arc.c source code [netbsd/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c]

1	/*
2	* CDDL HEADER START
3	*
4	* The contents of this file are subject to the terms of the
5	* Common Development and Distribution License (the "License").
6	* You may not use this file except in compliance with the License.
7	*
8	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9	* or http://www.opensolaris.org/os/licensing.
10	* See the License for the specific language governing permissions
11	* and limitations under the License.
12	*
13	* When distributing Covered Code, include this CDDL HEADER in each
14	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15	* If applicable, add the following below this CDDL HEADER, with the
16	* fields enclosed by brackets "[]" replaced with your own identifying
17	* information: Portions Copyright [yyyy] [name of copyright owner]
18	*
19	* CDDL HEADER END
20	*/
21	/*
22	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23	* Copyright (c) 2012, Joyent, Inc. All rights reserved.
24	* Copyright (c) 2011, 2016 by Delphix. All rights reserved.
25	* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26	* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
27	*/
28
29	/*
30	* DVA-based Adjustable Replacement Cache
31	*
32	* While much of the theory of operation used here is
33	* based on the self-tuning, low overhead replacement cache
34	* presented by Megiddo and Modha at FAST 2003, there are some
35	* significant differences:
36	*
37	* 1. The Megiddo and Modha model assumes any page is evictable.
38	* Pages in its cache cannot be "locked" into memory. This makes
39	* the eviction algorithm simple: evict the last page in the list.
40	* This also make the performance characteristics easy to reason
41	* about. Our cache is not so simple. At any given moment, some
42	* subset of the blocks in the cache are un-evictable because we
43	* have handed out a reference to them. Blocks are only evictable
44	* when there are no external references active. This makes
45	* eviction far more problematic: we choose to evict the evictable
46	* blocks that are the "lowest" in the list.
47	*
48	* There are times when it is not possible to evict the requested
49	* space. In these circumstances we are unable to adjust the cache
50	* size. To prevent the cache growing unbounded at these times we
51	* implement a "cache throttle" that slows the flow of new data
52	* into the cache until we can make space available.
53	*
54	* 2. The Megiddo and Modha model assumes a fixed cache size.
55	* Pages are evicted when the cache is full and there is a cache
56	* miss. Our model has a variable sized cache. It grows with
57	* high use, but also tries to react to memory pressure from the
58	* operating system: decreasing its size when system memory is
59	* tight.
60	*
61	* 3. The Megiddo and Modha model assumes a fixed page size. All
62	* elements of the cache are therefore exactly the same size. So
63	* when adjusting the cache size following a cache miss, its simply
64	* a matter of choosing a single page to evict. In our model, we
65	* have variable sized cache blocks (rangeing from 512 bytes to
66	* 128K bytes). We therefore choose a set of blocks to evict to make
67	* space for a cache miss that approximates as closely as possible
68	* the space used by the new block.
69	*
70	* See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71	* by N. Megiddo & D. Modha, FAST 2003
72	*/
73
74	/*
75	* The locking model:
76	*
77	* A new reference to a cache buffer can be obtained in two
78	* ways: 1) via a hash table lookup using the DVA as a key,
79	* or 2) via one of the ARC lists. The arc_read() interface
80	* uses method 1, while the internal arc algorithms for
81	* adjusting the cache use method 2. We therefore provide two
82	* types of locks: 1) the hash table lock array, and 2) the
83	* arc list locks.
84	*
85	* Buffers do not have their own mutexes, rather they rely on the
86	* hash table mutexes for the bulk of their protection (i.e. most
87	* fields in the arc_buf_hdr_t are protected by these mutexes).
88	*
89	* buf_hash_find() returns the appropriate mutex (held) when it
90	* locates the requested buffer in the hash table. It returns
91	* NULL for the mutex if the buffer was not in the table.
92	*
93	* buf_hash_remove() expects the appropriate hash mutex to be
94	* already held before it is invoked.
95	*
96	* Each arc state also has a mutex which is used to protect the
97	* buffer list associated with the state. When attempting to
98	* obtain a hash table lock while holding an arc list lock you
99	* must use: mutex_tryenter() to avoid deadlock. Also note that
100	* the active state mutex must be held before the ghost state mutex.
101	*
102	* Arc buffers may have an associated eviction callback function.
103	* This function will be invoked prior to removing the buffer (e.g.
104	* in arc_do_user_evicts()). Note however that the data associated
105	* with the buffer may be evicted prior to the callback. The callback
106	* must be made with no locks held (to prevent deadlock). Additionally,
107	* the users of callbacks must ensure that their private data is
108	* protected from simultaneous callbacks from arc_clear_callback()
109	* and arc_do_user_evicts().
110	*
111	* Note that the majority of the performance stats are manipulated
112	* with atomic operations.
113	*
114	* The L2ARC uses the l2ad_mtx on each vdev for the following:
115	*
116	* - L2ARC buflist creation
117	* - L2ARC buflist eviction
118	* - L2ARC write completion, which walks L2ARC buflists
119	* - ARC header destruction, as it removes from L2ARC buflists
120	* - ARC header release, as it removes from L2ARC buflists
121	*/
122
123	/*
124	* ARC operation:
125	*
126	* Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
127	* This structure can point either to a block that is still in the cache or to
128	* one that is only accessible in an L2 ARC device, or it can provide
129	* information about a block that was recently evicted. If a block is
130	* only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
131	* information to retrieve it from the L2ARC device. This information is
132	* stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
133	* that is in this state cannot access the data directly.
134	*
135	* Blocks that are actively being referenced or have not been evicted
136	* are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
137	* the arc_buf_hdr_t that will point to the data block in memory. A block can
138	* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
139	* caches data in two ways -- in a list of arc buffers (arc_buf_t) and
140	* also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
141	* Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
142	* consumer, and always contains uncompressed data. The ARC will provide
143	* references to this data and will keep it cached until it is no longer in
144	* use. Typically, the arc will try to cache only the L1ARC's physical data
145	* block and will aggressively evict any arc_buf_t that is no longer referenced.
146	* The amount of memory consumed by the arc_buf_t's can be seen via the
147	* "overhead_size" kstat.
148	*
149	*
150	* arc_buf_hdr_t
151	* +-----------+
152	* \| \|
153	* \| \|
154	* \| \|
155	* +-----------+
156	* l2arc_buf_hdr_t\| \|
157	* \| \|
158	* +-----------+
159	* l1arc_buf_hdr_t\| \|
160	* \| \| arc_buf_t
161	* \| b_buf +------------>+---------+ arc_buf_t
162	* \| \| \|b_next +---->+---------+
163	* \| b_pdata +-+ \|---------\| \|b_next +-->NULL
164	* +-----------+ \| \| \| +---------+
165	* \| \|b_data +-+ \| \|
166	* \| +---------+ \| \|b_data +-+
167	* +->+------+ \| +---------+ \|
168	* (potentially) \| \| \| \|
169	* compressed \| \| \| \|
170	* data +------+ \| v
171	* +->+------+ +------+
172	* uncompressed \| \| \| \|
173	* data \| \| \| \|
174	* +------+ +------+
175	*
176	* The L1ARC's data pointer, however, may or may not be uncompressed. The
177	* ARC has the ability to store the physical data (b_pdata) associated with
178	* the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
179	* physical block, it will match its on-disk compression characteristics.
180	* If the block on-disk is compressed, then the physical data block
181	* in the cache will also be compressed and vice-versa. This behavior
182	* can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
183	* compressed ARC functionality is disabled, the b_pdata will point to an
184	* uncompressed version of the on-disk data.
185	*
186	* When a consumer reads a block, the ARC must first look to see if the
187	* arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
188	* then an additional arc_buf_t is allocated and the uncompressed data is
189	* bcopied from the existing arc_buf_t. If the hdr is cached but does not
190	* have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
191	* the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
192	* b_pdata is not compressed, then the block is shared with the newly
193	* allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
194	* in the arc buffer chain. Sharing the block reduces the memory overhead
195	* required when the hdr is caching uncompressed blocks or the compressed
196	* arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
197	*
198	* The diagram below shows an example of an uncompressed ARC hdr that is
199	* sharing its data with an arc_buf_t:
200	*
201	* arc_buf_hdr_t
202	* +-----------+
203	* \| \|
204	* \| \|
205	* \| \|
206	* +-----------+
207	* l2arc_buf_hdr_t\| \|
208	* \| \|
209	* +-----------+
210	* l1arc_buf_hdr_t\| \|
211	* \| \| arc_buf_t (shared)
212	* \| b_buf +------------>+---------+ arc_buf_t
213	* \| \| \|b_next +---->+---------+
214	* \| b_pdata +-+ \|---------\| \|b_next +-->NULL
215	* +-----------+ \| \| \| +---------+
216	* \| \|b_data +-+ \| \|
217	* \| +---------+ \| \|b_data +-+
218	* +->+------+ \| +---------+ \|
219	* \| \| \| \|
220	* uncompressed \| \| \| \|
221	* data +------+ \| \|
222	* ^ +->+------+ \|
223	* \| uncompressed \| \| \|
224	* \| data \| \| \|
225	* \| +------+ \|
226	* +---------------------------------+
227	*
228	* Writing to the arc requires that the ARC first discard the b_pdata
229	* since the physical block is about to be rewritten. The new data contents
230	* will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
231	* performs the write, it may compress the data before writing it to disk.
232	* The ARC will be called with the transformed data and will bcopy the
233	* transformed on-disk block into a newly allocated b_pdata.
234	*
235	* When the L2ARC is in use, it will also take advantage of the b_pdata. The
236	* L2ARC will always write the contents of b_pdata to the L2ARC. This means
237	* that when compressed arc is enabled that the L2ARC blocks are identical
238	* to the on-disk block in the main data pool. This provides a significant
239	* advantage since the ARC can leverage the bp's checksum when reading from the
240	* L2ARC to determine if the contents are valid. However, if the compressed
241	* arc is disabled, then the L2ARC's block must be transformed to look
242	* like the physical block in the main data pool before comparing the
243	* checksum and determining its validity.
244	*/
245
246	#include <sys/spa.h>
247	#include <sys/zio.h>
248	#include <sys/spa_impl.h>
249	#include <sys/zio_compress.h>
250	#include <sys/zio_checksum.h>
251	#include <sys/zfs_context.h>
252	#include <sys/arc.h>
253	#include <sys/refcount.h>
254	#include <sys/vdev.h>
255	#include <sys/vdev_impl.h>
256	#include <sys/dsl_pool.h>
257	#include <sys/multilist.h>
258	#ifdef _KERNEL
259	#include <sys/dnlc.h>
260	#include <sys/racct.h>
261	#endif
262	#include <sys/callb.h>
263	#include <sys/kstat.h>
264	#include <sys/trim_map.h>
265	#include <zfs_fletcher.h>
266	#include <sys/sdt.h>
267
268	#include <machine/vmparam.h>
269
270	#ifdef illumos
271	#ifndef _KERNEL
272	/ set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers /
273	boolean_t arc_watch = B_FALSE;
274	int arc_procfd;
275	#endif
276	#endif /* illumos */
277
278	#ifdef __NetBSD__
279	#include <uvm/uvm.h>
280	#ifndef btop
281	#define btop(x) ((x) / PAGE_SIZE)
282	#endif
283	//#define needfree (uvmexp.free < uvmexp.freetarg ? uvmexp.freetarg : 0)
284	#define buf_init arc_buf_init
285	#define freemem uvmexp.free
286	#define minfree uvmexp.freemin
287	#define desfree uvmexp.freetarg
288	#define lotsfree (desfree * 2)
289	#define availrmem desfree
290	#define swapfs_minfree 0
291	#define swapfs_reserve 0
292	#undef curproc
293	#define curproc curlwp
294	#define proc_pageout uvm.pagedaemon_lwp
295
296	static void *zio_arena;
297
298	#include <sys/callback.h>
299	/ Structures used for memory and kva space reclaim. /
300	static struct callback_entry arc_kva_reclaim_entry;
301
302	#endif /* __NetBSD__ */
303
304	static kmutex_t arc_reclaim_lock;
305	static kcondvar_t arc_reclaim_thread_cv;
306	static boolean_t arc_reclaim_thread_exit;
307	static kcondvar_t arc_reclaim_waiters_cv;
308
309	#ifdef __FreeBSD__
310	static kmutex_t arc_dnlc_evicts_lock;
311	static kcondvar_t arc_dnlc_evicts_cv;
312	static boolean_t arc_dnlc_evicts_thread_exit;
313
314	uint_t arc_reduce_dnlc_percent = `3`;
315	#endif
316
317	/*
318	* The number of headers to evict in arc_evict_state_impl() before
319	* dropping the sublist lock and evicting from another sublist. A lower
320	* value means we're more likely to evict the "correct" header (i.e. the
321	* oldest header in the arc state), but comes with higher overhead
322	* (i.e. more invocations of arc_evict_state_impl()).
323	*/
324	int zfs_arc_evict_batch_limit = `10`;
325
326	/*
327	* The number of sublists used for each of the arc state lists. If this
328	* is not set to a suitable value by the user, it will be configured to
329	* the number of CPUs on the system in arc_init().
330	*/
331	int zfs_arc_num_sublists_per_state = `0`;
332
333	/ number of seconds before growing cache again /
334	static int arc_grow_retry = `60`;
335
336	/ shift of arc_c for calculating overflow limit in arc_get_data_buf /
337	int zfs_arc_overflow_shift = `8`;
338
339	/ shift of arc_c for calculating both min and max arc_p /
340	static int arc_p_min_shift = `4`;
341
342	/ log2(fraction of arc to reclaim) /
343	static int arc_shrink_shift = `7`;
344
345	/*
346	* log2(fraction of ARC which must be free to allow growing).
347	* I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
348	* when reading a new block into the ARC, we will evict an equal-sized block
349	* from the ARC.
350	*
351	* This must be less than arc_shrink_shift, so that when we shrink the ARC,
352	* we will still not allow it to grow.
353	*/
354	int arc_no_grow_shift = `5`;
355
356
357	/*
358	* minimum lifespan of a prefetch block in clock ticks
359	* (initialized in arc_init())
360	*/
361	static int arc_min_prefetch_lifespan;
362
363	/*
364	* If this percent of memory is free, don't throttle.
365	*/
366	int arc_lotsfree_percent = `10`;
367
368	static int arc_dead;
369	extern boolean_t zfs_prefetch_disable;
370
371	/*
372	* The arc has filled available memory and has now warmed up.
373	*/
374	static boolean_t arc_warm;
375
376	/*
377	* These tunables are for performance analysis.
378	*/
379	uint64_t zfs_arc_max;
380	uint64_t zfs_arc_min;
381	uint64_t zfs_arc_meta_limit = `0`;
382	uint64_t zfs_arc_meta_min = `0`;
383	int zfs_arc_grow_retry = `0`;
384	int zfs_arc_shrink_shift = `0`;
385	int zfs_arc_p_min_shift = `0`;
386	uint64_t zfs_arc_average_blocksize = `8` * `1024`; / 8KB /
387	u_int zfs_arc_free_target = `0`;
388
389	/ Absolute min for arc min / max is 16MB. /
390	static uint64_t arc_abs_min = `16` << `20`;
391
392	boolean_t zfs_compressed_arc_enabled = B_TRUE;
393
394	#if defined(__FreeBSD__) && defined(_KERNEL)
395	static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
396	static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
397	static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
398	static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
399
400	static void
401	arc_free_target_init(void *unused __unused)
402	{
403
404	zfs_arc_free_target = vm_pageout_wakeup_thresh;
405	}
406	SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
407	arc_free_target_init, NULL);
408
409	TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
410	TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
411	TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
412	SYSCTL_DECL(_vfs_zfs);
413	SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 \| CTLFLAG_RWTUN,
414	`0`, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
415	SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 \| CTLFLAG_RWTUN,
416	`0`, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
417	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
418	&zfs_arc_average_blocksize, `0`,
419	"ARC average blocksize");
420	SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
421	&arc_shrink_shift, `0`,
422	"log2(fraction of arc to reclaim)");
423	SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
424	&zfs_compressed_arc_enabled, `0`, "Enable compressed ARC");
425
426	/*
427	* We don't have a tunable for arc_free_target due to the dependency on
428	* pagedaemon initialisation.
429	*/
430	SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
431	CTLTYPE_UINT \| CTLFLAG_MPSAFE \| CTLFLAG_RW, `0`, sizeof(u_int),
432	sysctl_vfs_zfs_arc_free_target, "IU",
433	"Desired number of free pages below which ARC triggers reclaim");
434
435	static int
436	sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
437	{
438	u_int val;
439	int err;
440
441	val = zfs_arc_free_target;
442	err = sysctl_handle_int(oidp, &val, `0`, req);
443	if (err != `0` \|\| req->newptr == NULL)
444	return (err);
445
446	if (val < minfree)
447	return (EINVAL);
448	if (val > vm_cnt.v_page_count)
449	return (EINVAL);
450
451	zfs_arc_free_target = val;
452
453	return (`0`);
454	}
455
456	/*
457	* Must be declared here, before the definition of corresponding kstat
458	* macro which uses the same names will confuse the compiler.
459	*/
460	SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
461	CTLTYPE_U64 \| CTLFLAG_MPSAFE \| CTLFLAG_RW, `0`, sizeof(uint64_t),
462	sysctl_vfs_zfs_arc_meta_limit, "QU",
463	"ARC metadata limit");
464	#endif
465
466	/*
467	* Note that buffers can be in one of 6 states:
468	* ARC_anon - anonymous (discussed below)
469	* ARC_mru - recently used, currently cached
470	* ARC_mru_ghost - recentely used, no longer in cache
471	* ARC_mfu - frequently used, currently cached
472	* ARC_mfu_ghost - frequently used, no longer in cache
473	* ARC_l2c_only - exists in L2ARC but not other states
474	* When there are no active references to the buffer, they are
475	* are linked onto a list in one of these arc states. These are
476	* the only buffers that can be evicted or deleted. Within each
477	* state there are multiple lists, one for meta-data and one for
478	* non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
479	* etc.) is tracked separately so that it can be managed more
480	* explicitly: favored over data, limited explicitly.
481	*
482	* Anonymous buffers are buffers that are not associated with
483	* a DVA. These are buffers that hold dirty block copies
484	* before they are written to stable storage. By definition,
485	* they are "ref'd" and are considered part of arc_mru
486	* that cannot be freed. Generally, they will aquire a DVA
487	* as they are written and migrate onto the arc_mru list.
488	*
489	* The ARC_l2c_only state is for buffers that are in the second
490	* level ARC but no longer in any of the ARC_m* lists. The second
491	* level ARC itself may also contain buffers that are in any of
492	* the ARC_m* states - meaning that a buffer can exist in two
493	* places. The reason for the ARC_l2c_only state is to keep the
494	* buffer header in the hash table, so that reads that hit the
495	* second level ARC benefit from these fast lookups.
496	*/
497
498	typedef struct arc_state {
499	/*
500	* list of evictable buffers
501	*/
502	multilist_t arcs_list[ARC_BUFC_NUMTYPES];
503	/*
504	* total amount of evictable data in this state
505	*/
506	refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
507	/*
508	* total amount of data in this state; this includes: evictable,
509	* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
510	*/
511	refcount_t arcs_size;
512	} arc_state_t;
513
514	/ The 6 states: /
515	static arc_state_t ARC_anon;
516	static arc_state_t ARC_mru;
517	static arc_state_t ARC_mru_ghost;
518	static arc_state_t ARC_mfu;
519	static arc_state_t ARC_mfu_ghost;
520	static arc_state_t ARC_l2c_only;
521
522	typedef struct arc_stats {
523	kstat_named_t arcstat_hits;
524	kstat_named_t arcstat_misses;
525	kstat_named_t arcstat_demand_data_hits;
526	kstat_named_t arcstat_demand_data_misses;
527	kstat_named_t arcstat_demand_metadata_hits;
528	kstat_named_t arcstat_demand_metadata_misses;
529	kstat_named_t arcstat_prefetch_data_hits;
530	kstat_named_t arcstat_prefetch_data_misses;
531	kstat_named_t arcstat_prefetch_metadata_hits;
532	kstat_named_t arcstat_prefetch_metadata_misses;
533	kstat_named_t arcstat_mru_hits;
534	kstat_named_t arcstat_mru_ghost_hits;
535	kstat_named_t arcstat_mfu_hits;
536	kstat_named_t arcstat_mfu_ghost_hits;
537	kstat_named_t arcstat_allocated;
538	kstat_named_t arcstat_deleted;
539	/*
540	* Number of buffers that could not be evicted because the hash lock
541	* was held by another thread. The lock may not necessarily be held
542	* by something using the same buffer, since hash locks are shared
543	* by multiple buffers.
544	*/
545	kstat_named_t arcstat_mutex_miss;
546	/*
547	* Number of buffers skipped because they have I/O in progress, are
548	* indrect prefetch buffers that have not lived long enough, or are
549	* not from the spa we're trying to evict from.
550	*/
551	kstat_named_t arcstat_evict_skip;
552	/*
553	* Number of times arc_evict_state() was unable to evict enough
554	* buffers to reach it's target amount.
555	*/
556	kstat_named_t arcstat_evict_not_enough;
557	kstat_named_t arcstat_evict_l2_cached;
558	kstat_named_t arcstat_evict_l2_eligible;
559	kstat_named_t arcstat_evict_l2_ineligible;
560	kstat_named_t arcstat_evict_l2_skip;
561	kstat_named_t arcstat_hash_elements;
562	kstat_named_t arcstat_hash_elements_max;
563	kstat_named_t arcstat_hash_collisions;
564	kstat_named_t arcstat_hash_chains;
565	kstat_named_t arcstat_hash_chain_max;
566	kstat_named_t arcstat_p;
567	kstat_named_t arcstat_c;
568	kstat_named_t arcstat_c_min;
569	kstat_named_t arcstat_c_max;
570	kstat_named_t arcstat_size;
571	/*
572	* Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
573	* Note that the compressed bytes may match the uncompressed bytes
574	* if the block is either not compressed or compressed arc is disabled.
575	*/
576	kstat_named_t arcstat_compressed_size;
577	/*
578	* Uncompressed size of the data stored in b_pdata. If compressed
579	* arc is disabled then this value will be identical to the stat
580	* above.
581	*/
582	kstat_named_t arcstat_uncompressed_size;
583	/*
584	* Number of bytes stored in all the arc_buf_t's. This is classified
585	* as "overhead" since this data is typically short-lived and will
586	* be evicted from the arc when it becomes unreferenced unless the
587	* zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
588	* values have been set (see comment in dbuf.c for more information).
589	*/
590	kstat_named_t arcstat_overhead_size;
591	/*
592	* Number of bytes consumed by internal ARC structures necessary
593	* for tracking purposes; these structures are not actually
594	* backed by ARC buffers. This includes arc_buf_hdr_t structures
595	* (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
596	* caches), and arc_buf_t structures (allocated via arc_buf_t
597	* cache).
598	*/
599	kstat_named_t arcstat_hdr_size;
600	/*
601	* Number of bytes consumed by ARC buffers of type equal to
602	* ARC_BUFC_DATA. This is generally consumed by buffers backing
603	* on disk user data (e.g. plain file contents).
604	*/
605	kstat_named_t arcstat_data_size;
606	/*
607	* Number of bytes consumed by ARC buffers of type equal to
608	* ARC_BUFC_METADATA. This is generally consumed by buffers
609	* backing on disk data that is used for internal ZFS
610	* structures (e.g. ZAP, dnode, indirect blocks, etc).
611	*/
612	kstat_named_t arcstat_metadata_size;
613	/*
614	* Number of bytes consumed by various buffers and structures
615	* not actually backed with ARC buffers. This includes bonus
616	* buffers (allocated directly via zio_buf_* functions),
617	* dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
618	* cache), and dnode_t structures (allocated via dnode_t cache).
619	*/
620	kstat_named_t arcstat_other_size;
621	/*
622	* Total number of bytes consumed by ARC buffers residing in the
623	* arc_anon state. This includes all buffers in the arc_anon
624	* state; e.g. data, metadata, evictable, and unevictable buffers
625	* are all included in this value.
626	*/
627	kstat_named_t arcstat_anon_size;
628	/*
629	* Number of bytes consumed by ARC buffers that meet the
630	* following criteria: backing buffers of type ARC_BUFC_DATA,
631	* residing in the arc_anon state, and are eligible for eviction
632	* (e.g. have no outstanding holds on the buffer).
633	*/
634	kstat_named_t arcstat_anon_evictable_data;
635	/*
636	* Number of bytes consumed by ARC buffers that meet the
637	* following criteria: backing buffers of type ARC_BUFC_METADATA,
638	* residing in the arc_anon state, and are eligible for eviction
639	* (e.g. have no outstanding holds on the buffer).
640	*/
641	kstat_named_t arcstat_anon_evictable_metadata;
642	/*
643	* Total number of bytes consumed by ARC buffers residing in the
644	* arc_mru state. This includes all buffers in the arc_mru
645	* state; e.g. data, metadata, evictable, and unevictable buffers
646	* are all included in this value.
647	*/
648	kstat_named_t arcstat_mru_size;
649	/*
650	* Number of bytes consumed by ARC buffers that meet the
651	* following criteria: backing buffers of type ARC_BUFC_DATA,
652	* residing in the arc_mru state, and are eligible for eviction
653	* (e.g. have no outstanding holds on the buffer).
654	*/
655	kstat_named_t arcstat_mru_evictable_data;
656	/*
657	* Number of bytes consumed by ARC buffers that meet the
658	* following criteria: backing buffers of type ARC_BUFC_METADATA,
659	* residing in the arc_mru state, and are eligible for eviction
660	* (e.g. have no outstanding holds on the buffer).
661	*/
662	kstat_named_t arcstat_mru_evictable_metadata;
663	/*
664	* Total number of bytes that would have been consumed by ARC
665	* buffers in the arc_mru_ghost state. The key thing to note
666	* here, is the fact that this size doesn't actually indicate
667	* RAM consumption. The ghost lists only consist of headers and
668	* don't actually have ARC buffers linked off of these headers.
669	* Thus, if the headers had associated ARC buffers, these
670	* buffers would have consumed this number of bytes.
671	*/
672	kstat_named_t arcstat_mru_ghost_size;
673	/*
674	* Number of bytes that would have been consumed by ARC
675	* buffers that are eligible for eviction, of type
676	* ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
677	*/
678	kstat_named_t arcstat_mru_ghost_evictable_data;
679	/*
680	* Number of bytes that would have been consumed by ARC
681	* buffers that are eligible for eviction, of type
682	* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
683	*/
684	kstat_named_t arcstat_mru_ghost_evictable_metadata;
685	/*
686	* Total number of bytes consumed by ARC buffers residing in the
687	* arc_mfu state. This includes all buffers in the arc_mfu
688	* state; e.g. data, metadata, evictable, and unevictable buffers
689	* are all included in this value.
690	*/
691	kstat_named_t arcstat_mfu_size;
692	/*
693	* Number of bytes consumed by ARC buffers that are eligible for
694	* eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
695	* state.
696	*/
697	kstat_named_t arcstat_mfu_evictable_data;
698	/*
699	* Number of bytes consumed by ARC buffers that are eligible for
700	* eviction, of type ARC_BUFC_METADATA, and reside in the
701	* arc_mfu state.
702	*/
703	kstat_named_t arcstat_mfu_evictable_metadata;
704	/*
705	* Total number of bytes that would have been consumed by ARC
706	* buffers in the arc_mfu_ghost state. See the comment above
707	* arcstat_mru_ghost_size for more details.
708	*/
709	kstat_named_t arcstat_mfu_ghost_size;
710	/*
711	* Number of bytes that would have been consumed by ARC
712	* buffers that are eligible for eviction, of type
713	* ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
714	*/
715	kstat_named_t arcstat_mfu_ghost_evictable_data;
716	/*
717	* Number of bytes that would have been consumed by ARC
718	* buffers that are eligible for eviction, of type
719	* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
720	*/
721	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
722	kstat_named_t arcstat_l2_hits;
723	kstat_named_t arcstat_l2_misses;
724	kstat_named_t arcstat_l2_feeds;
725	kstat_named_t arcstat_l2_rw_clash;
726	kstat_named_t arcstat_l2_read_bytes;
727	kstat_named_t arcstat_l2_write_bytes;
728	kstat_named_t arcstat_l2_writes_sent;
729	kstat_named_t arcstat_l2_writes_done;
730	kstat_named_t arcstat_l2_writes_error;
731	kstat_named_t arcstat_l2_writes_lock_retry;
732	kstat_named_t arcstat_l2_evict_lock_retry;
733	kstat_named_t arcstat_l2_evict_reading;
734	kstat_named_t arcstat_l2_evict_l1cached;
735	kstat_named_t arcstat_l2_free_on_write;
736	kstat_named_t arcstat_l2_abort_lowmem;
737	kstat_named_t arcstat_l2_cksum_bad;
738	kstat_named_t arcstat_l2_io_error;
739	kstat_named_t arcstat_l2_size;
740	kstat_named_t arcstat_l2_asize;
741	kstat_named_t arcstat_l2_hdr_size;
742	kstat_named_t arcstat_l2_write_trylock_fail;
743	kstat_named_t arcstat_l2_write_passed_headroom;
744	kstat_named_t arcstat_l2_write_spa_mismatch;
745	kstat_named_t arcstat_l2_write_in_l2;
746	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
747	kstat_named_t arcstat_l2_write_not_cacheable;
748	kstat_named_t arcstat_l2_write_full;
749	kstat_named_t arcstat_l2_write_buffer_iter;
750	kstat_named_t arcstat_l2_write_pios;
751	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
752	kstat_named_t arcstat_l2_write_buffer_list_iter;
753	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
754	kstat_named_t arcstat_memory_throttle_count;
755	kstat_named_t arcstat_meta_used;
756	kstat_named_t arcstat_meta_limit;
757	kstat_named_t arcstat_meta_max;
758	kstat_named_t arcstat_meta_min;
759	kstat_named_t arcstat_sync_wait_for_async;
760	kstat_named_t arcstat_demand_hit_predictive_prefetch;
761	} arc_stats_t;
762
763	static arc_stats_t arc_stats = {
764	{ "hits", KSTAT_DATA_UINT64 },
765	{ "misses", KSTAT_DATA_UINT64 },
766	{ "demand_data_hits", KSTAT_DATA_UINT64 },
767	{ "demand_data_misses", KSTAT_DATA_UINT64 },
768	{ "demand_metadata_hits", KSTAT_DATA_UINT64 },
769	{ "demand_metadata_misses", KSTAT_DATA_UINT64 },
770	{ "prefetch_data_hits", KSTAT_DATA_UINT64 },
771	{ "prefetch_data_misses", KSTAT_DATA_UINT64 },
772	{ "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
773	{ "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
774	{ "mru_hits", KSTAT_DATA_UINT64 },
775	{ "mru_ghost_hits", KSTAT_DATA_UINT64 },
776	{ "mfu_hits", KSTAT_DATA_UINT64 },
777	{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
778	{ "allocated", KSTAT_DATA_UINT64 },
779	{ "deleted", KSTAT_DATA_UINT64 },
780	{ "mutex_miss", KSTAT_DATA_UINT64 },
781	{ "evict_skip", KSTAT_DATA_UINT64 },
782	{ "evict_not_enough", KSTAT_DATA_UINT64 },
783	{ "evict_l2_cached", KSTAT_DATA_UINT64 },
784	{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
785	{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
786	{ "evict_l2_skip", KSTAT_DATA_UINT64 },
787	{ "hash_elements", KSTAT_DATA_UINT64 },
788	{ "hash_elements_max", KSTAT_DATA_UINT64 },
789	{ "hash_collisions", KSTAT_DATA_UINT64 },
790	{ "hash_chains", KSTAT_DATA_UINT64 },
791	{ "hash_chain_max", KSTAT_DATA_UINT64 },
792	{ "p", KSTAT_DATA_UINT64 },
793	{ "c", KSTAT_DATA_UINT64 },
794	{ "c_min", KSTAT_DATA_UINT64 },
795	{ "c_max", KSTAT_DATA_UINT64 },
796	{ "size", KSTAT_DATA_UINT64 },
797	{ "compressed_size", KSTAT_DATA_UINT64 },
798	{ "uncompressed_size", KSTAT_DATA_UINT64 },
799	{ "overhead_size", KSTAT_DATA_UINT64 },
800	{ "hdr_size", KSTAT_DATA_UINT64 },
801	{ "data_size", KSTAT_DATA_UINT64 },
802	{ "metadata_size", KSTAT_DATA_UINT64 },
803	{ "other_size", KSTAT_DATA_UINT64 },
804	{ "anon_size", KSTAT_DATA_UINT64 },
805	{ "anon_evictable_data", KSTAT_DATA_UINT64 },
806	{ "anon_evictable_metadata", KSTAT_DATA_UINT64 },
807	{ "mru_size", KSTAT_DATA_UINT64 },
808	{ "mru_evictable_data", KSTAT_DATA_UINT64 },
809	{ "mru_evictable_metadata", KSTAT_DATA_UINT64 },
810	{ "mru_ghost_size", KSTAT_DATA_UINT64 },
811	{ "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
812	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
813	{ "mfu_size", KSTAT_DATA_UINT64 },
814	{ "mfu_evictable_data", KSTAT_DATA_UINT64 },
815	{ "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
816	{ "mfu_ghost_size", KSTAT_DATA_UINT64 },
817	{ "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
818	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
819	{ "l2_hits", KSTAT_DATA_UINT64 },
820	{ "l2_misses", KSTAT_DATA_UINT64 },
821	{ "l2_feeds", KSTAT_DATA_UINT64 },
822	{ "l2_rw_clash", KSTAT_DATA_UINT64 },
823	{ "l2_read_bytes", KSTAT_DATA_UINT64 },
824	{ "l2_write_bytes", KSTAT_DATA_UINT64 },
825	{ "l2_writes_sent", KSTAT_DATA_UINT64 },
826	{ "l2_writes_done", KSTAT_DATA_UINT64 },
827	{ "l2_writes_error", KSTAT_DATA_UINT64 },
828	{ "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
829	{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
830	{ "l2_evict_reading", KSTAT_DATA_UINT64 },
831	{ "l2_evict_l1cached", KSTAT_DATA_UINT64 },
832	{ "l2_free_on_write", KSTAT_DATA_UINT64 },
833	{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
834	{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
835	{ "l2_io_error", KSTAT_DATA_UINT64 },
836	{ "l2_size", KSTAT_DATA_UINT64 },
837	{ "l2_asize", KSTAT_DATA_UINT64 },
838	{ "l2_hdr_size", KSTAT_DATA_UINT64 },
839	{ "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
840	{ "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
841	{ "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
842	{ "l2_write_in_l2", KSTAT_DATA_UINT64 },
843	{ "l2_write_io_in_progress", KSTAT_DATA_UINT64 },
844	{ "l2_write_not_cacheable", KSTAT_DATA_UINT64 },
845	{ "l2_write_full", KSTAT_DATA_UINT64 },
846	{ "l2_write_buffer_iter", KSTAT_DATA_UINT64 },
847	{ "l2_write_pios", KSTAT_DATA_UINT64 },
848	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
849	{ "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
850	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
851	{ "memory_throttle_count", KSTAT_DATA_UINT64 },
852	{ "arc_meta_used", KSTAT_DATA_UINT64 },
853	{ "arc_meta_limit", KSTAT_DATA_UINT64 },
854	{ "arc_meta_max", KSTAT_DATA_UINT64 },
855	{ "arc_meta_min", KSTAT_DATA_UINT64 },
856	{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
857	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
858	};
859
860	#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
861
862	#define ARCSTAT_INCR(stat, val) \
863	atomic_add_64(&arc_stats.stat.value.ui64, (val))
864
865	#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
866	#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
867
868	#define ARCSTAT_MAX(stat, val) { \
869	uint64_t m; \
870	while ((val) > (m = arc_stats.stat.value.ui64) && \
871	(m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
872	continue; \
873	}
874
875	#define ARCSTAT_MAXSTAT(stat) \
876	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
877
878	/*
879	* We define a macro to allow ARC hits/misses to be easily broken down by
880	* two separate conditions, giving a total of four different subtypes for
881	* each of hits and misses (so eight statistics total).
882	*/
883	#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
884	if (cond1) { \
885	if (cond2) { \
886	ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
887	} else { \
888	ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
889	} \
890	} else { \
891	if (cond2) { \
892	ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
893	} else { \
894	ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
895	} \
896	}
897
898	kstat_t *arc_ksp;
899	static arc_state_t *arc_anon;
900	static arc_state_t *arc_mru;
901	static arc_state_t *arc_mru_ghost;
902	static arc_state_t *arc_mfu;
903	static arc_state_t *arc_mfu_ghost;
904	static arc_state_t *arc_l2c_only;
905
906	/*
907	* There are several ARC variables that are critical to export as kstats --
908	* but we don't want to have to grovel around in the kstat whenever we wish to
909	* manipulate them. For these variables, we therefore define them to be in
910	* terms of the statistic variable. This assures that we are not introducing
911	* the possibility of inconsistency by having shadow copies of the variables,
912	* while still allowing the code to be readable.
913	*/
914	#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
915	#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
916	#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
917	#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
918	#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
919	#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
920	#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
921	#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
922	#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
923
924	/ compressed size of entire arc /
925	#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
926	/ uncompressed size of entire arc /
927	#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
928	/ number of bytes in the arc from arc_buf_t's /
929	#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
930
931	static int arc_no_grow; / Don't try to grow cache size /
932	static uint64_t arc_tempreserve;
933	static uint64_t arc_loaned_bytes;
934
935	typedef struct arc_callback arc_callback_t;
936
937	struct arc_callback {
938	void *acb_private;
939	arc_done_func_t *acb_done;
940	arc_buf_t *acb_buf;
941	zio_t *acb_zio_dummy;
942	arc_callback_t *acb_next;
943	};
944
945	typedef struct arc_write_callback arc_write_callback_t;
946
947	struct arc_write_callback {
948	void *awcb_private;
949	arc_done_func_t *awcb_ready;
950	arc_done_func_t *awcb_children_ready;
951	arc_done_func_t *awcb_physdone;
952	arc_done_func_t *awcb_done;
953	arc_buf_t *awcb_buf;
954	};
955
956	/*
957	* ARC buffers are separated into multiple structs as a memory saving measure:
958	* - Common fields struct, always defined, and embedded within it:
959	* - L2-only fields, always allocated but undefined when not in L2ARC
960	* - L1-only fields, only allocated when in L1ARC
961	*
962	* Buffer in L1 Buffer only in L2
963	* +------------------------+ +------------------------+
964	* \| arc_buf_hdr_t \| \| arc_buf_hdr_t \|
965	* \| \| \| \|
966	* \| \| \| \|
967	* \| \| \| \|
968	* +------------------------+ +------------------------+
969	* \| l2arc_buf_hdr_t \| \| l2arc_buf_hdr_t \|
970	* \| (undefined if L1-only) \| \| \|
971	* +------------------------+ +------------------------+
972	* \| l1arc_buf_hdr_t \|
973	* \| \|
974	* \| \|
975	* \| \|
976	* \| \|
977	* +------------------------+
978	*
979	* Because it's possible for the L2ARC to become extremely large, we can wind
980	* up eating a lot of memory in L2ARC buffer headers, so the size of a header
981	* is minimized by only allocating the fields necessary for an L1-cached buffer
982	* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
983	* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
984	* words in pointers. arc_hdr_realloc() is used to switch a header between
985	* these two allocation states.
986	*/
987	typedef struct l1arc_buf_hdr {
988	kmutex_t b_freeze_lock;
989	zio_cksum_t *b_freeze_cksum;
990	#ifdef ZFS_DEBUG
991	/*
992	* used for debugging wtih kmem_flags - by allocating and freeing
993	* b_thawed when the buffer is thawed, we get a record of the stack
994	* trace that thawed it.
995	*/
996	void *b_thawed;
997	#endif
998
999	arc_buf_t *b_buf;
1000	uint32_t b_bufcnt;
1001	/ for waiting on writes to complete /
1002	kcondvar_t b_cv;
1003	uint8_t b_byteswap;
1004
1005	/ protected by arc state mutex /
1006	arc_state_t *b_state;
1007	multilist_node_t b_arc_node;
1008
1009	/ updated atomically /
1010	clock_t b_arc_access;
1011
1012	/ self protecting /
1013	refcount_t b_refcnt;
1014
1015	arc_callback_t *b_acb;
1016	void *b_pdata;
1017	} l1arc_buf_hdr_t;
1018
1019	typedef struct l2arc_dev l2arc_dev_t;
1020
1021	typedef struct l2arc_buf_hdr {
1022	/ protected by arc_buf_hdr mutex /
1023	l2arc_dev_t b_dev; /* L2ARC device /
1024	uint64_t b_daddr; / disk address, offset byte /
1025
1026	list_node_t b_l2node;
1027	} l2arc_buf_hdr_t;
1028
1029	struct arc_buf_hdr {
1030	/ protected by hash lock /
1031	dva_t b_dva;
1032	uint64_t b_birth;
1033
1034	arc_buf_contents_t b_type;
1035	arc_buf_hdr_t *b_hash_next;
1036	arc_flags_t b_flags;
1037
1038	/*
1039	* This field stores the size of the data buffer after
1040	* compression, and is set in the arc's zio completion handlers.
1041	* It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
1042	*
1043	* While the block pointers can store up to 32MB in their psize
1044	* field, we can only store up to 32MB minus 512B. This is due
1045	* to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
1046	* a field of zeros represents 512B in the bp). We can't use a
1047	* bias of 1 since we need to reserve a psize of zero, here, to
1048	* represent holes and embedded blocks.
1049	*
1050	* This isn't a problem in practice, since the maximum size of a
1051	* buffer is limited to 16MB, so we never need to store 32MB in
1052	* this field. Even in the upstream illumos code base, the
1053	* maximum size of a buffer is limited to 16MB.
1054	*/
1055	uint16_t b_psize;
1056
1057	/*
1058	* This field stores the size of the data buffer before
1059	* compression, and cannot change once set. It is in units
1060	* of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
1061	*/
1062	uint16_t b_lsize; / immutable /
1063	uint64_t b_spa; / immutable /
1064
1065	/ L2ARC fields. Undefined when not in L2ARC. /
1066	l2arc_buf_hdr_t b_l2hdr;
1067	/ L1ARC fields. Undefined when in l2arc_only state /
1068	l1arc_buf_hdr_t b_l1hdr;
1069	};
1070
1071	#if defined(__FreeBSD__) && defined(_KERNEL)
1072	static int
1073	sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
1074	{
1075	uint64_t val;
1076	int err;
1077
1078	val = arc_meta_limit;
1079	err = sysctl_handle_64(oidp, &val, `0`, req);
1080	if (err != `0` \|\| req->newptr == NULL)
1081	return (err);
1082
1083	if (val <= `0` \|\| val > arc_c_max)
1084	return (EINVAL);
1085
1086	arc_meta_limit = val;
1087	return (`0`);
1088	}
1089
1090	static int
1091	sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
1092	{
1093	uint64_t val;
1094	int err;
1095
1096	val = zfs_arc_max;
1097	err = sysctl_handle_64(oidp, &val, `0`, req);
1098	if (err != `0` \|\| req->newptr == NULL)
1099	return (err);
1100
1101	if (zfs_arc_max == `0`) {
1102	/ Loader tunable so blindly set /
1103	zfs_arc_max = val;
1104	return (`0`);
1105	}
1106
1107	if (val < arc_abs_min \|\| val > kmem_size())
1108	return (EINVAL);
1109	if (val < arc_c_min)
1110	return (EINVAL);
1111	if (zfs_arc_meta_limit > `0` && val < zfs_arc_meta_limit)
1112	return (EINVAL);
1113
1114	arc_c_max = val;
1115
1116	arc_c = arc_c_max;
1117	arc_p = (arc_c >> `1`);
1118
1119	if (zfs_arc_meta_limit == `0`) {
1120	/ limit meta-data to 1/4 of the arc capacity /
1121	arc_meta_limit = arc_c_max / `4`;
1122	}
1123
1124	/ if kmem_flags are set, lets try to use less memory /
1125	if (kmem_debugging())
1126	arc_c = arc_c / `2`;
1127
1128	zfs_arc_max = arc_c;
1129
1130	return (`0`);
1131	}
1132
1133	static int
1134	sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
1135	{
1136	uint64_t val;
1137	int err;
1138
1139	val = zfs_arc_min;
1140	err = sysctl_handle_64(oidp, &val, `0`, req);
1141	if (err != `0` \|\| req->newptr == NULL)
1142	return (err);
1143
1144	if (zfs_arc_min == `0`) {
1145	/ Loader tunable so blindly set /
1146	zfs_arc_min = val;
1147	return (`0`);
1148	}
1149
1150	if (val < arc_abs_min \|\| val > arc_c_max)
1151	return (EINVAL);
1152
1153	arc_c_min = val;
1154
1155	if (zfs_arc_meta_min == `0`)
1156	arc_meta_min = arc_c_min / `2`;
1157
1158	if (arc_c < arc_c_min)
1159	arc_c = arc_c_min;
1160
1161	zfs_arc_min = arc_c_min;
1162
1163	return (`0`);
1164	}
1165	#endif
1166
1167	#define GHOST_STATE(state) \
1168	((state) == arc_mru_ghost \|\| (state) == arc_mfu_ghost \|\| \
1169	(state) == arc_l2c_only)
1170
1171	#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
1172	#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
1173	#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
1174	#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
1175	#define HDR_COMPRESSION_ENABLED(hdr) \
1176	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
1177
1178	#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
1179	#define HDR_L2_READING(hdr) \
1180	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
1181	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
1182	#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
1183	#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
1184	#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
1185	#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
1186
1187	#define HDR_ISTYPE_METADATA(hdr) \
1188	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
1189	#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
1190
1191	#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
1192	#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
1193
1194	/ For storing compression mode in b_flags /
1195	#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
1196
1197	#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
1198	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
1199	#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
1200	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
1201
1202	#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
1203
1204	/*
1205	* Other sizes
1206	*/
1207
1208	#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
1209	#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
1210
1211	/*
1212	* Hash table routines
1213	*/
1214
1215	#define HT_LOCK_PAD CACHE_LINE_SIZE
1216
1217	struct ht_lock {
1218	kmutex_t ht_lock;
1219	#ifdef _KERNEL
1220	unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
1221	#endif
1222	};
1223
1224	#define BUF_LOCKS 256
1225	typedef struct buf_hash_table {
1226	uint64_t ht_mask;
1227	arc_buf_hdr_t **ht_table;
1228	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
1229	} buf_hash_table_t;
1230
1231	static buf_hash_table_t buf_hash_table;
1232
1233	#define BUF_HASH_INDEX(spa, dva, birth) \
1234	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
1235	#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
1236	#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
1237	#define HDR_LOCK(hdr) \
1238	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
1239
1240	uint64_t zfs_crc64_table[`256`];
1241
1242	/*
1243	* Level 2 ARC
1244	*/
1245
1246	#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
1247	#define L2ARC_HEADROOM 2 /* num of writes */
1248	/*
1249	* If we discover during ARC scan any buffers to be compressed, we boost
1250	* our headroom for the next scanning cycle by this percentage multiple.
1251	*/
1252	#define L2ARC_HEADROOM_BOOST 200
1253	#define L2ARC_FEED_SECS 1 /* caching interval secs */
1254	#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
1255
1256	#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
1257	#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
1258
1259	/ L2ARC Performance Tunables /
1260	uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; / default max write size /
1261	uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; / extra write during warmup /
1262	uint64_t l2arc_headroom = L2ARC_HEADROOM; / number of dev writes /
1263	uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
1264	uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; / interval seconds /
1265	uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; / min interval milliseconds /
1266	boolean_t l2arc_noprefetch = B_TRUE; / don't cache prefetch bufs /
1267	boolean_t l2arc_feed_again = B_TRUE; / turbo warmup /
1268	boolean_t l2arc_norw = B_TRUE; / no reads during writes /
1269
1270	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
1271	&l2arc_write_max, `0`, "max write size");
1272	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
1273	&l2arc_write_boost, `0`, "extra write during warmup");
1274	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
1275	&l2arc_headroom, `0`, "number of dev writes");
1276	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
1277	&l2arc_feed_secs, `0`, "interval seconds");
1278	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
1279	&l2arc_feed_min_ms, `0`, "min interval milliseconds");
1280
1281	SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
1282	&l2arc_noprefetch, `0`, "don't cache prefetch bufs");
1283	SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
1284	&l2arc_feed_again, `0`, "turbo warmup");
1285	SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
1286	&l2arc_norw, `0`, "no reads during writes");
1287
1288	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
1289	&ARC_anon.arcs_size.rc_count, `0`, "size of anonymous state");
1290	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
1291	&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, `0`,
1292	"size of anonymous state");
1293	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
1294	&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, `0`,
1295	"size of anonymous state");
1296
1297	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
1298	&ARC_mru.arcs_size.rc_count, `0`, "size of mru state");
1299	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
1300	&ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, `0`,
1301	"size of metadata in mru state");
1302	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
1303	&ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, `0`,
1304	"size of data in mru state");
1305
1306	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
1307	&ARC_mru_ghost.arcs_size.rc_count, `0`, "size of mru ghost state");
1308	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
1309	&ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, `0`,
1310	"size of metadata in mru ghost state");
1311	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
1312	&ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, `0`,
1313	"size of data in mru ghost state");
1314
1315	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
1316	&ARC_mfu.arcs_size.rc_count, `0`, "size of mfu state");
1317	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
1318	&ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, `0`,
1319	"size of metadata in mfu state");
1320	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
1321	&ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, `0`,
1322	"size of data in mfu state");
1323
1324	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
1325	&ARC_mfu_ghost.arcs_size.rc_count, `0`, "size of mfu ghost state");
1326	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
1327	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, `0`,
1328	"size of metadata in mfu ghost state");
1329	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
1330	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, `0`,
1331	"size of data in mfu ghost state");
1332
1333	SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
1334	&ARC_l2c_only.arcs_size.rc_count, `0`, "size of mru state");
1335
1336	/*
1337	* L2ARC Internals
1338	*/
1339	struct l2arc_dev {
1340	vdev_t l2ad_vdev; /* vdev /
1341	spa_t l2ad_spa; /* spa /
1342	uint64_t l2ad_hand; / next write location /
1343	uint64_t l2ad_start; / first addr on device /
1344	uint64_t l2ad_end; / last addr on device /
1345	boolean_t l2ad_first; / first sweep through /
1346	boolean_t l2ad_writing; / currently writing /
1347	kmutex_t l2ad_mtx; / lock for buffer list /
1348	list_t l2ad_buflist; / buffer list /
1349	list_node_t l2ad_node; / device list node /
1350	refcount_t l2ad_alloc; / allocated bytes /
1351	};
1352
1353	static list_t L2ARC_dev_list; / device list /
1354	static list_t l2arc_dev_list; /* device list pointer /
1355	static kmutex_t l2arc_dev_mtx; / device list mutex /
1356	static l2arc_dev_t l2arc_dev_last; /* last device used /
1357	static list_t L2ARC_free_on_write; / free after write buf list /
1358	static list_t l2arc_free_on_write; /* free after write list ptr /
1359	static kmutex_t l2arc_free_on_write_mtx; / mutex for list /
1360	static uint64_t l2arc_ndev; / number of devices /
1361
1362	typedef struct l2arc_read_callback {
1363	arc_buf_hdr_t l2rcb_hdr; /* read buffer /
1364	blkptr_t l2rcb_bp; / original blkptr /
1365	zbookmark_phys_t l2rcb_zb; / original bookmark /
1366	int l2rcb_flags; / original flags /
1367	void l2rcb_data; /* temporary buffer /
1368	} l2arc_read_callback_t;
1369
1370	typedef struct l2arc_write_callback {
1371	l2arc_dev_t l2wcb_dev; /* device info /
1372	arc_buf_hdr_t l2wcb_head; /* head of write buflist /
1373	} l2arc_write_callback_t;
1374
1375	typedef struct l2arc_data_free {
1376	/ protected by l2arc_free_on_write_mtx /
1377	void *l2df_data;
1378	size_t l2df_size;
1379	arc_buf_contents_t l2df_type;
1380	list_node_t l2df_list_node;
1381	} l2arc_data_free_t;
1382
1383	static kmutex_t l2arc_feed_thr_lock;
1384	static kcondvar_t l2arc_feed_thr_cv;
1385	static uint8_t l2arc_thread_exit;
1386
1387	static void arc_get_data_buf(arc_buf_hdr_t , uint64_t, void *);
1388	static void arc_free_data_buf(arc_buf_hdr_t , void* , uint64_t, void* *);
1389	static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
1390	static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
1391	static void arc_access(arc_buf_hdr_t , kmutex_t );
1392	static boolean_t arc_is_overflowing();
1393	static void arc_buf_watch(arc_buf_t *);
1394
1395	static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
1396	static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
1397	static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
1398	static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
1399
1400	static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
1401	static void l2arc_read_done(zio_t *);
1402
1403	static void
1404	l2arc_trim(const arc_buf_hdr_t *hdr)
1405	{
1406	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1407
1408	ASSERT(HDR_HAS_L2HDR(hdr));
1409	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
1410
1411	if (HDR_GET_PSIZE(hdr) != `0`) {
1412	trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
1413	HDR_GET_PSIZE(hdr), `0`);
1414	}
1415	}
1416
1417	static uint64_t
1418	buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1419	{
1420	uint8_t vdva = (uint8_t )dva;
1421	uint64_t crc = -`1ULL`;
1422	int i;
1423
1424	ASSERT(zfs_crc64_table[`128`] == ZFS_CRC64_POLY);
1425
1426	for (i = `0`; i < sizeof (dva_t); i++)
1427	crc = (crc >> `8`) ^ zfs_crc64_table[(crc ^ vdva[i]) & `0xFF`];
1428
1429	crc ^= (spa>>`8`) ^ birth;
1430
1431	return (crc);
1432	}
1433
1434	#define HDR_EMPTY(hdr) \
1435	((hdr)->b_dva.dva_word[0] == 0 && \
1436	(hdr)->b_dva.dva_word[1] == 0)
1437
1438	#define HDR_EQUAL(spa, dva, birth, hdr) \
1439	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
1440	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
1441	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
1442
1443	static void
1444	buf_discard_identity(arc_buf_hdr_t *hdr)
1445	{
1446	hdr->b_dva.dva_word[`0`] = `0`;
1447	hdr->b_dva.dva_word[`1`] = `0`;
1448	hdr->b_birth = `0`;
1449	}
1450
1451	static arc_buf_hdr_t *
1452	buf_hash_find(uint64_t spa, const blkptr_t bp, kmutex_t *lockp)
1453	{
1454	const dva_t *dva = BP_IDENTITY(bp);
1455	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1456	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1457	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1458	arc_buf_hdr_t *hdr;
1459
1460	mutex_enter(hash_lock);
1461	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1462	hdr = hdr->b_hash_next) {
1463	if (HDR_EQUAL(spa, dva, birth, hdr)) {
1464	*lockp = hash_lock;
1465	return (hdr);
1466	}
1467	}
1468	mutex_exit(hash_lock);
1469	*lockp = NULL;
1470	return (NULL);
1471	}
1472
1473	/*
1474	* Insert an entry into the hash table. If there is already an element
1475	* equal to elem in the hash table, then the already existing element
1476	* will be returned and the new element will not be inserted.
1477	* Otherwise returns NULL.
1478	* If lockp == NULL, the caller is assumed to already hold the hash lock.
1479	*/
1480	static arc_buf_hdr_t *
1481	buf_hash_insert(arc_buf_hdr_t hdr, kmutex_t *lockp)
1482	{
1483	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1484	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1485	arc_buf_hdr_t *fhdr;
1486	uint32_t i;
1487
1488	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1489	ASSERT(hdr->b_birth != `0`);
1490	ASSERT(!HDR_IN_HASH_TABLE(hdr));
1491
1492	if (lockp != NULL) {
1493	*lockp = hash_lock;
1494	mutex_enter(hash_lock);
1495	} else {
1496	ASSERT(MUTEX_HELD(hash_lock));
1497	}
1498
1499	for (fhdr = buf_hash_table.ht_table[idx], i = `0`; fhdr != NULL;
1500	fhdr = fhdr->b_hash_next, i++) {
1501	if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1502	return (fhdr);
1503	}
1504
1505	hdr->b_hash_next = buf_hash_table.ht_table[idx];
1506	buf_hash_table.ht_table[idx] = hdr;
1507	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1508
1509	/ collect some hash table performance data /
1510	if (i > `0`) {
1511	ARCSTAT_BUMP(arcstat_hash_collisions);
1512	if (i == `1`)
1513	ARCSTAT_BUMP(arcstat_hash_chains);
1514
1515	ARCSTAT_MAX(arcstat_hash_chain_max, i);
1516	}
1517
1518	ARCSTAT_BUMP(arcstat_hash_elements);
1519	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1520
1521	return (NULL);
1522	}
1523
1524	static void
1525	buf_hash_remove(arc_buf_hdr_t *hdr)
1526	{
1527	arc_buf_hdr_t fhdr, *hdrp;
1528	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1529
1530	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1531	ASSERT(HDR_IN_HASH_TABLE(hdr));
1532
1533	hdrp = &buf_hash_table.ht_table[idx];
1534	while ((fhdr = *hdrp) != hdr) {
1535	ASSERT3P(fhdr, !=, NULL);
1536	hdrp = &fhdr->b_hash_next;
1537	}
1538	*hdrp = hdr->b_hash_next;
1539	hdr->b_hash_next = NULL;
1540	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1541
1542	/ collect some hash table performance data /
1543	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1544
1545	if (buf_hash_table.ht_table[idx] &&
1546	buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1547	ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1548	}
1549
1550	/*
1551	* Global data structures and functions for the buf kmem cache.
1552	*/
1553	static kmem_cache_t *hdr_full_cache;
1554	static kmem_cache_t *hdr_l2only_cache;
1555	static kmem_cache_t *buf_cache;
1556
1557	static void
1558	buf_fini(void)
1559	{
1560	int i;
1561
1562	kmem_free(buf_hash_table.ht_table,
1563	(buf_hash_table.ht_mask + `1`) * sizeof (void *));
1564	for (i = `0`; i < BUF_LOCKS; i++)
1565	mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1566	kmem_cache_destroy(hdr_full_cache);
1567	kmem_cache_destroy(hdr_l2only_cache);
1568	kmem_cache_destroy(buf_cache);
1569	}
1570
1571	/*
1572	* Constructor callback - called when the cache is empty
1573	* and a new buf is requested.
1574	*/
1575	/ ARGSUSED /
1576	static int
1577	hdr_full_cons(void vbuf, void* unused, int* kmflag)
1578	{
1579	arc_buf_hdr_t *hdr = vbuf;
1580
1581	bzero(hdr, HDR_FULL_SIZE);
1582	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1583	refcount_create(&hdr->b_l1hdr.b_refcnt);
1584	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1585	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1586	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1587
1588	return (`0`);
1589	}
1590
1591	/ ARGSUSED /
1592	static int
1593	hdr_l2only_cons(void vbuf, void* unused, int* kmflag)
1594	{
1595	arc_buf_hdr_t *hdr = vbuf;
1596
1597	bzero(hdr, HDR_L2ONLY_SIZE);
1598	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1599
1600	return (`0`);
1601	}
1602
1603	/ ARGSUSED /
1604	static int
1605	buf_cons(void vbuf, void* unused, int* kmflag)
1606	{
1607	arc_buf_t *buf = vbuf;
1608
1609	bzero(buf, sizeof (arc_buf_t));
1610	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1611	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1612
1613	return (`0`);
1614	}
1615
1616	/*
1617	* Destructor callback - called when a cached buf is
1618	* no longer required.
1619	*/
1620	/ ARGSUSED /
1621	static void
1622	hdr_full_dest(void vbuf, void* *unused)
1623	{
1624	arc_buf_hdr_t *hdr = vbuf;
1625
1626	ASSERT(HDR_EMPTY(hdr));
1627	cv_destroy(&hdr->b_l1hdr.b_cv);
1628	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1629	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1630	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1631	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1632	}
1633
1634	/ ARGSUSED /
1635	static void
1636	hdr_l2only_dest(void vbuf, void* *unused)
1637	{
1638	arc_buf_hdr_t *hdr = vbuf;
1639
1640	ASSERT(HDR_EMPTY(hdr));
1641	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1642	}
1643
1644	/ ARGSUSED /
1645	static void
1646	buf_dest(void vbuf, void* *unused)
1647	{
1648	arc_buf_t *buf = vbuf;
1649
1650	mutex_destroy(&buf->b_evict_lock);
1651	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1652	}
1653
1654	/*
1655	* Reclaim callback -- invoked when memory is low.
1656	*/
1657	/ ARGSUSED /
1658	static void
1659	hdr_recl(void *unused)
1660	{
1661	dprintf("hdr_recl called\n");
1662	/*
1663	* umem calls the reclaim func when we destroy the buf cache,
1664	* which is after we do arc_fini().
1665	*/
1666	if (!arc_dead)
1667	cv_signal(&arc_reclaim_thread_cv);
1668	}
1669
1670	static void
1671	buf_init(void)
1672	{
1673	uint64_t *ct;
1674	uint64_t hsize = `1ULL` << `12`;
1675	int i, j;
1676
1677	/*
1678	* The hash table is big enough to fill all of physical memory
1679	* with an average block size of zfs_arc_average_blocksize (default 8K).
1680	* By default, the table will take up
1681	* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1682	*/
1683	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1684	hsize <<= `1`;
1685	retry:
1686	buf_hash_table.ht_mask = hsize - `1`;
1687	buf_hash_table.ht_table =
1688	kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1689	if (buf_hash_table.ht_table == NULL) {
1690	ASSERT(hsize > (`1ULL` << `8`));
1691	hsize >>= `1`;
1692	goto retry;
1693	}
1694
1695	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1696	`0`, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, `0`);
1697	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1698	HDR_L2ONLY_SIZE, `0`, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1699	NULL, NULL, `0`);
1700	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1701	`0`, buf_cons, buf_dest, NULL, NULL, NULL, `0`);
1702
1703	for (i = `0`; i < `256`; i++)
1704	for (ct = zfs_crc64_table + i, *ct = i, j = `8`; j > `0`; j--)
1705	ct = (ct >> `1`) ^ (-(*ct & `1`) & ZFS_CRC64_POLY);
1706
1707	for (i = `0`; i < BUF_LOCKS; i++) {
1708	mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1709	NULL, MUTEX_DEFAULT, NULL);
1710	}
1711	}
1712
1713	#define ARC_MINTIME (hz>>4) /* 62 ms */
1714
1715	static inline boolean_t
1716	arc_buf_is_shared(arc_buf_t *buf)
1717	{
1718	boolean_t shared = (buf->b_data != NULL &&
1719	buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
1720	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1721	return (shared);
1722	}
1723
1724	static inline void
1725	arc_cksum_free(arc_buf_hdr_t *hdr)
1726	{
1727	ASSERT(HDR_HAS_L1HDR(hdr));
1728	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1729	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1730	kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1731	hdr->b_l1hdr.b_freeze_cksum = NULL;
1732	}
1733	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1734	}
1735
1736	static void
1737	arc_cksum_verify(arc_buf_t *buf)
1738	{
1739	arc_buf_hdr_t *hdr = buf->b_hdr;
1740	zio_cksum_t zc;
1741
1742	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1743	return;
1744
1745	ASSERT(HDR_HAS_L1HDR(hdr));
1746
1747	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1748	if (hdr->b_l1hdr.b_freeze_cksum == NULL \|\| HDR_IO_ERROR(hdr)) {
1749	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1750	return;
1751	}
1752	fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
1753	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1754	panic("buffer modified while frozen!");
1755	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1756	}
1757
1758	static boolean_t
1759	arc_cksum_is_equal(arc_buf_hdr_t hdr, zio_t zio)
1760	{
1761	enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
1762	boolean_t valid_cksum;
1763
1764	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1765	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1766
1767	/*
1768	* We rely on the blkptr's checksum to determine if the block
1769	* is valid or not. When compressed arc is enabled, the l2arc
1770	* writes the block to the l2arc just as it appears in the pool.
1771	* This allows us to use the blkptr's checksum to validate the
1772	* data that we just read off of the l2arc without having to store
1773	* a separate checksum in the arc_buf_hdr_t. However, if compressed
1774	* arc is disabled, then the data written to the l2arc is always
1775	* uncompressed and won't match the block as it exists in the main
1776	* pool. When this is the case, we must first compress it if it is
1777	* compressed on the main pool before we can validate the checksum.
1778	*/
1779	if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
1780	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1781	uint64_t lsize = HDR_GET_LSIZE(hdr);
1782	uint64_t csize;
1783
1784	void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
1785	csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1786	ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
1787	if (csize < HDR_GET_PSIZE(hdr)) {
1788	/*
1789	* Compressed blocks are always a multiple of the
1790	* smallest ashift in the pool. Ideally, we would
1791	* like to round up the csize to the next
1792	* spa_min_ashift but that value may have changed
1793	* since the block was last written. Instead,
1794	* we rely on the fact that the hdr's psize
1795	* was set to the psize of the block when it was
1796	* last written. We set the csize to that value
1797	* and zero out any part that should not contain
1798	* data.
1799	*/
1800	bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize);
1801	csize = HDR_GET_PSIZE(hdr);
1802	}
1803	zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL);
1804	}
1805
1806	/*
1807	* Block pointers always store the checksum for the logical data.
1808	* If the block pointer has the gang bit set, then the checksum
1809	* it represents is for the reconstituted data and not for an
1810	* individual gang member. The zio pipeline, however, must be able to
1811	* determine the checksum of each of the gang constituents so it
1812	* treats the checksum comparison differently than what we need
1813	* for l2arc blocks. This prevents us from using the
1814	* zio_checksum_error() interface directly. Instead we must call the
1815	* zio_checksum_error_impl() so that we can ensure the checksum is
1816	* generated using the correct checksum algorithm and accounts for the
1817	* logical I/O size and not just a gang fragment.
1818	*/
1819	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
1820	BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
1821	zio->io_offset, NULL) == `0`);
1822	zio_pop_transforms(zio);
1823	return (valid_cksum);
1824	}
1825
1826	static void
1827	arc_cksum_compute(arc_buf_t *buf)
1828	{
1829	arc_buf_hdr_t *hdr = buf->b_hdr;
1830
1831	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1832	return;
1833
1834	ASSERT(HDR_HAS_L1HDR(hdr));
1835	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1836	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1837	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1838	return;
1839	}
1840	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1841	KM_SLEEP);
1842	fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
1843	hdr->b_l1hdr.b_freeze_cksum);
1844	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1845	#ifdef illumos
1846	arc_buf_watch(buf);
1847	#endif
1848	}
1849
1850	#ifdef illumos
1851	#ifndef _KERNEL
1852	typedef struct procctl {
1853	long cmd;
1854	prwatch_t prwatch;
1855	} procctl_t;
1856	#endif
1857
1858	/ ARGSUSED /
1859	static void
1860	arc_buf_unwatch(arc_buf_t *buf)
1861	{
1862	#ifndef _KERNEL
1863	if (arc_watch) {
1864	int result;
1865	procctl_t ctl;
1866	ctl.cmd = PCWATCH;
1867	ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1868	ctl.prwatch.pr_size = `0`;
1869	ctl.prwatch.pr_wflags = `0`;
1870	result = write(arc_procfd, &ctl, sizeof (ctl));
1871	ASSERT3U(result, ==, sizeof (ctl));
1872	}
1873	#endif
1874	}
1875
1876	/ ARGSUSED /
1877	static void
1878	arc_buf_watch(arc_buf_t *buf)
1879	{
1880	#ifndef _KERNEL
1881	if (arc_watch) {
1882	int result;
1883	procctl_t ctl;
1884	ctl.cmd = PCWATCH;
1885	ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1886	ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
1887	ctl.prwatch.pr_wflags = WA_WRITE;
1888	result = write(arc_procfd, &ctl, sizeof (ctl));
1889	ASSERT3U(result, ==, sizeof (ctl));
1890	}
1891	#endif
1892	}
1893	#endif /* illumos */
1894
1895	static arc_buf_contents_t
1896	arc_buf_type(arc_buf_hdr_t *hdr)
1897	{
1898	arc_buf_contents_t type;
1899	if (HDR_ISTYPE_METADATA(hdr)) {
1900	type = ARC_BUFC_METADATA;
1901	} else {
1902	type = ARC_BUFC_DATA;
1903	}
1904	VERIFY3U(hdr->b_type, ==, type);
1905	return (type);
1906	}
1907
1908	static uint32_t
1909	arc_bufc_to_flags(arc_buf_contents_t type)
1910	{
1911	switch (type) {
1912	case ARC_BUFC_DATA:
1913	/ metadata field is 0 if buffer contains normal data /
1914	return (`0`);
1915	case ARC_BUFC_METADATA:
1916	return (ARC_FLAG_BUFC_METADATA);
1917	default:
1918	break;
1919	}
1920	panic("undefined ARC buffer type!");
1921	return ((uint32_t)-`1`);
1922	}
1923
1924	void
1925	arc_buf_thaw(arc_buf_t *buf)
1926	{
1927	arc_buf_hdr_t *hdr = buf->b_hdr;
1928
1929	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1930	if (hdr->b_l1hdr.b_state != arc_anon)
1931	panic("modifying non-anon buffer!");
1932	if (HDR_IO_IN_PROGRESS(hdr))
1933	panic("modifying buffer while i/o in progress!");
1934	arc_cksum_verify(buf);
1935	}
1936
1937	ASSERT(HDR_HAS_L1HDR(hdr));
1938	arc_cksum_free(hdr);
1939
1940	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1941	#ifdef ZFS_DEBUG
1942	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1943	if (hdr->b_l1hdr.b_thawed != NULL)
1944	kmem_free(hdr->b_l1hdr.b_thawed, `1`);
1945	hdr->b_l1hdr.b_thawed = kmem_alloc(`1`, KM_SLEEP);
1946	}
1947	#endif
1948
1949	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1950
1951	#ifdef illumos
1952	arc_buf_unwatch(buf);
1953	#endif
1954	}
1955
1956	void
1957	arc_buf_freeze(arc_buf_t *buf)
1958	{
1959	arc_buf_hdr_t *hdr = buf->b_hdr;
1960	kmutex_t *hash_lock;
1961
1962	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1963	return;
1964
1965	hash_lock = HDR_LOCK(hdr);
1966	mutex_enter(hash_lock);
1967
1968	ASSERT(HDR_HAS_L1HDR(hdr));
1969	ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL \|\|
1970	hdr->b_l1hdr.b_state == arc_anon);
1971	arc_cksum_compute(buf);
1972	mutex_exit(hash_lock);
1973
1974	}
1975
1976	/*
1977	* The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
1978	* the following functions should be used to ensure that the flags are
1979	* updated in a thread-safe way. When manipulating the flags either
1980	* the hash_lock must be held or the hdr must be undiscoverable. This
1981	* ensures that we're not racing with any other threads when updating
1982	* the flags.
1983	*/
1984	static inline void
1985	arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1986	{
1987	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
1988	hdr->b_flags \|= flags;
1989	}
1990
1991	static inline void
1992	arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1993	{
1994	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
1995	hdr->b_flags &= ~flags;
1996	}
1997
1998	/*
1999	* Setting the compression bits in the arc_buf_hdr_t's b_flags is
2000	* done in a special way since we have to clear and set bits
2001	* at the same time. Consumers that wish to set the compression bits
2002	* must use this function to ensure that the flags are updated in
2003	* thread-safe manner.
2004	*/
2005	static void
2006	arc_hdr_set_compress(arc_buf_hdr_t hdr, enum* zio_compress cmp)
2007	{
2008	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
2009
2010	/*
2011	* Holes and embedded blocks will always have a psize = 0 so
2012	* we ignore the compression of the blkptr and set the
2013	* arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
2014	* Holes and embedded blocks remain anonymous so we don't
2015	* want to uncompress them. Mark them as uncompressed.
2016	*/
2017	if (!zfs_compressed_arc_enabled \|\| HDR_GET_PSIZE(hdr) == `0`) {
2018	arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
2019	HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
2020	ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
2021	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
2022	} else {
2023	arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
2024	HDR_SET_COMPRESS(hdr, cmp);
2025	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
2026	ASSERT(HDR_COMPRESSION_ENABLED(hdr));
2027	}
2028	}
2029
2030	static int
2031	arc_decompress(arc_buf_t *buf)
2032	{
2033	arc_buf_hdr_t *hdr = buf->b_hdr;
2034	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
2035	int error;
2036
2037	if (arc_buf_is_shared(buf)) {
2038	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
2039	} else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
2040	/*
2041	* The arc_buf_hdr_t is either not compressed or is
2042	* associated with an embedded block or a hole in which
2043	* case they remain anonymous.
2044	*/
2045	IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == `0` \|\|
2046	HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr));
2047	ASSERT(!HDR_SHARED_DATA(hdr));
2048	bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr));
2049	} else {
2050	ASSERT(!HDR_SHARED_DATA(hdr));
2051	ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
2052	error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2053	hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr),
2054	HDR_GET_LSIZE(hdr));
2055	if (error != `0`) {
2056	zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d",
2057	hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr),
2058	HDR_GET_LSIZE(hdr));
2059	return (SET_ERROR(EIO));
2060	}
2061	}
2062	if (bswap != DMU_BSWAP_NUMFUNCS) {
2063	ASSERT(!HDR_SHARED_DATA(hdr));
2064	ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2065	dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2066	}
2067	arc_cksum_compute(buf);
2068	return (`0`);
2069	}
2070
2071	/*
2072	* Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
2073	*/
2074	static uint64_t
2075	arc_hdr_size(arc_buf_hdr_t *hdr)
2076	{
2077	uint64_t size;
2078
2079	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
2080	HDR_GET_PSIZE(hdr) > `0`) {
2081	size = HDR_GET_PSIZE(hdr);
2082	} else {
2083	ASSERT3U(HDR_GET_LSIZE(hdr), !=, `0`);
2084	size = HDR_GET_LSIZE(hdr);
2085	}
2086	return (size);
2087	}
2088
2089	/*
2090	* Increment the amount of evictable space in the arc_state_t's refcount.
2091	* We account for the space used by the hdr and the arc buf individually
2092	* so that we can add and remove them from the refcount individually.
2093	*/
2094	static void
2095	arc_evictable_space_increment(arc_buf_hdr_t hdr, arc_state_t state)
2096	{
2097	arc_buf_contents_t type = arc_buf_type(hdr);
2098	uint64_t lsize = HDR_GET_LSIZE(hdr);
2099
2100	ASSERT(HDR_HAS_L1HDR(hdr));
2101
2102	if (GHOST_STATE(state)) {
2103	ASSERT0(hdr->b_l1hdr.b_bufcnt);
2104	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2105	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
2106	(void) refcount_add_many(&state->arcs_esize[type], lsize, hdr);
2107	return;
2108	}
2109
2110	ASSERT(!GHOST_STATE(state));
2111	if (hdr->b_l1hdr.b_pdata != NULL) {
2112	(void) refcount_add_many(&state->arcs_esize[type],
2113	arc_hdr_size(hdr), hdr);
2114	}
2115	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2116	buf = buf->b_next) {
2117	if (arc_buf_is_shared(buf)) {
2118	ASSERT(ARC_BUF_LAST(buf));
2119	continue;
2120	}
2121	(void) refcount_add_many(&state->arcs_esize[type], lsize, buf);
2122	}
2123	}
2124
2125	/*
2126	* Decrement the amount of evictable space in the arc_state_t's refcount.
2127	* We account for the space used by the hdr and the arc buf individually
2128	* so that we can add and remove them from the refcount individually.
2129	*/
2130	static void
2131	arc_evitable_space_decrement(arc_buf_hdr_t hdr, arc_state_t state)
2132	{
2133	arc_buf_contents_t type = arc_buf_type(hdr);
2134	uint64_t lsize = HDR_GET_LSIZE(hdr);
2135
2136	ASSERT(HDR_HAS_L1HDR(hdr));
2137
2138	if (GHOST_STATE(state)) {
2139	ASSERT0(hdr->b_l1hdr.b_bufcnt);
2140	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2141	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
2142	(void) refcount_remove_many(&state->arcs_esize[type],
2143	lsize, hdr);
2144	return;
2145	}
2146
2147	ASSERT(!GHOST_STATE(state));
2148	if (hdr->b_l1hdr.b_pdata != NULL) {
2149	(void) refcount_remove_many(&state->arcs_esize[type],
2150	arc_hdr_size(hdr), hdr);
2151	}
2152	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2153	buf = buf->b_next) {
2154	if (arc_buf_is_shared(buf)) {
2155	ASSERT(ARC_BUF_LAST(buf));
2156	continue;
2157	}
2158	(void) refcount_remove_many(&state->arcs_esize[type],
2159	lsize, buf);
2160	}
2161	}
2162
2163	/*
2164	* Add a reference to this hdr indicating that someone is actively
2165	* referencing that memory. When the refcount transitions from 0 to 1,
2166	* we remove it from the respective arc_state_t list to indicate that
2167	* it is not evictable.
2168	*/
2169	static void
2170	add_reference(arc_buf_hdr_t hdr, void* *tag)
2171	{
2172	ASSERT(HDR_HAS_L1HDR(hdr));
2173	if (!MUTEX_HELD(HDR_LOCK(hdr))) {
2174	ASSERT(hdr->b_l1hdr.b_state == arc_anon);
2175	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2176	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2177	}
2178
2179	arc_state_t *state = hdr->b_l1hdr.b_state;
2180
2181	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == `1`) &&
2182	(state != arc_anon)) {
2183	/ We don't use the L2-only state list. /
2184	if (state != arc_l2c_only) {
2185	multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
2186	hdr);
2187	arc_evitable_space_decrement(hdr, state);
2188	}
2189	/ remove the prefetch flag if we get a reference /
2190	arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
2191	}
2192	}
2193
2194	/*
2195	* Remove a reference from this hdr. When the reference transitions from
2196	* 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2197	* list making it eligible for eviction.
2198	*/
2199	static int
2200	remove_reference(arc_buf_hdr_t hdr, kmutex_t hash_lock, void *tag)
2201	{
2202	int cnt;
2203	arc_state_t *state = hdr->b_l1hdr.b_state;
2204
2205	ASSERT(HDR_HAS_L1HDR(hdr));
2206	ASSERT(state == arc_anon \|\| MUTEX_HELD(hash_lock));
2207	ASSERT(!GHOST_STATE(state));
2208
2209	/*
2210	* arc_l2c_only counts as a ghost state so we don't need to explicitly
2211	* check to prevent usage of the arc_l2c_only list.
2212	*/
2213	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == `0`) &&
2214	(state != arc_anon)) {
2215	multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
2216	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, `0`);
2217	arc_evictable_space_increment(hdr, state);
2218	}
2219	return (cnt);
2220	}
2221
2222	/*
2223	* Move the supplied buffer to the indicated state. The hash lock
2224	* for the buffer must be held by the caller.
2225	*/
2226	static void
2227	arc_change_state(arc_state_t new_state, arc_buf_hdr_t hdr,
2228	kmutex_t *hash_lock)
2229	{
2230	arc_state_t *old_state;
2231	int64_t refcnt;
2232	uint32_t bufcnt;
2233	boolean_t update_old, update_new;
2234	arc_buf_contents_t buftype = arc_buf_type(hdr);
2235
2236	/*
2237	* We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2238	* in arc_read() when bringing a buffer out of the L2ARC. However, the
2239	* L1 hdr doesn't always exist when we change state to arc_anon before
2240	* destroying a header, in which case reallocating to add the L1 hdr is
2241	* pointless.
2242	*/
2243	if (HDR_HAS_L1HDR(hdr)) {
2244	old_state = hdr->b_l1hdr.b_state;
2245	refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
2246	bufcnt = hdr->b_l1hdr.b_bufcnt;
2247	update_old = (bufcnt > `0` \|\| hdr->b_l1hdr.b_pdata != NULL);
2248	} else {
2249	old_state = arc_l2c_only;
2250	refcnt = `0`;
2251	bufcnt = `0`;
2252	update_old = B_FALSE;
2253	}
2254	update_new = update_old;
2255
2256	ASSERT(MUTEX_HELD(hash_lock));
2257	ASSERT3P(new_state, !=, old_state);
2258	ASSERT(!GHOST_STATE(new_state) \|\| bufcnt == `0`);
2259	ASSERT(old_state != arc_anon \|\| bufcnt <= `1`);
2260
2261	/*
2262	* If this buffer is evictable, transfer it from the
2263	* old state list to the new state list.
2264	*/
2265	if (refcnt == `0`) {
2266	if (old_state != arc_anon && old_state != arc_l2c_only) {
2267	ASSERT(HDR_HAS_L1HDR(hdr));
2268	multilist_remove(&old_state->arcs_list[buftype], hdr);
2269
2270	if (GHOST_STATE(old_state)) {
2271	ASSERT0(bufcnt);
2272	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2273	update_old = B_TRUE;
2274	}
2275	arc_evitable_space_decrement(hdr, old_state);
2276	}
2277	if (new_state != arc_anon && new_state != arc_l2c_only) {
2278
2279	/*
2280	* An L1 header always exists here, since if we're
2281	* moving to some L1-cached state (i.e. not l2c_only or
2282	* anonymous), we realloc the header to add an L1hdr
2283	* beforehand.
2284	*/
2285	ASSERT(HDR_HAS_L1HDR(hdr));
2286	multilist_insert(&new_state->arcs_list[buftype], hdr);
2287
2288	if (GHOST_STATE(new_state)) {
2289	ASSERT0(bufcnt);
2290	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2291	update_new = B_TRUE;
2292	}
2293	arc_evictable_space_increment(hdr, new_state);
2294	}
2295	}
2296
2297	ASSERT(!HDR_EMPTY(hdr));
2298	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2299	buf_hash_remove(hdr);
2300
2301	/ adjust state sizes (ignore arc_l2c_only) /
2302
2303	if (update_new && new_state != arc_l2c_only) {
2304	ASSERT(HDR_HAS_L1HDR(hdr));
2305	if (GHOST_STATE(new_state)) {
2306	ASSERT0(bufcnt);
2307
2308	/*
2309	* When moving a header to a ghost state, we first
2310	* remove all arc buffers. Thus, we'll have a
2311	* bufcnt of zero, and no arc buffer to use for
2312	* the reference. As a result, we use the arc
2313	* header pointer for the reference.
2314	*/
2315	(void) refcount_add_many(&new_state->arcs_size,
2316	HDR_GET_LSIZE(hdr), hdr);
2317	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
2318	} else {
2319	uint32_t buffers = `0`;
2320
2321	/*
2322	* Each individual buffer holds a unique reference,
2323	* thus we must remove each of these references one
2324	* at a time.
2325	*/
2326	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2327	buf = buf->b_next) {
2328	ASSERT3U(bufcnt, !=, `0`);
2329	buffers++;
2330
2331	/*
2332	* When the arc_buf_t is sharing the data
2333	* block with the hdr, the owner of the
2334	* reference belongs to the hdr. Only
2335	* add to the refcount if the arc_buf_t is
2336	* not shared.
2337	*/
2338	if (arc_buf_is_shared(buf)) {
2339	ASSERT(ARC_BUF_LAST(buf));
2340	continue;
2341	}
2342
2343	(void) refcount_add_many(&new_state->arcs_size,
2344	HDR_GET_LSIZE(hdr), buf);
2345	}
2346	ASSERT3U(bufcnt, ==, buffers);
2347
2348	if (hdr->b_l1hdr.b_pdata != NULL) {
2349	(void) refcount_add_many(&new_state->arcs_size,
2350	arc_hdr_size(hdr), hdr);
2351	} else {
2352	ASSERT(GHOST_STATE(old_state));
2353	}
2354	}
2355	}
2356
2357	if (update_old && old_state != arc_l2c_only) {
2358	ASSERT(HDR_HAS_L1HDR(hdr));
2359	if (GHOST_STATE(old_state)) {
2360	ASSERT0(bufcnt);
2361
2362	/*
2363	* When moving a header off of a ghost state,
2364	* the header will not contain any arc buffers.
2365	* We use the arc header pointer for the reference
2366	* which is exactly what we did when we put the
2367	* header on the ghost state.
2368	*/
2369
2370	(void) refcount_remove_many(&old_state->arcs_size,
2371	HDR_GET_LSIZE(hdr), hdr);
2372	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
2373	} else {
2374	uint32_t buffers = `0`;
2375
2376	/*
2377	* Each individual buffer holds a unique reference,
2378	* thus we must remove each of these references one
2379	* at a time.
2380	*/
2381	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2382	buf = buf->b_next) {
2383	ASSERT3P(bufcnt, !=, `0`);
2384	buffers++;
2385
2386	/*
2387	* When the arc_buf_t is sharing the data
2388	* block with the hdr, the owner of the
2389	* reference belongs to the hdr. Only
2390	* add to the refcount if the arc_buf_t is
2391	* not shared.
2392	*/
2393	if (arc_buf_is_shared(buf)) {
2394	ASSERT(ARC_BUF_LAST(buf));
2395	continue;
2396	}
2397
2398	(void) refcount_remove_many(
2399	&old_state->arcs_size, HDR_GET_LSIZE(hdr),
2400	buf);
2401	}
2402	ASSERT3U(bufcnt, ==, buffers);
2403	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
2404	(void) refcount_remove_many(
2405	&old_state->arcs_size, arc_hdr_size(hdr), hdr);
2406	}
2407	}
2408
2409	if (HDR_HAS_L1HDR(hdr))
2410	hdr->b_l1hdr.b_state = new_state;
2411
2412	/*
2413	* L2 headers should never be on the L2 state list since they don't
2414	* have L1 headers allocated.
2415	*/
2416	ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2417	multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
2418	}
2419
2420	void
2421	arc_space_consume(uint64_t space, arc_space_type_t type)
2422	{
2423	ASSERT(type >= `0` && type < ARC_SPACE_NUMTYPES);
2424
2425	switch (type) {
2426	case ARC_SPACE_DATA:
2427	ARCSTAT_INCR(arcstat_data_size, space);
2428	break;
2429	case ARC_SPACE_META:
2430	ARCSTAT_INCR(arcstat_metadata_size, space);
2431	break;
2432	case ARC_SPACE_OTHER:
2433	ARCSTAT_INCR(arcstat_other_size, space);
2434	break;
2435	case ARC_SPACE_HDRS:
2436	ARCSTAT_INCR(arcstat_hdr_size, space);
2437	break;
2438	case ARC_SPACE_L2HDRS:
2439	ARCSTAT_INCR(arcstat_l2_hdr_size, space);
2440	break;
2441	}
2442
2443	if (type != ARC_SPACE_DATA)
2444	ARCSTAT_INCR(arcstat_meta_used, space);
2445
2446	atomic_add_64(&arc_size, space);
2447	}
2448
2449	void
2450	arc_space_return(uint64_t space, arc_space_type_t type)
2451	{
2452	ASSERT(type >= `0` && type < ARC_SPACE_NUMTYPES);
2453
2454	switch (type) {
2455	case ARC_SPACE_DATA:
2456	ARCSTAT_INCR(arcstat_data_size, -space);
2457	break;
2458	case ARC_SPACE_META:
2459	ARCSTAT_INCR(arcstat_metadata_size, -space);
2460	break;
2461	case ARC_SPACE_OTHER:
2462	ARCSTAT_INCR(arcstat_other_size, -space);
2463	break;
2464	case ARC_SPACE_HDRS:
2465	ARCSTAT_INCR(arcstat_hdr_size, -space);
2466	break;
2467	case ARC_SPACE_L2HDRS:
2468	ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
2469	break;
2470	}
2471
2472	if (type != ARC_SPACE_DATA) {
2473	ASSERT(arc_meta_used >= space);
2474	if (arc_meta_max < arc_meta_used)
2475	arc_meta_max = arc_meta_used;
2476	ARCSTAT_INCR(arcstat_meta_used, -space);
2477	}
2478
2479	ASSERT(arc_size >= space);
2480	atomic_add_64(&arc_size, -space);
2481	}
2482
2483	/*
2484	* Allocate an initial buffer for this hdr, subsequent buffers will
2485	* use arc_buf_clone().
2486	*/
2487	static arc_buf_t *
2488	arc_buf_alloc_impl(arc_buf_hdr_t hdr, void* *tag)
2489	{
2490	arc_buf_t *buf;
2491
2492	ASSERT(HDR_HAS_L1HDR(hdr));
2493	ASSERT3U(HDR_GET_LSIZE(hdr), >, `0`);
2494	VERIFY(hdr->b_type == ARC_BUFC_DATA \|\|
2495	hdr->b_type == ARC_BUFC_METADATA);
2496
2497	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2498	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2499	ASSERT0(hdr->b_l1hdr.b_bufcnt);
2500
2501	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2502	buf->b_hdr = hdr;
2503	buf->b_data = NULL;
2504	buf->b_next = NULL;
2505
2506	add_reference(hdr, tag);
2507
2508	/*
2509	* We're about to change the hdr's b_flags. We must either
2510	* hold the hash_lock or be undiscoverable.
2511	*/
2512	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
2513
2514	/*
2515	* If the hdr's data can be shared (no byteswapping, hdr is
2516	* uncompressed, hdr's data is not currently being written to the
2517	* L2ARC write) then we share the data buffer and set the appropriate
2518	* bit in the hdr's b_flags to indicate the hdr is sharing it's
2519	* b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to
2520	* store the buf's data.
2521	*/
2522	if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2523	HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) {
2524	buf->b_data = hdr->b_l1hdr.b_pdata;
2525	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2526	} else {
2527	buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2528	ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2529	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2530	}
2531	VERIFY3P(buf->b_data, !=, NULL);
2532
2533	hdr->b_l1hdr.b_buf = buf;
2534	hdr->b_l1hdr.b_bufcnt += `1`;
2535
2536	return (buf);
2537	}
2538
2539	/*
2540	* Used when allocating additional buffers.
2541	*/
2542	static arc_buf_t *
2543	arc_buf_clone(arc_buf_t *from)
2544	{
2545	arc_buf_t *buf;
2546	arc_buf_hdr_t *hdr = from->b_hdr;
2547	uint64_t size = HDR_GET_LSIZE(hdr);
2548
2549	ASSERT(HDR_HAS_L1HDR(hdr));
2550	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2551
2552	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2553	buf->b_hdr = hdr;
2554	buf->b_data = NULL;
2555	buf->b_next = hdr->b_l1hdr.b_buf;
2556	hdr->b_l1hdr.b_buf = buf;
2557	buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2558	bcopy(from->b_data, buf->b_data, size);
2559	hdr->b_l1hdr.b_bufcnt += `1`;
2560
2561	ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2562	return (buf);
2563	}
2564
2565	static char *arc_onloan_tag = "onloan";
2566
2567	/*
2568	* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2569	* flight data by arc_tempreserve_space() until they are "returned". Loaned
2570	* buffers must be returned to the arc before they can be used by the DMU or
2571	* freed.
2572	*/
2573	arc_buf_t *
2574	arc_loan_buf(spa_t spa, int* size)
2575	{
2576	arc_buf_t *buf;
2577
2578	buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
2579
2580	atomic_add_64(&arc_loaned_bytes, size);
2581	return (buf);
2582	}
2583
2584	/*
2585	* Return a loaned arc buffer to the arc.
2586	*/
2587	void
2588	arc_return_buf(arc_buf_t buf, void* *tag)
2589	{
2590	arc_buf_hdr_t *hdr = buf->b_hdr;
2591
2592	ASSERT3P(buf->b_data, !=, NULL);
2593	ASSERT(HDR_HAS_L1HDR(hdr));
2594	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2595	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2596
2597	atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr));
2598	}
2599
2600	/ Detach an arc_buf from a dbuf (tag) /
2601	void
2602	arc_loan_inuse_buf(arc_buf_t buf, void* *tag)
2603	{
2604	arc_buf_hdr_t *hdr = buf->b_hdr;
2605
2606	ASSERT3P(buf->b_data, !=, NULL);
2607	ASSERT(HDR_HAS_L1HDR(hdr));
2608	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2609	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2610
2611	atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr));
2612	}
2613
2614	static void
2615	l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type)
2616	{
2617	l2arc_data_free_t df = kmem_alloc(sizeof* (*df), KM_SLEEP);
2618
2619	df->l2df_data = data;
2620	df->l2df_size = size;
2621	df->l2df_type = type;
2622	mutex_enter(&l2arc_free_on_write_mtx);
2623	list_insert_head(l2arc_free_on_write, df);
2624	mutex_exit(&l2arc_free_on_write_mtx);
2625	}
2626
2627	static void
2628	arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
2629	{
2630	arc_state_t *state = hdr->b_l1hdr.b_state;
2631	arc_buf_contents_t type = arc_buf_type(hdr);
2632	uint64_t size = arc_hdr_size(hdr);
2633
2634	/ protected by hash lock, if in the hash table /
2635	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2636	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2637	ASSERT(state != arc_anon && state != arc_l2c_only);
2638
2639	(void) refcount_remove_many(&state->arcs_esize[type],
2640	size, hdr);
2641	}
2642	(void) refcount_remove_many(&state->arcs_size, size, hdr);
2643	if (type == ARC_BUFC_METADATA) {
2644	arc_space_return(size, ARC_SPACE_META);
2645	} else {
2646	ASSERT(type == ARC_BUFC_DATA);
2647	arc_space_return(size, ARC_SPACE_DATA);
2648	}
2649
2650	l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type);
2651	}
2652
2653	/*
2654	* Share the arc_buf_t's data with the hdr. Whenever we are sharing the
2655	* data buffer, we transfer the refcount ownership to the hdr and update
2656	* the appropriate kstats.
2657	*/
2658	static void
2659	arc_share_buf(arc_buf_hdr_t hdr, arc_buf_t buf)
2660	{
2661	arc_state_t *state = hdr->b_l1hdr.b_state;
2662
2663	ASSERT(!HDR_SHARED_DATA(hdr));
2664	ASSERT(!arc_buf_is_shared(buf));
2665	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
2666	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
2667
2668	/*
2669	* Start sharing the data buffer. We transfer the
2670	* refcount ownership to the hdr since it always owns
2671	* the refcount whenever an arc_buf_t is shared.
2672	*/
2673	refcount_transfer_ownership(&state->arcs_size, buf, hdr);
2674	hdr->b_l1hdr.b_pdata = buf->b_data;
2675	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2676
2677	/*
2678	* Since we've transferred ownership to the hdr we need
2679	* to increment its compressed and uncompressed kstats and
2680	* decrement the overhead size.
2681	*/
2682	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
2683	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
2684	ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr));
2685	}
2686
2687	static void
2688	arc_unshare_buf(arc_buf_hdr_t hdr, arc_buf_t buf)
2689	{
2690	arc_state_t *state = hdr->b_l1hdr.b_state;
2691
2692	ASSERT(HDR_SHARED_DATA(hdr));
2693	ASSERT(arc_buf_is_shared(buf));
2694	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
2695	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
2696
2697	/*
2698	* We are no longer sharing this buffer so we need
2699	* to transfer its ownership to the rightful owner.
2700	*/
2701	refcount_transfer_ownership(&state->arcs_size, hdr, buf);
2702	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2703	hdr->b_l1hdr.b_pdata = NULL;
2704
2705	/*
2706	* Since the buffer is no longer shared between
2707	* the arc buf and the hdr, count it as overhead.
2708	*/
2709	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
2710	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
2711	ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2712	}
2713
2714	/*
2715	* Free up buf->b_data and if 'remove' is set, then pull the
2716	* arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2717	*/
2718	static void
2719	arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
2720	{
2721	arc_buf_t **bufp;
2722	arc_buf_hdr_t *hdr = buf->b_hdr;
2723	uint64_t size = HDR_GET_LSIZE(hdr);
2724	boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf);
2725
2726	/*
2727	* Free up the data associated with the buf but only
2728	* if we're not sharing this with the hdr. If we are sharing
2729	* it with the hdr, then hdr will have performed the allocation
2730	* so allow it to do the free.
2731	*/
2732	if (buf->b_data != NULL) {
2733	/*
2734	* We're about to change the hdr's b_flags. We must either
2735	* hold the hash_lock or be undiscoverable.
2736	*/
2737	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) \|\| HDR_EMPTY(hdr));
2738
2739	arc_cksum_verify(buf);
2740	#ifdef illumos
2741	arc_buf_unwatch(buf);
2742	#endif
2743
2744	if (destroyed_buf_is_shared) {
2745	ASSERT(ARC_BUF_LAST(buf));
2746	ASSERT(HDR_SHARED_DATA(hdr));
2747	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2748	} else {
2749	arc_free_data_buf(hdr, buf->b_data, size, buf);
2750	ARCSTAT_INCR(arcstat_overhead_size, -size);
2751	}
2752	buf->b_data = NULL;
2753
2754	ASSERT(hdr->b_l1hdr.b_bufcnt > `0`);
2755	hdr->b_l1hdr.b_bufcnt -= `1`;
2756	}
2757
2758	/ only remove the buf if requested /
2759	if (!remove)
2760	return;
2761
2762	/ remove the buf from the hdr list /
2763	arc_buf_t *lastbuf = NULL;
2764	bufp = &hdr->b_l1hdr.b_buf;
2765	while (*bufp != NULL) {
2766	if (*bufp == buf)
2767	*bufp = buf->b_next;
2768
2769	/*
2770	* If we've removed a buffer in the middle of
2771	* the list then update the lastbuf and update
2772	* bufp.
2773	*/
2774	if (*bufp != NULL) {
2775	lastbuf = *bufp;
2776	bufp = &(*bufp)->b_next;
2777	}
2778	}
2779	buf->b_next = NULL;
2780	ASSERT3P(lastbuf, !=, buf);
2781
2782	/*
2783	* If the current arc_buf_t is sharing its data
2784	* buffer with the hdr, then reassign the hdr's
2785	* b_pdata to share it with the new buffer at the end
2786	* of the list. The shared buffer is always the last one
2787	* on the hdr's buffer list.
2788	*/
2789	if (destroyed_buf_is_shared && lastbuf != NULL) {
2790	ASSERT(ARC_BUF_LAST(buf));
2791	ASSERT(ARC_BUF_LAST(lastbuf));
2792	VERIFY(!arc_buf_is_shared(lastbuf));
2793
2794	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
2795	arc_hdr_free_pdata(hdr);
2796
2797	/*
2798	* We must setup a new shared block between the
2799	* last buffer and the hdr. The data would have
2800	* been allocated by the arc buf so we need to transfer
2801	* ownership to the hdr since it's now being shared.
2802	*/
2803	arc_share_buf(hdr, lastbuf);
2804	} else if (HDR_SHARED_DATA(hdr)) {
2805	ASSERT(arc_buf_is_shared(lastbuf));
2806	}
2807
2808	if (hdr->b_l1hdr.b_bufcnt == `0`)
2809	arc_cksum_free(hdr);
2810
2811	/ clean up the buf /
2812	buf->b_hdr = NULL;
2813	kmem_cache_free(buf_cache, buf);
2814	}
2815
2816	static void
2817	arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr)
2818	{
2819	ASSERT3U(HDR_GET_LSIZE(hdr), >, `0`);
2820	ASSERT(HDR_HAS_L1HDR(hdr));
2821	ASSERT(!HDR_SHARED_DATA(hdr));
2822
2823	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
2824	hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr);
2825	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
2826	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
2827
2828	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
2829	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
2830	}
2831
2832	static void
2833	arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
2834	{
2835	ASSERT(HDR_HAS_L1HDR(hdr));
2836	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
2837
2838	/*
2839	* If the hdr is currently being written to the l2arc then
2840	* we defer freeing the data by adding it to the l2arc_free_on_write
2841	* list. The l2arc will free the data once it's finished
2842	* writing it to the l2arc device.
2843	*/
2844	if (HDR_L2_WRITING(hdr)) {
2845	arc_hdr_free_on_write(hdr);
2846	ARCSTAT_BUMP(arcstat_l2_free_on_write);
2847	} else {
2848	arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata,
2849	arc_hdr_size(hdr), hdr);
2850	}
2851	hdr->b_l1hdr.b_pdata = NULL;
2852	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
2853
2854	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
2855	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
2856	}
2857
2858	static arc_buf_hdr_t *
2859	arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
2860	enum zio_compress compress, arc_buf_contents_t type)
2861	{
2862	arc_buf_hdr_t *hdr;
2863
2864	ASSERT3U(lsize, >, `0`);
2865	VERIFY(type == ARC_BUFC_DATA \|\| type == ARC_BUFC_METADATA);
2866
2867	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
2868	ASSERT(HDR_EMPTY(hdr));
2869	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
2870	ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
2871	HDR_SET_PSIZE(hdr, psize);
2872	HDR_SET_LSIZE(hdr, lsize);
2873	hdr->b_spa = spa;
2874	hdr->b_type = type;
2875	hdr->b_flags = `0`;
2876	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) \| ARC_FLAG_HAS_L1HDR);
2877	arc_hdr_set_compress(hdr, compress);
2878
2879	hdr->b_l1hdr.b_state = arc_anon;
2880	hdr->b_l1hdr.b_arc_access = `0`;
2881	hdr->b_l1hdr.b_bufcnt = `0`;
2882	hdr->b_l1hdr.b_buf = NULL;
2883
2884	/*
2885	* Allocate the hdr's buffer. This will contain either
2886	* the compressed or uncompressed data depending on the block
2887	* it references and compressed arc enablement.
2888	*/
2889	arc_hdr_alloc_pdata(hdr);
2890	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2891
2892	return (hdr);
2893	}
2894
2895	/*
2896	* Transition between the two allocation states for the arc_buf_hdr struct.
2897	* The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
2898	* (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
2899	* version is used when a cache buffer is only in the L2ARC in order to reduce
2900	* memory usage.
2901	*/
2902	static arc_buf_hdr_t *
2903	arc_hdr_realloc(arc_buf_hdr_t hdr, kmem_cache_t old, kmem_cache_t *new)
2904	{
2905	ASSERT(HDR_HAS_L2HDR(hdr));
2906
2907	arc_buf_hdr_t *nhdr;
2908	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2909
2910	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) \|\|
2911	(old == hdr_l2only_cache && new == hdr_full_cache));
2912
2913	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
2914
2915	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
2916	buf_hash_remove(hdr);
2917
2918	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
2919
2920	if (new == hdr_full_cache) {
2921	arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
2922	/*
2923	* arc_access and arc_change_state need to be aware that a
2924	* header has just come out of L2ARC, so we set its state to
2925	* l2c_only even though it's about to change.
2926	*/
2927	nhdr->b_l1hdr.b_state = arc_l2c_only;
2928
2929	/ Verify previous threads set to NULL before freeing /
2930	ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL);
2931	} else {
2932	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2933	ASSERT0(hdr->b_l1hdr.b_bufcnt);
2934	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
2935
2936	/*
2937	* If we've reached here, We must have been called from
2938	* arc_evict_hdr(), as such we should have already been
2939	* removed from any ghost list we were previously on
2940	* (which protects us from racing with arc_evict_state),
2941	* thus no locking is needed during this check.
2942	*/
2943	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
2944
2945	/*
2946	* A buffer must not be moved into the arc_l2c_only
2947	* state if it's not finished being written out to the
2948	* l2arc device. Otherwise, the b_l1hdr.b_pdata field
2949	* might try to be accessed, even though it was removed.
2950	*/
2951	VERIFY(!HDR_L2_WRITING(hdr));
2952	VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL);
2953
2954	#ifdef ZFS_DEBUG
2955	if (hdr->b_l1hdr.b_thawed != NULL) {
2956	kmem_free(hdr->b_l1hdr.b_thawed, `1`);
2957	hdr->b_l1hdr.b_thawed = NULL;
2958	}
2959	#endif
2960
2961	arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
2962	}
2963	/*
2964	* The header has been reallocated so we need to re-insert it into any
2965	* lists it was on.
2966	*/
2967	(void) buf_hash_insert(nhdr, NULL);
2968
2969	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
2970
2971	mutex_enter(&dev->l2ad_mtx);
2972
2973	/*
2974	* We must place the realloc'ed header back into the list at
2975	* the same spot. Otherwise, if it's placed earlier in the list,
2976	* l2arc_write_buffers() could find it during the function's
2977	* write phase, and try to write it out to the l2arc.
2978	*/
2979	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
2980	list_remove(&dev->l2ad_buflist, hdr);
2981
2982	mutex_exit(&dev->l2ad_mtx);
2983
2984	/*
2985	* Since we're using the pointer address as the tag when
2986	* incrementing and decrementing the l2ad_alloc refcount, we
2987	* must remove the old pointer (that we're about to destroy) and
2988	* add the new pointer to the refcount. Otherwise we'd remove
2989	* the wrong pointer address when calling arc_hdr_destroy() later.
2990	*/
2991
2992	(void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
2993	(void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr);
2994
2995	buf_discard_identity(hdr);
2996	kmem_cache_free(old, hdr);
2997
2998	return (nhdr);
2999	}
3000
3001	/*
3002	* Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
3003	* The buf is returned thawed since we expect the consumer to modify it.
3004	*/
3005	arc_buf_t *
3006	arc_alloc_buf(spa_t spa, int32_t size, void* *tag, arc_buf_contents_t type)
3007	{
3008	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
3009	ZIO_COMPRESS_OFF, type);
3010	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3011	arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag);
3012	arc_buf_thaw(buf);
3013	return (buf);
3014	}
3015
3016	static void
3017	arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
3018	{
3019	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3020	l2arc_dev_t *dev = l2hdr->b_dev;
3021	uint64_t asize = arc_hdr_size(hdr);
3022
3023	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
3024	ASSERT(HDR_HAS_L2HDR(hdr));
3025
3026	list_remove(&dev->l2ad_buflist, hdr);
3027
3028	ARCSTAT_INCR(arcstat_l2_asize, -asize);
3029	ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr));
3030
3031	vdev_space_update(dev->l2ad_vdev, -asize, `0`, `0`);
3032
3033	(void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr);
3034	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
3035	}
3036
3037	static void
3038	arc_hdr_destroy(arc_buf_hdr_t *hdr)
3039	{
3040	if (HDR_HAS_L1HDR(hdr)) {
3041	ASSERT(hdr->b_l1hdr.b_buf == NULL \|\|
3042	hdr->b_l1hdr.b_bufcnt > `0`);
3043	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3044	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3045	}
3046	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3047	ASSERT(!HDR_IN_HASH_TABLE(hdr));
3048
3049	if (!HDR_EMPTY(hdr))
3050	buf_discard_identity(hdr);
3051
3052	if (HDR_HAS_L2HDR(hdr)) {
3053	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3054	boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
3055
3056	if (!buflist_held)
3057	mutex_enter(&dev->l2ad_mtx);
3058
3059	/*
3060	* Even though we checked this conditional above, we
3061	* need to check this again now that we have the
3062	* l2ad_mtx. This is because we could be racing with
3063	* another thread calling l2arc_evict() which might have
3064	* destroyed this header's L2 portion as we were waiting
3065	* to acquire the l2ad_mtx. If that happens, we don't
3066	* want to re-destroy the header's L2 portion.
3067	*/
3068	if (HDR_HAS_L2HDR(hdr)) {
3069	l2arc_trim(hdr);
3070	arc_hdr_l2hdr_destroy(hdr);
3071	}
3072
3073	if (!buflist_held)
3074	mutex_exit(&dev->l2ad_mtx);
3075	}
3076
3077	if (HDR_HAS_L1HDR(hdr)) {
3078	arc_cksum_free(hdr);
3079
3080	while (hdr->b_l1hdr.b_buf != NULL)
3081	arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE);
3082
3083	#ifdef ZFS_DEBUG
3084	if (hdr->b_l1hdr.b_thawed != NULL) {
3085	kmem_free(hdr->b_l1hdr.b_thawed, `1`);
3086	hdr->b_l1hdr.b_thawed = NULL;
3087	}
3088	#endif
3089
3090	if (hdr->b_l1hdr.b_pdata != NULL) {
3091	arc_hdr_free_pdata(hdr);
3092	}
3093	}
3094
3095	ASSERT3P(hdr->b_hash_next, ==, NULL);
3096	if (HDR_HAS_L1HDR(hdr)) {
3097	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3098	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
3099	kmem_cache_free(hdr_full_cache, hdr);
3100	} else {
3101	kmem_cache_free(hdr_l2only_cache, hdr);
3102	}
3103	}
3104
3105	void
3106	arc_buf_destroy(arc_buf_t buf, void** tag)
3107	{
3108	arc_buf_hdr_t *hdr = buf->b_hdr;
3109	kmutex_t *hash_lock = HDR_LOCK(hdr);
3110
3111	if (hdr->b_l1hdr.b_state == arc_anon) {
3112	ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, `1`);
3113	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3114	VERIFY0(remove_reference(hdr, NULL, tag));
3115	arc_hdr_destroy(hdr);
3116	return;
3117	}
3118
3119	mutex_enter(hash_lock);
3120	ASSERT3P(hdr, ==, buf->b_hdr);
3121	ASSERT(hdr->b_l1hdr.b_bufcnt > `0`);
3122	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3123	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
3124	ASSERT3P(buf->b_data, !=, NULL);
3125
3126	(void) remove_reference(hdr, hash_lock, tag);
3127	arc_buf_destroy_impl(buf, B_TRUE);
3128	mutex_exit(hash_lock);
3129	}
3130
3131	int32_t
3132	arc_buf_size(arc_buf_t *buf)
3133	{
3134	return (HDR_GET_LSIZE(buf->b_hdr));
3135	}
3136
3137	/*
3138	* Evict the arc_buf_hdr that is provided as a parameter. The resultant
3139	* state of the header is dependent on its state prior to entering this
3140	* function. The following transitions are possible:
3141	*
3142	* - arc_mru -> arc_mru_ghost
3143	* - arc_mfu -> arc_mfu_ghost
3144	* - arc_mru_ghost -> arc_l2c_only
3145	* - arc_mru_ghost -> deleted
3146	* - arc_mfu_ghost -> arc_l2c_only
3147	* - arc_mfu_ghost -> deleted
3148	*/
3149	static int64_t
3150	arc_evict_hdr(arc_buf_hdr_t hdr, kmutex_t hash_lock)
3151	{
3152	arc_state_t evicted_state, state;
3153	int64_t bytes_evicted = `0`;
3154
3155	ASSERT(MUTEX_HELD(hash_lock));
3156	ASSERT(HDR_HAS_L1HDR(hdr));
3157
3158	state = hdr->b_l1hdr.b_state;
3159	if (GHOST_STATE(state)) {
3160	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3161	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3162
3163	/*
3164	* l2arc_write_buffers() relies on a header's L1 portion
3165	* (i.e. its b_pdata field) during its write phase.
3166	* Thus, we cannot push a header onto the arc_l2c_only
3167	* state (removing it's L1 piece) until the header is
3168	* done being written to the l2arc.
3169	*/
3170	if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
3171	ARCSTAT_BUMP(arcstat_evict_l2_skip);
3172	return (bytes_evicted);
3173	}
3174
3175	ARCSTAT_BUMP(arcstat_deleted);
3176	bytes_evicted += HDR_GET_LSIZE(hdr);
3177
3178	DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
3179
3180	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
3181	if (HDR_HAS_L2HDR(hdr)) {
3182	ASSERT(hdr->b_l1hdr.b_pdata == NULL);
3183	/*
3184	* This buffer is cached on the 2nd Level ARC;
3185	* don't destroy the header.
3186	*/
3187	arc_change_state(arc_l2c_only, hdr, hash_lock);
3188	/*
3189	* dropping from L1+L2 cached to L2-only,
3190	* realloc to remove the L1 header.
3191	*/
3192	hdr = arc_hdr_realloc(hdr, hdr_full_cache,
3193	hdr_l2only_cache);
3194	} else {
3195	ASSERT(hdr->b_l1hdr.b_pdata == NULL);
3196	arc_change_state(arc_anon, hdr, hash_lock);
3197	arc_hdr_destroy(hdr);
3198	}
3199	return (bytes_evicted);
3200	}
3201
3202	ASSERT(state == arc_mru \|\| state == arc_mfu);
3203	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3204
3205	/ prefetch buffers have a minimum lifespan /
3206	if (HDR_IO_IN_PROGRESS(hdr) \|\|
3207	((hdr->b_flags & (ARC_FLAG_PREFETCH \| ARC_FLAG_INDIRECT)) &&
3208	ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
3209	arc_min_prefetch_lifespan)) {
3210	ARCSTAT_BUMP(arcstat_evict_skip);
3211	return (bytes_evicted);
3212	}
3213
3214	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
3215	while (hdr->b_l1hdr.b_buf) {
3216	arc_buf_t *buf = hdr->b_l1hdr.b_buf;
3217	if (!mutex_tryenter(&buf->b_evict_lock)) {
3218	ARCSTAT_BUMP(arcstat_mutex_miss);
3219	break;
3220	}
3221	if (buf->b_data != NULL)
3222	bytes_evicted += HDR_GET_LSIZE(hdr);
3223	mutex_exit(&buf->b_evict_lock);
3224	arc_buf_destroy_impl(buf, B_TRUE);
3225	}
3226
3227	if (HDR_HAS_L2HDR(hdr)) {
3228	ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
3229	} else {
3230	if (l2arc_write_eligible(hdr->b_spa, hdr)) {
3231	ARCSTAT_INCR(arcstat_evict_l2_eligible,
3232	HDR_GET_LSIZE(hdr));
3233	} else {
3234	ARCSTAT_INCR(arcstat_evict_l2_ineligible,
3235	HDR_GET_LSIZE(hdr));
3236	}
3237	}
3238
3239	if (hdr->b_l1hdr.b_bufcnt == `0`) {
3240	arc_cksum_free(hdr);
3241
3242	bytes_evicted += arc_hdr_size(hdr);
3243
3244	/*
3245	* If this hdr is being evicted and has a compressed
3246	* buffer then we discard it here before we change states.
3247	* This ensures that the accounting is updated correctly
3248	* in arc_free_data_buf().
3249	*/
3250	arc_hdr_free_pdata(hdr);
3251
3252	arc_change_state(evicted_state, hdr, hash_lock);
3253	ASSERT(HDR_IN_HASH_TABLE(hdr));
3254	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
3255	DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
3256	}
3257
3258	return (bytes_evicted);
3259	}
3260
3261	static uint64_t
3262	arc_evict_state_impl(multilist_t ml, int* idx, arc_buf_hdr_t *marker,
3263	uint64_t spa, int64_t bytes)
3264	{
3265	multilist_sublist_t *mls;
3266	uint64_t bytes_evicted = `0`;
3267	arc_buf_hdr_t *hdr;
3268	kmutex_t *hash_lock;
3269	int evict_count = `0`;
3270
3271	ASSERT3P(marker, !=, NULL);
3272	IMPLY(bytes < `0`, bytes == ARC_EVICT_ALL);
3273
3274	mls = multilist_sublist_lock(ml, idx);
3275
3276	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
3277	hdr = multilist_sublist_prev(mls, marker)) {
3278	if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) \|\|
3279	(evict_count >= zfs_arc_evict_batch_limit))
3280	break;
3281
3282	/*
3283	* To keep our iteration location, move the marker
3284	* forward. Since we're not holding hdr's hash lock, we
3285	* must be very careful and not remove 'hdr' from the
3286	* sublist. Otherwise, other consumers might mistake the
3287	* 'hdr' as not being on a sublist when they call the
3288	* multilist_link_active() function (they all rely on
3289	* the hash lock protecting concurrent insertions and
3290	* removals). multilist_sublist_move_forward() was
3291	* specifically implemented to ensure this is the case
3292	* (only 'marker' will be removed and re-inserted).
3293	*/
3294	multilist_sublist_move_forward(mls, marker);
3295
3296	/*
3297	* The only case where the b_spa field should ever be
3298	* zero, is the marker headers inserted by
3299	* arc_evict_state(). It's possible for multiple threads
3300	* to be calling arc_evict_state() concurrently (e.g.
3301	* dsl_pool_close() and zio_inject_fault()), so we must
3302	* skip any markers we see from these other threads.
3303	*/
3304	if (hdr->b_spa == `0`)
3305	continue;
3306
3307	/ we're only interested in evicting buffers of a certain spa /
3308	if (spa != `0` && hdr->b_spa != spa) {
3309	ARCSTAT_BUMP(arcstat_evict_skip);
3310	continue;
3311	}
3312
3313	hash_lock = HDR_LOCK(hdr);
3314
3315	/*
3316	* We aren't calling this function from any code path
3317	* that would already be holding a hash lock, so we're
3318	* asserting on this assumption to be defensive in case
3319	* this ever changes. Without this check, it would be
3320	* possible to incorrectly increment arcstat_mutex_miss
3321	* below (e.g. if the code changed such that we called
3322	* this function with a hash lock held).
3323	*/
3324	ASSERT(!MUTEX_HELD(hash_lock));
3325
3326	if (mutex_tryenter(hash_lock)) {
3327	uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
3328	mutex_exit(hash_lock);
3329
3330	bytes_evicted += evicted;
3331
3332	/*
3333	* If evicted is zero, arc_evict_hdr() must have
3334	* decided to skip this header, don't increment
3335	* evict_count in this case.
3336	*/
3337	if (evicted != `0`)
3338	evict_count++;
3339
3340	/*
3341	* If arc_size isn't overflowing, signal any
3342	* threads that might happen to be waiting.
3343	*
3344	* For each header evicted, we wake up a single
3345	* thread. If we used cv_broadcast, we could
3346	* wake up "too many" threads causing arc_size
3347	* to significantly overflow arc_c; since
3348	* arc_get_data_buf() doesn't check for overflow
3349	* when it's woken up (it doesn't because it's
3350	* possible for the ARC to be overflowing while
3351	* full of un-evictable buffers, and the
3352	* function should proceed in this case).
3353	*
3354	* If threads are left sleeping, due to not
3355	* using cv_broadcast, they will be woken up
3356	* just before arc_reclaim_thread() sleeps.
3357	*/
3358	mutex_enter(&arc_reclaim_lock);
3359	if (!arc_is_overflowing())
3360	cv_signal(&arc_reclaim_waiters_cv);
3361	mutex_exit(&arc_reclaim_lock);
3362	} else {
3363	ARCSTAT_BUMP(arcstat_mutex_miss);
3364	}
3365	}
3366
3367	multilist_sublist_unlock(mls);
3368
3369	return (bytes_evicted);
3370	}
3371
3372	/*
3373	* Evict buffers from the given arc state, until we've removed the
3374	* specified number of bytes. Move the removed buffers to the
3375	* appropriate evict state.
3376	*
3377	* This function makes a "best effort". It skips over any buffers
3378	* it can't get a hash_lock on, and so, may not catch all candidates.
3379	* It may also return without evicting as much space as requested.
3380	*
3381	* If bytes is specified using the special value ARC_EVICT_ALL, this
3382	* will evict all available (i.e. unlocked and evictable) buffers from
3383	* the given arc state; which is used by arc_flush().
3384	*/
3385	static uint64_t
3386	arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
3387	arc_buf_contents_t type)
3388	{
3389	uint64_t total_evicted = `0`;
3390	multilist_t *ml = &state->arcs_list[type];
3391	int num_sublists;
3392	arc_buf_hdr_t **markers;
3393
3394	IMPLY(bytes < `0`, bytes == ARC_EVICT_ALL);
3395
3396	num_sublists = multilist_get_num_sublists(ml);
3397
3398	/*
3399	* If we've tried to evict from each sublist, made some
3400	* progress, but still have not hit the target number of bytes
3401	* to evict, we want to keep trying. The markers allow us to
3402	* pick up where we left off for each individual sublist, rather
3403	* than starting from the tail each time.
3404	*/
3405	markers = kmem_zalloc(sizeof (markers) num_sublists, KM_SLEEP);
3406	for (int i = `0`; i < num_sublists; i++) {
3407	markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
3408
3409	/*
3410	* A b_spa of 0 is used to indicate that this header is
3411	* a marker. This fact is used in arc_adjust_type() and
3412	* arc_evict_state_impl().
3413	*/
3414	markers[i]->b_spa = `0`;
3415
3416	multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
3417	multilist_sublist_insert_tail(mls, markers[i]);
3418	multilist_sublist_unlock(mls);
3419	}
3420
3421	/*
3422	* While we haven't hit our target number of bytes to evict, or
3423	* we're evicting all available buffers.
3424	*/
3425	while (total_evicted < bytes \|\| bytes == ARC_EVICT_ALL) {
3426	/*
3427	* Start eviction using a randomly selected sublist,
3428	* this is to try and evenly balance eviction across all
3429	* sublists. Always starting at the same sublist
3430	* (e.g. index 0) would cause evictions to favor certain
3431	* sublists over others.
3432	*/
3433	int sublist_idx = multilist_get_random_index(ml);
3434	uint64_t scan_evicted = `0`;
3435
3436	for (int i = `0`; i < num_sublists; i++) {
3437	uint64_t bytes_remaining;
3438	uint64_t bytes_evicted;
3439
3440	if (bytes == ARC_EVICT_ALL)
3441	bytes_remaining = ARC_EVICT_ALL;
3442	else if (total_evicted < bytes)
3443	bytes_remaining = bytes - total_evicted;
3444	else
3445	break;
3446
3447	bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
3448	markers[sublist_idx], spa, bytes_remaining);
3449
3450	scan_evicted += bytes_evicted;
3451	total_evicted += bytes_evicted;
3452
3453	/ we've reached the end, wrap to the beginning /
3454	if (++sublist_idx >= num_sublists)
3455	sublist_idx = `0`;
3456	}
3457
3458	/*
3459	* If we didn't evict anything during this scan, we have
3460	* no reason to believe we'll evict more during another
3461	* scan, so break the loop.
3462	*/
3463	if (scan_evicted == `0`) {
3464	/ This isn't possible, let's make that obvious /
3465	ASSERT3S(bytes, !=, `0`);
3466
3467	/*
3468	* When bytes is ARC_EVICT_ALL, the only way to
3469	* break the loop is when scan_evicted is zero.
3470	* In that case, we actually have evicted enough,
3471	* so we don't want to increment the kstat.
3472	*/
3473	if (bytes != ARC_EVICT_ALL) {
3474	ASSERT3S(total_evicted, <, bytes);
3475	ARCSTAT_BUMP(arcstat_evict_not_enough);
3476	}
3477
3478	break;
3479	}
3480	}
3481
3482	for (int i = `0`; i < num_sublists; i++) {
3483	multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
3484	multilist_sublist_remove(mls, markers[i]);
3485	multilist_sublist_unlock(mls);
3486
3487	kmem_cache_free(hdr_full_cache, markers[i]);
3488	}
3489	kmem_free(markers, sizeof (markers) num_sublists);
3490
3491	return (total_evicted);
3492	}
3493
3494	/*
3495	* Flush all "evictable" data of the given type from the arc state
3496	* specified. This will not evict any "active" buffers (i.e. referenced).
3497	*
3498	* When 'retry' is set to B_FALSE, the function will make a single pass
3499	* over the state and evict any buffers that it can. Since it doesn't
3500	* continually retry the eviction, it might end up leaving some buffers
3501	* in the ARC due to lock misses.
3502	*
3503	* When 'retry' is set to B_TRUE, the function will continually retry the
3504	* eviction until all evictable buffers have been removed from the
3505	* state. As a result, if concurrent insertions into the state are
3506	* allowed (e.g. if the ARC isn't shutting down), this function might
3507	* wind up in an infinite loop, continually trying to evict buffers.
3508	*/
3509	static uint64_t
3510	arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
3511	boolean_t retry)
3512	{
3513	uint64_t evicted = `0`;
3514
3515	while (refcount_count(&state->arcs_esize[type]) != `0`) {
3516	evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
3517
3518	if (!retry)
3519	break;
3520	}
3521
3522	return (evicted);
3523	}
3524
3525	/*
3526	* Evict the specified number of bytes from the state specified,
3527	* restricting eviction to the spa and type given. This function
3528	* prevents us from trying to evict more from a state's list than
3529	* is "evictable", and to skip evicting altogether when passed a
3530	* negative value for "bytes". In contrast, arc_evict_state() will
3531	* evict everything it can, when passed a negative value for "bytes".
3532	*/
3533	static uint64_t
3534	arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
3535	arc_buf_contents_t type)
3536	{
3537	int64_t delta;
3538
3539	if (bytes > `0` && refcount_count(&state->arcs_esize[type]) > `0`) {
3540	delta = MIN(refcount_count(&state->arcs_esize[type]), bytes);
3541	return (arc_evict_state(state, spa, delta, type));
3542	}
3543
3544	return (`0`);
3545	}
3546
3547	/*
3548	* Evict metadata buffers from the cache, such that arc_meta_used is
3549	* capped by the arc_meta_limit tunable.
3550	*/
3551	static uint64_t
3552	arc_adjust_meta(void)
3553	{
3554	uint64_t total_evicted = `0`;
3555	int64_t target;
3556
3557	/*
3558	* If we're over the meta limit, we want to evict enough
3559	* metadata to get back under the meta limit. We don't want to
3560	* evict so much that we drop the MRU below arc_p, though. If
3561	* we're over the meta limit more than we're over arc_p, we
3562	* evict some from the MRU here, and some from the MFU below.
3563	*/
3564	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
3565	(int64_t)(refcount_count(&arc_anon->arcs_size) +
3566	refcount_count(&arc_mru->arcs_size) - arc_p));
3567
3568	total_evicted += arc_adjust_impl(arc_mru, `0`, target, ARC_BUFC_METADATA);
3569
3570	/*
3571	* Similar to the above, we want to evict enough bytes to get us
3572	* below the meta limit, but not so much as to drop us below the
3573	* space alloted to the MFU (which is defined as arc_c - arc_p).
3574	*/
3575	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
3576	(int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
3577
3578	total_evicted += arc_adjust_impl(arc_mfu, `0`, target, ARC_BUFC_METADATA);
3579
3580	return (total_evicted);
3581	}
3582
3583	/*
3584	* Return the type of the oldest buffer in the given arc state
3585	*
3586	* This function will select a random sublist of type ARC_BUFC_DATA and
3587	* a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
3588	* is compared, and the type which contains the "older" buffer will be
3589	* returned.
3590	*/
3591	static arc_buf_contents_t
3592	arc_adjust_type(arc_state_t *state)
3593	{
3594	multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
3595	multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
3596	int data_idx = multilist_get_random_index(data_ml);
3597	int meta_idx = multilist_get_random_index(meta_ml);
3598	multilist_sublist_t *data_mls;
3599	multilist_sublist_t *meta_mls;
3600	arc_buf_contents_t type;
3601	arc_buf_hdr_t *data_hdr;
3602	arc_buf_hdr_t *meta_hdr;
3603
3604	/*
3605	* We keep the sublist lock until we're finished, to prevent
3606	* the headers from being destroyed via arc_evict_state().
3607	*/
3608	data_mls = multilist_sublist_lock(data_ml, data_idx);
3609	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
3610
3611	/*
3612	* These two loops are to ensure we skip any markers that
3613	* might be at the tail of the lists due to arc_evict_state().
3614	*/
3615
3616	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
3617	data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
3618	if (data_hdr->b_spa != `0`)
3619	break;
3620	}
3621
3622	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
3623	meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
3624	if (meta_hdr->b_spa != `0`)
3625	break;
3626	}
3627
3628	if (data_hdr == NULL && meta_hdr == NULL) {
3629	type = ARC_BUFC_DATA;
3630	} else if (data_hdr == NULL) {
3631	ASSERT3P(meta_hdr, !=, NULL);
3632	type = ARC_BUFC_METADATA;
3633	} else if (meta_hdr == NULL) {
3634	ASSERT3P(data_hdr, !=, NULL);
3635	type = ARC_BUFC_DATA;
3636	} else {
3637	ASSERT3P(data_hdr, !=, NULL);
3638	ASSERT3P(meta_hdr, !=, NULL);
3639
3640	/ The headers can't be on the sublist without an L1 header /
3641	ASSERT(HDR_HAS_L1HDR(data_hdr));
3642	ASSERT(HDR_HAS_L1HDR(meta_hdr));
3643
3644	if (data_hdr->b_l1hdr.b_arc_access <
3645	meta_hdr->b_l1hdr.b_arc_access) {
3646	type = ARC_BUFC_DATA;
3647	} else {
3648	type = ARC_BUFC_METADATA;
3649	}
3650	}
3651
3652	multilist_sublist_unlock(meta_mls);
3653	multilist_sublist_unlock(data_mls);
3654
3655	return (type);
3656	}
3657
3658	/*
3659	* Evict buffers from the cache, such that arc_size is capped by arc_c.
3660	*/
3661	static uint64_t
3662	arc_adjust(void)
3663	{
3664	uint64_t total_evicted = `0`;
3665	uint64_t bytes;
3666	int64_t target;
3667
3668	/*
3669	* If we're over arc_meta_limit, we want to correct that before
3670	* potentially evicting data buffers below.
3671	*/
3672	total_evicted += arc_adjust_meta();
3673
3674	/*
3675	* Adjust MRU size
3676	*
3677	* If we're over the target cache size, we want to evict enough
3678	* from the list to get back to our target size. We don't want
3679	* to evict too much from the MRU, such that it drops below
3680	* arc_p. So, if we're over our target cache size more than
3681	* the MRU is over arc_p, we'll evict enough to get back to
3682	* arc_p here, and then evict more from the MFU below.
3683	*/
3684	target = MIN((int64_t)(arc_size - arc_c),
3685	(int64_t)(refcount_count(&arc_anon->arcs_size) +
3686	refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
3687
3688	/*
3689	* If we're below arc_meta_min, always prefer to evict data.
3690	* Otherwise, try to satisfy the requested number of bytes to
3691	* evict from the type which contains older buffers; in an
3692	* effort to keep newer buffers in the cache regardless of their
3693	* type. If we cannot satisfy the number of bytes from this
3694	* type, spill over into the next type.
3695	*/
3696	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
3697	arc_meta_used > arc_meta_min) {
3698	bytes = arc_adjust_impl(arc_mru, `0`, target, ARC_BUFC_METADATA);
3699	total_evicted += bytes;
3700
3701	/*
3702	* If we couldn't evict our target number of bytes from
3703	* metadata, we try to get the rest from data.
3704	*/
3705	target -= bytes;
3706
3707	total_evicted +=
3708	arc_adjust_impl(arc_mru, `0`, target, ARC_BUFC_DATA);
3709	} else {
3710	bytes = arc_adjust_impl(arc_mru, `0`, target, ARC_BUFC_DATA);
3711	total_evicted += bytes;
3712
3713	/*
3714	* If we couldn't evict our target number of bytes from
3715	* data, we try to get the rest from metadata.
3716	*/
3717	target -= bytes;
3718
3719	total_evicted +=
3720	arc_adjust_impl(arc_mru, `0`, target, ARC_BUFC_METADATA);
3721	}
3722
3723	/*
3724	* Adjust MFU size
3725	*
3726	* Now that we've tried to evict enough from the MRU to get its
3727	* size back to arc_p, if we're still above the target cache
3728	* size, we evict the rest from the MFU.
3729	*/
3730	target = arc_size - arc_c;
3731
3732	if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
3733	arc_meta_used > arc_meta_min) {
3734	bytes = arc_adjust_impl(arc_mfu, `0`, target, ARC_BUFC_METADATA);
3735	total_evicted += bytes;
3736
3737	/*
3738	* If we couldn't evict our target number of bytes from
3739	* metadata, we try to get the rest from data.
3740	*/
3741	target -= bytes;
3742
3743	total_evicted +=
3744	arc_adjust_impl(arc_mfu, `0`, target, ARC_BUFC_DATA);
3745	} else {
3746	bytes = arc_adjust_impl(arc_mfu, `0`, target, ARC_BUFC_DATA);
3747	total_evicted += bytes;
3748
3749	/*
3750	* If we couldn't evict our target number of bytes from
3751	* data, we try to get the rest from data.
3752	*/
3753	target -= bytes;
3754
3755	total_evicted +=
3756	arc_adjust_impl(arc_mfu, `0`, target, ARC_BUFC_METADATA);
3757	}
3758
3759	/*
3760	* Adjust ghost lists
3761	*
3762	* In addition to the above, the ARC also defines target values
3763	* for the ghost lists. The sum of the mru list and mru ghost
3764	* list should never exceed the target size of the cache, and
3765	* the sum of the mru list, mfu list, mru ghost list, and mfu
3766	* ghost list should never exceed twice the target size of the
3767	* cache. The following logic enforces these limits on the ghost
3768	* caches, and evicts from them as needed.
3769	*/
3770	target = refcount_count(&arc_mru->arcs_size) +
3771	refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
3772
3773	bytes = arc_adjust_impl(arc_mru_ghost, `0`, target, ARC_BUFC_DATA);
3774	total_evicted += bytes;
3775
3776	target -= bytes;
3777
3778	total_evicted +=
3779	arc_adjust_impl(arc_mru_ghost, `0`, target, ARC_BUFC_METADATA);
3780
3781	/*
3782	* We assume the sum of the mru list and mfu list is less than
3783	* or equal to arc_c (we enforced this above), which means we
3784	* can use the simpler of the two equations below:
3785	*
3786	* mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
3787	* mru ghost + mfu ghost <= arc_c
3788	*/
3789	target = refcount_count(&arc_mru_ghost->arcs_size) +
3790	refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
3791
3792	bytes = arc_adjust_impl(arc_mfu_ghost, `0`, target, ARC_BUFC_DATA);
3793	total_evicted += bytes;
3794
3795	target -= bytes;
3796
3797	total_evicted +=
3798	arc_adjust_impl(arc_mfu_ghost, `0`, target, ARC_BUFC_METADATA);
3799
3800	return (total_evicted);
3801	}
3802
3803	void
3804	arc_flush(spa_t *spa, boolean_t retry)
3805	{
3806	uint64_t guid = `0`;
3807
3808	/*
3809	* If retry is B_TRUE, a spa must not be specified since we have
3810	* no good way to determine if all of a spa's buffers have been
3811	* evicted from an arc state.
3812	*/
3813	ASSERT(!retry \|\| spa == `0`);
3814
3815	if (spa != NULL)
3816	guid = spa_load_guid(spa);
3817
3818	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
3819	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
3820
3821	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
3822	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
3823
3824	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
3825	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
3826
3827	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
3828	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
3829	}
3830
3831	void
3832	arc_shrink(int64_t to_free)
3833	{
3834	if (arc_c > arc_c_min) {
3835	DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
3836	arc_c_min, uint64_t, arc_p, uint64_t, to_free);
3837	if (arc_c > arc_c_min + to_free)
3838	atomic_add_64(&arc_c, -to_free);
3839	else
3840	arc_c = arc_c_min;
3841
3842	atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
3843	if (arc_c > arc_size)
3844	arc_c = MAX(arc_size, arc_c_min);
3845	if (arc_p > arc_c)
3846	arc_p = (arc_c >> `1`);
3847
3848	DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
3849	arc_p);
3850
3851	ASSERT(arc_c >= arc_c_min);
3852	ASSERT((int64_t)arc_p >= `0`);
3853	}
3854
3855	if (arc_size > arc_c) {
3856	DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
3857	uint64_t, arc_c);
3858	(void) arc_adjust();
3859	}
3860	}
3861
3862	static long needfree = `0`;
3863
3864	typedef enum free_memory_reason_t {
3865	FMR_UNKNOWN,
3866	FMR_NEEDFREE,
3867	FMR_LOTSFREE,
3868	FMR_SWAPFS_MINFREE,
3869	FMR_PAGES_PP_MAXIMUM,
3870	FMR_HEAP_ARENA,
3871	FMR_ZIO_ARENA,
3872	FMR_ZIO_FRAG,
3873	} free_memory_reason_t;
3874
3875	int64_t last_free_memory;
3876	free_memory_reason_t last_free_reason;
3877
3878	/*
3879	* Additional reserve of pages for pp_reserve.
3880	*/
3881	int64_t arc_pages_pp_reserve = `64`;
3882
3883	/*
3884	* Additional reserve of pages for swapfs.
3885	*/
3886	int64_t arc_swapfs_reserve = `64`;
3887
3888	/*
3889	* Return the amount of memory that can be consumed before reclaim will be
3890	* needed. Positive if there is sufficient free memory, negative indicates
3891	* the amount of memory that needs to be freed up.
3892	*/
3893	static int64_t
3894	arc_available_memory(void)
3895	{
3896	int64_t lowest = INT64_MAX;
3897	int64_t n;
3898	free_memory_reason_t r = FMR_UNKNOWN;
3899
3900	#ifdef _KERNEL
3901	if (needfree > `0`) {
3902	n = PAGESIZE * (-needfree);
3903	if (n < lowest) {
3904	lowest = n;
3905	r = FMR_NEEDFREE;
3906	}
3907	}
3908
3909	/*
3910	* Cooperate with pagedaemon when it's time for it to scan
3911	* and reclaim some pages.
3912	*/
3913	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
3914	if (n < lowest) {
3915	lowest = n;
3916	r = FMR_LOTSFREE;
3917	}
3918
3919	#ifdef illumos
3920	/*
3921	* check that we're out of range of the pageout scanner. It starts to
3922	* schedule paging if freemem is less than lotsfree and needfree.
3923	* lotsfree is the high-water mark for pageout, and needfree is the
3924	* number of needed free pages. We add extra pages here to make sure
3925	* the scanner doesn't start up while we're freeing memory.
3926	*/
3927	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3928	if (n < lowest) {
3929	lowest = n;
3930	r = FMR_LOTSFREE;
3931	}
3932
3933	/*
3934	* check to make sure that swapfs has enough space so that anon
3935	* reservations can still succeed. anon_resvmem() checks that the
3936	* availrmem is greater than swapfs_minfree, and the number of reserved
3937	* swap pages. We also add a bit of extra here just to prevent
3938	* circumstances from getting really dire.
3939	*/
3940	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3941	desfree - arc_swapfs_reserve);
3942	if (n < lowest) {
3943	lowest = n;
3944	r = FMR_SWAPFS_MINFREE;
3945	}
3946
3947
3948	/*
3949	* Check that we have enough availrmem that memory locking (e.g., via
3950	* mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
3951	* stores the number of pages that cannot be locked; when availrmem
3952	* drops below pages_pp_maximum, page locking mechanisms such as
3953	* page_pp_lock() will fail.)
3954	*/
3955	n = PAGESIZE * (availrmem - pages_pp_maximum -
3956	arc_pages_pp_reserve);
3957	if (n < lowest) {
3958	lowest = n;
3959	r = FMR_PAGES_PP_MAXIMUM;
3960	}
3961
3962	#endif /* illumos */
3963	#if defined(__i386) \|\| !defined(UMA_MD_SMALL_ALLOC)
3964	/*
3965	* If we're on an i386 platform, it's possible that we'll exhaust the
3966	* kernel heap space before we ever run out of available physical
3967	* memory. Most checks of the size of the heap_area compare against
3968	* tune.t_minarmem, which is the minimum available real memory that we
3969	* can have in the system. However, this is generally fixed at 25 pages
3970	* which is so low that it's useless. In this comparison, we seek to
3971	* calculate the total heap-size, and reclaim if more than 3/4ths of the
3972	* heap is allocated. (Or, in the calculation, if less than 1/4th is
3973	* free)
3974	*/
3975	n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
3976	(vmem_size(heap_arena, VMEM_FREE \| VMEM_ALLOC) >> `2`);
3977	if (n < lowest) {
3978	lowest = n;
3979	r = FMR_HEAP_ARENA;
3980	}
3981	#define zio_arena NULL
3982	#else
3983	#define zio_arena heap_arena
3984	#endif
3985
3986	/*
3987	* If zio data pages are being allocated out of a separate heap segment,
3988	* then enforce that the size of available vmem for this arena remains
3989	* above about 1/16th free.
3990	*
3991	* Note: The 1/16th arena free requirement was put in place
3992	* to aggressively evict memory from the arc in order to avoid
3993	* memory fragmentation issues.
3994	*/
3995	if (zio_arena != NULL) {
3996	n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
3997	(vmem_size(zio_arena, VMEM_ALLOC) >> `4`);
3998	if (n < lowest) {
3999	lowest = n;
4000	r = FMR_ZIO_ARENA;
4001	}
4002	}
4003
4004	#if __FreeBSD__
4005	/*
4006	* Above limits know nothing about real level of KVA fragmentation.
4007	* Start aggressive reclamation if too little sequential KVA left.
4008	*/
4009	if (lowest > `0`) {
4010	n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ?
4011	-((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> `4`) :
4012	INT64_MAX;
4013	if (n < lowest) {
4014	lowest = n;
4015	r = FMR_ZIO_FRAG;
4016	}
4017	}
4018	#endif
4019
4020	#else /* _KERNEL */
4021	/ Every 100 calls, free a small amount /
4022	if (spa_get_random(`100`) == `0`)
4023	lowest = -`1024`;
4024	#endif /* _KERNEL */
4025
4026	last_free_memory = lowest;
4027	last_free_reason = r;
4028	DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
4029	return (lowest);
4030	}
4031
4032
4033	/*
4034	* Determine if the system is under memory pressure and is asking
4035	* to reclaim memory. A return value of B_TRUE indicates that the system
4036	* is under memory pressure and that the arc should adjust accordingly.
4037	*/
4038	static boolean_t
4039	arc_reclaim_needed(void)
4040	{
4041	return (arc_available_memory() < `0`);
4042	}
4043
4044	extern kmem_cache_t *zio_buf_cache[];
4045	extern kmem_cache_t *zio_data_buf_cache[];
4046	extern kmem_cache_t *range_seg_cache;
4047
4048	static __noinline void
4049	arc_kmem_reap_now(void)
4050	{
4051	size_t i;
4052	kmem_cache_t *prev_cache = NULL;
4053	kmem_cache_t *prev_data_cache = NULL;
4054
4055	DTRACE_PROBE(arc__kmem_reap_start);
4056	#ifdef _KERNEL
4057	if (arc_meta_used >= arc_meta_limit) {
4058	/*
4059	* We are exceeding our meta-data cache limit.
4060	* Purge some DNLC entries to release holds on meta-data.
4061	*/
4062	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4063	}
4064	#if defined(__i386)
4065	/*
4066	* Reclaim unused memory from all kmem caches.
4067	*/
4068	kmem_reap();
4069	#endif
4070	#endif
4071
4072	for (i = `0`; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4073	if (zio_buf_cache[i] != prev_cache) {
4074	prev_cache = zio_buf_cache[i];
4075	kmem_cache_reap_now(zio_buf_cache[i]);
4076	}
4077	if (zio_data_buf_cache[i] != prev_data_cache) {
4078	prev_data_cache = zio_data_buf_cache[i];
4079	kmem_cache_reap_now(zio_data_buf_cache[i]);
4080	}
4081	}
4082	kmem_cache_reap_now(buf_cache);
4083	kmem_cache_reap_now(hdr_full_cache);
4084	kmem_cache_reap_now(hdr_l2only_cache);
4085	kmem_cache_reap_now(range_seg_cache);
4086
4087	#ifdef illumos
4088	if (zio_arena != NULL) {
4089	/*
4090	* Ask the vmem arena to reclaim unused memory from its
4091	* quantum caches.
4092	*/
4093	vmem_qcache_reap(zio_arena);
4094	}
4095	#endif
4096	DTRACE_PROBE(arc__kmem_reap_end);
4097	}
4098
4099	/*
4100	* Threads can block in arc_get_data_buf() waiting for this thread to evict
4101	* enough data and signal them to proceed. When this happens, the threads in
4102	* arc_get_data_buf() are sleeping while holding the hash lock for their
4103	* particular arc header. Thus, we must be careful to never sleep on a
4104	* hash lock in this thread. This is to prevent the following deadlock:
4105	*
4106	* - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
4107	* waiting for the reclaim thread to signal it.
4108	*
4109	* - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
4110	* fails, and goes to sleep forever.
4111	*
4112	* This possible deadlock is avoided by always acquiring a hash lock
4113	* using mutex_tryenter() from arc_reclaim_thread().
4114	*/
4115	static void
4116	arc_reclaim_thread(void *dummy __unused)
4117	{
4118	hrtime_t growtime = `0`;
4119	callb_cpr_t cpr;
4120
4121	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
4122
4123	mutex_enter(&arc_reclaim_lock);
4124	while (!arc_reclaim_thread_exit) {
4125	uint64_t evicted = `0`;
4126
4127	/*
4128	* This is necessary in order for the mdb ::arc dcmd to
4129	* show up to date information. Since the ::arc command
4130	* does not call the kstat's update function, without
4131	* this call, the command may show stale stats for the
4132	* anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4133	* with this change, the data might be up to 1 second
4134	* out of date; but that should suffice. The arc_state_t
4135	* structures can be queried directly if more accurate
4136	* information is needed.
4137	*/
4138	if (arc_ksp != NULL)
4139	arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4140
4141	mutex_exit(&arc_reclaim_lock);
4142
4143	/*
4144	* We call arc_adjust() before (possibly) calling
4145	* arc_kmem_reap_now(), so that we can wake up
4146	* arc_get_data_buf() sooner.
4147	*/
4148	evicted = arc_adjust();
4149
4150	int64_t free_memory = arc_available_memory();
4151	if (free_memory < `0`) {
4152
4153	arc_no_grow = B_TRUE;
4154	arc_warm = B_TRUE;
4155
4156	/*
4157	* Wait at least zfs_grow_retry (default 60) seconds
4158	* before considering growing.
4159	*/
4160	growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
4161
4162	arc_kmem_reap_now();
4163
4164	/*
4165	* If we are still low on memory, shrink the ARC
4166	* so that we have arc_shrink_min free space.
4167	*/
4168	free_memory = arc_available_memory();
4169
4170	int64_t to_free =
4171	(arc_c >> arc_shrink_shift) - free_memory;
4172	if (to_free > `0`) {
4173	#ifdef _KERNEL
4174	to_free = MAX(to_free, ptob(needfree));
4175	#endif
4176	arc_shrink(to_free);
4177	}
4178	} else if (free_memory < arc_c >> arc_no_grow_shift) {
4179	arc_no_grow = B_TRUE;
4180	} else if (gethrtime() >= growtime) {
4181	arc_no_grow = B_FALSE;
4182	}
4183
4184	mutex_enter(&arc_reclaim_lock);
4185
4186	/*
4187	* If evicted is zero, we couldn't evict anything via
4188	* arc_adjust(). This could be due to hash lock
4189	* collisions, but more likely due to the majority of
4190	* arc buffers being unevictable. Therefore, even if
4191	* arc_size is above arc_c, another pass is unlikely to
4192	* be helpful and could potentially cause us to enter an
4193	* infinite loop.
4194	*/
4195	if (arc_size <= arc_c \|\| evicted == `0`) {
4196	#ifdef _KERNEL
4197	needfree = `0`;
4198	#endif
4199	/*
4200	* We're either no longer overflowing, or we
4201	* can't evict anything more, so we should wake
4202	* up any threads before we go to sleep.
4203	*/
4204	cv_broadcast(&arc_reclaim_waiters_cv);
4205
4206	/*
4207	* Block until signaled, or after one second (we
4208	* might need to perform arc_kmem_reap_now()
4209	* even if we aren't being signalled)
4210	*/
4211	CALLB_CPR_SAFE_BEGIN(&cpr);
4212	(void) cv_timedwait_hires(&arc_reclaim_thread_cv,
4213	&arc_reclaim_lock, SEC2NSEC(`1`), MSEC2NSEC(`1`), `0`);
4214	CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
4215	}
4216	}
4217
4218	arc_reclaim_thread_exit = B_FALSE;
4219	cv_broadcast(&arc_reclaim_thread_cv);
4220	CALLB_CPR_EXIT(&cpr); / drops arc_reclaim_lock /
4221	thread_exit();
4222	}
4223
4224	#ifdef __FreeBSD__
4225
4226	static u_int arc_dnlc_evicts_arg;
4227	extern struct vfsops zfs_vfsops;
4228
4229	static void
4230	arc_dnlc_evicts_thread(void *dummy __unused)
4231	{
4232	callb_cpr_t cpr;
4233	u_int percent;
4234
4235	CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);
4236
4237	mutex_enter(&arc_dnlc_evicts_lock);
4238	while (!arc_dnlc_evicts_thread_exit) {
4239	CALLB_CPR_SAFE_BEGIN(&cpr);
4240	(void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
4241	CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
4242	if (arc_dnlc_evicts_arg != `0`) {
4243	percent = arc_dnlc_evicts_arg;
4244	mutex_exit(&arc_dnlc_evicts_lock);
4245	#ifdef _KERNEL
4246	vnlru_free(desiredvnodes * percent / `100`, &zfs_vfsops);
4247	#endif
4248	mutex_enter(&arc_dnlc_evicts_lock);
4249	/*
4250	* Clear our token only after vnlru_free()
4251	* pass is done, to avoid false queueing of
4252	* the requests.
4253	*/
4254	arc_dnlc_evicts_arg = `0`;
4255	}
4256	}
4257	arc_dnlc_evicts_thread_exit = FALSE;
4258	cv_broadcast(&arc_dnlc_evicts_cv);
4259	CALLB_CPR_EXIT(&cpr);
4260	thread_exit();
4261	}
4262
4263	void
4264	dnlc_reduce_cache(void *arg)
4265	{
4266	u_int percent;
4267
4268	percent = (u_int)(uintptr_t)arg;
4269	mutex_enter(&arc_dnlc_evicts_lock);
4270	if (arc_dnlc_evicts_arg == `0`) {
4271	arc_dnlc_evicts_arg = percent;
4272	cv_broadcast(&arc_dnlc_evicts_cv);
4273	}
4274	mutex_exit(&arc_dnlc_evicts_lock);
4275	}
4276
4277	#endif
4278
4279	/*
4280	* Adapt arc info given the number of bytes we are trying to add and
4281	* the state that we are comming from. This function is only called
4282	* when we are adding new content to the cache.
4283	*/
4284	static void
4285	arc_adapt(int bytes, arc_state_t *state)
4286	{
4287	int mult;
4288	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
4289	int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
4290	int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
4291
4292	if (state == arc_l2c_only)
4293	return;
4294
4295	ASSERT(bytes > `0`);
4296	/*
4297	* Adapt the target size of the MRU list:
4298	* - if we just hit in the MRU ghost list, then increase
4299	* the target size of the MRU list.
4300	* - if we just hit in the MFU ghost list, then increase
4301	* the target size of the MFU list by decreasing the
4302	* target size of the MRU list.
4303	*/
4304	if (state == arc_mru_ghost) {
4305	mult = (mrug_size >= mfug_size) ? `1` : (mfug_size / mrug_size);
4306	mult = MIN(mult, `10`); / avoid wild arc_p adjustment /
4307
4308	arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
4309	} else if (state == arc_mfu_ghost) {
4310	uint64_t delta;
4311
4312	mult = (mfug_size >= mrug_size) ? `1` : (mrug_size / mfug_size);
4313	mult = MIN(mult, `10`);
4314
4315	delta = MIN(bytes * mult, arc_p);
4316	arc_p = MAX(arc_p_min, arc_p - delta);
4317	}
4318	ASSERT((int64_t)arc_p >= `0`);
4319
4320	if (arc_reclaim_needed()) {
4321	cv_signal(&arc_reclaim_thread_cv);
4322	return;
4323	}
4324
4325	if (arc_no_grow)
4326	return;
4327
4328	if (arc_c >= arc_c_max)
4329	return;
4330
4331	/*
4332	* If we're within (2 * maxblocksize) bytes of the target
4333	* cache size, increment the target cache size
4334	*/
4335	if (arc_size > arc_c - (`2ULL` << SPA_MAXBLOCKSHIFT)) {
4336	DTRACE_PROBE1(arc__inc_adapt, int, bytes);
4337	atomic_add_64(&arc_c, (int64_t)bytes);
4338	if (arc_c > arc_c_max)
4339	arc_c = arc_c_max;
4340	else if (state == arc_anon)
4341	atomic_add_64(&arc_p, (int64_t)bytes);
4342	if (arc_p > arc_c)
4343	arc_p = arc_c;
4344	}
4345	ASSERT((int64_t)arc_p >= `0`);
4346	}
4347
4348	/*
4349	* Check if arc_size has grown past our upper threshold, determined by
4350	* zfs_arc_overflow_shift.
4351	*/
4352	static boolean_t
4353	arc_is_overflowing(void)
4354	{
4355	/ Always allow at least one block of overflow /
4356	uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
4357	arc_c >> zfs_arc_overflow_shift);
4358
4359	return (arc_size >= arc_c + overflow);
4360	}
4361
4362	/*
4363	* Allocate a block and return it to the caller. If we are hitting the
4364	* hard limit for the cache size, we must sleep, waiting for the eviction
4365	* thread to catch up. If we're past the target size but below the hard
4366	* limit, we'll only signal the reclaim thread and continue on.
4367	*/
4368	static void *
4369	arc_get_data_buf(arc_buf_hdr_t hdr, uint64_t size, void* *tag)
4370	{
4371	void *datap = NULL;
4372	arc_state_t *state = hdr->b_l1hdr.b_state;
4373	arc_buf_contents_t type = arc_buf_type(hdr);
4374
4375	arc_adapt(size, state);
4376
4377	/*
4378	* If arc_size is currently overflowing, and has grown past our
4379	* upper limit, we must be adding data faster than the evict
4380	* thread can evict. Thus, to ensure we don't compound the
4381	* problem by adding more data and forcing arc_size to grow even
4382	* further past it's target size, we halt and wait for the
4383	* eviction thread to catch up.
4384	*
4385	* It's also possible that the reclaim thread is unable to evict
4386	* enough buffers to get arc_size below the overflow limit (e.g.
4387	* due to buffers being un-evictable, or hash lock collisions).
4388	* In this case, we want to proceed regardless if we're
4389	* overflowing; thus we don't use a while loop here.
4390	*/
4391	if (arc_is_overflowing()) {
4392	mutex_enter(&arc_reclaim_lock);
4393
4394	/*
4395	* Now that we've acquired the lock, we may no longer be
4396	* over the overflow limit, lets check.
4397	*
4398	* We're ignoring the case of spurious wake ups. If that
4399	* were to happen, it'd let this thread consume an ARC
4400	* buffer before it should have (i.e. before we're under
4401	* the overflow limit and were signalled by the reclaim
4402	* thread). As long as that is a rare occurrence, it
4403	* shouldn't cause any harm.
4404	*/
4405	if (arc_is_overflowing()) {
4406	cv_signal(&arc_reclaim_thread_cv);
4407	cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
4408	}
4409
4410	mutex_exit(&arc_reclaim_lock);
4411	}
4412
4413	VERIFY3U(hdr->b_type, ==, type);
4414	if (type == ARC_BUFC_METADATA) {
4415	datap = zio_buf_alloc(size);
4416	arc_space_consume(size, ARC_SPACE_META);
4417	} else {
4418	ASSERT(type == ARC_BUFC_DATA);
4419	datap = zio_data_buf_alloc(size);
4420	arc_space_consume(size, ARC_SPACE_DATA);
4421	}
4422
4423	/*
4424	* Update the state size. Note that ghost states have a
4425	* "ghost size" and so don't need to be updated.
4426	*/
4427	if (!GHOST_STATE(state)) {
4428
4429	(void) refcount_add_many(&state->arcs_size, size, tag);
4430
4431	/*
4432	* If this is reached via arc_read, the link is
4433	* protected by the hash lock. If reached via
4434	* arc_buf_alloc, the header should not be accessed by
4435	* any other thread. And, if reached via arc_read_done,
4436	* the hash lock will protect it if it's found in the
4437	* hash table; otherwise no other thread should be
4438	* trying to [add\|remove]_reference it.
4439	*/
4440	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
4441	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4442	(void) refcount_add_many(&state->arcs_esize[type],
4443	size, tag);
4444	}
4445
4446	/*
4447	* If we are growing the cache, and we are adding anonymous
4448	* data, and we have outgrown arc_p, update arc_p
4449	*/
4450	if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
4451	(refcount_count(&arc_anon->arcs_size) +
4452	refcount_count(&arc_mru->arcs_size) > arc_p))
4453	arc_p = MIN(arc_c, arc_p + size);
4454	}
4455	ARCSTAT_BUMP(arcstat_allocated);
4456	return (datap);
4457	}
4458
4459	/*
4460	* Free the arc data buffer.
4461	*/
4462	static void
4463	arc_free_data_buf(arc_buf_hdr_t hdr, void* data, uint64_t size, void* *tag)
4464	{
4465	arc_state_t *state = hdr->b_l1hdr.b_state;
4466	arc_buf_contents_t type = arc_buf_type(hdr);
4467
4468	/ protected by hash lock, if in the hash table /
4469	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
4470	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4471	ASSERT(state != arc_anon && state != arc_l2c_only);
4472
4473	(void) refcount_remove_many(&state->arcs_esize[type],
4474	size, tag);
4475	}
4476	(void) refcount_remove_many(&state->arcs_size, size, tag);
4477
4478	VERIFY3U(hdr->b_type, ==, type);
4479	if (type == ARC_BUFC_METADATA) {
4480	zio_buf_free(data, size);
4481	arc_space_return(size, ARC_SPACE_META);
4482	} else {
4483	ASSERT(type == ARC_BUFC_DATA);
4484	zio_data_buf_free(data, size);
4485	arc_space_return(size, ARC_SPACE_DATA);
4486	}
4487	}
4488
4489	/*
4490	* This routine is called whenever a buffer is accessed.
4491	* NOTE: the hash lock is dropped in this function.
4492	*/
4493	static void
4494	arc_access(arc_buf_hdr_t hdr, kmutex_t hash_lock)
4495	{
4496	clock_t now;
4497
4498	ASSERT(MUTEX_HELD(hash_lock));
4499	ASSERT(HDR_HAS_L1HDR(hdr));
4500
4501	if (hdr->b_l1hdr.b_state == arc_anon) {
4502	/*
4503	* This buffer is not in the cache, and does not
4504	* appear in our "ghost" list. Add the new buffer
4505	* to the MRU state.
4506	*/
4507
4508	ASSERT0(hdr->b_l1hdr.b_arc_access);
4509	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4510	DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
4511	arc_change_state(arc_mru, hdr, hash_lock);
4512
4513	} else if (hdr->b_l1hdr.b_state == arc_mru) {
4514	now = ddi_get_lbolt();
4515
4516	/*
4517	* If this buffer is here because of a prefetch, then either:
4518	* - clear the flag if this is a "referencing" read
4519	* (any subsequent access will bump this into the MFU state).
4520	* or
4521	* - move the buffer to the head of the list if this is
4522	* another prefetch (to make it less likely to be evicted).
4523	*/
4524	if (HDR_PREFETCH(hdr)) {
4525	if (refcount_count(&hdr->b_l1hdr.b_refcnt) == `0`) {
4526	/ link protected by hash lock /
4527	ASSERT(multilist_link_active(
4528	&hdr->b_l1hdr.b_arc_node));
4529	} else {
4530	arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
4531	ARCSTAT_BUMP(arcstat_mru_hits);
4532	}
4533	hdr->b_l1hdr.b_arc_access = now;
4534	return;
4535	}
4536
4537	/*
4538	* This buffer has been "accessed" only once so far,
4539	* but it is still in the cache. Move it to the MFU
4540	* state.
4541	*/
4542	if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
4543	/*
4544	* More than 125ms have passed since we
4545	* instantiated this buffer. Move it to the
4546	* most frequently used state.
4547	*/
4548	hdr->b_l1hdr.b_arc_access = now;
4549	DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4550	arc_change_state(arc_mfu, hdr, hash_lock);
4551	}
4552	ARCSTAT_BUMP(arcstat_mru_hits);
4553	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
4554	arc_state_t *new_state;
4555	/*
4556	* This buffer has been "accessed" recently, but
4557	* was evicted from the cache. Move it to the
4558	* MFU state.
4559	*/
4560
4561	if (HDR_PREFETCH(hdr)) {
4562	new_state = arc_mru;
4563	if (refcount_count(&hdr->b_l1hdr.b_refcnt) > `0`)
4564	arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
4565	DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
4566	} else {
4567	new_state = arc_mfu;
4568	DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4569	}
4570
4571	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4572	arc_change_state(new_state, hdr, hash_lock);
4573
4574	ARCSTAT_BUMP(arcstat_mru_ghost_hits);
4575	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
4576	/*
4577	* This buffer has been accessed more than once and is
4578	* still in the cache. Keep it in the MFU state.
4579	*
4580	* NOTE: an add_reference() that occurred when we did
4581	* the arc_read() will have kicked this off the list.
4582	* If it was a prefetch, we will explicitly move it to
4583	* the head of the list now.
4584	*/
4585	if ((HDR_PREFETCH(hdr)) != `0`) {
4586	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4587	/ link protected by hash_lock /
4588	ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4589	}
4590	ARCSTAT_BUMP(arcstat_mfu_hits);
4591	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4592	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
4593	arc_state_t *new_state = arc_mfu;
4594	/*
4595	* This buffer has been accessed more than once but has
4596	* been evicted from the cache. Move it back to the
4597	* MFU state.
4598	*/
4599
4600	if (HDR_PREFETCH(hdr)) {
4601	/*
4602	* This is a prefetch access...
4603	* move this block back to the MRU state.
4604	*/
4605	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
4606	new_state = arc_mru;
4607	}
4608
4609	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4610	DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4611	arc_change_state(new_state, hdr, hash_lock);
4612
4613	ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
4614	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
4615	/*
4616	* This buffer is on the 2nd Level ARC.
4617	*/
4618
4619	hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4620	DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4621	arc_change_state(arc_mfu, hdr, hash_lock);
4622	} else {
4623	ASSERT(!"invalid arc state");
4624	}
4625	}
4626
4627	/ a generic arc_done_func_t which you can use /
4628	/ ARGSUSED /
4629	void
4630	arc_bcopy_func(zio_t zio, arc_buf_t buf, void *arg)
4631	{
4632	if (zio == NULL \|\| zio->io_error == `0`)
4633	bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr));
4634	arc_buf_destroy(buf, arg);
4635	}
4636
4637	/ a generic arc_done_func_t /
4638	void
4639	arc_getbuf_func(zio_t zio, arc_buf_t buf, void *arg)
4640	{
4641	arc_buf_t **bufp = arg;
4642	if (zio && zio->io_error) {
4643	arc_buf_destroy(buf, arg);
4644	*bufp = NULL;
4645	} else {
4646	*bufp = buf;
4647	ASSERT(buf->b_data);
4648	}
4649	}
4650
4651	static void
4652	arc_hdr_verify(arc_buf_hdr_t hdr, blkptr_t bp)
4653	{
4654	if (BP_IS_HOLE(bp) \|\| BP_IS_EMBEDDED(bp)) {
4655	ASSERT3U(HDR_GET_PSIZE(hdr), ==, `0`);
4656	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
4657	} else {
4658	if (HDR_COMPRESSION_ENABLED(hdr)) {
4659	ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
4660	BP_GET_COMPRESS(bp));
4661	}
4662	ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
4663	ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
4664	}
4665	}
4666
4667	static void
4668	arc_read_done(zio_t *zio)
4669	{
4670	arc_buf_hdr_t *hdr = zio->io_private;
4671	arc_buf_t abuf = NULL; /* buffer we're assigning to callback /
4672	kmutex_t *hash_lock = NULL;
4673	arc_callback_t callback_list, acb;
4674	int freeable = B_FALSE;
4675
4676	/*
4677	* The hdr was inserted into hash-table and removed from lists
4678	* prior to starting I/O. We should find this header, since
4679	* it's in the hash table, and it should be legit since it's
4680	* not possible to evict it during the I/O. The only possible
4681	* reason for it not to be found is if we were freed during the
4682	* read.
4683	*/
4684	if (HDR_IN_HASH_TABLE(hdr)) {
4685	ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
4686	ASSERT3U(hdr->b_dva.dva_word[`0`], ==,
4687	BP_IDENTITY(zio->io_bp)->dva_word[`0`]);
4688	ASSERT3U(hdr->b_dva.dva_word[`1`], ==,
4689	BP_IDENTITY(zio->io_bp)->dva_word[`1`]);
4690
4691	arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
4692	&hash_lock);
4693
4694	ASSERT((found == hdr &&
4695	DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) \|\|
4696	(found == hdr && HDR_L2_READING(hdr)));
4697	ASSERT3P(hash_lock, !=, NULL);
4698	}
4699
4700	if (zio->io_error == `0`) {
4701	/ byteswap if necessary /
4702	if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
4703	if (BP_GET_LEVEL(zio->io_bp) > `0`) {
4704	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
4705	} else {
4706	hdr->b_l1hdr.b_byteswap =
4707	DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
4708	}
4709	} else {
4710	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
4711	}
4712	}
4713
4714	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
4715	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
4716	arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
4717
4718	callback_list = hdr->b_l1hdr.b_acb;
4719	ASSERT3P(callback_list, !=, NULL);
4720
4721	if (hash_lock && zio->io_error == `0` &&
4722	hdr->b_l1hdr.b_state == arc_anon) {
4723	/*
4724	* Only call arc_access on anonymous buffers. This is because
4725	* if we've issued an I/O for an evicted buffer, we've already
4726	* called arc_access (to prevent any simultaneous readers from
4727	* getting confused).
4728	*/
4729	arc_access(hdr, hash_lock);
4730	}
4731
4732	/ create copies of the data buffer for the callers /
4733	for (acb = callback_list; acb; acb = acb->acb_next) {
4734	if (acb->acb_done != NULL) {
4735	/*
4736	* If we're here, then this must be a demand read
4737	* since prefetch requests don't have callbacks.
4738	* If a read request has a callback (i.e. acb_done is
4739	* not NULL), then we decompress the data for the
4740	* first request and clone the rest. This avoids
4741	* having to waste cpu resources decompressing data
4742	* that nobody is explicitly waiting to read.
4743	*/
4744	if (abuf == NULL) {
4745	acb->acb_buf = arc_buf_alloc_impl(hdr,
4746	acb->acb_private);
4747	if (zio->io_error == `0`) {
4748	zio->io_error =
4749	arc_decompress(acb->acb_buf);
4750	}
4751	abuf = acb->acb_buf;
4752	} else {
4753	add_reference(hdr, acb->acb_private);
4754	acb->acb_buf = arc_buf_clone(abuf);
4755	}
4756	}
4757	}
4758	hdr->b_l1hdr.b_acb = NULL;
4759	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
4760	if (abuf == NULL) {
4761	/*
4762	* This buffer didn't have a callback so it must
4763	* be a prefetch.
4764	*/
4765	ASSERT(HDR_PREFETCH(hdr));
4766	ASSERT0(hdr->b_l1hdr.b_bufcnt);
4767	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
4768	}
4769
4770	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) \|\|
4771	callback_list != NULL);
4772
4773	if (zio->io_error == `0`) {
4774	arc_hdr_verify(hdr, zio->io_bp);
4775	} else {
4776	arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
4777	if (hdr->b_l1hdr.b_state != arc_anon)
4778	arc_change_state(arc_anon, hdr, hash_lock);
4779	if (HDR_IN_HASH_TABLE(hdr))
4780	buf_hash_remove(hdr);
4781	freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4782	}
4783
4784	/*
4785	* Broadcast before we drop the hash_lock to avoid the possibility
4786	* that the hdr (and hence the cv) might be freed before we get to
4787	* the cv_broadcast().
4788	*/
4789	cv_broadcast(&hdr->b_l1hdr.b_cv);
4790
4791	if (hash_lock != NULL) {
4792	mutex_exit(hash_lock);
4793	} else {
4794	/*
4795	* This block was freed while we waited for the read to
4796	* complete. It has been removed from the hash table and
4797	* moved to the anonymous state (so that it won't show up
4798	* in the cache).
4799	*/
4800	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
4801	freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4802	}
4803
4804	/ execute each callback and free its structure /
4805	while ((acb = callback_list) != NULL) {
4806	if (acb->acb_done)
4807	acb->acb_done(zio, acb->acb_buf, acb->acb_private);
4808
4809	if (acb->acb_zio_dummy != NULL) {
4810	acb->acb_zio_dummy->io_error = zio->io_error;
4811	zio_nowait(acb->acb_zio_dummy);
4812	}
4813
4814	callback_list = acb->acb_next;
4815	kmem_free(acb, sizeof (arc_callback_t));
4816	}
4817
4818	if (freeable)
4819	arc_hdr_destroy(hdr);
4820	}
4821
4822	/*
4823	* "Read" the block at the specified DVA (in bp) via the
4824	* cache. If the block is found in the cache, invoke the provided
4825	* callback immediately and return. Note that the `zio' parameter
4826	* in the callback will be NULL in this case, since no IO was
4827	* required. If the block is not in the cache pass the read request
4828	* on to the spa with a substitute callback function, so that the
4829	* requested block will be added to the cache.
4830	*
4831	* If a read request arrives for a block that has a read in-progress,
4832	* either wait for the in-progress read to complete (and return the
4833	* results); or, if this is a read with a "done" func, add a record
4834	* to the read to invoke the "done" func when the read completes,
4835	* and return; or just return.
4836	*
4837	* arc_read_done() will invoke all the requested "done" functions
4838	* for readers of this block.
4839	*/
4840	int
4841	arc_read(zio_t pio, spa_t spa, const blkptr_t bp, arc_done_func_t done,
4842	void private, zio_priority_t priority, int* zio_flags,
4843	arc_flags_t arc_flags, const* zbookmark_phys_t *zb)
4844	{
4845	arc_buf_hdr_t *hdr = NULL;
4846	kmutex_t *hash_lock = NULL;
4847	zio_t *rzio;
4848	uint64_t guid = spa_load_guid(spa);
4849
4850	ASSERT(!BP_IS_EMBEDDED(bp) \|\|
4851	BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
4852
4853	top:
4854	if (!BP_IS_EMBEDDED(bp)) {
4855	/*
4856	* Embedded BP's have no DVA and require no I/O to "read".
4857	* Create an anonymous arc buf to back it.
4858	*/
4859	hdr = buf_hash_find(guid, bp, &hash_lock);
4860	}
4861
4862	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) {
4863	arc_buf_t *buf = NULL;
4864	*arc_flags \|= ARC_FLAG_CACHED;
4865
4866	if (HDR_IO_IN_PROGRESS(hdr)) {
4867
4868	if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
4869	priority == ZIO_PRIORITY_SYNC_READ) {
4870	/*
4871	* This sync read must wait for an
4872	* in-progress async read (e.g. a predictive
4873	* prefetch). Async reads are queued
4874	* separately at the vdev_queue layer, so
4875	* this is a form of priority inversion.
4876	* Ideally, we would "inherit" the demand
4877	* i/o's priority by moving the i/o from
4878	* the async queue to the synchronous queue,
4879	* but there is currently no mechanism to do
4880	* so. Track this so that we can evaluate
4881	* the magnitude of this potential performance
4882	* problem.
4883	*
4884	* Note that if the prefetch i/o is already
4885	* active (has been issued to the device),
4886	* the prefetch improved performance, because
4887	* we issued it sooner than we would have
4888	* without the prefetch.
4889	*/
4890	DTRACE_PROBE1(arc__sync__wait__for__async,
4891	arc_buf_hdr_t *, hdr);
4892	ARCSTAT_BUMP(arcstat_sync_wait_for_async);
4893	}
4894	if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
4895	arc_hdr_clear_flags(hdr,
4896	ARC_FLAG_PREDICTIVE_PREFETCH);
4897	}
4898
4899	if (*arc_flags & ARC_FLAG_WAIT) {
4900	cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
4901	mutex_exit(hash_lock);
4902	goto top;
4903	}
4904	ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4905
4906	if (done) {
4907	arc_callback_t *acb = NULL;
4908
4909	acb = kmem_zalloc(sizeof (arc_callback_t),
4910	KM_SLEEP);
4911	acb->acb_done = done;
4912	acb->acb_private = private;
4913	if (pio != NULL)
4914	acb->acb_zio_dummy = zio_null(pio,
4915	spa, NULL, NULL, NULL, zio_flags);
4916
4917	ASSERT3P(acb->acb_done, !=, NULL);
4918	acb->acb_next = hdr->b_l1hdr.b_acb;
4919	hdr->b_l1hdr.b_acb = acb;
4920	mutex_exit(hash_lock);
4921	return (`0`);
4922	}
4923	mutex_exit(hash_lock);
4924	return (`0`);
4925	}
4926
4927	ASSERT(hdr->b_l1hdr.b_state == arc_mru \|\|
4928	hdr->b_l1hdr.b_state == arc_mfu);
4929
4930	if (done) {
4931	if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
4932	/*
4933	* This is a demand read which does not have to
4934	* wait for i/o because we did a predictive
4935	* prefetch i/o for it, which has completed.
4936	*/
4937	DTRACE_PROBE1(
4938	arc__demand__hit__predictive__prefetch,
4939	arc_buf_hdr_t *, hdr);
4940	ARCSTAT_BUMP(
4941	arcstat_demand_hit_predictive_prefetch);
4942	arc_hdr_clear_flags(hdr,
4943	ARC_FLAG_PREDICTIVE_PREFETCH);
4944	}
4945	ASSERT(!BP_IS_EMBEDDED(bp) \|\| !BP_IS_HOLE(bp));
4946
4947	/*
4948	* If this block is already in use, create a new
4949	* copy of the data so that we will be guaranteed
4950	* that arc_release() will always succeed.
4951	*/
4952	buf = hdr->b_l1hdr.b_buf;
4953	if (buf == NULL) {
4954	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
4955	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
4956	buf = arc_buf_alloc_impl(hdr, private);
4957	VERIFY0(arc_decompress(buf));
4958	} else {
4959	add_reference(hdr, private);
4960	buf = arc_buf_clone(buf);
4961	}
4962	ASSERT3P(buf->b_data, !=, NULL);
4963
4964	} else if (*arc_flags & ARC_FLAG_PREFETCH &&
4965	refcount_count(&hdr->b_l1hdr.b_refcnt) == `0`) {
4966	arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
4967	}
4968	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
4969	arc_access(hdr, hash_lock);
4970	if (*arc_flags & ARC_FLAG_L2CACHE)
4971	arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
4972	mutex_exit(hash_lock);
4973	ARCSTAT_BUMP(arcstat_hits);
4974	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4975	demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4976	data, metadata, hits);
4977
4978	if (done)
4979	done(NULL, buf, private);
4980	} else {
4981	uint64_t lsize = BP_GET_LSIZE(bp);
4982	uint64_t psize = BP_GET_PSIZE(bp);
4983	arc_callback_t *acb;
4984	vdev_t *vd = NULL;
4985	uint64_t addr = `0`;
4986	boolean_t devw = B_FALSE;
4987	uint64_t size;
4988
4989	if (hdr == NULL) {
4990	/ this block is not in the cache /
4991	arc_buf_hdr_t *exists = NULL;
4992	arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
4993	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
4994	BP_GET_COMPRESS(bp), type);
4995
4996	if (!BP_IS_EMBEDDED(bp)) {
4997	hdr->b_dva = *BP_IDENTITY(bp);
4998	hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
4999	exists = buf_hash_insert(hdr, &hash_lock);
5000	}
5001	if (exists != NULL) {
5002	/ somebody beat us to the hash insert /
5003	mutex_exit(hash_lock);
5004	buf_discard_identity(hdr);
5005	arc_hdr_destroy(hdr);
5006	goto top; / restart the IO request /
5007	}
5008	} else {
5009	/*
5010	* This block is in the ghost cache. If it was L2-only
5011	* (and thus didn't have an L1 hdr), we realloc the
5012	* header to add an L1 hdr.
5013	*/
5014	if (!HDR_HAS_L1HDR(hdr)) {
5015	hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
5016	hdr_full_cache);
5017	}
5018	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
5019	ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
5020	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5021	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5022	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
5023
5024	/*
5025	* This is a delicate dance that we play here.
5026	* This hdr is in the ghost list so we access it
5027	* to move it out of the ghost list before we
5028	* initiate the read. If it's a prefetch then
5029	* it won't have a callback so we'll remove the
5030	* reference that arc_buf_alloc_impl() created. We
5031	* do this after we've called arc_access() to
5032	* avoid hitting an assert in remove_reference().
5033	*/
5034	arc_access(hdr, hash_lock);
5035	arc_hdr_alloc_pdata(hdr);
5036	}
5037	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
5038	size = arc_hdr_size(hdr);
5039
5040	/*
5041	* If compression is enabled on the hdr, then will do
5042	* RAW I/O and will store the compressed data in the hdr's
5043	* data block. Otherwise, the hdr's data block will contain
5044	* the uncompressed data.
5045	*/
5046	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
5047	zio_flags \|= ZIO_FLAG_RAW;
5048	}
5049
5050	if (*arc_flags & ARC_FLAG_PREFETCH)
5051	arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
5052	if (*arc_flags & ARC_FLAG_L2CACHE)
5053	arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5054	if (BP_GET_LEVEL(bp) > `0`)
5055	arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
5056	if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
5057	arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
5058	ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
5059
5060	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
5061	acb->acb_done = done;
5062	acb->acb_private = private;
5063
5064	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
5065	hdr->b_l1hdr.b_acb = acb;
5066	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5067
5068	if (HDR_HAS_L2HDR(hdr) &&
5069	(vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
5070	devw = hdr->b_l2hdr.b_dev->l2ad_writing;
5071	addr = hdr->b_l2hdr.b_daddr;
5072	/*
5073	* Lock out device removal.
5074	*/
5075	if (vdev_is_dead(vd) \|\|
5076	!spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
5077	vd = NULL;
5078	}
5079
5080	if (priority == ZIO_PRIORITY_ASYNC_READ)
5081	arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
5082	else
5083	arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
5084
5085	if (hash_lock != NULL)
5086	mutex_exit(hash_lock);
5087
5088	/*
5089	* At this point, we have a level 1 cache miss. Try again in
5090	* L2ARC if possible.
5091	*/
5092	ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
5093
5094	DTRACE_PROBE4(arc__miss, arc_buf_hdr_t , hdr, blkptr_t , bp,
5095	uint64_t, lsize, zbookmark_phys_t *, zb);
5096	ARCSTAT_BUMP(arcstat_misses);
5097	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5098	demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
5099	data, metadata, misses);
5100	#ifdef __FreeBSD__
5101	#ifdef _KERNEL
5102	#ifdef RACCT
5103	if (racct_enable) {
5104	PROC_LOCK(curproc);
5105	racct_add_force(curproc, RACCT_READBPS, size);
5106	racct_add_force(curproc, RACCT_READIOPS, `1`);
5107	PROC_UNLOCK(curproc);
5108	}
5109	#endif /* RACCT */
5110	curthread->td_ru.ru_inblock++;
5111	#endif
5112	#endif
5113
5114	if (vd != NULL && l2arc_ndev != `0` && !(l2arc_norw && devw)) {
5115	/*
5116	* Read from the L2ARC if the following are true:
5117	* 1. The L2ARC vdev was previously cached.
5118	* 2. This buffer still has L2ARC metadata.
5119	* 3. This buffer isn't currently writing to the L2ARC.
5120	* 4. The L2ARC entry wasn't evicted, which may
5121	* also have invalidated the vdev.
5122	* 5. This isn't prefetch and l2arc_noprefetch is set.
5123	*/
5124	if (HDR_HAS_L2HDR(hdr) &&
5125	!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
5126	!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
5127	l2arc_read_callback_t *cb;
5128	void* b_data;
5129
5130	DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
5131	ARCSTAT_BUMP(arcstat_l2_hits);
5132
5133	cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
5134	KM_SLEEP);
5135	cb->l2rcb_hdr = hdr;
5136	cb->l2rcb_bp = *bp;
5137	cb->l2rcb_zb = *zb;
5138	cb->l2rcb_flags = zio_flags;
5139	uint64_t asize = vdev_psize_to_asize(vd, size);
5140	if (asize != size) {
5141	b_data = zio_data_buf_alloc(asize);
5142	cb->l2rcb_data = b_data;
5143	} else {
5144	b_data = hdr->b_l1hdr.b_pdata;
5145	}
5146
5147	ASSERT(addr >= VDEV_LABEL_START_SIZE &&
5148	addr + asize < vd->vdev_psize -
5149	VDEV_LABEL_END_SIZE);
5150
5151	/*
5152	* l2arc read. The SCL_L2ARC lock will be
5153	* released by l2arc_read_done().
5154	* Issue a null zio if the underlying buffer
5155	* was squashed to zero size by compression.
5156	*/
5157	ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
5158	ZIO_COMPRESS_EMPTY);
5159	rzio = zio_read_phys(pio, vd, addr,
5160	asize, b_data,
5161	ZIO_CHECKSUM_OFF,
5162	l2arc_read_done, cb, priority,
5163	zio_flags \| ZIO_FLAG_DONT_CACHE \|
5164	ZIO_FLAG_CANFAIL \|
5165	ZIO_FLAG_DONT_PROPAGATE \|
5166	ZIO_FLAG_DONT_RETRY, B_FALSE);
5167	DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
5168	zio_t *, rzio);
5169	ARCSTAT_INCR(arcstat_l2_read_bytes, size);
5170
5171	if (*arc_flags & ARC_FLAG_NOWAIT) {
5172	zio_nowait(rzio);
5173	return (`0`);
5174	}
5175
5176	ASSERT(*arc_flags & ARC_FLAG_WAIT);
5177	if (zio_wait(rzio) == `0`)
5178	return (`0`);
5179
5180	/ l2arc read error; goto zio_read() /
5181	} else {
5182	DTRACE_PROBE1(l2arc__miss,
5183	arc_buf_hdr_t *, hdr);
5184	ARCSTAT_BUMP(arcstat_l2_misses);
5185	if (HDR_L2_WRITING(hdr))
5186	ARCSTAT_BUMP(arcstat_l2_rw_clash);
5187	spa_config_exit(spa, SCL_L2ARC, vd);
5188	}
5189	} else {
5190	if (vd != NULL)
5191	spa_config_exit(spa, SCL_L2ARC, vd);
5192	if (l2arc_ndev != `0`) {
5193	DTRACE_PROBE1(l2arc__miss,
5194	arc_buf_hdr_t *, hdr);
5195	ARCSTAT_BUMP(arcstat_l2_misses);
5196	}
5197	}
5198
5199	rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size,
5200	arc_read_done, hdr, priority, zio_flags, zb);
5201
5202	if (*arc_flags & ARC_FLAG_WAIT)
5203	return (zio_wait(rzio));
5204
5205	ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
5206	zio_nowait(rzio);
5207	}
5208	return (`0`);
5209	}
5210
5211	/*
5212	* Notify the arc that a block was freed, and thus will never be used again.
5213	*/
5214	void
5215	arc_freed(spa_t spa, const* blkptr_t *bp)
5216	{
5217	arc_buf_hdr_t *hdr;
5218	kmutex_t *hash_lock;
5219	uint64_t guid = spa_load_guid(spa);
5220
5221	ASSERT(!BP_IS_EMBEDDED(bp));
5222
5223	hdr = buf_hash_find(guid, bp, &hash_lock);
5224	if (hdr == NULL)
5225	return;
5226
5227	/*
5228	* We might be trying to free a block that is still doing I/O
5229	* (i.e. prefetch) or has a reference (i.e. a dedup-ed,
5230	* dmu_sync-ed block). If this block is being prefetched, then it
5231	* would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
5232	* until the I/O completes. A block may also have a reference if it is
5233	* part of a dedup-ed, dmu_synced write. The dmu_sync() function would
5234	* have written the new block to its final resting place on disk but
5235	* without the dedup flag set. This would have left the hdr in the MRU
5236	* state and discoverable. When the txg finally syncs it detects that
5237	* the block was overridden in open context and issues an override I/O.
5238	* Since this is a dedup block, the override I/O will determine if the
5239	* block is already in the DDT. If so, then it will replace the io_bp
5240	* with the bp from the DDT and allow the I/O to finish. When the I/O
5241	* reaches the done callback, dbuf_write_override_done, it will
5242	* check to see if the io_bp and io_bp_override are identical.
5243	* If they are not, then it indicates that the bp was replaced with
5244	* the bp in the DDT and the override bp is freed. This allows
5245	* us to arrive here with a reference on a block that is being
5246	* freed. So if we have an I/O in progress, or a reference to
5247	* this hdr, then we don't destroy the hdr.
5248	*/
5249	if (!HDR_HAS_L1HDR(hdr) \|\| (!HDR_IO_IN_PROGRESS(hdr) &&
5250	refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
5251	arc_change_state(arc_anon, hdr, hash_lock);
5252	arc_hdr_destroy(hdr);
5253	mutex_exit(hash_lock);
5254	} else {
5255	mutex_exit(hash_lock);
5256	}
5257
5258	}
5259
5260	/*
5261	* Release this buffer from the cache, making it an anonymous buffer. This
5262	* must be done after a read and prior to modifying the buffer contents.
5263	* If the buffer has more than one reference, we must make
5264	* a new hdr for the buffer.
5265	*/
5266	void
5267	arc_release(arc_buf_t buf, void* *tag)
5268	{
5269	arc_buf_hdr_t *hdr = buf->b_hdr;
5270
5271	/*
5272	* It would be nice to assert that if it's DMU metadata (level >
5273	* 0 \|\| it's the dnode file), then it must be syncing context.
5274	* But we don't know that information at this level.
5275	*/
5276
5277	mutex_enter(&buf->b_evict_lock);
5278
5279	ASSERT(HDR_HAS_L1HDR(hdr));
5280
5281	/*
5282	* We don't grab the hash lock prior to this check, because if
5283	* the buffer's header is in the arc_anon state, it won't be
5284	* linked into the hash table.
5285	*/
5286	if (hdr->b_l1hdr.b_state == arc_anon) {
5287	mutex_exit(&buf->b_evict_lock);
5288	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5289	ASSERT(!HDR_IN_HASH_TABLE(hdr));
5290	ASSERT(!HDR_HAS_L2HDR(hdr));
5291	ASSERT(HDR_EMPTY(hdr));
5292	ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, `1`);
5293	ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, `1`);
5294	ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
5295
5296	hdr->b_l1hdr.b_arc_access = `0`;
5297
5298	/*
5299	* If the buf is being overridden then it may already
5300	* have a hdr that is not empty.
5301	*/
5302	buf_discard_identity(hdr);
5303	arc_buf_thaw(buf);
5304
5305	return;
5306	}
5307
5308	kmutex_t *hash_lock = HDR_LOCK(hdr);
5309	mutex_enter(hash_lock);
5310
5311	/*
5312	* This assignment is only valid as long as the hash_lock is
5313	* held, we must be careful not to reference state or the
5314	* b_state field after dropping the lock.
5315	*/
5316	arc_state_t *state = hdr->b_l1hdr.b_state;
5317	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5318	ASSERT3P(state, !=, arc_anon);
5319
5320	/ this buffer is not on any list /
5321	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > `0`);
5322
5323	if (HDR_HAS_L2HDR(hdr)) {
5324	mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
5325
5326	/*
5327	* We have to recheck this conditional again now that
5328	* we're holding the l2ad_mtx to prevent a race with
5329	* another thread which might be concurrently calling
5330	* l2arc_evict(). In that case, l2arc_evict() might have
5331	* destroyed the header's L2 portion as we were waiting
5332	* to acquire the l2ad_mtx.
5333	*/
5334	if (HDR_HAS_L2HDR(hdr)) {
5335	l2arc_trim(hdr);
5336	arc_hdr_l2hdr_destroy(hdr);
5337	}
5338
5339	mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
5340	}
5341
5342	/*
5343	* Do we have more than one buf?
5344	*/
5345	if (hdr->b_l1hdr.b_bufcnt > `1`) {
5346	arc_buf_hdr_t *nhdr;
5347	arc_buf_t **bufp;
5348	uint64_t spa = hdr->b_spa;
5349	uint64_t psize = HDR_GET_PSIZE(hdr);
5350	uint64_t lsize = HDR_GET_LSIZE(hdr);
5351	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
5352	arc_buf_contents_t type = arc_buf_type(hdr);
5353	VERIFY3U(hdr->b_type, ==, type);
5354
5355	ASSERT(hdr->b_l1hdr.b_buf != buf \|\| buf->b_next != NULL);
5356	(void) remove_reference(hdr, hash_lock, tag);
5357
5358	if (arc_buf_is_shared(buf)) {
5359	ASSERT(HDR_SHARED_DATA(hdr));
5360	ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
5361	ASSERT(ARC_BUF_LAST(buf));
5362	}
5363
5364	/*
5365	* Pull the data off of this hdr and attach it to
5366	* a new anonymous hdr. Also find the last buffer
5367	* in the hdr's buffer list.
5368	*/
5369	arc_buf_t *lastbuf = NULL;
5370	bufp = &hdr->b_l1hdr.b_buf;
5371	while (*bufp != NULL) {
5372	if (*bufp == buf) {
5373	*bufp = buf->b_next;
5374	}
5375
5376	/*
5377	* If we've removed a buffer in the middle of
5378	* the list then update the lastbuf and update
5379	* bufp.
5380	*/
5381	if (*bufp != NULL) {
5382	lastbuf = *bufp;
5383	bufp = &(*bufp)->b_next;
5384	}
5385	}
5386	buf->b_next = NULL;
5387	ASSERT3P(lastbuf, !=, buf);
5388	ASSERT3P(lastbuf, !=, NULL);
5389
5390	/*
5391	* If the current arc_buf_t and the hdr are sharing their data
5392	* buffer, then we must stop sharing that block, transfer
5393	* ownership and setup sharing with a new arc_buf_t at the end
5394	* of the hdr's b_buf list.
5395	*/
5396	if (arc_buf_is_shared(buf)) {
5397	ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
5398	ASSERT(ARC_BUF_LAST(lastbuf));
5399	VERIFY(!arc_buf_is_shared(lastbuf));
5400
5401	/*
5402	* First, sever the block sharing relationship between
5403	* buf and the arc_buf_hdr_t. Then, setup a new
5404	* block sharing relationship with the last buffer
5405	* on the arc_buf_t list.
5406	*/
5407	arc_unshare_buf(hdr, buf);
5408	arc_share_buf(hdr, lastbuf);
5409	VERIFY3P(lastbuf->b_data, !=, NULL);
5410	} else if (HDR_SHARED_DATA(hdr)) {
5411	ASSERT(arc_buf_is_shared(lastbuf));
5412	}
5413	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
5414	ASSERT3P(state, !=, arc_l2c_only);
5415
5416	(void) refcount_remove_many(&state->arcs_size,
5417	HDR_GET_LSIZE(hdr), buf);
5418
5419	if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
5420	ASSERT3P(state, !=, arc_l2c_only);
5421	(void) refcount_remove_many(&state->arcs_esize[type],
5422	HDR_GET_LSIZE(hdr), buf);
5423	}
5424
5425	hdr->b_l1hdr.b_bufcnt -= `1`;
5426	arc_cksum_verify(buf);
5427	#ifdef illumos
5428	arc_buf_unwatch(buf);
5429	#endif
5430
5431	mutex_exit(hash_lock);
5432
5433	/*
5434	* Allocate a new hdr. The new hdr will contain a b_pdata
5435	* buffer which will be freed in arc_write().
5436	*/
5437	nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
5438	ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
5439	ASSERT0(nhdr->b_l1hdr.b_bufcnt);
5440	ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
5441	VERIFY3U(nhdr->b_type, ==, type);
5442	ASSERT(!HDR_SHARED_DATA(nhdr));
5443
5444	nhdr->b_l1hdr.b_buf = buf;
5445	nhdr->b_l1hdr.b_bufcnt = `1`;
5446	(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
5447	buf->b_hdr = nhdr;
5448
5449	mutex_exit(&buf->b_evict_lock);
5450	(void) refcount_add_many(&arc_anon->arcs_size,
5451	HDR_GET_LSIZE(nhdr), buf);
5452	} else {
5453	mutex_exit(&buf->b_evict_lock);
5454	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == `1`);
5455	/ protected by hash lock, or hdr is on arc_anon /
5456	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
5457	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5458	arc_change_state(arc_anon, hdr, hash_lock);
5459	hdr->b_l1hdr.b_arc_access = `0`;
5460	mutex_exit(hash_lock);
5461
5462	buf_discard_identity(hdr);
5463	arc_buf_thaw(buf);
5464	}
5465	}
5466
5467	int
5468	arc_released(arc_buf_t *buf)
5469	{
5470	int released;
5471
5472	mutex_enter(&buf->b_evict_lock);
5473	released = (buf->b_data != NULL &&
5474	buf->b_hdr->b_l1hdr.b_state == arc_anon);
5475	mutex_exit(&buf->b_evict_lock);
5476	return (released);
5477	}
5478
5479	#ifdef ZFS_DEBUG
5480	int
5481	arc_referenced(arc_buf_t *buf)
5482	{
5483	int referenced;
5484
5485	mutex_enter(&buf->b_evict_lock);
5486	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
5487	mutex_exit(&buf->b_evict_lock);
5488	return (referenced);
5489	}
5490	#endif
5491
5492	static void
5493	arc_write_ready(zio_t *zio)
5494	{
5495	arc_write_callback_t *callback = zio->io_private;
5496	arc_buf_t *buf = callback->awcb_buf;
5497	arc_buf_hdr_t *hdr = buf->b_hdr;
5498	uint64_t psize = BP_IS_HOLE(zio->io_bp) ? `0` : BP_GET_PSIZE(zio->io_bp);
5499
5500	ASSERT(HDR_HAS_L1HDR(hdr));
5501	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
5502	ASSERT(hdr->b_l1hdr.b_bufcnt > `0`);
5503
5504	/*
5505	* If we're reexecuting this zio because the pool suspended, then
5506	* cleanup any state that was previously set the first time the
5507	* callback as invoked.
5508	*/
5509	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
5510	arc_cksum_free(hdr);
5511	#ifdef illumos
5512	arc_buf_unwatch(buf);
5513	#endif
5514	if (hdr->b_l1hdr.b_pdata != NULL) {
5515	if (arc_buf_is_shared(buf)) {
5516	ASSERT(HDR_SHARED_DATA(hdr));
5517
5518	arc_unshare_buf(hdr, buf);
5519	} else {
5520	arc_hdr_free_pdata(hdr);
5521	}
5522	}
5523	}
5524	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
5525	ASSERT(!HDR_SHARED_DATA(hdr));
5526	ASSERT(!arc_buf_is_shared(buf));
5527
5528	callback->awcb_ready(zio, buf, callback->awcb_private);
5529
5530	if (HDR_IO_IN_PROGRESS(hdr))
5531	ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
5532
5533	arc_cksum_compute(buf);
5534	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5535
5536	enum zio_compress compress;
5537	if (BP_IS_HOLE(zio->io_bp) \|\| BP_IS_EMBEDDED(zio->io_bp)) {
5538	compress = ZIO_COMPRESS_OFF;
5539	} else {
5540	ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
5541	compress = BP_GET_COMPRESS(zio->io_bp);
5542	}
5543	HDR_SET_PSIZE(hdr, psize);
5544	arc_hdr_set_compress(hdr, compress);
5545
5546	/*
5547	* If the hdr is compressed, then copy the compressed
5548	* zio contents into arc_buf_hdr_t. Otherwise, copy the original
5549	* data buf into the hdr. Ideally, we would like to always copy the
5550	* io_data into b_pdata but the user may have disabled compressed
5551	* arc thus the on-disk block may or may not match what we maintain
5552	* in the hdr's b_pdata field.
5553	*/
5554	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
5555	ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF);
5556	ASSERT3U(psize, >, `0`);
5557	arc_hdr_alloc_pdata(hdr);
5558	bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
5559	} else {
5560	ASSERT3P(buf->b_data, ==, zio->io_orig_data);
5561	ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr));
5562	ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS);
5563	ASSERT(!HDR_SHARED_DATA(hdr));
5564	ASSERT(!arc_buf_is_shared(buf));
5565	ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, `1`);
5566	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
5567
5568	/*
5569	* This hdr is not compressed so we're able to share
5570	* the arc_buf_t data buffer with the hdr.
5571	*/
5572	arc_share_buf(hdr, buf);
5573	VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
5574	HDR_GET_LSIZE(hdr)));
5575	}
5576	arc_hdr_verify(hdr, zio->io_bp);
5577	}
5578
5579	static void
5580	arc_write_children_ready(zio_t *zio)
5581	{
5582	arc_write_callback_t *callback = zio->io_private;
5583	arc_buf_t *buf = callback->awcb_buf;
5584
5585	callback->awcb_children_ready(zio, buf, callback->awcb_private);
5586	}
5587
5588	/*
5589	* The SPA calls this callback for each physical write that happens on behalf
5590	* of a logical write. See the comment in dbuf_write_physdone() for details.
5591	*/
5592	static void
5593	arc_write_physdone(zio_t *zio)
5594	{
5595	arc_write_callback_t *cb = zio->io_private;
5596	if (cb->awcb_physdone != NULL)
5597	cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
5598	}
5599
5600	static void
5601	arc_write_done(zio_t *zio)
5602	{
5603	arc_write_callback_t *callback = zio->io_private;
5604	arc_buf_t *buf = callback->awcb_buf;
5605	arc_buf_hdr_t *hdr = buf->b_hdr;
5606
5607	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
5608
5609	if (zio->io_error == `0`) {
5610	arc_hdr_verify(hdr, zio->io_bp);
5611
5612	if (BP_IS_HOLE(zio->io_bp) \|\| BP_IS_EMBEDDED(zio->io_bp)) {
5613	buf_discard_identity(hdr);
5614	} else {
5615	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
5616	hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
5617	}
5618	} else {
5619	ASSERT(HDR_EMPTY(hdr));
5620	}
5621
5622	/*
5623	* If the block to be written was all-zero or compressed enough to be
5624	* embedded in the BP, no write was performed so there will be no
5625	* dva/birth/checksum. The buffer must therefore remain anonymous
5626	* (and uncached).
5627	*/
5628	if (!HDR_EMPTY(hdr)) {
5629	arc_buf_hdr_t *exists;
5630	kmutex_t *hash_lock;
5631
5632	ASSERT(zio->io_error == `0`);
5633
5634	arc_cksum_verify(buf);
5635
5636	exists = buf_hash_insert(hdr, &hash_lock);
5637	if (exists != NULL) {
5638	/*
5639	* This can only happen if we overwrite for
5640	* sync-to-convergence, because we remove
5641	* buffers from the hash table when we arc_free().
5642	*/
5643	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
5644	if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5645	panic("bad overwrite, hdr=%p exists=%p",
5646	(void )hdr, (void* *)exists);
5647	ASSERT(refcount_is_zero(
5648	&exists->b_l1hdr.b_refcnt));
5649	arc_change_state(arc_anon, exists, hash_lock);
5650	mutex_exit(hash_lock);
5651	arc_hdr_destroy(exists);
5652	exists = buf_hash_insert(hdr, &hash_lock);
5653	ASSERT3P(exists, ==, NULL);
5654	} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
5655	/ nopwrite /
5656	ASSERT(zio->io_prop.zp_nopwrite);
5657	if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5658	panic("bad nopwrite, hdr=%p exists=%p",
5659	(void )hdr, (void* *)exists);
5660	} else {
5661	/ Dedup /
5662	ASSERT(hdr->b_l1hdr.b_bufcnt == `1`);
5663	ASSERT(hdr->b_l1hdr.b_state == arc_anon);
5664	ASSERT(BP_GET_DEDUP(zio->io_bp));
5665	ASSERT(BP_GET_LEVEL(zio->io_bp) == `0`);
5666	}
5667	}
5668	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5669	/ if it's not anon, we are doing a scrub /
5670	if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
5671	arc_access(hdr, hash_lock);
5672	mutex_exit(hash_lock);
5673	} else {
5674	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5675	}
5676
5677	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5678	callback->awcb_done(zio, buf, callback->awcb_private);
5679
5680	kmem_free(callback, sizeof (arc_write_callback_t));
5681	}
5682
5683	zio_t *
5684	arc_write(zio_t pio, spa_t spa, uint64_t txg, blkptr_t bp, arc_buf_t buf,
5685	boolean_t l2arc, const zio_prop_t zp, arc_done_func_t ready,
5686	arc_done_func_t children_ready, arc_done_func_t physdone,
5687	arc_done_func_t done, void* *private, zio_priority_t priority,
5688	int zio_flags, const zbookmark_phys_t *zb)
5689	{
5690	arc_buf_hdr_t *hdr = buf->b_hdr;
5691	arc_write_callback_t *callback;
5692	zio_t *zio;
5693
5694	ASSERT3P(ready, !=, NULL);
5695	ASSERT3P(done, !=, NULL);
5696	ASSERT(!HDR_IO_ERROR(hdr));
5697	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5698	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
5699	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, `0`);
5700	if (l2arc)
5701	arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5702	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
5703	callback->awcb_ready = ready;
5704	callback->awcb_children_ready = children_ready;
5705	callback->awcb_physdone = physdone;
5706	callback->awcb_done = done;
5707	callback->awcb_private = private;
5708	callback->awcb_buf = buf;
5709
5710	/*
5711	* The hdr's b_pdata is now stale, free it now. A new data block
5712	* will be allocated when the zio pipeline calls arc_write_ready().
5713	*/
5714	if (hdr->b_l1hdr.b_pdata != NULL) {
5715	/*
5716	* If the buf is currently sharing the data block with
5717	* the hdr then we need to break that relationship here.
5718	* The hdr will remain with a NULL data pointer and the
5719	* buf will take sole ownership of the block.
5720	*/
5721	if (arc_buf_is_shared(buf)) {
5722	ASSERT(ARC_BUF_LAST(buf));
5723	arc_unshare_buf(hdr, buf);
5724	} else {
5725	arc_hdr_free_pdata(hdr);
5726	}
5727	VERIFY3P(buf->b_data, !=, NULL);
5728	arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
5729	}
5730	ASSERT(!arc_buf_is_shared(buf));
5731	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
5732
5733	zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp,
5734	arc_write_ready,
5735	(children_ready != NULL) ? arc_write_children_ready : NULL,
5736	arc_write_physdone, arc_write_done, callback,
5737	priority, zio_flags, zb);
5738
5739	return (zio);
5740	}
5741
5742	static int
5743	arc_memory_throttle(uint64_t reserve, uint64_t txg)
5744	{
5745	#ifdef _KERNEL
5746	uint64_t available_memory = ptob(freemem);
5747	static uint64_t page_load = `0`;
5748	static uint64_t last_txg = `0`;
5749
5750	#if defined(__i386) \|\| !defined(UMA_MD_SMALL_ALLOC)
5751	available_memory =
5752	MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
5753	#endif
5754
5755	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / `100`)
5756	return (`0`);
5757
5758	if (txg > last_txg) {
5759	last_txg = txg;
5760	page_load = `0`;
5761	}
5762	/*
5763	* If we are in pageout, we know that memory is already tight,
5764	* the arc is already going to be evicting, so we just want to
5765	* continue to let page writes occur as quickly as possible.
5766	*/
5767	if (curlwp == uvm.pagedaemon_lwp) {
5768	if (page_load > MAX(ptob(minfree), available_memory) / `4`)
5769	return (SET_ERROR(ERESTART));
5770	/ Note: reserve is inflated, so we deflate /
5771	page_load += reserve / `8`;
5772	return (`0`);
5773	} else if (page_load > `0` && arc_reclaim_needed()) {
5774	/ memory is low, delay before restarting /
5775	ARCSTAT_INCR(arcstat_memory_throttle_count, `1`);
5776	return (SET_ERROR(EAGAIN));
5777	}
5778	page_load = `0`;
5779	#endif
5780	return (`0`);
5781	}
5782
5783	void
5784	arc_tempreserve_clear(uint64_t reserve)
5785	{
5786	atomic_add_64(&arc_tempreserve, -reserve);
5787	ASSERT((int64_t)arc_tempreserve >= `0`);
5788	}
5789
5790	int
5791	arc_tempreserve_space(uint64_t reserve, uint64_t txg)
5792	{
5793	int error;
5794	uint64_t anon_size;
5795
5796	if (reserve > arc_c/`4` && !arc_no_grow) {
5797	arc_c = MIN(arc_c_max, reserve * `4`);
5798	DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
5799	}
5800	if (reserve > arc_c)
5801	return (SET_ERROR(ENOMEM));
5802
5803	/*
5804	* Don't count loaned bufs as in flight dirty data to prevent long
5805	* network delays from blocking transactions that are ready to be
5806	* assigned to a txg.
5807	*/
5808	anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
5809	arc_loaned_bytes), `0`);
5810
5811	/*
5812	* Writes will, almost always, require additional memory allocations
5813	* in order to compress/encrypt/etc the data. We therefore need to
5814	* make sure that there is sufficient available memory for this.
5815	*/
5816	error = arc_memory_throttle(reserve, txg);
5817	if (error != `0`)
5818	return (error);
5819
5820	/*
5821	* Throttle writes when the amount of dirty data in the cache
5822	* gets too large. We try to keep the cache less than half full
5823	* of dirty blocks so that our sync times don't grow too large.
5824	* Note: if two requests come in concurrently, we might let them
5825	* both succeed, when one of them should fail. Not a huge deal.
5826	*/
5827
5828	if (reserve + arc_tempreserve + anon_size > arc_c / `2` &&
5829	anon_size > arc_c / `4`) {
5830	uint64_t meta_esize =
5831	refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
5832	uint64_t data_esize =
5833	refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
5834	dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
5835	"anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
5836	arc_tempreserve >> `10`, meta_esize >> `10`,
5837	data_esize >> `10`, reserve >> `10`, arc_c >> `10`);
5838	return (SET_ERROR(ERESTART));
5839	}
5840	atomic_add_64(&arc_tempreserve, reserve);
5841	return (`0`);
5842	}
5843
5844	static void
5845	arc_kstat_update_state(arc_state_t state, kstat_named_t size,
5846	kstat_named_t evict_data, kstat_named_t evict_metadata)
5847	{
5848	size->value.ui64 = refcount_count(&state->arcs_size);
5849	evict_data->value.ui64 =
5850	refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
5851	evict_metadata->value.ui64 =
5852	refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
5853	}
5854
5855	static int
5856	arc_kstat_update(kstat_t ksp, int* rw)
5857	{
5858	arc_stats_t *as = ksp->ks_data;
5859
5860	if (rw == KSTAT_WRITE) {
5861	return (EACCES);
5862	} else {
5863	arc_kstat_update_state(arc_anon,
5864	&as->arcstat_anon_size,
5865	&as->arcstat_anon_evictable_data,
5866	&as->arcstat_anon_evictable_metadata);
5867	arc_kstat_update_state(arc_mru,
5868	&as->arcstat_mru_size,
5869	&as->arcstat_mru_evictable_data,
5870	&as->arcstat_mru_evictable_metadata);
5871	arc_kstat_update_state(arc_mru_ghost,
5872	&as->arcstat_mru_ghost_size,
5873	&as->arcstat_mru_ghost_evictable_data,
5874	&as->arcstat_mru_ghost_evictable_metadata);
5875	arc_kstat_update_state(arc_mfu,
5876	&as->arcstat_mfu_size,
5877	&as->arcstat_mfu_evictable_data,
5878	&as->arcstat_mfu_evictable_metadata);
5879	arc_kstat_update_state(arc_mfu_ghost,
5880	&as->arcstat_mfu_ghost_size,
5881	&as->arcstat_mfu_ghost_evictable_data,
5882	&as->arcstat_mfu_ghost_evictable_metadata);
5883	}
5884
5885	return (`0`);
5886	}
5887
5888	/*
5889	* This function must return indices evenly distributed between all
5890	* sublists of the multilist. This is needed due to how the ARC eviction
5891	* code is laid out; arc_evict_state() assumes ARC buffers are evenly
5892	* distributed between all sublists and uses this assumption when
5893	* deciding which sublist to evict from and how much to evict from it.
5894	*/
5895	unsigned int
5896	arc_state_multilist_index_func(multilist_t ml, void* *obj)
5897	{
5898	arc_buf_hdr_t *hdr = obj;
5899
5900	/*
5901	* We rely on b_dva to generate evenly distributed index
5902	* numbers using buf_hash below. So, as an added precaution,
5903	* let's make sure we never add empty buffers to the arc lists.
5904	*/
5905	ASSERT(!HDR_EMPTY(hdr));
5906
5907	/*
5908	* The assumption here, is the hash value for a given
5909	* arc_buf_hdr_t will remain constant throughout it's lifetime
5910	* (i.e. it's b_spa, b_dva, and b_birth fields don't change).
5911	* Thus, we don't need to store the header's sublist index
5912	* on insertion, as this index can be recalculated on removal.
5913	*
5914	* Also, the low order bits of the hash value are thought to be
5915	* distributed evenly. Otherwise, in the case that the multilist
5916	* has a power of two number of sublists, each sublists' usage
5917	* would not be evenly distributed.
5918	*/
5919	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
5920	multilist_get_num_sublists(ml));
5921	}
5922
5923	#ifdef _KERNEL
5924	#ifdef __FreeBSD__
5925	static eventhandler_tag arc_event_lowmem = NULL;
5926	#endif
5927
5928	static void
5929	arc_lowmem(void arg __unused, int* howto __unused)
5930	{
5931
5932	mutex_enter(&arc_reclaim_lock);
5933	/ XXX: Memory deficit should be passed as argument. /
5934	needfree = btoc(arc_c >> arc_shrink_shift);
5935	DTRACE_PROBE(arc__needfree);
5936	cv_signal(&arc_reclaim_thread_cv);
5937
5938	/*
5939	* It is unsafe to block here in arbitrary threads, because we can come
5940	* here from ARC itself and may hold ARC locks and thus risk a deadlock
5941	* with ARC reclaim thread.
5942	*/
5943	if (curlwp == uvm.pagedaemon_lwp)
5944	(void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
5945	mutex_exit(&arc_reclaim_lock);
5946	}
5947	#endif
5948
5949	static void
5950	arc_state_init(void)
5951	{
5952	arc_anon = &ARC_anon;
5953	arc_mru = &ARC_mru;
5954	arc_mru_ghost = &ARC_mru_ghost;
5955	arc_mfu = &ARC_mfu;
5956	arc_mfu_ghost = &ARC_mfu_ghost;
5957	arc_l2c_only = &ARC_l2c_only;
5958
5959	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
5960	sizeof (arc_buf_hdr_t),
5961	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5962	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5963	multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
5964	sizeof (arc_buf_hdr_t),
5965	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5966	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5967	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
5968	sizeof (arc_buf_hdr_t),
5969	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5970	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5971	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
5972	sizeof (arc_buf_hdr_t),
5973	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5974	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5975	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
5976	sizeof (arc_buf_hdr_t),
5977	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5978	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5979	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
5980	sizeof (arc_buf_hdr_t),
5981	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5982	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5983	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
5984	sizeof (arc_buf_hdr_t),
5985	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5986	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5987	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
5988	sizeof (arc_buf_hdr_t),
5989	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5990	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5991	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
5992	sizeof (arc_buf_hdr_t),
5993	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5994	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5995	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
5996	sizeof (arc_buf_hdr_t),
5997	offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5998	zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5999
6000	refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
6001	refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
6002	refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
6003	refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
6004	refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
6005	refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
6006	refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
6007	refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
6008	refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
6009	refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
6010	refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
6011	refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
6012
6013	refcount_create(&arc_anon->arcs_size);
6014	refcount_create(&arc_mru->arcs_size);
6015	refcount_create(&arc_mru_ghost->arcs_size);
6016	refcount_create(&arc_mfu->arcs_size);
6017	refcount_create(&arc_mfu_ghost->arcs_size);
6018	refcount_create(&arc_l2c_only->arcs_size);
6019	}
6020
6021	static void
6022	arc_state_fini(void)
6023	{
6024	refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
6025	refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
6026	refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
6027	refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
6028	refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
6029	refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
6030	refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
6031	refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
6032	refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
6033	refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
6034	refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
6035	refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
6036
6037	refcount_destroy(&arc_anon->arcs_size);
6038	refcount_destroy(&arc_mru->arcs_size);
6039	refcount_destroy(&arc_mru_ghost->arcs_size);
6040	refcount_destroy(&arc_mfu->arcs_size);
6041	refcount_destroy(&arc_mfu_ghost->arcs_size);
6042	refcount_destroy(&arc_l2c_only->arcs_size);
6043
6044	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
6045	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
6046	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
6047	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
6048	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
6049	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
6050	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
6051	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
6052	}
6053
6054	uint64_t
6055	arc_max_bytes(void)
6056	{
6057	return (arc_c_max);
6058	}
6059
6060	void
6061	arc_init(void)
6062	{
6063	int i, prefetch_tunable_set = `0`;
6064
6065	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
6066	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
6067	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
6068
6069	#ifdef __FreeBSD__
6070	mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
6071	cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
6072	#endif
6073
6074	/ Convert seconds to clock ticks /
6075	arc_min_prefetch_lifespan = `1` * hz;
6076
6077	/ Start out with 1/8 of all memory /
6078	arc_c = kmem_size() / `8`;
6079
6080	#ifdef illumos
6081	#ifdef _KERNEL
6082	/*
6083	* On architectures where the physical memory can be larger
6084	* than the addressable space (intel in 32-bit mode), we may
6085	* need to limit the cache to 1/8 of VM size.
6086	*/
6087	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC \| VMEM_FREE) / `8`);
6088	#endif
6089	#endif /* illumos */
6090	/ set min cache to 1/32 of all memory, or arc_abs_min, whichever is more /
6091	arc_c_min = MAX(arc_c / `4`, arc_abs_min);
6092	/ set max to 1/2 of all memory, or all but 1GB, whichever is more /
6093	if (arc_c * `8` >= `1` << `30`)
6094	arc_c_max = (arc_c * `8`) - (`1` << `30`);
6095	else
6096	arc_c_max = arc_c_min;
6097	arc_c_max = MAX(arc_c * `5`, arc_c_max);
6098
6099	/*
6100	* In userland, there's only the memory pressure that we artificially
6101	* create (see arc_available_memory()). Don't let arc_c get too
6102	* small, because it can cause transactions to be larger than
6103	* arc_c, causing arc_tempreserve_space() to fail.
6104	*/
6105	#ifndef _KERNEL
6106	arc_c_min = arc_c_max / `2`;
6107	#endif
6108
6109	#ifdef _KERNEL
6110	/*
6111	* Allow the tunables to override our calculations if they are
6112	* reasonable.
6113	*/
6114	if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) {
6115	arc_c_max = zfs_arc_max;
6116	arc_c_min = MIN(arc_c_min, arc_c_max);
6117	}
6118	if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
6119	arc_c_min = zfs_arc_min;
6120	#endif
6121
6122	arc_c = arc_c_max;
6123	arc_p = (arc_c >> `1`);
6124	arc_size = `0`;
6125
6126	/ limit meta-data to 1/4 of the arc capacity /
6127	arc_meta_limit = arc_c_max / `4`;
6128
6129	/ Allow the tunable to override if it is reasonable /
6130	if (zfs_arc_meta_limit > `0` && zfs_arc_meta_limit <= arc_c_max)
6131	arc_meta_limit = zfs_arc_meta_limit;
6132
6133	if (arc_c_min < arc_meta_limit / `2` && zfs_arc_min == `0`)
6134	arc_c_min = arc_meta_limit / `2`;
6135
6136	if (zfs_arc_meta_min > `0`) {
6137	arc_meta_min = zfs_arc_meta_min;
6138	} else {
6139	arc_meta_min = arc_c_min / `2`;
6140	}
6141
6142	if (zfs_arc_grow_retry > `0`)
6143	arc_grow_retry = zfs_arc_grow_retry;
6144
6145	if (zfs_arc_shrink_shift > `0`)
6146	arc_shrink_shift = zfs_arc_shrink_shift;
6147
6148	/*
6149	* Ensure that arc_no_grow_shift is less than arc_shrink_shift.
6150	*/
6151	if (arc_no_grow_shift >= arc_shrink_shift)
6152	arc_no_grow_shift = arc_shrink_shift - `1`;
6153
6154	if (zfs_arc_p_min_shift > `0`)
6155	arc_p_min_shift = zfs_arc_p_min_shift;
6156
6157	if (zfs_arc_num_sublists_per_state < `1`)
6158	zfs_arc_num_sublists_per_state = MAX(max_ncpus, `1`);
6159
6160	/ if kmem_flags are set, lets try to use less memory /
6161	if (kmem_debugging())
6162	arc_c = arc_c / `2`;
6163	if (arc_c < arc_c_min)
6164	arc_c = arc_c_min;
6165
6166	zfs_arc_min = arc_c_min;
6167	zfs_arc_max = arc_c_max;
6168
6169	arc_state_init();
6170	buf_init();
6171
6172	arc_reclaim_thread_exit = B_FALSE;
6173	#ifdef __FreeBSD__
6174	arc_dnlc_evicts_thread_exit = FALSE;
6175	#endif
6176
6177	arc_ksp = kstat_create("zfs", `0`, "arcstats", "misc", KSTAT_TYPE_NAMED,
6178	sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
6179
6180	if (arc_ksp != NULL) {
6181	arc_ksp->ks_data = &arc_stats;
6182	arc_ksp->ks_update = arc_kstat_update;
6183	kstat_install(arc_ksp);
6184	}
6185
6186	(void) thread_create(NULL, `0`, arc_reclaim_thread, NULL, `0`, &p0,
6187	TS_RUN, minclsyspri);
6188
6189	#ifdef __FreeBSD__
6190	#ifdef _KERNEL
6191	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
6192	EVENTHANDLER_PRI_FIRST);
6193	#endif
6194
6195	(void) thread_create(NULL, `0`, arc_dnlc_evicts_thread, NULL, `0`, &p0,
6196	TS_RUN, minclsyspri);
6197	#endif
6198
6199	arc_dead = B_FALSE;
6200	arc_warm = B_FALSE;
6201
6202	/*
6203	* Calculate maximum amount of dirty data per pool.
6204	*
6205	* If it has been set by /etc/system, take that.
6206	* Otherwise, use a percentage of physical memory defined by
6207	* zfs_dirty_data_max_percent (default 10%) with a cap at
6208	* zfs_dirty_data_max_max (default 4GB).
6209	*/
6210	if (zfs_dirty_data_max == `0`) {
6211	zfs_dirty_data_max = ptob(physmem) *
6212	zfs_dirty_data_max_percent / `100`;
6213	zfs_dirty_data_max = MIN(zfs_dirty_data_max,
6214	zfs_dirty_data_max_max);
6215	}
6216
6217	#ifdef _KERNEL
6218	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
6219	prefetch_tunable_set = `1`;
6220
6221	#ifdef __i386__
6222	if (prefetch_tunable_set == `0`) {
6223	printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
6224	"-- to enable,\n");
6225	printf(" add \"vfs.zfs.prefetch_disable=0\" "
6226	"to /boot/loader.conf.\n");
6227	zfs_prefetch_disable = `1`;
6228	}
6229	#else
6230	if ((((uint64_t)physmem * PAGESIZE) < (`1ULL` << `32`)) &&
6231	prefetch_tunable_set == `0`) {
6232	printf("ZFS NOTICE: Prefetch is disabled by default if less "
6233	"than 4GB of RAM is present;\n"
6234	" to enable, add \"vfs.zfs.prefetch_disable=0\" "
6235	"to /boot/loader.conf.\n");
6236	zfs_prefetch_disable = `1`;
6237	}
6238	#endif
6239	/ Warn about ZFS memory and address space requirements. /
6240	if (((uint64_t)physmem * PAGESIZE) < (`256` + `128` + `64`) * (`1` << `20`)) {
6241	printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
6242	"expect unstable behavior.\n");
6243	}
6244	if (kmem_size() < `512` * (`1` << `20`)) {
6245	printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
6246	"expect unstable behavior.\n");
6247	printf(" Consider tuning vm.kmem_size and "
6248	"vm.kmem_size_max\n");
6249	printf(" in /boot/loader.conf.\n");
6250	}
6251	#endif
6252	}
6253
6254	void
6255	arc_fini(void)
6256	{
6257	mutex_enter(&arc_reclaim_lock);
6258	arc_reclaim_thread_exit = B_TRUE;
6259	/*
6260	* The reclaim thread will set arc_reclaim_thread_exit back to
6261	* B_FALSE when it is finished exiting; we're waiting for that.
6262	*/
6263	while (arc_reclaim_thread_exit) {
6264	cv_signal(&arc_reclaim_thread_cv);
6265	cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
6266	}
6267	mutex_exit(&arc_reclaim_lock);
6268
6269	/ Use B_TRUE to ensure all buffers are evicted /
6270	arc_flush(NULL, B_TRUE);
6271
6272	#ifdef __FreeBSD__
6273	mutex_enter(&arc_dnlc_evicts_lock);
6274	arc_dnlc_evicts_thread_exit = TRUE;
6275
6276	/*
6277	* The user evicts thread will set arc_user_evicts_thread_exit
6278	* to FALSE when it is finished exiting; we're waiting for that.
6279	*/
6280	while (arc_dnlc_evicts_thread_exit) {
6281	cv_signal(&arc_dnlc_evicts_cv);
6282	cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
6283	}
6284	mutex_exit(&arc_dnlc_evicts_lock);
6285
6286	mutex_destroy(&arc_dnlc_evicts_lock);
6287	cv_destroy(&arc_dnlc_evicts_cv);
6288	#endif
6289
6290	arc_dead = B_TRUE;
6291
6292	if (arc_ksp != NULL) {
6293	kstat_delete(arc_ksp);
6294	arc_ksp = NULL;
6295	}
6296
6297	mutex_destroy(&arc_reclaim_lock);
6298	cv_destroy(&arc_reclaim_thread_cv);
6299	cv_destroy(&arc_reclaim_waiters_cv);
6300
6301	arc_state_fini();
6302	buf_fini();
6303
6304	ASSERT0(arc_loaned_bytes);
6305
6306	#ifdef __FreeBSD__
6307	#ifdef _KERNEL
6308	if (arc_event_lowmem != NULL)
6309	EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
6310	#endif
6311	#endif
6312	}
6313
6314	/*
6315	* Level 2 ARC
6316	*
6317	* The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
6318	* It uses dedicated storage devices to hold cached data, which are populated
6319	* using large infrequent writes. The main role of this cache is to boost
6320	* the performance of random read workloads. The intended L2ARC devices
6321	* include short-stroked disks, solid state disks, and other media with
6322	* substantially faster read latency than disk.
6323	*
6324	* +-----------------------+
6325	* \| ARC \|
6326	* +-----------------------+
6327	* \| ^ ^
6328	* \| \| \|
6329	* l2arc_feed_thread() arc_read()
6330	* \| \| \|
6331	* \| l2arc read \|
6332	* V \| \|
6333	* +---------------+ \|
6334	* \| L2ARC \| \|
6335	* +---------------+ \|
6336	* \| ^ \|
6337	* l2arc_write() \| \|
6338	* \| \| \|
6339	* V \| \|
6340	* +-------+ +-------+
6341	* \| vdev \| \| vdev \|
6342	* \| cache \| \| cache \|
6343	* +-------+ +-------+
6344	* +=========+ .-----.
6345	* : L2ARC : \|-_____-\|
6346	* : devices : \| Disks \|
6347	* +=========+ `-_____-'
6348	*
6349	* Read requests are satisfied from the following sources, in order:
6350	*
6351	* 1) ARC
6352	* 2) vdev cache of L2ARC devices
6353	* 3) L2ARC devices
6354	* 4) vdev cache of disks
6355	* 5) disks
6356	*
6357	* Some L2ARC device types exhibit extremely slow write performance.
6358	* To accommodate for this there are some significant differences between
6359	* the L2ARC and traditional cache design:
6360	*
6361	* 1. There is no eviction path from the ARC to the L2ARC. Evictions from
6362	* the ARC behave as usual, freeing buffers and placing headers on ghost
6363	* lists. The ARC does not send buffers to the L2ARC during eviction as
6364	* this would add inflated write latencies for all ARC memory pressure.
6365	*
6366	* 2. The L2ARC attempts to cache data from the ARC before it is evicted.
6367	* It does this by periodically scanning buffers from the eviction-end of
6368	* the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
6369	* not already there. It scans until a headroom of buffers is satisfied,
6370	* which itself is a buffer for ARC eviction. If a compressible buffer is
6371	* found during scanning and selected for writing to an L2ARC device, we
6372	* temporarily boost scanning headroom during the next scan cycle to make
6373	* sure we adapt to compression effects (which might significantly reduce
6374	* the data volume we write to L2ARC). The thread that does this is
6375	* l2arc_feed_thread(), illustrated below; example sizes are included to
6376	* provide a better sense of ratio than this diagram:
6377	*
6378	* head --> tail
6379	* +---------------------+----------+
6380	* ARC_mfu \|:::::#:::::::::::::::\|o#o###o###\|-->. # already on L2ARC
6381	* +---------------------+----------+ \| o L2ARC eligible
6382	* ARC_mru \|:#:::::::::::::::::::\|#o#ooo####\|-->\| : ARC buffer
6383	* +---------------------+----------+ \|
6384	* 15.9 Gbytes ^ 32 Mbytes \|
6385	* headroom \|
6386	* l2arc_feed_thread()
6387	* \|
6388	* l2arc write hand <--[oooo]--'
6389	* \| 8 Mbyte
6390	* \| write max
6391	* V
6392	* +==============================+
6393	* L2ARC dev \|####\|#\|###\|###\| \|####\| ... \|
6394	* +==============================+
6395	* 32 Gbytes
6396	*
6397	* 3. If an ARC buffer is copied to the L2ARC but then hit instead of
6398	* evicted, then the L2ARC has cached a buffer much sooner than it probably
6399	* needed to, potentially wasting L2ARC device bandwidth and storage. It is
6400	* safe to say that this is an uncommon case, since buffers at the end of
6401	* the ARC lists have moved there due to inactivity.
6402	*
6403	* 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
6404	* then the L2ARC simply misses copying some buffers. This serves as a
6405	* pressure valve to prevent heavy read workloads from both stalling the ARC
6406	* with waits and clogging the L2ARC with writes. This also helps prevent
6407	* the potential for the L2ARC to churn if it attempts to cache content too
6408	* quickly, such as during backups of the entire pool.
6409	*
6410	* 5. After system boot and before the ARC has filled main memory, there are
6411	* no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
6412	* lists can remain mostly static. Instead of searching from tail of these
6413	* lists as pictured, the l2arc_feed_thread() will search from the list heads
6414	* for eligible buffers, greatly increasing its chance of finding them.
6415	*
6416	* The L2ARC device write speed is also boosted during this time so that
6417	* the L2ARC warms up faster. Since there have been no ARC evictions yet,
6418	* there are no L2ARC reads, and no fear of degrading read performance
6419	* through increased writes.
6420	*
6421	* 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
6422	* the vdev queue can aggregate them into larger and fewer writes. Each
6423	* device is written to in a rotor fashion, sweeping writes through
6424	* available space then repeating.
6425	*
6426	* 7. The L2ARC does not store dirty content. It never needs to flush
6427	* write buffers back to disk based storage.
6428	*
6429	* 8. If an ARC buffer is written (and dirtied) which also exists in the
6430	* L2ARC, the now stale L2ARC buffer is immediately dropped.
6431	*
6432	* The performance of the L2ARC can be tweaked by a number of tunables, which
6433	* may be necessary for different workloads:
6434	*
6435	* l2arc_write_max max write bytes per interval
6436	* l2arc_write_boost extra write bytes during device warmup
6437	* l2arc_noprefetch skip caching prefetched buffers
6438	* l2arc_headroom number of max device writes to precache
6439	* l2arc_headroom_boost when we find compressed buffers during ARC
6440	* scanning, we multiply headroom by this
6441	* percentage factor for the next scan cycle,
6442	* since more compressed buffers are likely to
6443	* be present
6444	* l2arc_feed_secs seconds between L2ARC writing
6445	*
6446	* Tunables may be removed or added as future performance improvements are
6447	* integrated, and also may become zpool properties.
6448	*
6449	* There are three key functions that control how the L2ARC warms up:
6450	*
6451	* l2arc_write_eligible() check if a buffer is eligible to cache
6452	* l2arc_write_size() calculate how much to write
6453	* l2arc_write_interval() calculate sleep delay between writes
6454	*
6455	* These three functions determine what to write, how much, and how quickly
6456	* to send writes.
6457	*/
6458
6459	static boolean_t
6460	l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
6461	{
6462	/*
6463	* A buffer is not eligible for the L2ARC if it:
6464	* 1. belongs to a different spa.
6465	* 2. is already cached on the L2ARC.
6466	* 3. has an I/O in progress (it may be an incomplete read).
6467	* 4. is flagged not eligible (zfs property).
6468	*/
6469	if (hdr->b_spa != spa_guid) {
6470	ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
6471	return (B_FALSE);
6472	}
6473	if (HDR_HAS_L2HDR(hdr)) {
6474	ARCSTAT_BUMP(arcstat_l2_write_in_l2);
6475	return (B_FALSE);
6476	}
6477	if (HDR_IO_IN_PROGRESS(hdr)) {
6478	ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
6479	return (B_FALSE);
6480	}
6481	if (!HDR_L2CACHE(hdr)) {
6482	ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
6483	return (B_FALSE);
6484	}
6485
6486	return (B_TRUE);
6487	}
6488
6489	static uint64_t
6490	l2arc_write_size(void)
6491	{
6492	uint64_t size;
6493
6494	/*
6495	* Make sure our globals have meaningful values in case the user
6496	* altered them.
6497	*/
6498	size = l2arc_write_max;
6499	if (size == `0`) {
6500	cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
6501	"be greater than zero, resetting it to the default (%d)",
6502	L2ARC_WRITE_SIZE);
6503	size = l2arc_write_max = L2ARC_WRITE_SIZE;
6504	}
6505
6506	if (arc_warm == B_FALSE)
6507	size += l2arc_write_boost;
6508
6509	return (size);
6510
6511	}
6512
6513	static clock_t
6514	l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
6515	{
6516	clock_t interval, next, now;
6517
6518	/*
6519	* If the ARC lists are busy, increase our write rate; if the
6520	* lists are stale, idle back. This is achieved by checking
6521	* how much we previously wrote - if it was more than half of
6522	* what we wanted, schedule the next write much sooner.
6523	*/
6524	if (l2arc_feed_again && wrote > (wanted / `2`))
6525	interval = (hz * l2arc_feed_min_ms) / `1000`;
6526	else
6527	interval = hz * l2arc_feed_secs;
6528
6529	now = ddi_get_lbolt();
6530	next = MAX(now, MIN(now + interval, began + interval));
6531
6532	return (next);
6533	}
6534
6535	/*
6536	* Cycle through L2ARC devices. This is how L2ARC load balances.
6537	* If a device is returned, this also returns holding the spa config lock.
6538	*/
6539	static l2arc_dev_t *
6540	l2arc_dev_get_next(void)
6541	{
6542	l2arc_dev_t first, next = NULL;
6543
6544	/*
6545	* Lock out the removal of spas (spa_namespace_lock), then removal
6546	* of cache devices (l2arc_dev_mtx). Once a device has been selected,
6547	* both locks will be dropped and a spa config lock held instead.
6548	*/
6549	mutex_enter(&spa_namespace_lock);
6550	mutex_enter(&l2arc_dev_mtx);
6551
6552	/ if there are no vdevs, there is nothing to do /
6553	if (l2arc_ndev == `0`)
6554	goto out;
6555
6556	first = NULL;
6557	next = l2arc_dev_last;
6558	do {
6559	/ loop around the list looking for a non-faulted vdev /
6560	if (next == NULL) {
6561	next = list_head(l2arc_dev_list);
6562	} else {
6563	next = list_next(l2arc_dev_list, next);
6564	if (next == NULL)
6565	next = list_head(l2arc_dev_list);
6566	}
6567
6568	/ if we have come back to the start, bail out /
6569	if (first == NULL)
6570	first = next;
6571	else if (next == first)
6572	break;
6573
6574	} while (vdev_is_dead(next->l2ad_vdev));
6575
6576	/ if we were unable to find any usable vdevs, return NULL /
6577	if (vdev_is_dead(next->l2ad_vdev))
6578	next = NULL;
6579
6580	l2arc_dev_last = next;
6581
6582	out:
6583	mutex_exit(&l2arc_dev_mtx);
6584
6585	/*
6586	* Grab the config lock to prevent the 'next' device from being
6587	* removed while we are writing to it.
6588	*/
6589	if (next != NULL)
6590	spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
6591	mutex_exit(&spa_namespace_lock);
6592
6593	return (next);
6594	}
6595
6596	/*
6597	* Free buffers that were tagged for destruction.
6598	*/
6599	static void
6600	l2arc_do_free_on_write()
6601	{
6602	list_t *buflist;
6603	l2arc_data_free_t df, df_prev;
6604
6605	mutex_enter(&l2arc_free_on_write_mtx);
6606	buflist = l2arc_free_on_write;
6607
6608	for (df = list_tail(buflist); df; df = df_prev) {
6609	df_prev = list_prev(buflist, df);
6610	ASSERT3P(df->l2df_data, !=, NULL);
6611	if (df->l2df_type == ARC_BUFC_METADATA) {
6612	zio_buf_free(df->l2df_data, df->l2df_size);
6613	} else {
6614	ASSERT(df->l2df_type == ARC_BUFC_DATA);
6615	zio_data_buf_free(df->l2df_data, df->l2df_size);
6616	}
6617	list_remove(buflist, df);
6618	kmem_free(df, sizeof (l2arc_data_free_t));
6619	}
6620
6621	mutex_exit(&l2arc_free_on_write_mtx);
6622	}
6623
6624	/*
6625	* A write to a cache device has completed. Update all headers to allow
6626	* reads from these buffers to begin.
6627	*/
6628	static void
6629	l2arc_write_done(zio_t *zio)
6630	{
6631	l2arc_write_callback_t *cb;
6632	l2arc_dev_t *dev;
6633	list_t *buflist;
6634	arc_buf_hdr_t head, hdr, *hdr_prev;
6635	kmutex_t *hash_lock;
6636	int64_t bytes_dropped = `0`;
6637
6638	cb = zio->io_private;
6639	ASSERT3P(cb, !=, NULL);
6640	dev = cb->l2wcb_dev;
6641	ASSERT3P(dev, !=, NULL);
6642	head = cb->l2wcb_head;
6643	ASSERT3P(head, !=, NULL);
6644	buflist = &dev->l2ad_buflist;
6645	ASSERT3P(buflist, !=, NULL);
6646	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
6647	l2arc_write_callback_t *, cb);
6648
6649	if (zio->io_error != `0`)
6650	ARCSTAT_BUMP(arcstat_l2_writes_error);
6651
6652	/*
6653	* All writes completed, or an error was hit.
6654	*/
6655	top:
6656	mutex_enter(&dev->l2ad_mtx);
6657	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
6658	hdr_prev = list_prev(buflist, hdr);
6659
6660	hash_lock = HDR_LOCK(hdr);
6661
6662	/*
6663	* We cannot use mutex_enter or else we can deadlock
6664	* with l2arc_write_buffers (due to swapping the order
6665	* the hash lock and l2ad_mtx are taken).
6666	*/
6667	if (!mutex_tryenter(hash_lock)) {
6668	/*
6669	* Missed the hash lock. We must retry so we
6670	* don't leave the ARC_FLAG_L2_WRITING bit set.
6671	*/
6672	ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
6673
6674	/*
6675	* We don't want to rescan the headers we've
6676	* already marked as having been written out, so
6677	* we reinsert the head node so we can pick up
6678	* where we left off.
6679	*/
6680	list_remove(buflist, head);
6681	list_insert_after(buflist, hdr, head);
6682
6683	mutex_exit(&dev->l2ad_mtx);
6684
6685	/*
6686	* We wait for the hash lock to become available
6687	* to try and prevent busy waiting, and increase
6688	* the chance we'll be able to acquire the lock
6689	* the next time around.
6690	*/
6691	mutex_enter(hash_lock);
6692	mutex_exit(hash_lock);
6693	goto top;
6694	}
6695
6696	/*
6697	* We could not have been moved into the arc_l2c_only
6698	* state while in-flight due to our ARC_FLAG_L2_WRITING
6699	* bit being set. Let's just ensure that's being enforced.
6700	*/
6701	ASSERT(HDR_HAS_L1HDR(hdr));
6702
6703	if (zio->io_error != `0`) {
6704	/*
6705	* Error - drop L2ARC entry.
6706	*/
6707	list_remove(buflist, hdr);
6708	l2arc_trim(hdr);
6709	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
6710
6711	ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr));
6712	ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr));
6713
6714	bytes_dropped += arc_hdr_size(hdr);
6715	(void) refcount_remove_many(&dev->l2ad_alloc,
6716	arc_hdr_size(hdr), hdr);
6717	}
6718
6719	/*
6720	* Allow ARC to begin reads and ghost list evictions to
6721	* this L2ARC entry.
6722	*/
6723	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
6724
6725	mutex_exit(hash_lock);
6726	}
6727
6728	atomic_inc_64(&l2arc_writes_done);
6729	list_remove(buflist, head);
6730	ASSERT(!HDR_HAS_L1HDR(head));
6731	kmem_cache_free(hdr_l2only_cache, head);
6732	mutex_exit(&dev->l2ad_mtx);
6733
6734	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, `0`, `0`);
6735
6736	l2arc_do_free_on_write();
6737
6738	kmem_free(cb, sizeof (l2arc_write_callback_t));
6739	}
6740
6741	/*
6742	* A read to a cache device completed. Validate buffer contents before
6743	* handing over to the regular ARC routines.
6744	*/
6745	static void
6746	l2arc_read_done(zio_t *zio)
6747	{
6748	l2arc_read_callback_t *cb;
6749	arc_buf_hdr_t *hdr;
6750	kmutex_t *hash_lock;
6751	boolean_t valid_cksum;
6752
6753	ASSERT3P(zio->io_vd, !=, NULL);
6754	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
6755
6756	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
6757
6758	cb = zio->io_private;
6759	ASSERT3P(cb, !=, NULL);
6760	hdr = cb->l2rcb_hdr;
6761	ASSERT3P(hdr, !=, NULL);
6762
6763	hash_lock = HDR_LOCK(hdr);
6764	mutex_enter(hash_lock);
6765	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6766
6767	/*
6768	* If the data was read into a temporary buffer,
6769	* move it and free the buffer.
6770	*/
6771	if (cb->l2rcb_data != NULL) {
6772	ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
6773	if (zio->io_error == `0`) {
6774	bcopy(cb->l2rcb_data, hdr->b_l1hdr.b_pdata,
6775	arc_hdr_size(hdr));
6776	}
6777
6778	/*
6779	* The following must be done regardless of whether
6780	* there was an error:
6781	* - free the temporary buffer
6782	* - point zio to the real ARC buffer
6783	* - set zio size accordingly
6784	* These are required because zio is either re-used for
6785	* an I/O of the block in the case of the error
6786	* or the zio is passed to arc_read_done() and it
6787	* needs real data.
6788	*/
6789	zio_data_buf_free(cb->l2rcb_data, zio->io_size);
6790	zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
6791	zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_pdata;
6792	}
6793
6794	ASSERT3P(zio->io_data, !=, NULL);
6795
6796	/*
6797	* Check this survived the L2ARC journey.
6798	*/
6799	ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata);
6800	zio->io_bp_copy = cb->l2rcb_bp; / XXX fix in L2ARC 2.0 /
6801	zio->io_bp = &zio->io_bp_copy; / XXX fix in L2ARC 2.0 /
6802
6803	valid_cksum = arc_cksum_is_equal(hdr, zio);
6804	if (valid_cksum && zio->io_error == `0` && !HDR_L2_EVICTED(hdr)) {
6805	mutex_exit(hash_lock);
6806	zio->io_private = hdr;
6807	arc_read_done(zio);
6808	} else {
6809	mutex_exit(hash_lock);
6810	/*
6811	* Buffer didn't survive caching. Increment stats and
6812	* reissue to the original storage device.
6813	*/
6814	if (zio->io_error != `0`) {
6815	ARCSTAT_BUMP(arcstat_l2_io_error);
6816	} else {
6817	zio->io_error = SET_ERROR(EIO);
6818	}
6819	if (!valid_cksum)
6820	ARCSTAT_BUMP(arcstat_l2_cksum_bad);
6821
6822	/*
6823	* If there's no waiter, issue an async i/o to the primary
6824	* storage now. If there is a waiter, the caller must
6825	* issue the i/o in a context where it's OK to block.
6826	*/
6827	if (zio->io_waiter == NULL) {
6828	zio_t *pio = zio_unique_parent(zio);
6829
6830	ASSERT(!pio \|\| pio->io_child_type == ZIO_CHILD_LOGICAL);
6831
6832	zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
6833	hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done,
6834	hdr, zio->io_priority, cb->l2rcb_flags,
6835	&cb->l2rcb_zb));
6836	}
6837	}
6838
6839	kmem_free(cb, sizeof (l2arc_read_callback_t));
6840	}
6841
6842	/*
6843	* This is the list priority from which the L2ARC will search for pages to
6844	* cache. This is used within loops (0..3) to cycle through lists in the
6845	* desired order. This order can have a significant effect on cache
6846	* performance.
6847	*
6848	* Currently the metadata lists are hit first, MFU then MRU, followed by
6849	* the data lists. This function returns a locked list, and also returns
6850	* the lock pointer.
6851	*/
6852	static multilist_sublist_t *
6853	l2arc_sublist_lock(int list_num)
6854	{
6855	multilist_t *ml = NULL;
6856	unsigned int idx;
6857
6858	ASSERT(list_num >= `0` && list_num <= `3`);
6859
6860	switch (list_num) {
6861	case `0`:
6862	ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
6863	break;
6864	case `1`:
6865	ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
6866	break;
6867	case `2`:
6868	ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
6869	break;
6870	case `3`:
6871	ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
6872	break;
6873	}
6874
6875	/*
6876	* Return a randomly-selected sublist. This is acceptable
6877	* because the caller feeds only a little bit of data for each
6878	* call (8MB). Subsequent calls will result in different
6879	* sublists being selected.
6880	*/
6881	idx = multilist_get_random_index(ml);
6882	return (multilist_sublist_lock(ml, idx));
6883	}
6884
6885	/*
6886	* Evict buffers from the device write hand to the distance specified in
6887	* bytes. This distance may span populated buffers, it may span nothing.
6888	* This is clearing a region on the L2ARC device ready for writing.
6889	* If the 'all' boolean is set, every buffer is evicted.
6890	*/
6891	static void
6892	l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
6893	{
6894	list_t *buflist;
6895	arc_buf_hdr_t hdr, hdr_prev;
6896	kmutex_t *hash_lock;
6897	uint64_t taddr;
6898
6899	buflist = &dev->l2ad_buflist;
6900
6901	if (!all && dev->l2ad_first) {
6902	/*
6903	* This is the first sweep through the device. There is
6904	* nothing to evict.
6905	*/
6906	return;
6907	}
6908
6909	if (dev->l2ad_hand >= (dev->l2ad_end - (`2` * distance))) {
6910	/*
6911	* When nearing the end of the device, evict to the end
6912	* before the device write hand jumps to the start.
6913	*/
6914	taddr = dev->l2ad_end;
6915	} else {
6916	taddr = dev->l2ad_hand + distance;
6917	}
6918	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t , dev, list_t , buflist,
6919	uint64_t, taddr, boolean_t, all);
6920
6921	top:
6922	mutex_enter(&dev->l2ad_mtx);
6923	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
6924	hdr_prev = list_prev(buflist, hdr);
6925
6926	hash_lock = HDR_LOCK(hdr);
6927
6928	/*
6929	* We cannot use mutex_enter or else we can deadlock
6930	* with l2arc_write_buffers (due to swapping the order
6931	* the hash lock and l2ad_mtx are taken).
6932	*/
6933	if (!mutex_tryenter(hash_lock)) {
6934	/*
6935	* Missed the hash lock. Retry.
6936	*/
6937	ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
6938	mutex_exit(&dev->l2ad_mtx);
6939	mutex_enter(hash_lock);
6940	mutex_exit(hash_lock);
6941	goto top;
6942	}
6943
6944	if (HDR_L2_WRITE_HEAD(hdr)) {
6945	/*
6946	* We hit a write head node. Leave it for
6947	* l2arc_write_done().
6948	*/
6949	list_remove(buflist, hdr);
6950	mutex_exit(hash_lock);
6951	continue;
6952	}
6953
6954	if (!all && HDR_HAS_L2HDR(hdr) &&
6955	(hdr->b_l2hdr.b_daddr >= taddr \|\|
6956	hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
6957	/*
6958	* We've evicted to the target address,
6959	* or the end of the device.
6960	*/
6961	mutex_exit(hash_lock);
6962	break;
6963	}
6964
6965	ASSERT(HDR_HAS_L2HDR(hdr));
6966	if (!HDR_HAS_L1HDR(hdr)) {
6967	ASSERT(!HDR_L2_READING(hdr));
6968	/*
6969	* This doesn't exist in the ARC. Destroy.
6970	* arc_hdr_destroy() will call list_remove()
6971	* and decrement arcstat_l2_size.
6972	*/
6973	arc_change_state(arc_anon, hdr, hash_lock);
6974	arc_hdr_destroy(hdr);
6975	} else {
6976	ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
6977	ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
6978	/*
6979	* Invalidate issued or about to be issued
6980	* reads, since we may be about to write
6981	* over this location.
6982	*/
6983	if (HDR_L2_READING(hdr)) {
6984	ARCSTAT_BUMP(arcstat_l2_evict_reading);
6985	arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
6986	}
6987
6988	/ Ensure this header has finished being written /
6989	ASSERT(!HDR_L2_WRITING(hdr));
6990
6991	arc_hdr_l2hdr_destroy(hdr);
6992	}
6993	mutex_exit(hash_lock);
6994	}
6995	mutex_exit(&dev->l2ad_mtx);
6996	}
6997
6998	/*
6999	* Find and write ARC buffers to the L2ARC device.
7000	*
7001	* An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
7002	* for reading until they have completed writing.
7003	* The headroom_boost is an in-out parameter used to maintain headroom boost
7004	* state between calls to this function.
7005	*
7006	* Returns the number of bytes actually written (which may be smaller than
7007	* the delta by which the device hand has changed due to alignment).
7008	*/
7009	static uint64_t
7010	l2arc_write_buffers(spa_t spa, l2arc_dev_t dev, uint64_t target_sz)
7011	{
7012	arc_buf_hdr_t hdr, hdr_prev, *head;
7013	uint64_t write_asize, write_psize, write_sz, headroom;
7014	boolean_t full;
7015	l2arc_write_callback_t *cb;
7016	zio_t pio, wzio;
7017	uint64_t guid = spa_load_guid(spa);
7018	int try;
7019
7020	ASSERT3P(dev->l2ad_vdev, !=, NULL);
7021
7022	pio = NULL;
7023	write_sz = write_asize = write_psize = `0`;
7024	full = B_FALSE;
7025	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
7026	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD \| ARC_FLAG_HAS_L2HDR);
7027
7028	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
7029	/*
7030	* Copy buffers for L2ARC writing.
7031	*/
7032	for (try = `0`; try <= `3`; try++) {
7033	multilist_sublist_t *mls = l2arc_sublist_lock(try);
7034	uint64_t passed_sz = `0`;
7035
7036	ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
7037
7038	/*
7039	* L2ARC fast warmup.
7040	*
7041	* Until the ARC is warm and starts to evict, read from the
7042	* head of the ARC lists rather than the tail.
7043	*/
7044	if (arc_warm == B_FALSE)
7045	hdr = multilist_sublist_head(mls);
7046	else
7047	hdr = multilist_sublist_tail(mls);
7048	if (hdr == NULL)
7049	ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
7050
7051	headroom = target_sz * l2arc_headroom;
7052	if (zfs_compressed_arc_enabled)
7053	headroom = (headroom * l2arc_headroom_boost) / `100`;
7054
7055	for (; hdr; hdr = hdr_prev) {
7056	kmutex_t *hash_lock;
7057
7058	if (arc_warm == B_FALSE)
7059	hdr_prev = multilist_sublist_next(mls, hdr);
7060	else
7061	hdr_prev = multilist_sublist_prev(mls, hdr);
7062	ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
7063	HDR_GET_LSIZE(hdr));
7064
7065	hash_lock = HDR_LOCK(hdr);
7066	if (!mutex_tryenter(hash_lock)) {
7067	ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
7068	/*
7069	* Skip this buffer rather than waiting.
7070	*/
7071	continue;
7072	}
7073
7074	passed_sz += HDR_GET_LSIZE(hdr);
7075	if (passed_sz > headroom) {
7076	/*
7077	* Searched too far.
7078	*/
7079	mutex_exit(hash_lock);
7080	ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
7081	break;
7082	}
7083
7084	if (!l2arc_write_eligible(guid, hdr)) {
7085	mutex_exit(hash_lock);
7086	continue;
7087	}
7088
7089	/*
7090	* We rely on the L1 portion of the header below, so
7091	* it's invalid for this header to have been evicted out
7092	* of the ghost cache, prior to being written out. The
7093	* ARC_FLAG_L2_WRITING bit ensures this won't happen.
7094	*/
7095	ASSERT(HDR_HAS_L1HDR(hdr));
7096
7097	ASSERT3U(HDR_GET_PSIZE(hdr), >, `0`);
7098	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
7099	ASSERT3U(arc_hdr_size(hdr), >, `0`);
7100	uint64_t size = arc_hdr_size(hdr);
7101	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
7102	size);
7103
7104	if ((write_psize + asize) > target_sz) {
7105	full = B_TRUE;
7106	mutex_exit(hash_lock);
7107	ARCSTAT_BUMP(arcstat_l2_write_full);
7108	break;
7109	}
7110
7111	if (pio == NULL) {
7112	/*
7113	* Insert a dummy header on the buflist so
7114	* l2arc_write_done() can find where the
7115	* write buffers begin without searching.
7116	*/
7117	mutex_enter(&dev->l2ad_mtx);
7118	list_insert_head(&dev->l2ad_buflist, head);
7119	mutex_exit(&dev->l2ad_mtx);
7120
7121	cb = kmem_alloc(
7122	sizeof (l2arc_write_callback_t), KM_SLEEP);
7123	cb->l2wcb_dev = dev;
7124	cb->l2wcb_head = head;
7125	pio = zio_root(spa, l2arc_write_done, cb,
7126	ZIO_FLAG_CANFAIL);
7127	ARCSTAT_BUMP(arcstat_l2_write_pios);
7128	}
7129
7130	hdr->b_l2hdr.b_dev = dev;
7131	hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
7132	arc_hdr_set_flags(hdr,
7133	ARC_FLAG_L2_WRITING \| ARC_FLAG_HAS_L2HDR);
7134
7135	mutex_enter(&dev->l2ad_mtx);
7136	list_insert_head(&dev->l2ad_buflist, hdr);
7137	mutex_exit(&dev->l2ad_mtx);
7138
7139	(void) refcount_add_many(&dev->l2ad_alloc, size, hdr);
7140
7141	/*
7142	* Normally the L2ARC can use the hdr's data, but if
7143	* we're sharing data between the hdr and one of its
7144	* bufs, L2ARC needs its own copy of the data so that
7145	* the ZIO below can't race with the buf consumer. To
7146	* ensure that this copy will be available for the
7147	* lifetime of the ZIO and be cleaned up afterwards, we
7148	* add it to the l2arc_free_on_write queue.
7149	*/
7150	void *to_write;
7151	if (!HDR_SHARED_DATA(hdr) && size == asize) {
7152	to_write = hdr->b_l1hdr.b_pdata;
7153	} else {
7154	arc_buf_contents_t type = arc_buf_type(hdr);
7155	if (type == ARC_BUFC_METADATA) {
7156	to_write = zio_buf_alloc(asize);
7157	} else {
7158	ASSERT3U(type, ==, ARC_BUFC_DATA);
7159	to_write = zio_data_buf_alloc(asize);
7160	}
7161
7162	bcopy(hdr->b_l1hdr.b_pdata, to_write, size);
7163	if (asize != size)
7164	bzero(to_write + size, asize - size);
7165	l2arc_free_data_on_write(to_write, asize, type);
7166	}
7167	wzio = zio_write_phys(pio, dev->l2ad_vdev,
7168	hdr->b_l2hdr.b_daddr, asize, to_write,
7169	ZIO_CHECKSUM_OFF, NULL, hdr,
7170	ZIO_PRIORITY_ASYNC_WRITE,
7171	ZIO_FLAG_CANFAIL, B_FALSE);
7172
7173	write_sz += HDR_GET_LSIZE(hdr);
7174	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
7175	zio_t *, wzio);
7176
7177	write_asize += size;
7178	write_psize += asize;
7179	dev->l2ad_hand += asize;
7180
7181	mutex_exit(hash_lock);
7182
7183	(void) zio_nowait(wzio);
7184	}
7185
7186	multilist_sublist_unlock(mls);
7187
7188	if (full == B_TRUE)
7189	break;
7190	}
7191
7192	/ No buffers selected for writing? /
7193	if (pio == NULL) {
7194	ASSERT0(write_sz);
7195	ASSERT(!HDR_HAS_L1HDR(head));
7196	kmem_cache_free(hdr_l2only_cache, head);
7197	return (`0`);
7198	}
7199
7200	ASSERT3U(write_psize, <=, target_sz);
7201	ARCSTAT_BUMP(arcstat_l2_writes_sent);
7202	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
7203	ARCSTAT_INCR(arcstat_l2_size, write_sz);
7204	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
7205	vdev_space_update(dev->l2ad_vdev, write_asize, `0`, `0`);
7206
7207	/*
7208	* Bump device hand to the device start if it is approaching the end.
7209	* l2arc_evict() will already have evicted ahead for this case.
7210	*/
7211	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
7212	dev->l2ad_hand = dev->l2ad_start;
7213	dev->l2ad_first = B_FALSE;
7214	}
7215
7216	dev->l2ad_writing = B_TRUE;
7217	(void) zio_wait(pio);
7218	dev->l2ad_writing = B_FALSE;
7219
7220	return (write_asize);
7221	}
7222
7223	/*
7224	* This thread feeds the L2ARC at regular intervals. This is the beating
7225	* heart of the L2ARC.
7226	*/
7227	static void
7228	l2arc_feed_thread(void *dummy __unused)
7229	{
7230	callb_cpr_t cpr;
7231	l2arc_dev_t *dev;
7232	spa_t *spa;
7233	uint64_t size, wrote;
7234	clock_t begin, next = ddi_get_lbolt() + hz;
7235
7236	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
7237
7238	mutex_enter(&l2arc_feed_thr_lock);
7239
7240	while (l2arc_thread_exit == `0`) {
7241	CALLB_CPR_SAFE_BEGIN(&cpr);
7242	(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
7243	next - ddi_get_lbolt());
7244	CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
7245	next = ddi_get_lbolt() + hz;
7246
7247	/*
7248	* Quick check for L2ARC devices.
7249	*/
7250	mutex_enter(&l2arc_dev_mtx);
7251	if (l2arc_ndev == `0`) {
7252	mutex_exit(&l2arc_dev_mtx);
7253	continue;
7254	}
7255	mutex_exit(&l2arc_dev_mtx);
7256	begin = ddi_get_lbolt();
7257
7258	/*
7259	* This selects the next l2arc device to write to, and in
7260	* doing so the next spa to feed from: dev->l2ad_spa. This
7261	* will return NULL if there are now no l2arc devices or if
7262	* they are all faulted.
7263	*
7264	* If a device is returned, its spa's config lock is also
7265	* held to prevent device removal. l2arc_dev_get_next()
7266	* will grab and release l2arc_dev_mtx.
7267	*/
7268	if ((dev = l2arc_dev_get_next()) == NULL)
7269	continue;
7270
7271	spa = dev->l2ad_spa;
7272	ASSERT3P(spa, !=, NULL);
7273
7274	/*
7275	* If the pool is read-only then force the feed thread to
7276	* sleep a little longer.
7277	*/
7278	if (!spa_writeable(spa)) {
7279	next = ddi_get_lbolt() + `5` * l2arc_feed_secs * hz;
7280	spa_config_exit(spa, SCL_L2ARC, dev);
7281	continue;
7282	}
7283
7284	/*
7285	* Avoid contributing to memory pressure.
7286	*/
7287	if (arc_reclaim_needed()) {
7288	ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
7289	spa_config_exit(spa, SCL_L2ARC, dev);
7290	continue;
7291	}
7292
7293	ARCSTAT_BUMP(arcstat_l2_feeds);
7294
7295	size = l2arc_write_size();
7296
7297	/*
7298	* Evict L2ARC buffers that will be overwritten.
7299	*/
7300	l2arc_evict(dev, size, B_FALSE);
7301
7302	/*
7303	* Write ARC buffers.
7304	*/
7305	wrote = l2arc_write_buffers(spa, dev, size);
7306
7307	/*
7308	* Calculate interval between writes.
7309	*/
7310	next = l2arc_write_interval(begin, size, wrote);
7311	spa_config_exit(spa, SCL_L2ARC, dev);
7312	}
7313
7314	l2arc_thread_exit = `0`;
7315	cv_broadcast(&l2arc_feed_thr_cv);
7316	CALLB_CPR_EXIT(&cpr); / drops l2arc_feed_thr_lock /
7317	thread_exit();
7318	}
7319
7320	boolean_t
7321	l2arc_vdev_present(vdev_t *vd)
7322	{
7323	l2arc_dev_t *dev;
7324
7325	mutex_enter(&l2arc_dev_mtx);
7326	for (dev = list_head(l2arc_dev_list); dev != NULL;
7327	dev = list_next(l2arc_dev_list, dev)) {
7328	if (dev->l2ad_vdev == vd)
7329	break;
7330	}
7331	mutex_exit(&l2arc_dev_mtx);
7332
7333	return (dev != NULL);
7334	}
7335
7336	/*
7337	* Add a vdev for use by the L2ARC. By this point the spa has already
7338	* validated the vdev and opened it.
7339	*/
7340	void
7341	l2arc_add_vdev(spa_t spa, vdev_t vd)
7342	{
7343	l2arc_dev_t *adddev;
7344
7345	ASSERT(!l2arc_vdev_present(vd));
7346
7347	vdev_ashift_optimize(vd);
7348
7349	/*
7350	* Create a new l2arc device entry.
7351	*/
7352	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
7353	adddev->l2ad_spa = spa;
7354	adddev->l2ad_vdev = vd;
7355	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
7356	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
7357	adddev->l2ad_hand = adddev->l2ad_start;
7358	adddev->l2ad_first = B_TRUE;
7359	adddev->l2ad_writing = B_FALSE;
7360
7361	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
7362	/*
7363	* This is a list of all ARC buffers that are still valid on the
7364	* device.
7365	*/
7366	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
7367	offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
7368
7369	vdev_space_update(vd, `0`, `0`, adddev->l2ad_end - adddev->l2ad_hand);
7370	refcount_create(&adddev->l2ad_alloc);
7371
7372	/*
7373	* Add device to global list
7374	*/
7375	mutex_enter(&l2arc_dev_mtx);
7376	list_insert_head(l2arc_dev_list, adddev);
7377	atomic_inc_64(&l2arc_ndev);
7378	mutex_exit(&l2arc_dev_mtx);
7379	}
7380
7381	/*
7382	* Remove a vdev from the L2ARC.
7383	*/
7384	void
7385	l2arc_remove_vdev(vdev_t *vd)
7386	{
7387	l2arc_dev_t dev, nextdev, *remdev = NULL;
7388
7389	/*
7390	* Find the device by vdev
7391	*/
7392	mutex_enter(&l2arc_dev_mtx);
7393	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
7394	nextdev = list_next(l2arc_dev_list, dev);
7395	if (vd == dev->l2ad_vdev) {
7396	remdev = dev;
7397	break;
7398	}
7399	}
7400	ASSERT3P(remdev, !=, NULL);
7401
7402	/*
7403	* Remove device from global list
7404	*/
7405	list_remove(l2arc_dev_list, remdev);
7406	l2arc_dev_last = NULL; / may have been invalidated /
7407	atomic_dec_64(&l2arc_ndev);
7408	mutex_exit(&l2arc_dev_mtx);
7409
7410	/*
7411	* Clear all buflists and ARC references. L2ARC device flush.
7412	*/
7413	l2arc_evict(remdev, `0`, B_TRUE);
7414	list_destroy(&remdev->l2ad_buflist);
7415	mutex_destroy(&remdev->l2ad_mtx);
7416	refcount_destroy(&remdev->l2ad_alloc);
7417	kmem_free(remdev, sizeof (l2arc_dev_t));
7418	}
7419
7420	void
7421	l2arc_init(void)
7422	{
7423	l2arc_thread_exit = `0`;
7424	l2arc_ndev = `0`;
7425	l2arc_writes_sent = `0`;
7426	l2arc_writes_done = `0`;
7427
7428	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
7429	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
7430	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
7431	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
7432
7433	l2arc_dev_list = &L2ARC_dev_list;
7434	l2arc_free_on_write = &L2ARC_free_on_write;
7435	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
7436	offsetof(l2arc_dev_t, l2ad_node));
7437	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
7438	offsetof(l2arc_data_free_t, l2df_list_node));
7439	}
7440
7441	void
7442	l2arc_fini(void)
7443	{
7444	/*
7445	* This is called from dmu_fini(), which is called from spa_fini();
7446	* Because of this, we can assume that all l2arc devices have
7447	* already been removed when the pools themselves were removed.
7448	*/
7449
7450	l2arc_do_free_on_write();
7451
7452	mutex_destroy(&l2arc_feed_thr_lock);
7453	cv_destroy(&l2arc_feed_thr_cv);
7454	mutex_destroy(&l2arc_dev_mtx);
7455	mutex_destroy(&l2arc_free_on_write_mtx);
7456
7457	list_destroy(l2arc_dev_list);
7458	list_destroy(l2arc_free_on_write);
7459	}
7460
7461	void
7462	l2arc_start(void)
7463	{
7464	if (!(spa_mode_global & FWRITE))
7465	return;
7466
7467	(void) thread_create(NULL, `0`, l2arc_feed_thread, NULL, `0`, &p0,
7468	TS_RUN, minclsyspri);
7469	}
7470
7471	void
7472	l2arc_stop(void)
7473	{
7474	if (!(spa_mode_global & FWRITE))
7475	return;
7476
7477	mutex_enter(&l2arc_feed_thr_lock);
7478	cv_signal(&l2arc_feed_thr_cv); / kick thread out of startup /
7479	l2arc_thread_exit = `1`;
7480	while (l2arc_thread_exit != `0`)
7481	cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
7482	mutex_exit(&l2arc_feed_thr_lock);
7483	}
7484

Browse the source code of netbsd/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c