vfs_bio.c source code [netbsd/sys/kern/vfs_bio.c]

1	/ $NetBSD: vfs_bio.c,v 1.278 2018/11/24 17:52:39 maxv Exp $ /
2
3	/-*
4	* Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Andrew Doran, and by Wasabi Systems, Inc.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	* 1. Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29	* POSSIBILITY OF SUCH DAMAGE.
30	*/
31
32	/-*
33	* Copyright (c) 1982, 1986, 1989, 1993
34	* The Regents of the University of California. All rights reserved.
35	* (c) UNIX System Laboratories, Inc.
36	* All or some portions of this file are derived from material licensed
37	* to the University of California by American Telephone and Telegraph
38	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
39	* the permission of UNIX System Laboratories, Inc.
40	*
41	* Redistribution and use in source and binary forms, with or without
42	* modification, are permitted provided that the following conditions
43	* are met:
44	* 1. Redistributions of source code must retain the above copyright
45	* notice, this list of conditions and the following disclaimer.
46	* 2. Redistributions in binary form must reproduce the above copyright
47	* notice, this list of conditions and the following disclaimer in the
48	* documentation and/or other materials provided with the distribution.
49	* 3. Neither the name of the University nor the names of its contributors
50	* may be used to endorse or promote products derived from this software
51	* without specific prior written permission.
52	*
53	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63	* SUCH DAMAGE.
64	*
65	* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
66	*/
67
68	/-*
69	* Copyright (c) 1994 Christopher G. Demetriou
70	*
71	* Redistribution and use in source and binary forms, with or without
72	* modification, are permitted provided that the following conditions
73	* are met:
74	* 1. Redistributions of source code must retain the above copyright
75	* notice, this list of conditions and the following disclaimer.
76	* 2. Redistributions in binary form must reproduce the above copyright
77	* notice, this list of conditions and the following disclaimer in the
78	* documentation and/or other materials provided with the distribution.
79	* 3. All advertising materials mentioning features or use of this software
80	* must display the following acknowledgement:
81	* This product includes software developed by the University of
82	* California, Berkeley and its contributors.
83	* 4. Neither the name of the University nor the names of its contributors
84	* may be used to endorse or promote products derived from this software
85	* without specific prior written permission.
86	*
87	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
88	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
89	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
90	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
91	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
92	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
93	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
94	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
95	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
96	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
97	* SUCH DAMAGE.
98	*
99	* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
100	*/
101
102	/*
103	* The buffer cache subsystem.
104	*
105	* Some references:
106	* Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
107	* Leffler, et al.: The Design and Implementation of the 4.3BSD
108	* UNIX Operating System (Addison Welley, 1989)
109	*
110	* Locking
111	*
112	* There are three locks:
113	* - bufcache_lock: protects global buffer cache state.
114	* - BC_BUSY: a long term per-buffer lock.
115	* - buf_t::b_objlock: lock on completion (biowait vs biodone).
116	*
117	* For buffers associated with vnodes (a most common case) b_objlock points
118	* to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock.
119	*
120	* Lock order:
121	* bufcache_lock ->
122	* buf_t::b_objlock
123	*/
124
125	#include <sys/cdefs.h>
126	__KERNEL_RCSID(`0`, "$NetBSD: vfs_bio.c,v 1.278 2018/11/24 17:52:39 maxv Exp $");
127
128	#ifdef _KERNEL_OPT
129	#include "opt_bufcache.h"
130	#include "opt_dtrace.h"
131	#include "opt_biohist.h"
132	#endif
133
134	#include <sys/param.h>
135	#include <sys/systm.h>
136	#include <sys/kernel.h>
137	#include <sys/proc.h>
138	#include <sys/buf.h>
139	#include <sys/vnode.h>
140	#include <sys/mount.h>
141	#include <sys/resourcevar.h>
142	#include <sys/sysctl.h>
143	#include <sys/conf.h>
144	#include <sys/kauth.h>
145	#include <sys/fstrans.h>
146	#include <sys/intr.h>
147	#include <sys/cpu.h>
148	#include <sys/wapbl.h>
149	#include <sys/bitops.h>
150	#include <sys/cprng.h>
151	#include <sys/sdt.h>
152
153	#include <uvm/uvm.h> /* extern struct uvm uvm */
154
155	#include <miscfs/specfs/specdev.h>
156
157	#ifndef BUFPAGES
158	# define BUFPAGES 0
159	#endif
160
161	#ifdef BUFCACHE
162	# if (BUFCACHE < 5) \|\| (BUFCACHE > 95)
163	# error BUFCACHE is not between 5 and 95
164	# endif
165	#else
166	# define BUFCACHE 15
167	#endif
168
169	u_int nbuf; / desired number of buffer headers /
170	u_int bufpages = BUFPAGES; / optional hardwired count /
171	u_int bufcache = BUFCACHE; / max % of RAM to use for buffer cache /
172
173	/*
174	* Definitions for the buffer free lists.
175	*/
176	#define BQUEUES 3 /* number of free buffer queues */
177
178	#define BQ_LOCKED 0 /* super-blocks &c */
179	#define BQ_LRU 1 /* lru, useful buffers */
180	#define BQ_AGE 2 /* rubbish */
181
182	struct bqueue {
183	TAILQ_HEAD(, buf) bq_queue;
184	uint64_t bq_bytes;
185	buf_t *bq_marker;
186	};
187	static struct bqueue bufqueues[BQUEUES];
188
189	/ Function prototypes /
190	static void buf_setwm(void);
191	static int buf_trim(void);
192	static void bufpool_page_alloc(struct* pool , int*);
193	static void bufpool_page_free(struct pool , void* *);
194	static buf_t bio_doread(struct* vnode , daddr_t, int, int*);
195	static buf_t getnewbuf(int, int, int*);
196	static int buf_lotsfree(void);
197	static int buf_canrelease(void);
198	static u_long buf_mempoolidx(u_long);
199	static u_long buf_roundsize(u_long);
200	static void *buf_alloc(size_t);
201	static void buf_mrelease(void *, size_t);
202	static void binsheadfree(buf_t , struct* bqueue *);
203	static void binstailfree(buf_t , struct* bqueue *);
204	#ifdef DEBUG
205	static int checkfreelist(buf_t , struct* bqueue , int*);
206	#endif
207	static void biointr(void *);
208	static void biodone2(buf_t *);
209	static void bref(buf_t *);
210	static void brele(buf_t *);
211	static void sysctl_kern_buf_setup(void);
212	static void sysctl_vm_buf_setup(void);
213
214	/ Initialization for biohist /
215
216	#include <sys/biohist.h>
217
218	BIOHIST_DEFINE(biohist);
219
220	void
221	biohist_init(void)
222	{
223
224	BIOHIST_INIT(biohist, BIOHIST_SIZE);
225	}
226
227	/*
228	* Definitions for the buffer hash lists.
229	*/
230	#define BUFHASH(dvp, lbn) \
231	(&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
232	LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
233	u_long bufhash;
234
235	static kcondvar_t needbuffer_cv;
236
237	/*
238	* Buffer queue lock.
239	*/
240	kmutex_t bufcache_lock;
241	kmutex_t buffer_lock;
242
243	/ Software ISR for completed transfers. /
244	static void *biodone_sih;
245
246	/ Buffer pool for I/O buffers. /
247	static pool_cache_t buf_cache;
248	static pool_cache_t bufio_cache;
249
250	#define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */
251	#define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1)
252	__CTASSERT((`1` << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - `1`)) == MAXBSIZE);
253
254	/ Buffer memory pools /
255	static struct pool bmempools[NMEMPOOLS];
256
257	static struct vm_map *buf_map;
258
259	/*
260	* Buffer memory pool allocator.
261	*/
262	static void *
263	bufpool_page_alloc(struct pool pp, int* flags)
264	{
265
266	return (void *)uvm_km_alloc(buf_map,
267	MAXBSIZE, MAXBSIZE,
268	((flags & PR_WAITOK) ? `0` : UVM_KMF_NOWAIT\|UVM_KMF_TRYLOCK)
269	\| UVM_KMF_WIRED);
270	}
271
272	static void
273	bufpool_page_free(struct pool pp, void* *v)
274	{
275
276	uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
277	}
278
279	static struct pool_allocator bufmempool_allocator = {
280	.pa_alloc = bufpool_page_alloc,
281	.pa_free = bufpool_page_free,
282	.pa_pagesz = MAXBSIZE,
283	};
284
285	/ Buffer memory management variables /
286	u_long bufmem_valimit;
287	u_long bufmem_hiwater;
288	u_long bufmem_lowater;
289	u_long bufmem;
290
291	/*
292	* MD code can call this to set a hard limit on the amount
293	* of virtual memory used by the buffer cache.
294	*/
295	int
296	buf_setvalimit(vsize_t sz)
297	{
298
299	/ We need to accommodate at least NMEMPOOLS of MAXBSIZE each /
300	if (sz < NMEMPOOLS * MAXBSIZE)
301	return EINVAL;
302
303	bufmem_valimit = sz;
304	return `0`;
305	}
306
307	static void
308	buf_setwm(void)
309	{
310
311	bufmem_hiwater = buf_memcalc();
312	/ lowater is approx. 2% of memory (with bufcache = 15) /
313	#define BUFMEM_WMSHIFT 3
314	#define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT)
315	if (bufmem_hiwater < BUFMEM_HIWMMIN)
316	/ Ensure a reasonable minimum value /
317	bufmem_hiwater = BUFMEM_HIWMMIN;
318	bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
319	}
320
321	#ifdef DEBUG
322	int debug_verify_freelist = `0`;
323	static int
324	checkfreelist(buf_t bp, struct* bqueue dp, int* ison)
325	{
326	buf_t *b;
327
328	if (!debug_verify_freelist)
329	return `1`;
330
331	TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) {
332	if (b == bp)
333	return ison ? `1` : `0`;
334	}
335
336	return ison ? `0` : `1`;
337	}
338	#endif
339
340	/*
341	* Insq/Remq for the buffer hash lists.
342	* Call with buffer queue locked.
343	*/
344	static void
345	binsheadfree(buf_t bp, struct* bqueue *dp)
346	{
347
348	KASSERT(mutex_owned(&bufcache_lock));
349	KASSERT(bp->b_freelistindex == -`1`);
350	TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
351	dp->bq_bytes += bp->b_bufsize;
352	bp->b_freelistindex = dp - bufqueues;
353	}
354
355	static void
356	binstailfree(buf_t bp, struct* bqueue *dp)
357	{
358
359	KASSERT(mutex_owned(&bufcache_lock));
360	KASSERTMSG(bp->b_freelistindex == -`1`, "double free of buffer? "
361	"bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex);
362	TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
363	dp->bq_bytes += bp->b_bufsize;
364	bp->b_freelistindex = dp - bufqueues;
365	}
366
367	void
368	bremfree(buf_t *bp)
369	{
370	struct bqueue *dp;
371	int bqidx = bp->b_freelistindex;
372
373	KASSERT(mutex_owned(&bufcache_lock));
374
375	KASSERT(bqidx != -`1`);
376	dp = &bufqueues[bqidx];
377	KDASSERT(checkfreelist(bp, dp, `1`));
378	KASSERT(dp->bq_bytes >= bp->b_bufsize);
379	TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
380	dp->bq_bytes -= bp->b_bufsize;
381
382	/ For the sysctl helper. /
383	if (bp == dp->bq_marker)
384	dp->bq_marker = NULL;
385
386	#if defined(DIAGNOSTIC)
387	bp->b_freelistindex = -`1`;
388	#endif /* defined(DIAGNOSTIC) */
389	}
390
391	/*
392	* Add a reference to an buffer structure that came from buf_cache.
393	*/
394	static inline void
395	bref(buf_t *bp)
396	{
397
398	KASSERT(mutex_owned(&bufcache_lock));
399	KASSERT(bp->b_refcnt > `0`);
400
401	bp->b_refcnt++;
402	}
403
404	/*
405	* Free an unused buffer structure that came from buf_cache.
406	*/
407	static inline void
408	brele(buf_t *bp)
409	{
410
411	KASSERT(mutex_owned(&bufcache_lock));
412	KASSERT(bp->b_refcnt > `0`);
413
414	if (bp->b_refcnt-- == `1`) {
415	buf_destroy(bp);
416	#ifdef DEBUG
417	memset((char )bp, `0`, sizeof(bp));
418	#endif
419	pool_cache_put(buf_cache, bp);
420	}
421	}
422
423	/*
424	* note that for some ports this is used by pmap bootstrap code to
425	* determine kva size.
426	*/
427	u_long
428	buf_memcalc(void)
429	{
430	u_long n;
431	vsize_t mapsz = `0`;
432
433	/*
434	* Determine the upper bound of memory to use for buffers.
435	*
436	* - If bufpages is specified, use that as the number
437	* pages.
438	*
439	* - Otherwise, use bufcache as the percentage of
440	* physical memory.
441	*/
442	if (bufpages != `0`) {
443	n = bufpages;
444	} else {
445	if (bufcache < `5`) {
446	printf("forcing bufcache %d -> 5", bufcache);
447	bufcache = `5`;
448	}
449	if (bufcache > `95`) {
450	printf("forcing bufcache %d -> 95", bufcache);
451	bufcache = `95`;
452	}
453	if (buf_map != NULL)
454	mapsz = vm_map_max(buf_map) - vm_map_min(buf_map);
455	n = calc_cache_size(mapsz, bufcache,
456	(buf_map != kernel_map) ? `100` : BUFCACHE_VA_MAXPCT)
457	/ PAGE_SIZE;
458	}
459
460	n <<= PAGE_SHIFT;
461	if (bufmem_valimit != `0` && n > bufmem_valimit)
462	n = bufmem_valimit;
463
464	return (n);
465	}
466
467	/*
468	* Initialize buffers and hash links for buffers.
469	*/
470	void
471	bufinit(void)
472	{
473	struct bqueue *dp;
474	int use_std;
475	u_int i;
476
477	biodone_vfs = biodone;
478
479	mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
480	mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
481	cv_init(&needbuffer_cv, "needbuf");
482
483	if (bufmem_valimit != `0`) {
484	vaddr_t minaddr = `0`, maxaddr;
485	buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
486	bufmem_valimit, `0`, false, `0`);
487	if (buf_map == NULL)
488	panic("bufinit: cannot allocate submap");
489	} else
490	buf_map = kernel_map;
491
492	/*
493	* Initialize buffer cache memory parameters.
494	*/
495	bufmem = `0`;
496	buf_setwm();
497
498	/ On "small" machines use small pool page sizes where possible /
499	use_std = (physmem < atop(`16``1024``1024`));
500
501	/*
502	* Also use them on systems that can map the pool pages using
503	* a direct-mapped segment.
504	*/
505	#ifdef PMAP_MAP_POOLPAGE
506	use_std = `1`;
507	#endif
508
509	buf_cache = pool_cache_init(sizeof(buf_t), `0`, `0`, `0`,
510	"bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
511	bufio_cache = pool_cache_init(sizeof(buf_t), `0`, `0`, `0`,
512	"biopl", NULL, IPL_BIO, NULL, NULL, NULL);
513
514	for (i = `0`; i < NMEMPOOLS; i++) {
515	struct pool_allocator *pa;
516	struct pool *pp = &bmempools[i];
517	u_int size = `1` << (i + MEMPOOL_INDEX_OFFSET);
518	char name = kmem_alloc(`8`, KM_SLEEP); /* XXX: never freed /
519	if (__predict_false(size >= `1048576`))
520	(void)snprintf(name, `8`, "buf%um", size / `1048576`);
521	else if (__predict_true(size >= `1024`))
522	(void)snprintf(name, `8`, "buf%uk", size / `1024`);
523	else
524	(void)snprintf(name, `8`, "buf%ub", size);
525	pa = (size <= PAGE_SIZE && use_std)
526	? &pool_allocator_nointr
527	: &bufmempool_allocator;
528	pool_init(pp, size, `0`, `0`, `0`, name, pa, IPL_NONE);
529	pool_setlowat(pp, `1`);
530	pool_sethiwat(pp, `1`);
531	}
532
533	/ Initialize the buffer queues /
534	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
535	TAILQ_INIT(&dp->bq_queue);
536	dp->bq_bytes = `0`;
537	}
538
539	/*
540	* Estimate hash table size based on the amount of memory we
541	* intend to use for the buffer cache. The average buffer
542	* size is dependent on our clients (i.e. filesystems).
543	*
544	* For now, use an empirical 3K per buffer.
545	*/
546	nbuf = (bufmem_hiwater / `1024`) / `3`;
547	bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
548
549	sysctl_kern_buf_setup();
550	sysctl_vm_buf_setup();
551	}
552
553	void
554	bufinit2(void)
555	{
556
557	biodone_sih = softint_establish(SOFTINT_BIO \| SOFTINT_MPSAFE, biointr,
558	NULL);
559	if (biodone_sih == NULL)
560	panic("bufinit2: can't establish soft interrupt");
561	}
562
563	static int
564	buf_lotsfree(void)
565	{
566	u_long guess;
567
568	/ Always allocate if less than the low water mark. /
569	if (bufmem < bufmem_lowater)
570	return `1`;
571
572	/ Never allocate if greater than the high water mark. /
573	if (bufmem > bufmem_hiwater)
574	return `0`;
575
576	/ If there's anything on the AGE list, it should be eaten. /
577	if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
578	return `0`;
579
580	/*
581	* The probabily of getting a new allocation is inversely
582	* proportional to the current size of the cache above
583	* the low water mark. Divide the total first to avoid overflows
584	* in the product.
585	*/
586	guess = cprng_fast32() % `16`;
587
588	if ((bufmem_hiwater - bufmem_lowater) / `16` * guess >=
589	(bufmem - bufmem_lowater))
590	return `1`;
591
592	/ Otherwise don't allocate. /
593	return `0`;
594	}
595
596	/*
597	* Return estimate of bytes we think need to be
598	* released to help resolve low memory conditions.
599	*
600	* => called with bufcache_lock held.
601	*/
602	static int
603	buf_canrelease(void)
604	{
605	int pagedemand, ninvalid = `0`;
606
607	KASSERT(mutex_owned(&bufcache_lock));
608
609	if (bufmem < bufmem_lowater)
610	return `0`;
611
612	if (bufmem > bufmem_hiwater)
613	return bufmem - bufmem_hiwater;
614
615	ninvalid += bufqueues[BQ_AGE].bq_bytes;
616
617	pagedemand = uvmexp.freetarg - uvmexp.free;
618	if (pagedemand < `0`)
619	return ninvalid;
620	return MAX(ninvalid, MIN(`2` * MAXBSIZE,
621	MIN((bufmem - bufmem_lowater) / `16`, pagedemand * PAGE_SIZE)));
622	}
623
624	/*
625	* Buffer memory allocation helper functions
626	*/
627	static u_long
628	buf_mempoolidx(u_long size)
629	{
630	u_int n = `0`;
631
632	size -= `1`;
633	size >>= MEMPOOL_INDEX_OFFSET;
634	while (size) {
635	size >>= `1`;
636	n += `1`;
637	}
638	if (n >= NMEMPOOLS)
639	panic("buf mem pool index %d", n);
640	return n;
641	}
642
643	static u_long
644	buf_roundsize(u_long size)
645	{
646	/ Round up to nearest power of 2 /
647	return (`1` << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
648	}
649
650	static void *
651	buf_alloc(size_t size)
652	{
653	u_int n = buf_mempoolidx(size);
654	void *addr;
655
656	while (`1`) {
657	addr = pool_get(&bmempools[n], PR_NOWAIT);
658	if (addr != NULL)
659	break;
660
661	/ No memory, see if we can free some. If so, try again /
662	mutex_enter(&bufcache_lock);
663	if (buf_drain(`1`) > `0`) {
664	mutex_exit(&bufcache_lock);
665	continue;
666	}
667
668	if (curlwp == uvm.pagedaemon_lwp) {
669	mutex_exit(&bufcache_lock);
670	return NULL;
671	}
672
673	/ Wait for buffers to arrive on the LRU queue /
674	cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / `4`);
675	mutex_exit(&bufcache_lock);
676	}
677
678	return addr;
679	}
680
681	static void
682	buf_mrelease(void *addr, size_t size)
683	{
684
685	pool_put(&bmempools[buf_mempoolidx(size)], addr);
686	}
687
688	/*
689	* bread()/breadn() helper.
690	*/
691	static buf_t *
692	bio_doread(struct vnode vp, daddr_t blkno, int* size, int async)
693	{
694	buf_t *bp;
695	struct mount *mp;
696
697	bp = getblk(vp, blkno, size, `0`, `0`);
698
699	/*
700	* getblk() may return NULL if we are the pagedaemon.
701	*/
702	if (bp == NULL) {
703	KASSERT(curlwp == uvm.pagedaemon_lwp);
704	return NULL;
705	}
706
707	/*
708	* If buffer does not have data valid, start a read.
709	* Note that if buffer is BC_INVAL, getblk() won't return it.
710	* Therefore, it's valid if its I/O has completed or been delayed.
711	*/
712	if (!ISSET(bp->b_oflags, (BO_DONE \| BO_DELWRI))) {
713	/ Start I/O for the buffer. /
714	SET(bp->b_flags, B_READ \| async);
715	if (async)
716	BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
717	else
718	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
719	VOP_STRATEGY(vp, bp);
720
721	/ Pay for the read. /
722	curlwp->l_ru.ru_inblock++;
723	} else if (async)
724	brelse(bp, `0`);
725
726	if (vp->v_type == VBLK)
727	mp = spec_node_getmountedfs(vp);
728	else
729	mp = vp->v_mount;
730
731	/*
732	* Collect statistics on synchronous and asynchronous reads.
733	* Reads from block devices are charged to their associated
734	* filesystem (if any).
735	*/
736	if (mp != NULL) {
737	if (async == `0`)
738	mp->mnt_stat.f_syncreads++;
739	else
740	mp->mnt_stat.f_asyncreads++;
741	}
742
743	return (bp);
744	}
745
746	/*
747	* Read a disk block.
748	* This algorithm described in Bach (p.54).
749	*/
750	int
751	bread(struct vnode vp, daddr_t blkno, int* size, int flags, buf_t **bpp)
752	{
753	buf_t *bp;
754	int error;
755
756	BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
757
758	/ Get buffer for block. /
759	bp = *bpp = bio_doread(vp, blkno, size, `0`);
760	if (bp == NULL)
761	return ENOMEM;
762
763	/ Wait for the read to complete, and return result. /
764	error = biowait(bp);
765	if (error == `0` && (flags & B_MODIFY) != `0`)
766	error = fscow_run(bp, true);
767	if (error) {
768	brelse(bp, `0`);
769	*bpp = NULL;
770	}
771
772	return error;
773	}
774
775	/*
776	* Read-ahead multiple disk blocks. The first is sync, the rest async.
777	* Trivial modification to the breada algorithm presented in Bach (p.55).
778	*/
779	int
780	breadn(struct vnode vp, daddr_t blkno, int* size, daddr_t *rablks,
781	int rasizes, int* nrablks, int flags, buf_t **bpp)
782	{
783	buf_t *bp;
784	int error, i;
785
786	BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
787
788	bp = *bpp = bio_doread(vp, blkno, size, `0`);
789	if (bp == NULL)
790	return ENOMEM;
791
792	/*
793	* For each of the read-ahead blocks, start a read, if necessary.
794	*/
795	mutex_enter(&bufcache_lock);
796	for (i = `0`; i < nrablks; i++) {
797	/ If it's in the cache, just go on to next one. /
798	if (incore(vp, rablks[i]))
799	continue;
800
801	/ Get a buffer for the read-ahead block /
802	mutex_exit(&bufcache_lock);
803	(void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
804	mutex_enter(&bufcache_lock);
805	}
806	mutex_exit(&bufcache_lock);
807
808	/ Otherwise, we had to start a read for it; wait until it's valid. /
809	error = biowait(bp);
810	if (error == `0` && (flags & B_MODIFY) != `0`)
811	error = fscow_run(bp, true);
812	if (error) {
813	brelse(bp, `0`);
814	*bpp = NULL;
815	}
816
817	return error;
818	}
819
820	/*
821	* Block write. Described in Bach (p.56)
822	*/
823	int
824	bwrite(buf_t *bp)
825	{
826	int rv, sync, wasdelayed;
827	struct vnode *vp;
828	struct mount *mp;
829
830	BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
831	(uintptr_t)bp, `0`, `0`, `0`);
832
833	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
834	KASSERT(!cv_has_waiters(&bp->b_done));
835
836	vp = bp->b_vp;
837
838	/*
839	* dholland 20160728 AFAICT vp==NULL must be impossible as it
840	* will crash upon reaching VOP_STRATEGY below... see further
841	* analysis on tech-kern.
842	*/
843	KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode");
844
845	if (vp != NULL) {
846	KASSERT(bp->b_objlock == vp->v_interlock);
847	if (vp->v_type == VBLK)
848	mp = spec_node_getmountedfs(vp);
849	else
850	mp = vp->v_mount;
851	} else {
852	mp = NULL;
853	}
854
855	if (mp && mp->mnt_wapbl) {
856	if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
857	bdwrite(bp);
858	return `0`;
859	}
860	}
861
862	/*
863	* Remember buffer type, to switch on it later. If the write was
864	* synchronous, but the file system was mounted with MNT_ASYNC,
865	* convert it to a delayed write.
866	* XXX note that this relies on delayed tape writes being converted
867	* to async, not sync writes (which is safe, but ugly).
868	*/
869	sync = !ISSET(bp->b_flags, B_ASYNC);
870	if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
871	bdwrite(bp);
872	return (`0`);
873	}
874
875	/*
876	* Collect statistics on synchronous and asynchronous writes.
877	* Writes to block devices are charged to their associated
878	* filesystem (if any).
879	*/
880	if (mp != NULL) {
881	if (sync)
882	mp->mnt_stat.f_syncwrites++;
883	else
884	mp->mnt_stat.f_asyncwrites++;
885	}
886
887	/*
888	* Pay for the I/O operation and make sure the buf is on the correct
889	* vnode queue.
890	*/
891	bp->b_error = `0`;
892	wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
893	CLR(bp->b_flags, B_READ);
894	if (wasdelayed) {
895	mutex_enter(&bufcache_lock);
896	mutex_enter(bp->b_objlock);
897	CLR(bp->b_oflags, BO_DONE \| BO_DELWRI);
898	reassignbuf(bp, bp->b_vp);
899	mutex_exit(&bufcache_lock);
900	} else {
901	curlwp->l_ru.ru_oublock++;
902	mutex_enter(bp->b_objlock);
903	CLR(bp->b_oflags, BO_DONE \| BO_DELWRI);
904	}
905	if (vp != NULL)
906	vp->v_numoutput++;
907	mutex_exit(bp->b_objlock);
908
909	/ Initiate disk write. /
910	if (sync)
911	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
912	else
913	BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
914
915	VOP_STRATEGY(vp, bp);
916
917	if (sync) {
918	/ If I/O was synchronous, wait for it to complete. /
919	rv = biowait(bp);
920
921	/ Release the buffer. /
922	brelse(bp, `0`);
923
924	return (rv);
925	} else {
926	return (`0`);
927	}
928	}
929
930	int
931	vn_bwrite(void *v)
932	{
933	struct vop_bwrite_args *ap = v;
934
935	return (bwrite(ap->a_bp));
936	}
937
938	/*
939	* Delayed write.
940	*
941	* The buffer is marked dirty, but is not queued for I/O.
942	* This routine should be used when the buffer is expected
943	* to be modified again soon, typically a small write that
944	* partially fills a buffer.
945	*
946	* NB: magnetic tapes cannot be delayed; they must be
947	* written in the order that the writes are requested.
948	*
949	* Described in Leffler, et al. (pp. 208-213).
950	*/
951	void
952	bdwrite(buf_t *bp)
953	{
954
955	BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
956	(uintptr_t)bp, `0`, `0`, `0`);
957
958	KASSERT(bp->b_vp == NULL \|\| bp->b_vp->v_tag != VT_UFS \|\|
959	bp->b_vp->v_type == VBLK \|\| ISSET(bp->b_flags, B_COWDONE));
960	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
961	KASSERT(!cv_has_waiters(&bp->b_done));
962
963	/ If this is a tape block, write the block now. /
964	if (bdev_type(bp->b_dev) == D_TAPE) {
965	bawrite(bp);
966	return;
967	}
968
969	if (wapbl_vphaswapbl(bp->b_vp)) {
970	struct mount *mp = wapbl_vptomp(bp->b_vp);
971
972	if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
973	WAPBL_ADD_BUF(mp, bp);
974	}
975	}
976
977	/*
978	* If the block hasn't been seen before:
979	* (1) Mark it as having been seen,
980	* (2) Charge for the write,
981	* (3) Make sure it's on its vnode's correct block list.
982	*/
983	KASSERT(bp->b_vp == NULL \|\| bp->b_objlock == bp->b_vp->v_interlock);
984
985	if (!ISSET(bp->b_oflags, BO_DELWRI)) {
986	mutex_enter(&bufcache_lock);
987	mutex_enter(bp->b_objlock);
988	SET(bp->b_oflags, BO_DELWRI);
989	curlwp->l_ru.ru_oublock++;
990	reassignbuf(bp, bp->b_vp);
991	mutex_exit(&bufcache_lock);
992	} else {
993	mutex_enter(bp->b_objlock);
994	}
995	/ Otherwise, the "write" is done, so mark and release the buffer. /
996	CLR(bp->b_oflags, BO_DONE);
997	mutex_exit(bp->b_objlock);
998
999	brelse(bp, `0`);
1000	}
1001
1002	/*
1003	* Asynchronous block write; just an asynchronous bwrite().
1004	*/
1005	void
1006	bawrite(buf_t *bp)
1007	{
1008
1009	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1010	KASSERT(bp->b_vp != NULL);
1011
1012	SET(bp->b_flags, B_ASYNC);
1013	VOP_BWRITE(bp->b_vp, bp);
1014	}
1015
1016	/*
1017	* Release a buffer on to the free lists.
1018	* Described in Bach (p. 46).
1019	*/
1020	void
1021	brelsel(buf_t bp, int* set)
1022	{
1023	struct bqueue *bufq;
1024	struct vnode *vp;
1025
1026	KASSERT(bp != NULL);
1027	KASSERT(mutex_owned(&bufcache_lock));
1028	KASSERT(!cv_has_waiters(&bp->b_done));
1029	KASSERT(bp->b_refcnt > `0`);
1030
1031	SET(bp->b_cflags, set);
1032
1033	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1034	KASSERT(bp->b_iodone == NULL);
1035
1036	/ Wake up any processes waiting for any buffer to become free. /
1037	cv_signal(&needbuffer_cv);
1038
1039	/ Wake up any proceeses waiting for _this_ buffer to become free /
1040	if (ISSET(bp->b_cflags, BC_WANTED))
1041	CLR(bp->b_cflags, BC_WANTED\|BC_AGE);
1042
1043	/ If it's clean clear the copy-on-write flag. /
1044	if (ISSET(bp->b_flags, B_COWDONE)) {
1045	mutex_enter(bp->b_objlock);
1046	if (!ISSET(bp->b_oflags, BO_DELWRI))
1047	CLR(bp->b_flags, B_COWDONE);
1048	mutex_exit(bp->b_objlock);
1049	}
1050
1051	/*
1052	* Determine which queue the buffer should be on, then put it there.
1053	*/
1054
1055	/ If it's locked, don't report an error; try again later. /
1056	if (ISSET(bp->b_flags, B_LOCKED))
1057	bp->b_error = `0`;
1058
1059	/ If it's not cacheable, or an error, mark it invalid. /
1060	if (ISSET(bp->b_cflags, BC_NOCACHE) \|\| bp->b_error != `0`)
1061	SET(bp->b_cflags, BC_INVAL);
1062
1063	if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1064	/*
1065	* This is a delayed write buffer that was just flushed to
1066	* disk. It is still on the LRU queue. If it's become
1067	* invalid, then we need to move it to a different queue;
1068	* otherwise leave it in its current position.
1069	*/
1070	CLR(bp->b_cflags, BC_VFLUSH);
1071	if (!ISSET(bp->b_cflags, BC_INVAL\|BC_AGE) &&
1072	!ISSET(bp->b_flags, B_LOCKED) && bp->b_error == `0`) {
1073	KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], `1`));
1074	goto already_queued;
1075	} else {
1076	bremfree(bp);
1077	}
1078	}
1079
1080	KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], `0`));
1081	KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], `0`));
1082	KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], `0`));
1083
1084	if ((bp->b_bufsize <= `0`) \|\| ISSET(bp->b_cflags, BC_INVAL)) {
1085	/*
1086	* If it's invalid or empty, dissociate it from its vnode
1087	* and put on the head of the appropriate queue.
1088	*/
1089	if (ISSET(bp->b_flags, B_LOCKED)) {
1090	if (wapbl_vphaswapbl(vp = bp->b_vp)) {
1091	struct mount *mp = wapbl_vptomp(vp);
1092
1093	KASSERT(bp->b_iodone
1094	!= mp->mnt_wapbl_op->wo_wapbl_biodone);
1095	WAPBL_REMOVE_BUF(mp, bp);
1096	}
1097	}
1098
1099	mutex_enter(bp->b_objlock);
1100	CLR(bp->b_oflags, BO_DONE\|BO_DELWRI);
1101	if ((vp = bp->b_vp) != NULL) {
1102	KASSERT(bp->b_objlock == vp->v_interlock);
1103	reassignbuf(bp, bp->b_vp);
1104	brelvp(bp);
1105	mutex_exit(vp->v_interlock);
1106	} else {
1107	KASSERT(bp->b_objlock == &buffer_lock);
1108	mutex_exit(bp->b_objlock);
1109	}
1110
1111	if (bp->b_bufsize <= `0`)
1112	/ no data /
1113	goto already_queued;
1114	else
1115	/ invalid data /
1116	bufq = &bufqueues[BQ_AGE];
1117	binsheadfree(bp, bufq);
1118	} else {
1119	/*
1120	* It has valid data. Put it on the end of the appropriate
1121	* queue, so that it'll stick around for as long as possible.
1122	* If buf is AGE, but has dependencies, must put it on last
1123	* bufqueue to be scanned, ie LRU. This protects against the
1124	* livelock where BQ_AGE only has buffers with dependencies,
1125	* and we thus never get to the dependent buffers in BQ_LRU.
1126	*/
1127	if (ISSET(bp->b_flags, B_LOCKED)) {
1128	/ locked in core /
1129	bufq = &bufqueues[BQ_LOCKED];
1130	} else if (!ISSET(bp->b_cflags, BC_AGE)) {
1131	/ valid data /
1132	bufq = &bufqueues[BQ_LRU];
1133	} else {
1134	/ stale but valid data /
1135	bufq = &bufqueues[BQ_AGE];
1136	}
1137	binstailfree(bp, bufq);
1138	}
1139	already_queued:
1140	/ Unlock the buffer. /
1141	CLR(bp->b_cflags, BC_AGE\|BC_BUSY\|BC_NOCACHE);
1142	CLR(bp->b_flags, B_ASYNC);
1143	cv_broadcast(&bp->b_busy);
1144
1145	if (bp->b_bufsize <= `0`)
1146	brele(bp);
1147	}
1148
1149	void
1150	brelse(buf_t bp, int* set)
1151	{
1152
1153	mutex_enter(&bufcache_lock);
1154	brelsel(bp, set);
1155	mutex_exit(&bufcache_lock);
1156	}
1157
1158	/*
1159	* Determine if a block is in the cache.
1160	* Just look on what would be its hash chain. If it's there, return
1161	* a pointer to it, unless it's marked invalid. If it's marked invalid,
1162	* we normally don't return the buffer, unless the caller explicitly
1163	* wants us to.
1164	*/
1165	buf_t *
1166	incore(struct vnode *vp, daddr_t blkno)
1167	{
1168	buf_t *bp;
1169
1170	KASSERT(mutex_owned(&bufcache_lock));
1171
1172	/ Search hash chain /
1173	LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
1174	if (bp->b_lblkno == blkno && bp->b_vp == vp &&
1175	!ISSET(bp->b_cflags, BC_INVAL)) {
1176	KASSERT(bp->b_objlock == vp->v_interlock);
1177	return (bp);
1178	}
1179	}
1180
1181	return (NULL);
1182	}
1183
1184	/*
1185	* Get a block of requested size that is associated with
1186	* a given vnode and block offset. If it is found in the
1187	* block cache, mark it as having been found, make it busy
1188	* and return it. Otherwise, return an empty block of the
1189	* correct size. It is up to the caller to insure that the
1190	* cached blocks be of the correct size.
1191	*/
1192	buf_t *
1193	getblk(struct vnode vp, daddr_t blkno, int* size, int slpflag, int slptimeo)
1194	{
1195	int err, preserve;
1196	buf_t *bp;
1197
1198	mutex_enter(&bufcache_lock);
1199	loop:
1200	bp = incore(vp, blkno);
1201	if (bp != NULL) {
1202	err = bbusy(bp, ((slpflag & PCATCH) != `0`), slptimeo, NULL);
1203	if (err != `0`) {
1204	if (err == EPASSTHROUGH)
1205	goto loop;
1206	mutex_exit(&bufcache_lock);
1207	return (NULL);
1208	}
1209	KASSERT(!cv_has_waiters(&bp->b_done));
1210	#ifdef DIAGNOSTIC
1211	if (ISSET(bp->b_oflags, BO_DONE\|BO_DELWRI) &&
1212	bp->b_bcount < size && vp->v_type != VBLK)
1213	panic("getblk: block size invariant failed");
1214	#endif
1215	bremfree(bp);
1216	preserve = `1`;
1217	} else {
1218	if ((bp = getnewbuf(slpflag, slptimeo, `0`)) == NULL)
1219	goto loop;
1220
1221	if (incore(vp, blkno) != NULL) {
1222	/ The block has come into memory in the meantime. /
1223	brelsel(bp, `0`);
1224	goto loop;
1225	}
1226
1227	LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
1228	bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
1229	mutex_enter(vp->v_interlock);
1230	bgetvp(vp, bp);
1231	mutex_exit(vp->v_interlock);
1232	preserve = `0`;
1233	}
1234	mutex_exit(&bufcache_lock);
1235
1236	/*
1237	* LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
1238	* if we re-size buffers here.
1239	*/
1240	if (ISSET(bp->b_flags, B_LOCKED)) {
1241	KASSERT(bp->b_bufsize >= size);
1242	} else {
1243	if (allocbuf(bp, size, preserve)) {
1244	mutex_enter(&bufcache_lock);
1245	LIST_REMOVE(bp, b_hash);
1246	brelsel(bp, BC_INVAL);
1247	mutex_exit(&bufcache_lock);
1248	return NULL;
1249	}
1250	}
1251	BIO_SETPRIO(bp, BPRIO_DEFAULT);
1252	return (bp);
1253	}
1254
1255	/*
1256	* Get an empty, disassociated buffer of given size.
1257	*/
1258	buf_t *
1259	geteblk(int size)
1260	{
1261	buf_t *bp;
1262	int error __diagused;
1263
1264	mutex_enter(&bufcache_lock);
1265	while ((bp = getnewbuf(`0`, `0`, `0`)) == NULL)
1266	;
1267
1268	SET(bp->b_cflags, BC_INVAL);
1269	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1270	mutex_exit(&bufcache_lock);
1271	BIO_SETPRIO(bp, BPRIO_DEFAULT);
1272	error = allocbuf(bp, size, `0`);
1273	KASSERT(error == `0`);
1274	return (bp);
1275	}
1276
1277	/*
1278	* Expand or contract the actual memory allocated to a buffer.
1279	*
1280	* If the buffer shrinks, data is lost, so it's up to the
1281	* caller to have written it out first; this routine will not
1282	* start a write. If the buffer grows, it's the callers
1283	* responsibility to fill out the buffer's additional contents.
1284	*/
1285	int
1286	allocbuf(buf_t bp, int* size, int preserve)
1287	{
1288	void *addr;
1289	vsize_t oldsize, desired_size;
1290	int oldcount;
1291	int delta;
1292
1293	desired_size = buf_roundsize(size);
1294	if (desired_size > MAXBSIZE)
1295	printf("allocbuf: buffer larger than MAXBSIZE requested");
1296
1297	oldcount = bp->b_bcount;
1298
1299	bp->b_bcount = size;
1300
1301	oldsize = bp->b_bufsize;
1302	if (oldsize == desired_size) {
1303	/*
1304	* Do not short cut the WAPBL resize, as the buffer length
1305	* could still have changed and this would corrupt the
1306	* tracking of the transaction length.
1307	*/
1308	goto out;
1309	}
1310
1311	/*
1312	* If we want a buffer of a different size, re-allocate the
1313	* buffer's memory; copy old content only if needed.
1314	*/
1315	addr = buf_alloc(desired_size);
1316	if (addr == NULL)
1317	return ENOMEM;
1318	if (preserve)
1319	memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
1320	if (bp->b_data != NULL)
1321	buf_mrelease(bp->b_data, oldsize);
1322	bp->b_data = addr;
1323	bp->b_bufsize = desired_size;
1324
1325	/*
1326	* Update overall buffer memory counter (protected by bufcache_lock)
1327	*/
1328	delta = (long)desired_size - (long)oldsize;
1329
1330	mutex_enter(&bufcache_lock);
1331	if ((bufmem += delta) > bufmem_hiwater) {
1332	/*
1333	* Need to trim overall memory usage.
1334	*/
1335	while (buf_canrelease()) {
1336	if (curcpu()->ci_schedstate.spc_flags &
1337	SPCF_SHOULDYIELD) {
1338	mutex_exit(&bufcache_lock);
1339	preempt();
1340	mutex_enter(&bufcache_lock);
1341	}
1342	if (buf_trim() == `0`)
1343	break;
1344	}
1345	}
1346	mutex_exit(&bufcache_lock);
1347
1348	out:
1349	if (wapbl_vphaswapbl(bp->b_vp))
1350	WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
1351
1352	return `0`;
1353	}
1354
1355	/*
1356	* Find a buffer which is available for use.
1357	* Select something from a free list.
1358	* Preference is to AGE list, then LRU list.
1359	*
1360	* Called with the buffer queues locked.
1361	* Return buffer locked.
1362	*/
1363	static buf_t *
1364	getnewbuf(int slpflag, int slptimeo, int from_bufq)
1365	{
1366	buf_t *bp;
1367	struct vnode *vp;
1368	struct mount *transmp = NULL;
1369
1370	start:
1371	KASSERT(mutex_owned(&bufcache_lock));
1372
1373	/*
1374	* Get a new buffer from the pool.
1375	*/
1376	if (!from_bufq && buf_lotsfree()) {
1377	mutex_exit(&bufcache_lock);
1378	bp = pool_cache_get(buf_cache, PR_NOWAIT);
1379	if (bp != NULL) {
1380	memset((char )bp, `0`, sizeof(bp));
1381	buf_init(bp);
1382	SET(bp->b_cflags, BC_BUSY); / mark buffer busy /
1383	mutex_enter(&bufcache_lock);
1384	#if defined(DIAGNOSTIC)
1385	bp->b_freelistindex = -`1`;
1386	#endif /* defined(DIAGNOSTIC) */
1387	return (bp);
1388	}
1389	mutex_enter(&bufcache_lock);
1390	}
1391
1392	KASSERT(mutex_owned(&bufcache_lock));
1393	if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) {
1394	KASSERT(!ISSET(bp->b_oflags, BO_DELWRI));
1395	} else {
1396	TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) {
1397	if (ISSET(bp->b_cflags, BC_VFLUSH) \|\|
1398	!ISSET(bp->b_oflags, BO_DELWRI))
1399	break;
1400	if (fstrans_start_nowait(bp->b_vp->v_mount) == `0`) {
1401	KASSERT(transmp == NULL);
1402	transmp = bp->b_vp->v_mount;
1403	break;
1404	}
1405	}
1406	}
1407	if (bp != NULL) {
1408	KASSERT(!ISSET(bp->b_cflags, BC_BUSY) \|\| ISSET(bp->b_cflags, BC_VFLUSH));
1409	bremfree(bp);
1410
1411	/ Buffer is no longer on free lists. /
1412	SET(bp->b_cflags, BC_BUSY);
1413	} else {
1414	/*
1415	* XXX: !from_bufq should be removed.
1416	*/
1417	if (!from_bufq \|\| curlwp != uvm.pagedaemon_lwp) {
1418	/ wait for a free buffer of any kind /
1419	if ((slpflag & PCATCH) != `0`)
1420	(void)cv_timedwait_sig(&needbuffer_cv,
1421	&bufcache_lock, slptimeo);
1422	else
1423	(void)cv_timedwait(&needbuffer_cv,
1424	&bufcache_lock, slptimeo);
1425	}
1426	return (NULL);
1427	}
1428
1429	#ifdef DIAGNOSTIC
1430	if (bp->b_bufsize <= `0`)
1431	panic("buffer %p: on queue but empty", bp);
1432	#endif
1433
1434	if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1435	/*
1436	* This is a delayed write buffer being flushed to disk. Make
1437	* sure it gets aged out of the queue when it's finished, and
1438	* leave it off the LRU queue.
1439	*/
1440	CLR(bp->b_cflags, BC_VFLUSH);
1441	SET(bp->b_cflags, BC_AGE);
1442	goto start;
1443	}
1444
1445	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1446	KASSERT(bp->b_refcnt > `0`);
1447	KASSERT(!cv_has_waiters(&bp->b_done));
1448
1449	/*
1450	* If buffer was a delayed write, start it and return NULL
1451	* (since we might sleep while starting the write).
1452	*/
1453	if (ISSET(bp->b_oflags, BO_DELWRI)) {
1454	/*
1455	* This buffer has gone through the LRU, so make sure it gets
1456	* reused ASAP.
1457	*/
1458	SET(bp->b_cflags, BC_AGE);
1459	mutex_exit(&bufcache_lock);
1460	bawrite(bp);
1461	KASSERT(transmp != NULL);
1462	fstrans_done(transmp);
1463	mutex_enter(&bufcache_lock);
1464	return (NULL);
1465	}
1466
1467	KASSERT(transmp == NULL);
1468
1469	vp = bp->b_vp;
1470
1471	/ clear out various other fields /
1472	bp->b_cflags = BC_BUSY;
1473	bp->b_oflags = `0`;
1474	bp->b_flags = `0`;
1475	bp->b_dev = NODEV;
1476	bp->b_blkno = `0`;
1477	bp->b_lblkno = `0`;
1478	bp->b_rawblkno = `0`;
1479	bp->b_iodone = `0`;
1480	bp->b_error = `0`;
1481	bp->b_resid = `0`;
1482	bp->b_bcount = `0`;
1483
1484	LIST_REMOVE(bp, b_hash);
1485
1486	/ Disassociate us from our vnode, if we had one... /
1487	if (vp != NULL) {
1488	mutex_enter(vp->v_interlock);
1489	brelvp(bp);
1490	mutex_exit(vp->v_interlock);
1491	}
1492
1493	return (bp);
1494	}
1495
1496	/*
1497	* Attempt to free an aged buffer off the queues.
1498	* Called with queue lock held.
1499	* Returns the amount of buffer memory freed.
1500	*/
1501	static int
1502	buf_trim(void)
1503	{
1504	buf_t *bp;
1505	long size;
1506
1507	KASSERT(mutex_owned(&bufcache_lock));
1508
1509	/ Instruct getnewbuf() to get buffers off the queues /
1510	if ((bp = getnewbuf(PCATCH, `1`, `1`)) == NULL)
1511	return `0`;
1512
1513	KASSERT((bp->b_cflags & BC_WANTED) == `0`);
1514	size = bp->b_bufsize;
1515	bufmem -= size;
1516	if (size > `0`) {
1517	buf_mrelease(bp->b_data, size);
1518	bp->b_bcount = bp->b_bufsize = `0`;
1519	}
1520	/ brelse() will return the buffer to the global buffer pool /
1521	brelsel(bp, `0`);
1522	return size;
1523	}
1524
1525	int
1526	buf_drain(int n)
1527	{
1528	int size = `0`, sz;
1529
1530	KASSERT(mutex_owned(&bufcache_lock));
1531
1532	while (size < n && bufmem > bufmem_lowater) {
1533	sz = buf_trim();
1534	if (sz <= `0`)
1535	break;
1536	size += sz;
1537	}
1538
1539	return size;
1540	}
1541
1542	SDT_PROVIDER_DEFINE(io);
1543
1544	SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf "/bp/*);
1545	SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf "/bp/*);
1546
1547	/*
1548	* Wait for operations on the buffer to complete.
1549	* When they do, extract and return the I/O's error value.
1550	*/
1551	int
1552	biowait(buf_t *bp)
1553	{
1554
1555	BIOHIST_FUNC(__func__);
1556
1557	KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1558	KASSERT(bp->b_refcnt > `0`);
1559
1560	SDT_PROBE1(io, kernel, , wait__start, bp);
1561
1562	mutex_enter(bp->b_objlock);
1563
1564	BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx",
1565	(uintptr_t)bp, bp->b_oflags,
1566	(uintptr_t)__builtin_return_address(`0`), `0`);
1567
1568	while (!ISSET(bp->b_oflags, BO_DONE \| BO_DELWRI)) {
1569	BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, `0`, `0`, `0`);
1570	cv_wait(&bp->b_done, bp->b_objlock);
1571	}
1572	mutex_exit(bp->b_objlock);
1573
1574	SDT_PROBE1(io, kernel, , wait__done, bp);
1575
1576	BIOHIST_LOG(biohist, "return %jd", bp->b_error, `0`, `0`, `0`);
1577
1578	return bp->b_error;
1579	}
1580
1581	/*
1582	* Mark I/O complete on a buffer.
1583	*
1584	* If a callback has been requested, e.g. the pageout
1585	* daemon, do so. Otherwise, awaken waiting processes.
1586	*
1587	* [ Leffler, et al., says on p.247:
1588	* "This routine wakes up the blocked process, frees the buffer
1589	* for an asynchronous write, or, for a request by the pagedaemon
1590	* process, invokes a procedure specified in the buffer structure" ]
1591	*
1592	* In real life, the pagedaemon (or other system processes) wants
1593	* to do async stuff too, and doesn't want the buffer brelse()'d.
1594	* (for swap pager, that puts swap buffers on the free lists (!!!),
1595	* for the vn device, that puts allocated buffers on the free lists!)
1596	*/
1597	void
1598	biodone(buf_t *bp)
1599	{
1600	int s;
1601
1602	BIOHIST_FUNC(__func__);
1603
1604	KASSERT(!ISSET(bp->b_oflags, BO_DONE));
1605
1606	if (cpu_intr_p()) {
1607	/ From interrupt mode: defer to a soft interrupt. /
1608	s = splvm();
1609	TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
1610
1611	BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled",
1612	(uintptr_t)bp, `0`, `0`, `0`);
1613	softint_schedule(biodone_sih);
1614	splx(s);
1615	} else {
1616	/ Process now - the buffer may be freed soon. /
1617	biodone2(bp);
1618	}
1619	}
1620
1621	SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf "/bp/*);
1622
1623	static void
1624	biodone2(buf_t *bp)
1625	{
1626	void (callout)(buf_t );
1627
1628	SDT_PROBE1(io, kernel, ,done, bp);
1629
1630	BIOHIST_FUNC(__func__);
1631	BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, `0`, `0`, `0`);
1632
1633	mutex_enter(bp->b_objlock);
1634	/ Note that the transfer is done. /
1635	if (ISSET(bp->b_oflags, BO_DONE))
1636	panic("biodone2 already");
1637	CLR(bp->b_flags, B_COWDONE);
1638	SET(bp->b_oflags, BO_DONE);
1639	BIO_SETPRIO(bp, BPRIO_DEFAULT);
1640
1641	/ Wake up waiting writers. /
1642	if (!ISSET(bp->b_flags, B_READ))
1643	vwakeup(bp);
1644
1645	if ((callout = bp->b_iodone) != NULL) {
1646	BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout,
1647	`0`, `0`, `0`);
1648
1649	/ Note callout done, then call out. /
1650	KASSERT(!cv_has_waiters(&bp->b_done));
1651	KERNEL_LOCK(`1`, NULL); / XXXSMP /
1652	bp->b_iodone = NULL;
1653	mutex_exit(bp->b_objlock);
1654	(*callout)(bp);
1655	KERNEL_UNLOCK_ONE(NULL); / XXXSMP /
1656	} else if (ISSET(bp->b_flags, B_ASYNC)) {
1657	/ If async, release. /
1658	BIOHIST_LOG(biohist, "async", `0`, `0`, `0`, `0`);
1659	KASSERT(!cv_has_waiters(&bp->b_done));
1660	mutex_exit(bp->b_objlock);
1661	brelse(bp, `0`);
1662	} else {
1663	/ Otherwise just wake up waiters in biowait(). /
1664	BIOHIST_LOG(biohist, "wake-up", `0`, `0`, `0`, `0`);
1665	cv_broadcast(&bp->b_done);
1666	mutex_exit(bp->b_objlock);
1667	}
1668	}
1669
1670	static void
1671	biointr(void *cookie)
1672	{
1673	struct cpu_info *ci;
1674	buf_t *bp;
1675	int s;
1676
1677	BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
1678
1679	ci = curcpu();
1680
1681	s = splvm();
1682	while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
1683	KASSERT(curcpu() == ci);
1684
1685	bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
1686	TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
1687	splx(s);
1688
1689	BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, `0`, `0`, `0`);
1690	biodone2(bp);
1691
1692	s = splvm();
1693	}
1694	splx(s);
1695	}
1696
1697	/*
1698	* Wait for all buffers to complete I/O
1699	* Return the number of "stuck" buffers.
1700	*/
1701	int
1702	buf_syncwait(void)
1703	{
1704	buf_t *bp;
1705	int iter, nbusy, nbusy_prev = `0`, ihash;
1706
1707	BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
1708
1709	for (iter = `0`; iter < `20`;) {
1710	mutex_enter(&bufcache_lock);
1711	nbusy = `0`;
1712	for (ihash = `0`; ihash < bufhash+`1`; ihash++) {
1713	LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1714	if ((bp->b_cflags & (BC_BUSY\|BC_INVAL)) == BC_BUSY)
1715	nbusy += ((bp->b_flags & B_READ) == `0`);
1716	}
1717	}
1718	mutex_exit(&bufcache_lock);
1719
1720	if (nbusy == `0`)
1721	break;
1722	if (nbusy_prev == `0`)
1723	nbusy_prev = nbusy;
1724	printf("%d ", nbusy);
1725	kpause("bflush", false, MAX(`1`, hz / `25` * iter), NULL);
1726	if (nbusy >= nbusy_prev) / we didn't flush anything /
1727	iter++;
1728	else
1729	nbusy_prev = nbusy;
1730	}
1731
1732	if (nbusy) {
1733	#if defined(DEBUG) \|\| defined(DEBUG_HALT_BUSY)
1734	printf("giving up\nPrinting vnodes for busy buffers\n");
1735	for (ihash = `0`; ihash < bufhash+`1`; ihash++) {
1736	LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1737	if ((bp->b_cflags & (BC_BUSY\|BC_INVAL)) == BC_BUSY &&
1738	(bp->b_flags & B_READ) == `0`)
1739	vprint(NULL, bp->b_vp);
1740	}
1741	}
1742	#endif
1743	}
1744
1745	return nbusy;
1746	}
1747
1748	static void
1749	sysctl_fillbuf(const buf_t i, struct* buf_sysctl *o)
1750	{
1751	const bool allowaddr = get_expose_address(curproc);
1752
1753	memset(o, `0`, sizeof(*o));
1754
1755	o->b_flags = i->b_flags \| i->b_cflags \| i->b_oflags;
1756	o->b_error = i->b_error;
1757	o->b_prio = i->b_prio;
1758	o->b_dev = i->b_dev;
1759	o->b_bufsize = i->b_bufsize;
1760	o->b_bcount = i->b_bcount;
1761	o->b_resid = i->b_resid;
1762	COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr);
1763	o->b_blkno = i->b_blkno;
1764	o->b_rawblkno = i->b_rawblkno;
1765	COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr);
1766	COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr);
1767	COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr);
1768	COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr);
1769	o->b_lblkno = i->b_lblkno;
1770	}
1771
1772	#define KERN_BUFSLOP 20
1773	static int
1774	sysctl_dobuf(SYSCTLFN_ARGS)
1775	{
1776	buf_t *bp;
1777	struct buf_sysctl bs;
1778	struct bqueue *bq;
1779	char *dp;
1780	u_int i, op, arg;
1781	size_t len, needed, elem_size, out_size;
1782	int error, elem_count, retries;
1783
1784	if (namelen == `1` && name[`0`] == CTL_QUERY)
1785	return (sysctl_query(SYSCTLFN_CALL(rnode)));
1786
1787	if (namelen != `4`)
1788	return (EINVAL);
1789
1790	retries = `100`;
1791	retry:
1792	dp = oldp;
1793	len = (oldp != NULL) ? *oldlenp : `0`;
1794	op = name[`0`];
1795	arg = name[`1`];
1796	elem_size = name[`2`];
1797	elem_count = name[`3`];
1798	out_size = MIN(sizeof(bs), elem_size);
1799
1800	/*
1801	* at the moment, these are just "placeholders" to make the
1802	* API for retrieving kern.buf data more extensible in the
1803	* future.
1804	*
1805	* XXX kern.buf currently has "netbsd32" issues. hopefully
1806	* these will be resolved at a later point.
1807	*/
1808	if (op != KERN_BUF_ALL \|\| arg != KERN_BUF_ALL \|\|
1809	elem_size < `1` \|\| elem_count < `0`)
1810	return (EINVAL);
1811
1812	error = `0`;
1813	needed = `0`;
1814	sysctl_unlock();
1815	mutex_enter(&bufcache_lock);
1816	for (i = `0`; i < BQUEUES; i++) {
1817	bq = &bufqueues[i];
1818	TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
1819	bq->bq_marker = bp;
1820	if (len >= elem_size && elem_count > `0`) {
1821	sysctl_fillbuf(bp, &bs);
1822	mutex_exit(&bufcache_lock);
1823	error = copyout(&bs, dp, out_size);
1824	mutex_enter(&bufcache_lock);
1825	if (error)
1826	break;
1827	if (bq->bq_marker != bp) {
1828	/*
1829	* This sysctl node is only for
1830	* statistics. Retry; if the
1831	* queue keeps changing, then
1832	* bail out.
1833	*/
1834	if (retries-- == `0`) {
1835	error = EAGAIN;
1836	break;
1837	}
1838	mutex_exit(&bufcache_lock);
1839	sysctl_relock();
1840	goto retry;
1841	}
1842	dp += elem_size;
1843	len -= elem_size;
1844	}
1845	needed += elem_size;
1846	if (elem_count > `0` && elem_count != INT_MAX)
1847	elem_count--;
1848	}
1849	if (error != `0`)
1850	break;
1851	}
1852	mutex_exit(&bufcache_lock);
1853	sysctl_relock();
1854
1855	*oldlenp = needed;
1856	if (oldp == NULL)
1857	oldlenp += KERN_BUFSLOP sizeof(buf_t);
1858
1859	return (error);
1860	}
1861
1862	static int
1863	sysctl_bufvm_update(SYSCTLFN_ARGS)
1864	{
1865	int error, rv;
1866	struct sysctlnode node;
1867	unsigned int temp_bufcache;
1868	unsigned long temp_water;
1869
1870	/ Take a copy of the supplied node and its data /
1871	node = *rnode;
1872	if (node.sysctl_data == &bufcache) {
1873	node.sysctl_data = &temp_bufcache;
1874	temp_bufcache = (unsigned* int *)rnode->sysctl_data;
1875	} else {
1876	node.sysctl_data = &temp_water;
1877	temp_water = (unsigned* long *)rnode->sysctl_data;
1878	}
1879
1880	/ Update the copy /
1881	error = sysctl_lookup(SYSCTLFN_CALL(&node));
1882	if (error \|\| newp == NULL)
1883	return (error);
1884
1885	if (rnode->sysctl_data == &bufcache) {
1886	if (temp_bufcache > `100`)
1887	return (EINVAL);
1888	bufcache = temp_bufcache;
1889	buf_setwm();
1890	} else if (rnode->sysctl_data == &bufmem_lowater) {
1891	if (bufmem_hiwater - temp_water < `16`)
1892	return (EINVAL);
1893	bufmem_lowater = temp_water;
1894	} else if (rnode->sysctl_data == &bufmem_hiwater) {
1895	if (temp_water - bufmem_lowater < `16`)
1896	return (EINVAL);
1897	bufmem_hiwater = temp_water;
1898	} else
1899	return (EINVAL);
1900
1901	/ Drain until below new high water mark /
1902	sysctl_unlock();
1903	mutex_enter(&bufcache_lock);
1904	while (bufmem > bufmem_hiwater) {
1905	rv = buf_drain((bufmem - bufmem_hiwater) / (`2` * `1024`));
1906	if (rv <= `0`)
1907	break;
1908	}
1909	mutex_exit(&bufcache_lock);
1910	sysctl_relock();
1911
1912	return `0`;
1913	}
1914
1915	static struct sysctllog *vfsbio_sysctllog;
1916
1917	static void
1918	sysctl_kern_buf_setup(void)
1919	{
1920
1921	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1922	CTLFLAG_PERMANENT,
1923	CTLTYPE_NODE, "buf",
1924	SYSCTL_DESCR("Kernel buffer cache information"),
1925	sysctl_dobuf, `0`, NULL, `0`,
1926	CTL_KERN, KERN_BUF, CTL_EOL);
1927	}
1928
1929	static void
1930	sysctl_vm_buf_setup(void)
1931	{
1932
1933	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1934	CTLFLAG_PERMANENT\|CTLFLAG_READWRITE,
1935	CTLTYPE_INT, "bufcache",
1936	SYSCTL_DESCR("Percentage of physical memory to use for "
1937	"buffer cache"),
1938	sysctl_bufvm_update, `0`, &bufcache, `0`,
1939	CTL_VM, CTL_CREATE, CTL_EOL);
1940	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1941	CTLFLAG_PERMANENT\|CTLFLAG_READONLY,
1942	CTLTYPE_LONG, "bufmem",
1943	SYSCTL_DESCR("Amount of kernel memory used by buffer "
1944	"cache"),
1945	NULL, `0`, &bufmem, `0`,
1946	CTL_VM, CTL_CREATE, CTL_EOL);
1947	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1948	CTLFLAG_PERMANENT\|CTLFLAG_READWRITE,
1949	CTLTYPE_LONG, "bufmem_lowater",
1950	SYSCTL_DESCR("Minimum amount of kernel memory to "
1951	"reserve for buffer cache"),
1952	sysctl_bufvm_update, `0`, &bufmem_lowater, `0`,
1953	CTL_VM, CTL_CREATE, CTL_EOL);
1954	sysctl_createv(&vfsbio_sysctllog, `0`, NULL, NULL,
1955	CTLFLAG_PERMANENT\|CTLFLAG_READWRITE,
1956	CTLTYPE_LONG, "bufmem_hiwater",
1957	SYSCTL_DESCR("Maximum amount of kernel memory to use "
1958	"for buffer cache"),
1959	sysctl_bufvm_update, `0`, &bufmem_hiwater, `0`,
1960	CTL_VM, CTL_CREATE, CTL_EOL);
1961	}
1962
1963	#ifdef DEBUG
1964	/*
1965	* Print out statistics on the current allocation of the buffer pool.
1966	* Can be enabled to print out on every ``sync'' by setting "syncprt"
1967	* in vfs_syscalls.c using sysctl.
1968	*/
1969	void
1970	vfs_bufstats(void)
1971	{
1972	int i, j, count;
1973	buf_t *bp;
1974	struct bqueue *dp;
1975	int counts[MAXBSIZE / MIN_PAGE_SIZE + `1`];
1976	static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
1977
1978	for (dp = bufqueues, i = `0`; dp < &bufqueues[BQUEUES]; dp++, i++) {
1979	count = `0`;
1980	memset(counts, `0`, sizeof(counts));
1981	TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
1982	counts[bp->b_bufsize / PAGE_SIZE]++;
1983	count++;
1984	}
1985	printf("%s: total-%d", bname[i], count);
1986	for (j = `0`; j <= MAXBSIZE / PAGE_SIZE; j++)
1987	if (counts[j] != `0`)
1988	printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1989	printf("\n");
1990	}
1991	}
1992	#endif /* DEBUG */
1993
1994	/ ------------------------------ /
1995
1996	buf_t *
1997	getiobuf(struct vnode *vp, bool waitok)
1998	{
1999	buf_t *bp;
2000
2001	bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
2002	if (bp == NULL)
2003	return bp;
2004
2005	buf_init(bp);
2006
2007	if ((bp->b_vp = vp) != NULL) {
2008	bp->b_objlock = vp->v_interlock;
2009	} else {
2010	KASSERT(bp->b_objlock == &buffer_lock);
2011	}
2012
2013	return bp;
2014	}
2015
2016	void
2017	putiobuf(buf_t *bp)
2018	{
2019
2020	buf_destroy(bp);
2021	pool_cache_put(bufio_cache, bp);
2022	}
2023
2024	/*
2025	* nestiobuf_iodone: b_iodone callback for nested buffers.
2026	*/
2027
2028	void
2029	nestiobuf_iodone(buf_t *bp)
2030	{
2031	buf_t *mbp = bp->b_private;
2032	int error;
2033	int donebytes;
2034
2035	KASSERT(bp->b_bcount <= bp->b_bufsize);
2036	KASSERT(mbp != bp);
2037
2038	error = bp->b_error;
2039	if (bp->b_error == `0` &&
2040	(bp->b_bcount < bp->b_bufsize \|\| bp->b_resid > `0`)) {
2041	/*
2042	* Not all got transfered, raise an error. We have no way to
2043	* propagate these conditions to mbp.
2044	*/
2045	error = EIO;
2046	}
2047
2048	donebytes = bp->b_bufsize;
2049
2050	putiobuf(bp);
2051	nestiobuf_done(mbp, donebytes, error);
2052	}
2053
2054	/*
2055	* nestiobuf_setup: setup a "nested" buffer.
2056	*
2057	* => 'mbp' is a "master" buffer which is being divided into sub pieces.
2058	* => 'bp' should be a buffer allocated by getiobuf.
2059	* => 'offset' is a byte offset in the master buffer.
2060	* => 'size' is a size in bytes of this nested buffer.
2061	*/
2062
2063	void
2064	nestiobuf_setup(buf_t mbp, buf_t bp, int offset, size_t size)
2065	{
2066	const int b_pass = mbp->b_flags & (B_READ\|B_MEDIA_FLAGS);
2067	struct vnode *vp = mbp->b_vp;
2068
2069	KASSERT(mbp->b_bcount >= offset + size);
2070	bp->b_vp = vp;
2071	bp->b_dev = mbp->b_dev;
2072	bp->b_objlock = mbp->b_objlock;
2073	bp->b_cflags = BC_BUSY;
2074	bp->b_flags = B_ASYNC \| b_pass;
2075	bp->b_iodone = nestiobuf_iodone;
2076	bp->b_data = (char *)mbp->b_data + offset;
2077	bp->b_resid = bp->b_bcount = size;
2078	bp->b_bufsize = bp->b_bcount;
2079	bp->b_private = mbp;
2080	BIO_COPYPRIO(bp, mbp);
2081	if (BUF_ISWRITE(bp) && vp != NULL) {
2082	mutex_enter(vp->v_interlock);
2083	vp->v_numoutput++;
2084	mutex_exit(vp->v_interlock);
2085	}
2086	}
2087
2088	/*
2089	* nestiobuf_done: propagate completion to the master buffer.
2090	*
2091	* => 'donebytes' specifies how many bytes in the 'mbp' is completed.
2092	* => 'error' is an errno(2) that 'donebytes' has been completed with.
2093	*/
2094
2095	void
2096	nestiobuf_done(buf_t mbp, int* donebytes, int error)
2097	{
2098
2099	if (donebytes == `0`) {
2100	return;
2101	}
2102	mutex_enter(mbp->b_objlock);
2103	KASSERT(mbp->b_resid >= donebytes);
2104	mbp->b_resid -= donebytes;
2105	if (error)
2106	mbp->b_error = error;
2107	if (mbp->b_resid == `0`) {
2108	if (mbp->b_error)
2109	mbp->b_resid = mbp->b_bcount;
2110	mutex_exit(mbp->b_objlock);
2111	biodone(mbp);
2112	} else
2113	mutex_exit(mbp->b_objlock);
2114	}
2115
2116	void
2117	buf_init(buf_t *bp)
2118	{
2119
2120	cv_init(&bp->b_busy, "biolock");
2121	cv_init(&bp->b_done, "biowait");
2122	bp->b_dev = NODEV;
2123	bp->b_error = `0`;
2124	bp->b_flags = `0`;
2125	bp->b_cflags = `0`;
2126	bp->b_oflags = `0`;
2127	bp->b_objlock = &buffer_lock;
2128	bp->b_iodone = NULL;
2129	bp->b_refcnt = `1`;
2130	bp->b_dev = NODEV;
2131	bp->b_vnbufs.le_next = NOLIST;
2132	BIO_SETPRIO(bp, BPRIO_DEFAULT);
2133	}
2134
2135	void
2136	buf_destroy(buf_t *bp)
2137	{
2138
2139	cv_destroy(&bp->b_done);
2140	cv_destroy(&bp->b_busy);
2141	}
2142
2143	int
2144	bbusy(buf_t bp, bool intr, int* timo, kmutex_t *interlock)
2145	{
2146	int error;
2147
2148	KASSERT(mutex_owned(&bufcache_lock));
2149
2150	if ((bp->b_cflags & BC_BUSY) != `0`) {
2151	if (curlwp == uvm.pagedaemon_lwp)
2152	return EDEADLK;
2153	bp->b_cflags \|= BC_WANTED;
2154	bref(bp);
2155	if (interlock != NULL)
2156	mutex_exit(interlock);
2157	if (intr) {
2158	error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
2159	timo);
2160	} else {
2161	error = cv_timedwait(&bp->b_busy, &bufcache_lock,
2162	timo);
2163	}
2164	brele(bp);
2165	if (interlock != NULL)
2166	mutex_enter(interlock);
2167	if (error != `0`)
2168	return error;
2169	return EPASSTHROUGH;
2170	}
2171	bp->b_cflags \|= BC_BUSY;
2172
2173	return `0`;
2174	}
2175
2176	/*
2177	* Nothing outside this file should really need to know about nbuf,
2178	* but a few things still want to read it, so give them a way to do that.
2179	*/
2180	int
2181	buf_nbuf(void)
2182	{
2183
2184	return nbuf;
2185	}
2186

Browse the source code of netbsd/sys/kern/vfs_bio.c