kern_exec.c source code [netbsd/sys/kern/kern_exec.c]

1	/ $NetBSD: kern_exec.c,v 1.478 2019/07/05 17:14:48 maxv Exp $ /
2
3	/-*
4	* Copyright (c) 2008 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* Redistribution and use in source and binary forms, with or without
8	* modification, are permitted provided that the following conditions
9	* are met:
10	* 1. Redistributions of source code must retain the above copyright
11	* notice, this list of conditions and the following disclaimer.
12	* 2. Redistributions in binary form must reproduce the above copyright
13	* notice, this list of conditions and the following disclaimer in the
14	* documentation and/or other materials provided with the distribution.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	/-*
30	* Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
31	* Copyright (C) 1992 Wolfgang Solfrank.
32	* Copyright (C) 1992 TooLs GmbH.
33	* All rights reserved.
34	*
35	* Redistribution and use in source and binary forms, with or without
36	* modification, are permitted provided that the following conditions
37	* are met:
38	* 1. Redistributions of source code must retain the above copyright
39	* notice, this list of conditions and the following disclaimer.
40	* 2. Redistributions in binary form must reproduce the above copyright
41	* notice, this list of conditions and the following disclaimer in the
42	* documentation and/or other materials provided with the distribution.
43	* 3. All advertising materials mentioning features or use of this software
44	* must display the following acknowledgement:
45	* This product includes software developed by TooLs GmbH.
46	* 4. The name of TooLs GmbH may not be used to endorse or promote products
47	* derived from this software without specific prior written permission.
48	*
49	* THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
50	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
51	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
52	* IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
53	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
54	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
55	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
56	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
57	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
58	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59	*/
60
61	#include <sys/cdefs.h>
62	__KERNEL_RCSID(`0`, "$NetBSD: kern_exec.c,v 1.478 2019/07/05 17:14:48 maxv Exp $");
63
64	#include "opt_exec.h"
65	#include "opt_execfmt.h"
66	#include "opt_ktrace.h"
67	#include "opt_modular.h"
68	#include "opt_syscall_debug.h"
69	#include "veriexec.h"
70	#include "opt_pax.h"
71
72	#include <sys/param.h>
73	#include <sys/systm.h>
74	#include <sys/filedesc.h>
75	#include <sys/kernel.h>
76	#include <sys/proc.h>
77	#include <sys/ptrace.h>
78	#include <sys/mount.h>
79	#include <sys/kmem.h>
80	#include <sys/namei.h>
81	#include <sys/vnode.h>
82	#include <sys/file.h>
83	#include <sys/filedesc.h>
84	#include <sys/acct.h>
85	#include <sys/atomic.h>
86	#include <sys/exec.h>
87	#include <sys/ktrace.h>
88	#include <sys/uidinfo.h>
89	#include <sys/wait.h>
90	#include <sys/mman.h>
91	#include <sys/ras.h>
92	#include <sys/signalvar.h>
93	#include <sys/stat.h>
94	#include <sys/syscall.h>
95	#include <sys/kauth.h>
96	#include <sys/lwpctl.h>
97	#include <sys/pax.h>
98	#include <sys/cpu.h>
99	#include <sys/module.h>
100	#include <sys/syscallvar.h>
101	#include <sys/syscallargs.h>
102	#if NVERIEXEC > 0
103	#include <sys/verified_exec.h>
104	#endif /* NVERIEXEC > 0 */
105	#include <sys/sdt.h>
106	#include <sys/spawn.h>
107	#include <sys/prot.h>
108	#include <sys/cprng.h>
109
110	#include <uvm/uvm_extern.h>
111
112	#include <machine/reg.h>
113
114	#include <compat/common/compat_util.h>
115
116	#ifndef MD_TOPDOWN_INIT
117	#ifdef __USE_TOPDOWN_VM
118	#define MD_TOPDOWN_INIT(epp) (epp)->ep_flags \|= EXEC_TOPDOWN_VM
119	#else
120	#define MD_TOPDOWN_INIT(epp)
121	#endif
122	#endif
123
124	struct execve_data;
125
126	extern int user_va0_disable;
127
128	static size_t calcargs(struct execve_data * restrict, const size_t);
129	static size_t calcstack(struct execve_data * restrict, const size_t);
130	static int copyoutargs(struct execve_data * restrict, struct lwp *,
131	char * const);
132	static int copyoutpsstrs(struct execve_data * restrict, struct proc *);
133	static int copyinargs(struct execve_data * restrict, char * const *,
134	char * const , execve_fetch_element_t, char* **);
135	static int copyinargstrs(struct execve_data * restrict, char * const *,
136	execve_fetch_element_t, char *, size_t , void ()(const* void *, size_t));
137	static int exec_sigcode_map(struct proc , const* struct emul *);
138
139	#if defined(DEBUG) && !defined(DEBUG_EXEC)
140	#define DEBUG_EXEC
141	#endif
142	#ifdef DEBUG_EXEC
143	#define DPRINTF(a) printf a
144	#define COPYPRINTF(s, a, b) printf("%s, %d: copyout%s @%p %zu\n", __func__, \
145	__LINE__, (s), (a), (b))
146	static void dump_vmcmds(const struct exec_package * const, size_t, int);
147	#define DUMPVMCMDS(p, x, e) do { dump_vmcmds((p), (x), (e)); } while (0)
148	#else
149	#define DPRINTF(a)
150	#define COPYPRINTF(s, a, b)
151	#define DUMPVMCMDS(p, x, e) do {} while (0)
152	#endif /* DEBUG_EXEC */
153
154	/*
155	* DTrace SDT provider definitions
156	*/
157	SDT_PROVIDER_DECLARE(proc);
158	SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *");
159	SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *");
160	SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int");
161
162	/*
163	* Exec function switch:
164	*
165	* Note that each makecmds function is responsible for loading the
166	* exec package with the necessary functions for any exec-type-specific
167	* handling.
168	*
169	* Functions for specific exec types should be defined in their own
170	* header file.
171	*/
172	static const struct execsw **execsw = NULL;
173	static int nexecs;
174
175	u_int exec_maxhdrsz; / must not be static - used by netbsd32 /
176
177	/ list of dynamically loaded execsw entries /
178	static LIST_HEAD(execlist_head, exec_entry) ex_head =
179	LIST_HEAD_INITIALIZER(ex_head);
180	struct exec_entry {
181	LIST_ENTRY(exec_entry) ex_list;
182	SLIST_ENTRY(exec_entry) ex_slist;
183	const struct execsw *ex_sw;
184	};
185
186	#ifndef __HAVE_SYSCALL_INTERN
187	void syscall(void);
188	#endif
189
190	/ NetBSD autoloadable syscalls /
191	#ifdef MODULAR
192	#include <kern/syscalls_autoload.c>
193	#endif
194
195	/ NetBSD emul struct /
196	struct emul emul_netbsd = {
197	.e_name = "netbsd",
198	#ifdef EMUL_NATIVEROOT
199	.e_path = EMUL_NATIVEROOT,
200	#else
201	.e_path = NULL,
202	#endif
203	#ifndef __HAVE_MINIMAL_EMUL
204	.e_flags = EMUL_HAS_SYS___syscall,
205	.e_errno = NULL,
206	.e_nosys = SYS_syscall,
207	.e_nsysent = SYS_NSYSENT,
208	#endif
209	#ifdef MODULAR
210	.e_sc_autoload = netbsd_syscalls_autoload,
211	#endif
212	.e_sysent = sysent,
213	.e_nomodbits = sysent_nomodbits,
214	#ifdef SYSCALL_DEBUG
215	.e_syscallnames = syscallnames,
216	#else
217	.e_syscallnames = NULL,
218	#endif
219	.e_sendsig = sendsig,
220	.e_trapsignal = trapsignal,
221	.e_sigcode = NULL,
222	.e_esigcode = NULL,
223	.e_sigobject = NULL,
224	.e_setregs = setregs,
225	.e_proc_exec = NULL,
226	.e_proc_fork = NULL,
227	.e_proc_exit = NULL,
228	.e_lwp_fork = NULL,
229	.e_lwp_exit = NULL,
230	#ifdef __HAVE_SYSCALL_INTERN
231	.e_syscall_intern = syscall_intern,
232	#else
233	.e_syscall = syscall,
234	#endif
235	.e_sysctlovly = NULL,
236	.e_vm_default_addr = uvm_default_mapaddr,
237	.e_usertrap = NULL,
238	.e_ucsize = sizeof(ucontext_t),
239	.e_startlwp = startlwp
240	};
241
242	/*
243	* Exec lock. Used to control access to execsw[] structures.
244	* This must not be static so that netbsd32 can access it, too.
245	*/
246	krwlock_t exec_lock;
247
248	static kmutex_t sigobject_lock;
249
250	/*
251	* Data used between a loadvm and execve part of an "exec" operation
252	*/
253	struct execve_data {
254	struct exec_package ed_pack;
255	struct pathbuf *ed_pathbuf;
256	struct vattr ed_attr;
257	struct ps_strings ed_arginfo;
258	char *ed_argp;
259	const char *ed_pathstring;
260	char *ed_resolvedpathbuf;
261	size_t ed_ps_strings_sz;
262	int ed_szsigcode;
263	size_t ed_argslen;
264	long ed_argc;
265	long ed_envc;
266	};
267
268	/*
269	* data passed from parent lwp to child during a posix_spawn()
270	*/
271	struct spawn_exec_data {
272	struct execve_data sed_exec;
273	struct posix_spawn_file_actions
274	*sed_actions;
275	struct posix_spawnattr *sed_attrs;
276	struct proc *sed_parent;
277	kcondvar_t sed_cv_child_ready;
278	kmutex_t sed_mtx_child;
279	int sed_error;
280	volatile uint32_t sed_refcnt;
281	};
282
283	static struct vm_map *exec_map;
284	static struct pool exec_pool;
285
286	static void *
287	exec_pool_alloc(struct pool pp, int* flags)
288	{
289
290	return (void *)uvm_km_alloc(exec_map, NCARGS, `0`,
291	UVM_KMF_PAGEABLE \| UVM_KMF_WAITVA);
292	}
293
294	static void
295	exec_pool_free(struct pool pp, void* *addr)
296	{
297
298	uvm_km_free(exec_map, (vaddr_t)addr, NCARGS, UVM_KMF_PAGEABLE);
299	}
300
301	static struct pool_allocator exec_palloc = {
302	.pa_alloc = exec_pool_alloc,
303	.pa_free = exec_pool_free,
304	.pa_pagesz = NCARGS
305	};
306
307	/*
308	* check exec:
309	* given an "executable" described in the exec package's namei info,
310	* see what we can do with it.
311	*
312	* ON ENTRY:
313	* exec package with appropriate namei info
314	* lwp pointer of exec'ing lwp
315	* NO SELF-LOCKED VNODES
316	*
317	* ON EXIT:
318	* error: nothing held, etc. exec header still allocated.
319	* ok: filled exec package, executable's vnode (unlocked).
320	*
321	* EXEC SWITCH ENTRY:
322	* Locked vnode to check, exec package, proc.
323	*
324	* EXEC SWITCH EXIT:
325	* ok: return 0, filled exec package, executable's vnode (unlocked).
326	* error: destructive:
327	* everything deallocated execept exec header.
328	* non-destructive:
329	* error code, executable's vnode (unlocked),
330	* exec header unmodified.
331	*/
332	int
333	/ARGSUSED/
334	check_exec(struct lwp l, struct* exec_package epp, struct* pathbuf *pb)
335	{
336	int error, i;
337	struct vnode *vp;
338	struct nameidata nd;
339	size_t resid;
340
341	#if 1
342	// grab the absolute pathbuf here before namei() trashes it.
343	pathbuf_copystring(pb, epp->ep_resolvedname, PATH_MAX);
344	#endif
345	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| TRYEMULROOT, pb);
346
347	/ first get the vnode /
348	if ((error = namei(&nd)) != `0`)
349	return error;
350	epp->ep_vp = vp = nd.ni_vp;
351	#if 0
352	/*
353	* XXX: can't use nd.ni_pnbuf, because although pb contains an
354	* absolute path, nd.ni_pnbuf does not if the path contains symlinks.
355	*/
356	/ normally this can't fail /
357	error = copystr(nd.ni_pnbuf, epp->ep_resolvedname, PATH_MAX, NULL);
358	KASSERT(error == `0`);
359	#endif
360
361	#ifdef DIAGNOSTIC
362	/ paranoia (take this out once namei stuff stabilizes) /
363	memset(nd.ni_pnbuf, `'~'`, PATH_MAX);
364	#endif
365
366	/ check access and type /
367	if (vp->v_type != VREG) {
368	error = EACCES;
369	goto bad1;
370	}
371	if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != `0`)
372	goto bad1;
373
374	/ get attributes /
375	if ((error = VOP_GETATTR(vp, epp->ep_vap, l->l_cred)) != `0`)
376	goto bad1;
377
378	/ Check mount point /
379	if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
380	error = EACCES;
381	goto bad1;
382	}
383	if (vp->v_mount->mnt_flag & MNT_NOSUID)
384	epp->ep_vap->va_mode &= ~(S_ISUID \| S_ISGID);
385
386	/ try to open it /
387	if ((error = VOP_OPEN(vp, FREAD, l->l_cred)) != `0`)
388	goto bad1;
389
390	/ unlock vp, since we need it unlocked from here on out. /
391	VOP_UNLOCK(vp);
392
393	#if NVERIEXEC > 0
394	error = veriexec_verify(l, vp, epp->ep_resolvedname,
395	epp->ep_flags & EXEC_INDIR ? VERIEXEC_INDIRECT : VERIEXEC_DIRECT,
396	NULL);
397	if (error)
398	goto bad2;
399	#endif /* NVERIEXEC > 0 */
400
401	#ifdef PAX_SEGVGUARD
402	error = pax_segvguard(l, vp, epp->ep_resolvedname, false);
403	if (error)
404	goto bad2;
405	#endif /* PAX_SEGVGUARD */
406
407	/ now we have the file, get the exec header /
408	error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, `0`,
409	UIO_SYSSPACE, `0`, l->l_cred, &resid, NULL);
410	if (error)
411	goto bad2;
412	epp->ep_hdrvalid = epp->ep_hdrlen - resid;
413
414	/*
415	* Set up default address space limits. Can be overridden
416	* by individual exec packages.
417	*/
418	epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
419	epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS;
420
421	/*
422	* set up the vmcmds for creation of the process
423	* address space
424	*/
425	error = ENOEXEC;
426	for (i = `0`; i < nexecs; i++) {
427	int newerror;
428
429	epp->ep_esch = execsw[i];
430	newerror = (*execsw[i]->es_makecmds)(l, epp);
431
432	if (!newerror) {
433	/ Seems ok: check that entry point is not too high /
434	if (epp->ep_entry >= epp->ep_vm_maxaddr) {
435	#ifdef DIAGNOSTIC
436	printf("%s: rejecting %p due to "
437	"too high entry address (>= %p)\n",
438	__func__, (void *)epp->ep_entry,
439	(void *)epp->ep_vm_maxaddr);
440	#endif
441	error = ENOEXEC;
442	break;
443	}
444	/ Seems ok: check that entry point is not too low /
445	if (epp->ep_entry < epp->ep_vm_minaddr) {
446	#ifdef DIAGNOSTIC
447	printf("%s: rejecting %p due to "
448	"too low entry address (< %p)\n",
449	__func__, (void *)epp->ep_entry,
450	(void *)epp->ep_vm_minaddr);
451	#endif
452	error = ENOEXEC;
453	break;
454	}
455
456	/ check limits /
457	if ((epp->ep_tsize > MAXTSIZ) \|\|
458	(epp->ep_dsize > (u_quad_t)l->l_proc->p_rlimit
459	[RLIMIT_DATA].rlim_cur)) {
460	#ifdef DIAGNOSTIC
461	printf("%s: rejecting due to "
462	"limits (t=%llu > %llu \|\| d=%llu > %llu)\n",
463	__func__,
464	(unsigned long long)epp->ep_tsize,
465	(unsigned long long)MAXTSIZ,
466	(unsigned long long)epp->ep_dsize,
467	(unsigned long long)
468	l->l_proc->p_rlimit[RLIMIT_DATA].rlim_cur);
469	#endif
470	error = ENOMEM;
471	break;
472	}
473	return `0`;
474	}
475
476	/*
477	* Reset all the fields that may have been modified by the
478	* loader.
479	*/
480	KASSERT(epp->ep_emul_arg == NULL);
481	if (epp->ep_emul_root != NULL) {
482	vrele(epp->ep_emul_root);
483	epp->ep_emul_root = NULL;
484	}
485	if (epp->ep_interp != NULL) {
486	vrele(epp->ep_interp);
487	epp->ep_interp = NULL;
488	}
489	epp->ep_pax_flags = `0`;
490
491	/ make sure the first "interesting" error code is saved. /
492	if (error == ENOEXEC)
493	error = newerror;
494
495	if (epp->ep_flags & EXEC_DESTR)
496	/ Error from "#!" code, tidied up by recursive call /
497	return error;
498	}
499
500	/ not found, error /
501
502	/*
503	* free any vmspace-creation commands,
504	* and release their references
505	*/
506	kill_vmcmds(&epp->ep_vmcmds);
507
508	bad2:
509	/*
510	* close and release the vnode, restore the old one, free the
511	* pathname buf, and punt.
512	*/
513	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
514	VOP_CLOSE(vp, FREAD, l->l_cred);
515	vput(vp);
516	return error;
517
518	bad1:
519	/*
520	* free the namei pathname buffer, and put the vnode
521	* (which we don't yet have open).
522	*/
523	vput(vp); / was still locked /
524	return error;
525	}
526
527	#ifdef __MACHINE_STACK_GROWS_UP
528	#define STACK_PTHREADSPACE NBPG
529	#else
530	#define STACK_PTHREADSPACE 0
531	#endif
532
533	static int
534	execve_fetch_element(char * const array, size_t index, char* **value)
535	{
536	return copyin(array + index, value, sizeof(*value));
537	}
538
539	/*
540	* exec system call
541	*/
542	int
543	sys_execve(struct lwp l, const* struct sys_execve_args uap, register_t retval)
544	{
545	/ {*
546	syscallarg(const char ) path;*
547	syscallarg(char const ) argp;
548	syscallarg(char const ) envp;
549	} /*
550
551	return execve1(l, SCARG(uap, path), SCARG(uap, argp),
552	SCARG(uap, envp), execve_fetch_element);
553	}
554
555	int
556	sys_fexecve(struct lwp l, const* struct sys_fexecve_args *uap,
557	register_t *retval)
558	{
559	/ {*
560	syscallarg(int) fd;
561	syscallarg(char const ) argp;
562	syscallarg(char const ) envp;
563	} /*
564
565	return ENOSYS;
566	}
567
568	/*
569	* Load modules to try and execute an image that we do not understand.
570	* If no execsw entries are present, we load those likely to be needed
571	* in order to run native images only. Otherwise, we autoload all
572	* possible modules that could let us run the binary. XXX lame
573	*/
574	static void
575	exec_autoload(void)
576	{
577	#ifdef MODULAR
578	static const char * const native[] = {
579	"exec_elf32",
580	"exec_elf64",
581	"exec_script",
582	NULL
583	};
584	static const char * const compat[] = {
585	"exec_elf32",
586	"exec_elf64",
587	"exec_script",
588	"exec_aout",
589	"exec_coff",
590	"exec_ecoff",
591	"compat_aoutm68k",
592	"compat_netbsd32",
593	"compat_sunos",
594	"compat_sunos32",
595	"compat_ultrix",
596	NULL
597	};
598	char const * const *list;
599	int i;
600
601	list = (nexecs == `0` ? native : compat);
602	for (i = `0`; list[i] != NULL; i++) {
603	if (module_autoload(list[i], MODULE_CLASS_EXEC) != `0`) {
604	continue;
605	}
606	yield();
607	}
608	#endif
609	}
610
611	/*
612	* Copy the user or kernel supplied upath to the allocated pathbuffer pbp
613	* making it absolute in the process, by prepending the current working
614	* directory if it is not. If offs is supplied it will contain the offset
615	* where the original supplied copy of upath starts.
616	*/
617	int
618	exec_makepathbuf(struct lwp l, const* char upath, enum* uio_seg seg,
619	struct pathbuf *pbp, size_t offs)
620	{
621	char path, bp;
622	size_t len, tlen;
623	int error;
624	struct cwdinfo *cwdi;
625
626	path = PNBUF_GET();
627	if (seg == UIO_SYSSPACE) {
628	error = copystr(upath, path, MAXPATHLEN, &len);
629	} else {
630	error = copyinstr(upath, path, MAXPATHLEN, &len);
631	}
632	if (error)
633	goto err;
634
635	if (path[`0`] == `'/'`) {
636	if (offs)
637	*offs = `0`;
638	goto out;
639	}
640
641	len++;
642	if (len + `1` >= MAXPATHLEN) {
643	error = ENAMETOOLONG;
644	goto err;
645	}
646	bp = path + MAXPATHLEN - len;
647	memmove(bp, path, len);
648	*(--bp) = `'/'`;
649
650	cwdi = l->l_proc->p_cwdi;
651	rw_enter(&cwdi->cwdi_lock, RW_READER);
652	error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path, MAXPATHLEN / `2`,
653	GETCWD_CHECK_ACCESS, l);
654	rw_exit(&cwdi->cwdi_lock);
655
656	if (error)
657	goto err;
658	tlen = path + MAXPATHLEN - bp;
659
660	memmove(path, bp, tlen);
661	path[tlen - `1`] = `'\0'`;
662	if (offs)
663	*offs = tlen - len;
664	out:
665	*pbp = pathbuf_assimilate(path);
666	return `0`;
667	err:
668	PNBUF_PUT(path);
669	return error;
670	}
671
672	vaddr_t
673	exec_vm_minaddr(vaddr_t va_min)
674	{
675	/*
676	* Increase va_min if we don't want NULL to be mappable by the
677	* process.
678	*/
679	#define VM_MIN_GUARD PAGE_SIZE
680	if (user_va0_disable && (va_min < VM_MIN_GUARD))
681	return VM_MIN_GUARD;
682	return va_min;
683	}
684
685	static int
686	execve_loadvm(struct lwp l, const* char path, char* * const *args,
687	char * const *envs, execve_fetch_element_t fetch_element,
688	struct execve_data * restrict data)
689	{
690	struct exec_package * const epp = &data->ed_pack;
691	int error;
692	struct proc *p;
693	char *dp;
694	u_int modgen;
695	size_t offs;
696
697	KASSERT(data != NULL);
698
699	p = l->l_proc;
700	modgen = `0`;
701
702	SDT_PROBE(proc, kernel, , exec, path, `0`, `0`, `0`, `0`);
703
704	/*
705	* Check if we have exceeded our number of processes limit.
706	* This is so that we handle the case where a root daemon
707	* forked, ran setuid to become the desired user and is trying
708	* to exec. The obvious place to do the reference counting check
709	* is setuid(), but we don't do the reference counting check there
710	* like other OS's do because then all the programs that use setuid()
711	* must be modified to check the return code of setuid() and exit().
712	* It is dangerous to make setuid() fail, because it fails open and
713	* the program will continue to run as root. If we make it succeed
714	* and return an error code, again we are not enforcing the limit.
715	* The best place to enforce the limit is here, when the process tries
716	* to execute a new image, because eventually the process will need
717	* to call exec in order to do something useful.
718	*/
719	retry:
720	if (p->p_flag & PK_SUGID) {
721	if (kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
722	p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
723	&p->p_rlimit[RLIMIT_NPROC],
724	KAUTH_ARG(RLIMIT_NPROC)) != `0` &&
725	chgproccnt(kauth_cred_getuid(l->l_cred), `0`) >
726	p->p_rlimit[RLIMIT_NPROC].rlim_cur)
727	return EAGAIN;
728	}
729
730	/*
731	* Drain existing references and forbid new ones. The process
732	* should be left alone until we're done here. This is necessary
733	* to avoid race conditions - e.g. in ptrace() - that might allow
734	* a local user to illicitly obtain elevated privileges.
735	*/
736	rw_enter(&p->p_reflock, RW_WRITER);
737
738	/*
739	* Init the namei data to point the file user's program name.
740	* This is done here rather than in check_exec(), so that it's
741	* possible to override this settings if any of makecmd/probe
742	* functions call check_exec() recursively - for example,
743	* see exec_script_makecmds().
744	*/
745	if ((error = exec_makepathbuf(l, path, UIO_USERSPACE,
746	&data->ed_pathbuf, &offs)) != `0`)
747	goto clrflg;
748	data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
749	data->ed_resolvedpathbuf = PNBUF_GET();
750
751	/*
752	* initialize the fields of the exec package.
753	*/
754	epp->ep_kname = data->ed_pathstring + offs;
755	epp->ep_resolvedname = data->ed_resolvedpathbuf;
756	epp->ep_hdr = kmem_alloc(exec_maxhdrsz, KM_SLEEP);
757	epp->ep_hdrlen = exec_maxhdrsz;
758	epp->ep_hdrvalid = `0`;
759	epp->ep_emul_arg = NULL;
760	epp->ep_emul_arg_free = NULL;
761	memset(&epp->ep_vmcmds, `0`, sizeof(epp->ep_vmcmds));
762	epp->ep_vap = &data->ed_attr;
763	epp->ep_flags = (p->p_flag & PK_32) ? EXEC_FROM32 : `0`;
764	MD_TOPDOWN_INIT(epp);
765	epp->ep_emul_root = NULL;
766	epp->ep_interp = NULL;
767	epp->ep_esch = NULL;
768	epp->ep_pax_flags = `0`;
769	memset(epp->ep_machine_arch, `0`, sizeof(epp->ep_machine_arch));
770
771	rw_enter(&exec_lock, RW_READER);
772
773	/ see if we can run it. /
774	if ((error = check_exec(l, epp, data->ed_pathbuf)) != `0`) {
775	if (error != ENOENT && error != EACCES && error != ENOEXEC) {
776	DPRINTF(("%s: check exec failed for %s, error %d\n",
777	__func__, epp->ep_kname, error));
778	}
779	goto freehdr;
780	}
781
782	/ allocate an argument buffer /
783	data->ed_argp = pool_get(&exec_pool, PR_WAITOK);
784	KASSERT(data->ed_argp != NULL);
785	dp = data->ed_argp;
786
787	if ((error = copyinargs(data, args, envs, fetch_element, &dp)) != `0`) {
788	goto bad;
789	}
790
791	/*
792	* Calculate the new stack size.
793	*/
794
795	#ifdef __MACHINE_STACK_GROWS_UP
796	/*
797	* copyargs() fills argc/argv/envp from the lower address even on
798	* __MACHINE_STACK_GROWS_UP machines. Reserve a few words just below the SP
799	* so that _rtld() use it.
800	*/
801	#define RTLD_GAP 32
802	#else
803	#define RTLD_GAP 0
804	#endif
805
806	const size_t argenvstrlen = (char *)ALIGN(dp) - data->ed_argp;
807
808	data->ed_argslen = calcargs(data, argenvstrlen);
809
810	const size_t len = calcstack(data, pax_aslr_stack_gap(epp) + RTLD_GAP);
811
812	if (len > epp->ep_ssize) {
813	/ in effect, compare to initial limit /
814	DPRINTF(("%s: stack limit exceeded %zu\n", __func__, len));
815	error = ENOMEM;
816	goto bad;
817	}
818	/ adjust "active stack depth" for process VSZ /
819	epp->ep_ssize = len;
820
821	return `0`;
822
823	bad:
824	/ free the vmspace-creation commands, and release their references /
825	kill_vmcmds(&epp->ep_vmcmds);
826	/ kill any opened file descriptor, if necessary /
827	if (epp->ep_flags & EXEC_HASFD) {
828	epp->ep_flags &= ~EXEC_HASFD;
829	fd_close(epp->ep_fd);
830	}
831	/ close and put the exec'd file /
832	vn_lock(epp->ep_vp, LK_EXCLUSIVE \| LK_RETRY);
833	VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
834	vput(epp->ep_vp);
835	pool_put(&exec_pool, data->ed_argp);
836
837	freehdr:
838	kmem_free(epp->ep_hdr, epp->ep_hdrlen);
839	if (epp->ep_emul_root != NULL)
840	vrele(epp->ep_emul_root);
841	if (epp->ep_interp != NULL)
842	vrele(epp->ep_interp);
843
844	rw_exit(&exec_lock);
845
846	pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
847	pathbuf_destroy(data->ed_pathbuf);
848	PNBUF_PUT(data->ed_resolvedpathbuf);
849
850	clrflg:
851	rw_exit(&p->p_reflock);
852
853	if (modgen != module_gen && error == ENOEXEC) {
854	modgen = module_gen;
855	exec_autoload();
856	goto retry;
857	}
858
859	SDT_PROBE(proc, kernel, , exec__failure, error, `0`, `0`, `0`, `0`);
860	return error;
861	}
862
863	static int
864	execve_dovmcmds(struct lwp l, struct* execve_data * restrict data)
865	{
866	struct exec_package * const epp = &data->ed_pack;
867	struct proc *p = l->l_proc;
868	struct exec_vmcmd *base_vcp;
869	int error = `0`;
870	size_t i;
871
872	/ record proc's vnode, for use by procfs and others /
873	if (p->p_textvp)
874	vrele(p->p_textvp);
875	vref(epp->ep_vp);
876	p->p_textvp = epp->ep_vp;
877
878	/ create the new process's VM space by running the vmcmds /
879	KASSERTMSG(epp->ep_vmcmds.evs_used != `0`, "%s: no vmcmds", __func__);
880
881	#ifdef TRACE_EXEC
882	DUMPVMCMDS(epp, `0`, `0`);
883	#endif
884
885	base_vcp = NULL;
886
887	for (i = `0`; i < epp->ep_vmcmds.evs_used && !error; i++) {
888	struct exec_vmcmd *vcp;
889
890	vcp = &epp->ep_vmcmds.evs_cmds[i];
891	if (vcp->ev_flags & VMCMD_RELATIVE) {
892	KASSERTMSG(base_vcp != NULL,
893	"%s: relative vmcmd with no base", __func__);
894	KASSERTMSG((vcp->ev_flags & VMCMD_BASE) == `0`,
895	"%s: illegal base & relative vmcmd", __func__);
896	vcp->ev_addr += base_vcp->ev_addr;
897	}
898	error = (*vcp->ev_proc)(l, vcp);
899	if (error)
900	DUMPVMCMDS(epp, i, error);
901	if (vcp->ev_flags & VMCMD_BASE)
902	base_vcp = vcp;
903	}
904
905	/ free the vmspace-creation commands, and release their references /
906	kill_vmcmds(&epp->ep_vmcmds);
907
908	vn_lock(epp->ep_vp, LK_EXCLUSIVE \| LK_RETRY);
909	VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
910	vput(epp->ep_vp);
911
912	/ if an error happened, deallocate and punt /
913	if (error != `0`) {
914	DPRINTF(("%s: vmcmd %zu failed: %d\n", __func__, i - `1`, error));
915	}
916	return error;
917	}
918
919	static void
920	execve_free_data(struct execve_data *data)
921	{
922	struct exec_package * const epp = &data->ed_pack;
923
924	/ free the vmspace-creation commands, and release their references /
925	kill_vmcmds(&epp->ep_vmcmds);
926	/ kill any opened file descriptor, if necessary /
927	if (epp->ep_flags & EXEC_HASFD) {
928	epp->ep_flags &= ~EXEC_HASFD;
929	fd_close(epp->ep_fd);
930	}
931
932	/ close and put the exec'd file /
933	vn_lock(epp->ep_vp, LK_EXCLUSIVE \| LK_RETRY);
934	VOP_CLOSE(epp->ep_vp, FREAD, curlwp->l_cred);
935	vput(epp->ep_vp);
936	pool_put(&exec_pool, data->ed_argp);
937
938	kmem_free(epp->ep_hdr, epp->ep_hdrlen);
939	if (epp->ep_emul_root != NULL)
940	vrele(epp->ep_emul_root);
941	if (epp->ep_interp != NULL)
942	vrele(epp->ep_interp);
943
944	pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
945	pathbuf_destroy(data->ed_pathbuf);
946	PNBUF_PUT(data->ed_resolvedpathbuf);
947	}
948
949	static void
950	pathexec(struct proc p, const* char *resolvedname)
951	{
952	KASSERT(resolvedname[`0`] == `'/'`);
953
954	/ set command name & other accounting info /
955	strlcpy(p->p_comm, strrchr(resolvedname, `'/'`) + `1`, sizeof(p->p_comm));
956
957	kmem_strfree(p->p_path);
958	p->p_path = kmem_strdupsize(resolvedname, NULL, KM_SLEEP);
959	}
960
961	/ XXX elsewhere /
962	static int
963	credexec(struct lwp l, struct* vattr *attr)
964	{
965	struct proc *p = l->l_proc;
966	int error;
967
968	/*
969	* Deal with set[ug]id. MNT_NOSUID has already been used to disable
970	* s[ug]id. It's OK to check for PSL_TRACED here as we have blocked
971	* out additional references on the process for the moment.
972	*/
973	if ((p->p_slflag & PSL_TRACED) == `0` &&
974
975	(((attr->va_mode & S_ISUID) != `0` &&
976	kauth_cred_geteuid(l->l_cred) != attr->va_uid) \|\|
977
978	((attr->va_mode & S_ISGID) != `0` &&
979	kauth_cred_getegid(l->l_cred) != attr->va_gid))) {
980	/*
981	* Mark the process as SUGID before we do
982	* anything that might block.
983	*/
984	proc_crmod_enter();
985	proc_crmod_leave(NULL, NULL, true);
986
987	/ Make sure file descriptors 0..2 are in use. /
988	if ((error = fd_checkstd()) != `0`) {
989	DPRINTF(("%s: fdcheckstd failed %d\n",
990	__func__, error));
991	return error;
992	}
993
994	/*
995	* Copy the credential so other references don't see our
996	* changes.
997	*/
998	l->l_cred = kauth_cred_copy(l->l_cred);
999	#ifdef KTRACE
1000	/*
1001	* If the persistent trace flag isn't set, turn off.
1002	*/
1003	if (p->p_tracep) {
1004	mutex_enter(&ktrace_lock);
1005	if (!(p->p_traceflag & KTRFAC_PERSISTENT))
1006	ktrderef(p);
1007	mutex_exit(&ktrace_lock);
1008	}
1009	#endif
1010	if (attr->va_mode & S_ISUID)
1011	kauth_cred_seteuid(l->l_cred, attr->va_uid);
1012	if (attr->va_mode & S_ISGID)
1013	kauth_cred_setegid(l->l_cred, attr->va_gid);
1014	} else {
1015	if (kauth_cred_geteuid(l->l_cred) ==
1016	kauth_cred_getuid(l->l_cred) &&
1017	kauth_cred_getegid(l->l_cred) ==
1018	kauth_cred_getgid(l->l_cred))
1019	p->p_flag &= ~PK_SUGID;
1020	}
1021
1022	/*
1023	* Copy the credential so other references don't see our changes.
1024	* Test to see if this is necessary first, since in the common case
1025	* we won't need a private reference.
1026	*/
1027	if (kauth_cred_geteuid(l->l_cred) != kauth_cred_getsvuid(l->l_cred) \|\|
1028	kauth_cred_getegid(l->l_cred) != kauth_cred_getsvgid(l->l_cred)) {
1029	l->l_cred = kauth_cred_copy(l->l_cred);
1030	kauth_cred_setsvuid(l->l_cred, kauth_cred_geteuid(l->l_cred));
1031	kauth_cred_setsvgid(l->l_cred, kauth_cred_getegid(l->l_cred));
1032	}
1033
1034	/ Update the master credentials. /
1035	if (l->l_cred != p->p_cred) {
1036	kauth_cred_t ocred;
1037
1038	kauth_cred_hold(l->l_cred);
1039	mutex_enter(p->p_lock);
1040	ocred = p->p_cred;
1041	p->p_cred = l->l_cred;
1042	mutex_exit(p->p_lock);
1043	kauth_cred_free(ocred);
1044	}
1045
1046	return `0`;
1047	}
1048
1049	static void
1050	emulexec(struct lwp l, struct* exec_package *epp)
1051	{
1052	struct proc *p = l->l_proc;
1053
1054	/ The emulation root will usually have been found when we looked*
1055	* for the elf interpreter (or similar), if not look now. */
1056	if (epp->ep_esch->es_emul->e_path != NULL &&
1057	epp->ep_emul_root == NULL)
1058	emul_find_root(l, epp);
1059
1060	/ Any old emulation root got removed by fdcloseexec /
1061	rw_enter(&p->p_cwdi->cwdi_lock, RW_WRITER);
1062	p->p_cwdi->cwdi_edir = epp->ep_emul_root;
1063	rw_exit(&p->p_cwdi->cwdi_lock);
1064	epp->ep_emul_root = NULL;
1065	if (epp->ep_interp != NULL)
1066	vrele(epp->ep_interp);
1067
1068	/*
1069	* Call emulation specific exec hook. This can setup per-process
1070	* p->p_emuldata or do any other per-process stuff an emulation needs.
1071	*
1072	* If we are executing process of different emulation than the
1073	* original forked process, call e_proc_exit() of the old emulation
1074	* first, then e_proc_exec() of new emulation. If the emulation is
1075	* same, the exec hook code should deallocate any old emulation
1076	* resources held previously by this process.
1077	*/
1078	if (p->p_emul && p->p_emul->e_proc_exit
1079	&& p->p_emul != epp->ep_esch->es_emul)
1080	(*p->p_emul->e_proc_exit)(p);
1081
1082	/*
1083	* This is now LWP 1.
1084	*/
1085	/ XXX elsewhere /
1086	mutex_enter(p->p_lock);
1087	p->p_nlwpid = `1`;
1088	l->l_lid = `1`;
1089	mutex_exit(p->p_lock);
1090
1091	/*
1092	* Call exec hook. Emulation code may NOT store reference to anything
1093	* from &pack.
1094	*/
1095	if (epp->ep_esch->es_emul->e_proc_exec)
1096	(*epp->ep_esch->es_emul->e_proc_exec)(p, epp);
1097
1098	/ update p_emul, the old value is no longer needed /
1099	p->p_emul = epp->ep_esch->es_emul;
1100
1101	/ ...and the same for p_execsw /
1102	p->p_execsw = epp->ep_esch;
1103
1104	#ifdef __HAVE_SYSCALL_INTERN
1105	(*p->p_emul->e_syscall_intern)(p);
1106	#endif
1107	ktremul();
1108	}
1109
1110	static int
1111	execve_runproc(struct lwp l, struct* execve_data * restrict data,
1112	bool no_local_exec_lock, bool is_spawn)
1113	{
1114	struct exec_package * const epp = &data->ed_pack;
1115	int error = `0`;
1116	struct proc *p;
1117
1118	/*
1119	* In case of a posix_spawn operation, the child doing the exec
1120	* might not hold the reader lock on exec_lock, but the parent
1121	* will do this instead.
1122	*/
1123	KASSERT(no_local_exec_lock \|\| rw_lock_held(&exec_lock));
1124	KASSERT(!no_local_exec_lock \|\| is_spawn);
1125	KASSERT(data != NULL);
1126
1127	p = l->l_proc;
1128
1129	/ Get rid of other LWPs. /
1130	if (p->p_nlwps > `1`) {
1131	mutex_enter(p->p_lock);
1132	exit_lwps(l);
1133	mutex_exit(p->p_lock);
1134	}
1135	KDASSERT(p->p_nlwps == `1`);
1136
1137	/ Destroy any lwpctl info. /
1138	if (p->p_lwpctl != NULL)
1139	lwp_ctl_exit();
1140
1141	/ Remove POSIX timers /
1142	timers_free(p, TIMERS_POSIX);
1143
1144	/ Set the PaX flags. /
1145	pax_set_flags(epp, p);
1146
1147	/*
1148	* Do whatever is necessary to prepare the address space
1149	* for remapping. Note that this might replace the current
1150	* vmspace with another!
1151	*/
1152	if (is_spawn)
1153	uvmspace_spawn(l, epp->ep_vm_minaddr,
1154	epp->ep_vm_maxaddr,
1155	epp->ep_flags & EXEC_TOPDOWN_VM);
1156	else
1157	uvmspace_exec(l, epp->ep_vm_minaddr,
1158	epp->ep_vm_maxaddr,
1159	epp->ep_flags & EXEC_TOPDOWN_VM);
1160
1161	struct vmspace *vm;
1162	vm = p->p_vmspace;
1163	vm->vm_taddr = (void *)epp->ep_taddr;
1164	vm->vm_tsize = btoc(epp->ep_tsize);
1165	vm->vm_daddr = (void*)epp->ep_daddr;
1166	vm->vm_dsize = btoc(epp->ep_dsize);
1167	vm->vm_ssize = btoc(epp->ep_ssize);
1168	vm->vm_issize = `0`;
1169	vm->vm_maxsaddr = (void *)epp->ep_maxsaddr;
1170	vm->vm_minsaddr = (void *)epp->ep_minsaddr;
1171
1172	pax_aslr_init_vm(l, vm, epp);
1173
1174	/ Now map address space. /
1175	error = execve_dovmcmds(l, data);
1176	if (error != `0`)
1177	goto exec_abort;
1178
1179	pathexec(p, epp->ep_resolvedname);
1180
1181	char * const newstack = STACK_GROW(vm->vm_minsaddr, epp->ep_ssize);
1182
1183	error = copyoutargs(data, l, newstack);
1184	if (error != `0`)
1185	goto exec_abort;
1186
1187	cwdexec(p);
1188	fd_closeexec(); / handle close on exec /
1189
1190	if (__predict_false(ktrace_on))
1191	fd_ktrexecfd();
1192
1193	execsigs(p); / reset caught signals /
1194
1195	mutex_enter(p->p_lock);
1196	l->l_ctxlink = NULL; / reset ucontext link /
1197	p->p_acflag &= ~AFORK;
1198	p->p_flag \|= PK_EXEC;
1199	mutex_exit(p->p_lock);
1200
1201	/*
1202	* Stop profiling.
1203	*/
1204	if ((p->p_stflag & PST_PROFIL) != `0`) {
1205	mutex_spin_enter(&p->p_stmutex);
1206	stopprofclock(p);
1207	mutex_spin_exit(&p->p_stmutex);
1208	}
1209
1210	/*
1211	* It's OK to test PL_PPWAIT unlocked here, as other LWPs have
1212	* exited and exec()/exit() are the only places it will be cleared.
1213	*/
1214	if ((p->p_lflag & PL_PPWAIT) != `0`) {
1215	lwp_t *lp;
1216
1217	mutex_enter(proc_lock);
1218	lp = p->p_vforklwp;
1219	p->p_vforklwp = NULL;
1220
1221	l->l_lwpctl = NULL; / was on loan from blocked parent /
1222	p->p_lflag &= ~PL_PPWAIT;
1223	lp->l_vforkwaiting = false;
1224
1225	cv_broadcast(&lp->l_waitcv);
1226	mutex_exit(proc_lock);
1227	}
1228
1229	error = credexec(l, &data->ed_attr);
1230	if (error)
1231	goto exec_abort;
1232
1233	#if defined(__HAVE_RAS)
1234	/*
1235	* Remove all RASs from the address space.
1236	*/
1237	ras_purgeall();
1238	#endif
1239
1240	doexechooks(p);
1241
1242	/*
1243	* Set initial SP at the top of the stack.
1244	*
1245	* Note that on machines where stack grows up (e.g. hppa), SP points to
1246	* the end of arg/env strings. Userland guesses the address of argc
1247	* via ps_strings::ps_argvstr.
1248	*/
1249
1250	/ Setup new registers and do misc. setup. /
1251	(*epp->ep_esch->es_emul->e_setregs)(l, epp, (vaddr_t)newstack);
1252	if (epp->ep_esch->es_setregs)
1253	(*epp->ep_esch->es_setregs)(l, epp, (vaddr_t)newstack);
1254
1255	/ Provide a consistent LWP private setting /
1256	(void)lwp_setprivate(l, NULL);
1257
1258	/ Discard all PCU state; need to start fresh /
1259	pcu_discard_all(l);
1260
1261	/ map the process's signal trampoline code /
1262	if ((error = exec_sigcode_map(p, epp->ep_esch->es_emul)) != `0`) {
1263	DPRINTF(("%s: map sigcode failed %d\n", __func__, error));
1264	goto exec_abort;
1265	}
1266
1267	pool_put(&exec_pool, data->ed_argp);
1268
1269	/ notify others that we exec'd /
1270	KNOTE(&p->p_klist, NOTE_EXEC);
1271
1272	kmem_free(epp->ep_hdr, epp->ep_hdrlen);
1273
1274	SDT_PROBE(proc, kernel, , exec__success, epp->ep_kname, `0`, `0`, `0`, `0`);
1275
1276	emulexec(l, epp);
1277
1278	/ Allow new references from the debugger/procfs. /
1279	rw_exit(&p->p_reflock);
1280	if (!no_local_exec_lock)
1281	rw_exit(&exec_lock);
1282
1283	mutex_enter(proc_lock);
1284
1285	/ posix_spawn(3) reports a single event with implied exec(3) /
1286	if ((p->p_slflag & PSL_TRACED) && !is_spawn) {
1287	mutex_enter(p->p_lock);
1288	eventswitch(TRAP_EXEC);
1289	mutex_enter(proc_lock);
1290	}
1291
1292	if (p->p_sflag & PS_STOPEXEC) {
1293	ksiginfoq_t kq;
1294
1295	KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
1296	p->p_pptr->p_nstopchild++;
1297	p->p_waited = `0`;
1298	mutex_enter(p->p_lock);
1299	ksiginfo_queue_init(&kq);
1300	sigclearall(p, &contsigmask, &kq);
1301	lwp_lock(l);
1302	l->l_stat = LSSTOP;
1303	p->p_stat = SSTOP;
1304	p->p_nrlwps--;
1305	lwp_unlock(l);
1306	mutex_exit(p->p_lock);
1307	mutex_exit(proc_lock);
1308	lwp_lock(l);
1309	mi_switch(l);
1310	ksiginfo_queue_drain(&kq);
1311	KERNEL_LOCK(l->l_biglocks, l);
1312	} else {
1313	mutex_exit(proc_lock);
1314	}
1315
1316	pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
1317	pathbuf_destroy(data->ed_pathbuf);
1318	PNBUF_PUT(data->ed_resolvedpathbuf);
1319	#ifdef TRACE_EXEC
1320	DPRINTF(("%s finished\n", __func__));
1321	#endif
1322	return EJUSTRETURN;
1323
1324	exec_abort:
1325	SDT_PROBE(proc, kernel, , exec__failure, error, `0`, `0`, `0`, `0`);
1326	rw_exit(&p->p_reflock);
1327	if (!no_local_exec_lock)
1328	rw_exit(&exec_lock);
1329
1330	pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
1331	pathbuf_destroy(data->ed_pathbuf);
1332	PNBUF_PUT(data->ed_resolvedpathbuf);
1333
1334	/*
1335	* the old process doesn't exist anymore. exit gracefully.
1336	* get rid of the (new) address space we have created, if any, get rid
1337	* of our namei data and vnode, and exit noting failure
1338	*/
1339	uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
1340	VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
1341
1342	exec_free_emul_arg(epp);
1343	pool_put(&exec_pool, data->ed_argp);
1344	kmem_free(epp->ep_hdr, epp->ep_hdrlen);
1345	if (epp->ep_emul_root != NULL)
1346	vrele(epp->ep_emul_root);
1347	if (epp->ep_interp != NULL)
1348	vrele(epp->ep_interp);
1349
1350	/ Acquire the sched-state mutex (exit1() will release it). /
1351	if (!is_spawn) {
1352	mutex_enter(p->p_lock);
1353	exit1(l, error, SIGABRT);
1354	}
1355
1356	return error;
1357	}
1358
1359	int
1360	execve1(struct lwp l, const* char path, char* * const *args,
1361	char * const *envs, execve_fetch_element_t fetch_element)
1362	{
1363	struct execve_data data;
1364	int error;
1365
1366	error = execve_loadvm(l, path, args, envs, fetch_element, &data);
1367	if (error)
1368	return error;
1369	error = execve_runproc(l, &data, false, false);
1370	return error;
1371	}
1372
1373	static size_t
1374	fromptrsz(const struct exec_package *epp)
1375	{
1376	return (epp->ep_flags & EXEC_FROM32) ? sizeof(int) : sizeof(char *);
1377	}
1378
1379	static size_t
1380	ptrsz(const struct exec_package *epp)
1381	{
1382	return (epp->ep_flags & EXEC_32) ? sizeof(int) : sizeof(char *);
1383	}
1384
1385	static size_t
1386	calcargs(struct execve_data * restrict data, const size_t argenvstrlen)
1387	{
1388	struct exec_package * const epp = &data->ed_pack;
1389
1390	const size_t nargenvptrs =
1391	`1` + / long argc /
1392	data->ed_argc + / char argv[] /*
1393	`1` + / \0 /
1394	data->ed_envc + / char env[] /*
1395	`1`; / \0 /
1396
1397	return (nargenvptrs * ptrsz(epp)) / pointers /
1398	+ argenvstrlen / strings /
1399	+ epp->ep_esch->es_arglen; / auxinfo /
1400	}
1401
1402	static size_t
1403	calcstack(struct execve_data * restrict data, const size_t gaplen)
1404	{
1405	struct exec_package * const epp = &data->ed_pack;
1406
1407	data->ed_szsigcode = epp->ep_esch->es_emul->e_esigcode -
1408	epp->ep_esch->es_emul->e_sigcode;
1409
1410	data->ed_ps_strings_sz = (epp->ep_flags & EXEC_32) ?
1411	sizeof(struct ps_strings32) : sizeof(struct ps_strings);
1412
1413	const size_t sigcode_psstr_sz =
1414	data->ed_szsigcode + / sigcode /
1415	data->ed_ps_strings_sz + / ps_strings /
1416	STACK_PTHREADSPACE; / pthread space /
1417
1418	const size_t stacklen =
1419	data->ed_argslen +
1420	gaplen +
1421	sigcode_psstr_sz;
1422
1423	/ make the stack "safely" aligned /
1424	return STACK_LEN_ALIGN(stacklen, STACK_ALIGNBYTES);
1425	}
1426
1427	static int
1428	copyoutargs(struct execve_data * restrict data, struct lwp *l,
1429	char * const newstack)
1430	{
1431	struct exec_package * const epp = &data->ed_pack;
1432	struct proc *p = l->l_proc;
1433	int error;
1434
1435	memset(&data->ed_arginfo, `0`, sizeof(data->ed_arginfo));
1436
1437	/ remember information about the process /
1438	data->ed_arginfo.ps_nargvstr = data->ed_argc;
1439	data->ed_arginfo.ps_nenvstr = data->ed_envc;
1440
1441	/*
1442	* Allocate the stack address passed to the newly execve()'ed process.
1443	*
1444	* The new stack address will be set to the SP (stack pointer) register
1445	* in setregs().
1446	*/
1447
1448	char *newargs = STACK_ALLOC(
1449	STACK_SHRINK(newstack, data->ed_argslen), data->ed_argslen);
1450
1451	error = (*epp->ep_esch->es_copyargs)(l, epp,
1452	&data->ed_arginfo, &newargs, data->ed_argp);
1453
1454	if (error) {
1455	DPRINTF(("%s: copyargs failed %d\n", __func__, error));
1456	return error;
1457	}
1458
1459	error = copyoutpsstrs(data, p);
1460	if (error != `0`)
1461	return error;
1462
1463	return `0`;
1464	}
1465
1466	static int
1467	copyoutpsstrs(struct execve_data * restrict data, struct proc *p)
1468	{
1469	struct exec_package * const epp = &data->ed_pack;
1470	struct ps_strings32 arginfo32;
1471	void *aip;
1472	int error;
1473
1474	/ fill process ps_strings info /
1475	p->p_psstrp = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr,
1476	STACK_PTHREADSPACE), data->ed_ps_strings_sz);
1477
1478	if (epp->ep_flags & EXEC_32) {
1479	aip = &arginfo32;
1480	arginfo32.ps_argvstr = (vaddr_t)data->ed_arginfo.ps_argvstr;
1481	arginfo32.ps_nargvstr = data->ed_arginfo.ps_nargvstr;
1482	arginfo32.ps_envstr = (vaddr_t)data->ed_arginfo.ps_envstr;
1483	arginfo32.ps_nenvstr = data->ed_arginfo.ps_nenvstr;
1484	} else
1485	aip = &data->ed_arginfo;
1486
1487	/ copy out the process's ps_strings structure /
1488	if ((error = copyout(aip, (void *)p->p_psstrp, data->ed_ps_strings_sz))
1489	!= `0`) {
1490	DPRINTF(("%s: ps_strings copyout %p->%p size %zu failed\n",
1491	__func__, aip, (void *)p->p_psstrp, data->ed_ps_strings_sz));
1492	return error;
1493	}
1494
1495	return `0`;
1496	}
1497
1498	static int
1499	copyinargs(struct execve_data * restrict data, char * const *args,
1500	char * const envs, execve_fetch_element_t fetch_element, char* **dpp)
1501	{
1502	struct exec_package * const epp = &data->ed_pack;
1503	char *dp;
1504	size_t i;
1505	int error;
1506
1507	dp = *dpp;
1508
1509	data->ed_argc = `0`;
1510
1511	/ copy the fake args list, if there's one, freeing it as we go /
1512	if (epp->ep_flags & EXEC_HASARGL) {
1513	struct exec_fakearg *fa = epp->ep_fa;
1514
1515	while (fa->fa_arg != NULL) {
1516	const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
1517	size_t len;
1518
1519	len = strlcpy(dp, fa->fa_arg, maxlen);
1520	/ Count NUL into len. /
1521	if (len < maxlen)
1522	len++;
1523	else {
1524	while (fa->fa_arg != NULL) {
1525	kmem_free(fa->fa_arg, fa->fa_len);
1526	fa++;
1527	}
1528	kmem_free(epp->ep_fa, epp->ep_fa_len);
1529	epp->ep_flags &= ~EXEC_HASARGL;
1530	return E2BIG;
1531	}
1532	ktrexecarg(fa->fa_arg, len - `1`);
1533	dp += len;
1534
1535	kmem_free(fa->fa_arg, fa->fa_len);
1536	fa++;
1537	data->ed_argc++;
1538	}
1539	kmem_free(epp->ep_fa, epp->ep_fa_len);
1540	epp->ep_flags &= ~EXEC_HASARGL;
1541	}
1542
1543	/*
1544	* Read and count argument strings from user.
1545	*/
1546
1547	if (args == NULL) {
1548	DPRINTF(("%s: null args\n", __func__));
1549	return EINVAL;
1550	}
1551	if (epp->ep_flags & EXEC_SKIPARG)
1552	args = (const void )((const* char *)args + fromptrsz(epp));
1553	i = `0`;
1554	error = copyinargstrs(data, args, fetch_element, &dp, &i, ktr_execarg);
1555	if (error != `0`) {
1556	DPRINTF(("%s: copyin arg %d\n", __func__, error));
1557	return error;
1558	}
1559	data->ed_argc += i;
1560
1561	/*
1562	* Read and count environment strings from user.
1563	*/
1564
1565	data->ed_envc = `0`;
1566	/ environment need not be there /
1567	if (envs == NULL)
1568	goto done;
1569	i = `0`;
1570	error = copyinargstrs(data, envs, fetch_element, &dp, &i, ktr_execenv);
1571	if (error != `0`) {
1572	DPRINTF(("%s: copyin env %d\n", __func__, error));
1573	return error;
1574	}
1575	data->ed_envc += i;
1576
1577	done:
1578	*dpp = dp;
1579
1580	return `0`;
1581	}
1582
1583	static int
1584	copyinargstrs(struct execve_data * restrict data, char * const *strs,
1585	execve_fetch_element_t fetch_element, char *dpp, size_t ip,
1586	void (ktr)(const* void *, size_t))
1587	{
1588	char dp, sp;
1589	size_t i;
1590	int error;
1591
1592	dp = *dpp;
1593
1594	i = `0`;
1595	while (`1`) {
1596	const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
1597	size_t len;
1598
1599	if ((error = (*fetch_element)(strs, i, &sp)) != `0`) {
1600	return error;
1601	}
1602	if (!sp)
1603	break;
1604	if ((error = copyinstr(sp, dp, maxlen, &len)) != `0`) {
1605	if (error == ENAMETOOLONG)
1606	error = E2BIG;
1607	return error;
1608	}
1609	if (__predict_false(ktrace_on))
1610	(*ktr)(dp, len - `1`);
1611	dp += len;
1612	i++;
1613	}
1614
1615	*dpp = dp;
1616	*ip = i;
1617
1618	return `0`;
1619	}
1620
1621	/*
1622	* Copy argv and env strings from kernel buffer (argp) to the new stack.
1623	* Those strings are located just after auxinfo.
1624	*/
1625	int
1626	copyargs(struct lwp l, struct* exec_package pack, struct* ps_strings *arginfo,
1627	char *stackp, void* *argp)
1628	{
1629	char *cpp, dp, *sp;
1630	size_t len;
1631	void *nullp;
1632	long argc, envc;
1633	int error;
1634
1635	cpp = (char *)stackp;
1636	nullp = NULL;
1637	argc = arginfo->ps_nargvstr;
1638	envc = arginfo->ps_nenvstr;
1639
1640	/ argc on stack is long /
1641	CTASSERT(sizeof(cpp) == sizeof*(argc));
1642
1643	dp = (char *)(cpp +
1644	`1` + / long argc /
1645	argc + / char argv[] /*
1646	`1` + / \0 /
1647	envc + / char env[] /*
1648	`1`) + / \0 /
1649	pack->ep_esch->es_arglen; / auxinfo /
1650	sp = argp;
1651
1652	if ((error = copyout(&argc, cpp++, sizeof(argc))) != `0`) {
1653	COPYPRINTF("", cpp - `1`, sizeof(argc));
1654	return error;
1655	}
1656
1657	/ XXX don't copy them out, remap them! /
1658	arginfo->ps_argvstr = cpp; / remember location of argv for later /
1659
1660	for (; --argc >= `0`; sp += len, dp += len) {
1661	if ((error = copyout(&dp, cpp++, sizeof(dp))) != `0`) {
1662	COPYPRINTF("", cpp - `1`, sizeof(dp));
1663	return error;
1664	}
1665	if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != `0`) {
1666	COPYPRINTF("str", dp, (size_t)ARG_MAX);
1667	return error;
1668	}
1669	}
1670
1671	if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != `0`) {
1672	COPYPRINTF("", cpp - `1`, sizeof(nullp));
1673	return error;
1674	}
1675
1676	arginfo->ps_envstr = cpp; / remember location of envp for later /
1677
1678	for (; --envc >= `0`; sp += len, dp += len) {
1679	if ((error = copyout(&dp, cpp++, sizeof(dp))) != `0`) {
1680	COPYPRINTF("", cpp - `1`, sizeof(dp));
1681	return error;
1682	}
1683	if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != `0`) {
1684	COPYPRINTF("str", dp, (size_t)ARG_MAX);
1685	return error;
1686	}
1687
1688	}
1689
1690	if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != `0`) {
1691	COPYPRINTF("", cpp - `1`, sizeof(nullp));
1692	return error;
1693	}
1694
1695	stackp = (char* *)cpp;
1696	return `0`;
1697	}
1698
1699
1700	/*
1701	* Add execsw[] entries.
1702	*/
1703	int
1704	exec_add(struct execsw esp, int* count)
1705	{
1706	struct exec_entry *it;
1707	int i;
1708
1709	if (count == `0`) {
1710	return `0`;
1711	}
1712
1713	/ Check for duplicates. /
1714	rw_enter(&exec_lock, RW_WRITER);
1715	for (i = `0`; i < count; i++) {
1716	LIST_FOREACH(it, &ex_head, ex_list) {
1717	/ assume unique (makecmds, probe_func, emulation) /
1718	if (it->ex_sw->es_makecmds == esp[i].es_makecmds &&
1719	it->ex_sw->u.elf_probe_func ==
1720	esp[i].u.elf_probe_func &&
1721	it->ex_sw->es_emul == esp[i].es_emul) {
1722	rw_exit(&exec_lock);
1723	return EEXIST;
1724	}
1725	}
1726	}
1727
1728	/ Allocate new entries. /
1729	for (i = `0`; i < count; i++) {
1730	it = kmem_alloc(sizeof(*it), KM_SLEEP);
1731	it->ex_sw = &esp[i];
1732	LIST_INSERT_HEAD(&ex_head, it, ex_list);
1733	}
1734
1735	/ update execsw[] /
1736	exec_init(`0`);
1737	rw_exit(&exec_lock);
1738	return `0`;
1739	}
1740
1741	/*
1742	* Remove execsw[] entry.
1743	*/
1744	int
1745	exec_remove(struct execsw esp, int* count)
1746	{
1747	struct exec_entry it, next;
1748	int i;
1749	const struct proclist_desc *pd;
1750	proc_t *p;
1751
1752	if (count == `0`) {
1753	return `0`;
1754	}
1755
1756	/ Abort if any are busy. /
1757	rw_enter(&exec_lock, RW_WRITER);
1758	for (i = `0`; i < count; i++) {
1759	mutex_enter(proc_lock);
1760	for (pd = proclists; pd->pd_list != NULL; pd++) {
1761	PROCLIST_FOREACH(p, pd->pd_list) {
1762	if (p->p_execsw == &esp[i]) {
1763	mutex_exit(proc_lock);
1764	rw_exit(&exec_lock);
1765	return EBUSY;
1766	}
1767	}
1768	}
1769	mutex_exit(proc_lock);
1770	}
1771
1772	/ None are busy, so remove them all. /
1773	for (i = `0`; i < count; i++) {
1774	for (it = LIST_FIRST(&ex_head); it != NULL; it = next) {
1775	next = LIST_NEXT(it, ex_list);
1776	if (it->ex_sw == &esp[i]) {
1777	LIST_REMOVE(it, ex_list);
1778	kmem_free(it, sizeof(*it));
1779	break;
1780	}
1781	}
1782	}
1783
1784	/ update execsw[] /
1785	exec_init(`0`);
1786	rw_exit(&exec_lock);
1787	return `0`;
1788	}
1789
1790	/*
1791	* Initialize exec structures. If init_boot is true, also does necessary
1792	* one-time initialization (it's called from main() that way).
1793	* Once system is multiuser, this should be called with exec_lock held,
1794	* i.e. via exec_{add\|remove}().
1795	*/
1796	int
1797	exec_init(int init_boot)
1798	{
1799	const struct execsw **sw;
1800	struct exec_entry *ex;
1801	SLIST_HEAD(,exec_entry) first;
1802	SLIST_HEAD(,exec_entry) any;
1803	SLIST_HEAD(,exec_entry) last;
1804	int i, sz;
1805
1806	if (init_boot) {
1807	/ do one-time initializations /
1808	vaddr_t vmin = `0`, vmax;
1809
1810	rw_init(&exec_lock);
1811	mutex_init(&sigobject_lock, MUTEX_DEFAULT, IPL_NONE);
1812	exec_map = uvm_km_suballoc(kernel_map, &vmin, &vmax,
1813	maxexec*NCARGS, VM_MAP_PAGEABLE, false, NULL);
1814	pool_init(&exec_pool, NCARGS, `0`, `0`, PR_NOALIGN\|PR_NOTOUCH,
1815	"execargs", &exec_palloc, IPL_NONE);
1816	pool_sethardlimit(&exec_pool, maxexec, "should not happen", `0`);
1817	} else {
1818	KASSERT(rw_write_held(&exec_lock));
1819	}
1820
1821	/ Sort each entry onto the appropriate queue. /
1822	SLIST_INIT(&first);
1823	SLIST_INIT(&any);
1824	SLIST_INIT(&last);
1825	sz = `0`;
1826	LIST_FOREACH(ex, &ex_head, ex_list) {
1827	switch(ex->ex_sw->es_prio) {
1828	case EXECSW_PRIO_FIRST:
1829	SLIST_INSERT_HEAD(&first, ex, ex_slist);
1830	break;
1831	case EXECSW_PRIO_ANY:
1832	SLIST_INSERT_HEAD(&any, ex, ex_slist);
1833	break;
1834	case EXECSW_PRIO_LAST:
1835	SLIST_INSERT_HEAD(&last, ex, ex_slist);
1836	break;
1837	default:
1838	panic("%s", __func__);
1839	break;
1840	}
1841	sz++;
1842	}
1843
1844	/*
1845	* Create new execsw[]. Ensure we do not try a zero-sized
1846	* allocation.
1847	*/
1848	sw = kmem_alloc(sz * sizeof(struct execsw *) + `1`, KM_SLEEP);
1849	i = `0`;
1850	SLIST_FOREACH(ex, &first, ex_slist) {
1851	sw[i++] = ex->ex_sw;
1852	}
1853	SLIST_FOREACH(ex, &any, ex_slist) {
1854	sw[i++] = ex->ex_sw;
1855	}
1856	SLIST_FOREACH(ex, &last, ex_slist) {
1857	sw[i++] = ex->ex_sw;
1858	}
1859
1860	/ Replace old execsw[] and free used memory. /
1861	if (execsw != NULL) {
1862	kmem_free(__UNCONST(execsw),
1863	nexecs * sizeof(struct execsw *) + `1`);
1864	}
1865	execsw = sw;
1866	nexecs = sz;
1867
1868	/ Figure out the maximum size of an exec header. /
1869	exec_maxhdrsz = sizeof(int);
1870	for (i = `0`; i < nexecs; i++) {
1871	if (execsw[i]->es_hdrsz > exec_maxhdrsz)
1872	exec_maxhdrsz = execsw[i]->es_hdrsz;
1873	}
1874
1875	return `0`;
1876	}
1877
1878	static int
1879	exec_sigcode_map(struct proc p, const* struct emul *e)
1880	{
1881	vaddr_t va;
1882	vsize_t sz;
1883	int error;
1884	struct uvm_object *uobj;
1885
1886	sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
1887
1888	if (e->e_sigobject == NULL \|\| sz == `0`) {
1889	return `0`;
1890	}
1891
1892	/*
1893	* If we don't have a sigobject for this emulation, create one.
1894	*
1895	* sigobject is an anonymous memory object (just like SYSV shared
1896	* memory) that we keep a permanent reference to and that we map
1897	* in all processes that need this sigcode. The creation is simple,
1898	* we create an object, add a permanent reference to it, map it in
1899	* kernel space, copy out the sigcode to it and unmap it.
1900	* We map it with PROT_READ\|PROT_EXEC into the process just
1901	* the way sys_mmap() would map it.
1902	*/
1903
1904	uobj = *e->e_sigobject;
1905	if (uobj == NULL) {
1906	mutex_enter(&sigobject_lock);
1907	if ((uobj = *e->e_sigobject) == NULL) {
1908	uobj = uao_create(sz, `0`);
1909	(*uobj->pgops->pgo_reference)(uobj);
1910	va = vm_map_min(kernel_map);
1911	if ((error = uvm_map(kernel_map, &va, round_page(sz),
1912	uobj, `0`, `0`,
1913	UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
1914	UVM_INH_SHARE, UVM_ADV_RANDOM, `0`)))) {
1915	printf("kernel mapping failed %d\n", error);
1916	(*uobj->pgops->pgo_detach)(uobj);
1917	mutex_exit(&sigobject_lock);
1918	return error;
1919	}
1920	memcpy((void *)va, e->e_sigcode, sz);
1921	#ifdef PMAP_NEED_PROCWR
1922	pmap_procwr(&proc0, va, sz);
1923	#endif
1924	uvm_unmap(kernel_map, va, va + round_page(sz));
1925	*e->e_sigobject = uobj;
1926	}
1927	mutex_exit(&sigobject_lock);
1928	}
1929
1930	/ Just a hint to uvm_map where to put it. /
1931	va = e->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr,
1932	round_page(sz), p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1933
1934	#ifdef __alpha__
1935	/*
1936	* Tru64 puts /sbin/loader at the end of user virtual memory,
1937	* which causes the above calculation to put the sigcode at
1938	* an invalid address. Put it just below the text instead.
1939	*/
1940	if (va == (vaddr_t)vm_map_max(&p->p_vmspace->vm_map)) {
1941	va = (vaddr_t)p->p_vmspace->vm_taddr - round_page(sz);
1942	}
1943	#endif
1944
1945	(*uobj->pgops->pgo_reference)(uobj);
1946	error = uvm_map(&p->p_vmspace->vm_map, &va, round_page(sz),
1947	uobj, `0`, `0`,
1948	UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, UVM_INH_SHARE,
1949	UVM_ADV_RANDOM, `0`));
1950	if (error) {
1951	DPRINTF(("%s, %d: map %p "
1952	"uvm_map %#"PRIxVSIZE"@%#"PRIxVADDR" failed %d\n",
1953	__func__, __LINE__, &p->p_vmspace->vm_map, round_page(sz),
1954	va, error));
1955	(*uobj->pgops->pgo_detach)(uobj);
1956	return error;
1957	}
1958	p->p_sigctx.ps_sigcode = (void *)va;
1959	return `0`;
1960	}
1961
1962	/*
1963	* Release a refcount on spawn_exec_data and destroy memory, if this
1964	* was the last one.
1965	*/
1966	static void
1967	spawn_exec_data_release(struct spawn_exec_data *data)
1968	{
1969	if (atomic_dec_32_nv(&data->sed_refcnt) != `0`)
1970	return;
1971
1972	cv_destroy(&data->sed_cv_child_ready);
1973	mutex_destroy(&data->sed_mtx_child);
1974
1975	if (data->sed_actions)
1976	posix_spawn_fa_free(data->sed_actions,
1977	data->sed_actions->len);
1978	if (data->sed_attrs)
1979	kmem_free(data->sed_attrs,
1980	sizeof(*data->sed_attrs));
1981	kmem_free(data, sizeof(*data));
1982	}
1983
1984	/*
1985	* A child lwp of a posix_spawn operation starts here and ends up in
1986	* cpu_spawn_return, dealing with all filedescriptor and scheduler
1987	* manipulations in between.
1988	* The parent waits for the child, as it is not clear whether the child
1989	* will be able to acquire its own exec_lock. If it can, the parent can
1990	* be released early and continue running in parallel. If not (or if the
1991	* magic debug flag is passed in the scheduler attribute struct), the
1992	* child rides on the parent's exec lock until it is ready to return to
1993	* to userland - and only then releases the parent. This method loses
1994	* concurrency, but improves error reporting.
1995	*/
1996	static void
1997	spawn_return(void *arg)
1998	{
1999	struct spawn_exec_data *spawn_data = arg;
2000	struct lwp *l = curlwp;
2001	struct proc *p = l->l_proc;
2002	int error, newfd;
2003	int ostat;
2004	size_t i;
2005	const struct posix_spawn_file_actions_entry *fae;
2006	pid_t ppid;
2007	register_t retval;
2008	bool have_reflock;
2009	bool parent_is_waiting = true;
2010
2011	/*
2012	* Check if we can release parent early.
2013	* We either need to have no sed_attrs, or sed_attrs does not
2014	* have POSIX_SPAWN_RETURNERROR or one of the flags, that require
2015	* safe access to the parent proc (passed in sed_parent).
2016	* We then try to get the exec_lock, and only if that works, we can
2017	* release the parent here already.
2018	*/
2019	ppid = spawn_data->sed_parent->p_pid;
2020	if ((!spawn_data->sed_attrs
2021	\|\| (spawn_data->sed_attrs->sa_flags
2022	& (POSIX_SPAWN_RETURNERROR\|POSIX_SPAWN_SETPGROUP)) == `0`)
2023	&& rw_tryenter(&exec_lock, RW_READER)) {
2024	parent_is_waiting = false;
2025	mutex_enter(&spawn_data->sed_mtx_child);
2026	cv_signal(&spawn_data->sed_cv_child_ready);
2027	mutex_exit(&spawn_data->sed_mtx_child);
2028	}
2029
2030	/ don't allow debugger access yet /
2031	rw_enter(&p->p_reflock, RW_WRITER);
2032	have_reflock = true;
2033
2034	error = `0`;
2035	/ handle posix_spawn_file_actions /
2036	if (spawn_data->sed_actions != NULL) {
2037	for (i = `0`; i < spawn_data->sed_actions->len; i++) {
2038	fae = &spawn_data->sed_actions->fae[i];
2039	switch (fae->fae_action) {
2040	case FAE_OPEN:
2041	if (fd_getfile(fae->fae_fildes) != NULL) {
2042	error = fd_close(fae->fae_fildes);
2043	if (error)
2044	break;
2045	}
2046	error = fd_open(fae->fae_path, fae->fae_oflag,
2047	fae->fae_mode, &newfd);
2048	if (error)
2049	break;
2050	if (newfd != fae->fae_fildes) {
2051	error = dodup(l, newfd,
2052	fae->fae_fildes, `0`, &retval);
2053	if (fd_getfile(newfd) != NULL)
2054	fd_close(newfd);
2055	}
2056	break;
2057	case FAE_DUP2:
2058	error = dodup(l, fae->fae_fildes,
2059	fae->fae_newfildes, `0`, &retval);
2060	break;
2061	case FAE_CLOSE:
2062	if (fd_getfile(fae->fae_fildes) == NULL) {
2063	error = EBADF;
2064	break;
2065	}
2066	error = fd_close(fae->fae_fildes);
2067	break;
2068	}
2069	if (error)
2070	goto report_error;
2071	}
2072	}
2073
2074	/ handle posix_spawnattr /
2075	if (spawn_data->sed_attrs != NULL) {
2076	struct sigaction sigact;
2077	memset(&sigact, `0`, sizeof(sigact));
2078	sigact._sa_u._sa_handler = SIG_DFL;
2079	sigact.sa_flags = `0`;
2080
2081	/*
2082	* set state to SSTOP so that this proc can be found by pid.
2083	* see proc_enterprp, do_sched_setparam below
2084	*/
2085	mutex_enter(proc_lock);
2086	/*
2087	* p_stat should be SACTIVE, so we need to adjust the
2088	* parent's p_nstopchild here. For safety, just make
2089	* we're on the good side of SDEAD before we adjust.
2090	*/
2091	ostat = p->p_stat;
2092	KASSERT(ostat < SSTOP);
2093	p->p_stat = SSTOP;
2094	p->p_waited = `0`;
2095	p->p_pptr->p_nstopchild++;
2096	mutex_exit(proc_lock);
2097
2098	/ Set process group /
2099	if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETPGROUP) {
2100	pid_t mypid = p->p_pid,
2101	pgrp = spawn_data->sed_attrs->sa_pgroup;
2102
2103	if (pgrp == `0`)
2104	pgrp = mypid;
2105
2106	error = proc_enterpgrp(spawn_data->sed_parent,
2107	mypid, pgrp, false);
2108	if (error)
2109	goto report_error_stopped;
2110	}
2111
2112	/ Set scheduler policy /
2113	if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETSCHEDULER)
2114	error = do_sched_setparam(p->p_pid, `0`,
2115	spawn_data->sed_attrs->sa_schedpolicy,
2116	&spawn_data->sed_attrs->sa_schedparam);
2117	else if (spawn_data->sed_attrs->sa_flags
2118	& POSIX_SPAWN_SETSCHEDPARAM) {
2119	error = do_sched_setparam(ppid, `0`,
2120	SCHED_NONE, &spawn_data->sed_attrs->sa_schedparam);
2121	}
2122	if (error)
2123	goto report_error_stopped;
2124
2125	/ Reset user ID's /
2126	if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_RESETIDS) {
2127	error = do_setresuid(l, -`1`,
2128	kauth_cred_getgid(l->l_cred), -`1`,
2129	ID_E_EQ_R \| ID_E_EQ_S);
2130	if (error)
2131	goto report_error_stopped;
2132	error = do_setresuid(l, -`1`,
2133	kauth_cred_getuid(l->l_cred), -`1`,
2134	ID_E_EQ_R \| ID_E_EQ_S);
2135	if (error)
2136	goto report_error_stopped;
2137	}
2138
2139	/ Set signal masks/defaults /
2140	if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETSIGMASK) {
2141	mutex_enter(p->p_lock);
2142	error = sigprocmask1(l, SIG_SETMASK,
2143	&spawn_data->sed_attrs->sa_sigmask, NULL);
2144	mutex_exit(p->p_lock);
2145	if (error)
2146	goto report_error_stopped;
2147	}
2148
2149	if (spawn_data->sed_attrs->sa_flags & POSIX_SPAWN_SETSIGDEF) {
2150	/*
2151	* The following sigaction call is using a sigaction
2152	* version 0 trampoline which is in the compatibility
2153	* code only. This is not a problem because for SIG_DFL
2154	* and SIG_IGN, the trampolines are now ignored. If they
2155	* were not, this would be a problem because we are
2156	* holding the exec_lock, and the compat code needs
2157	* to do the same in order to replace the trampoline
2158	* code of the process.
2159	*/
2160	for (i = `1`; i <= NSIG; i++) {
2161	if (sigismember(
2162	&spawn_data->sed_attrs->sa_sigdefault, i))
2163	sigaction1(l, i, &sigact, NULL, NULL,
2164	`0`);
2165	}
2166	}
2167	mutex_enter(proc_lock);
2168	p->p_stat = ostat;
2169	p->p_pptr->p_nstopchild--;
2170	mutex_exit(proc_lock);
2171	}
2172
2173	/ now do the real exec /
2174	error = execve_runproc(l, &spawn_data->sed_exec, parent_is_waiting,
2175	true);
2176	have_reflock = false;
2177	if (error == EJUSTRETURN)
2178	error = `0`;
2179	else if (error)
2180	goto report_error;
2181
2182	if (parent_is_waiting) {
2183	mutex_enter(&spawn_data->sed_mtx_child);
2184	cv_signal(&spawn_data->sed_cv_child_ready);
2185	mutex_exit(&spawn_data->sed_mtx_child);
2186	}
2187
2188	/ release our refcount on the data /
2189	spawn_exec_data_release(spawn_data);
2190
2191	if (p->p_slflag & PSL_TRACED) {
2192	/ Paranoid check /
2193	mutex_enter(proc_lock);
2194	if (!(p->p_slflag & PSL_TRACED)) {
2195	mutex_exit(proc_lock);
2196	goto cpu_return;
2197	}
2198
2199	mutex_enter(p->p_lock);
2200	eventswitch(TRAP_CHLD);
2201	}
2202
2203	cpu_return:
2204	/ and finally: leave to userland for the first time /
2205	cpu_spawn_return(l);
2206
2207	/ NOTREACHED /
2208	return;
2209
2210	report_error_stopped:
2211	mutex_enter(proc_lock);
2212	p->p_stat = ostat;
2213	p->p_pptr->p_nstopchild--;
2214	mutex_exit(proc_lock);
2215	report_error:
2216	if (have_reflock) {
2217	/*
2218	* We have not passed through execve_runproc(),
2219	* which would have released the p_reflock and also
2220	* taken ownership of the sed_exec part of spawn_data,
2221	* so release/free both here.
2222	*/
2223	rw_exit(&p->p_reflock);
2224	execve_free_data(&spawn_data->sed_exec);
2225	}
2226
2227	if (parent_is_waiting) {
2228	/ pass error to parent /
2229	mutex_enter(&spawn_data->sed_mtx_child);
2230	spawn_data->sed_error = error;
2231	cv_signal(&spawn_data->sed_cv_child_ready);
2232	mutex_exit(&spawn_data->sed_mtx_child);
2233	} else {
2234	rw_exit(&exec_lock);
2235	}
2236
2237	/ release our refcount on the data /
2238	spawn_exec_data_release(spawn_data);
2239
2240	/ done, exit /
2241	mutex_enter(p->p_lock);
2242	/*
2243	* Posix explicitly asks for an exit code of 127 if we report
2244	* errors from the child process - so, unfortunately, there
2245	* is no way to report a more exact error code.
2246	* A NetBSD specific workaround is POSIX_SPAWN_RETURNERROR as
2247	* flag bit in the attrp argument to posix_spawn(2), see above.
2248	*/
2249	exit1(l, `127`, `0`);
2250	}
2251
2252	void
2253	posix_spawn_fa_free(struct posix_spawn_file_actions *fa, size_t len)
2254	{
2255
2256	for (size_t i = `0`; i < len; i++) {
2257	struct posix_spawn_file_actions_entry *fae = &fa->fae[i];
2258	if (fae->fae_action != FAE_OPEN)
2259	continue;
2260	kmem_strfree(fae->fae_path);
2261	}
2262	if (fa->len > `0`)
2263	kmem_free(fa->fae, sizeof(fa->fae) fa->len);
2264	kmem_free(fa, sizeof(*fa));
2265	}
2266
2267	static int
2268	posix_spawn_fa_alloc(struct posix_spawn_file_actions **fap,
2269	const struct posix_spawn_file_actions *ufa, rlim_t lim)
2270	{
2271	struct posix_spawn_file_actions *fa;
2272	struct posix_spawn_file_actions_entry *fae;
2273	char *pbuf = NULL;
2274	int error;
2275	size_t i = `0`;
2276
2277	fa = kmem_alloc(sizeof(*fa), KM_SLEEP);
2278	error = copyin(ufa, fa, sizeof(*fa));
2279	if (error \|\| fa->len == `0`) {
2280	kmem_free(fa, sizeof(*fa));
2281	return error; / 0 if not an error, and len == 0 /
2282	}
2283
2284	if (fa->len > lim) {
2285	kmem_free(fa, sizeof(*fa));
2286	return EINVAL;
2287	}
2288
2289	fa->size = fa->len;
2290	size_t fal = fa->len * sizeof(*fae);
2291	fae = fa->fae;
2292	fa->fae = kmem_alloc(fal, KM_SLEEP);
2293	error = copyin(fae, fa->fae, fal);
2294	if (error)
2295	goto out;
2296
2297	pbuf = PNBUF_GET();
2298	for (; i < fa->len; i++) {
2299	fae = &fa->fae[i];
2300	if (fae->fae_action != FAE_OPEN)
2301	continue;
2302	error = copyinstr(fae->fae_path, pbuf, MAXPATHLEN, &fal);
2303	if (error)
2304	goto out;
2305	fae->fae_path = kmem_alloc(fal, KM_SLEEP);
2306	memcpy(fae->fae_path, pbuf, fal);
2307	}
2308	PNBUF_PUT(pbuf);
2309
2310	*fap = fa;
2311	return `0`;
2312	out:
2313	if (pbuf)
2314	PNBUF_PUT(pbuf);
2315	posix_spawn_fa_free(fa, i);
2316	return error;
2317	}
2318
2319	int
2320	check_posix_spawn(struct lwp *l1)
2321	{
2322	int error, tnprocs, count;
2323	uid_t uid;
2324	struct proc *p1;
2325
2326	p1 = l1->l_proc;
2327	uid = kauth_cred_getuid(l1->l_cred);
2328	tnprocs = atomic_inc_uint_nv(&nprocs);
2329
2330	/*
2331	* Although process entries are dynamically created, we still keep
2332	* a global limit on the maximum number we will create.
2333	*/
2334	if (__predict_false(tnprocs >= maxproc))
2335	error = -`1`;
2336	else
2337	error = kauth_authorize_process(l1->l_cred,
2338	KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);
2339
2340	if (error) {
2341	atomic_dec_uint(&nprocs);
2342	return EAGAIN;
2343	}
2344
2345	/*
2346	* Enforce limits.
2347	*/
2348	count = chgproccnt(uid, `1`);
2349	if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
2350	p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
2351	&p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != `0` &&
2352	__predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) {
2353	(void)chgproccnt(uid, -`1`);
2354	atomic_dec_uint(&nprocs);
2355	return EAGAIN;
2356	}
2357
2358	return `0`;
2359	}
2360
2361	int
2362	do_posix_spawn(struct lwp l1, pid_t pid_res, bool child_ok, const* char *path,
2363	struct posix_spawn_file_actions *fa,
2364	struct posix_spawnattr *sa,
2365	char *const argv, char* *const *envp,
2366	execve_fetch_element_t fetch)
2367	{
2368
2369	struct proc p1, p2;
2370	struct lwp *l2;
2371	int error;
2372	struct spawn_exec_data *spawn_data;
2373	vaddr_t uaddr;
2374	pid_t pid;
2375	bool have_exec_lock = false;
2376
2377	p1 = l1->l_proc;
2378
2379	/ Allocate and init spawn_data /
2380	spawn_data = kmem_zalloc(sizeof(*spawn_data), KM_SLEEP);
2381	spawn_data->sed_refcnt = `1`; / only parent so far /
2382	cv_init(&spawn_data->sed_cv_child_ready, "pspawn");
2383	mutex_init(&spawn_data->sed_mtx_child, MUTEX_DEFAULT, IPL_NONE);
2384	mutex_enter(&spawn_data->sed_mtx_child);
2385
2386	/*
2387	* Do the first part of the exec now, collect state
2388	* in spawn_data.
2389	*/
2390	error = execve_loadvm(l1, path, argv,
2391	envp, fetch, &spawn_data->sed_exec);
2392	if (error == EJUSTRETURN)
2393	error = `0`;
2394	else if (error)
2395	goto error_exit;
2396
2397	have_exec_lock = true;
2398
2399	/*
2400	* Allocate virtual address space for the U-area now, while it
2401	* is still easy to abort the fork operation if we're out of
2402	* kernel virtual address space.
2403	*/
2404	uaddr = uvm_uarea_alloc();
2405	if (__predict_false(uaddr == `0`)) {
2406	error = ENOMEM;
2407	goto error_exit;
2408	}
2409
2410	/*
2411	* Allocate new proc. Borrow proc0 vmspace for it, we will
2412	* replace it with its own before returning to userland
2413	* in the child.
2414	* This is a point of no return, we will have to go through
2415	* the child proc to properly clean it up past this point.
2416	*/
2417	p2 = proc_alloc();
2418	pid = p2->p_pid;
2419
2420	/*
2421	* Make a proc table entry for the new process.
2422	* Start by zeroing the section of proc that is zero-initialized,
2423	* then copy the section that is copied directly from the parent.
2424	*/
2425	memset(&p2->p_startzero, `0`,
2426	(unsigned) ((char )&p2->p_endzero - (char* *)&p2->p_startzero));
2427	memcpy(&p2->p_startcopy, &p1->p_startcopy,
2428	(unsigned) ((char )&p2->p_endcopy - (char* *)&p2->p_startcopy));
2429	p2->p_vmspace = proc0.p_vmspace;
2430
2431	TAILQ_INIT(&p2->p_sigpend.sp_info);
2432
2433	LIST_INIT(&p2->p_lwps);
2434	LIST_INIT(&p2->p_sigwaiters);
2435
2436	/*
2437	* Duplicate sub-structures as needed.
2438	* Increase reference counts on shared objects.
2439	* Inherit flags we want to keep. The flags related to SIGCHLD
2440	* handling are important in order to keep a consistent behaviour
2441	* for the child after the fork. If we are a 32-bit process, the
2442	* child will be too.
2443	*/
2444	p2->p_flag =
2445	p1->p_flag & (PK_SUGID \| PK_NOCLDWAIT \| PK_CLDSIGIGN \| PK_32);
2446	p2->p_emul = p1->p_emul;
2447	p2->p_execsw = p1->p_execsw;
2448
2449	mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
2450	mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
2451	rw_init(&p2->p_reflock);
2452	cv_init(&p2->p_waitcv, "wait");
2453	cv_init(&p2->p_lwpcv, "lwpwait");
2454
2455	p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
2456
2457	kauth_proc_fork(p1, p2);
2458
2459	p2->p_raslist = NULL;
2460	p2->p_fd = fd_copy();
2461
2462	/ XXX racy /
2463	p2->p_mqueue_cnt = p1->p_mqueue_cnt;
2464
2465	p2->p_cwdi = cwdinit();
2466
2467	/*
2468	* Note: p_limit (rlimit stuff) is copy-on-write, so normally
2469	* we just need increase pl_refcnt.
2470	*/
2471	if (!p1->p_limit->pl_writeable) {
2472	lim_addref(p1->p_limit);
2473	p2->p_limit = p1->p_limit;
2474	} else {
2475	p2->p_limit = lim_copy(p1->p_limit);
2476	}
2477
2478	p2->p_lflag = `0`;
2479	l1->l_vforkwaiting = false;
2480	p2->p_sflag = `0`;
2481	p2->p_slflag = `0`;
2482	p2->p_pptr = p1;
2483	p2->p_ppid = p1->p_pid;
2484	LIST_INIT(&p2->p_children);
2485
2486	p2->p_aio = NULL;
2487
2488	#ifdef KTRACE
2489	/*
2490	* Copy traceflag and tracefile if enabled.
2491	* If not inherited, these were zeroed above.
2492	*/
2493	if (p1->p_traceflag & KTRFAC_INHERIT) {
2494	mutex_enter(&ktrace_lock);
2495	p2->p_traceflag = p1->p_traceflag;
2496	if ((p2->p_tracep = p1->p_tracep) != NULL)
2497	ktradref(p2);
2498	mutex_exit(&ktrace_lock);
2499	}
2500	#endif
2501
2502	/*
2503	* Create signal actions for the child process.
2504	*/
2505	p2->p_sigacts = sigactsinit(p1, `0`);
2506	mutex_enter(p1->p_lock);
2507	p2->p_sflag \|=
2508	(p1->p_sflag & (PS_STOPFORK \| PS_STOPEXEC \| PS_NOCLDSTOP));
2509	sched_proc_fork(p1, p2);
2510	mutex_exit(p1->p_lock);
2511
2512	p2->p_stflag = p1->p_stflag;
2513
2514	/*
2515	* p_stats.
2516	* Copy parts of p_stats, and zero out the rest.
2517	*/
2518	p2->p_stats = pstatscopy(p1->p_stats);
2519
2520	/ copy over machdep flags to the new proc /
2521	cpu_proc_fork(p1, p2);
2522
2523	/*
2524	* Prepare remaining parts of spawn data
2525	*/
2526	spawn_data->sed_actions = fa;
2527	spawn_data->sed_attrs = sa;
2528
2529	spawn_data->sed_parent = p1;
2530
2531	/ create LWP /
2532	lwp_create(l1, p2, uaddr, `0`, NULL, `0`, spawn_return, spawn_data,
2533	&l2, l1->l_class, &l1->l_sigmask, &l1->l_sigstk);
2534	l2->l_ctxlink = NULL; / reset ucontext link /
2535
2536	/*
2537	* Copy the credential so other references don't see our changes.
2538	* Test to see if this is necessary first, since in the common case
2539	* we won't need a private reference.
2540	*/
2541	if (kauth_cred_geteuid(l2->l_cred) != kauth_cred_getsvuid(l2->l_cred) \|\|
2542	kauth_cred_getegid(l2->l_cred) != kauth_cred_getsvgid(l2->l_cred)) {
2543	l2->l_cred = kauth_cred_copy(l2->l_cred);
2544	kauth_cred_setsvuid(l2->l_cred, kauth_cred_geteuid(l2->l_cred));
2545	kauth_cred_setsvgid(l2->l_cred, kauth_cred_getegid(l2->l_cred));
2546	}
2547
2548	/ Update the master credentials. /
2549	if (l2->l_cred != p2->p_cred) {
2550	kauth_cred_t ocred;
2551
2552	kauth_cred_hold(l2->l_cred);
2553	mutex_enter(p2->p_lock);
2554	ocred = p2->p_cred;
2555	p2->p_cred = l2->l_cred;
2556	mutex_exit(p2->p_lock);
2557	kauth_cred_free(ocred);
2558	}
2559
2560	*child_ok = true;
2561	spawn_data->sed_refcnt = `2`; / child gets it as well /
2562	#if 0
2563	l2->l_nopreempt = `1`; / start it non-preemptable /
2564	#endif
2565
2566	/*
2567	* It's now safe for the scheduler and other processes to see the
2568	* child process.
2569	*/
2570	mutex_enter(proc_lock);
2571
2572	if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT)
2573	p2->p_lflag \|= PL_CONTROLT;
2574
2575	LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling);
2576	p2->p_exitsig = SIGCHLD; / signal for parent on exit /
2577
2578	if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN\|PSL_TRACED)) ==
2579	(PSL_TRACEPOSIX_SPAWN\|PSL_TRACED)) {
2580	proc_changeparent(p2, p1->p_pptr);
2581	p1->p_pspid = p2->p_pid;
2582	p2->p_pspid = p1->p_pid;
2583	}
2584
2585	LIST_INSERT_AFTER(p1, p2, p_pglist);
2586	LIST_INSERT_HEAD(&allproc, p2, p_list);
2587
2588	p2->p_trace_enabled = trace_is_enabled(p2);
2589	#ifdef __HAVE_SYSCALL_INTERN
2590	(*p2->p_emul->e_syscall_intern)(p2);
2591	#endif
2592
2593	/*
2594	* Make child runnable, set start time, and add to run queue except
2595	* if the parent requested the child to start in SSTOP state.
2596	*/
2597	mutex_enter(p2->p_lock);
2598
2599	getmicrotime(&p2->p_stats->p_start);
2600
2601	lwp_lock(l2);
2602	KASSERT(p2->p_nrlwps == `1`);
2603	p2->p_nrlwps = `1`;
2604	p2->p_stat = SACTIVE;
2605	l2->l_stat = LSRUN;
2606	sched_enqueue(l2, false);
2607	lwp_unlock(l2);
2608
2609	mutex_exit(p2->p_lock);
2610	mutex_exit(proc_lock);
2611
2612	cv_wait(&spawn_data->sed_cv_child_ready, &spawn_data->sed_mtx_child);
2613	error = spawn_data->sed_error;
2614	mutex_exit(&spawn_data->sed_mtx_child);
2615	spawn_exec_data_release(spawn_data);
2616
2617	rw_exit(&p1->p_reflock);
2618	rw_exit(&exec_lock);
2619	have_exec_lock = false;
2620
2621	*pid_res = pid;
2622
2623	if (error)
2624	return error;
2625
2626	if (p1->p_slflag & PSL_TRACED) {
2627	/ Paranoid check /
2628	mutex_enter(proc_lock);
2629	if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN\|PSL_TRACED)) !=
2630	(PSL_TRACEPOSIX_SPAWN\|PSL_TRACED)) {
2631	mutex_exit(proc_lock);
2632	return `0`;
2633	}
2634
2635	mutex_enter(p1->p_lock);
2636	eventswitch(TRAP_CHLD);
2637	}
2638	return `0`;
2639
2640	error_exit:
2641	if (have_exec_lock) {
2642	execve_free_data(&spawn_data->sed_exec);
2643	rw_exit(&p1->p_reflock);
2644	rw_exit(&exec_lock);
2645	}
2646	mutex_exit(&spawn_data->sed_mtx_child);
2647	spawn_exec_data_release(spawn_data);
2648
2649	return error;
2650	}
2651
2652	int
2653	sys_posix_spawn(struct lwp l1, const* struct sys_posix_spawn_args *uap,
2654	register_t *retval)
2655	{
2656	/ {*
2657	syscallarg(pid_t ) pid;*
2658	syscallarg(const char ) path;*
2659	syscallarg(const struct posix_spawn_file_actions ) file_actions;*
2660	syscallarg(const struct posix_spawnattr ) attrp;*
2661	syscallarg(char const ) argv;
2662	syscallarg(char const ) envp;
2663	} /*
2664
2665	int error;
2666	struct posix_spawn_file_actions *fa = NULL;
2667	struct posix_spawnattr *sa = NULL;
2668	pid_t pid;
2669	bool child_ok = false;
2670	rlim_t max_fileactions;
2671	proc_t *p = l1->l_proc;
2672
2673	error = check_posix_spawn(l1);
2674	if (error) {
2675	*retval = error;
2676	return `0`;
2677	}
2678
2679	/ copy in file_actions struct /
2680	if (SCARG(uap, file_actions) != NULL) {
2681	max_fileactions = `2` * uimin(p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
2682	maxfiles);
2683	error = posix_spawn_fa_alloc(&fa, SCARG(uap, file_actions),
2684	max_fileactions);
2685	if (error)
2686	goto error_exit;
2687	}
2688
2689	/ copyin posix_spawnattr struct /
2690	if (SCARG(uap, attrp) != NULL) {
2691	sa = kmem_alloc(sizeof(*sa), KM_SLEEP);
2692	error = copyin(SCARG(uap, attrp), sa, sizeof(*sa));
2693	if (error)
2694	goto error_exit;
2695	}
2696
2697	/*
2698	* Do the spawn
2699	*/
2700	error = do_posix_spawn(l1, &pid, &child_ok, SCARG(uap, path), fa, sa,
2701	SCARG(uap, argv), SCARG(uap, envp), execve_fetch_element);
2702	if (error)
2703	goto error_exit;
2704
2705	if (error == `0` && SCARG(uap, pid) != NULL)
2706	error = copyout(&pid, SCARG(uap, pid), sizeof(pid));
2707
2708	*retval = error;
2709	return `0`;
2710
2711	error_exit:
2712	if (!child_ok) {
2713	(void)chgproccnt(kauth_cred_getuid(l1->l_cred), -`1`);
2714	atomic_dec_uint(&nprocs);
2715
2716	if (sa)
2717	kmem_free(sa, sizeof(*sa));
2718	if (fa)
2719	posix_spawn_fa_free(fa, fa->len);
2720	}
2721
2722	*retval = error;
2723	return `0`;
2724	}
2725
2726	void
2727	exec_free_emul_arg(struct exec_package *epp)
2728	{
2729	if (epp->ep_emul_arg_free != NULL) {
2730	KASSERT(epp->ep_emul_arg != NULL);
2731	(*epp->ep_emul_arg_free)(epp->ep_emul_arg);
2732	epp->ep_emul_arg_free = NULL;
2733	epp->ep_emul_arg = NULL;
2734	} else {
2735	KASSERT(epp->ep_emul_arg == NULL);
2736	}
2737	}
2738
2739	#ifdef DEBUG_EXEC
2740	static void
2741	dump_vmcmds(const struct exec_package * const epp, size_t x, int error)
2742	{
2743	struct exec_vmcmd *vp = &epp->ep_vmcmds.evs_cmds[`0`];
2744	size_t j;
2745
2746	if (error == `0`)
2747	DPRINTF(("vmcmds %u\n", epp->ep_vmcmds.evs_used));
2748	else
2749	DPRINTF(("vmcmds %zu/%u, error %d\n", x,
2750	epp->ep_vmcmds.evs_used, error));
2751
2752	for (j = `0`; j < epp->ep_vmcmds.evs_used; j++) {
2753	DPRINTF(("vmcmd[%zu] = vmcmd_map_%s %#"
2754	PRIxVADDR"/%#"PRIxVSIZE" fd@%#"
2755	PRIxVSIZE" prot=0%o flags=%d\n", j,
2756	vp[j].ev_proc == vmcmd_map_pagedvn ?
2757	"pagedvn" :
2758	vp[j].ev_proc == vmcmd_map_readvn ?
2759	"readvn" :
2760	vp[j].ev_proc == vmcmd_map_zero ?
2761	"zero" : "unknown",
2762	vp[j].ev_addr, vp[j].ev_len,
2763	vp[j].ev_offset, vp[j].ev_prot,
2764	vp[j].ev_flags));
2765	if (error != `0` && j == x)
2766	DPRINTF((" ^--- failed\n"));
2767	}
2768	}
2769	#endif
2770

Browse the source code of netbsd/sys/kern/kern_exec.c