1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2015, Joyent, Inc. All rights reserved. 29 */ 30 31 /* 32 * The LX Brand: emulation of a Linux operating environment within a zone. 33 * 34 * OVERVIEW 35 * 36 * The LX brand enables a full Linux userland -- including a C library, 37 * init(1) framework, and some set of applications -- to run unmodified 38 * within an illumos zone. Unlike illumos, where applications are expected 39 * to link against and consume functions exported from libraries, the 40 * supported Linux binary compatibility boundary is the system call 41 * interface. By accurately emulating the behaviour of Linux system calls, 42 * Linux software can be executed in this environment as if it were running 43 * on a native Linux system. 44 * 45 * EMULATING LINUX SYSTEM CALLS 46 * 47 * Linux system calls are made in 32-bit processes via the "int 0x80" 48 * instruction; in 64-bit processes the "syscall" instruction is used, as it 49 * is with native illumos processes. In both cases, arguments to system 50 * calls are generally passed in registers and the usermode stack is not 51 * interpreted or modified by the Linux kernel. 52 * 53 * When the emulated Linux process makes a system call, it traps into the 54 * illumos kernel. The in-kernel brand module contains various emulation 55 * routines, and can fully service some emulated system calls; e.g. read(2) 56 * and write(2). Other system calls require assistance from the illumos 57 * libc, bouncing back out to the brand library ("lx_brand.so.1") for 58 * emulation. 59 * 60 * The brand mechanism allows for the provision of an alternative trap 61 * handler for the various system call mechanisms. Traditionally this was 62 * used to immediately revector execution to the usermode emulation library, 63 * which was responsible for handling all system calls. In the interests of 64 * more accurate emulation and increased performance, much of the regular 65 * illumos system call path is now invoked. Only the argument processing and 66 * handler dispatch are replaced by the brand, via the per-LWP 67 * "lwp_brand_syscall" interposition function pointer. 68 * 69 * THE NATIVE AND BRAND STACKS 70 * 71 * Some runtime environments (e.g. the Go language) allocate very small 72 * thread stacks, preferring to grow or split the stack as necessary. The 73 * Linux kernel generally does not use the usermode stack when servicing 74 * system calls, so this is not a problem. In order for our emulation to 75 * have the same zero stack impact, we must execute usermode emulation 76 * routines on an _alternate_ stack. This is similar, in principle, to the 77 * use of sigaltstack(3C) to run signal handlers off the main thread stack. 78 * 79 * To this end, the brand library allocates and installs an alternate stack 80 * (called the "native" stack) for each LWP. The in-kernel brand code uses 81 * this stack for usermode emulation calls and interposed signal delivery, 82 * while the emulated Linux process sees only the data on the main thread 83 * stack, known as the "brand" stack. The stack mode is tracked in the 84 * per-LWP brand-private data, using the LX_STACK_MODE_* enum. 85 * 86 * The stack mode doubles as a system call "mode bit". When in the 87 * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux 88 * system calls. In other modes, system calls are assumed to be native 89 * illumos system calls as made during brand library initialisation and 90 * usermode emulation. 91 * 92 * USERMODE EMULATION 93 * 94 * When a Linux system call cannot be emulated within the kernel, we preserve 95 * the register state of the Linux process and revector the LWP to the brand 96 * library usermode emulation handler: the "lx_emulate()" function in 97 * "lx_brand.so.1". This revectoring is modelled on the delivery of signals, 98 * and is performed in "lx_emulate_user()". 99 * 100 * First, the emulated process state is written out to the usermode stack of 101 * the process as a "ucontext_t" object. Arguments to the emulation routine 102 * are passed on the stack or in registers, depending on the ABI. When the 103 * usermode emulation is complete, the result is passed back to the kernel 104 * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context 105 * for restoration. 106 * 107 * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT 108 * 109 * When servicing emulated system calls in the usermode brand library, or 110 * during signal delivery, various state is preserved by the kernel so that 111 * the running LWP may be revectored to a handling routine. The context 112 * allows the kernel to restart the program at the point of interruption, 113 * either at the return of the signal handler, via setcontext(3C); or after 114 * the usermode emulation request has been serviced, via B_EMULATION_DONE. 115 * 116 * In illumos native processes, the saved context (a "ucontext_t" object) 117 * includes the state of registers and the current signal mask at the point 118 * of interruption. The context also includes a link to the most recently 119 * saved context, forming a chain to be unwound as requests complete. The LX 120 * brand requires additional book-keeping to describe the machine state: in 121 * particular, the current stack mode and the occupied extent of the native 122 * stack. 123 * 124 * The brand code is able to interpose on the context save and restore 125 * operations in the kernel -- see "lx_savecontext()" and 126 * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to 127 * function correctly in the face of a dual stack LWP. The brand also 128 * interposes on the signal delivery mechanism -- see "lx_sendsig()" and 129 * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand 130 * library interposer on the native stack, regardless of the interrupted 131 * execution mode. Linux sigaltstack(2) emulation is performed entirely by 132 * the usermode brand library during signal handler interposition. 133 */ 134 135 #include <sys/types.h> 136 #include <sys/kmem.h> 137 #include <sys/errno.h> 138 #include <sys/thread.h> 139 #include <sys/systm.h> 140 #include <sys/syscall.h> 141 #include <sys/proc.h> 142 #include <sys/modctl.h> 143 #include <sys/cmn_err.h> 144 #include <sys/model.h> 145 #include <sys/exec.h> 146 #include <sys/lx_impl.h> 147 #include <sys/machbrand.h> 148 #include <sys/lx_syscalls.h> 149 #include <sys/lx_misc.h> 150 #include <sys/lx_futex.h> 151 #include <sys/lx_brand.h> 152 #include <sys/param.h> 153 #include <sys/termios.h> 154 #include <sys/sunddi.h> 155 #include <sys/ddi.h> 156 #include <sys/vnode.h> 157 #include <sys/pathname.h> 158 #include <sys/auxv.h> 159 #include <sys/priv.h> 160 #include <sys/regset.h> 161 #include <sys/privregs.h> 162 #include <sys/archsystm.h> 163 #include <sys/zone.h> 164 #include <sys/brand.h> 165 #include <sys/sdt.h> 166 #include <sys/x86_archext.h> 167 #include <sys/controlregs.h> 168 #include <sys/core.h> 169 #include <sys/stack.h> 170 #include <sys/stat.h> 171 #include <sys/socket.h> 172 #include <lx_signum.h> 173 #include <util/sscanf.h> 174 175 int lx_debug = 0; 176 177 void lx_init_brand_data(zone_t *); 178 void lx_free_brand_data(zone_t *); 179 void lx_setbrand(proc_t *); 180 int lx_getattr(zone_t *, int, void *, size_t *); 181 int lx_setattr(zone_t *, int, void *, size_t); 182 int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, 183 uintptr_t, uintptr_t); 184 void lx_set_kern_version(zone_t *, char *); 185 void lx_copy_procdata(proc_t *, proc_t *); 186 187 extern int getsetcontext(int, void *); 188 extern int waitsys(idtype_t, id_t, siginfo_t *, int); 189 #if defined(_SYSCALL32_IMPL) 190 extern int getsetcontext32(int, void *); 191 extern int waitsys32(idtype_t, id_t, siginfo_t *, int); 192 #endif 193 194 extern void lx_proc_exit(proc_t *); 195 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); 196 197 extern void lx_ioctl_init(); 198 extern void lx_ioctl_fini(); 199 extern void lx_socket_init(); 200 extern void lx_socket_fini(); 201 202 lx_systrace_f *lx_systrace_entry_ptr; 203 lx_systrace_f *lx_systrace_return_ptr; 204 205 static int lx_systrace_enabled; 206 207 /* 208 * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly 209 * want an MMU dependency here (and should there be a microprocessor without 210 * a hole, we don't want to start allocating from the top of the VA range). 211 */ 212 #define LX_MAXSTACK64 0x7ffffff00000 213 214 uint64_t lx_maxstack64 = LX_MAXSTACK64; 215 216 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 217 struct intpdata *idata, int level, long *execsz, int setid, 218 caddr_t exec_file, struct cred *cred, int *brand_action); 219 220 static boolean_t lx_native_exec(uint8_t, const char **); 221 static uint32_t lx_map32limit(proc_t *); 222 223 static void lx_savecontext(ucontext_t *); 224 static void lx_restorecontext(ucontext_t *); 225 static caddr_t lx_sendsig_stack(int); 226 static void lx_sendsig(int); 227 #if defined(_SYSCALL32_IMPL) 228 static void lx_savecontext32(ucontext32_t *); 229 #endif 230 static int lx_setid_clear(vattr_t *, cred_t *); 231 #if defined(_LP64) 232 static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type, 233 enum seg_rw); 234 #endif 235 236 237 /* lx brand */ 238 struct brand_ops lx_brops = { 239 lx_init_brand_data, /* b_init_brand_data */ 240 lx_free_brand_data, /* b_free_brand_data */ 241 lx_brandsys, /* b_brandsys */ 242 lx_setbrand, /* b_setbrand */ 243 lx_getattr, /* b_getattr */ 244 lx_setattr, /* b_setattr */ 245 lx_copy_procdata, /* b_copy_procdata */ 246 lx_proc_exit, /* b_proc_exit */ 247 lx_exec, /* b_exec */ 248 lx_setrval, /* b_lwp_setrval */ 249 lx_lwpdata_alloc, /* b_lwpdata_alloc */ 250 lx_lwpdata_free, /* b_lwpdata_free */ 251 lx_initlwp, /* b_initlwp */ 252 lx_forklwp, /* b_forklwp */ 253 lx_freelwp, /* b_freelwp */ 254 lx_exitlwp, /* b_lwpexit */ 255 lx_elfexec, /* b_elfexec */ 256 NULL, /* b_sigset_native_to_brand */ 257 NULL, /* b_sigset_brand_to_native */ 258 lx_sigfd_translate, /* b_sigfd_translate */ 259 NSIG, /* b_nsig */ 260 lx_exit_with_sig, /* b_exit_with_sig */ 261 lx_wait_filter, /* b_wait_filter */ 262 lx_native_exec, /* b_native_exec */ 263 lx_map32limit, /* b_map32limit */ 264 lx_stop_notify, /* b_stop_notify */ 265 lx_waitid_helper, /* b_waitid_helper */ 266 lx_sigcld_repost, /* b_sigcld_repost */ 267 lx_ptrace_issig_stop, /* b_issig_stop */ 268 lx_ptrace_sig_ignorable, /* b_sig_ignorable */ 269 lx_savecontext, /* b_savecontext */ 270 #if defined(_SYSCALL32_IMPL) 271 lx_savecontext32, /* b_savecontext32 */ 272 #endif 273 lx_restorecontext, /* b_restorecontext */ 274 lx_sendsig_stack, /* b_sendsig_stack */ 275 lx_sendsig, /* b_sendsig */ 276 lx_setid_clear, /* b_setid_clear */ 277 #if defined(_LP64) 278 lx_pagefault /* b_pagefault */ 279 #else 280 NULL 281 #endif 282 }; 283 284 struct brand_mach_ops lx_mops = { 285 NULL, 286 NULL, 287 NULL, 288 NULL, 289 NULL, 290 lx_fixsegreg, 291 lx_fsbase 292 }; 293 294 struct brand lx_brand = { 295 BRAND_VER_1, 296 "lx", 297 &lx_brops, 298 &lx_mops, 299 sizeof (struct lx_proc_data) 300 }; 301 302 static struct modlbrand modlbrand = { 303 &mod_brandops, "lx brand", &lx_brand 304 }; 305 306 static struct modlinkage modlinkage = { 307 MODREV_1, (void *)&modlbrand, NULL 308 }; 309 310 void 311 lx_proc_exit(proc_t *p) 312 { 313 lx_proc_data_t *lxpd; 314 proc_t *cp; 315 316 mutex_enter(&p->p_lock); 317 VERIFY(lxpd = ptolxproc(p)); 318 if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) { 319 mutex_exit(&p->p_lock); 320 return; 321 } 322 mutex_exit(&p->p_lock); 323 324 /* Check for children which desire notification of parental death. */ 325 mutex_enter(&pidlock); 326 for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) { 327 mutex_enter(&cp->p_lock); 328 if ((lxpd = ptolxproc(cp)) == NULL) { 329 mutex_exit(&cp->p_lock); 330 continue; 331 } 332 if (lxpd->l_parent_deathsig != 0) { 333 sigtoproc(p, NULL, lxpd->l_parent_deathsig); 334 } 335 mutex_exit(&cp->p_lock); 336 } 337 mutex_exit(&pidlock); 338 } 339 340 void 341 lx_setbrand(proc_t *p) 342 { 343 /* Send SIGCHLD to parent by default when child exits */ 344 ptolxproc(p)->l_signal = stol_signo[SIGCHLD]; 345 } 346 347 /* ARGSUSED */ 348 int 349 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) 350 { 351 char vers[LX_VERS_MAX]; 352 353 if (attr == LX_KERN_VERSION_NUM) { 354 if (bufsize > (LX_VERS_MAX - 1)) 355 return (ERANGE); 356 bzero(vers, LX_VERS_MAX); 357 if (copyin(buf, &vers, bufsize) != 0) 358 return (EFAULT); 359 lx_set_kern_version(zone, vers); 360 return (0); 361 } 362 return (EINVAL); 363 } 364 365 /* ARGSUSED */ 366 int 367 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) 368 { 369 if (attr == LX_KERN_VERSION_NUM) { 370 if (*bufsize < LX_VERS_MAX) 371 return (ERANGE); 372 if (copyout(lx_get_zone_kern_version(curzone), buf, 373 LX_VERS_MAX) != 0) 374 return (EFAULT); 375 *bufsize = LX_VERS_MAX; 376 return (0); 377 } 378 return (-EINVAL); 379 } 380 381 uint32_t 382 lx_map32limit(proc_t *p) 383 { 384 /* 385 * To be bug-for-bug compatible with Linux, we have MAP_32BIT only 386 * allow mappings in the first 31 bits. This was a nuance in the 387 * original Linux implementation circa 2002, and applications have 388 * come to depend on its behavior. 389 * 390 * This is only relevant for 64-bit processes. 391 */ 392 if (p->p_model == DATAMODEL_LP64) 393 return (1 << 31); 394 395 return ((uint32_t)USERLIMIT32); 396 } 397 398 void 399 lx_brand_systrace_enable(void) 400 { 401 VERIFY(!lx_systrace_enabled); 402 403 lx_systrace_enabled = 1; 404 } 405 406 void 407 lx_brand_systrace_disable(void) 408 { 409 VERIFY(lx_systrace_enabled); 410 411 lx_systrace_enabled = 0; 412 } 413 414 void 415 lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp) 416 { 417 VERIFY(lwpd->br_ntv_stack != 0); 418 419 /* 420 * The "brand-lx-set-ntv-stack-current" probe has arguments: 421 * arg0: stack pointer before change 422 * arg1: stack pointer after change 423 * arg2: current stack base 424 */ 425 DTRACE_PROBE3(brand__lx__set__ntv__stack__current, 426 uintptr_t, lwpd->br_ntv_stack_current, 427 uintptr_t, new_sp, 428 uintptr_t, lwpd->br_ntv_stack); 429 430 lwpd->br_ntv_stack_current = new_sp; 431 } 432 433 #if defined(_LP64) 434 static int 435 lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type, 436 enum seg_rw rw) 437 { 438 int syscall_num; 439 440 /* 441 * We only want to handle a very specific set of circumstances. 442 * Namely: this is a 64-bit LX-branded process attempting to execute an 443 * address in a page for which it does not have a valid mapping. If 444 * this is not the case, we bail out as fast as possible. 445 */ 446 VERIFY(PROC_IS_BRANDED(p)); 447 if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) != 448 DATAMODEL_NATIVE) { 449 return (-1); 450 } 451 452 if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) { 453 return (-1); 454 } 455 456 /* 457 * This is a valid vsyscall address. We service the system call and 458 * return 0 to signal that the pagefault has been handled completely. 459 */ 460 lx_vsyscall_enter(p, lwp, syscall_num); 461 return (0); 462 } 463 #endif 464 465 /* 466 * This hook runs prior to sendsig() processing and allows us to nominate 467 * an alternative stack pointer for delivery of the signal handling frame. 468 * Critically, this routine should _not_ modify any LWP state as the 469 * savecontext() does not run until after this hook. 470 */ 471 static caddr_t 472 lx_sendsig_stack(int sig) 473 { 474 klwp_t *lwp = ttolwp(curthread); 475 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 476 477 /* 478 * We want to take signal delivery on the native stack, but only if 479 * one has been allocated and installed for this LWP. 480 */ 481 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 482 /* 483 * The program is not running on the native stack. Return 484 * the native stack pointer from our brand-private data so 485 * that we may switch to it for signal handling. 486 */ 487 return ((caddr_t)lwpd->br_ntv_stack_current); 488 } else { 489 struct regs *rp = lwptoregs(lwp); 490 491 /* 492 * Either the program is already running on the native stack, 493 * or one has not yet been allocated for this LWP. Use the 494 * current stack pointer value. 495 */ 496 return ((caddr_t)rp->r_sp); 497 } 498 } 499 500 /* 501 * This hook runs after sendsig() processing and allows us to update the 502 * per-LWP mode flags for system calls and stacks. The pre-signal 503 * context has already been saved and delivered to the user at this point. 504 */ 505 static void 506 lx_sendsig(int sig) 507 { 508 klwp_t *lwp = ttolwp(curthread); 509 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 510 struct regs *rp = lwptoregs(lwp); 511 512 switch (lwpd->br_stack_mode) { 513 case LX_STACK_MODE_BRAND: 514 case LX_STACK_MODE_NATIVE: 515 /* 516 * In lx_sendsig_stack(), we nominated a stack pointer from the 517 * native stack. Update the stack mode, and the current in-use 518 * extent of the native stack, accordingly: 519 */ 520 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; 521 lx_lwp_set_native_stack_current(lwpd, rp->r_sp); 522 523 /* 524 * Fix up segment registers, etc. 525 */ 526 lx_switch_to_native(lwp); 527 break; 528 529 default: 530 /* 531 * Otherwise, the brand library has not yet installed the 532 * alternate stack for this LWP. Signals will be handled on 533 * the regular stack thread. 534 */ 535 return; 536 } 537 } 538 539 /* 540 * This hook runs prior to the context restoration, allowing us to take action 541 * or modify the context before it is loaded. 542 */ 543 static void 544 lx_restorecontext(ucontext_t *ucp) 545 { 546 klwp_t *lwp = ttolwp(curthread); 547 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 548 uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0]; 549 caddr_t sp = ucp->uc_brand_data[1]; 550 551 /* 552 * We have a saved native stack pointer value that we must restore 553 * into the per-LWP data. 554 */ 555 if (flags & LX_UC_RESTORE_NATIVE_SP) { 556 lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp); 557 } 558 559 /* 560 * We do not wish to restore the value of uc_link in this context, 561 * so replace it with the value currently in the LWP. 562 */ 563 if (flags & LX_UC_IGNORE_LINK) { 564 ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext; 565 } 566 567 /* 568 * Restore the stack mode: 569 */ 570 if (flags & LX_UC_STACK_NATIVE) { 571 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; 572 } else if (flags & LX_UC_STACK_BRAND) { 573 lwpd->br_stack_mode = LX_STACK_MODE_BRAND; 574 } 575 576 #if defined(__amd64) 577 /* 578 * Override the fs/gsbase in the context with the value provided 579 * through the Linux arch_prctl(2) system call. 580 */ 581 if (flags & LX_UC_STACK_BRAND) { 582 if (lwpd->br_lx_fsbase != 0) { 583 ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase; 584 } 585 if (lwpd->br_lx_gsbase != 0) { 586 ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase; 587 } 588 } 589 #endif 590 } 591 592 static void 593 lx_savecontext(ucontext_t *ucp) 594 { 595 klwp_t *lwp = ttolwp(curthread); 596 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 597 uintptr_t flags = 0; 598 599 /* 600 * The ucontext_t affords us three private pointer-sized members in 601 * "uc_brand_data". We pack a variety of flags into the first element, 602 * and an optional stack pointer in the second element. The flags 603 * determine which stack pointer (native or brand), if any, is stored 604 * in the second element. The third element may contain the system 605 * call number; this is analogous to the "orig_[er]ax" member of a 606 * Linux "user_regs_struct". 607 */ 608 609 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && 610 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 611 /* 612 * Record the value of the native stack pointer to restore 613 * when returning to this branded context: 614 */ 615 flags |= LX_UC_RESTORE_NATIVE_SP; 616 ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current; 617 } 618 619 /* 620 * Save the stack mode: 621 */ 622 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { 623 flags |= LX_UC_STACK_NATIVE; 624 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 625 flags |= LX_UC_STACK_BRAND; 626 } 627 628 /* 629 * If we might need to restart this system call, save that information 630 * in the context: 631 */ 632 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 633 ucp->uc_brand_data[2] = 634 (void *)(uintptr_t)lwpd->br_syscall_num; 635 if (lwpd->br_syscall_restart) { 636 flags |= LX_UC_RESTART_SYSCALL; 637 } 638 } else { 639 ucp->uc_brand_data[2] = NULL; 640 } 641 642 ucp->uc_brand_data[0] = (void *)flags; 643 } 644 645 #if defined(_SYSCALL32_IMPL) 646 static void 647 lx_savecontext32(ucontext32_t *ucp) 648 { 649 klwp_t *lwp = ttolwp(curthread); 650 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 651 unsigned int flags = 0; 652 653 /* 654 * The ucontext_t affords us three private pointer-sized members in 655 * "uc_brand_data". We pack a variety of flags into the first element, 656 * and an optional stack pointer in the second element. The flags 657 * determine which stack pointer (native or brand), if any, is stored 658 * in the second element. The third element may contain the system 659 * call number; this is analogous to the "orig_[er]ax" member of a 660 * Linux "user_regs_struct". 661 */ 662 663 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && 664 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 665 /* 666 * Record the value of the native stack pointer to restore 667 * when returning to this branded context: 668 */ 669 flags |= LX_UC_RESTORE_NATIVE_SP; 670 ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current; 671 } 672 673 /* 674 * Save the stack mode: 675 */ 676 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { 677 flags |= LX_UC_STACK_NATIVE; 678 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 679 flags |= LX_UC_STACK_BRAND; 680 } 681 682 /* 683 * If we might need to restart this system call, save that information 684 * in the context: 685 */ 686 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 687 ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num; 688 if (lwpd->br_syscall_restart) { 689 flags |= LX_UC_RESTART_SYSCALL; 690 } 691 } else { 692 ucp->uc_brand_data[2] = NULL; 693 } 694 695 ucp->uc_brand_data[0] = flags; 696 } 697 #endif 698 699 void 700 lx_init_brand_data(zone_t *zone) 701 { 702 lx_zone_data_t *data; 703 ASSERT(zone->zone_brand == &lx_brand); 704 ASSERT(zone->zone_brand_data == NULL); 705 data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP); 706 /* 707 * Set the default lxzd_kernel_version to 2.4. 708 * This can be changed by a call to setattr() during zone boot. 709 */ 710 (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX); 711 712 /* 713 * Linux is not at all picky about address family when it comes to 714 * supporting interface-related ioctls. To mimic this behavior, we'll 715 * attempt those ioctls against a ksocket configured for that purpose. 716 */ 717 (void) ksocket_socket(&data->lxzd_ioctl_sock, AF_INET, SOCK_DGRAM, 0, 718 0, zone->zone_kcred); 719 720 zone->zone_brand_data = data; 721 722 /* 723 * In Linux, if the init(1) process terminates the system panics. 724 * The zone must reboot to simulate this behaviour. 725 */ 726 zone->zone_reboot_on_init_exit = B_TRUE; 727 } 728 729 void 730 lx_free_brand_data(zone_t *zone) 731 { 732 lx_zone_data_t *data = ztolxzd(zone); 733 ASSERT(data != NULL); 734 if (data->lxzd_ioctl_sock != NULL) { 735 /* 736 * Since zone_kcred has been cleaned up already, close the 737 * socket using the global kcred. 738 */ 739 ksocket_close(data->lxzd_ioctl_sock, kcred); 740 data->lxzd_ioctl_sock = NULL; 741 } 742 zone->zone_brand_data = NULL; 743 kmem_free(data, sizeof (*data)); 744 } 745 746 void 747 lx_unsupported(char *dmsg) 748 { 749 lx_proc_data_t *pd = ttolxproc(curthread); 750 751 DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg); 752 753 if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) { 754 /* 755 * If this process was run with strict mode enabled 756 * (via LX_STRICT in the environment), we mark this 757 * LWP as having triggered an unsupported behaviour. 758 * This flag will be checked at an appropriate point 759 * by lx_check_strict_failure(). 760 */ 761 lx_lwp_data_t *lwpd = ttolxlwp(curthread); 762 763 lwpd->br_strict_failure = B_TRUE; 764 } 765 } 766 767 void 768 lx_check_strict_failure(lx_lwp_data_t *lwpd) 769 { 770 proc_t *p; 771 772 if (!lwpd->br_strict_failure) { 773 return; 774 } 775 776 lwpd->br_strict_failure = B_FALSE; 777 778 /* 779 * If this process is operating in strict mode (via LX_STRICT in 780 * the environment), and has triggered a call to 781 * lx_unsupported(), we drop SIGSYS on it as we return. 782 */ 783 p = curproc; 784 mutex_enter(&p->p_lock); 785 sigtoproc(p, curthread, SIGSYS); 786 mutex_exit(&p->p_lock); 787 } 788 789 void 790 lx_trace_sysenter(int syscall_num, uintptr_t *args) 791 { 792 if (lx_systrace_enabled) { 793 VERIFY(lx_systrace_entry_ptr != NULL); 794 795 (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1], 796 args[2], args[3], args[4], args[5]); 797 } 798 } 799 800 void 801 lx_trace_sysreturn(int syscall_num, long ret) 802 { 803 if (lx_systrace_enabled) { 804 VERIFY(lx_systrace_return_ptr != NULL); 805 806 (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0); 807 } 808 } 809 810 /* 811 * Get the addresses of the user-space system call handler and attach it to 812 * the proc structure. Returning 0 indicates success; the value returned 813 * by the system call is the value stored in rval. Returning a non-zero 814 * value indicates a failure; the value returned is used to set errno, -1 815 * is returned from the syscall and the contents of rval are ignored. To 816 * set errno and have the syscall return a value other than -1 we can 817 * manually set errno and rval and return 0. 818 */ 819 int 820 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, 821 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) 822 { 823 kthread_t *t = curthread; 824 klwp_t *lwp = ttolwp(t); 825 proc_t *p = ttoproc(t); 826 lx_proc_data_t *pd; 827 struct termios *termios; 828 uint_t termios_len; 829 int error; 830 int code; 831 int sig; 832 lx_brand_registration_t reg; 833 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 834 835 /* 836 * There is one operation that is suppored for non-branded 837 * process. B_EXEC_BRAND. This is the equilivant of an 838 * exec call, but the new process that is created will be 839 * a branded process. 840 */ 841 if (cmd == B_EXEC_BRAND) { 842 VERIFY(p->p_zone != NULL); 843 VERIFY(p->p_zone->zone_brand == &lx_brand); 844 return (exec_common( 845 (char *)arg1, (const char **)arg2, (const char **)arg3, 846 EBA_BRAND)); 847 } 848 849 /* For all other operations this must be a branded process. */ 850 if (p->p_brand == NULL) 851 return (ENOSYS); 852 853 VERIFY(p->p_brand == &lx_brand); 854 VERIFY(p->p_brand_data != NULL); 855 856 switch (cmd) { 857 case B_REGISTER: 858 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 859 lx_print("stack mode was not PREINIT during " 860 "REGISTER\n"); 861 return (EINVAL); 862 } 863 864 if (p->p_model == DATAMODEL_NATIVE) { 865 if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { 866 lx_print("Failed to copyin brand registration " 867 "at 0x%p\n", (void *)arg1); 868 return (EFAULT); 869 } 870 } 871 #ifdef _LP64 872 else { 873 /* 32-bit userland on 64-bit kernel */ 874 lx_brand_registration32_t reg32; 875 876 if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { 877 lx_print("Failed to copyin brand registration " 878 "at 0x%p\n", (void *)arg1); 879 return (EFAULT); 880 } 881 882 reg.lxbr_version = (uint_t)reg32.lxbr_version; 883 reg.lxbr_handler = 884 (void *)(uintptr_t)reg32.lxbr_handler; 885 reg.lxbr_flags = reg32.lxbr_flags; 886 } 887 #endif 888 889 if (reg.lxbr_version != LX_VERSION_1) { 890 lx_print("Invalid brand library version (%u)\n", 891 reg.lxbr_version); 892 return (EINVAL); 893 } 894 895 if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) { 896 lx_print("Invalid brand flags (%u)\n", 897 reg.lxbr_flags); 898 return (EINVAL); 899 } 900 901 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", 902 (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); 903 pd = p->p_brand_data; 904 pd->l_handler = (uintptr_t)reg.lxbr_handler; 905 pd->l_flags = reg.lxbr_flags & LX_PROC_ALL; 906 907 return (0); 908 909 case B_TTYMODES: 910 /* This is necessary for emulating TCGETS ioctls. */ 911 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), 912 DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, 913 &termios_len) != DDI_SUCCESS) 914 return (EIO); 915 916 ASSERT(termios_len == sizeof (*termios)); 917 918 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { 919 ddi_prop_free(termios); 920 return (EFAULT); 921 } 922 923 ddi_prop_free(termios); 924 return (0); 925 926 case B_ELFDATA: 927 pd = curproc->p_brand_data; 928 if (get_udatamodel() == DATAMODEL_NATIVE) { 929 if (copyout(&pd->l_elf_data, (void *)arg1, 930 sizeof (lx_elf_data_t)) != 0) { 931 return (EFAULT); 932 } 933 } 934 #if defined(_LP64) 935 else { 936 /* 32-bit userland on 64-bit kernel */ 937 lx_elf_data32_t led32; 938 939 led32.ed_phdr = (int)pd->l_elf_data.ed_phdr; 940 led32.ed_phent = (int)pd->l_elf_data.ed_phent; 941 led32.ed_phnum = (int)pd->l_elf_data.ed_phnum; 942 led32.ed_entry = (int)pd->l_elf_data.ed_entry; 943 led32.ed_base = (int)pd->l_elf_data.ed_base; 944 led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry; 945 946 if (copyout(&led32, (void *)arg1, 947 sizeof (led32)) != 0) { 948 return (EFAULT); 949 } 950 } 951 #endif 952 return (0); 953 954 case B_EXEC_NATIVE: 955 return (exec_common((char *)arg1, (const char **)arg2, 956 (const char **)arg3, EBA_NATIVE)); 957 958 /* 959 * The B_TRUSS_POINT subcommand is used so that we can make a no-op 960 * syscall for debugging purposes (dtracing) from within the user-level 961 * emulation. 962 */ 963 case B_TRUSS_POINT: 964 return (0); 965 966 case B_LPID_TO_SPAIR: { 967 /* 968 * Given a Linux pid as arg1, return the Solaris pid in arg2 and 969 * the Solaris LWP in arg3. We also translate pid 1 (which is 970 * hardcoded in many applications) to the zone's init process. 971 */ 972 pid_t s_pid; 973 id_t s_tid; 974 975 if ((pid_t)arg1 == 1) { 976 s_pid = p->p_zone->zone_proc_initpid; 977 /* handle the dead/missing init(1M) case */ 978 if (s_pid == -1) 979 s_pid = 1; 980 s_tid = 1; 981 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) { 982 return (ESRCH); 983 } 984 985 if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 || 986 copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) { 987 return (EFAULT); 988 } 989 990 return (0); 991 } 992 993 case B_SIGEV_THREAD_ID: { 994 /* 995 * Emulate Linux's timer_create(2) SIGEV_THREAD_ID 996 * notification method. This mechanism is only meant 997 * for userland threading libraries such as glibc and 998 * is documented as such. Therefore, assume this is 999 * only ever invoked for the purpose of alerting a 1000 * Linux threading library. Assume that the tid is a 1001 * member of the caller's process and the signal 1002 * number is valid. See lx_sigev_thread_id() for the 1003 * userland side of this emulation. 1004 * 1005 * The return code from this function is not checked 1006 * by the caller since it executes in an asynchronous 1007 * context and there is nothing much to be done. If 1008 * this function does fail then it will manifest as 1009 * Linux threads waiting for a signal they will never 1010 * receive. 1011 * 1012 * arg1 -- Linux tid 1013 * arg2 -- Linux signal number 1014 * arg3 -- union sigval 1015 */ 1016 1017 int native_sig = lx_ltos_signo((int)arg2, 0); 1018 pid_t native_pid; 1019 int native_tid; 1020 sigqueue_t *sqp; 1021 1022 if (native_sig == 0) 1023 return (EINVAL); 1024 1025 lx_lpid_to_spair((pid_t)arg1, &native_pid, &native_tid); 1026 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); 1027 mutex_enter(&curproc->p_lock); 1028 1029 if ((t = idtot(curproc, native_tid)) == NULL) { 1030 mutex_exit(&curproc->p_lock); 1031 kmem_free(sqp, sizeof (sigqueue_t)); 1032 return (ESRCH); 1033 } 1034 1035 sqp->sq_info.si_signo = native_sig; 1036 sqp->sq_info.si_code = SI_TIMER; 1037 sqp->sq_info.si_pid = curproc->p_pid; 1038 sqp->sq_info.si_zoneid = getzoneid(); 1039 sqp->sq_info.si_uid = crgetruid(CRED()); 1040 sqp->sq_info.si_value.sival_ptr = (void *)arg3; 1041 sigaddqa(curproc, t, sqp); 1042 1043 mutex_exit(&curproc->p_lock); 1044 1045 return (0); 1046 } 1047 1048 case B_SET_AFFINITY_MASK: 1049 case B_GET_AFFINITY_MASK: 1050 /* 1051 * Retrieve or store the CPU affinity mask for the 1052 * requested linux pid. 1053 * 1054 * arg1 is a linux PID (0 means curthread). 1055 * arg2 is the size of the given mask. 1056 * arg3 is the address of the affinity mask. 1057 */ 1058 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval)); 1059 1060 case B_PTRACE_STOP_FOR_OPT: 1061 return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ? 1062 B_FALSE : B_TRUE, (ulong_t)arg3, arg4)); 1063 1064 case B_PTRACE_CLONE_BEGIN: 1065 return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ? 1066 B_FALSE : B_TRUE)); 1067 1068 case B_PTRACE_KERNEL: 1069 return (lx_ptrace_kernel((int)arg1, (pid_t)arg2, arg3, arg4)); 1070 1071 case B_HELPER_WAITID: { 1072 idtype_t idtype = (idtype_t)arg1; 1073 id_t id = (id_t)arg2; 1074 siginfo_t *infop = (siginfo_t *)arg3; 1075 int options = (int)arg4; 1076 1077 lwpd = ttolxlwp(curthread); 1078 1079 /* 1080 * Our brand-specific waitid helper only understands a subset of 1081 * the possible idtypes. Ensure we keep to that subset here: 1082 */ 1083 if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { 1084 return (EINVAL); 1085 } 1086 1087 /* 1088 * Enable the return of emulated ptrace(2) stop conditions 1089 * through lx_waitid_helper, and stash the Linux-specific 1090 * extra waitid() flags. 1091 */ 1092 lwpd->br_waitid_emulate = B_TRUE; 1093 lwpd->br_waitid_flags = (int)arg5; 1094 1095 #if defined(_SYSCALL32_IMPL) 1096 if (get_udatamodel() != DATAMODEL_NATIVE) { 1097 return (waitsys32(idtype, id, infop, options)); 1098 } else 1099 #endif 1100 { 1101 return (waitsys(idtype, id, infop, options)); 1102 } 1103 1104 lwpd->br_waitid_emulate = B_FALSE; 1105 lwpd->br_waitid_flags = 0; 1106 1107 return (0); 1108 } 1109 1110 case B_UNSUPPORTED: { 1111 char dmsg[256]; 1112 1113 if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) { 1114 lx_print("Failed to copyin unsupported msg " 1115 "at 0x%p\n", (void *)arg1); 1116 return (EFAULT); 1117 } 1118 dmsg[255] = '\0'; 1119 lx_unsupported(dmsg); 1120 1121 lx_check_strict_failure(lwpd); 1122 1123 return (0); 1124 } 1125 1126 case B_STORE_ARGS: { 1127 /* 1128 * B_STORE_ARGS subcommand 1129 * arg1 = address of struct to be copied in 1130 * arg2 = size of the struct being copied in 1131 * arg3-arg6 ignored 1132 * rval = the amount of data copied. 1133 */ 1134 void *buf; 1135 1136 /* only have upper limit because arg2 is unsigned */ 1137 if (arg2 > LX_BR_ARGS_SIZE_MAX) { 1138 return (EINVAL); 1139 } 1140 1141 buf = kmem_alloc(arg2, KM_SLEEP); 1142 if (copyin((void *)arg1, buf, arg2) != 0) { 1143 lx_print("Failed to copyin scall arg at 0x%p\n", 1144 (void *) arg1); 1145 kmem_free(buf, arg2); 1146 /* 1147 * Purposely not setting br_scall_args to NULL 1148 * to preserve data for debugging. 1149 */ 1150 return (EFAULT); 1151 } 1152 1153 if (lwpd->br_scall_args != NULL) { 1154 ASSERT(lwpd->br_args_size > 0); 1155 kmem_free(lwpd->br_scall_args, 1156 lwpd->br_args_size); 1157 } 1158 1159 lwpd->br_scall_args = buf; 1160 lwpd->br_args_size = arg2; 1161 *rval = arg2; 1162 return (0); 1163 } 1164 1165 case B_HELPER_CLONE: 1166 return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3, 1167 (void *)arg4)); 1168 1169 case B_HELPER_SETGROUPS: 1170 return (lx_helper_setgroups(arg1, (gid_t *)arg2)); 1171 1172 case B_HELPER_SIGQUEUE: 1173 return (lx_helper_rt_sigqueueinfo(arg1, arg2, 1174 (siginfo_t *)arg3)); 1175 1176 case B_HELPER_TGSIGQUEUE: 1177 return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3, 1178 (siginfo_t *)arg4)); 1179 1180 case B_SET_THUNK_PID: 1181 lwpd->br_lx_thunk_pid = arg1; 1182 return (0); 1183 1184 case B_GETPID: 1185 /* 1186 * The usermode clone(2) code needs to be able to call 1187 * lx_getpid() from native code: 1188 */ 1189 *rval = lx_getpid(); 1190 return (0); 1191 1192 case B_SET_NATIVE_STACK: 1193 /* 1194 * B_SET_NATIVE_STACK subcommand 1195 * arg1 = the base of the stack to use for emulation 1196 */ 1197 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 1198 lx_print("B_SET_NATIVE_STACK when stack was already " 1199 "set to %p\n", (void *)arg1); 1200 return (EEXIST); 1201 } 1202 1203 /* 1204 * We move from the PREINIT state, where we have no brand 1205 * emulation stack, to the INIT state. Here, we are still 1206 * running on what will become the BRAND stack, but are running 1207 * emulation (i.e. native) code. Once the initialisation 1208 * process for this thread has finished, we will jump to 1209 * brand-specific code, while moving to the BRAND mode. 1210 * 1211 * When a new LWP is created, lx_initlwp() will clear the 1212 * stack data. If that LWP is actually being duplicated 1213 * into a child process by fork(2), lx_forklwp() will copy 1214 * it so that the cloned thread will keep using the same 1215 * alternate stack. 1216 */ 1217 lwpd->br_ntv_stack = arg1; 1218 lwpd->br_stack_mode = LX_STACK_MODE_INIT; 1219 lx_lwp_set_native_stack_current(lwpd, arg1); 1220 1221 return (0); 1222 1223 case B_GET_CURRENT_CONTEXT: 1224 /* 1225 * B_GET_CURRENT_CONTEXT subcommand: 1226 * arg1 = address for pointer to current ucontext_t 1227 */ 1228 1229 #if defined(_SYSCALL32_IMPL) 1230 if (get_udatamodel() != DATAMODEL_NATIVE) { 1231 caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext; 1232 1233 error = copyout(&addr, (void *)arg1, sizeof (addr)); 1234 } else 1235 #endif 1236 { 1237 error = copyout(&lwp->lwp_oldcontext, (void *)arg1, 1238 sizeof (lwp->lwp_oldcontext)); 1239 } 1240 1241 return (error != 0 ? EFAULT : 0); 1242 1243 case B_JUMP_TO_LINUX: 1244 /* 1245 * B_JUMP_TO_LINUX subcommand: 1246 * arg1 = ucontext_t pointer for jump state 1247 */ 1248 1249 if (arg1 == NULL) 1250 return (EINVAL); 1251 1252 switch (lwpd->br_stack_mode) { 1253 case LX_STACK_MODE_NATIVE: { 1254 struct regs *rp = lwptoregs(lwp); 1255 1256 /* 1257 * We are on the NATIVE stack, so we must preserve 1258 * the extent of that stack. The pointer will be 1259 * reset by a future setcontext(). 1260 */ 1261 lx_lwp_set_native_stack_current(lwpd, 1262 (uintptr_t)rp->r_sp); 1263 break; 1264 } 1265 1266 case LX_STACK_MODE_INIT: 1267 /* 1268 * The LWP is transitioning to Linux code for the first 1269 * time. 1270 */ 1271 break; 1272 1273 case LX_STACK_MODE_PREINIT: 1274 /* 1275 * This LWP has not installed an alternate stack for 1276 * usermode emulation handling. 1277 */ 1278 return (ENOENT); 1279 1280 case LX_STACK_MODE_BRAND: 1281 /* 1282 * The LWP should not be on the BRAND stack. 1283 */ 1284 exit(CLD_KILLED, SIGSYS); 1285 return (0); 1286 } 1287 1288 /* 1289 * Transfer control to Linux: 1290 */ 1291 return (lx_runexe(lwp, (void *)arg1)); 1292 1293 case B_EMULATION_DONE: 1294 /* 1295 * B_EMULATION_DONE subcommand: 1296 * arg1 = ucontext_t * to restore 1297 * arg2 = system call number 1298 * arg3 = return code 1299 * arg4 = if operation failed, the errno value 1300 */ 1301 1302 /* 1303 * The first part of this operation is a setcontext() to 1304 * restore the register state to the copy we preserved 1305 * before vectoring to the usermode emulation routine. 1306 * If that fails, we return (hopefully) to the emulation 1307 * routine and it will handle the error. 1308 */ 1309 #if (_SYSCALL32_IMPL) 1310 if (get_udatamodel() != DATAMODEL_NATIVE) { 1311 error = getsetcontext32(SETCONTEXT, (void *)arg1); 1312 } else 1313 #endif 1314 { 1315 error = getsetcontext(SETCONTEXT, (void *)arg1); 1316 } 1317 1318 if (error != 0) { 1319 return (error); 1320 } 1321 1322 /* 1323 * The saved Linux context has been restored. We handle the 1324 * return value or errno with code common to the in-kernel 1325 * system call emulation. 1326 */ 1327 if ((error = (int)arg4) != 0) { 1328 /* 1329 * lx_syscall_return() looks at the errno in the LWP, 1330 * so set it here: 1331 */ 1332 set_errno(error); 1333 } 1334 lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3); 1335 1336 return (0); 1337 1338 case B_EXIT_AS_SIG: 1339 code = CLD_KILLED; 1340 sig = (int)arg1; 1341 proc_is_exiting(p); 1342 if (exitlwps(1) != 0) { 1343 mutex_enter(&p->p_lock); 1344 lwp_exit(); 1345 } 1346 ttolwp(curthread)->lwp_cursig = sig; 1347 if (sig == SIGSEGV) { 1348 if (core(sig, 0) == 0) 1349 code = CLD_DUMPED; 1350 } 1351 exit(code, sig); 1352 /* NOTREACHED */ 1353 break; 1354 } 1355 1356 return (EINVAL); 1357 } 1358 1359 char * 1360 lx_get_zone_kern_version(zone_t *zone) 1361 { 1362 return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version); 1363 } 1364 1365 void 1366 lx_set_kern_version(zone_t *zone, char *vers) 1367 { 1368 lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; 1369 1370 (void) strlcpy(lxzd->lxzd_kernel_version, vers, LX_VERS_MAX); 1371 } 1372 1373 /* 1374 * Compare linux kernel version to the one set for the zone. 1375 * Returns greater than 0 if zone version is higher, less than 0 if the zone 1376 * version is lower, and 0 if the version are equal. 1377 */ 1378 int 1379 lx_kern_version_cmp(zone_t *zone, const char *vers) 1380 { 1381 int zvers[3] = {0, 0, 0}; 1382 int cvers[3] = {0, 0, 0}; 1383 int i; 1384 1385 VERIFY(zone->zone_brand == &lx_brand); 1386 1387 (void) sscanf(ztolxzd(zone)->lxzd_kernel_version, "%d.%d.%d", &zvers[0], 1388 &zvers[1], &zvers[2]); 1389 (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]); 1390 1391 for (i = 0; i < 3; i++) { 1392 if (zvers[i] > cvers[i]) { 1393 return (1); 1394 } else if (zvers[i] < cvers[i]) { 1395 return (-1); 1396 } 1397 } 1398 return (0); 1399 } 1400 1401 /* 1402 * Linux unconditionally removes the setuid and setgid bits when changing 1403 * file ownership. This brand hook overrides the illumos native behaviour, 1404 * which is based on the PRIV_FILE_SETID privilege. 1405 */ 1406 static int 1407 lx_setid_clear(vattr_t *vap, cred_t *cr) 1408 { 1409 if (S_ISDIR(vap->va_mode)) { 1410 return (0); 1411 } 1412 1413 if (vap->va_mode & S_ISUID) { 1414 vap->va_mask |= AT_MODE; 1415 vap->va_mode &= ~S_ISUID; 1416 } 1417 if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1418 vap->va_mask |= AT_MODE; 1419 vap->va_mode &= ~S_ISGID; 1420 } 1421 1422 return (0); 1423 } 1424 1425 /* 1426 * Copy the per-process brand data from a parent proc to a child. 1427 */ 1428 void 1429 lx_copy_procdata(proc_t *child, proc_t *parent) 1430 { 1431 lx_proc_data_t *cpd = child->p_brand_data; 1432 lx_proc_data_t *ppd = parent->p_brand_data; 1433 1434 VERIFY(parent->p_brand == &lx_brand); 1435 VERIFY(child->p_brand == &lx_brand); 1436 VERIFY(ppd != NULL); 1437 VERIFY(cpd != NULL); 1438 1439 *cpd = *ppd; 1440 1441 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY; 1442 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY; 1443 1444 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20; 1445 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20; 1446 1447 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY; 1448 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY; 1449 1450 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY; 1451 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY; 1452 } 1453 1454 #if defined(_LP64) 1455 static void 1456 Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst) 1457 { 1458 bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident)); 1459 dst->e_type = src->e_type; 1460 dst->e_machine = src->e_machine; 1461 dst->e_version = src->e_version; 1462 dst->e_entry = src->e_entry; 1463 dst->e_phoff = src->e_phoff; 1464 dst->e_shoff = src->e_shoff; 1465 dst->e_flags = src->e_flags; 1466 dst->e_ehsize = src->e_ehsize; 1467 dst->e_phentsize = src->e_phentsize; 1468 dst->e_phnum = src->e_phnum; 1469 dst->e_shentsize = src->e_shentsize; 1470 dst->e_shnum = src->e_shnum; 1471 dst->e_shstrndx = src->e_shstrndx; 1472 } 1473 #endif /* _LP64 */ 1474 1475 static void 1476 restoreexecenv(struct execenv *ep, stack_t *sp) 1477 { 1478 klwp_t *lwp = ttolwp(curthread); 1479 1480 setexecenv(ep); 1481 lwp->lwp_sigaltstack.ss_sp = sp->ss_sp; 1482 lwp->lwp_sigaltstack.ss_size = sp->ss_size; 1483 lwp->lwp_sigaltstack.ss_flags = sp->ss_flags; 1484 } 1485 1486 extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, 1487 long *, int, caddr_t, cred_t *, int *); 1488 1489 extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int, 1490 long *, int, caddr_t, cred_t *, int *); 1491 1492 /* 1493 * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux 1494 * binaries. 1495 */ 1496 static int 1497 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 1498 struct intpdata *idata, int level, long *execsz, int setid, 1499 caddr_t exec_file, struct cred *cred, int *brand_action) 1500 { 1501 int error; 1502 vnode_t *nvp; 1503 Ehdr ehdr; 1504 Addr uphdr_vaddr; 1505 intptr_t voffset; 1506 char *interp = NULL; 1507 uintptr_t ldaddr = NULL; 1508 int i; 1509 proc_t *p = ttoproc(curthread); 1510 klwp_t *lwp = ttolwp(curthread); 1511 struct execenv env; 1512 struct execenv origenv; 1513 stack_t orig_sigaltstack; 1514 struct user *up = PTOU(ttoproc(curthread)); 1515 lx_elf_data_t *edp; 1516 char *lib_path = NULL; 1517 1518 ASSERT(ttoproc(curthread)->p_brand == &lx_brand); 1519 ASSERT(ttoproc(curthread)->p_brand_data != NULL); 1520 1521 edp = &ttolxproc(curthread)->l_elf_data; 1522 1523 if (args->to_model == DATAMODEL_NATIVE) { 1524 lib_path = LX_LIB_PATH; 1525 } 1526 #if defined(_LP64) 1527 else { 1528 lib_path = LX_LIB_PATH32; 1529 } 1530 #endif 1531 1532 /* 1533 * Set the brandname and library name for the new process so that 1534 * elfexec() puts them onto the stack. 1535 */ 1536 args->brandname = LX_BRANDNAME; 1537 args->emulator = lib_path; 1538 1539 #if defined(_LP64) 1540 /* 1541 * To conform with the way Linux lays out the address space, we clamp 1542 * the stack to be the top of the lower region of the x86-64 canonical 1543 * form address space -- which has the side-effect of laying out the 1544 * entire address space in that lower region. Note that this only 1545 * matters on 64-bit processes (this value will always be greater than 1546 * the size of a 32-bit address space) and doesn't actually affect 1547 * USERLIMIT: if a Linux-branded processes wishes to map something 1548 * into the top half of the address space, it can do so -- but with 1549 * the user stack starting at the top of the bottom region, those high 1550 * virtual addresses won't be used unless explicitly directed. 1551 */ 1552 args->maxstack = lx_maxstack64; 1553 #endif 1554 1555 /* 1556 * We will first exec the brand library, then map in the linux 1557 * executable and the linux linker. 1558 */ 1559 if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP, 1560 &nvp))) { 1561 uprintf("%s: not found.", lib_path); 1562 return (error); 1563 } 1564 1565 /* 1566 * We will eventually set the p_exec member to be the vnode for the new 1567 * executable when we call setexecenv(). However, if we get an error 1568 * before that call we need to restore the execenv to its original 1569 * values so that when we return to the caller fop_close() works 1570 * properly while cleaning up from the failed exec(). Restoring the 1571 * original value will also properly decrement the 2nd VN_RELE that we 1572 * took on the brand library. 1573 */ 1574 origenv.ex_bssbase = p->p_bssbase; 1575 origenv.ex_brkbase = p->p_brkbase; 1576 origenv.ex_brksize = p->p_brksize; 1577 origenv.ex_vp = p->p_exec; 1578 orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp; 1579 orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size; 1580 orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags; 1581 1582 if (args->to_model == DATAMODEL_NATIVE) { 1583 error = elfexec(nvp, uap, args, idata, level + 1, execsz, 1584 setid, exec_file, cred, brand_action); 1585 } 1586 #if defined(_LP64) 1587 else { 1588 error = elf32exec(nvp, uap, args, idata, level + 1, execsz, 1589 setid, exec_file, cred, brand_action); 1590 } 1591 #endif 1592 VN_RELE(nvp); 1593 if (error != 0) { 1594 restoreexecenv(&origenv, &orig_sigaltstack); 1595 return (error); 1596 } 1597 1598 /* 1599 * exec-ed in the brand library above. 1600 * The u_auxv vectors are now setup by elfexec to point to the 1601 * brand emulation library and its linker. 1602 */ 1603 1604 bzero(&env, sizeof (env)); 1605 1606 /* 1607 * map in the the Linux executable 1608 */ 1609 if (args->to_model == DATAMODEL_NATIVE) { 1610 error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, 1611 &voffset, exec_file, &interp, &env.ex_bssbase, 1612 &env.ex_brkbase, &env.ex_brksize, NULL, NULL); 1613 } 1614 #if defined(_LP64) 1615 else { 1616 Elf32_Ehdr ehdr32; 1617 Elf32_Addr uphdr_vaddr32; 1618 1619 error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, 1620 &voffset, exec_file, &interp, &env.ex_bssbase, 1621 &env.ex_brkbase, &env.ex_brksize, NULL, NULL); 1622 1623 Ehdr32to64(&ehdr32, &ehdr); 1624 1625 if (uphdr_vaddr32 == (Elf32_Addr)-1) 1626 uphdr_vaddr = (Addr)-1; 1627 else 1628 uphdr_vaddr = uphdr_vaddr32; 1629 } 1630 #endif 1631 if (error != 0) { 1632 restoreexecenv(&origenv, &orig_sigaltstack); 1633 1634 if (interp != NULL) 1635 kmem_free(interp, MAXPATHLEN); 1636 1637 return (error); 1638 } 1639 1640 /* 1641 * Save off the important properties of the lx executable. The brand 1642 * library will ask us for this data later, when it is ready to set 1643 * things up for the lx executable. 1644 */ 1645 edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : 1646 voffset + uphdr_vaddr; 1647 edp->ed_entry = voffset + ehdr.e_entry; 1648 edp->ed_phent = ehdr.e_phentsize; 1649 edp->ed_phnum = ehdr.e_phnum; 1650 1651 if (interp != NULL) { 1652 if (ehdr.e_type == ET_DYN) { 1653 /* 1654 * This is a shared object executable, so we need to 1655 * pick a reasonable place to put the heap. Just don't 1656 * use the first page. 1657 */ 1658 env.ex_brkbase = (caddr_t)PAGESIZE; 1659 env.ex_bssbase = (caddr_t)PAGESIZE; 1660 } 1661 1662 /* 1663 * If the program needs an interpreter (most do), map it in and 1664 * store relevant information about it in the aux vector, where 1665 * the brand library can find it. 1666 */ 1667 if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW, 1668 NULLVPP, &nvp))) { 1669 uprintf("%s: not found.", interp); 1670 restoreexecenv(&origenv, &orig_sigaltstack); 1671 kmem_free(interp, MAXPATHLEN); 1672 return (error); 1673 } 1674 1675 kmem_free(interp, MAXPATHLEN); 1676 interp = NULL; 1677 1678 /* 1679 * map in the Linux linker 1680 */ 1681 if (args->to_model == DATAMODEL_NATIVE) { 1682 error = mapexec_brand(nvp, args, &ehdr, 1683 &uphdr_vaddr, &voffset, exec_file, NULL, NULL, 1684 NULL, NULL, NULL, &ldaddr); 1685 } 1686 #if defined(_LP64) 1687 else { 1688 Elf32_Ehdr ehdr32; 1689 Elf32_Addr uphdr_vaddr32; 1690 1691 error = mapexec32_brand(nvp, args, &ehdr32, 1692 &uphdr_vaddr32, &voffset, exec_file, NULL, NULL, 1693 NULL, NULL, NULL, &ldaddr); 1694 1695 Ehdr32to64(&ehdr32, &ehdr); 1696 1697 if (uphdr_vaddr32 == (Elf32_Addr)-1) 1698 uphdr_vaddr = (Addr)-1; 1699 else 1700 uphdr_vaddr = uphdr_vaddr32; 1701 } 1702 #endif 1703 1704 VN_RELE(nvp); 1705 if (error != 0) { 1706 restoreexecenv(&origenv, &orig_sigaltstack); 1707 return (error); 1708 } 1709 1710 /* 1711 * Now that we know the base address of the brand's linker, 1712 * we also save this for later use by the brand library. 1713 */ 1714 edp->ed_base = voffset; 1715 edp->ed_ldentry = voffset + ehdr.e_entry; 1716 } else { 1717 /* 1718 * This program has no interpreter. The lx brand library will 1719 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, 1720 * so in this case, put the entry point of the main executable 1721 * there. 1722 */ 1723 if (ehdr.e_type == ET_EXEC) { 1724 /* 1725 * An executable with no interpreter, this must be a 1726 * statically linked executable, which means we loaded 1727 * it at the address specified in the elf header, in 1728 * which case the e_entry field of the elf header is an 1729 * absolute address. 1730 */ 1731 edp->ed_ldentry = ehdr.e_entry; 1732 edp->ed_entry = ehdr.e_entry; 1733 } else { 1734 /* 1735 * A shared object with no interpreter, we use the 1736 * calculated address from above. 1737 */ 1738 edp->ed_ldentry = edp->ed_entry; 1739 1740 /* 1741 * In all situations except an ET_DYN elf object with no 1742 * interpreter, we want to leave the brk and base 1743 * values set by mapexec_brand alone. Normally when 1744 * running ET_DYN objects on Solaris (most likely 1745 * /lib/ld.so.1) the kernel sets brk and base to 0 since 1746 * it doesn't know where to put the heap, and later the 1747 * linker will call brk() to initialize the heap in: 1748 * usr/src/cmd/sgs/rtld/common/setup.c:setup() 1749 * after it has determined where to put it. (This 1750 * decision is made after the linker loads and inspects 1751 * elf properties of the target executable being run.) 1752 * 1753 * So for ET_DYN Linux executables, we also don't know 1754 * where the heap should go, so we'll set the brk and 1755 * base to 0. But in this case the Solaris linker will 1756 * not initialize the heap, so when the Linux linker 1757 * starts running there is no heap allocated. This 1758 * seems to be ok on Linux 2.4 based systems because the 1759 * Linux linker/libc fall back to using mmap() to 1760 * allocate memory. But on 2.6 systems, running 1761 * applications by specifying them as command line 1762 * arguments to the linker results in segfaults for an 1763 * as yet undetermined reason (which seems to indicatej 1764 * that a more permanent fix for heap initalization in 1765 * these cases may be necessary). 1766 */ 1767 if (ehdr.e_type == ET_DYN) { 1768 env.ex_bssbase = (caddr_t)0; 1769 env.ex_brkbase = (caddr_t)0; 1770 env.ex_brksize = 0; 1771 } 1772 } 1773 1774 } 1775 1776 env.ex_vp = vp; 1777 setexecenv(&env); 1778 1779 /* 1780 * We try to keep /proc's view of the aux vector consistent with 1781 * what's on the process stack. 1782 */ 1783 if (args->to_model == DATAMODEL_NATIVE) { 1784 auxv_t phdr_auxv[4] = { 1785 { AT_SUN_BRAND_LX_PHDR, 0 }, 1786 { AT_SUN_BRAND_LX_INTERP, 0 }, 1787 { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 }, 1788 { AT_SUN_BRAND_AUX4, 0 } 1789 }; 1790 phdr_auxv[0].a_un.a_val = edp->ed_phdr; 1791 phdr_auxv[1].a_un.a_val = ldaddr; 1792 phdr_auxv[2].a_un.a_val = 1; /* set in lx_init */ 1793 phdr_auxv[3].a_type = AT_CLKTCK; 1794 phdr_auxv[3].a_un.a_val = hz; 1795 1796 if (copyout(&phdr_auxv, args->auxp_brand, 1797 sizeof (phdr_auxv)) == -1) 1798 return (EFAULT); 1799 } 1800 #if defined(_LP64) 1801 else { 1802 auxv32_t phdr_auxv32[3] = { 1803 { AT_SUN_BRAND_LX_PHDR, 0 }, 1804 { AT_SUN_BRAND_LX_INTERP, 0 }, 1805 { AT_SUN_BRAND_AUX3, 0 } 1806 }; 1807 phdr_auxv32[0].a_un.a_val = edp->ed_phdr; 1808 phdr_auxv32[1].a_un.a_val = ldaddr; 1809 phdr_auxv32[2].a_type = AT_CLKTCK; 1810 phdr_auxv32[2].a_un.a_val = hz; 1811 1812 if (copyout(&phdr_auxv32, args->auxp_brand, 1813 sizeof (phdr_auxv32)) == -1) 1814 return (EFAULT); 1815 } 1816 #endif 1817 1818 /* 1819 * /proc uses the AT_ENTRY aux vector entry to deduce 1820 * the location of the executable in the address space. The user 1821 * structure contains a copy of the aux vector that needs to have those 1822 * entries patched with the values of the real lx executable (they 1823 * currently contain the values from the lx brand library that was 1824 * elfexec'd, above). 1825 * 1826 * For live processes, AT_BASE is used to locate the linker segment, 1827 * which /proc and friends will later use to find Solaris symbols 1828 * (such as rtld_db_preinit). However, for core files, /proc uses 1829 * AT_ENTRY to find the right segment to label as the executable. 1830 * So we set AT_ENTRY to be the entry point of the linux executable, 1831 * but leave AT_BASE to be the address of the Solaris linker. 1832 */ 1833 for (i = 0; i < __KERN_NAUXV_IMPL; i++) { 1834 switch (up->u_auxv[i].a_type) { 1835 case AT_ENTRY: 1836 up->u_auxv[i].a_un.a_val = edp->ed_entry; 1837 break; 1838 1839 case AT_SUN_BRAND_LX_PHDR: 1840 up->u_auxv[i].a_un.a_val = edp->ed_phdr; 1841 break; 1842 1843 case AT_SUN_BRAND_LX_INTERP: 1844 up->u_auxv[i].a_un.a_val = ldaddr; 1845 break; 1846 1847 default: 1848 break; 1849 } 1850 } 1851 1852 return (0); 1853 } 1854 1855 boolean_t 1856 lx_native_exec(uint8_t osabi, const char **interp) 1857 { 1858 if (osabi != ELFOSABI_SOLARIS) 1859 return (B_FALSE); 1860 1861 /* 1862 * If the process root matches the zone root, prepend /native to the 1863 * interpreter path for native executables. Absolute precision from 1864 * VN_CMP is not necessary since any change of process root is likely 1865 * to make native binaries inaccessible via /native. 1866 * 1867 * Processes which chroot directly into /native will be able to 1868 * function as expected with no need for the prefix. 1869 */ 1870 if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) { 1871 *interp = "/native"; 1872 } 1873 1874 return (B_TRUE); 1875 } 1876 1877 static void 1878 lx_syscall_init(void) 1879 { 1880 int i; 1881 1882 /* 1883 * Count up the 32-bit Linux system calls. Note that lx_sysent32 1884 * has (LX_NSYSCALLS + 1) entries. 1885 */ 1886 for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++) 1887 continue; 1888 lx_nsysent32 = i; 1889 1890 #if defined(_LP64) 1891 /* 1892 * Count up the 64-bit Linux system calls. Note that lx_sysent64 1893 * has (LX_NSYSCALLS + 1) entries. 1894 */ 1895 for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++) 1896 continue; 1897 lx_nsysent64 = i; 1898 #endif 1899 } 1900 1901 int 1902 _init(void) 1903 { 1904 int err = 0; 1905 1906 lx_syscall_init(); 1907 lx_pid_init(); 1908 lx_ioctl_init(); 1909 lx_futex_init(); 1910 lx_ptrace_init(); 1911 lx_socket_init(); 1912 1913 err = mod_install(&modlinkage); 1914 if (err != 0) { 1915 cmn_err(CE_WARN, "Couldn't install lx brand module"); 1916 1917 /* 1918 * This looks drastic, but it should never happen. These 1919 * two data structures should be completely free-able until 1920 * they are used by Linux processes. Since the brand 1921 * wasn't loaded there should be no Linux processes, and 1922 * thus no way for these data structures to be modified. 1923 */ 1924 lx_pid_fini(); 1925 lx_ioctl_fini(); 1926 if (lx_futex_fini()) 1927 panic("lx brand module cannot be loaded or unloaded."); 1928 } 1929 return (err); 1930 } 1931 1932 int 1933 _info(struct modinfo *modinfop) 1934 { 1935 return (mod_info(&modlinkage, modinfop)); 1936 } 1937 1938 int 1939 _fini(void) 1940 { 1941 int err; 1942 int futex_done = 0; 1943 1944 /* 1945 * If there are any zones using this brand, we can't allow it to be 1946 * unloaded. 1947 */ 1948 if (brand_zone_count(&lx_brand)) 1949 return (EBUSY); 1950 1951 lx_ptrace_fini(); 1952 lx_pid_fini(); 1953 lx_ioctl_fini(); 1954 lx_socket_fini(); 1955 1956 if ((err = lx_futex_fini()) != 0) { 1957 goto done; 1958 } 1959 futex_done = 1; 1960 1961 err = mod_remove(&modlinkage); 1962 1963 done: 1964 if (err) { 1965 /* 1966 * If we can't unload the module, then we have to get it 1967 * back into a sane state. 1968 */ 1969 lx_ptrace_init(); 1970 lx_pid_init(); 1971 lx_ioctl_init(); 1972 lx_socket_init(); 1973 1974 if (futex_done) { 1975 lx_futex_init(); 1976 } 1977 } 1978 1979 return (err); 1980 }