1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2015, Joyent, Inc. All rights reserved. 29 */ 30 31 /* 32 * The LX Brand: emulation of a Linux operating environment within a zone. 33 * 34 * OVERVIEW 35 * 36 * The LX brand enables a full Linux userland -- including a C library, 37 * init(1) framework, and some set of applications -- to run unmodified 38 * within an illumos zone. Unlike illumos, where applications are expected 39 * to link against and consume functions exported from libraries, the 40 * supported Linux binary compatibility boundary is the system call 41 * interface. By accurately emulating the behaviour of Linux system calls, 42 * Linux software can be executed in this environment as if it were running 43 * on a native Linux system. 44 * 45 * EMULATING LINUX SYSTEM CALLS 46 * 47 * Linux system calls are made in 32-bit processes via the "int 0x80" 48 * instruction; in 64-bit processes the "syscall" instruction is used, as it 49 * is with native illumos processes. In both cases, arguments to system 50 * calls are generally passed in registers and the usermode stack is not 51 * interpreted or modified by the Linux kernel. 52 * 53 * When the emulated Linux process makes a system call, it traps into the 54 * illumos kernel. The in-kernel brand module contains various emulation 55 * routines, and can fully service some emulated system calls; e.g. read(2) 56 * and write(2). Other system calls require assistance from the illumos 57 * libc, bouncing back out to the brand library ("lx_brand.so.1") for 58 * emulation. 59 * 60 * The brand mechanism allows for the provision of an alternative trap 61 * handler for the various system call mechanisms. Traditionally this was 62 * used to immediately revector execution to the usermode emulation library, 63 * which was responsible for handling all system calls. In the interests of 64 * more accurate emulation and increased performance, much of the regular 65 * illumos system call path is now invoked. Only the argument processing and 66 * handler dispatch are replaced by the brand, via the per-LWP 67 * "lwp_brand_syscall" interposition function pointer. 68 * 69 * THE NATIVE AND BRAND STACKS 70 * 71 * Some runtime environments (e.g. the Go language) allocate very small 72 * thread stacks, preferring to grow or split the stack as necessary. The 73 * Linux kernel generally does not use the usermode stack when servicing 74 * system calls, so this is not a problem. In order for our emulation to 75 * have the same zero stack impact, we must execute usermode emulation 76 * routines on an _alternate_ stack. This is similar, in principle, to the 77 * use of sigaltstack(3C) to run signal handlers off the main thread stack. 78 * 79 * To this end, the brand library allocates and installs an alternate stack 80 * (called the "native" stack) for each LWP. The in-kernel brand code uses 81 * this stack for usermode emulation calls and interposed signal delivery, 82 * while the emulated Linux process sees only the data on the main thread 83 * stack, known as the "brand" stack. The stack mode is tracked in the 84 * per-LWP brand-private data, using the LX_STACK_MODE_* enum. 85 * 86 * The stack mode doubles as a system call "mode bit". When in the 87 * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux 88 * system calls. In other modes, system calls are assumed to be native 89 * illumos system calls as made during brand library initialisation and 90 * usermode emulation. 91 * 92 * USERMODE EMULATION 93 * 94 * When a Linux system call cannot be emulated within the kernel, we preserve 95 * the register state of the Linux process and revector the LWP to the brand 96 * library usermode emulation handler: the "lx_emulate()" function in 97 * "lx_brand.so.1". This revectoring is modelled on the delivery of signals, 98 * and is performed in "lx_emulate_user()". 99 * 100 * First, the emulated process state is written out to the usermode stack of 101 * the process as a "ucontext_t" object. Arguments to the emulation routine 102 * are passed on the stack or in registers, depending on the ABI. When the 103 * usermode emulation is complete, the result is passed back to the kernel 104 * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context 105 * for restoration. 106 * 107 * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT 108 * 109 * When servicing emulated system calls in the usermode brand library, or 110 * during signal delivery, various state is preserved by the kernel so that 111 * the running LWP may be revectored to a handling routine. The context 112 * allows the kernel to restart the program at the point of interruption, 113 * either at the return of the signal handler, via setcontext(3C); or after 114 * the usermode emulation request has been serviced, via B_EMULATION_DONE. 115 * 116 * In illumos native processes, the saved context (a "ucontext_t" object) 117 * includes the state of registers and the current signal mask at the point 118 * of interruption. The context also includes a link to the most recently 119 * saved context, forming a chain to be unwound as requests complete. The LX 120 * brand requires additional book-keeping to describe the machine state: in 121 * particular, the current stack mode and the occupied extent of the native 122 * stack. 123 * 124 * The brand code is able to interpose on the context save and restore 125 * operations in the kernel -- see "lx_savecontext()" and 126 * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to 127 * function correctly in the face of a dual stack LWP. The brand also 128 * interposes on the signal delivery mechanism -- see "lx_sendsig()" and 129 * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand 130 * library interposer on the native stack, regardless of the interrupted 131 * execution mode. Linux sigaltstack(2) emulation is performed entirely by 132 * the usermode brand library during signal handler interposition. 133 */ 134 135 #include <sys/types.h> 136 #include <sys/kmem.h> 137 #include <sys/errno.h> 138 #include <sys/thread.h> 139 #include <sys/systm.h> 140 #include <sys/syscall.h> 141 #include <sys/proc.h> 142 #include <sys/modctl.h> 143 #include <sys/cmn_err.h> 144 #include <sys/model.h> 145 #include <sys/exec.h> 146 #include <sys/lx_impl.h> 147 #include <sys/machbrand.h> 148 #include <sys/lx_syscalls.h> 149 #include <sys/lx_misc.h> 150 #include <sys/lx_futex.h> 151 #include <sys/lx_brand.h> 152 #include <sys/param.h> 153 #include <sys/termios.h> 154 #include <sys/sunddi.h> 155 #include <sys/ddi.h> 156 #include <sys/vnode.h> 157 #include <sys/pathname.h> 158 #include <sys/auxv.h> 159 #include <sys/priv.h> 160 #include <sys/regset.h> 161 #include <sys/privregs.h> 162 #include <sys/archsystm.h> 163 #include <sys/zone.h> 164 #include <sys/brand.h> 165 #include <sys/sdt.h> 166 #include <sys/x86_archext.h> 167 #include <sys/controlregs.h> 168 #include <sys/core.h> 169 #include <sys/stack.h> 170 #include <sys/stat.h> 171 #include <sys/socket.h> 172 #include <lx_signum.h> 173 #include <util/sscanf.h> 174 175 int lx_debug = 0; 176 177 void lx_init_brand_data(zone_t *); 178 void lx_free_brand_data(zone_t *); 179 void lx_setbrand(proc_t *); 180 int lx_getattr(zone_t *, int, void *, size_t *); 181 int lx_setattr(zone_t *, int, void *, size_t); 182 int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, 183 uintptr_t, uintptr_t); 184 void lx_set_kern_version(zone_t *, char *); 185 void lx_copy_procdata(proc_t *, proc_t *); 186 187 extern int getsetcontext(int, void *); 188 extern int waitsys(idtype_t, id_t, siginfo_t *, int); 189 #if defined(_SYSCALL32_IMPL) 190 extern int getsetcontext32(int, void *); 191 extern int waitsys32(idtype_t, id_t, siginfo_t *, int); 192 #endif 193 194 extern void lx_proc_exit(proc_t *); 195 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); 196 197 extern void lx_ioctl_init(); 198 extern void lx_ioctl_fini(); 199 extern void lx_socket_init(); 200 extern void lx_socket_fini(); 201 202 lx_systrace_f *lx_systrace_entry_ptr; 203 lx_systrace_f *lx_systrace_return_ptr; 204 205 static int lx_systrace_enabled; 206 207 /* 208 * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly 209 * want an MMU dependency here (and should there be a microprocessor without 210 * a hole, we don't want to start allocating from the top of the VA range). 211 */ 212 #define LX_MAXSTACK64 0x7ffffff00000 213 214 uint64_t lx_maxstack64 = LX_MAXSTACK64; 215 216 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 217 struct intpdata *idata, int level, long *execsz, int setid, 218 caddr_t exec_file, struct cred *cred, int *brand_action); 219 220 static boolean_t lx_native_exec(uint8_t, const char **); 221 static uint32_t lx_map32limit(proc_t *); 222 223 static void lx_savecontext(ucontext_t *); 224 static void lx_restorecontext(ucontext_t *); 225 static caddr_t lx_sendsig_stack(int); 226 static void lx_sendsig(int); 227 #if defined(_SYSCALL32_IMPL) 228 static void lx_savecontext32(ucontext32_t *); 229 #endif 230 static int lx_setid_clear(vattr_t *, cred_t *); 231 #if defined(_LP64) 232 static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type, 233 enum seg_rw); 234 #endif 235 236 237 /* lx brand */ 238 struct brand_ops lx_brops = { 239 lx_init_brand_data, /* b_init_brand_data */ 240 lx_free_brand_data, /* b_free_brand_data */ 241 lx_brandsys, /* b_brandsys */ 242 lx_setbrand, /* b_setbrand */ 243 lx_getattr, /* b_getattr */ 244 lx_setattr, /* b_setattr */ 245 lx_copy_procdata, /* b_copy_procdata */ 246 lx_proc_exit, /* b_proc_exit */ 247 lx_exec, /* b_exec */ 248 lx_setrval, /* b_lwp_setrval */ 249 lx_lwpdata_alloc, /* b_lwpdata_alloc */ 250 lx_lwpdata_free, /* b_lwpdata_free */ 251 lx_initlwp, /* b_initlwp */ 252 lx_forklwp, /* b_forklwp */ 253 lx_freelwp, /* b_freelwp */ 254 lx_exitlwp, /* b_lwpexit */ 255 lx_elfexec, /* b_elfexec */ 256 NULL, /* b_sigset_native_to_brand */ 257 NULL, /* b_sigset_brand_to_native */ 258 lx_sigfd_translate, /* b_sigfd_translate */ 259 NSIG, /* b_nsig */ 260 lx_exit_with_sig, /* b_exit_with_sig */ 261 lx_wait_filter, /* b_wait_filter */ 262 lx_native_exec, /* b_native_exec */ 263 lx_map32limit, /* b_map32limit */ 264 lx_stop_notify, /* b_stop_notify */ 265 lx_waitid_helper, /* b_waitid_helper */ 266 lx_sigcld_repost, /* b_sigcld_repost */ 267 lx_ptrace_issig_stop, /* b_issig_stop */ 268 lx_ptrace_sig_ignorable, /* b_sig_ignorable */ 269 lx_savecontext, /* b_savecontext */ 270 #if defined(_SYSCALL32_IMPL) 271 lx_savecontext32, /* b_savecontext32 */ 272 #endif 273 lx_restorecontext, /* b_restorecontext */ 274 lx_sendsig_stack, /* b_sendsig_stack */ 275 lx_sendsig, /* b_sendsig */ 276 lx_setid_clear, /* b_setid_clear */ 277 #if defined(_LP64) 278 lx_pagefault /* b_pagefault */ 279 #else 280 NULL 281 #endif 282 }; 283 284 struct brand_mach_ops lx_mops = { 285 NULL, 286 NULL, 287 NULL, 288 NULL, 289 NULL, 290 lx_fixsegreg, 291 lx_fsbase 292 }; 293 294 struct brand lx_brand = { 295 BRAND_VER_1, 296 "lx", 297 &lx_brops, 298 &lx_mops, 299 sizeof (struct lx_proc_data) 300 }; 301 302 static struct modlbrand modlbrand = { 303 &mod_brandops, "lx brand", &lx_brand 304 }; 305 306 static struct modlinkage modlinkage = { 307 MODREV_1, (void *)&modlbrand, NULL 308 }; 309 310 void 311 lx_proc_exit(proc_t *p) 312 { 313 lx_proc_data_t *lxpd; 314 proc_t *cp; 315 316 mutex_enter(&p->p_lock); 317 VERIFY(lxpd = ptolxproc(p)); 318 if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) { 319 mutex_exit(&p->p_lock); 320 return; 321 } 322 mutex_exit(&p->p_lock); 323 324 /* Check for children which desire notification of parental death. */ 325 mutex_enter(&pidlock); 326 for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) { 327 mutex_enter(&cp->p_lock); 328 if ((lxpd = ptolxproc(cp)) == NULL) { 329 mutex_exit(&cp->p_lock); 330 continue; 331 } 332 if (lxpd->l_parent_deathsig != 0) { 333 sigtoproc(p, NULL, lxpd->l_parent_deathsig); 334 } 335 mutex_exit(&cp->p_lock); 336 } 337 mutex_exit(&pidlock); 338 } 339 340 void 341 lx_setbrand(proc_t *p) 342 { 343 /* Send SIGCHLD to parent by default when child exits */ 344 ptolxproc(p)->l_signal = stol_signo[SIGCHLD]; 345 } 346 347 /* ARGSUSED */ 348 int 349 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) 350 { 351 char vers[LX_VERS_MAX]; 352 353 if (attr == LX_KERN_VERSION_NUM) { 354 if (bufsize > (LX_VERS_MAX - 1)) 355 return (ERANGE); 356 bzero(vers, LX_VERS_MAX); 357 if (copyin(buf, &vers, bufsize) != 0) 358 return (EFAULT); 359 lx_set_kern_version(zone, vers); 360 return (0); 361 } 362 return (EINVAL); 363 } 364 365 /* ARGSUSED */ 366 int 367 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) 368 { 369 if (attr == LX_KERN_VERSION_NUM) { 370 if (*bufsize < LX_VERS_MAX) 371 return (ERANGE); 372 if (copyout(lx_get_zone_kern_version(curzone), buf, 373 LX_VERS_MAX) != 0) 374 return (EFAULT); 375 *bufsize = LX_VERS_MAX; 376 return (0); 377 } 378 return (-EINVAL); 379 } 380 381 uint32_t 382 lx_map32limit(proc_t *p) 383 { 384 /* 385 * To be bug-for-bug compatible with Linux, we have MAP_32BIT only 386 * allow mappings in the first 31 bits. This was a nuance in the 387 * original Linux implementation circa 2002, and applications have 388 * come to depend on its behavior. 389 * 390 * This is only relevant for 64-bit processes. 391 */ 392 if (p->p_model == DATAMODEL_LP64) 393 return (1 << 31); 394 395 return ((uint32_t)USERLIMIT32); 396 } 397 398 void 399 lx_brand_systrace_enable(void) 400 { 401 VERIFY(!lx_systrace_enabled); 402 403 lx_systrace_enabled = 1; 404 } 405 406 void 407 lx_brand_systrace_disable(void) 408 { 409 VERIFY(lx_systrace_enabled); 410 411 lx_systrace_enabled = 0; 412 } 413 414 void 415 lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp) 416 { 417 VERIFY(lwpd->br_ntv_stack != 0); 418 419 /* 420 * The "brand-lx-set-ntv-stack-current" probe has arguments: 421 * arg0: stack pointer before change 422 * arg1: stack pointer after change 423 * arg2: current stack base 424 */ 425 DTRACE_PROBE3(brand__lx__set__ntv__stack__current, 426 uintptr_t, lwpd->br_ntv_stack_current, 427 uintptr_t, new_sp, 428 uintptr_t, lwpd->br_ntv_stack); 429 430 lwpd->br_ntv_stack_current = new_sp; 431 } 432 433 #if defined(_LP64) 434 static int 435 lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type, 436 enum seg_rw rw) 437 { 438 int syscall_num; 439 440 /* 441 * We only want to handle a very specific set of circumstances. 442 * Namely: this is a 64-bit LX-branded process attempting to execute an 443 * address in a page for which it does not have a valid mapping. If 444 * this is not the case, we bail out as fast as possible. 445 */ 446 VERIFY(PROC_IS_BRANDED(p)); 447 if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) != 448 DATAMODEL_NATIVE) { 449 return (-1); 450 } 451 452 if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) { 453 return (-1); 454 } 455 456 /* 457 * This is a valid vsyscall address. We service the system call and 458 * return 0 to signal that the pagefault has been handled completely. 459 */ 460 lx_vsyscall_enter(p, lwp, syscall_num); 461 return (0); 462 } 463 #endif 464 465 /* 466 * This hook runs prior to sendsig() processing and allows us to nominate 467 * an alternative stack pointer for delivery of the signal handling frame. 468 * Critically, this routine should _not_ modify any LWP state as the 469 * savecontext() does not run until after this hook. 470 */ 471 static caddr_t 472 lx_sendsig_stack(int sig) 473 { 474 klwp_t *lwp = ttolwp(curthread); 475 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 476 477 /* 478 * We want to take signal delivery on the native stack, but only if 479 * one has been allocated and installed for this LWP. 480 */ 481 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 482 /* 483 * The program is not running on the native stack. Return 484 * the native stack pointer from our brand-private data so 485 * that we may switch to it for signal handling. 486 */ 487 return ((caddr_t)lwpd->br_ntv_stack_current); 488 } else { 489 struct regs *rp = lwptoregs(lwp); 490 491 /* 492 * Either the program is already running on the native stack, 493 * or one has not yet been allocated for this LWP. Use the 494 * current stack pointer value. 495 */ 496 return ((caddr_t)rp->r_sp); 497 } 498 } 499 500 /* 501 * This hook runs after sendsig() processing and allows us to update the 502 * per-LWP mode flags for system calls and stacks. The pre-signal 503 * context has already been saved and delivered to the user at this point. 504 */ 505 static void 506 lx_sendsig(int sig) 507 { 508 klwp_t *lwp = ttolwp(curthread); 509 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 510 struct regs *rp = lwptoregs(lwp); 511 512 switch (lwpd->br_stack_mode) { 513 case LX_STACK_MODE_BRAND: 514 case LX_STACK_MODE_NATIVE: 515 /* 516 * In lx_sendsig_stack(), we nominated a stack pointer from the 517 * native stack. Update the stack mode, and the current in-use 518 * extent of the native stack, accordingly: 519 */ 520 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; 521 lx_lwp_set_native_stack_current(lwpd, rp->r_sp); 522 523 /* 524 * Fix up segment registers, etc. 525 */ 526 lx_switch_to_native(lwp); 527 break; 528 529 default: 530 /* 531 * Otherwise, the brand library has not yet installed the 532 * alternate stack for this LWP. Signals will be handled on 533 * the regular stack thread. 534 */ 535 return; 536 } 537 } 538 539 /* 540 * This hook runs prior to the context restoration, allowing us to take action 541 * or modify the context before it is loaded. 542 */ 543 static void 544 lx_restorecontext(ucontext_t *ucp) 545 { 546 klwp_t *lwp = ttolwp(curthread); 547 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 548 uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0]; 549 caddr_t sp = ucp->uc_brand_data[1]; 550 551 /* 552 * We have a saved native stack pointer value that we must restore 553 * into the per-LWP data. 554 */ 555 if (flags & LX_UC_RESTORE_NATIVE_SP) { 556 lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp); 557 } 558 559 /* 560 * We do not wish to restore the value of uc_link in this context, 561 * so replace it with the value currently in the LWP. 562 */ 563 if (flags & LX_UC_IGNORE_LINK) { 564 ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext; 565 } 566 567 /* 568 * Restore the stack mode: 569 */ 570 if (flags & LX_UC_STACK_NATIVE) { 571 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; 572 } else if (flags & LX_UC_STACK_BRAND) { 573 lwpd->br_stack_mode = LX_STACK_MODE_BRAND; 574 } 575 576 #if defined(__amd64) 577 /* 578 * Override the fs/gsbase in the context with the value provided 579 * through the Linux arch_prctl(2) system call. 580 */ 581 if (flags & LX_UC_STACK_BRAND) { 582 if (lwpd->br_lx_fsbase != 0) { 583 ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase; 584 } 585 if (lwpd->br_lx_gsbase != 0) { 586 ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase; 587 } 588 } 589 #endif 590 } 591 592 static void 593 lx_savecontext(ucontext_t *ucp) 594 { 595 klwp_t *lwp = ttolwp(curthread); 596 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 597 uintptr_t flags = 0; 598 599 /* 600 * The ucontext_t affords us three private pointer-sized members in 601 * "uc_brand_data". We pack a variety of flags into the first element, 602 * and an optional stack pointer in the second element. The flags 603 * determine which stack pointer (native or brand), if any, is stored 604 * in the second element. The third element may contain the system 605 * call number; this is analogous to the "orig_[er]ax" member of a 606 * Linux "user_regs_struct". 607 */ 608 609 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && 610 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 611 /* 612 * Record the value of the native stack pointer to restore 613 * when returning to this branded context: 614 */ 615 flags |= LX_UC_RESTORE_NATIVE_SP; 616 ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current; 617 } 618 619 /* 620 * Save the stack mode: 621 */ 622 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { 623 flags |= LX_UC_STACK_NATIVE; 624 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 625 flags |= LX_UC_STACK_BRAND; 626 } 627 628 /* 629 * If we might need to restart this system call, save that information 630 * in the context: 631 */ 632 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 633 ucp->uc_brand_data[2] = 634 (void *)(uintptr_t)lwpd->br_syscall_num; 635 if (lwpd->br_syscall_restart) { 636 flags |= LX_UC_RESTART_SYSCALL; 637 } 638 } else { 639 ucp->uc_brand_data[2] = NULL; 640 } 641 642 ucp->uc_brand_data[0] = (void *)flags; 643 } 644 645 #if defined(_SYSCALL32_IMPL) 646 static void 647 lx_savecontext32(ucontext32_t *ucp) 648 { 649 klwp_t *lwp = ttolwp(curthread); 650 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 651 unsigned int flags = 0; 652 653 /* 654 * The ucontext_t affords us three private pointer-sized members in 655 * "uc_brand_data". We pack a variety of flags into the first element, 656 * and an optional stack pointer in the second element. The flags 657 * determine which stack pointer (native or brand), if any, is stored 658 * in the second element. The third element may contain the system 659 * call number; this is analogous to the "orig_[er]ax" member of a 660 * Linux "user_regs_struct". 661 */ 662 663 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && 664 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 665 /* 666 * Record the value of the native stack pointer to restore 667 * when returning to this branded context: 668 */ 669 flags |= LX_UC_RESTORE_NATIVE_SP; 670 ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current; 671 } 672 673 /* 674 * Save the stack mode: 675 */ 676 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { 677 flags |= LX_UC_STACK_NATIVE; 678 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 679 flags |= LX_UC_STACK_BRAND; 680 } 681 682 /* 683 * If we might need to restart this system call, save that information 684 * in the context: 685 */ 686 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 687 ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num; 688 if (lwpd->br_syscall_restart) { 689 flags |= LX_UC_RESTART_SYSCALL; 690 } 691 } else { 692 ucp->uc_brand_data[2] = NULL; 693 } 694 695 ucp->uc_brand_data[0] = flags; 696 } 697 #endif 698 699 void 700 lx_init_brand_data(zone_t *zone) 701 { 702 lx_zone_data_t *data; 703 ASSERT(zone->zone_brand == &lx_brand); 704 ASSERT(zone->zone_brand_data == NULL); 705 data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP); 706 /* 707 * Set the default lxzd_kernel_version to 2.4. 708 * This can be changed by a call to setattr() during zone boot. 709 */ 710 (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX); 711 712 /* 713 * Linux is not at all picky about address family when it comes to 714 * supporting interface-related ioctls. To mimic this behavior, we'll 715 * attempt those ioctls against a ksocket configured for that purpose. 716 */ 717 (void) ksocket_socket(&data->lxzd_ioctl_sock, AF_INET, SOCK_DGRAM, 0, 718 0, zone->zone_kcred); 719 720 zone->zone_brand_data = data; 721 722 /* 723 * In Linux, if the init(1) process terminates the system panics. 724 * The zone must reboot to simulate this behaviour. 725 */ 726 zone->zone_reboot_on_init_exit = B_TRUE; 727 } 728 729 void 730 lx_free_brand_data(zone_t *zone) 731 { 732 lx_zone_data_t *data = ztolxzd(zone); 733 ASSERT(data != NULL); 734 if (data->lxzd_ioctl_sock != NULL) { 735 /* 736 * Since zone_kcred has been cleaned up already, close the 737 * socket using the global kcred. 738 */ 739 ksocket_close(data->lxzd_ioctl_sock, kcred); 740 data->lxzd_ioctl_sock = NULL; 741 } 742 zone->zone_brand_data = NULL; 743 kmem_free(data, sizeof (*data)); 744 } 745 746 void 747 lx_unsupported(char *dmsg) 748 { 749 lx_proc_data_t *pd = ttolxproc(curthread); 750 751 DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg); 752 753 if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) { 754 /* 755 * If this process was run with strict mode enabled 756 * (via LX_STRICT in the environment), we mark this 757 * LWP as having triggered an unsupported behaviour. 758 * This flag will be checked at an appropriate point 759 * by lx_check_strict_failure(). 760 */ 761 lx_lwp_data_t *lwpd = ttolxlwp(curthread); 762 763 lwpd->br_strict_failure = B_TRUE; 764 } 765 } 766 767 void 768 lx_check_strict_failure(lx_lwp_data_t *lwpd) 769 { 770 proc_t *p; 771 772 if (!lwpd->br_strict_failure) { 773 return; 774 } 775 776 lwpd->br_strict_failure = B_FALSE; 777 778 /* 779 * If this process is operating in strict mode (via LX_STRICT in 780 * the environment), and has triggered a call to 781 * lx_unsupported(), we drop SIGSYS on it as we return. 782 */ 783 p = curproc; 784 mutex_enter(&p->p_lock); 785 sigtoproc(p, curthread, SIGSYS); 786 mutex_exit(&p->p_lock); 787 } 788 789 void 790 lx_trace_sysenter(int syscall_num, uintptr_t *args) 791 { 792 if (lx_systrace_enabled) { 793 VERIFY(lx_systrace_entry_ptr != NULL); 794 795 (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1], 796 args[2], args[3], args[4], args[5]); 797 } 798 } 799 800 void 801 lx_trace_sysreturn(int syscall_num, long ret) 802 { 803 if (lx_systrace_enabled) { 804 VERIFY(lx_systrace_return_ptr != NULL); 805 806 (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0); 807 } 808 } 809 810 /* 811 * Get the addresses of the user-space system call handler and attach it to 812 * the proc structure. Returning 0 indicates success; the value returned 813 * by the system call is the value stored in rval. Returning a non-zero 814 * value indicates a failure; the value returned is used to set errno, -1 815 * is returned from the syscall and the contents of rval are ignored. To 816 * set errno and have the syscall return a value other than -1 we can 817 * manually set errno and rval and return 0. 818 */ 819 int 820 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, 821 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) 822 { 823 kthread_t *t = curthread; 824 klwp_t *lwp = ttolwp(t); 825 proc_t *p = ttoproc(t); 826 lx_proc_data_t *pd; 827 struct termios *termios; 828 uint_t termios_len; 829 int error; 830 int code; 831 int sig; 832 lx_brand_registration_t reg; 833 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 834 835 /* 836 * There is one operation that is suppored for non-branded 837 * process. B_EXEC_BRAND. This is the equilivant of an 838 * exec call, but the new process that is created will be 839 * a branded process. 840 */ 841 if (cmd == B_EXEC_BRAND) { 842 VERIFY(p->p_zone != NULL); 843 VERIFY(p->p_zone->zone_brand == &lx_brand); 844 return (exec_common( 845 (char *)arg1, (const char **)arg2, (const char **)arg3, 846 EBA_BRAND)); 847 } 848 849 /* For all other operations this must be a branded process. */ 850 if (p->p_brand == NULL) 851 return (ENOSYS); 852 853 VERIFY(p->p_brand == &lx_brand); 854 VERIFY(p->p_brand_data != NULL); 855 856 switch (cmd) { 857 case B_REGISTER: 858 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 859 lx_print("stack mode was not PREINIT during " 860 "REGISTER\n"); 861 return (EINVAL); 862 } 863 864 if (p->p_model == DATAMODEL_NATIVE) { 865 if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { 866 lx_print("Failed to copyin brand registration " 867 "at 0x%p\n", (void *)arg1); 868 return (EFAULT); 869 } 870 } 871 #ifdef _LP64 872 else { 873 /* 32-bit userland on 64-bit kernel */ 874 lx_brand_registration32_t reg32; 875 876 if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { 877 lx_print("Failed to copyin brand registration " 878 "at 0x%p\n", (void *)arg1); 879 return (EFAULT); 880 } 881 882 reg.lxbr_version = (uint_t)reg32.lxbr_version; 883 reg.lxbr_handler = 884 (void *)(uintptr_t)reg32.lxbr_handler; 885 reg.lxbr_flags = reg32.lxbr_flags; 886 } 887 #endif 888 889 if (reg.lxbr_version != LX_VERSION_1) { 890 lx_print("Invalid brand library version (%u)\n", 891 reg.lxbr_version); 892 return (EINVAL); 893 } 894 895 if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) { 896 lx_print("Invalid brand flags (%u)\n", 897 reg.lxbr_flags); 898 return (EINVAL); 899 } 900 901 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", 902 (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); 903 pd = p->p_brand_data; 904 pd->l_handler = (uintptr_t)reg.lxbr_handler; 905 pd->l_flags = reg.lxbr_flags & LX_PROC_ALL; 906 907 return (0); 908 909 case B_TTYMODES: 910 /* This is necessary for emulating TCGETS ioctls. */ 911 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), 912 DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, 913 &termios_len) != DDI_SUCCESS) 914 return (EIO); 915 916 ASSERT(termios_len == sizeof (*termios)); 917 918 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { 919 ddi_prop_free(termios); 920 return (EFAULT); 921 } 922 923 ddi_prop_free(termios); 924 return (0); 925 926 case B_ELFDATA: 927 pd = curproc->p_brand_data; 928 if (get_udatamodel() == DATAMODEL_NATIVE) { 929 if (copyout(&pd->l_elf_data, (void *)arg1, 930 sizeof (lx_elf_data_t)) != 0) { 931 return (EFAULT); 932 } 933 } 934 #if defined(_LP64) 935 else { 936 /* 32-bit userland on 64-bit kernel */ 937 lx_elf_data32_t led32; 938 939 led32.ed_phdr = (int)pd->l_elf_data.ed_phdr; 940 led32.ed_phent = (int)pd->l_elf_data.ed_phent; 941 led32.ed_phnum = (int)pd->l_elf_data.ed_phnum; 942 led32.ed_entry = (int)pd->l_elf_data.ed_entry; 943 led32.ed_base = (int)pd->l_elf_data.ed_base; 944 led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry; 945 946 if (copyout(&led32, (void *)arg1, 947 sizeof (led32)) != 0) { 948 return (EFAULT); 949 } 950 } 951 #endif 952 return (0); 953 954 case B_EXEC_NATIVE: 955 return (exec_common((char *)arg1, (const char **)arg2, 956 (const char **)arg3, EBA_NATIVE)); 957 958 /* 959 * The B_TRUSS_POINT subcommand is used so that we can make a no-op 960 * syscall for debugging purposes (dtracing) from within the user-level 961 * emulation. 962 */ 963 case B_TRUSS_POINT: 964 return (0); 965 966 case B_LPID_TO_SPAIR: { 967 /* 968 * Given a Linux pid as arg1, return the Solaris pid in arg2 and 969 * the Solaris LWP in arg3. We also translate pid 1 (which is 970 * hardcoded in many applications) to the zone's init process. 971 */ 972 pid_t s_pid; 973 id_t s_tid; 974 975 if ((pid_t)arg1 == 1) { 976 s_pid = p->p_zone->zone_proc_initpid; 977 /* handle the dead/missing init(1M) case */ 978 if (s_pid == -1) 979 s_pid = 1; 980 s_tid = 1; 981 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) { 982 return (ESRCH); 983 } 984 985 if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 || 986 copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) { 987 return (EFAULT); 988 } 989 990 return (0); 991 } 992 993 case B_SIGEV_THREAD_ID: { 994 /* 995 * Emulate Linux's timer_create(2) SIGEV_THREAD_ID 996 * notification method. This mechanism is only meant 997 * for userland threading libraries such as glibc and 998 * is documented as such. Therefore, assume this is 999 * only ever invoked for the purpose of alerting a 1000 * Linux threading library. Assume that the tid is a 1001 * member of the caller's process and the signal 1002 * number is valid. See lx_sigev_thread_id() for the 1003 * userland side of this emulation. 1004 * 1005 * The return code from this function is not checked 1006 * by the caller since it executes in an asynchronous 1007 * context and there is nothing much to be done. If 1008 * this function does fail then it will manifest as 1009 * Linux threads waiting for a signal they will never 1010 * receive. 1011 * 1012 * arg1 -- Linux tid 1013 * arg2 -- signal number 1014 * arg3 -- union sigval 1015 */ 1016 1017 int native_sig = lx_stol_signo((int)arg2, LX_SIGTIMER); 1018 pid_t native_pid; 1019 int native_tid; 1020 sigqueue_t *sqp; 1021 1022 lx_lpid_to_spair((pid_t)arg1, &native_pid, &native_tid); 1023 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); 1024 mutex_enter(&curproc->p_lock); 1025 1026 if ((t = idtot(curproc, native_tid)) == NULL) { 1027 mutex_exit(&curproc->p_lock); 1028 kmem_free(sqp, sizeof (sigqueue_t)); 1029 return (ESRCH); 1030 } 1031 1032 sqp->sq_info.si_signo = native_sig; 1033 sqp->sq_info.si_code = SI_TIMER; 1034 sqp->sq_info.si_pid = curproc->p_pid; 1035 sqp->sq_info.si_zoneid = getzoneid(); 1036 sqp->sq_info.si_uid = crgetruid(CRED()); 1037 sqp->sq_info.si_value = (union sigval)((void *)arg3); 1038 sigaddqa(curproc, t, sqp); 1039 1040 mutex_exit(&curproc->p_lock); 1041 kmem_free(sqp, sizeof (sigqueue_t)); 1042 1043 return (0); 1044 } 1045 1046 case B_SET_AFFINITY_MASK: 1047 case B_GET_AFFINITY_MASK: 1048 /* 1049 * Retrieve or store the CPU affinity mask for the 1050 * requested linux pid. 1051 * 1052 * arg1 is a linux PID (0 means curthread). 1053 * arg2 is the size of the given mask. 1054 * arg3 is the address of the affinity mask. 1055 */ 1056 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval)); 1057 1058 case B_PTRACE_STOP_FOR_OPT: 1059 return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ? 1060 B_FALSE : B_TRUE, (ulong_t)arg3, arg4)); 1061 1062 case B_PTRACE_CLONE_BEGIN: 1063 return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ? 1064 B_FALSE : B_TRUE)); 1065 1066 case B_PTRACE_KERNEL: 1067 return (lx_ptrace_kernel((int)arg1, (pid_t)arg2, arg3, arg4)); 1068 1069 case B_HELPER_WAITID: { 1070 idtype_t idtype = (idtype_t)arg1; 1071 id_t id = (id_t)arg2; 1072 siginfo_t *infop = (siginfo_t *)arg3; 1073 int options = (int)arg4; 1074 1075 lwpd = ttolxlwp(curthread); 1076 1077 /* 1078 * Our brand-specific waitid helper only understands a subset of 1079 * the possible idtypes. Ensure we keep to that subset here: 1080 */ 1081 if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { 1082 return (EINVAL); 1083 } 1084 1085 /* 1086 * Enable the return of emulated ptrace(2) stop conditions 1087 * through lx_waitid_helper, and stash the Linux-specific 1088 * extra waitid() flags. 1089 */ 1090 lwpd->br_waitid_emulate = B_TRUE; 1091 lwpd->br_waitid_flags = (int)arg5; 1092 1093 #if defined(_SYSCALL32_IMPL) 1094 if (get_udatamodel() != DATAMODEL_NATIVE) { 1095 return (waitsys32(idtype, id, infop, options)); 1096 } else 1097 #endif 1098 { 1099 return (waitsys(idtype, id, infop, options)); 1100 } 1101 1102 lwpd->br_waitid_emulate = B_FALSE; 1103 lwpd->br_waitid_flags = 0; 1104 1105 return (0); 1106 } 1107 1108 case B_UNSUPPORTED: { 1109 char dmsg[256]; 1110 1111 if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) { 1112 lx_print("Failed to copyin unsupported msg " 1113 "at 0x%p\n", (void *)arg1); 1114 return (EFAULT); 1115 } 1116 dmsg[255] = '\0'; 1117 lx_unsupported(dmsg); 1118 1119 lx_check_strict_failure(lwpd); 1120 1121 return (0); 1122 } 1123 1124 case B_STORE_ARGS: { 1125 /* 1126 * B_STORE_ARGS subcommand 1127 * arg1 = address of struct to be copied in 1128 * arg2 = size of the struct being copied in 1129 * arg3-arg6 ignored 1130 * rval = the amount of data copied. 1131 */ 1132 void *buf; 1133 1134 /* only have upper limit because arg2 is unsigned */ 1135 if (arg2 > LX_BR_ARGS_SIZE_MAX) { 1136 return (EINVAL); 1137 } 1138 1139 buf = kmem_alloc(arg2, KM_SLEEP); 1140 if (copyin((void *)arg1, buf, arg2) != 0) { 1141 lx_print("Failed to copyin scall arg at 0x%p\n", 1142 (void *) arg1); 1143 kmem_free(buf, arg2); 1144 /* 1145 * Purposely not setting br_scall_args to NULL 1146 * to preserve data for debugging. 1147 */ 1148 return (EFAULT); 1149 } 1150 1151 if (lwpd->br_scall_args != NULL) { 1152 ASSERT(lwpd->br_args_size > 0); 1153 kmem_free(lwpd->br_scall_args, 1154 lwpd->br_args_size); 1155 } 1156 1157 lwpd->br_scall_args = buf; 1158 lwpd->br_args_size = arg2; 1159 *rval = arg2; 1160 return (0); 1161 } 1162 1163 case B_HELPER_CLONE: 1164 return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3, 1165 (void *)arg4)); 1166 1167 case B_HELPER_SETGROUPS: 1168 return (lx_helper_setgroups(arg1, (gid_t *)arg2)); 1169 1170 case B_HELPER_SIGQUEUE: 1171 return (lx_helper_rt_sigqueueinfo(arg1, arg2, 1172 (siginfo_t *)arg3)); 1173 1174 case B_HELPER_TGSIGQUEUE: 1175 return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3, 1176 (siginfo_t *)arg4)); 1177 1178 case B_SET_THUNK_PID: 1179 lwpd->br_lx_thunk_pid = arg1; 1180 return (0); 1181 1182 case B_GETPID: 1183 /* 1184 * The usermode clone(2) code needs to be able to call 1185 * lx_getpid() from native code: 1186 */ 1187 *rval = lx_getpid(); 1188 return (0); 1189 1190 case B_SET_NATIVE_STACK: 1191 /* 1192 * B_SET_NATIVE_STACK subcommand 1193 * arg1 = the base of the stack to use for emulation 1194 */ 1195 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 1196 lx_print("B_SET_NATIVE_STACK when stack was already " 1197 "set to %p\n", (void *)arg1); 1198 return (EEXIST); 1199 } 1200 1201 /* 1202 * We move from the PREINIT state, where we have no brand 1203 * emulation stack, to the INIT state. Here, we are still 1204 * running on what will become the BRAND stack, but are running 1205 * emulation (i.e. native) code. Once the initialisation 1206 * process for this thread has finished, we will jump to 1207 * brand-specific code, while moving to the BRAND mode. 1208 * 1209 * When a new LWP is created, lx_initlwp() will clear the 1210 * stack data. If that LWP is actually being duplicated 1211 * into a child process by fork(2), lx_forklwp() will copy 1212 * it so that the cloned thread will keep using the same 1213 * alternate stack. 1214 */ 1215 lwpd->br_ntv_stack = arg1; 1216 lwpd->br_stack_mode = LX_STACK_MODE_INIT; 1217 lx_lwp_set_native_stack_current(lwpd, arg1); 1218 1219 return (0); 1220 1221 case B_GET_CURRENT_CONTEXT: 1222 /* 1223 * B_GET_CURRENT_CONTEXT subcommand: 1224 * arg1 = address for pointer to current ucontext_t 1225 */ 1226 1227 #if defined(_SYSCALL32_IMPL) 1228 if (get_udatamodel() != DATAMODEL_NATIVE) { 1229 caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext; 1230 1231 error = copyout(&addr, (void *)arg1, sizeof (addr)); 1232 } else 1233 #endif 1234 { 1235 error = copyout(&lwp->lwp_oldcontext, (void *)arg1, 1236 sizeof (lwp->lwp_oldcontext)); 1237 } 1238 1239 return (error != 0 ? EFAULT : 0); 1240 1241 case B_JUMP_TO_LINUX: 1242 /* 1243 * B_JUMP_TO_LINUX subcommand: 1244 * arg1 = ucontext_t pointer for jump state 1245 */ 1246 1247 if (arg1 == NULL) 1248 return (EINVAL); 1249 1250 switch (lwpd->br_stack_mode) { 1251 case LX_STACK_MODE_NATIVE: { 1252 struct regs *rp = lwptoregs(lwp); 1253 1254 /* 1255 * We are on the NATIVE stack, so we must preserve 1256 * the extent of that stack. The pointer will be 1257 * reset by a future setcontext(). 1258 */ 1259 lx_lwp_set_native_stack_current(lwpd, 1260 (uintptr_t)rp->r_sp); 1261 break; 1262 } 1263 1264 case LX_STACK_MODE_INIT: 1265 /* 1266 * The LWP is transitioning to Linux code for the first 1267 * time. 1268 */ 1269 break; 1270 1271 case LX_STACK_MODE_PREINIT: 1272 /* 1273 * This LWP has not installed an alternate stack for 1274 * usermode emulation handling. 1275 */ 1276 return (ENOENT); 1277 1278 case LX_STACK_MODE_BRAND: 1279 /* 1280 * The LWP should not be on the BRAND stack. 1281 */ 1282 exit(CLD_KILLED, SIGSYS); 1283 return (0); 1284 } 1285 1286 /* 1287 * Transfer control to Linux: 1288 */ 1289 return (lx_runexe(lwp, (void *)arg1)); 1290 1291 case B_EMULATION_DONE: 1292 /* 1293 * B_EMULATION_DONE subcommand: 1294 * arg1 = ucontext_t * to restore 1295 * arg2 = system call number 1296 * arg3 = return code 1297 * arg4 = if operation failed, the errno value 1298 */ 1299 1300 /* 1301 * The first part of this operation is a setcontext() to 1302 * restore the register state to the copy we preserved 1303 * before vectoring to the usermode emulation routine. 1304 * If that fails, we return (hopefully) to the emulation 1305 * routine and it will handle the error. 1306 */ 1307 #if (_SYSCALL32_IMPL) 1308 if (get_udatamodel() != DATAMODEL_NATIVE) { 1309 error = getsetcontext32(SETCONTEXT, (void *)arg1); 1310 } else 1311 #endif 1312 { 1313 error = getsetcontext(SETCONTEXT, (void *)arg1); 1314 } 1315 1316 if (error != 0) { 1317 return (error); 1318 } 1319 1320 /* 1321 * The saved Linux context has been restored. We handle the 1322 * return value or errno with code common to the in-kernel 1323 * system call emulation. 1324 */ 1325 if ((error = (int)arg4) != 0) { 1326 /* 1327 * lx_syscall_return() looks at the errno in the LWP, 1328 * so set it here: 1329 */ 1330 set_errno(error); 1331 } 1332 lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3); 1333 1334 return (0); 1335 1336 case B_EXIT_AS_SIG: 1337 code = CLD_KILLED; 1338 sig = (int)arg1; 1339 proc_is_exiting(p); 1340 if (exitlwps(1) != 0) { 1341 mutex_enter(&p->p_lock); 1342 lwp_exit(); 1343 } 1344 ttolwp(curthread)->lwp_cursig = sig; 1345 if (sig == SIGSEGV) { 1346 if (core(sig, 0) == 0) 1347 code = CLD_DUMPED; 1348 } 1349 exit(code, sig); 1350 /* NOTREACHED */ 1351 break; 1352 } 1353 1354 return (EINVAL); 1355 } 1356 1357 char * 1358 lx_get_zone_kern_version(zone_t *zone) 1359 { 1360 return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version); 1361 } 1362 1363 void 1364 lx_set_kern_version(zone_t *zone, char *vers) 1365 { 1366 lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; 1367 1368 (void) strlcpy(lxzd->lxzd_kernel_version, vers, LX_VERS_MAX); 1369 } 1370 1371 /* 1372 * Compare linux kernel version to the one set for the zone. 1373 * Returns greater than 0 if zone version is higher, less than 0 if the zone 1374 * version is lower, and 0 if the version are equal. 1375 */ 1376 int 1377 lx_kern_version_cmp(zone_t *zone, const char *vers) 1378 { 1379 int zvers[3] = {0, 0, 0}; 1380 int cvers[3] = {0, 0, 0}; 1381 int i; 1382 1383 VERIFY(zone->zone_brand == &lx_brand); 1384 1385 (void) sscanf(ztolxzd(zone)->lxzd_kernel_version, "%d.%d.%d", &zvers[0], 1386 &zvers[1], &zvers[2]); 1387 (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]); 1388 1389 for (i = 0; i < 3; i++) { 1390 if (zvers[i] > cvers[i]) { 1391 return (1); 1392 } else if (zvers[i] < cvers[i]) { 1393 return (-1); 1394 } 1395 } 1396 return (0); 1397 } 1398 1399 /* 1400 * Linux unconditionally removes the setuid and setgid bits when changing 1401 * file ownership. This brand hook overrides the illumos native behaviour, 1402 * which is based on the PRIV_FILE_SETID privilege. 1403 */ 1404 static int 1405 lx_setid_clear(vattr_t *vap, cred_t *cr) 1406 { 1407 if (S_ISDIR(vap->va_mode)) { 1408 return (0); 1409 } 1410 1411 if (vap->va_mode & S_ISUID) { 1412 vap->va_mask |= AT_MODE; 1413 vap->va_mode &= ~S_ISUID; 1414 } 1415 if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1416 vap->va_mask |= AT_MODE; 1417 vap->va_mode &= ~S_ISGID; 1418 } 1419 1420 return (0); 1421 } 1422 1423 /* 1424 * Copy the per-process brand data from a parent proc to a child. 1425 */ 1426 void 1427 lx_copy_procdata(proc_t *child, proc_t *parent) 1428 { 1429 lx_proc_data_t *cpd = child->p_brand_data; 1430 lx_proc_data_t *ppd = parent->p_brand_data; 1431 1432 VERIFY(parent->p_brand == &lx_brand); 1433 VERIFY(child->p_brand == &lx_brand); 1434 VERIFY(ppd != NULL); 1435 VERIFY(cpd != NULL); 1436 1437 *cpd = *ppd; 1438 1439 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY; 1440 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY; 1441 1442 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20; 1443 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20; 1444 1445 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY; 1446 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY; 1447 1448 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY; 1449 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY; 1450 } 1451 1452 #if defined(_LP64) 1453 static void 1454 Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst) 1455 { 1456 bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident)); 1457 dst->e_type = src->e_type; 1458 dst->e_machine = src->e_machine; 1459 dst->e_version = src->e_version; 1460 dst->e_entry = src->e_entry; 1461 dst->e_phoff = src->e_phoff; 1462 dst->e_shoff = src->e_shoff; 1463 dst->e_flags = src->e_flags; 1464 dst->e_ehsize = src->e_ehsize; 1465 dst->e_phentsize = src->e_phentsize; 1466 dst->e_phnum = src->e_phnum; 1467 dst->e_shentsize = src->e_shentsize; 1468 dst->e_shnum = src->e_shnum; 1469 dst->e_shstrndx = src->e_shstrndx; 1470 } 1471 #endif /* _LP64 */ 1472 1473 static void 1474 restoreexecenv(struct execenv *ep, stack_t *sp) 1475 { 1476 klwp_t *lwp = ttolwp(curthread); 1477 1478 setexecenv(ep); 1479 lwp->lwp_sigaltstack.ss_sp = sp->ss_sp; 1480 lwp->lwp_sigaltstack.ss_size = sp->ss_size; 1481 lwp->lwp_sigaltstack.ss_flags = sp->ss_flags; 1482 } 1483 1484 extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, 1485 long *, int, caddr_t, cred_t *, int *); 1486 1487 extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int, 1488 long *, int, caddr_t, cred_t *, int *); 1489 1490 /* 1491 * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux 1492 * binaries. 1493 */ 1494 static int 1495 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 1496 struct intpdata *idata, int level, long *execsz, int setid, 1497 caddr_t exec_file, struct cred *cred, int *brand_action) 1498 { 1499 int error; 1500 vnode_t *nvp; 1501 Ehdr ehdr; 1502 Addr uphdr_vaddr; 1503 intptr_t voffset; 1504 char *interp = NULL; 1505 uintptr_t ldaddr = NULL; 1506 int i; 1507 proc_t *p = ttoproc(curthread); 1508 klwp_t *lwp = ttolwp(curthread); 1509 struct execenv env; 1510 struct execenv origenv; 1511 stack_t orig_sigaltstack; 1512 struct user *up = PTOU(ttoproc(curthread)); 1513 lx_elf_data_t *edp; 1514 char *lib_path = NULL; 1515 1516 ASSERT(ttoproc(curthread)->p_brand == &lx_brand); 1517 ASSERT(ttoproc(curthread)->p_brand_data != NULL); 1518 1519 edp = &ttolxproc(curthread)->l_elf_data; 1520 1521 if (args->to_model == DATAMODEL_NATIVE) { 1522 lib_path = LX_LIB_PATH; 1523 } 1524 #if defined(_LP64) 1525 else { 1526 lib_path = LX_LIB_PATH32; 1527 } 1528 #endif 1529 1530 /* 1531 * Set the brandname and library name for the new process so that 1532 * elfexec() puts them onto the stack. 1533 */ 1534 args->brandname = LX_BRANDNAME; 1535 args->emulator = lib_path; 1536 1537 #if defined(_LP64) 1538 /* 1539 * To conform with the way Linux lays out the address space, we clamp 1540 * the stack to be the top of the lower region of the x86-64 canonical 1541 * form address space -- which has the side-effect of laying out the 1542 * entire address space in that lower region. Note that this only 1543 * matters on 64-bit processes (this value will always be greater than 1544 * the size of a 32-bit address space) and doesn't actually affect 1545 * USERLIMIT: if a Linux-branded processes wishes to map something 1546 * into the top half of the address space, it can do so -- but with 1547 * the user stack starting at the top of the bottom region, those high 1548 * virtual addresses won't be used unless explicitly directed. 1549 */ 1550 args->maxstack = lx_maxstack64; 1551 #endif 1552 1553 /* 1554 * We will first exec the brand library, then map in the linux 1555 * executable and the linux linker. 1556 */ 1557 if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP, 1558 &nvp))) { 1559 uprintf("%s: not found.", lib_path); 1560 return (error); 1561 } 1562 1563 /* 1564 * We will eventually set the p_exec member to be the vnode for the new 1565 * executable when we call setexecenv(). However, if we get an error 1566 * before that call we need to restore the execenv to its original 1567 * values so that when we return to the caller fop_close() works 1568 * properly while cleaning up from the failed exec(). Restoring the 1569 * original value will also properly decrement the 2nd VN_RELE that we 1570 * took on the brand library. 1571 */ 1572 origenv.ex_bssbase = p->p_bssbase; 1573 origenv.ex_brkbase = p->p_brkbase; 1574 origenv.ex_brksize = p->p_brksize; 1575 origenv.ex_vp = p->p_exec; 1576 orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp; 1577 orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size; 1578 orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags; 1579 1580 if (args->to_model == DATAMODEL_NATIVE) { 1581 error = elfexec(nvp, uap, args, idata, level + 1, execsz, 1582 setid, exec_file, cred, brand_action); 1583 } 1584 #if defined(_LP64) 1585 else { 1586 error = elf32exec(nvp, uap, args, idata, level + 1, execsz, 1587 setid, exec_file, cred, brand_action); 1588 } 1589 #endif 1590 VN_RELE(nvp); 1591 if (error != 0) { 1592 restoreexecenv(&origenv, &orig_sigaltstack); 1593 return (error); 1594 } 1595 1596 /* 1597 * exec-ed in the brand library above. 1598 * The u_auxv vectors are now setup by elfexec to point to the 1599 * brand emulation library and its linker. 1600 */ 1601 1602 bzero(&env, sizeof (env)); 1603 1604 /* 1605 * map in the the Linux executable 1606 */ 1607 if (args->to_model == DATAMODEL_NATIVE) { 1608 error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, 1609 &voffset, exec_file, &interp, &env.ex_bssbase, 1610 &env.ex_brkbase, &env.ex_brksize, NULL, NULL); 1611 } 1612 #if defined(_LP64) 1613 else { 1614 Elf32_Ehdr ehdr32; 1615 Elf32_Addr uphdr_vaddr32; 1616 1617 error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, 1618 &voffset, exec_file, &interp, &env.ex_bssbase, 1619 &env.ex_brkbase, &env.ex_brksize, NULL, NULL); 1620 1621 Ehdr32to64(&ehdr32, &ehdr); 1622 1623 if (uphdr_vaddr32 == (Elf32_Addr)-1) 1624 uphdr_vaddr = (Addr)-1; 1625 else 1626 uphdr_vaddr = uphdr_vaddr32; 1627 } 1628 #endif 1629 if (error != 0) { 1630 restoreexecenv(&origenv, &orig_sigaltstack); 1631 1632 if (interp != NULL) 1633 kmem_free(interp, MAXPATHLEN); 1634 1635 return (error); 1636 } 1637 1638 /* 1639 * Save off the important properties of the lx executable. The brand 1640 * library will ask us for this data later, when it is ready to set 1641 * things up for the lx executable. 1642 */ 1643 edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : 1644 voffset + uphdr_vaddr; 1645 edp->ed_entry = voffset + ehdr.e_entry; 1646 edp->ed_phent = ehdr.e_phentsize; 1647 edp->ed_phnum = ehdr.e_phnum; 1648 1649 if (interp != NULL) { 1650 if (ehdr.e_type == ET_DYN) { 1651 /* 1652 * This is a shared object executable, so we need to 1653 * pick a reasonable place to put the heap. Just don't 1654 * use the first page. 1655 */ 1656 env.ex_brkbase = (caddr_t)PAGESIZE; 1657 env.ex_bssbase = (caddr_t)PAGESIZE; 1658 } 1659 1660 /* 1661 * If the program needs an interpreter (most do), map it in and 1662 * store relevant information about it in the aux vector, where 1663 * the brand library can find it. 1664 */ 1665 if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW, 1666 NULLVPP, &nvp))) { 1667 uprintf("%s: not found.", interp); 1668 restoreexecenv(&origenv, &orig_sigaltstack); 1669 kmem_free(interp, MAXPATHLEN); 1670 return (error); 1671 } 1672 1673 kmem_free(interp, MAXPATHLEN); 1674 interp = NULL; 1675 1676 /* 1677 * map in the Linux linker 1678 */ 1679 if (args->to_model == DATAMODEL_NATIVE) { 1680 error = mapexec_brand(nvp, args, &ehdr, 1681 &uphdr_vaddr, &voffset, exec_file, NULL, NULL, 1682 NULL, NULL, NULL, &ldaddr); 1683 } 1684 #if defined(_LP64) 1685 else { 1686 Elf32_Ehdr ehdr32; 1687 Elf32_Addr uphdr_vaddr32; 1688 1689 error = mapexec32_brand(nvp, args, &ehdr32, 1690 &uphdr_vaddr32, &voffset, exec_file, NULL, NULL, 1691 NULL, NULL, NULL, &ldaddr); 1692 1693 Ehdr32to64(&ehdr32, &ehdr); 1694 1695 if (uphdr_vaddr32 == (Elf32_Addr)-1) 1696 uphdr_vaddr = (Addr)-1; 1697 else 1698 uphdr_vaddr = uphdr_vaddr32; 1699 } 1700 #endif 1701 1702 VN_RELE(nvp); 1703 if (error != 0) { 1704 restoreexecenv(&origenv, &orig_sigaltstack); 1705 return (error); 1706 } 1707 1708 /* 1709 * Now that we know the base address of the brand's linker, 1710 * we also save this for later use by the brand library. 1711 */ 1712 edp->ed_base = voffset; 1713 edp->ed_ldentry = voffset + ehdr.e_entry; 1714 } else { 1715 /* 1716 * This program has no interpreter. The lx brand library will 1717 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, 1718 * so in this case, put the entry point of the main executable 1719 * there. 1720 */ 1721 if (ehdr.e_type == ET_EXEC) { 1722 /* 1723 * An executable with no interpreter, this must be a 1724 * statically linked executable, which means we loaded 1725 * it at the address specified in the elf header, in 1726 * which case the e_entry field of the elf header is an 1727 * absolute address. 1728 */ 1729 edp->ed_ldentry = ehdr.e_entry; 1730 edp->ed_entry = ehdr.e_entry; 1731 } else { 1732 /* 1733 * A shared object with no interpreter, we use the 1734 * calculated address from above. 1735 */ 1736 edp->ed_ldentry = edp->ed_entry; 1737 1738 /* 1739 * In all situations except an ET_DYN elf object with no 1740 * interpreter, we want to leave the brk and base 1741 * values set by mapexec_brand alone. Normally when 1742 * running ET_DYN objects on Solaris (most likely 1743 * /lib/ld.so.1) the kernel sets brk and base to 0 since 1744 * it doesn't know where to put the heap, and later the 1745 * linker will call brk() to initialize the heap in: 1746 * usr/src/cmd/sgs/rtld/common/setup.c:setup() 1747 * after it has determined where to put it. (This 1748 * decision is made after the linker loads and inspects 1749 * elf properties of the target executable being run.) 1750 * 1751 * So for ET_DYN Linux executables, we also don't know 1752 * where the heap should go, so we'll set the brk and 1753 * base to 0. But in this case the Solaris linker will 1754 * not initialize the heap, so when the Linux linker 1755 * starts running there is no heap allocated. This 1756 * seems to be ok on Linux 2.4 based systems because the 1757 * Linux linker/libc fall back to using mmap() to 1758 * allocate memory. But on 2.6 systems, running 1759 * applications by specifying them as command line 1760 * arguments to the linker results in segfaults for an 1761 * as yet undetermined reason (which seems to indicatej 1762 * that a more permanent fix for heap initalization in 1763 * these cases may be necessary). 1764 */ 1765 if (ehdr.e_type == ET_DYN) { 1766 env.ex_bssbase = (caddr_t)0; 1767 env.ex_brkbase = (caddr_t)0; 1768 env.ex_brksize = 0; 1769 } 1770 } 1771 1772 } 1773 1774 env.ex_vp = vp; 1775 setexecenv(&env); 1776 1777 /* 1778 * We try to keep /proc's view of the aux vector consistent with 1779 * what's on the process stack. 1780 */ 1781 if (args->to_model == DATAMODEL_NATIVE) { 1782 auxv_t phdr_auxv[4] = { 1783 { AT_SUN_BRAND_LX_PHDR, 0 }, 1784 { AT_SUN_BRAND_LX_INTERP, 0 }, 1785 { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 }, 1786 { AT_SUN_BRAND_AUX4, 0 } 1787 }; 1788 phdr_auxv[0].a_un.a_val = edp->ed_phdr; 1789 phdr_auxv[1].a_un.a_val = ldaddr; 1790 phdr_auxv[2].a_un.a_val = 1; /* set in lx_init */ 1791 phdr_auxv[3].a_type = AT_CLKTCK; 1792 phdr_auxv[3].a_un.a_val = hz; 1793 1794 if (copyout(&phdr_auxv, args->auxp_brand, 1795 sizeof (phdr_auxv)) == -1) 1796 return (EFAULT); 1797 } 1798 #if defined(_LP64) 1799 else { 1800 auxv32_t phdr_auxv32[3] = { 1801 { AT_SUN_BRAND_LX_PHDR, 0 }, 1802 { AT_SUN_BRAND_LX_INTERP, 0 }, 1803 { AT_SUN_BRAND_AUX3, 0 } 1804 }; 1805 phdr_auxv32[0].a_un.a_val = edp->ed_phdr; 1806 phdr_auxv32[1].a_un.a_val = ldaddr; 1807 phdr_auxv32[2].a_type = AT_CLKTCK; 1808 phdr_auxv32[2].a_un.a_val = hz; 1809 1810 if (copyout(&phdr_auxv32, args->auxp_brand, 1811 sizeof (phdr_auxv32)) == -1) 1812 return (EFAULT); 1813 } 1814 #endif 1815 1816 /* 1817 * /proc uses the AT_ENTRY aux vector entry to deduce 1818 * the location of the executable in the address space. The user 1819 * structure contains a copy of the aux vector that needs to have those 1820 * entries patched with the values of the real lx executable (they 1821 * currently contain the values from the lx brand library that was 1822 * elfexec'd, above). 1823 * 1824 * For live processes, AT_BASE is used to locate the linker segment, 1825 * which /proc and friends will later use to find Solaris symbols 1826 * (such as rtld_db_preinit). However, for core files, /proc uses 1827 * AT_ENTRY to find the right segment to label as the executable. 1828 * So we set AT_ENTRY to be the entry point of the linux executable, 1829 * but leave AT_BASE to be the address of the Solaris linker. 1830 */ 1831 for (i = 0; i < __KERN_NAUXV_IMPL; i++) { 1832 switch (up->u_auxv[i].a_type) { 1833 case AT_ENTRY: 1834 up->u_auxv[i].a_un.a_val = edp->ed_entry; 1835 break; 1836 1837 case AT_SUN_BRAND_LX_PHDR: 1838 up->u_auxv[i].a_un.a_val = edp->ed_phdr; 1839 break; 1840 1841 case AT_SUN_BRAND_LX_INTERP: 1842 up->u_auxv[i].a_un.a_val = ldaddr; 1843 break; 1844 1845 default: 1846 break; 1847 } 1848 } 1849 1850 return (0); 1851 } 1852 1853 boolean_t 1854 lx_native_exec(uint8_t osabi, const char **interp) 1855 { 1856 if (osabi != ELFOSABI_SOLARIS) 1857 return (B_FALSE); 1858 1859 /* 1860 * If the process root matches the zone root, prepend /native to the 1861 * interpreter path for native executables. Absolute precision from 1862 * VN_CMP is not necessary since any change of process root is likely 1863 * to make native binaries inaccessible via /native. 1864 * 1865 * Processes which chroot directly into /native will be able to 1866 * function as expected with no need for the prefix. 1867 */ 1868 if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) { 1869 *interp = "/native"; 1870 } 1871 1872 return (B_TRUE); 1873 } 1874 1875 static void 1876 lx_syscall_init(void) 1877 { 1878 int i; 1879 1880 /* 1881 * Count up the 32-bit Linux system calls. Note that lx_sysent32 1882 * has (LX_NSYSCALLS + 1) entries. 1883 */ 1884 for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++) 1885 continue; 1886 lx_nsysent32 = i; 1887 1888 #if defined(_LP64) 1889 /* 1890 * Count up the 64-bit Linux system calls. Note that lx_sysent64 1891 * has (LX_NSYSCALLS + 1) entries. 1892 */ 1893 for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++) 1894 continue; 1895 lx_nsysent64 = i; 1896 #endif 1897 } 1898 1899 int 1900 _init(void) 1901 { 1902 int err = 0; 1903 1904 lx_syscall_init(); 1905 lx_pid_init(); 1906 lx_ioctl_init(); 1907 lx_futex_init(); 1908 lx_ptrace_init(); 1909 lx_socket_init(); 1910 1911 err = mod_install(&modlinkage); 1912 if (err != 0) { 1913 cmn_err(CE_WARN, "Couldn't install lx brand module"); 1914 1915 /* 1916 * This looks drastic, but it should never happen. These 1917 * two data structures should be completely free-able until 1918 * they are used by Linux processes. Since the brand 1919 * wasn't loaded there should be no Linux processes, and 1920 * thus no way for these data structures to be modified. 1921 */ 1922 lx_pid_fini(); 1923 lx_ioctl_fini(); 1924 if (lx_futex_fini()) 1925 panic("lx brand module cannot be loaded or unloaded."); 1926 } 1927 return (err); 1928 } 1929 1930 int 1931 _info(struct modinfo *modinfop) 1932 { 1933 return (mod_info(&modlinkage, modinfop)); 1934 } 1935 1936 int 1937 _fini(void) 1938 { 1939 int err; 1940 int futex_done = 0; 1941 1942 /* 1943 * If there are any zones using this brand, we can't allow it to be 1944 * unloaded. 1945 */ 1946 if (brand_zone_count(&lx_brand)) 1947 return (EBUSY); 1948 1949 lx_ptrace_fini(); 1950 lx_pid_fini(); 1951 lx_ioctl_fini(); 1952 lx_socket_fini(); 1953 1954 if ((err = lx_futex_fini()) != 0) { 1955 goto done; 1956 } 1957 futex_done = 1; 1958 1959 err = mod_remove(&modlinkage); 1960 1961 done: 1962 if (err) { 1963 /* 1964 * If we can't unload the module, then we have to get it 1965 * back into a sane state. 1966 */ 1967 lx_ptrace_init(); 1968 lx_pid_init(); 1969 lx_ioctl_init(); 1970 lx_socket_init(); 1971 1972 if (futex_done) { 1973 lx_futex_init(); 1974 } 1975 } 1976 1977 return (err); 1978 }