1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright 2015, Joyent, Inc. All rights reserved. 29 */ 30 31 /* 32 * The LX Brand: emulation of a Linux operating environment within a zone. 33 * 34 * OVERVIEW 35 * 36 * The LX brand enables a full Linux userland -- including a C library, 37 * init(1) framework, and some set of applications -- to run unmodified 38 * within an illumos zone. Unlike illumos, where applications are expected 39 * to link against and consume functions exported from libraries, the 40 * supported Linux binary compatibility boundary is the system call 41 * interface. By accurately emulating the behaviour of Linux system calls, 42 * Linux software can be executed in this environment as if it were running 43 * on a native Linux system. 44 * 45 * EMULATING LINUX SYSTEM CALLS 46 * 47 * Linux system calls are made in 32-bit processes via the "int 0x80" 48 * instruction; in 64-bit processes the "syscall" instruction is used, as it 49 * is with native illumos processes. In both cases, arguments to system 50 * calls are generally passed in registers and the usermode stack is not 51 * interpreted or modified by the Linux kernel. 52 * 53 * When the emulated Linux process makes a system call, it traps into the 54 * illumos kernel. The in-kernel brand module contains various emulation 55 * routines, and can fully service some emulated system calls; e.g. read(2) 56 * and write(2). Other system calls require assistance from the illumos 57 * libc, bouncing back out to the brand library ("lx_brand.so.1") for 58 * emulation. 59 * 60 * The brand mechanism allows for the provision of an alternative trap 61 * handler for the various system call mechanisms. Traditionally this was 62 * used to immediately revector execution to the usermode emulation library, 63 * which was responsible for handling all system calls. In the interests of 64 * more accurate emulation and increased performance, much of the regular 65 * illumos system call path is now invoked. Only the argument processing and 66 * handler dispatch are replaced by the brand, via the per-LWP 67 * "lwp_brand_syscall" interposition function pointer. 68 * 69 * THE NATIVE AND BRAND STACKS 70 * 71 * Some runtime environments (e.g. the Go language) allocate very small 72 * thread stacks, preferring to grow or split the stack as necessary. The 73 * Linux kernel generally does not use the usermode stack when servicing 74 * system calls, so this is not a problem. In order for our emulation to 75 * have the same zero stack impact, we must execute usermode emulation 76 * routines on an _alternate_ stack. This is similar, in principle, to the 77 * use of sigaltstack(3C) to run signal handlers off the main thread stack. 78 * 79 * To this end, the brand library allocates and installs an alternate stack 80 * (called the "native" stack) for each LWP. The in-kernel brand code uses 81 * this stack for usermode emulation calls and interposed signal delivery, 82 * while the emulated Linux process sees only the data on the main thread 83 * stack, known as the "brand" stack. The stack mode is tracked in the 84 * per-LWP brand-private data, using the LX_STACK_MODE_* enum. 85 * 86 * The stack mode doubles as a system call "mode bit". When in the 87 * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux 88 * system calls. In other modes, system calls are assumed to be native 89 * illumos system calls as made during brand library initialisation and 90 * usermode emulation. 91 * 92 * USERMODE EMULATION 93 * 94 * When a Linux system call cannot be emulated within the kernel, we preserve 95 * the register state of the Linux process and revector the LWP to the brand 96 * library usermode emulation handler: the "lx_emulate()" function in 97 * "lx_brand.so.1". This revectoring is modelled on the delivery of signals, 98 * and is performed in "lx_emulate_user()". 99 * 100 * First, the emulated process state is written out to the usermode stack of 101 * the process as a "ucontext_t" object. Arguments to the emulation routine 102 * are passed on the stack or in registers, depending on the ABI. When the 103 * usermode emulation is complete, the result is passed back to the kernel 104 * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context 105 * for restoration. 106 * 107 * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT 108 * 109 * When servicing emulated system calls in the usermode brand library, or 110 * during signal delivery, various state is preserved by the kernel so that 111 * the running LWP may be revectored to a handling routine. The context 112 * allows the kernel to restart the program at the point of interruption, 113 * either at the return of the signal handler, via setcontext(3C); or after 114 * the usermode emulation request has been serviced, via B_EMULATION_DONE. 115 * 116 * In illumos native processes, the saved context (a "ucontext_t" object) 117 * includes the state of registers and the current signal mask at the point 118 * of interruption. The context also includes a link to the most recently 119 * saved context, forming a chain to be unwound as requests complete. The LX 120 * brand requires additional book-keeping to describe the machine state: in 121 * particular, the current stack mode and the occupied extent of the native 122 * stack. 123 * 124 * The brand code is able to interpose on the context save and restore 125 * operations in the kernel -- see "lx_savecontext()" and 126 * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to 127 * function correctly in the face of a dual stack LWP. The brand also 128 * interposes on the signal delivery mechanism -- see "lx_sendsig()" and 129 * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand 130 * library interposer on the native stack, regardless of the interrupted 131 * execution mode. Linux sigaltstack(2) emulation is performed entirely by 132 * the usermode brand library during signal handler interposition. 133 */ 134 135 #include <sys/types.h> 136 #include <sys/kmem.h> 137 #include <sys/errno.h> 138 #include <sys/thread.h> 139 #include <sys/systm.h> 140 #include <sys/syscall.h> 141 #include <sys/proc.h> 142 #include <sys/modctl.h> 143 #include <sys/cmn_err.h> 144 #include <sys/model.h> 145 #include <sys/exec.h> 146 #include <sys/lx_impl.h> 147 #include <sys/machbrand.h> 148 #include <sys/lx_syscalls.h> 149 #include <sys/lx_misc.h> 150 #include <sys/lx_futex.h> 151 #include <sys/lx_brand.h> 152 #include <sys/param.h> 153 #include <sys/termios.h> 154 #include <sys/sunddi.h> 155 #include <sys/ddi.h> 156 #include <sys/vnode.h> 157 #include <sys/pathname.h> 158 #include <sys/auxv.h> 159 #include <sys/priv.h> 160 #include <sys/regset.h> 161 #include <sys/privregs.h> 162 #include <sys/archsystm.h> 163 #include <sys/zone.h> 164 #include <sys/brand.h> 165 #include <sys/sdt.h> 166 #include <sys/x86_archext.h> 167 #include <sys/controlregs.h> 168 #include <sys/core.h> 169 #include <sys/stack.h> 170 #include <sys/stat.h> 171 #include <sys/socket.h> 172 #include <lx_signum.h> 173 #include <util/sscanf.h> 174 175 int lx_debug = 0; 176 177 void lx_init_brand_data(zone_t *); 178 void lx_free_brand_data(zone_t *); 179 void lx_setbrand(proc_t *); 180 int lx_getattr(zone_t *, int, void *, size_t *); 181 int lx_setattr(zone_t *, int, void *, size_t); 182 int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, 183 uintptr_t, uintptr_t); 184 void lx_set_kern_version(zone_t *, char *); 185 void lx_copy_procdata(proc_t *, proc_t *); 186 187 extern int getsetcontext(int, void *); 188 extern int waitsys(idtype_t, id_t, siginfo_t *, int); 189 #if defined(_SYSCALL32_IMPL) 190 extern int getsetcontext32(int, void *); 191 extern int waitsys32(idtype_t, id_t, siginfo_t *, int); 192 #endif 193 194 extern void lx_proc_exit(proc_t *); 195 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); 196 197 extern void lx_ioctl_init(); 198 extern void lx_ioctl_fini(); 199 extern void lx_socket_init(); 200 extern void lx_socket_fini(); 201 202 lx_systrace_f *lx_systrace_entry_ptr; 203 lx_systrace_f *lx_systrace_return_ptr; 204 205 static int lx_systrace_enabled; 206 207 /* 208 * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly 209 * want an MMU dependency here (and should there be a microprocessor without 210 * a hole, we don't want to start allocating from the top of the VA range). 211 */ 212 #define LX_MAXSTACK64 0x7ffffff00000 213 214 uint64_t lx_maxstack64 = LX_MAXSTACK64; 215 216 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 217 struct intpdata *idata, int level, long *execsz, int setid, 218 caddr_t exec_file, struct cred *cred, int *brand_action); 219 220 static boolean_t lx_native_exec(uint8_t, const char **); 221 static uint32_t lx_map32limit(proc_t *); 222 223 static void lx_savecontext(ucontext_t *); 224 static void lx_restorecontext(ucontext_t *); 225 static caddr_t lx_sendsig_stack(int); 226 static void lx_sendsig(int); 227 #if defined(_SYSCALL32_IMPL) 228 static void lx_savecontext32(ucontext32_t *); 229 #endif 230 static int lx_setid_clear(vattr_t *, cred_t *); 231 #if defined(_LP64) 232 static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type, 233 enum seg_rw); 234 #endif 235 236 237 /* lx brand */ 238 struct brand_ops lx_brops = { 239 lx_init_brand_data, /* b_init_brand_data */ 240 lx_free_brand_data, /* b_free_brand_data */ 241 lx_brandsys, /* b_brandsys */ 242 lx_setbrand, /* b_setbrand */ 243 lx_getattr, /* b_getattr */ 244 lx_setattr, /* b_setattr */ 245 lx_copy_procdata, /* b_copy_procdata */ 246 lx_proc_exit, /* b_proc_exit */ 247 lx_exec, /* b_exec */ 248 lx_setrval, /* b_lwp_setrval */ 249 lx_lwpdata_alloc, /* b_lwpdata_alloc */ 250 lx_lwpdata_free, /* b_lwpdata_free */ 251 lx_initlwp, /* b_initlwp */ 252 lx_forklwp, /* b_forklwp */ 253 lx_freelwp, /* b_freelwp */ 254 lx_exitlwp, /* b_lwpexit */ 255 lx_elfexec, /* b_elfexec */ 256 NULL, /* b_sigset_native_to_brand */ 257 NULL, /* b_sigset_brand_to_native */ 258 lx_sigfd_translate, /* b_sigfd_translate */ 259 NSIG, /* b_nsig */ 260 lx_exit_with_sig, /* b_exit_with_sig */ 261 lx_wait_filter, /* b_wait_filter */ 262 lx_native_exec, /* b_native_exec */ 263 lx_map32limit, /* b_map32limit */ 264 lx_stop_notify, /* b_stop_notify */ 265 lx_waitid_helper, /* b_waitid_helper */ 266 lx_sigcld_repost, /* b_sigcld_repost */ 267 lx_ptrace_issig_stop, /* b_issig_stop */ 268 lx_ptrace_sig_ignorable, /* b_sig_ignorable */ 269 lx_savecontext, /* b_savecontext */ 270 #if defined(_SYSCALL32_IMPL) 271 lx_savecontext32, /* b_savecontext32 */ 272 #endif 273 lx_restorecontext, /* b_restorecontext */ 274 lx_sendsig_stack, /* b_sendsig_stack */ 275 lx_sendsig, /* b_sendsig */ 276 lx_setid_clear, /* b_setid_clear */ 277 #if defined(_LP64) 278 lx_pagefault /* b_pagefault */ 279 #else 280 NULL 281 #endif 282 }; 283 284 struct brand_mach_ops lx_mops = { 285 NULL, 286 NULL, 287 NULL, 288 NULL, 289 NULL, 290 lx_fixsegreg, 291 lx_fsbase 292 }; 293 294 struct brand lx_brand = { 295 BRAND_VER_1, 296 "lx", 297 &lx_brops, 298 &lx_mops, 299 sizeof (struct lx_proc_data) 300 }; 301 302 static struct modlbrand modlbrand = { 303 &mod_brandops, "lx brand", &lx_brand 304 }; 305 306 static struct modlinkage modlinkage = { 307 MODREV_1, (void *)&modlbrand, NULL 308 }; 309 310 void 311 lx_proc_exit(proc_t *p) 312 { 313 lx_proc_data_t *lxpd; 314 proc_t *cp; 315 316 mutex_enter(&p->p_lock); 317 VERIFY(lxpd = ptolxproc(p)); 318 if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) { 319 mutex_exit(&p->p_lock); 320 return; 321 } 322 mutex_exit(&p->p_lock); 323 324 /* Check for children which desire notification of parental death. */ 325 mutex_enter(&pidlock); 326 for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) { 327 mutex_enter(&cp->p_lock); 328 if ((lxpd = ptolxproc(cp)) == NULL) { 329 mutex_exit(&cp->p_lock); 330 continue; 331 } 332 if (lxpd->l_parent_deathsig != 0) { 333 sigtoproc(p, NULL, lxpd->l_parent_deathsig); 334 } 335 mutex_exit(&cp->p_lock); 336 } 337 mutex_exit(&pidlock); 338 } 339 340 void 341 lx_setbrand(proc_t *p) 342 { 343 /* Send SIGCHLD to parent by default when child exits */ 344 ptolxproc(p)->l_signal = stol_signo[SIGCHLD]; 345 } 346 347 /* ARGSUSED */ 348 int 349 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) 350 { 351 char vers[LX_VERS_MAX]; 352 353 if (attr == LX_KERN_VERSION_NUM) { 354 if (bufsize > (LX_VERS_MAX - 1)) 355 return (ERANGE); 356 bzero(vers, LX_VERS_MAX); 357 if (copyin(buf, &vers, bufsize) != 0) 358 return (EFAULT); 359 lx_set_kern_version(zone, vers); 360 return (0); 361 } 362 return (EINVAL); 363 } 364 365 /* ARGSUSED */ 366 int 367 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) 368 { 369 if (attr == LX_KERN_VERSION_NUM) { 370 if (*bufsize < LX_VERS_MAX) 371 return (ERANGE); 372 if (copyout(lx_get_zone_kern_version(curzone), buf, 373 LX_VERS_MAX) != 0) 374 return (EFAULT); 375 *bufsize = LX_VERS_MAX; 376 return (0); 377 } 378 return (-EINVAL); 379 } 380 381 uint32_t 382 lx_map32limit(proc_t *p) 383 { 384 /* 385 * To be bug-for-bug compatible with Linux, we have MAP_32BIT only 386 * allow mappings in the first 31 bits. This was a nuance in the 387 * original Linux implementation circa 2002, and applications have 388 * come to depend on its behavior. 389 * 390 * This is only relevant for 64-bit processes. 391 */ 392 if (p->p_model == DATAMODEL_LP64) 393 return (1 << 31); 394 395 return ((uint32_t)USERLIMIT32); 396 } 397 398 void 399 lx_brand_systrace_enable(void) 400 { 401 VERIFY(!lx_systrace_enabled); 402 403 lx_systrace_enabled = 1; 404 } 405 406 void 407 lx_brand_systrace_disable(void) 408 { 409 VERIFY(lx_systrace_enabled); 410 411 lx_systrace_enabled = 0; 412 } 413 414 void 415 lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp) 416 { 417 VERIFY(lwpd->br_ntv_stack != 0); 418 419 /* 420 * The "brand-lx-set-ntv-stack-current" probe has arguments: 421 * arg0: stack pointer before change 422 * arg1: stack pointer after change 423 * arg2: current stack base 424 */ 425 DTRACE_PROBE3(brand__lx__set__ntv__stack__current, 426 uintptr_t, lwpd->br_ntv_stack_current, 427 uintptr_t, new_sp, 428 uintptr_t, lwpd->br_ntv_stack); 429 430 lwpd->br_ntv_stack_current = new_sp; 431 } 432 433 #if defined(_LP64) 434 static int 435 lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type, 436 enum seg_rw rw) 437 { 438 int syscall_num; 439 440 /* 441 * We only want to handle a very specific set of circumstances. 442 * Namely: this is a 64-bit LX-branded process attempting to execute an 443 * address in a page for which it does not have a valid mapping. If 444 * this is not the case, we bail out as fast as possible. 445 */ 446 VERIFY(PROC_IS_BRANDED(p)); 447 if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) != 448 DATAMODEL_NATIVE) { 449 return (-1); 450 } 451 452 if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) { 453 return (-1); 454 } 455 456 /* 457 * This is a valid vsyscall address. We service the system call and 458 * return 0 to signal that the pagefault has been handled completely. 459 */ 460 lx_vsyscall_enter(p, lwp, syscall_num); 461 return (0); 462 } 463 #endif 464 465 /* 466 * This hook runs prior to sendsig() processing and allows us to nominate 467 * an alternative stack pointer for delivery of the signal handling frame. 468 * Critically, this routine should _not_ modify any LWP state as the 469 * savecontext() does not run until after this hook. 470 */ 471 static caddr_t 472 lx_sendsig_stack(int sig) 473 { 474 klwp_t *lwp = ttolwp(curthread); 475 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 476 477 /* 478 * We want to take signal delivery on the native stack, but only if 479 * one has been allocated and installed for this LWP. 480 */ 481 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 482 /* 483 * The program is not running on the native stack. Return 484 * the native stack pointer from our brand-private data so 485 * that we may switch to it for signal handling. 486 */ 487 return ((caddr_t)lwpd->br_ntv_stack_current); 488 } else { 489 struct regs *rp = lwptoregs(lwp); 490 491 /* 492 * Either the program is already running on the native stack, 493 * or one has not yet been allocated for this LWP. Use the 494 * current stack pointer value. 495 */ 496 return ((caddr_t)rp->r_sp); 497 } 498 } 499 500 /* 501 * This hook runs after sendsig() processing and allows us to update the 502 * per-LWP mode flags for system calls and stacks. The pre-signal 503 * context has already been saved and delivered to the user at this point. 504 */ 505 static void 506 lx_sendsig(int sig) 507 { 508 klwp_t *lwp = ttolwp(curthread); 509 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 510 struct regs *rp = lwptoregs(lwp); 511 512 switch (lwpd->br_stack_mode) { 513 case LX_STACK_MODE_BRAND: 514 case LX_STACK_MODE_NATIVE: 515 /* 516 * In lx_sendsig_stack(), we nominated a stack pointer from the 517 * native stack. Update the stack mode, and the current in-use 518 * extent of the native stack, accordingly: 519 */ 520 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; 521 lx_lwp_set_native_stack_current(lwpd, rp->r_sp); 522 523 /* 524 * Fix up segment registers, etc. 525 */ 526 lx_switch_to_native(lwp); 527 break; 528 529 default: 530 /* 531 * Otherwise, the brand library has not yet installed the 532 * alternate stack for this LWP. Signals will be handled on 533 * the regular stack thread. 534 */ 535 return; 536 } 537 } 538 539 /* 540 * This hook runs prior to the context restoration, allowing us to take action 541 * or modify the context before it is loaded. 542 */ 543 static void 544 lx_restorecontext(ucontext_t *ucp) 545 { 546 klwp_t *lwp = ttolwp(curthread); 547 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 548 uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0]; 549 caddr_t sp = ucp->uc_brand_data[1]; 550 551 /* 552 * We have a saved native stack pointer value that we must restore 553 * into the per-LWP data. 554 */ 555 if (flags & LX_UC_RESTORE_NATIVE_SP) { 556 lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp); 557 } 558 559 /* 560 * We do not wish to restore the value of uc_link in this context, 561 * so replace it with the value currently in the LWP. 562 */ 563 if (flags & LX_UC_IGNORE_LINK) { 564 ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext; 565 } 566 567 /* 568 * Restore the stack mode: 569 */ 570 if (flags & LX_UC_STACK_NATIVE) { 571 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; 572 } else if (flags & LX_UC_STACK_BRAND) { 573 lwpd->br_stack_mode = LX_STACK_MODE_BRAND; 574 } 575 576 #if defined(__amd64) 577 /* 578 * Override the fs/gsbase in the context with the value provided 579 * through the Linux arch_prctl(2) system call. 580 */ 581 if (flags & LX_UC_STACK_BRAND) { 582 if (lwpd->br_lx_fsbase != 0) { 583 ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase; 584 } 585 if (lwpd->br_lx_gsbase != 0) { 586 ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase; 587 } 588 } 589 #endif 590 } 591 592 static void 593 lx_savecontext(ucontext_t *ucp) 594 { 595 klwp_t *lwp = ttolwp(curthread); 596 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 597 uintptr_t flags = 0; 598 599 /* 600 * The ucontext_t affords us three private pointer-sized members in 601 * "uc_brand_data". We pack a variety of flags into the first element, 602 * and an optional stack pointer in the second element. The flags 603 * determine which stack pointer (native or brand), if any, is stored 604 * in the second element. The third element may contain the system 605 * call number; this is analogous to the "orig_[er]ax" member of a 606 * Linux "user_regs_struct". 607 */ 608 609 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && 610 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 611 /* 612 * Record the value of the native stack pointer to restore 613 * when returning to this branded context: 614 */ 615 flags |= LX_UC_RESTORE_NATIVE_SP; 616 ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current; 617 } 618 619 /* 620 * Save the stack mode: 621 */ 622 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { 623 flags |= LX_UC_STACK_NATIVE; 624 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 625 flags |= LX_UC_STACK_BRAND; 626 } 627 628 /* 629 * If we might need to restart this system call, save that information 630 * in the context: 631 */ 632 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 633 ucp->uc_brand_data[2] = 634 (void *)(uintptr_t)lwpd->br_syscall_num; 635 if (lwpd->br_syscall_restart) { 636 flags |= LX_UC_RESTART_SYSCALL; 637 } 638 } else { 639 ucp->uc_brand_data[2] = NULL; 640 } 641 642 ucp->uc_brand_data[0] = (void *)flags; 643 } 644 645 #if defined(_SYSCALL32_IMPL) 646 static void 647 lx_savecontext32(ucontext32_t *ucp) 648 { 649 klwp_t *lwp = ttolwp(curthread); 650 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 651 unsigned int flags = 0; 652 653 /* 654 * The ucontext_t affords us three private pointer-sized members in 655 * "uc_brand_data". We pack a variety of flags into the first element, 656 * and an optional stack pointer in the second element. The flags 657 * determine which stack pointer (native or brand), if any, is stored 658 * in the second element. The third element may contain the system 659 * call number; this is analogous to the "orig_[er]ax" member of a 660 * Linux "user_regs_struct". 661 */ 662 663 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && 664 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 665 /* 666 * Record the value of the native stack pointer to restore 667 * when returning to this branded context: 668 */ 669 flags |= LX_UC_RESTORE_NATIVE_SP; 670 ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current; 671 } 672 673 /* 674 * Save the stack mode: 675 */ 676 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { 677 flags |= LX_UC_STACK_NATIVE; 678 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 679 flags |= LX_UC_STACK_BRAND; 680 } 681 682 /* 683 * If we might need to restart this system call, save that information 684 * in the context: 685 */ 686 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { 687 ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num; 688 if (lwpd->br_syscall_restart) { 689 flags |= LX_UC_RESTART_SYSCALL; 690 } 691 } else { 692 ucp->uc_brand_data[2] = NULL; 693 } 694 695 ucp->uc_brand_data[0] = flags; 696 } 697 #endif 698 699 void 700 lx_init_brand_data(zone_t *zone) 701 { 702 lx_zone_data_t *data; 703 ASSERT(zone->zone_brand == &lx_brand); 704 ASSERT(zone->zone_brand_data == NULL); 705 data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP); 706 /* 707 * Set the default lxzd_kernel_version to 2.4. 708 * This can be changed by a call to setattr() during zone boot. 709 */ 710 (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX); 711 712 /* 713 * Linux is not at all picky about address family when it comes to 714 * supporting interface-related ioctls. To mimic this behavior, we'll 715 * attempt those ioctls against a ksocket configured for that purpose. 716 */ 717 (void) ksocket_socket(&data->lxzd_ioctl_sock, AF_INET, SOCK_DGRAM, 0, 718 0, zone->zone_kcred); 719 720 zone->zone_brand_data = data; 721 722 /* 723 * In Linux, if the init(1) process terminates the system panics. 724 * The zone must reboot to simulate this behaviour. 725 */ 726 zone->zone_reboot_on_init_exit = B_TRUE; 727 } 728 729 void 730 lx_free_brand_data(zone_t *zone) 731 { 732 lx_zone_data_t *data = ztolxzd(zone); 733 ASSERT(data != NULL); 734 if (data->lxzd_ioctl_sock != NULL) { 735 /* 736 * Since zone_kcred has been cleaned up already, close the 737 * socket using the global kcred. 738 */ 739 ksocket_close(data->lxzd_ioctl_sock, kcred); 740 data->lxzd_ioctl_sock = NULL; 741 } 742 zone->zone_brand_data = NULL; 743 kmem_free(data, sizeof (*data)); 744 } 745 746 void 747 lx_unsupported(char *dmsg) 748 { 749 lx_proc_data_t *pd = ttolxproc(curthread); 750 751 DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg); 752 753 if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) { 754 /* 755 * If this process was run with strict mode enabled 756 * (via LX_STRICT in the environment), we mark this 757 * LWP as having triggered an unsupported behaviour. 758 * This flag will be checked at an appropriate point 759 * by lx_check_strict_failure(). 760 */ 761 lx_lwp_data_t *lwpd = ttolxlwp(curthread); 762 763 lwpd->br_strict_failure = B_TRUE; 764 } 765 } 766 767 void 768 lx_check_strict_failure(lx_lwp_data_t *lwpd) 769 { 770 proc_t *p; 771 772 if (!lwpd->br_strict_failure) { 773 return; 774 } 775 776 lwpd->br_strict_failure = B_FALSE; 777 778 /* 779 * If this process is operating in strict mode (via LX_STRICT in 780 * the environment), and has triggered a call to 781 * lx_unsupported(), we drop SIGSYS on it as we return. 782 */ 783 p = curproc; 784 mutex_enter(&p->p_lock); 785 sigtoproc(p, curthread, SIGSYS); 786 mutex_exit(&p->p_lock); 787 } 788 789 void 790 lx_trace_sysenter(int syscall_num, uintptr_t *args) 791 { 792 if (lx_systrace_enabled) { 793 VERIFY(lx_systrace_entry_ptr != NULL); 794 795 (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1], 796 args[2], args[3], args[4], args[5]); 797 } 798 } 799 800 void 801 lx_trace_sysreturn(int syscall_num, long ret) 802 { 803 if (lx_systrace_enabled) { 804 VERIFY(lx_systrace_return_ptr != NULL); 805 806 (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0); 807 } 808 } 809 810 /* 811 * Get the addresses of the user-space system call handler and attach it to 812 * the proc structure. Returning 0 indicates success; the value returned 813 * by the system call is the value stored in rval. Returning a non-zero 814 * value indicates a failure; the value returned is used to set errno, -1 815 * is returned from the syscall and the contents of rval are ignored. To 816 * set errno and have the syscall return a value other than -1 we can 817 * manually set errno and rval and return 0. 818 */ 819 int 820 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, 821 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) 822 { 823 kthread_t *t = curthread; 824 klwp_t *lwp = ttolwp(t); 825 proc_t *p = ttoproc(t); 826 lx_proc_data_t *pd; 827 struct termios *termios; 828 uint_t termios_len; 829 int error; 830 int code; 831 int sig; 832 lx_brand_registration_t reg; 833 lx_lwp_data_t *lwpd = lwptolxlwp(lwp); 834 835 /* 836 * There is one operation that is suppored for non-branded 837 * process. B_EXEC_BRAND. This is the equilivant of an 838 * exec call, but the new process that is created will be 839 * a branded process. 840 */ 841 if (cmd == B_EXEC_BRAND) { 842 VERIFY(p->p_zone != NULL); 843 VERIFY(p->p_zone->zone_brand == &lx_brand); 844 return (exec_common( 845 (char *)arg1, (const char **)arg2, (const char **)arg3, 846 EBA_BRAND)); 847 } 848 849 /* For all other operations this must be a branded process. */ 850 if (p->p_brand == NULL) 851 return (ENOSYS); 852 853 VERIFY(p->p_brand == &lx_brand); 854 VERIFY(p->p_brand_data != NULL); 855 856 switch (cmd) { 857 case B_REGISTER: 858 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 859 lx_print("stack mode was not PREINIT during " 860 "REGISTER\n"); 861 return (EINVAL); 862 } 863 864 if (p->p_model == DATAMODEL_NATIVE) { 865 if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { 866 lx_print("Failed to copyin brand registration " 867 "at 0x%p\n", (void *)arg1); 868 return (EFAULT); 869 } 870 } 871 #ifdef _LP64 872 else { 873 /* 32-bit userland on 64-bit kernel */ 874 lx_brand_registration32_t reg32; 875 876 if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { 877 lx_print("Failed to copyin brand registration " 878 "at 0x%p\n", (void *)arg1); 879 return (EFAULT); 880 } 881 882 reg.lxbr_version = (uint_t)reg32.lxbr_version; 883 reg.lxbr_handler = 884 (void *)(uintptr_t)reg32.lxbr_handler; 885 reg.lxbr_flags = reg32.lxbr_flags; 886 } 887 #endif 888 889 if (reg.lxbr_version != LX_VERSION_1) { 890 lx_print("Invalid brand library version (%u)\n", 891 reg.lxbr_version); 892 return (EINVAL); 893 } 894 895 if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) { 896 lx_print("Invalid brand flags (%u)\n", 897 reg.lxbr_flags); 898 return (EINVAL); 899 } 900 901 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", 902 (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); 903 pd = p->p_brand_data; 904 pd->l_handler = (uintptr_t)reg.lxbr_handler; 905 pd->l_flags = reg.lxbr_flags & LX_PROC_ALL; 906 907 return (0); 908 909 case B_TTYMODES: 910 /* This is necessary for emulating TCGETS ioctls. */ 911 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), 912 DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, 913 &termios_len) != DDI_SUCCESS) 914 return (EIO); 915 916 ASSERT(termios_len == sizeof (*termios)); 917 918 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { 919 ddi_prop_free(termios); 920 return (EFAULT); 921 } 922 923 ddi_prop_free(termios); 924 return (0); 925 926 case B_ELFDATA: 927 pd = curproc->p_brand_data; 928 if (get_udatamodel() == DATAMODEL_NATIVE) { 929 if (copyout(&pd->l_elf_data, (void *)arg1, 930 sizeof (lx_elf_data_t)) != 0) { 931 return (EFAULT); 932 } 933 } 934 #if defined(_LP64) 935 else { 936 /* 32-bit userland on 64-bit kernel */ 937 lx_elf_data32_t led32; 938 939 led32.ed_phdr = (int)pd->l_elf_data.ed_phdr; 940 led32.ed_phent = (int)pd->l_elf_data.ed_phent; 941 led32.ed_phnum = (int)pd->l_elf_data.ed_phnum; 942 led32.ed_entry = (int)pd->l_elf_data.ed_entry; 943 led32.ed_base = (int)pd->l_elf_data.ed_base; 944 led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry; 945 946 if (copyout(&led32, (void *)arg1, 947 sizeof (led32)) != 0) { 948 return (EFAULT); 949 } 950 } 951 #endif 952 return (0); 953 954 case B_EXEC_NATIVE: 955 return (exec_common((char *)arg1, (const char **)arg2, 956 (const char **)arg3, EBA_NATIVE)); 957 958 /* 959 * The B_TRUSS_POINT subcommand is used so that we can make a no-op 960 * syscall for debugging purposes (dtracing) from within the user-level 961 * emulation. 962 */ 963 case B_TRUSS_POINT: 964 return (0); 965 966 case B_LPID_TO_SPAIR: { 967 /* 968 * Given a Linux pid as arg1, return the Solaris pid in arg2 and 969 * the Solaris LWP in arg3. We also translate pid 1 (which is 970 * hardcoded in many applications) to the zone's init process. 971 */ 972 pid_t s_pid; 973 id_t s_tid; 974 975 if ((pid_t)arg1 == 1) { 976 s_pid = p->p_zone->zone_proc_initpid; 977 /* handle the dead/missing init(1M) case */ 978 if (s_pid == -1) 979 s_pid = 1; 980 s_tid = 1; 981 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) { 982 return (ESRCH); 983 } 984 985 if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 || 986 copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) { 987 return (EFAULT); 988 } 989 990 return (0); 991 } 992 993 case B_SET_AFFINITY_MASK: 994 case B_GET_AFFINITY_MASK: 995 /* 996 * Retrieve or store the CPU affinity mask for the 997 * requested linux pid. 998 * 999 * arg1 is a linux PID (0 means curthread). 1000 * arg2 is the size of the given mask. 1001 * arg3 is the address of the affinity mask. 1002 */ 1003 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval)); 1004 1005 case B_PTRACE_STOP_FOR_OPT: 1006 return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ? 1007 B_FALSE : B_TRUE, (ulong_t)arg3, arg4)); 1008 1009 case B_PTRACE_CLONE_BEGIN: 1010 return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ? 1011 B_FALSE : B_TRUE)); 1012 1013 case B_PTRACE_KERNEL: 1014 return (lx_ptrace_kernel((int)arg1, (pid_t)arg2, arg3, arg4)); 1015 1016 case B_HELPER_WAITID: { 1017 idtype_t idtype = (idtype_t)arg1; 1018 id_t id = (id_t)arg2; 1019 siginfo_t *infop = (siginfo_t *)arg3; 1020 int options = (int)arg4; 1021 1022 lwpd = ttolxlwp(curthread); 1023 1024 /* 1025 * Our brand-specific waitid helper only understands a subset of 1026 * the possible idtypes. Ensure we keep to that subset here: 1027 */ 1028 if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { 1029 return (EINVAL); 1030 } 1031 1032 /* 1033 * Enable the return of emulated ptrace(2) stop conditions 1034 * through lx_waitid_helper, and stash the Linux-specific 1035 * extra waitid() flags. 1036 */ 1037 lwpd->br_waitid_emulate = B_TRUE; 1038 lwpd->br_waitid_flags = (int)arg5; 1039 1040 #if defined(_SYSCALL32_IMPL) 1041 if (get_udatamodel() != DATAMODEL_NATIVE) { 1042 return (waitsys32(idtype, id, infop, options)); 1043 } else 1044 #endif 1045 { 1046 return (waitsys(idtype, id, infop, options)); 1047 } 1048 1049 lwpd->br_waitid_emulate = B_FALSE; 1050 lwpd->br_waitid_flags = 0; 1051 1052 return (0); 1053 } 1054 1055 case B_UNSUPPORTED: { 1056 char dmsg[256]; 1057 1058 if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) { 1059 lx_print("Failed to copyin unsupported msg " 1060 "at 0x%p\n", (void *)arg1); 1061 return (EFAULT); 1062 } 1063 dmsg[255] = '\0'; 1064 lx_unsupported(dmsg); 1065 1066 lx_check_strict_failure(lwpd); 1067 1068 return (0); 1069 } 1070 1071 case B_STORE_ARGS: { 1072 /* 1073 * B_STORE_ARGS subcommand 1074 * arg1 = address of struct to be copied in 1075 * arg2 = size of the struct being copied in 1076 * arg3-arg6 ignored 1077 * rval = the amount of data copied. 1078 */ 1079 void *buf; 1080 1081 /* only have upper limit because arg2 is unsigned */ 1082 if (arg2 > LX_BR_ARGS_SIZE_MAX) { 1083 return (EINVAL); 1084 } 1085 1086 buf = kmem_alloc(arg2, KM_SLEEP); 1087 if (copyin((void *)arg1, buf, arg2) != 0) { 1088 lx_print("Failed to copyin scall arg at 0x%p\n", 1089 (void *) arg1); 1090 kmem_free(buf, arg2); 1091 /* 1092 * Purposely not setting br_scall_args to NULL 1093 * to preserve data for debugging. 1094 */ 1095 return (EFAULT); 1096 } 1097 1098 if (lwpd->br_scall_args != NULL) { 1099 ASSERT(lwpd->br_args_size > 0); 1100 kmem_free(lwpd->br_scall_args, 1101 lwpd->br_args_size); 1102 } 1103 1104 lwpd->br_scall_args = buf; 1105 lwpd->br_args_size = arg2; 1106 *rval = arg2; 1107 return (0); 1108 } 1109 1110 case B_HELPER_CLONE: 1111 return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3, 1112 (void *)arg4)); 1113 1114 case B_HELPER_SETGROUPS: 1115 return (lx_helper_setgroups(arg1, (gid_t *)arg2)); 1116 1117 case B_HELPER_SIGQUEUE: 1118 return (lx_helper_rt_sigqueueinfo(arg1, arg2, 1119 (siginfo_t *)arg3)); 1120 1121 case B_HELPER_TGSIGQUEUE: 1122 return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3, 1123 (siginfo_t *)arg4)); 1124 1125 case B_SET_THUNK_PID: 1126 lwpd->br_lx_thunk_pid = arg1; 1127 return (0); 1128 1129 case B_GETPID: 1130 /* 1131 * The usermode clone(2) code needs to be able to call 1132 * lx_getpid() from native code: 1133 */ 1134 *rval = lx_getpid(); 1135 return (0); 1136 1137 case B_SET_NATIVE_STACK: 1138 /* 1139 * B_SET_NATIVE_STACK subcommand 1140 * arg1 = the base of the stack to use for emulation 1141 */ 1142 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { 1143 lx_print("B_SET_NATIVE_STACK when stack was already " 1144 "set to %p\n", (void *)arg1); 1145 return (EEXIST); 1146 } 1147 1148 /* 1149 * We move from the PREINIT state, where we have no brand 1150 * emulation stack, to the INIT state. Here, we are still 1151 * running on what will become the BRAND stack, but are running 1152 * emulation (i.e. native) code. Once the initialisation 1153 * process for this thread has finished, we will jump to 1154 * brand-specific code, while moving to the BRAND mode. 1155 * 1156 * When a new LWP is created, lx_initlwp() will clear the 1157 * stack data. If that LWP is actually being duplicated 1158 * into a child process by fork(2), lx_forklwp() will copy 1159 * it so that the cloned thread will keep using the same 1160 * alternate stack. 1161 */ 1162 lwpd->br_ntv_stack = arg1; 1163 lwpd->br_stack_mode = LX_STACK_MODE_INIT; 1164 lx_lwp_set_native_stack_current(lwpd, arg1); 1165 1166 return (0); 1167 1168 case B_GET_CURRENT_CONTEXT: 1169 /* 1170 * B_GET_CURRENT_CONTEXT subcommand: 1171 * arg1 = address for pointer to current ucontext_t 1172 */ 1173 1174 #if defined(_SYSCALL32_IMPL) 1175 if (get_udatamodel() != DATAMODEL_NATIVE) { 1176 caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext; 1177 1178 error = copyout(&addr, (void *)arg1, sizeof (addr)); 1179 } else 1180 #endif 1181 { 1182 error = copyout(&lwp->lwp_oldcontext, (void *)arg1, 1183 sizeof (lwp->lwp_oldcontext)); 1184 } 1185 1186 return (error != 0 ? EFAULT : 0); 1187 1188 case B_JUMP_TO_LINUX: 1189 /* 1190 * B_JUMP_TO_LINUX subcommand: 1191 * arg1 = ucontext_t pointer for jump state 1192 */ 1193 1194 if (arg1 == NULL) 1195 return (EINVAL); 1196 1197 switch (lwpd->br_stack_mode) { 1198 case LX_STACK_MODE_NATIVE: { 1199 struct regs *rp = lwptoregs(lwp); 1200 1201 /* 1202 * We are on the NATIVE stack, so we must preserve 1203 * the extent of that stack. The pointer will be 1204 * reset by a future setcontext(). 1205 */ 1206 lx_lwp_set_native_stack_current(lwpd, 1207 (uintptr_t)rp->r_sp); 1208 break; 1209 } 1210 1211 case LX_STACK_MODE_INIT: 1212 /* 1213 * The LWP is transitioning to Linux code for the first 1214 * time. 1215 */ 1216 break; 1217 1218 case LX_STACK_MODE_PREINIT: 1219 /* 1220 * This LWP has not installed an alternate stack for 1221 * usermode emulation handling. 1222 */ 1223 return (ENOENT); 1224 1225 case LX_STACK_MODE_BRAND: 1226 /* 1227 * The LWP should not be on the BRAND stack. 1228 */ 1229 exit(CLD_KILLED, SIGSYS); 1230 return (0); 1231 } 1232 1233 /* 1234 * Transfer control to Linux: 1235 */ 1236 return (lx_runexe(lwp, (void *)arg1)); 1237 1238 case B_EMULATION_DONE: 1239 /* 1240 * B_EMULATION_DONE subcommand: 1241 * arg1 = ucontext_t * to restore 1242 * arg2 = system call number 1243 * arg3 = return code 1244 * arg4 = if operation failed, the errno value 1245 */ 1246 1247 /* 1248 * The first part of this operation is a setcontext() to 1249 * restore the register state to the copy we preserved 1250 * before vectoring to the usermode emulation routine. 1251 * If that fails, we return (hopefully) to the emulation 1252 * routine and it will handle the error. 1253 */ 1254 #if (_SYSCALL32_IMPL) 1255 if (get_udatamodel() != DATAMODEL_NATIVE) { 1256 error = getsetcontext32(SETCONTEXT, (void *)arg1); 1257 } else 1258 #endif 1259 { 1260 error = getsetcontext(SETCONTEXT, (void *)arg1); 1261 } 1262 1263 if (error != 0) { 1264 return (error); 1265 } 1266 1267 /* 1268 * The saved Linux context has been restored. We handle the 1269 * return value or errno with code common to the in-kernel 1270 * system call emulation. 1271 */ 1272 if ((error = (int)arg4) != 0) { 1273 /* 1274 * lx_syscall_return() looks at the errno in the LWP, 1275 * so set it here: 1276 */ 1277 set_errno(error); 1278 } 1279 lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3); 1280 1281 return (0); 1282 1283 case B_EXIT_AS_SIG: 1284 code = CLD_KILLED; 1285 sig = (int)arg1; 1286 proc_is_exiting(p); 1287 if (exitlwps(1) != 0) { 1288 mutex_enter(&p->p_lock); 1289 lwp_exit(); 1290 } 1291 ttolwp(curthread)->lwp_cursig = sig; 1292 if (sig == SIGSEGV) { 1293 if (core(sig, 0) == 0) 1294 code = CLD_DUMPED; 1295 } 1296 exit(code, sig); 1297 /* NOTREACHED */ 1298 break; 1299 } 1300 1301 return (EINVAL); 1302 } 1303 1304 char * 1305 lx_get_zone_kern_version(zone_t *zone) 1306 { 1307 return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version); 1308 } 1309 1310 void 1311 lx_set_kern_version(zone_t *zone, char *vers) 1312 { 1313 lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; 1314 1315 (void) strlcpy(lxzd->lxzd_kernel_version, vers, LX_VERS_MAX); 1316 } 1317 1318 /* 1319 * Compare linux kernel version to the one set for the zone. 1320 * Returns greater than 0 if zone version is higher, less than 0 if the zone 1321 * version is lower, and 0 if the version are equal. 1322 */ 1323 int 1324 lx_kern_version_cmp(zone_t *zone, const char *vers) 1325 { 1326 int zvers[3] = {0, 0, 0}; 1327 int cvers[3] = {0, 0, 0}; 1328 int i; 1329 1330 VERIFY(zone->zone_brand == &lx_brand); 1331 1332 (void) sscanf(ztolxzd(zone)->lxzd_kernel_version, "%d.%d.%d", &zvers[0], 1333 &zvers[1], &zvers[2]); 1334 (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]); 1335 1336 for (i = 0; i < 3; i++) { 1337 if (zvers[i] > cvers[i]) { 1338 return (1); 1339 } else if (zvers[i] < cvers[i]) { 1340 return (-1); 1341 } 1342 } 1343 return (0); 1344 } 1345 1346 /* 1347 * Linux unconditionally removes the setuid and setgid bits when changing 1348 * file ownership. This brand hook overrides the illumos native behaviour, 1349 * which is based on the PRIV_FILE_SETID privilege. 1350 */ 1351 static int 1352 lx_setid_clear(vattr_t *vap, cred_t *cr) 1353 { 1354 if (S_ISDIR(vap->va_mode)) { 1355 return (0); 1356 } 1357 1358 if (vap->va_mode & S_ISUID) { 1359 vap->va_mask |= AT_MODE; 1360 vap->va_mode &= ~S_ISUID; 1361 } 1362 if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1363 vap->va_mask |= AT_MODE; 1364 vap->va_mode &= ~S_ISGID; 1365 } 1366 1367 return (0); 1368 } 1369 1370 /* 1371 * Copy the per-process brand data from a parent proc to a child. 1372 */ 1373 void 1374 lx_copy_procdata(proc_t *child, proc_t *parent) 1375 { 1376 lx_proc_data_t *cpd = child->p_brand_data; 1377 lx_proc_data_t *ppd = parent->p_brand_data; 1378 1379 VERIFY(parent->p_brand == &lx_brand); 1380 VERIFY(child->p_brand == &lx_brand); 1381 VERIFY(ppd != NULL); 1382 VERIFY(cpd != NULL); 1383 1384 *cpd = *ppd; 1385 1386 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY; 1387 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY; 1388 1389 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20; 1390 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20; 1391 1392 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY; 1393 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY; 1394 1395 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY; 1396 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY; 1397 } 1398 1399 #if defined(_LP64) 1400 static void 1401 Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst) 1402 { 1403 bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident)); 1404 dst->e_type = src->e_type; 1405 dst->e_machine = src->e_machine; 1406 dst->e_version = src->e_version; 1407 dst->e_entry = src->e_entry; 1408 dst->e_phoff = src->e_phoff; 1409 dst->e_shoff = src->e_shoff; 1410 dst->e_flags = src->e_flags; 1411 dst->e_ehsize = src->e_ehsize; 1412 dst->e_phentsize = src->e_phentsize; 1413 dst->e_phnum = src->e_phnum; 1414 dst->e_shentsize = src->e_shentsize; 1415 dst->e_shnum = src->e_shnum; 1416 dst->e_shstrndx = src->e_shstrndx; 1417 } 1418 #endif /* _LP64 */ 1419 1420 static void 1421 restoreexecenv(struct execenv *ep, stack_t *sp) 1422 { 1423 klwp_t *lwp = ttolwp(curthread); 1424 1425 setexecenv(ep); 1426 lwp->lwp_sigaltstack.ss_sp = sp->ss_sp; 1427 lwp->lwp_sigaltstack.ss_size = sp->ss_size; 1428 lwp->lwp_sigaltstack.ss_flags = sp->ss_flags; 1429 } 1430 1431 extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, 1432 long *, int, caddr_t, cred_t *, int *); 1433 1434 extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int, 1435 long *, int, caddr_t, cred_t *, int *); 1436 1437 /* 1438 * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux 1439 * binaries. 1440 */ 1441 static int 1442 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, 1443 struct intpdata *idata, int level, long *execsz, int setid, 1444 caddr_t exec_file, struct cred *cred, int *brand_action) 1445 { 1446 int error; 1447 vnode_t *nvp; 1448 Ehdr ehdr; 1449 Addr uphdr_vaddr; 1450 intptr_t voffset; 1451 char *interp = NULL; 1452 uintptr_t ldaddr = NULL; 1453 int i; 1454 proc_t *p = ttoproc(curthread); 1455 klwp_t *lwp = ttolwp(curthread); 1456 struct execenv env; 1457 struct execenv origenv; 1458 stack_t orig_sigaltstack; 1459 struct user *up = PTOU(ttoproc(curthread)); 1460 lx_elf_data_t *edp; 1461 char *lib_path = NULL; 1462 1463 ASSERT(ttoproc(curthread)->p_brand == &lx_brand); 1464 ASSERT(ttoproc(curthread)->p_brand_data != NULL); 1465 1466 edp = &ttolxproc(curthread)->l_elf_data; 1467 1468 if (args->to_model == DATAMODEL_NATIVE) { 1469 lib_path = LX_LIB_PATH; 1470 } 1471 #if defined(_LP64) 1472 else { 1473 lib_path = LX_LIB_PATH32; 1474 } 1475 #endif 1476 1477 /* 1478 * Set the brandname and library name for the new process so that 1479 * elfexec() puts them onto the stack. 1480 */ 1481 args->brandname = LX_BRANDNAME; 1482 args->emulator = lib_path; 1483 1484 #if defined(_LP64) 1485 /* 1486 * To conform with the way Linux lays out the address space, we clamp 1487 * the stack to be the top of the lower region of the x86-64 canonical 1488 * form address space -- which has the side-effect of laying out the 1489 * entire address space in that lower region. Note that this only 1490 * matters on 64-bit processes (this value will always be greater than 1491 * the size of a 32-bit address space) and doesn't actually affect 1492 * USERLIMIT: if a Linux-branded processes wishes to map something 1493 * into the top half of the address space, it can do so -- but with 1494 * the user stack starting at the top of the bottom region, those high 1495 * virtual addresses won't be used unless explicitly directed. 1496 */ 1497 args->maxstack = lx_maxstack64; 1498 #endif 1499 1500 /* 1501 * We will first exec the brand library, then map in the linux 1502 * executable and the linux linker. 1503 */ 1504 if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP, 1505 &nvp))) { 1506 uprintf("%s: not found.", lib_path); 1507 return (error); 1508 } 1509 1510 /* 1511 * We will eventually set the p_exec member to be the vnode for the new 1512 * executable when we call setexecenv(). However, if we get an error 1513 * before that call we need to restore the execenv to its original 1514 * values so that when we return to the caller fop_close() works 1515 * properly while cleaning up from the failed exec(). Restoring the 1516 * original value will also properly decrement the 2nd VN_RELE that we 1517 * took on the brand library. 1518 */ 1519 origenv.ex_bssbase = p->p_bssbase; 1520 origenv.ex_brkbase = p->p_brkbase; 1521 origenv.ex_brksize = p->p_brksize; 1522 origenv.ex_vp = p->p_exec; 1523 orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp; 1524 orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size; 1525 orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags; 1526 1527 if (args->to_model == DATAMODEL_NATIVE) { 1528 error = elfexec(nvp, uap, args, idata, level + 1, execsz, 1529 setid, exec_file, cred, brand_action); 1530 } 1531 #if defined(_LP64) 1532 else { 1533 error = elf32exec(nvp, uap, args, idata, level + 1, execsz, 1534 setid, exec_file, cred, brand_action); 1535 } 1536 #endif 1537 VN_RELE(nvp); 1538 if (error != 0) { 1539 restoreexecenv(&origenv, &orig_sigaltstack); 1540 return (error); 1541 } 1542 1543 /* 1544 * exec-ed in the brand library above. 1545 * The u_auxv vectors are now setup by elfexec to point to the 1546 * brand emulation library and its linker. 1547 */ 1548 1549 bzero(&env, sizeof (env)); 1550 1551 /* 1552 * map in the the Linux executable 1553 */ 1554 if (args->to_model == DATAMODEL_NATIVE) { 1555 error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, 1556 &voffset, exec_file, &interp, &env.ex_bssbase, 1557 &env.ex_brkbase, &env.ex_brksize, NULL, NULL); 1558 } 1559 #if defined(_LP64) 1560 else { 1561 Elf32_Ehdr ehdr32; 1562 Elf32_Addr uphdr_vaddr32; 1563 1564 error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, 1565 &voffset, exec_file, &interp, &env.ex_bssbase, 1566 &env.ex_brkbase, &env.ex_brksize, NULL, NULL); 1567 1568 Ehdr32to64(&ehdr32, &ehdr); 1569 1570 if (uphdr_vaddr32 == (Elf32_Addr)-1) 1571 uphdr_vaddr = (Addr)-1; 1572 else 1573 uphdr_vaddr = uphdr_vaddr32; 1574 } 1575 #endif 1576 if (error != 0) { 1577 restoreexecenv(&origenv, &orig_sigaltstack); 1578 1579 if (interp != NULL) 1580 kmem_free(interp, MAXPATHLEN); 1581 1582 return (error); 1583 } 1584 1585 /* 1586 * Save off the important properties of the lx executable. The brand 1587 * library will ask us for this data later, when it is ready to set 1588 * things up for the lx executable. 1589 */ 1590 edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : 1591 voffset + uphdr_vaddr; 1592 edp->ed_entry = voffset + ehdr.e_entry; 1593 edp->ed_phent = ehdr.e_phentsize; 1594 edp->ed_phnum = ehdr.e_phnum; 1595 1596 if (interp != NULL) { 1597 if (ehdr.e_type == ET_DYN) { 1598 /* 1599 * This is a shared object executable, so we need to 1600 * pick a reasonable place to put the heap. Just don't 1601 * use the first page. 1602 */ 1603 env.ex_brkbase = (caddr_t)PAGESIZE; 1604 env.ex_bssbase = (caddr_t)PAGESIZE; 1605 } 1606 1607 /* 1608 * If the program needs an interpreter (most do), map it in and 1609 * store relevant information about it in the aux vector, where 1610 * the brand library can find it. 1611 */ 1612 if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW, 1613 NULLVPP, &nvp))) { 1614 uprintf("%s: not found.", interp); 1615 restoreexecenv(&origenv, &orig_sigaltstack); 1616 kmem_free(interp, MAXPATHLEN); 1617 return (error); 1618 } 1619 1620 kmem_free(interp, MAXPATHLEN); 1621 interp = NULL; 1622 1623 /* 1624 * map in the Linux linker 1625 */ 1626 if (args->to_model == DATAMODEL_NATIVE) { 1627 error = mapexec_brand(nvp, args, &ehdr, 1628 &uphdr_vaddr, &voffset, exec_file, NULL, NULL, 1629 NULL, NULL, NULL, &ldaddr); 1630 } 1631 #if defined(_LP64) 1632 else { 1633 Elf32_Ehdr ehdr32; 1634 Elf32_Addr uphdr_vaddr32; 1635 1636 error = mapexec32_brand(nvp, args, &ehdr32, 1637 &uphdr_vaddr32, &voffset, exec_file, NULL, NULL, 1638 NULL, NULL, NULL, &ldaddr); 1639 1640 Ehdr32to64(&ehdr32, &ehdr); 1641 1642 if (uphdr_vaddr32 == (Elf32_Addr)-1) 1643 uphdr_vaddr = (Addr)-1; 1644 else 1645 uphdr_vaddr = uphdr_vaddr32; 1646 } 1647 #endif 1648 1649 VN_RELE(nvp); 1650 if (error != 0) { 1651 restoreexecenv(&origenv, &orig_sigaltstack); 1652 return (error); 1653 } 1654 1655 /* 1656 * Now that we know the base address of the brand's linker, 1657 * we also save this for later use by the brand library. 1658 */ 1659 edp->ed_base = voffset; 1660 edp->ed_ldentry = voffset + ehdr.e_entry; 1661 } else { 1662 /* 1663 * This program has no interpreter. The lx brand library will 1664 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, 1665 * so in this case, put the entry point of the main executable 1666 * there. 1667 */ 1668 if (ehdr.e_type == ET_EXEC) { 1669 /* 1670 * An executable with no interpreter, this must be a 1671 * statically linked executable, which means we loaded 1672 * it at the address specified in the elf header, in 1673 * which case the e_entry field of the elf header is an 1674 * absolute address. 1675 */ 1676 edp->ed_ldentry = ehdr.e_entry; 1677 edp->ed_entry = ehdr.e_entry; 1678 } else { 1679 /* 1680 * A shared object with no interpreter, we use the 1681 * calculated address from above. 1682 */ 1683 edp->ed_ldentry = edp->ed_entry; 1684 1685 /* 1686 * In all situations except an ET_DYN elf object with no 1687 * interpreter, we want to leave the brk and base 1688 * values set by mapexec_brand alone. Normally when 1689 * running ET_DYN objects on Solaris (most likely 1690 * /lib/ld.so.1) the kernel sets brk and base to 0 since 1691 * it doesn't know where to put the heap, and later the 1692 * linker will call brk() to initialize the heap in: 1693 * usr/src/cmd/sgs/rtld/common/setup.c:setup() 1694 * after it has determined where to put it. (This 1695 * decision is made after the linker loads and inspects 1696 * elf properties of the target executable being run.) 1697 * 1698 * So for ET_DYN Linux executables, we also don't know 1699 * where the heap should go, so we'll set the brk and 1700 * base to 0. But in this case the Solaris linker will 1701 * not initialize the heap, so when the Linux linker 1702 * starts running there is no heap allocated. This 1703 * seems to be ok on Linux 2.4 based systems because the 1704 * Linux linker/libc fall back to using mmap() to 1705 * allocate memory. But on 2.6 systems, running 1706 * applications by specifying them as command line 1707 * arguments to the linker results in segfaults for an 1708 * as yet undetermined reason (which seems to indicatej 1709 * that a more permanent fix for heap initalization in 1710 * these cases may be necessary). 1711 */ 1712 if (ehdr.e_type == ET_DYN) { 1713 env.ex_bssbase = (caddr_t)0; 1714 env.ex_brkbase = (caddr_t)0; 1715 env.ex_brksize = 0; 1716 } 1717 } 1718 1719 } 1720 1721 env.ex_vp = vp; 1722 setexecenv(&env); 1723 1724 /* 1725 * We try to keep /proc's view of the aux vector consistent with 1726 * what's on the process stack. 1727 */ 1728 if (args->to_model == DATAMODEL_NATIVE) { 1729 auxv_t phdr_auxv[4] = { 1730 { AT_SUN_BRAND_LX_PHDR, 0 }, 1731 { AT_SUN_BRAND_LX_INTERP, 0 }, 1732 { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 }, 1733 { AT_SUN_BRAND_AUX4, 0 } 1734 }; 1735 phdr_auxv[0].a_un.a_val = edp->ed_phdr; 1736 phdr_auxv[1].a_un.a_val = ldaddr; 1737 phdr_auxv[2].a_un.a_val = 1; /* set in lx_init */ 1738 phdr_auxv[3].a_type = AT_CLKTCK; 1739 phdr_auxv[3].a_un.a_val = hz; 1740 1741 if (copyout(&phdr_auxv, args->auxp_brand, 1742 sizeof (phdr_auxv)) == -1) 1743 return (EFAULT); 1744 } 1745 #if defined(_LP64) 1746 else { 1747 auxv32_t phdr_auxv32[3] = { 1748 { AT_SUN_BRAND_LX_PHDR, 0 }, 1749 { AT_SUN_BRAND_LX_INTERP, 0 }, 1750 { AT_SUN_BRAND_AUX3, 0 } 1751 }; 1752 phdr_auxv32[0].a_un.a_val = edp->ed_phdr; 1753 phdr_auxv32[1].a_un.a_val = ldaddr; 1754 phdr_auxv32[2].a_type = AT_CLKTCK; 1755 phdr_auxv32[2].a_un.a_val = hz; 1756 1757 if (copyout(&phdr_auxv32, args->auxp_brand, 1758 sizeof (phdr_auxv32)) == -1) 1759 return (EFAULT); 1760 } 1761 #endif 1762 1763 /* 1764 * /proc uses the AT_ENTRY aux vector entry to deduce 1765 * the location of the executable in the address space. The user 1766 * structure contains a copy of the aux vector that needs to have those 1767 * entries patched with the values of the real lx executable (they 1768 * currently contain the values from the lx brand library that was 1769 * elfexec'd, above). 1770 * 1771 * For live processes, AT_BASE is used to locate the linker segment, 1772 * which /proc and friends will later use to find Solaris symbols 1773 * (such as rtld_db_preinit). However, for core files, /proc uses 1774 * AT_ENTRY to find the right segment to label as the executable. 1775 * So we set AT_ENTRY to be the entry point of the linux executable, 1776 * but leave AT_BASE to be the address of the Solaris linker. 1777 */ 1778 for (i = 0; i < __KERN_NAUXV_IMPL; i++) { 1779 switch (up->u_auxv[i].a_type) { 1780 case AT_ENTRY: 1781 up->u_auxv[i].a_un.a_val = edp->ed_entry; 1782 break; 1783 1784 case AT_SUN_BRAND_LX_PHDR: 1785 up->u_auxv[i].a_un.a_val = edp->ed_phdr; 1786 break; 1787 1788 case AT_SUN_BRAND_LX_INTERP: 1789 up->u_auxv[i].a_un.a_val = ldaddr; 1790 break; 1791 1792 default: 1793 break; 1794 } 1795 } 1796 1797 return (0); 1798 } 1799 1800 boolean_t 1801 lx_native_exec(uint8_t osabi, const char **interp) 1802 { 1803 if (osabi != ELFOSABI_SOLARIS) 1804 return (B_FALSE); 1805 1806 /* 1807 * If the process root matches the zone root, prepend /native to the 1808 * interpreter path for native executables. Absolute precision from 1809 * VN_CMP is not necessary since any change of process root is likely 1810 * to make native binaries inaccessible via /native. 1811 * 1812 * Processes which chroot directly into /native will be able to 1813 * function as expected with no need for the prefix. 1814 */ 1815 if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) { 1816 *interp = "/native"; 1817 } 1818 1819 return (B_TRUE); 1820 } 1821 1822 static void 1823 lx_syscall_init(void) 1824 { 1825 int i; 1826 1827 /* 1828 * Count up the 32-bit Linux system calls. Note that lx_sysent32 1829 * has (LX_NSYSCALLS + 1) entries. 1830 */ 1831 for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++) 1832 continue; 1833 lx_nsysent32 = i; 1834 1835 #if defined(_LP64) 1836 /* 1837 * Count up the 64-bit Linux system calls. Note that lx_sysent64 1838 * has (LX_NSYSCALLS + 1) entries. 1839 */ 1840 for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++) 1841 continue; 1842 lx_nsysent64 = i; 1843 #endif 1844 } 1845 1846 int 1847 _init(void) 1848 { 1849 int err = 0; 1850 1851 lx_syscall_init(); 1852 lx_pid_init(); 1853 lx_ioctl_init(); 1854 lx_futex_init(); 1855 lx_ptrace_init(); 1856 lx_socket_init(); 1857 1858 err = mod_install(&modlinkage); 1859 if (err != 0) { 1860 cmn_err(CE_WARN, "Couldn't install lx brand module"); 1861 1862 /* 1863 * This looks drastic, but it should never happen. These 1864 * two data structures should be completely free-able until 1865 * they are used by Linux processes. Since the brand 1866 * wasn't loaded there should be no Linux processes, and 1867 * thus no way for these data structures to be modified. 1868 */ 1869 lx_pid_fini(); 1870 lx_ioctl_fini(); 1871 if (lx_futex_fini()) 1872 panic("lx brand module cannot be loaded or unloaded."); 1873 } 1874 return (err); 1875 } 1876 1877 int 1878 _info(struct modinfo *modinfop) 1879 { 1880 return (mod_info(&modlinkage, modinfop)); 1881 } 1882 1883 int 1884 _fini(void) 1885 { 1886 int err; 1887 int futex_done = 0; 1888 1889 /* 1890 * If there are any zones using this brand, we can't allow it to be 1891 * unloaded. 1892 */ 1893 if (brand_zone_count(&lx_brand)) 1894 return (EBUSY); 1895 1896 lx_ptrace_fini(); 1897 lx_pid_fini(); 1898 lx_ioctl_fini(); 1899 lx_socket_fini(); 1900 1901 if ((err = lx_futex_fini()) != 0) { 1902 goto done; 1903 } 1904 futex_done = 1; 1905 1906 err = mod_remove(&modlinkage); 1907 1908 done: 1909 if (err) { 1910 /* 1911 * If we can't unload the module, then we have to get it 1912 * back into a sane state. 1913 */ 1914 lx_ptrace_init(); 1915 lx_pid_init(); 1916 lx_ioctl_init(); 1917 lx_socket_init(); 1918 1919 if (futex_done) { 1920 lx_futex_init(); 1921 } 1922 } 1923 1924 return (err); 1925 }