1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2015, Joyent, Inc. All rights reserved.
  29  */
  30 
  31 /*
  32  * The LX Brand: emulation of a Linux operating environment within a zone.
  33  *
  34  * OVERVIEW
  35  *
  36  * The LX brand enables a full Linux userland -- including a C library,
  37  * init(1) framework, and some set of applications -- to run unmodified
  38  * within an illumos zone.  Unlike illumos, where applications are expected
  39  * to link against and consume functions exported from libraries, the
  40  * supported Linux binary compatibility boundary is the system call
  41  * interface.  By accurately emulating the behaviour of Linux system calls,
  42  * Linux software can be executed in this environment as if it were running
  43  * on a native Linux system.
  44  *
  45  * EMULATING LINUX SYSTEM CALLS
  46  *
  47  * Linux system calls are made in 32-bit processes via the "int 0x80"
  48  * instruction; in 64-bit processes the "syscall" instruction is used, as it
  49  * is with native illumos processes.  In both cases, arguments to system
  50  * calls are generally passed in registers and the usermode stack is not
  51  * interpreted or modified by the Linux kernel.
  52  *
  53  * When the emulated Linux process makes a system call, it traps into the
  54  * illumos kernel.  The in-kernel brand module contains various emulation
  55  * routines, and can fully service some emulated system calls; e.g. read(2)
  56  * and write(2).  Other system calls require assistance from the illumos
  57  * libc, bouncing back out to the brand library ("lx_brand.so.1") for
  58  * emulation.
  59  *
  60  * The brand mechanism allows for the provision of an alternative trap
  61  * handler for the various system call mechanisms.  Traditionally this was
  62  * used to immediately revector execution to the usermode emulation library,
  63  * which was responsible for handling all system calls.  In the interests of
  64  * more accurate emulation and increased performance, much of the regular
  65  * illumos system call path is now invoked.  Only the argument processing and
  66  * handler dispatch are replaced by the brand, via the per-LWP
  67  * "lwp_brand_syscall" interposition function pointer.
  68  *
  69  * THE NATIVE AND BRAND STACKS
  70  *
  71  * Some runtime environments (e.g. the Go language) allocate very small
  72  * thread stacks, preferring to grow or split the stack as necessary.  The
  73  * Linux kernel generally does not use the usermode stack when servicing
  74  * system calls, so this is not a problem.  In order for our emulation to
  75  * have the same zero stack impact, we must execute usermode emulation
  76  * routines on an _alternate_ stack.  This is similar, in principle, to the
  77  * use of sigaltstack(3C) to run signal handlers off the main thread stack.
  78  *
  79  * To this end, the brand library allocates and installs an alternate stack
  80  * (called the "native" stack) for each LWP.  The in-kernel brand code uses
  81  * this stack for usermode emulation calls and interposed signal delivery,
  82  * while the emulated Linux process sees only the data on the main thread
  83  * stack, known as the "brand" stack.  The stack mode is tracked in the
  84  * per-LWP brand-private data, using the LX_STACK_MODE_* enum.
  85  *
  86  * The stack mode doubles as a system call "mode bit".  When in the
  87  * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux
  88  * system calls.  In other modes, system calls are assumed to be native
  89  * illumos system calls as made during brand library initialisation and
  90  * usermode emulation.
  91  *
  92  * USERMODE EMULATION
  93  *
  94  * When a Linux system call cannot be emulated within the kernel, we preserve
  95  * the register state of the Linux process and revector the LWP to the brand
  96  * library usermode emulation handler: the "lx_emulate()" function in
  97  * "lx_brand.so.1".  This revectoring is modelled on the delivery of signals,
  98  * and is performed in "lx_emulate_user()".
  99  *
 100  * First, the emulated process state is written out to the usermode stack of
 101  * the process as a "ucontext_t" object.  Arguments to the emulation routine
 102  * are passed on the stack or in registers, depending on the ABI.  When the
 103  * usermode emulation is complete, the result is passed back to the kernel
 104  * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context
 105  * for restoration.
 106  *
 107  * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT
 108  *
 109  * When servicing emulated system calls in the usermode brand library, or
 110  * during signal delivery, various state is preserved by the kernel so that
 111  * the running LWP may be revectored to a handling routine.  The context
 112  * allows the kernel to restart the program at the point of interruption,
 113  * either at the return of the signal handler, via setcontext(3C); or after
 114  * the usermode emulation request has been serviced, via B_EMULATION_DONE.
 115  *
 116  * In illumos native processes, the saved context (a "ucontext_t" object)
 117  * includes the state of registers and the current signal mask at the point
 118  * of interruption.  The context also includes a link to the most recently
 119  * saved context, forming a chain to be unwound as requests complete.  The LX
 120  * brand requires additional book-keeping to describe the machine state: in
 121  * particular, the current stack mode and the occupied extent of the native
 122  * stack.
 123  *
 124  * The brand code is able to interpose on the context save and restore
 125  * operations in the kernel -- see "lx_savecontext()" and
 126  * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to
 127  * function correctly in the face of a dual stack LWP.  The brand also
 128  * interposes on the signal delivery mechanism -- see "lx_sendsig()" and
 129  * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand
 130  * library interposer on the native stack, regardless of the interrupted
 131  * execution mode.  Linux sigaltstack(2) emulation is performed entirely by
 132  * the usermode brand library during signal handler interposition.
 133  */
 134 
 135 #include <sys/types.h>
 136 #include <sys/kmem.h>
 137 #include <sys/errno.h>
 138 #include <sys/thread.h>
 139 #include <sys/systm.h>
 140 #include <sys/syscall.h>
 141 #include <sys/proc.h>
 142 #include <sys/modctl.h>
 143 #include <sys/cmn_err.h>
 144 #include <sys/model.h>
 145 #include <sys/exec.h>
 146 #include <sys/lx_impl.h>
 147 #include <sys/machbrand.h>
 148 #include <sys/lx_syscalls.h>
 149 #include <sys/lx_misc.h>
 150 #include <sys/lx_futex.h>
 151 #include <sys/lx_brand.h>
 152 #include <sys/param.h>
 153 #include <sys/termios.h>
 154 #include <sys/sunddi.h>
 155 #include <sys/ddi.h>
 156 #include <sys/vnode.h>
 157 #include <sys/pathname.h>
 158 #include <sys/auxv.h>
 159 #include <sys/priv.h>
 160 #include <sys/regset.h>
 161 #include <sys/privregs.h>
 162 #include <sys/archsystm.h>
 163 #include <sys/zone.h>
 164 #include <sys/brand.h>
 165 #include <sys/sdt.h>
 166 #include <sys/x86_archext.h>
 167 #include <sys/controlregs.h>
 168 #include <sys/core.h>
 169 #include <sys/stack.h>
 170 #include <sys/stat.h>
 171 #include <sys/socket.h>
 172 #include <lx_signum.h>
 173 #include <util/sscanf.h>
 174 
 175 int     lx_debug = 0;
 176 
 177 void    lx_init_brand_data(zone_t *);
 178 void    lx_free_brand_data(zone_t *);
 179 void    lx_setbrand(proc_t *);
 180 int     lx_getattr(zone_t *, int, void *, size_t *);
 181 int     lx_setattr(zone_t *, int, void *, size_t);
 182 int     lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
 183                 uintptr_t, uintptr_t);
 184 void    lx_set_kern_version(zone_t *, char *);
 185 void    lx_copy_procdata(proc_t *, proc_t *);
 186 
 187 extern int getsetcontext(int, void *);
 188 extern int waitsys(idtype_t, id_t, siginfo_t *, int);
 189 #if defined(_SYSCALL32_IMPL)
 190 extern int getsetcontext32(int, void *);
 191 extern int waitsys32(idtype_t, id_t, siginfo_t *, int);
 192 #endif
 193 
 194 extern void lx_proc_exit(proc_t *);
 195 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
 196 
 197 extern void lx_ioctl_init();
 198 extern void lx_ioctl_fini();
 199 extern void lx_socket_init();
 200 extern void lx_socket_fini();
 201 
 202 lx_systrace_f *lx_systrace_entry_ptr;
 203 lx_systrace_f *lx_systrace_return_ptr;
 204 
 205 static int lx_systrace_enabled;
 206 
 207 /*
 208  * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly
 209  * want an MMU dependency here (and should there be a microprocessor without
 210  * a hole, we don't want to start allocating from the top of the VA range).
 211  */
 212 #define LX_MAXSTACK64   0x7ffffff00000
 213 
 214 uint64_t lx_maxstack64 = LX_MAXSTACK64;
 215 
 216 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
 217     struct intpdata *idata, int level, long *execsz, int setid,
 218     caddr_t exec_file, struct cred *cred, int *brand_action);
 219 
 220 static boolean_t lx_native_exec(uint8_t, const char **);
 221 static uint32_t lx_map32limit(proc_t *);
 222 
 223 static void lx_savecontext(ucontext_t *);
 224 static void lx_restorecontext(ucontext_t *);
 225 static caddr_t lx_sendsig_stack(int);
 226 static void lx_sendsig(int);
 227 #if defined(_SYSCALL32_IMPL)
 228 static void lx_savecontext32(ucontext32_t *);
 229 #endif
 230 static int lx_setid_clear(vattr_t *, cred_t *);
 231 #if defined(_LP64)
 232 static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type,
 233     enum seg_rw);
 234 #endif
 235 
 236 
 237 /* lx brand */
 238 struct brand_ops lx_brops = {
 239         lx_init_brand_data,             /* b_init_brand_data */
 240         lx_free_brand_data,             /* b_free_brand_data */
 241         lx_brandsys,                    /* b_brandsys */
 242         lx_setbrand,                    /* b_setbrand */
 243         lx_getattr,                     /* b_getattr */
 244         lx_setattr,                     /* b_setattr */
 245         lx_copy_procdata,               /* b_copy_procdata */
 246         lx_proc_exit,                   /* b_proc_exit */
 247         lx_exec,                        /* b_exec */
 248         lx_setrval,                     /* b_lwp_setrval */
 249         lx_lwpdata_alloc,               /* b_lwpdata_alloc */
 250         lx_lwpdata_free,                /* b_lwpdata_free */
 251         lx_initlwp,                     /* b_initlwp */
 252         lx_forklwp,                     /* b_forklwp */
 253         lx_freelwp,                     /* b_freelwp */
 254         lx_exitlwp,                     /* b_lwpexit */
 255         lx_elfexec,                     /* b_elfexec */
 256         NULL,                           /* b_sigset_native_to_brand */
 257         NULL,                           /* b_sigset_brand_to_native */
 258         lx_sigfd_translate,             /* b_sigfd_translate */
 259         NSIG,                           /* b_nsig */
 260         lx_exit_with_sig,               /* b_exit_with_sig */
 261         lx_wait_filter,                 /* b_wait_filter */
 262         lx_native_exec,                 /* b_native_exec */
 263         lx_map32limit,                  /* b_map32limit */
 264         lx_stop_notify,                 /* b_stop_notify */
 265         lx_waitid_helper,               /* b_waitid_helper */
 266         lx_sigcld_repost,               /* b_sigcld_repost */
 267         lx_ptrace_issig_stop,           /* b_issig_stop */
 268         lx_ptrace_sig_ignorable,        /* b_sig_ignorable */
 269         lx_savecontext,                 /* b_savecontext */
 270 #if defined(_SYSCALL32_IMPL)
 271         lx_savecontext32,               /* b_savecontext32 */
 272 #endif
 273         lx_restorecontext,              /* b_restorecontext */
 274         lx_sendsig_stack,               /* b_sendsig_stack */
 275         lx_sendsig,                     /* b_sendsig */
 276         lx_setid_clear,                 /* b_setid_clear */
 277 #if defined(_LP64)
 278         lx_pagefault                    /* b_pagefault */
 279 #else
 280         NULL
 281 #endif
 282 };
 283 
 284 struct brand_mach_ops lx_mops = {
 285         NULL,
 286         NULL,
 287         NULL,
 288         NULL,
 289         NULL,
 290         lx_fixsegreg,
 291         lx_fsbase
 292 };
 293 
 294 struct brand lx_brand = {
 295         BRAND_VER_1,
 296         "lx",
 297         &lx_brops,
 298         &lx_mops,
 299         sizeof (struct lx_proc_data)
 300 };
 301 
 302 static struct modlbrand modlbrand = {
 303         &mod_brandops, "lx brand", &lx_brand
 304 };
 305 
 306 static struct modlinkage modlinkage = {
 307         MODREV_1, (void *)&modlbrand, NULL
 308 };
 309 
 310 void
 311 lx_proc_exit(proc_t *p)
 312 {
 313         lx_proc_data_t *lxpd;
 314         proc_t *cp;
 315 
 316         mutex_enter(&p->p_lock);
 317         VERIFY(lxpd = ptolxproc(p));
 318         if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) {
 319                 mutex_exit(&p->p_lock);
 320                 return;
 321         }
 322         mutex_exit(&p->p_lock);
 323 
 324         /* Check for children which desire notification of parental death. */
 325         mutex_enter(&pidlock);
 326         for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) {
 327                 mutex_enter(&cp->p_lock);
 328                 if ((lxpd = ptolxproc(cp)) == NULL) {
 329                         mutex_exit(&cp->p_lock);
 330                         continue;
 331                 }
 332                 if (lxpd->l_parent_deathsig != 0) {
 333                         sigtoproc(p, NULL, lxpd->l_parent_deathsig);
 334                 }
 335                 mutex_exit(&cp->p_lock);
 336         }
 337         mutex_exit(&pidlock);
 338 }
 339 
 340 void
 341 lx_setbrand(proc_t *p)
 342 {
 343         /* Send SIGCHLD to parent by default when child exits */
 344         ptolxproc(p)->l_signal = stol_signo[SIGCHLD];
 345 }
 346 
 347 /* ARGSUSED */
 348 int
 349 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
 350 {
 351         char vers[LX_VERS_MAX];
 352 
 353         if (attr == LX_KERN_VERSION_NUM) {
 354                 if (bufsize > (LX_VERS_MAX - 1))
 355                         return (ERANGE);
 356                 bzero(vers, LX_VERS_MAX);
 357                 if (copyin(buf, &vers, bufsize) != 0)
 358                         return (EFAULT);
 359                 lx_set_kern_version(zone, vers);
 360                 return (0);
 361         }
 362         return (EINVAL);
 363 }
 364 
 365 /* ARGSUSED */
 366 int
 367 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
 368 {
 369         if (attr == LX_KERN_VERSION_NUM) {
 370                 if (*bufsize < LX_VERS_MAX)
 371                         return (ERANGE);
 372                 if (copyout(lx_get_zone_kern_version(curzone), buf,
 373                     LX_VERS_MAX) != 0)
 374                         return (EFAULT);
 375                 *bufsize = LX_VERS_MAX;
 376                 return (0);
 377         }
 378         return (-EINVAL);
 379 }
 380 
 381 uint32_t
 382 lx_map32limit(proc_t *p)
 383 {
 384         /*
 385          * To be bug-for-bug compatible with Linux, we have MAP_32BIT only
 386          * allow mappings in the first 31 bits.  This was a nuance in the
 387          * original Linux implementation circa 2002, and applications have
 388          * come to depend on its behavior.
 389          *
 390          * This is only relevant for 64-bit processes.
 391          */
 392         if (p->p_model == DATAMODEL_LP64)
 393                 return (1 << 31);
 394 
 395         return ((uint32_t)USERLIMIT32);
 396 }
 397 
 398 void
 399 lx_brand_systrace_enable(void)
 400 {
 401         VERIFY(!lx_systrace_enabled);
 402 
 403         lx_systrace_enabled = 1;
 404 }
 405 
 406 void
 407 lx_brand_systrace_disable(void)
 408 {
 409         VERIFY(lx_systrace_enabled);
 410 
 411         lx_systrace_enabled = 0;
 412 }
 413 
 414 void
 415 lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp)
 416 {
 417         VERIFY(lwpd->br_ntv_stack != 0);
 418 
 419         /*
 420          * The "brand-lx-set-ntv-stack-current" probe has arguments:
 421          *   arg0: stack pointer before change
 422          *   arg1: stack pointer after change
 423          *   arg2: current stack base
 424          */
 425         DTRACE_PROBE3(brand__lx__set__ntv__stack__current,
 426             uintptr_t, lwpd->br_ntv_stack_current,
 427             uintptr_t, new_sp,
 428             uintptr_t, lwpd->br_ntv_stack);
 429 
 430         lwpd->br_ntv_stack_current = new_sp;
 431 }
 432 
 433 #if defined(_LP64)
 434 static int
 435 lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type,
 436     enum seg_rw rw)
 437 {
 438         int syscall_num;
 439 
 440         /*
 441          * We only want to handle a very specific set of circumstances.
 442          * Namely: this is a 64-bit LX-branded process attempting to execute an
 443          * address in a page for which it does not have a valid mapping.  If
 444          * this is not the case, we bail out as fast as possible.
 445          */
 446         VERIFY(PROC_IS_BRANDED(p));
 447         if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) !=
 448             DATAMODEL_NATIVE) {
 449                 return (-1);
 450         }
 451 
 452         if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) {
 453                 return (-1);
 454         }
 455 
 456         /*
 457          * This is a valid vsyscall address.  We service the system call and
 458          * return 0 to signal that the pagefault has been handled completely.
 459          */
 460         lx_vsyscall_enter(p, lwp, syscall_num);
 461         return (0);
 462 }
 463 #endif
 464 
 465 /*
 466  * This hook runs prior to sendsig() processing and allows us to nominate
 467  * an alternative stack pointer for delivery of the signal handling frame.
 468  * Critically, this routine should _not_ modify any LWP state as the
 469  * savecontext() does not run until after this hook.
 470  */
 471 static caddr_t
 472 lx_sendsig_stack(int sig)
 473 {
 474         klwp_t *lwp = ttolwp(curthread);
 475         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 476 
 477         /*
 478          * We want to take signal delivery on the native stack, but only if
 479          * one has been allocated and installed for this LWP.
 480          */
 481         if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 482                 /*
 483                  * The program is not running on the native stack.  Return
 484                  * the native stack pointer from our brand-private data so
 485                  * that we may switch to it for signal handling.
 486                  */
 487                 return ((caddr_t)lwpd->br_ntv_stack_current);
 488         } else {
 489                 struct regs *rp = lwptoregs(lwp);
 490 
 491                 /*
 492                  * Either the program is already running on the native stack,
 493                  * or one has not yet been allocated for this LWP.  Use the
 494                  * current stack pointer value.
 495                  */
 496                 return ((caddr_t)rp->r_sp);
 497         }
 498 }
 499 
 500 /*
 501  * This hook runs after sendsig() processing and allows us to update the
 502  * per-LWP mode flags for system calls and stacks.  The pre-signal
 503  * context has already been saved and delivered to the user at this point.
 504  */
 505 static void
 506 lx_sendsig(int sig)
 507 {
 508         klwp_t *lwp = ttolwp(curthread);
 509         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 510         struct regs *rp = lwptoregs(lwp);
 511 
 512         switch (lwpd->br_stack_mode) {
 513         case LX_STACK_MODE_BRAND:
 514         case LX_STACK_MODE_NATIVE:
 515                 /*
 516                  * In lx_sendsig_stack(), we nominated a stack pointer from the
 517                  * native stack.  Update the stack mode, and the current in-use
 518                  * extent of the native stack, accordingly:
 519                  */
 520                 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
 521                 lx_lwp_set_native_stack_current(lwpd, rp->r_sp);
 522 
 523                 /*
 524                  * Fix up segment registers, etc.
 525                  */
 526                 lx_switch_to_native(lwp);
 527                 break;
 528 
 529         default:
 530                 /*
 531                  * Otherwise, the brand library has not yet installed the
 532                  * alternate stack for this LWP.  Signals will be handled on
 533                  * the regular stack thread.
 534                  */
 535                 return;
 536         }
 537 }
 538 
 539 /*
 540  * This hook runs prior to the context restoration, allowing us to take action
 541  * or modify the context before it is loaded.
 542  */
 543 static void
 544 lx_restorecontext(ucontext_t *ucp)
 545 {
 546         klwp_t *lwp = ttolwp(curthread);
 547         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 548         uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0];
 549         caddr_t sp = ucp->uc_brand_data[1];
 550 
 551         /*
 552          * We have a saved native stack pointer value that we must restore
 553          * into the per-LWP data.
 554          */
 555         if (flags & LX_UC_RESTORE_NATIVE_SP) {
 556                 lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp);
 557         }
 558 
 559         /*
 560          * We do not wish to restore the value of uc_link in this context,
 561          * so replace it with the value currently in the LWP.
 562          */
 563         if (flags & LX_UC_IGNORE_LINK) {
 564                 ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext;
 565         }
 566 
 567         /*
 568          * Restore the stack mode:
 569          */
 570         if (flags & LX_UC_STACK_NATIVE) {
 571                 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
 572         } else if (flags & LX_UC_STACK_BRAND) {
 573                 lwpd->br_stack_mode = LX_STACK_MODE_BRAND;
 574         }
 575 
 576 #if defined(__amd64)
 577         /*
 578          * Override the fs/gsbase in the context with the value provided
 579          * through the Linux arch_prctl(2) system call.
 580          */
 581         if (flags & LX_UC_STACK_BRAND) {
 582                 if (lwpd->br_lx_fsbase != 0) {
 583                         ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase;
 584                 }
 585                 if (lwpd->br_lx_gsbase != 0) {
 586                         ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase;
 587                 }
 588         }
 589 #endif
 590 }
 591 
 592 static void
 593 lx_savecontext(ucontext_t *ucp)
 594 {
 595         klwp_t *lwp = ttolwp(curthread);
 596         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 597         uintptr_t flags = 0;
 598 
 599         /*
 600          * The ucontext_t affords us three private pointer-sized members in
 601          * "uc_brand_data".  We pack a variety of flags into the first element,
 602          * and an optional stack pointer in the second element.  The flags
 603          * determine which stack pointer (native or brand), if any, is stored
 604          * in the second element.  The third element may contain the system
 605          * call number; this is analogous to the "orig_[er]ax" member of a
 606          * Linux "user_regs_struct".
 607          */
 608 
 609         if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
 610             lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
 611                 /*
 612                  * Record the value of the native stack pointer to restore
 613                  * when returning to this branded context:
 614                  */
 615                 flags |= LX_UC_RESTORE_NATIVE_SP;
 616                 ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current;
 617         }
 618 
 619         /*
 620          * Save the stack mode:
 621          */
 622         if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
 623                 flags |= LX_UC_STACK_NATIVE;
 624         } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 625                 flags |= LX_UC_STACK_BRAND;
 626         }
 627 
 628         /*
 629          * If we might need to restart this system call, save that information
 630          * in the context:
 631          */
 632         if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 633                 ucp->uc_brand_data[2] =
 634                     (void *)(uintptr_t)lwpd->br_syscall_num;
 635                 if (lwpd->br_syscall_restart) {
 636                         flags |= LX_UC_RESTART_SYSCALL;
 637                 }
 638         } else {
 639                 ucp->uc_brand_data[2] = NULL;
 640         }
 641 
 642         ucp->uc_brand_data[0] = (void *)flags;
 643 }
 644 
 645 #if defined(_SYSCALL32_IMPL)
 646 static void
 647 lx_savecontext32(ucontext32_t *ucp)
 648 {
 649         klwp_t *lwp = ttolwp(curthread);
 650         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 651         unsigned int flags = 0;
 652 
 653         /*
 654          * The ucontext_t affords us three private pointer-sized members in
 655          * "uc_brand_data".  We pack a variety of flags into the first element,
 656          * and an optional stack pointer in the second element.  The flags
 657          * determine which stack pointer (native or brand), if any, is stored
 658          * in the second element.  The third element may contain the system
 659          * call number; this is analogous to the "orig_[er]ax" member of a
 660          * Linux "user_regs_struct".
 661          */
 662 
 663         if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
 664             lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
 665                 /*
 666                  * Record the value of the native stack pointer to restore
 667                  * when returning to this branded context:
 668                  */
 669                 flags |= LX_UC_RESTORE_NATIVE_SP;
 670                 ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current;
 671         }
 672 
 673         /*
 674          * Save the stack mode:
 675          */
 676         if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
 677                 flags |= LX_UC_STACK_NATIVE;
 678         } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 679                 flags |= LX_UC_STACK_BRAND;
 680         }
 681 
 682         /*
 683          * If we might need to restart this system call, save that information
 684          * in the context:
 685          */
 686         if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 687                 ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num;
 688                 if (lwpd->br_syscall_restart) {
 689                         flags |= LX_UC_RESTART_SYSCALL;
 690                 }
 691         } else {
 692                 ucp->uc_brand_data[2] = NULL;
 693         }
 694 
 695         ucp->uc_brand_data[0] = flags;
 696 }
 697 #endif
 698 
 699 void
 700 lx_init_brand_data(zone_t *zone)
 701 {
 702         lx_zone_data_t *data;
 703         ASSERT(zone->zone_brand == &lx_brand);
 704         ASSERT(zone->zone_brand_data == NULL);
 705         data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP);
 706         /*
 707          * Set the default lxzd_kernel_version to 2.4.
 708          * This can be changed by a call to setattr() during zone boot.
 709          */
 710         (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX);
 711 
 712         /*
 713          * Linux is not at all picky about address family when it comes to
 714          * supporting interface-related ioctls.  To mimic this behavior, we'll
 715          * attempt those ioctls against a ksocket configured for that purpose.
 716          */
 717         (void) ksocket_socket(&data->lxzd_ioctl_sock, AF_INET, SOCK_DGRAM, 0,
 718             0, zone->zone_kcred);
 719 
 720         zone->zone_brand_data = data;
 721 
 722         /*
 723          * In Linux, if the init(1) process terminates the system panics.
 724          * The zone must reboot to simulate this behaviour.
 725          */
 726         zone->zone_reboot_on_init_exit = B_TRUE;
 727 }
 728 
 729 void
 730 lx_free_brand_data(zone_t *zone)
 731 {
 732         lx_zone_data_t *data = ztolxzd(zone);
 733         ASSERT(data != NULL);
 734         if (data->lxzd_ioctl_sock != NULL) {
 735                 /*
 736                  * Since zone_kcred has been cleaned up already, close the
 737                  * socket using the global kcred.
 738                  */
 739                 ksocket_close(data->lxzd_ioctl_sock, kcred);
 740                 data->lxzd_ioctl_sock = NULL;
 741         }
 742         zone->zone_brand_data = NULL;
 743         kmem_free(data, sizeof (*data));
 744 }
 745 
 746 void
 747 lx_unsupported(char *dmsg)
 748 {
 749         lx_proc_data_t *pd = ttolxproc(curthread);
 750 
 751         DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg);
 752 
 753         if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) {
 754                 /*
 755                  * If this process was run with strict mode enabled
 756                  * (via LX_STRICT in the environment), we mark this
 757                  * LWP as having triggered an unsupported behaviour.
 758                  * This flag will be checked at an appropriate point
 759                  * by lx_check_strict_failure().
 760                  */
 761                 lx_lwp_data_t *lwpd = ttolxlwp(curthread);
 762 
 763                 lwpd->br_strict_failure = B_TRUE;
 764         }
 765 }
 766 
 767 void
 768 lx_check_strict_failure(lx_lwp_data_t *lwpd)
 769 {
 770         proc_t *p;
 771 
 772         if (!lwpd->br_strict_failure) {
 773                 return;
 774         }
 775 
 776         lwpd->br_strict_failure = B_FALSE;
 777 
 778         /*
 779          * If this process is operating in strict mode (via LX_STRICT in
 780          * the environment), and has triggered a call to
 781          * lx_unsupported(), we drop SIGSYS on it as we return.
 782          */
 783         p = curproc;
 784         mutex_enter(&p->p_lock);
 785         sigtoproc(p, curthread, SIGSYS);
 786         mutex_exit(&p->p_lock);
 787 }
 788 
 789 void
 790 lx_trace_sysenter(int syscall_num, uintptr_t *args)
 791 {
 792         if (lx_systrace_enabled) {
 793                 VERIFY(lx_systrace_entry_ptr != NULL);
 794 
 795                 (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1],
 796                     args[2], args[3], args[4], args[5]);
 797         }
 798 }
 799 
 800 void
 801 lx_trace_sysreturn(int syscall_num, long ret)
 802 {
 803         if (lx_systrace_enabled) {
 804                 VERIFY(lx_systrace_return_ptr != NULL);
 805 
 806                 (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0);
 807         }
 808 }
 809 
 810 /*
 811  * Get the addresses of the user-space system call handler and attach it to
 812  * the proc structure. Returning 0 indicates success; the value returned
 813  * by the system call is the value stored in rval. Returning a non-zero
 814  * value indicates a failure; the value returned is used to set errno, -1
 815  * is returned from the syscall and the contents of rval are ignored. To
 816  * set errno and have the syscall return a value other than -1 we can
 817  * manually set errno and rval and return 0.
 818  */
 819 int
 820 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
 821     uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
 822 {
 823         kthread_t *t = curthread;
 824         klwp_t *lwp = ttolwp(t);
 825         proc_t *p = ttoproc(t);
 826         lx_proc_data_t *pd;
 827         struct termios *termios;
 828         uint_t termios_len;
 829         int error;
 830         int code;
 831         int sig;
 832         lx_brand_registration_t reg;
 833         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 834 
 835         /*
 836          * There is one operation that is suppored for non-branded
 837          * process.  B_EXEC_BRAND.  This is the equilivant of an
 838          * exec call, but the new process that is created will be
 839          * a branded process.
 840          */
 841         if (cmd == B_EXEC_BRAND) {
 842                 VERIFY(p->p_zone != NULL);
 843                 VERIFY(p->p_zone->zone_brand == &lx_brand);
 844                 return (exec_common(
 845                     (char *)arg1, (const char **)arg2, (const char **)arg3,
 846                     EBA_BRAND));
 847         }
 848 
 849         /* For all other operations this must be a branded process. */
 850         if (p->p_brand == NULL)
 851                 return (ENOSYS);
 852 
 853         VERIFY(p->p_brand == &lx_brand);
 854         VERIFY(p->p_brand_data != NULL);
 855 
 856         switch (cmd) {
 857         case B_REGISTER:
 858                 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
 859                         lx_print("stack mode was not PREINIT during "
 860                             "REGISTER\n");
 861                         return (EINVAL);
 862                 }
 863 
 864                 if (p->p_model == DATAMODEL_NATIVE) {
 865                         if (copyin((void *)arg1, &reg, sizeof (reg)) != 0) {
 866                                 lx_print("Failed to copyin brand registration "
 867                                     "at 0x%p\n", (void *)arg1);
 868                                 return (EFAULT);
 869                         }
 870                 }
 871 #ifdef _LP64
 872                 else {
 873                         /* 32-bit userland on 64-bit kernel */
 874                         lx_brand_registration32_t reg32;
 875 
 876                         if (copyin((void *)arg1, &reg32, sizeof (reg32)) != 0) {
 877                                 lx_print("Failed to copyin brand registration "
 878                                     "at 0x%p\n", (void *)arg1);
 879                                 return (EFAULT);
 880                         }
 881 
 882                         reg.lxbr_version = (uint_t)reg32.lxbr_version;
 883                         reg.lxbr_handler =
 884                             (void *)(uintptr_t)reg32.lxbr_handler;
 885                         reg.lxbr_flags = reg32.lxbr_flags;
 886                 }
 887 #endif
 888 
 889                 if (reg.lxbr_version != LX_VERSION_1) {
 890                         lx_print("Invalid brand library version (%u)\n",
 891                             reg.lxbr_version);
 892                         return (EINVAL);
 893                 }
 894 
 895                 if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) {
 896                         lx_print("Invalid brand flags (%u)\n",
 897                             reg.lxbr_flags);
 898                         return (EINVAL);
 899                 }
 900 
 901                 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
 902                     (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
 903                 pd = p->p_brand_data;
 904                 pd->l_handler = (uintptr_t)reg.lxbr_handler;
 905                 pd->l_flags = reg.lxbr_flags & LX_PROC_ALL;
 906 
 907                 return (0);
 908 
 909         case B_TTYMODES:
 910                 /* This is necessary for emulating TCGETS ioctls. */
 911                 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
 912                     DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
 913                     &termios_len) != DDI_SUCCESS)
 914                         return (EIO);
 915 
 916                 ASSERT(termios_len == sizeof (*termios));
 917 
 918                 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
 919                         ddi_prop_free(termios);
 920                         return (EFAULT);
 921                 }
 922 
 923                 ddi_prop_free(termios);
 924                 return (0);
 925 
 926         case B_ELFDATA:
 927                 pd = curproc->p_brand_data;
 928                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 929                         if (copyout(&pd->l_elf_data, (void *)arg1,
 930                             sizeof (lx_elf_data_t)) != 0) {
 931                                 return (EFAULT);
 932                         }
 933                 }
 934 #if defined(_LP64)
 935                 else {
 936                         /* 32-bit userland on 64-bit kernel */
 937                         lx_elf_data32_t led32;
 938 
 939                         led32.ed_phdr = (int)pd->l_elf_data.ed_phdr;
 940                         led32.ed_phent = (int)pd->l_elf_data.ed_phent;
 941                         led32.ed_phnum = (int)pd->l_elf_data.ed_phnum;
 942                         led32.ed_entry = (int)pd->l_elf_data.ed_entry;
 943                         led32.ed_base = (int)pd->l_elf_data.ed_base;
 944                         led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry;
 945 
 946                         if (copyout(&led32, (void *)arg1,
 947                             sizeof (led32)) != 0) {
 948                                 return (EFAULT);
 949                         }
 950                 }
 951 #endif
 952                 return (0);
 953 
 954         case B_EXEC_NATIVE:
 955                 return (exec_common((char *)arg1, (const char **)arg2,
 956                     (const char **)arg3, EBA_NATIVE));
 957 
 958         /*
 959          * The B_TRUSS_POINT subcommand is used so that we can make a no-op
 960          * syscall for debugging purposes (dtracing) from within the user-level
 961          * emulation.
 962          */
 963         case B_TRUSS_POINT:
 964                 return (0);
 965 
 966         case B_LPID_TO_SPAIR: {
 967                 /*
 968                  * Given a Linux pid as arg1, return the Solaris pid in arg2 and
 969                  * the Solaris LWP in arg3.  We also translate pid 1 (which is
 970                  * hardcoded in many applications) to the zone's init process.
 971                  */
 972                 pid_t s_pid;
 973                 id_t s_tid;
 974 
 975                 if ((pid_t)arg1 == 1) {
 976                         s_pid = p->p_zone->zone_proc_initpid;
 977                         /* handle the dead/missing init(1M) case */
 978                         if (s_pid == -1)
 979                                 s_pid = 1;
 980                         s_tid = 1;
 981                 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) {
 982                         return (ESRCH);
 983                 }
 984 
 985                 if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 ||
 986                     copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) {
 987                         return (EFAULT);
 988                 }
 989 
 990                 return (0);
 991         }
 992 
 993         case B_SIGEV_THREAD_ID: {
 994                 /*
 995                  * Emulate Linux's timer_create(2) SIGEV_THREAD_ID
 996                  * notification method. This mechanism is only meant
 997                  * for userland threading libraries such as glibc and
 998                  * is documented as such. Therefore, assume this is
 999                  * only ever invoked for the purpose of alerting a
1000                  * Linux threading library. Assume that the tid is a
1001                  * member of the caller's process and the signal
1002                  * number is valid. See lx_sigev_thread_id() for the
1003                  * userland side of this emulation.
1004                  *
1005                  * arg1 -- Linux tid
1006                  * arg2 -- signal number
1007                  * arg3 -- union sigval
1008                  */
1009 
1010                 proc_t *pp, *cp = curproc;
1011                 int native_sig = ltos_signo[(int)arg2];
1012                 pid_t native_pid;
1013                 int native_tid;
1014                 sigqueue_t *sqp;
1015 
1016                 lx_lpid_to_spair((pid_t)arg1, &native_pid, &native_tid);
1017 
1018                 mutex_enter(&pidlock);
1019                 if (((pp = prfind(native_pid)) == NULL) || (pp->p_stat == SIDL)) {
1020                         mutex_exit(&pidlock);
1021                         return (ESRCH);
1022                 }
1023                 mutex_enter(&pp->p_lock);
1024                 mutex_exit(&pidlock);
1025 
1026                 if ((t = idtot(pp, native_tid)) == NULL) {
1027                         mutex_exit(&pp->p_lock);
1028                         return (ESRCH);
1029                 }
1030 
1031                 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
1032                 sqp->sq_info.si_signo = native_sig;
1033                 sqp->sq_info.si_code = SI_TIMER;
1034                 sqp->sq_info.si_pid = cp->p_pid;
1035                 sqp->sq_info.si_zoneid = getzoneid();
1036                 sqp->sq_info.si_uid = crgetruid(CRED());
1037                 sqp->sq_info.si_value = (union sigval)((void *)arg3);
1038                 sigaddqa(pp, t, sqp);
1039 
1040                 mutex_exit(&pp->p_lock);
1041                 return (0);
1042         }
1043 
1044         case B_SET_AFFINITY_MASK:
1045         case B_GET_AFFINITY_MASK:
1046                 /*
1047                  * Retrieve or store the CPU affinity mask for the
1048                  * requested linux pid.
1049                  *
1050                  * arg1 is a linux PID (0 means curthread).
1051                  * arg2 is the size of the given mask.
1052                  * arg3 is the address of the affinity mask.
1053                  */
1054                 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval));
1055 
1056         case B_PTRACE_STOP_FOR_OPT:
1057                 return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ?
1058                     B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
1059 
1060         case B_PTRACE_CLONE_BEGIN:
1061                 return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ?
1062                     B_FALSE : B_TRUE));
1063 
1064         case B_PTRACE_KERNEL:
1065                 return (lx_ptrace_kernel((int)arg1, (pid_t)arg2, arg3, arg4));
1066 
1067         case B_HELPER_WAITID: {
1068                 idtype_t idtype = (idtype_t)arg1;
1069                 id_t id = (id_t)arg2;
1070                 siginfo_t *infop = (siginfo_t *)arg3;
1071                 int options = (int)arg4;
1072 
1073                 lwpd = ttolxlwp(curthread);
1074 
1075                 /*
1076                  * Our brand-specific waitid helper only understands a subset of
1077                  * the possible idtypes.  Ensure we keep to that subset here:
1078                  */
1079                 if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) {
1080                         return (EINVAL);
1081                 }
1082 
1083                 /*
1084                  * Enable the return of emulated ptrace(2) stop conditions
1085                  * through lx_waitid_helper, and stash the Linux-specific
1086                  * extra waitid() flags.
1087                  */
1088                 lwpd->br_waitid_emulate = B_TRUE;
1089                 lwpd->br_waitid_flags = (int)arg5;
1090 
1091 #if defined(_SYSCALL32_IMPL)
1092                 if (get_udatamodel() != DATAMODEL_NATIVE) {
1093                         return (waitsys32(idtype, id, infop, options));
1094                 } else
1095 #endif
1096                 {
1097                         return (waitsys(idtype, id, infop, options));
1098                 }
1099 
1100                 lwpd->br_waitid_emulate = B_FALSE;
1101                 lwpd->br_waitid_flags = 0;
1102 
1103                 return (0);
1104         }
1105 
1106         case B_UNSUPPORTED: {
1107                 char dmsg[256];
1108 
1109                 if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) {
1110                         lx_print("Failed to copyin unsupported msg "
1111                             "at 0x%p\n", (void *)arg1);
1112                         return (EFAULT);
1113                 }
1114                 dmsg[255] = '\0';
1115                 lx_unsupported(dmsg);
1116 
1117                 lx_check_strict_failure(lwpd);
1118 
1119                 return (0);
1120         }
1121 
1122         case B_STORE_ARGS: {
1123                 /*
1124                  * B_STORE_ARGS subcommand
1125                  * arg1 = address of struct to be copied in
1126                  * arg2 = size of the struct being copied in
1127                  * arg3-arg6 ignored
1128                  * rval = the amount of data copied.
1129                  */
1130                 void *buf;
1131 
1132                 /* only have upper limit because arg2 is unsigned */
1133                 if (arg2 > LX_BR_ARGS_SIZE_MAX) {
1134                         return (EINVAL);
1135                 }
1136 
1137                 buf = kmem_alloc(arg2, KM_SLEEP);
1138                 if (copyin((void *)arg1, buf, arg2) != 0) {
1139                         lx_print("Failed to copyin scall arg at 0x%p\n",
1140                             (void *) arg1);
1141                         kmem_free(buf, arg2);
1142                         /*
1143                          * Purposely not setting br_scall_args to NULL
1144                          * to preserve data for debugging.
1145                          */
1146                         return (EFAULT);
1147                 }
1148 
1149                 if (lwpd->br_scall_args != NULL) {
1150                         ASSERT(lwpd->br_args_size > 0);
1151                         kmem_free(lwpd->br_scall_args,
1152                             lwpd->br_args_size);
1153                 }
1154 
1155                 lwpd->br_scall_args = buf;
1156                 lwpd->br_args_size = arg2;
1157                 *rval = arg2;
1158                 return (0);
1159         }
1160 
1161         case B_HELPER_CLONE:
1162                 return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3,
1163                     (void *)arg4));
1164 
1165         case B_HELPER_SETGROUPS:
1166                 return (lx_helper_setgroups(arg1, (gid_t *)arg2));
1167 
1168         case B_HELPER_SIGQUEUE:
1169                 return (lx_helper_rt_sigqueueinfo(arg1, arg2,
1170                     (siginfo_t *)arg3));
1171 
1172         case B_HELPER_TGSIGQUEUE:
1173                 return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3,
1174                     (siginfo_t *)arg4));
1175 
1176         case B_SET_THUNK_PID:
1177                 lwpd->br_lx_thunk_pid = arg1;
1178                 return (0);
1179 
1180         case B_GETPID:
1181                 /*
1182                  * The usermode clone(2) code needs to be able to call
1183                  * lx_getpid() from native code:
1184                  */
1185                 *rval = lx_getpid();
1186                 return (0);
1187 
1188         case B_SET_NATIVE_STACK:
1189                 /*
1190                  * B_SET_NATIVE_STACK subcommand
1191                  * arg1 = the base of the stack to use for emulation
1192                  */
1193                 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
1194                         lx_print("B_SET_NATIVE_STACK when stack was already "
1195                             "set to %p\n", (void *)arg1);
1196                         return (EEXIST);
1197                 }
1198 
1199                 /*
1200                  * We move from the PREINIT state, where we have no brand
1201                  * emulation stack, to the INIT state.  Here, we are still
1202                  * running on what will become the BRAND stack, but are running
1203                  * emulation (i.e. native) code.  Once the initialisation
1204                  * process for this thread has finished, we will jump to
1205                  * brand-specific code, while moving to the BRAND mode.
1206                  *
1207                  * When a new LWP is created, lx_initlwp() will clear the
1208                  * stack data.  If that LWP is actually being duplicated
1209                  * into a child process by fork(2), lx_forklwp() will copy
1210                  * it so that the cloned thread will keep using the same
1211                  * alternate stack.
1212                  */
1213                 lwpd->br_ntv_stack = arg1;
1214                 lwpd->br_stack_mode = LX_STACK_MODE_INIT;
1215                 lx_lwp_set_native_stack_current(lwpd, arg1);
1216 
1217                 return (0);
1218 
1219         case B_GET_CURRENT_CONTEXT:
1220                 /*
1221                  * B_GET_CURRENT_CONTEXT subcommand:
1222                  * arg1 = address for pointer to current ucontext_t
1223                  */
1224 
1225 #if defined(_SYSCALL32_IMPL)
1226                 if (get_udatamodel() != DATAMODEL_NATIVE) {
1227                         caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext;
1228 
1229                         error = copyout(&addr, (void *)arg1, sizeof (addr));
1230                 } else
1231 #endif
1232                 {
1233                         error = copyout(&lwp->lwp_oldcontext, (void *)arg1,
1234                             sizeof (lwp->lwp_oldcontext));
1235                 }
1236 
1237                 return (error != 0 ? EFAULT : 0);
1238 
1239         case B_JUMP_TO_LINUX:
1240                 /*
1241                  * B_JUMP_TO_LINUX subcommand:
1242                  * arg1 = ucontext_t pointer for jump state
1243                  */
1244 
1245                 if (arg1 == NULL)
1246                         return (EINVAL);
1247 
1248                 switch (lwpd->br_stack_mode) {
1249                 case LX_STACK_MODE_NATIVE: {
1250                         struct regs *rp = lwptoregs(lwp);
1251 
1252                         /*
1253                          * We are on the NATIVE stack, so we must preserve
1254                          * the extent of that stack.  The pointer will be
1255                          * reset by a future setcontext().
1256                          */
1257                         lx_lwp_set_native_stack_current(lwpd,
1258                             (uintptr_t)rp->r_sp);
1259                         break;
1260                 }
1261 
1262                 case LX_STACK_MODE_INIT:
1263                         /*
1264                          * The LWP is transitioning to Linux code for the first
1265                          * time.
1266                          */
1267                         break;
1268 
1269                 case LX_STACK_MODE_PREINIT:
1270                         /*
1271                          * This LWP has not installed an alternate stack for
1272                          * usermode emulation handling.
1273                          */
1274                         return (ENOENT);
1275 
1276                 case LX_STACK_MODE_BRAND:
1277                         /*
1278                          * The LWP should not be on the BRAND stack.
1279                          */
1280                         exit(CLD_KILLED, SIGSYS);
1281                         return (0);
1282                 }
1283 
1284                 /*
1285                  * Transfer control to Linux:
1286                  */
1287                 return (lx_runexe(lwp, (void *)arg1));
1288 
1289         case B_EMULATION_DONE:
1290                 /*
1291                  * B_EMULATION_DONE subcommand:
1292                  * arg1 = ucontext_t * to restore
1293                  * arg2 = system call number
1294                  * arg3 = return code
1295                  * arg4 = if operation failed, the errno value
1296                  */
1297 
1298                 /*
1299                  * The first part of this operation is a setcontext() to
1300                  * restore the register state to the copy we preserved
1301                  * before vectoring to the usermode emulation routine.
1302                  * If that fails, we return (hopefully) to the emulation
1303                  * routine and it will handle the error.
1304                  */
1305 #if (_SYSCALL32_IMPL)
1306                 if (get_udatamodel() != DATAMODEL_NATIVE) {
1307                         error = getsetcontext32(SETCONTEXT, (void *)arg1);
1308                 } else
1309 #endif
1310                 {
1311                         error = getsetcontext(SETCONTEXT, (void *)arg1);
1312                 }
1313 
1314                 if (error != 0) {
1315                         return (error);
1316                 }
1317 
1318                 /*
1319                  * The saved Linux context has been restored.  We handle the
1320                  * return value or errno with code common to the in-kernel
1321                  * system call emulation.
1322                  */
1323                 if ((error = (int)arg4) != 0) {
1324                         /*
1325                          * lx_syscall_return() looks at the errno in the LWP,
1326                          * so set it here:
1327                          */
1328                         set_errno(error);
1329                 }
1330                 lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3);
1331 
1332                 return (0);
1333 
1334         case B_EXIT_AS_SIG:
1335                 code = CLD_KILLED;
1336                 sig = (int)arg1;
1337                 proc_is_exiting(p);
1338                 if (exitlwps(1) != 0) {
1339                         mutex_enter(&p->p_lock);
1340                         lwp_exit();
1341                 }
1342                 ttolwp(curthread)->lwp_cursig = sig;
1343                 if (sig == SIGSEGV) {
1344                         if (core(sig, 0) == 0)
1345                                 code = CLD_DUMPED;
1346                 }
1347                 exit(code, sig);
1348                 /* NOTREACHED */
1349                 break;
1350         }
1351 
1352         return (EINVAL);
1353 }
1354 
1355 char *
1356 lx_get_zone_kern_version(zone_t *zone)
1357 {
1358         return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version);
1359 }
1360 
1361 void
1362 lx_set_kern_version(zone_t *zone, char *vers)
1363 {
1364         lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
1365 
1366         (void) strlcpy(lxzd->lxzd_kernel_version, vers, LX_VERS_MAX);
1367 }
1368 
1369 /*
1370  * Compare linux kernel version to the one set for the zone.
1371  * Returns greater than 0 if zone version is higher, less than 0 if the zone
1372  * version is lower, and 0 if the version are equal.
1373  */
1374 int
1375 lx_kern_version_cmp(zone_t *zone, const char *vers)
1376 {
1377         int zvers[3] = {0, 0, 0};
1378         int cvers[3] = {0, 0, 0};
1379         int i;
1380 
1381         VERIFY(zone->zone_brand == &lx_brand);
1382 
1383         (void) sscanf(ztolxzd(zone)->lxzd_kernel_version, "%d.%d.%d", &zvers[0],
1384             &zvers[1], &zvers[2]);
1385         (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]);
1386 
1387         for (i = 0; i < 3; i++) {
1388                 if (zvers[i] > cvers[i]) {
1389                         return (1);
1390                 } else if (zvers[i] < cvers[i]) {
1391                         return (-1);
1392                 }
1393         }
1394         return (0);
1395 }
1396 
1397 /*
1398  * Linux unconditionally removes the setuid and setgid bits when changing
1399  * file ownership.  This brand hook overrides the illumos native behaviour,
1400  * which is based on the PRIV_FILE_SETID privilege.
1401  */
1402 static int
1403 lx_setid_clear(vattr_t *vap, cred_t *cr)
1404 {
1405         if (S_ISDIR(vap->va_mode)) {
1406                 return (0);
1407         }
1408 
1409         if (vap->va_mode & S_ISUID) {
1410                 vap->va_mask |= AT_MODE;
1411                 vap->va_mode &= ~S_ISUID;
1412         }
1413         if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1414                 vap->va_mask |= AT_MODE;
1415                 vap->va_mode &= ~S_ISGID;
1416         }
1417 
1418         return (0);
1419 }
1420 
1421 /*
1422  * Copy the per-process brand data from a parent proc to a child.
1423  */
1424 void
1425 lx_copy_procdata(proc_t *child, proc_t *parent)
1426 {
1427         lx_proc_data_t *cpd = child->p_brand_data;
1428         lx_proc_data_t *ppd = parent->p_brand_data;
1429 
1430         VERIFY(parent->p_brand == &lx_brand);
1431         VERIFY(child->p_brand == &lx_brand);
1432         VERIFY(ppd != NULL);
1433         VERIFY(cpd != NULL);
1434 
1435         *cpd = *ppd;
1436 
1437         cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY;
1438         cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY;
1439 
1440         cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20;
1441         cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20;
1442 
1443         cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY;
1444         cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY;
1445 
1446         cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY;
1447         cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY;
1448 }
1449 
1450 #if defined(_LP64)
1451 static void
1452 Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst)
1453 {
1454         bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident));
1455         dst->e_type =                src->e_type;
1456         dst->e_machine =     src->e_machine;
1457         dst->e_version =     src->e_version;
1458         dst->e_entry =               src->e_entry;
1459         dst->e_phoff =               src->e_phoff;
1460         dst->e_shoff =               src->e_shoff;
1461         dst->e_flags =               src->e_flags;
1462         dst->e_ehsize =              src->e_ehsize;
1463         dst->e_phentsize =   src->e_phentsize;
1464         dst->e_phnum =               src->e_phnum;
1465         dst->e_shentsize =   src->e_shentsize;
1466         dst->e_shnum =               src->e_shnum;
1467         dst->e_shstrndx =    src->e_shstrndx;
1468 }
1469 #endif /* _LP64 */
1470 
1471 static void
1472 restoreexecenv(struct execenv *ep, stack_t *sp)
1473 {
1474         klwp_t *lwp = ttolwp(curthread);
1475 
1476         setexecenv(ep);
1477         lwp->lwp_sigaltstack.ss_sp = sp->ss_sp;
1478         lwp->lwp_sigaltstack.ss_size = sp->ss_size;
1479         lwp->lwp_sigaltstack.ss_flags = sp->ss_flags;
1480 }
1481 
1482 extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
1483     long *, int, caddr_t, cred_t *, int *);
1484 
1485 extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int,
1486     long *, int, caddr_t, cred_t *, int *);
1487 
1488 /*
1489  * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux
1490  * binaries.
1491  */
1492 static int
1493 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
1494     struct intpdata *idata, int level, long *execsz, int setid,
1495     caddr_t exec_file, struct cred *cred, int *brand_action)
1496 {
1497         int             error;
1498         vnode_t         *nvp;
1499         Ehdr            ehdr;
1500         Addr            uphdr_vaddr;
1501         intptr_t        voffset;
1502         char            *interp = NULL;
1503         uintptr_t       ldaddr = NULL;
1504         int             i;
1505         proc_t          *p = ttoproc(curthread);
1506         klwp_t          *lwp = ttolwp(curthread);
1507         struct execenv  env;
1508         struct execenv  origenv;
1509         stack_t         orig_sigaltstack;
1510         struct user     *up = PTOU(ttoproc(curthread));
1511         lx_elf_data_t   *edp;
1512         char            *lib_path = NULL;
1513 
1514         ASSERT(ttoproc(curthread)->p_brand == &lx_brand);
1515         ASSERT(ttoproc(curthread)->p_brand_data != NULL);
1516 
1517         edp = &ttolxproc(curthread)->l_elf_data;
1518 
1519         if (args->to_model == DATAMODEL_NATIVE) {
1520                 lib_path = LX_LIB_PATH;
1521         }
1522 #if defined(_LP64)
1523         else {
1524                 lib_path = LX_LIB_PATH32;
1525         }
1526 #endif
1527 
1528         /*
1529          * Set the brandname and library name for the new process so that
1530          * elfexec() puts them onto the stack.
1531          */
1532         args->brandname = LX_BRANDNAME;
1533         args->emulator = lib_path;
1534 
1535 #if defined(_LP64)
1536         /*
1537          * To conform with the way Linux lays out the address space, we clamp
1538          * the stack to be the top of the lower region of the x86-64 canonical
1539          * form address space -- which has the side-effect of laying out the
1540          * entire address space in that lower region.  Note that this only
1541          * matters on 64-bit processes (this value will always be greater than
1542          * the size of a 32-bit address space) and doesn't actually affect
1543          * USERLIMIT:  if a Linux-branded processes wishes to map something
1544          * into the top half of the address space, it can do so -- but with
1545          * the user stack starting at the top of the bottom region, those high
1546          * virtual addresses won't be used unless explicitly directed.
1547          */
1548         args->maxstack = lx_maxstack64;
1549 #endif
1550 
1551         /*
1552          * We will first exec the brand library, then map in the linux
1553          * executable and the linux linker.
1554          */
1555         if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP,
1556             &nvp))) {
1557                 uprintf("%s: not found.", lib_path);
1558                 return (error);
1559         }
1560 
1561         /*
1562          * We will eventually set the p_exec member to be the vnode for the new
1563          * executable when we call setexecenv(). However, if we get an error
1564          * before that call we need to restore the execenv to its original
1565          * values so that when we return to the caller fop_close() works
1566          * properly while cleaning up from the failed exec().  Restoring the
1567          * original value will also properly decrement the 2nd VN_RELE that we
1568          * took on the brand library.
1569          */
1570         origenv.ex_bssbase = p->p_bssbase;
1571         origenv.ex_brkbase = p->p_brkbase;
1572         origenv.ex_brksize = p->p_brksize;
1573         origenv.ex_vp = p->p_exec;
1574         orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp;
1575         orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size;
1576         orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags;
1577 
1578         if (args->to_model == DATAMODEL_NATIVE) {
1579                 error = elfexec(nvp, uap, args, idata, level + 1, execsz,
1580                     setid, exec_file, cred, brand_action);
1581         }
1582 #if defined(_LP64)
1583         else {
1584                 error = elf32exec(nvp, uap, args, idata, level + 1, execsz,
1585                     setid, exec_file, cred, brand_action);
1586         }
1587 #endif
1588         VN_RELE(nvp);
1589         if (error != 0) {
1590                 restoreexecenv(&origenv, &orig_sigaltstack);
1591                 return (error);
1592         }
1593 
1594         /*
1595          * exec-ed in the brand library above.
1596          * The u_auxv vectors are now setup by elfexec to point to the
1597          * brand emulation library and its linker.
1598          */
1599 
1600         bzero(&env, sizeof (env));
1601 
1602         /*
1603          * map in the the Linux executable
1604          */
1605         if (args->to_model == DATAMODEL_NATIVE) {
1606                 error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
1607                     &voffset, exec_file, &interp, &env.ex_bssbase,
1608                     &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1609         }
1610 #if defined(_LP64)
1611         else {
1612                 Elf32_Ehdr      ehdr32;
1613                 Elf32_Addr      uphdr_vaddr32;
1614 
1615                 error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
1616                     &voffset, exec_file, &interp, &env.ex_bssbase,
1617                     &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1618 
1619                 Ehdr32to64(&ehdr32, &ehdr);
1620 
1621                 if (uphdr_vaddr32 == (Elf32_Addr)-1)
1622                         uphdr_vaddr = (Addr)-1;
1623                 else
1624                         uphdr_vaddr = uphdr_vaddr32;
1625         }
1626 #endif
1627         if (error != 0) {
1628                 restoreexecenv(&origenv, &orig_sigaltstack);
1629 
1630                 if (interp != NULL)
1631                         kmem_free(interp, MAXPATHLEN);
1632 
1633                 return (error);
1634         }
1635 
1636         /*
1637          * Save off the important properties of the lx executable. The brand
1638          * library will ask us for this data later, when it is ready to set
1639          * things up for the lx executable.
1640          */
1641         edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
1642             voffset + uphdr_vaddr;
1643         edp->ed_entry = voffset + ehdr.e_entry;
1644         edp->ed_phent = ehdr.e_phentsize;
1645         edp->ed_phnum = ehdr.e_phnum;
1646 
1647         if (interp != NULL) {
1648                 if (ehdr.e_type == ET_DYN) {
1649                         /*
1650                          * This is a shared object executable, so we need to
1651                          * pick a reasonable place to put the heap. Just don't
1652                          * use the first page.
1653                          */
1654                         env.ex_brkbase = (caddr_t)PAGESIZE;
1655                         env.ex_bssbase = (caddr_t)PAGESIZE;
1656                 }
1657 
1658                 /*
1659                  * If the program needs an interpreter (most do), map it in and
1660                  * store relevant information about it in the aux vector, where
1661                  * the brand library can find it.
1662                  */
1663                 if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW,
1664                     NULLVPP, &nvp))) {
1665                         uprintf("%s: not found.", interp);
1666                         restoreexecenv(&origenv, &orig_sigaltstack);
1667                         kmem_free(interp, MAXPATHLEN);
1668                         return (error);
1669                 }
1670 
1671                 kmem_free(interp, MAXPATHLEN);
1672                 interp = NULL;
1673 
1674                 /*
1675                  * map in the Linux linker
1676                  */
1677                 if (args->to_model == DATAMODEL_NATIVE) {
1678                         error = mapexec_brand(nvp, args, &ehdr,
1679                             &uphdr_vaddr, &voffset, exec_file, NULL, NULL,
1680                             NULL, NULL, NULL, &ldaddr);
1681                 }
1682 #if defined(_LP64)
1683                 else {
1684                         Elf32_Ehdr      ehdr32;
1685                         Elf32_Addr      uphdr_vaddr32;
1686 
1687                         error = mapexec32_brand(nvp, args, &ehdr32,
1688                             &uphdr_vaddr32, &voffset, exec_file, NULL, NULL,
1689                             NULL, NULL, NULL, &ldaddr);
1690 
1691                         Ehdr32to64(&ehdr32, &ehdr);
1692 
1693                         if (uphdr_vaddr32 == (Elf32_Addr)-1)
1694                                 uphdr_vaddr = (Addr)-1;
1695                         else
1696                                 uphdr_vaddr = uphdr_vaddr32;
1697                 }
1698 #endif
1699 
1700                 VN_RELE(nvp);
1701                 if (error != 0) {
1702                         restoreexecenv(&origenv, &orig_sigaltstack);
1703                         return (error);
1704                 }
1705 
1706                 /*
1707                  * Now that we know the base address of the brand's linker,
1708                  * we also save this for later use by the brand library.
1709                  */
1710                 edp->ed_base = voffset;
1711                 edp->ed_ldentry = voffset + ehdr.e_entry;
1712         } else {
1713                 /*
1714                  * This program has no interpreter. The lx brand library will
1715                  * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
1716                  * so in this case, put the entry point of the main executable
1717                  * there.
1718                  */
1719                 if (ehdr.e_type == ET_EXEC) {
1720                         /*
1721                          * An executable with no interpreter, this must be a
1722                          * statically linked executable, which means we loaded
1723                          * it at the address specified in the elf header, in
1724                          * which case the e_entry field of the elf header is an
1725                          * absolute address.
1726                          */
1727                         edp->ed_ldentry = ehdr.e_entry;
1728                         edp->ed_entry = ehdr.e_entry;
1729                 } else {
1730                         /*
1731                          * A shared object with no interpreter, we use the
1732                          * calculated address from above.
1733                          */
1734                         edp->ed_ldentry = edp->ed_entry;
1735 
1736                         /*
1737                          * In all situations except an ET_DYN elf object with no
1738                          * interpreter, we want to leave the brk and base
1739                          * values set by mapexec_brand alone. Normally when
1740                          * running ET_DYN objects on Solaris (most likely
1741                          * /lib/ld.so.1) the kernel sets brk and base to 0 since
1742                          * it doesn't know where to put the heap, and later the
1743                          * linker will call brk() to initialize the heap in:
1744                          *      usr/src/cmd/sgs/rtld/common/setup.c:setup()
1745                          * after it has determined where to put it.  (This
1746                          * decision is made after the linker loads and inspects
1747                          * elf properties of the target executable being run.)
1748                          *
1749                          * So for ET_DYN Linux executables, we also don't know
1750                          * where the heap should go, so we'll set the brk and
1751                          * base to 0.  But in this case the Solaris linker will
1752                          * not initialize the heap, so when the Linux linker
1753                          * starts running there is no heap allocated.  This
1754                          * seems to be ok on Linux 2.4 based systems because the
1755                          * Linux linker/libc fall back to using mmap() to
1756                          * allocate memory. But on 2.6 systems, running
1757                          * applications by specifying them as command line
1758                          * arguments to the linker results in segfaults for an
1759                          * as yet undetermined reason (which seems to indicatej
1760                          * that a more permanent fix for heap initalization in
1761                          * these cases may be necessary).
1762                          */
1763                         if (ehdr.e_type == ET_DYN) {
1764                                 env.ex_bssbase = (caddr_t)0;
1765                                 env.ex_brkbase = (caddr_t)0;
1766                                 env.ex_brksize = 0;
1767                         }
1768                 }
1769 
1770         }
1771 
1772         env.ex_vp = vp;
1773         setexecenv(&env);
1774 
1775         /*
1776          * We try to keep /proc's view of the aux vector consistent with
1777          * what's on the process stack.
1778          */
1779         if (args->to_model == DATAMODEL_NATIVE) {
1780                 auxv_t phdr_auxv[4] = {
1781                     { AT_SUN_BRAND_LX_PHDR, 0 },
1782                     { AT_SUN_BRAND_LX_INTERP, 0 },
1783                     { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 },
1784                     { AT_SUN_BRAND_AUX4, 0 }
1785                 };
1786                 phdr_auxv[0].a_un.a_val = edp->ed_phdr;
1787                 phdr_auxv[1].a_un.a_val = ldaddr;
1788                 phdr_auxv[2].a_un.a_val = 1;    /* set in lx_init */
1789                 phdr_auxv[3].a_type = AT_CLKTCK;
1790                 phdr_auxv[3].a_un.a_val = hz;
1791 
1792                 if (copyout(&phdr_auxv, args->auxp_brand,
1793                     sizeof (phdr_auxv)) == -1)
1794                         return (EFAULT);
1795         }
1796 #if defined(_LP64)
1797         else {
1798                 auxv32_t phdr_auxv32[3] = {
1799                     { AT_SUN_BRAND_LX_PHDR, 0 },
1800                     { AT_SUN_BRAND_LX_INTERP, 0 },
1801                     { AT_SUN_BRAND_AUX3, 0 }
1802                 };
1803                 phdr_auxv32[0].a_un.a_val = edp->ed_phdr;
1804                 phdr_auxv32[1].a_un.a_val = ldaddr;
1805                 phdr_auxv32[2].a_type = AT_CLKTCK;
1806                 phdr_auxv32[2].a_un.a_val = hz;
1807 
1808                 if (copyout(&phdr_auxv32, args->auxp_brand,
1809                     sizeof (phdr_auxv32)) == -1)
1810                         return (EFAULT);
1811         }
1812 #endif
1813 
1814         /*
1815          * /proc uses the AT_ENTRY aux vector entry to deduce
1816          * the location of the executable in the address space. The user
1817          * structure contains a copy of the aux vector that needs to have those
1818          * entries patched with the values of the real lx executable (they
1819          * currently contain the values from the lx brand library that was
1820          * elfexec'd, above).
1821          *
1822          * For live processes, AT_BASE is used to locate the linker segment,
1823          * which /proc and friends will later use to find Solaris symbols
1824          * (such as rtld_db_preinit). However, for core files, /proc uses
1825          * AT_ENTRY to find the right segment to label as the executable.
1826          * So we set AT_ENTRY to be the entry point of the linux executable,
1827          * but leave AT_BASE to be the address of the Solaris linker.
1828          */
1829         for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
1830                 switch (up->u_auxv[i].a_type) {
1831                 case AT_ENTRY:
1832                         up->u_auxv[i].a_un.a_val = edp->ed_entry;
1833                         break;
1834 
1835                 case AT_SUN_BRAND_LX_PHDR:
1836                         up->u_auxv[i].a_un.a_val = edp->ed_phdr;
1837                         break;
1838 
1839                 case AT_SUN_BRAND_LX_INTERP:
1840                         up->u_auxv[i].a_un.a_val = ldaddr;
1841                         break;
1842 
1843                 default:
1844                         break;
1845                 }
1846         }
1847 
1848         return (0);
1849 }
1850 
1851 boolean_t
1852 lx_native_exec(uint8_t osabi, const char **interp)
1853 {
1854         if (osabi != ELFOSABI_SOLARIS)
1855                 return (B_FALSE);
1856 
1857         /*
1858          * If the process root matches the zone root, prepend /native to the
1859          * interpreter path for native executables.  Absolute precision from
1860          * VN_CMP is not necessary since any change of process root is likely
1861          * to make native binaries inaccessible via /native.
1862          *
1863          * Processes which chroot directly into /native will be able to
1864          * function as expected with no need for the prefix.
1865          */
1866         if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) {
1867                 *interp = "/native";
1868         }
1869 
1870         return (B_TRUE);
1871 }
1872 
1873 static void
1874 lx_syscall_init(void)
1875 {
1876         int i;
1877 
1878         /*
1879          * Count up the 32-bit Linux system calls.  Note that lx_sysent32
1880          * has (LX_NSYSCALLS + 1) entries.
1881          */
1882         for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++)
1883                 continue;
1884         lx_nsysent32 = i;
1885 
1886 #if defined(_LP64)
1887         /*
1888          * Count up the 64-bit Linux system calls.  Note that lx_sysent64
1889          * has (LX_NSYSCALLS + 1) entries.
1890          */
1891         for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++)
1892                 continue;
1893         lx_nsysent64 = i;
1894 #endif
1895 }
1896 
1897 int
1898 _init(void)
1899 {
1900         int err = 0;
1901 
1902         lx_syscall_init();
1903         lx_pid_init();
1904         lx_ioctl_init();
1905         lx_futex_init();
1906         lx_ptrace_init();
1907         lx_socket_init();
1908 
1909         err = mod_install(&modlinkage);
1910         if (err != 0) {
1911                 cmn_err(CE_WARN, "Couldn't install lx brand module");
1912 
1913                 /*
1914                  * This looks drastic, but it should never happen.  These
1915                  * two data structures should be completely free-able until
1916                  * they are used by Linux processes.  Since the brand
1917                  * wasn't loaded there should be no Linux processes, and
1918                  * thus no way for these data structures to be modified.
1919                  */
1920                 lx_pid_fini();
1921                 lx_ioctl_fini();
1922                 if (lx_futex_fini())
1923                         panic("lx brand module cannot be loaded or unloaded.");
1924         }
1925         return (err);
1926 }
1927 
1928 int
1929 _info(struct modinfo *modinfop)
1930 {
1931         return (mod_info(&modlinkage, modinfop));
1932 }
1933 
1934 int
1935 _fini(void)
1936 {
1937         int err;
1938         int futex_done = 0;
1939 
1940         /*
1941          * If there are any zones using this brand, we can't allow it to be
1942          * unloaded.
1943          */
1944         if (brand_zone_count(&lx_brand))
1945                 return (EBUSY);
1946 
1947         lx_ptrace_fini();
1948         lx_pid_fini();
1949         lx_ioctl_fini();
1950         lx_socket_fini();
1951 
1952         if ((err = lx_futex_fini()) != 0) {
1953                 goto done;
1954         }
1955         futex_done = 1;
1956 
1957         err = mod_remove(&modlinkage);
1958 
1959 done:
1960         if (err) {
1961                 /*
1962                  * If we can't unload the module, then we have to get it
1963                  * back into a sane state.
1964                  */
1965                 lx_ptrace_init();
1966                 lx_pid_init();
1967                 lx_ioctl_init();
1968                 lx_socket_init();
1969 
1970                 if (futex_done) {
1971                         lx_futex_init();
1972                 }
1973         }
1974 
1975         return (err);
1976 }