1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2015, Joyent, Inc. All rights reserved.
  29  */
  30 
  31 /*
  32  * The LX Brand: emulation of a Linux operating environment within a zone.
  33  *
  34  * OVERVIEW
  35  *
  36  * The LX brand enables a full Linux userland -- including a C library,
  37  * init(1) framework, and some set of applications -- to run unmodified
  38  * within an illumos zone.  Unlike illumos, where applications are expected
  39  * to link against and consume functions exported from libraries, the
  40  * supported Linux binary compatibility boundary is the system call
  41  * interface.  By accurately emulating the behaviour of Linux system calls,
  42  * Linux software can be executed in this environment as if it were running
  43  * on a native Linux system.
  44  *
  45  * EMULATING LINUX SYSTEM CALLS
  46  *
  47  * Linux system calls are made in 32-bit processes via the "int 0x80"
  48  * instruction; in 64-bit processes the "syscall" instruction is used, as it
  49  * is with native illumos processes.  In both cases, arguments to system
  50  * calls are generally passed in registers and the usermode stack is not
  51  * interpreted or modified by the Linux kernel.
  52  *
  53  * When the emulated Linux process makes a system call, it traps into the
  54  * illumos kernel.  The in-kernel brand module contains various emulation
  55  * routines, and can fully service some emulated system calls; e.g. read(2)
  56  * and write(2).  Other system calls require assistance from the illumos
  57  * libc, bouncing back out to the brand library ("lx_brand.so.1") for
  58  * emulation.
  59  *
  60  * The brand mechanism allows for the provision of an alternative trap
  61  * handler for the various system call mechanisms.  Traditionally this was
  62  * used to immediately revector execution to the usermode emulation library,
  63  * which was responsible for handling all system calls.  In the interests of
  64  * more accurate emulation and increased performance, much of the regular
  65  * illumos system call path is now invoked.  Only the argument processing and
  66  * handler dispatch are replaced by the brand, via the per-LWP
  67  * "lwp_brand_syscall" interposition function pointer.
  68  *
  69  * THE NATIVE AND BRAND STACKS
  70  *
  71  * Some runtime environments (e.g. the Go language) allocate very small
  72  * thread stacks, preferring to grow or split the stack as necessary.  The
  73  * Linux kernel generally does not use the usermode stack when servicing
  74  * system calls, so this is not a problem.  In order for our emulation to
  75  * have the same zero stack impact, we must execute usermode emulation
  76  * routines on an _alternate_ stack.  This is similar, in principle, to the
  77  * use of sigaltstack(3C) to run signal handlers off the main thread stack.
  78  *
  79  * To this end, the brand library allocates and installs an alternate stack
  80  * (called the "native" stack) for each LWP.  The in-kernel brand code uses
  81  * this stack for usermode emulation calls and interposed signal delivery,
  82  * while the emulated Linux process sees only the data on the main thread
  83  * stack, known as the "brand" stack.  The stack mode is tracked in the
  84  * per-LWP brand-private data, using the LX_STACK_MODE_* enum.
  85  *
  86  * The stack mode doubles as a system call "mode bit".  When in the
  87  * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux
  88  * system calls.  In other modes, system calls are assumed to be native
  89  * illumos system calls as made during brand library initialisation and
  90  * usermode emulation.
  91  *
  92  * USERMODE EMULATION
  93  *
  94  * When a Linux system call cannot be emulated within the kernel, we preserve
  95  * the register state of the Linux process and revector the LWP to the brand
  96  * library usermode emulation handler: the "lx_emulate()" function in
  97  * "lx_brand.so.1".  This revectoring is modelled on the delivery of signals,
  98  * and is performed in "lx_emulate_user()".
  99  *
 100  * First, the emulated process state is written out to the usermode stack of
 101  * the process as a "ucontext_t" object.  Arguments to the emulation routine
 102  * are passed on the stack or in registers, depending on the ABI.  When the
 103  * usermode emulation is complete, the result is passed back to the kernel
 104  * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context
 105  * for restoration.
 106  *
 107  * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT
 108  *
 109  * When servicing emulated system calls in the usermode brand library, or
 110  * during signal delivery, various state is preserved by the kernel so that
 111  * the running LWP may be revectored to a handling routine.  The context
 112  * allows the kernel to restart the program at the point of interruption,
 113  * either at the return of the signal handler, via setcontext(3C); or after
 114  * the usermode emulation request has been serviced, via B_EMULATION_DONE.
 115  *
 116  * In illumos native processes, the saved context (a "ucontext_t" object)
 117  * includes the state of registers and the current signal mask at the point
 118  * of interruption.  The context also includes a link to the most recently
 119  * saved context, forming a chain to be unwound as requests complete.  The LX
 120  * brand requires additional book-keeping to describe the machine state: in
 121  * particular, the current stack mode and the occupied extent of the native
 122  * stack.
 123  *
 124  * The brand code is able to interpose on the context save and restore
 125  * operations in the kernel -- see "lx_savecontext()" and
 126  * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to
 127  * function correctly in the face of a dual stack LWP.  The brand also
 128  * interposes on the signal delivery mechanism -- see "lx_sendsig()" and
 129  * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand
 130  * library interposer on the native stack, regardless of the interrupted
 131  * execution mode.  Linux sigaltstack(2) emulation is performed entirely by
 132  * the usermode brand library during signal handler interposition.
 133  */
 134 
 135 #include <sys/types.h>
 136 #include <sys/kmem.h>
 137 #include <sys/errno.h>
 138 #include <sys/thread.h>
 139 #include <sys/systm.h>
 140 #include <sys/syscall.h>
 141 #include <sys/proc.h>
 142 #include <sys/modctl.h>
 143 #include <sys/cmn_err.h>
 144 #include <sys/model.h>
 145 #include <sys/exec.h>
 146 #include <sys/lx_impl.h>
 147 #include <sys/machbrand.h>
 148 #include <sys/lx_syscalls.h>
 149 #include <sys/lx_misc.h>
 150 #include <sys/lx_futex.h>
 151 #include <sys/lx_brand.h>
 152 #include <sys/param.h>
 153 #include <sys/termios.h>
 154 #include <sys/sunddi.h>
 155 #include <sys/ddi.h>
 156 #include <sys/vnode.h>
 157 #include <sys/pathname.h>
 158 #include <sys/auxv.h>
 159 #include <sys/priv.h>
 160 #include <sys/regset.h>
 161 #include <sys/privregs.h>
 162 #include <sys/archsystm.h>
 163 #include <sys/zone.h>
 164 #include <sys/brand.h>
 165 #include <sys/sdt.h>
 166 #include <sys/x86_archext.h>
 167 #include <sys/controlregs.h>
 168 #include <sys/core.h>
 169 #include <sys/stack.h>
 170 #include <sys/stat.h>
 171 #include <sys/socket.h>
 172 #include <lx_signum.h>
 173 #include <util/sscanf.h>
 174 
 175 int     lx_debug = 0;
 176 
 177 void    lx_init_brand_data(zone_t *);
 178 void    lx_free_brand_data(zone_t *);
 179 void    lx_setbrand(proc_t *);
 180 int     lx_getattr(zone_t *, int, void *, size_t *);
 181 int     lx_setattr(zone_t *, int, void *, size_t);
 182 int     lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
 183                 uintptr_t, uintptr_t);
 184 void    lx_set_kern_version(zone_t *, char *);
 185 void    lx_copy_procdata(proc_t *, proc_t *);
 186 
 187 extern int getsetcontext(int, void *);
 188 extern int waitsys(idtype_t, id_t, siginfo_t *, int);
 189 #if defined(_SYSCALL32_IMPL)
 190 extern int getsetcontext32(int, void *);
 191 extern int waitsys32(idtype_t, id_t, siginfo_t *, int);
 192 #endif
 193 
 194 extern void lx_proc_exit(proc_t *);
 195 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
 196 
 197 extern void lx_ioctl_init();
 198 extern void lx_ioctl_fini();
 199 extern void lx_socket_init();
 200 extern void lx_socket_fini();
 201 
 202 lx_systrace_f *lx_systrace_entry_ptr;
 203 lx_systrace_f *lx_systrace_return_ptr;
 204 
 205 static int lx_systrace_enabled;
 206 
 207 /*
 208  * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly
 209  * want an MMU dependency here (and should there be a microprocessor without
 210  * a hole, we don't want to start allocating from the top of the VA range).
 211  */
 212 #define LX_MAXSTACK64   0x7ffffff00000
 213 
 214 uint64_t lx_maxstack64 = LX_MAXSTACK64;
 215 
 216 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
 217     struct intpdata *idata, int level, long *execsz, int setid,
 218     caddr_t exec_file, struct cred *cred, int *brand_action);
 219 
 220 static boolean_t lx_native_exec(uint8_t, const char **);
 221 static uint32_t lx_map32limit(proc_t *);
 222 
 223 static void lx_savecontext(ucontext_t *);
 224 static void lx_restorecontext(ucontext_t *);
 225 static caddr_t lx_sendsig_stack(int);
 226 static void lx_sendsig(int);
 227 #if defined(_SYSCALL32_IMPL)
 228 static void lx_savecontext32(ucontext32_t *);
 229 #endif
 230 static int lx_setid_clear(vattr_t *, cred_t *);
 231 #if defined(_LP64)
 232 static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type,
 233     enum seg_rw);
 234 #endif
 235 
 236 
 237 /* lx brand */
 238 struct brand_ops lx_brops = {
 239         lx_init_brand_data,             /* b_init_brand_data */
 240         lx_free_brand_data,             /* b_free_brand_data */
 241         lx_brandsys,                    /* b_brandsys */
 242         lx_setbrand,                    /* b_setbrand */
 243         lx_getattr,                     /* b_getattr */
 244         lx_setattr,                     /* b_setattr */
 245         lx_copy_procdata,               /* b_copy_procdata */
 246         lx_proc_exit,                   /* b_proc_exit */
 247         lx_exec,                        /* b_exec */
 248         lx_setrval,                     /* b_lwp_setrval */
 249         lx_lwpdata_alloc,               /* b_lwpdata_alloc */
 250         lx_lwpdata_free,                /* b_lwpdata_free */
 251         lx_initlwp,                     /* b_initlwp */
 252         lx_forklwp,                     /* b_forklwp */
 253         lx_freelwp,                     /* b_freelwp */
 254         lx_exitlwp,                     /* b_lwpexit */
 255         lx_elfexec,                     /* b_elfexec */
 256         NULL,                           /* b_sigset_native_to_brand */
 257         NULL,                           /* b_sigset_brand_to_native */
 258         lx_sigfd_translate,             /* b_sigfd_translate */
 259         NSIG,                           /* b_nsig */
 260         lx_exit_with_sig,               /* b_exit_with_sig */
 261         lx_wait_filter,                 /* b_wait_filter */
 262         lx_native_exec,                 /* b_native_exec */
 263         lx_map32limit,                  /* b_map32limit */
 264         lx_stop_notify,                 /* b_stop_notify */
 265         lx_waitid_helper,               /* b_waitid_helper */
 266         lx_sigcld_repost,               /* b_sigcld_repost */
 267         lx_ptrace_issig_stop,           /* b_issig_stop */
 268         lx_ptrace_sig_ignorable,        /* b_sig_ignorable */
 269         lx_savecontext,                 /* b_savecontext */
 270 #if defined(_SYSCALL32_IMPL)
 271         lx_savecontext32,               /* b_savecontext32 */
 272 #endif
 273         lx_restorecontext,              /* b_restorecontext */
 274         lx_sendsig_stack,               /* b_sendsig_stack */
 275         lx_sendsig,                     /* b_sendsig */
 276         lx_setid_clear,                 /* b_setid_clear */
 277 #if defined(_LP64)
 278         lx_pagefault                    /* b_pagefault */
 279 #else
 280         NULL
 281 #endif
 282 };
 283 
 284 struct brand_mach_ops lx_mops = {
 285         NULL,
 286         NULL,
 287         NULL,
 288         NULL,
 289         NULL,
 290         lx_fixsegreg,
 291         lx_fsbase
 292 };
 293 
 294 struct brand lx_brand = {
 295         BRAND_VER_1,
 296         "lx",
 297         &lx_brops,
 298         &lx_mops,
 299         sizeof (struct lx_proc_data)
 300 };
 301 
 302 static struct modlbrand modlbrand = {
 303         &mod_brandops, "lx brand", &lx_brand
 304 };
 305 
 306 static struct modlinkage modlinkage = {
 307         MODREV_1, (void *)&modlbrand, NULL
 308 };
 309 
 310 void
 311 lx_proc_exit(proc_t *p)
 312 {
 313         lx_proc_data_t *lxpd;
 314         proc_t *cp;
 315 
 316         mutex_enter(&p->p_lock);
 317         VERIFY(lxpd = ptolxproc(p));
 318         if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) {
 319                 mutex_exit(&p->p_lock);
 320                 return;
 321         }
 322         mutex_exit(&p->p_lock);
 323 
 324         /* Check for children which desire notification of parental death. */
 325         mutex_enter(&pidlock);
 326         for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) {
 327                 mutex_enter(&cp->p_lock);
 328                 if ((lxpd = ptolxproc(cp)) == NULL) {
 329                         mutex_exit(&cp->p_lock);
 330                         continue;
 331                 }
 332                 if (lxpd->l_parent_deathsig != 0) {
 333                         sigtoproc(p, NULL, lxpd->l_parent_deathsig);
 334                 }
 335                 mutex_exit(&cp->p_lock);
 336         }
 337         mutex_exit(&pidlock);
 338 }
 339 
 340 void
 341 lx_setbrand(proc_t *p)
 342 {
 343         /* Send SIGCHLD to parent by default when child exits */
 344         ptolxproc(p)->l_signal = stol_signo[SIGCHLD];
 345 }
 346 
 347 /* ARGSUSED */
 348 int
 349 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
 350 {
 351         char vers[LX_VERS_MAX];
 352 
 353         if (attr == LX_KERN_VERSION_NUM) {
 354                 if (bufsize > (LX_VERS_MAX - 1))
 355                         return (ERANGE);
 356                 bzero(vers, LX_VERS_MAX);
 357                 if (copyin(buf, &vers, bufsize) != 0)
 358                         return (EFAULT);
 359                 lx_set_kern_version(zone, vers);
 360                 return (0);
 361         }
 362         return (EINVAL);
 363 }
 364 
 365 /* ARGSUSED */
 366 int
 367 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
 368 {
 369         if (attr == LX_KERN_VERSION_NUM) {
 370                 if (*bufsize < LX_VERS_MAX)
 371                         return (ERANGE);
 372                 if (copyout(lx_get_zone_kern_version(curzone), buf,
 373                     LX_VERS_MAX) != 0)
 374                         return (EFAULT);
 375                 *bufsize = LX_VERS_MAX;
 376                 return (0);
 377         }
 378         return (-EINVAL);
 379 }
 380 
 381 uint32_t
 382 lx_map32limit(proc_t *p)
 383 {
 384         /*
 385          * To be bug-for-bug compatible with Linux, we have MAP_32BIT only
 386          * allow mappings in the first 31 bits.  This was a nuance in the
 387          * original Linux implementation circa 2002, and applications have
 388          * come to depend on its behavior.
 389          *
 390          * This is only relevant for 64-bit processes.
 391          */
 392         if (p->p_model == DATAMODEL_LP64)
 393                 return (1 << 31);
 394 
 395         return ((uint32_t)USERLIMIT32);
 396 }
 397 
 398 void
 399 lx_brand_systrace_enable(void)
 400 {
 401         VERIFY(!lx_systrace_enabled);
 402 
 403         lx_systrace_enabled = 1;
 404 }
 405 
 406 void
 407 lx_brand_systrace_disable(void)
 408 {
 409         VERIFY(lx_systrace_enabled);
 410 
 411         lx_systrace_enabled = 0;
 412 }
 413 
 414 void
 415 lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp)
 416 {
 417         VERIFY(lwpd->br_ntv_stack != 0);
 418 
 419         /*
 420          * The "brand-lx-set-ntv-stack-current" probe has arguments:
 421          *   arg0: stack pointer before change
 422          *   arg1: stack pointer after change
 423          *   arg2: current stack base
 424          */
 425         DTRACE_PROBE3(brand__lx__set__ntv__stack__current,
 426             uintptr_t, lwpd->br_ntv_stack_current,
 427             uintptr_t, new_sp,
 428             uintptr_t, lwpd->br_ntv_stack);
 429 
 430         lwpd->br_ntv_stack_current = new_sp;
 431 }
 432 
 433 #if defined(_LP64)
 434 static int
 435 lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type,
 436     enum seg_rw rw)
 437 {
 438         int syscall_num;
 439 
 440         /*
 441          * We only want to handle a very specific set of circumstances.
 442          * Namely: this is a 64-bit LX-branded process attempting to execute an
 443          * address in a page for which it does not have a valid mapping.  If
 444          * this is not the case, we bail out as fast as possible.
 445          */
 446         VERIFY(PROC_IS_BRANDED(p));
 447         if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) !=
 448             DATAMODEL_NATIVE) {
 449                 return (-1);
 450         }
 451 
 452         if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) {
 453                 return (-1);
 454         }
 455 
 456         /*
 457          * This is a valid vsyscall address.  We service the system call and
 458          * return 0 to signal that the pagefault has been handled completely.
 459          */
 460         lx_vsyscall_enter(p, lwp, syscall_num);
 461         return (0);
 462 }
 463 #endif
 464 
 465 /*
 466  * This hook runs prior to sendsig() processing and allows us to nominate
 467  * an alternative stack pointer for delivery of the signal handling frame.
 468  * Critically, this routine should _not_ modify any LWP state as the
 469  * savecontext() does not run until after this hook.
 470  */
 471 static caddr_t
 472 lx_sendsig_stack(int sig)
 473 {
 474         klwp_t *lwp = ttolwp(curthread);
 475         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 476 
 477         /*
 478          * We want to take signal delivery on the native stack, but only if
 479          * one has been allocated and installed for this LWP.
 480          */
 481         if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 482                 /*
 483                  * The program is not running on the native stack.  Return
 484                  * the native stack pointer from our brand-private data so
 485                  * that we may switch to it for signal handling.
 486                  */
 487                 return ((caddr_t)lwpd->br_ntv_stack_current);
 488         } else {
 489                 struct regs *rp = lwptoregs(lwp);
 490 
 491                 /*
 492                  * Either the program is already running on the native stack,
 493                  * or one has not yet been allocated for this LWP.  Use the
 494                  * current stack pointer value.
 495                  */
 496                 return ((caddr_t)rp->r_sp);
 497         }
 498 }
 499 
 500 /*
 501  * This hook runs after sendsig() processing and allows us to update the
 502  * per-LWP mode flags for system calls and stacks.  The pre-signal
 503  * context has already been saved and delivered to the user at this point.
 504  */
 505 static void
 506 lx_sendsig(int sig)
 507 {
 508         klwp_t *lwp = ttolwp(curthread);
 509         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 510         struct regs *rp = lwptoregs(lwp);
 511 
 512         switch (lwpd->br_stack_mode) {
 513         case LX_STACK_MODE_BRAND:
 514         case LX_STACK_MODE_NATIVE:
 515                 /*
 516                  * In lx_sendsig_stack(), we nominated a stack pointer from the
 517                  * native stack.  Update the stack mode, and the current in-use
 518                  * extent of the native stack, accordingly:
 519                  */
 520                 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
 521                 lx_lwp_set_native_stack_current(lwpd, rp->r_sp);
 522 
 523                 /*
 524                  * Fix up segment registers, etc.
 525                  */
 526                 lx_switch_to_native(lwp);
 527                 break;
 528 
 529         default:
 530                 /*
 531                  * Otherwise, the brand library has not yet installed the
 532                  * alternate stack for this LWP.  Signals will be handled on
 533                  * the regular stack thread.
 534                  */
 535                 return;
 536         }
 537 }
 538 
 539 /*
 540  * This hook runs prior to the context restoration, allowing us to take action
 541  * or modify the context before it is loaded.
 542  */
 543 static void
 544 lx_restorecontext(ucontext_t *ucp)
 545 {
 546         klwp_t *lwp = ttolwp(curthread);
 547         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 548         uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0];
 549         caddr_t sp = ucp->uc_brand_data[1];
 550 
 551         /*
 552          * We have a saved native stack pointer value that we must restore
 553          * into the per-LWP data.
 554          */
 555         if (flags & LX_UC_RESTORE_NATIVE_SP) {
 556                 lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp);
 557         }
 558 
 559         /*
 560          * We do not wish to restore the value of uc_link in this context,
 561          * so replace it with the value currently in the LWP.
 562          */
 563         if (flags & LX_UC_IGNORE_LINK) {
 564                 ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext;
 565         }
 566 
 567         /*
 568          * Restore the stack mode:
 569          */
 570         if (flags & LX_UC_STACK_NATIVE) {
 571                 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
 572         } else if (flags & LX_UC_STACK_BRAND) {
 573                 lwpd->br_stack_mode = LX_STACK_MODE_BRAND;
 574         }
 575 
 576 #if defined(__amd64)
 577         /*
 578          * Override the fs/gsbase in the context with the value provided
 579          * through the Linux arch_prctl(2) system call.
 580          */
 581         if (flags & LX_UC_STACK_BRAND) {
 582                 if (lwpd->br_lx_fsbase != 0) {
 583                         ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase;
 584                 }
 585                 if (lwpd->br_lx_gsbase != 0) {
 586                         ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase;
 587                 }
 588         }
 589 #endif
 590 }
 591 
 592 static void
 593 lx_savecontext(ucontext_t *ucp)
 594 {
 595         klwp_t *lwp = ttolwp(curthread);
 596         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 597         uintptr_t flags = 0;
 598 
 599         /*
 600          * The ucontext_t affords us three private pointer-sized members in
 601          * "uc_brand_data".  We pack a variety of flags into the first element,
 602          * and an optional stack pointer in the second element.  The flags
 603          * determine which stack pointer (native or brand), if any, is stored
 604          * in the second element.  The third element may contain the system
 605          * call number; this is analogous to the "orig_[er]ax" member of a
 606          * Linux "user_regs_struct".
 607          */
 608 
 609         if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
 610             lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
 611                 /*
 612                  * Record the value of the native stack pointer to restore
 613                  * when returning to this branded context:
 614                  */
 615                 flags |= LX_UC_RESTORE_NATIVE_SP;
 616                 ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current;
 617         }
 618 
 619         /*
 620          * Save the stack mode:
 621          */
 622         if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
 623                 flags |= LX_UC_STACK_NATIVE;
 624         } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 625                 flags |= LX_UC_STACK_BRAND;
 626         }
 627 
 628         /*
 629          * If we might need to restart this system call, save that information
 630          * in the context:
 631          */
 632         if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 633                 ucp->uc_brand_data[2] =
 634                     (void *)(uintptr_t)lwpd->br_syscall_num;
 635                 if (lwpd->br_syscall_restart) {
 636                         flags |= LX_UC_RESTART_SYSCALL;
 637                 }
 638         } else {
 639                 ucp->uc_brand_data[2] = NULL;
 640         }
 641 
 642         ucp->uc_brand_data[0] = (void *)flags;
 643 }
 644 
 645 #if defined(_SYSCALL32_IMPL)
 646 static void
 647 lx_savecontext32(ucontext32_t *ucp)
 648 {
 649         klwp_t *lwp = ttolwp(curthread);
 650         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 651         unsigned int flags = 0;
 652 
 653         /*
 654          * The ucontext_t affords us three private pointer-sized members in
 655          * "uc_brand_data".  We pack a variety of flags into the first element,
 656          * and an optional stack pointer in the second element.  The flags
 657          * determine which stack pointer (native or brand), if any, is stored
 658          * in the second element.  The third element may contain the system
 659          * call number; this is analogous to the "orig_[er]ax" member of a
 660          * Linux "user_regs_struct".
 661          */
 662 
 663         if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
 664             lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
 665                 /*
 666                  * Record the value of the native stack pointer to restore
 667                  * when returning to this branded context:
 668                  */
 669                 flags |= LX_UC_RESTORE_NATIVE_SP;
 670                 ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current;
 671         }
 672 
 673         /*
 674          * Save the stack mode:
 675          */
 676         if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
 677                 flags |= LX_UC_STACK_NATIVE;
 678         } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 679                 flags |= LX_UC_STACK_BRAND;
 680         }
 681 
 682         /*
 683          * If we might need to restart this system call, save that information
 684          * in the context:
 685          */
 686         if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 687                 ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num;
 688                 if (lwpd->br_syscall_restart) {
 689                         flags |= LX_UC_RESTART_SYSCALL;
 690                 }
 691         } else {
 692                 ucp->uc_brand_data[2] = NULL;
 693         }
 694 
 695         ucp->uc_brand_data[0] = flags;
 696 }
 697 #endif
 698 
 699 void
 700 lx_init_brand_data(zone_t *zone)
 701 {
 702         lx_zone_data_t *data;
 703         ASSERT(zone->zone_brand == &lx_brand);
 704         ASSERT(zone->zone_brand_data == NULL);
 705         data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP);
 706         /*
 707          * Set the default lxzd_kernel_version to 2.4.
 708          * This can be changed by a call to setattr() during zone boot.
 709          */
 710         (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX);
 711 
 712         /*
 713          * Linux is not at all picky about address family when it comes to
 714          * supporting interface-related ioctls.  To mimic this behavior, we'll
 715          * attempt those ioctls against a ksocket configured for that purpose.
 716          */
 717         (void) ksocket_socket(&data->lxzd_ioctl_sock, AF_INET, SOCK_DGRAM, 0,
 718             0, zone->zone_kcred);
 719 
 720         zone->zone_brand_data = data;
 721 
 722         /*
 723          * In Linux, if the init(1) process terminates the system panics.
 724          * The zone must reboot to simulate this behaviour.
 725          */
 726         zone->zone_reboot_on_init_exit = B_TRUE;
 727 }
 728 
 729 void
 730 lx_free_brand_data(zone_t *zone)
 731 {
 732         lx_zone_data_t *data = ztolxzd(zone);
 733         ASSERT(data != NULL);
 734         if (data->lxzd_ioctl_sock != NULL) {
 735                 /*
 736                  * Since zone_kcred has been cleaned up already, close the
 737                  * socket using the global kcred.
 738                  */
 739                 ksocket_close(data->lxzd_ioctl_sock, kcred);
 740                 data->lxzd_ioctl_sock = NULL;
 741         }
 742         zone->zone_brand_data = NULL;
 743         kmem_free(data, sizeof (*data));
 744 }
 745 
 746 void
 747 lx_unsupported(char *dmsg)
 748 {
 749         lx_proc_data_t *pd = ttolxproc(curthread);
 750 
 751         DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg);
 752 
 753         if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) {
 754                 /*
 755                  * If this process was run with strict mode enabled
 756                  * (via LX_STRICT in the environment), we mark this
 757                  * LWP as having triggered an unsupported behaviour.
 758                  * This flag will be checked at an appropriate point
 759                  * by lx_check_strict_failure().
 760                  */
 761                 lx_lwp_data_t *lwpd = ttolxlwp(curthread);
 762 
 763                 lwpd->br_strict_failure = B_TRUE;
 764         }
 765 }
 766 
 767 void
 768 lx_check_strict_failure(lx_lwp_data_t *lwpd)
 769 {
 770         proc_t *p;
 771 
 772         if (!lwpd->br_strict_failure) {
 773                 return;
 774         }
 775 
 776         lwpd->br_strict_failure = B_FALSE;
 777 
 778         /*
 779          * If this process is operating in strict mode (via LX_STRICT in
 780          * the environment), and has triggered a call to
 781          * lx_unsupported(), we drop SIGSYS on it as we return.
 782          */
 783         p = curproc;
 784         mutex_enter(&p->p_lock);
 785         sigtoproc(p, curthread, SIGSYS);
 786         mutex_exit(&p->p_lock);
 787 }
 788 
 789 void
 790 lx_trace_sysenter(int syscall_num, uintptr_t *args)
 791 {
 792         if (lx_systrace_enabled) {
 793                 VERIFY(lx_systrace_entry_ptr != NULL);
 794 
 795                 (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1],
 796                     args[2], args[3], args[4], args[5]);
 797         }
 798 }
 799 
 800 void
 801 lx_trace_sysreturn(int syscall_num, long ret)
 802 {
 803         if (lx_systrace_enabled) {
 804                 VERIFY(lx_systrace_return_ptr != NULL);
 805 
 806                 (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0);
 807         }
 808 }
 809 
 810 /*
 811  * Get the addresses of the user-space system call handler and attach it to
 812  * the proc structure. Returning 0 indicates success; the value returned
 813  * by the system call is the value stored in rval. Returning a non-zero
 814  * value indicates a failure; the value returned is used to set errno, -1
 815  * is returned from the syscall and the contents of rval are ignored. To
 816  * set errno and have the syscall return a value other than -1 we can
 817  * manually set errno and rval and return 0.
 818  */
 819 int
 820 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
 821     uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
 822 {
 823         kthread_t *t = curthread;
 824         klwp_t *lwp = ttolwp(t);
 825         proc_t *p = ttoproc(t);
 826         lx_proc_data_t *pd;
 827         struct termios *termios;
 828         uint_t termios_len;
 829         int error;
 830         int code;
 831         int sig;
 832         lx_brand_registration_t reg;
 833         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 834 
 835         /*
 836          * There is one operation that is suppored for non-branded
 837          * process.  B_EXEC_BRAND.  This is the equilivant of an
 838          * exec call, but the new process that is created will be
 839          * a branded process.
 840          */
 841         if (cmd == B_EXEC_BRAND) {
 842                 VERIFY(p->p_zone != NULL);
 843                 VERIFY(p->p_zone->zone_brand == &lx_brand);
 844                 return (exec_common(
 845                     (char *)arg1, (const char **)arg2, (const char **)arg3,
 846                     EBA_BRAND));
 847         }
 848 
 849         /* For all other operations this must be a branded process. */
 850         if (p->p_brand == NULL)
 851                 return (ENOSYS);
 852 
 853         VERIFY(p->p_brand == &lx_brand);
 854         VERIFY(p->p_brand_data != NULL);
 855 
 856         switch (cmd) {
 857         case B_REGISTER:
 858                 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
 859                         lx_print("stack mode was not PREINIT during "
 860                             "REGISTER\n");
 861                         return (EINVAL);
 862                 }
 863 
 864                 if (p->p_model == DATAMODEL_NATIVE) {
 865                         if (copyin((void *)arg1, &reg, sizeof (reg)) != 0) {
 866                                 lx_print("Failed to copyin brand registration "
 867                                     "at 0x%p\n", (void *)arg1);
 868                                 return (EFAULT);
 869                         }
 870                 }
 871 #ifdef _LP64
 872                 else {
 873                         /* 32-bit userland on 64-bit kernel */
 874                         lx_brand_registration32_t reg32;
 875 
 876                         if (copyin((void *)arg1, &reg32, sizeof (reg32)) != 0) {
 877                                 lx_print("Failed to copyin brand registration "
 878                                     "at 0x%p\n", (void *)arg1);
 879                                 return (EFAULT);
 880                         }
 881 
 882                         reg.lxbr_version = (uint_t)reg32.lxbr_version;
 883                         reg.lxbr_handler =
 884                             (void *)(uintptr_t)reg32.lxbr_handler;
 885                         reg.lxbr_flags = reg32.lxbr_flags;
 886                 }
 887 #endif
 888 
 889                 if (reg.lxbr_version != LX_VERSION_1) {
 890                         lx_print("Invalid brand library version (%u)\n",
 891                             reg.lxbr_version);
 892                         return (EINVAL);
 893                 }
 894 
 895                 if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) {
 896                         lx_print("Invalid brand flags (%u)\n",
 897                             reg.lxbr_flags);
 898                         return (EINVAL);
 899                 }
 900 
 901                 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
 902                     (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
 903                 pd = p->p_brand_data;
 904                 pd->l_handler = (uintptr_t)reg.lxbr_handler;
 905                 pd->l_flags = reg.lxbr_flags & LX_PROC_ALL;
 906 
 907                 return (0);
 908 
 909         case B_TTYMODES:
 910                 /* This is necessary for emulating TCGETS ioctls. */
 911                 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
 912                     DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
 913                     &termios_len) != DDI_SUCCESS)
 914                         return (EIO);
 915 
 916                 ASSERT(termios_len == sizeof (*termios));
 917 
 918                 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
 919                         ddi_prop_free(termios);
 920                         return (EFAULT);
 921                 }
 922 
 923                 ddi_prop_free(termios);
 924                 return (0);
 925 
 926         case B_ELFDATA:
 927                 pd = curproc->p_brand_data;
 928                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 929                         if (copyout(&pd->l_elf_data, (void *)arg1,
 930                             sizeof (lx_elf_data_t)) != 0) {
 931                                 return (EFAULT);
 932                         }
 933                 }
 934 #if defined(_LP64)
 935                 else {
 936                         /* 32-bit userland on 64-bit kernel */
 937                         lx_elf_data32_t led32;
 938 
 939                         led32.ed_phdr = (int)pd->l_elf_data.ed_phdr;
 940                         led32.ed_phent = (int)pd->l_elf_data.ed_phent;
 941                         led32.ed_phnum = (int)pd->l_elf_data.ed_phnum;
 942                         led32.ed_entry = (int)pd->l_elf_data.ed_entry;
 943                         led32.ed_base = (int)pd->l_elf_data.ed_base;
 944                         led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry;
 945 
 946                         if (copyout(&led32, (void *)arg1,
 947                             sizeof (led32)) != 0) {
 948                                 return (EFAULT);
 949                         }
 950                 }
 951 #endif
 952                 return (0);
 953 
 954         case B_EXEC_NATIVE:
 955                 return (exec_common((char *)arg1, (const char **)arg2,
 956                     (const char **)arg3, EBA_NATIVE));
 957 
 958         /*
 959          * The B_TRUSS_POINT subcommand is used so that we can make a no-op
 960          * syscall for debugging purposes (dtracing) from within the user-level
 961          * emulation.
 962          */
 963         case B_TRUSS_POINT:
 964                 return (0);
 965 
 966         case B_LPID_TO_SPAIR: {
 967                 /*
 968                  * Given a Linux pid as arg1, return the Solaris pid in arg2 and
 969                  * the Solaris LWP in arg3.  We also translate pid 1 (which is
 970                  * hardcoded in many applications) to the zone's init process.
 971                  */
 972                 pid_t s_pid;
 973                 id_t s_tid;
 974 
 975                 if ((pid_t)arg1 == 1) {
 976                         s_pid = p->p_zone->zone_proc_initpid;
 977                         /* handle the dead/missing init(1M) case */
 978                         if (s_pid == -1)
 979                                 s_pid = 1;
 980                         s_tid = 1;
 981                 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) {
 982                         return (ESRCH);
 983                 }
 984 
 985                 if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 ||
 986                     copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) {
 987                         return (EFAULT);
 988                 }
 989 
 990                 return (0);
 991         }
 992 
 993         case B_SIGEV_THREAD_ID: {
 994                 /*
 995                  * Emulate Linux's timer_create(2) SIGEV_THREAD_ID
 996                  * notification method. This mechanism is only meant
 997                  * for userland threading libraries such as glibc and
 998                  * is documented as such. Therefore, assume this is
 999                  * only ever invoked for the purpose of alerting a
1000                  * Linux threading library. Assume that the tid is a
1001                  * member of the caller's process and the signal
1002                  * number is valid. See lx_sigev_thread_id() for the
1003                  * userland side of this emulation.
1004                  *
1005                  * The return code from this function is not checked
1006                  * by the caller since it executes in an asynchronous
1007                  * context and there is nothing much to be done. If
1008                  * this function does fail then it will manifest as
1009                  * Linux threads waiting for a signal they will never
1010                  * receive.
1011                  *
1012                  * arg1 -- Linux tid
1013                  * arg2 -- Linux signal number
1014                  * arg3 -- union sigval
1015                  */
1016 
1017                 int native_sig = lx_ltos_signo((int)arg2, 0);
1018                 pid_t native_pid;
1019                 int native_tid;
1020                 sigqueue_t *sqp;
1021 
1022                 if (native_sig == 0)
1023                         return (EINVAL);
1024 
1025                 lx_lpid_to_spair((pid_t)arg1, &native_pid, &native_tid);
1026                 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
1027                 mutex_enter(&curproc->p_lock);
1028 
1029                 if ((t = idtot(curproc, native_tid)) == NULL) {
1030                         mutex_exit(&curproc->p_lock);
1031                         kmem_free(sqp, sizeof (sigqueue_t));
1032                         return (ESRCH);
1033                 }
1034 
1035                 sqp->sq_info.si_signo = native_sig;
1036                 sqp->sq_info.si_code = SI_TIMER;
1037                 sqp->sq_info.si_pid = curproc->p_pid;
1038                 sqp->sq_info.si_zoneid = getzoneid();
1039                 sqp->sq_info.si_uid = crgetruid(CRED());
1040                 sqp->sq_info.si_value.sival_ptr = (void *)arg3;
1041                 sigaddqa(curproc, t, sqp);
1042 
1043                 mutex_exit(&curproc->p_lock);
1044 
1045                 return (0);
1046         }
1047 
1048         case B_SET_AFFINITY_MASK:
1049         case B_GET_AFFINITY_MASK:
1050                 /*
1051                  * Retrieve or store the CPU affinity mask for the
1052                  * requested linux pid.
1053                  *
1054                  * arg1 is a linux PID (0 means curthread).
1055                  * arg2 is the size of the given mask.
1056                  * arg3 is the address of the affinity mask.
1057                  */
1058                 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval));
1059 
1060         case B_PTRACE_STOP_FOR_OPT:
1061                 return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ?
1062                     B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
1063 
1064         case B_PTRACE_CLONE_BEGIN:
1065                 return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ?
1066                     B_FALSE : B_TRUE));
1067 
1068         case B_PTRACE_KERNEL:
1069                 return (lx_ptrace_kernel((int)arg1, (pid_t)arg2, arg3, arg4));
1070 
1071         case B_HELPER_WAITID: {
1072                 idtype_t idtype = (idtype_t)arg1;
1073                 id_t id = (id_t)arg2;
1074                 siginfo_t *infop = (siginfo_t *)arg3;
1075                 int options = (int)arg4;
1076 
1077                 lwpd = ttolxlwp(curthread);
1078 
1079                 /*
1080                  * Our brand-specific waitid helper only understands a subset of
1081                  * the possible idtypes.  Ensure we keep to that subset here:
1082                  */
1083                 if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) {
1084                         return (EINVAL);
1085                 }
1086 
1087                 /*
1088                  * Enable the return of emulated ptrace(2) stop conditions
1089                  * through lx_waitid_helper, and stash the Linux-specific
1090                  * extra waitid() flags.
1091                  */
1092                 lwpd->br_waitid_emulate = B_TRUE;
1093                 lwpd->br_waitid_flags = (int)arg5;
1094 
1095 #if defined(_SYSCALL32_IMPL)
1096                 if (get_udatamodel() != DATAMODEL_NATIVE) {
1097                         return (waitsys32(idtype, id, infop, options));
1098                 } else
1099 #endif
1100                 {
1101                         return (waitsys(idtype, id, infop, options));
1102                 }
1103 
1104                 lwpd->br_waitid_emulate = B_FALSE;
1105                 lwpd->br_waitid_flags = 0;
1106 
1107                 return (0);
1108         }
1109 
1110         case B_UNSUPPORTED: {
1111                 char dmsg[256];
1112 
1113                 if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) {
1114                         lx_print("Failed to copyin unsupported msg "
1115                             "at 0x%p\n", (void *)arg1);
1116                         return (EFAULT);
1117                 }
1118                 dmsg[255] = '\0';
1119                 lx_unsupported(dmsg);
1120 
1121                 lx_check_strict_failure(lwpd);
1122 
1123                 return (0);
1124         }
1125 
1126         case B_STORE_ARGS: {
1127                 /*
1128                  * B_STORE_ARGS subcommand
1129                  * arg1 = address of struct to be copied in
1130                  * arg2 = size of the struct being copied in
1131                  * arg3-arg6 ignored
1132                  * rval = the amount of data copied.
1133                  */
1134                 void *buf;
1135 
1136                 /* only have upper limit because arg2 is unsigned */
1137                 if (arg2 > LX_BR_ARGS_SIZE_MAX) {
1138                         return (EINVAL);
1139                 }
1140 
1141                 buf = kmem_alloc(arg2, KM_SLEEP);
1142                 if (copyin((void *)arg1, buf, arg2) != 0) {
1143                         lx_print("Failed to copyin scall arg at 0x%p\n",
1144                             (void *) arg1);
1145                         kmem_free(buf, arg2);
1146                         /*
1147                          * Purposely not setting br_scall_args to NULL
1148                          * to preserve data for debugging.
1149                          */
1150                         return (EFAULT);
1151                 }
1152 
1153                 if (lwpd->br_scall_args != NULL) {
1154                         ASSERT(lwpd->br_args_size > 0);
1155                         kmem_free(lwpd->br_scall_args,
1156                             lwpd->br_args_size);
1157                 }
1158 
1159                 lwpd->br_scall_args = buf;
1160                 lwpd->br_args_size = arg2;
1161                 *rval = arg2;
1162                 return (0);
1163         }
1164 
1165         case B_HELPER_CLONE:
1166                 return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3,
1167                     (void *)arg4));
1168 
1169         case B_HELPER_SETGROUPS:
1170                 return (lx_helper_setgroups(arg1, (gid_t *)arg2));
1171 
1172         case B_HELPER_SIGQUEUE:
1173                 return (lx_helper_rt_sigqueueinfo(arg1, arg2,
1174                     (siginfo_t *)arg3));
1175 
1176         case B_HELPER_TGSIGQUEUE:
1177                 return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3,
1178                     (siginfo_t *)arg4));
1179 
1180         case B_SET_THUNK_PID:
1181                 lwpd->br_lx_thunk_pid = arg1;
1182                 return (0);
1183 
1184         case B_GETPID:
1185                 /*
1186                  * The usermode clone(2) code needs to be able to call
1187                  * lx_getpid() from native code:
1188                  */
1189                 *rval = lx_getpid();
1190                 return (0);
1191 
1192         case B_SET_NATIVE_STACK:
1193                 /*
1194                  * B_SET_NATIVE_STACK subcommand
1195                  * arg1 = the base of the stack to use for emulation
1196                  */
1197                 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
1198                         lx_print("B_SET_NATIVE_STACK when stack was already "
1199                             "set to %p\n", (void *)arg1);
1200                         return (EEXIST);
1201                 }
1202 
1203                 /*
1204                  * We move from the PREINIT state, where we have no brand
1205                  * emulation stack, to the INIT state.  Here, we are still
1206                  * running on what will become the BRAND stack, but are running
1207                  * emulation (i.e. native) code.  Once the initialisation
1208                  * process for this thread has finished, we will jump to
1209                  * brand-specific code, while moving to the BRAND mode.
1210                  *
1211                  * When a new LWP is created, lx_initlwp() will clear the
1212                  * stack data.  If that LWP is actually being duplicated
1213                  * into a child process by fork(2), lx_forklwp() will copy
1214                  * it so that the cloned thread will keep using the same
1215                  * alternate stack.
1216                  */
1217                 lwpd->br_ntv_stack = arg1;
1218                 lwpd->br_stack_mode = LX_STACK_MODE_INIT;
1219                 lx_lwp_set_native_stack_current(lwpd, arg1);
1220 
1221                 return (0);
1222 
1223         case B_GET_CURRENT_CONTEXT:
1224                 /*
1225                  * B_GET_CURRENT_CONTEXT subcommand:
1226                  * arg1 = address for pointer to current ucontext_t
1227                  */
1228 
1229 #if defined(_SYSCALL32_IMPL)
1230                 if (get_udatamodel() != DATAMODEL_NATIVE) {
1231                         caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext;
1232 
1233                         error = copyout(&addr, (void *)arg1, sizeof (addr));
1234                 } else
1235 #endif
1236                 {
1237                         error = copyout(&lwp->lwp_oldcontext, (void *)arg1,
1238                             sizeof (lwp->lwp_oldcontext));
1239                 }
1240 
1241                 return (error != 0 ? EFAULT : 0);
1242 
1243         case B_JUMP_TO_LINUX:
1244                 /*
1245                  * B_JUMP_TO_LINUX subcommand:
1246                  * arg1 = ucontext_t pointer for jump state
1247                  */
1248 
1249                 if (arg1 == NULL)
1250                         return (EINVAL);
1251 
1252                 switch (lwpd->br_stack_mode) {
1253                 case LX_STACK_MODE_NATIVE: {
1254                         struct regs *rp = lwptoregs(lwp);
1255 
1256                         /*
1257                          * We are on the NATIVE stack, so we must preserve
1258                          * the extent of that stack.  The pointer will be
1259                          * reset by a future setcontext().
1260                          */
1261                         lx_lwp_set_native_stack_current(lwpd,
1262                             (uintptr_t)rp->r_sp);
1263                         break;
1264                 }
1265 
1266                 case LX_STACK_MODE_INIT:
1267                         /*
1268                          * The LWP is transitioning to Linux code for the first
1269                          * time.
1270                          */
1271                         break;
1272 
1273                 case LX_STACK_MODE_PREINIT:
1274                         /*
1275                          * This LWP has not installed an alternate stack for
1276                          * usermode emulation handling.
1277                          */
1278                         return (ENOENT);
1279 
1280                 case LX_STACK_MODE_BRAND:
1281                         /*
1282                          * The LWP should not be on the BRAND stack.
1283                          */
1284                         exit(CLD_KILLED, SIGSYS);
1285                         return (0);
1286                 }
1287 
1288                 /*
1289                  * Transfer control to Linux:
1290                  */
1291                 return (lx_runexe(lwp, (void *)arg1));
1292 
1293         case B_EMULATION_DONE:
1294                 /*
1295                  * B_EMULATION_DONE subcommand:
1296                  * arg1 = ucontext_t * to restore
1297                  * arg2 = system call number
1298                  * arg3 = return code
1299                  * arg4 = if operation failed, the errno value
1300                  */
1301 
1302                 /*
1303                  * The first part of this operation is a setcontext() to
1304                  * restore the register state to the copy we preserved
1305                  * before vectoring to the usermode emulation routine.
1306                  * If that fails, we return (hopefully) to the emulation
1307                  * routine and it will handle the error.
1308                  */
1309 #if (_SYSCALL32_IMPL)
1310                 if (get_udatamodel() != DATAMODEL_NATIVE) {
1311                         error = getsetcontext32(SETCONTEXT, (void *)arg1);
1312                 } else
1313 #endif
1314                 {
1315                         error = getsetcontext(SETCONTEXT, (void *)arg1);
1316                 }
1317 
1318                 if (error != 0) {
1319                         return (error);
1320                 }
1321 
1322                 /*
1323                  * The saved Linux context has been restored.  We handle the
1324                  * return value or errno with code common to the in-kernel
1325                  * system call emulation.
1326                  */
1327                 if ((error = (int)arg4) != 0) {
1328                         /*
1329                          * lx_syscall_return() looks at the errno in the LWP,
1330                          * so set it here:
1331                          */
1332                         set_errno(error);
1333                 }
1334                 lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3);
1335 
1336                 return (0);
1337 
1338         case B_EXIT_AS_SIG:
1339                 code = CLD_KILLED;
1340                 sig = (int)arg1;
1341                 proc_is_exiting(p);
1342                 if (exitlwps(1) != 0) {
1343                         mutex_enter(&p->p_lock);
1344                         lwp_exit();
1345                 }
1346                 ttolwp(curthread)->lwp_cursig = sig;
1347                 if (sig == SIGSEGV) {
1348                         if (core(sig, 0) == 0)
1349                                 code = CLD_DUMPED;
1350                 }
1351                 exit(code, sig);
1352                 /* NOTREACHED */
1353                 break;
1354         }
1355 
1356         return (EINVAL);
1357 }
1358 
1359 char *
1360 lx_get_zone_kern_version(zone_t *zone)
1361 {
1362         return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version);
1363 }
1364 
1365 void
1366 lx_set_kern_version(zone_t *zone, char *vers)
1367 {
1368         lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
1369 
1370         (void) strlcpy(lxzd->lxzd_kernel_version, vers, LX_VERS_MAX);
1371 }
1372 
1373 /*
1374  * Compare linux kernel version to the one set for the zone.
1375  * Returns greater than 0 if zone version is higher, less than 0 if the zone
1376  * version is lower, and 0 if the version are equal.
1377  */
1378 int
1379 lx_kern_version_cmp(zone_t *zone, const char *vers)
1380 {
1381         int zvers[3] = {0, 0, 0};
1382         int cvers[3] = {0, 0, 0};
1383         int i;
1384 
1385         VERIFY(zone->zone_brand == &lx_brand);
1386 
1387         (void) sscanf(ztolxzd(zone)->lxzd_kernel_version, "%d.%d.%d", &zvers[0],
1388             &zvers[1], &zvers[2]);
1389         (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]);
1390 
1391         for (i = 0; i < 3; i++) {
1392                 if (zvers[i] > cvers[i]) {
1393                         return (1);
1394                 } else if (zvers[i] < cvers[i]) {
1395                         return (-1);
1396                 }
1397         }
1398         return (0);
1399 }
1400 
1401 /*
1402  * Linux unconditionally removes the setuid and setgid bits when changing
1403  * file ownership.  This brand hook overrides the illumos native behaviour,
1404  * which is based on the PRIV_FILE_SETID privilege.
1405  */
1406 static int
1407 lx_setid_clear(vattr_t *vap, cred_t *cr)
1408 {
1409         if (S_ISDIR(vap->va_mode)) {
1410                 return (0);
1411         }
1412 
1413         if (vap->va_mode & S_ISUID) {
1414                 vap->va_mask |= AT_MODE;
1415                 vap->va_mode &= ~S_ISUID;
1416         }
1417         if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1418                 vap->va_mask |= AT_MODE;
1419                 vap->va_mode &= ~S_ISGID;
1420         }
1421 
1422         return (0);
1423 }
1424 
1425 /*
1426  * Copy the per-process brand data from a parent proc to a child.
1427  */
1428 void
1429 lx_copy_procdata(proc_t *child, proc_t *parent)
1430 {
1431         lx_proc_data_t *cpd = child->p_brand_data;
1432         lx_proc_data_t *ppd = parent->p_brand_data;
1433 
1434         VERIFY(parent->p_brand == &lx_brand);
1435         VERIFY(child->p_brand == &lx_brand);
1436         VERIFY(ppd != NULL);
1437         VERIFY(cpd != NULL);
1438 
1439         *cpd = *ppd;
1440 
1441         cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY;
1442         cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY;
1443 
1444         cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20;
1445         cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20;
1446 
1447         cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY;
1448         cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY;
1449 
1450         cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY;
1451         cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY;
1452 }
1453 
1454 #if defined(_LP64)
1455 static void
1456 Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst)
1457 {
1458         bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident));
1459         dst->e_type =                src->e_type;
1460         dst->e_machine =     src->e_machine;
1461         dst->e_version =     src->e_version;
1462         dst->e_entry =               src->e_entry;
1463         dst->e_phoff =               src->e_phoff;
1464         dst->e_shoff =               src->e_shoff;
1465         dst->e_flags =               src->e_flags;
1466         dst->e_ehsize =              src->e_ehsize;
1467         dst->e_phentsize =   src->e_phentsize;
1468         dst->e_phnum =               src->e_phnum;
1469         dst->e_shentsize =   src->e_shentsize;
1470         dst->e_shnum =               src->e_shnum;
1471         dst->e_shstrndx =    src->e_shstrndx;
1472 }
1473 #endif /* _LP64 */
1474 
1475 static void
1476 restoreexecenv(struct execenv *ep, stack_t *sp)
1477 {
1478         klwp_t *lwp = ttolwp(curthread);
1479 
1480         setexecenv(ep);
1481         lwp->lwp_sigaltstack.ss_sp = sp->ss_sp;
1482         lwp->lwp_sigaltstack.ss_size = sp->ss_size;
1483         lwp->lwp_sigaltstack.ss_flags = sp->ss_flags;
1484 }
1485 
1486 extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
1487     long *, int, caddr_t, cred_t *, int *);
1488 
1489 extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int,
1490     long *, int, caddr_t, cred_t *, int *);
1491 
1492 /*
1493  * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux
1494  * binaries.
1495  */
1496 static int
1497 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
1498     struct intpdata *idata, int level, long *execsz, int setid,
1499     caddr_t exec_file, struct cred *cred, int *brand_action)
1500 {
1501         int             error;
1502         vnode_t         *nvp;
1503         Ehdr            ehdr;
1504         Addr            uphdr_vaddr;
1505         intptr_t        voffset;
1506         char            *interp = NULL;
1507         uintptr_t       ldaddr = NULL;
1508         int             i;
1509         proc_t          *p = ttoproc(curthread);
1510         klwp_t          *lwp = ttolwp(curthread);
1511         struct execenv  env;
1512         struct execenv  origenv;
1513         stack_t         orig_sigaltstack;
1514         struct user     *up = PTOU(ttoproc(curthread));
1515         lx_elf_data_t   *edp;
1516         char            *lib_path = NULL;
1517 
1518         ASSERT(ttoproc(curthread)->p_brand == &lx_brand);
1519         ASSERT(ttoproc(curthread)->p_brand_data != NULL);
1520 
1521         edp = &ttolxproc(curthread)->l_elf_data;
1522 
1523         if (args->to_model == DATAMODEL_NATIVE) {
1524                 lib_path = LX_LIB_PATH;
1525         }
1526 #if defined(_LP64)
1527         else {
1528                 lib_path = LX_LIB_PATH32;
1529         }
1530 #endif
1531 
1532         /*
1533          * Set the brandname and library name for the new process so that
1534          * elfexec() puts them onto the stack.
1535          */
1536         args->brandname = LX_BRANDNAME;
1537         args->emulator = lib_path;
1538 
1539 #if defined(_LP64)
1540         /*
1541          * To conform with the way Linux lays out the address space, we clamp
1542          * the stack to be the top of the lower region of the x86-64 canonical
1543          * form address space -- which has the side-effect of laying out the
1544          * entire address space in that lower region.  Note that this only
1545          * matters on 64-bit processes (this value will always be greater than
1546          * the size of a 32-bit address space) and doesn't actually affect
1547          * USERLIMIT:  if a Linux-branded processes wishes to map something
1548          * into the top half of the address space, it can do so -- but with
1549          * the user stack starting at the top of the bottom region, those high
1550          * virtual addresses won't be used unless explicitly directed.
1551          */
1552         args->maxstack = lx_maxstack64;
1553 #endif
1554 
1555         /*
1556          * We will first exec the brand library, then map in the linux
1557          * executable and the linux linker.
1558          */
1559         if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP,
1560             &nvp))) {
1561                 uprintf("%s: not found.", lib_path);
1562                 return (error);
1563         }
1564 
1565         /*
1566          * We will eventually set the p_exec member to be the vnode for the new
1567          * executable when we call setexecenv(). However, if we get an error
1568          * before that call we need to restore the execenv to its original
1569          * values so that when we return to the caller fop_close() works
1570          * properly while cleaning up from the failed exec().  Restoring the
1571          * original value will also properly decrement the 2nd VN_RELE that we
1572          * took on the brand library.
1573          */
1574         origenv.ex_bssbase = p->p_bssbase;
1575         origenv.ex_brkbase = p->p_brkbase;
1576         origenv.ex_brksize = p->p_brksize;
1577         origenv.ex_vp = p->p_exec;
1578         orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp;
1579         orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size;
1580         orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags;
1581 
1582         if (args->to_model == DATAMODEL_NATIVE) {
1583                 error = elfexec(nvp, uap, args, idata, level + 1, execsz,
1584                     setid, exec_file, cred, brand_action);
1585         }
1586 #if defined(_LP64)
1587         else {
1588                 error = elf32exec(nvp, uap, args, idata, level + 1, execsz,
1589                     setid, exec_file, cred, brand_action);
1590         }
1591 #endif
1592         VN_RELE(nvp);
1593         if (error != 0) {
1594                 restoreexecenv(&origenv, &orig_sigaltstack);
1595                 return (error);
1596         }
1597 
1598         /*
1599          * exec-ed in the brand library above.
1600          * The u_auxv vectors are now setup by elfexec to point to the
1601          * brand emulation library and its linker.
1602          */
1603 
1604         bzero(&env, sizeof (env));
1605 
1606         /*
1607          * map in the the Linux executable
1608          */
1609         if (args->to_model == DATAMODEL_NATIVE) {
1610                 error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
1611                     &voffset, exec_file, &interp, &env.ex_bssbase,
1612                     &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1613         }
1614 #if defined(_LP64)
1615         else {
1616                 Elf32_Ehdr      ehdr32;
1617                 Elf32_Addr      uphdr_vaddr32;
1618 
1619                 error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
1620                     &voffset, exec_file, &interp, &env.ex_bssbase,
1621                     &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1622 
1623                 Ehdr32to64(&ehdr32, &ehdr);
1624 
1625                 if (uphdr_vaddr32 == (Elf32_Addr)-1)
1626                         uphdr_vaddr = (Addr)-1;
1627                 else
1628                         uphdr_vaddr = uphdr_vaddr32;
1629         }
1630 #endif
1631         if (error != 0) {
1632                 restoreexecenv(&origenv, &orig_sigaltstack);
1633 
1634                 if (interp != NULL)
1635                         kmem_free(interp, MAXPATHLEN);
1636 
1637                 return (error);
1638         }
1639 
1640         /*
1641          * Save off the important properties of the lx executable. The brand
1642          * library will ask us for this data later, when it is ready to set
1643          * things up for the lx executable.
1644          */
1645         edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
1646             voffset + uphdr_vaddr;
1647         edp->ed_entry = voffset + ehdr.e_entry;
1648         edp->ed_phent = ehdr.e_phentsize;
1649         edp->ed_phnum = ehdr.e_phnum;
1650 
1651         if (interp != NULL) {
1652                 if (ehdr.e_type == ET_DYN) {
1653                         /*
1654                          * This is a shared object executable, so we need to
1655                          * pick a reasonable place to put the heap. Just don't
1656                          * use the first page.
1657                          */
1658                         env.ex_brkbase = (caddr_t)PAGESIZE;
1659                         env.ex_bssbase = (caddr_t)PAGESIZE;
1660                 }
1661 
1662                 /*
1663                  * If the program needs an interpreter (most do), map it in and
1664                  * store relevant information about it in the aux vector, where
1665                  * the brand library can find it.
1666                  */
1667                 if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW,
1668                     NULLVPP, &nvp))) {
1669                         uprintf("%s: not found.", interp);
1670                         restoreexecenv(&origenv, &orig_sigaltstack);
1671                         kmem_free(interp, MAXPATHLEN);
1672                         return (error);
1673                 }
1674 
1675                 kmem_free(interp, MAXPATHLEN);
1676                 interp = NULL;
1677 
1678                 /*
1679                  * map in the Linux linker
1680                  */
1681                 if (args->to_model == DATAMODEL_NATIVE) {
1682                         error = mapexec_brand(nvp, args, &ehdr,
1683                             &uphdr_vaddr, &voffset, exec_file, NULL, NULL,
1684                             NULL, NULL, NULL, &ldaddr);
1685                 }
1686 #if defined(_LP64)
1687                 else {
1688                         Elf32_Ehdr      ehdr32;
1689                         Elf32_Addr      uphdr_vaddr32;
1690 
1691                         error = mapexec32_brand(nvp, args, &ehdr32,
1692                             &uphdr_vaddr32, &voffset, exec_file, NULL, NULL,
1693                             NULL, NULL, NULL, &ldaddr);
1694 
1695                         Ehdr32to64(&ehdr32, &ehdr);
1696 
1697                         if (uphdr_vaddr32 == (Elf32_Addr)-1)
1698                                 uphdr_vaddr = (Addr)-1;
1699                         else
1700                                 uphdr_vaddr = uphdr_vaddr32;
1701                 }
1702 #endif
1703 
1704                 VN_RELE(nvp);
1705                 if (error != 0) {
1706                         restoreexecenv(&origenv, &orig_sigaltstack);
1707                         return (error);
1708                 }
1709 
1710                 /*
1711                  * Now that we know the base address of the brand's linker,
1712                  * we also save this for later use by the brand library.
1713                  */
1714                 edp->ed_base = voffset;
1715                 edp->ed_ldentry = voffset + ehdr.e_entry;
1716         } else {
1717                 /*
1718                  * This program has no interpreter. The lx brand library will
1719                  * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
1720                  * so in this case, put the entry point of the main executable
1721                  * there.
1722                  */
1723                 if (ehdr.e_type == ET_EXEC) {
1724                         /*
1725                          * An executable with no interpreter, this must be a
1726                          * statically linked executable, which means we loaded
1727                          * it at the address specified in the elf header, in
1728                          * which case the e_entry field of the elf header is an
1729                          * absolute address.
1730                          */
1731                         edp->ed_ldentry = ehdr.e_entry;
1732                         edp->ed_entry = ehdr.e_entry;
1733                 } else {
1734                         /*
1735                          * A shared object with no interpreter, we use the
1736                          * calculated address from above.
1737                          */
1738                         edp->ed_ldentry = edp->ed_entry;
1739 
1740                         /*
1741                          * In all situations except an ET_DYN elf object with no
1742                          * interpreter, we want to leave the brk and base
1743                          * values set by mapexec_brand alone. Normally when
1744                          * running ET_DYN objects on Solaris (most likely
1745                          * /lib/ld.so.1) the kernel sets brk and base to 0 since
1746                          * it doesn't know where to put the heap, and later the
1747                          * linker will call brk() to initialize the heap in:
1748                          *      usr/src/cmd/sgs/rtld/common/setup.c:setup()
1749                          * after it has determined where to put it.  (This
1750                          * decision is made after the linker loads and inspects
1751                          * elf properties of the target executable being run.)
1752                          *
1753                          * So for ET_DYN Linux executables, we also don't know
1754                          * where the heap should go, so we'll set the brk and
1755                          * base to 0.  But in this case the Solaris linker will
1756                          * not initialize the heap, so when the Linux linker
1757                          * starts running there is no heap allocated.  This
1758                          * seems to be ok on Linux 2.4 based systems because the
1759                          * Linux linker/libc fall back to using mmap() to
1760                          * allocate memory. But on 2.6 systems, running
1761                          * applications by specifying them as command line
1762                          * arguments to the linker results in segfaults for an
1763                          * as yet undetermined reason (which seems to indicatej
1764                          * that a more permanent fix for heap initalization in
1765                          * these cases may be necessary).
1766                          */
1767                         if (ehdr.e_type == ET_DYN) {
1768                                 env.ex_bssbase = (caddr_t)0;
1769                                 env.ex_brkbase = (caddr_t)0;
1770                                 env.ex_brksize = 0;
1771                         }
1772                 }
1773 
1774         }
1775 
1776         env.ex_vp = vp;
1777         setexecenv(&env);
1778 
1779         /*
1780          * We try to keep /proc's view of the aux vector consistent with
1781          * what's on the process stack.
1782          */
1783         if (args->to_model == DATAMODEL_NATIVE) {
1784                 auxv_t phdr_auxv[4] = {
1785                     { AT_SUN_BRAND_LX_PHDR, 0 },
1786                     { AT_SUN_BRAND_LX_INTERP, 0 },
1787                     { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 },
1788                     { AT_SUN_BRAND_AUX4, 0 }
1789                 };
1790                 phdr_auxv[0].a_un.a_val = edp->ed_phdr;
1791                 phdr_auxv[1].a_un.a_val = ldaddr;
1792                 phdr_auxv[2].a_un.a_val = 1;    /* set in lx_init */
1793                 phdr_auxv[3].a_type = AT_CLKTCK;
1794                 phdr_auxv[3].a_un.a_val = hz;
1795 
1796                 if (copyout(&phdr_auxv, args->auxp_brand,
1797                     sizeof (phdr_auxv)) == -1)
1798                         return (EFAULT);
1799         }
1800 #if defined(_LP64)
1801         else {
1802                 auxv32_t phdr_auxv32[3] = {
1803                     { AT_SUN_BRAND_LX_PHDR, 0 },
1804                     { AT_SUN_BRAND_LX_INTERP, 0 },
1805                     { AT_SUN_BRAND_AUX3, 0 }
1806                 };
1807                 phdr_auxv32[0].a_un.a_val = edp->ed_phdr;
1808                 phdr_auxv32[1].a_un.a_val = ldaddr;
1809                 phdr_auxv32[2].a_type = AT_CLKTCK;
1810                 phdr_auxv32[2].a_un.a_val = hz;
1811 
1812                 if (copyout(&phdr_auxv32, args->auxp_brand,
1813                     sizeof (phdr_auxv32)) == -1)
1814                         return (EFAULT);
1815         }
1816 #endif
1817 
1818         /*
1819          * /proc uses the AT_ENTRY aux vector entry to deduce
1820          * the location of the executable in the address space. The user
1821          * structure contains a copy of the aux vector that needs to have those
1822          * entries patched with the values of the real lx executable (they
1823          * currently contain the values from the lx brand library that was
1824          * elfexec'd, above).
1825          *
1826          * For live processes, AT_BASE is used to locate the linker segment,
1827          * which /proc and friends will later use to find Solaris symbols
1828          * (such as rtld_db_preinit). However, for core files, /proc uses
1829          * AT_ENTRY to find the right segment to label as the executable.
1830          * So we set AT_ENTRY to be the entry point of the linux executable,
1831          * but leave AT_BASE to be the address of the Solaris linker.
1832          */
1833         for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
1834                 switch (up->u_auxv[i].a_type) {
1835                 case AT_ENTRY:
1836                         up->u_auxv[i].a_un.a_val = edp->ed_entry;
1837                         break;
1838 
1839                 case AT_SUN_BRAND_LX_PHDR:
1840                         up->u_auxv[i].a_un.a_val = edp->ed_phdr;
1841                         break;
1842 
1843                 case AT_SUN_BRAND_LX_INTERP:
1844                         up->u_auxv[i].a_un.a_val = ldaddr;
1845                         break;
1846 
1847                 default:
1848                         break;
1849                 }
1850         }
1851 
1852         return (0);
1853 }
1854 
1855 boolean_t
1856 lx_native_exec(uint8_t osabi, const char **interp)
1857 {
1858         if (osabi != ELFOSABI_SOLARIS)
1859                 return (B_FALSE);
1860 
1861         /*
1862          * If the process root matches the zone root, prepend /native to the
1863          * interpreter path for native executables.  Absolute precision from
1864          * VN_CMP is not necessary since any change of process root is likely
1865          * to make native binaries inaccessible via /native.
1866          *
1867          * Processes which chroot directly into /native will be able to
1868          * function as expected with no need for the prefix.
1869          */
1870         if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) {
1871                 *interp = "/native";
1872         }
1873 
1874         return (B_TRUE);
1875 }
1876 
1877 static void
1878 lx_syscall_init(void)
1879 {
1880         int i;
1881 
1882         /*
1883          * Count up the 32-bit Linux system calls.  Note that lx_sysent32
1884          * has (LX_NSYSCALLS + 1) entries.
1885          */
1886         for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++)
1887                 continue;
1888         lx_nsysent32 = i;
1889 
1890 #if defined(_LP64)
1891         /*
1892          * Count up the 64-bit Linux system calls.  Note that lx_sysent64
1893          * has (LX_NSYSCALLS + 1) entries.
1894          */
1895         for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++)
1896                 continue;
1897         lx_nsysent64 = i;
1898 #endif
1899 }
1900 
1901 int
1902 _init(void)
1903 {
1904         int err = 0;
1905 
1906         lx_syscall_init();
1907         lx_pid_init();
1908         lx_ioctl_init();
1909         lx_futex_init();
1910         lx_ptrace_init();
1911         lx_socket_init();
1912 
1913         err = mod_install(&modlinkage);
1914         if (err != 0) {
1915                 cmn_err(CE_WARN, "Couldn't install lx brand module");
1916 
1917                 /*
1918                  * This looks drastic, but it should never happen.  These
1919                  * two data structures should be completely free-able until
1920                  * they are used by Linux processes.  Since the brand
1921                  * wasn't loaded there should be no Linux processes, and
1922                  * thus no way for these data structures to be modified.
1923                  */
1924                 lx_pid_fini();
1925                 lx_ioctl_fini();
1926                 if (lx_futex_fini())
1927                         panic("lx brand module cannot be loaded or unloaded.");
1928         }
1929         return (err);
1930 }
1931 
1932 int
1933 _info(struct modinfo *modinfop)
1934 {
1935         return (mod_info(&modlinkage, modinfop));
1936 }
1937 
1938 int
1939 _fini(void)
1940 {
1941         int err;
1942         int futex_done = 0;
1943 
1944         /*
1945          * If there are any zones using this brand, we can't allow it to be
1946          * unloaded.
1947          */
1948         if (brand_zone_count(&lx_brand))
1949                 return (EBUSY);
1950 
1951         lx_ptrace_fini();
1952         lx_pid_fini();
1953         lx_ioctl_fini();
1954         lx_socket_fini();
1955 
1956         if ((err = lx_futex_fini()) != 0) {
1957                 goto done;
1958         }
1959         futex_done = 1;
1960 
1961         err = mod_remove(&modlinkage);
1962 
1963 done:
1964         if (err) {
1965                 /*
1966                  * If we can't unload the module, then we have to get it
1967                  * back into a sane state.
1968                  */
1969                 lx_ptrace_init();
1970                 lx_pid_init();
1971                 lx_ioctl_init();
1972                 lx_socket_init();
1973 
1974                 if (futex_done) {
1975                         lx_futex_init();
1976                 }
1977         }
1978 
1979         return (err);
1980 }