1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2015, Joyent, Inc. All rights reserved.
  29  */
  30 
  31 /*
  32  * The LX Brand: emulation of a Linux operating environment within a zone.
  33  *
  34  * OVERVIEW
  35  *
  36  * The LX brand enables a full Linux userland -- including a C library,
  37  * init(1) framework, and some set of applications -- to run unmodified
  38  * within an illumos zone.  Unlike illumos, where applications are expected
  39  * to link against and consume functions exported from libraries, the
  40  * supported Linux binary compatibility boundary is the system call
  41  * interface.  By accurately emulating the behaviour of Linux system calls,
  42  * Linux software can be executed in this environment as if it were running
  43  * on a native Linux system.
  44  *
  45  * EMULATING LINUX SYSTEM CALLS
  46  *
  47  * Linux system calls are made in 32-bit processes via the "int 0x80"
  48  * instruction; in 64-bit processes the "syscall" instruction is used, as it
  49  * is with native illumos processes.  In both cases, arguments to system
  50  * calls are generally passed in registers and the usermode stack is not
  51  * interpreted or modified by the Linux kernel.
  52  *
  53  * When the emulated Linux process makes a system call, it traps into the
  54  * illumos kernel.  The in-kernel brand module contains various emulation
  55  * routines, and can fully service some emulated system calls; e.g. read(2)
  56  * and write(2).  Other system calls require assistance from the illumos
  57  * libc, bouncing back out to the brand library ("lx_brand.so.1") for
  58  * emulation.
  59  *
  60  * The brand mechanism allows for the provision of an alternative trap
  61  * handler for the various system call mechanisms.  Traditionally this was
  62  * used to immediately revector execution to the usermode emulation library,
  63  * which was responsible for handling all system calls.  In the interests of
  64  * more accurate emulation and increased performance, much of the regular
  65  * illumos system call path is now invoked.  Only the argument processing and
  66  * handler dispatch are replaced by the brand, via the per-LWP
  67  * "lwp_brand_syscall" interposition function pointer.
  68  *
  69  * THE NATIVE AND BRAND STACKS
  70  *
  71  * Some runtime environments (e.g. the Go language) allocate very small
  72  * thread stacks, preferring to grow or split the stack as necessary.  The
  73  * Linux kernel generally does not use the usermode stack when servicing
  74  * system calls, so this is not a problem.  In order for our emulation to
  75  * have the same zero stack impact, we must execute usermode emulation
  76  * routines on an _alternate_ stack.  This is similar, in principle, to the
  77  * use of sigaltstack(3C) to run signal handlers off the main thread stack.
  78  *
  79  * To this end, the brand library allocates and installs an alternate stack
  80  * (called the "native" stack) for each LWP.  The in-kernel brand code uses
  81  * this stack for usermode emulation calls and interposed signal delivery,
  82  * while the emulated Linux process sees only the data on the main thread
  83  * stack, known as the "brand" stack.  The stack mode is tracked in the
  84  * per-LWP brand-private data, using the LX_STACK_MODE_* enum.
  85  *
  86  * The stack mode doubles as a system call "mode bit".  When in the
  87  * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux
  88  * system calls.  In other modes, system calls are assumed to be native
  89  * illumos system calls as made during brand library initialisation and
  90  * usermode emulation.
  91  *
  92  * USERMODE EMULATION
  93  *
  94  * When a Linux system call cannot be emulated within the kernel, we preserve
  95  * the register state of the Linux process and revector the LWP to the brand
  96  * library usermode emulation handler: the "lx_emulate()" function in
  97  * "lx_brand.so.1".  This revectoring is modelled on the delivery of signals,
  98  * and is performed in "lx_emulate_user()".
  99  *
 100  * First, the emulated process state is written out to the usermode stack of
 101  * the process as a "ucontext_t" object.  Arguments to the emulation routine
 102  * are passed on the stack or in registers, depending on the ABI.  When the
 103  * usermode emulation is complete, the result is passed back to the kernel
 104  * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context
 105  * for restoration.
 106  *
 107  * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT
 108  *
 109  * When servicing emulated system calls in the usermode brand library, or
 110  * during signal delivery, various state is preserved by the kernel so that
 111  * the running LWP may be revectored to a handling routine.  The context
 112  * allows the kernel to restart the program at the point of interruption,
 113  * either at the return of the signal handler, via setcontext(3C); or after
 114  * the usermode emulation request has been serviced, via B_EMULATION_DONE.
 115  *
 116  * In illumos native processes, the saved context (a "ucontext_t" object)
 117  * includes the state of registers and the current signal mask at the point
 118  * of interruption.  The context also includes a link to the most recently
 119  * saved context, forming a chain to be unwound as requests complete.  The LX
 120  * brand requires additional book-keeping to describe the machine state: in
 121  * particular, the current stack mode and the occupied extent of the native
 122  * stack.
 123  *
 124  * The brand code is able to interpose on the context save and restore
 125  * operations in the kernel -- see "lx_savecontext()" and
 126  * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to
 127  * function correctly in the face of a dual stack LWP.  The brand also
 128  * interposes on the signal delivery mechanism -- see "lx_sendsig()" and
 129  * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand
 130  * library interposer on the native stack, regardless of the interrupted
 131  * execution mode.  Linux sigaltstack(2) emulation is performed entirely by
 132  * the usermode brand library during signal handler interposition.
 133  */
 134 
 135 #include <sys/types.h>
 136 #include <sys/kmem.h>
 137 #include <sys/errno.h>
 138 #include <sys/thread.h>
 139 #include <sys/systm.h>
 140 #include <sys/syscall.h>
 141 #include <sys/proc.h>
 142 #include <sys/modctl.h>
 143 #include <sys/cmn_err.h>
 144 #include <sys/model.h>
 145 #include <sys/exec.h>
 146 #include <sys/lx_impl.h>
 147 #include <sys/machbrand.h>
 148 #include <sys/lx_syscalls.h>
 149 #include <sys/lx_misc.h>
 150 #include <sys/lx_futex.h>
 151 #include <sys/lx_brand.h>
 152 #include <sys/param.h>
 153 #include <sys/termios.h>
 154 #include <sys/sunddi.h>
 155 #include <sys/ddi.h>
 156 #include <sys/vnode.h>
 157 #include <sys/pathname.h>
 158 #include <sys/auxv.h>
 159 #include <sys/priv.h>
 160 #include <sys/regset.h>
 161 #include <sys/privregs.h>
 162 #include <sys/archsystm.h>
 163 #include <sys/zone.h>
 164 #include <sys/brand.h>
 165 #include <sys/sdt.h>
 166 #include <sys/x86_archext.h>
 167 #include <sys/controlregs.h>
 168 #include <sys/core.h>
 169 #include <sys/stack.h>
 170 #include <sys/stat.h>
 171 #include <sys/socket.h>
 172 #include <lx_signum.h>
 173 #include <util/sscanf.h>
 174 
 175 int     lx_debug = 0;
 176 
 177 void    lx_init_brand_data(zone_t *);
 178 void    lx_free_brand_data(zone_t *);
 179 void    lx_setbrand(proc_t *);
 180 int     lx_getattr(zone_t *, int, void *, size_t *);
 181 int     lx_setattr(zone_t *, int, void *, size_t);
 182 int     lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
 183                 uintptr_t, uintptr_t);
 184 void    lx_set_kern_version(zone_t *, char *);
 185 void    lx_copy_procdata(proc_t *, proc_t *);
 186 
 187 extern int getsetcontext(int, void *);
 188 extern int waitsys(idtype_t, id_t, siginfo_t *, int);
 189 #if defined(_SYSCALL32_IMPL)
 190 extern int getsetcontext32(int, void *);
 191 extern int waitsys32(idtype_t, id_t, siginfo_t *, int);
 192 #endif
 193 
 194 extern void lx_proc_exit(proc_t *);
 195 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
 196 
 197 extern void lx_ioctl_init();
 198 extern void lx_ioctl_fini();
 199 extern void lx_socket_init();
 200 extern void lx_socket_fini();
 201 
 202 lx_systrace_f *lx_systrace_entry_ptr;
 203 lx_systrace_f *lx_systrace_return_ptr;
 204 
 205 static int lx_systrace_enabled;
 206 
 207 /*
 208  * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly
 209  * want an MMU dependency here (and should there be a microprocessor without
 210  * a hole, we don't want to start allocating from the top of the VA range).
 211  */
 212 #define LX_MAXSTACK64   0x7ffffff00000
 213 
 214 uint64_t lx_maxstack64 = LX_MAXSTACK64;
 215 
 216 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
 217     struct intpdata *idata, int level, long *execsz, int setid,
 218     caddr_t exec_file, struct cred *cred, int *brand_action);
 219 
 220 static boolean_t lx_native_exec(uint8_t, const char **);
 221 static uint32_t lx_map32limit(proc_t *);
 222 
 223 static void lx_savecontext(ucontext_t *);
 224 static void lx_restorecontext(ucontext_t *);
 225 static caddr_t lx_sendsig_stack(int);
 226 static void lx_sendsig(int);
 227 #if defined(_SYSCALL32_IMPL)
 228 static void lx_savecontext32(ucontext32_t *);
 229 #endif
 230 static int lx_setid_clear(vattr_t *, cred_t *);
 231 #if defined(_LP64)
 232 static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type,
 233     enum seg_rw);
 234 #endif
 235 
 236 
 237 /* lx brand */
 238 struct brand_ops lx_brops = {
 239         lx_init_brand_data,             /* b_init_brand_data */
 240         lx_free_brand_data,             /* b_free_brand_data */
 241         lx_brandsys,                    /* b_brandsys */
 242         lx_setbrand,                    /* b_setbrand */
 243         lx_getattr,                     /* b_getattr */
 244         lx_setattr,                     /* b_setattr */
 245         lx_copy_procdata,               /* b_copy_procdata */
 246         lx_proc_exit,                   /* b_proc_exit */
 247         lx_exec,                        /* b_exec */
 248         lx_setrval,                     /* b_lwp_setrval */
 249         lx_lwpdata_alloc,               /* b_lwpdata_alloc */
 250         lx_lwpdata_free,                /* b_lwpdata_free */
 251         lx_initlwp,                     /* b_initlwp */
 252         lx_forklwp,                     /* b_forklwp */
 253         lx_freelwp,                     /* b_freelwp */
 254         lx_exitlwp,                     /* b_lwpexit */
 255         lx_elfexec,                     /* b_elfexec */
 256         NULL,                           /* b_sigset_native_to_brand */
 257         NULL,                           /* b_sigset_brand_to_native */
 258         lx_sigfd_translate,             /* b_sigfd_translate */
 259         NSIG,                           /* b_nsig */
 260         lx_exit_with_sig,               /* b_exit_with_sig */
 261         lx_wait_filter,                 /* b_wait_filter */
 262         lx_native_exec,                 /* b_native_exec */
 263         lx_map32limit,                  /* b_map32limit */
 264         lx_stop_notify,                 /* b_stop_notify */
 265         lx_waitid_helper,               /* b_waitid_helper */
 266         lx_sigcld_repost,               /* b_sigcld_repost */
 267         lx_ptrace_issig_stop,           /* b_issig_stop */
 268         lx_ptrace_sig_ignorable,        /* b_sig_ignorable */
 269         lx_savecontext,                 /* b_savecontext */
 270 #if defined(_SYSCALL32_IMPL)
 271         lx_savecontext32,               /* b_savecontext32 */
 272 #endif
 273         lx_restorecontext,              /* b_restorecontext */
 274         lx_sendsig_stack,               /* b_sendsig_stack */
 275         lx_sendsig,                     /* b_sendsig */
 276         lx_setid_clear,                 /* b_setid_clear */
 277 #if defined(_LP64)
 278         lx_pagefault                    /* b_pagefault */
 279 #else
 280         NULL
 281 #endif
 282 };
 283 
 284 struct brand_mach_ops lx_mops = {
 285         NULL,
 286         NULL,
 287         NULL,
 288         NULL,
 289         NULL,
 290         lx_fixsegreg,
 291         lx_fsbase
 292 };
 293 
 294 struct brand lx_brand = {
 295         BRAND_VER_1,
 296         "lx",
 297         &lx_brops,
 298         &lx_mops,
 299         sizeof (struct lx_proc_data)
 300 };
 301 
 302 static struct modlbrand modlbrand = {
 303         &mod_brandops, "lx brand", &lx_brand
 304 };
 305 
 306 static struct modlinkage modlinkage = {
 307         MODREV_1, (void *)&modlbrand, NULL
 308 };
 309 
 310 void
 311 lx_proc_exit(proc_t *p)
 312 {
 313         lx_proc_data_t *lxpd;
 314         proc_t *cp;
 315 
 316         mutex_enter(&p->p_lock);
 317         VERIFY(lxpd = ptolxproc(p));
 318         if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) {
 319                 mutex_exit(&p->p_lock);
 320                 return;
 321         }
 322         mutex_exit(&p->p_lock);
 323 
 324         /* Check for children which desire notification of parental death. */
 325         mutex_enter(&pidlock);
 326         for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) {
 327                 mutex_enter(&cp->p_lock);
 328                 if ((lxpd = ptolxproc(cp)) == NULL) {
 329                         mutex_exit(&cp->p_lock);
 330                         continue;
 331                 }
 332                 if (lxpd->l_parent_deathsig != 0) {
 333                         sigtoproc(p, NULL, lxpd->l_parent_deathsig);
 334                 }
 335                 mutex_exit(&cp->p_lock);
 336         }
 337         mutex_exit(&pidlock);
 338 }
 339 
 340 void
 341 lx_setbrand(proc_t *p)
 342 {
 343         /* Send SIGCHLD to parent by default when child exits */
 344         ptolxproc(p)->l_signal = stol_signo[SIGCHLD];
 345 }
 346 
 347 /* ARGSUSED */
 348 int
 349 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
 350 {
 351         char vers[LX_VERS_MAX];
 352 
 353         if (attr == LX_KERN_VERSION_NUM) {
 354                 if (bufsize > (LX_VERS_MAX - 1))
 355                         return (ERANGE);
 356                 bzero(vers, LX_VERS_MAX);
 357                 if (copyin(buf, &vers, bufsize) != 0)
 358                         return (EFAULT);
 359                 lx_set_kern_version(zone, vers);
 360                 return (0);
 361         }
 362         return (EINVAL);
 363 }
 364 
 365 /* ARGSUSED */
 366 int
 367 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
 368 {
 369         if (attr == LX_KERN_VERSION_NUM) {
 370                 if (*bufsize < LX_VERS_MAX)
 371                         return (ERANGE);
 372                 if (copyout(lx_get_zone_kern_version(curzone), buf,
 373                     LX_VERS_MAX) != 0)
 374                         return (EFAULT);
 375                 *bufsize = LX_VERS_MAX;
 376                 return (0);
 377         }
 378         return (-EINVAL);
 379 }
 380 
 381 uint32_t
 382 lx_map32limit(proc_t *p)
 383 {
 384         /*
 385          * To be bug-for-bug compatible with Linux, we have MAP_32BIT only
 386          * allow mappings in the first 31 bits.  This was a nuance in the
 387          * original Linux implementation circa 2002, and applications have
 388          * come to depend on its behavior.
 389          *
 390          * This is only relevant for 64-bit processes.
 391          */
 392         if (p->p_model == DATAMODEL_LP64)
 393                 return (1 << 31);
 394 
 395         return ((uint32_t)USERLIMIT32);
 396 }
 397 
 398 void
 399 lx_brand_systrace_enable(void)
 400 {
 401         VERIFY(!lx_systrace_enabled);
 402 
 403         lx_systrace_enabled = 1;
 404 }
 405 
 406 void
 407 lx_brand_systrace_disable(void)
 408 {
 409         VERIFY(lx_systrace_enabled);
 410 
 411         lx_systrace_enabled = 0;
 412 }
 413 
 414 void
 415 lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp)
 416 {
 417         VERIFY(lwpd->br_ntv_stack != 0);
 418 
 419         /*
 420          * The "brand-lx-set-ntv-stack-current" probe has arguments:
 421          *   arg0: stack pointer before change
 422          *   arg1: stack pointer after change
 423          *   arg2: current stack base
 424          */
 425         DTRACE_PROBE3(brand__lx__set__ntv__stack__current,
 426             uintptr_t, lwpd->br_ntv_stack_current,
 427             uintptr_t, new_sp,
 428             uintptr_t, lwpd->br_ntv_stack);
 429 
 430         lwpd->br_ntv_stack_current = new_sp;
 431 }
 432 
 433 #if defined(_LP64)
 434 static int
 435 lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type,
 436     enum seg_rw rw)
 437 {
 438         int syscall_num;
 439 
 440         /*
 441          * We only want to handle a very specific set of circumstances.
 442          * Namely: this is a 64-bit LX-branded process attempting to execute an
 443          * address in a page for which it does not have a valid mapping.  If
 444          * this is not the case, we bail out as fast as possible.
 445          */
 446         VERIFY(PROC_IS_BRANDED(p));
 447         if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) !=
 448             DATAMODEL_NATIVE) {
 449                 return (-1);
 450         }
 451 
 452         if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) {
 453                 return (-1);
 454         }
 455 
 456         /*
 457          * This is a valid vsyscall address.  We service the system call and
 458          * return 0 to signal that the pagefault has been handled completely.
 459          */
 460         lx_vsyscall_enter(p, lwp, syscall_num);
 461         return (0);
 462 }
 463 #endif
 464 
 465 /*
 466  * This hook runs prior to sendsig() processing and allows us to nominate
 467  * an alternative stack pointer for delivery of the signal handling frame.
 468  * Critically, this routine should _not_ modify any LWP state as the
 469  * savecontext() does not run until after this hook.
 470  */
 471 static caddr_t
 472 lx_sendsig_stack(int sig)
 473 {
 474         klwp_t *lwp = ttolwp(curthread);
 475         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 476 
 477         /*
 478          * We want to take signal delivery on the native stack, but only if
 479          * one has been allocated and installed for this LWP.
 480          */
 481         if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 482                 /*
 483                  * The program is not running on the native stack.  Return
 484                  * the native stack pointer from our brand-private data so
 485                  * that we may switch to it for signal handling.
 486                  */
 487                 return ((caddr_t)lwpd->br_ntv_stack_current);
 488         } else {
 489                 struct regs *rp = lwptoregs(lwp);
 490 
 491                 /*
 492                  * Either the program is already running on the native stack,
 493                  * or one has not yet been allocated for this LWP.  Use the
 494                  * current stack pointer value.
 495                  */
 496                 return ((caddr_t)rp->r_sp);
 497         }
 498 }
 499 
 500 /*
 501  * This hook runs after sendsig() processing and allows us to update the
 502  * per-LWP mode flags for system calls and stacks.  The pre-signal
 503  * context has already been saved and delivered to the user at this point.
 504  */
 505 static void
 506 lx_sendsig(int sig)
 507 {
 508         klwp_t *lwp = ttolwp(curthread);
 509         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 510         struct regs *rp = lwptoregs(lwp);
 511 
 512         switch (lwpd->br_stack_mode) {
 513         case LX_STACK_MODE_BRAND:
 514         case LX_STACK_MODE_NATIVE:
 515                 /*
 516                  * In lx_sendsig_stack(), we nominated a stack pointer from the
 517                  * native stack.  Update the stack mode, and the current in-use
 518                  * extent of the native stack, accordingly:
 519                  */
 520                 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
 521                 lx_lwp_set_native_stack_current(lwpd, rp->r_sp);
 522 
 523                 /*
 524                  * Fix up segment registers, etc.
 525                  */
 526                 lx_switch_to_native(lwp);
 527                 break;
 528 
 529         default:
 530                 /*
 531                  * Otherwise, the brand library has not yet installed the
 532                  * alternate stack for this LWP.  Signals will be handled on
 533                  * the regular stack thread.
 534                  */
 535                 return;
 536         }
 537 }
 538 
 539 /*
 540  * This hook runs prior to the context restoration, allowing us to take action
 541  * or modify the context before it is loaded.
 542  */
 543 static void
 544 lx_restorecontext(ucontext_t *ucp)
 545 {
 546         klwp_t *lwp = ttolwp(curthread);
 547         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 548         uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0];
 549         caddr_t sp = ucp->uc_brand_data[1];
 550 
 551         /*
 552          * We have a saved native stack pointer value that we must restore
 553          * into the per-LWP data.
 554          */
 555         if (flags & LX_UC_RESTORE_NATIVE_SP) {
 556                 lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp);
 557         }
 558 
 559         /*
 560          * We do not wish to restore the value of uc_link in this context,
 561          * so replace it with the value currently in the LWP.
 562          */
 563         if (flags & LX_UC_IGNORE_LINK) {
 564                 ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext;
 565         }
 566 
 567         /*
 568          * Restore the stack mode:
 569          */
 570         if (flags & LX_UC_STACK_NATIVE) {
 571                 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
 572         } else if (flags & LX_UC_STACK_BRAND) {
 573                 lwpd->br_stack_mode = LX_STACK_MODE_BRAND;
 574         }
 575 
 576 #if defined(__amd64)
 577         /*
 578          * Override the fs/gsbase in the context with the value provided
 579          * through the Linux arch_prctl(2) system call.
 580          */
 581         if (flags & LX_UC_STACK_BRAND) {
 582                 if (lwpd->br_lx_fsbase != 0) {
 583                         ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase;
 584                 }
 585                 if (lwpd->br_lx_gsbase != 0) {
 586                         ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase;
 587                 }
 588         }
 589 #endif
 590 }
 591 
 592 static void
 593 lx_savecontext(ucontext_t *ucp)
 594 {
 595         klwp_t *lwp = ttolwp(curthread);
 596         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 597         uintptr_t flags = 0;
 598 
 599         /*
 600          * The ucontext_t affords us three private pointer-sized members in
 601          * "uc_brand_data".  We pack a variety of flags into the first element,
 602          * and an optional stack pointer in the second element.  The flags
 603          * determine which stack pointer (native or brand), if any, is stored
 604          * in the second element.  The third element may contain the system
 605          * call number; this is analogous to the "orig_[er]ax" member of a
 606          * Linux "user_regs_struct".
 607          */
 608 
 609         if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
 610             lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
 611                 /*
 612                  * Record the value of the native stack pointer to restore
 613                  * when returning to this branded context:
 614                  */
 615                 flags |= LX_UC_RESTORE_NATIVE_SP;
 616                 ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current;
 617         }
 618 
 619         /*
 620          * Save the stack mode:
 621          */
 622         if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
 623                 flags |= LX_UC_STACK_NATIVE;
 624         } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 625                 flags |= LX_UC_STACK_BRAND;
 626         }
 627 
 628         /*
 629          * If we might need to restart this system call, save that information
 630          * in the context:
 631          */
 632         if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 633                 ucp->uc_brand_data[2] =
 634                     (void *)(uintptr_t)lwpd->br_syscall_num;
 635                 if (lwpd->br_syscall_restart) {
 636                         flags |= LX_UC_RESTART_SYSCALL;
 637                 }
 638         } else {
 639                 ucp->uc_brand_data[2] = NULL;
 640         }
 641 
 642         ucp->uc_brand_data[0] = (void *)flags;
 643 }
 644 
 645 #if defined(_SYSCALL32_IMPL)
 646 static void
 647 lx_savecontext32(ucontext32_t *ucp)
 648 {
 649         klwp_t *lwp = ttolwp(curthread);
 650         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 651         unsigned int flags = 0;
 652 
 653         /*
 654          * The ucontext_t affords us three private pointer-sized members in
 655          * "uc_brand_data".  We pack a variety of flags into the first element,
 656          * and an optional stack pointer in the second element.  The flags
 657          * determine which stack pointer (native or brand), if any, is stored
 658          * in the second element.  The third element may contain the system
 659          * call number; this is analogous to the "orig_[er]ax" member of a
 660          * Linux "user_regs_struct".
 661          */
 662 
 663         if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
 664             lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
 665                 /*
 666                  * Record the value of the native stack pointer to restore
 667                  * when returning to this branded context:
 668                  */
 669                 flags |= LX_UC_RESTORE_NATIVE_SP;
 670                 ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current;
 671         }
 672 
 673         /*
 674          * Save the stack mode:
 675          */
 676         if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
 677                 flags |= LX_UC_STACK_NATIVE;
 678         } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 679                 flags |= LX_UC_STACK_BRAND;
 680         }
 681 
 682         /*
 683          * If we might need to restart this system call, save that information
 684          * in the context:
 685          */
 686         if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
 687                 ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num;
 688                 if (lwpd->br_syscall_restart) {
 689                         flags |= LX_UC_RESTART_SYSCALL;
 690                 }
 691         } else {
 692                 ucp->uc_brand_data[2] = NULL;
 693         }
 694 
 695         ucp->uc_brand_data[0] = flags;
 696 }
 697 #endif
 698 
 699 void
 700 lx_init_brand_data(zone_t *zone)
 701 {
 702         lx_zone_data_t *data;
 703         ASSERT(zone->zone_brand == &lx_brand);
 704         ASSERT(zone->zone_brand_data == NULL);
 705         data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP);
 706         /*
 707          * Set the default lxzd_kernel_version to 2.4.
 708          * This can be changed by a call to setattr() during zone boot.
 709          */
 710         (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX);
 711 
 712         /*
 713          * Linux is not at all picky about address family when it comes to
 714          * supporting interface-related ioctls.  To mimic this behavior, we'll
 715          * attempt those ioctls against a ksocket configured for that purpose.
 716          */
 717         (void) ksocket_socket(&data->lxzd_ioctl_sock, AF_INET, SOCK_DGRAM, 0,
 718             0, zone->zone_kcred);
 719 
 720         zone->zone_brand_data = data;
 721 
 722         /*
 723          * In Linux, if the init(1) process terminates the system panics.
 724          * The zone must reboot to simulate this behaviour.
 725          */
 726         zone->zone_reboot_on_init_exit = B_TRUE;
 727 }
 728 
 729 void
 730 lx_free_brand_data(zone_t *zone)
 731 {
 732         lx_zone_data_t *data = ztolxzd(zone);
 733         ASSERT(data != NULL);
 734         if (data->lxzd_ioctl_sock != NULL) {
 735                 /*
 736                  * Since zone_kcred has been cleaned up already, close the
 737                  * socket using the global kcred.
 738                  */
 739                 ksocket_close(data->lxzd_ioctl_sock, kcred);
 740                 data->lxzd_ioctl_sock = NULL;
 741         }
 742         zone->zone_brand_data = NULL;
 743         kmem_free(data, sizeof (*data));
 744 }
 745 
 746 void
 747 lx_unsupported(char *dmsg)
 748 {
 749         lx_proc_data_t *pd = ttolxproc(curthread);
 750 
 751         DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg);
 752 
 753         if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) {
 754                 /*
 755                  * If this process was run with strict mode enabled
 756                  * (via LX_STRICT in the environment), we mark this
 757                  * LWP as having triggered an unsupported behaviour.
 758                  * This flag will be checked at an appropriate point
 759                  * by lx_check_strict_failure().
 760                  */
 761                 lx_lwp_data_t *lwpd = ttolxlwp(curthread);
 762 
 763                 lwpd->br_strict_failure = B_TRUE;
 764         }
 765 }
 766 
 767 void
 768 lx_check_strict_failure(lx_lwp_data_t *lwpd)
 769 {
 770         proc_t *p;
 771 
 772         if (!lwpd->br_strict_failure) {
 773                 return;
 774         }
 775 
 776         lwpd->br_strict_failure = B_FALSE;
 777 
 778         /*
 779          * If this process is operating in strict mode (via LX_STRICT in
 780          * the environment), and has triggered a call to
 781          * lx_unsupported(), we drop SIGSYS on it as we return.
 782          */
 783         p = curproc;
 784         mutex_enter(&p->p_lock);
 785         sigtoproc(p, curthread, SIGSYS);
 786         mutex_exit(&p->p_lock);
 787 }
 788 
 789 void
 790 lx_trace_sysenter(int syscall_num, uintptr_t *args)
 791 {
 792         if (lx_systrace_enabled) {
 793                 VERIFY(lx_systrace_entry_ptr != NULL);
 794 
 795                 (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1],
 796                     args[2], args[3], args[4], args[5]);
 797         }
 798 }
 799 
 800 void
 801 lx_trace_sysreturn(int syscall_num, long ret)
 802 {
 803         if (lx_systrace_enabled) {
 804                 VERIFY(lx_systrace_return_ptr != NULL);
 805 
 806                 (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0);
 807         }
 808 }
 809 
 810 /*
 811  * Get the addresses of the user-space system call handler and attach it to
 812  * the proc structure. Returning 0 indicates success; the value returned
 813  * by the system call is the value stored in rval. Returning a non-zero
 814  * value indicates a failure; the value returned is used to set errno, -1
 815  * is returned from the syscall and the contents of rval are ignored. To
 816  * set errno and have the syscall return a value other than -1 we can
 817  * manually set errno and rval and return 0.
 818  */
 819 int
 820 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
 821     uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
 822 {
 823         kthread_t *t = curthread;
 824         klwp_t *lwp = ttolwp(t);
 825         proc_t *p = ttoproc(t);
 826         lx_proc_data_t *pd;
 827         struct termios *termios;
 828         uint_t termios_len;
 829         int error;
 830         int code;
 831         int sig;
 832         lx_brand_registration_t reg;
 833         lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
 834 
 835         /*
 836          * There is one operation that is suppored for non-branded
 837          * process.  B_EXEC_BRAND.  This is the equilivant of an
 838          * exec call, but the new process that is created will be
 839          * a branded process.
 840          */
 841         if (cmd == B_EXEC_BRAND) {
 842                 VERIFY(p->p_zone != NULL);
 843                 VERIFY(p->p_zone->zone_brand == &lx_brand);
 844                 return (exec_common(
 845                     (char *)arg1, (const char **)arg2, (const char **)arg3,
 846                     EBA_BRAND));
 847         }
 848 
 849         /* For all other operations this must be a branded process. */
 850         if (p->p_brand == NULL)
 851                 return (ENOSYS);
 852 
 853         VERIFY(p->p_brand == &lx_brand);
 854         VERIFY(p->p_brand_data != NULL);
 855 
 856         switch (cmd) {
 857         case B_REGISTER:
 858                 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
 859                         lx_print("stack mode was not PREINIT during "
 860                             "REGISTER\n");
 861                         return (EINVAL);
 862                 }
 863 
 864                 if (p->p_model == DATAMODEL_NATIVE) {
 865                         if (copyin((void *)arg1, &reg, sizeof (reg)) != 0) {
 866                                 lx_print("Failed to copyin brand registration "
 867                                     "at 0x%p\n", (void *)arg1);
 868                                 return (EFAULT);
 869                         }
 870                 }
 871 #ifdef _LP64
 872                 else {
 873                         /* 32-bit userland on 64-bit kernel */
 874                         lx_brand_registration32_t reg32;
 875 
 876                         if (copyin((void *)arg1, &reg32, sizeof (reg32)) != 0) {
 877                                 lx_print("Failed to copyin brand registration "
 878                                     "at 0x%p\n", (void *)arg1);
 879                                 return (EFAULT);
 880                         }
 881 
 882                         reg.lxbr_version = (uint_t)reg32.lxbr_version;
 883                         reg.lxbr_handler =
 884                             (void *)(uintptr_t)reg32.lxbr_handler;
 885                         reg.lxbr_flags = reg32.lxbr_flags;
 886                 }
 887 #endif
 888 
 889                 if (reg.lxbr_version != LX_VERSION_1) {
 890                         lx_print("Invalid brand library version (%u)\n",
 891                             reg.lxbr_version);
 892                         return (EINVAL);
 893                 }
 894 
 895                 if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) {
 896                         lx_print("Invalid brand flags (%u)\n",
 897                             reg.lxbr_flags);
 898                         return (EINVAL);
 899                 }
 900 
 901                 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
 902                     (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
 903                 pd = p->p_brand_data;
 904                 pd->l_handler = (uintptr_t)reg.lxbr_handler;
 905                 pd->l_flags = reg.lxbr_flags & LX_PROC_ALL;
 906 
 907                 return (0);
 908 
 909         case B_TTYMODES:
 910                 /* This is necessary for emulating TCGETS ioctls. */
 911                 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
 912                     DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
 913                     &termios_len) != DDI_SUCCESS)
 914                         return (EIO);
 915 
 916                 ASSERT(termios_len == sizeof (*termios));
 917 
 918                 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
 919                         ddi_prop_free(termios);
 920                         return (EFAULT);
 921                 }
 922 
 923                 ddi_prop_free(termios);
 924                 return (0);
 925 
 926         case B_ELFDATA:
 927                 pd = curproc->p_brand_data;
 928                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 929                         if (copyout(&pd->l_elf_data, (void *)arg1,
 930                             sizeof (lx_elf_data_t)) != 0) {
 931                                 return (EFAULT);
 932                         }
 933                 }
 934 #if defined(_LP64)
 935                 else {
 936                         /* 32-bit userland on 64-bit kernel */
 937                         lx_elf_data32_t led32;
 938 
 939                         led32.ed_phdr = (int)pd->l_elf_data.ed_phdr;
 940                         led32.ed_phent = (int)pd->l_elf_data.ed_phent;
 941                         led32.ed_phnum = (int)pd->l_elf_data.ed_phnum;
 942                         led32.ed_entry = (int)pd->l_elf_data.ed_entry;
 943                         led32.ed_base = (int)pd->l_elf_data.ed_base;
 944                         led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry;
 945 
 946                         if (copyout(&led32, (void *)arg1,
 947                             sizeof (led32)) != 0) {
 948                                 return (EFAULT);
 949                         }
 950                 }
 951 #endif
 952                 return (0);
 953 
 954         case B_EXEC_NATIVE:
 955                 return (exec_common((char *)arg1, (const char **)arg2,
 956                     (const char **)arg3, EBA_NATIVE));
 957 
 958         /*
 959          * The B_TRUSS_POINT subcommand is used so that we can make a no-op
 960          * syscall for debugging purposes (dtracing) from within the user-level
 961          * emulation.
 962          */
 963         case B_TRUSS_POINT:
 964                 return (0);
 965 
 966         case B_LPID_TO_SPAIR: {
 967                 /*
 968                  * Given a Linux pid as arg1, return the Solaris pid in arg2 and
 969                  * the Solaris LWP in arg3.  We also translate pid 1 (which is
 970                  * hardcoded in many applications) to the zone's init process.
 971                  */
 972                 pid_t s_pid;
 973                 id_t s_tid;
 974 
 975                 if ((pid_t)arg1 == 1) {
 976                         s_pid = p->p_zone->zone_proc_initpid;
 977                         /* handle the dead/missing init(1M) case */
 978                         if (s_pid == -1)
 979                                 s_pid = 1;
 980                         s_tid = 1;
 981                 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) {
 982                         return (ESRCH);
 983                 }
 984 
 985                 if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 ||
 986                     copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) {
 987                         return (EFAULT);
 988                 }
 989 
 990                 return (0);
 991         }
 992 
 993         case B_SET_AFFINITY_MASK:
 994         case B_GET_AFFINITY_MASK:
 995                 /*
 996                  * Retrieve or store the CPU affinity mask for the
 997                  * requested linux pid.
 998                  *
 999                  * arg1 is a linux PID (0 means curthread).
1000                  * arg2 is the size of the given mask.
1001                  * arg3 is the address of the affinity mask.
1002                  */
1003                 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval));
1004 
1005         case B_PTRACE_STOP_FOR_OPT:
1006                 return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ?
1007                     B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
1008 
1009         case B_PTRACE_CLONE_BEGIN:
1010                 return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ?
1011                     B_FALSE : B_TRUE));
1012 
1013         case B_PTRACE_KERNEL:
1014                 return (lx_ptrace_kernel((int)arg1, (pid_t)arg2, arg3, arg4));
1015 
1016         case B_HELPER_WAITID: {
1017                 idtype_t idtype = (idtype_t)arg1;
1018                 id_t id = (id_t)arg2;
1019                 siginfo_t *infop = (siginfo_t *)arg3;
1020                 int options = (int)arg4;
1021 
1022                 lwpd = ttolxlwp(curthread);
1023 
1024                 /*
1025                  * Our brand-specific waitid helper only understands a subset of
1026                  * the possible idtypes.  Ensure we keep to that subset here:
1027                  */
1028                 if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) {
1029                         return (EINVAL);
1030                 }
1031 
1032                 /*
1033                  * Enable the return of emulated ptrace(2) stop conditions
1034                  * through lx_waitid_helper, and stash the Linux-specific
1035                  * extra waitid() flags.
1036                  */
1037                 lwpd->br_waitid_emulate = B_TRUE;
1038                 lwpd->br_waitid_flags = (int)arg5;
1039 
1040 #if defined(_SYSCALL32_IMPL)
1041                 if (get_udatamodel() != DATAMODEL_NATIVE) {
1042                         return (waitsys32(idtype, id, infop, options));
1043                 } else
1044 #endif
1045                 {
1046                         return (waitsys(idtype, id, infop, options));
1047                 }
1048 
1049                 lwpd->br_waitid_emulate = B_FALSE;
1050                 lwpd->br_waitid_flags = 0;
1051 
1052                 return (0);
1053         }
1054 
1055         case B_UNSUPPORTED: {
1056                 char dmsg[256];
1057 
1058                 if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) {
1059                         lx_print("Failed to copyin unsupported msg "
1060                             "at 0x%p\n", (void *)arg1);
1061                         return (EFAULT);
1062                 }
1063                 dmsg[255] = '\0';
1064                 lx_unsupported(dmsg);
1065 
1066                 lx_check_strict_failure(lwpd);
1067 
1068                 return (0);
1069         }
1070 
1071         case B_STORE_ARGS: {
1072                 /*
1073                  * B_STORE_ARGS subcommand
1074                  * arg1 = address of struct to be copied in
1075                  * arg2 = size of the struct being copied in
1076                  * arg3-arg6 ignored
1077                  * rval = the amount of data copied.
1078                  */
1079                 void *buf;
1080 
1081                 /* only have upper limit because arg2 is unsigned */
1082                 if (arg2 > LX_BR_ARGS_SIZE_MAX) {
1083                         return (EINVAL);
1084                 }
1085 
1086                 buf = kmem_alloc(arg2, KM_SLEEP);
1087                 if (copyin((void *)arg1, buf, arg2) != 0) {
1088                         lx_print("Failed to copyin scall arg at 0x%p\n",
1089                             (void *) arg1);
1090                         kmem_free(buf, arg2);
1091                         /*
1092                          * Purposely not setting br_scall_args to NULL
1093                          * to preserve data for debugging.
1094                          */
1095                         return (EFAULT);
1096                 }
1097 
1098                 if (lwpd->br_scall_args != NULL) {
1099                         ASSERT(lwpd->br_args_size > 0);
1100                         kmem_free(lwpd->br_scall_args,
1101                             lwpd->br_args_size);
1102                 }
1103 
1104                 lwpd->br_scall_args = buf;
1105                 lwpd->br_args_size = arg2;
1106                 *rval = arg2;
1107                 return (0);
1108         }
1109 
1110         case B_HELPER_CLONE:
1111                 return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3,
1112                     (void *)arg4));
1113 
1114         case B_HELPER_SETGROUPS:
1115                 return (lx_helper_setgroups(arg1, (gid_t *)arg2));
1116 
1117         case B_HELPER_SIGQUEUE:
1118                 return (lx_helper_rt_sigqueueinfo(arg1, arg2,
1119                     (siginfo_t *)arg3));
1120 
1121         case B_HELPER_TGSIGQUEUE:
1122                 return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3,
1123                     (siginfo_t *)arg4));
1124 
1125         case B_SET_THUNK_PID:
1126                 lwpd->br_lx_thunk_pid = arg1;
1127                 return (0);
1128 
1129         case B_GETPID:
1130                 /*
1131                  * The usermode clone(2) code needs to be able to call
1132                  * lx_getpid() from native code:
1133                  */
1134                 *rval = lx_getpid();
1135                 return (0);
1136 
1137         case B_SET_NATIVE_STACK:
1138                 /*
1139                  * B_SET_NATIVE_STACK subcommand
1140                  * arg1 = the base of the stack to use for emulation
1141                  */
1142                 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
1143                         lx_print("B_SET_NATIVE_STACK when stack was already "
1144                             "set to %p\n", (void *)arg1);
1145                         return (EEXIST);
1146                 }
1147 
1148                 /*
1149                  * We move from the PREINIT state, where we have no brand
1150                  * emulation stack, to the INIT state.  Here, we are still
1151                  * running on what will become the BRAND stack, but are running
1152                  * emulation (i.e. native) code.  Once the initialisation
1153                  * process for this thread has finished, we will jump to
1154                  * brand-specific code, while moving to the BRAND mode.
1155                  *
1156                  * When a new LWP is created, lx_initlwp() will clear the
1157                  * stack data.  If that LWP is actually being duplicated
1158                  * into a child process by fork(2), lx_forklwp() will copy
1159                  * it so that the cloned thread will keep using the same
1160                  * alternate stack.
1161                  */
1162                 lwpd->br_ntv_stack = arg1;
1163                 lwpd->br_stack_mode = LX_STACK_MODE_INIT;
1164                 lx_lwp_set_native_stack_current(lwpd, arg1);
1165 
1166                 return (0);
1167 
1168         case B_GET_CURRENT_CONTEXT:
1169                 /*
1170                  * B_GET_CURRENT_CONTEXT subcommand:
1171                  * arg1 = address for pointer to current ucontext_t
1172                  */
1173 
1174 #if defined(_SYSCALL32_IMPL)
1175                 if (get_udatamodel() != DATAMODEL_NATIVE) {
1176                         caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext;
1177 
1178                         error = copyout(&addr, (void *)arg1, sizeof (addr));
1179                 } else
1180 #endif
1181                 {
1182                         error = copyout(&lwp->lwp_oldcontext, (void *)arg1,
1183                             sizeof (lwp->lwp_oldcontext));
1184                 }
1185 
1186                 return (error != 0 ? EFAULT : 0);
1187 
1188         case B_JUMP_TO_LINUX:
1189                 /*
1190                  * B_JUMP_TO_LINUX subcommand:
1191                  * arg1 = ucontext_t pointer for jump state
1192                  */
1193 
1194                 if (arg1 == NULL)
1195                         return (EINVAL);
1196 
1197                 switch (lwpd->br_stack_mode) {
1198                 case LX_STACK_MODE_NATIVE: {
1199                         struct regs *rp = lwptoregs(lwp);
1200 
1201                         /*
1202                          * We are on the NATIVE stack, so we must preserve
1203                          * the extent of that stack.  The pointer will be
1204                          * reset by a future setcontext().
1205                          */
1206                         lx_lwp_set_native_stack_current(lwpd,
1207                             (uintptr_t)rp->r_sp);
1208                         break;
1209                 }
1210 
1211                 case LX_STACK_MODE_INIT:
1212                         /*
1213                          * The LWP is transitioning to Linux code for the first
1214                          * time.
1215                          */
1216                         break;
1217 
1218                 case LX_STACK_MODE_PREINIT:
1219                         /*
1220                          * This LWP has not installed an alternate stack for
1221                          * usermode emulation handling.
1222                          */
1223                         return (ENOENT);
1224 
1225                 case LX_STACK_MODE_BRAND:
1226                         /*
1227                          * The LWP should not be on the BRAND stack.
1228                          */
1229                         exit(CLD_KILLED, SIGSYS);
1230                         return (0);
1231                 }
1232 
1233                 /*
1234                  * Transfer control to Linux:
1235                  */
1236                 return (lx_runexe(lwp, (void *)arg1));
1237 
1238         case B_EMULATION_DONE:
1239                 /*
1240                  * B_EMULATION_DONE subcommand:
1241                  * arg1 = ucontext_t * to restore
1242                  * arg2 = system call number
1243                  * arg3 = return code
1244                  * arg4 = if operation failed, the errno value
1245                  */
1246 
1247                 /*
1248                  * The first part of this operation is a setcontext() to
1249                  * restore the register state to the copy we preserved
1250                  * before vectoring to the usermode emulation routine.
1251                  * If that fails, we return (hopefully) to the emulation
1252                  * routine and it will handle the error.
1253                  */
1254 #if (_SYSCALL32_IMPL)
1255                 if (get_udatamodel() != DATAMODEL_NATIVE) {
1256                         error = getsetcontext32(SETCONTEXT, (void *)arg1);
1257                 } else
1258 #endif
1259                 {
1260                         error = getsetcontext(SETCONTEXT, (void *)arg1);
1261                 }
1262 
1263                 if (error != 0) {
1264                         return (error);
1265                 }
1266 
1267                 /*
1268                  * The saved Linux context has been restored.  We handle the
1269                  * return value or errno with code common to the in-kernel
1270                  * system call emulation.
1271                  */
1272                 if ((error = (int)arg4) != 0) {
1273                         /*
1274                          * lx_syscall_return() looks at the errno in the LWP,
1275                          * so set it here:
1276                          */
1277                         set_errno(error);
1278                 }
1279                 lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3);
1280 
1281                 return (0);
1282 
1283         case B_EXIT_AS_SIG:
1284                 code = CLD_KILLED;
1285                 sig = (int)arg1;
1286                 proc_is_exiting(p);
1287                 if (exitlwps(1) != 0) {
1288                         mutex_enter(&p->p_lock);
1289                         lwp_exit();
1290                 }
1291                 ttolwp(curthread)->lwp_cursig = sig;
1292                 if (sig == SIGSEGV) {
1293                         if (core(sig, 0) == 0)
1294                                 code = CLD_DUMPED;
1295                 }
1296                 exit(code, sig);
1297                 /* NOTREACHED */
1298                 break;
1299         }
1300 
1301         return (EINVAL);
1302 }
1303 
1304 char *
1305 lx_get_zone_kern_version(zone_t *zone)
1306 {
1307         return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version);
1308 }
1309 
1310 void
1311 lx_set_kern_version(zone_t *zone, char *vers)
1312 {
1313         lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
1314 
1315         (void) strlcpy(lxzd->lxzd_kernel_version, vers, LX_VERS_MAX);
1316 }
1317 
1318 /*
1319  * Compare linux kernel version to the one set for the zone.
1320  * Returns greater than 0 if zone version is higher, less than 0 if the zone
1321  * version is lower, and 0 if the version are equal.
1322  */
1323 int
1324 lx_kern_version_cmp(zone_t *zone, const char *vers)
1325 {
1326         int zvers[3] = {0, 0, 0};
1327         int cvers[3] = {0, 0, 0};
1328         int i;
1329 
1330         VERIFY(zone->zone_brand == &lx_brand);
1331 
1332         (void) sscanf(ztolxzd(zone)->lxzd_kernel_version, "%d.%d.%d", &zvers[0],
1333             &zvers[1], &zvers[2]);
1334         (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]);
1335 
1336         for (i = 0; i < 3; i++) {
1337                 if (zvers[i] > cvers[i]) {
1338                         return (1);
1339                 } else if (zvers[i] < cvers[i]) {
1340                         return (-1);
1341                 }
1342         }
1343         return (0);
1344 }
1345 
1346 /*
1347  * Linux unconditionally removes the setuid and setgid bits when changing
1348  * file ownership.  This brand hook overrides the illumos native behaviour,
1349  * which is based on the PRIV_FILE_SETID privilege.
1350  */
1351 static int
1352 lx_setid_clear(vattr_t *vap, cred_t *cr)
1353 {
1354         if (S_ISDIR(vap->va_mode)) {
1355                 return (0);
1356         }
1357 
1358         if (vap->va_mode & S_ISUID) {
1359                 vap->va_mask |= AT_MODE;
1360                 vap->va_mode &= ~S_ISUID;
1361         }
1362         if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1363                 vap->va_mask |= AT_MODE;
1364                 vap->va_mode &= ~S_ISGID;
1365         }
1366 
1367         return (0);
1368 }
1369 
1370 /*
1371  * Copy the per-process brand data from a parent proc to a child.
1372  */
1373 void
1374 lx_copy_procdata(proc_t *child, proc_t *parent)
1375 {
1376         lx_proc_data_t *cpd = child->p_brand_data;
1377         lx_proc_data_t *ppd = parent->p_brand_data;
1378 
1379         VERIFY(parent->p_brand == &lx_brand);
1380         VERIFY(child->p_brand == &lx_brand);
1381         VERIFY(ppd != NULL);
1382         VERIFY(cpd != NULL);
1383 
1384         *cpd = *ppd;
1385 
1386         cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY;
1387         cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY;
1388 
1389         cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20;
1390         cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20;
1391 
1392         cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY;
1393         cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY;
1394 
1395         cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY;
1396         cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY;
1397 }
1398 
1399 #if defined(_LP64)
1400 static void
1401 Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst)
1402 {
1403         bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident));
1404         dst->e_type =                src->e_type;
1405         dst->e_machine =     src->e_machine;
1406         dst->e_version =     src->e_version;
1407         dst->e_entry =               src->e_entry;
1408         dst->e_phoff =               src->e_phoff;
1409         dst->e_shoff =               src->e_shoff;
1410         dst->e_flags =               src->e_flags;
1411         dst->e_ehsize =              src->e_ehsize;
1412         dst->e_phentsize =   src->e_phentsize;
1413         dst->e_phnum =               src->e_phnum;
1414         dst->e_shentsize =   src->e_shentsize;
1415         dst->e_shnum =               src->e_shnum;
1416         dst->e_shstrndx =    src->e_shstrndx;
1417 }
1418 #endif /* _LP64 */
1419 
1420 static void
1421 restoreexecenv(struct execenv *ep, stack_t *sp)
1422 {
1423         klwp_t *lwp = ttolwp(curthread);
1424 
1425         setexecenv(ep);
1426         lwp->lwp_sigaltstack.ss_sp = sp->ss_sp;
1427         lwp->lwp_sigaltstack.ss_size = sp->ss_size;
1428         lwp->lwp_sigaltstack.ss_flags = sp->ss_flags;
1429 }
1430 
1431 extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
1432     long *, int, caddr_t, cred_t *, int *);
1433 
1434 extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int,
1435     long *, int, caddr_t, cred_t *, int *);
1436 
1437 /*
1438  * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux
1439  * binaries.
1440  */
1441 static int
1442 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
1443     struct intpdata *idata, int level, long *execsz, int setid,
1444     caddr_t exec_file, struct cred *cred, int *brand_action)
1445 {
1446         int             error;
1447         vnode_t         *nvp;
1448         Ehdr            ehdr;
1449         Addr            uphdr_vaddr;
1450         intptr_t        voffset;
1451         char            *interp = NULL;
1452         uintptr_t       ldaddr = NULL;
1453         int             i;
1454         proc_t          *p = ttoproc(curthread);
1455         klwp_t          *lwp = ttolwp(curthread);
1456         struct execenv  env;
1457         struct execenv  origenv;
1458         stack_t         orig_sigaltstack;
1459         struct user     *up = PTOU(ttoproc(curthread));
1460         lx_elf_data_t   *edp;
1461         char            *lib_path = NULL;
1462 
1463         ASSERT(ttoproc(curthread)->p_brand == &lx_brand);
1464         ASSERT(ttoproc(curthread)->p_brand_data != NULL);
1465 
1466         edp = &ttolxproc(curthread)->l_elf_data;
1467 
1468         if (args->to_model == DATAMODEL_NATIVE) {
1469                 lib_path = LX_LIB_PATH;
1470         }
1471 #if defined(_LP64)
1472         else {
1473                 lib_path = LX_LIB_PATH32;
1474         }
1475 #endif
1476 
1477         /*
1478          * Set the brandname and library name for the new process so that
1479          * elfexec() puts them onto the stack.
1480          */
1481         args->brandname = LX_BRANDNAME;
1482         args->emulator = lib_path;
1483 
1484 #if defined(_LP64)
1485         /*
1486          * To conform with the way Linux lays out the address space, we clamp
1487          * the stack to be the top of the lower region of the x86-64 canonical
1488          * form address space -- which has the side-effect of laying out the
1489          * entire address space in that lower region.  Note that this only
1490          * matters on 64-bit processes (this value will always be greater than
1491          * the size of a 32-bit address space) and doesn't actually affect
1492          * USERLIMIT:  if a Linux-branded processes wishes to map something
1493          * into the top half of the address space, it can do so -- but with
1494          * the user stack starting at the top of the bottom region, those high
1495          * virtual addresses won't be used unless explicitly directed.
1496          */
1497         args->maxstack = lx_maxstack64;
1498 #endif
1499 
1500         /*
1501          * We will first exec the brand library, then map in the linux
1502          * executable and the linux linker.
1503          */
1504         if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP,
1505             &nvp))) {
1506                 uprintf("%s: not found.", lib_path);
1507                 return (error);
1508         }
1509 
1510         /*
1511          * We will eventually set the p_exec member to be the vnode for the new
1512          * executable when we call setexecenv(). However, if we get an error
1513          * before that call we need to restore the execenv to its original
1514          * values so that when we return to the caller fop_close() works
1515          * properly while cleaning up from the failed exec().  Restoring the
1516          * original value will also properly decrement the 2nd VN_RELE that we
1517          * took on the brand library.
1518          */
1519         origenv.ex_bssbase = p->p_bssbase;
1520         origenv.ex_brkbase = p->p_brkbase;
1521         origenv.ex_brksize = p->p_brksize;
1522         origenv.ex_vp = p->p_exec;
1523         orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp;
1524         orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size;
1525         orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags;
1526 
1527         if (args->to_model == DATAMODEL_NATIVE) {
1528                 error = elfexec(nvp, uap, args, idata, level + 1, execsz,
1529                     setid, exec_file, cred, brand_action);
1530         }
1531 #if defined(_LP64)
1532         else {
1533                 error = elf32exec(nvp, uap, args, idata, level + 1, execsz,
1534                     setid, exec_file, cred, brand_action);
1535         }
1536 #endif
1537         VN_RELE(nvp);
1538         if (error != 0) {
1539                 restoreexecenv(&origenv, &orig_sigaltstack);
1540                 return (error);
1541         }
1542 
1543         /*
1544          * exec-ed in the brand library above.
1545          * The u_auxv vectors are now setup by elfexec to point to the
1546          * brand emulation library and its linker.
1547          */
1548 
1549         bzero(&env, sizeof (env));
1550 
1551         /*
1552          * map in the the Linux executable
1553          */
1554         if (args->to_model == DATAMODEL_NATIVE) {
1555                 error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
1556                     &voffset, exec_file, &interp, &env.ex_bssbase,
1557                     &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1558         }
1559 #if defined(_LP64)
1560         else {
1561                 Elf32_Ehdr      ehdr32;
1562                 Elf32_Addr      uphdr_vaddr32;
1563 
1564                 error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
1565                     &voffset, exec_file, &interp, &env.ex_bssbase,
1566                     &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1567 
1568                 Ehdr32to64(&ehdr32, &ehdr);
1569 
1570                 if (uphdr_vaddr32 == (Elf32_Addr)-1)
1571                         uphdr_vaddr = (Addr)-1;
1572                 else
1573                         uphdr_vaddr = uphdr_vaddr32;
1574         }
1575 #endif
1576         if (error != 0) {
1577                 restoreexecenv(&origenv, &orig_sigaltstack);
1578 
1579                 if (interp != NULL)
1580                         kmem_free(interp, MAXPATHLEN);
1581 
1582                 return (error);
1583         }
1584 
1585         /*
1586          * Save off the important properties of the lx executable. The brand
1587          * library will ask us for this data later, when it is ready to set
1588          * things up for the lx executable.
1589          */
1590         edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
1591             voffset + uphdr_vaddr;
1592         edp->ed_entry = voffset + ehdr.e_entry;
1593         edp->ed_phent = ehdr.e_phentsize;
1594         edp->ed_phnum = ehdr.e_phnum;
1595 
1596         if (interp != NULL) {
1597                 if (ehdr.e_type == ET_DYN) {
1598                         /*
1599                          * This is a shared object executable, so we need to
1600                          * pick a reasonable place to put the heap. Just don't
1601                          * use the first page.
1602                          */
1603                         env.ex_brkbase = (caddr_t)PAGESIZE;
1604                         env.ex_bssbase = (caddr_t)PAGESIZE;
1605                 }
1606 
1607                 /*
1608                  * If the program needs an interpreter (most do), map it in and
1609                  * store relevant information about it in the aux vector, where
1610                  * the brand library can find it.
1611                  */
1612                 if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW,
1613                     NULLVPP, &nvp))) {
1614                         uprintf("%s: not found.", interp);
1615                         restoreexecenv(&origenv, &orig_sigaltstack);
1616                         kmem_free(interp, MAXPATHLEN);
1617                         return (error);
1618                 }
1619 
1620                 kmem_free(interp, MAXPATHLEN);
1621                 interp = NULL;
1622 
1623                 /*
1624                  * map in the Linux linker
1625                  */
1626                 if (args->to_model == DATAMODEL_NATIVE) {
1627                         error = mapexec_brand(nvp, args, &ehdr,
1628                             &uphdr_vaddr, &voffset, exec_file, NULL, NULL,
1629                             NULL, NULL, NULL, &ldaddr);
1630                 }
1631 #if defined(_LP64)
1632                 else {
1633                         Elf32_Ehdr      ehdr32;
1634                         Elf32_Addr      uphdr_vaddr32;
1635 
1636                         error = mapexec32_brand(nvp, args, &ehdr32,
1637                             &uphdr_vaddr32, &voffset, exec_file, NULL, NULL,
1638                             NULL, NULL, NULL, &ldaddr);
1639 
1640                         Ehdr32to64(&ehdr32, &ehdr);
1641 
1642                         if (uphdr_vaddr32 == (Elf32_Addr)-1)
1643                                 uphdr_vaddr = (Addr)-1;
1644                         else
1645                                 uphdr_vaddr = uphdr_vaddr32;
1646                 }
1647 #endif
1648 
1649                 VN_RELE(nvp);
1650                 if (error != 0) {
1651                         restoreexecenv(&origenv, &orig_sigaltstack);
1652                         return (error);
1653                 }
1654 
1655                 /*
1656                  * Now that we know the base address of the brand's linker,
1657                  * we also save this for later use by the brand library.
1658                  */
1659                 edp->ed_base = voffset;
1660                 edp->ed_ldentry = voffset + ehdr.e_entry;
1661         } else {
1662                 /*
1663                  * This program has no interpreter. The lx brand library will
1664                  * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
1665                  * so in this case, put the entry point of the main executable
1666                  * there.
1667                  */
1668                 if (ehdr.e_type == ET_EXEC) {
1669                         /*
1670                          * An executable with no interpreter, this must be a
1671                          * statically linked executable, which means we loaded
1672                          * it at the address specified in the elf header, in
1673                          * which case the e_entry field of the elf header is an
1674                          * absolute address.
1675                          */
1676                         edp->ed_ldentry = ehdr.e_entry;
1677                         edp->ed_entry = ehdr.e_entry;
1678                 } else {
1679                         /*
1680                          * A shared object with no interpreter, we use the
1681                          * calculated address from above.
1682                          */
1683                         edp->ed_ldentry = edp->ed_entry;
1684 
1685                         /*
1686                          * In all situations except an ET_DYN elf object with no
1687                          * interpreter, we want to leave the brk and base
1688                          * values set by mapexec_brand alone. Normally when
1689                          * running ET_DYN objects on Solaris (most likely
1690                          * /lib/ld.so.1) the kernel sets brk and base to 0 since
1691                          * it doesn't know where to put the heap, and later the
1692                          * linker will call brk() to initialize the heap in:
1693                          *      usr/src/cmd/sgs/rtld/common/setup.c:setup()
1694                          * after it has determined where to put it.  (This
1695                          * decision is made after the linker loads and inspects
1696                          * elf properties of the target executable being run.)
1697                          *
1698                          * So for ET_DYN Linux executables, we also don't know
1699                          * where the heap should go, so we'll set the brk and
1700                          * base to 0.  But in this case the Solaris linker will
1701                          * not initialize the heap, so when the Linux linker
1702                          * starts running there is no heap allocated.  This
1703                          * seems to be ok on Linux 2.4 based systems because the
1704                          * Linux linker/libc fall back to using mmap() to
1705                          * allocate memory. But on 2.6 systems, running
1706                          * applications by specifying them as command line
1707                          * arguments to the linker results in segfaults for an
1708                          * as yet undetermined reason (which seems to indicatej
1709                          * that a more permanent fix for heap initalization in
1710                          * these cases may be necessary).
1711                          */
1712                         if (ehdr.e_type == ET_DYN) {
1713                                 env.ex_bssbase = (caddr_t)0;
1714                                 env.ex_brkbase = (caddr_t)0;
1715                                 env.ex_brksize = 0;
1716                         }
1717                 }
1718 
1719         }
1720 
1721         env.ex_vp = vp;
1722         setexecenv(&env);
1723 
1724         /*
1725          * We try to keep /proc's view of the aux vector consistent with
1726          * what's on the process stack.
1727          */
1728         if (args->to_model == DATAMODEL_NATIVE) {
1729                 auxv_t phdr_auxv[4] = {
1730                     { AT_SUN_BRAND_LX_PHDR, 0 },
1731                     { AT_SUN_BRAND_LX_INTERP, 0 },
1732                     { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 },
1733                     { AT_SUN_BRAND_AUX4, 0 }
1734                 };
1735                 phdr_auxv[0].a_un.a_val = edp->ed_phdr;
1736                 phdr_auxv[1].a_un.a_val = ldaddr;
1737                 phdr_auxv[2].a_un.a_val = 1;    /* set in lx_init */
1738                 phdr_auxv[3].a_type = AT_CLKTCK;
1739                 phdr_auxv[3].a_un.a_val = hz;
1740 
1741                 if (copyout(&phdr_auxv, args->auxp_brand,
1742                     sizeof (phdr_auxv)) == -1)
1743                         return (EFAULT);
1744         }
1745 #if defined(_LP64)
1746         else {
1747                 auxv32_t phdr_auxv32[3] = {
1748                     { AT_SUN_BRAND_LX_PHDR, 0 },
1749                     { AT_SUN_BRAND_LX_INTERP, 0 },
1750                     { AT_SUN_BRAND_AUX3, 0 }
1751                 };
1752                 phdr_auxv32[0].a_un.a_val = edp->ed_phdr;
1753                 phdr_auxv32[1].a_un.a_val = ldaddr;
1754                 phdr_auxv32[2].a_type = AT_CLKTCK;
1755                 phdr_auxv32[2].a_un.a_val = hz;
1756 
1757                 if (copyout(&phdr_auxv32, args->auxp_brand,
1758                     sizeof (phdr_auxv32)) == -1)
1759                         return (EFAULT);
1760         }
1761 #endif
1762 
1763         /*
1764          * /proc uses the AT_ENTRY aux vector entry to deduce
1765          * the location of the executable in the address space. The user
1766          * structure contains a copy of the aux vector that needs to have those
1767          * entries patched with the values of the real lx executable (they
1768          * currently contain the values from the lx brand library that was
1769          * elfexec'd, above).
1770          *
1771          * For live processes, AT_BASE is used to locate the linker segment,
1772          * which /proc and friends will later use to find Solaris symbols
1773          * (such as rtld_db_preinit). However, for core files, /proc uses
1774          * AT_ENTRY to find the right segment to label as the executable.
1775          * So we set AT_ENTRY to be the entry point of the linux executable,
1776          * but leave AT_BASE to be the address of the Solaris linker.
1777          */
1778         for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
1779                 switch (up->u_auxv[i].a_type) {
1780                 case AT_ENTRY:
1781                         up->u_auxv[i].a_un.a_val = edp->ed_entry;
1782                         break;
1783 
1784                 case AT_SUN_BRAND_LX_PHDR:
1785                         up->u_auxv[i].a_un.a_val = edp->ed_phdr;
1786                         break;
1787 
1788                 case AT_SUN_BRAND_LX_INTERP:
1789                         up->u_auxv[i].a_un.a_val = ldaddr;
1790                         break;
1791 
1792                 default:
1793                         break;
1794                 }
1795         }
1796 
1797         return (0);
1798 }
1799 
1800 boolean_t
1801 lx_native_exec(uint8_t osabi, const char **interp)
1802 {
1803         if (osabi != ELFOSABI_SOLARIS)
1804                 return (B_FALSE);
1805 
1806         /*
1807          * If the process root matches the zone root, prepend /native to the
1808          * interpreter path for native executables.  Absolute precision from
1809          * VN_CMP is not necessary since any change of process root is likely
1810          * to make native binaries inaccessible via /native.
1811          *
1812          * Processes which chroot directly into /native will be able to
1813          * function as expected with no need for the prefix.
1814          */
1815         if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) {
1816                 *interp = "/native";
1817         }
1818 
1819         return (B_TRUE);
1820 }
1821 
1822 static void
1823 lx_syscall_init(void)
1824 {
1825         int i;
1826 
1827         /*
1828          * Count up the 32-bit Linux system calls.  Note that lx_sysent32
1829          * has (LX_NSYSCALLS + 1) entries.
1830          */
1831         for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++)
1832                 continue;
1833         lx_nsysent32 = i;
1834 
1835 #if defined(_LP64)
1836         /*
1837          * Count up the 64-bit Linux system calls.  Note that lx_sysent64
1838          * has (LX_NSYSCALLS + 1) entries.
1839          */
1840         for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++)
1841                 continue;
1842         lx_nsysent64 = i;
1843 #endif
1844 }
1845 
1846 int
1847 _init(void)
1848 {
1849         int err = 0;
1850 
1851         lx_syscall_init();
1852         lx_pid_init();
1853         lx_ioctl_init();
1854         lx_futex_init();
1855         lx_ptrace_init();
1856         lx_socket_init();
1857 
1858         err = mod_install(&modlinkage);
1859         if (err != 0) {
1860                 cmn_err(CE_WARN, "Couldn't install lx brand module");
1861 
1862                 /*
1863                  * This looks drastic, but it should never happen.  These
1864                  * two data structures should be completely free-able until
1865                  * they are used by Linux processes.  Since the brand
1866                  * wasn't loaded there should be no Linux processes, and
1867                  * thus no way for these data structures to be modified.
1868                  */
1869                 lx_pid_fini();
1870                 lx_ioctl_fini();
1871                 if (lx_futex_fini())
1872                         panic("lx brand module cannot be loaded or unloaded.");
1873         }
1874         return (err);
1875 }
1876 
1877 int
1878 _info(struct modinfo *modinfop)
1879 {
1880         return (mod_info(&modlinkage, modinfop));
1881 }
1882 
1883 int
1884 _fini(void)
1885 {
1886         int err;
1887         int futex_done = 0;
1888 
1889         /*
1890          * If there are any zones using this brand, we can't allow it to be
1891          * unloaded.
1892          */
1893         if (brand_zone_count(&lx_brand))
1894                 return (EBUSY);
1895 
1896         lx_ptrace_fini();
1897         lx_pid_fini();
1898         lx_ioctl_fini();
1899         lx_socket_fini();
1900 
1901         if ((err = lx_futex_fini()) != 0) {
1902                 goto done;
1903         }
1904         futex_done = 1;
1905 
1906         err = mod_remove(&modlinkage);
1907 
1908 done:
1909         if (err) {
1910                 /*
1911                  * If we can't unload the module, then we have to get it
1912                  * back into a sane state.
1913                  */
1914                 lx_ptrace_init();
1915                 lx_pid_init();
1916                 lx_ioctl_init();
1917                 lx_socket_init();
1918 
1919                 if (futex_done) {
1920                         lx_futex_init();
1921                 }
1922         }
1923 
1924         return (err);
1925 }