Print this page
OS-4514 [lx] SIGEV_THREAD_ID emulation needed
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/brand/lx/os/lx_brand.c
+++ new/usr/src/uts/common/brand/lx/os/lx_brand.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 /*
28 28 * Copyright 2015, Joyent, Inc. All rights reserved.
29 29 */
30 30
31 31 /*
32 32 * The LX Brand: emulation of a Linux operating environment within a zone.
33 33 *
34 34 * OVERVIEW
35 35 *
36 36 * The LX brand enables a full Linux userland -- including a C library,
37 37 * init(1) framework, and some set of applications -- to run unmodified
38 38 * within an illumos zone. Unlike illumos, where applications are expected
39 39 * to link against and consume functions exported from libraries, the
40 40 * supported Linux binary compatibility boundary is the system call
41 41 * interface. By accurately emulating the behaviour of Linux system calls,
42 42 * Linux software can be executed in this environment as if it were running
43 43 * on a native Linux system.
44 44 *
45 45 * EMULATING LINUX SYSTEM CALLS
46 46 *
47 47 * Linux system calls are made in 32-bit processes via the "int 0x80"
48 48 * instruction; in 64-bit processes the "syscall" instruction is used, as it
49 49 * is with native illumos processes. In both cases, arguments to system
50 50 * calls are generally passed in registers and the usermode stack is not
51 51 * interpreted or modified by the Linux kernel.
52 52 *
53 53 * When the emulated Linux process makes a system call, it traps into the
54 54 * illumos kernel. The in-kernel brand module contains various emulation
55 55 * routines, and can fully service some emulated system calls; e.g. read(2)
56 56 * and write(2). Other system calls require assistance from the illumos
57 57 * libc, bouncing back out to the brand library ("lx_brand.so.1") for
58 58 * emulation.
59 59 *
60 60 * The brand mechanism allows for the provision of an alternative trap
61 61 * handler for the various system call mechanisms. Traditionally this was
62 62 * used to immediately revector execution to the usermode emulation library,
63 63 * which was responsible for handling all system calls. In the interests of
64 64 * more accurate emulation and increased performance, much of the regular
65 65 * illumos system call path is now invoked. Only the argument processing and
66 66 * handler dispatch are replaced by the brand, via the per-LWP
67 67 * "lwp_brand_syscall" interposition function pointer.
68 68 *
69 69 * THE NATIVE AND BRAND STACKS
70 70 *
71 71 * Some runtime environments (e.g. the Go language) allocate very small
72 72 * thread stacks, preferring to grow or split the stack as necessary. The
73 73 * Linux kernel generally does not use the usermode stack when servicing
74 74 * system calls, so this is not a problem. In order for our emulation to
75 75 * have the same zero stack impact, we must execute usermode emulation
76 76 * routines on an _alternate_ stack. This is similar, in principle, to the
77 77 * use of sigaltstack(3C) to run signal handlers off the main thread stack.
78 78 *
79 79 * To this end, the brand library allocates and installs an alternate stack
80 80 * (called the "native" stack) for each LWP. The in-kernel brand code uses
81 81 * this stack for usermode emulation calls and interposed signal delivery,
82 82 * while the emulated Linux process sees only the data on the main thread
83 83 * stack, known as the "brand" stack. The stack mode is tracked in the
84 84 * per-LWP brand-private data, using the LX_STACK_MODE_* enum.
85 85 *
86 86 * The stack mode doubles as a system call "mode bit". When in the
87 87 * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux
88 88 * system calls. In other modes, system calls are assumed to be native
89 89 * illumos system calls as made during brand library initialisation and
90 90 * usermode emulation.
91 91 *
92 92 * USERMODE EMULATION
93 93 *
94 94 * When a Linux system call cannot be emulated within the kernel, we preserve
95 95 * the register state of the Linux process and revector the LWP to the brand
96 96 * library usermode emulation handler: the "lx_emulate()" function in
97 97 * "lx_brand.so.1". This revectoring is modelled on the delivery of signals,
98 98 * and is performed in "lx_emulate_user()".
99 99 *
100 100 * First, the emulated process state is written out to the usermode stack of
101 101 * the process as a "ucontext_t" object. Arguments to the emulation routine
102 102 * are passed on the stack or in registers, depending on the ABI. When the
103 103 * usermode emulation is complete, the result is passed back to the kernel
104 104 * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context
105 105 * for restoration.
106 106 *
107 107 * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT
108 108 *
109 109 * When servicing emulated system calls in the usermode brand library, or
110 110 * during signal delivery, various state is preserved by the kernel so that
111 111 * the running LWP may be revectored to a handling routine. The context
112 112 * allows the kernel to restart the program at the point of interruption,
113 113 * either at the return of the signal handler, via setcontext(3C); or after
114 114 * the usermode emulation request has been serviced, via B_EMULATION_DONE.
115 115 *
116 116 * In illumos native processes, the saved context (a "ucontext_t" object)
117 117 * includes the state of registers and the current signal mask at the point
118 118 * of interruption. The context also includes a link to the most recently
119 119 * saved context, forming a chain to be unwound as requests complete. The LX
120 120 * brand requires additional book-keeping to describe the machine state: in
121 121 * particular, the current stack mode and the occupied extent of the native
122 122 * stack.
123 123 *
124 124 * The brand code is able to interpose on the context save and restore
125 125 * operations in the kernel -- see "lx_savecontext()" and
126 126 * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to
127 127 * function correctly in the face of a dual stack LWP. The brand also
128 128 * interposes on the signal delivery mechanism -- see "lx_sendsig()" and
129 129 * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand
130 130 * library interposer on the native stack, regardless of the interrupted
131 131 * execution mode. Linux sigaltstack(2) emulation is performed entirely by
132 132 * the usermode brand library during signal handler interposition.
133 133 */
134 134
135 135 #include <sys/types.h>
136 136 #include <sys/kmem.h>
137 137 #include <sys/errno.h>
138 138 #include <sys/thread.h>
139 139 #include <sys/systm.h>
140 140 #include <sys/syscall.h>
141 141 #include <sys/proc.h>
142 142 #include <sys/modctl.h>
143 143 #include <sys/cmn_err.h>
144 144 #include <sys/model.h>
145 145 #include <sys/exec.h>
146 146 #include <sys/lx_impl.h>
147 147 #include <sys/machbrand.h>
148 148 #include <sys/lx_syscalls.h>
149 149 #include <sys/lx_misc.h>
150 150 #include <sys/lx_futex.h>
151 151 #include <sys/lx_brand.h>
152 152 #include <sys/param.h>
153 153 #include <sys/termios.h>
154 154 #include <sys/sunddi.h>
155 155 #include <sys/ddi.h>
156 156 #include <sys/vnode.h>
157 157 #include <sys/pathname.h>
158 158 #include <sys/auxv.h>
159 159 #include <sys/priv.h>
160 160 #include <sys/regset.h>
161 161 #include <sys/privregs.h>
162 162 #include <sys/archsystm.h>
163 163 #include <sys/zone.h>
164 164 #include <sys/brand.h>
165 165 #include <sys/sdt.h>
166 166 #include <sys/x86_archext.h>
167 167 #include <sys/controlregs.h>
168 168 #include <sys/core.h>
169 169 #include <sys/stack.h>
170 170 #include <sys/stat.h>
171 171 #include <sys/socket.h>
172 172 #include <lx_signum.h>
173 173 #include <util/sscanf.h>
174 174
175 175 int lx_debug = 0;
176 176
177 177 void lx_init_brand_data(zone_t *);
178 178 void lx_free_brand_data(zone_t *);
179 179 void lx_setbrand(proc_t *);
180 180 int lx_getattr(zone_t *, int, void *, size_t *);
181 181 int lx_setattr(zone_t *, int, void *, size_t);
182 182 int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
183 183 uintptr_t, uintptr_t);
184 184 void lx_set_kern_version(zone_t *, char *);
185 185 void lx_copy_procdata(proc_t *, proc_t *);
186 186
187 187 extern int getsetcontext(int, void *);
188 188 extern int waitsys(idtype_t, id_t, siginfo_t *, int);
189 189 #if defined(_SYSCALL32_IMPL)
190 190 extern int getsetcontext32(int, void *);
191 191 extern int waitsys32(idtype_t, id_t, siginfo_t *, int);
192 192 #endif
193 193
194 194 extern void lx_proc_exit(proc_t *);
195 195 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
196 196
197 197 extern void lx_ioctl_init();
198 198 extern void lx_ioctl_fini();
199 199 extern void lx_socket_init();
200 200 extern void lx_socket_fini();
201 201
202 202 lx_systrace_f *lx_systrace_entry_ptr;
203 203 lx_systrace_f *lx_systrace_return_ptr;
204 204
205 205 static int lx_systrace_enabled;
206 206
207 207 /*
208 208 * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly
209 209 * want an MMU dependency here (and should there be a microprocessor without
210 210 * a hole, we don't want to start allocating from the top of the VA range).
211 211 */
212 212 #define LX_MAXSTACK64 0x7ffffff00000
213 213
214 214 uint64_t lx_maxstack64 = LX_MAXSTACK64;
215 215
216 216 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
217 217 struct intpdata *idata, int level, long *execsz, int setid,
218 218 caddr_t exec_file, struct cred *cred, int *brand_action);
219 219
220 220 static boolean_t lx_native_exec(uint8_t, const char **);
221 221 static uint32_t lx_map32limit(proc_t *);
222 222
223 223 static void lx_savecontext(ucontext_t *);
224 224 static void lx_restorecontext(ucontext_t *);
225 225 static caddr_t lx_sendsig_stack(int);
226 226 static void lx_sendsig(int);
227 227 #if defined(_SYSCALL32_IMPL)
228 228 static void lx_savecontext32(ucontext32_t *);
229 229 #endif
230 230 static int lx_setid_clear(vattr_t *, cred_t *);
231 231 #if defined(_LP64)
232 232 static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type,
233 233 enum seg_rw);
234 234 #endif
235 235
236 236
237 237 /* lx brand */
238 238 struct brand_ops lx_brops = {
239 239 lx_init_brand_data, /* b_init_brand_data */
240 240 lx_free_brand_data, /* b_free_brand_data */
241 241 lx_brandsys, /* b_brandsys */
242 242 lx_setbrand, /* b_setbrand */
243 243 lx_getattr, /* b_getattr */
244 244 lx_setattr, /* b_setattr */
245 245 lx_copy_procdata, /* b_copy_procdata */
246 246 lx_proc_exit, /* b_proc_exit */
247 247 lx_exec, /* b_exec */
248 248 lx_setrval, /* b_lwp_setrval */
249 249 lx_lwpdata_alloc, /* b_lwpdata_alloc */
250 250 lx_lwpdata_free, /* b_lwpdata_free */
251 251 lx_initlwp, /* b_initlwp */
252 252 lx_forklwp, /* b_forklwp */
253 253 lx_freelwp, /* b_freelwp */
254 254 lx_exitlwp, /* b_lwpexit */
255 255 lx_elfexec, /* b_elfexec */
256 256 NULL, /* b_sigset_native_to_brand */
257 257 NULL, /* b_sigset_brand_to_native */
258 258 lx_sigfd_translate, /* b_sigfd_translate */
259 259 NSIG, /* b_nsig */
260 260 lx_exit_with_sig, /* b_exit_with_sig */
261 261 lx_wait_filter, /* b_wait_filter */
262 262 lx_native_exec, /* b_native_exec */
263 263 lx_map32limit, /* b_map32limit */
264 264 lx_stop_notify, /* b_stop_notify */
265 265 lx_waitid_helper, /* b_waitid_helper */
266 266 lx_sigcld_repost, /* b_sigcld_repost */
267 267 lx_ptrace_issig_stop, /* b_issig_stop */
268 268 lx_ptrace_sig_ignorable, /* b_sig_ignorable */
269 269 lx_savecontext, /* b_savecontext */
270 270 #if defined(_SYSCALL32_IMPL)
271 271 lx_savecontext32, /* b_savecontext32 */
272 272 #endif
273 273 lx_restorecontext, /* b_restorecontext */
274 274 lx_sendsig_stack, /* b_sendsig_stack */
275 275 lx_sendsig, /* b_sendsig */
276 276 lx_setid_clear, /* b_setid_clear */
277 277 #if defined(_LP64)
278 278 lx_pagefault /* b_pagefault */
279 279 #else
280 280 NULL
281 281 #endif
282 282 };
283 283
284 284 struct brand_mach_ops lx_mops = {
285 285 NULL,
286 286 NULL,
287 287 NULL,
288 288 NULL,
289 289 NULL,
290 290 lx_fixsegreg,
291 291 lx_fsbase
292 292 };
293 293
294 294 struct brand lx_brand = {
295 295 BRAND_VER_1,
296 296 "lx",
297 297 &lx_brops,
298 298 &lx_mops,
299 299 sizeof (struct lx_proc_data)
300 300 };
301 301
302 302 static struct modlbrand modlbrand = {
303 303 &mod_brandops, "lx brand", &lx_brand
304 304 };
305 305
306 306 static struct modlinkage modlinkage = {
307 307 MODREV_1, (void *)&modlbrand, NULL
308 308 };
309 309
310 310 void
311 311 lx_proc_exit(proc_t *p)
312 312 {
313 313 lx_proc_data_t *lxpd;
314 314 proc_t *cp;
315 315
316 316 mutex_enter(&p->p_lock);
317 317 VERIFY(lxpd = ptolxproc(p));
318 318 if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) {
319 319 mutex_exit(&p->p_lock);
320 320 return;
321 321 }
322 322 mutex_exit(&p->p_lock);
323 323
324 324 /* Check for children which desire notification of parental death. */
325 325 mutex_enter(&pidlock);
326 326 for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) {
327 327 mutex_enter(&cp->p_lock);
328 328 if ((lxpd = ptolxproc(cp)) == NULL) {
329 329 mutex_exit(&cp->p_lock);
330 330 continue;
331 331 }
332 332 if (lxpd->l_parent_deathsig != 0) {
333 333 sigtoproc(p, NULL, lxpd->l_parent_deathsig);
334 334 }
335 335 mutex_exit(&cp->p_lock);
336 336 }
337 337 mutex_exit(&pidlock);
338 338 }
339 339
340 340 void
341 341 lx_setbrand(proc_t *p)
342 342 {
343 343 /* Send SIGCHLD to parent by default when child exits */
344 344 ptolxproc(p)->l_signal = stol_signo[SIGCHLD];
345 345 }
346 346
347 347 /* ARGSUSED */
348 348 int
349 349 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
350 350 {
351 351 char vers[LX_VERS_MAX];
352 352
353 353 if (attr == LX_KERN_VERSION_NUM) {
354 354 if (bufsize > (LX_VERS_MAX - 1))
355 355 return (ERANGE);
356 356 bzero(vers, LX_VERS_MAX);
357 357 if (copyin(buf, &vers, bufsize) != 0)
358 358 return (EFAULT);
359 359 lx_set_kern_version(zone, vers);
360 360 return (0);
361 361 }
362 362 return (EINVAL);
363 363 }
364 364
365 365 /* ARGSUSED */
366 366 int
367 367 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
368 368 {
369 369 if (attr == LX_KERN_VERSION_NUM) {
370 370 if (*bufsize < LX_VERS_MAX)
371 371 return (ERANGE);
372 372 if (copyout(lx_get_zone_kern_version(curzone), buf,
373 373 LX_VERS_MAX) != 0)
374 374 return (EFAULT);
375 375 *bufsize = LX_VERS_MAX;
376 376 return (0);
377 377 }
378 378 return (-EINVAL);
379 379 }
380 380
381 381 uint32_t
382 382 lx_map32limit(proc_t *p)
383 383 {
384 384 /*
385 385 * To be bug-for-bug compatible with Linux, we have MAP_32BIT only
386 386 * allow mappings in the first 31 bits. This was a nuance in the
387 387 * original Linux implementation circa 2002, and applications have
388 388 * come to depend on its behavior.
389 389 *
390 390 * This is only relevant for 64-bit processes.
391 391 */
392 392 if (p->p_model == DATAMODEL_LP64)
393 393 return (1 << 31);
394 394
395 395 return ((uint32_t)USERLIMIT32);
396 396 }
397 397
398 398 void
399 399 lx_brand_systrace_enable(void)
400 400 {
401 401 VERIFY(!lx_systrace_enabled);
402 402
403 403 lx_systrace_enabled = 1;
404 404 }
405 405
406 406 void
407 407 lx_brand_systrace_disable(void)
408 408 {
409 409 VERIFY(lx_systrace_enabled);
410 410
411 411 lx_systrace_enabled = 0;
412 412 }
413 413
414 414 void
415 415 lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp)
416 416 {
417 417 VERIFY(lwpd->br_ntv_stack != 0);
418 418
419 419 /*
420 420 * The "brand-lx-set-ntv-stack-current" probe has arguments:
421 421 * arg0: stack pointer before change
422 422 * arg1: stack pointer after change
423 423 * arg2: current stack base
424 424 */
425 425 DTRACE_PROBE3(brand__lx__set__ntv__stack__current,
426 426 uintptr_t, lwpd->br_ntv_stack_current,
427 427 uintptr_t, new_sp,
428 428 uintptr_t, lwpd->br_ntv_stack);
429 429
430 430 lwpd->br_ntv_stack_current = new_sp;
431 431 }
432 432
433 433 #if defined(_LP64)
434 434 static int
435 435 lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type,
436 436 enum seg_rw rw)
437 437 {
438 438 int syscall_num;
439 439
440 440 /*
441 441 * We only want to handle a very specific set of circumstances.
442 442 * Namely: this is a 64-bit LX-branded process attempting to execute an
443 443 * address in a page for which it does not have a valid mapping. If
444 444 * this is not the case, we bail out as fast as possible.
445 445 */
446 446 VERIFY(PROC_IS_BRANDED(p));
447 447 if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) !=
448 448 DATAMODEL_NATIVE) {
449 449 return (-1);
450 450 }
451 451
452 452 if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) {
453 453 return (-1);
454 454 }
455 455
456 456 /*
457 457 * This is a valid vsyscall address. We service the system call and
458 458 * return 0 to signal that the pagefault has been handled completely.
459 459 */
460 460 lx_vsyscall_enter(p, lwp, syscall_num);
461 461 return (0);
462 462 }
463 463 #endif
464 464
465 465 /*
466 466 * This hook runs prior to sendsig() processing and allows us to nominate
467 467 * an alternative stack pointer for delivery of the signal handling frame.
468 468 * Critically, this routine should _not_ modify any LWP state as the
469 469 * savecontext() does not run until after this hook.
470 470 */
471 471 static caddr_t
472 472 lx_sendsig_stack(int sig)
473 473 {
474 474 klwp_t *lwp = ttolwp(curthread);
475 475 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
476 476
477 477 /*
478 478 * We want to take signal delivery on the native stack, but only if
479 479 * one has been allocated and installed for this LWP.
480 480 */
481 481 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
482 482 /*
483 483 * The program is not running on the native stack. Return
484 484 * the native stack pointer from our brand-private data so
485 485 * that we may switch to it for signal handling.
486 486 */
487 487 return ((caddr_t)lwpd->br_ntv_stack_current);
488 488 } else {
489 489 struct regs *rp = lwptoregs(lwp);
490 490
491 491 /*
492 492 * Either the program is already running on the native stack,
493 493 * or one has not yet been allocated for this LWP. Use the
494 494 * current stack pointer value.
495 495 */
496 496 return ((caddr_t)rp->r_sp);
497 497 }
498 498 }
499 499
500 500 /*
501 501 * This hook runs after sendsig() processing and allows us to update the
502 502 * per-LWP mode flags for system calls and stacks. The pre-signal
503 503 * context has already been saved and delivered to the user at this point.
504 504 */
505 505 static void
506 506 lx_sendsig(int sig)
507 507 {
508 508 klwp_t *lwp = ttolwp(curthread);
509 509 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
510 510 struct regs *rp = lwptoregs(lwp);
511 511
512 512 switch (lwpd->br_stack_mode) {
513 513 case LX_STACK_MODE_BRAND:
514 514 case LX_STACK_MODE_NATIVE:
515 515 /*
516 516 * In lx_sendsig_stack(), we nominated a stack pointer from the
517 517 * native stack. Update the stack mode, and the current in-use
518 518 * extent of the native stack, accordingly:
519 519 */
520 520 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
521 521 lx_lwp_set_native_stack_current(lwpd, rp->r_sp);
522 522
523 523 /*
524 524 * Fix up segment registers, etc.
525 525 */
526 526 lx_switch_to_native(lwp);
527 527 break;
528 528
529 529 default:
530 530 /*
531 531 * Otherwise, the brand library has not yet installed the
532 532 * alternate stack for this LWP. Signals will be handled on
533 533 * the regular stack thread.
534 534 */
535 535 return;
536 536 }
537 537 }
538 538
539 539 /*
540 540 * This hook runs prior to the context restoration, allowing us to take action
541 541 * or modify the context before it is loaded.
542 542 */
543 543 static void
544 544 lx_restorecontext(ucontext_t *ucp)
545 545 {
546 546 klwp_t *lwp = ttolwp(curthread);
547 547 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
548 548 uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0];
549 549 caddr_t sp = ucp->uc_brand_data[1];
550 550
551 551 /*
552 552 * We have a saved native stack pointer value that we must restore
553 553 * into the per-LWP data.
554 554 */
555 555 if (flags & LX_UC_RESTORE_NATIVE_SP) {
556 556 lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp);
557 557 }
558 558
559 559 /*
560 560 * We do not wish to restore the value of uc_link in this context,
561 561 * so replace it with the value currently in the LWP.
562 562 */
563 563 if (flags & LX_UC_IGNORE_LINK) {
564 564 ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext;
565 565 }
566 566
567 567 /*
568 568 * Restore the stack mode:
569 569 */
570 570 if (flags & LX_UC_STACK_NATIVE) {
571 571 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
572 572 } else if (flags & LX_UC_STACK_BRAND) {
573 573 lwpd->br_stack_mode = LX_STACK_MODE_BRAND;
574 574 }
575 575
576 576 #if defined(__amd64)
577 577 /*
578 578 * Override the fs/gsbase in the context with the value provided
579 579 * through the Linux arch_prctl(2) system call.
580 580 */
581 581 if (flags & LX_UC_STACK_BRAND) {
582 582 if (lwpd->br_lx_fsbase != 0) {
583 583 ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase;
584 584 }
585 585 if (lwpd->br_lx_gsbase != 0) {
586 586 ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase;
587 587 }
588 588 }
589 589 #endif
590 590 }
591 591
592 592 static void
593 593 lx_savecontext(ucontext_t *ucp)
594 594 {
595 595 klwp_t *lwp = ttolwp(curthread);
596 596 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
597 597 uintptr_t flags = 0;
598 598
599 599 /*
600 600 * The ucontext_t affords us three private pointer-sized members in
601 601 * "uc_brand_data". We pack a variety of flags into the first element,
602 602 * and an optional stack pointer in the second element. The flags
603 603 * determine which stack pointer (native or brand), if any, is stored
604 604 * in the second element. The third element may contain the system
605 605 * call number; this is analogous to the "orig_[er]ax" member of a
606 606 * Linux "user_regs_struct".
607 607 */
608 608
609 609 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
610 610 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
611 611 /*
612 612 * Record the value of the native stack pointer to restore
613 613 * when returning to this branded context:
614 614 */
615 615 flags |= LX_UC_RESTORE_NATIVE_SP;
616 616 ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current;
617 617 }
618 618
619 619 /*
620 620 * Save the stack mode:
621 621 */
622 622 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
623 623 flags |= LX_UC_STACK_NATIVE;
624 624 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
625 625 flags |= LX_UC_STACK_BRAND;
626 626 }
627 627
628 628 /*
629 629 * If we might need to restart this system call, save that information
630 630 * in the context:
631 631 */
632 632 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
633 633 ucp->uc_brand_data[2] =
634 634 (void *)(uintptr_t)lwpd->br_syscall_num;
635 635 if (lwpd->br_syscall_restart) {
636 636 flags |= LX_UC_RESTART_SYSCALL;
637 637 }
638 638 } else {
639 639 ucp->uc_brand_data[2] = NULL;
640 640 }
641 641
642 642 ucp->uc_brand_data[0] = (void *)flags;
643 643 }
644 644
645 645 #if defined(_SYSCALL32_IMPL)
646 646 static void
647 647 lx_savecontext32(ucontext32_t *ucp)
648 648 {
649 649 klwp_t *lwp = ttolwp(curthread);
650 650 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
651 651 unsigned int flags = 0;
652 652
653 653 /*
654 654 * The ucontext_t affords us three private pointer-sized members in
655 655 * "uc_brand_data". We pack a variety of flags into the first element,
656 656 * and an optional stack pointer in the second element. The flags
657 657 * determine which stack pointer (native or brand), if any, is stored
658 658 * in the second element. The third element may contain the system
659 659 * call number; this is analogous to the "orig_[er]ax" member of a
660 660 * Linux "user_regs_struct".
661 661 */
662 662
663 663 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
664 664 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
665 665 /*
666 666 * Record the value of the native stack pointer to restore
667 667 * when returning to this branded context:
668 668 */
669 669 flags |= LX_UC_RESTORE_NATIVE_SP;
670 670 ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current;
671 671 }
672 672
673 673 /*
674 674 * Save the stack mode:
675 675 */
676 676 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
677 677 flags |= LX_UC_STACK_NATIVE;
678 678 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
679 679 flags |= LX_UC_STACK_BRAND;
680 680 }
681 681
682 682 /*
683 683 * If we might need to restart this system call, save that information
684 684 * in the context:
685 685 */
686 686 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
687 687 ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num;
688 688 if (lwpd->br_syscall_restart) {
689 689 flags |= LX_UC_RESTART_SYSCALL;
690 690 }
691 691 } else {
692 692 ucp->uc_brand_data[2] = NULL;
693 693 }
694 694
695 695 ucp->uc_brand_data[0] = flags;
696 696 }
697 697 #endif
698 698
699 699 void
700 700 lx_init_brand_data(zone_t *zone)
701 701 {
702 702 lx_zone_data_t *data;
703 703 ASSERT(zone->zone_brand == &lx_brand);
704 704 ASSERT(zone->zone_brand_data == NULL);
705 705 data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP);
706 706 /*
707 707 * Set the default lxzd_kernel_version to 2.4.
708 708 * This can be changed by a call to setattr() during zone boot.
709 709 */
710 710 (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX);
711 711
712 712 /*
713 713 * Linux is not at all picky about address family when it comes to
714 714 * supporting interface-related ioctls. To mimic this behavior, we'll
715 715 * attempt those ioctls against a ksocket configured for that purpose.
716 716 */
717 717 (void) ksocket_socket(&data->lxzd_ioctl_sock, AF_INET, SOCK_DGRAM, 0,
718 718 0, zone->zone_kcred);
719 719
720 720 zone->zone_brand_data = data;
721 721
722 722 /*
723 723 * In Linux, if the init(1) process terminates the system panics.
724 724 * The zone must reboot to simulate this behaviour.
725 725 */
726 726 zone->zone_reboot_on_init_exit = B_TRUE;
727 727 }
728 728
729 729 void
730 730 lx_free_brand_data(zone_t *zone)
731 731 {
732 732 lx_zone_data_t *data = ztolxzd(zone);
733 733 ASSERT(data != NULL);
734 734 if (data->lxzd_ioctl_sock != NULL) {
735 735 /*
736 736 * Since zone_kcred has been cleaned up already, close the
737 737 * socket using the global kcred.
738 738 */
739 739 ksocket_close(data->lxzd_ioctl_sock, kcred);
740 740 data->lxzd_ioctl_sock = NULL;
741 741 }
742 742 zone->zone_brand_data = NULL;
743 743 kmem_free(data, sizeof (*data));
744 744 }
745 745
746 746 void
747 747 lx_unsupported(char *dmsg)
748 748 {
749 749 lx_proc_data_t *pd = ttolxproc(curthread);
750 750
751 751 DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg);
752 752
753 753 if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) {
754 754 /*
755 755 * If this process was run with strict mode enabled
756 756 * (via LX_STRICT in the environment), we mark this
757 757 * LWP as having triggered an unsupported behaviour.
758 758 * This flag will be checked at an appropriate point
759 759 * by lx_check_strict_failure().
760 760 */
761 761 lx_lwp_data_t *lwpd = ttolxlwp(curthread);
762 762
763 763 lwpd->br_strict_failure = B_TRUE;
764 764 }
765 765 }
766 766
767 767 void
768 768 lx_check_strict_failure(lx_lwp_data_t *lwpd)
769 769 {
770 770 proc_t *p;
771 771
772 772 if (!lwpd->br_strict_failure) {
773 773 return;
774 774 }
775 775
776 776 lwpd->br_strict_failure = B_FALSE;
777 777
778 778 /*
779 779 * If this process is operating in strict mode (via LX_STRICT in
780 780 * the environment), and has triggered a call to
781 781 * lx_unsupported(), we drop SIGSYS on it as we return.
782 782 */
783 783 p = curproc;
784 784 mutex_enter(&p->p_lock);
785 785 sigtoproc(p, curthread, SIGSYS);
786 786 mutex_exit(&p->p_lock);
787 787 }
788 788
789 789 void
790 790 lx_trace_sysenter(int syscall_num, uintptr_t *args)
791 791 {
792 792 if (lx_systrace_enabled) {
793 793 VERIFY(lx_systrace_entry_ptr != NULL);
794 794
795 795 (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1],
796 796 args[2], args[3], args[4], args[5]);
797 797 }
798 798 }
799 799
800 800 void
801 801 lx_trace_sysreturn(int syscall_num, long ret)
802 802 {
803 803 if (lx_systrace_enabled) {
804 804 VERIFY(lx_systrace_return_ptr != NULL);
805 805
806 806 (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0);
807 807 }
808 808 }
809 809
810 810 /*
811 811 * Get the addresses of the user-space system call handler and attach it to
812 812 * the proc structure. Returning 0 indicates success; the value returned
813 813 * by the system call is the value stored in rval. Returning a non-zero
814 814 * value indicates a failure; the value returned is used to set errno, -1
815 815 * is returned from the syscall and the contents of rval are ignored. To
816 816 * set errno and have the syscall return a value other than -1 we can
817 817 * manually set errno and rval and return 0.
818 818 */
819 819 int
820 820 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
821 821 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
822 822 {
823 823 kthread_t *t = curthread;
824 824 klwp_t *lwp = ttolwp(t);
825 825 proc_t *p = ttoproc(t);
826 826 lx_proc_data_t *pd;
827 827 struct termios *termios;
828 828 uint_t termios_len;
829 829 int error;
830 830 int code;
831 831 int sig;
832 832 lx_brand_registration_t reg;
833 833 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
834 834
835 835 /*
836 836 * There is one operation that is suppored for non-branded
837 837 * process. B_EXEC_BRAND. This is the equilivant of an
838 838 * exec call, but the new process that is created will be
839 839 * a branded process.
840 840 */
841 841 if (cmd == B_EXEC_BRAND) {
842 842 VERIFY(p->p_zone != NULL);
843 843 VERIFY(p->p_zone->zone_brand == &lx_brand);
844 844 return (exec_common(
845 845 (char *)arg1, (const char **)arg2, (const char **)arg3,
846 846 EBA_BRAND));
847 847 }
848 848
849 849 /* For all other operations this must be a branded process. */
850 850 if (p->p_brand == NULL)
851 851 return (ENOSYS);
852 852
853 853 VERIFY(p->p_brand == &lx_brand);
854 854 VERIFY(p->p_brand_data != NULL);
855 855
856 856 switch (cmd) {
857 857 case B_REGISTER:
858 858 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
859 859 lx_print("stack mode was not PREINIT during "
860 860 "REGISTER\n");
861 861 return (EINVAL);
862 862 }
863 863
864 864 if (p->p_model == DATAMODEL_NATIVE) {
865 865 if (copyin((void *)arg1, ®, sizeof (reg)) != 0) {
866 866 lx_print("Failed to copyin brand registration "
867 867 "at 0x%p\n", (void *)arg1);
868 868 return (EFAULT);
869 869 }
870 870 }
871 871 #ifdef _LP64
872 872 else {
873 873 /* 32-bit userland on 64-bit kernel */
874 874 lx_brand_registration32_t reg32;
875 875
876 876 if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) {
877 877 lx_print("Failed to copyin brand registration "
878 878 "at 0x%p\n", (void *)arg1);
879 879 return (EFAULT);
880 880 }
881 881
882 882 reg.lxbr_version = (uint_t)reg32.lxbr_version;
883 883 reg.lxbr_handler =
884 884 (void *)(uintptr_t)reg32.lxbr_handler;
885 885 reg.lxbr_flags = reg32.lxbr_flags;
886 886 }
887 887 #endif
888 888
889 889 if (reg.lxbr_version != LX_VERSION_1) {
890 890 lx_print("Invalid brand library version (%u)\n",
891 891 reg.lxbr_version);
892 892 return (EINVAL);
893 893 }
894 894
895 895 if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) {
896 896 lx_print("Invalid brand flags (%u)\n",
897 897 reg.lxbr_flags);
898 898 return (EINVAL);
899 899 }
900 900
901 901 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
902 902 (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
903 903 pd = p->p_brand_data;
904 904 pd->l_handler = (uintptr_t)reg.lxbr_handler;
905 905 pd->l_flags = reg.lxbr_flags & LX_PROC_ALL;
906 906
907 907 return (0);
908 908
909 909 case B_TTYMODES:
910 910 /* This is necessary for emulating TCGETS ioctls. */
911 911 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
912 912 DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
913 913 &termios_len) != DDI_SUCCESS)
914 914 return (EIO);
915 915
916 916 ASSERT(termios_len == sizeof (*termios));
917 917
918 918 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
919 919 ddi_prop_free(termios);
920 920 return (EFAULT);
921 921 }
922 922
923 923 ddi_prop_free(termios);
924 924 return (0);
925 925
926 926 case B_ELFDATA:
927 927 pd = curproc->p_brand_data;
928 928 if (get_udatamodel() == DATAMODEL_NATIVE) {
929 929 if (copyout(&pd->l_elf_data, (void *)arg1,
930 930 sizeof (lx_elf_data_t)) != 0) {
931 931 return (EFAULT);
932 932 }
933 933 }
934 934 #if defined(_LP64)
935 935 else {
936 936 /* 32-bit userland on 64-bit kernel */
937 937 lx_elf_data32_t led32;
938 938
939 939 led32.ed_phdr = (int)pd->l_elf_data.ed_phdr;
940 940 led32.ed_phent = (int)pd->l_elf_data.ed_phent;
941 941 led32.ed_phnum = (int)pd->l_elf_data.ed_phnum;
942 942 led32.ed_entry = (int)pd->l_elf_data.ed_entry;
943 943 led32.ed_base = (int)pd->l_elf_data.ed_base;
944 944 led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry;
945 945
946 946 if (copyout(&led32, (void *)arg1,
947 947 sizeof (led32)) != 0) {
948 948 return (EFAULT);
949 949 }
950 950 }
951 951 #endif
952 952 return (0);
953 953
954 954 case B_EXEC_NATIVE:
955 955 return (exec_common((char *)arg1, (const char **)arg2,
956 956 (const char **)arg3, EBA_NATIVE));
957 957
958 958 /*
959 959 * The B_TRUSS_POINT subcommand is used so that we can make a no-op
960 960 * syscall for debugging purposes (dtracing) from within the user-level
961 961 * emulation.
962 962 */
963 963 case B_TRUSS_POINT:
964 964 return (0);
965 965
966 966 case B_LPID_TO_SPAIR: {
967 967 /*
968 968 * Given a Linux pid as arg1, return the Solaris pid in arg2 and
969 969 * the Solaris LWP in arg3. We also translate pid 1 (which is
970 970 * hardcoded in many applications) to the zone's init process.
971 971 */
972 972 pid_t s_pid;
973 973 id_t s_tid;
974 974
975 975 if ((pid_t)arg1 == 1) {
976 976 s_pid = p->p_zone->zone_proc_initpid;
977 977 /* handle the dead/missing init(1M) case */
978 978 if (s_pid == -1)
979 979 s_pid = 1;
980 980 s_tid = 1;
981 981 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) {
982 982 return (ESRCH);
↓ open down ↓ |
982 lines elided |
↑ open up ↑ |
983 983 }
984 984
985 985 if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 ||
986 986 copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) {
987 987 return (EFAULT);
988 988 }
989 989
990 990 return (0);
991 991 }
992 992
993 + case B_SIGEV_THREAD_ID: {
994 + /*
995 + * Emulate Linux's timer_create(2) SIGEV_THREAD_ID
996 + * notification method. This mechanism is only meant
997 + * for userland threading libraries such as glibc and
998 + * is documented as such. Therefore, assume this is
999 + * only ever invoked for the purpose of alerting a
1000 + * Linux threading library. Assume that the tid is a
1001 + * member of the caller's process and the signal
1002 + * number is valid. See lx_sigev_thread_id() for the
1003 + * userland side of this emulation.
1004 + *
1005 + * The return code from this function is not checked
1006 + * by the caller since it executes in an asynchronous
1007 + * context and there is nothing much to be done. If
1008 + * this function does fail then it will manifest as
1009 + * Linux threads waiting for a signal they will never
1010 + * receive.
1011 + *
1012 + * arg1 -- Linux tid
1013 + * arg2 -- Linux signal number
1014 + * arg3 -- union sigval
1015 + */
1016 +
1017 + int native_sig = lx_ltos_signo((int)arg2, 0);
1018 + pid_t native_pid;
1019 + int native_tid;
1020 + sigqueue_t *sqp;
1021 +
1022 + if (native_sig == 0)
1023 + return (EINVAL);
1024 +
1025 + lx_lpid_to_spair((pid_t)arg1, &native_pid, &native_tid);
1026 + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
1027 + mutex_enter(&curproc->p_lock);
1028 +
1029 + if ((t = idtot(curproc, native_tid)) == NULL) {
1030 + mutex_exit(&curproc->p_lock);
1031 + kmem_free(sqp, sizeof (sigqueue_t));
1032 + return (ESRCH);
1033 + }
1034 +
1035 + sqp->sq_info.si_signo = native_sig;
1036 + sqp->sq_info.si_code = SI_TIMER;
1037 + sqp->sq_info.si_pid = curproc->p_pid;
1038 + sqp->sq_info.si_zoneid = getzoneid();
1039 + sqp->sq_info.si_uid = crgetruid(CRED());
1040 + sqp->sq_info.si_value.sival_ptr = (void *)arg3;
1041 + sigaddqa(curproc, t, sqp);
1042 +
1043 + mutex_exit(&curproc->p_lock);
1044 +
1045 + return (0);
1046 + }
1047 +
993 1048 case B_SET_AFFINITY_MASK:
994 1049 case B_GET_AFFINITY_MASK:
995 1050 /*
996 1051 * Retrieve or store the CPU affinity mask for the
997 1052 * requested linux pid.
998 1053 *
999 1054 * arg1 is a linux PID (0 means curthread).
1000 1055 * arg2 is the size of the given mask.
1001 1056 * arg3 is the address of the affinity mask.
1002 1057 */
1003 1058 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval));
1004 1059
1005 1060 case B_PTRACE_STOP_FOR_OPT:
1006 1061 return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ?
1007 1062 B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
1008 1063
1009 1064 case B_PTRACE_CLONE_BEGIN:
1010 1065 return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ?
1011 1066 B_FALSE : B_TRUE));
1012 1067
1013 1068 case B_PTRACE_KERNEL:
1014 1069 return (lx_ptrace_kernel((int)arg1, (pid_t)arg2, arg3, arg4));
1015 1070
1016 1071 case B_HELPER_WAITID: {
1017 1072 idtype_t idtype = (idtype_t)arg1;
1018 1073 id_t id = (id_t)arg2;
1019 1074 siginfo_t *infop = (siginfo_t *)arg3;
1020 1075 int options = (int)arg4;
1021 1076
1022 1077 lwpd = ttolxlwp(curthread);
1023 1078
1024 1079 /*
1025 1080 * Our brand-specific waitid helper only understands a subset of
1026 1081 * the possible idtypes. Ensure we keep to that subset here:
1027 1082 */
1028 1083 if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) {
1029 1084 return (EINVAL);
1030 1085 }
1031 1086
1032 1087 /*
1033 1088 * Enable the return of emulated ptrace(2) stop conditions
1034 1089 * through lx_waitid_helper, and stash the Linux-specific
1035 1090 * extra waitid() flags.
1036 1091 */
1037 1092 lwpd->br_waitid_emulate = B_TRUE;
1038 1093 lwpd->br_waitid_flags = (int)arg5;
1039 1094
1040 1095 #if defined(_SYSCALL32_IMPL)
1041 1096 if (get_udatamodel() != DATAMODEL_NATIVE) {
1042 1097 return (waitsys32(idtype, id, infop, options));
1043 1098 } else
1044 1099 #endif
1045 1100 {
1046 1101 return (waitsys(idtype, id, infop, options));
1047 1102 }
1048 1103
1049 1104 lwpd->br_waitid_emulate = B_FALSE;
1050 1105 lwpd->br_waitid_flags = 0;
1051 1106
1052 1107 return (0);
1053 1108 }
1054 1109
1055 1110 case B_UNSUPPORTED: {
1056 1111 char dmsg[256];
1057 1112
1058 1113 if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) {
1059 1114 lx_print("Failed to copyin unsupported msg "
1060 1115 "at 0x%p\n", (void *)arg1);
1061 1116 return (EFAULT);
1062 1117 }
1063 1118 dmsg[255] = '\0';
1064 1119 lx_unsupported(dmsg);
1065 1120
1066 1121 lx_check_strict_failure(lwpd);
1067 1122
1068 1123 return (0);
1069 1124 }
1070 1125
1071 1126 case B_STORE_ARGS: {
1072 1127 /*
1073 1128 * B_STORE_ARGS subcommand
1074 1129 * arg1 = address of struct to be copied in
1075 1130 * arg2 = size of the struct being copied in
1076 1131 * arg3-arg6 ignored
1077 1132 * rval = the amount of data copied.
1078 1133 */
1079 1134 void *buf;
1080 1135
1081 1136 /* only have upper limit because arg2 is unsigned */
1082 1137 if (arg2 > LX_BR_ARGS_SIZE_MAX) {
1083 1138 return (EINVAL);
1084 1139 }
1085 1140
1086 1141 buf = kmem_alloc(arg2, KM_SLEEP);
1087 1142 if (copyin((void *)arg1, buf, arg2) != 0) {
1088 1143 lx_print("Failed to copyin scall arg at 0x%p\n",
1089 1144 (void *) arg1);
1090 1145 kmem_free(buf, arg2);
1091 1146 /*
1092 1147 * Purposely not setting br_scall_args to NULL
1093 1148 * to preserve data for debugging.
1094 1149 */
1095 1150 return (EFAULT);
1096 1151 }
1097 1152
1098 1153 if (lwpd->br_scall_args != NULL) {
1099 1154 ASSERT(lwpd->br_args_size > 0);
1100 1155 kmem_free(lwpd->br_scall_args,
1101 1156 lwpd->br_args_size);
1102 1157 }
1103 1158
1104 1159 lwpd->br_scall_args = buf;
1105 1160 lwpd->br_args_size = arg2;
1106 1161 *rval = arg2;
1107 1162 return (0);
1108 1163 }
1109 1164
1110 1165 case B_HELPER_CLONE:
1111 1166 return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3,
1112 1167 (void *)arg4));
1113 1168
1114 1169 case B_HELPER_SETGROUPS:
1115 1170 return (lx_helper_setgroups(arg1, (gid_t *)arg2));
1116 1171
1117 1172 case B_HELPER_SIGQUEUE:
1118 1173 return (lx_helper_rt_sigqueueinfo(arg1, arg2,
1119 1174 (siginfo_t *)arg3));
1120 1175
1121 1176 case B_HELPER_TGSIGQUEUE:
1122 1177 return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3,
1123 1178 (siginfo_t *)arg4));
1124 1179
1125 1180 case B_SET_THUNK_PID:
1126 1181 lwpd->br_lx_thunk_pid = arg1;
1127 1182 return (0);
1128 1183
1129 1184 case B_GETPID:
1130 1185 /*
1131 1186 * The usermode clone(2) code needs to be able to call
1132 1187 * lx_getpid() from native code:
1133 1188 */
1134 1189 *rval = lx_getpid();
1135 1190 return (0);
1136 1191
1137 1192 case B_SET_NATIVE_STACK:
1138 1193 /*
1139 1194 * B_SET_NATIVE_STACK subcommand
1140 1195 * arg1 = the base of the stack to use for emulation
1141 1196 */
1142 1197 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
1143 1198 lx_print("B_SET_NATIVE_STACK when stack was already "
1144 1199 "set to %p\n", (void *)arg1);
1145 1200 return (EEXIST);
1146 1201 }
1147 1202
1148 1203 /*
1149 1204 * We move from the PREINIT state, where we have no brand
1150 1205 * emulation stack, to the INIT state. Here, we are still
1151 1206 * running on what will become the BRAND stack, but are running
1152 1207 * emulation (i.e. native) code. Once the initialisation
1153 1208 * process for this thread has finished, we will jump to
1154 1209 * brand-specific code, while moving to the BRAND mode.
1155 1210 *
1156 1211 * When a new LWP is created, lx_initlwp() will clear the
1157 1212 * stack data. If that LWP is actually being duplicated
1158 1213 * into a child process by fork(2), lx_forklwp() will copy
1159 1214 * it so that the cloned thread will keep using the same
1160 1215 * alternate stack.
1161 1216 */
1162 1217 lwpd->br_ntv_stack = arg1;
1163 1218 lwpd->br_stack_mode = LX_STACK_MODE_INIT;
1164 1219 lx_lwp_set_native_stack_current(lwpd, arg1);
1165 1220
1166 1221 return (0);
1167 1222
1168 1223 case B_GET_CURRENT_CONTEXT:
1169 1224 /*
1170 1225 * B_GET_CURRENT_CONTEXT subcommand:
1171 1226 * arg1 = address for pointer to current ucontext_t
1172 1227 */
1173 1228
1174 1229 #if defined(_SYSCALL32_IMPL)
1175 1230 if (get_udatamodel() != DATAMODEL_NATIVE) {
1176 1231 caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext;
1177 1232
1178 1233 error = copyout(&addr, (void *)arg1, sizeof (addr));
1179 1234 } else
1180 1235 #endif
1181 1236 {
1182 1237 error = copyout(&lwp->lwp_oldcontext, (void *)arg1,
1183 1238 sizeof (lwp->lwp_oldcontext));
1184 1239 }
1185 1240
1186 1241 return (error != 0 ? EFAULT : 0);
1187 1242
1188 1243 case B_JUMP_TO_LINUX:
1189 1244 /*
1190 1245 * B_JUMP_TO_LINUX subcommand:
1191 1246 * arg1 = ucontext_t pointer for jump state
1192 1247 */
1193 1248
1194 1249 if (arg1 == NULL)
1195 1250 return (EINVAL);
1196 1251
1197 1252 switch (lwpd->br_stack_mode) {
1198 1253 case LX_STACK_MODE_NATIVE: {
1199 1254 struct regs *rp = lwptoregs(lwp);
1200 1255
1201 1256 /*
1202 1257 * We are on the NATIVE stack, so we must preserve
1203 1258 * the extent of that stack. The pointer will be
1204 1259 * reset by a future setcontext().
1205 1260 */
1206 1261 lx_lwp_set_native_stack_current(lwpd,
1207 1262 (uintptr_t)rp->r_sp);
1208 1263 break;
1209 1264 }
1210 1265
1211 1266 case LX_STACK_MODE_INIT:
1212 1267 /*
1213 1268 * The LWP is transitioning to Linux code for the first
1214 1269 * time.
1215 1270 */
1216 1271 break;
1217 1272
1218 1273 case LX_STACK_MODE_PREINIT:
1219 1274 /*
1220 1275 * This LWP has not installed an alternate stack for
1221 1276 * usermode emulation handling.
1222 1277 */
1223 1278 return (ENOENT);
1224 1279
1225 1280 case LX_STACK_MODE_BRAND:
1226 1281 /*
1227 1282 * The LWP should not be on the BRAND stack.
1228 1283 */
1229 1284 exit(CLD_KILLED, SIGSYS);
1230 1285 return (0);
1231 1286 }
1232 1287
1233 1288 /*
1234 1289 * Transfer control to Linux:
1235 1290 */
1236 1291 return (lx_runexe(lwp, (void *)arg1));
1237 1292
1238 1293 case B_EMULATION_DONE:
1239 1294 /*
1240 1295 * B_EMULATION_DONE subcommand:
1241 1296 * arg1 = ucontext_t * to restore
1242 1297 * arg2 = system call number
1243 1298 * arg3 = return code
1244 1299 * arg4 = if operation failed, the errno value
1245 1300 */
1246 1301
1247 1302 /*
1248 1303 * The first part of this operation is a setcontext() to
1249 1304 * restore the register state to the copy we preserved
1250 1305 * before vectoring to the usermode emulation routine.
1251 1306 * If that fails, we return (hopefully) to the emulation
1252 1307 * routine and it will handle the error.
1253 1308 */
1254 1309 #if (_SYSCALL32_IMPL)
1255 1310 if (get_udatamodel() != DATAMODEL_NATIVE) {
1256 1311 error = getsetcontext32(SETCONTEXT, (void *)arg1);
1257 1312 } else
1258 1313 #endif
1259 1314 {
1260 1315 error = getsetcontext(SETCONTEXT, (void *)arg1);
1261 1316 }
1262 1317
1263 1318 if (error != 0) {
1264 1319 return (error);
1265 1320 }
1266 1321
1267 1322 /*
1268 1323 * The saved Linux context has been restored. We handle the
1269 1324 * return value or errno with code common to the in-kernel
1270 1325 * system call emulation.
1271 1326 */
1272 1327 if ((error = (int)arg4) != 0) {
1273 1328 /*
1274 1329 * lx_syscall_return() looks at the errno in the LWP,
1275 1330 * so set it here:
1276 1331 */
1277 1332 set_errno(error);
1278 1333 }
1279 1334 lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3);
1280 1335
1281 1336 return (0);
1282 1337
1283 1338 case B_EXIT_AS_SIG:
1284 1339 code = CLD_KILLED;
1285 1340 sig = (int)arg1;
1286 1341 proc_is_exiting(p);
1287 1342 if (exitlwps(1) != 0) {
1288 1343 mutex_enter(&p->p_lock);
1289 1344 lwp_exit();
1290 1345 }
1291 1346 ttolwp(curthread)->lwp_cursig = sig;
1292 1347 if (sig == SIGSEGV) {
1293 1348 if (core(sig, 0) == 0)
1294 1349 code = CLD_DUMPED;
1295 1350 }
1296 1351 exit(code, sig);
1297 1352 /* NOTREACHED */
1298 1353 break;
1299 1354 }
1300 1355
1301 1356 return (EINVAL);
1302 1357 }
1303 1358
1304 1359 char *
1305 1360 lx_get_zone_kern_version(zone_t *zone)
1306 1361 {
1307 1362 return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version);
1308 1363 }
1309 1364
1310 1365 void
1311 1366 lx_set_kern_version(zone_t *zone, char *vers)
1312 1367 {
1313 1368 lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
1314 1369
1315 1370 (void) strlcpy(lxzd->lxzd_kernel_version, vers, LX_VERS_MAX);
1316 1371 }
1317 1372
1318 1373 /*
1319 1374 * Compare linux kernel version to the one set for the zone.
1320 1375 * Returns greater than 0 if zone version is higher, less than 0 if the zone
1321 1376 * version is lower, and 0 if the version are equal.
1322 1377 */
1323 1378 int
1324 1379 lx_kern_version_cmp(zone_t *zone, const char *vers)
1325 1380 {
1326 1381 int zvers[3] = {0, 0, 0};
1327 1382 int cvers[3] = {0, 0, 0};
1328 1383 int i;
1329 1384
1330 1385 VERIFY(zone->zone_brand == &lx_brand);
1331 1386
1332 1387 (void) sscanf(ztolxzd(zone)->lxzd_kernel_version, "%d.%d.%d", &zvers[0],
1333 1388 &zvers[1], &zvers[2]);
1334 1389 (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]);
1335 1390
1336 1391 for (i = 0; i < 3; i++) {
1337 1392 if (zvers[i] > cvers[i]) {
1338 1393 return (1);
1339 1394 } else if (zvers[i] < cvers[i]) {
1340 1395 return (-1);
1341 1396 }
1342 1397 }
1343 1398 return (0);
1344 1399 }
1345 1400
1346 1401 /*
1347 1402 * Linux unconditionally removes the setuid and setgid bits when changing
1348 1403 * file ownership. This brand hook overrides the illumos native behaviour,
1349 1404 * which is based on the PRIV_FILE_SETID privilege.
1350 1405 */
1351 1406 static int
1352 1407 lx_setid_clear(vattr_t *vap, cred_t *cr)
1353 1408 {
1354 1409 if (S_ISDIR(vap->va_mode)) {
1355 1410 return (0);
1356 1411 }
1357 1412
1358 1413 if (vap->va_mode & S_ISUID) {
1359 1414 vap->va_mask |= AT_MODE;
1360 1415 vap->va_mode &= ~S_ISUID;
1361 1416 }
1362 1417 if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1363 1418 vap->va_mask |= AT_MODE;
1364 1419 vap->va_mode &= ~S_ISGID;
1365 1420 }
1366 1421
1367 1422 return (0);
1368 1423 }
1369 1424
1370 1425 /*
1371 1426 * Copy the per-process brand data from a parent proc to a child.
1372 1427 */
1373 1428 void
1374 1429 lx_copy_procdata(proc_t *child, proc_t *parent)
1375 1430 {
1376 1431 lx_proc_data_t *cpd = child->p_brand_data;
1377 1432 lx_proc_data_t *ppd = parent->p_brand_data;
1378 1433
1379 1434 VERIFY(parent->p_brand == &lx_brand);
1380 1435 VERIFY(child->p_brand == &lx_brand);
1381 1436 VERIFY(ppd != NULL);
1382 1437 VERIFY(cpd != NULL);
1383 1438
1384 1439 *cpd = *ppd;
1385 1440
1386 1441 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY;
1387 1442 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY;
1388 1443
1389 1444 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20;
1390 1445 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20;
1391 1446
1392 1447 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY;
1393 1448 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY;
1394 1449
1395 1450 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY;
1396 1451 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY;
1397 1452 }
1398 1453
1399 1454 #if defined(_LP64)
1400 1455 static void
1401 1456 Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst)
1402 1457 {
1403 1458 bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident));
1404 1459 dst->e_type = src->e_type;
1405 1460 dst->e_machine = src->e_machine;
1406 1461 dst->e_version = src->e_version;
1407 1462 dst->e_entry = src->e_entry;
1408 1463 dst->e_phoff = src->e_phoff;
1409 1464 dst->e_shoff = src->e_shoff;
1410 1465 dst->e_flags = src->e_flags;
1411 1466 dst->e_ehsize = src->e_ehsize;
1412 1467 dst->e_phentsize = src->e_phentsize;
1413 1468 dst->e_phnum = src->e_phnum;
1414 1469 dst->e_shentsize = src->e_shentsize;
1415 1470 dst->e_shnum = src->e_shnum;
1416 1471 dst->e_shstrndx = src->e_shstrndx;
1417 1472 }
1418 1473 #endif /* _LP64 */
1419 1474
1420 1475 static void
1421 1476 restoreexecenv(struct execenv *ep, stack_t *sp)
1422 1477 {
1423 1478 klwp_t *lwp = ttolwp(curthread);
1424 1479
1425 1480 setexecenv(ep);
1426 1481 lwp->lwp_sigaltstack.ss_sp = sp->ss_sp;
1427 1482 lwp->lwp_sigaltstack.ss_size = sp->ss_size;
1428 1483 lwp->lwp_sigaltstack.ss_flags = sp->ss_flags;
1429 1484 }
1430 1485
1431 1486 extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
1432 1487 long *, int, caddr_t, cred_t *, int *);
1433 1488
1434 1489 extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int,
1435 1490 long *, int, caddr_t, cred_t *, int *);
1436 1491
1437 1492 /*
1438 1493 * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux
1439 1494 * binaries.
1440 1495 */
1441 1496 static int
1442 1497 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
1443 1498 struct intpdata *idata, int level, long *execsz, int setid,
1444 1499 caddr_t exec_file, struct cred *cred, int *brand_action)
1445 1500 {
1446 1501 int error;
1447 1502 vnode_t *nvp;
1448 1503 Ehdr ehdr;
1449 1504 Addr uphdr_vaddr;
1450 1505 intptr_t voffset;
1451 1506 char *interp = NULL;
1452 1507 uintptr_t ldaddr = NULL;
1453 1508 int i;
1454 1509 proc_t *p = ttoproc(curthread);
1455 1510 klwp_t *lwp = ttolwp(curthread);
1456 1511 struct execenv env;
1457 1512 struct execenv origenv;
1458 1513 stack_t orig_sigaltstack;
1459 1514 struct user *up = PTOU(ttoproc(curthread));
1460 1515 lx_elf_data_t *edp;
1461 1516 char *lib_path = NULL;
1462 1517
1463 1518 ASSERT(ttoproc(curthread)->p_brand == &lx_brand);
1464 1519 ASSERT(ttoproc(curthread)->p_brand_data != NULL);
1465 1520
1466 1521 edp = &ttolxproc(curthread)->l_elf_data;
1467 1522
1468 1523 if (args->to_model == DATAMODEL_NATIVE) {
1469 1524 lib_path = LX_LIB_PATH;
1470 1525 }
1471 1526 #if defined(_LP64)
1472 1527 else {
1473 1528 lib_path = LX_LIB_PATH32;
1474 1529 }
1475 1530 #endif
1476 1531
1477 1532 /*
1478 1533 * Set the brandname and library name for the new process so that
1479 1534 * elfexec() puts them onto the stack.
1480 1535 */
1481 1536 args->brandname = LX_BRANDNAME;
1482 1537 args->emulator = lib_path;
1483 1538
1484 1539 #if defined(_LP64)
1485 1540 /*
1486 1541 * To conform with the way Linux lays out the address space, we clamp
1487 1542 * the stack to be the top of the lower region of the x86-64 canonical
1488 1543 * form address space -- which has the side-effect of laying out the
1489 1544 * entire address space in that lower region. Note that this only
1490 1545 * matters on 64-bit processes (this value will always be greater than
1491 1546 * the size of a 32-bit address space) and doesn't actually affect
1492 1547 * USERLIMIT: if a Linux-branded processes wishes to map something
1493 1548 * into the top half of the address space, it can do so -- but with
1494 1549 * the user stack starting at the top of the bottom region, those high
1495 1550 * virtual addresses won't be used unless explicitly directed.
1496 1551 */
1497 1552 args->maxstack = lx_maxstack64;
1498 1553 #endif
1499 1554
1500 1555 /*
1501 1556 * We will first exec the brand library, then map in the linux
1502 1557 * executable and the linux linker.
1503 1558 */
1504 1559 if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP,
1505 1560 &nvp))) {
1506 1561 uprintf("%s: not found.", lib_path);
1507 1562 return (error);
1508 1563 }
1509 1564
1510 1565 /*
1511 1566 * We will eventually set the p_exec member to be the vnode for the new
1512 1567 * executable when we call setexecenv(). However, if we get an error
1513 1568 * before that call we need to restore the execenv to its original
1514 1569 * values so that when we return to the caller fop_close() works
1515 1570 * properly while cleaning up from the failed exec(). Restoring the
1516 1571 * original value will also properly decrement the 2nd VN_RELE that we
1517 1572 * took on the brand library.
1518 1573 */
1519 1574 origenv.ex_bssbase = p->p_bssbase;
1520 1575 origenv.ex_brkbase = p->p_brkbase;
1521 1576 origenv.ex_brksize = p->p_brksize;
1522 1577 origenv.ex_vp = p->p_exec;
1523 1578 orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp;
1524 1579 orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size;
1525 1580 orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags;
1526 1581
1527 1582 if (args->to_model == DATAMODEL_NATIVE) {
1528 1583 error = elfexec(nvp, uap, args, idata, level + 1, execsz,
1529 1584 setid, exec_file, cred, brand_action);
1530 1585 }
1531 1586 #if defined(_LP64)
1532 1587 else {
1533 1588 error = elf32exec(nvp, uap, args, idata, level + 1, execsz,
1534 1589 setid, exec_file, cred, brand_action);
1535 1590 }
1536 1591 #endif
1537 1592 VN_RELE(nvp);
1538 1593 if (error != 0) {
1539 1594 restoreexecenv(&origenv, &orig_sigaltstack);
1540 1595 return (error);
1541 1596 }
1542 1597
1543 1598 /*
1544 1599 * exec-ed in the brand library above.
1545 1600 * The u_auxv vectors are now setup by elfexec to point to the
1546 1601 * brand emulation library and its linker.
1547 1602 */
1548 1603
1549 1604 bzero(&env, sizeof (env));
1550 1605
1551 1606 /*
1552 1607 * map in the the Linux executable
1553 1608 */
1554 1609 if (args->to_model == DATAMODEL_NATIVE) {
1555 1610 error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
1556 1611 &voffset, exec_file, &interp, &env.ex_bssbase,
1557 1612 &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1558 1613 }
1559 1614 #if defined(_LP64)
1560 1615 else {
1561 1616 Elf32_Ehdr ehdr32;
1562 1617 Elf32_Addr uphdr_vaddr32;
1563 1618
1564 1619 error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
1565 1620 &voffset, exec_file, &interp, &env.ex_bssbase,
1566 1621 &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1567 1622
1568 1623 Ehdr32to64(&ehdr32, &ehdr);
1569 1624
1570 1625 if (uphdr_vaddr32 == (Elf32_Addr)-1)
1571 1626 uphdr_vaddr = (Addr)-1;
1572 1627 else
1573 1628 uphdr_vaddr = uphdr_vaddr32;
1574 1629 }
1575 1630 #endif
1576 1631 if (error != 0) {
1577 1632 restoreexecenv(&origenv, &orig_sigaltstack);
1578 1633
1579 1634 if (interp != NULL)
1580 1635 kmem_free(interp, MAXPATHLEN);
1581 1636
1582 1637 return (error);
1583 1638 }
1584 1639
1585 1640 /*
1586 1641 * Save off the important properties of the lx executable. The brand
1587 1642 * library will ask us for this data later, when it is ready to set
1588 1643 * things up for the lx executable.
1589 1644 */
1590 1645 edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
1591 1646 voffset + uphdr_vaddr;
1592 1647 edp->ed_entry = voffset + ehdr.e_entry;
1593 1648 edp->ed_phent = ehdr.e_phentsize;
1594 1649 edp->ed_phnum = ehdr.e_phnum;
1595 1650
1596 1651 if (interp != NULL) {
1597 1652 if (ehdr.e_type == ET_DYN) {
1598 1653 /*
1599 1654 * This is a shared object executable, so we need to
1600 1655 * pick a reasonable place to put the heap. Just don't
1601 1656 * use the first page.
1602 1657 */
1603 1658 env.ex_brkbase = (caddr_t)PAGESIZE;
1604 1659 env.ex_bssbase = (caddr_t)PAGESIZE;
1605 1660 }
1606 1661
1607 1662 /*
1608 1663 * If the program needs an interpreter (most do), map it in and
1609 1664 * store relevant information about it in the aux vector, where
1610 1665 * the brand library can find it.
1611 1666 */
1612 1667 if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW,
1613 1668 NULLVPP, &nvp))) {
1614 1669 uprintf("%s: not found.", interp);
1615 1670 restoreexecenv(&origenv, &orig_sigaltstack);
1616 1671 kmem_free(interp, MAXPATHLEN);
1617 1672 return (error);
1618 1673 }
1619 1674
1620 1675 kmem_free(interp, MAXPATHLEN);
1621 1676 interp = NULL;
1622 1677
1623 1678 /*
1624 1679 * map in the Linux linker
1625 1680 */
1626 1681 if (args->to_model == DATAMODEL_NATIVE) {
1627 1682 error = mapexec_brand(nvp, args, &ehdr,
1628 1683 &uphdr_vaddr, &voffset, exec_file, NULL, NULL,
1629 1684 NULL, NULL, NULL, &ldaddr);
1630 1685 }
1631 1686 #if defined(_LP64)
1632 1687 else {
1633 1688 Elf32_Ehdr ehdr32;
1634 1689 Elf32_Addr uphdr_vaddr32;
1635 1690
1636 1691 error = mapexec32_brand(nvp, args, &ehdr32,
1637 1692 &uphdr_vaddr32, &voffset, exec_file, NULL, NULL,
1638 1693 NULL, NULL, NULL, &ldaddr);
1639 1694
1640 1695 Ehdr32to64(&ehdr32, &ehdr);
1641 1696
1642 1697 if (uphdr_vaddr32 == (Elf32_Addr)-1)
1643 1698 uphdr_vaddr = (Addr)-1;
1644 1699 else
1645 1700 uphdr_vaddr = uphdr_vaddr32;
1646 1701 }
1647 1702 #endif
1648 1703
1649 1704 VN_RELE(nvp);
1650 1705 if (error != 0) {
1651 1706 restoreexecenv(&origenv, &orig_sigaltstack);
1652 1707 return (error);
1653 1708 }
1654 1709
1655 1710 /*
1656 1711 * Now that we know the base address of the brand's linker,
1657 1712 * we also save this for later use by the brand library.
1658 1713 */
1659 1714 edp->ed_base = voffset;
1660 1715 edp->ed_ldentry = voffset + ehdr.e_entry;
1661 1716 } else {
1662 1717 /*
1663 1718 * This program has no interpreter. The lx brand library will
1664 1719 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
1665 1720 * so in this case, put the entry point of the main executable
1666 1721 * there.
1667 1722 */
1668 1723 if (ehdr.e_type == ET_EXEC) {
1669 1724 /*
1670 1725 * An executable with no interpreter, this must be a
1671 1726 * statically linked executable, which means we loaded
1672 1727 * it at the address specified in the elf header, in
1673 1728 * which case the e_entry field of the elf header is an
1674 1729 * absolute address.
1675 1730 */
1676 1731 edp->ed_ldentry = ehdr.e_entry;
1677 1732 edp->ed_entry = ehdr.e_entry;
1678 1733 } else {
1679 1734 /*
1680 1735 * A shared object with no interpreter, we use the
1681 1736 * calculated address from above.
1682 1737 */
1683 1738 edp->ed_ldentry = edp->ed_entry;
1684 1739
1685 1740 /*
1686 1741 * In all situations except an ET_DYN elf object with no
1687 1742 * interpreter, we want to leave the brk and base
1688 1743 * values set by mapexec_brand alone. Normally when
1689 1744 * running ET_DYN objects on Solaris (most likely
1690 1745 * /lib/ld.so.1) the kernel sets brk and base to 0 since
1691 1746 * it doesn't know where to put the heap, and later the
1692 1747 * linker will call brk() to initialize the heap in:
1693 1748 * usr/src/cmd/sgs/rtld/common/setup.c:setup()
1694 1749 * after it has determined where to put it. (This
1695 1750 * decision is made after the linker loads and inspects
1696 1751 * elf properties of the target executable being run.)
1697 1752 *
1698 1753 * So for ET_DYN Linux executables, we also don't know
1699 1754 * where the heap should go, so we'll set the brk and
1700 1755 * base to 0. But in this case the Solaris linker will
1701 1756 * not initialize the heap, so when the Linux linker
1702 1757 * starts running there is no heap allocated. This
1703 1758 * seems to be ok on Linux 2.4 based systems because the
1704 1759 * Linux linker/libc fall back to using mmap() to
1705 1760 * allocate memory. But on 2.6 systems, running
1706 1761 * applications by specifying them as command line
1707 1762 * arguments to the linker results in segfaults for an
1708 1763 * as yet undetermined reason (which seems to indicatej
1709 1764 * that a more permanent fix for heap initalization in
1710 1765 * these cases may be necessary).
1711 1766 */
1712 1767 if (ehdr.e_type == ET_DYN) {
1713 1768 env.ex_bssbase = (caddr_t)0;
1714 1769 env.ex_brkbase = (caddr_t)0;
1715 1770 env.ex_brksize = 0;
1716 1771 }
1717 1772 }
1718 1773
1719 1774 }
1720 1775
1721 1776 env.ex_vp = vp;
1722 1777 setexecenv(&env);
1723 1778
1724 1779 /*
1725 1780 * We try to keep /proc's view of the aux vector consistent with
1726 1781 * what's on the process stack.
1727 1782 */
1728 1783 if (args->to_model == DATAMODEL_NATIVE) {
1729 1784 auxv_t phdr_auxv[4] = {
1730 1785 { AT_SUN_BRAND_LX_PHDR, 0 },
1731 1786 { AT_SUN_BRAND_LX_INTERP, 0 },
1732 1787 { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 },
1733 1788 { AT_SUN_BRAND_AUX4, 0 }
1734 1789 };
1735 1790 phdr_auxv[0].a_un.a_val = edp->ed_phdr;
1736 1791 phdr_auxv[1].a_un.a_val = ldaddr;
1737 1792 phdr_auxv[2].a_un.a_val = 1; /* set in lx_init */
1738 1793 phdr_auxv[3].a_type = AT_CLKTCK;
1739 1794 phdr_auxv[3].a_un.a_val = hz;
1740 1795
1741 1796 if (copyout(&phdr_auxv, args->auxp_brand,
1742 1797 sizeof (phdr_auxv)) == -1)
1743 1798 return (EFAULT);
1744 1799 }
1745 1800 #if defined(_LP64)
1746 1801 else {
1747 1802 auxv32_t phdr_auxv32[3] = {
1748 1803 { AT_SUN_BRAND_LX_PHDR, 0 },
1749 1804 { AT_SUN_BRAND_LX_INTERP, 0 },
1750 1805 { AT_SUN_BRAND_AUX3, 0 }
1751 1806 };
1752 1807 phdr_auxv32[0].a_un.a_val = edp->ed_phdr;
1753 1808 phdr_auxv32[1].a_un.a_val = ldaddr;
1754 1809 phdr_auxv32[2].a_type = AT_CLKTCK;
1755 1810 phdr_auxv32[2].a_un.a_val = hz;
1756 1811
1757 1812 if (copyout(&phdr_auxv32, args->auxp_brand,
1758 1813 sizeof (phdr_auxv32)) == -1)
1759 1814 return (EFAULT);
1760 1815 }
1761 1816 #endif
1762 1817
1763 1818 /*
1764 1819 * /proc uses the AT_ENTRY aux vector entry to deduce
1765 1820 * the location of the executable in the address space. The user
1766 1821 * structure contains a copy of the aux vector that needs to have those
1767 1822 * entries patched with the values of the real lx executable (they
1768 1823 * currently contain the values from the lx brand library that was
1769 1824 * elfexec'd, above).
1770 1825 *
1771 1826 * For live processes, AT_BASE is used to locate the linker segment,
1772 1827 * which /proc and friends will later use to find Solaris symbols
1773 1828 * (such as rtld_db_preinit). However, for core files, /proc uses
1774 1829 * AT_ENTRY to find the right segment to label as the executable.
1775 1830 * So we set AT_ENTRY to be the entry point of the linux executable,
1776 1831 * but leave AT_BASE to be the address of the Solaris linker.
1777 1832 */
1778 1833 for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
1779 1834 switch (up->u_auxv[i].a_type) {
1780 1835 case AT_ENTRY:
1781 1836 up->u_auxv[i].a_un.a_val = edp->ed_entry;
1782 1837 break;
1783 1838
1784 1839 case AT_SUN_BRAND_LX_PHDR:
1785 1840 up->u_auxv[i].a_un.a_val = edp->ed_phdr;
1786 1841 break;
1787 1842
1788 1843 case AT_SUN_BRAND_LX_INTERP:
1789 1844 up->u_auxv[i].a_un.a_val = ldaddr;
1790 1845 break;
1791 1846
1792 1847 default:
1793 1848 break;
1794 1849 }
1795 1850 }
1796 1851
1797 1852 return (0);
1798 1853 }
1799 1854
1800 1855 boolean_t
1801 1856 lx_native_exec(uint8_t osabi, const char **interp)
1802 1857 {
1803 1858 if (osabi != ELFOSABI_SOLARIS)
1804 1859 return (B_FALSE);
1805 1860
1806 1861 /*
1807 1862 * If the process root matches the zone root, prepend /native to the
1808 1863 * interpreter path for native executables. Absolute precision from
1809 1864 * VN_CMP is not necessary since any change of process root is likely
1810 1865 * to make native binaries inaccessible via /native.
1811 1866 *
1812 1867 * Processes which chroot directly into /native will be able to
1813 1868 * function as expected with no need for the prefix.
1814 1869 */
1815 1870 if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) {
1816 1871 *interp = "/native";
1817 1872 }
1818 1873
1819 1874 return (B_TRUE);
1820 1875 }
1821 1876
1822 1877 static void
1823 1878 lx_syscall_init(void)
1824 1879 {
1825 1880 int i;
1826 1881
1827 1882 /*
1828 1883 * Count up the 32-bit Linux system calls. Note that lx_sysent32
1829 1884 * has (LX_NSYSCALLS + 1) entries.
1830 1885 */
1831 1886 for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++)
1832 1887 continue;
1833 1888 lx_nsysent32 = i;
1834 1889
1835 1890 #if defined(_LP64)
1836 1891 /*
1837 1892 * Count up the 64-bit Linux system calls. Note that lx_sysent64
1838 1893 * has (LX_NSYSCALLS + 1) entries.
1839 1894 */
1840 1895 for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++)
1841 1896 continue;
1842 1897 lx_nsysent64 = i;
1843 1898 #endif
1844 1899 }
1845 1900
1846 1901 int
1847 1902 _init(void)
1848 1903 {
1849 1904 int err = 0;
1850 1905
1851 1906 lx_syscall_init();
1852 1907 lx_pid_init();
1853 1908 lx_ioctl_init();
1854 1909 lx_futex_init();
1855 1910 lx_ptrace_init();
1856 1911 lx_socket_init();
1857 1912
1858 1913 err = mod_install(&modlinkage);
1859 1914 if (err != 0) {
1860 1915 cmn_err(CE_WARN, "Couldn't install lx brand module");
1861 1916
1862 1917 /*
1863 1918 * This looks drastic, but it should never happen. These
1864 1919 * two data structures should be completely free-able until
1865 1920 * they are used by Linux processes. Since the brand
1866 1921 * wasn't loaded there should be no Linux processes, and
1867 1922 * thus no way for these data structures to be modified.
1868 1923 */
1869 1924 lx_pid_fini();
1870 1925 lx_ioctl_fini();
1871 1926 if (lx_futex_fini())
1872 1927 panic("lx brand module cannot be loaded or unloaded.");
1873 1928 }
1874 1929 return (err);
1875 1930 }
1876 1931
1877 1932 int
1878 1933 _info(struct modinfo *modinfop)
1879 1934 {
1880 1935 return (mod_info(&modlinkage, modinfop));
1881 1936 }
1882 1937
1883 1938 int
1884 1939 _fini(void)
1885 1940 {
1886 1941 int err;
1887 1942 int futex_done = 0;
1888 1943
1889 1944 /*
1890 1945 * If there are any zones using this brand, we can't allow it to be
1891 1946 * unloaded.
1892 1947 */
1893 1948 if (brand_zone_count(&lx_brand))
1894 1949 return (EBUSY);
1895 1950
1896 1951 lx_ptrace_fini();
1897 1952 lx_pid_fini();
1898 1953 lx_ioctl_fini();
1899 1954 lx_socket_fini();
1900 1955
1901 1956 if ((err = lx_futex_fini()) != 0) {
1902 1957 goto done;
1903 1958 }
1904 1959 futex_done = 1;
1905 1960
1906 1961 err = mod_remove(&modlinkage);
1907 1962
1908 1963 done:
1909 1964 if (err) {
1910 1965 /*
1911 1966 * If we can't unload the module, then we have to get it
1912 1967 * back into a sane state.
1913 1968 */
1914 1969 lx_ptrace_init();
1915 1970 lx_pid_init();
1916 1971 lx_ioctl_init();
1917 1972 lx_socket_init();
1918 1973
1919 1974 if (futex_done) {
1920 1975 lx_futex_init();
1921 1976 }
1922 1977 }
1923 1978
1924 1979 return (err);
1925 1980 }
↓ open down ↓ |
923 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX