Print this page
OS-???? [lx] SIGEV_THREAD_ID emulation needed
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/brand/lx/os/lx_brand.c
+++ new/usr/src/uts/common/brand/lx/os/lx_brand.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 /*
28 28 * Copyright 2015, Joyent, Inc. All rights reserved.
29 29 */
30 30
31 31 /*
32 32 * The LX Brand: emulation of a Linux operating environment within a zone.
33 33 *
34 34 * OVERVIEW
35 35 *
36 36 * The LX brand enables a full Linux userland -- including a C library,
37 37 * init(1) framework, and some set of applications -- to run unmodified
38 38 * within an illumos zone. Unlike illumos, where applications are expected
39 39 * to link against and consume functions exported from libraries, the
40 40 * supported Linux binary compatibility boundary is the system call
41 41 * interface. By accurately emulating the behaviour of Linux system calls,
42 42 * Linux software can be executed in this environment as if it were running
43 43 * on a native Linux system.
44 44 *
45 45 * EMULATING LINUX SYSTEM CALLS
46 46 *
47 47 * Linux system calls are made in 32-bit processes via the "int 0x80"
48 48 * instruction; in 64-bit processes the "syscall" instruction is used, as it
49 49 * is with native illumos processes. In both cases, arguments to system
50 50 * calls are generally passed in registers and the usermode stack is not
51 51 * interpreted or modified by the Linux kernel.
52 52 *
53 53 * When the emulated Linux process makes a system call, it traps into the
54 54 * illumos kernel. The in-kernel brand module contains various emulation
55 55 * routines, and can fully service some emulated system calls; e.g. read(2)
56 56 * and write(2). Other system calls require assistance from the illumos
57 57 * libc, bouncing back out to the brand library ("lx_brand.so.1") for
58 58 * emulation.
59 59 *
60 60 * The brand mechanism allows for the provision of an alternative trap
61 61 * handler for the various system call mechanisms. Traditionally this was
62 62 * used to immediately revector execution to the usermode emulation library,
63 63 * which was responsible for handling all system calls. In the interests of
64 64 * more accurate emulation and increased performance, much of the regular
65 65 * illumos system call path is now invoked. Only the argument processing and
66 66 * handler dispatch are replaced by the brand, via the per-LWP
67 67 * "lwp_brand_syscall" interposition function pointer.
68 68 *
69 69 * THE NATIVE AND BRAND STACKS
70 70 *
71 71 * Some runtime environments (e.g. the Go language) allocate very small
72 72 * thread stacks, preferring to grow or split the stack as necessary. The
73 73 * Linux kernel generally does not use the usermode stack when servicing
74 74 * system calls, so this is not a problem. In order for our emulation to
75 75 * have the same zero stack impact, we must execute usermode emulation
76 76 * routines on an _alternate_ stack. This is similar, in principle, to the
77 77 * use of sigaltstack(3C) to run signal handlers off the main thread stack.
78 78 *
79 79 * To this end, the brand library allocates and installs an alternate stack
80 80 * (called the "native" stack) for each LWP. The in-kernel brand code uses
81 81 * this stack for usermode emulation calls and interposed signal delivery,
82 82 * while the emulated Linux process sees only the data on the main thread
83 83 * stack, known as the "brand" stack. The stack mode is tracked in the
84 84 * per-LWP brand-private data, using the LX_STACK_MODE_* enum.
85 85 *
86 86 * The stack mode doubles as a system call "mode bit". When in the
87 87 * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux
88 88 * system calls. In other modes, system calls are assumed to be native
89 89 * illumos system calls as made during brand library initialisation and
90 90 * usermode emulation.
91 91 *
92 92 * USERMODE EMULATION
93 93 *
94 94 * When a Linux system call cannot be emulated within the kernel, we preserve
95 95 * the register state of the Linux process and revector the LWP to the brand
96 96 * library usermode emulation handler: the "lx_emulate()" function in
97 97 * "lx_brand.so.1". This revectoring is modelled on the delivery of signals,
98 98 * and is performed in "lx_emulate_user()".
99 99 *
100 100 * First, the emulated process state is written out to the usermode stack of
101 101 * the process as a "ucontext_t" object. Arguments to the emulation routine
102 102 * are passed on the stack or in registers, depending on the ABI. When the
103 103 * usermode emulation is complete, the result is passed back to the kernel
104 104 * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context
105 105 * for restoration.
106 106 *
107 107 * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT
108 108 *
109 109 * When servicing emulated system calls in the usermode brand library, or
110 110 * during signal delivery, various state is preserved by the kernel so that
111 111 * the running LWP may be revectored to a handling routine. The context
112 112 * allows the kernel to restart the program at the point of interruption,
113 113 * either at the return of the signal handler, via setcontext(3C); or after
114 114 * the usermode emulation request has been serviced, via B_EMULATION_DONE.
115 115 *
116 116 * In illumos native processes, the saved context (a "ucontext_t" object)
117 117 * includes the state of registers and the current signal mask at the point
118 118 * of interruption. The context also includes a link to the most recently
119 119 * saved context, forming a chain to be unwound as requests complete. The LX
120 120 * brand requires additional book-keeping to describe the machine state: in
121 121 * particular, the current stack mode and the occupied extent of the native
122 122 * stack.
123 123 *
124 124 * The brand code is able to interpose on the context save and restore
125 125 * operations in the kernel -- see "lx_savecontext()" and
126 126 * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to
127 127 * function correctly in the face of a dual stack LWP. The brand also
128 128 * interposes on the signal delivery mechanism -- see "lx_sendsig()" and
129 129 * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand
130 130 * library interposer on the native stack, regardless of the interrupted
131 131 * execution mode. Linux sigaltstack(2) emulation is performed entirely by
132 132 * the usermode brand library during signal handler interposition.
133 133 */
134 134
135 135 #include <sys/types.h>
136 136 #include <sys/kmem.h>
137 137 #include <sys/errno.h>
138 138 #include <sys/thread.h>
139 139 #include <sys/systm.h>
140 140 #include <sys/syscall.h>
141 141 #include <sys/proc.h>
142 142 #include <sys/modctl.h>
143 143 #include <sys/cmn_err.h>
144 144 #include <sys/model.h>
145 145 #include <sys/exec.h>
146 146 #include <sys/lx_impl.h>
147 147 #include <sys/machbrand.h>
148 148 #include <sys/lx_syscalls.h>
149 149 #include <sys/lx_misc.h>
150 150 #include <sys/lx_futex.h>
151 151 #include <sys/lx_brand.h>
152 152 #include <sys/param.h>
153 153 #include <sys/termios.h>
154 154 #include <sys/sunddi.h>
155 155 #include <sys/ddi.h>
156 156 #include <sys/vnode.h>
157 157 #include <sys/pathname.h>
158 158 #include <sys/auxv.h>
159 159 #include <sys/priv.h>
160 160 #include <sys/regset.h>
161 161 #include <sys/privregs.h>
162 162 #include <sys/archsystm.h>
163 163 #include <sys/zone.h>
164 164 #include <sys/brand.h>
165 165 #include <sys/sdt.h>
166 166 #include <sys/x86_archext.h>
167 167 #include <sys/controlregs.h>
168 168 #include <sys/core.h>
169 169 #include <sys/stack.h>
170 170 #include <sys/stat.h>
171 171 #include <sys/socket.h>
172 172 #include <lx_signum.h>
173 173 #include <util/sscanf.h>
174 174
175 175 int lx_debug = 0;
176 176
177 177 void lx_init_brand_data(zone_t *);
178 178 void lx_free_brand_data(zone_t *);
179 179 void lx_setbrand(proc_t *);
180 180 int lx_getattr(zone_t *, int, void *, size_t *);
181 181 int lx_setattr(zone_t *, int, void *, size_t);
182 182 int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t,
183 183 uintptr_t, uintptr_t);
184 184 void lx_set_kern_version(zone_t *, char *);
185 185 void lx_copy_procdata(proc_t *, proc_t *);
186 186
187 187 extern int getsetcontext(int, void *);
188 188 extern int waitsys(idtype_t, id_t, siginfo_t *, int);
189 189 #if defined(_SYSCALL32_IMPL)
190 190 extern int getsetcontext32(int, void *);
191 191 extern int waitsys32(idtype_t, id_t, siginfo_t *, int);
192 192 #endif
193 193
194 194 extern void lx_proc_exit(proc_t *);
195 195 extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *);
196 196
197 197 extern void lx_ioctl_init();
198 198 extern void lx_ioctl_fini();
199 199 extern void lx_socket_init();
200 200 extern void lx_socket_fini();
201 201
202 202 lx_systrace_f *lx_systrace_entry_ptr;
203 203 lx_systrace_f *lx_systrace_return_ptr;
204 204
205 205 static int lx_systrace_enabled;
206 206
207 207 /*
208 208 * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly
209 209 * want an MMU dependency here (and should there be a microprocessor without
210 210 * a hole, we don't want to start allocating from the top of the VA range).
211 211 */
212 212 #define LX_MAXSTACK64 0x7ffffff00000
213 213
214 214 uint64_t lx_maxstack64 = LX_MAXSTACK64;
215 215
216 216 static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
217 217 struct intpdata *idata, int level, long *execsz, int setid,
218 218 caddr_t exec_file, struct cred *cred, int *brand_action);
219 219
220 220 static boolean_t lx_native_exec(uint8_t, const char **);
221 221 static uint32_t lx_map32limit(proc_t *);
222 222
223 223 static void lx_savecontext(ucontext_t *);
224 224 static void lx_restorecontext(ucontext_t *);
225 225 static caddr_t lx_sendsig_stack(int);
226 226 static void lx_sendsig(int);
227 227 #if defined(_SYSCALL32_IMPL)
228 228 static void lx_savecontext32(ucontext32_t *);
229 229 #endif
230 230 static int lx_setid_clear(vattr_t *, cred_t *);
231 231 #if defined(_LP64)
232 232 static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type,
233 233 enum seg_rw);
234 234 #endif
235 235
236 236
237 237 /* lx brand */
238 238 struct brand_ops lx_brops = {
239 239 lx_init_brand_data, /* b_init_brand_data */
240 240 lx_free_brand_data, /* b_free_brand_data */
241 241 lx_brandsys, /* b_brandsys */
242 242 lx_setbrand, /* b_setbrand */
243 243 lx_getattr, /* b_getattr */
244 244 lx_setattr, /* b_setattr */
245 245 lx_copy_procdata, /* b_copy_procdata */
246 246 lx_proc_exit, /* b_proc_exit */
247 247 lx_exec, /* b_exec */
248 248 lx_setrval, /* b_lwp_setrval */
249 249 lx_lwpdata_alloc, /* b_lwpdata_alloc */
250 250 lx_lwpdata_free, /* b_lwpdata_free */
251 251 lx_initlwp, /* b_initlwp */
252 252 lx_forklwp, /* b_forklwp */
253 253 lx_freelwp, /* b_freelwp */
254 254 lx_exitlwp, /* b_lwpexit */
255 255 lx_elfexec, /* b_elfexec */
256 256 NULL, /* b_sigset_native_to_brand */
257 257 NULL, /* b_sigset_brand_to_native */
258 258 lx_sigfd_translate, /* b_sigfd_translate */
259 259 NSIG, /* b_nsig */
260 260 lx_exit_with_sig, /* b_exit_with_sig */
261 261 lx_wait_filter, /* b_wait_filter */
262 262 lx_native_exec, /* b_native_exec */
263 263 lx_map32limit, /* b_map32limit */
264 264 lx_stop_notify, /* b_stop_notify */
265 265 lx_waitid_helper, /* b_waitid_helper */
266 266 lx_sigcld_repost, /* b_sigcld_repost */
267 267 lx_ptrace_issig_stop, /* b_issig_stop */
268 268 lx_ptrace_sig_ignorable, /* b_sig_ignorable */
269 269 lx_savecontext, /* b_savecontext */
270 270 #if defined(_SYSCALL32_IMPL)
271 271 lx_savecontext32, /* b_savecontext32 */
272 272 #endif
273 273 lx_restorecontext, /* b_restorecontext */
274 274 lx_sendsig_stack, /* b_sendsig_stack */
275 275 lx_sendsig, /* b_sendsig */
276 276 lx_setid_clear, /* b_setid_clear */
277 277 #if defined(_LP64)
278 278 lx_pagefault /* b_pagefault */
279 279 #else
280 280 NULL
281 281 #endif
282 282 };
283 283
284 284 struct brand_mach_ops lx_mops = {
285 285 NULL,
286 286 NULL,
287 287 NULL,
288 288 NULL,
289 289 NULL,
290 290 lx_fixsegreg,
291 291 lx_fsbase
292 292 };
293 293
294 294 struct brand lx_brand = {
295 295 BRAND_VER_1,
296 296 "lx",
297 297 &lx_brops,
298 298 &lx_mops,
299 299 sizeof (struct lx_proc_data)
300 300 };
301 301
302 302 static struct modlbrand modlbrand = {
303 303 &mod_brandops, "lx brand", &lx_brand
304 304 };
305 305
306 306 static struct modlinkage modlinkage = {
307 307 MODREV_1, (void *)&modlbrand, NULL
308 308 };
309 309
310 310 void
311 311 lx_proc_exit(proc_t *p)
312 312 {
313 313 lx_proc_data_t *lxpd;
314 314 proc_t *cp;
315 315
316 316 mutex_enter(&p->p_lock);
317 317 VERIFY(lxpd = ptolxproc(p));
318 318 if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) {
319 319 mutex_exit(&p->p_lock);
320 320 return;
321 321 }
322 322 mutex_exit(&p->p_lock);
323 323
324 324 /* Check for children which desire notification of parental death. */
325 325 mutex_enter(&pidlock);
326 326 for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) {
327 327 mutex_enter(&cp->p_lock);
328 328 if ((lxpd = ptolxproc(cp)) == NULL) {
329 329 mutex_exit(&cp->p_lock);
330 330 continue;
331 331 }
332 332 if (lxpd->l_parent_deathsig != 0) {
333 333 sigtoproc(p, NULL, lxpd->l_parent_deathsig);
334 334 }
335 335 mutex_exit(&cp->p_lock);
336 336 }
337 337 mutex_exit(&pidlock);
338 338 }
339 339
340 340 void
341 341 lx_setbrand(proc_t *p)
342 342 {
343 343 /* Send SIGCHLD to parent by default when child exits */
344 344 ptolxproc(p)->l_signal = stol_signo[SIGCHLD];
345 345 }
346 346
347 347 /* ARGSUSED */
348 348 int
349 349 lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize)
350 350 {
351 351 char vers[LX_VERS_MAX];
352 352
353 353 if (attr == LX_KERN_VERSION_NUM) {
354 354 if (bufsize > (LX_VERS_MAX - 1))
355 355 return (ERANGE);
356 356 bzero(vers, LX_VERS_MAX);
357 357 if (copyin(buf, &vers, bufsize) != 0)
358 358 return (EFAULT);
359 359 lx_set_kern_version(zone, vers);
360 360 return (0);
361 361 }
362 362 return (EINVAL);
363 363 }
364 364
365 365 /* ARGSUSED */
366 366 int
367 367 lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize)
368 368 {
369 369 if (attr == LX_KERN_VERSION_NUM) {
370 370 if (*bufsize < LX_VERS_MAX)
371 371 return (ERANGE);
372 372 if (copyout(lx_get_zone_kern_version(curzone), buf,
373 373 LX_VERS_MAX) != 0)
374 374 return (EFAULT);
375 375 *bufsize = LX_VERS_MAX;
376 376 return (0);
377 377 }
378 378 return (-EINVAL);
379 379 }
380 380
381 381 uint32_t
382 382 lx_map32limit(proc_t *p)
383 383 {
384 384 /*
385 385 * To be bug-for-bug compatible with Linux, we have MAP_32BIT only
386 386 * allow mappings in the first 31 bits. This was a nuance in the
387 387 * original Linux implementation circa 2002, and applications have
388 388 * come to depend on its behavior.
389 389 *
390 390 * This is only relevant for 64-bit processes.
391 391 */
392 392 if (p->p_model == DATAMODEL_LP64)
393 393 return (1 << 31);
394 394
395 395 return ((uint32_t)USERLIMIT32);
396 396 }
397 397
398 398 void
399 399 lx_brand_systrace_enable(void)
400 400 {
401 401 VERIFY(!lx_systrace_enabled);
402 402
403 403 lx_systrace_enabled = 1;
404 404 }
405 405
406 406 void
407 407 lx_brand_systrace_disable(void)
408 408 {
409 409 VERIFY(lx_systrace_enabled);
410 410
411 411 lx_systrace_enabled = 0;
412 412 }
413 413
414 414 void
415 415 lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp)
416 416 {
417 417 VERIFY(lwpd->br_ntv_stack != 0);
418 418
419 419 /*
420 420 * The "brand-lx-set-ntv-stack-current" probe has arguments:
421 421 * arg0: stack pointer before change
422 422 * arg1: stack pointer after change
423 423 * arg2: current stack base
424 424 */
425 425 DTRACE_PROBE3(brand__lx__set__ntv__stack__current,
426 426 uintptr_t, lwpd->br_ntv_stack_current,
427 427 uintptr_t, new_sp,
428 428 uintptr_t, lwpd->br_ntv_stack);
429 429
430 430 lwpd->br_ntv_stack_current = new_sp;
431 431 }
432 432
433 433 #if defined(_LP64)
434 434 static int
435 435 lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type,
436 436 enum seg_rw rw)
437 437 {
438 438 int syscall_num;
439 439
440 440 /*
441 441 * We only want to handle a very specific set of circumstances.
442 442 * Namely: this is a 64-bit LX-branded process attempting to execute an
443 443 * address in a page for which it does not have a valid mapping. If
444 444 * this is not the case, we bail out as fast as possible.
445 445 */
446 446 VERIFY(PROC_IS_BRANDED(p));
447 447 if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) !=
448 448 DATAMODEL_NATIVE) {
449 449 return (-1);
450 450 }
451 451
452 452 if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) {
453 453 return (-1);
454 454 }
455 455
456 456 /*
457 457 * This is a valid vsyscall address. We service the system call and
458 458 * return 0 to signal that the pagefault has been handled completely.
459 459 */
460 460 lx_vsyscall_enter(p, lwp, syscall_num);
461 461 return (0);
462 462 }
463 463 #endif
464 464
465 465 /*
466 466 * This hook runs prior to sendsig() processing and allows us to nominate
467 467 * an alternative stack pointer for delivery of the signal handling frame.
468 468 * Critically, this routine should _not_ modify any LWP state as the
469 469 * savecontext() does not run until after this hook.
470 470 */
471 471 static caddr_t
472 472 lx_sendsig_stack(int sig)
473 473 {
474 474 klwp_t *lwp = ttolwp(curthread);
475 475 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
476 476
477 477 /*
478 478 * We want to take signal delivery on the native stack, but only if
479 479 * one has been allocated and installed for this LWP.
480 480 */
481 481 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
482 482 /*
483 483 * The program is not running on the native stack. Return
484 484 * the native stack pointer from our brand-private data so
485 485 * that we may switch to it for signal handling.
486 486 */
487 487 return ((caddr_t)lwpd->br_ntv_stack_current);
488 488 } else {
489 489 struct regs *rp = lwptoregs(lwp);
490 490
491 491 /*
492 492 * Either the program is already running on the native stack,
493 493 * or one has not yet been allocated for this LWP. Use the
494 494 * current stack pointer value.
495 495 */
496 496 return ((caddr_t)rp->r_sp);
497 497 }
498 498 }
499 499
500 500 /*
501 501 * This hook runs after sendsig() processing and allows us to update the
502 502 * per-LWP mode flags for system calls and stacks. The pre-signal
503 503 * context has already been saved and delivered to the user at this point.
504 504 */
505 505 static void
506 506 lx_sendsig(int sig)
507 507 {
508 508 klwp_t *lwp = ttolwp(curthread);
509 509 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
510 510 struct regs *rp = lwptoregs(lwp);
511 511
512 512 switch (lwpd->br_stack_mode) {
513 513 case LX_STACK_MODE_BRAND:
514 514 case LX_STACK_MODE_NATIVE:
515 515 /*
516 516 * In lx_sendsig_stack(), we nominated a stack pointer from the
517 517 * native stack. Update the stack mode, and the current in-use
518 518 * extent of the native stack, accordingly:
519 519 */
520 520 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
521 521 lx_lwp_set_native_stack_current(lwpd, rp->r_sp);
522 522
523 523 /*
524 524 * Fix up segment registers, etc.
525 525 */
526 526 lx_switch_to_native(lwp);
527 527 break;
528 528
529 529 default:
530 530 /*
531 531 * Otherwise, the brand library has not yet installed the
532 532 * alternate stack for this LWP. Signals will be handled on
533 533 * the regular stack thread.
534 534 */
535 535 return;
536 536 }
537 537 }
538 538
539 539 /*
540 540 * This hook runs prior to the context restoration, allowing us to take action
541 541 * or modify the context before it is loaded.
542 542 */
543 543 static void
544 544 lx_restorecontext(ucontext_t *ucp)
545 545 {
546 546 klwp_t *lwp = ttolwp(curthread);
547 547 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
548 548 uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0];
549 549 caddr_t sp = ucp->uc_brand_data[1];
550 550
551 551 /*
552 552 * We have a saved native stack pointer value that we must restore
553 553 * into the per-LWP data.
554 554 */
555 555 if (flags & LX_UC_RESTORE_NATIVE_SP) {
556 556 lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp);
557 557 }
558 558
559 559 /*
560 560 * We do not wish to restore the value of uc_link in this context,
561 561 * so replace it with the value currently in the LWP.
562 562 */
563 563 if (flags & LX_UC_IGNORE_LINK) {
564 564 ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext;
565 565 }
566 566
567 567 /*
568 568 * Restore the stack mode:
569 569 */
570 570 if (flags & LX_UC_STACK_NATIVE) {
571 571 lwpd->br_stack_mode = LX_STACK_MODE_NATIVE;
572 572 } else if (flags & LX_UC_STACK_BRAND) {
573 573 lwpd->br_stack_mode = LX_STACK_MODE_BRAND;
574 574 }
575 575
576 576 #if defined(__amd64)
577 577 /*
578 578 * Override the fs/gsbase in the context with the value provided
579 579 * through the Linux arch_prctl(2) system call.
580 580 */
581 581 if (flags & LX_UC_STACK_BRAND) {
582 582 if (lwpd->br_lx_fsbase != 0) {
583 583 ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase;
584 584 }
585 585 if (lwpd->br_lx_gsbase != 0) {
586 586 ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase;
587 587 }
588 588 }
589 589 #endif
590 590 }
591 591
592 592 static void
593 593 lx_savecontext(ucontext_t *ucp)
594 594 {
595 595 klwp_t *lwp = ttolwp(curthread);
596 596 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
597 597 uintptr_t flags = 0;
598 598
599 599 /*
600 600 * The ucontext_t affords us three private pointer-sized members in
601 601 * "uc_brand_data". We pack a variety of flags into the first element,
602 602 * and an optional stack pointer in the second element. The flags
603 603 * determine which stack pointer (native or brand), if any, is stored
604 604 * in the second element. The third element may contain the system
605 605 * call number; this is analogous to the "orig_[er]ax" member of a
606 606 * Linux "user_regs_struct".
607 607 */
608 608
609 609 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
610 610 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
611 611 /*
612 612 * Record the value of the native stack pointer to restore
613 613 * when returning to this branded context:
614 614 */
615 615 flags |= LX_UC_RESTORE_NATIVE_SP;
616 616 ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current;
617 617 }
618 618
619 619 /*
620 620 * Save the stack mode:
621 621 */
622 622 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
623 623 flags |= LX_UC_STACK_NATIVE;
624 624 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
625 625 flags |= LX_UC_STACK_BRAND;
626 626 }
627 627
628 628 /*
629 629 * If we might need to restart this system call, save that information
630 630 * in the context:
631 631 */
632 632 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
633 633 ucp->uc_brand_data[2] =
634 634 (void *)(uintptr_t)lwpd->br_syscall_num;
635 635 if (lwpd->br_syscall_restart) {
636 636 flags |= LX_UC_RESTART_SYSCALL;
637 637 }
638 638 } else {
639 639 ucp->uc_brand_data[2] = NULL;
640 640 }
641 641
642 642 ucp->uc_brand_data[0] = (void *)flags;
643 643 }
644 644
645 645 #if defined(_SYSCALL32_IMPL)
646 646 static void
647 647 lx_savecontext32(ucontext32_t *ucp)
648 648 {
649 649 klwp_t *lwp = ttolwp(curthread);
650 650 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
651 651 unsigned int flags = 0;
652 652
653 653 /*
654 654 * The ucontext_t affords us three private pointer-sized members in
655 655 * "uc_brand_data". We pack a variety of flags into the first element,
656 656 * and an optional stack pointer in the second element. The flags
657 657 * determine which stack pointer (native or brand), if any, is stored
658 658 * in the second element. The third element may contain the system
659 659 * call number; this is analogous to the "orig_[er]ax" member of a
660 660 * Linux "user_regs_struct".
661 661 */
662 662
663 663 if (lwpd->br_stack_mode != LX_STACK_MODE_INIT &&
664 664 lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
665 665 /*
666 666 * Record the value of the native stack pointer to restore
667 667 * when returning to this branded context:
668 668 */
669 669 flags |= LX_UC_RESTORE_NATIVE_SP;
670 670 ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current;
671 671 }
672 672
673 673 /*
674 674 * Save the stack mode:
675 675 */
676 676 if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) {
677 677 flags |= LX_UC_STACK_NATIVE;
678 678 } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
679 679 flags |= LX_UC_STACK_BRAND;
680 680 }
681 681
682 682 /*
683 683 * If we might need to restart this system call, save that information
684 684 * in the context:
685 685 */
686 686 if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) {
687 687 ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num;
688 688 if (lwpd->br_syscall_restart) {
689 689 flags |= LX_UC_RESTART_SYSCALL;
690 690 }
691 691 } else {
692 692 ucp->uc_brand_data[2] = NULL;
693 693 }
694 694
695 695 ucp->uc_brand_data[0] = flags;
696 696 }
697 697 #endif
698 698
699 699 void
700 700 lx_init_brand_data(zone_t *zone)
701 701 {
702 702 lx_zone_data_t *data;
703 703 ASSERT(zone->zone_brand == &lx_brand);
704 704 ASSERT(zone->zone_brand_data == NULL);
705 705 data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP);
706 706 /*
707 707 * Set the default lxzd_kernel_version to 2.4.
708 708 * This can be changed by a call to setattr() during zone boot.
709 709 */
710 710 (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX);
711 711
712 712 /*
713 713 * Linux is not at all picky about address family when it comes to
714 714 * supporting interface-related ioctls. To mimic this behavior, we'll
715 715 * attempt those ioctls against a ksocket configured for that purpose.
716 716 */
717 717 (void) ksocket_socket(&data->lxzd_ioctl_sock, AF_INET, SOCK_DGRAM, 0,
718 718 0, zone->zone_kcred);
719 719
720 720 zone->zone_brand_data = data;
721 721
722 722 /*
723 723 * In Linux, if the init(1) process terminates the system panics.
724 724 * The zone must reboot to simulate this behaviour.
725 725 */
726 726 zone->zone_reboot_on_init_exit = B_TRUE;
727 727 }
728 728
729 729 void
730 730 lx_free_brand_data(zone_t *zone)
731 731 {
732 732 lx_zone_data_t *data = ztolxzd(zone);
733 733 ASSERT(data != NULL);
734 734 if (data->lxzd_ioctl_sock != NULL) {
735 735 /*
736 736 * Since zone_kcred has been cleaned up already, close the
737 737 * socket using the global kcred.
738 738 */
739 739 ksocket_close(data->lxzd_ioctl_sock, kcred);
740 740 data->lxzd_ioctl_sock = NULL;
741 741 }
742 742 zone->zone_brand_data = NULL;
743 743 kmem_free(data, sizeof (*data));
744 744 }
745 745
746 746 void
747 747 lx_unsupported(char *dmsg)
748 748 {
749 749 lx_proc_data_t *pd = ttolxproc(curthread);
750 750
751 751 DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg);
752 752
753 753 if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) {
754 754 /*
755 755 * If this process was run with strict mode enabled
756 756 * (via LX_STRICT in the environment), we mark this
757 757 * LWP as having triggered an unsupported behaviour.
758 758 * This flag will be checked at an appropriate point
759 759 * by lx_check_strict_failure().
760 760 */
761 761 lx_lwp_data_t *lwpd = ttolxlwp(curthread);
762 762
763 763 lwpd->br_strict_failure = B_TRUE;
764 764 }
765 765 }
766 766
767 767 void
768 768 lx_check_strict_failure(lx_lwp_data_t *lwpd)
769 769 {
770 770 proc_t *p;
771 771
772 772 if (!lwpd->br_strict_failure) {
773 773 return;
774 774 }
775 775
776 776 lwpd->br_strict_failure = B_FALSE;
777 777
778 778 /*
779 779 * If this process is operating in strict mode (via LX_STRICT in
780 780 * the environment), and has triggered a call to
781 781 * lx_unsupported(), we drop SIGSYS on it as we return.
782 782 */
783 783 p = curproc;
784 784 mutex_enter(&p->p_lock);
785 785 sigtoproc(p, curthread, SIGSYS);
786 786 mutex_exit(&p->p_lock);
787 787 }
788 788
789 789 void
790 790 lx_trace_sysenter(int syscall_num, uintptr_t *args)
791 791 {
792 792 if (lx_systrace_enabled) {
793 793 VERIFY(lx_systrace_entry_ptr != NULL);
794 794
795 795 (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1],
796 796 args[2], args[3], args[4], args[5]);
797 797 }
798 798 }
799 799
800 800 void
801 801 lx_trace_sysreturn(int syscall_num, long ret)
802 802 {
803 803 if (lx_systrace_enabled) {
804 804 VERIFY(lx_systrace_return_ptr != NULL);
805 805
806 806 (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0);
807 807 }
808 808 }
809 809
810 810 /*
811 811 * Get the addresses of the user-space system call handler and attach it to
812 812 * the proc structure. Returning 0 indicates success; the value returned
813 813 * by the system call is the value stored in rval. Returning a non-zero
814 814 * value indicates a failure; the value returned is used to set errno, -1
815 815 * is returned from the syscall and the contents of rval are ignored. To
816 816 * set errno and have the syscall return a value other than -1 we can
817 817 * manually set errno and rval and return 0.
818 818 */
819 819 int
820 820 lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2,
821 821 uintptr_t arg3, uintptr_t arg4, uintptr_t arg5)
822 822 {
823 823 kthread_t *t = curthread;
824 824 klwp_t *lwp = ttolwp(t);
825 825 proc_t *p = ttoproc(t);
826 826 lx_proc_data_t *pd;
827 827 struct termios *termios;
828 828 uint_t termios_len;
829 829 int error;
830 830 int code;
831 831 int sig;
832 832 lx_brand_registration_t reg;
833 833 lx_lwp_data_t *lwpd = lwptolxlwp(lwp);
834 834
835 835 /*
836 836 * There is one operation that is suppored for non-branded
837 837 * process. B_EXEC_BRAND. This is the equilivant of an
838 838 * exec call, but the new process that is created will be
839 839 * a branded process.
840 840 */
841 841 if (cmd == B_EXEC_BRAND) {
842 842 VERIFY(p->p_zone != NULL);
843 843 VERIFY(p->p_zone->zone_brand == &lx_brand);
844 844 return (exec_common(
845 845 (char *)arg1, (const char **)arg2, (const char **)arg3,
846 846 EBA_BRAND));
847 847 }
848 848
849 849 /* For all other operations this must be a branded process. */
850 850 if (p->p_brand == NULL)
851 851 return (ENOSYS);
852 852
853 853 VERIFY(p->p_brand == &lx_brand);
854 854 VERIFY(p->p_brand_data != NULL);
855 855
856 856 switch (cmd) {
857 857 case B_REGISTER:
858 858 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
859 859 lx_print("stack mode was not PREINIT during "
860 860 "REGISTER\n");
861 861 return (EINVAL);
862 862 }
863 863
864 864 if (p->p_model == DATAMODEL_NATIVE) {
865 865 if (copyin((void *)arg1, ®, sizeof (reg)) != 0) {
866 866 lx_print("Failed to copyin brand registration "
867 867 "at 0x%p\n", (void *)arg1);
868 868 return (EFAULT);
869 869 }
870 870 }
871 871 #ifdef _LP64
872 872 else {
873 873 /* 32-bit userland on 64-bit kernel */
874 874 lx_brand_registration32_t reg32;
875 875
876 876 if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) {
877 877 lx_print("Failed to copyin brand registration "
878 878 "at 0x%p\n", (void *)arg1);
879 879 return (EFAULT);
880 880 }
881 881
882 882 reg.lxbr_version = (uint_t)reg32.lxbr_version;
883 883 reg.lxbr_handler =
884 884 (void *)(uintptr_t)reg32.lxbr_handler;
885 885 reg.lxbr_flags = reg32.lxbr_flags;
886 886 }
887 887 #endif
888 888
889 889 if (reg.lxbr_version != LX_VERSION_1) {
890 890 lx_print("Invalid brand library version (%u)\n",
891 891 reg.lxbr_version);
892 892 return (EINVAL);
893 893 }
894 894
895 895 if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) {
896 896 lx_print("Invalid brand flags (%u)\n",
897 897 reg.lxbr_flags);
898 898 return (EINVAL);
899 899 }
900 900
901 901 lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n",
902 902 (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p);
903 903 pd = p->p_brand_data;
904 904 pd->l_handler = (uintptr_t)reg.lxbr_handler;
905 905 pd->l_flags = reg.lxbr_flags & LX_PROC_ALL;
906 906
907 907 return (0);
908 908
909 909 case B_TTYMODES:
910 910 /* This is necessary for emulating TCGETS ioctls. */
911 911 if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(),
912 912 DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios,
913 913 &termios_len) != DDI_SUCCESS)
914 914 return (EIO);
915 915
916 916 ASSERT(termios_len == sizeof (*termios));
917 917
918 918 if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) {
919 919 ddi_prop_free(termios);
920 920 return (EFAULT);
921 921 }
922 922
923 923 ddi_prop_free(termios);
924 924 return (0);
925 925
926 926 case B_ELFDATA:
927 927 pd = curproc->p_brand_data;
928 928 if (get_udatamodel() == DATAMODEL_NATIVE) {
929 929 if (copyout(&pd->l_elf_data, (void *)arg1,
930 930 sizeof (lx_elf_data_t)) != 0) {
931 931 return (EFAULT);
932 932 }
933 933 }
934 934 #if defined(_LP64)
935 935 else {
936 936 /* 32-bit userland on 64-bit kernel */
937 937 lx_elf_data32_t led32;
938 938
939 939 led32.ed_phdr = (int)pd->l_elf_data.ed_phdr;
940 940 led32.ed_phent = (int)pd->l_elf_data.ed_phent;
941 941 led32.ed_phnum = (int)pd->l_elf_data.ed_phnum;
942 942 led32.ed_entry = (int)pd->l_elf_data.ed_entry;
943 943 led32.ed_base = (int)pd->l_elf_data.ed_base;
944 944 led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry;
945 945
946 946 if (copyout(&led32, (void *)arg1,
947 947 sizeof (led32)) != 0) {
948 948 return (EFAULT);
949 949 }
950 950 }
951 951 #endif
952 952 return (0);
953 953
954 954 case B_EXEC_NATIVE:
955 955 return (exec_common((char *)arg1, (const char **)arg2,
956 956 (const char **)arg3, EBA_NATIVE));
957 957
958 958 /*
959 959 * The B_TRUSS_POINT subcommand is used so that we can make a no-op
960 960 * syscall for debugging purposes (dtracing) from within the user-level
961 961 * emulation.
962 962 */
963 963 case B_TRUSS_POINT:
964 964 return (0);
965 965
966 966 case B_LPID_TO_SPAIR: {
967 967 /*
968 968 * Given a Linux pid as arg1, return the Solaris pid in arg2 and
969 969 * the Solaris LWP in arg3. We also translate pid 1 (which is
970 970 * hardcoded in many applications) to the zone's init process.
971 971 */
972 972 pid_t s_pid;
973 973 id_t s_tid;
974 974
975 975 if ((pid_t)arg1 == 1) {
976 976 s_pid = p->p_zone->zone_proc_initpid;
977 977 /* handle the dead/missing init(1M) case */
978 978 if (s_pid == -1)
979 979 s_pid = 1;
980 980 s_tid = 1;
981 981 } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) {
982 982 return (ESRCH);
↓ open down ↓ |
982 lines elided |
↑ open up ↑ |
983 983 }
984 984
985 985 if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 ||
986 986 copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) {
987 987 return (EFAULT);
988 988 }
989 989
990 990 return (0);
991 991 }
992 992
993 + case B_SIGEV_THREAD_ID: {
994 + /*
995 + * Emulate Linux's timer_create(2) SIGEV_THREAD_ID
996 + * notification method. This mechanism is only meant
997 + * for userland threading libraries such as glibc and
998 + * is documented as such. Therefore, assume this is
999 + * only ever invoked for the purpose of alerting a
1000 + * Linux threading library. Assume that the tid is a
1001 + * member of the caller's process and the signal
1002 + * number is valid. See lx_sigev_thread_id() for the
1003 + * userland side of this emulation.
1004 + *
1005 + * arg1 -- Linux tid
1006 + * arg2 -- signal number
1007 + * arg3 -- union sigval
1008 + */
1009 +
1010 + proc_t *pp, *cp = curproc;
1011 + int native_sig = ltos_signo[(int)arg2];
1012 + pid_t native_pid;
1013 + int native_tid;
1014 + sigqueue_t *sqp;
1015 +
1016 + lx_lpid_to_spair((pid_t)arg1, &native_pid, &native_tid);
1017 +
1018 + mutex_enter(&pidlock);
1019 + if (((pp = prfind(native_pid)) == NULL) || (pp->p_stat == SIDL)) {
1020 + mutex_exit(&pidlock);
1021 + return (ESRCH);
1022 + }
1023 + mutex_enter(&pp->p_lock);
1024 + mutex_exit(&pidlock);
1025 +
1026 + if ((t = idtot(pp, native_tid)) == NULL) {
1027 + mutex_exit(&pp->p_lock);
1028 + return (ESRCH);
1029 + }
1030 +
1031 + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
1032 + sqp->sq_info.si_signo = native_sig;
1033 + sqp->sq_info.si_code = SI_TIMER;
1034 + sqp->sq_info.si_pid = cp->p_pid;
1035 + sqp->sq_info.si_zoneid = getzoneid();
1036 + sqp->sq_info.si_uid = crgetruid(CRED());
1037 + sqp->sq_info.si_value = (union sigval)((void *)arg3);
1038 + sigaddqa(pp, t, sqp);
1039 +
1040 + mutex_exit(&pp->p_lock);
1041 + return (0);
1042 + }
1043 +
993 1044 case B_SET_AFFINITY_MASK:
994 1045 case B_GET_AFFINITY_MASK:
995 1046 /*
996 1047 * Retrieve or store the CPU affinity mask for the
997 1048 * requested linux pid.
998 1049 *
999 1050 * arg1 is a linux PID (0 means curthread).
1000 1051 * arg2 is the size of the given mask.
1001 1052 * arg3 is the address of the affinity mask.
1002 1053 */
1003 1054 return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval));
1004 1055
1005 1056 case B_PTRACE_STOP_FOR_OPT:
1006 1057 return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ?
1007 1058 B_FALSE : B_TRUE, (ulong_t)arg3, arg4));
1008 1059
1009 1060 case B_PTRACE_CLONE_BEGIN:
1010 1061 return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ?
1011 1062 B_FALSE : B_TRUE));
1012 1063
1013 1064 case B_PTRACE_KERNEL:
1014 1065 return (lx_ptrace_kernel((int)arg1, (pid_t)arg2, arg3, arg4));
1015 1066
1016 1067 case B_HELPER_WAITID: {
1017 1068 idtype_t idtype = (idtype_t)arg1;
1018 1069 id_t id = (id_t)arg2;
1019 1070 siginfo_t *infop = (siginfo_t *)arg3;
1020 1071 int options = (int)arg4;
1021 1072
1022 1073 lwpd = ttolxlwp(curthread);
1023 1074
1024 1075 /*
1025 1076 * Our brand-specific waitid helper only understands a subset of
1026 1077 * the possible idtypes. Ensure we keep to that subset here:
1027 1078 */
1028 1079 if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) {
1029 1080 return (EINVAL);
1030 1081 }
1031 1082
1032 1083 /*
1033 1084 * Enable the return of emulated ptrace(2) stop conditions
1034 1085 * through lx_waitid_helper, and stash the Linux-specific
1035 1086 * extra waitid() flags.
1036 1087 */
1037 1088 lwpd->br_waitid_emulate = B_TRUE;
1038 1089 lwpd->br_waitid_flags = (int)arg5;
1039 1090
1040 1091 #if defined(_SYSCALL32_IMPL)
1041 1092 if (get_udatamodel() != DATAMODEL_NATIVE) {
1042 1093 return (waitsys32(idtype, id, infop, options));
1043 1094 } else
1044 1095 #endif
1045 1096 {
1046 1097 return (waitsys(idtype, id, infop, options));
1047 1098 }
1048 1099
1049 1100 lwpd->br_waitid_emulate = B_FALSE;
1050 1101 lwpd->br_waitid_flags = 0;
1051 1102
1052 1103 return (0);
1053 1104 }
1054 1105
1055 1106 case B_UNSUPPORTED: {
1056 1107 char dmsg[256];
1057 1108
1058 1109 if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) {
1059 1110 lx_print("Failed to copyin unsupported msg "
1060 1111 "at 0x%p\n", (void *)arg1);
1061 1112 return (EFAULT);
1062 1113 }
1063 1114 dmsg[255] = '\0';
1064 1115 lx_unsupported(dmsg);
1065 1116
1066 1117 lx_check_strict_failure(lwpd);
1067 1118
1068 1119 return (0);
1069 1120 }
1070 1121
1071 1122 case B_STORE_ARGS: {
1072 1123 /*
1073 1124 * B_STORE_ARGS subcommand
1074 1125 * arg1 = address of struct to be copied in
1075 1126 * arg2 = size of the struct being copied in
1076 1127 * arg3-arg6 ignored
1077 1128 * rval = the amount of data copied.
1078 1129 */
1079 1130 void *buf;
1080 1131
1081 1132 /* only have upper limit because arg2 is unsigned */
1082 1133 if (arg2 > LX_BR_ARGS_SIZE_MAX) {
1083 1134 return (EINVAL);
1084 1135 }
1085 1136
1086 1137 buf = kmem_alloc(arg2, KM_SLEEP);
1087 1138 if (copyin((void *)arg1, buf, arg2) != 0) {
1088 1139 lx_print("Failed to copyin scall arg at 0x%p\n",
1089 1140 (void *) arg1);
1090 1141 kmem_free(buf, arg2);
1091 1142 /*
1092 1143 * Purposely not setting br_scall_args to NULL
1093 1144 * to preserve data for debugging.
1094 1145 */
1095 1146 return (EFAULT);
1096 1147 }
1097 1148
1098 1149 if (lwpd->br_scall_args != NULL) {
1099 1150 ASSERT(lwpd->br_args_size > 0);
1100 1151 kmem_free(lwpd->br_scall_args,
1101 1152 lwpd->br_args_size);
1102 1153 }
1103 1154
1104 1155 lwpd->br_scall_args = buf;
1105 1156 lwpd->br_args_size = arg2;
1106 1157 *rval = arg2;
1107 1158 return (0);
1108 1159 }
1109 1160
1110 1161 case B_HELPER_CLONE:
1111 1162 return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3,
1112 1163 (void *)arg4));
1113 1164
1114 1165 case B_HELPER_SETGROUPS:
1115 1166 return (lx_helper_setgroups(arg1, (gid_t *)arg2));
1116 1167
1117 1168 case B_HELPER_SIGQUEUE:
1118 1169 return (lx_helper_rt_sigqueueinfo(arg1, arg2,
1119 1170 (siginfo_t *)arg3));
1120 1171
1121 1172 case B_HELPER_TGSIGQUEUE:
1122 1173 return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3,
1123 1174 (siginfo_t *)arg4));
1124 1175
1125 1176 case B_SET_THUNK_PID:
1126 1177 lwpd->br_lx_thunk_pid = arg1;
1127 1178 return (0);
1128 1179
1129 1180 case B_GETPID:
1130 1181 /*
1131 1182 * The usermode clone(2) code needs to be able to call
1132 1183 * lx_getpid() from native code:
1133 1184 */
1134 1185 *rval = lx_getpid();
1135 1186 return (0);
1136 1187
1137 1188 case B_SET_NATIVE_STACK:
1138 1189 /*
1139 1190 * B_SET_NATIVE_STACK subcommand
1140 1191 * arg1 = the base of the stack to use for emulation
1141 1192 */
1142 1193 if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) {
1143 1194 lx_print("B_SET_NATIVE_STACK when stack was already "
1144 1195 "set to %p\n", (void *)arg1);
1145 1196 return (EEXIST);
1146 1197 }
1147 1198
1148 1199 /*
1149 1200 * We move from the PREINIT state, where we have no brand
1150 1201 * emulation stack, to the INIT state. Here, we are still
1151 1202 * running on what will become the BRAND stack, but are running
1152 1203 * emulation (i.e. native) code. Once the initialisation
1153 1204 * process for this thread has finished, we will jump to
1154 1205 * brand-specific code, while moving to the BRAND mode.
1155 1206 *
1156 1207 * When a new LWP is created, lx_initlwp() will clear the
1157 1208 * stack data. If that LWP is actually being duplicated
1158 1209 * into a child process by fork(2), lx_forklwp() will copy
1159 1210 * it so that the cloned thread will keep using the same
1160 1211 * alternate stack.
1161 1212 */
1162 1213 lwpd->br_ntv_stack = arg1;
1163 1214 lwpd->br_stack_mode = LX_STACK_MODE_INIT;
1164 1215 lx_lwp_set_native_stack_current(lwpd, arg1);
1165 1216
1166 1217 return (0);
1167 1218
1168 1219 case B_GET_CURRENT_CONTEXT:
1169 1220 /*
1170 1221 * B_GET_CURRENT_CONTEXT subcommand:
1171 1222 * arg1 = address for pointer to current ucontext_t
1172 1223 */
1173 1224
1174 1225 #if defined(_SYSCALL32_IMPL)
1175 1226 if (get_udatamodel() != DATAMODEL_NATIVE) {
1176 1227 caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext;
1177 1228
1178 1229 error = copyout(&addr, (void *)arg1, sizeof (addr));
1179 1230 } else
1180 1231 #endif
1181 1232 {
1182 1233 error = copyout(&lwp->lwp_oldcontext, (void *)arg1,
1183 1234 sizeof (lwp->lwp_oldcontext));
1184 1235 }
1185 1236
1186 1237 return (error != 0 ? EFAULT : 0);
1187 1238
1188 1239 case B_JUMP_TO_LINUX:
1189 1240 /*
1190 1241 * B_JUMP_TO_LINUX subcommand:
1191 1242 * arg1 = ucontext_t pointer for jump state
1192 1243 */
1193 1244
1194 1245 if (arg1 == NULL)
1195 1246 return (EINVAL);
1196 1247
1197 1248 switch (lwpd->br_stack_mode) {
1198 1249 case LX_STACK_MODE_NATIVE: {
1199 1250 struct regs *rp = lwptoregs(lwp);
1200 1251
1201 1252 /*
1202 1253 * We are on the NATIVE stack, so we must preserve
1203 1254 * the extent of that stack. The pointer will be
1204 1255 * reset by a future setcontext().
1205 1256 */
1206 1257 lx_lwp_set_native_stack_current(lwpd,
1207 1258 (uintptr_t)rp->r_sp);
1208 1259 break;
1209 1260 }
1210 1261
1211 1262 case LX_STACK_MODE_INIT:
1212 1263 /*
1213 1264 * The LWP is transitioning to Linux code for the first
1214 1265 * time.
1215 1266 */
1216 1267 break;
1217 1268
1218 1269 case LX_STACK_MODE_PREINIT:
1219 1270 /*
1220 1271 * This LWP has not installed an alternate stack for
1221 1272 * usermode emulation handling.
1222 1273 */
1223 1274 return (ENOENT);
1224 1275
1225 1276 case LX_STACK_MODE_BRAND:
1226 1277 /*
1227 1278 * The LWP should not be on the BRAND stack.
1228 1279 */
1229 1280 exit(CLD_KILLED, SIGSYS);
1230 1281 return (0);
1231 1282 }
1232 1283
1233 1284 /*
1234 1285 * Transfer control to Linux:
1235 1286 */
1236 1287 return (lx_runexe(lwp, (void *)arg1));
1237 1288
1238 1289 case B_EMULATION_DONE:
1239 1290 /*
1240 1291 * B_EMULATION_DONE subcommand:
1241 1292 * arg1 = ucontext_t * to restore
1242 1293 * arg2 = system call number
1243 1294 * arg3 = return code
1244 1295 * arg4 = if operation failed, the errno value
1245 1296 */
1246 1297
1247 1298 /*
1248 1299 * The first part of this operation is a setcontext() to
1249 1300 * restore the register state to the copy we preserved
1250 1301 * before vectoring to the usermode emulation routine.
1251 1302 * If that fails, we return (hopefully) to the emulation
1252 1303 * routine and it will handle the error.
1253 1304 */
1254 1305 #if (_SYSCALL32_IMPL)
1255 1306 if (get_udatamodel() != DATAMODEL_NATIVE) {
1256 1307 error = getsetcontext32(SETCONTEXT, (void *)arg1);
1257 1308 } else
1258 1309 #endif
1259 1310 {
1260 1311 error = getsetcontext(SETCONTEXT, (void *)arg1);
1261 1312 }
1262 1313
1263 1314 if (error != 0) {
1264 1315 return (error);
1265 1316 }
1266 1317
1267 1318 /*
1268 1319 * The saved Linux context has been restored. We handle the
1269 1320 * return value or errno with code common to the in-kernel
1270 1321 * system call emulation.
1271 1322 */
1272 1323 if ((error = (int)arg4) != 0) {
1273 1324 /*
1274 1325 * lx_syscall_return() looks at the errno in the LWP,
1275 1326 * so set it here:
1276 1327 */
1277 1328 set_errno(error);
1278 1329 }
1279 1330 lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3);
1280 1331
1281 1332 return (0);
1282 1333
1283 1334 case B_EXIT_AS_SIG:
1284 1335 code = CLD_KILLED;
1285 1336 sig = (int)arg1;
1286 1337 proc_is_exiting(p);
1287 1338 if (exitlwps(1) != 0) {
1288 1339 mutex_enter(&p->p_lock);
1289 1340 lwp_exit();
1290 1341 }
1291 1342 ttolwp(curthread)->lwp_cursig = sig;
1292 1343 if (sig == SIGSEGV) {
1293 1344 if (core(sig, 0) == 0)
1294 1345 code = CLD_DUMPED;
1295 1346 }
1296 1347 exit(code, sig);
1297 1348 /* NOTREACHED */
1298 1349 break;
1299 1350 }
1300 1351
1301 1352 return (EINVAL);
1302 1353 }
1303 1354
1304 1355 char *
1305 1356 lx_get_zone_kern_version(zone_t *zone)
1306 1357 {
1307 1358 return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version);
1308 1359 }
1309 1360
1310 1361 void
1311 1362 lx_set_kern_version(zone_t *zone, char *vers)
1312 1363 {
1313 1364 lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data;
1314 1365
1315 1366 (void) strlcpy(lxzd->lxzd_kernel_version, vers, LX_VERS_MAX);
1316 1367 }
1317 1368
1318 1369 /*
1319 1370 * Compare linux kernel version to the one set for the zone.
1320 1371 * Returns greater than 0 if zone version is higher, less than 0 if the zone
1321 1372 * version is lower, and 0 if the version are equal.
1322 1373 */
1323 1374 int
1324 1375 lx_kern_version_cmp(zone_t *zone, const char *vers)
1325 1376 {
1326 1377 int zvers[3] = {0, 0, 0};
1327 1378 int cvers[3] = {0, 0, 0};
1328 1379 int i;
1329 1380
1330 1381 VERIFY(zone->zone_brand == &lx_brand);
1331 1382
1332 1383 (void) sscanf(ztolxzd(zone)->lxzd_kernel_version, "%d.%d.%d", &zvers[0],
1333 1384 &zvers[1], &zvers[2]);
1334 1385 (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]);
1335 1386
1336 1387 for (i = 0; i < 3; i++) {
1337 1388 if (zvers[i] > cvers[i]) {
1338 1389 return (1);
1339 1390 } else if (zvers[i] < cvers[i]) {
1340 1391 return (-1);
1341 1392 }
1342 1393 }
1343 1394 return (0);
1344 1395 }
1345 1396
1346 1397 /*
1347 1398 * Linux unconditionally removes the setuid and setgid bits when changing
1348 1399 * file ownership. This brand hook overrides the illumos native behaviour,
1349 1400 * which is based on the PRIV_FILE_SETID privilege.
1350 1401 */
1351 1402 static int
1352 1403 lx_setid_clear(vattr_t *vap, cred_t *cr)
1353 1404 {
1354 1405 if (S_ISDIR(vap->va_mode)) {
1355 1406 return (0);
1356 1407 }
1357 1408
1358 1409 if (vap->va_mode & S_ISUID) {
1359 1410 vap->va_mask |= AT_MODE;
1360 1411 vap->va_mode &= ~S_ISUID;
1361 1412 }
1362 1413 if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1363 1414 vap->va_mask |= AT_MODE;
1364 1415 vap->va_mode &= ~S_ISGID;
1365 1416 }
1366 1417
1367 1418 return (0);
1368 1419 }
1369 1420
1370 1421 /*
1371 1422 * Copy the per-process brand data from a parent proc to a child.
1372 1423 */
1373 1424 void
1374 1425 lx_copy_procdata(proc_t *child, proc_t *parent)
1375 1426 {
1376 1427 lx_proc_data_t *cpd = child->p_brand_data;
1377 1428 lx_proc_data_t *ppd = parent->p_brand_data;
1378 1429
1379 1430 VERIFY(parent->p_brand == &lx_brand);
1380 1431 VERIFY(child->p_brand == &lx_brand);
1381 1432 VERIFY(ppd != NULL);
1382 1433 VERIFY(cpd != NULL);
1383 1434
1384 1435 *cpd = *ppd;
1385 1436
1386 1437 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY;
1387 1438 cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY;
1388 1439
1389 1440 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20;
1390 1441 cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20;
1391 1442
1392 1443 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY;
1393 1444 cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY;
1394 1445
1395 1446 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY;
1396 1447 cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY;
1397 1448 }
1398 1449
1399 1450 #if defined(_LP64)
1400 1451 static void
1401 1452 Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst)
1402 1453 {
1403 1454 bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident));
1404 1455 dst->e_type = src->e_type;
1405 1456 dst->e_machine = src->e_machine;
1406 1457 dst->e_version = src->e_version;
1407 1458 dst->e_entry = src->e_entry;
1408 1459 dst->e_phoff = src->e_phoff;
1409 1460 dst->e_shoff = src->e_shoff;
1410 1461 dst->e_flags = src->e_flags;
1411 1462 dst->e_ehsize = src->e_ehsize;
1412 1463 dst->e_phentsize = src->e_phentsize;
1413 1464 dst->e_phnum = src->e_phnum;
1414 1465 dst->e_shentsize = src->e_shentsize;
1415 1466 dst->e_shnum = src->e_shnum;
1416 1467 dst->e_shstrndx = src->e_shstrndx;
1417 1468 }
1418 1469 #endif /* _LP64 */
1419 1470
1420 1471 static void
1421 1472 restoreexecenv(struct execenv *ep, stack_t *sp)
1422 1473 {
1423 1474 klwp_t *lwp = ttolwp(curthread);
1424 1475
1425 1476 setexecenv(ep);
1426 1477 lwp->lwp_sigaltstack.ss_sp = sp->ss_sp;
1427 1478 lwp->lwp_sigaltstack.ss_size = sp->ss_size;
1428 1479 lwp->lwp_sigaltstack.ss_flags = sp->ss_flags;
1429 1480 }
1430 1481
1431 1482 extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int,
1432 1483 long *, int, caddr_t, cred_t *, int *);
1433 1484
1434 1485 extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int,
1435 1486 long *, int, caddr_t, cred_t *, int *);
1436 1487
1437 1488 /*
1438 1489 * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux
1439 1490 * binaries.
1440 1491 */
1441 1492 static int
1442 1493 lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args,
1443 1494 struct intpdata *idata, int level, long *execsz, int setid,
1444 1495 caddr_t exec_file, struct cred *cred, int *brand_action)
1445 1496 {
1446 1497 int error;
1447 1498 vnode_t *nvp;
1448 1499 Ehdr ehdr;
1449 1500 Addr uphdr_vaddr;
1450 1501 intptr_t voffset;
1451 1502 char *interp = NULL;
1452 1503 uintptr_t ldaddr = NULL;
1453 1504 int i;
1454 1505 proc_t *p = ttoproc(curthread);
1455 1506 klwp_t *lwp = ttolwp(curthread);
1456 1507 struct execenv env;
1457 1508 struct execenv origenv;
1458 1509 stack_t orig_sigaltstack;
1459 1510 struct user *up = PTOU(ttoproc(curthread));
1460 1511 lx_elf_data_t *edp;
1461 1512 char *lib_path = NULL;
1462 1513
1463 1514 ASSERT(ttoproc(curthread)->p_brand == &lx_brand);
1464 1515 ASSERT(ttoproc(curthread)->p_brand_data != NULL);
1465 1516
1466 1517 edp = &ttolxproc(curthread)->l_elf_data;
1467 1518
1468 1519 if (args->to_model == DATAMODEL_NATIVE) {
1469 1520 lib_path = LX_LIB_PATH;
1470 1521 }
1471 1522 #if defined(_LP64)
1472 1523 else {
1473 1524 lib_path = LX_LIB_PATH32;
1474 1525 }
1475 1526 #endif
1476 1527
1477 1528 /*
1478 1529 * Set the brandname and library name for the new process so that
1479 1530 * elfexec() puts them onto the stack.
1480 1531 */
1481 1532 args->brandname = LX_BRANDNAME;
1482 1533 args->emulator = lib_path;
1483 1534
1484 1535 #if defined(_LP64)
1485 1536 /*
1486 1537 * To conform with the way Linux lays out the address space, we clamp
1487 1538 * the stack to be the top of the lower region of the x86-64 canonical
1488 1539 * form address space -- which has the side-effect of laying out the
1489 1540 * entire address space in that lower region. Note that this only
1490 1541 * matters on 64-bit processes (this value will always be greater than
1491 1542 * the size of a 32-bit address space) and doesn't actually affect
1492 1543 * USERLIMIT: if a Linux-branded processes wishes to map something
1493 1544 * into the top half of the address space, it can do so -- but with
1494 1545 * the user stack starting at the top of the bottom region, those high
1495 1546 * virtual addresses won't be used unless explicitly directed.
1496 1547 */
1497 1548 args->maxstack = lx_maxstack64;
1498 1549 #endif
1499 1550
1500 1551 /*
1501 1552 * We will first exec the brand library, then map in the linux
1502 1553 * executable and the linux linker.
1503 1554 */
1504 1555 if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP,
1505 1556 &nvp))) {
1506 1557 uprintf("%s: not found.", lib_path);
1507 1558 return (error);
1508 1559 }
1509 1560
1510 1561 /*
1511 1562 * We will eventually set the p_exec member to be the vnode for the new
1512 1563 * executable when we call setexecenv(). However, if we get an error
1513 1564 * before that call we need to restore the execenv to its original
1514 1565 * values so that when we return to the caller fop_close() works
1515 1566 * properly while cleaning up from the failed exec(). Restoring the
1516 1567 * original value will also properly decrement the 2nd VN_RELE that we
1517 1568 * took on the brand library.
1518 1569 */
1519 1570 origenv.ex_bssbase = p->p_bssbase;
1520 1571 origenv.ex_brkbase = p->p_brkbase;
1521 1572 origenv.ex_brksize = p->p_brksize;
1522 1573 origenv.ex_vp = p->p_exec;
1523 1574 orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp;
1524 1575 orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size;
1525 1576 orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags;
1526 1577
1527 1578 if (args->to_model == DATAMODEL_NATIVE) {
1528 1579 error = elfexec(nvp, uap, args, idata, level + 1, execsz,
1529 1580 setid, exec_file, cred, brand_action);
1530 1581 }
1531 1582 #if defined(_LP64)
1532 1583 else {
1533 1584 error = elf32exec(nvp, uap, args, idata, level + 1, execsz,
1534 1585 setid, exec_file, cred, brand_action);
1535 1586 }
1536 1587 #endif
1537 1588 VN_RELE(nvp);
1538 1589 if (error != 0) {
1539 1590 restoreexecenv(&origenv, &orig_sigaltstack);
1540 1591 return (error);
1541 1592 }
1542 1593
1543 1594 /*
1544 1595 * exec-ed in the brand library above.
1545 1596 * The u_auxv vectors are now setup by elfexec to point to the
1546 1597 * brand emulation library and its linker.
1547 1598 */
1548 1599
1549 1600 bzero(&env, sizeof (env));
1550 1601
1551 1602 /*
1552 1603 * map in the the Linux executable
1553 1604 */
1554 1605 if (args->to_model == DATAMODEL_NATIVE) {
1555 1606 error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr,
1556 1607 &voffset, exec_file, &interp, &env.ex_bssbase,
1557 1608 &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1558 1609 }
1559 1610 #if defined(_LP64)
1560 1611 else {
1561 1612 Elf32_Ehdr ehdr32;
1562 1613 Elf32_Addr uphdr_vaddr32;
1563 1614
1564 1615 error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32,
1565 1616 &voffset, exec_file, &interp, &env.ex_bssbase,
1566 1617 &env.ex_brkbase, &env.ex_brksize, NULL, NULL);
1567 1618
1568 1619 Ehdr32to64(&ehdr32, &ehdr);
1569 1620
1570 1621 if (uphdr_vaddr32 == (Elf32_Addr)-1)
1571 1622 uphdr_vaddr = (Addr)-1;
1572 1623 else
1573 1624 uphdr_vaddr = uphdr_vaddr32;
1574 1625 }
1575 1626 #endif
1576 1627 if (error != 0) {
1577 1628 restoreexecenv(&origenv, &orig_sigaltstack);
1578 1629
1579 1630 if (interp != NULL)
1580 1631 kmem_free(interp, MAXPATHLEN);
1581 1632
1582 1633 return (error);
1583 1634 }
1584 1635
1585 1636 /*
1586 1637 * Save off the important properties of the lx executable. The brand
1587 1638 * library will ask us for this data later, when it is ready to set
1588 1639 * things up for the lx executable.
1589 1640 */
1590 1641 edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff :
1591 1642 voffset + uphdr_vaddr;
1592 1643 edp->ed_entry = voffset + ehdr.e_entry;
1593 1644 edp->ed_phent = ehdr.e_phentsize;
1594 1645 edp->ed_phnum = ehdr.e_phnum;
1595 1646
1596 1647 if (interp != NULL) {
1597 1648 if (ehdr.e_type == ET_DYN) {
1598 1649 /*
1599 1650 * This is a shared object executable, so we need to
1600 1651 * pick a reasonable place to put the heap. Just don't
1601 1652 * use the first page.
1602 1653 */
1603 1654 env.ex_brkbase = (caddr_t)PAGESIZE;
1604 1655 env.ex_bssbase = (caddr_t)PAGESIZE;
1605 1656 }
1606 1657
1607 1658 /*
1608 1659 * If the program needs an interpreter (most do), map it in and
1609 1660 * store relevant information about it in the aux vector, where
1610 1661 * the brand library can find it.
1611 1662 */
1612 1663 if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW,
1613 1664 NULLVPP, &nvp))) {
1614 1665 uprintf("%s: not found.", interp);
1615 1666 restoreexecenv(&origenv, &orig_sigaltstack);
1616 1667 kmem_free(interp, MAXPATHLEN);
1617 1668 return (error);
1618 1669 }
1619 1670
1620 1671 kmem_free(interp, MAXPATHLEN);
1621 1672 interp = NULL;
1622 1673
1623 1674 /*
1624 1675 * map in the Linux linker
1625 1676 */
1626 1677 if (args->to_model == DATAMODEL_NATIVE) {
1627 1678 error = mapexec_brand(nvp, args, &ehdr,
1628 1679 &uphdr_vaddr, &voffset, exec_file, NULL, NULL,
1629 1680 NULL, NULL, NULL, &ldaddr);
1630 1681 }
1631 1682 #if defined(_LP64)
1632 1683 else {
1633 1684 Elf32_Ehdr ehdr32;
1634 1685 Elf32_Addr uphdr_vaddr32;
1635 1686
1636 1687 error = mapexec32_brand(nvp, args, &ehdr32,
1637 1688 &uphdr_vaddr32, &voffset, exec_file, NULL, NULL,
1638 1689 NULL, NULL, NULL, &ldaddr);
1639 1690
1640 1691 Ehdr32to64(&ehdr32, &ehdr);
1641 1692
1642 1693 if (uphdr_vaddr32 == (Elf32_Addr)-1)
1643 1694 uphdr_vaddr = (Addr)-1;
1644 1695 else
1645 1696 uphdr_vaddr = uphdr_vaddr32;
1646 1697 }
1647 1698 #endif
1648 1699
1649 1700 VN_RELE(nvp);
1650 1701 if (error != 0) {
1651 1702 restoreexecenv(&origenv, &orig_sigaltstack);
1652 1703 return (error);
1653 1704 }
1654 1705
1655 1706 /*
1656 1707 * Now that we know the base address of the brand's linker,
1657 1708 * we also save this for later use by the brand library.
1658 1709 */
1659 1710 edp->ed_base = voffset;
1660 1711 edp->ed_ldentry = voffset + ehdr.e_entry;
1661 1712 } else {
1662 1713 /*
1663 1714 * This program has no interpreter. The lx brand library will
1664 1715 * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector,
1665 1716 * so in this case, put the entry point of the main executable
1666 1717 * there.
1667 1718 */
1668 1719 if (ehdr.e_type == ET_EXEC) {
1669 1720 /*
1670 1721 * An executable with no interpreter, this must be a
1671 1722 * statically linked executable, which means we loaded
1672 1723 * it at the address specified in the elf header, in
1673 1724 * which case the e_entry field of the elf header is an
1674 1725 * absolute address.
1675 1726 */
1676 1727 edp->ed_ldentry = ehdr.e_entry;
1677 1728 edp->ed_entry = ehdr.e_entry;
1678 1729 } else {
1679 1730 /*
1680 1731 * A shared object with no interpreter, we use the
1681 1732 * calculated address from above.
1682 1733 */
1683 1734 edp->ed_ldentry = edp->ed_entry;
1684 1735
1685 1736 /*
1686 1737 * In all situations except an ET_DYN elf object with no
1687 1738 * interpreter, we want to leave the brk and base
1688 1739 * values set by mapexec_brand alone. Normally when
1689 1740 * running ET_DYN objects on Solaris (most likely
1690 1741 * /lib/ld.so.1) the kernel sets brk and base to 0 since
1691 1742 * it doesn't know where to put the heap, and later the
1692 1743 * linker will call brk() to initialize the heap in:
1693 1744 * usr/src/cmd/sgs/rtld/common/setup.c:setup()
1694 1745 * after it has determined where to put it. (This
1695 1746 * decision is made after the linker loads and inspects
1696 1747 * elf properties of the target executable being run.)
1697 1748 *
1698 1749 * So for ET_DYN Linux executables, we also don't know
1699 1750 * where the heap should go, so we'll set the brk and
1700 1751 * base to 0. But in this case the Solaris linker will
1701 1752 * not initialize the heap, so when the Linux linker
1702 1753 * starts running there is no heap allocated. This
1703 1754 * seems to be ok on Linux 2.4 based systems because the
1704 1755 * Linux linker/libc fall back to using mmap() to
1705 1756 * allocate memory. But on 2.6 systems, running
1706 1757 * applications by specifying them as command line
1707 1758 * arguments to the linker results in segfaults for an
1708 1759 * as yet undetermined reason (which seems to indicatej
1709 1760 * that a more permanent fix for heap initalization in
1710 1761 * these cases may be necessary).
1711 1762 */
1712 1763 if (ehdr.e_type == ET_DYN) {
1713 1764 env.ex_bssbase = (caddr_t)0;
1714 1765 env.ex_brkbase = (caddr_t)0;
1715 1766 env.ex_brksize = 0;
1716 1767 }
1717 1768 }
1718 1769
1719 1770 }
1720 1771
1721 1772 env.ex_vp = vp;
1722 1773 setexecenv(&env);
1723 1774
1724 1775 /*
1725 1776 * We try to keep /proc's view of the aux vector consistent with
1726 1777 * what's on the process stack.
1727 1778 */
1728 1779 if (args->to_model == DATAMODEL_NATIVE) {
1729 1780 auxv_t phdr_auxv[4] = {
1730 1781 { AT_SUN_BRAND_LX_PHDR, 0 },
1731 1782 { AT_SUN_BRAND_LX_INTERP, 0 },
1732 1783 { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 },
1733 1784 { AT_SUN_BRAND_AUX4, 0 }
1734 1785 };
1735 1786 phdr_auxv[0].a_un.a_val = edp->ed_phdr;
1736 1787 phdr_auxv[1].a_un.a_val = ldaddr;
1737 1788 phdr_auxv[2].a_un.a_val = 1; /* set in lx_init */
1738 1789 phdr_auxv[3].a_type = AT_CLKTCK;
1739 1790 phdr_auxv[3].a_un.a_val = hz;
1740 1791
1741 1792 if (copyout(&phdr_auxv, args->auxp_brand,
1742 1793 sizeof (phdr_auxv)) == -1)
1743 1794 return (EFAULT);
1744 1795 }
1745 1796 #if defined(_LP64)
1746 1797 else {
1747 1798 auxv32_t phdr_auxv32[3] = {
1748 1799 { AT_SUN_BRAND_LX_PHDR, 0 },
1749 1800 { AT_SUN_BRAND_LX_INTERP, 0 },
1750 1801 { AT_SUN_BRAND_AUX3, 0 }
1751 1802 };
1752 1803 phdr_auxv32[0].a_un.a_val = edp->ed_phdr;
1753 1804 phdr_auxv32[1].a_un.a_val = ldaddr;
1754 1805 phdr_auxv32[2].a_type = AT_CLKTCK;
1755 1806 phdr_auxv32[2].a_un.a_val = hz;
1756 1807
1757 1808 if (copyout(&phdr_auxv32, args->auxp_brand,
1758 1809 sizeof (phdr_auxv32)) == -1)
1759 1810 return (EFAULT);
1760 1811 }
1761 1812 #endif
1762 1813
1763 1814 /*
1764 1815 * /proc uses the AT_ENTRY aux vector entry to deduce
1765 1816 * the location of the executable in the address space. The user
1766 1817 * structure contains a copy of the aux vector that needs to have those
1767 1818 * entries patched with the values of the real lx executable (they
1768 1819 * currently contain the values from the lx brand library that was
1769 1820 * elfexec'd, above).
1770 1821 *
1771 1822 * For live processes, AT_BASE is used to locate the linker segment,
1772 1823 * which /proc and friends will later use to find Solaris symbols
1773 1824 * (such as rtld_db_preinit). However, for core files, /proc uses
1774 1825 * AT_ENTRY to find the right segment to label as the executable.
1775 1826 * So we set AT_ENTRY to be the entry point of the linux executable,
1776 1827 * but leave AT_BASE to be the address of the Solaris linker.
1777 1828 */
1778 1829 for (i = 0; i < __KERN_NAUXV_IMPL; i++) {
1779 1830 switch (up->u_auxv[i].a_type) {
1780 1831 case AT_ENTRY:
1781 1832 up->u_auxv[i].a_un.a_val = edp->ed_entry;
1782 1833 break;
1783 1834
1784 1835 case AT_SUN_BRAND_LX_PHDR:
1785 1836 up->u_auxv[i].a_un.a_val = edp->ed_phdr;
1786 1837 break;
1787 1838
1788 1839 case AT_SUN_BRAND_LX_INTERP:
1789 1840 up->u_auxv[i].a_un.a_val = ldaddr;
1790 1841 break;
1791 1842
1792 1843 default:
1793 1844 break;
1794 1845 }
1795 1846 }
1796 1847
1797 1848 return (0);
1798 1849 }
1799 1850
1800 1851 boolean_t
1801 1852 lx_native_exec(uint8_t osabi, const char **interp)
1802 1853 {
1803 1854 if (osabi != ELFOSABI_SOLARIS)
1804 1855 return (B_FALSE);
1805 1856
1806 1857 /*
1807 1858 * If the process root matches the zone root, prepend /native to the
1808 1859 * interpreter path for native executables. Absolute precision from
1809 1860 * VN_CMP is not necessary since any change of process root is likely
1810 1861 * to make native binaries inaccessible via /native.
1811 1862 *
1812 1863 * Processes which chroot directly into /native will be able to
1813 1864 * function as expected with no need for the prefix.
1814 1865 */
1815 1866 if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) {
1816 1867 *interp = "/native";
1817 1868 }
1818 1869
1819 1870 return (B_TRUE);
1820 1871 }
1821 1872
1822 1873 static void
1823 1874 lx_syscall_init(void)
1824 1875 {
1825 1876 int i;
1826 1877
1827 1878 /*
1828 1879 * Count up the 32-bit Linux system calls. Note that lx_sysent32
1829 1880 * has (LX_NSYSCALLS + 1) entries.
1830 1881 */
1831 1882 for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++)
1832 1883 continue;
1833 1884 lx_nsysent32 = i;
1834 1885
1835 1886 #if defined(_LP64)
1836 1887 /*
1837 1888 * Count up the 64-bit Linux system calls. Note that lx_sysent64
1838 1889 * has (LX_NSYSCALLS + 1) entries.
1839 1890 */
1840 1891 for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++)
1841 1892 continue;
1842 1893 lx_nsysent64 = i;
1843 1894 #endif
1844 1895 }
1845 1896
1846 1897 int
1847 1898 _init(void)
1848 1899 {
1849 1900 int err = 0;
1850 1901
1851 1902 lx_syscall_init();
1852 1903 lx_pid_init();
1853 1904 lx_ioctl_init();
1854 1905 lx_futex_init();
1855 1906 lx_ptrace_init();
1856 1907 lx_socket_init();
1857 1908
1858 1909 err = mod_install(&modlinkage);
1859 1910 if (err != 0) {
1860 1911 cmn_err(CE_WARN, "Couldn't install lx brand module");
1861 1912
1862 1913 /*
1863 1914 * This looks drastic, but it should never happen. These
1864 1915 * two data structures should be completely free-able until
1865 1916 * they are used by Linux processes. Since the brand
1866 1917 * wasn't loaded there should be no Linux processes, and
1867 1918 * thus no way for these data structures to be modified.
1868 1919 */
1869 1920 lx_pid_fini();
1870 1921 lx_ioctl_fini();
1871 1922 if (lx_futex_fini())
1872 1923 panic("lx brand module cannot be loaded or unloaded.");
1873 1924 }
1874 1925 return (err);
1875 1926 }
1876 1927
1877 1928 int
1878 1929 _info(struct modinfo *modinfop)
1879 1930 {
1880 1931 return (mod_info(&modlinkage, modinfop));
1881 1932 }
1882 1933
1883 1934 int
1884 1935 _fini(void)
1885 1936 {
1886 1937 int err;
1887 1938 int futex_done = 0;
1888 1939
1889 1940 /*
1890 1941 * If there are any zones using this brand, we can't allow it to be
1891 1942 * unloaded.
1892 1943 */
1893 1944 if (brand_zone_count(&lx_brand))
1894 1945 return (EBUSY);
1895 1946
1896 1947 lx_ptrace_fini();
1897 1948 lx_pid_fini();
1898 1949 lx_ioctl_fini();
1899 1950 lx_socket_fini();
1900 1951
1901 1952 if ((err = lx_futex_fini()) != 0) {
1902 1953 goto done;
1903 1954 }
1904 1955 futex_done = 1;
1905 1956
1906 1957 err = mod_remove(&modlinkage);
1907 1958
1908 1959 done:
1909 1960 if (err) {
1910 1961 /*
1911 1962 * If we can't unload the module, then we have to get it
1912 1963 * back into a sane state.
1913 1964 */
1914 1965 lx_ptrace_init();
1915 1966 lx_pid_init();
1916 1967 lx_ioctl_init();
1917 1968 lx_socket_init();
1918 1969
1919 1970 if (futex_done) {
1920 1971 lx_futex_init();
1921 1972 }
1922 1973 }
1923 1974
1924 1975 return (err);
1925 1976 }
↓ open down ↓ |
923 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX