1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2017, Joyent, Inc. 24 */ 25 26 /* 27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 28 * 29 * An instance of the structure aggr_grp_t is allocated for each 30 * link aggregation group. When created, aggr_grp_t objects are 31 * entered into the aggr_grp_hash hash table maintained by the modhash 32 * module. The hash key is the linkid associated with the link 33 * aggregation group. 34 * 35 * A set of MAC ports are associated with each association group. 36 * 37 * Aggr pseudo TX rings 38 * -------------------- 39 * The underlying ports (NICs) in an aggregation can have TX rings. To 40 * enhance aggr's performance, these TX rings are made available to the 41 * aggr layer as pseudo TX rings. The concept of pseudo rings are not new. 42 * They are already present and implemented on the RX side. It is called 43 * as pseudo RX rings. The same concept is extended to the TX side where 44 * each TX ring of an underlying port is reflected in aggr as a pseudo 45 * TX ring. Thus each pseudo TX ring will map to a specific hardware TX 46 * ring. Even in the case of a NIC that does not have a TX ring, a pseudo 47 * TX ring is given to the aggregation layer. 48 * 49 * With this change, the outgoing stack depth looks much better: 50 * 51 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 52 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 53 * 54 * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings: 55 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 56 * 57 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 58 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX 59 * ring belonging to a port on which the packet has to be sent. 60 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 61 * policy and then uses the fanout_hint passed to it to pick a TX ring from 62 * the selected port. 63 * 64 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 65 * bandwidth limit is applied first on the outgoing packet and the packets 66 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 67 * particular TX ring. 68 */ 69 70 #include <sys/types.h> 71 #include <sys/sysmacros.h> 72 #include <sys/conf.h> 73 #include <sys/cmn_err.h> 74 #include <sys/disp.h> 75 #include <sys/list.h> 76 #include <sys/ksynch.h> 77 #include <sys/kmem.h> 78 #include <sys/stream.h> 79 #include <sys/modctl.h> 80 #include <sys/ddi.h> 81 #include <sys/sunddi.h> 82 #include <sys/atomic.h> 83 #include <sys/stat.h> 84 #include <sys/modhash.h> 85 #include <sys/id_space.h> 86 #include <sys/strsun.h> 87 #include <sys/cred.h> 88 #include <sys/dlpi.h> 89 #include <sys/zone.h> 90 #include <sys/mac_provider.h> 91 #include <sys/dls.h> 92 #include <sys/vlan.h> 93 #include <sys/aggr.h> 94 #include <sys/aggr_impl.h> 95 96 static int aggr_m_start(void *); 97 static void aggr_m_stop(void *); 98 static int aggr_m_promisc(void *, boolean_t); 99 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 100 static int aggr_m_unicst(void *, const uint8_t *); 101 static int aggr_m_stat(void *, uint_t, uint64_t *); 102 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 103 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 105 const void *); 106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 107 mac_prop_info_handle_t); 108 109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 111 boolean_t *); 112 113 static void aggr_grp_capab_set(aggr_grp_t *); 114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 115 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 116 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 119 120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 122 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 123 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t); 125 static int aggr_addmac(void *, const uint8_t *); 126 static int aggr_remmac(void *, const uint8_t *); 127 static mblk_t *aggr_rx_poll(void *, int); 128 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 129 const int, mac_ring_info_t *, mac_ring_handle_t); 130 static void aggr_fill_group(void *, mac_ring_type_t, const int, 131 mac_group_info_t *, mac_group_handle_t); 132 133 static kmem_cache_t *aggr_grp_cache; 134 static mod_hash_t *aggr_grp_hash; 135 static krwlock_t aggr_grp_lock; 136 static uint_t aggr_grp_cnt; 137 static id_space_t *key_ids; 138 139 #define GRP_HASHSZ 64 140 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 141 #define AGGR_PORT_NAME_DELIMIT '-' 142 143 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 144 145 #define AGGR_M_CALLBACK_FLAGS \ 146 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 147 148 static mac_callbacks_t aggr_m_callbacks = { 149 AGGR_M_CALLBACK_FLAGS, 150 aggr_m_stat, 151 aggr_m_start, 152 aggr_m_stop, 153 aggr_m_promisc, 154 aggr_m_multicst, 155 NULL, 156 NULL, 157 NULL, 158 aggr_m_ioctl, 159 aggr_m_capab_get, 160 NULL, 161 NULL, 162 aggr_m_setprop, 163 NULL, 164 aggr_m_propinfo 165 }; 166 167 /*ARGSUSED*/ 168 static int 169 aggr_grp_constructor(void *buf, void *arg, int kmflag) 170 { 171 aggr_grp_t *grp = buf; 172 173 bzero(grp, sizeof (*grp)); 174 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 175 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 176 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 177 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 178 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 179 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 180 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 181 grp->lg_link_state = LINK_STATE_UNKNOWN; 182 return (0); 183 } 184 185 /*ARGSUSED*/ 186 static void 187 aggr_grp_destructor(void *buf, void *arg) 188 { 189 aggr_grp_t *grp = buf; 190 191 if (grp->lg_tx_ports != NULL) { 192 kmem_free(grp->lg_tx_ports, 193 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 194 } 195 196 mutex_destroy(&grp->lg_lacp_lock); 197 cv_destroy(&grp->lg_lacp_cv); 198 mutex_destroy(&grp->lg_port_lock); 199 cv_destroy(&grp->lg_port_cv); 200 rw_destroy(&grp->lg_tx_lock); 201 mutex_destroy(&grp->lg_tx_flowctl_lock); 202 cv_destroy(&grp->lg_tx_flowctl_cv); 203 } 204 205 void 206 aggr_grp_init(void) 207 { 208 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 209 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 210 aggr_grp_destructor, NULL, NULL, NULL, 0); 211 212 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 213 GRP_HASHSZ, mod_hash_null_valdtor); 214 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 215 aggr_grp_cnt = 0; 216 217 /* 218 * Allocate an id space to manage key values (when key is not 219 * specified). The range of the id space will be from 220 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 221 * uses a 16-bit key. 222 */ 223 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 224 ASSERT(key_ids != NULL); 225 } 226 227 void 228 aggr_grp_fini(void) 229 { 230 id_space_destroy(key_ids); 231 rw_destroy(&aggr_grp_lock); 232 mod_hash_destroy_idhash(aggr_grp_hash); 233 kmem_cache_destroy(aggr_grp_cache); 234 } 235 236 uint_t 237 aggr_grp_count(void) 238 { 239 uint_t count; 240 241 rw_enter(&aggr_grp_lock, RW_READER); 242 count = aggr_grp_cnt; 243 rw_exit(&aggr_grp_lock); 244 return (count); 245 } 246 247 /* 248 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 249 * requires the mac perimeter, this function holds a reference of the aggr 250 * and aggr won't call mac_unregister() until this reference drops to 0. 251 */ 252 void 253 aggr_grp_port_hold(aggr_port_t *port) 254 { 255 aggr_grp_t *grp = port->lp_grp; 256 257 AGGR_PORT_REFHOLD(port); 258 mutex_enter(&grp->lg_port_lock); 259 grp->lg_port_ref++; 260 mutex_exit(&grp->lg_port_lock); 261 } 262 263 /* 264 * Release the reference of the grp and inform aggr_grp_delete() calling 265 * mac_unregister() is now safe. 266 */ 267 void 268 aggr_grp_port_rele(aggr_port_t *port) 269 { 270 aggr_grp_t *grp = port->lp_grp; 271 272 mutex_enter(&grp->lg_port_lock); 273 if (--grp->lg_port_ref == 0) 274 cv_signal(&grp->lg_port_cv); 275 mutex_exit(&grp->lg_port_lock); 276 AGGR_PORT_REFRELE(port); 277 } 278 279 /* 280 * Wait for the port's lacp timer thread and the port's notification callback 281 * to exit. 282 */ 283 void 284 aggr_grp_port_wait(aggr_grp_t *grp) 285 { 286 mutex_enter(&grp->lg_port_lock); 287 if (grp->lg_port_ref != 0) 288 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 289 mutex_exit(&grp->lg_port_lock); 290 } 291 292 /* 293 * Attach a port to a link aggregation group. 294 * 295 * A port is attached to a link aggregation group once its speed 296 * and link state have been verified. 297 * 298 * Returns B_TRUE if the group link state or speed has changed. If 299 * it's the case, the caller must notify the MAC layer via a call 300 * to mac_link(). 301 */ 302 boolean_t 303 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 304 { 305 boolean_t link_state_changed = B_FALSE; 306 307 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 308 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 309 310 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 311 return (B_FALSE); 312 313 /* 314 * Validate the MAC port link speed and update the group 315 * link speed if needed. 316 */ 317 if (port->lp_ifspeed == 0 || 318 port->lp_link_state != LINK_STATE_UP || 319 port->lp_link_duplex != LINK_DUPLEX_FULL) { 320 /* 321 * Can't attach a MAC port with unknown link speed, 322 * down link, or not in full duplex mode. 323 */ 324 return (B_FALSE); 325 } 326 327 if (grp->lg_ifspeed == 0) { 328 /* 329 * The group inherits the speed of the first link being 330 * attached. 331 */ 332 grp->lg_ifspeed = port->lp_ifspeed; 333 link_state_changed = B_TRUE; 334 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 335 /* 336 * The link speed of the MAC port must be the same as 337 * the group link speed, as per 802.3ad. Since it is 338 * not, the attach is cancelled. 339 */ 340 return (B_FALSE); 341 } 342 343 grp->lg_nattached_ports++; 344 345 /* 346 * Update the group link state. 347 */ 348 if (grp->lg_link_state != LINK_STATE_UP) { 349 grp->lg_link_state = LINK_STATE_UP; 350 grp->lg_link_duplex = LINK_DUPLEX_FULL; 351 link_state_changed = B_TRUE; 352 } 353 354 /* 355 * Update port's state. 356 */ 357 port->lp_state = AGGR_PORT_STATE_ATTACHED; 358 359 aggr_grp_multicst_port(port, B_TRUE); 360 361 /* 362 * Set port's receive callback 363 */ 364 mac_rx_set(port->lp_mch, aggr_recv_cb, port); 365 366 /* 367 * If LACP is OFF, the port can be used to send data as soon 368 * as its link is up and verified to be compatible with the 369 * aggregation. 370 * 371 * If LACP is active or passive, notify the LACP subsystem, which 372 * will enable sending on the port following the LACP protocol. 373 */ 374 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 375 aggr_send_port_enable(port); 376 else 377 aggr_lacp_port_attached(port); 378 379 return (link_state_changed); 380 } 381 382 boolean_t 383 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 384 { 385 boolean_t link_state_changed = B_FALSE; 386 387 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 388 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 389 390 /* update state */ 391 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 392 return (B_FALSE); 393 394 mac_rx_clear(port->lp_mch); 395 396 aggr_grp_multicst_port(port, B_FALSE); 397 398 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 399 aggr_send_port_disable(port); 400 else 401 aggr_lacp_port_detached(port); 402 403 port->lp_state = AGGR_PORT_STATE_STANDBY; 404 405 grp->lg_nattached_ports--; 406 if (grp->lg_nattached_ports == 0) { 407 /* the last attached MAC port of the group is being detached */ 408 grp->lg_ifspeed = 0; 409 grp->lg_link_state = LINK_STATE_DOWN; 410 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 411 link_state_changed = B_TRUE; 412 } 413 414 return (link_state_changed); 415 } 416 417 /* 418 * Update the MAC addresses of the constituent ports of the specified 419 * group. This function is invoked: 420 * - after creating a new aggregation group. 421 * - after adding new ports to an aggregation group. 422 * - after removing a port from a group when the MAC address of 423 * that port was used for the MAC address of the group. 424 * - after the MAC address of a port changed when the MAC address 425 * of that port was used for the MAC address of the group. 426 * 427 * Return true if the link state of the aggregation changed, for example 428 * as a result of a failure changing the MAC address of one of the 429 * constituent ports. 430 */ 431 boolean_t 432 aggr_grp_update_ports_mac(aggr_grp_t *grp) 433 { 434 aggr_port_t *cport; 435 boolean_t link_state_changed = B_FALSE; 436 mac_perim_handle_t mph; 437 438 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 439 440 for (cport = grp->lg_ports; cport != NULL; 441 cport = cport->lp_next) { 442 mac_perim_enter_by_mh(cport->lp_mh, &mph); 443 if (aggr_port_unicst(cport) != 0) { 444 if (aggr_grp_detach_port(grp, cport)) 445 link_state_changed = B_TRUE; 446 } else { 447 /* 448 * If a port was detached because of a previous 449 * failure changing the MAC address, the port is 450 * reattached when it successfully changes the MAC 451 * address now, and this might cause the link state 452 * of the aggregation to change. 453 */ 454 if (aggr_grp_attach_port(grp, cport)) 455 link_state_changed = B_TRUE; 456 } 457 mac_perim_exit(mph); 458 } 459 return (link_state_changed); 460 } 461 462 /* 463 * Invoked when the MAC address of a port has changed. If the port's 464 * MAC address was used for the group MAC address, set mac_addr_changedp 465 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 466 * notification. If the link state changes due to detach/attach of 467 * the constituent port, set link_state_changedp to B_TRUE to indicate 468 * to the caller that it should send a MAC_NOTE_LINK notification. In both 469 * cases, it is the responsibility of the caller to invoke notification 470 * functions after releasing the the port lock. 471 */ 472 void 473 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 474 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 475 { 476 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 477 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 478 ASSERT(mac_addr_changedp != NULL); 479 ASSERT(link_state_changedp != NULL); 480 481 *mac_addr_changedp = B_FALSE; 482 *link_state_changedp = B_FALSE; 483 484 if (grp->lg_addr_fixed) { 485 /* 486 * The group is using a fixed MAC address or an automatic 487 * MAC address has not been set. 488 */ 489 return; 490 } 491 492 if (grp->lg_mac_addr_port == port) { 493 /* 494 * The MAC address of the port was assigned to the group 495 * MAC address. Update the group MAC address. 496 */ 497 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 498 *mac_addr_changedp = B_TRUE; 499 } else { 500 /* 501 * Update the actual port MAC address to the MAC address 502 * of the group. 503 */ 504 if (aggr_port_unicst(port) != 0) { 505 *link_state_changedp = aggr_grp_detach_port(grp, port); 506 } else { 507 /* 508 * If a port was detached because of a previous 509 * failure changing the MAC address, the port is 510 * reattached when it successfully changes the MAC 511 * address now, and this might cause the link state 512 * of the aggregation to change. 513 */ 514 *link_state_changedp = aggr_grp_attach_port(grp, port); 515 } 516 } 517 } 518 519 /* 520 * Add a port to a link aggregation group. 521 */ 522 static int 523 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 524 aggr_port_t **pp) 525 { 526 aggr_port_t *port, **cport; 527 mac_perim_handle_t mph; 528 zoneid_t port_zoneid = ALL_ZONES; 529 int err; 530 531 /* The port must be int the same zone as the aggregation. */ 532 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 533 port_zoneid = GLOBAL_ZONEID; 534 if (grp->lg_zoneid != port_zoneid) 535 return (EBUSY); 536 537 /* 538 * lg_mh could be NULL when the function is called during the creation 539 * of the aggregation. 540 */ 541 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 542 543 /* create new port */ 544 err = aggr_port_create(grp, port_linkid, force, &port); 545 if (err != 0) 546 return (err); 547 548 mac_perim_enter_by_mh(port->lp_mh, &mph); 549 550 /* add port to list of group constituent ports */ 551 cport = &grp->lg_ports; 552 while (*cport != NULL) 553 cport = &((*cport)->lp_next); 554 *cport = port; 555 556 /* 557 * Back reference to the group it is member of. A port always 558 * holds a reference to its group to ensure that the back 559 * reference is always valid. 560 */ 561 port->lp_grp = grp; 562 AGGR_GRP_REFHOLD(grp); 563 grp->lg_nports++; 564 565 aggr_lacp_init_port(port); 566 mac_perim_exit(mph); 567 568 if (pp != NULL) 569 *pp = port; 570 571 return (0); 572 } 573 574 /* 575 * This is called in response to either our LACP state machine or a MAC 576 * notification that the link has gone down via aggr_send_port_disable(). At 577 * this point, we may need to update our default ring. To that end, we go 578 * through the set of ports (underlying datalinks in an aggregation) that are 579 * currently enabled to transmit data. If all our links have been disabled for 580 * transmit, then we don't do anything. 581 * 582 * Note, because we only have a single TX group, we don't have to worry about 583 * the rings moving between groups and the chance that mac will reassign it 584 * unless someone removes a port, at which point, we play it safe and call this 585 * again. 586 */ 587 void 588 aggr_grp_update_default(aggr_grp_t *grp) 589 { 590 aggr_port_t *port; 591 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 592 593 rw_enter(&grp->lg_tx_lock, RW_WRITER); 594 595 if (grp->lg_ntx_ports == 0) { 596 rw_exit(&grp->lg_tx_lock); 597 return; 598 } 599 600 port = grp->lg_tx_ports[0]; 601 ASSERT(port->lp_tx_ring_cnt > 0); 602 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]); 603 rw_exit(&grp->lg_tx_lock); 604 } 605 606 /* 607 * Add a pseudo RX ring for the given HW ring handle. 608 */ 609 static int 610 aggr_add_pseudo_rx_ring(aggr_port_t *port, 611 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 612 { 613 aggr_pseudo_rx_ring_t *ring; 614 int err; 615 int j; 616 617 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 618 ring = rx_grp->arg_rings + j; 619 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 620 break; 621 } 622 623 /* 624 * No slot for this new RX ring. 625 */ 626 if (j == MAX_RINGS_PER_GROUP) 627 return (EIO); 628 629 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 630 ring->arr_hw_rh = hw_rh; 631 ring->arr_port = port; 632 rx_grp->arg_ring_cnt++; 633 634 /* 635 * The group is already registered, dynamically add a new ring to the 636 * mac group. 637 */ 638 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 639 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 640 ring->arr_hw_rh = NULL; 641 ring->arr_port = NULL; 642 rx_grp->arg_ring_cnt--; 643 } else { 644 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 645 mac_find_ring(rx_grp->arg_gh, j)); 646 } 647 return (err); 648 } 649 650 /* 651 * Remove the pseudo RX ring of the given HW ring handle. 652 */ 653 static void 654 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 655 { 656 aggr_pseudo_rx_ring_t *ring; 657 int j; 658 659 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 660 ring = rx_grp->arg_rings + j; 661 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 662 ring->arr_hw_rh != hw_rh) { 663 continue; 664 } 665 666 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 667 668 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 669 ring->arr_hw_rh = NULL; 670 ring->arr_port = NULL; 671 rx_grp->arg_ring_cnt--; 672 mac_hwring_teardown(hw_rh); 673 break; 674 } 675 } 676 677 /* 678 * This function is called to create pseudo rings over the hardware rings of 679 * the underlying device. Note that there is a 1:1 mapping between the pseudo 680 * RX rings of the aggr and the hardware rings of the underlying port. 681 */ 682 static int 683 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 684 { 685 aggr_grp_t *grp = port->lp_grp; 686 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 687 aggr_unicst_addr_t *addr, *a; 688 mac_perim_handle_t pmph; 689 int hw_rh_cnt, i = 0, j; 690 int err = 0; 691 692 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 693 mac_perim_enter_by_mh(port->lp_mh, &pmph); 694 695 /* 696 * This function must be called after the aggr registers its mac 697 * and its RX group has been initialized. 698 */ 699 ASSERT(rx_grp->arg_gh != NULL); 700 701 /* 702 * Get the list the the underlying HW rings. 703 */ 704 hw_rh_cnt = mac_hwrings_get(port->lp_mch, 705 &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX); 706 707 if (port->lp_hwgh != NULL) { 708 /* 709 * Quiesce the HW ring and the mac srs on the ring. Note 710 * that the HW ring will be restarted when the pseudo ring 711 * is started. At that time all the packets will be 712 * directly passed up to the pseudo RX ring and handled 713 * by mac srs created over the pseudo RX ring. 714 */ 715 mac_rx_client_quiesce(port->lp_mch); 716 mac_srs_perm_quiesce(port->lp_mch, B_TRUE); 717 } 718 719 /* 720 * Add all the unicast addresses to the newly added port. 721 */ 722 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 723 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0) 724 break; 725 } 726 727 for (i = 0; err == 0 && i < hw_rh_cnt; i++) 728 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 729 730 if (err != 0) { 731 for (j = 0; j < i; j++) 732 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 733 734 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 735 aggr_port_remmac(port, a->aua_addr); 736 737 if (port->lp_hwgh != NULL) { 738 mac_srs_perm_quiesce(port->lp_mch, B_FALSE); 739 mac_rx_client_restart(port->lp_mch); 740 port->lp_hwgh = NULL; 741 } 742 } else { 743 port->lp_rx_grp_added = B_TRUE; 744 } 745 done: 746 mac_perim_exit(pmph); 747 return (err); 748 } 749 750 /* 751 * This function is called by aggr to remove pseudo RX rings over the 752 * HW rings of the underlying port. 753 */ 754 static void 755 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 756 { 757 aggr_grp_t *grp = port->lp_grp; 758 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 759 aggr_unicst_addr_t *addr; 760 mac_group_handle_t hwgh; 761 mac_perim_handle_t pmph; 762 int hw_rh_cnt, i; 763 764 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 765 mac_perim_enter_by_mh(port->lp_mh, &pmph); 766 767 if (!port->lp_rx_grp_added) 768 goto done; 769 770 ASSERT(rx_grp->arg_gh != NULL); 771 hw_rh_cnt = mac_hwrings_get(port->lp_mch, 772 &hwgh, hw_rh, MAC_RING_TYPE_RX); 773 774 /* 775 * If hw_rh_cnt is 0, it means that the underlying port does not 776 * support RX rings. Directly return in this case. 777 */ 778 for (i = 0; i < hw_rh_cnt; i++) 779 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 780 781 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 782 aggr_port_remmac(port, addr->aua_addr); 783 784 if (port->lp_hwgh != NULL) { 785 port->lp_hwgh = NULL; 786 787 /* 788 * First clear the permanent-quiesced flag of the RX srs then 789 * restart the HW ring and the mac srs on the ring. Note that 790 * the HW ring and associated SRS will soon been removed when 791 * the port is removed from the aggr. 792 */ 793 mac_srs_perm_quiesce(port->lp_mch, B_FALSE); 794 mac_rx_client_restart(port->lp_mch); 795 } 796 797 port->lp_rx_grp_added = B_FALSE; 798 done: 799 mac_perim_exit(pmph); 800 } 801 802 /* 803 * Add a pseudo TX ring for the given HW ring handle. 804 */ 805 static int 806 aggr_add_pseudo_tx_ring(aggr_port_t *port, 807 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 808 mac_ring_handle_t *pseudo_rh) 809 { 810 aggr_pseudo_tx_ring_t *ring; 811 int err; 812 int i; 813 814 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 815 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 816 ring = tx_grp->atg_rings + i; 817 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 818 break; 819 } 820 /* 821 * No slot for this new TX ring. 822 */ 823 if (i == MAX_RINGS_PER_GROUP) 824 return (EIO); 825 /* 826 * The following 4 statements needs to be done before 827 * calling mac_group_add_ring(). Otherwise it will 828 * result in an assertion failure in mac_init_ring(). 829 */ 830 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 831 ring->atr_hw_rh = hw_rh; 832 ring->atr_port = port; 833 tx_grp->atg_ring_cnt++; 834 835 /* 836 * The TX side has no concept of ring groups unlike RX groups. 837 * There is just a single group which stores all the TX rings. 838 * This group will be used to store aggr's pseudo TX rings. 839 */ 840 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 841 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 842 ring->atr_hw_rh = NULL; 843 ring->atr_port = NULL; 844 tx_grp->atg_ring_cnt--; 845 } else { 846 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 847 if (hw_rh != NULL) { 848 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 849 mac_find_ring(tx_grp->atg_gh, i)); 850 } 851 } 852 853 return (err); 854 } 855 856 /* 857 * Remove the pseudo TX ring of the given HW ring handle. 858 */ 859 static void 860 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 861 mac_ring_handle_t pseudo_hw_rh) 862 { 863 aggr_pseudo_tx_ring_t *ring; 864 int i; 865 866 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 867 ring = tx_grp->atg_rings + i; 868 if (ring->atr_rh != pseudo_hw_rh) 869 continue; 870 871 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 872 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 873 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 874 mac_hwring_teardown(ring->atr_hw_rh); 875 ring->atr_hw_rh = NULL; 876 ring->atr_port = NULL; 877 tx_grp->atg_ring_cnt--; 878 break; 879 } 880 } 881 882 /* 883 * This function is called to create pseudo rings over hardware rings of 884 * the underlying device. There is a 1:1 mapping between the pseudo TX 885 * rings of the aggr and the hardware rings of the underlying port. 886 */ 887 static int 888 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 889 { 890 aggr_grp_t *grp = port->lp_grp; 891 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 892 mac_perim_handle_t pmph; 893 int hw_rh_cnt, i = 0, j; 894 int err = 0; 895 896 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 897 mac_perim_enter_by_mh(port->lp_mh, &pmph); 898 899 /* 900 * Get the list the the underlying HW rings. 901 */ 902 hw_rh_cnt = mac_hwrings_get(port->lp_mch, 903 NULL, hw_rh, MAC_RING_TYPE_TX); 904 905 /* 906 * Even if the underlying NIC does not have TX rings, we 907 * still make a psuedo TX ring for that NIC with NULL as 908 * the ring handle. 909 */ 910 if (hw_rh_cnt == 0) 911 port->lp_tx_ring_cnt = 1; 912 else 913 port->lp_tx_ring_cnt = hw_rh_cnt; 914 915 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 916 port->lp_tx_ring_cnt), KM_SLEEP); 917 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 918 port->lp_tx_ring_cnt), KM_SLEEP); 919 920 if (hw_rh_cnt == 0) { 921 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 922 NULL, &pseudo_rh)) == 0) { 923 port->lp_tx_rings[0] = NULL; 924 port->lp_pseudo_tx_rings[0] = pseudo_rh; 925 } 926 } else { 927 for (i = 0; err == 0 && i < hw_rh_cnt; i++) { 928 err = aggr_add_pseudo_tx_ring(port, 929 tx_grp, hw_rh[i], &pseudo_rh); 930 if (err != 0) 931 break; 932 port->lp_tx_rings[i] = hw_rh[i]; 933 port->lp_pseudo_tx_rings[i] = pseudo_rh; 934 } 935 } 936 937 if (err != 0) { 938 if (hw_rh_cnt != 0) { 939 for (j = 0; j < i; j++) { 940 aggr_rem_pseudo_tx_ring(tx_grp, 941 port->lp_pseudo_tx_rings[j]); 942 } 943 } 944 kmem_free(port->lp_tx_rings, 945 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 946 kmem_free(port->lp_pseudo_tx_rings, 947 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 948 port->lp_tx_ring_cnt = 0; 949 } else { 950 port->lp_tx_grp_added = B_TRUE; 951 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 952 aggr_tx_ring_update, port); 953 } 954 mac_perim_exit(pmph); 955 aggr_grp_update_default(grp); 956 return (err); 957 } 958 959 /* 960 * This function is called by aggr to remove pseudo TX rings over the 961 * HW rings of the underlying port. 962 */ 963 static void 964 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 965 { 966 aggr_grp_t *grp = port->lp_grp; 967 mac_perim_handle_t pmph; 968 int i; 969 970 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 971 mac_perim_enter_by_mh(port->lp_mh, &pmph); 972 973 if (!port->lp_tx_grp_added) 974 goto done; 975 976 ASSERT(tx_grp->atg_gh != NULL); 977 978 for (i = 0; i < port->lp_tx_ring_cnt; i++) 979 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 980 981 kmem_free(port->lp_tx_rings, 982 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 983 kmem_free(port->lp_pseudo_tx_rings, 984 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 985 986 port->lp_tx_ring_cnt = 0; 987 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 988 port->lp_tx_grp_added = B_FALSE; 989 aggr_grp_update_default(grp); 990 done: 991 mac_perim_exit(pmph); 992 } 993 994 static int 995 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 996 { 997 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 998 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 999 } 1000 1001 static int 1002 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 1003 { 1004 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1005 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 1006 } 1007 1008 /* 1009 * Here we need to start the pseudo-ring. As MAC already ensures that the 1010 * underlying device is set up, all we need to do is save the ring generation. 1011 * 1012 * Note, we don't end up wanting to use the underlying mac_hwring_start/stop 1013 * functions here as those don't actually stop and start the ring, they just 1014 * quiesce the ring. Regardless of whether the aggr is logically up or not, we 1015 * want to make sure that we can receive traffic for LACP. 1016 */ 1017 static int 1018 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen) 1019 { 1020 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1021 1022 rr_ring->arr_gen = mr_gen; 1023 return (0); 1024 } 1025 1026 /* 1027 * Add one or more ports to an existing link aggregation group. 1028 */ 1029 int 1030 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 1031 laioc_port_t *ports) 1032 { 1033 int rc, i, nadded = 0; 1034 aggr_grp_t *grp = NULL; 1035 aggr_port_t *port; 1036 boolean_t link_state_changed = B_FALSE; 1037 mac_perim_handle_t mph, pmph; 1038 1039 /* get group corresponding to linkid */ 1040 rw_enter(&aggr_grp_lock, RW_READER); 1041 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1042 (mod_hash_val_t *)&grp) != 0) { 1043 rw_exit(&aggr_grp_lock); 1044 return (ENOENT); 1045 } 1046 AGGR_GRP_REFHOLD(grp); 1047 1048 /* 1049 * Hold the perimeter so that the aggregation won't be destroyed. 1050 */ 1051 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1052 rw_exit(&aggr_grp_lock); 1053 1054 /* add the specified ports to group */ 1055 for (i = 0; i < nports; i++) { 1056 /* add port to group */ 1057 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1058 force, &port)) != 0) { 1059 goto bail; 1060 } 1061 ASSERT(port != NULL); 1062 nadded++; 1063 1064 /* check capabilities */ 1065 if (!aggr_grp_capab_check(grp, port) || 1066 !aggr_grp_sdu_check(grp, port) || 1067 !aggr_grp_margin_check(grp, port)) { 1068 rc = ENOTSUP; 1069 goto bail; 1070 } 1071 1072 /* 1073 * Create the pseudo ring for each HW ring of the underlying 1074 * port. 1075 */ 1076 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); 1077 if (rc != 0) 1078 goto bail; 1079 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group); 1080 if (rc != 0) 1081 goto bail; 1082 1083 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1084 1085 /* set LACP mode */ 1086 aggr_port_lacp_set_mode(grp, port); 1087 1088 /* start port if group has already been started */ 1089 if (grp->lg_started) { 1090 rc = aggr_port_start(port); 1091 if (rc != 0) { 1092 mac_perim_exit(pmph); 1093 goto bail; 1094 } 1095 1096 /* 1097 * Turn on the promiscuous mode over the port when it 1098 * is requested to be turned on to receive the 1099 * non-primary address over a port, or the promiscous 1100 * mode is enabled over the aggr. 1101 */ 1102 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1103 rc = aggr_port_promisc(port, B_TRUE); 1104 if (rc != 0) { 1105 mac_perim_exit(pmph); 1106 goto bail; 1107 } 1108 } 1109 } 1110 mac_perim_exit(pmph); 1111 1112 /* 1113 * Attach each port if necessary. 1114 */ 1115 if (aggr_port_notify_link(grp, port)) 1116 link_state_changed = B_TRUE; 1117 1118 /* 1119 * Initialize the callback functions for this port. 1120 */ 1121 aggr_port_init_callbacks(port); 1122 } 1123 1124 /* update the MAC address of the constituent ports */ 1125 if (aggr_grp_update_ports_mac(grp)) 1126 link_state_changed = B_TRUE; 1127 1128 if (link_state_changed) 1129 mac_link_update(grp->lg_mh, grp->lg_link_state); 1130 1131 bail: 1132 if (rc != 0) { 1133 /* stop and remove ports that have been added */ 1134 for (i = 0; i < nadded; i++) { 1135 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1136 ASSERT(port != NULL); 1137 if (grp->lg_started) { 1138 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1139 (void) aggr_port_promisc(port, B_FALSE); 1140 aggr_port_stop(port); 1141 mac_perim_exit(pmph); 1142 } 1143 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1144 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); 1145 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1146 } 1147 } 1148 1149 mac_perim_exit(mph); 1150 AGGR_GRP_REFRELE(grp); 1151 return (rc); 1152 } 1153 1154 static int 1155 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1156 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1157 aggr_lacp_timer_t lacp_timer) 1158 { 1159 boolean_t mac_addr_changed = B_FALSE; 1160 boolean_t link_state_changed = B_FALSE; 1161 mac_perim_handle_t pmph; 1162 1163 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1164 1165 /* validate fixed address if specified */ 1166 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1167 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1168 (mac_addr[0] & 0x01))) { 1169 return (EINVAL); 1170 } 1171 1172 /* update policy if requested */ 1173 if (update_mask & AGGR_MODIFY_POLICY) 1174 aggr_send_update_policy(grp, policy); 1175 1176 /* update unicast MAC address if requested */ 1177 if (update_mask & AGGR_MODIFY_MAC) { 1178 if (mac_fixed) { 1179 /* user-supplied MAC address */ 1180 grp->lg_mac_addr_port = NULL; 1181 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1182 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1183 mac_addr_changed = B_TRUE; 1184 } 1185 } else if (grp->lg_addr_fixed) { 1186 /* switch from user-supplied to automatic */ 1187 aggr_port_t *port = grp->lg_ports; 1188 1189 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1190 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1191 grp->lg_mac_addr_port = port; 1192 mac_addr_changed = B_TRUE; 1193 mac_perim_exit(pmph); 1194 } 1195 grp->lg_addr_fixed = mac_fixed; 1196 } 1197 1198 if (mac_addr_changed) 1199 link_state_changed = aggr_grp_update_ports_mac(grp); 1200 1201 if (update_mask & AGGR_MODIFY_LACP_MODE) 1202 aggr_lacp_update_mode(grp, lacp_mode); 1203 1204 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1205 aggr_lacp_update_timer(grp, lacp_timer); 1206 1207 if (link_state_changed) 1208 mac_link_update(grp->lg_mh, grp->lg_link_state); 1209 1210 if (mac_addr_changed) 1211 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1212 1213 return (0); 1214 } 1215 1216 /* 1217 * Update properties of an existing link aggregation group. 1218 */ 1219 int 1220 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1221 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1222 aggr_lacp_timer_t lacp_timer) 1223 { 1224 aggr_grp_t *grp = NULL; 1225 mac_perim_handle_t mph; 1226 int err; 1227 1228 /* get group corresponding to linkid */ 1229 rw_enter(&aggr_grp_lock, RW_READER); 1230 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1231 (mod_hash_val_t *)&grp) != 0) { 1232 rw_exit(&aggr_grp_lock); 1233 return (ENOENT); 1234 } 1235 AGGR_GRP_REFHOLD(grp); 1236 1237 /* 1238 * Hold the perimeter so that the aggregation won't be destroyed. 1239 */ 1240 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1241 rw_exit(&aggr_grp_lock); 1242 1243 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1244 mac_addr, lacp_mode, lacp_timer); 1245 1246 mac_perim_exit(mph); 1247 AGGR_GRP_REFRELE(grp); 1248 return (err); 1249 } 1250 1251 /* 1252 * Create a new link aggregation group upon request from administrator. 1253 * Returns 0 on success, an errno on failure. 1254 */ 1255 int 1256 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1257 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1258 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1259 cred_t *credp) 1260 { 1261 aggr_grp_t *grp = NULL; 1262 aggr_port_t *port; 1263 mac_register_t *mac; 1264 boolean_t link_state_changed; 1265 mac_perim_handle_t mph; 1266 int err; 1267 int i; 1268 kt_did_t tid = 0; 1269 1270 /* need at least one port */ 1271 if (nports == 0) 1272 return (EINVAL); 1273 1274 rw_enter(&aggr_grp_lock, RW_WRITER); 1275 1276 /* does a group with the same linkid already exist? */ 1277 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1278 (mod_hash_val_t *)&grp); 1279 if (err == 0) { 1280 rw_exit(&aggr_grp_lock); 1281 return (EEXIST); 1282 } 1283 1284 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1285 1286 grp->lg_refs = 1; 1287 grp->lg_closing = B_FALSE; 1288 grp->lg_force = force; 1289 grp->lg_linkid = linkid; 1290 grp->lg_zoneid = crgetzoneid(credp); 1291 grp->lg_ifspeed = 0; 1292 grp->lg_link_state = LINK_STATE_UNKNOWN; 1293 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1294 grp->lg_started = B_FALSE; 1295 grp->lg_promisc = B_FALSE; 1296 grp->lg_lacp_done = B_FALSE; 1297 grp->lg_tx_notify_done = B_FALSE; 1298 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1299 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1300 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1301 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1302 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1303 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1304 MAX_RINGS_PER_GROUP), KM_SLEEP); 1305 grp->lg_tx_blocked_cnt = 0; 1306 bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t)); 1307 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1308 aggr_lacp_init_grp(grp); 1309 1310 /* add MAC ports to group */ 1311 grp->lg_ports = NULL; 1312 grp->lg_nports = 0; 1313 grp->lg_nattached_ports = 0; 1314 grp->lg_ntx_ports = 0; 1315 1316 /* 1317 * If key is not specified by the user, allocate the key. 1318 */ 1319 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1320 err = ENOMEM; 1321 goto bail; 1322 } 1323 grp->lg_key = key; 1324 1325 for (i = 0; i < nports; i++) { 1326 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL); 1327 if (err != 0) 1328 goto bail; 1329 } 1330 1331 /* 1332 * If no explicit MAC address was specified by the administrator, 1333 * set it to the MAC address of the first port. 1334 */ 1335 grp->lg_addr_fixed = mac_fixed; 1336 if (grp->lg_addr_fixed) { 1337 /* validate specified address */ 1338 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1339 err = EINVAL; 1340 goto bail; 1341 } 1342 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1343 } else { 1344 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1345 grp->lg_mac_addr_port = grp->lg_ports; 1346 } 1347 1348 /* set the initial group capabilities */ 1349 aggr_grp_capab_set(grp); 1350 1351 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1352 err = ENOMEM; 1353 goto bail; 1354 } 1355 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1356 mac->m_driver = grp; 1357 mac->m_dip = aggr_dip; 1358 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1359 mac->m_src_addr = grp->lg_addr; 1360 mac->m_callbacks = &aggr_m_callbacks; 1361 mac->m_min_sdu = 0; 1362 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1363 mac->m_margin = aggr_grp_max_margin(grp); 1364 mac->m_v12n = MAC_VIRT_LEVEL1; 1365 err = mac_register(mac, &grp->lg_mh); 1366 mac_free(mac); 1367 if (err != 0) 1368 goto bail; 1369 1370 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1371 if (err != 0) { 1372 (void) mac_unregister(grp->lg_mh); 1373 grp->lg_mh = NULL; 1374 goto bail; 1375 } 1376 1377 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1378 1379 /* 1380 * Update the MAC address of the constituent ports. 1381 * None of the port is attached at this time, the link state of the 1382 * aggregation will not change. 1383 */ 1384 link_state_changed = aggr_grp_update_ports_mac(grp); 1385 ASSERT(!link_state_changed); 1386 1387 /* update outbound load balancing policy */ 1388 aggr_send_update_policy(grp, policy); 1389 1390 /* set LACP mode */ 1391 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1392 1393 /* 1394 * Attach each port if necessary. 1395 */ 1396 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1397 /* 1398 * Create the pseudo ring for each HW ring of the underlying 1399 * port. Note that this is done after the aggr registers the 1400 * mac. 1401 */ 1402 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0); 1403 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0); 1404 if (aggr_port_notify_link(grp, port)) 1405 link_state_changed = B_TRUE; 1406 1407 /* 1408 * Initialize the callback functions for this port. 1409 */ 1410 aggr_port_init_callbacks(port); 1411 } 1412 1413 if (link_state_changed) 1414 mac_link_update(grp->lg_mh, grp->lg_link_state); 1415 1416 /* add new group to hash table */ 1417 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1418 (mod_hash_val_t)grp); 1419 ASSERT(err == 0); 1420 aggr_grp_cnt++; 1421 1422 mac_perim_exit(mph); 1423 rw_exit(&aggr_grp_lock); 1424 return (0); 1425 1426 bail: 1427 1428 grp->lg_closing = B_TRUE; 1429 1430 port = grp->lg_ports; 1431 while (port != NULL) { 1432 aggr_port_t *cport; 1433 1434 cport = port->lp_next; 1435 aggr_port_delete(port); 1436 port = cport; 1437 } 1438 1439 /* 1440 * Inform the lacp_rx thread to exit. 1441 */ 1442 mutex_enter(&grp->lg_lacp_lock); 1443 grp->lg_lacp_done = B_TRUE; 1444 cv_signal(&grp->lg_lacp_cv); 1445 while (grp->lg_lacp_rx_thread != NULL) 1446 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1447 mutex_exit(&grp->lg_lacp_lock); 1448 /* 1449 * Inform the tx_notify thread to exit. 1450 */ 1451 mutex_enter(&grp->lg_tx_flowctl_lock); 1452 if (grp->lg_tx_notify_thread != NULL) { 1453 tid = grp->lg_tx_notify_thread->t_did; 1454 grp->lg_tx_notify_done = B_TRUE; 1455 cv_signal(&grp->lg_tx_flowctl_cv); 1456 } 1457 mutex_exit(&grp->lg_tx_flowctl_lock); 1458 if (tid != 0) 1459 thread_join(tid); 1460 1461 kmem_free(grp->lg_tx_blocked_rings, 1462 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1463 rw_exit(&aggr_grp_lock); 1464 AGGR_GRP_REFRELE(grp); 1465 return (err); 1466 } 1467 1468 /* 1469 * Return a pointer to the member of a group with specified linkid. 1470 */ 1471 static aggr_port_t * 1472 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1473 { 1474 aggr_port_t *port; 1475 1476 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1477 1478 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1479 if (port->lp_linkid == linkid) 1480 break; 1481 } 1482 1483 return (port); 1484 } 1485 1486 /* 1487 * Stop, detach and remove a port from a link aggregation group. 1488 */ 1489 static int 1490 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1491 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1492 { 1493 int rc = 0; 1494 aggr_port_t **pport; 1495 boolean_t mac_addr_changed = B_FALSE; 1496 boolean_t link_state_changed = B_FALSE; 1497 mac_perim_handle_t mph; 1498 uint64_t val; 1499 uint_t i; 1500 uint_t stat; 1501 1502 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1503 ASSERT(grp->lg_nports > 1); 1504 ASSERT(!grp->lg_closing); 1505 1506 /* unlink port */ 1507 for (pport = &grp->lg_ports; *pport != port; 1508 pport = &(*pport)->lp_next) { 1509 if (*pport == NULL) { 1510 rc = ENOENT; 1511 goto done; 1512 } 1513 } 1514 *pport = port->lp_next; 1515 1516 mac_perim_enter_by_mh(port->lp_mh, &mph); 1517 1518 /* 1519 * If the MAC address of the port being removed was assigned 1520 * to the group, update the group MAC address 1521 * using the MAC address of a different port. 1522 */ 1523 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1524 /* 1525 * Set the MAC address of the group to the 1526 * MAC address of its first port. 1527 */ 1528 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1529 grp->lg_mac_addr_port = grp->lg_ports; 1530 mac_addr_changed = B_TRUE; 1531 } 1532 1533 link_state_changed = aggr_grp_detach_port(grp, port); 1534 1535 /* 1536 * Add the counter statistics of the ports while it was aggregated 1537 * to the group's residual statistics. This is done by obtaining 1538 * the current counter from the underlying MAC then subtracting the 1539 * value of the counter at the moment it was added to the 1540 * aggregation. 1541 */ 1542 for (i = 0; i < MAC_NSTAT; i++) { 1543 stat = i + MAC_STAT_MIN; 1544 if (!MAC_STAT_ISACOUNTER(stat)) 1545 continue; 1546 val = aggr_port_stat(port, stat); 1547 val -= port->lp_stat[i]; 1548 grp->lg_stat[i] += val; 1549 } 1550 for (i = 0; i < ETHER_NSTAT; i++) { 1551 stat = i + MACTYPE_STAT_MIN; 1552 if (!ETHER_STAT_ISACOUNTER(stat)) 1553 continue; 1554 val = aggr_port_stat(port, stat); 1555 val -= port->lp_ether_stat[i]; 1556 grp->lg_ether_stat[i] += val; 1557 } 1558 1559 grp->lg_nports--; 1560 mac_perim_exit(mph); 1561 1562 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1563 aggr_port_delete(port); 1564 1565 /* 1566 * If the group MAC address has changed, update the MAC address of 1567 * the remaining constituent ports according to the new MAC 1568 * address of the group. 1569 */ 1570 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1571 link_state_changed = B_TRUE; 1572 1573 done: 1574 if (mac_addr_changedp != NULL) 1575 *mac_addr_changedp = mac_addr_changed; 1576 if (link_state_changedp != NULL) 1577 *link_state_changedp = link_state_changed; 1578 1579 return (rc); 1580 } 1581 1582 /* 1583 * Remove one or more ports from an existing link aggregation group. 1584 */ 1585 int 1586 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1587 { 1588 int rc = 0, i; 1589 aggr_grp_t *grp = NULL; 1590 aggr_port_t *port; 1591 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1592 boolean_t link_state_update = B_FALSE, link_state_changed; 1593 mac_perim_handle_t mph, pmph; 1594 1595 /* get group corresponding to linkid */ 1596 rw_enter(&aggr_grp_lock, RW_READER); 1597 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1598 (mod_hash_val_t *)&grp) != 0) { 1599 rw_exit(&aggr_grp_lock); 1600 return (ENOENT); 1601 } 1602 AGGR_GRP_REFHOLD(grp); 1603 1604 /* 1605 * Hold the perimeter so that the aggregation won't be destroyed. 1606 */ 1607 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1608 rw_exit(&aggr_grp_lock); 1609 1610 /* we need to keep at least one port per group */ 1611 if (nports >= grp->lg_nports) { 1612 rc = EINVAL; 1613 goto bail; 1614 } 1615 1616 /* first verify that all the groups are valid */ 1617 for (i = 0; i < nports; i++) { 1618 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1619 /* port not found */ 1620 rc = ENOENT; 1621 goto bail; 1622 } 1623 } 1624 1625 /* clear the promiscous mode for the specified ports */ 1626 for (i = 0; i < nports && rc == 0; i++) { 1627 /* lookup port */ 1628 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1629 ASSERT(port != NULL); 1630 1631 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1632 rc = aggr_port_promisc(port, B_FALSE); 1633 mac_perim_exit(pmph); 1634 } 1635 if (rc != 0) { 1636 for (i = 0; i < nports; i++) { 1637 port = aggr_grp_port_lookup(grp, 1638 ports[i].lp_linkid); 1639 ASSERT(port != NULL); 1640 1641 /* 1642 * Turn the promiscuous mode back on if it is required 1643 * to receive the non-primary address over a port, or 1644 * the promiscous mode is enabled over the aggr. 1645 */ 1646 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1647 if (port->lp_started && (grp->lg_promisc || 1648 port->lp_prom_addr != NULL)) { 1649 (void) aggr_port_promisc(port, B_TRUE); 1650 } 1651 mac_perim_exit(pmph); 1652 } 1653 goto bail; 1654 } 1655 1656 /* remove the specified ports from group */ 1657 for (i = 0; i < nports; i++) { 1658 /* lookup port */ 1659 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1660 ASSERT(port != NULL); 1661 1662 /* stop port if group has already been started */ 1663 if (grp->lg_started) { 1664 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1665 aggr_port_stop(port); 1666 mac_perim_exit(pmph); 1667 } 1668 1669 /* 1670 * aggr_rem_pseudo_tx_group() is not called here. Instead 1671 * it is called from inside aggr_grp_rem_port() after the 1672 * port has been detached. The reason is that 1673 * aggr_rem_pseudo_tx_group() removes one ring at a time 1674 * and if there is still traffic going on, then there 1675 * is the possibility of aggr_find_tx_ring() returning a 1676 * removed ring for transmission. Once the port has been 1677 * detached, that port will not be used and 1678 * aggr_find_tx_ring() will not return any rings 1679 * belonging to it. 1680 */ 1681 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); 1682 1683 /* remove port from group */ 1684 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 1685 &link_state_changed); 1686 ASSERT(rc == 0); 1687 mac_addr_update = mac_addr_update || mac_addr_changed; 1688 link_state_update = link_state_update || link_state_changed; 1689 } 1690 1691 bail: 1692 if (mac_addr_update) 1693 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1694 if (link_state_update) 1695 mac_link_update(grp->lg_mh, grp->lg_link_state); 1696 1697 mac_perim_exit(mph); 1698 AGGR_GRP_REFRELE(grp); 1699 1700 return (rc); 1701 } 1702 1703 int 1704 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 1705 { 1706 aggr_grp_t *grp = NULL; 1707 aggr_port_t *port, *cport; 1708 datalink_id_t tmpid; 1709 mod_hash_val_t val; 1710 mac_perim_handle_t mph, pmph; 1711 int err; 1712 kt_did_t tid = 0; 1713 1714 rw_enter(&aggr_grp_lock, RW_WRITER); 1715 1716 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1717 (mod_hash_val_t *)&grp) != 0) { 1718 rw_exit(&aggr_grp_lock); 1719 return (ENOENT); 1720 } 1721 1722 /* 1723 * Note that dls_devnet_destroy() must be called before lg_lock is 1724 * held. Otherwise, it will deadlock if another thread is in 1725 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 1726 * dls_devnet_destroy() needs to delete. 1727 */ 1728 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 1729 rw_exit(&aggr_grp_lock); 1730 return (err); 1731 } 1732 ASSERT(linkid == tmpid); 1733 1734 /* 1735 * Unregister from the MAC service module. Since this can 1736 * fail if a client hasn't closed the MAC port, we gracefully 1737 * fail the operation. 1738 */ 1739 if ((err = mac_disable(grp->lg_mh)) != 0) { 1740 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 1741 rw_exit(&aggr_grp_lock); 1742 return (err); 1743 } 1744 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 1745 ASSERT(grp == (aggr_grp_t *)val); 1746 1747 ASSERT(aggr_grp_cnt > 0); 1748 aggr_grp_cnt--; 1749 rw_exit(&aggr_grp_lock); 1750 1751 /* 1752 * Inform the lacp_rx thread to exit. 1753 */ 1754 mutex_enter(&grp->lg_lacp_lock); 1755 grp->lg_lacp_done = B_TRUE; 1756 cv_signal(&grp->lg_lacp_cv); 1757 while (grp->lg_lacp_rx_thread != NULL) 1758 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1759 mutex_exit(&grp->lg_lacp_lock); 1760 /* 1761 * Inform the tx_notify_thread to exit. 1762 */ 1763 mutex_enter(&grp->lg_tx_flowctl_lock); 1764 if (grp->lg_tx_notify_thread != NULL) { 1765 tid = grp->lg_tx_notify_thread->t_did; 1766 grp->lg_tx_notify_done = B_TRUE; 1767 cv_signal(&grp->lg_tx_flowctl_cv); 1768 } 1769 mutex_exit(&grp->lg_tx_flowctl_lock); 1770 if (tid != 0) 1771 thread_join(tid); 1772 1773 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1774 1775 grp->lg_closing = B_TRUE; 1776 /* detach and free MAC ports associated with group */ 1777 port = grp->lg_ports; 1778 while (port != NULL) { 1779 cport = port->lp_next; 1780 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1781 if (grp->lg_started) 1782 aggr_port_stop(port); 1783 (void) aggr_grp_detach_port(grp, port); 1784 mac_perim_exit(pmph); 1785 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1786 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); 1787 aggr_port_delete(port); 1788 port = cport; 1789 } 1790 1791 mac_perim_exit(mph); 1792 1793 kmem_free(grp->lg_tx_blocked_rings, 1794 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1795 /* 1796 * Wait for the port's lacp timer thread and its notification callback 1797 * to exit before calling mac_unregister() since both needs to access 1798 * the mac perimeter of the grp. 1799 */ 1800 aggr_grp_port_wait(grp); 1801 1802 VERIFY(mac_unregister(grp->lg_mh) == 0); 1803 grp->lg_mh = NULL; 1804 1805 AGGR_GRP_REFRELE(grp); 1806 return (0); 1807 } 1808 1809 void 1810 aggr_grp_free(aggr_grp_t *grp) 1811 { 1812 ASSERT(grp->lg_refs == 0); 1813 ASSERT(grp->lg_port_ref == 0); 1814 if (grp->lg_key > AGGR_MAX_KEY) { 1815 id_free(key_ids, grp->lg_key); 1816 grp->lg_key = 0; 1817 } 1818 kmem_cache_free(aggr_grp_cache, grp); 1819 } 1820 1821 int 1822 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 1823 aggr_grp_info_new_grp_fn_t new_grp_fn, 1824 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 1825 { 1826 aggr_grp_t *grp; 1827 aggr_port_t *port; 1828 mac_perim_handle_t mph, pmph; 1829 int rc = 0; 1830 1831 /* 1832 * Make sure that the aggregation link is visible from the caller's 1833 * zone. 1834 */ 1835 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 1836 return (ENOENT); 1837 1838 rw_enter(&aggr_grp_lock, RW_READER); 1839 1840 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1841 (mod_hash_val_t *)&grp) != 0) { 1842 rw_exit(&aggr_grp_lock); 1843 return (ENOENT); 1844 } 1845 AGGR_GRP_REFHOLD(grp); 1846 1847 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1848 rw_exit(&aggr_grp_lock); 1849 1850 rc = new_grp_fn(fn_arg, grp->lg_linkid, 1851 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 1852 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 1853 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 1854 1855 if (rc != 0) 1856 goto bail; 1857 1858 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1859 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1860 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 1861 port->lp_state, &port->lp_lacp.ActorOperPortState); 1862 mac_perim_exit(pmph); 1863 1864 if (rc != 0) 1865 goto bail; 1866 } 1867 1868 bail: 1869 mac_perim_exit(mph); 1870 AGGR_GRP_REFRELE(grp); 1871 return (rc); 1872 } 1873 1874 /*ARGSUSED*/ 1875 static void 1876 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 1877 { 1878 miocnak(q, mp, 0, ENOTSUP); 1879 } 1880 1881 static int 1882 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 1883 { 1884 aggr_port_t *port; 1885 uint_t stat_index; 1886 1887 /* We only aggregate counter statistics. */ 1888 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || 1889 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { 1890 return (ENOTSUP); 1891 } 1892 1893 /* 1894 * Counter statistics for a group are computed by aggregating the 1895 * counters of the members MACs while they were aggregated, plus 1896 * the residual counter of the group itself, which is updated each 1897 * time a MAC is removed from the group. 1898 */ 1899 *val = 0; 1900 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1901 /* actual port statistic */ 1902 *val += aggr_port_stat(port, stat); 1903 /* 1904 * minus the port stat when it was added, plus any residual 1905 * amount for the group. 1906 */ 1907 if (IS_MAC_STAT(stat)) { 1908 stat_index = stat - MAC_STAT_MIN; 1909 *val -= port->lp_stat[stat_index]; 1910 *val += grp->lg_stat[stat_index]; 1911 } else if (IS_MACTYPE_STAT(stat)) { 1912 stat_index = stat - MACTYPE_STAT_MIN; 1913 *val -= port->lp_ether_stat[stat_index]; 1914 *val += grp->lg_ether_stat[stat_index]; 1915 } 1916 } 1917 return (0); 1918 } 1919 1920 int 1921 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 1922 { 1923 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 1924 1925 if (rx_ring->arr_hw_rh != NULL) { 1926 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 1927 } else { 1928 aggr_port_t *port = rx_ring->arr_port; 1929 1930 *val = mac_stat_get(port->lp_mh, stat); 1931 1932 } 1933 return (0); 1934 } 1935 1936 int 1937 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 1938 { 1939 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 1940 1941 if (tx_ring->atr_hw_rh != NULL) { 1942 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 1943 } else { 1944 aggr_port_t *port = tx_ring->atr_port; 1945 1946 *val = mac_stat_get(port->lp_mh, stat); 1947 } 1948 return (0); 1949 } 1950 1951 static int 1952 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 1953 { 1954 aggr_grp_t *grp = arg; 1955 mac_perim_handle_t mph; 1956 int rval = 0; 1957 1958 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1959 1960 switch (stat) { 1961 case MAC_STAT_IFSPEED: 1962 *val = grp->lg_ifspeed; 1963 break; 1964 1965 case ETHER_STAT_LINK_DUPLEX: 1966 *val = grp->lg_link_duplex; 1967 break; 1968 1969 default: 1970 /* 1971 * For all other statistics, we return the aggregated stat 1972 * from the underlying ports. aggr_grp_stat() will set 1973 * rval appropriately if the statistic isn't a counter. 1974 */ 1975 rval = aggr_grp_stat(grp, stat, val); 1976 } 1977 1978 mac_perim_exit(mph); 1979 return (rval); 1980 } 1981 1982 static int 1983 aggr_m_start(void *arg) 1984 { 1985 aggr_grp_t *grp = arg; 1986 aggr_port_t *port; 1987 mac_perim_handle_t mph, pmph; 1988 1989 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1990 1991 /* 1992 * Attempts to start all configured members of the group. 1993 * Group members will be attached when their link-up notification 1994 * is received. 1995 */ 1996 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1997 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1998 if (aggr_port_start(port) != 0) { 1999 mac_perim_exit(pmph); 2000 continue; 2001 } 2002 2003 /* 2004 * Turn on the promiscuous mode if it is required to receive 2005 * the non-primary address over a port, or the promiscous 2006 * mode is enabled over the aggr. 2007 */ 2008 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 2009 if (aggr_port_promisc(port, B_TRUE) != 0) 2010 aggr_port_stop(port); 2011 } 2012 mac_perim_exit(pmph); 2013 } 2014 2015 grp->lg_started = B_TRUE; 2016 2017 mac_perim_exit(mph); 2018 return (0); 2019 } 2020 2021 static void 2022 aggr_m_stop(void *arg) 2023 { 2024 aggr_grp_t *grp = arg; 2025 aggr_port_t *port; 2026 mac_perim_handle_t mph, pmph; 2027 2028 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2029 2030 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2031 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2032 2033 /* reset port promiscuous mode */ 2034 (void) aggr_port_promisc(port, B_FALSE); 2035 2036 aggr_port_stop(port); 2037 mac_perim_exit(pmph); 2038 } 2039 2040 grp->lg_started = B_FALSE; 2041 mac_perim_exit(mph); 2042 } 2043 2044 static int 2045 aggr_m_promisc(void *arg, boolean_t on) 2046 { 2047 aggr_grp_t *grp = arg; 2048 aggr_port_t *port; 2049 boolean_t link_state_changed = B_FALSE; 2050 mac_perim_handle_t mph, pmph; 2051 2052 AGGR_GRP_REFHOLD(grp); 2053 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2054 2055 ASSERT(!grp->lg_closing); 2056 2057 if (on == grp->lg_promisc) 2058 goto bail; 2059 2060 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2061 int err = 0; 2062 2063 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2064 AGGR_PORT_REFHOLD(port); 2065 if (!on && (port->lp_prom_addr == NULL)) 2066 err = aggr_port_promisc(port, B_FALSE); 2067 else if (on && port->lp_started) 2068 err = aggr_port_promisc(port, B_TRUE); 2069 2070 if (err != 0) { 2071 if (aggr_grp_detach_port(grp, port)) 2072 link_state_changed = B_TRUE; 2073 } else { 2074 /* 2075 * If a port was detached because of a previous 2076 * failure changing the promiscuity, the port 2077 * is reattached when it successfully changes 2078 * the promiscuity now, and this might cause 2079 * the link state of the aggregation to change. 2080 */ 2081 if (aggr_grp_attach_port(grp, port)) 2082 link_state_changed = B_TRUE; 2083 } 2084 mac_perim_exit(pmph); 2085 AGGR_PORT_REFRELE(port); 2086 } 2087 2088 grp->lg_promisc = on; 2089 2090 if (link_state_changed) 2091 mac_link_update(grp->lg_mh, grp->lg_link_state); 2092 2093 bail: 2094 mac_perim_exit(mph); 2095 AGGR_GRP_REFRELE(grp); 2096 2097 return (0); 2098 } 2099 2100 static void 2101 aggr_grp_port_rename(const char *new_name, void *arg) 2102 { 2103 /* 2104 * aggr port's mac client name is the format of "aggr link name" plus 2105 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2106 */ 2107 int aggr_len, link_len, clnt_name_len, i; 2108 char *str_end, *str_st, *str_del; 2109 char aggr_name[MAXNAMELEN]; 2110 char link_name[MAXNAMELEN]; 2111 char *clnt_name; 2112 aggr_grp_t *aggr_grp = arg; 2113 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2114 2115 for (i = 0; i < aggr_grp->lg_nports; i++) { 2116 clnt_name = mac_client_name(aggr_port->lp_mch); 2117 clnt_name_len = strlen(clnt_name); 2118 str_st = clnt_name; 2119 str_end = &(clnt_name[clnt_name_len]); 2120 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2121 ASSERT(str_del != NULL); 2122 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2123 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2124 bzero(aggr_name, MAXNAMELEN); 2125 bzero(link_name, MAXNAMELEN); 2126 bcopy(clnt_name, aggr_name, aggr_len); 2127 bcopy(str_del, link_name, link_len + 1); 2128 bzero(clnt_name, MAXNAMELEN); 2129 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2130 link_name); 2131 2132 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2133 aggr_port = aggr_port->lp_next; 2134 } 2135 } 2136 2137 /* 2138 * Initialize the capabilities that are advertised for the group 2139 * according to the capabilities of the constituent ports. 2140 */ 2141 static boolean_t 2142 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2143 { 2144 aggr_grp_t *grp = arg; 2145 2146 switch (cap) { 2147 case MAC_CAPAB_HCKSUM: { 2148 uint32_t *hcksum_txflags = cap_data; 2149 *hcksum_txflags = grp->lg_hcksum_txflags; 2150 break; 2151 } 2152 case MAC_CAPAB_LSO: { 2153 mac_capab_lso_t *cap_lso = cap_data; 2154 2155 if (grp->lg_lso) { 2156 *cap_lso = grp->lg_cap_lso; 2157 break; 2158 } else { 2159 return (B_FALSE); 2160 } 2161 } 2162 case MAC_CAPAB_NO_NATIVEVLAN: 2163 return (!grp->lg_vlan); 2164 case MAC_CAPAB_NO_ZCOPY: 2165 return (!grp->lg_zcopy); 2166 case MAC_CAPAB_RINGS: { 2167 mac_capab_rings_t *cap_rings = cap_data; 2168 2169 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2170 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2171 cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt; 2172 2173 /* 2174 * An aggregation advertises only one (pseudo) RX 2175 * group, which virtualizes the main/primary group of 2176 * the underlying devices. 2177 */ 2178 cap_rings->mr_gnum = 1; 2179 cap_rings->mr_gaddring = NULL; 2180 cap_rings->mr_gremring = NULL; 2181 } else { 2182 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2183 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2184 cap_rings->mr_gnum = 0; 2185 } 2186 cap_rings->mr_rget = aggr_fill_ring; 2187 cap_rings->mr_gget = aggr_fill_group; 2188 break; 2189 } 2190 case MAC_CAPAB_AGGR: 2191 { 2192 mac_capab_aggr_t *aggr_cap; 2193 2194 if (cap_data != NULL) { 2195 aggr_cap = cap_data; 2196 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2197 aggr_cap->mca_unicst = aggr_m_unicst; 2198 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2199 aggr_cap->mca_arg = arg; 2200 } 2201 return (B_TRUE); 2202 } 2203 default: 2204 return (B_FALSE); 2205 } 2206 return (B_TRUE); 2207 } 2208 2209 /* 2210 * Callback funtion for MAC layer to register groups. 2211 */ 2212 static void 2213 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2214 mac_group_info_t *infop, mac_group_handle_t gh) 2215 { 2216 aggr_grp_t *grp = arg; 2217 aggr_pseudo_rx_group_t *rx_group; 2218 aggr_pseudo_tx_group_t *tx_group; 2219 2220 ASSERT(index == 0); 2221 if (rtype == MAC_RING_TYPE_RX) { 2222 rx_group = &grp->lg_rx_group; 2223 rx_group->arg_gh = gh; 2224 rx_group->arg_grp = grp; 2225 2226 infop->mgi_driver = (mac_group_driver_t)rx_group; 2227 infop->mgi_start = NULL; 2228 infop->mgi_stop = NULL; 2229 infop->mgi_addmac = aggr_addmac; 2230 infop->mgi_remmac = aggr_remmac; 2231 infop->mgi_count = rx_group->arg_ring_cnt; 2232 } else { 2233 tx_group = &grp->lg_tx_group; 2234 tx_group->atg_gh = gh; 2235 } 2236 } 2237 2238 /* 2239 * Callback funtion for MAC layer to register all rings. 2240 */ 2241 static void 2242 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2243 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2244 { 2245 aggr_grp_t *grp = arg; 2246 2247 switch (rtype) { 2248 case MAC_RING_TYPE_RX: { 2249 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group; 2250 aggr_pseudo_rx_ring_t *rx_ring; 2251 mac_intr_t aggr_mac_intr; 2252 2253 ASSERT(rg_index == 0); 2254 2255 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt)); 2256 rx_ring = rx_group->arg_rings + index; 2257 rx_ring->arr_rh = rh; 2258 2259 /* 2260 * Entrypoint to enable interrupt (disable poll) and 2261 * disable interrupt (enable poll). 2262 */ 2263 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2264 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2265 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2266 aggr_mac_intr.mi_ddi_handle = NULL; 2267 2268 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2269 infop->mri_start = aggr_pseudo_start_ring; 2270 infop->mri_stop = NULL; 2271 2272 infop->mri_intr = aggr_mac_intr; 2273 infop->mri_poll = aggr_rx_poll; 2274 2275 infop->mri_stat = aggr_rx_ring_stat; 2276 break; 2277 } 2278 case MAC_RING_TYPE_TX: { 2279 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2280 aggr_pseudo_tx_ring_t *tx_ring; 2281 2282 ASSERT(rg_index == -1); 2283 ASSERT(index < tx_group->atg_ring_cnt); 2284 2285 tx_ring = &tx_group->atg_rings[index]; 2286 tx_ring->atr_rh = rh; 2287 2288 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2289 infop->mri_start = NULL; 2290 infop->mri_stop = NULL; 2291 infop->mri_tx = aggr_ring_tx; 2292 infop->mri_stat = aggr_tx_ring_stat; 2293 /* 2294 * Use the hw TX ring handle to find if the ring needs 2295 * serialization or not. For NICs that do not expose 2296 * Tx rings, atr_hw_rh will be NULL. 2297 */ 2298 if (tx_ring->atr_hw_rh != NULL) { 2299 infop->mri_flags = 2300 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2301 } 2302 break; 2303 } 2304 default: 2305 break; 2306 } 2307 } 2308 2309 static mblk_t * 2310 aggr_rx_poll(void *arg, int bytes_to_pickup) 2311 { 2312 aggr_pseudo_rx_ring_t *rr_ring = arg; 2313 aggr_port_t *port = rr_ring->arr_port; 2314 aggr_grp_t *grp = port->lp_grp; 2315 mblk_t *mp_chain, *mp, **mpp; 2316 2317 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2318 2319 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2320 return (mp_chain); 2321 2322 mpp = &mp_chain; 2323 while ((mp = *mpp) != NULL) { 2324 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2325 struct ether_header *ehp; 2326 2327 ehp = (struct ether_header *)mp->b_rptr; 2328 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2329 *mpp = mp->b_next; 2330 mp->b_next = NULL; 2331 aggr_recv_lacp(port, 2332 (mac_resource_handle_t)rr_ring, mp); 2333 continue; 2334 } 2335 } 2336 2337 if (!port->lp_collector_enabled) { 2338 *mpp = mp->b_next; 2339 mp->b_next = NULL; 2340 freemsg(mp); 2341 continue; 2342 } 2343 mpp = &mp->b_next; 2344 } 2345 return (mp_chain); 2346 } 2347 2348 static int 2349 aggr_addmac(void *arg, const uint8_t *mac_addr) 2350 { 2351 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2352 aggr_unicst_addr_t *addr, **pprev; 2353 aggr_grp_t *grp = rx_group->arg_grp; 2354 aggr_port_t *port, *p; 2355 mac_perim_handle_t mph; 2356 int err = 0; 2357 2358 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2359 2360 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2361 mac_perim_exit(mph); 2362 return (0); 2363 } 2364 2365 /* 2366 * Insert this mac address into the list of mac addresses owned by 2367 * the aggregation pseudo group. 2368 */ 2369 pprev = &rx_group->arg_macaddr; 2370 while ((addr = *pprev) != NULL) { 2371 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2372 mac_perim_exit(mph); 2373 return (EEXIST); 2374 } 2375 pprev = &addr->aua_next; 2376 } 2377 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2378 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2379 addr->aua_next = NULL; 2380 *pprev = addr; 2381 2382 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2383 if ((err = aggr_port_addmac(port, mac_addr)) != 0) 2384 break; 2385 2386 if (err != 0) { 2387 for (p = grp->lg_ports; p != port; p = p->lp_next) 2388 aggr_port_remmac(p, mac_addr); 2389 2390 *pprev = NULL; 2391 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2392 } 2393 2394 mac_perim_exit(mph); 2395 return (err); 2396 } 2397 2398 static int 2399 aggr_remmac(void *arg, const uint8_t *mac_addr) 2400 { 2401 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2402 aggr_unicst_addr_t *addr, **pprev; 2403 aggr_grp_t *grp = rx_group->arg_grp; 2404 aggr_port_t *port; 2405 mac_perim_handle_t mph; 2406 int err = 0; 2407 2408 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2409 2410 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2411 mac_perim_exit(mph); 2412 return (0); 2413 } 2414 2415 /* 2416 * Insert this mac address into the list of mac addresses owned by 2417 * the aggregation pseudo group. 2418 */ 2419 pprev = &rx_group->arg_macaddr; 2420 while ((addr = *pprev) != NULL) { 2421 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2422 pprev = &addr->aua_next; 2423 continue; 2424 } 2425 break; 2426 } 2427 if (addr == NULL) { 2428 mac_perim_exit(mph); 2429 return (EINVAL); 2430 } 2431 2432 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2433 aggr_port_remmac(port, mac_addr); 2434 2435 *pprev = addr->aua_next; 2436 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2437 2438 mac_perim_exit(mph); 2439 return (err); 2440 } 2441 2442 /* 2443 * Add or remove the multicast addresses that are defined for the group 2444 * to or from the specified port. 2445 * 2446 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2447 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2448 * called when the port is either stopped or detached. 2449 */ 2450 void 2451 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2452 { 2453 aggr_grp_t *grp = port->lp_grp; 2454 2455 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2456 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2457 2458 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2459 return; 2460 2461 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2462 } 2463 2464 static int 2465 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2466 { 2467 aggr_grp_t *grp = arg; 2468 aggr_port_t *port = NULL, *errport = NULL; 2469 mac_perim_handle_t mph; 2470 int err = 0; 2471 2472 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2473 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2474 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2475 !port->lp_started) { 2476 continue; 2477 } 2478 err = aggr_port_multicst(port, add, addrp); 2479 if (err != 0) { 2480 errport = port; 2481 break; 2482 } 2483 } 2484 2485 /* 2486 * At least one port caused error return and this error is returned to 2487 * mac, eventually a NAK would be sent upwards. 2488 * Some ports have this multicast address listed now, and some don't. 2489 * Treat this error as a whole aggr failure not individual port failure. 2490 * Therefore remove this multicast address from other ports. 2491 */ 2492 if ((err != 0) && add) { 2493 for (port = grp->lg_ports; port != errport; 2494 port = port->lp_next) { 2495 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2496 !port->lp_started) { 2497 continue; 2498 } 2499 (void) aggr_port_multicst(port, B_FALSE, addrp); 2500 } 2501 } 2502 mac_perim_exit(mph); 2503 return (err); 2504 } 2505 2506 static int 2507 aggr_m_unicst(void *arg, const uint8_t *macaddr) 2508 { 2509 aggr_grp_t *grp = arg; 2510 mac_perim_handle_t mph; 2511 int err; 2512 2513 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2514 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 2515 0, 0); 2516 mac_perim_exit(mph); 2517 return (err); 2518 } 2519 2520 /* 2521 * Initialize the capabilities that are advertised for the group 2522 * according to the capabilities of the constituent ports. 2523 */ 2524 static void 2525 aggr_grp_capab_set(aggr_grp_t *grp) 2526 { 2527 uint32_t cksum; 2528 aggr_port_t *port; 2529 mac_capab_lso_t cap_lso; 2530 2531 ASSERT(grp->lg_mh == NULL); 2532 ASSERT(grp->lg_ports != NULL); 2533 2534 grp->lg_hcksum_txflags = (uint32_t)-1; 2535 grp->lg_zcopy = B_TRUE; 2536 grp->lg_vlan = B_TRUE; 2537 2538 grp->lg_lso = B_TRUE; 2539 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 2540 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 2541 2542 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2543 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 2544 cksum = 0; 2545 grp->lg_hcksum_txflags &= cksum; 2546 2547 grp->lg_vlan &= 2548 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 2549 2550 grp->lg_zcopy &= 2551 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 2552 2553 grp->lg_lso &= 2554 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 2555 if (grp->lg_lso) { 2556 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 2557 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2558 cap_lso.lso_basic_tcp_ipv4.lso_max) 2559 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 2560 cap_lso.lso_basic_tcp_ipv4.lso_max; 2561 } 2562 } 2563 } 2564 2565 /* 2566 * Checks whether the capabilities of the port being added are compatible 2567 * with the current capabilities of the aggregation. 2568 */ 2569 static boolean_t 2570 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 2571 { 2572 uint32_t hcksum_txflags; 2573 2574 ASSERT(grp->lg_ports != NULL); 2575 2576 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 2577 grp->lg_vlan) != grp->lg_vlan) { 2578 return (B_FALSE); 2579 } 2580 2581 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 2582 grp->lg_zcopy) != grp->lg_zcopy) { 2583 return (B_FALSE); 2584 } 2585 2586 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 2587 if (grp->lg_hcksum_txflags != 0) 2588 return (B_FALSE); 2589 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 2590 grp->lg_hcksum_txflags) { 2591 return (B_FALSE); 2592 } 2593 2594 if (grp->lg_lso) { 2595 mac_capab_lso_t cap_lso; 2596 2597 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 2598 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 2599 grp->lg_cap_lso.lso_flags) 2600 return (B_FALSE); 2601 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2602 cap_lso.lso_basic_tcp_ipv4.lso_max) 2603 return (B_FALSE); 2604 } else { 2605 return (B_FALSE); 2606 } 2607 } 2608 2609 return (B_TRUE); 2610 } 2611 2612 /* 2613 * Returns the maximum SDU according to the SDU of the constituent ports. 2614 */ 2615 static uint_t 2616 aggr_grp_max_sdu(aggr_grp_t *grp) 2617 { 2618 uint_t max_sdu = (uint_t)-1; 2619 aggr_port_t *port; 2620 2621 ASSERT(grp->lg_ports != NULL); 2622 2623 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2624 uint_t port_sdu_max; 2625 2626 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2627 if (max_sdu > port_sdu_max) 2628 max_sdu = port_sdu_max; 2629 } 2630 2631 return (max_sdu); 2632 } 2633 2634 /* 2635 * Checks if the maximum SDU of the specified port is compatible 2636 * with the maximum SDU of the specified aggregation group, returns 2637 * B_TRUE if it is, B_FALSE otherwise. 2638 */ 2639 static boolean_t 2640 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 2641 { 2642 uint_t port_sdu_max; 2643 2644 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2645 return (port_sdu_max >= grp->lg_max_sdu); 2646 } 2647 2648 /* 2649 * Returns the maximum margin according to the margin of the constituent ports. 2650 */ 2651 static uint32_t 2652 aggr_grp_max_margin(aggr_grp_t *grp) 2653 { 2654 uint32_t margin = UINT32_MAX; 2655 aggr_port_t *port; 2656 2657 ASSERT(grp->lg_mh == NULL); 2658 ASSERT(grp->lg_ports != NULL); 2659 2660 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2661 if (margin > port->lp_margin) 2662 margin = port->lp_margin; 2663 } 2664 2665 grp->lg_margin = margin; 2666 return (margin); 2667 } 2668 2669 /* 2670 * Checks if the maximum margin of the specified port is compatible 2671 * with the maximum margin of the specified aggregation group, returns 2672 * B_TRUE if it is, B_FALSE otherwise. 2673 */ 2674 static boolean_t 2675 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 2676 { 2677 if (port->lp_margin >= grp->lg_margin) 2678 return (B_TRUE); 2679 2680 /* 2681 * See whether the current margin value is allowed to be changed to 2682 * the new value. 2683 */ 2684 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 2685 return (B_FALSE); 2686 2687 grp->lg_margin = port->lp_margin; 2688 return (B_TRUE); 2689 } 2690 2691 /* 2692 * Set MTU on individual ports of an aggregation group 2693 */ 2694 static int 2695 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 2696 uint32_t *old_mtu) 2697 { 2698 boolean_t removed = B_FALSE; 2699 mac_perim_handle_t mph; 2700 mac_diag_t diag; 2701 int err, rv, retry = 0; 2702 2703 if (port->lp_mah != NULL) { 2704 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 2705 port->lp_mah = NULL; 2706 removed = B_TRUE; 2707 } 2708 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 2709 try_again: 2710 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 2711 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 2712 &port->lp_mah, 0, &diag)) != 0) { 2713 /* 2714 * following is a workaround for a bug in 'bge' driver. 2715 * See CR 6794654 for more information and this work around 2716 * will be removed once the CR is fixed. 2717 */ 2718 if (rv == EIO && retry++ < 3) { 2719 delay(2 * hz); 2720 goto try_again; 2721 } 2722 /* 2723 * if mac_unicast_add() failed while setting the MTU, 2724 * detach the port from the group. 2725 */ 2726 mac_perim_enter_by_mh(port->lp_mh, &mph); 2727 (void) aggr_grp_detach_port(grp, port); 2728 mac_perim_exit(mph); 2729 cmn_err(CE_WARN, "Unable to restart the port %s while " 2730 "setting MTU. Detaching the port from the aggregation.", 2731 mac_client_name(port->lp_mch)); 2732 } 2733 return (err); 2734 } 2735 2736 static int 2737 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 2738 { 2739 int err = 0, i, rv; 2740 aggr_port_t *port; 2741 uint32_t *mtu; 2742 2743 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2744 2745 /* 2746 * If the MTU being set is equal to aggr group's maximum 2747 * allowable value, then there is nothing to change 2748 */ 2749 if (sdu == grp->lg_max_sdu) 2750 return (0); 2751 2752 /* 0 is aggr group's min sdu */ 2753 if (sdu == 0) 2754 return (EINVAL); 2755 2756 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 2757 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 2758 port = port->lp_next, i++) { 2759 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 2760 } 2761 if (err != 0) { 2762 /* recover from error: reset the mtus of the ports */ 2763 aggr_port_t *tmp; 2764 2765 for (tmp = grp->lg_ports, i = 0; tmp != port; 2766 tmp = tmp->lp_next, i++) { 2767 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 2768 } 2769 goto bail; 2770 } 2771 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 2772 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 2773 ASSERT(rv == 0); 2774 bail: 2775 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 2776 return (err); 2777 } 2778 2779 /* 2780 * Callback functions for set/get of properties 2781 */ 2782 /*ARGSUSED*/ 2783 static int 2784 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 2785 uint_t pr_valsize, const void *pr_val) 2786 { 2787 int err = ENOTSUP; 2788 aggr_grp_t *grp = m_driver; 2789 2790 switch (pr_num) { 2791 case MAC_PROP_MTU: { 2792 uint32_t mtu; 2793 2794 if (pr_valsize < sizeof (mtu)) { 2795 err = EINVAL; 2796 break; 2797 } 2798 bcopy(pr_val, &mtu, sizeof (mtu)); 2799 err = aggr_sdu_update(grp, mtu); 2800 break; 2801 } 2802 default: 2803 break; 2804 } 2805 return (err); 2806 } 2807 2808 typedef struct rboundary { 2809 uint32_t bval; 2810 int btype; 2811 } rboundary_t; 2812 2813 /* 2814 * This function finds the intersection of mtu ranges stored in arrays - 2815 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 2816 * Individual arrays are assumed to contain non-overlapping ranges. 2817 * Algorithm: 2818 * A range has two boundaries - min and max. We scan all arrays and store 2819 * each boundary as a separate element in a temporary array. We also store 2820 * the boundary types, min or max, as +1 or -1 respectively in the temporary 2821 * array. Then we sort the temporary array in ascending order. We scan the 2822 * sorted array from lower to higher values and keep a cumulative sum of 2823 * boundary types. Element in the temporary array for which the sum reaches 2824 * mcount is a min boundary of a range in the result and next element will be 2825 * max boundary. 2826 * 2827 * Example for mcount = 3, 2828 * 2829 * ----|_________|-------|_______|----|__|------ mrange[0] 2830 * 2831 * -------|________|--|____________|-----|___|-- mrange[1] 2832 * 2833 * --------|________________|-------|____|------ mrange[2] 2834 * 2835 * 3 2 1 2836 * \|/ 2837 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 2838 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 2839 * 2840 * same min and max 2841 * V 2842 * --------|_____|-------|__|------------|------ intersecting ranges 2843 */ 2844 void 2845 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 2846 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 2847 { 2848 mac_propval_uint32_range_t *rval, *ur; 2849 int rmaxcnt, rcount; 2850 size_t sz_range32; 2851 rboundary_t *ta; /* temporary array */ 2852 rboundary_t temp; 2853 boolean_t range_started = B_FALSE; 2854 int i, j, m, sum; 2855 2856 sz_range32 = sizeof (mac_propval_uint32_range_t); 2857 2858 for (i = 0, rmaxcnt = 0; i < mcount; i++) 2859 rmaxcnt += mrange[i]->mpr_count; 2860 2861 /* Allocate enough space to store the results */ 2862 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 2863 2864 /* Number of boundaries are twice as many as ranges */ 2865 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 2866 2867 for (i = 0, m = 0; i < mcount; i++) { 2868 ur = &(mrange[i]->mpr_range_uint32[0]); 2869 for (j = 0; j < mrange[i]->mpr_count; j++) { 2870 ta[m].bval = ur[j].mpur_min; 2871 ta[m++].btype = 1; 2872 ta[m].bval = ur[j].mpur_max; 2873 ta[m++].btype = -1; 2874 } 2875 } 2876 2877 /* 2878 * Sort the temporary array in ascending order of bval; 2879 * if boundary values are same then sort on btype. 2880 */ 2881 for (i = 0; i < m-1; i++) { 2882 for (j = i+1; j < m; j++) { 2883 if ((ta[i].bval > ta[j].bval) || 2884 ((ta[i].bval == ta[j].bval) && 2885 (ta[i].btype < ta[j].btype))) { 2886 temp = ta[i]; 2887 ta[i] = ta[j]; 2888 ta[j] = temp; 2889 } 2890 } 2891 } 2892 2893 /* Walk through temporary array to find all ranges in the results */ 2894 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 2895 sum += ta[i].btype; 2896 if (sum == mcount) { 2897 rval[rcount].mpur_min = ta[i].bval; 2898 range_started = B_TRUE; 2899 } else if (sum < mcount && range_started) { 2900 rval[rcount++].mpur_max = ta[i].bval; 2901 range_started = B_FALSE; 2902 } 2903 } 2904 2905 *prval = rval; 2906 *prmaxcnt = rmaxcnt; 2907 *prcount = rcount; 2908 2909 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t)); 2910 } 2911 2912 /* 2913 * Returns the mtu ranges which could be supported by aggr group. 2914 * prmaxcnt returns the size of the buffer prval, prcount returns 2915 * the number of valid entries in prval. Caller is responsible 2916 * for freeing up prval. 2917 */ 2918 int 2919 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 2920 int *prmaxcnt, int *prcount) 2921 { 2922 mac_propval_range_t **vals; 2923 aggr_port_t *port; 2924 mac_perim_handle_t mph; 2925 uint_t i, numr; 2926 int err = 0; 2927 size_t sz_propval, sz_range32; 2928 size_t size; 2929 2930 sz_propval = sizeof (mac_propval_range_t); 2931 sz_range32 = sizeof (mac_propval_uint32_range_t); 2932 2933 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2934 2935 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 2936 KM_SLEEP); 2937 2938 for (port = grp->lg_ports, i = 0; port != NULL; 2939 port = port->lp_next, i++) { 2940 2941 size = sz_propval; 2942 vals[i] = kmem_alloc(size, KM_SLEEP); 2943 vals[i]->mpr_count = 1; 2944 2945 mac_perim_enter_by_mh(port->lp_mh, &mph); 2946 2947 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 2948 NULL, 0, vals[i], NULL); 2949 if (err == ENOSPC) { 2950 /* 2951 * Not enough space to hold all ranges. 2952 * Allocate extra space as indicated and retry. 2953 */ 2954 numr = vals[i]->mpr_count; 2955 kmem_free(vals[i], sz_propval); 2956 size = sz_propval + (numr - 1) * sz_range32; 2957 vals[i] = kmem_alloc(size, KM_SLEEP); 2958 vals[i]->mpr_count = numr; 2959 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 2960 NULL, 0, vals[i], NULL); 2961 ASSERT(err != ENOSPC); 2962 } 2963 mac_perim_exit(mph); 2964 if (err != 0) { 2965 kmem_free(vals[i], size); 2966 vals[i] = NULL; 2967 break; 2968 } 2969 } 2970 2971 /* 2972 * if any of the underlying ports does not support changing MTU then 2973 * just return ENOTSUP 2974 */ 2975 if (port != NULL) { 2976 ASSERT(err != 0); 2977 goto done; 2978 } 2979 2980 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 2981 prcount); 2982 2983 done: 2984 for (i = 0; i < grp->lg_nports; i++) { 2985 if (vals[i] != NULL) { 2986 numr = vals[i]->mpr_count; 2987 size = sz_propval + (numr - 1) * sz_range32; 2988 kmem_free(vals[i], size); 2989 } 2990 } 2991 2992 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 2993 return (err); 2994 } 2995 2996 static void 2997 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 2998 mac_prop_info_handle_t prh) 2999 { 3000 aggr_grp_t *grp = m_driver; 3001 mac_propval_uint32_range_t *rval = NULL; 3002 int i, rcount, rmaxcnt; 3003 int err = 0; 3004 3005 _NOTE(ARGUNUSED(pr_name)); 3006 3007 switch (pr_num) { 3008 case MAC_PROP_MTU: 3009 3010 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, 3011 &rcount); 3012 if (err != 0) { 3013 ASSERT(rval == NULL); 3014 return; 3015 } 3016 for (i = 0; i < rcount; i++) { 3017 mac_prop_info_set_range_uint32(prh, 3018 rval[i].mpur_min, rval[i].mpur_max); 3019 } 3020 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 3021 break; 3022 } 3023 }