1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2018 Joyent, Inc. 24 */ 25 26 /* 27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 28 * 29 * An instance of the structure aggr_grp_t is allocated for each 30 * link aggregation group. When created, aggr_grp_t objects are 31 * entered into the aggr_grp_hash hash table maintained by the modhash 32 * module. The hash key is the linkid associated with the link 33 * aggregation group. 34 * 35 * Each aggregation contains a set of ports. The port is represented 36 * by the aggr_port_t structure. A port consists of a single MAC 37 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying 38 * MAC. This client is used by the aggr to send and receive LACP 39 * traffic. Each port client takes on the same MAC unicast address -- 40 * the address of the aggregation itself (taken from the first port by 41 * default). 42 * 43 * The MAC client that hangs off each aggr port is not your typical 44 * MAC client. Not only does it have exclusive control of the MAC, but 45 * it also has no Tx or Rx SRSes. An SRS is designed to queue and 46 * fanout traffic among L4 protocols; but the aggr is an intermediary, 47 * not a consumer. Instead of using SRSes, the aggr puts the 48 * underlying hardware rings into passthru mode and ships packets up 49 * via a direct call to aggr_recv_cb(). This allows aggr to enforce 50 * LACP while passing all other traffic up to clients of the aggr. 51 * 52 * Pseudo Rx Groups and Rings 53 * -------------------------- 54 * 55 * It is imperative for client performance that the aggr provide as 56 * many MAC groups as possible. In order to use the underlying HW 57 * resources, aggr creates pseudo groups to aggregate the underlying 58 * HW groups. Every HW group gets mapped to a pseudo group; and every 59 * HW ring in that group gets mapped to a pseudo ring. The pseudo 60 * group at index 0 combines all the HW groups at index 0 from each 61 * port, etc. The aggr's MAC then creates normal MAC groups and rings 62 * out of these pseudo groups and rings to present to the aggr's 63 * clients. To the clients, the aggr's groups and rings are absolutely 64 * no different than a NIC's groups or rings. 65 * 66 * Pseudo Tx Rings 67 * --------------- 68 * 69 * The underlying ports (NICs) in an aggregation can have Tx rings. To 70 * enhance aggr's performance, these Tx rings are made available to 71 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are 72 * not new. They are already present and implemented on the Rx side. 73 * The same concept is extended to the Tx side where each Tx ring of 74 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus 75 * each pseudo Tx ring will map to a specific hardware Tx ring. Even 76 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring 77 * is given to the aggregation layer. 78 * 79 * With this change, the outgoing stack depth looks much better: 80 * 81 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 82 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 83 * 84 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: 85 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 86 * 87 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 88 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx 89 * ring belonging to a port on which the packet has to be sent. 90 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 91 * policy and then uses the fanout_hint passed to it to pick a Tx ring from 92 * the selected port. 93 * 94 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 95 * bandwidth limit is applied first on the outgoing packet and the packets 96 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 97 * particular Tx ring. 98 */ 99 100 #include <sys/types.h> 101 #include <sys/sysmacros.h> 102 #include <sys/conf.h> 103 #include <sys/cmn_err.h> 104 #include <sys/disp.h> 105 #include <sys/list.h> 106 #include <sys/ksynch.h> 107 #include <sys/kmem.h> 108 #include <sys/stream.h> 109 #include <sys/modctl.h> 110 #include <sys/ddi.h> 111 #include <sys/sunddi.h> 112 #include <sys/atomic.h> 113 #include <sys/stat.h> 114 #include <sys/modhash.h> 115 #include <sys/id_space.h> 116 #include <sys/strsun.h> 117 #include <sys/cred.h> 118 #include <sys/dlpi.h> 119 #include <sys/zone.h> 120 #include <sys/mac_provider.h> 121 #include <sys/dls.h> 122 #include <sys/vlan.h> 123 #include <sys/aggr.h> 124 #include <sys/aggr_impl.h> 125 126 static int aggr_m_start(void *); 127 static void aggr_m_stop(void *); 128 static int aggr_m_promisc(void *, boolean_t); 129 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 130 static int aggr_m_unicst(void *, const uint8_t *); 131 static int aggr_m_stat(void *, uint_t, uint64_t *); 132 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 133 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 134 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 135 const void *); 136 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 137 mac_prop_info_handle_t); 138 139 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 140 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 141 boolean_t *); 142 143 static void aggr_grp_capab_set(aggr_grp_t *); 144 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 145 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 146 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 147 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 148 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 149 150 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 151 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 152 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 153 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 154 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); 155 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); 156 static int aggr_addmac(void *, const uint8_t *); 157 static int aggr_remmac(void *, const uint8_t *); 158 static int aggr_addvlan(mac_group_driver_t, uint16_t); 159 static int aggr_remvlan(mac_group_driver_t, uint16_t); 160 static mblk_t *aggr_rx_poll(void *, int); 161 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 162 const int, mac_ring_info_t *, mac_ring_handle_t); 163 static void aggr_fill_group(void *, mac_ring_type_t, const int, 164 mac_group_info_t *, mac_group_handle_t); 165 166 static kmem_cache_t *aggr_grp_cache; 167 static mod_hash_t *aggr_grp_hash; 168 static krwlock_t aggr_grp_lock; 169 static uint_t aggr_grp_cnt; 170 static id_space_t *key_ids; 171 172 #define GRP_HASHSZ 64 173 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 174 #define AGGR_PORT_NAME_DELIMIT '-' 175 176 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 177 178 #define AGGR_M_CALLBACK_FLAGS \ 179 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 180 181 static mac_callbacks_t aggr_m_callbacks = { 182 AGGR_M_CALLBACK_FLAGS, 183 aggr_m_stat, 184 aggr_m_start, 185 aggr_m_stop, 186 aggr_m_promisc, 187 aggr_m_multicst, 188 NULL, 189 NULL, 190 NULL, 191 aggr_m_ioctl, 192 aggr_m_capab_get, 193 NULL, 194 NULL, 195 aggr_m_setprop, 196 NULL, 197 aggr_m_propinfo 198 }; 199 200 /*ARGSUSED*/ 201 static int 202 aggr_grp_constructor(void *buf, void *arg, int kmflag) 203 { 204 aggr_grp_t *grp = buf; 205 206 bzero(grp, sizeof (*grp)); 207 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 208 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 209 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 210 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 211 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 212 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 213 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 214 grp->lg_link_state = LINK_STATE_UNKNOWN; 215 return (0); 216 } 217 218 /*ARGSUSED*/ 219 static void 220 aggr_grp_destructor(void *buf, void *arg) 221 { 222 aggr_grp_t *grp = buf; 223 224 if (grp->lg_tx_ports != NULL) { 225 kmem_free(grp->lg_tx_ports, 226 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 227 } 228 229 mutex_destroy(&grp->lg_lacp_lock); 230 cv_destroy(&grp->lg_lacp_cv); 231 mutex_destroy(&grp->lg_port_lock); 232 cv_destroy(&grp->lg_port_cv); 233 rw_destroy(&grp->lg_tx_lock); 234 mutex_destroy(&grp->lg_tx_flowctl_lock); 235 cv_destroy(&grp->lg_tx_flowctl_cv); 236 } 237 238 void 239 aggr_grp_init(void) 240 { 241 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 242 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 243 aggr_grp_destructor, NULL, NULL, NULL, 0); 244 245 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 246 GRP_HASHSZ, mod_hash_null_valdtor); 247 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 248 aggr_grp_cnt = 0; 249 250 /* 251 * Allocate an id space to manage key values (when key is not 252 * specified). The range of the id space will be from 253 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 254 * uses a 16-bit key. 255 */ 256 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 257 ASSERT(key_ids != NULL); 258 } 259 260 void 261 aggr_grp_fini(void) 262 { 263 id_space_destroy(key_ids); 264 rw_destroy(&aggr_grp_lock); 265 mod_hash_destroy_idhash(aggr_grp_hash); 266 kmem_cache_destroy(aggr_grp_cache); 267 } 268 269 uint_t 270 aggr_grp_count(void) 271 { 272 uint_t count; 273 274 rw_enter(&aggr_grp_lock, RW_READER); 275 count = aggr_grp_cnt; 276 rw_exit(&aggr_grp_lock); 277 return (count); 278 } 279 280 /* 281 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 282 * requires the mac perimeter, this function holds a reference of the aggr 283 * and aggr won't call mac_unregister() until this reference drops to 0. 284 */ 285 void 286 aggr_grp_port_hold(aggr_port_t *port) 287 { 288 aggr_grp_t *grp = port->lp_grp; 289 290 AGGR_PORT_REFHOLD(port); 291 mutex_enter(&grp->lg_port_lock); 292 grp->lg_port_ref++; 293 mutex_exit(&grp->lg_port_lock); 294 } 295 296 /* 297 * Release the reference of the grp and inform aggr_grp_delete() calling 298 * mac_unregister() is now safe. 299 */ 300 void 301 aggr_grp_port_rele(aggr_port_t *port) 302 { 303 aggr_grp_t *grp = port->lp_grp; 304 305 mutex_enter(&grp->lg_port_lock); 306 if (--grp->lg_port_ref == 0) 307 cv_signal(&grp->lg_port_cv); 308 mutex_exit(&grp->lg_port_lock); 309 AGGR_PORT_REFRELE(port); 310 } 311 312 /* 313 * Wait for the port's lacp timer thread and the port's notification callback 314 * to exit. 315 */ 316 void 317 aggr_grp_port_wait(aggr_grp_t *grp) 318 { 319 mutex_enter(&grp->lg_port_lock); 320 if (grp->lg_port_ref != 0) 321 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 322 mutex_exit(&grp->lg_port_lock); 323 } 324 325 /* 326 * Attach a port to a link aggregation group. 327 * 328 * A port is attached to a link aggregation group once its speed 329 * and link state have been verified. 330 * 331 * Returns B_TRUE if the group link state or speed has changed. If 332 * it's the case, the caller must notify the MAC layer via a call 333 * to mac_link(). 334 */ 335 boolean_t 336 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 337 { 338 boolean_t link_state_changed = B_FALSE; 339 340 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 341 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 342 343 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 344 return (B_FALSE); 345 346 /* 347 * Validate the MAC port link speed and update the group 348 * link speed if needed. 349 */ 350 if (port->lp_ifspeed == 0 || 351 port->lp_link_state != LINK_STATE_UP || 352 port->lp_link_duplex != LINK_DUPLEX_FULL) { 353 /* 354 * Can't attach a MAC port with unknown link speed, 355 * down link, or not in full duplex mode. 356 */ 357 return (B_FALSE); 358 } 359 360 mutex_enter(&grp->lg_stat_lock); 361 if (grp->lg_ifspeed == 0) { 362 /* 363 * The group inherits the speed of the first link being 364 * attached. 365 */ 366 grp->lg_ifspeed = port->lp_ifspeed; 367 link_state_changed = B_TRUE; 368 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 369 /* 370 * The link speed of the MAC port must be the same as 371 * the group link speed, as per 802.3ad. Since it is 372 * not, the attach is cancelled. 373 */ 374 mutex_exit(&grp->lg_stat_lock); 375 return (B_FALSE); 376 } 377 mutex_exit(&grp->lg_stat_lock); 378 379 grp->lg_nattached_ports++; 380 381 /* 382 * Update the group link state. 383 */ 384 if (grp->lg_link_state != LINK_STATE_UP) { 385 grp->lg_link_state = LINK_STATE_UP; 386 mutex_enter(&grp->lg_stat_lock); 387 grp->lg_link_duplex = LINK_DUPLEX_FULL; 388 mutex_exit(&grp->lg_stat_lock); 389 link_state_changed = B_TRUE; 390 } 391 392 /* 393 * Update port's state. 394 */ 395 port->lp_state = AGGR_PORT_STATE_ATTACHED; 396 397 aggr_grp_multicst_port(port, B_TRUE); 398 399 /* 400 * The port client doesn't have an Rx SRS; instead of calling 401 * mac_rx_set() we set the client's flow callback directly. 402 * This datapath is used only when the port's driver doesn't 403 * support MAC_CAPAB_RINGS. Drivers with ring support will 404 * deliver traffic to the aggr via ring passthru. 405 */ 406 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); 407 408 /* 409 * If LACP is OFF, the port can be used to send data as soon 410 * as its link is up and verified to be compatible with the 411 * aggregation. 412 * 413 * If LACP is active or passive, notify the LACP subsystem, which 414 * will enable sending on the port following the LACP protocol. 415 */ 416 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 417 aggr_send_port_enable(port); 418 else 419 aggr_lacp_port_attached(port); 420 421 return (link_state_changed); 422 } 423 424 boolean_t 425 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 426 { 427 boolean_t link_state_changed = B_FALSE; 428 429 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 430 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 431 432 /* update state */ 433 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 434 return (B_FALSE); 435 436 mac_client_clear_flow_cb(port->lp_mch); 437 438 aggr_grp_multicst_port(port, B_FALSE); 439 440 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 441 aggr_send_port_disable(port); 442 else 443 aggr_lacp_port_detached(port); 444 445 port->lp_state = AGGR_PORT_STATE_STANDBY; 446 447 grp->lg_nattached_ports--; 448 if (grp->lg_nattached_ports == 0) { 449 /* the last attached MAC port of the group is being detached */ 450 grp->lg_link_state = LINK_STATE_DOWN; 451 mutex_enter(&grp->lg_stat_lock); 452 grp->lg_ifspeed = 0; 453 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 454 mutex_exit(&grp->lg_stat_lock); 455 link_state_changed = B_TRUE; 456 } 457 458 return (link_state_changed); 459 } 460 461 /* 462 * Update the MAC addresses of the constituent ports of the specified 463 * group. This function is invoked: 464 * - after creating a new aggregation group. 465 * - after adding new ports to an aggregation group. 466 * - after removing a port from a group when the MAC address of 467 * that port was used for the MAC address of the group. 468 * - after the MAC address of a port changed when the MAC address 469 * of that port was used for the MAC address of the group. 470 * 471 * Return true if the link state of the aggregation changed, for example 472 * as a result of a failure changing the MAC address of one of the 473 * constituent ports. 474 */ 475 boolean_t 476 aggr_grp_update_ports_mac(aggr_grp_t *grp) 477 { 478 aggr_port_t *cport; 479 boolean_t link_state_changed = B_FALSE; 480 mac_perim_handle_t mph; 481 482 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 483 484 for (cport = grp->lg_ports; cport != NULL; 485 cport = cport->lp_next) { 486 mac_perim_enter_by_mh(cport->lp_mh, &mph); 487 if (aggr_port_unicst(cport) != 0) { 488 if (aggr_grp_detach_port(grp, cport)) 489 link_state_changed = B_TRUE; 490 } else { 491 /* 492 * If a port was detached because of a previous 493 * failure changing the MAC address, the port is 494 * reattached when it successfully changes the MAC 495 * address now, and this might cause the link state 496 * of the aggregation to change. 497 */ 498 if (aggr_grp_attach_port(grp, cport)) 499 link_state_changed = B_TRUE; 500 } 501 mac_perim_exit(mph); 502 } 503 return (link_state_changed); 504 } 505 506 /* 507 * Invoked when the MAC address of a port has changed. If the port's 508 * MAC address was used for the group MAC address, set mac_addr_changedp 509 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 510 * notification. If the link state changes due to detach/attach of 511 * the constituent port, set link_state_changedp to B_TRUE to indicate 512 * to the caller that it should send a MAC_NOTE_LINK notification. In both 513 * cases, it is the responsibility of the caller to invoke notification 514 * functions after releasing the the port lock. 515 */ 516 void 517 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 518 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 519 { 520 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 521 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 522 ASSERT(mac_addr_changedp != NULL); 523 ASSERT(link_state_changedp != NULL); 524 525 *mac_addr_changedp = B_FALSE; 526 *link_state_changedp = B_FALSE; 527 528 if (grp->lg_addr_fixed) { 529 /* 530 * The group is using a fixed MAC address or an automatic 531 * MAC address has not been set. 532 */ 533 return; 534 } 535 536 if (grp->lg_mac_addr_port == port) { 537 /* 538 * The MAC address of the port was assigned to the group 539 * MAC address. Update the group MAC address. 540 */ 541 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 542 *mac_addr_changedp = B_TRUE; 543 } else { 544 /* 545 * Update the actual port MAC address to the MAC address 546 * of the group. 547 */ 548 if (aggr_port_unicst(port) != 0) { 549 *link_state_changedp = aggr_grp_detach_port(grp, port); 550 } else { 551 /* 552 * If a port was detached because of a previous 553 * failure changing the MAC address, the port is 554 * reattached when it successfully changes the MAC 555 * address now, and this might cause the link state 556 * of the aggregation to change. 557 */ 558 *link_state_changedp = aggr_grp_attach_port(grp, port); 559 } 560 } 561 } 562 563 /* 564 * Add a port to a link aggregation group. 565 */ 566 static int 567 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 568 aggr_port_t **pp) 569 { 570 aggr_port_t *port, **cport; 571 mac_perim_handle_t mph; 572 zoneid_t port_zoneid = ALL_ZONES; 573 int err; 574 575 /* The port must be in the same zone as the aggregation. */ 576 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 577 port_zoneid = GLOBAL_ZONEID; 578 if (grp->lg_zoneid != port_zoneid) 579 return (EBUSY); 580 581 /* 582 * If we are creating the aggr, then there is no MAC handle 583 * and thus no perimeter to hold. If we are adding a port to 584 * an existing aggr, then the perimiter of the aggr's MAC must 585 * be held. 586 */ 587 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 588 589 err = aggr_port_create(grp, port_linkid, force, &port); 590 if (err != 0) 591 return (err); 592 593 mac_perim_enter_by_mh(port->lp_mh, &mph); 594 595 /* Add the new port to the end of the list. */ 596 cport = &grp->lg_ports; 597 while (*cport != NULL) 598 cport = &((*cport)->lp_next); 599 *cport = port; 600 601 /* 602 * Back reference to the group it is member of. A port always 603 * holds a reference to its group to ensure that the back 604 * reference is always valid. 605 */ 606 port->lp_grp = grp; 607 AGGR_GRP_REFHOLD(grp); 608 grp->lg_nports++; 609 610 aggr_lacp_init_port(port); 611 mac_perim_exit(mph); 612 613 if (pp != NULL) 614 *pp = port; 615 616 return (0); 617 } 618 619 /* 620 * This is called in response to either our LACP state machine or a MAC 621 * notification that the link has gone down via aggr_send_port_disable(). At 622 * this point, we may need to update our default ring. To that end, we go 623 * through the set of ports (underlying datalinks in an aggregation) that are 624 * currently enabled to transmit data. If all our links have been disabled for 625 * transmit, then we don't do anything. 626 * 627 * Note, because we only have a single TX group, we don't have to worry about 628 * the rings moving between groups and the chance that mac will reassign it 629 * unless someone removes a port, at which point, we play it safe and call this 630 * again. 631 */ 632 void 633 aggr_grp_update_default(aggr_grp_t *grp) 634 { 635 aggr_port_t *port; 636 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 637 638 rw_enter(&grp->lg_tx_lock, RW_WRITER); 639 640 if (grp->lg_ntx_ports == 0) { 641 rw_exit(&grp->lg_tx_lock); 642 return; 643 } 644 645 port = grp->lg_tx_ports[0]; 646 ASSERT(port->lp_tx_ring_cnt > 0); 647 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]); 648 rw_exit(&grp->lg_tx_lock); 649 } 650 651 /* 652 * Add a pseudo RX ring for the given HW ring handle. 653 */ 654 static int 655 aggr_add_pseudo_rx_ring(aggr_port_t *port, 656 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 657 { 658 aggr_pseudo_rx_ring_t *ring; 659 int err; 660 int j; 661 662 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 663 ring = rx_grp->arg_rings + j; 664 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 665 break; 666 } 667 668 /* 669 * No slot for this new RX ring. 670 */ 671 if (j == MAX_RINGS_PER_GROUP) 672 return (EIO); 673 674 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 675 ring->arr_hw_rh = hw_rh; 676 ring->arr_port = port; 677 ring->arr_grp = rx_grp; 678 rx_grp->arg_ring_cnt++; 679 680 /* 681 * The group is already registered, dynamically add a new ring to the 682 * mac group. 683 */ 684 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 685 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 686 ring->arr_hw_rh = NULL; 687 ring->arr_port = NULL; 688 ring->arr_grp = NULL; 689 rx_grp->arg_ring_cnt--; 690 } else { 691 /* 692 * This must run after the MAC is registered. 693 */ 694 ASSERT3P(ring->arr_rh, !=, NULL); 695 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, 696 (void *)port, (mac_resource_handle_t)ring); 697 } 698 return (err); 699 } 700 701 /* 702 * Remove the pseudo RX ring of the given HW ring handle. 703 */ 704 static void 705 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 706 { 707 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { 708 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; 709 710 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 711 ring->arr_hw_rh != hw_rh) { 712 continue; 713 } 714 715 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 716 717 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 718 ring->arr_hw_rh = NULL; 719 ring->arr_port = NULL; 720 ring->arr_grp = NULL; 721 rx_grp->arg_ring_cnt--; 722 mac_hwring_clear_passthru(hw_rh); 723 break; 724 } 725 } 726 727 /* 728 * Create pseudo rings over the HW rings of the port. 729 * 730 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group. 731 * 732 * o Program existing unicast filters on the pseudo group into the HW group. 733 * 734 * o Program existing VLAN filters on the pseudo group into the HW group. 735 */ 736 static int 737 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 738 { 739 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 740 aggr_unicst_addr_t *addr, *a; 741 mac_perim_handle_t pmph; 742 aggr_vlan_t *avp; 743 uint_t hw_rh_cnt, i; 744 int err = 0; 745 uint_t g_idx = rx_grp->arg_index; 746 747 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 748 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 749 mac_perim_enter_by_mh(port->lp_mh, &pmph); 750 751 /* 752 * This function must be called after the aggr registers its 753 * MAC and its Rx groups have been initialized. 754 */ 755 ASSERT(rx_grp->arg_gh != NULL); 756 757 /* 758 * Get the list of the underlying HW rings. 759 */ 760 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, 761 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); 762 763 /* 764 * Add existing VLAN and unicast address filters to the port. 765 */ 766 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; 767 avp = list_next(&rx_grp->arg_vlans, avp)) { 768 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) 769 goto err; 770 } 771 772 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 773 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) 774 goto err; 775 } 776 777 for (i = 0; i < hw_rh_cnt; i++) { 778 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 779 if (err != 0) 780 goto err; 781 } 782 783 mac_perim_exit(pmph); 784 return (0); 785 786 err: 787 ASSERT(err != 0); 788 789 for (uint_t j = 0; j < i; j++) 790 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 791 792 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 793 aggr_port_remmac(port, g_idx, a->aua_addr); 794 795 if (avp != NULL) 796 avp = list_prev(&rx_grp->arg_vlans, avp); 797 798 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { 799 int err2; 800 801 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 802 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 803 ": errno %d.", avp->av_vid, 804 mac_client_name(port->lp_mch), err2); 805 } 806 } 807 808 port->lp_hwghs[g_idx] = NULL; 809 mac_perim_exit(pmph); 810 return (err); 811 } 812 813 /* 814 * Destroy the pseudo rings mapping to this port and remove all VLAN 815 * and unicast filters from this port. Even if there are no underlying 816 * HW rings we must still remove the unicast filters to take the port 817 * out of promisc mode. 818 */ 819 static void 820 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 821 { 822 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 823 aggr_unicst_addr_t *addr; 824 mac_perim_handle_t pmph; 825 uint_t hw_rh_cnt; 826 uint_t g_idx = rx_grp->arg_index; 827 828 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 829 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 830 ASSERT3P(rx_grp->arg_gh, !=, NULL); 831 mac_perim_enter_by_mh(port->lp_mh, &pmph); 832 833 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, 834 MAC_RING_TYPE_RX); 835 836 for (uint_t i = 0; i < hw_rh_cnt; i++) 837 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 838 839 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 840 aggr_port_remmac(port, g_idx, addr->aua_addr); 841 842 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; 843 avp = list_next(&rx_grp->arg_vlans, avp)) { 844 int err; 845 846 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 847 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 848 ": errno %d.", avp->av_vid, 849 mac_client_name(port->lp_mch), err); 850 } 851 } 852 853 port->lp_hwghs[g_idx] = NULL; 854 mac_perim_exit(pmph); 855 } 856 857 /* 858 * Add a pseudo TX ring for the given HW ring handle. 859 */ 860 static int 861 aggr_add_pseudo_tx_ring(aggr_port_t *port, 862 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 863 mac_ring_handle_t *pseudo_rh) 864 { 865 aggr_pseudo_tx_ring_t *ring; 866 int err; 867 int i; 868 869 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 870 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 871 ring = tx_grp->atg_rings + i; 872 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 873 break; 874 } 875 /* 876 * No slot for this new TX ring. 877 */ 878 if (i == MAX_RINGS_PER_GROUP) 879 return (EIO); 880 /* 881 * The following 4 statements needs to be done before 882 * calling mac_group_add_ring(). Otherwise it will 883 * result in an assertion failure in mac_init_ring(). 884 */ 885 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 886 ring->atr_hw_rh = hw_rh; 887 ring->atr_port = port; 888 tx_grp->atg_ring_cnt++; 889 890 /* 891 * The TX side has no concept of ring groups unlike RX groups. 892 * There is just a single group which stores all the TX rings. 893 * This group will be used to store aggr's pseudo TX rings. 894 */ 895 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 896 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 897 ring->atr_hw_rh = NULL; 898 ring->atr_port = NULL; 899 tx_grp->atg_ring_cnt--; 900 } else { 901 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 902 if (hw_rh != NULL) { 903 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 904 mac_find_ring(tx_grp->atg_gh, i)); 905 } 906 } 907 908 return (err); 909 } 910 911 /* 912 * Remove the pseudo TX ring of the given HW ring handle. 913 */ 914 static void 915 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 916 mac_ring_handle_t pseudo_hw_rh) 917 { 918 aggr_pseudo_tx_ring_t *ring; 919 int i; 920 921 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 922 ring = tx_grp->atg_rings + i; 923 if (ring->atr_rh != pseudo_hw_rh) 924 continue; 925 926 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 927 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 928 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 929 mac_hwring_teardown(ring->atr_hw_rh); 930 ring->atr_hw_rh = NULL; 931 ring->atr_port = NULL; 932 tx_grp->atg_ring_cnt--; 933 break; 934 } 935 } 936 937 /* 938 * This function is called to create pseudo rings over hardware rings of 939 * the underlying device. There is a 1:1 mapping between the pseudo TX 940 * rings of the aggr and the hardware rings of the underlying port. 941 */ 942 static int 943 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 944 { 945 aggr_grp_t *grp = port->lp_grp; 946 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 947 mac_perim_handle_t pmph; 948 int hw_rh_cnt, i = 0, j; 949 int err = 0; 950 951 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 952 mac_perim_enter_by_mh(port->lp_mh, &pmph); 953 954 /* 955 * Get the list the the underlying HW rings. 956 */ 957 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, 958 MAC_RING_TYPE_TX); 959 960 /* 961 * Even if the underlying NIC does not have TX rings, we 962 * still make a psuedo TX ring for that NIC with NULL as 963 * the ring handle. 964 */ 965 if (hw_rh_cnt == 0) 966 port->lp_tx_ring_cnt = 1; 967 else 968 port->lp_tx_ring_cnt = hw_rh_cnt; 969 970 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 971 port->lp_tx_ring_cnt), KM_SLEEP); 972 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 973 port->lp_tx_ring_cnt), KM_SLEEP); 974 975 if (hw_rh_cnt == 0) { 976 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 977 NULL, &pseudo_rh)) == 0) { 978 port->lp_tx_rings[0] = NULL; 979 port->lp_pseudo_tx_rings[0] = pseudo_rh; 980 } 981 } else { 982 for (i = 0; err == 0 && i < hw_rh_cnt; i++) { 983 err = aggr_add_pseudo_tx_ring(port, 984 tx_grp, hw_rh[i], &pseudo_rh); 985 if (err != 0) 986 break; 987 port->lp_tx_rings[i] = hw_rh[i]; 988 port->lp_pseudo_tx_rings[i] = pseudo_rh; 989 } 990 } 991 992 if (err != 0) { 993 if (hw_rh_cnt != 0) { 994 for (j = 0; j < i; j++) { 995 aggr_rem_pseudo_tx_ring(tx_grp, 996 port->lp_pseudo_tx_rings[j]); 997 } 998 } 999 kmem_free(port->lp_tx_rings, 1000 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1001 kmem_free(port->lp_pseudo_tx_rings, 1002 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1003 port->lp_tx_ring_cnt = 0; 1004 } else { 1005 port->lp_tx_grp_added = B_TRUE; 1006 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 1007 aggr_tx_ring_update, port); 1008 } 1009 mac_perim_exit(pmph); 1010 aggr_grp_update_default(grp); 1011 return (err); 1012 } 1013 1014 /* 1015 * This function is called by aggr to remove pseudo TX rings over the 1016 * HW rings of the underlying port. 1017 */ 1018 static void 1019 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 1020 { 1021 aggr_grp_t *grp = port->lp_grp; 1022 mac_perim_handle_t pmph; 1023 int i; 1024 1025 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1026 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1027 1028 if (!port->lp_tx_grp_added) 1029 goto done; 1030 1031 ASSERT(tx_grp->atg_gh != NULL); 1032 1033 for (i = 0; i < port->lp_tx_ring_cnt; i++) 1034 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 1035 1036 kmem_free(port->lp_tx_rings, 1037 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1038 kmem_free(port->lp_pseudo_tx_rings, 1039 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1040 1041 port->lp_tx_ring_cnt = 0; 1042 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 1043 port->lp_tx_grp_added = B_FALSE; 1044 aggr_grp_update_default(grp); 1045 done: 1046 mac_perim_exit(pmph); 1047 } 1048 1049 static int 1050 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 1051 { 1052 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1053 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 1054 } 1055 1056 static int 1057 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 1058 { 1059 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1060 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 1061 } 1062 1063 /* 1064 * Start the pseudo ring. Since the pseudo ring is just an abstraction 1065 * over an actual HW ring, the real task is to start the underlying HW 1066 * ring. 1067 */ 1068 static int 1069 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) 1070 { 1071 int err; 1072 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1073 1074 err = mac_hwring_start(rr_ring->arr_hw_rh); 1075 1076 if (err != 0) 1077 return (err); 1078 1079 rr_ring->arr_gen = mr_gen; 1080 return (err); 1081 } 1082 1083 /* 1084 * Stop the pseudo ring. Since the pseudo ring is just an abstraction 1085 * over an actual HW ring, the real task is to stop the underlying HW 1086 * ring. 1087 */ 1088 static void 1089 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) 1090 { 1091 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1092 1093 /* 1094 * The rings underlying the default group must stay up to 1095 * continue receiving LACP traffic. We would normally never 1096 * stop the default Rx rings because of the primary MAC 1097 * client; but aggr's primary MAC client doesn't call 1098 * mac_unicast_add() and thus mi_active is 0 when the last 1099 * non-primary client is deleted. 1100 */ 1101 if (rr_ring->arr_grp->arg_index != 0) 1102 mac_hwring_stop(rr_ring->arr_hw_rh); 1103 } 1104 1105 /* 1106 * Add one or more ports to an existing link aggregation group. 1107 */ 1108 int 1109 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 1110 laioc_port_t *ports) 1111 { 1112 int rc; 1113 uint_t port_added = 0; 1114 uint_t grp_added; 1115 aggr_grp_t *grp = NULL; 1116 aggr_port_t *port; 1117 boolean_t link_state_changed = B_FALSE; 1118 mac_perim_handle_t mph, pmph; 1119 1120 /* Get the aggr corresponding to linkid. */ 1121 rw_enter(&aggr_grp_lock, RW_READER); 1122 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1123 (mod_hash_val_t *)&grp) != 0) { 1124 rw_exit(&aggr_grp_lock); 1125 return (ENOENT); 1126 } 1127 AGGR_GRP_REFHOLD(grp); 1128 1129 /* 1130 * Hold the perimeter so that the aggregation can't be destroyed. 1131 */ 1132 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1133 rw_exit(&aggr_grp_lock); 1134 1135 /* Add the specified ports to the aggr. */ 1136 for (uint_t i = 0; i < nports; i++) { 1137 grp_added = 0; 1138 1139 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1140 force, &port)) != 0) { 1141 goto bail; 1142 } 1143 1144 ASSERT(port != NULL); 1145 port_added++; 1146 1147 /* check capabilities */ 1148 if (!aggr_grp_capab_check(grp, port) || 1149 !aggr_grp_sdu_check(grp, port) || 1150 !aggr_grp_margin_check(grp, port)) { 1151 rc = ENOTSUP; 1152 goto bail; 1153 } 1154 1155 /* 1156 * Create the pseudo ring for each HW ring of the underlying 1157 * port. 1158 */ 1159 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); 1160 if (rc != 0) 1161 goto bail; 1162 1163 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { 1164 rc = aggr_add_pseudo_rx_group(port, 1165 &grp->lg_rx_groups[j]); 1166 1167 if (rc != 0) 1168 goto bail; 1169 1170 grp_added++; 1171 } 1172 1173 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1174 1175 /* set LACP mode */ 1176 aggr_port_lacp_set_mode(grp, port); 1177 1178 /* start port if group has already been started */ 1179 if (grp->lg_started) { 1180 rc = aggr_port_start(port); 1181 if (rc != 0) { 1182 mac_perim_exit(pmph); 1183 goto bail; 1184 } 1185 1186 /* 1187 * Turn on the promiscuous mode over the port when it 1188 * is requested to be turned on to receive the 1189 * non-primary address over a port, or the promiscuous 1190 * mode is enabled over the aggr. 1191 */ 1192 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1193 rc = aggr_port_promisc(port, B_TRUE); 1194 if (rc != 0) { 1195 mac_perim_exit(pmph); 1196 goto bail; 1197 } 1198 } 1199 } 1200 mac_perim_exit(pmph); 1201 1202 /* 1203 * Attach each port if necessary. 1204 */ 1205 if (aggr_port_notify_link(grp, port)) 1206 link_state_changed = B_TRUE; 1207 1208 /* 1209 * Initialize the callback functions for this port. 1210 */ 1211 aggr_port_init_callbacks(port); 1212 } 1213 1214 /* update the MAC address of the constituent ports */ 1215 if (aggr_grp_update_ports_mac(grp)) 1216 link_state_changed = B_TRUE; 1217 1218 if (link_state_changed) 1219 mac_link_update(grp->lg_mh, grp->lg_link_state); 1220 1221 bail: 1222 if (rc != 0) { 1223 /* stop and remove ports that have been added */ 1224 for (uint_t i = 0; i < port_added; i++) { 1225 uint_t grp_remove; 1226 1227 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1228 ASSERT(port != NULL); 1229 1230 if (grp->lg_started) { 1231 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1232 (void) aggr_port_promisc(port, B_FALSE); 1233 aggr_port_stop(port); 1234 mac_perim_exit(pmph); 1235 } 1236 1237 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1238 1239 /* 1240 * Only the last port could have a partial set 1241 * of groups added. 1242 */ 1243 grp_remove = (i + 1 == port_added) ? grp_added : 1244 grp->lg_rx_group_count; 1245 1246 for (uint_t j = 0; j < grp_remove; j++) { 1247 aggr_rem_pseudo_rx_group(port, 1248 &grp->lg_rx_groups[j]); 1249 } 1250 1251 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1252 } 1253 } 1254 1255 mac_perim_exit(mph); 1256 AGGR_GRP_REFRELE(grp); 1257 return (rc); 1258 } 1259 1260 static int 1261 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1262 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1263 aggr_lacp_timer_t lacp_timer) 1264 { 1265 boolean_t mac_addr_changed = B_FALSE; 1266 boolean_t link_state_changed = B_FALSE; 1267 mac_perim_handle_t pmph; 1268 1269 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1270 1271 /* validate fixed address if specified */ 1272 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1273 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1274 (mac_addr[0] & 0x01))) { 1275 return (EINVAL); 1276 } 1277 1278 /* update policy if requested */ 1279 if (update_mask & AGGR_MODIFY_POLICY) 1280 aggr_send_update_policy(grp, policy); 1281 1282 /* update unicast MAC address if requested */ 1283 if (update_mask & AGGR_MODIFY_MAC) { 1284 if (mac_fixed) { 1285 /* user-supplied MAC address */ 1286 grp->lg_mac_addr_port = NULL; 1287 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1288 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1289 mac_addr_changed = B_TRUE; 1290 } 1291 } else if (grp->lg_addr_fixed) { 1292 /* switch from user-supplied to automatic */ 1293 aggr_port_t *port = grp->lg_ports; 1294 1295 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1296 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1297 grp->lg_mac_addr_port = port; 1298 mac_addr_changed = B_TRUE; 1299 mac_perim_exit(pmph); 1300 } 1301 grp->lg_addr_fixed = mac_fixed; 1302 } 1303 1304 if (mac_addr_changed) 1305 link_state_changed = aggr_grp_update_ports_mac(grp); 1306 1307 if (update_mask & AGGR_MODIFY_LACP_MODE) 1308 aggr_lacp_update_mode(grp, lacp_mode); 1309 1310 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1311 aggr_lacp_update_timer(grp, lacp_timer); 1312 1313 if (link_state_changed) 1314 mac_link_update(grp->lg_mh, grp->lg_link_state); 1315 1316 if (mac_addr_changed) 1317 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1318 1319 return (0); 1320 } 1321 1322 /* 1323 * Update properties of an existing link aggregation group. 1324 */ 1325 int 1326 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1327 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1328 aggr_lacp_timer_t lacp_timer) 1329 { 1330 aggr_grp_t *grp = NULL; 1331 mac_perim_handle_t mph; 1332 int err; 1333 1334 /* get group corresponding to linkid */ 1335 rw_enter(&aggr_grp_lock, RW_READER); 1336 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1337 (mod_hash_val_t *)&grp) != 0) { 1338 rw_exit(&aggr_grp_lock); 1339 return (ENOENT); 1340 } 1341 AGGR_GRP_REFHOLD(grp); 1342 1343 /* 1344 * Hold the perimeter so that the aggregation won't be destroyed. 1345 */ 1346 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1347 rw_exit(&aggr_grp_lock); 1348 1349 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1350 mac_addr, lacp_mode, lacp_timer); 1351 1352 mac_perim_exit(mph); 1353 AGGR_GRP_REFRELE(grp); 1354 return (err); 1355 } 1356 1357 /* 1358 * Create a new link aggregation group upon request from administrator. 1359 * Returns 0 on success, an errno on failure. 1360 */ 1361 int 1362 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1363 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1364 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1365 cred_t *credp) 1366 { 1367 aggr_grp_t *grp = NULL; 1368 aggr_port_t *port; 1369 mac_register_t *mac; 1370 boolean_t link_state_changed; 1371 mac_perim_handle_t mph; 1372 int err; 1373 int i; 1374 kt_did_t tid = 0; 1375 1376 /* need at least one port */ 1377 if (nports == 0) 1378 return (EINVAL); 1379 1380 rw_enter(&aggr_grp_lock, RW_WRITER); 1381 1382 /* does a group with the same linkid already exist? */ 1383 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1384 (mod_hash_val_t *)&grp); 1385 if (err == 0) { 1386 rw_exit(&aggr_grp_lock); 1387 return (EEXIST); 1388 } 1389 1390 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1391 1392 grp->lg_refs = 1; 1393 grp->lg_closing = B_FALSE; 1394 grp->lg_force = force; 1395 grp->lg_linkid = linkid; 1396 grp->lg_zoneid = crgetzoneid(credp); 1397 grp->lg_ifspeed = 0; 1398 grp->lg_link_state = LINK_STATE_UNKNOWN; 1399 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1400 grp->lg_started = B_FALSE; 1401 grp->lg_promisc = B_FALSE; 1402 grp->lg_lacp_done = B_FALSE; 1403 grp->lg_tx_notify_done = B_FALSE; 1404 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1405 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1406 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1407 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1408 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1409 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1410 MAX_RINGS_PER_GROUP), KM_SLEEP); 1411 grp->lg_tx_blocked_cnt = 0; 1412 bzero(&grp->lg_rx_groups, 1413 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); 1414 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1415 aggr_lacp_init_grp(grp); 1416 1417 /* add MAC ports to group */ 1418 grp->lg_ports = NULL; 1419 grp->lg_nports = 0; 1420 grp->lg_nattached_ports = 0; 1421 grp->lg_ntx_ports = 0; 1422 1423 /* 1424 * If key is not specified by the user, allocate the key. 1425 */ 1426 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1427 err = ENOMEM; 1428 goto bail; 1429 } 1430 grp->lg_key = key; 1431 1432 for (i = 0; i < nports; i++) { 1433 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port); 1434 if (err != 0) 1435 goto bail; 1436 } 1437 1438 grp->lg_rx_group_count = 1; 1439 1440 for (i = 0, port = grp->lg_ports; port != NULL; 1441 i++, port = port->lp_next) { 1442 uint_t num_rgroups; 1443 1444 mac_perim_enter_by_mh(port->lp_mh, &mph); 1445 num_rgroups = mac_get_num_rx_groups(port->lp_mh); 1446 mac_perim_exit(mph); 1447 1448 /* 1449 * Utilize all the groups in a port. If some ports 1450 * have less groups than others, then traffic destined 1451 * for the same unicast address may be HW classified 1452 * on some ports but SW classified by aggr when 1453 * arriving on other ports. 1454 */ 1455 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, 1456 num_rgroups); 1457 } 1458 1459 /* 1460 * There could be cases where the hardware provides more 1461 * groups than aggr can support. Make sure we never go above 1462 * the max aggr can support. 1463 */ 1464 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, 1465 MAX_GROUPS_PER_PORT); 1466 1467 ASSERT3U(grp->lg_rx_group_count, >, 0); 1468 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1469 grp->lg_rx_groups[i].arg_index = i; 1470 grp->lg_rx_groups[i].arg_untagged = 0; 1471 list_create(&(grp->lg_rx_groups[i].arg_vlans), 1472 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); 1473 } 1474 1475 /* 1476 * If no explicit MAC address was specified by the administrator, 1477 * set it to the MAC address of the first port. 1478 */ 1479 grp->lg_addr_fixed = mac_fixed; 1480 if (grp->lg_addr_fixed) { 1481 /* validate specified address */ 1482 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1483 err = EINVAL; 1484 goto bail; 1485 } 1486 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1487 } else { 1488 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1489 grp->lg_mac_addr_port = grp->lg_ports; 1490 } 1491 1492 /* Set the initial group capabilities. */ 1493 aggr_grp_capab_set(grp); 1494 1495 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1496 err = ENOMEM; 1497 goto bail; 1498 } 1499 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1500 mac->m_driver = grp; 1501 mac->m_dip = aggr_dip; 1502 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1503 mac->m_src_addr = grp->lg_addr; 1504 mac->m_callbacks = &aggr_m_callbacks; 1505 mac->m_min_sdu = 0; 1506 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1507 mac->m_margin = aggr_grp_max_margin(grp); 1508 mac->m_v12n = MAC_VIRT_LEVEL1; 1509 err = mac_register(mac, &grp->lg_mh); 1510 mac_free(mac); 1511 if (err != 0) 1512 goto bail; 1513 1514 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1515 if (err != 0) { 1516 (void) mac_unregister(grp->lg_mh); 1517 grp->lg_mh = NULL; 1518 goto bail; 1519 } 1520 1521 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1522 1523 /* 1524 * Update the MAC address of the constituent ports. 1525 * None of the port is attached at this time, the link state of the 1526 * aggregation will not change. 1527 * 1528 * All ports take on the primary MAC address of the aggr 1529 * (lg_aggr). At this point, none of the ports are attached; 1530 * thus the link state of the aggregation will not change. 1531 */ 1532 link_state_changed = aggr_grp_update_ports_mac(grp); 1533 ASSERT(!link_state_changed); 1534 1535 /* Update outbound load balancing policy. */ 1536 aggr_send_update_policy(grp, policy); 1537 1538 /* Set LACP mode. */ 1539 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1540 1541 /* 1542 * Attach each port if necessary. 1543 */ 1544 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1545 /* 1546 * Create the pseudo ring for each HW ring of the 1547 * underlying port. Note that this is done after the 1548 * aggr registers its MAC. 1549 */ 1550 VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group), 1551 ==, 0); 1552 1553 for (i = 0; i < grp->lg_rx_group_count; i++) { 1554 VERIFY3S(aggr_add_pseudo_rx_group(port, 1555 &grp->lg_rx_groups[i]), ==, 0); 1556 } 1557 1558 if (aggr_port_notify_link(grp, port)) 1559 link_state_changed = B_TRUE; 1560 1561 /* 1562 * Initialize the callback functions for this port. 1563 */ 1564 aggr_port_init_callbacks(port); 1565 } 1566 1567 if (link_state_changed) 1568 mac_link_update(grp->lg_mh, grp->lg_link_state); 1569 1570 /* add new group to hash table */ 1571 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1572 (mod_hash_val_t)grp); 1573 ASSERT(err == 0); 1574 aggr_grp_cnt++; 1575 1576 mac_perim_exit(mph); 1577 rw_exit(&aggr_grp_lock); 1578 return (0); 1579 1580 bail: 1581 1582 grp->lg_closing = B_TRUE; 1583 1584 port = grp->lg_ports; 1585 while (port != NULL) { 1586 aggr_port_t *cport; 1587 1588 cport = port->lp_next; 1589 aggr_port_delete(port); 1590 port = cport; 1591 } 1592 1593 /* 1594 * Inform the lacp_rx thread to exit. 1595 */ 1596 mutex_enter(&grp->lg_lacp_lock); 1597 grp->lg_lacp_done = B_TRUE; 1598 cv_signal(&grp->lg_lacp_cv); 1599 while (grp->lg_lacp_rx_thread != NULL) 1600 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1601 mutex_exit(&grp->lg_lacp_lock); 1602 /* 1603 * Inform the tx_notify thread to exit. 1604 */ 1605 mutex_enter(&grp->lg_tx_flowctl_lock); 1606 if (grp->lg_tx_notify_thread != NULL) { 1607 tid = grp->lg_tx_notify_thread->t_did; 1608 grp->lg_tx_notify_done = B_TRUE; 1609 cv_signal(&grp->lg_tx_flowctl_cv); 1610 } 1611 mutex_exit(&grp->lg_tx_flowctl_lock); 1612 if (tid != 0) 1613 thread_join(tid); 1614 1615 kmem_free(grp->lg_tx_blocked_rings, 1616 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1617 rw_exit(&aggr_grp_lock); 1618 AGGR_GRP_REFRELE(grp); 1619 return (err); 1620 } 1621 1622 /* 1623 * Return a pointer to the member of a group with specified linkid. 1624 */ 1625 static aggr_port_t * 1626 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1627 { 1628 aggr_port_t *port; 1629 1630 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1631 1632 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1633 if (port->lp_linkid == linkid) 1634 break; 1635 } 1636 1637 return (port); 1638 } 1639 1640 /* 1641 * Stop, detach and remove a port from a link aggregation group. 1642 */ 1643 static int 1644 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1645 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1646 { 1647 int rc = 0; 1648 aggr_port_t **pport; 1649 boolean_t mac_addr_changed = B_FALSE; 1650 boolean_t link_state_changed = B_FALSE; 1651 mac_perim_handle_t mph; 1652 uint64_t val; 1653 uint_t i; 1654 uint_t stat; 1655 1656 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1657 ASSERT(grp->lg_nports > 1); 1658 ASSERT(!grp->lg_closing); 1659 1660 /* unlink port */ 1661 for (pport = &grp->lg_ports; *pport != port; 1662 pport = &(*pport)->lp_next) { 1663 if (*pport == NULL) { 1664 rc = ENOENT; 1665 goto done; 1666 } 1667 } 1668 *pport = port->lp_next; 1669 1670 mac_perim_enter_by_mh(port->lp_mh, &mph); 1671 1672 /* 1673 * If the MAC address of the port being removed was assigned 1674 * to the group, update the group MAC address 1675 * using the MAC address of a different port. 1676 */ 1677 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1678 /* 1679 * Set the MAC address of the group to the 1680 * MAC address of its first port. 1681 */ 1682 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1683 grp->lg_mac_addr_port = grp->lg_ports; 1684 mac_addr_changed = B_TRUE; 1685 } 1686 1687 link_state_changed = aggr_grp_detach_port(grp, port); 1688 1689 /* 1690 * Add the counter statistics of the ports while it was aggregated 1691 * to the group's residual statistics. This is done by obtaining 1692 * the current counter from the underlying MAC then subtracting the 1693 * value of the counter at the moment it was added to the 1694 * aggregation. 1695 */ 1696 for (i = 0; i < MAC_NSTAT; i++) { 1697 stat = i + MAC_STAT_MIN; 1698 if (!MAC_STAT_ISACOUNTER(stat)) 1699 continue; 1700 val = aggr_port_stat(port, stat); 1701 val -= port->lp_stat[i]; 1702 mutex_enter(&grp->lg_stat_lock); 1703 grp->lg_stat[i] += val; 1704 mutex_exit(&grp->lg_stat_lock); 1705 } 1706 for (i = 0; i < ETHER_NSTAT; i++) { 1707 stat = i + MACTYPE_STAT_MIN; 1708 if (!ETHER_STAT_ISACOUNTER(stat)) 1709 continue; 1710 val = aggr_port_stat(port, stat); 1711 val -= port->lp_ether_stat[i]; 1712 mutex_enter(&grp->lg_stat_lock); 1713 grp->lg_ether_stat[i] += val; 1714 mutex_exit(&grp->lg_stat_lock); 1715 } 1716 1717 grp->lg_nports--; 1718 mac_perim_exit(mph); 1719 1720 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1721 aggr_port_delete(port); 1722 1723 /* 1724 * If the group MAC address has changed, update the MAC address of 1725 * the remaining constituent ports according to the new MAC 1726 * address of the group. 1727 */ 1728 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1729 link_state_changed = B_TRUE; 1730 1731 done: 1732 if (mac_addr_changedp != NULL) 1733 *mac_addr_changedp = mac_addr_changed; 1734 if (link_state_changedp != NULL) 1735 *link_state_changedp = link_state_changed; 1736 1737 return (rc); 1738 } 1739 1740 /* 1741 * Remove one or more ports from an existing link aggregation group. 1742 */ 1743 int 1744 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1745 { 1746 int rc = 0, i; 1747 aggr_grp_t *grp = NULL; 1748 aggr_port_t *port; 1749 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1750 boolean_t link_state_update = B_FALSE, link_state_changed; 1751 mac_perim_handle_t mph, pmph; 1752 1753 /* get group corresponding to linkid */ 1754 rw_enter(&aggr_grp_lock, RW_READER); 1755 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1756 (mod_hash_val_t *)&grp) != 0) { 1757 rw_exit(&aggr_grp_lock); 1758 return (ENOENT); 1759 } 1760 AGGR_GRP_REFHOLD(grp); 1761 1762 /* 1763 * Hold the perimeter so that the aggregation won't be destroyed. 1764 */ 1765 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1766 rw_exit(&aggr_grp_lock); 1767 1768 /* we need to keep at least one port per group */ 1769 if (nports >= grp->lg_nports) { 1770 rc = EINVAL; 1771 goto bail; 1772 } 1773 1774 /* first verify that all the groups are valid */ 1775 for (i = 0; i < nports; i++) { 1776 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1777 /* port not found */ 1778 rc = ENOENT; 1779 goto bail; 1780 } 1781 } 1782 1783 /* clear the promiscous mode for the specified ports */ 1784 for (i = 0; i < nports && rc == 0; i++) { 1785 /* lookup port */ 1786 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1787 ASSERT(port != NULL); 1788 1789 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1790 rc = aggr_port_promisc(port, B_FALSE); 1791 mac_perim_exit(pmph); 1792 } 1793 if (rc != 0) { 1794 for (i = 0; i < nports; i++) { 1795 port = aggr_grp_port_lookup(grp, 1796 ports[i].lp_linkid); 1797 ASSERT(port != NULL); 1798 1799 /* 1800 * Turn the promiscuous mode back on if it is required 1801 * to receive the non-primary address over a port, or 1802 * the promiscous mode is enabled over the aggr. 1803 */ 1804 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1805 if (port->lp_started && (grp->lg_promisc || 1806 port->lp_prom_addr != NULL)) { 1807 (void) aggr_port_promisc(port, B_TRUE); 1808 } 1809 mac_perim_exit(pmph); 1810 } 1811 goto bail; 1812 } 1813 1814 /* remove the specified ports from group */ 1815 for (i = 0; i < nports; i++) { 1816 /* lookup port */ 1817 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1818 ASSERT(port != NULL); 1819 1820 /* stop port if group has already been started */ 1821 if (grp->lg_started) { 1822 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1823 aggr_port_stop(port); 1824 mac_perim_exit(pmph); 1825 } 1826 1827 /* 1828 * aggr_rem_pseudo_tx_group() is not called here. Instead 1829 * it is called from inside aggr_grp_rem_port() after the 1830 * port has been detached. The reason is that 1831 * aggr_rem_pseudo_tx_group() removes one ring at a time 1832 * and if there is still traffic going on, then there 1833 * is the possibility of aggr_find_tx_ring() returning a 1834 * removed ring for transmission. Once the port has been 1835 * detached, that port will not be used and 1836 * aggr_find_tx_ring() will not return any rings 1837 * belonging to it. 1838 */ 1839 for (i = 0; i < grp->lg_rx_group_count; i++) 1840 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 1841 1842 /* remove port from group */ 1843 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 1844 &link_state_changed); 1845 ASSERT(rc == 0); 1846 mac_addr_update = mac_addr_update || mac_addr_changed; 1847 link_state_update = link_state_update || link_state_changed; 1848 } 1849 1850 bail: 1851 if (mac_addr_update) 1852 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1853 if (link_state_update) 1854 mac_link_update(grp->lg_mh, grp->lg_link_state); 1855 1856 mac_perim_exit(mph); 1857 AGGR_GRP_REFRELE(grp); 1858 1859 return (rc); 1860 } 1861 1862 int 1863 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 1864 { 1865 aggr_grp_t *grp = NULL; 1866 aggr_port_t *port, *cport; 1867 datalink_id_t tmpid; 1868 mod_hash_val_t val; 1869 mac_perim_handle_t mph, pmph; 1870 int err; 1871 kt_did_t tid = 0; 1872 1873 rw_enter(&aggr_grp_lock, RW_WRITER); 1874 1875 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1876 (mod_hash_val_t *)&grp) != 0) { 1877 rw_exit(&aggr_grp_lock); 1878 return (ENOENT); 1879 } 1880 1881 /* 1882 * Note that dls_devnet_destroy() must be called before lg_lock is 1883 * held. Otherwise, it will deadlock if another thread is in 1884 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 1885 * dls_devnet_destroy() needs to delete. 1886 */ 1887 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 1888 rw_exit(&aggr_grp_lock); 1889 return (err); 1890 } 1891 ASSERT(linkid == tmpid); 1892 1893 /* 1894 * Unregister from the MAC service module. Since this can 1895 * fail if a client hasn't closed the MAC port, we gracefully 1896 * fail the operation. 1897 */ 1898 if ((err = mac_disable(grp->lg_mh)) != 0) { 1899 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 1900 rw_exit(&aggr_grp_lock); 1901 return (err); 1902 } 1903 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 1904 ASSERT(grp == (aggr_grp_t *)val); 1905 1906 ASSERT(aggr_grp_cnt > 0); 1907 aggr_grp_cnt--; 1908 rw_exit(&aggr_grp_lock); 1909 1910 /* 1911 * Inform the lacp_rx thread to exit. 1912 */ 1913 mutex_enter(&grp->lg_lacp_lock); 1914 grp->lg_lacp_done = B_TRUE; 1915 cv_signal(&grp->lg_lacp_cv); 1916 while (grp->lg_lacp_rx_thread != NULL) 1917 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1918 mutex_exit(&grp->lg_lacp_lock); 1919 /* 1920 * Inform the tx_notify_thread to exit. 1921 */ 1922 mutex_enter(&grp->lg_tx_flowctl_lock); 1923 if (grp->lg_tx_notify_thread != NULL) { 1924 tid = grp->lg_tx_notify_thread->t_did; 1925 grp->lg_tx_notify_done = B_TRUE; 1926 cv_signal(&grp->lg_tx_flowctl_cv); 1927 } 1928 mutex_exit(&grp->lg_tx_flowctl_lock); 1929 if (tid != 0) 1930 thread_join(tid); 1931 1932 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1933 1934 grp->lg_closing = B_TRUE; 1935 /* detach and free MAC ports associated with group */ 1936 port = grp->lg_ports; 1937 while (port != NULL) { 1938 cport = port->lp_next; 1939 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1940 if (grp->lg_started) 1941 aggr_port_stop(port); 1942 (void) aggr_grp_detach_port(grp, port); 1943 mac_perim_exit(pmph); 1944 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1945 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 1946 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 1947 aggr_port_delete(port); 1948 port = cport; 1949 } 1950 1951 mac_perim_exit(mph); 1952 1953 kmem_free(grp->lg_tx_blocked_rings, 1954 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1955 /* 1956 * Wait for the port's lacp timer thread and its notification callback 1957 * to exit before calling mac_unregister() since both needs to access 1958 * the mac perimeter of the grp. 1959 */ 1960 aggr_grp_port_wait(grp); 1961 1962 VERIFY(mac_unregister(grp->lg_mh) == 0); 1963 grp->lg_mh = NULL; 1964 1965 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1966 list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); 1967 } 1968 1969 AGGR_GRP_REFRELE(grp); 1970 return (0); 1971 } 1972 1973 void 1974 aggr_grp_free(aggr_grp_t *grp) 1975 { 1976 ASSERT(grp->lg_refs == 0); 1977 ASSERT(grp->lg_port_ref == 0); 1978 if (grp->lg_key > AGGR_MAX_KEY) { 1979 id_free(key_ids, grp->lg_key); 1980 grp->lg_key = 0; 1981 } 1982 kmem_cache_free(aggr_grp_cache, grp); 1983 } 1984 1985 int 1986 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 1987 aggr_grp_info_new_grp_fn_t new_grp_fn, 1988 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 1989 { 1990 aggr_grp_t *grp; 1991 aggr_port_t *port; 1992 mac_perim_handle_t mph, pmph; 1993 int rc = 0; 1994 1995 /* 1996 * Make sure that the aggregation link is visible from the caller's 1997 * zone. 1998 */ 1999 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 2000 return (ENOENT); 2001 2002 rw_enter(&aggr_grp_lock, RW_READER); 2003 2004 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 2005 (mod_hash_val_t *)&grp) != 0) { 2006 rw_exit(&aggr_grp_lock); 2007 return (ENOENT); 2008 } 2009 AGGR_GRP_REFHOLD(grp); 2010 2011 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2012 rw_exit(&aggr_grp_lock); 2013 2014 rc = new_grp_fn(fn_arg, grp->lg_linkid, 2015 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 2016 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 2017 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 2018 2019 if (rc != 0) 2020 goto bail; 2021 2022 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2023 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2024 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 2025 port->lp_state, &port->lp_lacp.ActorOperPortState); 2026 mac_perim_exit(pmph); 2027 2028 if (rc != 0) 2029 goto bail; 2030 } 2031 2032 bail: 2033 mac_perim_exit(mph); 2034 AGGR_GRP_REFRELE(grp); 2035 return (rc); 2036 } 2037 2038 /*ARGSUSED*/ 2039 static void 2040 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 2041 { 2042 miocnak(q, mp, 0, ENOTSUP); 2043 } 2044 2045 static int 2046 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 2047 { 2048 aggr_port_t *port; 2049 uint_t stat_index; 2050 2051 ASSERT(MUTEX_HELD(&grp->lg_stat_lock)); 2052 2053 /* We only aggregate counter statistics. */ 2054 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || 2055 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { 2056 return (ENOTSUP); 2057 } 2058 2059 /* 2060 * Counter statistics for a group are computed by aggregating the 2061 * counters of the members MACs while they were aggregated, plus 2062 * the residual counter of the group itself, which is updated each 2063 * time a MAC is removed from the group. 2064 */ 2065 *val = 0; 2066 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2067 /* actual port statistic */ 2068 *val += aggr_port_stat(port, stat); 2069 /* 2070 * minus the port stat when it was added, plus any residual 2071 * amount for the group. 2072 */ 2073 if (IS_MAC_STAT(stat)) { 2074 stat_index = stat - MAC_STAT_MIN; 2075 *val -= port->lp_stat[stat_index]; 2076 *val += grp->lg_stat[stat_index]; 2077 } else if (IS_MACTYPE_STAT(stat)) { 2078 stat_index = stat - MACTYPE_STAT_MIN; 2079 *val -= port->lp_ether_stat[stat_index]; 2080 *val += grp->lg_ether_stat[stat_index]; 2081 } 2082 } 2083 return (0); 2084 } 2085 2086 int 2087 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2088 { 2089 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 2090 2091 if (rx_ring->arr_hw_rh != NULL) { 2092 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 2093 } else { 2094 aggr_port_t *port = rx_ring->arr_port; 2095 2096 *val = mac_stat_get(port->lp_mh, stat); 2097 2098 } 2099 return (0); 2100 } 2101 2102 int 2103 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2104 { 2105 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 2106 2107 if (tx_ring->atr_hw_rh != NULL) { 2108 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 2109 } else { 2110 aggr_port_t *port = tx_ring->atr_port; 2111 2112 *val = mac_stat_get(port->lp_mh, stat); 2113 } 2114 return (0); 2115 } 2116 2117 static int 2118 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 2119 { 2120 aggr_grp_t *grp = arg; 2121 int rval = 0; 2122 2123 mutex_enter(&grp->lg_stat_lock); 2124 2125 switch (stat) { 2126 case MAC_STAT_IFSPEED: 2127 *val = grp->lg_ifspeed; 2128 break; 2129 2130 case ETHER_STAT_LINK_DUPLEX: 2131 *val = grp->lg_link_duplex; 2132 break; 2133 2134 default: 2135 /* 2136 * For all other statistics, we return the aggregated stat 2137 * from the underlying ports. aggr_grp_stat() will set 2138 * rval appropriately if the statistic isn't a counter. 2139 */ 2140 rval = aggr_grp_stat(grp, stat, val); 2141 } 2142 2143 mutex_exit(&grp->lg_stat_lock); 2144 return (rval); 2145 } 2146 2147 static int 2148 aggr_m_start(void *arg) 2149 { 2150 aggr_grp_t *grp = arg; 2151 aggr_port_t *port; 2152 mac_perim_handle_t mph, pmph; 2153 2154 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2155 2156 /* 2157 * Attempts to start all configured members of the group. 2158 * Group members will be attached when their link-up notification 2159 * is received. 2160 */ 2161 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2162 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2163 if (aggr_port_start(port) != 0) { 2164 mac_perim_exit(pmph); 2165 continue; 2166 } 2167 2168 /* 2169 * Turn on the promiscuous mode if it is required to receive 2170 * the non-primary address over a port, or the promiscous 2171 * mode is enabled over the aggr. 2172 */ 2173 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 2174 if (aggr_port_promisc(port, B_TRUE) != 0) 2175 aggr_port_stop(port); 2176 } 2177 mac_perim_exit(pmph); 2178 } 2179 2180 grp->lg_started = B_TRUE; 2181 2182 mac_perim_exit(mph); 2183 return (0); 2184 } 2185 2186 static void 2187 aggr_m_stop(void *arg) 2188 { 2189 aggr_grp_t *grp = arg; 2190 aggr_port_t *port; 2191 mac_perim_handle_t mph, pmph; 2192 2193 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2194 2195 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2196 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2197 2198 /* reset port promiscuous mode */ 2199 (void) aggr_port_promisc(port, B_FALSE); 2200 2201 aggr_port_stop(port); 2202 mac_perim_exit(pmph); 2203 } 2204 2205 grp->lg_started = B_FALSE; 2206 mac_perim_exit(mph); 2207 } 2208 2209 static int 2210 aggr_m_promisc(void *arg, boolean_t on) 2211 { 2212 aggr_grp_t *grp = arg; 2213 aggr_port_t *port; 2214 boolean_t link_state_changed = B_FALSE; 2215 mac_perim_handle_t mph, pmph; 2216 2217 AGGR_GRP_REFHOLD(grp); 2218 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2219 2220 ASSERT(!grp->lg_closing); 2221 2222 if (on == grp->lg_promisc) 2223 goto bail; 2224 2225 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2226 int err = 0; 2227 2228 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2229 AGGR_PORT_REFHOLD(port); 2230 if (!on && (port->lp_prom_addr == NULL)) 2231 err = aggr_port_promisc(port, B_FALSE); 2232 else if (on && port->lp_started) 2233 err = aggr_port_promisc(port, B_TRUE); 2234 2235 if (err != 0) { 2236 if (aggr_grp_detach_port(grp, port)) 2237 link_state_changed = B_TRUE; 2238 } else { 2239 /* 2240 * If a port was detached because of a previous 2241 * failure changing the promiscuity, the port 2242 * is reattached when it successfully changes 2243 * the promiscuity now, and this might cause 2244 * the link state of the aggregation to change. 2245 */ 2246 if (aggr_grp_attach_port(grp, port)) 2247 link_state_changed = B_TRUE; 2248 } 2249 mac_perim_exit(pmph); 2250 AGGR_PORT_REFRELE(port); 2251 } 2252 2253 grp->lg_promisc = on; 2254 2255 if (link_state_changed) 2256 mac_link_update(grp->lg_mh, grp->lg_link_state); 2257 2258 bail: 2259 mac_perim_exit(mph); 2260 AGGR_GRP_REFRELE(grp); 2261 2262 return (0); 2263 } 2264 2265 static void 2266 aggr_grp_port_rename(const char *new_name, void *arg) 2267 { 2268 /* 2269 * aggr port's mac client name is the format of "aggr link name" plus 2270 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2271 */ 2272 int aggr_len, link_len, clnt_name_len, i; 2273 char *str_end, *str_st, *str_del; 2274 char aggr_name[MAXNAMELEN]; 2275 char link_name[MAXNAMELEN]; 2276 char *clnt_name; 2277 aggr_grp_t *aggr_grp = arg; 2278 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2279 2280 for (i = 0; i < aggr_grp->lg_nports; i++) { 2281 clnt_name = mac_client_name(aggr_port->lp_mch); 2282 clnt_name_len = strlen(clnt_name); 2283 str_st = clnt_name; 2284 str_end = &(clnt_name[clnt_name_len]); 2285 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2286 ASSERT(str_del != NULL); 2287 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2288 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2289 bzero(aggr_name, MAXNAMELEN); 2290 bzero(link_name, MAXNAMELEN); 2291 bcopy(clnt_name, aggr_name, aggr_len); 2292 bcopy(str_del, link_name, link_len + 1); 2293 bzero(clnt_name, MAXNAMELEN); 2294 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2295 link_name); 2296 2297 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2298 aggr_port = aggr_port->lp_next; 2299 } 2300 } 2301 2302 /* 2303 * Initialize the capabilities that are advertised for the group 2304 * according to the capabilities of the constituent ports. 2305 */ 2306 static boolean_t 2307 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2308 { 2309 aggr_grp_t *grp = arg; 2310 2311 switch (cap) { 2312 case MAC_CAPAB_HCKSUM: { 2313 uint32_t *hcksum_txflags = cap_data; 2314 *hcksum_txflags = grp->lg_hcksum_txflags; 2315 break; 2316 } 2317 case MAC_CAPAB_LSO: { 2318 mac_capab_lso_t *cap_lso = cap_data; 2319 2320 if (grp->lg_lso) { 2321 *cap_lso = grp->lg_cap_lso; 2322 break; 2323 } else { 2324 return (B_FALSE); 2325 } 2326 } 2327 case MAC_CAPAB_NO_NATIVEVLAN: 2328 return (!grp->lg_vlan); 2329 case MAC_CAPAB_NO_ZCOPY: 2330 return (!grp->lg_zcopy); 2331 case MAC_CAPAB_RINGS: { 2332 mac_capab_rings_t *cap_rings = cap_data; 2333 uint_t ring_cnt = 0; 2334 2335 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 2336 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; 2337 2338 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2339 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2340 cap_rings->mr_rnum = ring_cnt; 2341 cap_rings->mr_gnum = grp->lg_rx_group_count; 2342 cap_rings->mr_gaddring = NULL; 2343 cap_rings->mr_gremring = NULL; 2344 } else { 2345 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2346 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2347 cap_rings->mr_gnum = 0; 2348 } 2349 cap_rings->mr_rget = aggr_fill_ring; 2350 cap_rings->mr_gget = aggr_fill_group; 2351 break; 2352 } 2353 case MAC_CAPAB_AGGR: 2354 { 2355 mac_capab_aggr_t *aggr_cap; 2356 2357 if (cap_data != NULL) { 2358 aggr_cap = cap_data; 2359 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2360 aggr_cap->mca_unicst = aggr_m_unicst; 2361 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2362 aggr_cap->mca_arg = arg; 2363 } 2364 return (B_TRUE); 2365 } 2366 default: 2367 return (B_FALSE); 2368 } 2369 return (B_TRUE); 2370 } 2371 2372 /* 2373 * Callback function for MAC layer to register groups. 2374 */ 2375 static void 2376 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2377 mac_group_info_t *infop, mac_group_handle_t gh) 2378 { 2379 aggr_grp_t *grp = arg; 2380 2381 if (rtype == MAC_RING_TYPE_RX) { 2382 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; 2383 2384 rx_group->arg_gh = gh; 2385 rx_group->arg_grp = grp; 2386 2387 infop->mgi_driver = (mac_group_driver_t)rx_group; 2388 infop->mgi_start = NULL; 2389 infop->mgi_stop = NULL; 2390 infop->mgi_addmac = aggr_addmac; 2391 infop->mgi_remmac = aggr_remmac; 2392 infop->mgi_count = rx_group->arg_ring_cnt; 2393 2394 /* 2395 * Always set the HW VLAN callbacks. They are smart 2396 * enough to know when a port has HW VLAN filters to 2397 * program and when it doesn't. 2398 */ 2399 infop->mgi_addvlan = aggr_addvlan; 2400 infop->mgi_remvlan = aggr_remvlan; 2401 } else { 2402 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2403 2404 ASSERT3S(index, ==, 0); 2405 tx_group->atg_gh = gh; 2406 } 2407 } 2408 2409 /* 2410 * Callback funtion for MAC layer to register all rings. 2411 */ 2412 static void 2413 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2414 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2415 { 2416 aggr_grp_t *grp = arg; 2417 2418 switch (rtype) { 2419 case MAC_RING_TYPE_RX: { 2420 aggr_pseudo_rx_group_t *rx_group; 2421 aggr_pseudo_rx_ring_t *rx_ring; 2422 mac_intr_t aggr_mac_intr; 2423 2424 rx_group = &grp->lg_rx_groups[rg_index]; 2425 ASSERT3S(index, >=, 0); 2426 ASSERT3S(index, <, rx_group->arg_ring_cnt); 2427 rx_ring = rx_group->arg_rings + index; 2428 rx_ring->arr_rh = rh; 2429 2430 /* 2431 * Entrypoint to enable interrupt (disable poll) and 2432 * disable interrupt (enable poll). 2433 */ 2434 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2435 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2436 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2437 aggr_mac_intr.mi_ddi_handle = NULL; 2438 2439 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2440 infop->mri_start = aggr_pseudo_start_rx_ring; 2441 infop->mri_stop = aggr_pseudo_stop_rx_ring; 2442 2443 infop->mri_intr = aggr_mac_intr; 2444 infop->mri_poll = aggr_rx_poll; 2445 2446 infop->mri_stat = aggr_rx_ring_stat; 2447 break; 2448 } 2449 case MAC_RING_TYPE_TX: { 2450 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2451 aggr_pseudo_tx_ring_t *tx_ring; 2452 2453 ASSERT(rg_index == -1); 2454 ASSERT(index < tx_group->atg_ring_cnt); 2455 2456 tx_ring = &tx_group->atg_rings[index]; 2457 tx_ring->atr_rh = rh; 2458 2459 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2460 infop->mri_start = NULL; 2461 infop->mri_stop = NULL; 2462 infop->mri_tx = aggr_ring_tx; 2463 infop->mri_stat = aggr_tx_ring_stat; 2464 /* 2465 * Use the hw TX ring handle to find if the ring needs 2466 * serialization or not. For NICs that do not expose 2467 * Tx rings, atr_hw_rh will be NULL. 2468 */ 2469 if (tx_ring->atr_hw_rh != NULL) { 2470 infop->mri_flags = 2471 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2472 } 2473 break; 2474 } 2475 default: 2476 break; 2477 } 2478 } 2479 2480 static mblk_t * 2481 aggr_rx_poll(void *arg, int bytes_to_pickup) 2482 { 2483 aggr_pseudo_rx_ring_t *rr_ring = arg; 2484 aggr_port_t *port = rr_ring->arr_port; 2485 aggr_grp_t *grp = port->lp_grp; 2486 mblk_t *mp_chain, *mp, **mpp; 2487 2488 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2489 2490 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2491 return (mp_chain); 2492 2493 mpp = &mp_chain; 2494 while ((mp = *mpp) != NULL) { 2495 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2496 struct ether_header *ehp; 2497 2498 ehp = (struct ether_header *)mp->b_rptr; 2499 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2500 *mpp = mp->b_next; 2501 mp->b_next = NULL; 2502 aggr_recv_lacp(port, 2503 (mac_resource_handle_t)rr_ring, mp); 2504 continue; 2505 } 2506 } 2507 2508 if (!port->lp_collector_enabled) { 2509 *mpp = mp->b_next; 2510 mp->b_next = NULL; 2511 freemsg(mp); 2512 continue; 2513 } 2514 mpp = &mp->b_next; 2515 } 2516 return (mp_chain); 2517 } 2518 2519 static int 2520 aggr_addmac(void *arg, const uint8_t *mac_addr) 2521 { 2522 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2523 aggr_unicst_addr_t *addr, **pprev; 2524 aggr_grp_t *grp = rx_group->arg_grp; 2525 aggr_port_t *port, *p; 2526 mac_perim_handle_t mph; 2527 int err = 0; 2528 uint_t idx = rx_group->arg_index; 2529 2530 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2531 2532 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2533 mac_perim_exit(mph); 2534 return (0); 2535 } 2536 2537 /* 2538 * Insert this mac address into the list of mac addresses owned by 2539 * the aggregation pseudo group. 2540 */ 2541 pprev = &rx_group->arg_macaddr; 2542 while ((addr = *pprev) != NULL) { 2543 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2544 mac_perim_exit(mph); 2545 return (EEXIST); 2546 } 2547 pprev = &addr->aua_next; 2548 } 2549 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2550 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2551 addr->aua_next = NULL; 2552 *pprev = addr; 2553 2554 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2555 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) 2556 break; 2557 2558 if (err != 0) { 2559 for (p = grp->lg_ports; p != port; p = p->lp_next) 2560 aggr_port_remmac(p, idx, mac_addr); 2561 2562 *pprev = NULL; 2563 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2564 } 2565 2566 mac_perim_exit(mph); 2567 return (err); 2568 } 2569 2570 static int 2571 aggr_remmac(void *arg, const uint8_t *mac_addr) 2572 { 2573 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2574 aggr_unicst_addr_t *addr, **pprev; 2575 aggr_grp_t *grp = rx_group->arg_grp; 2576 aggr_port_t *port; 2577 mac_perim_handle_t mph; 2578 int err = 0; 2579 2580 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2581 2582 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2583 mac_perim_exit(mph); 2584 return (0); 2585 } 2586 2587 /* 2588 * Insert this mac address into the list of mac addresses owned by 2589 * the aggregation pseudo group. 2590 */ 2591 pprev = &rx_group->arg_macaddr; 2592 while ((addr = *pprev) != NULL) { 2593 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2594 pprev = &addr->aua_next; 2595 continue; 2596 } 2597 break; 2598 } 2599 if (addr == NULL) { 2600 mac_perim_exit(mph); 2601 return (EINVAL); 2602 } 2603 2604 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2605 aggr_port_remmac(port, rx_group->arg_index, mac_addr); 2606 2607 *pprev = addr->aua_next; 2608 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2609 2610 mac_perim_exit(mph); 2611 return (err); 2612 } 2613 2614 /* 2615 * Search for VID in the Rx group's list and return a pointer if 2616 * found. Otherwise return NULL. 2617 */ 2618 static aggr_vlan_t * 2619 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) 2620 { 2621 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh)); 2622 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL; 2623 avp = list_next(&rx_group->arg_vlans, avp)) { 2624 if (avp->av_vid == vid) 2625 return (avp); 2626 } 2627 2628 return (NULL); 2629 } 2630 2631 /* 2632 * Accept traffic on the specified VID. 2633 * 2634 * Persist VLAN state in the aggr so that ports added later will 2635 * receive the correct filters. In the future it would be nice to 2636 * allow aggr to iterate its clients instead of duplicating state. 2637 */ 2638 static int 2639 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) 2640 { 2641 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2642 aggr_grp_t *aggr = rx_group->arg_grp; 2643 aggr_port_t *port, *p; 2644 mac_perim_handle_t mph; 2645 int err = 0; 2646 aggr_vlan_t *avp = NULL; 2647 uint_t idx = rx_group->arg_index; 2648 2649 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2650 2651 if (vid == MAC_VLAN_UNTAGGED) { 2652 /* 2653 * Aggr is both a MAC provider and MAC client. As a 2654 * MAC provider it is passed MAC_VLAN_UNTAGGED by its 2655 * client. As a client itself, it should pass 2656 * VLAN_ID_NONE to its ports. 2657 */ 2658 vid = VLAN_ID_NONE; 2659 rx_group->arg_untagged++; 2660 goto update_ports; 2661 } 2662 2663 avp = aggr_find_vlan(rx_group, vid); 2664 2665 if (avp != NULL) { 2666 avp->av_refs++; 2667 mac_perim_exit(mph); 2668 return (0); 2669 } 2670 2671 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP); 2672 avp->av_vid = vid; 2673 avp->av_refs = 1; 2674 2675 update_ports: 2676 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2677 if ((err = aggr_port_addvlan(port, idx, vid)) != 0) 2678 break; 2679 2680 if (err != 0) { 2681 /* 2682 * If any of these calls fail then we are in a 2683 * situation where the ports have different HW state. 2684 * There's no reasonable action the MAC client can 2685 * take in this scenario to rectify the situation. 2686 */ 2687 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2688 int err2; 2689 2690 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { 2691 cmn_err(CE_WARN, "Failed to remove VLAN %u" 2692 " from port %s: errno %d.", vid, 2693 mac_client_name(p->lp_mch), err2); 2694 } 2695 2696 } 2697 2698 if (vid == VLAN_ID_NONE) 2699 rx_group->arg_untagged--; 2700 2701 if (avp != NULL) { 2702 kmem_free(avp, sizeof (aggr_vlan_t)); 2703 avp = NULL; 2704 } 2705 } 2706 2707 if (avp != NULL) 2708 list_insert_tail(&rx_group->arg_vlans, avp); 2709 2710 done: 2711 mac_perim_exit(mph); 2712 return (err); 2713 } 2714 2715 /* 2716 * Stop accepting traffic on this VLAN if it's the last use of this VLAN. 2717 */ 2718 static int 2719 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) 2720 { 2721 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2722 aggr_grp_t *aggr = rx_group->arg_grp; 2723 aggr_port_t *port, *p; 2724 mac_perim_handle_t mph; 2725 int err = 0; 2726 aggr_vlan_t *avp = NULL; 2727 uint_t idx = rx_group->arg_index; 2728 2729 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2730 2731 /* 2732 * See the comment in aggr_addvlan(). 2733 */ 2734 if (vid == MAC_VLAN_UNTAGGED) { 2735 vid = VLAN_ID_NONE; 2736 rx_group->arg_untagged--; 2737 2738 if (rx_group->arg_untagged > 0) 2739 goto done; 2740 2741 goto update_ports; 2742 } 2743 2744 avp = aggr_find_vlan(rx_group, vid); 2745 2746 if (avp == NULL) { 2747 err = ENOENT; 2748 goto done; 2749 } 2750 2751 avp->av_refs--; 2752 2753 if (avp->av_refs > 0) 2754 goto done; 2755 2756 update_ports: 2757 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2758 if ((err = aggr_port_remvlan(port, idx, vid)) != 0) 2759 break; 2760 2761 /* 2762 * See the comment in aggr_addvlan() for justification of the 2763 * use of VERIFY here. 2764 */ 2765 if (err != 0) { 2766 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2767 int err2; 2768 2769 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { 2770 cmn_err(CE_WARN, "Failed to add VLAN %u" 2771 " to port %s: errno %d.", vid, 2772 mac_client_name(p->lp_mch), err2); 2773 } 2774 } 2775 2776 if (avp != NULL) 2777 avp->av_refs++; 2778 2779 if (vid == VLAN_ID_NONE) 2780 rx_group->arg_untagged++; 2781 2782 goto done; 2783 } 2784 2785 if (err == 0 && avp != NULL) { 2786 VERIFY3U(avp->av_refs, ==, 0); 2787 list_remove(&rx_group->arg_vlans, avp); 2788 kmem_free(avp, sizeof (aggr_vlan_t)); 2789 } 2790 2791 done: 2792 mac_perim_exit(mph); 2793 return (err); 2794 } 2795 2796 /* 2797 * Add or remove the multicast addresses that are defined for the group 2798 * to or from the specified port. 2799 * 2800 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2801 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2802 * called when the port is either stopped or detached. 2803 */ 2804 void 2805 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2806 { 2807 aggr_grp_t *grp = port->lp_grp; 2808 2809 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2810 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2811 2812 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2813 return; 2814 2815 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2816 } 2817 2818 static int 2819 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2820 { 2821 aggr_grp_t *grp = arg; 2822 aggr_port_t *port = NULL, *errport = NULL; 2823 mac_perim_handle_t mph; 2824 int err = 0; 2825 2826 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2827 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2828 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2829 !port->lp_started) { 2830 continue; 2831 } 2832 err = aggr_port_multicst(port, add, addrp); 2833 if (err != 0) { 2834 errport = port; 2835 break; 2836 } 2837 } 2838 2839 /* 2840 * At least one port caused error return and this error is returned to 2841 * mac, eventually a NAK would be sent upwards. 2842 * Some ports have this multicast address listed now, and some don't. 2843 * Treat this error as a whole aggr failure not individual port failure. 2844 * Therefore remove this multicast address from other ports. 2845 */ 2846 if ((err != 0) && add) { 2847 for (port = grp->lg_ports; port != errport; 2848 port = port->lp_next) { 2849 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2850 !port->lp_started) { 2851 continue; 2852 } 2853 (void) aggr_port_multicst(port, B_FALSE, addrp); 2854 } 2855 } 2856 mac_perim_exit(mph); 2857 return (err); 2858 } 2859 2860 static int 2861 aggr_m_unicst(void *arg, const uint8_t *macaddr) 2862 { 2863 aggr_grp_t *grp = arg; 2864 mac_perim_handle_t mph; 2865 int err; 2866 2867 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2868 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 2869 0, 0); 2870 mac_perim_exit(mph); 2871 return (err); 2872 } 2873 2874 /* 2875 * Initialize the capabilities that are advertised for the group 2876 * according to the capabilities of the constituent ports. 2877 */ 2878 static void 2879 aggr_grp_capab_set(aggr_grp_t *grp) 2880 { 2881 uint32_t cksum; 2882 aggr_port_t *port; 2883 mac_capab_lso_t cap_lso; 2884 2885 ASSERT(grp->lg_mh == NULL); 2886 ASSERT(grp->lg_ports != NULL); 2887 2888 grp->lg_hcksum_txflags = (uint32_t)-1; 2889 grp->lg_zcopy = B_TRUE; 2890 grp->lg_vlan = B_TRUE; 2891 2892 grp->lg_lso = B_TRUE; 2893 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 2894 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 2895 2896 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2897 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 2898 cksum = 0; 2899 grp->lg_hcksum_txflags &= cksum; 2900 2901 grp->lg_vlan &= 2902 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 2903 2904 grp->lg_zcopy &= 2905 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 2906 2907 grp->lg_lso &= 2908 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 2909 if (grp->lg_lso) { 2910 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 2911 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2912 cap_lso.lso_basic_tcp_ipv4.lso_max) 2913 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 2914 cap_lso.lso_basic_tcp_ipv4.lso_max; 2915 } 2916 } 2917 } 2918 2919 /* 2920 * Checks whether the capabilities of the port being added are compatible 2921 * with the current capabilities of the aggregation. 2922 */ 2923 static boolean_t 2924 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 2925 { 2926 uint32_t hcksum_txflags; 2927 2928 ASSERT(grp->lg_ports != NULL); 2929 2930 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 2931 grp->lg_vlan) != grp->lg_vlan) { 2932 return (B_FALSE); 2933 } 2934 2935 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 2936 grp->lg_zcopy) != grp->lg_zcopy) { 2937 return (B_FALSE); 2938 } 2939 2940 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 2941 if (grp->lg_hcksum_txflags != 0) 2942 return (B_FALSE); 2943 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 2944 grp->lg_hcksum_txflags) { 2945 return (B_FALSE); 2946 } 2947 2948 if (grp->lg_lso) { 2949 mac_capab_lso_t cap_lso; 2950 2951 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 2952 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 2953 grp->lg_cap_lso.lso_flags) 2954 return (B_FALSE); 2955 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2956 cap_lso.lso_basic_tcp_ipv4.lso_max) 2957 return (B_FALSE); 2958 } else { 2959 return (B_FALSE); 2960 } 2961 } 2962 2963 return (B_TRUE); 2964 } 2965 2966 /* 2967 * Returns the maximum SDU according to the SDU of the constituent ports. 2968 */ 2969 static uint_t 2970 aggr_grp_max_sdu(aggr_grp_t *grp) 2971 { 2972 uint_t max_sdu = (uint_t)-1; 2973 aggr_port_t *port; 2974 2975 ASSERT(grp->lg_ports != NULL); 2976 2977 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2978 uint_t port_sdu_max; 2979 2980 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2981 if (max_sdu > port_sdu_max) 2982 max_sdu = port_sdu_max; 2983 } 2984 2985 return (max_sdu); 2986 } 2987 2988 /* 2989 * Checks if the maximum SDU of the specified port is compatible 2990 * with the maximum SDU of the specified aggregation group, returns 2991 * B_TRUE if it is, B_FALSE otherwise. 2992 */ 2993 static boolean_t 2994 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 2995 { 2996 uint_t port_sdu_max; 2997 2998 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2999 return (port_sdu_max >= grp->lg_max_sdu); 3000 } 3001 3002 /* 3003 * Returns the maximum margin according to the margin of the constituent ports. 3004 */ 3005 static uint32_t 3006 aggr_grp_max_margin(aggr_grp_t *grp) 3007 { 3008 uint32_t margin = UINT32_MAX; 3009 aggr_port_t *port; 3010 3011 ASSERT(grp->lg_mh == NULL); 3012 ASSERT(grp->lg_ports != NULL); 3013 3014 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3015 if (margin > port->lp_margin) 3016 margin = port->lp_margin; 3017 } 3018 3019 grp->lg_margin = margin; 3020 return (margin); 3021 } 3022 3023 /* 3024 * Checks if the maximum margin of the specified port is compatible 3025 * with the maximum margin of the specified aggregation group, returns 3026 * B_TRUE if it is, B_FALSE otherwise. 3027 */ 3028 static boolean_t 3029 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 3030 { 3031 if (port->lp_margin >= grp->lg_margin) 3032 return (B_TRUE); 3033 3034 /* 3035 * See whether the current margin value is allowed to be changed to 3036 * the new value. 3037 */ 3038 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 3039 return (B_FALSE); 3040 3041 grp->lg_margin = port->lp_margin; 3042 return (B_TRUE); 3043 } 3044 3045 /* 3046 * Set MTU on individual ports of an aggregation group 3047 */ 3048 static int 3049 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 3050 uint32_t *old_mtu) 3051 { 3052 boolean_t removed = B_FALSE; 3053 mac_perim_handle_t mph; 3054 mac_diag_t diag; 3055 int err, rv, retry = 0; 3056 3057 if (port->lp_mah != NULL) { 3058 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 3059 port->lp_mah = NULL; 3060 removed = B_TRUE; 3061 } 3062 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 3063 try_again: 3064 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 3065 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 3066 &port->lp_mah, 0, &diag)) != 0) { 3067 /* 3068 * following is a workaround for a bug in 'bge' driver. 3069 * See CR 6794654 for more information and this work around 3070 * will be removed once the CR is fixed. 3071 */ 3072 if (rv == EIO && retry++ < 3) { 3073 delay(2 * hz); 3074 goto try_again; 3075 } 3076 /* 3077 * if mac_unicast_add() failed while setting the MTU, 3078 * detach the port from the group. 3079 */ 3080 mac_perim_enter_by_mh(port->lp_mh, &mph); 3081 (void) aggr_grp_detach_port(grp, port); 3082 mac_perim_exit(mph); 3083 cmn_err(CE_WARN, "Unable to restart the port %s while " 3084 "setting MTU. Detaching the port from the aggregation.", 3085 mac_client_name(port->lp_mch)); 3086 } 3087 return (err); 3088 } 3089 3090 static int 3091 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 3092 { 3093 int err = 0, i, rv; 3094 aggr_port_t *port; 3095 uint32_t *mtu; 3096 3097 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3098 3099 /* 3100 * If the MTU being set is equal to aggr group's maximum 3101 * allowable value, then there is nothing to change 3102 */ 3103 if (sdu == grp->lg_max_sdu) 3104 return (0); 3105 3106 /* 0 is aggr group's min sdu */ 3107 if (sdu == 0) 3108 return (EINVAL); 3109 3110 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 3111 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 3112 port = port->lp_next, i++) { 3113 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 3114 } 3115 if (err != 0) { 3116 /* recover from error: reset the mtus of the ports */ 3117 aggr_port_t *tmp; 3118 3119 for (tmp = grp->lg_ports, i = 0; tmp != port; 3120 tmp = tmp->lp_next, i++) { 3121 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 3122 } 3123 goto bail; 3124 } 3125 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 3126 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 3127 ASSERT(rv == 0); 3128 bail: 3129 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 3130 return (err); 3131 } 3132 3133 /* 3134 * Callback functions for set/get of properties 3135 */ 3136 /*ARGSUSED*/ 3137 static int 3138 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3139 uint_t pr_valsize, const void *pr_val) 3140 { 3141 int err = ENOTSUP; 3142 aggr_grp_t *grp = m_driver; 3143 3144 switch (pr_num) { 3145 case MAC_PROP_MTU: { 3146 uint32_t mtu; 3147 3148 if (pr_valsize < sizeof (mtu)) { 3149 err = EINVAL; 3150 break; 3151 } 3152 bcopy(pr_val, &mtu, sizeof (mtu)); 3153 err = aggr_sdu_update(grp, mtu); 3154 break; 3155 } 3156 default: 3157 break; 3158 } 3159 return (err); 3160 } 3161 3162 typedef struct rboundary { 3163 uint32_t bval; 3164 int btype; 3165 } rboundary_t; 3166 3167 /* 3168 * This function finds the intersection of mtu ranges stored in arrays - 3169 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 3170 * Individual arrays are assumed to contain non-overlapping ranges. 3171 * Algorithm: 3172 * A range has two boundaries - min and max. We scan all arrays and store 3173 * each boundary as a separate element in a temporary array. We also store 3174 * the boundary types, min or max, as +1 or -1 respectively in the temporary 3175 * array. Then we sort the temporary array in ascending order. We scan the 3176 * sorted array from lower to higher values and keep a cumulative sum of 3177 * boundary types. Element in the temporary array for which the sum reaches 3178 * mcount is a min boundary of a range in the result and next element will be 3179 * max boundary. 3180 * 3181 * Example for mcount = 3, 3182 * 3183 * ----|_________|-------|_______|----|__|------ mrange[0] 3184 * 3185 * -------|________|--|____________|-----|___|-- mrange[1] 3186 * 3187 * --------|________________|-------|____|------ mrange[2] 3188 * 3189 * 3 2 1 3190 * \|/ 3191 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 3192 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 3193 * 3194 * same min and max 3195 * V 3196 * --------|_____|-------|__|------------|------ intersecting ranges 3197 */ 3198 void 3199 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 3200 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 3201 { 3202 mac_propval_uint32_range_t *rval, *ur; 3203 int rmaxcnt, rcount; 3204 size_t sz_range32; 3205 rboundary_t *ta; /* temporary array */ 3206 rboundary_t temp; 3207 boolean_t range_started = B_FALSE; 3208 int i, j, m, sum; 3209 3210 sz_range32 = sizeof (mac_propval_uint32_range_t); 3211 3212 for (i = 0, rmaxcnt = 0; i < mcount; i++) 3213 rmaxcnt += mrange[i]->mpr_count; 3214 3215 /* Allocate enough space to store the results */ 3216 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 3217 3218 /* Number of boundaries are twice as many as ranges */ 3219 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 3220 3221 for (i = 0, m = 0; i < mcount; i++) { 3222 ur = &(mrange[i]->mpr_range_uint32[0]); 3223 for (j = 0; j < mrange[i]->mpr_count; j++) { 3224 ta[m].bval = ur[j].mpur_min; 3225 ta[m++].btype = 1; 3226 ta[m].bval = ur[j].mpur_max; 3227 ta[m++].btype = -1; 3228 } 3229 } 3230 3231 /* 3232 * Sort the temporary array in ascending order of bval; 3233 * if boundary values are same then sort on btype. 3234 */ 3235 for (i = 0; i < m-1; i++) { 3236 for (j = i+1; j < m; j++) { 3237 if ((ta[i].bval > ta[j].bval) || 3238 ((ta[i].bval == ta[j].bval) && 3239 (ta[i].btype < ta[j].btype))) { 3240 temp = ta[i]; 3241 ta[i] = ta[j]; 3242 ta[j] = temp; 3243 } 3244 } 3245 } 3246 3247 /* Walk through temporary array to find all ranges in the results */ 3248 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 3249 sum += ta[i].btype; 3250 if (sum == mcount) { 3251 rval[rcount].mpur_min = ta[i].bval; 3252 range_started = B_TRUE; 3253 } else if (sum < mcount && range_started) { 3254 rval[rcount++].mpur_max = ta[i].bval; 3255 range_started = B_FALSE; 3256 } 3257 } 3258 3259 *prval = rval; 3260 *prmaxcnt = rmaxcnt; 3261 *prcount = rcount; 3262 3263 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t)); 3264 } 3265 3266 /* 3267 * Returns the mtu ranges which could be supported by aggr group. 3268 * prmaxcnt returns the size of the buffer prval, prcount returns 3269 * the number of valid entries in prval. Caller is responsible 3270 * for freeing up prval. 3271 */ 3272 int 3273 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 3274 int *prmaxcnt, int *prcount) 3275 { 3276 mac_propval_range_t **vals; 3277 aggr_port_t *port; 3278 mac_perim_handle_t mph; 3279 uint_t i, numr; 3280 int err = 0; 3281 size_t sz_propval, sz_range32; 3282 size_t size; 3283 3284 sz_propval = sizeof (mac_propval_range_t); 3285 sz_range32 = sizeof (mac_propval_uint32_range_t); 3286 3287 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3288 3289 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 3290 KM_SLEEP); 3291 3292 for (port = grp->lg_ports, i = 0; port != NULL; 3293 port = port->lp_next, i++) { 3294 3295 size = sz_propval; 3296 vals[i] = kmem_alloc(size, KM_SLEEP); 3297 vals[i]->mpr_count = 1; 3298 3299 mac_perim_enter_by_mh(port->lp_mh, &mph); 3300 3301 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3302 NULL, 0, vals[i], NULL); 3303 if (err == ENOSPC) { 3304 /* 3305 * Not enough space to hold all ranges. 3306 * Allocate extra space as indicated and retry. 3307 */ 3308 numr = vals[i]->mpr_count; 3309 kmem_free(vals[i], sz_propval); 3310 size = sz_propval + (numr - 1) * sz_range32; 3311 vals[i] = kmem_alloc(size, KM_SLEEP); 3312 vals[i]->mpr_count = numr; 3313 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3314 NULL, 0, vals[i], NULL); 3315 ASSERT(err != ENOSPC); 3316 } 3317 mac_perim_exit(mph); 3318 if (err != 0) { 3319 kmem_free(vals[i], size); 3320 vals[i] = NULL; 3321 break; 3322 } 3323 } 3324 3325 /* 3326 * if any of the underlying ports does not support changing MTU then 3327 * just return ENOTSUP 3328 */ 3329 if (port != NULL) { 3330 ASSERT(err != 0); 3331 goto done; 3332 } 3333 3334 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 3335 prcount); 3336 3337 done: 3338 for (i = 0; i < grp->lg_nports; i++) { 3339 if (vals[i] != NULL) { 3340 numr = vals[i]->mpr_count; 3341 size = sz_propval + (numr - 1) * sz_range32; 3342 kmem_free(vals[i], size); 3343 } 3344 } 3345 3346 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 3347 return (err); 3348 } 3349 3350 static void 3351 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3352 mac_prop_info_handle_t prh) 3353 { 3354 aggr_grp_t *grp = m_driver; 3355 mac_propval_uint32_range_t *rval = NULL; 3356 int i, rcount, rmaxcnt; 3357 int err = 0; 3358 3359 _NOTE(ARGUNUSED(pr_name)); 3360 3361 switch (pr_num) { 3362 case MAC_PROP_MTU: 3363 3364 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, 3365 &rcount); 3366 if (err != 0) { 3367 ASSERT(rval == NULL); 3368 return; 3369 } 3370 for (i = 0; i < rcount; i++) { 3371 mac_prop_info_set_range_uint32(prh, 3372 rval[i].mpur_min, rval[i].mpur_max); 3373 } 3374 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 3375 break; 3376 } 3377 }