1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2018 Joyent, Inc.
  24  */
  25 
  26 /*
  27  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
  28  *
  29  * An instance of the structure aggr_grp_t is allocated for each
  30  * link aggregation group. When created, aggr_grp_t objects are
  31  * entered into the aggr_grp_hash hash table maintained by the modhash
  32  * module. The hash key is the linkid associated with the link
  33  * aggregation group.
  34  *
  35  * A set of MAC ports are associated with each association group.
  36  *
  37  * Aggr pseudo TX rings
  38  * --------------------
  39  * The underlying ports (NICs) in an aggregation can have TX rings. To
  40  * enhance aggr's performance, these TX rings are made available to the
  41  * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
  42  * They are already present and implemented on the RX side. It is called
  43  * as pseudo RX rings. The same concept is extended to the TX side where
  44  * each TX ring of an underlying port is reflected in aggr as a pseudo
  45  * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
  46  * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
  47  * TX ring is given to the aggregation layer.
  48  *
  49  * With this change, the outgoing stack depth looks much better:
  50  *
  51  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
  52  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
  53  *
  54  * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
  55  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
  56  *
  57  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
  58  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
  59  * ring belonging to a port on which the packet has to be sent.
  60  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
  61  * policy and then uses the fanout_hint passed to it to pick a TX ring from
  62  * the selected port.
  63  *
  64  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
  65  * bandwidth limit is applied first on the outgoing packet and the packets
  66  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
  67  * particular TX ring.
  68  */
  69 
  70 #include <sys/types.h>
  71 #include <sys/sysmacros.h>
  72 #include <sys/conf.h>
  73 #include <sys/cmn_err.h>
  74 #include <sys/disp.h>
  75 #include <sys/list.h>
  76 #include <sys/ksynch.h>
  77 #include <sys/kmem.h>
  78 #include <sys/stream.h>
  79 #include <sys/modctl.h>
  80 #include <sys/ddi.h>
  81 #include <sys/sunddi.h>
  82 #include <sys/atomic.h>
  83 #include <sys/stat.h>
  84 #include <sys/modhash.h>
  85 #include <sys/id_space.h>
  86 #include <sys/strsun.h>
  87 #include <sys/cred.h>
  88 #include <sys/dlpi.h>
  89 #include <sys/zone.h>
  90 #include <sys/mac_provider.h>
  91 #include <sys/dls.h>
  92 #include <sys/vlan.h>
  93 #include <sys/aggr.h>
  94 #include <sys/aggr_impl.h>
  95 
  96 static int aggr_m_start(void *);
  97 static void aggr_m_stop(void *);
  98 static int aggr_m_promisc(void *, boolean_t);
  99 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
 100 static int aggr_m_unicst(void *, const uint8_t *);
 101 static int aggr_m_stat(void *, uint_t, uint64_t *);
 102 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
 103 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
 104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
 105     const void *);
 106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
 107     mac_prop_info_handle_t);
 108 
 109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
 110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
 111     boolean_t *);
 112 
 113 static void aggr_grp_capab_set(aggr_grp_t *);
 114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
 115 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
 116 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
 117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
 118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
 119 
 120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 122 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
 123 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
 124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
 125 static int aggr_addmac(void *, const uint8_t *);
 126 static int aggr_remmac(void *, const uint8_t *);
 127 static int aggr_addvlan(mac_group_driver_t, uint16_t);
 128 static int aggr_remvlan(mac_group_driver_t, uint16_t);
 129 static mblk_t *aggr_rx_poll(void *, int);
 130 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
 131     const int, mac_ring_info_t *, mac_ring_handle_t);
 132 static void aggr_fill_group(void *, mac_ring_type_t, const int,
 133     mac_group_info_t *, mac_group_handle_t);
 134 
 135 static kmem_cache_t     *aggr_grp_cache;
 136 static mod_hash_t       *aggr_grp_hash;
 137 static krwlock_t        aggr_grp_lock;
 138 static uint_t           aggr_grp_cnt;
 139 static id_space_t       *key_ids;
 140 
 141 #define GRP_HASHSZ              64
 142 #define GRP_HASH_KEY(linkid)    ((mod_hash_key_t)(uintptr_t)linkid)
 143 #define AGGR_PORT_NAME_DELIMIT '-'
 144 
 145 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
 146 
 147 #define AGGR_M_CALLBACK_FLAGS   \
 148         (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
 149 
 150 static mac_callbacks_t aggr_m_callbacks = {
 151         AGGR_M_CALLBACK_FLAGS,
 152         aggr_m_stat,
 153         aggr_m_start,
 154         aggr_m_stop,
 155         aggr_m_promisc,
 156         aggr_m_multicst,
 157         NULL,
 158         NULL,
 159         NULL,
 160         aggr_m_ioctl,
 161         aggr_m_capab_get,
 162         NULL,
 163         NULL,
 164         aggr_m_setprop,
 165         NULL,
 166         aggr_m_propinfo
 167 };
 168 
 169 /*ARGSUSED*/
 170 static int
 171 aggr_grp_constructor(void *buf, void *arg, int kmflag)
 172 {
 173         aggr_grp_t *grp = buf;
 174 
 175         bzero(grp, sizeof (*grp));
 176         mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
 177         cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
 178         rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
 179         mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
 180         cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
 181         mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
 182         cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
 183         grp->lg_link_state = LINK_STATE_UNKNOWN;
 184         return (0);
 185 }
 186 
 187 /*ARGSUSED*/
 188 static void
 189 aggr_grp_destructor(void *buf, void *arg)
 190 {
 191         aggr_grp_t *grp = buf;
 192 
 193         if (grp->lg_tx_ports != NULL) {
 194                 kmem_free(grp->lg_tx_ports,
 195                     grp->lg_tx_ports_size * sizeof (aggr_port_t *));
 196         }
 197 
 198         mutex_destroy(&grp->lg_lacp_lock);
 199         cv_destroy(&grp->lg_lacp_cv);
 200         mutex_destroy(&grp->lg_port_lock);
 201         cv_destroy(&grp->lg_port_cv);
 202         rw_destroy(&grp->lg_tx_lock);
 203         mutex_destroy(&grp->lg_tx_flowctl_lock);
 204         cv_destroy(&grp->lg_tx_flowctl_cv);
 205 }
 206 
 207 void
 208 aggr_grp_init(void)
 209 {
 210         aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
 211             sizeof (aggr_grp_t), 0, aggr_grp_constructor,
 212             aggr_grp_destructor, NULL, NULL, NULL, 0);
 213 
 214         aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
 215             GRP_HASHSZ, mod_hash_null_valdtor);
 216         rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
 217         aggr_grp_cnt = 0;
 218 
 219         /*
 220          * Allocate an id space to manage key values (when key is not
 221          * specified). The range of the id space will be from
 222          * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
 223          * uses a 16-bit key.
 224          */
 225         key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
 226         ASSERT(key_ids != NULL);
 227 }
 228 
 229 void
 230 aggr_grp_fini(void)
 231 {
 232         id_space_destroy(key_ids);
 233         rw_destroy(&aggr_grp_lock);
 234         mod_hash_destroy_idhash(aggr_grp_hash);
 235         kmem_cache_destroy(aggr_grp_cache);
 236 }
 237 
 238 uint_t
 239 aggr_grp_count(void)
 240 {
 241         uint_t  count;
 242 
 243         rw_enter(&aggr_grp_lock, RW_READER);
 244         count = aggr_grp_cnt;
 245         rw_exit(&aggr_grp_lock);
 246         return (count);
 247 }
 248 
 249 /*
 250  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
 251  * requires the mac perimeter, this function holds a reference of the aggr
 252  * and aggr won't call mac_unregister() until this reference drops to 0.
 253  */
 254 void
 255 aggr_grp_port_hold(aggr_port_t *port)
 256 {
 257         aggr_grp_t      *grp = port->lp_grp;
 258 
 259         AGGR_PORT_REFHOLD(port);
 260         mutex_enter(&grp->lg_port_lock);
 261         grp->lg_port_ref++;
 262         mutex_exit(&grp->lg_port_lock);
 263 }
 264 
 265 /*
 266  * Release the reference of the grp and inform aggr_grp_delete() calling
 267  * mac_unregister() is now safe.
 268  */
 269 void
 270 aggr_grp_port_rele(aggr_port_t *port)
 271 {
 272         aggr_grp_t      *grp = port->lp_grp;
 273 
 274         mutex_enter(&grp->lg_port_lock);
 275         if (--grp->lg_port_ref == 0)
 276                 cv_signal(&grp->lg_port_cv);
 277         mutex_exit(&grp->lg_port_lock);
 278         AGGR_PORT_REFRELE(port);
 279 }
 280 
 281 /*
 282  * Wait for the port's lacp timer thread and the port's notification callback
 283  * to exit.
 284  */
 285 void
 286 aggr_grp_port_wait(aggr_grp_t *grp)
 287 {
 288         mutex_enter(&grp->lg_port_lock);
 289         if (grp->lg_port_ref != 0)
 290                 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
 291         mutex_exit(&grp->lg_port_lock);
 292 }
 293 
 294 /*
 295  * Attach a port to a link aggregation group.
 296  *
 297  * A port is attached to a link aggregation group once its speed
 298  * and link state have been verified.
 299  *
 300  * Returns B_TRUE if the group link state or speed has changed. If
 301  * it's the case, the caller must notify the MAC layer via a call
 302  * to mac_link().
 303  */
 304 boolean_t
 305 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 306 {
 307         boolean_t link_state_changed = B_FALSE;
 308 
 309         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 310         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 311 
 312         if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
 313                 return (B_FALSE);
 314 
 315         /*
 316          * Validate the MAC port link speed and update the group
 317          * link speed if needed.
 318          */
 319         if (port->lp_ifspeed == 0 ||
 320             port->lp_link_state != LINK_STATE_UP ||
 321             port->lp_link_duplex != LINK_DUPLEX_FULL) {
 322                 /*
 323                  * Can't attach a MAC port with unknown link speed,
 324                  * down link, or not in full duplex mode.
 325                  */
 326                 return (B_FALSE);
 327         }
 328 
 329         mutex_enter(&grp->lg_stat_lock);
 330         if (grp->lg_ifspeed == 0) {
 331                 /*
 332                  * The group inherits the speed of the first link being
 333                  * attached.
 334                  */
 335                 grp->lg_ifspeed = port->lp_ifspeed;
 336                 link_state_changed = B_TRUE;
 337         } else if (grp->lg_ifspeed != port->lp_ifspeed) {
 338                 /*
 339                  * The link speed of the MAC port must be the same as
 340                  * the group link speed, as per 802.3ad. Since it is
 341                  * not, the attach is cancelled.
 342                  */
 343                 mutex_exit(&grp->lg_stat_lock);
 344                 return (B_FALSE);
 345         }
 346         mutex_exit(&grp->lg_stat_lock);
 347 
 348         grp->lg_nattached_ports++;
 349 
 350         /*
 351          * Update the group link state.
 352          */
 353         if (grp->lg_link_state != LINK_STATE_UP) {
 354                 grp->lg_link_state = LINK_STATE_UP;
 355                 mutex_enter(&grp->lg_stat_lock);
 356                 grp->lg_link_duplex = LINK_DUPLEX_FULL;
 357                 mutex_exit(&grp->lg_stat_lock);
 358                 link_state_changed = B_TRUE;
 359         }
 360 
 361         /*
 362          * Update port's state.
 363          */
 364         port->lp_state = AGGR_PORT_STATE_ATTACHED;
 365 
 366         aggr_grp_multicst_port(port, B_TRUE);
 367 
 368         /*
 369          * Set port's receive callback
 370          */
 371         mac_rx_set(port->lp_mch, aggr_recv_cb, port);
 372 
 373         /*
 374          * If LACP is OFF, the port can be used to send data as soon
 375          * as its link is up and verified to be compatible with the
 376          * aggregation.
 377          *
 378          * If LACP is active or passive, notify the LACP subsystem, which
 379          * will enable sending on the port following the LACP protocol.
 380          */
 381         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 382                 aggr_send_port_enable(port);
 383         else
 384                 aggr_lacp_port_attached(port);
 385 
 386         return (link_state_changed);
 387 }
 388 
 389 boolean_t
 390 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
 391 {
 392         boolean_t link_state_changed = B_FALSE;
 393 
 394         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 395         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 396 
 397         /* update state */
 398         if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 399                 return (B_FALSE);
 400 
 401         mac_rx_clear(port->lp_mch);
 402 
 403         aggr_grp_multicst_port(port, B_FALSE);
 404 
 405         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 406                 aggr_send_port_disable(port);
 407         else
 408                 aggr_lacp_port_detached(port);
 409 
 410         port->lp_state = AGGR_PORT_STATE_STANDBY;
 411 
 412         grp->lg_nattached_ports--;
 413         if (grp->lg_nattached_ports == 0) {
 414                 /* the last attached MAC port of the group is being detached */
 415                 grp->lg_link_state = LINK_STATE_DOWN;
 416                 mutex_enter(&grp->lg_stat_lock);
 417                 grp->lg_ifspeed = 0;
 418                 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
 419                 mutex_exit(&grp->lg_stat_lock);
 420                 link_state_changed = B_TRUE;
 421         }
 422 
 423         return (link_state_changed);
 424 }
 425 
 426 /*
 427  * Update the MAC addresses of the constituent ports of the specified
 428  * group. This function is invoked:
 429  * - after creating a new aggregation group.
 430  * - after adding new ports to an aggregation group.
 431  * - after removing a port from a group when the MAC address of
 432  *   that port was used for the MAC address of the group.
 433  * - after the MAC address of a port changed when the MAC address
 434  *   of that port was used for the MAC address of the group.
 435  *
 436  * Return true if the link state of the aggregation changed, for example
 437  * as a result of a failure changing the MAC address of one of the
 438  * constituent ports.
 439  */
 440 boolean_t
 441 aggr_grp_update_ports_mac(aggr_grp_t *grp)
 442 {
 443         aggr_port_t *cport;
 444         boolean_t link_state_changed = B_FALSE;
 445         mac_perim_handle_t mph;
 446 
 447         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 448 
 449         for (cport = grp->lg_ports; cport != NULL;
 450             cport = cport->lp_next) {
 451                 mac_perim_enter_by_mh(cport->lp_mh, &mph);
 452                 if (aggr_port_unicst(cport) != 0) {
 453                         if (aggr_grp_detach_port(grp, cport))
 454                                 link_state_changed = B_TRUE;
 455                 } else {
 456                         /*
 457                          * If a port was detached because of a previous
 458                          * failure changing the MAC address, the port is
 459                          * reattached when it successfully changes the MAC
 460                          * address now, and this might cause the link state
 461                          * of the aggregation to change.
 462                          */
 463                         if (aggr_grp_attach_port(grp, cport))
 464                                 link_state_changed = B_TRUE;
 465                 }
 466                 mac_perim_exit(mph);
 467         }
 468         return (link_state_changed);
 469 }
 470 
 471 /*
 472  * Invoked when the MAC address of a port has changed. If the port's
 473  * MAC address was used for the group MAC address, set mac_addr_changedp
 474  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
 475  * notification. If the link state changes due to detach/attach of
 476  * the constituent port, set link_state_changedp to B_TRUE to indicate
 477  * to the caller that it should send a MAC_NOTE_LINK notification. In both
 478  * cases, it is the responsibility of the caller to invoke notification
 479  * functions after releasing the the port lock.
 480  */
 481 void
 482 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
 483     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
 484 {
 485         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 486         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 487         ASSERT(mac_addr_changedp != NULL);
 488         ASSERT(link_state_changedp != NULL);
 489 
 490         *mac_addr_changedp = B_FALSE;
 491         *link_state_changedp = B_FALSE;
 492 
 493         if (grp->lg_addr_fixed) {
 494                 /*
 495                  * The group is using a fixed MAC address or an automatic
 496                  * MAC address has not been set.
 497                  */
 498                 return;
 499         }
 500 
 501         if (grp->lg_mac_addr_port == port) {
 502                 /*
 503                  * The MAC address of the port was assigned to the group
 504                  * MAC address. Update the group MAC address.
 505                  */
 506                 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
 507                 *mac_addr_changedp = B_TRUE;
 508         } else {
 509                 /*
 510                  * Update the actual port MAC address to the MAC address
 511                  * of the group.
 512                  */
 513                 if (aggr_port_unicst(port) != 0) {
 514                         *link_state_changedp = aggr_grp_detach_port(grp, port);
 515                 } else {
 516                         /*
 517                          * If a port was detached because of a previous
 518                          * failure changing the MAC address, the port is
 519                          * reattached when it successfully changes the MAC
 520                          * address now, and this might cause the link state
 521                          * of the aggregation to change.
 522                          */
 523                         *link_state_changedp = aggr_grp_attach_port(grp, port);
 524                 }
 525         }
 526 }
 527 
 528 /*
 529  * Add a port to a link aggregation group.
 530  */
 531 static int
 532 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
 533     aggr_port_t **pp)
 534 {
 535         aggr_port_t *port, **cport;
 536         mac_perim_handle_t mph;
 537         zoneid_t port_zoneid = ALL_ZONES;
 538         int err;
 539 
 540         /* The port must be int the same zone as the aggregation. */
 541         if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
 542                 port_zoneid = GLOBAL_ZONEID;
 543         if (grp->lg_zoneid != port_zoneid)
 544                 return (EBUSY);
 545 
 546         /*
 547          * lg_mh could be NULL when the function is called during the creation
 548          * of the aggregation.
 549          */
 550         ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
 551 
 552         /* create new port */
 553         err = aggr_port_create(grp, port_linkid, force, &port);
 554         if (err != 0)
 555                 return (err);
 556 
 557         mac_perim_enter_by_mh(port->lp_mh, &mph);
 558 
 559         /* add port to list of group constituent ports */
 560         cport = &grp->lg_ports;
 561         while (*cport != NULL)
 562                 cport = &((*cport)->lp_next);
 563         *cport = port;
 564 
 565         /*
 566          * Back reference to the group it is member of. A port always
 567          * holds a reference to its group to ensure that the back
 568          * reference is always valid.
 569          */
 570         port->lp_grp = grp;
 571         AGGR_GRP_REFHOLD(grp);
 572         grp->lg_nports++;
 573 
 574         aggr_lacp_init_port(port);
 575         mac_perim_exit(mph);
 576 
 577         if (pp != NULL)
 578                 *pp = port;
 579 
 580         return (0);
 581 }
 582 
 583 /*
 584  * This is called in response to either our LACP state machine or a MAC
 585  * notification that the link has gone down via aggr_send_port_disable(). At
 586  * this point, we may need to update our default ring. To that end, we go
 587  * through the set of ports (underlying datalinks in an aggregation) that are
 588  * currently enabled to transmit data. If all our links have been disabled for
 589  * transmit, then we don't do anything.
 590  *
 591  * Note, because we only have a single TX group, we don't have to worry about
 592  * the rings moving between groups and the chance that mac will reassign it
 593  * unless someone removes a port, at which point, we play it safe and call this
 594  * again.
 595  */
 596 void
 597 aggr_grp_update_default(aggr_grp_t *grp)
 598 {
 599         aggr_port_t *port;
 600         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 601 
 602         rw_enter(&grp->lg_tx_lock, RW_WRITER);
 603 
 604         if (grp->lg_ntx_ports == 0) {
 605                 rw_exit(&grp->lg_tx_lock);
 606                 return;
 607         }
 608 
 609         port = grp->lg_tx_ports[0];
 610         ASSERT(port->lp_tx_ring_cnt > 0);
 611         mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
 612         rw_exit(&grp->lg_tx_lock);
 613 }
 614 
 615 /*
 616  * Add a pseudo RX ring for the given HW ring handle.
 617  */
 618 static int
 619 aggr_add_pseudo_rx_ring(aggr_port_t *port,
 620     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 621 {
 622         aggr_pseudo_rx_ring_t   *ring;
 623         int                     err;
 624         int                     j;
 625 
 626         for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 627                 ring = rx_grp->arg_rings + j;
 628                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
 629                         break;
 630         }
 631 
 632         /*
 633          * No slot for this new RX ring.
 634          */
 635         if (j == MAX_RINGS_PER_GROUP)
 636                 return (EIO);
 637 
 638         ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
 639         ring->arr_hw_rh = hw_rh;
 640         ring->arr_port = port;
 641         rx_grp->arg_ring_cnt++;
 642 
 643         /*
 644          * The group is already registered, dynamically add a new ring to the
 645          * mac group.
 646          */
 647         if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
 648                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 649                 ring->arr_hw_rh = NULL;
 650                 ring->arr_port = NULL;
 651                 rx_grp->arg_ring_cnt--;
 652         } else {
 653                 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
 654                     mac_find_ring(rx_grp->arg_gh, j));
 655         }
 656         return (err);
 657 }
 658 
 659 /*
 660  * Remove the pseudo RX ring of the given HW ring handle.
 661  */
 662 static void
 663 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 664 {
 665         aggr_pseudo_rx_ring_t   *ring;
 666         int                     j;
 667 
 668         for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 669                 ring = rx_grp->arg_rings + j;
 670                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
 671                     ring->arr_hw_rh != hw_rh) {
 672                         continue;
 673                 }
 674 
 675                 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
 676 
 677                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 678                 ring->arr_hw_rh = NULL;
 679                 ring->arr_port = NULL;
 680                 rx_grp->arg_ring_cnt--;
 681                 mac_hwring_teardown(hw_rh);
 682                 break;
 683         }
 684 }
 685 
 686 /*
 687  * Create pseudo rings over the HW rings of the port.
 688  *
 689  * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
 690  *
 691  * o Program existing unicast filters on the pseudo group into the HW group.
 692  *
 693  * o Program existing VLAN filters on the pseudo group into the HW group.
 694  */
 695 static int
 696 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 697 {
 698         aggr_grp_t              *grp = port->lp_grp;
 699         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 700         aggr_unicst_addr_t      *addr, *a;
 701         mac_perim_handle_t      pmph;
 702         aggr_vlan_t             *avp;
 703         int                     hw_rh_cnt, i = 0, j;
 704         int                     err = 0;
 705 
 706         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 707         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 708 
 709         /*
 710          * This function must be called after the aggr registers its MAC
 711          * and its Rx group has been initialized.
 712          */
 713         ASSERT(rx_grp->arg_gh != NULL);
 714 
 715         /*
 716          * Get the list of the underlying HW rings.
 717          */
 718         hw_rh_cnt = mac_hwrings_get(port->lp_mch,
 719             &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
 720 
 721         if (port->lp_hwgh != NULL) {
 722                 /*
 723                  * Quiesce the HW ring and the MAC SRS on the ring. Note
 724                  * that the HW ring will be restarted when the pseudo ring
 725                  * is started. At that time all the packets will be
 726                  * directly passed up to the pseudo Rx ring and handled
 727                  * by MAC SRS created over the pseudo Rx ring.
 728                  */
 729                 mac_rx_client_quiesce(port->lp_mch);
 730                 mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
 731         }
 732 
 733         /*
 734          * Add existing VLAN and unicast address filters to the port.
 735          */
 736         for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
 737             avp = list_next(&rx_grp->arg_vlans, avp)) {
 738                 if ((err = aggr_port_addvlan(port, avp->av_vid)) != 0)
 739                         goto err;
 740         }
 741 
 742         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
 743                 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
 744                         goto err;
 745         }
 746 
 747         for (i = 0; i < hw_rh_cnt; i++) {
 748                 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
 749                 if (err != 0)
 750                         goto err;
 751         }
 752 
 753         port->lp_rx_grp_added = B_TRUE;
 754         mac_perim_exit(pmph);
 755         return (0);
 756 
 757 err:
 758         ASSERT(err != 0);
 759 
 760         for (j = 0; j < i; j++)
 761                 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
 762 
 763         for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
 764                 aggr_port_remmac(port, a->aua_addr);
 765 
 766         if (avp != NULL)
 767                 avp = list_prev(&rx_grp->arg_vlans, avp);
 768 
 769         for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
 770                 int err2;
 771 
 772                 if ((err2 = aggr_port_remvlan(port, avp->av_vid)) != 0) {
 773                         cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
 774                             ": errno %d.", avp->av_vid,
 775                             mac_client_name(port->lp_mch), err2);
 776                 }
 777         }
 778 
 779         if (port->lp_hwgh != NULL) {
 780                 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
 781                 mac_rx_client_restart(port->lp_mch);
 782                 port->lp_hwgh = NULL;
 783         }
 784 
 785         mac_perim_exit(pmph);
 786         return (err);
 787 }
 788 
 789 /*
 790  * Destroy the pseudo rings mapping to this port and remove all VLAN
 791  * and unicast filters from this port. Even if there are no underlying
 792  * HW rings we must still remove the unicast filters to take the port
 793  * out of promisc mode.
 794  */
 795 static void
 796 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 797 {
 798         aggr_grp_t              *grp = port->lp_grp;
 799         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 800         aggr_unicst_addr_t      *addr;
 801         mac_group_handle_t      hwgh;
 802         mac_perim_handle_t      pmph;
 803         int                     hw_rh_cnt, i;
 804 
 805         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 806         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 807 
 808         if (!port->lp_rx_grp_added)
 809                 goto done;
 810 
 811         ASSERT(rx_grp->arg_gh != NULL);
 812         hw_rh_cnt = mac_hwrings_get(port->lp_mch,
 813             &hwgh, hw_rh, MAC_RING_TYPE_RX);
 814 
 815         for (i = 0; i < hw_rh_cnt; i++)
 816                 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
 817 
 818         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
 819                 aggr_port_remmac(port, addr->aua_addr);
 820 
 821         for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
 822             avp = list_next(&rx_grp->arg_vlans, avp)) {
 823                 int err;
 824 
 825                 if ((err = aggr_port_remvlan(port, avp->av_vid)) != 0) {
 826                         cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
 827                             ": errno %d.", avp->av_vid,
 828                             mac_client_name(port->lp_mch), err);
 829                 }
 830         }
 831 
 832         if (port->lp_hwgh != NULL) {
 833                 port->lp_hwgh = NULL;
 834 
 835                 /*
 836                  * First clear the permanent-quiesced flag of the RX srs then
 837                  * restart the HW ring and the mac srs on the ring. Note that
 838                  * the HW ring and associated SRS will soon been removed when
 839                  * the port is removed from the aggr.
 840                  */
 841                 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
 842                 mac_rx_client_restart(port->lp_mch);
 843         }
 844 
 845         port->lp_rx_grp_added = B_FALSE;
 846 done:
 847         mac_perim_exit(pmph);
 848 }
 849 
 850 /*
 851  * Add a pseudo TX ring for the given HW ring handle.
 852  */
 853 static int
 854 aggr_add_pseudo_tx_ring(aggr_port_t *port,
 855     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
 856     mac_ring_handle_t *pseudo_rh)
 857 {
 858         aggr_pseudo_tx_ring_t   *ring;
 859         int                     err;
 860         int                     i;
 861 
 862         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 863         for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
 864                 ring = tx_grp->atg_rings + i;
 865                 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
 866                         break;
 867         }
 868         /*
 869          * No slot for this new TX ring.
 870          */
 871         if (i == MAX_RINGS_PER_GROUP)
 872                 return (EIO);
 873         /*
 874          * The following 4 statements needs to be done before
 875          * calling mac_group_add_ring(). Otherwise it will
 876          * result in an assertion failure in mac_init_ring().
 877          */
 878         ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
 879         ring->atr_hw_rh = hw_rh;
 880         ring->atr_port = port;
 881         tx_grp->atg_ring_cnt++;
 882 
 883         /*
 884          * The TX side has no concept of ring groups unlike RX groups.
 885          * There is just a single group which stores all the TX rings.
 886          * This group will be used to store aggr's pseudo TX rings.
 887          */
 888         if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
 889                 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
 890                 ring->atr_hw_rh = NULL;
 891                 ring->atr_port = NULL;
 892                 tx_grp->atg_ring_cnt--;
 893         } else {
 894                 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
 895                 if (hw_rh != NULL) {
 896                         mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
 897                             mac_find_ring(tx_grp->atg_gh, i));
 898                 }
 899         }
 900 
 901         return (err);
 902 }
 903 
 904 /*
 905  * Remove the pseudo TX ring of the given HW ring handle.
 906  */
 907 static void
 908 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
 909     mac_ring_handle_t pseudo_hw_rh)
 910 {
 911         aggr_pseudo_tx_ring_t   *ring;
 912         int                     i;
 913 
 914         for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
 915                 ring = tx_grp->atg_rings + i;
 916                 if (ring->atr_rh != pseudo_hw_rh)
 917                         continue;
 918 
 919                 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
 920                 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
 921                 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
 922                 mac_hwring_teardown(ring->atr_hw_rh);
 923                 ring->atr_hw_rh = NULL;
 924                 ring->atr_port = NULL;
 925                 tx_grp->atg_ring_cnt--;
 926                 break;
 927         }
 928 }
 929 
 930 /*
 931  * This function is called to create pseudo rings over hardware rings of
 932  * the underlying device. There is a 1:1 mapping between the pseudo TX
 933  * rings of the aggr and the hardware rings of the underlying port.
 934  */
 935 static int
 936 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
 937 {
 938         aggr_grp_t              *grp = port->lp_grp;
 939         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
 940         mac_perim_handle_t      pmph;
 941         int                     hw_rh_cnt, i = 0, j;
 942         int                     err = 0;
 943 
 944         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 945         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 946 
 947         /*
 948          * Get the list the the underlying HW rings.
 949          */
 950         hw_rh_cnt = mac_hwrings_get(port->lp_mch,
 951             NULL, hw_rh, MAC_RING_TYPE_TX);
 952 
 953         /*
 954          * Even if the underlying NIC does not have TX rings, we
 955          * still make a psuedo TX ring for that NIC with NULL as
 956          * the ring handle.
 957          */
 958         if (hw_rh_cnt == 0)
 959                 port->lp_tx_ring_cnt = 1;
 960         else
 961                 port->lp_tx_ring_cnt = hw_rh_cnt;
 962 
 963         port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 964             port->lp_tx_ring_cnt), KM_SLEEP);
 965         port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 966             port->lp_tx_ring_cnt), KM_SLEEP);
 967 
 968         if (hw_rh_cnt == 0) {
 969                 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
 970                     NULL, &pseudo_rh)) == 0) {
 971                         port->lp_tx_rings[0] = NULL;
 972                         port->lp_pseudo_tx_rings[0] = pseudo_rh;
 973                 }
 974         } else {
 975                 for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
 976                         err = aggr_add_pseudo_tx_ring(port,
 977                             tx_grp, hw_rh[i], &pseudo_rh);
 978                         if (err != 0)
 979                                 break;
 980                         port->lp_tx_rings[i] = hw_rh[i];
 981                         port->lp_pseudo_tx_rings[i] = pseudo_rh;
 982                 }
 983         }
 984 
 985         if (err != 0) {
 986                 if (hw_rh_cnt != 0) {
 987                         for (j = 0; j < i; j++) {
 988                                 aggr_rem_pseudo_tx_ring(tx_grp,
 989                                     port->lp_pseudo_tx_rings[j]);
 990                         }
 991                 }
 992                 kmem_free(port->lp_tx_rings,
 993                     (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
 994                 kmem_free(port->lp_pseudo_tx_rings,
 995                     (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
 996                 port->lp_tx_ring_cnt = 0;
 997         } else {
 998                 port->lp_tx_grp_added = B_TRUE;
 999                 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1000                     aggr_tx_ring_update, port);
1001         }
1002         mac_perim_exit(pmph);
1003         aggr_grp_update_default(grp);
1004         return (err);
1005 }
1006 
1007 /*
1008  * This function is called by aggr to remove pseudo TX rings over the
1009  * HW rings of the underlying port.
1010  */
1011 static void
1012 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1013 {
1014         aggr_grp_t              *grp = port->lp_grp;
1015         mac_perim_handle_t      pmph;
1016         int                     i;
1017 
1018         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1019         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1020 
1021         if (!port->lp_tx_grp_added)
1022                 goto done;
1023 
1024         ASSERT(tx_grp->atg_gh != NULL);
1025 
1026         for (i = 0; i < port->lp_tx_ring_cnt; i++)
1027                 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1028 
1029         kmem_free(port->lp_tx_rings,
1030             (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1031         kmem_free(port->lp_pseudo_tx_rings,
1032             (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1033 
1034         port->lp_tx_ring_cnt = 0;
1035         (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1036         port->lp_tx_grp_added = B_FALSE;
1037         aggr_grp_update_default(grp);
1038 done:
1039         mac_perim_exit(pmph);
1040 }
1041 
1042 static int
1043 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1044 {
1045         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1046         return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1047 }
1048 
1049 static int
1050 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1051 {
1052         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1053         return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1054 }
1055 
1056 /*
1057  * Here we need to start the pseudo-ring. As MAC already ensures that the
1058  * underlying device is set up, all we need to do is save the ring generation.
1059  *
1060  * Note, we don't end up wanting to use the underlying mac_hwring_start/stop
1061  * functions here as those don't actually stop and start the ring, they just
1062  * quiesce the ring. Regardless of whether the aggr is logically up or not, we
1063  * want to make sure that we can receive traffic for LACP.
1064  */
1065 static int
1066 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1067 {
1068         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1069 
1070         rr_ring->arr_gen = mr_gen;
1071         return (0);
1072 }
1073 
1074 /*
1075  * Add one or more ports to an existing link aggregation group.
1076  */
1077 int
1078 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1079     laioc_port_t *ports)
1080 {
1081         int rc, i, nadded = 0;
1082         aggr_grp_t *grp = NULL;
1083         aggr_port_t *port;
1084         boolean_t link_state_changed = B_FALSE;
1085         mac_perim_handle_t mph, pmph;
1086 
1087         /* get group corresponding to linkid */
1088         rw_enter(&aggr_grp_lock, RW_READER);
1089         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1090             (mod_hash_val_t *)&grp) != 0) {
1091                 rw_exit(&aggr_grp_lock);
1092                 return (ENOENT);
1093         }
1094         AGGR_GRP_REFHOLD(grp);
1095 
1096         /*
1097          * Hold the perimeter so that the aggregation won't be destroyed.
1098          */
1099         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1100         rw_exit(&aggr_grp_lock);
1101 
1102         /* add the specified ports to group */
1103         for (i = 0; i < nports; i++) {
1104                 /* add port to group */
1105                 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1106                     force, &port)) != 0) {
1107                         goto bail;
1108                 }
1109                 ASSERT(port != NULL);
1110                 nadded++;
1111 
1112                 /* check capabilities */
1113                 if (!aggr_grp_capab_check(grp, port) ||
1114                     !aggr_grp_sdu_check(grp, port) ||
1115                     !aggr_grp_margin_check(grp, port)) {
1116                         rc = ENOTSUP;
1117                         goto bail;
1118                 }
1119 
1120                 /*
1121                  * Create the pseudo ring for each HW ring of the underlying
1122                  * port.
1123                  */
1124                 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1125                 if (rc != 0)
1126                         goto bail;
1127                 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1128                 if (rc != 0)
1129                         goto bail;
1130 
1131                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1132 
1133                 /* set LACP mode */
1134                 aggr_port_lacp_set_mode(grp, port);
1135 
1136                 /* start port if group has already been started */
1137                 if (grp->lg_started) {
1138                         rc = aggr_port_start(port);
1139                         if (rc != 0) {
1140                                 mac_perim_exit(pmph);
1141                                 goto bail;
1142                         }
1143 
1144                         /*
1145                          * Turn on the promiscuous mode over the port when it
1146                          * is requested to be turned on to receive the
1147                          * non-primary address over a port, or the promiscous
1148                          * mode is enabled over the aggr.
1149                          */
1150                         if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1151                                 rc = aggr_port_promisc(port, B_TRUE);
1152                                 if (rc != 0) {
1153                                         mac_perim_exit(pmph);
1154                                         goto bail;
1155                                 }
1156                         }
1157                 }
1158                 mac_perim_exit(pmph);
1159 
1160                 /*
1161                  * Attach each port if necessary.
1162                  */
1163                 if (aggr_port_notify_link(grp, port))
1164                         link_state_changed = B_TRUE;
1165 
1166                 /*
1167                  * Initialize the callback functions for this port.
1168                  */
1169                 aggr_port_init_callbacks(port);
1170         }
1171 
1172         /* update the MAC address of the constituent ports */
1173         if (aggr_grp_update_ports_mac(grp))
1174                 link_state_changed = B_TRUE;
1175 
1176         if (link_state_changed)
1177                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1178 
1179 bail:
1180         if (rc != 0) {
1181                 /* stop and remove ports that have been added */
1182                 for (i = 0; i < nadded; i++) {
1183                         port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1184                         ASSERT(port != NULL);
1185                         if (grp->lg_started) {
1186                                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1187                                 (void) aggr_port_promisc(port, B_FALSE);
1188                                 aggr_port_stop(port);
1189                                 mac_perim_exit(pmph);
1190                         }
1191                         aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1192                         aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1193                         (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1194                 }
1195         }
1196 
1197         mac_perim_exit(mph);
1198         AGGR_GRP_REFRELE(grp);
1199         return (rc);
1200 }
1201 
1202 static int
1203 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1204     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1205     aggr_lacp_timer_t lacp_timer)
1206 {
1207         boolean_t mac_addr_changed = B_FALSE;
1208         boolean_t link_state_changed = B_FALSE;
1209         mac_perim_handle_t pmph;
1210 
1211         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1212 
1213         /* validate fixed address if specified */
1214         if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1215             ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1216             (mac_addr[0] & 0x01))) {
1217                 return (EINVAL);
1218         }
1219 
1220         /* update policy if requested */
1221         if (update_mask & AGGR_MODIFY_POLICY)
1222                 aggr_send_update_policy(grp, policy);
1223 
1224         /* update unicast MAC address if requested */
1225         if (update_mask & AGGR_MODIFY_MAC) {
1226                 if (mac_fixed) {
1227                         /* user-supplied MAC address */
1228                         grp->lg_mac_addr_port = NULL;
1229                         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1230                                 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1231                                 mac_addr_changed = B_TRUE;
1232                         }
1233                 } else if (grp->lg_addr_fixed) {
1234                         /* switch from user-supplied to automatic */
1235                         aggr_port_t *port = grp->lg_ports;
1236 
1237                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1238                         bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1239                         grp->lg_mac_addr_port = port;
1240                         mac_addr_changed = B_TRUE;
1241                         mac_perim_exit(pmph);
1242                 }
1243                 grp->lg_addr_fixed = mac_fixed;
1244         }
1245 
1246         if (mac_addr_changed)
1247                 link_state_changed = aggr_grp_update_ports_mac(grp);
1248 
1249         if (update_mask & AGGR_MODIFY_LACP_MODE)
1250                 aggr_lacp_update_mode(grp, lacp_mode);
1251 
1252         if (update_mask & AGGR_MODIFY_LACP_TIMER)
1253                 aggr_lacp_update_timer(grp, lacp_timer);
1254 
1255         if (link_state_changed)
1256                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1257 
1258         if (mac_addr_changed)
1259                 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1260 
1261         return (0);
1262 }
1263 
1264 /*
1265  * Update properties of an existing link aggregation group.
1266  */
1267 int
1268 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1269     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1270     aggr_lacp_timer_t lacp_timer)
1271 {
1272         aggr_grp_t *grp = NULL;
1273         mac_perim_handle_t mph;
1274         int err;
1275 
1276         /* get group corresponding to linkid */
1277         rw_enter(&aggr_grp_lock, RW_READER);
1278         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1279             (mod_hash_val_t *)&grp) != 0) {
1280                 rw_exit(&aggr_grp_lock);
1281                 return (ENOENT);
1282         }
1283         AGGR_GRP_REFHOLD(grp);
1284 
1285         /*
1286          * Hold the perimeter so that the aggregation won't be destroyed.
1287          */
1288         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1289         rw_exit(&aggr_grp_lock);
1290 
1291         err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1292             mac_addr, lacp_mode, lacp_timer);
1293 
1294         mac_perim_exit(mph);
1295         AGGR_GRP_REFRELE(grp);
1296         return (err);
1297 }
1298 
1299 /*
1300  * Create a new link aggregation group upon request from administrator.
1301  * Returns 0 on success, an errno on failure.
1302  */
1303 int
1304 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1305     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1306     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1307     cred_t *credp)
1308 {
1309         aggr_grp_t *grp = NULL;
1310         aggr_port_t *port;
1311         mac_register_t *mac;
1312         boolean_t link_state_changed;
1313         mac_perim_handle_t mph;
1314         int err;
1315         int i;
1316         kt_did_t tid = 0;
1317 
1318         /* need at least one port */
1319         if (nports == 0)
1320                 return (EINVAL);
1321 
1322         rw_enter(&aggr_grp_lock, RW_WRITER);
1323 
1324         /* does a group with the same linkid already exist? */
1325         err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1326             (mod_hash_val_t *)&grp);
1327         if (err == 0) {
1328                 rw_exit(&aggr_grp_lock);
1329                 return (EEXIST);
1330         }
1331 
1332         grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1333 
1334         grp->lg_refs = 1;
1335         grp->lg_closing = B_FALSE;
1336         grp->lg_force = force;
1337         grp->lg_linkid = linkid;
1338         grp->lg_zoneid = crgetzoneid(credp);
1339         grp->lg_ifspeed = 0;
1340         grp->lg_link_state = LINK_STATE_UNKNOWN;
1341         grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1342         grp->lg_started = B_FALSE;
1343         grp->lg_promisc = B_FALSE;
1344         grp->lg_lacp_done = B_FALSE;
1345         grp->lg_tx_notify_done = B_FALSE;
1346         grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1347         grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1348             aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1349         grp->lg_tx_notify_thread = thread_create(NULL, 0,
1350             aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1351         grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1352             MAX_RINGS_PER_GROUP), KM_SLEEP);
1353         grp->lg_tx_blocked_cnt = 0;
1354         bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1355         bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1356         aggr_lacp_init_grp(grp);
1357 
1358         grp->lg_rx_group.arg_untagged = 0;
1359         list_create(&(grp->lg_rx_group.arg_vlans), sizeof (aggr_vlan_t),
1360             offsetof(aggr_vlan_t, av_link));
1361 
1362         /* add MAC ports to group */
1363         grp->lg_ports = NULL;
1364         grp->lg_nports = 0;
1365         grp->lg_nattached_ports = 0;
1366         grp->lg_ntx_ports = 0;
1367 
1368         /*
1369          * If key is not specified by the user, allocate the key.
1370          */
1371         if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1372                 err = ENOMEM;
1373                 goto bail;
1374         }
1375         grp->lg_key = key;
1376 
1377         for (i = 0; i < nports; i++) {
1378                 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1379                 if (err != 0)
1380                         goto bail;
1381         }
1382 
1383         /*
1384          * If no explicit MAC address was specified by the administrator,
1385          * set it to the MAC address of the first port.
1386          */
1387         grp->lg_addr_fixed = mac_fixed;
1388         if (grp->lg_addr_fixed) {
1389                 /* validate specified address */
1390                 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1391                         err = EINVAL;
1392                         goto bail;
1393                 }
1394                 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1395         } else {
1396                 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1397                 grp->lg_mac_addr_port = grp->lg_ports;
1398         }
1399 
1400         /* set the initial group capabilities */
1401         aggr_grp_capab_set(grp);
1402 
1403         if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1404                 err = ENOMEM;
1405                 goto bail;
1406         }
1407         mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1408         mac->m_driver = grp;
1409         mac->m_dip = aggr_dip;
1410         mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1411         mac->m_src_addr = grp->lg_addr;
1412         mac->m_callbacks = &aggr_m_callbacks;
1413         mac->m_min_sdu = 0;
1414         mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1415         mac->m_margin = aggr_grp_max_margin(grp);
1416         mac->m_v12n = MAC_VIRT_LEVEL1;
1417         err = mac_register(mac, &grp->lg_mh);
1418         mac_free(mac);
1419         if (err != 0)
1420                 goto bail;
1421 
1422         err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1423         if (err != 0) {
1424                 (void) mac_unregister(grp->lg_mh);
1425                 grp->lg_mh = NULL;
1426                 goto bail;
1427         }
1428 
1429         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1430 
1431         /*
1432          * Update the MAC address of the constituent ports.
1433          * None of the port is attached at this time, the link state of the
1434          * aggregation will not change.
1435          */
1436         link_state_changed = aggr_grp_update_ports_mac(grp);
1437         ASSERT(!link_state_changed);
1438 
1439         /* update outbound load balancing policy */
1440         aggr_send_update_policy(grp, policy);
1441 
1442         /* set LACP mode */
1443         aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1444 
1445         /*
1446          * Attach each port if necessary.
1447          */
1448         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1449                 /*
1450                  * Create the pseudo ring for each HW ring of the underlying
1451                  * port. Note that this is done after the aggr registers the
1452                  * mac.
1453                  */
1454                 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1455                 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1456                 if (aggr_port_notify_link(grp, port))
1457                         link_state_changed = B_TRUE;
1458 
1459                 /*
1460                  * Initialize the callback functions for this port.
1461                  */
1462                 aggr_port_init_callbacks(port);
1463         }
1464 
1465         if (link_state_changed)
1466                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1467 
1468         /* add new group to hash table */
1469         err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1470             (mod_hash_val_t)grp);
1471         ASSERT(err == 0);
1472         aggr_grp_cnt++;
1473 
1474         mac_perim_exit(mph);
1475         rw_exit(&aggr_grp_lock);
1476         return (0);
1477 
1478 bail:
1479 
1480         grp->lg_closing = B_TRUE;
1481 
1482         port = grp->lg_ports;
1483         while (port != NULL) {
1484                 aggr_port_t *cport;
1485 
1486                 cport = port->lp_next;
1487                 aggr_port_delete(port);
1488                 port = cport;
1489         }
1490 
1491         /*
1492          * Inform the lacp_rx thread to exit.
1493          */
1494         mutex_enter(&grp->lg_lacp_lock);
1495         grp->lg_lacp_done = B_TRUE;
1496         cv_signal(&grp->lg_lacp_cv);
1497         while (grp->lg_lacp_rx_thread != NULL)
1498                 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1499         mutex_exit(&grp->lg_lacp_lock);
1500         /*
1501          * Inform the tx_notify thread to exit.
1502          */
1503         mutex_enter(&grp->lg_tx_flowctl_lock);
1504         if (grp->lg_tx_notify_thread != NULL) {
1505                 tid = grp->lg_tx_notify_thread->t_did;
1506                 grp->lg_tx_notify_done = B_TRUE;
1507                 cv_signal(&grp->lg_tx_flowctl_cv);
1508         }
1509         mutex_exit(&grp->lg_tx_flowctl_lock);
1510         if (tid != 0)
1511                 thread_join(tid);
1512 
1513         kmem_free(grp->lg_tx_blocked_rings,
1514             (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1515         rw_exit(&aggr_grp_lock);
1516         AGGR_GRP_REFRELE(grp);
1517         return (err);
1518 }
1519 
1520 /*
1521  * Return a pointer to the member of a group with specified linkid.
1522  */
1523 static aggr_port_t *
1524 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1525 {
1526         aggr_port_t *port;
1527 
1528         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1529 
1530         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1531                 if (port->lp_linkid == linkid)
1532                         break;
1533         }
1534 
1535         return (port);
1536 }
1537 
1538 /*
1539  * Stop, detach and remove a port from a link aggregation group.
1540  */
1541 static int
1542 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1543     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1544 {
1545         int rc = 0;
1546         aggr_port_t **pport;
1547         boolean_t mac_addr_changed = B_FALSE;
1548         boolean_t link_state_changed = B_FALSE;
1549         mac_perim_handle_t mph;
1550         uint64_t val;
1551         uint_t i;
1552         uint_t stat;
1553 
1554         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1555         ASSERT(grp->lg_nports > 1);
1556         ASSERT(!grp->lg_closing);
1557 
1558         /* unlink port */
1559         for (pport = &grp->lg_ports; *pport != port;
1560             pport = &(*pport)->lp_next) {
1561                 if (*pport == NULL) {
1562                         rc = ENOENT;
1563                         goto done;
1564                 }
1565         }
1566         *pport = port->lp_next;
1567 
1568         mac_perim_enter_by_mh(port->lp_mh, &mph);
1569 
1570         /*
1571          * If the MAC address of the port being removed was assigned
1572          * to the group, update the group MAC address
1573          * using the MAC address of a different port.
1574          */
1575         if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1576                 /*
1577                  * Set the MAC address of the group to the
1578                  * MAC address of its first port.
1579                  */
1580                 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1581                 grp->lg_mac_addr_port = grp->lg_ports;
1582                 mac_addr_changed = B_TRUE;
1583         }
1584 
1585         link_state_changed = aggr_grp_detach_port(grp, port);
1586 
1587         /*
1588          * Add the counter statistics of the ports while it was aggregated
1589          * to the group's residual statistics.  This is done by obtaining
1590          * the current counter from the underlying MAC then subtracting the
1591          * value of the counter at the moment it was added to the
1592          * aggregation.
1593          */
1594         for (i = 0; i < MAC_NSTAT; i++) {
1595                 stat = i + MAC_STAT_MIN;
1596                 if (!MAC_STAT_ISACOUNTER(stat))
1597                         continue;
1598                 val = aggr_port_stat(port, stat);
1599                 val -= port->lp_stat[i];
1600                 mutex_enter(&grp->lg_stat_lock);
1601                 grp->lg_stat[i] += val;
1602                 mutex_exit(&grp->lg_stat_lock);
1603         }
1604         for (i = 0; i < ETHER_NSTAT; i++) {
1605                 stat = i + MACTYPE_STAT_MIN;
1606                 if (!ETHER_STAT_ISACOUNTER(stat))
1607                         continue;
1608                 val = aggr_port_stat(port, stat);
1609                 val -= port->lp_ether_stat[i];
1610                 mutex_enter(&grp->lg_stat_lock);
1611                 grp->lg_ether_stat[i] += val;
1612                 mutex_exit(&grp->lg_stat_lock);
1613         }
1614 
1615         grp->lg_nports--;
1616         mac_perim_exit(mph);
1617 
1618         aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1619         aggr_port_delete(port);
1620 
1621         /*
1622          * If the group MAC address has changed, update the MAC address of
1623          * the remaining constituent ports according to the new MAC
1624          * address of the group.
1625          */
1626         if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1627                 link_state_changed = B_TRUE;
1628 
1629 done:
1630         if (mac_addr_changedp != NULL)
1631                 *mac_addr_changedp = mac_addr_changed;
1632         if (link_state_changedp != NULL)
1633                 *link_state_changedp = link_state_changed;
1634 
1635         return (rc);
1636 }
1637 
1638 /*
1639  * Remove one or more ports from an existing link aggregation group.
1640  */
1641 int
1642 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1643 {
1644         int rc = 0, i;
1645         aggr_grp_t *grp = NULL;
1646         aggr_port_t *port;
1647         boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1648         boolean_t link_state_update = B_FALSE, link_state_changed;
1649         mac_perim_handle_t mph, pmph;
1650 
1651         /* get group corresponding to linkid */
1652         rw_enter(&aggr_grp_lock, RW_READER);
1653         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1654             (mod_hash_val_t *)&grp) != 0) {
1655                 rw_exit(&aggr_grp_lock);
1656                 return (ENOENT);
1657         }
1658         AGGR_GRP_REFHOLD(grp);
1659 
1660         /*
1661          * Hold the perimeter so that the aggregation won't be destroyed.
1662          */
1663         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1664         rw_exit(&aggr_grp_lock);
1665 
1666         /* we need to keep at least one port per group */
1667         if (nports >= grp->lg_nports) {
1668                 rc = EINVAL;
1669                 goto bail;
1670         }
1671 
1672         /* first verify that all the groups are valid */
1673         for (i = 0; i < nports; i++) {
1674                 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1675                         /* port not found */
1676                         rc = ENOENT;
1677                         goto bail;
1678                 }
1679         }
1680 
1681         /* clear the promiscous mode for the specified ports */
1682         for (i = 0; i < nports && rc == 0; i++) {
1683                 /* lookup port */
1684                 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1685                 ASSERT(port != NULL);
1686 
1687                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1688                 rc = aggr_port_promisc(port, B_FALSE);
1689                 mac_perim_exit(pmph);
1690         }
1691         if (rc != 0) {
1692                 for (i = 0; i < nports; i++) {
1693                         port = aggr_grp_port_lookup(grp,
1694                             ports[i].lp_linkid);
1695                         ASSERT(port != NULL);
1696 
1697                         /*
1698                          * Turn the promiscuous mode back on if it is required
1699                          * to receive the non-primary address over a port, or
1700                          * the promiscous mode is enabled over the aggr.
1701                          */
1702                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1703                         if (port->lp_started && (grp->lg_promisc ||
1704                             port->lp_prom_addr != NULL)) {
1705                                 (void) aggr_port_promisc(port, B_TRUE);
1706                         }
1707                         mac_perim_exit(pmph);
1708                 }
1709                 goto bail;
1710         }
1711 
1712         /* remove the specified ports from group */
1713         for (i = 0; i < nports; i++) {
1714                 /* lookup port */
1715                 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1716                 ASSERT(port != NULL);
1717 
1718                 /* stop port if group has already been started */
1719                 if (grp->lg_started) {
1720                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1721                         aggr_port_stop(port);
1722                         mac_perim_exit(pmph);
1723                 }
1724 
1725                 /*
1726                  * aggr_rem_pseudo_tx_group() is not called here. Instead
1727                  * it is called from inside aggr_grp_rem_port() after the
1728                  * port has been detached. The reason is that
1729                  * aggr_rem_pseudo_tx_group() removes one ring at a time
1730                  * and if there is still traffic going on, then there
1731                  * is the possibility of aggr_find_tx_ring() returning a
1732                  * removed ring for transmission. Once the port has been
1733                  * detached, that port will not be used and
1734                  * aggr_find_tx_ring() will not return any rings
1735                  * belonging to it.
1736                  */
1737                 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1738 
1739                 /* remove port from group */
1740                 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1741                     &link_state_changed);
1742                 ASSERT(rc == 0);
1743                 mac_addr_update = mac_addr_update || mac_addr_changed;
1744                 link_state_update = link_state_update || link_state_changed;
1745         }
1746 
1747 bail:
1748         if (mac_addr_update)
1749                 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1750         if (link_state_update)
1751                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1752 
1753         mac_perim_exit(mph);
1754         AGGR_GRP_REFRELE(grp);
1755 
1756         return (rc);
1757 }
1758 
1759 int
1760 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1761 {
1762         aggr_grp_t *grp = NULL;
1763         aggr_port_t *port, *cport;
1764         datalink_id_t tmpid;
1765         mod_hash_val_t val;
1766         mac_perim_handle_t mph, pmph;
1767         int err;
1768         kt_did_t tid = 0;
1769 
1770         rw_enter(&aggr_grp_lock, RW_WRITER);
1771 
1772         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1773             (mod_hash_val_t *)&grp) != 0) {
1774                 rw_exit(&aggr_grp_lock);
1775                 return (ENOENT);
1776         }
1777 
1778         /*
1779          * Note that dls_devnet_destroy() must be called before lg_lock is
1780          * held. Otherwise, it will deadlock if another thread is in
1781          * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1782          * dls_devnet_destroy() needs to delete.
1783          */
1784         if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1785                 rw_exit(&aggr_grp_lock);
1786                 return (err);
1787         }
1788         ASSERT(linkid == tmpid);
1789 
1790         /*
1791          * Unregister from the MAC service module. Since this can
1792          * fail if a client hasn't closed the MAC port, we gracefully
1793          * fail the operation.
1794          */
1795         if ((err = mac_disable(grp->lg_mh)) != 0) {
1796                 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1797                 rw_exit(&aggr_grp_lock);
1798                 return (err);
1799         }
1800         (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1801         ASSERT(grp == (aggr_grp_t *)val);
1802 
1803         ASSERT(aggr_grp_cnt > 0);
1804         aggr_grp_cnt--;
1805         rw_exit(&aggr_grp_lock);
1806 
1807         /*
1808          * Inform the lacp_rx thread to exit.
1809          */
1810         mutex_enter(&grp->lg_lacp_lock);
1811         grp->lg_lacp_done = B_TRUE;
1812         cv_signal(&grp->lg_lacp_cv);
1813         while (grp->lg_lacp_rx_thread != NULL)
1814                 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1815         mutex_exit(&grp->lg_lacp_lock);
1816         /*
1817          * Inform the tx_notify_thread to exit.
1818          */
1819         mutex_enter(&grp->lg_tx_flowctl_lock);
1820         if (grp->lg_tx_notify_thread != NULL) {
1821                 tid = grp->lg_tx_notify_thread->t_did;
1822                 grp->lg_tx_notify_done = B_TRUE;
1823                 cv_signal(&grp->lg_tx_flowctl_cv);
1824         }
1825         mutex_exit(&grp->lg_tx_flowctl_lock);
1826         if (tid != 0)
1827                 thread_join(tid);
1828 
1829         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1830 
1831         grp->lg_closing = B_TRUE;
1832         /* detach and free MAC ports associated with group */
1833         port = grp->lg_ports;
1834         while (port != NULL) {
1835                 cport = port->lp_next;
1836                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1837                 if (grp->lg_started)
1838                         aggr_port_stop(port);
1839                 (void) aggr_grp_detach_port(grp, port);
1840                 mac_perim_exit(pmph);
1841                 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1842                 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1843                 aggr_port_delete(port);
1844                 port = cport;
1845         }
1846 
1847         mac_perim_exit(mph);
1848 
1849         kmem_free(grp->lg_tx_blocked_rings,
1850             (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1851         /*
1852          * Wait for the port's lacp timer thread and its notification callback
1853          * to exit before calling mac_unregister() since both needs to access
1854          * the mac perimeter of the grp.
1855          */
1856         aggr_grp_port_wait(grp);
1857 
1858         VERIFY(mac_unregister(grp->lg_mh) == 0);
1859         grp->lg_mh = NULL;
1860 
1861         list_destroy(&(grp->lg_rx_group.arg_vlans));
1862 
1863         AGGR_GRP_REFRELE(grp);
1864         return (0);
1865 }
1866 
1867 void
1868 aggr_grp_free(aggr_grp_t *grp)
1869 {
1870         ASSERT(grp->lg_refs == 0);
1871         ASSERT(grp->lg_port_ref == 0);
1872         if (grp->lg_key > AGGR_MAX_KEY) {
1873                 id_free(key_ids, grp->lg_key);
1874                 grp->lg_key = 0;
1875         }
1876         kmem_cache_free(aggr_grp_cache, grp);
1877 }
1878 
1879 int
1880 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1881     aggr_grp_info_new_grp_fn_t new_grp_fn,
1882     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1883 {
1884         aggr_grp_t      *grp;
1885         aggr_port_t     *port;
1886         mac_perim_handle_t mph, pmph;
1887         int             rc = 0;
1888 
1889         /*
1890          * Make sure that the aggregation link is visible from the caller's
1891          * zone.
1892          */
1893         if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1894                 return (ENOENT);
1895 
1896         rw_enter(&aggr_grp_lock, RW_READER);
1897 
1898         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1899             (mod_hash_val_t *)&grp) != 0) {
1900                 rw_exit(&aggr_grp_lock);
1901                 return (ENOENT);
1902         }
1903         AGGR_GRP_REFHOLD(grp);
1904 
1905         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1906         rw_exit(&aggr_grp_lock);
1907 
1908         rc = new_grp_fn(fn_arg, grp->lg_linkid,
1909             (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1910             grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1911             grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1912 
1913         if (rc != 0)
1914                 goto bail;
1915 
1916         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1917                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1918                 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1919                     port->lp_state, &port->lp_lacp.ActorOperPortState);
1920                 mac_perim_exit(pmph);
1921 
1922                 if (rc != 0)
1923                         goto bail;
1924         }
1925 
1926 bail:
1927         mac_perim_exit(mph);
1928         AGGR_GRP_REFRELE(grp);
1929         return (rc);
1930 }
1931 
1932 /*ARGSUSED*/
1933 static void
1934 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1935 {
1936         miocnak(q, mp, 0, ENOTSUP);
1937 }
1938 
1939 static int
1940 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1941 {
1942         aggr_port_t     *port;
1943         uint_t          stat_index;
1944 
1945         ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
1946 
1947         /* We only aggregate counter statistics. */
1948         if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1949             IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1950                 return (ENOTSUP);
1951         }
1952 
1953         /*
1954          * Counter statistics for a group are computed by aggregating the
1955          * counters of the members MACs while they were aggregated, plus
1956          * the residual counter of the group itself, which is updated each
1957          * time a MAC is removed from the group.
1958          */
1959         *val = 0;
1960         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1961                 /* actual port statistic */
1962                 *val += aggr_port_stat(port, stat);
1963                 /*
1964                  * minus the port stat when it was added, plus any residual
1965                  * amount for the group.
1966                  */
1967                 if (IS_MAC_STAT(stat)) {
1968                         stat_index = stat - MAC_STAT_MIN;
1969                         *val -= port->lp_stat[stat_index];
1970                         *val += grp->lg_stat[stat_index];
1971                 } else if (IS_MACTYPE_STAT(stat)) {
1972                         stat_index = stat - MACTYPE_STAT_MIN;
1973                         *val -= port->lp_ether_stat[stat_index];
1974                         *val += grp->lg_ether_stat[stat_index];
1975                 }
1976         }
1977         return (0);
1978 }
1979 
1980 int
1981 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1982 {
1983         aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1984 
1985         if (rx_ring->arr_hw_rh != NULL) {
1986                 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1987         } else {
1988                 aggr_port_t     *port = rx_ring->arr_port;
1989 
1990                 *val = mac_stat_get(port->lp_mh, stat);
1991 
1992         }
1993         return (0);
1994 }
1995 
1996 int
1997 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1998 {
1999         aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2000 
2001         if (tx_ring->atr_hw_rh != NULL) {
2002                 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2003         } else {
2004                 aggr_port_t     *port = tx_ring->atr_port;
2005 
2006                 *val = mac_stat_get(port->lp_mh, stat);
2007         }
2008         return (0);
2009 }
2010 
2011 static int
2012 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2013 {
2014         aggr_grp_t              *grp = arg;
2015         int                     rval = 0;
2016 
2017         mutex_enter(&grp->lg_stat_lock);
2018 
2019         switch (stat) {
2020         case MAC_STAT_IFSPEED:
2021                 *val = grp->lg_ifspeed;
2022                 break;
2023 
2024         case ETHER_STAT_LINK_DUPLEX:
2025                 *val = grp->lg_link_duplex;
2026                 break;
2027 
2028         default:
2029                 /*
2030                  * For all other statistics, we return the aggregated stat
2031                  * from the underlying ports.  aggr_grp_stat() will set
2032                  * rval appropriately if the statistic isn't a counter.
2033                  */
2034                 rval = aggr_grp_stat(grp, stat, val);
2035         }
2036 
2037         mutex_exit(&grp->lg_stat_lock);
2038         return (rval);
2039 }
2040 
2041 static int
2042 aggr_m_start(void *arg)
2043 {
2044         aggr_grp_t *grp = arg;
2045         aggr_port_t *port;
2046         mac_perim_handle_t mph, pmph;
2047 
2048         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2049 
2050         /*
2051          * Attempts to start all configured members of the group.
2052          * Group members will be attached when their link-up notification
2053          * is received.
2054          */
2055         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2056                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2057                 if (aggr_port_start(port) != 0) {
2058                         mac_perim_exit(pmph);
2059                         continue;
2060                 }
2061 
2062                 /*
2063                  * Turn on the promiscuous mode if it is required to receive
2064                  * the non-primary address over a port, or the promiscous
2065                  * mode is enabled over the aggr.
2066                  */
2067                 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2068                         if (aggr_port_promisc(port, B_TRUE) != 0)
2069                                 aggr_port_stop(port);
2070                 }
2071                 mac_perim_exit(pmph);
2072         }
2073 
2074         grp->lg_started = B_TRUE;
2075 
2076         mac_perim_exit(mph);
2077         return (0);
2078 }
2079 
2080 static void
2081 aggr_m_stop(void *arg)
2082 {
2083         aggr_grp_t *grp = arg;
2084         aggr_port_t *port;
2085         mac_perim_handle_t mph, pmph;
2086 
2087         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2088 
2089         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2090                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2091 
2092                 /* reset port promiscuous mode */
2093                 (void) aggr_port_promisc(port, B_FALSE);
2094 
2095                 aggr_port_stop(port);
2096                 mac_perim_exit(pmph);
2097         }
2098 
2099         grp->lg_started = B_FALSE;
2100         mac_perim_exit(mph);
2101 }
2102 
2103 static int
2104 aggr_m_promisc(void *arg, boolean_t on)
2105 {
2106         aggr_grp_t *grp = arg;
2107         aggr_port_t *port;
2108         boolean_t link_state_changed = B_FALSE;
2109         mac_perim_handle_t mph, pmph;
2110 
2111         AGGR_GRP_REFHOLD(grp);
2112         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2113 
2114         ASSERT(!grp->lg_closing);
2115 
2116         if (on == grp->lg_promisc)
2117                 goto bail;
2118 
2119         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2120                 int     err = 0;
2121 
2122                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2123                 AGGR_PORT_REFHOLD(port);
2124                 if (!on && (port->lp_prom_addr == NULL))
2125                         err = aggr_port_promisc(port, B_FALSE);
2126                 else if (on && port->lp_started)
2127                         err = aggr_port_promisc(port, B_TRUE);
2128 
2129                 if (err != 0) {
2130                         if (aggr_grp_detach_port(grp, port))
2131                                 link_state_changed = B_TRUE;
2132                 } else {
2133                         /*
2134                          * If a port was detached because of a previous
2135                          * failure changing the promiscuity, the port
2136                          * is reattached when it successfully changes
2137                          * the promiscuity now, and this might cause
2138                          * the link state of the aggregation to change.
2139                          */
2140                         if (aggr_grp_attach_port(grp, port))
2141                                 link_state_changed = B_TRUE;
2142                 }
2143                 mac_perim_exit(pmph);
2144                 AGGR_PORT_REFRELE(port);
2145         }
2146 
2147         grp->lg_promisc = on;
2148 
2149         if (link_state_changed)
2150                 mac_link_update(grp->lg_mh, grp->lg_link_state);
2151 
2152 bail:
2153         mac_perim_exit(mph);
2154         AGGR_GRP_REFRELE(grp);
2155 
2156         return (0);
2157 }
2158 
2159 static void
2160 aggr_grp_port_rename(const char *new_name, void *arg)
2161 {
2162         /*
2163          * aggr port's mac client name is the format of "aggr link name" plus
2164          * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2165          */
2166         int aggr_len, link_len, clnt_name_len, i;
2167         char *str_end, *str_st, *str_del;
2168         char aggr_name[MAXNAMELEN];
2169         char link_name[MAXNAMELEN];
2170         char *clnt_name;
2171         aggr_grp_t *aggr_grp = arg;
2172         aggr_port_t *aggr_port = aggr_grp->lg_ports;
2173 
2174         for (i = 0; i < aggr_grp->lg_nports; i++) {
2175                 clnt_name = mac_client_name(aggr_port->lp_mch);
2176                 clnt_name_len = strlen(clnt_name);
2177                 str_st = clnt_name;
2178                 str_end = &(clnt_name[clnt_name_len]);
2179                 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2180                 ASSERT(str_del != NULL);
2181                 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2182                 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2183                 bzero(aggr_name, MAXNAMELEN);
2184                 bzero(link_name, MAXNAMELEN);
2185                 bcopy(clnt_name, aggr_name, aggr_len);
2186                 bcopy(str_del, link_name, link_len + 1);
2187                 bzero(clnt_name, MAXNAMELEN);
2188                 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2189                     link_name);
2190 
2191                 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2192                 aggr_port = aggr_port->lp_next;
2193         }
2194 }
2195 
2196 /*
2197  * Initialize the capabilities that are advertised for the group
2198  * according to the capabilities of the constituent ports.
2199  */
2200 static boolean_t
2201 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2202 {
2203         aggr_grp_t *grp = arg;
2204 
2205         switch (cap) {
2206         case MAC_CAPAB_HCKSUM: {
2207                 uint32_t *hcksum_txflags = cap_data;
2208                 *hcksum_txflags = grp->lg_hcksum_txflags;
2209                 break;
2210         }
2211         case MAC_CAPAB_LSO: {
2212                 mac_capab_lso_t *cap_lso = cap_data;
2213 
2214                 if (grp->lg_lso) {
2215                         *cap_lso = grp->lg_cap_lso;
2216                         break;
2217                 } else {
2218                         return (B_FALSE);
2219                 }
2220         }
2221         case MAC_CAPAB_NO_NATIVEVLAN:
2222                 return (!grp->lg_vlan);
2223         case MAC_CAPAB_NO_ZCOPY:
2224                 return (!grp->lg_zcopy);
2225         case MAC_CAPAB_RINGS: {
2226                 mac_capab_rings_t *cap_rings = cap_data;
2227 
2228                 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2229                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2230                         cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2231 
2232                         /*
2233                          * An aggregation advertises only one (pseudo) RX
2234                          * group, which virtualizes the main/primary group of
2235                          * the underlying devices.
2236                          */
2237                         cap_rings->mr_gnum = 1;
2238                         cap_rings->mr_gaddring = NULL;
2239                         cap_rings->mr_gremring = NULL;
2240                 } else {
2241                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2242                         cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2243                         cap_rings->mr_gnum = 0;
2244                 }
2245                 cap_rings->mr_rget = aggr_fill_ring;
2246                 cap_rings->mr_gget = aggr_fill_group;
2247                 break;
2248         }
2249         case MAC_CAPAB_AGGR:
2250         {
2251                 mac_capab_aggr_t *aggr_cap;
2252 
2253                 if (cap_data != NULL) {
2254                         aggr_cap = cap_data;
2255                         aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2256                         aggr_cap->mca_unicst = aggr_m_unicst;
2257                         aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2258                         aggr_cap->mca_arg = arg;
2259                 }
2260                 return (B_TRUE);
2261         }
2262         default:
2263                 return (B_FALSE);
2264         }
2265         return (B_TRUE);
2266 }
2267 
2268 /*
2269  * Callback function for MAC layer to register groups.
2270  */
2271 static void
2272 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2273     mac_group_info_t *infop, mac_group_handle_t gh)
2274 {
2275         aggr_grp_t *grp = arg;
2276         aggr_pseudo_rx_group_t *rx_group;
2277         aggr_pseudo_tx_group_t *tx_group;
2278 
2279         ASSERT(index == 0);
2280         if (rtype == MAC_RING_TYPE_RX) {
2281                 rx_group = &grp->lg_rx_group;
2282                 rx_group->arg_gh = gh;
2283                 rx_group->arg_grp = grp;
2284 
2285                 infop->mgi_driver = (mac_group_driver_t)rx_group;
2286                 infop->mgi_start = NULL;
2287                 infop->mgi_stop = NULL;
2288                 infop->mgi_addmac = aggr_addmac;
2289                 infop->mgi_remmac = aggr_remmac;
2290                 infop->mgi_count = rx_group->arg_ring_cnt;
2291 
2292                 /*
2293                  * Always set the HW VLAN callbacks. They are smart
2294                  * enough to know when a port has HW VLAN filters to
2295                  * program and when it doesn't.
2296                  */
2297                 infop->mgi_addvlan = aggr_addvlan;
2298                 infop->mgi_remvlan = aggr_remvlan;
2299         } else {
2300                 tx_group = &grp->lg_tx_group;
2301                 tx_group->atg_gh = gh;
2302         }
2303 }
2304 
2305 /*
2306  * Callback funtion for MAC layer to register all rings.
2307  */
2308 static void
2309 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2310     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2311 {
2312         aggr_grp_t      *grp = arg;
2313 
2314         switch (rtype) {
2315         case MAC_RING_TYPE_RX: {
2316                 aggr_pseudo_rx_group_t  *rx_group = &grp->lg_rx_group;
2317                 aggr_pseudo_rx_ring_t   *rx_ring;
2318                 mac_intr_t              aggr_mac_intr;
2319 
2320                 ASSERT(rg_index == 0);
2321 
2322                 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2323                 rx_ring = rx_group->arg_rings + index;
2324                 rx_ring->arr_rh = rh;
2325 
2326                 /*
2327                  * Entrypoint to enable interrupt (disable poll) and
2328                  * disable interrupt (enable poll).
2329                  */
2330                 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2331                 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2332                 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2333                 aggr_mac_intr.mi_ddi_handle = NULL;
2334 
2335                 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2336                 infop->mri_start = aggr_pseudo_start_ring;
2337                 infop->mri_stop = NULL;
2338 
2339                 infop->mri_intr = aggr_mac_intr;
2340                 infop->mri_poll = aggr_rx_poll;
2341 
2342                 infop->mri_stat = aggr_rx_ring_stat;
2343                 break;
2344         }
2345         case MAC_RING_TYPE_TX: {
2346                 aggr_pseudo_tx_group_t  *tx_group = &grp->lg_tx_group;
2347                 aggr_pseudo_tx_ring_t   *tx_ring;
2348 
2349                 ASSERT(rg_index == -1);
2350                 ASSERT(index < tx_group->atg_ring_cnt);
2351 
2352                 tx_ring = &tx_group->atg_rings[index];
2353                 tx_ring->atr_rh = rh;
2354 
2355                 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2356                 infop->mri_start = NULL;
2357                 infop->mri_stop = NULL;
2358                 infop->mri_tx = aggr_ring_tx;
2359                 infop->mri_stat = aggr_tx_ring_stat;
2360                 /*
2361                  * Use the hw TX ring handle to find if the ring needs
2362                  * serialization or not. For NICs that do not expose
2363                  * Tx rings, atr_hw_rh will be NULL.
2364                  */
2365                 if (tx_ring->atr_hw_rh != NULL) {
2366                         infop->mri_flags =
2367                             mac_hwring_getinfo(tx_ring->atr_hw_rh);
2368                 }
2369                 break;
2370         }
2371         default:
2372                 break;
2373         }
2374 }
2375 
2376 static mblk_t *
2377 aggr_rx_poll(void *arg, int bytes_to_pickup)
2378 {
2379         aggr_pseudo_rx_ring_t *rr_ring = arg;
2380         aggr_port_t *port = rr_ring->arr_port;
2381         aggr_grp_t *grp = port->lp_grp;
2382         mblk_t *mp_chain, *mp, **mpp;
2383 
2384         mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2385 
2386         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2387                 return (mp_chain);
2388 
2389         mpp = &mp_chain;
2390         while ((mp = *mpp) != NULL) {
2391                 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2392                         struct ether_header *ehp;
2393 
2394                         ehp = (struct ether_header *)mp->b_rptr;
2395                         if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2396                                 *mpp = mp->b_next;
2397                                 mp->b_next = NULL;
2398                                 aggr_recv_lacp(port,
2399                                     (mac_resource_handle_t)rr_ring, mp);
2400                                 continue;
2401                         }
2402                 }
2403 
2404                 if (!port->lp_collector_enabled) {
2405                         *mpp = mp->b_next;
2406                         mp->b_next = NULL;
2407                         freemsg(mp);
2408                         continue;
2409                 }
2410                 mpp = &mp->b_next;
2411         }
2412         return (mp_chain);
2413 }
2414 
2415 static int
2416 aggr_addmac(void *arg, const uint8_t *mac_addr)
2417 {
2418         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2419         aggr_unicst_addr_t      *addr, **pprev;
2420         aggr_grp_t              *grp = rx_group->arg_grp;
2421         aggr_port_t             *port, *p;
2422         mac_perim_handle_t      mph;
2423         int                     err = 0;
2424 
2425         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2426 
2427         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2428                 mac_perim_exit(mph);
2429                 return (0);
2430         }
2431 
2432         /*
2433          * Insert this mac address into the list of mac addresses owned by
2434          * the aggregation pseudo group.
2435          */
2436         pprev = &rx_group->arg_macaddr;
2437         while ((addr = *pprev) != NULL) {
2438                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2439                         mac_perim_exit(mph);
2440                         return (EEXIST);
2441                 }
2442                 pprev = &addr->aua_next;
2443         }
2444         addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2445         bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2446         addr->aua_next = NULL;
2447         *pprev = addr;
2448 
2449         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2450                 if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2451                         break;
2452 
2453         if (err != 0) {
2454                 for (p = grp->lg_ports; p != port; p = p->lp_next)
2455                         aggr_port_remmac(p, mac_addr);
2456 
2457                 *pprev = NULL;
2458                 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2459         }
2460 
2461         mac_perim_exit(mph);
2462         return (err);
2463 }
2464 
2465 static int
2466 aggr_remmac(void *arg, const uint8_t *mac_addr)
2467 {
2468         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2469         aggr_unicst_addr_t      *addr, **pprev;
2470         aggr_grp_t              *grp = rx_group->arg_grp;
2471         aggr_port_t             *port;
2472         mac_perim_handle_t      mph;
2473         int                     err = 0;
2474 
2475         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2476 
2477         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2478                 mac_perim_exit(mph);
2479                 return (0);
2480         }
2481 
2482         /*
2483          * Insert this mac address into the list of mac addresses owned by
2484          * the aggregation pseudo group.
2485          */
2486         pprev = &rx_group->arg_macaddr;
2487         while ((addr = *pprev) != NULL) {
2488                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2489                         pprev = &addr->aua_next;
2490                         continue;
2491                 }
2492                 break;
2493         }
2494         if (addr == NULL) {
2495                 mac_perim_exit(mph);
2496                 return (EINVAL);
2497         }
2498 
2499         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2500                 aggr_port_remmac(port, mac_addr);
2501 
2502         *pprev = addr->aua_next;
2503         kmem_free(addr, sizeof (aggr_unicst_addr_t));
2504 
2505         mac_perim_exit(mph);
2506         return (err);
2507 }
2508 
2509 /*
2510  * Search for VID in the Rx group's list and return a pointer if
2511  * found. Otherwise return NULL.
2512  */
2513 static aggr_vlan_t *
2514 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2515 {
2516         ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2517         for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2518             avp = list_next(&rx_group->arg_vlans, avp)) {
2519                 if (avp->av_vid == vid)
2520                         return (avp);
2521         }
2522 
2523         return (NULL);
2524 }
2525 
2526 /*
2527  * Accept traffic on the specified VID.
2528  *
2529  * Persist VLAN state in the aggr so that ports added later will
2530  * receive the correct filters. In the future it would be nice to
2531  * allow aggr to iterate its clients instead of duplicating state.
2532  */
2533 static int
2534 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2535 {
2536         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2537         aggr_grp_t              *aggr = rx_group->arg_grp;
2538         aggr_port_t             *port, *p;
2539         mac_perim_handle_t      mph;
2540         int                     err = 0;
2541         aggr_vlan_t             *avp = NULL;
2542 
2543         mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2544 
2545         if (vid == MAC_VLAN_UNTAGGED) {
2546                 /*
2547                  * Aggr is both a MAC provider and MAC client. As a
2548                  * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2549                  * client. As a client itself, it should pass
2550                  * VLAN_ID_NONE to its ports.
2551                  */
2552                 vid = VLAN_ID_NONE;
2553                 rx_group->arg_untagged++;
2554                 goto update_ports;
2555         }
2556 
2557         avp = aggr_find_vlan(rx_group, vid);
2558 
2559         if (avp != NULL) {
2560                 avp->av_refs++;
2561                 mac_perim_exit(mph);
2562                 return (0);
2563         }
2564 
2565         avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2566         avp->av_vid = vid;
2567         avp->av_refs = 1;
2568 
2569 update_ports:
2570         for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2571                 if ((err = aggr_port_addvlan(port, vid)) != 0)
2572                         break;
2573 
2574         if (err != 0) {
2575                 /*
2576                  * If any of these calls fail then we are in a
2577                  * situation where the ports have different HW state.
2578                  * There's no reasonable action the MAC client can
2579                  * take in this scenario to rectify the situation.
2580                  */
2581                 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2582                         int err2;
2583 
2584                         if ((err2 = aggr_port_remvlan(p, vid)) != 0) {
2585                                 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2586                                     " from port %s: errno %d.", vid,
2587                                     mac_client_name(p->lp_mch), err2);
2588                         }
2589 
2590                 }
2591 
2592                 if (vid == VLAN_ID_NONE)
2593                         rx_group->arg_untagged--;
2594 
2595                 if (avp != NULL) {
2596                         kmem_free(avp, sizeof (aggr_vlan_t));
2597                         avp = NULL;
2598                 }
2599         }
2600 
2601         if (avp != NULL)
2602                 list_insert_tail(&rx_group->arg_vlans, avp);
2603 
2604 done:
2605         mac_perim_exit(mph);
2606         return (err);
2607 }
2608 
2609 /*
2610  * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2611  */
2612 static int
2613 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2614 {
2615         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2616         aggr_grp_t              *aggr = rx_group->arg_grp;
2617         aggr_port_t             *port, *p;
2618         mac_perim_handle_t      mph;
2619         int                     err = 0;
2620         aggr_vlan_t             *avp = NULL;
2621 
2622         mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2623 
2624         /*
2625          * See the comment in aggr_addvlan().
2626          */
2627         if (vid == MAC_VLAN_UNTAGGED) {
2628                 vid = VLAN_ID_NONE;
2629                 rx_group->arg_untagged--;
2630 
2631                 if (rx_group->arg_untagged > 0)
2632                         goto done;
2633 
2634                 goto update_ports;
2635         }
2636 
2637         avp = aggr_find_vlan(rx_group, vid);
2638 
2639         if (avp == NULL) {
2640                 err = ENOENT;
2641                 goto done;
2642         }
2643 
2644         avp->av_refs--;
2645 
2646         if (avp->av_refs > 0)
2647                 goto done;
2648 
2649 update_ports:
2650         for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2651                 if ((err = aggr_port_remvlan(port, vid)) != 0)
2652                         break;
2653 
2654         /*
2655          * See the comment in aggr_addvlan() for justification of the
2656          * use of VERIFY here.
2657          */
2658         if (err != 0) {
2659                 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2660                         int err2;
2661 
2662                         if ((err2 = aggr_port_addvlan(p, vid)) != 0) {
2663                                 cmn_err(CE_WARN, "Failed to add VLAN %u"
2664                                     " to port %s: errno %d.", vid,
2665                                     mac_client_name(p->lp_mch), err2);
2666                         }
2667                 }
2668 
2669                 if (avp != NULL)
2670                         avp->av_refs++;
2671 
2672                 if (vid == VLAN_ID_NONE)
2673                         rx_group->arg_untagged++;
2674 
2675                 goto done;
2676         }
2677 
2678         if (err == 0 && avp != NULL) {
2679                 VERIFY3U(avp->av_refs, ==, 0);
2680                 list_remove(&rx_group->arg_vlans, avp);
2681                 kmem_free(avp, sizeof (aggr_vlan_t));
2682         }
2683 
2684 done:
2685         mac_perim_exit(mph);
2686         return (err);
2687 }
2688 
2689 /*
2690  * Add or remove the multicast addresses that are defined for the group
2691  * to or from the specified port.
2692  *
2693  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2694  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2695  * called when the port is either stopped or detached.
2696  */
2697 void
2698 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2699 {
2700         aggr_grp_t *grp = port->lp_grp;
2701 
2702         ASSERT(MAC_PERIM_HELD(port->lp_mh));
2703         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2704 
2705         if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2706                 return;
2707 
2708         mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2709 }
2710 
2711 static int
2712 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2713 {
2714         aggr_grp_t *grp = arg;
2715         aggr_port_t *port = NULL, *errport = NULL;
2716         mac_perim_handle_t mph;
2717         int err = 0;
2718 
2719         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2720         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2721                 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2722                     !port->lp_started) {
2723                         continue;
2724                 }
2725                 err = aggr_port_multicst(port, add, addrp);
2726                 if (err != 0) {
2727                         errport = port;
2728                         break;
2729                 }
2730         }
2731 
2732         /*
2733          * At least one port caused error return and this error is returned to
2734          * mac, eventually a NAK would be sent upwards.
2735          * Some ports have this multicast address listed now, and some don't.
2736          * Treat this error as a whole aggr failure not individual port failure.
2737          * Therefore remove this multicast address from other ports.
2738          */
2739         if ((err != 0) && add) {
2740                 for (port = grp->lg_ports; port != errport;
2741                     port = port->lp_next) {
2742                         if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2743                             !port->lp_started) {
2744                                 continue;
2745                         }
2746                         (void) aggr_port_multicst(port, B_FALSE, addrp);
2747                 }
2748         }
2749         mac_perim_exit(mph);
2750         return (err);
2751 }
2752 
2753 static int
2754 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2755 {
2756         aggr_grp_t *grp = arg;
2757         mac_perim_handle_t mph;
2758         int err;
2759 
2760         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2761         err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2762             0, 0);
2763         mac_perim_exit(mph);
2764         return (err);
2765 }
2766 
2767 /*
2768  * Initialize the capabilities that are advertised for the group
2769  * according to the capabilities of the constituent ports.
2770  */
2771 static void
2772 aggr_grp_capab_set(aggr_grp_t *grp)
2773 {
2774         uint32_t cksum;
2775         aggr_port_t *port;
2776         mac_capab_lso_t cap_lso;
2777 
2778         ASSERT(grp->lg_mh == NULL);
2779         ASSERT(grp->lg_ports != NULL);
2780 
2781         grp->lg_hcksum_txflags = (uint32_t)-1;
2782         grp->lg_zcopy = B_TRUE;
2783         grp->lg_vlan = B_TRUE;
2784 
2785         grp->lg_lso = B_TRUE;
2786         grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2787         grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2788 
2789         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2790                 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2791                         cksum = 0;
2792                 grp->lg_hcksum_txflags &= cksum;
2793 
2794                 grp->lg_vlan &=
2795                     !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2796 
2797                 grp->lg_zcopy &=
2798                     !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2799 
2800                 grp->lg_lso &=
2801                     mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2802                 if (grp->lg_lso) {
2803                         grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2804                         if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2805                             cap_lso.lso_basic_tcp_ipv4.lso_max)
2806                                 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2807                                     cap_lso.lso_basic_tcp_ipv4.lso_max;
2808                 }
2809         }
2810 }
2811 
2812 /*
2813  * Checks whether the capabilities of the port being added are compatible
2814  * with the current capabilities of the aggregation.
2815  */
2816 static boolean_t
2817 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2818 {
2819         uint32_t hcksum_txflags;
2820 
2821         ASSERT(grp->lg_ports != NULL);
2822 
2823         if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2824             grp->lg_vlan) != grp->lg_vlan) {
2825                 return (B_FALSE);
2826         }
2827 
2828         if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2829             grp->lg_zcopy) != grp->lg_zcopy) {
2830                 return (B_FALSE);
2831         }
2832 
2833         if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2834                 if (grp->lg_hcksum_txflags != 0)
2835                         return (B_FALSE);
2836         } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2837             grp->lg_hcksum_txflags) {
2838                 return (B_FALSE);
2839         }
2840 
2841         if (grp->lg_lso) {
2842                 mac_capab_lso_t cap_lso;
2843 
2844                 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2845                         if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2846                             grp->lg_cap_lso.lso_flags)
2847                                 return (B_FALSE);
2848                         if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2849                             cap_lso.lso_basic_tcp_ipv4.lso_max)
2850                                 return (B_FALSE);
2851                 } else {
2852                         return (B_FALSE);
2853                 }
2854         }
2855 
2856         return (B_TRUE);
2857 }
2858 
2859 /*
2860  * Returns the maximum SDU according to the SDU of the constituent ports.
2861  */
2862 static uint_t
2863 aggr_grp_max_sdu(aggr_grp_t *grp)
2864 {
2865         uint_t max_sdu = (uint_t)-1;
2866         aggr_port_t *port;
2867 
2868         ASSERT(grp->lg_ports != NULL);
2869 
2870         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2871                 uint_t port_sdu_max;
2872 
2873                 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2874                 if (max_sdu > port_sdu_max)
2875                         max_sdu = port_sdu_max;
2876         }
2877 
2878         return (max_sdu);
2879 }
2880 
2881 /*
2882  * Checks if the maximum SDU of the specified port is compatible
2883  * with the maximum SDU of the specified aggregation group, returns
2884  * B_TRUE if it is, B_FALSE otherwise.
2885  */
2886 static boolean_t
2887 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2888 {
2889         uint_t port_sdu_max;
2890 
2891         mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2892         return (port_sdu_max >= grp->lg_max_sdu);
2893 }
2894 
2895 /*
2896  * Returns the maximum margin according to the margin of the constituent ports.
2897  */
2898 static uint32_t
2899 aggr_grp_max_margin(aggr_grp_t *grp)
2900 {
2901         uint32_t margin = UINT32_MAX;
2902         aggr_port_t *port;
2903 
2904         ASSERT(grp->lg_mh == NULL);
2905         ASSERT(grp->lg_ports != NULL);
2906 
2907         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2908                 if (margin > port->lp_margin)
2909                         margin = port->lp_margin;
2910         }
2911 
2912         grp->lg_margin = margin;
2913         return (margin);
2914 }
2915 
2916 /*
2917  * Checks if the maximum margin of the specified port is compatible
2918  * with the maximum margin of the specified aggregation group, returns
2919  * B_TRUE if it is, B_FALSE otherwise.
2920  */
2921 static boolean_t
2922 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2923 {
2924         if (port->lp_margin >= grp->lg_margin)
2925                 return (B_TRUE);
2926 
2927         /*
2928          * See whether the current margin value is allowed to be changed to
2929          * the new value.
2930          */
2931         if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2932                 return (B_FALSE);
2933 
2934         grp->lg_margin = port->lp_margin;
2935         return (B_TRUE);
2936 }
2937 
2938 /*
2939  * Set MTU on individual ports of an aggregation group
2940  */
2941 static int
2942 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2943     uint32_t *old_mtu)
2944 {
2945         boolean_t               removed = B_FALSE;
2946         mac_perim_handle_t      mph;
2947         mac_diag_t              diag;
2948         int                     err, rv, retry = 0;
2949 
2950         if (port->lp_mah != NULL) {
2951                 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2952                 port->lp_mah = NULL;
2953                 removed = B_TRUE;
2954         }
2955         err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2956 try_again:
2957         if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2958             MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2959             &port->lp_mah, 0, &diag)) != 0) {
2960                 /*
2961                  * following is a workaround for a bug in 'bge' driver.
2962                  * See CR 6794654 for more information and this work around
2963                  * will be removed once the CR is fixed.
2964                  */
2965                 if (rv == EIO && retry++ < 3) {
2966                         delay(2 * hz);
2967                         goto try_again;
2968                 }
2969                 /*
2970                  * if mac_unicast_add() failed while setting the MTU,
2971                  * detach the port from the group.
2972                  */
2973                 mac_perim_enter_by_mh(port->lp_mh, &mph);
2974                 (void) aggr_grp_detach_port(grp, port);
2975                 mac_perim_exit(mph);
2976                 cmn_err(CE_WARN, "Unable to restart the port %s while "
2977                     "setting MTU. Detaching the port from the aggregation.",
2978                     mac_client_name(port->lp_mch));
2979         }
2980         return (err);
2981 }
2982 
2983 static int
2984 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2985 {
2986         int                     err = 0, i, rv;
2987         aggr_port_t             *port;
2988         uint32_t                *mtu;
2989 
2990         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2991 
2992         /*
2993          * If the MTU being set is equal to aggr group's maximum
2994          * allowable value, then there is nothing to change
2995          */
2996         if (sdu == grp->lg_max_sdu)
2997                 return (0);
2998 
2999         /* 0 is aggr group's min sdu */
3000         if (sdu == 0)
3001                 return (EINVAL);
3002 
3003         mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3004         for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3005             port = port->lp_next, i++) {
3006                 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3007         }
3008         if (err != 0) {
3009                 /* recover from error: reset the mtus of the ports */
3010                 aggr_port_t *tmp;
3011 
3012                 for (tmp = grp->lg_ports, i = 0; tmp != port;
3013                     tmp = tmp->lp_next, i++) {
3014                         (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3015                 }
3016                 goto bail;
3017         }
3018         grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3019         rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3020         ASSERT(rv == 0);
3021 bail:
3022         kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3023         return (err);
3024 }
3025 
3026 /*
3027  * Callback functions for set/get of properties
3028  */
3029 /*ARGSUSED*/
3030 static int
3031 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3032     uint_t pr_valsize, const void *pr_val)
3033 {
3034         int             err = ENOTSUP;
3035         aggr_grp_t      *grp = m_driver;
3036 
3037         switch (pr_num) {
3038         case MAC_PROP_MTU: {
3039                 uint32_t        mtu;
3040 
3041                 if (pr_valsize < sizeof (mtu)) {
3042                         err = EINVAL;
3043                         break;
3044                 }
3045                 bcopy(pr_val, &mtu, sizeof (mtu));
3046                 err = aggr_sdu_update(grp, mtu);
3047                 break;
3048         }
3049         default:
3050                 break;
3051         }
3052         return (err);
3053 }
3054 
3055 typedef struct rboundary {
3056         uint32_t        bval;
3057         int             btype;
3058 } rboundary_t;
3059 
3060 /*
3061  * This function finds the intersection of mtu ranges stored in arrays -
3062  * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3063  * Individual arrays are assumed to contain non-overlapping ranges.
3064  * Algorithm:
3065  *   A range has two boundaries - min and max. We scan all arrays and store
3066  * each boundary as a separate element in a temporary array. We also store
3067  * the boundary types, min or max, as +1 or -1 respectively in the temporary
3068  * array. Then we sort the temporary array in ascending order. We scan the
3069  * sorted array from lower to higher values and keep a cumulative sum of
3070  * boundary types. Element in the temporary array for which the sum reaches
3071  * mcount is a min boundary of a range in the result and next element will be
3072  * max boundary.
3073  *
3074  * Example for mcount = 3,
3075  *
3076  *  ----|_________|-------|_______|----|__|------ mrange[0]
3077  *
3078  *  -------|________|--|____________|-----|___|-- mrange[1]
3079  *
3080  *  --------|________________|-------|____|------ mrange[2]
3081  *
3082  *                                      3 2 1
3083  *                                       \|/
3084  *      1  23     2 1  2  3  2    1 01 2  V   0  <- the sum
3085  *  ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3086  *
3087  *                                 same min and max
3088  *                                        V
3089  *  --------|_____|-------|__|------------|------ intersecting ranges
3090  */
3091 void
3092 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3093     mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3094 {
3095         mac_propval_uint32_range_t      *rval, *ur;
3096         int                             rmaxcnt, rcount;
3097         size_t                          sz_range32;
3098         rboundary_t                     *ta; /* temporary array */
3099         rboundary_t                     temp;
3100         boolean_t                       range_started = B_FALSE;
3101         int                             i, j, m, sum;
3102 
3103         sz_range32 = sizeof (mac_propval_uint32_range_t);
3104 
3105         for (i = 0, rmaxcnt = 0; i < mcount; i++)
3106                 rmaxcnt += mrange[i]->mpr_count;
3107 
3108         /* Allocate enough space to store the results */
3109         rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3110 
3111         /* Number of boundaries are twice as many as ranges */
3112         ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3113 
3114         for (i = 0, m = 0; i < mcount; i++) {
3115                 ur = &(mrange[i]->mpr_range_uint32[0]);
3116                 for (j = 0; j < mrange[i]->mpr_count; j++) {
3117                         ta[m].bval = ur[j].mpur_min;
3118                         ta[m++].btype = 1;
3119                         ta[m].bval = ur[j].mpur_max;
3120                         ta[m++].btype = -1;
3121                 }
3122         }
3123 
3124         /*
3125          * Sort the temporary array in ascending order of bval;
3126          * if boundary values are same then sort on btype.
3127          */
3128         for (i = 0; i < m-1; i++) {
3129                 for (j = i+1; j < m; j++) {
3130                         if ((ta[i].bval > ta[j].bval) ||
3131                             ((ta[i].bval == ta[j].bval) &&
3132                             (ta[i].btype < ta[j].btype))) {
3133                                 temp = ta[i];
3134                                 ta[i] = ta[j];
3135                                 ta[j] = temp;
3136                         }
3137                 }
3138         }
3139 
3140         /* Walk through temporary array to find all ranges in the results */
3141         for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3142                 sum += ta[i].btype;
3143                 if (sum == mcount) {
3144                         rval[rcount].mpur_min = ta[i].bval;
3145                         range_started = B_TRUE;
3146                 } else if (sum < mcount && range_started) {
3147                         rval[rcount++].mpur_max = ta[i].bval;
3148                         range_started = B_FALSE;
3149                 }
3150         }
3151 
3152         *prval = rval;
3153         *prmaxcnt = rmaxcnt;
3154         *prcount = rcount;
3155 
3156         kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3157 }
3158 
3159 /*
3160  * Returns the mtu ranges which could be supported by aggr group.
3161  * prmaxcnt returns the size of the buffer prval, prcount returns
3162  * the number of valid entries in prval. Caller is responsible
3163  * for freeing up prval.
3164  */
3165 int
3166 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3167     int *prmaxcnt, int *prcount)
3168 {
3169         mac_propval_range_t             **vals;
3170         aggr_port_t                     *port;
3171         mac_perim_handle_t              mph;
3172         uint_t                          i, numr;
3173         int                             err = 0;
3174         size_t                          sz_propval, sz_range32;
3175         size_t                          size;
3176 
3177         sz_propval = sizeof (mac_propval_range_t);
3178         sz_range32 = sizeof (mac_propval_uint32_range_t);
3179 
3180         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3181 
3182         vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3183             KM_SLEEP);
3184 
3185         for (port = grp->lg_ports, i = 0; port != NULL;
3186             port = port->lp_next, i++) {
3187 
3188                 size = sz_propval;
3189                 vals[i] = kmem_alloc(size, KM_SLEEP);
3190                 vals[i]->mpr_count = 1;
3191 
3192                 mac_perim_enter_by_mh(port->lp_mh, &mph);
3193 
3194                 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3195                     NULL, 0, vals[i], NULL);
3196                 if (err == ENOSPC) {
3197                         /*
3198                          * Not enough space to hold all ranges.
3199                          * Allocate extra space as indicated and retry.
3200                          */
3201                         numr = vals[i]->mpr_count;
3202                         kmem_free(vals[i], sz_propval);
3203                         size = sz_propval + (numr - 1) * sz_range32;
3204                         vals[i] = kmem_alloc(size, KM_SLEEP);
3205                         vals[i]->mpr_count = numr;
3206                         err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3207                             NULL, 0, vals[i], NULL);
3208                         ASSERT(err != ENOSPC);
3209                 }
3210                 mac_perim_exit(mph);
3211                 if (err != 0) {
3212                         kmem_free(vals[i], size);
3213                         vals[i] = NULL;
3214                         break;
3215                 }
3216         }
3217 
3218         /*
3219          * if any of the underlying ports does not support changing MTU then
3220          * just return ENOTSUP
3221          */
3222         if (port != NULL) {
3223                 ASSERT(err != 0);
3224                 goto done;
3225         }
3226 
3227         aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3228             prcount);
3229 
3230 done:
3231         for (i = 0; i < grp->lg_nports; i++) {
3232                 if (vals[i] != NULL) {
3233                         numr = vals[i]->mpr_count;
3234                         size = sz_propval + (numr - 1) * sz_range32;
3235                         kmem_free(vals[i], size);
3236                 }
3237         }
3238 
3239         kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3240         return (err);
3241 }
3242 
3243 static void
3244 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3245     mac_prop_info_handle_t prh)
3246 {
3247         aggr_grp_t                      *grp = m_driver;
3248         mac_propval_uint32_range_t      *rval = NULL;
3249         int                             i, rcount, rmaxcnt;
3250         int                             err = 0;
3251 
3252         _NOTE(ARGUNUSED(pr_name));
3253 
3254         switch (pr_num) {
3255         case MAC_PROP_MTU:
3256 
3257                 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3258                     &rcount);
3259                 if (err != 0) {
3260                         ASSERT(rval == NULL);
3261                         return;
3262                 }
3263                 for (i = 0; i < rcount; i++) {
3264                         mac_prop_info_set_range_uint32(prh,
3265                             rval[i].mpur_min, rval[i].mpur_max);
3266                 }
3267                 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3268                 break;
3269         }
3270 }