1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2018 Joyent, Inc.
  24  */
  25 
  26 /*
  27  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
  28  *
  29  * An instance of the structure aggr_grp_t is allocated for each
  30  * link aggregation group. When created, aggr_grp_t objects are
  31  * entered into the aggr_grp_hash hash table maintained by the modhash
  32  * module. The hash key is the linkid associated with the link
  33  * aggregation group.
  34  *
  35  * Each aggregation contains a set of ports. The port is represented
  36  * by the aggr_port_t structure. A port consists of a single MAC
  37  * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
  38  * MAC. This client is used by the aggr to send and receive LACP
  39  * traffic. Each port client takes on the same MAC unicast address --
  40  * the address of the aggregation itself (taken from the first port by
  41  * default).
  42  *
  43  * The MAC client that hangs off each aggr port is not your typical
  44  * MAC client. Not only does it have exclusive control of the MAC, but
  45  * it also has no Tx or Rx SRSes. An SRS is designed to queue and
  46  * fanout traffic among L4 protocols; but the aggr is an intermediary,
  47  * not a consumer. Instead of using SRSes, the aggr puts the
  48  * underlying hardware rings into passthru mode and ships packets up
  49  * via a direct call to aggr_recv_cb(). This allows aggr to enforce
  50  * LACP while passing all other traffic up to clients of the aggr.
  51  *
  52  * Pseudo Rx Groups and Rings
  53  * --------------------------
  54  *
  55  * It is imperative for client performance that the aggr provide as
  56  * many MAC groups as possible. In order to use the underlying HW
  57  * resources, aggr creates pseudo groups to aggregate the underlying
  58  * HW groups. Every HW group gets mapped to a pseudo group; and every
  59  * HW ring in that group gets mapped to a pseudo ring. The pseudo
  60  * group at index 0 combines all the HW groups at index 0 from each
  61  * port, etc. The aggr's MAC then creates normal MAC groups and rings
  62  * out of these pseudo groups and rings to present to the aggr's
  63  * clients. To the clients, the aggr's groups and rings are absolutely
  64  * no different than a NIC's groups or rings.
  65  *
  66  * Pseudo Tx Rings
  67  * ---------------
  68  *
  69  * The underlying ports (NICs) in an aggregation can have Tx rings. To
  70  * enhance aggr's performance, these Tx rings are made available to
  71  * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
  72  * not new. They are already present and implemented on the Rx side.
  73  * The same concept is extended to the Tx side where each Tx ring of
  74  * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
  75  * each pseudo Tx ring will map to a specific hardware Tx ring. Even
  76  * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
  77  * is given to the aggregation layer.
  78  *
  79  * With this change, the outgoing stack depth looks much better:
  80  *
  81  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
  82  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
  83  *
  84  * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
  85  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
  86  *
  87  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
  88  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
  89  * ring belonging to a port on which the packet has to be sent.
  90  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
  91  * policy and then uses the fanout_hint passed to it to pick a Tx ring from
  92  * the selected port.
  93  *
  94  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
  95  * bandwidth limit is applied first on the outgoing packet and the packets
  96  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
  97  * particular Tx ring.
  98  */
  99 
 100 #include <sys/types.h>
 101 #include <sys/sysmacros.h>
 102 #include <sys/conf.h>
 103 #include <sys/cmn_err.h>
 104 #include <sys/disp.h>
 105 #include <sys/list.h>
 106 #include <sys/ksynch.h>
 107 #include <sys/kmem.h>
 108 #include <sys/stream.h>
 109 #include <sys/modctl.h>
 110 #include <sys/ddi.h>
 111 #include <sys/sunddi.h>
 112 #include <sys/atomic.h>
 113 #include <sys/stat.h>
 114 #include <sys/modhash.h>
 115 #include <sys/id_space.h>
 116 #include <sys/strsun.h>
 117 #include <sys/cred.h>
 118 #include <sys/dlpi.h>
 119 #include <sys/zone.h>
 120 #include <sys/mac_provider.h>
 121 #include <sys/dls.h>
 122 #include <sys/vlan.h>
 123 #include <sys/aggr.h>
 124 #include <sys/aggr_impl.h>
 125 
 126 static int aggr_m_start(void *);
 127 static void aggr_m_stop(void *);
 128 static int aggr_m_promisc(void *, boolean_t);
 129 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
 130 static int aggr_m_unicst(void *, const uint8_t *);
 131 static int aggr_m_stat(void *, uint_t, uint64_t *);
 132 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
 133 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
 134 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
 135     const void *);
 136 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
 137     mac_prop_info_handle_t);
 138 
 139 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
 140 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
 141     boolean_t *);
 142 
 143 static void aggr_grp_capab_set(aggr_grp_t *);
 144 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
 145 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
 146 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
 147 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
 148 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
 149 
 150 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 151 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 152 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
 153 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
 154 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
 155 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
 156 static int aggr_addmac(void *, const uint8_t *);
 157 static int aggr_remmac(void *, const uint8_t *);
 158 static int aggr_addvlan(mac_group_driver_t, uint16_t);
 159 static int aggr_remvlan(mac_group_driver_t, uint16_t);
 160 static mblk_t *aggr_rx_poll(void *, int);
 161 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
 162     const int, mac_ring_info_t *, mac_ring_handle_t);
 163 static void aggr_fill_group(void *, mac_ring_type_t, const int,
 164     mac_group_info_t *, mac_group_handle_t);
 165 
 166 static kmem_cache_t     *aggr_grp_cache;
 167 static mod_hash_t       *aggr_grp_hash;
 168 static krwlock_t        aggr_grp_lock;
 169 static uint_t           aggr_grp_cnt;
 170 static id_space_t       *key_ids;
 171 
 172 #define GRP_HASHSZ              64
 173 #define GRP_HASH_KEY(linkid)    ((mod_hash_key_t)(uintptr_t)linkid)
 174 #define AGGR_PORT_NAME_DELIMIT '-'
 175 
 176 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
 177 
 178 #define AGGR_M_CALLBACK_FLAGS   \
 179         (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
 180 
 181 static mac_callbacks_t aggr_m_callbacks = {
 182         AGGR_M_CALLBACK_FLAGS,
 183         aggr_m_stat,
 184         aggr_m_start,
 185         aggr_m_stop,
 186         aggr_m_promisc,
 187         aggr_m_multicst,
 188         NULL,
 189         NULL,
 190         NULL,
 191         aggr_m_ioctl,
 192         aggr_m_capab_get,
 193         NULL,
 194         NULL,
 195         aggr_m_setprop,
 196         NULL,
 197         aggr_m_propinfo
 198 };
 199 
 200 /*ARGSUSED*/
 201 static int
 202 aggr_grp_constructor(void *buf, void *arg, int kmflag)
 203 {
 204         aggr_grp_t *grp = buf;
 205 
 206         bzero(grp, sizeof (*grp));
 207         mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
 208         cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
 209         rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
 210         mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
 211         cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
 212         mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
 213         cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
 214         grp->lg_link_state = LINK_STATE_UNKNOWN;
 215         return (0);
 216 }
 217 
 218 /*ARGSUSED*/
 219 static void
 220 aggr_grp_destructor(void *buf, void *arg)
 221 {
 222         aggr_grp_t *grp = buf;
 223 
 224         if (grp->lg_tx_ports != NULL) {
 225                 kmem_free(grp->lg_tx_ports,
 226                     grp->lg_tx_ports_size * sizeof (aggr_port_t *));
 227         }
 228 
 229         mutex_destroy(&grp->lg_lacp_lock);
 230         cv_destroy(&grp->lg_lacp_cv);
 231         mutex_destroy(&grp->lg_port_lock);
 232         cv_destroy(&grp->lg_port_cv);
 233         rw_destroy(&grp->lg_tx_lock);
 234         mutex_destroy(&grp->lg_tx_flowctl_lock);
 235         cv_destroy(&grp->lg_tx_flowctl_cv);
 236 }
 237 
 238 void
 239 aggr_grp_init(void)
 240 {
 241         aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
 242             sizeof (aggr_grp_t), 0, aggr_grp_constructor,
 243             aggr_grp_destructor, NULL, NULL, NULL, 0);
 244 
 245         aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
 246             GRP_HASHSZ, mod_hash_null_valdtor);
 247         rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
 248         aggr_grp_cnt = 0;
 249 
 250         /*
 251          * Allocate an id space to manage key values (when key is not
 252          * specified). The range of the id space will be from
 253          * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
 254          * uses a 16-bit key.
 255          */
 256         key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
 257         ASSERT(key_ids != NULL);
 258 }
 259 
 260 void
 261 aggr_grp_fini(void)
 262 {
 263         id_space_destroy(key_ids);
 264         rw_destroy(&aggr_grp_lock);
 265         mod_hash_destroy_idhash(aggr_grp_hash);
 266         kmem_cache_destroy(aggr_grp_cache);
 267 }
 268 
 269 uint_t
 270 aggr_grp_count(void)
 271 {
 272         uint_t  count;
 273 
 274         rw_enter(&aggr_grp_lock, RW_READER);
 275         count = aggr_grp_cnt;
 276         rw_exit(&aggr_grp_lock);
 277         return (count);
 278 }
 279 
 280 /*
 281  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
 282  * requires the mac perimeter, this function holds a reference of the aggr
 283  * and aggr won't call mac_unregister() until this reference drops to 0.
 284  */
 285 void
 286 aggr_grp_port_hold(aggr_port_t *port)
 287 {
 288         aggr_grp_t      *grp = port->lp_grp;
 289 
 290         AGGR_PORT_REFHOLD(port);
 291         mutex_enter(&grp->lg_port_lock);
 292         grp->lg_port_ref++;
 293         mutex_exit(&grp->lg_port_lock);
 294 }
 295 
 296 /*
 297  * Release the reference of the grp and inform aggr_grp_delete() calling
 298  * mac_unregister() is now safe.
 299  */
 300 void
 301 aggr_grp_port_rele(aggr_port_t *port)
 302 {
 303         aggr_grp_t      *grp = port->lp_grp;
 304 
 305         mutex_enter(&grp->lg_port_lock);
 306         if (--grp->lg_port_ref == 0)
 307                 cv_signal(&grp->lg_port_cv);
 308         mutex_exit(&grp->lg_port_lock);
 309         AGGR_PORT_REFRELE(port);
 310 }
 311 
 312 /*
 313  * Wait for the port's lacp timer thread and the port's notification callback
 314  * to exit.
 315  */
 316 void
 317 aggr_grp_port_wait(aggr_grp_t *grp)
 318 {
 319         mutex_enter(&grp->lg_port_lock);
 320         if (grp->lg_port_ref != 0)
 321                 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
 322         mutex_exit(&grp->lg_port_lock);
 323 }
 324 
 325 /*
 326  * Attach a port to a link aggregation group.
 327  *
 328  * A port is attached to a link aggregation group once its speed
 329  * and link state have been verified.
 330  *
 331  * Returns B_TRUE if the group link state or speed has changed. If
 332  * it's the case, the caller must notify the MAC layer via a call
 333  * to mac_link().
 334  */
 335 boolean_t
 336 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
 337 {
 338         boolean_t link_state_changed = B_FALSE;
 339 
 340         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 341         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 342 
 343         if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
 344                 return (B_FALSE);
 345 
 346         /*
 347          * Validate the MAC port link speed and update the group
 348          * link speed if needed.
 349          */
 350         if (port->lp_ifspeed == 0 ||
 351             port->lp_link_state != LINK_STATE_UP ||
 352             port->lp_link_duplex != LINK_DUPLEX_FULL) {
 353                 /*
 354                  * Can't attach a MAC port with unknown link speed,
 355                  * down link, or not in full duplex mode.
 356                  */
 357                 return (B_FALSE);
 358         }
 359 
 360         mutex_enter(&grp->lg_stat_lock);
 361         if (grp->lg_ifspeed == 0) {
 362                 /*
 363                  * The group inherits the speed of the first link being
 364                  * attached.
 365                  */
 366                 grp->lg_ifspeed = port->lp_ifspeed;
 367                 link_state_changed = B_TRUE;
 368         } else if (grp->lg_ifspeed != port->lp_ifspeed) {
 369                 /*
 370                  * The link speed of the MAC port must be the same as
 371                  * the group link speed, as per 802.3ad. Since it is
 372                  * not, the attach is cancelled.
 373                  */
 374                 mutex_exit(&grp->lg_stat_lock);
 375                 return (B_FALSE);
 376         }
 377         mutex_exit(&grp->lg_stat_lock);
 378 
 379         grp->lg_nattached_ports++;
 380 
 381         /*
 382          * Update the group link state.
 383          */
 384         if (grp->lg_link_state != LINK_STATE_UP) {
 385                 grp->lg_link_state = LINK_STATE_UP;
 386                 mutex_enter(&grp->lg_stat_lock);
 387                 grp->lg_link_duplex = LINK_DUPLEX_FULL;
 388                 mutex_exit(&grp->lg_stat_lock);
 389                 link_state_changed = B_TRUE;
 390         }
 391 
 392         /*
 393          * Update port's state.
 394          */
 395         port->lp_state = AGGR_PORT_STATE_ATTACHED;
 396 
 397         aggr_grp_multicst_port(port, B_TRUE);
 398 
 399         /*
 400          * The port client doesn't have an Rx SRS; instead of calling
 401          * mac_rx_set() we set the client's flow callback directly.
 402          * This datapath is used only when the port's driver doesn't
 403          * support MAC_CAPAB_RINGS. Drivers with ring support will
 404          * deliver traffic to the aggr via ring passthru.
 405          */
 406         mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
 407 
 408         /*
 409          * If LACP is OFF, the port can be used to send data as soon
 410          * as its link is up and verified to be compatible with the
 411          * aggregation.
 412          *
 413          * If LACP is active or passive, notify the LACP subsystem, which
 414          * will enable sending on the port following the LACP protocol.
 415          */
 416         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 417                 aggr_send_port_enable(port);
 418         else
 419                 aggr_lacp_port_attached(port);
 420 
 421         return (link_state_changed);
 422 }
 423 
 424 boolean_t
 425 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
 426 {
 427         boolean_t link_state_changed = B_FALSE;
 428 
 429         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 430         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 431 
 432         /* update state */
 433         if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 434                 return (B_FALSE);
 435 
 436         mac_client_clear_flow_cb(port->lp_mch);
 437 
 438         aggr_grp_multicst_port(port, B_FALSE);
 439 
 440         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 441                 aggr_send_port_disable(port);
 442         else
 443                 aggr_lacp_port_detached(port);
 444 
 445         port->lp_state = AGGR_PORT_STATE_STANDBY;
 446 
 447         grp->lg_nattached_ports--;
 448         if (grp->lg_nattached_ports == 0) {
 449                 /* the last attached MAC port of the group is being detached */
 450                 grp->lg_link_state = LINK_STATE_DOWN;
 451                 mutex_enter(&grp->lg_stat_lock);
 452                 grp->lg_ifspeed = 0;
 453                 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
 454                 mutex_exit(&grp->lg_stat_lock);
 455                 link_state_changed = B_TRUE;
 456         }
 457 
 458         return (link_state_changed);
 459 }
 460 
 461 /*
 462  * Update the MAC addresses of the constituent ports of the specified
 463  * group. This function is invoked:
 464  * - after creating a new aggregation group.
 465  * - after adding new ports to an aggregation group.
 466  * - after removing a port from a group when the MAC address of
 467  *   that port was used for the MAC address of the group.
 468  * - after the MAC address of a port changed when the MAC address
 469  *   of that port was used for the MAC address of the group.
 470  *
 471  * Return true if the link state of the aggregation changed, for example
 472  * as a result of a failure changing the MAC address of one of the
 473  * constituent ports.
 474  */
 475 boolean_t
 476 aggr_grp_update_ports_mac(aggr_grp_t *grp)
 477 {
 478         aggr_port_t *cport;
 479         boolean_t link_state_changed = B_FALSE;
 480         mac_perim_handle_t mph;
 481 
 482         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 483 
 484         for (cport = grp->lg_ports; cport != NULL;
 485             cport = cport->lp_next) {
 486                 mac_perim_enter_by_mh(cport->lp_mh, &mph);
 487                 if (aggr_port_unicst(cport) != 0) {
 488                         if (aggr_grp_detach_port(grp, cport))
 489                                 link_state_changed = B_TRUE;
 490                 } else {
 491                         /*
 492                          * If a port was detached because of a previous
 493                          * failure changing the MAC address, the port is
 494                          * reattached when it successfully changes the MAC
 495                          * address now, and this might cause the link state
 496                          * of the aggregation to change.
 497                          */
 498                         if (aggr_grp_attach_port(grp, cport))
 499                                 link_state_changed = B_TRUE;
 500                 }
 501                 mac_perim_exit(mph);
 502         }
 503         return (link_state_changed);
 504 }
 505 
 506 /*
 507  * Invoked when the MAC address of a port has changed. If the port's
 508  * MAC address was used for the group MAC address, set mac_addr_changedp
 509  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
 510  * notification. If the link state changes due to detach/attach of
 511  * the constituent port, set link_state_changedp to B_TRUE to indicate
 512  * to the caller that it should send a MAC_NOTE_LINK notification. In both
 513  * cases, it is the responsibility of the caller to invoke notification
 514  * functions after releasing the the port lock.
 515  */
 516 void
 517 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
 518     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
 519 {
 520         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 521         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 522         ASSERT(mac_addr_changedp != NULL);
 523         ASSERT(link_state_changedp != NULL);
 524 
 525         *mac_addr_changedp = B_FALSE;
 526         *link_state_changedp = B_FALSE;
 527 
 528         if (grp->lg_addr_fixed) {
 529                 /*
 530                  * The group is using a fixed MAC address or an automatic
 531                  * MAC address has not been set.
 532                  */
 533                 return;
 534         }
 535 
 536         if (grp->lg_mac_addr_port == port) {
 537                 /*
 538                  * The MAC address of the port was assigned to the group
 539                  * MAC address. Update the group MAC address.
 540                  */
 541                 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
 542                 *mac_addr_changedp = B_TRUE;
 543         } else {
 544                 /*
 545                  * Update the actual port MAC address to the MAC address
 546                  * of the group.
 547                  */
 548                 if (aggr_port_unicst(port) != 0) {
 549                         *link_state_changedp = aggr_grp_detach_port(grp, port);
 550                 } else {
 551                         /*
 552                          * If a port was detached because of a previous
 553                          * failure changing the MAC address, the port is
 554                          * reattached when it successfully changes the MAC
 555                          * address now, and this might cause the link state
 556                          * of the aggregation to change.
 557                          */
 558                         *link_state_changedp = aggr_grp_attach_port(grp, port);
 559                 }
 560         }
 561 }
 562 
 563 /*
 564  * Add a port to a link aggregation group.
 565  */
 566 static int
 567 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
 568     aggr_port_t **pp)
 569 {
 570         aggr_port_t *port, **cport;
 571         mac_perim_handle_t mph;
 572         zoneid_t port_zoneid = ALL_ZONES;
 573         int err;
 574 
 575         /* The port must be in the same zone as the aggregation. */
 576         if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
 577                 port_zoneid = GLOBAL_ZONEID;
 578         if (grp->lg_zoneid != port_zoneid)
 579                 return (EBUSY);
 580 
 581         /*
 582          * If we are creating the aggr, then there is no MAC handle
 583          * and thus no perimeter to hold. If we are adding a port to
 584          * an existing aggr, then the perimiter of the aggr's MAC must
 585          * be held.
 586          */
 587         ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
 588 
 589         err = aggr_port_create(grp, port_linkid, force, &port);
 590         if (err != 0)
 591                 return (err);
 592 
 593         mac_perim_enter_by_mh(port->lp_mh, &mph);
 594 
 595         /* Add the new port to the end of the list. */
 596         cport = &grp->lg_ports;
 597         while (*cport != NULL)
 598                 cport = &((*cport)->lp_next);
 599         *cport = port;
 600 
 601         /*
 602          * Back reference to the group it is member of. A port always
 603          * holds a reference to its group to ensure that the back
 604          * reference is always valid.
 605          */
 606         port->lp_grp = grp;
 607         AGGR_GRP_REFHOLD(grp);
 608         grp->lg_nports++;
 609 
 610         aggr_lacp_init_port(port);
 611         mac_perim_exit(mph);
 612 
 613         if (pp != NULL)
 614                 *pp = port;
 615 
 616         return (0);
 617 }
 618 
 619 /*
 620  * This is called in response to either our LACP state machine or a MAC
 621  * notification that the link has gone down via aggr_send_port_disable(). At
 622  * this point, we may need to update our default ring. To that end, we go
 623  * through the set of ports (underlying datalinks in an aggregation) that are
 624  * currently enabled to transmit data. If all our links have been disabled for
 625  * transmit, then we don't do anything.
 626  *
 627  * Note, because we only have a single TX group, we don't have to worry about
 628  * the rings moving between groups and the chance that mac will reassign it
 629  * unless someone removes a port, at which point, we play it safe and call this
 630  * again.
 631  */
 632 void
 633 aggr_grp_update_default(aggr_grp_t *grp)
 634 {
 635         aggr_port_t *port;
 636         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 637 
 638         rw_enter(&grp->lg_tx_lock, RW_WRITER);
 639 
 640         if (grp->lg_ntx_ports == 0) {
 641                 rw_exit(&grp->lg_tx_lock);
 642                 return;
 643         }
 644 
 645         port = grp->lg_tx_ports[0];
 646         ASSERT(port->lp_tx_ring_cnt > 0);
 647         mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
 648         rw_exit(&grp->lg_tx_lock);
 649 }
 650 
 651 /*
 652  * Add a pseudo RX ring for the given HW ring handle.
 653  */
 654 static int
 655 aggr_add_pseudo_rx_ring(aggr_port_t *port,
 656     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 657 {
 658         aggr_pseudo_rx_ring_t   *ring;
 659         int                     err;
 660         int                     j;
 661 
 662         for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 663                 ring = rx_grp->arg_rings + j;
 664                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
 665                         break;
 666         }
 667 
 668         /*
 669          * No slot for this new RX ring.
 670          */
 671         if (j == MAX_RINGS_PER_GROUP)
 672                 return (EIO);
 673 
 674         ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
 675         ring->arr_hw_rh = hw_rh;
 676         ring->arr_port = port;
 677         ring->arr_grp = rx_grp;
 678         rx_grp->arg_ring_cnt++;
 679 
 680         /*
 681          * The group is already registered, dynamically add a new ring to the
 682          * mac group.
 683          */
 684         if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
 685                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 686                 ring->arr_hw_rh = NULL;
 687                 ring->arr_port = NULL;
 688                 ring->arr_grp = NULL;
 689                 rx_grp->arg_ring_cnt--;
 690         } else {
 691                 /*
 692                  * This must run after the MAC is registered.
 693                  */
 694                 ASSERT3P(ring->arr_rh, !=, NULL);
 695                 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
 696                     (void *)port, (mac_resource_handle_t)ring);
 697         }
 698         return (err);
 699 }
 700 
 701 /*
 702  * Remove the pseudo RX ring of the given HW ring handle.
 703  */
 704 static void
 705 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 706 {
 707         for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 708                 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
 709 
 710                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
 711                     ring->arr_hw_rh != hw_rh) {
 712                         continue;
 713                 }
 714 
 715                 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
 716 
 717                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 718                 ring->arr_hw_rh = NULL;
 719                 ring->arr_port = NULL;
 720                 ring->arr_grp = NULL;
 721                 rx_grp->arg_ring_cnt--;
 722                 mac_hwring_clear_passthru(hw_rh);
 723                 break;
 724         }
 725 }
 726 
 727 /*
 728  * Create pseudo rings over the HW rings of the port.
 729  *
 730  * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
 731  *
 732  * o Program existing unicast filters on the pseudo group into the HW group.
 733  *
 734  * o Program existing VLAN filters on the pseudo group into the HW group.
 735  */
 736 static int
 737 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 738 {
 739         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 740         aggr_unicst_addr_t      *addr, *a;
 741         mac_perim_handle_t      pmph;
 742         aggr_vlan_t             *avp;
 743         uint_t                  hw_rh_cnt, i;
 744         int                     err = 0;
 745         uint_t                  g_idx = rx_grp->arg_index;
 746 
 747         ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
 748         ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
 749         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 750 
 751         /*
 752          * This function must be called after the aggr registers its
 753          * MAC and its Rx groups have been initialized.
 754          */
 755         ASSERT(rx_grp->arg_gh != NULL);
 756 
 757         /*
 758          * Get the list of the underlying HW rings.
 759          */
 760         hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
 761             &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
 762 
 763         /*
 764          * Add existing VLAN and unicast address filters to the port.
 765          */
 766         for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
 767             avp = list_next(&rx_grp->arg_vlans, avp)) {
 768                 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
 769                         goto err;
 770         }
 771 
 772         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
 773                 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
 774                         goto err;
 775         }
 776 
 777         for (i = 0; i < hw_rh_cnt; i++) {
 778                 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
 779                 if (err != 0)
 780                         goto err;
 781         }
 782 
 783         mac_perim_exit(pmph);
 784         return (0);
 785 
 786 err:
 787         ASSERT(err != 0);
 788 
 789         for (uint_t j = 0; j < i; j++)
 790                 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
 791 
 792         for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
 793                 aggr_port_remmac(port, g_idx, a->aua_addr);
 794 
 795         if (avp != NULL)
 796                 avp = list_prev(&rx_grp->arg_vlans, avp);
 797 
 798         for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
 799                 int err2;
 800 
 801                 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
 802                         cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
 803                             ": errno %d.", avp->av_vid,
 804                             mac_client_name(port->lp_mch), err2);
 805                 }
 806         }
 807 
 808         port->lp_hwghs[g_idx] = NULL;
 809         mac_perim_exit(pmph);
 810         return (err);
 811 }
 812 
 813 /*
 814  * Destroy the pseudo rings mapping to this port and remove all VLAN
 815  * and unicast filters from this port. Even if there are no underlying
 816  * HW rings we must still remove the unicast filters to take the port
 817  * out of promisc mode.
 818  */
 819 static void
 820 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 821 {
 822         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 823         aggr_unicst_addr_t      *addr;
 824         mac_perim_handle_t      pmph;
 825         uint_t                  hw_rh_cnt;
 826         uint_t                  g_idx = rx_grp->arg_index;
 827 
 828         ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
 829         ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
 830         ASSERT3P(rx_grp->arg_gh, !=, NULL);
 831         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 832 
 833         hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
 834             MAC_RING_TYPE_RX);
 835 
 836         for (uint_t i = 0; i < hw_rh_cnt; i++)
 837                 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
 838 
 839         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
 840                 aggr_port_remmac(port, g_idx, addr->aua_addr);
 841 
 842         for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
 843             avp = list_next(&rx_grp->arg_vlans, avp)) {
 844                 int err;
 845 
 846                 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
 847                         cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
 848                             ": errno %d.", avp->av_vid,
 849                             mac_client_name(port->lp_mch), err);
 850                 }
 851         }
 852 
 853         port->lp_hwghs[g_idx] = NULL;
 854         mac_perim_exit(pmph);
 855 }
 856 
 857 /*
 858  * Add a pseudo TX ring for the given HW ring handle.
 859  */
 860 static int
 861 aggr_add_pseudo_tx_ring(aggr_port_t *port,
 862     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
 863     mac_ring_handle_t *pseudo_rh)
 864 {
 865         aggr_pseudo_tx_ring_t   *ring;
 866         int                     err;
 867         int                     i;
 868 
 869         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 870         for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
 871                 ring = tx_grp->atg_rings + i;
 872                 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
 873                         break;
 874         }
 875         /*
 876          * No slot for this new TX ring.
 877          */
 878         if (i == MAX_RINGS_PER_GROUP)
 879                 return (EIO);
 880         /*
 881          * The following 4 statements needs to be done before
 882          * calling mac_group_add_ring(). Otherwise it will
 883          * result in an assertion failure in mac_init_ring().
 884          */
 885         ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
 886         ring->atr_hw_rh = hw_rh;
 887         ring->atr_port = port;
 888         tx_grp->atg_ring_cnt++;
 889 
 890         /*
 891          * The TX side has no concept of ring groups unlike RX groups.
 892          * There is just a single group which stores all the TX rings.
 893          * This group will be used to store aggr's pseudo TX rings.
 894          */
 895         if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
 896                 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
 897                 ring->atr_hw_rh = NULL;
 898                 ring->atr_port = NULL;
 899                 tx_grp->atg_ring_cnt--;
 900         } else {
 901                 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
 902                 if (hw_rh != NULL) {
 903                         mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
 904                             mac_find_ring(tx_grp->atg_gh, i));
 905                 }
 906         }
 907 
 908         return (err);
 909 }
 910 
 911 /*
 912  * Remove the pseudo TX ring of the given HW ring handle.
 913  */
 914 static void
 915 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
 916     mac_ring_handle_t pseudo_hw_rh)
 917 {
 918         aggr_pseudo_tx_ring_t   *ring;
 919         int                     i;
 920 
 921         for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
 922                 ring = tx_grp->atg_rings + i;
 923                 if (ring->atr_rh != pseudo_hw_rh)
 924                         continue;
 925 
 926                 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
 927                 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
 928                 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
 929                 mac_hwring_teardown(ring->atr_hw_rh);
 930                 ring->atr_hw_rh = NULL;
 931                 ring->atr_port = NULL;
 932                 tx_grp->atg_ring_cnt--;
 933                 break;
 934         }
 935 }
 936 
 937 /*
 938  * This function is called to create pseudo rings over hardware rings of
 939  * the underlying device. There is a 1:1 mapping between the pseudo TX
 940  * rings of the aggr and the hardware rings of the underlying port.
 941  */
 942 static int
 943 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
 944 {
 945         aggr_grp_t              *grp = port->lp_grp;
 946         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
 947         mac_perim_handle_t      pmph;
 948         int                     hw_rh_cnt, i = 0, j;
 949         int                     err = 0;
 950 
 951         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 952         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 953 
 954         /*
 955          * Get the list the the underlying HW rings.
 956          */
 957         hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
 958             MAC_RING_TYPE_TX);
 959 
 960         /*
 961          * Even if the underlying NIC does not have TX rings, we
 962          * still make a psuedo TX ring for that NIC with NULL as
 963          * the ring handle.
 964          */
 965         if (hw_rh_cnt == 0)
 966                 port->lp_tx_ring_cnt = 1;
 967         else
 968                 port->lp_tx_ring_cnt = hw_rh_cnt;
 969 
 970         port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 971             port->lp_tx_ring_cnt), KM_SLEEP);
 972         port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 973             port->lp_tx_ring_cnt), KM_SLEEP);
 974 
 975         if (hw_rh_cnt == 0) {
 976                 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
 977                     NULL, &pseudo_rh)) == 0) {
 978                         port->lp_tx_rings[0] = NULL;
 979                         port->lp_pseudo_tx_rings[0] = pseudo_rh;
 980                 }
 981         } else {
 982                 for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
 983                         err = aggr_add_pseudo_tx_ring(port,
 984                             tx_grp, hw_rh[i], &pseudo_rh);
 985                         if (err != 0)
 986                                 break;
 987                         port->lp_tx_rings[i] = hw_rh[i];
 988                         port->lp_pseudo_tx_rings[i] = pseudo_rh;
 989                 }
 990         }
 991 
 992         if (err != 0) {
 993                 if (hw_rh_cnt != 0) {
 994                         for (j = 0; j < i; j++) {
 995                                 aggr_rem_pseudo_tx_ring(tx_grp,
 996                                     port->lp_pseudo_tx_rings[j]);
 997                         }
 998                 }
 999                 kmem_free(port->lp_tx_rings,
1000                     (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1001                 kmem_free(port->lp_pseudo_tx_rings,
1002                     (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1003                 port->lp_tx_ring_cnt = 0;
1004         } else {
1005                 port->lp_tx_grp_added = B_TRUE;
1006                 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1007                     aggr_tx_ring_update, port);
1008         }
1009         mac_perim_exit(pmph);
1010         aggr_grp_update_default(grp);
1011         return (err);
1012 }
1013 
1014 /*
1015  * This function is called by aggr to remove pseudo TX rings over the
1016  * HW rings of the underlying port.
1017  */
1018 static void
1019 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1020 {
1021         aggr_grp_t              *grp = port->lp_grp;
1022         mac_perim_handle_t      pmph;
1023         int                     i;
1024 
1025         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1026         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1027 
1028         if (!port->lp_tx_grp_added)
1029                 goto done;
1030 
1031         ASSERT(tx_grp->atg_gh != NULL);
1032 
1033         for (i = 0; i < port->lp_tx_ring_cnt; i++)
1034                 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1035 
1036         kmem_free(port->lp_tx_rings,
1037             (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1038         kmem_free(port->lp_pseudo_tx_rings,
1039             (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1040 
1041         port->lp_tx_ring_cnt = 0;
1042         (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1043         port->lp_tx_grp_added = B_FALSE;
1044         aggr_grp_update_default(grp);
1045 done:
1046         mac_perim_exit(pmph);
1047 }
1048 
1049 static int
1050 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1051 {
1052         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1053         return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1054 }
1055 
1056 static int
1057 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1058 {
1059         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1060         return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1061 }
1062 
1063 /*
1064  * Start the pseudo ring. Since the pseudo ring is just an abstraction
1065  * over an actual HW ring, the real task is to start the underlying HW
1066  * ring.
1067  */
1068 static int
1069 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1070 {
1071         int err;
1072         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1073 
1074         err = mac_hwring_start(rr_ring->arr_hw_rh);
1075 
1076         if (err != 0)
1077                 return (err);
1078 
1079         rr_ring->arr_gen = mr_gen;
1080         return (err);
1081 }
1082 
1083 /*
1084  * Stop the pseudo ring. Since the pseudo ring is just an abstraction
1085  * over an actual HW ring, the real task is to stop the underlying HW
1086  * ring.
1087  */
1088 static void
1089 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
1090 {
1091         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1092 
1093         /*
1094          * The rings underlying the default group must stay up to
1095          * continue receiving LACP traffic. We would normally never
1096          * stop the default Rx rings because of the primary MAC
1097          * client; but aggr's primary MAC client doesn't call
1098          * mac_unicast_add() and thus mi_active is 0 when the last
1099          * non-primary client is deleted.
1100          */
1101         if (rr_ring->arr_grp->arg_index != 0)
1102                 mac_hwring_stop(rr_ring->arr_hw_rh);
1103 }
1104 
1105 /*
1106  * Add one or more ports to an existing link aggregation group.
1107  */
1108 int
1109 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1110     laioc_port_t *ports)
1111 {
1112         int rc;
1113         uint_t port_added = 0;
1114         uint_t grp_added;
1115         aggr_grp_t *grp = NULL;
1116         aggr_port_t *port;
1117         boolean_t link_state_changed = B_FALSE;
1118         mac_perim_handle_t mph, pmph;
1119 
1120         /* Get the aggr corresponding to linkid. */
1121         rw_enter(&aggr_grp_lock, RW_READER);
1122         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1123             (mod_hash_val_t *)&grp) != 0) {
1124                 rw_exit(&aggr_grp_lock);
1125                 return (ENOENT);
1126         }
1127         AGGR_GRP_REFHOLD(grp);
1128 
1129         /*
1130          * Hold the perimeter so that the aggregation can't be destroyed.
1131          */
1132         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1133         rw_exit(&aggr_grp_lock);
1134 
1135         /* Add the specified ports to the aggr. */
1136         for (uint_t i = 0; i < nports; i++) {
1137                 grp_added = 0;
1138 
1139                 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1140                     force, &port)) != 0) {
1141                         goto bail;
1142                 }
1143 
1144                 ASSERT(port != NULL);
1145                 port_added++;
1146 
1147                 /* check capabilities */
1148                 if (!aggr_grp_capab_check(grp, port) ||
1149                     !aggr_grp_sdu_check(grp, port) ||
1150                     !aggr_grp_margin_check(grp, port)) {
1151                         rc = ENOTSUP;
1152                         goto bail;
1153                 }
1154 
1155                 /*
1156                  * Create the pseudo ring for each HW ring of the underlying
1157                  * port.
1158                  */
1159                 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1160                 if (rc != 0)
1161                         goto bail;
1162 
1163                 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
1164                         rc = aggr_add_pseudo_rx_group(port,
1165                             &grp->lg_rx_groups[j]);
1166 
1167                         if (rc != 0)
1168                                 goto bail;
1169 
1170                         grp_added++;
1171                 }
1172 
1173                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1174 
1175                 /* set LACP mode */
1176                 aggr_port_lacp_set_mode(grp, port);
1177 
1178                 /* start port if group has already been started */
1179                 if (grp->lg_started) {
1180                         rc = aggr_port_start(port);
1181                         if (rc != 0) {
1182                                 mac_perim_exit(pmph);
1183                                 goto bail;
1184                         }
1185 
1186                         /*
1187                          * Turn on the promiscuous mode over the port when it
1188                          * is requested to be turned on to receive the
1189                          * non-primary address over a port, or the promiscuous
1190                          * mode is enabled over the aggr.
1191                          */
1192                         if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1193                                 rc = aggr_port_promisc(port, B_TRUE);
1194                                 if (rc != 0) {
1195                                         mac_perim_exit(pmph);
1196                                         goto bail;
1197                                 }
1198                         }
1199                 }
1200                 mac_perim_exit(pmph);
1201 
1202                 /*
1203                  * Attach each port if necessary.
1204                  */
1205                 if (aggr_port_notify_link(grp, port))
1206                         link_state_changed = B_TRUE;
1207 
1208                 /*
1209                  * Initialize the callback functions for this port.
1210                  */
1211                 aggr_port_init_callbacks(port);
1212         }
1213 
1214         /* update the MAC address of the constituent ports */
1215         if (aggr_grp_update_ports_mac(grp))
1216                 link_state_changed = B_TRUE;
1217 
1218         if (link_state_changed)
1219                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1220 
1221 bail:
1222         if (rc != 0) {
1223                 /* stop and remove ports that have been added */
1224                 for (uint_t i = 0; i < port_added; i++) {
1225                         uint_t grp_remove;
1226 
1227                         port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1228                         ASSERT(port != NULL);
1229 
1230                         if (grp->lg_started) {
1231                                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1232                                 (void) aggr_port_promisc(port, B_FALSE);
1233                                 aggr_port_stop(port);
1234                                 mac_perim_exit(pmph);
1235                         }
1236 
1237                         aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1238 
1239                         /*
1240                          * Only the last port could have a partial set
1241                          * of groups added.
1242                          */
1243                         grp_remove = (i + 1 == port_added) ? grp_added :
1244                             grp->lg_rx_group_count;
1245 
1246                         for (uint_t j = 0; j < grp_remove; j++) {
1247                                 aggr_rem_pseudo_rx_group(port,
1248                                     &grp->lg_rx_groups[j]);
1249                         }
1250 
1251                         (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1252                 }
1253         }
1254 
1255         mac_perim_exit(mph);
1256         AGGR_GRP_REFRELE(grp);
1257         return (rc);
1258 }
1259 
1260 static int
1261 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1262     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1263     aggr_lacp_timer_t lacp_timer)
1264 {
1265         boolean_t mac_addr_changed = B_FALSE;
1266         boolean_t link_state_changed = B_FALSE;
1267         mac_perim_handle_t pmph;
1268 
1269         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1270 
1271         /* validate fixed address if specified */
1272         if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1273             ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1274             (mac_addr[0] & 0x01))) {
1275                 return (EINVAL);
1276         }
1277 
1278         /* update policy if requested */
1279         if (update_mask & AGGR_MODIFY_POLICY)
1280                 aggr_send_update_policy(grp, policy);
1281 
1282         /* update unicast MAC address if requested */
1283         if (update_mask & AGGR_MODIFY_MAC) {
1284                 if (mac_fixed) {
1285                         /* user-supplied MAC address */
1286                         grp->lg_mac_addr_port = NULL;
1287                         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1288                                 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1289                                 mac_addr_changed = B_TRUE;
1290                         }
1291                 } else if (grp->lg_addr_fixed) {
1292                         /* switch from user-supplied to automatic */
1293                         aggr_port_t *port = grp->lg_ports;
1294 
1295                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1296                         bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1297                         grp->lg_mac_addr_port = port;
1298                         mac_addr_changed = B_TRUE;
1299                         mac_perim_exit(pmph);
1300                 }
1301                 grp->lg_addr_fixed = mac_fixed;
1302         }
1303 
1304         if (mac_addr_changed)
1305                 link_state_changed = aggr_grp_update_ports_mac(grp);
1306 
1307         if (update_mask & AGGR_MODIFY_LACP_MODE)
1308                 aggr_lacp_update_mode(grp, lacp_mode);
1309 
1310         if (update_mask & AGGR_MODIFY_LACP_TIMER)
1311                 aggr_lacp_update_timer(grp, lacp_timer);
1312 
1313         if (link_state_changed)
1314                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1315 
1316         if (mac_addr_changed)
1317                 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1318 
1319         return (0);
1320 }
1321 
1322 /*
1323  * Update properties of an existing link aggregation group.
1324  */
1325 int
1326 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1327     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1328     aggr_lacp_timer_t lacp_timer)
1329 {
1330         aggr_grp_t *grp = NULL;
1331         mac_perim_handle_t mph;
1332         int err;
1333 
1334         /* get group corresponding to linkid */
1335         rw_enter(&aggr_grp_lock, RW_READER);
1336         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1337             (mod_hash_val_t *)&grp) != 0) {
1338                 rw_exit(&aggr_grp_lock);
1339                 return (ENOENT);
1340         }
1341         AGGR_GRP_REFHOLD(grp);
1342 
1343         /*
1344          * Hold the perimeter so that the aggregation won't be destroyed.
1345          */
1346         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1347         rw_exit(&aggr_grp_lock);
1348 
1349         err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1350             mac_addr, lacp_mode, lacp_timer);
1351 
1352         mac_perim_exit(mph);
1353         AGGR_GRP_REFRELE(grp);
1354         return (err);
1355 }
1356 
1357 /*
1358  * Create a new link aggregation group upon request from administrator.
1359  * Returns 0 on success, an errno on failure.
1360  */
1361 int
1362 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1363     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1364     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1365     cred_t *credp)
1366 {
1367         aggr_grp_t *grp = NULL;
1368         aggr_port_t *port;
1369         mac_register_t *mac;
1370         boolean_t link_state_changed;
1371         mac_perim_handle_t mph;
1372         int err;
1373         int i;
1374         kt_did_t tid = 0;
1375 
1376         /* need at least one port */
1377         if (nports == 0)
1378                 return (EINVAL);
1379 
1380         rw_enter(&aggr_grp_lock, RW_WRITER);
1381 
1382         /* does a group with the same linkid already exist? */
1383         err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1384             (mod_hash_val_t *)&grp);
1385         if (err == 0) {
1386                 rw_exit(&aggr_grp_lock);
1387                 return (EEXIST);
1388         }
1389 
1390         grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1391 
1392         grp->lg_refs = 1;
1393         grp->lg_closing = B_FALSE;
1394         grp->lg_force = force;
1395         grp->lg_linkid = linkid;
1396         grp->lg_zoneid = crgetzoneid(credp);
1397         grp->lg_ifspeed = 0;
1398         grp->lg_link_state = LINK_STATE_UNKNOWN;
1399         grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1400         grp->lg_started = B_FALSE;
1401         grp->lg_promisc = B_FALSE;
1402         grp->lg_lacp_done = B_FALSE;
1403         grp->lg_tx_notify_done = B_FALSE;
1404         grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1405         grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1406             aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1407         grp->lg_tx_notify_thread = thread_create(NULL, 0,
1408             aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1409         grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1410             MAX_RINGS_PER_GROUP), KM_SLEEP);
1411         grp->lg_tx_blocked_cnt = 0;
1412         bzero(&grp->lg_rx_groups,
1413             sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
1414         bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1415         aggr_lacp_init_grp(grp);
1416 
1417         /* add MAC ports to group */
1418         grp->lg_ports = NULL;
1419         grp->lg_nports = 0;
1420         grp->lg_nattached_ports = 0;
1421         grp->lg_ntx_ports = 0;
1422 
1423         /*
1424          * If key is not specified by the user, allocate the key.
1425          */
1426         if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1427                 err = ENOMEM;
1428                 goto bail;
1429         }
1430         grp->lg_key = key;
1431 
1432         for (i = 0; i < nports; i++) {
1433                 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1434                 if (err != 0)
1435                         goto bail;
1436         }
1437 
1438         grp->lg_rx_group_count = 1;
1439 
1440         for (i = 0, port = grp->lg_ports; port != NULL;
1441             i++, port = port->lp_next) {
1442                 uint_t num_rgroups;
1443 
1444                 mac_perim_enter_by_mh(port->lp_mh, &mph);
1445                 num_rgroups = mac_get_num_rx_groups(port->lp_mh);
1446                 mac_perim_exit(mph);
1447 
1448                 /*
1449                  * Utilize all the groups in a port. If some ports
1450                  * have less groups than others, then traffic destined
1451                  * for the same unicast address may be HW classified
1452                  * on some ports but SW classified by aggr when
1453                  * arriving on other ports.
1454                  */
1455                 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
1456                     num_rgroups);
1457         }
1458 
1459         /*
1460          * There could be cases where the hardware provides more
1461          * groups than aggr can support. Make sure we never go above
1462          * the max aggr can support.
1463          */
1464         grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
1465             MAX_GROUPS_PER_PORT);
1466 
1467         ASSERT3U(grp->lg_rx_group_count, >, 0);
1468         for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1469                 grp->lg_rx_groups[i].arg_index = i;
1470                 grp->lg_rx_groups[i].arg_untagged = 0;
1471                 list_create(&(grp->lg_rx_groups[i].arg_vlans),
1472                     sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
1473         }
1474 
1475         /*
1476          * If no explicit MAC address was specified by the administrator,
1477          * set it to the MAC address of the first port.
1478          */
1479         grp->lg_addr_fixed = mac_fixed;
1480         if (grp->lg_addr_fixed) {
1481                 /* validate specified address */
1482                 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1483                         err = EINVAL;
1484                         goto bail;
1485                 }
1486                 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1487         } else {
1488                 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1489                 grp->lg_mac_addr_port = grp->lg_ports;
1490         }
1491 
1492         /* Set the initial group capabilities. */
1493         aggr_grp_capab_set(grp);
1494 
1495         if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1496                 err = ENOMEM;
1497                 goto bail;
1498         }
1499         mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1500         mac->m_driver = grp;
1501         mac->m_dip = aggr_dip;
1502         mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1503         mac->m_src_addr = grp->lg_addr;
1504         mac->m_callbacks = &aggr_m_callbacks;
1505         mac->m_min_sdu = 0;
1506         mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1507         mac->m_margin = aggr_grp_max_margin(grp);
1508         mac->m_v12n = MAC_VIRT_LEVEL1;
1509         err = mac_register(mac, &grp->lg_mh);
1510         mac_free(mac);
1511         if (err != 0)
1512                 goto bail;
1513 
1514         err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1515         if (err != 0) {
1516                 (void) mac_unregister(grp->lg_mh);
1517                 grp->lg_mh = NULL;
1518                 goto bail;
1519         }
1520 
1521         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1522 
1523         /*
1524          * Update the MAC address of the constituent ports.
1525          * None of the port is attached at this time, the link state of the
1526          * aggregation will not change.
1527          *
1528          * All ports take on the primary MAC address of the aggr
1529          * (lg_aggr). At this point, none of the ports are attached;
1530          * thus the link state of the aggregation will not change.
1531          */
1532         link_state_changed = aggr_grp_update_ports_mac(grp);
1533         ASSERT(!link_state_changed);
1534 
1535         /* Update outbound load balancing policy. */
1536         aggr_send_update_policy(grp, policy);
1537 
1538         /* Set LACP mode. */
1539         aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1540 
1541         /*
1542          * Attach each port if necessary.
1543          */
1544         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1545                 /*
1546                  * Create the pseudo ring for each HW ring of the
1547                  * underlying port. Note that this is done after the
1548                  * aggr registers its MAC.
1549                  */
1550                 VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group),
1551                     ==, 0);
1552 
1553                 for (i = 0; i < grp->lg_rx_group_count; i++) {
1554                         VERIFY3S(aggr_add_pseudo_rx_group(port,
1555                             &grp->lg_rx_groups[i]), ==, 0);
1556                 }
1557 
1558                 if (aggr_port_notify_link(grp, port))
1559                         link_state_changed = B_TRUE;
1560 
1561                 /*
1562                  * Initialize the callback functions for this port.
1563                  */
1564                 aggr_port_init_callbacks(port);
1565         }
1566 
1567         if (link_state_changed)
1568                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1569 
1570         /* add new group to hash table */
1571         err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1572             (mod_hash_val_t)grp);
1573         ASSERT(err == 0);
1574         aggr_grp_cnt++;
1575 
1576         mac_perim_exit(mph);
1577         rw_exit(&aggr_grp_lock);
1578         return (0);
1579 
1580 bail:
1581 
1582         grp->lg_closing = B_TRUE;
1583 
1584         port = grp->lg_ports;
1585         while (port != NULL) {
1586                 aggr_port_t *cport;
1587 
1588                 cport = port->lp_next;
1589                 aggr_port_delete(port);
1590                 port = cport;
1591         }
1592 
1593         /*
1594          * Inform the lacp_rx thread to exit.
1595          */
1596         mutex_enter(&grp->lg_lacp_lock);
1597         grp->lg_lacp_done = B_TRUE;
1598         cv_signal(&grp->lg_lacp_cv);
1599         while (grp->lg_lacp_rx_thread != NULL)
1600                 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1601         mutex_exit(&grp->lg_lacp_lock);
1602         /*
1603          * Inform the tx_notify thread to exit.
1604          */
1605         mutex_enter(&grp->lg_tx_flowctl_lock);
1606         if (grp->lg_tx_notify_thread != NULL) {
1607                 tid = grp->lg_tx_notify_thread->t_did;
1608                 grp->lg_tx_notify_done = B_TRUE;
1609                 cv_signal(&grp->lg_tx_flowctl_cv);
1610         }
1611         mutex_exit(&grp->lg_tx_flowctl_lock);
1612         if (tid != 0)
1613                 thread_join(tid);
1614 
1615         kmem_free(grp->lg_tx_blocked_rings,
1616             (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1617         rw_exit(&aggr_grp_lock);
1618         AGGR_GRP_REFRELE(grp);
1619         return (err);
1620 }
1621 
1622 /*
1623  * Return a pointer to the member of a group with specified linkid.
1624  */
1625 static aggr_port_t *
1626 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1627 {
1628         aggr_port_t *port;
1629 
1630         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1631 
1632         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1633                 if (port->lp_linkid == linkid)
1634                         break;
1635         }
1636 
1637         return (port);
1638 }
1639 
1640 /*
1641  * Stop, detach and remove a port from a link aggregation group.
1642  */
1643 static int
1644 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1645     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1646 {
1647         int rc = 0;
1648         aggr_port_t **pport;
1649         boolean_t mac_addr_changed = B_FALSE;
1650         boolean_t link_state_changed = B_FALSE;
1651         mac_perim_handle_t mph;
1652         uint64_t val;
1653         uint_t i;
1654         uint_t stat;
1655 
1656         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1657         ASSERT(grp->lg_nports > 1);
1658         ASSERT(!grp->lg_closing);
1659 
1660         /* unlink port */
1661         for (pport = &grp->lg_ports; *pport != port;
1662             pport = &(*pport)->lp_next) {
1663                 if (*pport == NULL) {
1664                         rc = ENOENT;
1665                         goto done;
1666                 }
1667         }
1668         *pport = port->lp_next;
1669 
1670         mac_perim_enter_by_mh(port->lp_mh, &mph);
1671 
1672         /*
1673          * If the MAC address of the port being removed was assigned
1674          * to the group, update the group MAC address
1675          * using the MAC address of a different port.
1676          */
1677         if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1678                 /*
1679                  * Set the MAC address of the group to the
1680                  * MAC address of its first port.
1681                  */
1682                 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1683                 grp->lg_mac_addr_port = grp->lg_ports;
1684                 mac_addr_changed = B_TRUE;
1685         }
1686 
1687         link_state_changed = aggr_grp_detach_port(grp, port);
1688 
1689         /*
1690          * Add the counter statistics of the ports while it was aggregated
1691          * to the group's residual statistics.  This is done by obtaining
1692          * the current counter from the underlying MAC then subtracting the
1693          * value of the counter at the moment it was added to the
1694          * aggregation.
1695          */
1696         for (i = 0; i < MAC_NSTAT; i++) {
1697                 stat = i + MAC_STAT_MIN;
1698                 if (!MAC_STAT_ISACOUNTER(stat))
1699                         continue;
1700                 val = aggr_port_stat(port, stat);
1701                 val -= port->lp_stat[i];
1702                 mutex_enter(&grp->lg_stat_lock);
1703                 grp->lg_stat[i] += val;
1704                 mutex_exit(&grp->lg_stat_lock);
1705         }
1706         for (i = 0; i < ETHER_NSTAT; i++) {
1707                 stat = i + MACTYPE_STAT_MIN;
1708                 if (!ETHER_STAT_ISACOUNTER(stat))
1709                         continue;
1710                 val = aggr_port_stat(port, stat);
1711                 val -= port->lp_ether_stat[i];
1712                 mutex_enter(&grp->lg_stat_lock);
1713                 grp->lg_ether_stat[i] += val;
1714                 mutex_exit(&grp->lg_stat_lock);
1715         }
1716 
1717         grp->lg_nports--;
1718         mac_perim_exit(mph);
1719 
1720         aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1721         aggr_port_delete(port);
1722 
1723         /*
1724          * If the group MAC address has changed, update the MAC address of
1725          * the remaining constituent ports according to the new MAC
1726          * address of the group.
1727          */
1728         if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1729                 link_state_changed = B_TRUE;
1730 
1731 done:
1732         if (mac_addr_changedp != NULL)
1733                 *mac_addr_changedp = mac_addr_changed;
1734         if (link_state_changedp != NULL)
1735                 *link_state_changedp = link_state_changed;
1736 
1737         return (rc);
1738 }
1739 
1740 /*
1741  * Remove one or more ports from an existing link aggregation group.
1742  */
1743 int
1744 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1745 {
1746         int rc = 0, i;
1747         aggr_grp_t *grp = NULL;
1748         aggr_port_t *port;
1749         boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1750         boolean_t link_state_update = B_FALSE, link_state_changed;
1751         mac_perim_handle_t mph, pmph;
1752 
1753         /* get group corresponding to linkid */
1754         rw_enter(&aggr_grp_lock, RW_READER);
1755         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1756             (mod_hash_val_t *)&grp) != 0) {
1757                 rw_exit(&aggr_grp_lock);
1758                 return (ENOENT);
1759         }
1760         AGGR_GRP_REFHOLD(grp);
1761 
1762         /*
1763          * Hold the perimeter so that the aggregation won't be destroyed.
1764          */
1765         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1766         rw_exit(&aggr_grp_lock);
1767 
1768         /* we need to keep at least one port per group */
1769         if (nports >= grp->lg_nports) {
1770                 rc = EINVAL;
1771                 goto bail;
1772         }
1773 
1774         /* first verify that all the groups are valid */
1775         for (i = 0; i < nports; i++) {
1776                 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1777                         /* port not found */
1778                         rc = ENOENT;
1779                         goto bail;
1780                 }
1781         }
1782 
1783         /* clear the promiscous mode for the specified ports */
1784         for (i = 0; i < nports && rc == 0; i++) {
1785                 /* lookup port */
1786                 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1787                 ASSERT(port != NULL);
1788 
1789                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1790                 rc = aggr_port_promisc(port, B_FALSE);
1791                 mac_perim_exit(pmph);
1792         }
1793         if (rc != 0) {
1794                 for (i = 0; i < nports; i++) {
1795                         port = aggr_grp_port_lookup(grp,
1796                             ports[i].lp_linkid);
1797                         ASSERT(port != NULL);
1798 
1799                         /*
1800                          * Turn the promiscuous mode back on if it is required
1801                          * to receive the non-primary address over a port, or
1802                          * the promiscous mode is enabled over the aggr.
1803                          */
1804                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1805                         if (port->lp_started && (grp->lg_promisc ||
1806                             port->lp_prom_addr != NULL)) {
1807                                 (void) aggr_port_promisc(port, B_TRUE);
1808                         }
1809                         mac_perim_exit(pmph);
1810                 }
1811                 goto bail;
1812         }
1813 
1814         /* remove the specified ports from group */
1815         for (i = 0; i < nports; i++) {
1816                 /* lookup port */
1817                 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1818                 ASSERT(port != NULL);
1819 
1820                 /* stop port if group has already been started */
1821                 if (grp->lg_started) {
1822                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1823                         aggr_port_stop(port);
1824                         mac_perim_exit(pmph);
1825                 }
1826 
1827                 /*
1828                  * aggr_rem_pseudo_tx_group() is not called here. Instead
1829                  * it is called from inside aggr_grp_rem_port() after the
1830                  * port has been detached. The reason is that
1831                  * aggr_rem_pseudo_tx_group() removes one ring at a time
1832                  * and if there is still traffic going on, then there
1833                  * is the possibility of aggr_find_tx_ring() returning a
1834                  * removed ring for transmission. Once the port has been
1835                  * detached, that port will not be used and
1836                  * aggr_find_tx_ring() will not return any rings
1837                  * belonging to it.
1838                  */
1839                 for (i = 0; i < grp->lg_rx_group_count; i++)
1840                         aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1841 
1842                 /* remove port from group */
1843                 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1844                     &link_state_changed);
1845                 ASSERT(rc == 0);
1846                 mac_addr_update = mac_addr_update || mac_addr_changed;
1847                 link_state_update = link_state_update || link_state_changed;
1848         }
1849 
1850 bail:
1851         if (mac_addr_update)
1852                 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1853         if (link_state_update)
1854                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1855 
1856         mac_perim_exit(mph);
1857         AGGR_GRP_REFRELE(grp);
1858 
1859         return (rc);
1860 }
1861 
1862 int
1863 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1864 {
1865         aggr_grp_t *grp = NULL;
1866         aggr_port_t *port, *cport;
1867         datalink_id_t tmpid;
1868         mod_hash_val_t val;
1869         mac_perim_handle_t mph, pmph;
1870         int err;
1871         kt_did_t tid = 0;
1872 
1873         rw_enter(&aggr_grp_lock, RW_WRITER);
1874 
1875         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1876             (mod_hash_val_t *)&grp) != 0) {
1877                 rw_exit(&aggr_grp_lock);
1878                 return (ENOENT);
1879         }
1880 
1881         /*
1882          * Note that dls_devnet_destroy() must be called before lg_lock is
1883          * held. Otherwise, it will deadlock if another thread is in
1884          * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1885          * dls_devnet_destroy() needs to delete.
1886          */
1887         if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1888                 rw_exit(&aggr_grp_lock);
1889                 return (err);
1890         }
1891         ASSERT(linkid == tmpid);
1892 
1893         /*
1894          * Unregister from the MAC service module. Since this can
1895          * fail if a client hasn't closed the MAC port, we gracefully
1896          * fail the operation.
1897          */
1898         if ((err = mac_disable(grp->lg_mh)) != 0) {
1899                 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1900                 rw_exit(&aggr_grp_lock);
1901                 return (err);
1902         }
1903         (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1904         ASSERT(grp == (aggr_grp_t *)val);
1905 
1906         ASSERT(aggr_grp_cnt > 0);
1907         aggr_grp_cnt--;
1908         rw_exit(&aggr_grp_lock);
1909 
1910         /*
1911          * Inform the lacp_rx thread to exit.
1912          */
1913         mutex_enter(&grp->lg_lacp_lock);
1914         grp->lg_lacp_done = B_TRUE;
1915         cv_signal(&grp->lg_lacp_cv);
1916         while (grp->lg_lacp_rx_thread != NULL)
1917                 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1918         mutex_exit(&grp->lg_lacp_lock);
1919         /*
1920          * Inform the tx_notify_thread to exit.
1921          */
1922         mutex_enter(&grp->lg_tx_flowctl_lock);
1923         if (grp->lg_tx_notify_thread != NULL) {
1924                 tid = grp->lg_tx_notify_thread->t_did;
1925                 grp->lg_tx_notify_done = B_TRUE;
1926                 cv_signal(&grp->lg_tx_flowctl_cv);
1927         }
1928         mutex_exit(&grp->lg_tx_flowctl_lock);
1929         if (tid != 0)
1930                 thread_join(tid);
1931 
1932         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1933 
1934         grp->lg_closing = B_TRUE;
1935         /* detach and free MAC ports associated with group */
1936         port = grp->lg_ports;
1937         while (port != NULL) {
1938                 cport = port->lp_next;
1939                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1940                 if (grp->lg_started)
1941                         aggr_port_stop(port);
1942                 (void) aggr_grp_detach_port(grp, port);
1943                 mac_perim_exit(pmph);
1944                 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1945                 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
1946                         aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1947                 aggr_port_delete(port);
1948                 port = cport;
1949         }
1950 
1951         mac_perim_exit(mph);
1952 
1953         kmem_free(grp->lg_tx_blocked_rings,
1954             (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1955         /*
1956          * Wait for the port's lacp timer thread and its notification callback
1957          * to exit before calling mac_unregister() since both needs to access
1958          * the mac perimeter of the grp.
1959          */
1960         aggr_grp_port_wait(grp);
1961 
1962         VERIFY(mac_unregister(grp->lg_mh) == 0);
1963         grp->lg_mh = NULL;
1964 
1965         for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1966                 list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
1967         }
1968 
1969         AGGR_GRP_REFRELE(grp);
1970         return (0);
1971 }
1972 
1973 void
1974 aggr_grp_free(aggr_grp_t *grp)
1975 {
1976         ASSERT(grp->lg_refs == 0);
1977         ASSERT(grp->lg_port_ref == 0);
1978         if (grp->lg_key > AGGR_MAX_KEY) {
1979                 id_free(key_ids, grp->lg_key);
1980                 grp->lg_key = 0;
1981         }
1982         kmem_cache_free(aggr_grp_cache, grp);
1983 }
1984 
1985 int
1986 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1987     aggr_grp_info_new_grp_fn_t new_grp_fn,
1988     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1989 {
1990         aggr_grp_t      *grp;
1991         aggr_port_t     *port;
1992         mac_perim_handle_t mph, pmph;
1993         int             rc = 0;
1994 
1995         /*
1996          * Make sure that the aggregation link is visible from the caller's
1997          * zone.
1998          */
1999         if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
2000                 return (ENOENT);
2001 
2002         rw_enter(&aggr_grp_lock, RW_READER);
2003 
2004         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2005             (mod_hash_val_t *)&grp) != 0) {
2006                 rw_exit(&aggr_grp_lock);
2007                 return (ENOENT);
2008         }
2009         AGGR_GRP_REFHOLD(grp);
2010 
2011         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2012         rw_exit(&aggr_grp_lock);
2013 
2014         rc = new_grp_fn(fn_arg, grp->lg_linkid,
2015             (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
2016             grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
2017             grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
2018 
2019         if (rc != 0)
2020                 goto bail;
2021 
2022         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2023                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2024                 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
2025                     port->lp_state, &port->lp_lacp.ActorOperPortState);
2026                 mac_perim_exit(pmph);
2027 
2028                 if (rc != 0)
2029                         goto bail;
2030         }
2031 
2032 bail:
2033         mac_perim_exit(mph);
2034         AGGR_GRP_REFRELE(grp);
2035         return (rc);
2036 }
2037 
2038 /*ARGSUSED*/
2039 static void
2040 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
2041 {
2042         miocnak(q, mp, 0, ENOTSUP);
2043 }
2044 
2045 static int
2046 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
2047 {
2048         aggr_port_t     *port;
2049         uint_t          stat_index;
2050 
2051         ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
2052 
2053         /* We only aggregate counter statistics. */
2054         if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
2055             IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
2056                 return (ENOTSUP);
2057         }
2058 
2059         /*
2060          * Counter statistics for a group are computed by aggregating the
2061          * counters of the members MACs while they were aggregated, plus
2062          * the residual counter of the group itself, which is updated each
2063          * time a MAC is removed from the group.
2064          */
2065         *val = 0;
2066         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2067                 /* actual port statistic */
2068                 *val += aggr_port_stat(port, stat);
2069                 /*
2070                  * minus the port stat when it was added, plus any residual
2071                  * amount for the group.
2072                  */
2073                 if (IS_MAC_STAT(stat)) {
2074                         stat_index = stat - MAC_STAT_MIN;
2075                         *val -= port->lp_stat[stat_index];
2076                         *val += grp->lg_stat[stat_index];
2077                 } else if (IS_MACTYPE_STAT(stat)) {
2078                         stat_index = stat - MACTYPE_STAT_MIN;
2079                         *val -= port->lp_ether_stat[stat_index];
2080                         *val += grp->lg_ether_stat[stat_index];
2081                 }
2082         }
2083         return (0);
2084 }
2085 
2086 int
2087 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2088 {
2089         aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
2090 
2091         if (rx_ring->arr_hw_rh != NULL) {
2092                 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
2093         } else {
2094                 aggr_port_t     *port = rx_ring->arr_port;
2095 
2096                 *val = mac_stat_get(port->lp_mh, stat);
2097 
2098         }
2099         return (0);
2100 }
2101 
2102 int
2103 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2104 {
2105         aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2106 
2107         if (tx_ring->atr_hw_rh != NULL) {
2108                 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2109         } else {
2110                 aggr_port_t     *port = tx_ring->atr_port;
2111 
2112                 *val = mac_stat_get(port->lp_mh, stat);
2113         }
2114         return (0);
2115 }
2116 
2117 static int
2118 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2119 {
2120         aggr_grp_t              *grp = arg;
2121         int                     rval = 0;
2122 
2123         mutex_enter(&grp->lg_stat_lock);
2124 
2125         switch (stat) {
2126         case MAC_STAT_IFSPEED:
2127                 *val = grp->lg_ifspeed;
2128                 break;
2129 
2130         case ETHER_STAT_LINK_DUPLEX:
2131                 *val = grp->lg_link_duplex;
2132                 break;
2133 
2134         default:
2135                 /*
2136                  * For all other statistics, we return the aggregated stat
2137                  * from the underlying ports.  aggr_grp_stat() will set
2138                  * rval appropriately if the statistic isn't a counter.
2139                  */
2140                 rval = aggr_grp_stat(grp, stat, val);
2141         }
2142 
2143         mutex_exit(&grp->lg_stat_lock);
2144         return (rval);
2145 }
2146 
2147 static int
2148 aggr_m_start(void *arg)
2149 {
2150         aggr_grp_t *grp = arg;
2151         aggr_port_t *port;
2152         mac_perim_handle_t mph, pmph;
2153 
2154         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2155 
2156         /*
2157          * Attempts to start all configured members of the group.
2158          * Group members will be attached when their link-up notification
2159          * is received.
2160          */
2161         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2162                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2163                 if (aggr_port_start(port) != 0) {
2164                         mac_perim_exit(pmph);
2165                         continue;
2166                 }
2167 
2168                 /*
2169                  * Turn on the promiscuous mode if it is required to receive
2170                  * the non-primary address over a port, or the promiscous
2171                  * mode is enabled over the aggr.
2172                  */
2173                 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2174                         if (aggr_port_promisc(port, B_TRUE) != 0)
2175                                 aggr_port_stop(port);
2176                 }
2177                 mac_perim_exit(pmph);
2178         }
2179 
2180         grp->lg_started = B_TRUE;
2181 
2182         mac_perim_exit(mph);
2183         return (0);
2184 }
2185 
2186 static void
2187 aggr_m_stop(void *arg)
2188 {
2189         aggr_grp_t *grp = arg;
2190         aggr_port_t *port;
2191         mac_perim_handle_t mph, pmph;
2192 
2193         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2194 
2195         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2196                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2197 
2198                 /* reset port promiscuous mode */
2199                 (void) aggr_port_promisc(port, B_FALSE);
2200 
2201                 aggr_port_stop(port);
2202                 mac_perim_exit(pmph);
2203         }
2204 
2205         grp->lg_started = B_FALSE;
2206         mac_perim_exit(mph);
2207 }
2208 
2209 static int
2210 aggr_m_promisc(void *arg, boolean_t on)
2211 {
2212         aggr_grp_t *grp = arg;
2213         aggr_port_t *port;
2214         boolean_t link_state_changed = B_FALSE;
2215         mac_perim_handle_t mph, pmph;
2216 
2217         AGGR_GRP_REFHOLD(grp);
2218         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2219 
2220         ASSERT(!grp->lg_closing);
2221 
2222         if (on == grp->lg_promisc)
2223                 goto bail;
2224 
2225         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2226                 int     err = 0;
2227 
2228                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2229                 AGGR_PORT_REFHOLD(port);
2230                 if (!on && (port->lp_prom_addr == NULL))
2231                         err = aggr_port_promisc(port, B_FALSE);
2232                 else if (on && port->lp_started)
2233                         err = aggr_port_promisc(port, B_TRUE);
2234 
2235                 if (err != 0) {
2236                         if (aggr_grp_detach_port(grp, port))
2237                                 link_state_changed = B_TRUE;
2238                 } else {
2239                         /*
2240                          * If a port was detached because of a previous
2241                          * failure changing the promiscuity, the port
2242                          * is reattached when it successfully changes
2243                          * the promiscuity now, and this might cause
2244                          * the link state of the aggregation to change.
2245                          */
2246                         if (aggr_grp_attach_port(grp, port))
2247                                 link_state_changed = B_TRUE;
2248                 }
2249                 mac_perim_exit(pmph);
2250                 AGGR_PORT_REFRELE(port);
2251         }
2252 
2253         grp->lg_promisc = on;
2254 
2255         if (link_state_changed)
2256                 mac_link_update(grp->lg_mh, grp->lg_link_state);
2257 
2258 bail:
2259         mac_perim_exit(mph);
2260         AGGR_GRP_REFRELE(grp);
2261 
2262         return (0);
2263 }
2264 
2265 static void
2266 aggr_grp_port_rename(const char *new_name, void *arg)
2267 {
2268         /*
2269          * aggr port's mac client name is the format of "aggr link name" plus
2270          * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2271          */
2272         int aggr_len, link_len, clnt_name_len, i;
2273         char *str_end, *str_st, *str_del;
2274         char aggr_name[MAXNAMELEN];
2275         char link_name[MAXNAMELEN];
2276         char *clnt_name;
2277         aggr_grp_t *aggr_grp = arg;
2278         aggr_port_t *aggr_port = aggr_grp->lg_ports;
2279 
2280         for (i = 0; i < aggr_grp->lg_nports; i++) {
2281                 clnt_name = mac_client_name(aggr_port->lp_mch);
2282                 clnt_name_len = strlen(clnt_name);
2283                 str_st = clnt_name;
2284                 str_end = &(clnt_name[clnt_name_len]);
2285                 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2286                 ASSERT(str_del != NULL);
2287                 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2288                 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2289                 bzero(aggr_name, MAXNAMELEN);
2290                 bzero(link_name, MAXNAMELEN);
2291                 bcopy(clnt_name, aggr_name, aggr_len);
2292                 bcopy(str_del, link_name, link_len + 1);
2293                 bzero(clnt_name, MAXNAMELEN);
2294                 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2295                     link_name);
2296 
2297                 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2298                 aggr_port = aggr_port->lp_next;
2299         }
2300 }
2301 
2302 /*
2303  * Initialize the capabilities that are advertised for the group
2304  * according to the capabilities of the constituent ports.
2305  */
2306 static boolean_t
2307 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2308 {
2309         aggr_grp_t *grp = arg;
2310 
2311         switch (cap) {
2312         case MAC_CAPAB_HCKSUM: {
2313                 uint32_t *hcksum_txflags = cap_data;
2314                 *hcksum_txflags = grp->lg_hcksum_txflags;
2315                 break;
2316         }
2317         case MAC_CAPAB_LSO: {
2318                 mac_capab_lso_t *cap_lso = cap_data;
2319 
2320                 if (grp->lg_lso) {
2321                         *cap_lso = grp->lg_cap_lso;
2322                         break;
2323                 } else {
2324                         return (B_FALSE);
2325                 }
2326         }
2327         case MAC_CAPAB_NO_NATIVEVLAN:
2328                 return (!grp->lg_vlan);
2329         case MAC_CAPAB_NO_ZCOPY:
2330                 return (!grp->lg_zcopy);
2331         case MAC_CAPAB_RINGS: {
2332                 mac_capab_rings_t *cap_rings = cap_data;
2333                 uint_t ring_cnt = 0;
2334 
2335                 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2336                         ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
2337 
2338                 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2339                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2340                         cap_rings->mr_rnum = ring_cnt;
2341                         cap_rings->mr_gnum = grp->lg_rx_group_count;
2342                         cap_rings->mr_gaddring = NULL;
2343                         cap_rings->mr_gremring = NULL;
2344                 } else {
2345                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2346                         cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2347                         cap_rings->mr_gnum = 0;
2348                 }
2349                 cap_rings->mr_rget = aggr_fill_ring;
2350                 cap_rings->mr_gget = aggr_fill_group;
2351                 break;
2352         }
2353         case MAC_CAPAB_AGGR:
2354         {
2355                 mac_capab_aggr_t *aggr_cap;
2356 
2357                 if (cap_data != NULL) {
2358                         aggr_cap = cap_data;
2359                         aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2360                         aggr_cap->mca_unicst = aggr_m_unicst;
2361                         aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2362                         aggr_cap->mca_arg = arg;
2363                 }
2364                 return (B_TRUE);
2365         }
2366         default:
2367                 return (B_FALSE);
2368         }
2369         return (B_TRUE);
2370 }
2371 
2372 /*
2373  * Callback function for MAC layer to register groups.
2374  */
2375 static void
2376 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2377     mac_group_info_t *infop, mac_group_handle_t gh)
2378 {
2379         aggr_grp_t *grp = arg;
2380 
2381         if (rtype == MAC_RING_TYPE_RX) {
2382                 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
2383 
2384                 rx_group->arg_gh = gh;
2385                 rx_group->arg_grp = grp;
2386 
2387                 infop->mgi_driver = (mac_group_driver_t)rx_group;
2388                 infop->mgi_start = NULL;
2389                 infop->mgi_stop = NULL;
2390                 infop->mgi_addmac = aggr_addmac;
2391                 infop->mgi_remmac = aggr_remmac;
2392                 infop->mgi_count = rx_group->arg_ring_cnt;
2393 
2394                 /*
2395                  * Always set the HW VLAN callbacks. They are smart
2396                  * enough to know when a port has HW VLAN filters to
2397                  * program and when it doesn't.
2398                  */
2399                 infop->mgi_addvlan = aggr_addvlan;
2400                 infop->mgi_remvlan = aggr_remvlan;
2401         } else {
2402                 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2403 
2404                 ASSERT3S(index, ==, 0);
2405                 tx_group->atg_gh = gh;
2406         }
2407 }
2408 
2409 /*
2410  * Callback funtion for MAC layer to register all rings.
2411  */
2412 static void
2413 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2414     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2415 {
2416         aggr_grp_t      *grp = arg;
2417 
2418         switch (rtype) {
2419         case MAC_RING_TYPE_RX: {
2420                 aggr_pseudo_rx_group_t  *rx_group;
2421                 aggr_pseudo_rx_ring_t   *rx_ring;
2422                 mac_intr_t              aggr_mac_intr;
2423 
2424                 rx_group = &grp->lg_rx_groups[rg_index];
2425                 ASSERT3S(index, >=, 0);
2426                 ASSERT3S(index, <, rx_group->arg_ring_cnt);
2427                 rx_ring = rx_group->arg_rings + index;
2428                 rx_ring->arr_rh = rh;
2429 
2430                 /*
2431                  * Entrypoint to enable interrupt (disable poll) and
2432                  * disable interrupt (enable poll).
2433                  */
2434                 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2435                 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2436                 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2437                 aggr_mac_intr.mi_ddi_handle = NULL;
2438 
2439                 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2440                 infop->mri_start = aggr_pseudo_start_rx_ring;
2441                 infop->mri_stop = aggr_pseudo_stop_rx_ring;
2442 
2443                 infop->mri_intr = aggr_mac_intr;
2444                 infop->mri_poll = aggr_rx_poll;
2445 
2446                 infop->mri_stat = aggr_rx_ring_stat;
2447                 break;
2448         }
2449         case MAC_RING_TYPE_TX: {
2450                 aggr_pseudo_tx_group_t  *tx_group = &grp->lg_tx_group;
2451                 aggr_pseudo_tx_ring_t   *tx_ring;
2452 
2453                 ASSERT(rg_index == -1);
2454                 ASSERT(index < tx_group->atg_ring_cnt);
2455 
2456                 tx_ring = &tx_group->atg_rings[index];
2457                 tx_ring->atr_rh = rh;
2458 
2459                 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2460                 infop->mri_start = NULL;
2461                 infop->mri_stop = NULL;
2462                 infop->mri_tx = aggr_ring_tx;
2463                 infop->mri_stat = aggr_tx_ring_stat;
2464                 /*
2465                  * Use the hw TX ring handle to find if the ring needs
2466                  * serialization or not. For NICs that do not expose
2467                  * Tx rings, atr_hw_rh will be NULL.
2468                  */
2469                 if (tx_ring->atr_hw_rh != NULL) {
2470                         infop->mri_flags =
2471                             mac_hwring_getinfo(tx_ring->atr_hw_rh);
2472                 }
2473                 break;
2474         }
2475         default:
2476                 break;
2477         }
2478 }
2479 
2480 static mblk_t *
2481 aggr_rx_poll(void *arg, int bytes_to_pickup)
2482 {
2483         aggr_pseudo_rx_ring_t *rr_ring = arg;
2484         aggr_port_t *port = rr_ring->arr_port;
2485         aggr_grp_t *grp = port->lp_grp;
2486         mblk_t *mp_chain, *mp, **mpp;
2487 
2488         mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2489 
2490         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2491                 return (mp_chain);
2492 
2493         mpp = &mp_chain;
2494         while ((mp = *mpp) != NULL) {
2495                 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2496                         struct ether_header *ehp;
2497 
2498                         ehp = (struct ether_header *)mp->b_rptr;
2499                         if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2500                                 *mpp = mp->b_next;
2501                                 mp->b_next = NULL;
2502                                 aggr_recv_lacp(port,
2503                                     (mac_resource_handle_t)rr_ring, mp);
2504                                 continue;
2505                         }
2506                 }
2507 
2508                 if (!port->lp_collector_enabled) {
2509                         *mpp = mp->b_next;
2510                         mp->b_next = NULL;
2511                         freemsg(mp);
2512                         continue;
2513                 }
2514                 mpp = &mp->b_next;
2515         }
2516         return (mp_chain);
2517 }
2518 
2519 static int
2520 aggr_addmac(void *arg, const uint8_t *mac_addr)
2521 {
2522         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2523         aggr_unicst_addr_t      *addr, **pprev;
2524         aggr_grp_t              *grp = rx_group->arg_grp;
2525         aggr_port_t             *port, *p;
2526         mac_perim_handle_t      mph;
2527         int                     err = 0;
2528         uint_t                  idx = rx_group->arg_index;
2529 
2530         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2531 
2532         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2533                 mac_perim_exit(mph);
2534                 return (0);
2535         }
2536 
2537         /*
2538          * Insert this mac address into the list of mac addresses owned by
2539          * the aggregation pseudo group.
2540          */
2541         pprev = &rx_group->arg_macaddr;
2542         while ((addr = *pprev) != NULL) {
2543                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2544                         mac_perim_exit(mph);
2545                         return (EEXIST);
2546                 }
2547                 pprev = &addr->aua_next;
2548         }
2549         addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2550         bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2551         addr->aua_next = NULL;
2552         *pprev = addr;
2553 
2554         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2555                 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
2556                         break;
2557 
2558         if (err != 0) {
2559                 for (p = grp->lg_ports; p != port; p = p->lp_next)
2560                         aggr_port_remmac(p, idx, mac_addr);
2561 
2562                 *pprev = NULL;
2563                 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2564         }
2565 
2566         mac_perim_exit(mph);
2567         return (err);
2568 }
2569 
2570 static int
2571 aggr_remmac(void *arg, const uint8_t *mac_addr)
2572 {
2573         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2574         aggr_unicst_addr_t      *addr, **pprev;
2575         aggr_grp_t              *grp = rx_group->arg_grp;
2576         aggr_port_t             *port;
2577         mac_perim_handle_t      mph;
2578         int                     err = 0;
2579 
2580         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2581 
2582         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2583                 mac_perim_exit(mph);
2584                 return (0);
2585         }
2586 
2587         /*
2588          * Insert this mac address into the list of mac addresses owned by
2589          * the aggregation pseudo group.
2590          */
2591         pprev = &rx_group->arg_macaddr;
2592         while ((addr = *pprev) != NULL) {
2593                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2594                         pprev = &addr->aua_next;
2595                         continue;
2596                 }
2597                 break;
2598         }
2599         if (addr == NULL) {
2600                 mac_perim_exit(mph);
2601                 return (EINVAL);
2602         }
2603 
2604         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2605                 aggr_port_remmac(port, rx_group->arg_index, mac_addr);
2606 
2607         *pprev = addr->aua_next;
2608         kmem_free(addr, sizeof (aggr_unicst_addr_t));
2609 
2610         mac_perim_exit(mph);
2611         return (err);
2612 }
2613 
2614 /*
2615  * Search for VID in the Rx group's list and return a pointer if
2616  * found. Otherwise return NULL.
2617  */
2618 static aggr_vlan_t *
2619 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2620 {
2621         ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2622         for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2623             avp = list_next(&rx_group->arg_vlans, avp)) {
2624                 if (avp->av_vid == vid)
2625                         return (avp);
2626         }
2627 
2628         return (NULL);
2629 }
2630 
2631 /*
2632  * Accept traffic on the specified VID.
2633  *
2634  * Persist VLAN state in the aggr so that ports added later will
2635  * receive the correct filters. In the future it would be nice to
2636  * allow aggr to iterate its clients instead of duplicating state.
2637  */
2638 static int
2639 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2640 {
2641         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2642         aggr_grp_t              *aggr = rx_group->arg_grp;
2643         aggr_port_t             *port, *p;
2644         mac_perim_handle_t      mph;
2645         int                     err = 0;
2646         aggr_vlan_t             *avp = NULL;
2647         uint_t                  idx = rx_group->arg_index;
2648 
2649         mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2650 
2651         if (vid == MAC_VLAN_UNTAGGED) {
2652                 /*
2653                  * Aggr is both a MAC provider and MAC client. As a
2654                  * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2655                  * client. As a client itself, it should pass
2656                  * VLAN_ID_NONE to its ports.
2657                  */
2658                 vid = VLAN_ID_NONE;
2659                 rx_group->arg_untagged++;
2660                 goto update_ports;
2661         }
2662 
2663         avp = aggr_find_vlan(rx_group, vid);
2664 
2665         if (avp != NULL) {
2666                 avp->av_refs++;
2667                 mac_perim_exit(mph);
2668                 return (0);
2669         }
2670 
2671         avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2672         avp->av_vid = vid;
2673         avp->av_refs = 1;
2674 
2675 update_ports:
2676         for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2677                 if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
2678                         break;
2679 
2680         if (err != 0) {
2681                 /*
2682                  * If any of these calls fail then we are in a
2683                  * situation where the ports have different HW state.
2684                  * There's no reasonable action the MAC client can
2685                  * take in this scenario to rectify the situation.
2686                  */
2687                 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2688                         int err2;
2689 
2690                         if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
2691                                 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2692                                     " from port %s: errno %d.", vid,
2693                                     mac_client_name(p->lp_mch), err2);
2694                         }
2695 
2696                 }
2697 
2698                 if (vid == VLAN_ID_NONE)
2699                         rx_group->arg_untagged--;
2700 
2701                 if (avp != NULL) {
2702                         kmem_free(avp, sizeof (aggr_vlan_t));
2703                         avp = NULL;
2704                 }
2705         }
2706 
2707         if (avp != NULL)
2708                 list_insert_tail(&rx_group->arg_vlans, avp);
2709 
2710 done:
2711         mac_perim_exit(mph);
2712         return (err);
2713 }
2714 
2715 /*
2716  * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2717  */
2718 static int
2719 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2720 {
2721         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2722         aggr_grp_t              *aggr = rx_group->arg_grp;
2723         aggr_port_t             *port, *p;
2724         mac_perim_handle_t      mph;
2725         int                     err = 0;
2726         aggr_vlan_t             *avp = NULL;
2727         uint_t                  idx = rx_group->arg_index;
2728 
2729         mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2730 
2731         /*
2732          * See the comment in aggr_addvlan().
2733          */
2734         if (vid == MAC_VLAN_UNTAGGED) {
2735                 vid = VLAN_ID_NONE;
2736                 rx_group->arg_untagged--;
2737 
2738                 if (rx_group->arg_untagged > 0)
2739                         goto done;
2740 
2741                 goto update_ports;
2742         }
2743 
2744         avp = aggr_find_vlan(rx_group, vid);
2745 
2746         if (avp == NULL) {
2747                 err = ENOENT;
2748                 goto done;
2749         }
2750 
2751         avp->av_refs--;
2752 
2753         if (avp->av_refs > 0)
2754                 goto done;
2755 
2756 update_ports:
2757         for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2758                 if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
2759                         break;
2760 
2761         /*
2762          * See the comment in aggr_addvlan() for justification of the
2763          * use of VERIFY here.
2764          */
2765         if (err != 0) {
2766                 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2767                         int err2;
2768 
2769                         if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
2770                                 cmn_err(CE_WARN, "Failed to add VLAN %u"
2771                                     " to port %s: errno %d.", vid,
2772                                     mac_client_name(p->lp_mch), err2);
2773                         }
2774                 }
2775 
2776                 if (avp != NULL)
2777                         avp->av_refs++;
2778 
2779                 if (vid == VLAN_ID_NONE)
2780                         rx_group->arg_untagged++;
2781 
2782                 goto done;
2783         }
2784 
2785         if (err == 0 && avp != NULL) {
2786                 VERIFY3U(avp->av_refs, ==, 0);
2787                 list_remove(&rx_group->arg_vlans, avp);
2788                 kmem_free(avp, sizeof (aggr_vlan_t));
2789         }
2790 
2791 done:
2792         mac_perim_exit(mph);
2793         return (err);
2794 }
2795 
2796 /*
2797  * Add or remove the multicast addresses that are defined for the group
2798  * to or from the specified port.
2799  *
2800  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2801  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2802  * called when the port is either stopped or detached.
2803  */
2804 void
2805 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2806 {
2807         aggr_grp_t *grp = port->lp_grp;
2808 
2809         ASSERT(MAC_PERIM_HELD(port->lp_mh));
2810         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2811 
2812         if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2813                 return;
2814 
2815         mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2816 }
2817 
2818 static int
2819 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2820 {
2821         aggr_grp_t *grp = arg;
2822         aggr_port_t *port = NULL, *errport = NULL;
2823         mac_perim_handle_t mph;
2824         int err = 0;
2825 
2826         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2827         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2828                 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2829                     !port->lp_started) {
2830                         continue;
2831                 }
2832                 err = aggr_port_multicst(port, add, addrp);
2833                 if (err != 0) {
2834                         errport = port;
2835                         break;
2836                 }
2837         }
2838 
2839         /*
2840          * At least one port caused error return and this error is returned to
2841          * mac, eventually a NAK would be sent upwards.
2842          * Some ports have this multicast address listed now, and some don't.
2843          * Treat this error as a whole aggr failure not individual port failure.
2844          * Therefore remove this multicast address from other ports.
2845          */
2846         if ((err != 0) && add) {
2847                 for (port = grp->lg_ports; port != errport;
2848                     port = port->lp_next) {
2849                         if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2850                             !port->lp_started) {
2851                                 continue;
2852                         }
2853                         (void) aggr_port_multicst(port, B_FALSE, addrp);
2854                 }
2855         }
2856         mac_perim_exit(mph);
2857         return (err);
2858 }
2859 
2860 static int
2861 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2862 {
2863         aggr_grp_t *grp = arg;
2864         mac_perim_handle_t mph;
2865         int err;
2866 
2867         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2868         err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2869             0, 0);
2870         mac_perim_exit(mph);
2871         return (err);
2872 }
2873 
2874 /*
2875  * Initialize the capabilities that are advertised for the group
2876  * according to the capabilities of the constituent ports.
2877  */
2878 static void
2879 aggr_grp_capab_set(aggr_grp_t *grp)
2880 {
2881         uint32_t cksum;
2882         aggr_port_t *port;
2883         mac_capab_lso_t cap_lso;
2884 
2885         ASSERT(grp->lg_mh == NULL);
2886         ASSERT(grp->lg_ports != NULL);
2887 
2888         grp->lg_hcksum_txflags = (uint32_t)-1;
2889         grp->lg_zcopy = B_TRUE;
2890         grp->lg_vlan = B_TRUE;
2891 
2892         grp->lg_lso = B_TRUE;
2893         grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2894         grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2895 
2896         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2897                 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2898                         cksum = 0;
2899                 grp->lg_hcksum_txflags &= cksum;
2900 
2901                 grp->lg_vlan &=
2902                     !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2903 
2904                 grp->lg_zcopy &=
2905                     !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2906 
2907                 grp->lg_lso &=
2908                     mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2909                 if (grp->lg_lso) {
2910                         grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2911                         if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2912                             cap_lso.lso_basic_tcp_ipv4.lso_max)
2913                                 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2914                                     cap_lso.lso_basic_tcp_ipv4.lso_max;
2915                 }
2916         }
2917 }
2918 
2919 /*
2920  * Checks whether the capabilities of the port being added are compatible
2921  * with the current capabilities of the aggregation.
2922  */
2923 static boolean_t
2924 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2925 {
2926         uint32_t hcksum_txflags;
2927 
2928         ASSERT(grp->lg_ports != NULL);
2929 
2930         if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2931             grp->lg_vlan) != grp->lg_vlan) {
2932                 return (B_FALSE);
2933         }
2934 
2935         if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2936             grp->lg_zcopy) != grp->lg_zcopy) {
2937                 return (B_FALSE);
2938         }
2939 
2940         if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2941                 if (grp->lg_hcksum_txflags != 0)
2942                         return (B_FALSE);
2943         } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2944             grp->lg_hcksum_txflags) {
2945                 return (B_FALSE);
2946         }
2947 
2948         if (grp->lg_lso) {
2949                 mac_capab_lso_t cap_lso;
2950 
2951                 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2952                         if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2953                             grp->lg_cap_lso.lso_flags)
2954                                 return (B_FALSE);
2955                         if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2956                             cap_lso.lso_basic_tcp_ipv4.lso_max)
2957                                 return (B_FALSE);
2958                 } else {
2959                         return (B_FALSE);
2960                 }
2961         }
2962 
2963         return (B_TRUE);
2964 }
2965 
2966 /*
2967  * Returns the maximum SDU according to the SDU of the constituent ports.
2968  */
2969 static uint_t
2970 aggr_grp_max_sdu(aggr_grp_t *grp)
2971 {
2972         uint_t max_sdu = (uint_t)-1;
2973         aggr_port_t *port;
2974 
2975         ASSERT(grp->lg_ports != NULL);
2976 
2977         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2978                 uint_t port_sdu_max;
2979 
2980                 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2981                 if (max_sdu > port_sdu_max)
2982                         max_sdu = port_sdu_max;
2983         }
2984 
2985         return (max_sdu);
2986 }
2987 
2988 /*
2989  * Checks if the maximum SDU of the specified port is compatible
2990  * with the maximum SDU of the specified aggregation group, returns
2991  * B_TRUE if it is, B_FALSE otherwise.
2992  */
2993 static boolean_t
2994 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2995 {
2996         uint_t port_sdu_max;
2997 
2998         mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2999         return (port_sdu_max >= grp->lg_max_sdu);
3000 }
3001 
3002 /*
3003  * Returns the maximum margin according to the margin of the constituent ports.
3004  */
3005 static uint32_t
3006 aggr_grp_max_margin(aggr_grp_t *grp)
3007 {
3008         uint32_t margin = UINT32_MAX;
3009         aggr_port_t *port;
3010 
3011         ASSERT(grp->lg_mh == NULL);
3012         ASSERT(grp->lg_ports != NULL);
3013 
3014         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3015                 if (margin > port->lp_margin)
3016                         margin = port->lp_margin;
3017         }
3018 
3019         grp->lg_margin = margin;
3020         return (margin);
3021 }
3022 
3023 /*
3024  * Checks if the maximum margin of the specified port is compatible
3025  * with the maximum margin of the specified aggregation group, returns
3026  * B_TRUE if it is, B_FALSE otherwise.
3027  */
3028 static boolean_t
3029 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
3030 {
3031         if (port->lp_margin >= grp->lg_margin)
3032                 return (B_TRUE);
3033 
3034         /*
3035          * See whether the current margin value is allowed to be changed to
3036          * the new value.
3037          */
3038         if (!mac_margin_update(grp->lg_mh, port->lp_margin))
3039                 return (B_FALSE);
3040 
3041         grp->lg_margin = port->lp_margin;
3042         return (B_TRUE);
3043 }
3044 
3045 /*
3046  * Set MTU on individual ports of an aggregation group
3047  */
3048 static int
3049 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
3050     uint32_t *old_mtu)
3051 {
3052         boolean_t               removed = B_FALSE;
3053         mac_perim_handle_t      mph;
3054         mac_diag_t              diag;
3055         int                     err, rv, retry = 0;
3056 
3057         if (port->lp_mah != NULL) {
3058                 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
3059                 port->lp_mah = NULL;
3060                 removed = B_TRUE;
3061         }
3062         err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
3063 try_again:
3064         if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
3065             MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
3066             &port->lp_mah, 0, &diag)) != 0) {
3067                 /*
3068                  * following is a workaround for a bug in 'bge' driver.
3069                  * See CR 6794654 for more information and this work around
3070                  * will be removed once the CR is fixed.
3071                  */
3072                 if (rv == EIO && retry++ < 3) {
3073                         delay(2 * hz);
3074                         goto try_again;
3075                 }
3076                 /*
3077                  * if mac_unicast_add() failed while setting the MTU,
3078                  * detach the port from the group.
3079                  */
3080                 mac_perim_enter_by_mh(port->lp_mh, &mph);
3081                 (void) aggr_grp_detach_port(grp, port);
3082                 mac_perim_exit(mph);
3083                 cmn_err(CE_WARN, "Unable to restart the port %s while "
3084                     "setting MTU. Detaching the port from the aggregation.",
3085                     mac_client_name(port->lp_mch));
3086         }
3087         return (err);
3088 }
3089 
3090 static int
3091 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
3092 {
3093         int                     err = 0, i, rv;
3094         aggr_port_t             *port;
3095         uint32_t                *mtu;
3096 
3097         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3098 
3099         /*
3100          * If the MTU being set is equal to aggr group's maximum
3101          * allowable value, then there is nothing to change
3102          */
3103         if (sdu == grp->lg_max_sdu)
3104                 return (0);
3105 
3106         /* 0 is aggr group's min sdu */
3107         if (sdu == 0)
3108                 return (EINVAL);
3109 
3110         mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3111         for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3112             port = port->lp_next, i++) {
3113                 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3114         }
3115         if (err != 0) {
3116                 /* recover from error: reset the mtus of the ports */
3117                 aggr_port_t *tmp;
3118 
3119                 for (tmp = grp->lg_ports, i = 0; tmp != port;
3120                     tmp = tmp->lp_next, i++) {
3121                         (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3122                 }
3123                 goto bail;
3124         }
3125         grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3126         rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3127         ASSERT(rv == 0);
3128 bail:
3129         kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3130         return (err);
3131 }
3132 
3133 /*
3134  * Callback functions for set/get of properties
3135  */
3136 /*ARGSUSED*/
3137 static int
3138 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3139     uint_t pr_valsize, const void *pr_val)
3140 {
3141         int             err = ENOTSUP;
3142         aggr_grp_t      *grp = m_driver;
3143 
3144         switch (pr_num) {
3145         case MAC_PROP_MTU: {
3146                 uint32_t        mtu;
3147 
3148                 if (pr_valsize < sizeof (mtu)) {
3149                         err = EINVAL;
3150                         break;
3151                 }
3152                 bcopy(pr_val, &mtu, sizeof (mtu));
3153                 err = aggr_sdu_update(grp, mtu);
3154                 break;
3155         }
3156         default:
3157                 break;
3158         }
3159         return (err);
3160 }
3161 
3162 typedef struct rboundary {
3163         uint32_t        bval;
3164         int             btype;
3165 } rboundary_t;
3166 
3167 /*
3168  * This function finds the intersection of mtu ranges stored in arrays -
3169  * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3170  * Individual arrays are assumed to contain non-overlapping ranges.
3171  * Algorithm:
3172  *   A range has two boundaries - min and max. We scan all arrays and store
3173  * each boundary as a separate element in a temporary array. We also store
3174  * the boundary types, min or max, as +1 or -1 respectively in the temporary
3175  * array. Then we sort the temporary array in ascending order. We scan the
3176  * sorted array from lower to higher values and keep a cumulative sum of
3177  * boundary types. Element in the temporary array for which the sum reaches
3178  * mcount is a min boundary of a range in the result and next element will be
3179  * max boundary.
3180  *
3181  * Example for mcount = 3,
3182  *
3183  *  ----|_________|-------|_______|----|__|------ mrange[0]
3184  *
3185  *  -------|________|--|____________|-----|___|-- mrange[1]
3186  *
3187  *  --------|________________|-------|____|------ mrange[2]
3188  *
3189  *                                      3 2 1
3190  *                                       \|/
3191  *      1  23     2 1  2  3  2    1 01 2  V   0  <- the sum
3192  *  ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3193  *
3194  *                                 same min and max
3195  *                                        V
3196  *  --------|_____|-------|__|------------|------ intersecting ranges
3197  */
3198 void
3199 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3200     mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3201 {
3202         mac_propval_uint32_range_t      *rval, *ur;
3203         int                             rmaxcnt, rcount;
3204         size_t                          sz_range32;
3205         rboundary_t                     *ta; /* temporary array */
3206         rboundary_t                     temp;
3207         boolean_t                       range_started = B_FALSE;
3208         int                             i, j, m, sum;
3209 
3210         sz_range32 = sizeof (mac_propval_uint32_range_t);
3211 
3212         for (i = 0, rmaxcnt = 0; i < mcount; i++)
3213                 rmaxcnt += mrange[i]->mpr_count;
3214 
3215         /* Allocate enough space to store the results */
3216         rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3217 
3218         /* Number of boundaries are twice as many as ranges */
3219         ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3220 
3221         for (i = 0, m = 0; i < mcount; i++) {
3222                 ur = &(mrange[i]->mpr_range_uint32[0]);
3223                 for (j = 0; j < mrange[i]->mpr_count; j++) {
3224                         ta[m].bval = ur[j].mpur_min;
3225                         ta[m++].btype = 1;
3226                         ta[m].bval = ur[j].mpur_max;
3227                         ta[m++].btype = -1;
3228                 }
3229         }
3230 
3231         /*
3232          * Sort the temporary array in ascending order of bval;
3233          * if boundary values are same then sort on btype.
3234          */
3235         for (i = 0; i < m-1; i++) {
3236                 for (j = i+1; j < m; j++) {
3237                         if ((ta[i].bval > ta[j].bval) ||
3238                             ((ta[i].bval == ta[j].bval) &&
3239                             (ta[i].btype < ta[j].btype))) {
3240                                 temp = ta[i];
3241                                 ta[i] = ta[j];
3242                                 ta[j] = temp;
3243                         }
3244                 }
3245         }
3246 
3247         /* Walk through temporary array to find all ranges in the results */
3248         for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3249                 sum += ta[i].btype;
3250                 if (sum == mcount) {
3251                         rval[rcount].mpur_min = ta[i].bval;
3252                         range_started = B_TRUE;
3253                 } else if (sum < mcount && range_started) {
3254                         rval[rcount++].mpur_max = ta[i].bval;
3255                         range_started = B_FALSE;
3256                 }
3257         }
3258 
3259         *prval = rval;
3260         *prmaxcnt = rmaxcnt;
3261         *prcount = rcount;
3262 
3263         kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3264 }
3265 
3266 /*
3267  * Returns the mtu ranges which could be supported by aggr group.
3268  * prmaxcnt returns the size of the buffer prval, prcount returns
3269  * the number of valid entries in prval. Caller is responsible
3270  * for freeing up prval.
3271  */
3272 int
3273 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3274     int *prmaxcnt, int *prcount)
3275 {
3276         mac_propval_range_t             **vals;
3277         aggr_port_t                     *port;
3278         mac_perim_handle_t              mph;
3279         uint_t                          i, numr;
3280         int                             err = 0;
3281         size_t                          sz_propval, sz_range32;
3282         size_t                          size;
3283 
3284         sz_propval = sizeof (mac_propval_range_t);
3285         sz_range32 = sizeof (mac_propval_uint32_range_t);
3286 
3287         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3288 
3289         vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3290             KM_SLEEP);
3291 
3292         for (port = grp->lg_ports, i = 0; port != NULL;
3293             port = port->lp_next, i++) {
3294 
3295                 size = sz_propval;
3296                 vals[i] = kmem_alloc(size, KM_SLEEP);
3297                 vals[i]->mpr_count = 1;
3298 
3299                 mac_perim_enter_by_mh(port->lp_mh, &mph);
3300 
3301                 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3302                     NULL, 0, vals[i], NULL);
3303                 if (err == ENOSPC) {
3304                         /*
3305                          * Not enough space to hold all ranges.
3306                          * Allocate extra space as indicated and retry.
3307                          */
3308                         numr = vals[i]->mpr_count;
3309                         kmem_free(vals[i], sz_propval);
3310                         size = sz_propval + (numr - 1) * sz_range32;
3311                         vals[i] = kmem_alloc(size, KM_SLEEP);
3312                         vals[i]->mpr_count = numr;
3313                         err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3314                             NULL, 0, vals[i], NULL);
3315                         ASSERT(err != ENOSPC);
3316                 }
3317                 mac_perim_exit(mph);
3318                 if (err != 0) {
3319                         kmem_free(vals[i], size);
3320                         vals[i] = NULL;
3321                         break;
3322                 }
3323         }
3324 
3325         /*
3326          * if any of the underlying ports does not support changing MTU then
3327          * just return ENOTSUP
3328          */
3329         if (port != NULL) {
3330                 ASSERT(err != 0);
3331                 goto done;
3332         }
3333 
3334         aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3335             prcount);
3336 
3337 done:
3338         for (i = 0; i < grp->lg_nports; i++) {
3339                 if (vals[i] != NULL) {
3340                         numr = vals[i]->mpr_count;
3341                         size = sz_propval + (numr - 1) * sz_range32;
3342                         kmem_free(vals[i], size);
3343                 }
3344         }
3345 
3346         kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3347         return (err);
3348 }
3349 
3350 static void
3351 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3352     mac_prop_info_handle_t prh)
3353 {
3354         aggr_grp_t                      *grp = m_driver;
3355         mac_propval_uint32_range_t      *rval = NULL;
3356         int                             i, rcount, rmaxcnt;
3357         int                             err = 0;
3358 
3359         _NOTE(ARGUNUSED(pr_name));
3360 
3361         switch (pr_num) {
3362         case MAC_PROP_MTU:
3363 
3364                 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3365                     &rcount);
3366                 if (err != 0) {
3367                         ASSERT(rval == NULL);
3368                         return;
3369                 }
3370                 for (i = 0; i < rcount; i++) {
3371                         mac_prop_info_set_range_uint32(prh,
3372                             rval[i].mpur_min, rval[i].mpur_max);
3373                 }
3374                 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3375                 break;
3376         }
3377 }