Print this page
11493 aggr needs support for multiple pseudo rx groups
Portions contributed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2018 Joyent, Inc.
  24  */
  25 
  26 /*
  27  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
  28  *
  29  * An instance of the structure aggr_grp_t is allocated for each
  30  * link aggregation group. When created, aggr_grp_t objects are
  31  * entered into the aggr_grp_hash hash table maintained by the modhash
  32  * module. The hash key is the linkid associated with the link
  33  * aggregation group.
  34  *
  35  * A set of MAC ports are associated with each association group.






  36  *
  37  * Aggr pseudo TX rings
  38  * --------------------
  39  * The underlying ports (NICs) in an aggregation can have TX rings. To
  40  * enhance aggr's performance, these TX rings are made available to the
  41  * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
  42  * They are already present and implemented on the RX side. It is called
  43  * as pseudo RX rings. The same concept is extended to the TX side where
  44  * each TX ring of an underlying port is reflected in aggr as a pseudo
  45  * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
  46  * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
  47  * TX ring is given to the aggregation layer.
  48  *



























  49  * With this change, the outgoing stack depth looks much better:
  50  *
  51  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
  52  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
  53  *
  54  * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
  55  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
  56  *
  57  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
  58  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
  59  * ring belonging to a port on which the packet has to be sent.
  60  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
  61  * policy and then uses the fanout_hint passed to it to pick a TX ring from
  62  * the selected port.
  63  *
  64  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
  65  * bandwidth limit is applied first on the outgoing packet and the packets
  66  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
  67  * particular TX ring.
  68  */
  69 
  70 #include <sys/types.h>
  71 #include <sys/sysmacros.h>
  72 #include <sys/conf.h>
  73 #include <sys/cmn_err.h>
  74 #include <sys/disp.h>
  75 #include <sys/list.h>
  76 #include <sys/ksynch.h>
  77 #include <sys/kmem.h>
  78 #include <sys/stream.h>
  79 #include <sys/modctl.h>
  80 #include <sys/ddi.h>
  81 #include <sys/sunddi.h>
  82 #include <sys/atomic.h>
  83 #include <sys/stat.h>
  84 #include <sys/modhash.h>
  85 #include <sys/id_space.h>
  86 #include <sys/strsun.h>
  87 #include <sys/cred.h>


 104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
 105     const void *);
 106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
 107     mac_prop_info_handle_t);
 108 
 109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
 110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
 111     boolean_t *);
 112 
 113 static void aggr_grp_capab_set(aggr_grp_t *);
 114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
 115 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
 116 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
 117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
 118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
 119 
 120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 122 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
 123 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
 124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);

 125 static int aggr_addmac(void *, const uint8_t *);
 126 static int aggr_remmac(void *, const uint8_t *);
 127 static int aggr_addvlan(mac_group_driver_t, uint16_t);
 128 static int aggr_remvlan(mac_group_driver_t, uint16_t);
 129 static mblk_t *aggr_rx_poll(void *, int);
 130 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
 131     const int, mac_ring_info_t *, mac_ring_handle_t);
 132 static void aggr_fill_group(void *, mac_ring_type_t, const int,
 133     mac_group_info_t *, mac_group_handle_t);
 134 
 135 static kmem_cache_t     *aggr_grp_cache;
 136 static mod_hash_t       *aggr_grp_hash;
 137 static krwlock_t        aggr_grp_lock;
 138 static uint_t           aggr_grp_cnt;
 139 static id_space_t       *key_ids;
 140 
 141 #define GRP_HASHSZ              64
 142 #define GRP_HASH_KEY(linkid)    ((mod_hash_key_t)(uintptr_t)linkid)
 143 #define AGGR_PORT_NAME_DELIMIT '-'
 144 


 349 
 350         /*
 351          * Update the group link state.
 352          */
 353         if (grp->lg_link_state != LINK_STATE_UP) {
 354                 grp->lg_link_state = LINK_STATE_UP;
 355                 mutex_enter(&grp->lg_stat_lock);
 356                 grp->lg_link_duplex = LINK_DUPLEX_FULL;
 357                 mutex_exit(&grp->lg_stat_lock);
 358                 link_state_changed = B_TRUE;
 359         }
 360 
 361         /*
 362          * Update port's state.
 363          */
 364         port->lp_state = AGGR_PORT_STATE_ATTACHED;
 365 
 366         aggr_grp_multicst_port(port, B_TRUE);
 367 
 368         /*
 369          * Set port's receive callback




 370          */
 371         mac_rx_set(port->lp_mch, aggr_recv_cb, port);
 372 
 373         /*
 374          * If LACP is OFF, the port can be used to send data as soon
 375          * as its link is up and verified to be compatible with the
 376          * aggregation.
 377          *
 378          * If LACP is active or passive, notify the LACP subsystem, which
 379          * will enable sending on the port following the LACP protocol.
 380          */
 381         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 382                 aggr_send_port_enable(port);
 383         else
 384                 aggr_lacp_port_attached(port);
 385 
 386         return (link_state_changed);
 387 }
 388 
 389 boolean_t
 390 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
 391 {
 392         boolean_t link_state_changed = B_FALSE;
 393 
 394         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 395         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 396 
 397         /* update state */
 398         if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 399                 return (B_FALSE);
 400 
 401         mac_rx_clear(port->lp_mch);
 402 
 403         aggr_grp_multicst_port(port, B_FALSE);
 404 
 405         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 406                 aggr_send_port_disable(port);
 407         else
 408                 aggr_lacp_port_detached(port);
 409 
 410         port->lp_state = AGGR_PORT_STATE_STANDBY;
 411 
 412         grp->lg_nattached_ports--;
 413         if (grp->lg_nattached_ports == 0) {
 414                 /* the last attached MAC port of the group is being detached */
 415                 grp->lg_link_state = LINK_STATE_DOWN;
 416                 mutex_enter(&grp->lg_stat_lock);
 417                 grp->lg_ifspeed = 0;
 418                 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
 419                 mutex_exit(&grp->lg_stat_lock);
 420                 link_state_changed = B_TRUE;
 421         }


 520                          * address now, and this might cause the link state
 521                          * of the aggregation to change.
 522                          */
 523                         *link_state_changedp = aggr_grp_attach_port(grp, port);
 524                 }
 525         }
 526 }
 527 
 528 /*
 529  * Add a port to a link aggregation group.
 530  */
 531 static int
 532 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
 533     aggr_port_t **pp)
 534 {
 535         aggr_port_t *port, **cport;
 536         mac_perim_handle_t mph;
 537         zoneid_t port_zoneid = ALL_ZONES;
 538         int err;
 539 
 540         /* The port must be int the same zone as the aggregation. */
 541         if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
 542                 port_zoneid = GLOBAL_ZONEID;
 543         if (grp->lg_zoneid != port_zoneid)
 544                 return (EBUSY);
 545 
 546         /*
 547          * lg_mh could be NULL when the function is called during the creation
 548          * of the aggregation.


 549          */
 550         ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
 551 
 552         /* create new port */
 553         err = aggr_port_create(grp, port_linkid, force, &port);
 554         if (err != 0)
 555                 return (err);
 556 
 557         mac_perim_enter_by_mh(port->lp_mh, &mph);
 558 
 559         /* add port to list of group constituent ports */
 560         cport = &grp->lg_ports;
 561         while (*cport != NULL)
 562                 cport = &((*cport)->lp_next);
 563         *cport = port;
 564 
 565         /*
 566          * Back reference to the group it is member of. A port always
 567          * holds a reference to its group to ensure that the back
 568          * reference is always valid.
 569          */
 570         port->lp_grp = grp;
 571         AGGR_GRP_REFHOLD(grp);
 572         grp->lg_nports++;
 573 
 574         aggr_lacp_init_port(port);
 575         mac_perim_exit(mph);
 576 
 577         if (pp != NULL)
 578                 *pp = port;
 579 


 621 {
 622         aggr_pseudo_rx_ring_t   *ring;
 623         int                     err;
 624         int                     j;
 625 
 626         for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 627                 ring = rx_grp->arg_rings + j;
 628                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
 629                         break;
 630         }
 631 
 632         /*
 633          * No slot for this new RX ring.
 634          */
 635         if (j == MAX_RINGS_PER_GROUP)
 636                 return (EIO);
 637 
 638         ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
 639         ring->arr_hw_rh = hw_rh;
 640         ring->arr_port = port;

 641         rx_grp->arg_ring_cnt++;
 642 
 643         /*
 644          * The group is already registered, dynamically add a new ring to the
 645          * mac group.
 646          */
 647         if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
 648                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 649                 ring->arr_hw_rh = NULL;
 650                 ring->arr_port = NULL;

 651                 rx_grp->arg_ring_cnt--;
 652         } else {
 653                 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
 654                     mac_find_ring(rx_grp->arg_gh, j));




 655         }
 656         return (err);
 657 }
 658 
 659 /*
 660  * Remove the pseudo RX ring of the given HW ring handle.
 661  */
 662 static void
 663 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 664 {
 665         aggr_pseudo_rx_ring_t   *ring;
 666         int                     j;
 667 
 668         for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 669                 ring = rx_grp->arg_rings + j;
 670                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
 671                     ring->arr_hw_rh != hw_rh) {
 672                         continue;
 673                 }
 674 
 675                 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
 676 
 677                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 678                 ring->arr_hw_rh = NULL;
 679                 ring->arr_port = NULL;

 680                 rx_grp->arg_ring_cnt--;
 681                 mac_hwring_teardown(hw_rh);
 682                 break;
 683         }
 684 }
 685 
 686 /*
 687  * Create pseudo rings over the HW rings of the port.
 688  *
 689  * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
 690  *
 691  * o Program existing unicast filters on the pseudo group into the HW group.
 692  *
 693  * o Program existing VLAN filters on the pseudo group into the HW group.
 694  */
 695 static int
 696 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 697 {
 698         aggr_grp_t              *grp = port->lp_grp;
 699         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 700         aggr_unicst_addr_t      *addr, *a;
 701         mac_perim_handle_t      pmph;
 702         aggr_vlan_t             *avp;
 703         int                     hw_rh_cnt, i = 0, j;
 704         int                     err = 0;

 705 
 706         ASSERT(MAC_PERIM_HELD(grp->lg_mh));

 707         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 708 
 709         /*
 710          * This function must be called after the aggr registers its MAC
 711          * and its Rx group has been initialized.
 712          */
 713         ASSERT(rx_grp->arg_gh != NULL);
 714 
 715         /*
 716          * Get the list of the underlying HW rings.
 717          */
 718         hw_rh_cnt = mac_hwrings_get(port->lp_mch,
 719             &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
 720 
 721         if (port->lp_hwgh != NULL) {
 722                 /*
 723                  * Quiesce the HW ring and the MAC SRS on the ring. Note
 724                  * that the HW ring will be restarted when the pseudo ring
 725                  * is started. At that time all the packets will be
 726                  * directly passed up to the pseudo Rx ring and handled
 727                  * by MAC SRS created over the pseudo Rx ring.
 728                  */
 729                 mac_rx_client_quiesce(port->lp_mch);
 730                 mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
 731         }
 732 
 733         /*
 734          * Add existing VLAN and unicast address filters to the port.
 735          */
 736         for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
 737             avp = list_next(&rx_grp->arg_vlans, avp)) {
 738                 if ((err = aggr_port_addvlan(port, avp->av_vid)) != 0)
 739                         goto err;
 740         }
 741 
 742         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
 743                 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
 744                         goto err;
 745         }
 746 
 747         for (i = 0; i < hw_rh_cnt; i++) {
 748                 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
 749                 if (err != 0)
 750                         goto err;
 751         }
 752 
 753         port->lp_rx_grp_added = B_TRUE;
 754         mac_perim_exit(pmph);
 755         return (0);
 756 
 757 err:
 758         ASSERT(err != 0);
 759 
 760         for (j = 0; j < i; j++)
 761                 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
 762 
 763         for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
 764                 aggr_port_remmac(port, a->aua_addr);
 765 
 766         if (avp != NULL)
 767                 avp = list_prev(&rx_grp->arg_vlans, avp);
 768 
 769         for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
 770                 int err2;
 771 
 772                 if ((err2 = aggr_port_remvlan(port, avp->av_vid)) != 0) {
 773                         cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
 774                             ": errno %d.", avp->av_vid,
 775                             mac_client_name(port->lp_mch), err2);
 776                 }
 777         }
 778 
 779         if (port->lp_hwgh != NULL) {
 780                 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
 781                 mac_rx_client_restart(port->lp_mch);
 782                 port->lp_hwgh = NULL;
 783         }
 784 
 785         mac_perim_exit(pmph);
 786         return (err);
 787 }
 788 
 789 /*
 790  * Destroy the pseudo rings mapping to this port and remove all VLAN
 791  * and unicast filters from this port. Even if there are no underlying
 792  * HW rings we must still remove the unicast filters to take the port
 793  * out of promisc mode.
 794  */
 795 static void
 796 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 797 {
 798         aggr_grp_t              *grp = port->lp_grp;
 799         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 800         aggr_unicst_addr_t      *addr;
 801         mac_group_handle_t      hwgh;
 802         mac_perim_handle_t      pmph;
 803         int                     hw_rh_cnt, i;

 804 
 805         ASSERT(MAC_PERIM_HELD(grp->lg_mh));


 806         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 807 
 808         if (!port->lp_rx_grp_added)
 809                 goto done;
 810 
 811         ASSERT(rx_grp->arg_gh != NULL);
 812         hw_rh_cnt = mac_hwrings_get(port->lp_mch,
 813             &hwgh, hw_rh, MAC_RING_TYPE_RX);
 814 
 815         for (i = 0; i < hw_rh_cnt; i++)
 816                 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
 817 
 818         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
 819                 aggr_port_remmac(port, addr->aua_addr);
 820 
 821         for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
 822             avp = list_next(&rx_grp->arg_vlans, avp)) {
 823                 int err;
 824 
 825                 if ((err = aggr_port_remvlan(port, avp->av_vid)) != 0) {
 826                         cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
 827                             ": errno %d.", avp->av_vid,
 828                             mac_client_name(port->lp_mch), err);
 829                 }
 830         }
 831 
 832         if (port->lp_hwgh != NULL) {
 833                 port->lp_hwgh = NULL;
 834 
 835                 /*
 836                  * First clear the permanent-quiesced flag of the RX srs then
 837                  * restart the HW ring and the mac srs on the ring. Note that
 838                  * the HW ring and associated SRS will soon been removed when
 839                  * the port is removed from the aggr.
 840                  */
 841                 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
 842                 mac_rx_client_restart(port->lp_mch);
 843         }
 844 
 845         port->lp_rx_grp_added = B_FALSE;
 846 done:
 847         mac_perim_exit(pmph);
 848 }
 849 
 850 /*
 851  * Add a pseudo TX ring for the given HW ring handle.
 852  */
 853 static int
 854 aggr_add_pseudo_tx_ring(aggr_port_t *port,
 855     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
 856     mac_ring_handle_t *pseudo_rh)
 857 {
 858         aggr_pseudo_tx_ring_t   *ring;
 859         int                     err;
 860         int                     i;
 861 
 862         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 863         for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
 864                 ring = tx_grp->atg_rings + i;
 865                 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
 866                         break;


 930 /*
 931  * This function is called to create pseudo rings over hardware rings of
 932  * the underlying device. There is a 1:1 mapping between the pseudo TX
 933  * rings of the aggr and the hardware rings of the underlying port.
 934  */
 935 static int
 936 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
 937 {
 938         aggr_grp_t              *grp = port->lp_grp;
 939         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
 940         mac_perim_handle_t      pmph;
 941         int                     hw_rh_cnt, i = 0, j;
 942         int                     err = 0;
 943 
 944         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 945         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 946 
 947         /*
 948          * Get the list the the underlying HW rings.
 949          */
 950         hw_rh_cnt = mac_hwrings_get(port->lp_mch,
 951             NULL, hw_rh, MAC_RING_TYPE_TX);
 952 
 953         /*
 954          * Even if the underlying NIC does not have TX rings, we
 955          * still make a psuedo TX ring for that NIC with NULL as
 956          * the ring handle.
 957          */
 958         if (hw_rh_cnt == 0)
 959                 port->lp_tx_ring_cnt = 1;
 960         else
 961                 port->lp_tx_ring_cnt = hw_rh_cnt;
 962 
 963         port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 964             port->lp_tx_ring_cnt), KM_SLEEP);
 965         port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 966             port->lp_tx_ring_cnt), KM_SLEEP);
 967 
 968         if (hw_rh_cnt == 0) {
 969                 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
 970                     NULL, &pseudo_rh)) == 0) {
 971                         port->lp_tx_rings[0] = NULL;


1037         aggr_grp_update_default(grp);
1038 done:
1039         mac_perim_exit(pmph);
1040 }
1041 
1042 static int
1043 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1044 {
1045         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1046         return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1047 }
1048 
1049 static int
1050 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1051 {
1052         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1053         return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1054 }
1055 
1056 /*
1057  * Here we need to start the pseudo-ring. As MAC already ensures that the
1058  * underlying device is set up, all we need to do is save the ring generation.
1059  *
1060  * Note, we don't end up wanting to use the underlying mac_hwring_start/stop
1061  * functions here as those don't actually stop and start the ring, they just
1062  * quiesce the ring. Regardless of whether the aggr is logically up or not, we
1063  * want to make sure that we can receive traffic for LACP.
1064  */
1065 static int
1066 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1067 {

1068         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1069 





1070         rr_ring->arr_gen = mr_gen;
1071         return (0);
1072 }
1073 
1074 /*






















1075  * Add one or more ports to an existing link aggregation group.
1076  */
1077 int
1078 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1079     laioc_port_t *ports)
1080 {
1081         int rc, i, nadded = 0;


1082         aggr_grp_t *grp = NULL;
1083         aggr_port_t *port;
1084         boolean_t link_state_changed = B_FALSE;
1085         mac_perim_handle_t mph, pmph;
1086 
1087         /* get group corresponding to linkid */
1088         rw_enter(&aggr_grp_lock, RW_READER);
1089         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1090             (mod_hash_val_t *)&grp) != 0) {
1091                 rw_exit(&aggr_grp_lock);
1092                 return (ENOENT);
1093         }
1094         AGGR_GRP_REFHOLD(grp);
1095 
1096         /*
1097          * Hold the perimeter so that the aggregation won't be destroyed.
1098          */
1099         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1100         rw_exit(&aggr_grp_lock);
1101 
1102         /* add the specified ports to group */
1103         for (i = 0; i < nports; i++) {
1104                 /* add port to group */

1105                 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1106                     force, &port)) != 0) {
1107                         goto bail;
1108                 }

1109                 ASSERT(port != NULL);
1110                 nadded++;
1111 
1112                 /* check capabilities */
1113                 if (!aggr_grp_capab_check(grp, port) ||
1114                     !aggr_grp_sdu_check(grp, port) ||
1115                     !aggr_grp_margin_check(grp, port)) {
1116                         rc = ENOTSUP;
1117                         goto bail;
1118                 }
1119 
1120                 /*
1121                  * Create the pseudo ring for each HW ring of the underlying
1122                  * port.
1123                  */
1124                 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1125                 if (rc != 0)
1126                         goto bail;
1127                 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);




1128                 if (rc != 0)
1129                         goto bail;
1130 



1131                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1132 
1133                 /* set LACP mode */
1134                 aggr_port_lacp_set_mode(grp, port);
1135 
1136                 /* start port if group has already been started */
1137                 if (grp->lg_started) {
1138                         rc = aggr_port_start(port);
1139                         if (rc != 0) {
1140                                 mac_perim_exit(pmph);
1141                                 goto bail;
1142                         }
1143 
1144                         /*
1145                          * Turn on the promiscuous mode over the port when it
1146                          * is requested to be turned on to receive the
1147                          * non-primary address over a port, or the promiscous
1148                          * mode is enabled over the aggr.
1149                          */
1150                         if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1151                                 rc = aggr_port_promisc(port, B_TRUE);
1152                                 if (rc != 0) {
1153                                         mac_perim_exit(pmph);
1154                                         goto bail;
1155                                 }
1156                         }
1157                 }
1158                 mac_perim_exit(pmph);
1159 
1160                 /*
1161                  * Attach each port if necessary.
1162                  */
1163                 if (aggr_port_notify_link(grp, port))
1164                         link_state_changed = B_TRUE;
1165 
1166                 /*
1167                  * Initialize the callback functions for this port.
1168                  */
1169                 aggr_port_init_callbacks(port);
1170         }
1171 
1172         /* update the MAC address of the constituent ports */
1173         if (aggr_grp_update_ports_mac(grp))
1174                 link_state_changed = B_TRUE;
1175 
1176         if (link_state_changed)
1177                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1178 
1179 bail:
1180         if (rc != 0) {
1181                 /* stop and remove ports that have been added */
1182                 for (i = 0; i < nadded; i++) {


1183                         port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1184                         ASSERT(port != NULL);

1185                         if (grp->lg_started) {
1186                                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1187                                 (void) aggr_port_promisc(port, B_FALSE);
1188                                 aggr_port_stop(port);
1189                                 mac_perim_exit(pmph);
1190                         }

1191                         aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1192                         aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);












1193                         (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1194                 }
1195         }
1196 
1197         mac_perim_exit(mph);
1198         AGGR_GRP_REFRELE(grp);
1199         return (rc);
1200 }
1201 
1202 static int
1203 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1204     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1205     aggr_lacp_timer_t lacp_timer)
1206 {
1207         boolean_t mac_addr_changed = B_FALSE;
1208         boolean_t link_state_changed = B_FALSE;
1209         mac_perim_handle_t pmph;
1210 
1211         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1212 


1334         grp->lg_refs = 1;
1335         grp->lg_closing = B_FALSE;
1336         grp->lg_force = force;
1337         grp->lg_linkid = linkid;
1338         grp->lg_zoneid = crgetzoneid(credp);
1339         grp->lg_ifspeed = 0;
1340         grp->lg_link_state = LINK_STATE_UNKNOWN;
1341         grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1342         grp->lg_started = B_FALSE;
1343         grp->lg_promisc = B_FALSE;
1344         grp->lg_lacp_done = B_FALSE;
1345         grp->lg_tx_notify_done = B_FALSE;
1346         grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1347         grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1348             aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1349         grp->lg_tx_notify_thread = thread_create(NULL, 0,
1350             aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1351         grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1352             MAX_RINGS_PER_GROUP), KM_SLEEP);
1353         grp->lg_tx_blocked_cnt = 0;
1354         bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));

1355         bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1356         aggr_lacp_init_grp(grp);
1357 
1358         grp->lg_rx_group.arg_untagged = 0;
1359         list_create(&(grp->lg_rx_group.arg_vlans), sizeof (aggr_vlan_t),
1360             offsetof(aggr_vlan_t, av_link));
1361 
1362         /* add MAC ports to group */
1363         grp->lg_ports = NULL;
1364         grp->lg_nports = 0;
1365         grp->lg_nattached_ports = 0;
1366         grp->lg_ntx_ports = 0;
1367 
1368         /*
1369          * If key is not specified by the user, allocate the key.
1370          */
1371         if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1372                 err = ENOMEM;
1373                 goto bail;
1374         }
1375         grp->lg_key = key;
1376 
1377         for (i = 0; i < nports; i++) {
1378                 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1379                 if (err != 0)
1380                         goto bail;
1381         }
1382 










1383         /*



























1384          * If no explicit MAC address was specified by the administrator,
1385          * set it to the MAC address of the first port.
1386          */
1387         grp->lg_addr_fixed = mac_fixed;
1388         if (grp->lg_addr_fixed) {
1389                 /* validate specified address */
1390                 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1391                         err = EINVAL;
1392                         goto bail;
1393                 }
1394                 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1395         } else {
1396                 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1397                 grp->lg_mac_addr_port = grp->lg_ports;
1398         }
1399 
1400         /* set the initial group capabilities */
1401         aggr_grp_capab_set(grp);
1402 
1403         if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1404                 err = ENOMEM;
1405                 goto bail;
1406         }
1407         mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1408         mac->m_driver = grp;
1409         mac->m_dip = aggr_dip;
1410         mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1411         mac->m_src_addr = grp->lg_addr;
1412         mac->m_callbacks = &aggr_m_callbacks;
1413         mac->m_min_sdu = 0;
1414         mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1415         mac->m_margin = aggr_grp_max_margin(grp);
1416         mac->m_v12n = MAC_VIRT_LEVEL1;
1417         err = mac_register(mac, &grp->lg_mh);
1418         mac_free(mac);
1419         if (err != 0)
1420                 goto bail;
1421 
1422         err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1423         if (err != 0) {
1424                 (void) mac_unregister(grp->lg_mh);
1425                 grp->lg_mh = NULL;
1426                 goto bail;
1427         }
1428 
1429         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1430 
1431         /*
1432          * Update the MAC address of the constituent ports.
1433          * None of the port is attached at this time, the link state of the
1434          * aggregation will not change.




1435          */
1436         link_state_changed = aggr_grp_update_ports_mac(grp);
1437         ASSERT(!link_state_changed);
1438 
1439         /* update outbound load balancing policy */
1440         aggr_send_update_policy(grp, policy);
1441 
1442         /* set LACP mode */
1443         aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1444 
1445         /*
1446          * Attach each port if necessary.
1447          */
1448         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1449                 /*
1450                  * Create the pseudo ring for each HW ring of the underlying
1451                  * port. Note that this is done after the aggr registers the
1452                  * mac.
1453                  */
1454                 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1455                 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);






1456                 if (aggr_port_notify_link(grp, port))
1457                         link_state_changed = B_TRUE;
1458 
1459                 /*
1460                  * Initialize the callback functions for this port.
1461                  */
1462                 aggr_port_init_callbacks(port);
1463         }
1464 
1465         if (link_state_changed)
1466                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1467 
1468         /* add new group to hash table */
1469         err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1470             (mod_hash_val_t)grp);
1471         ASSERT(err == 0);
1472         aggr_grp_cnt++;
1473 
1474         mac_perim_exit(mph);
1475         rw_exit(&aggr_grp_lock);


1717 
1718                 /* stop port if group has already been started */
1719                 if (grp->lg_started) {
1720                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1721                         aggr_port_stop(port);
1722                         mac_perim_exit(pmph);
1723                 }
1724 
1725                 /*
1726                  * aggr_rem_pseudo_tx_group() is not called here. Instead
1727                  * it is called from inside aggr_grp_rem_port() after the
1728                  * port has been detached. The reason is that
1729                  * aggr_rem_pseudo_tx_group() removes one ring at a time
1730                  * and if there is still traffic going on, then there
1731                  * is the possibility of aggr_find_tx_ring() returning a
1732                  * removed ring for transmission. Once the port has been
1733                  * detached, that port will not be used and
1734                  * aggr_find_tx_ring() will not return any rings
1735                  * belonging to it.
1736                  */
1737                 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);

1738 
1739                 /* remove port from group */
1740                 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1741                     &link_state_changed);
1742                 ASSERT(rc == 0);
1743                 mac_addr_update = mac_addr_update || mac_addr_changed;
1744                 link_state_update = link_state_update || link_state_changed;
1745         }
1746 
1747 bail:
1748         if (mac_addr_update)
1749                 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1750         if (link_state_update)
1751                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1752 
1753         mac_perim_exit(mph);
1754         AGGR_GRP_REFRELE(grp);
1755 
1756         return (rc);
1757 }


1822                 grp->lg_tx_notify_done = B_TRUE;
1823                 cv_signal(&grp->lg_tx_flowctl_cv);
1824         }
1825         mutex_exit(&grp->lg_tx_flowctl_lock);
1826         if (tid != 0)
1827                 thread_join(tid);
1828 
1829         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1830 
1831         grp->lg_closing = B_TRUE;
1832         /* detach and free MAC ports associated with group */
1833         port = grp->lg_ports;
1834         while (port != NULL) {
1835                 cport = port->lp_next;
1836                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1837                 if (grp->lg_started)
1838                         aggr_port_stop(port);
1839                 (void) aggr_grp_detach_port(grp, port);
1840                 mac_perim_exit(pmph);
1841                 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1842                 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);

1843                 aggr_port_delete(port);
1844                 port = cport;
1845         }
1846 
1847         mac_perim_exit(mph);
1848 
1849         kmem_free(grp->lg_tx_blocked_rings,
1850             (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1851         /*
1852          * Wait for the port's lacp timer thread and its notification callback
1853          * to exit before calling mac_unregister() since both needs to access
1854          * the mac perimeter of the grp.
1855          */
1856         aggr_grp_port_wait(grp);
1857 
1858         VERIFY(mac_unregister(grp->lg_mh) == 0);
1859         grp->lg_mh = NULL;
1860 
1861         list_destroy(&(grp->lg_rx_group.arg_vlans));


1862 
1863         AGGR_GRP_REFRELE(grp);
1864         return (0);
1865 }
1866 
1867 void
1868 aggr_grp_free(aggr_grp_t *grp)
1869 {
1870         ASSERT(grp->lg_refs == 0);
1871         ASSERT(grp->lg_port_ref == 0);
1872         if (grp->lg_key > AGGR_MAX_KEY) {
1873                 id_free(key_ids, grp->lg_key);
1874                 grp->lg_key = 0;
1875         }
1876         kmem_cache_free(aggr_grp_cache, grp);
1877 }
1878 
1879 int
1880 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1881     aggr_grp_info_new_grp_fn_t new_grp_fn,


2207                 uint32_t *hcksum_txflags = cap_data;
2208                 *hcksum_txflags = grp->lg_hcksum_txflags;
2209                 break;
2210         }
2211         case MAC_CAPAB_LSO: {
2212                 mac_capab_lso_t *cap_lso = cap_data;
2213 
2214                 if (grp->lg_lso) {
2215                         *cap_lso = grp->lg_cap_lso;
2216                         break;
2217                 } else {
2218                         return (B_FALSE);
2219                 }
2220         }
2221         case MAC_CAPAB_NO_NATIVEVLAN:
2222                 return (!grp->lg_vlan);
2223         case MAC_CAPAB_NO_ZCOPY:
2224                 return (!grp->lg_zcopy);
2225         case MAC_CAPAB_RINGS: {
2226                 mac_capab_rings_t *cap_rings = cap_data;

2227 



2228                 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2229                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2230                         cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2231 
2232                         /*
2233                          * An aggregation advertises only one (pseudo) RX
2234                          * group, which virtualizes the main/primary group of
2235                          * the underlying devices.
2236                          */
2237                         cap_rings->mr_gnum = 1;
2238                         cap_rings->mr_gaddring = NULL;
2239                         cap_rings->mr_gremring = NULL;
2240                 } else {
2241                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2242                         cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2243                         cap_rings->mr_gnum = 0;
2244                 }
2245                 cap_rings->mr_rget = aggr_fill_ring;
2246                 cap_rings->mr_gget = aggr_fill_group;
2247                 break;
2248         }
2249         case MAC_CAPAB_AGGR:
2250         {
2251                 mac_capab_aggr_t *aggr_cap;
2252 
2253                 if (cap_data != NULL) {
2254                         aggr_cap = cap_data;
2255                         aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2256                         aggr_cap->mca_unicst = aggr_m_unicst;
2257                         aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2258                         aggr_cap->mca_arg = arg;
2259                 }
2260                 return (B_TRUE);
2261         }
2262         default:
2263                 return (B_FALSE);
2264         }
2265         return (B_TRUE);
2266 }
2267 
2268 /*
2269  * Callback function for MAC layer to register groups.
2270  */
2271 static void
2272 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2273     mac_group_info_t *infop, mac_group_handle_t gh)
2274 {
2275         aggr_grp_t *grp = arg;
2276         aggr_pseudo_rx_group_t *rx_group;
2277         aggr_pseudo_tx_group_t *tx_group;
2278 
2279         ASSERT(index == 0);
2280         if (rtype == MAC_RING_TYPE_RX) {
2281                 rx_group = &grp->lg_rx_group;

2282                 rx_group->arg_gh = gh;
2283                 rx_group->arg_grp = grp;
2284 
2285                 infop->mgi_driver = (mac_group_driver_t)rx_group;
2286                 infop->mgi_start = NULL;
2287                 infop->mgi_stop = NULL;
2288                 infop->mgi_addmac = aggr_addmac;
2289                 infop->mgi_remmac = aggr_remmac;
2290                 infop->mgi_count = rx_group->arg_ring_cnt;
2291 
2292                 /*
2293                  * Always set the HW VLAN callbacks. They are smart
2294                  * enough to know when a port has HW VLAN filters to
2295                  * program and when it doesn't.
2296                  */
2297                 infop->mgi_addvlan = aggr_addvlan;
2298                 infop->mgi_remvlan = aggr_remvlan;
2299         } else {
2300                 tx_group = &grp->lg_tx_group;


2301                 tx_group->atg_gh = gh;
2302         }
2303 }
2304 
2305 /*
2306  * Callback funtion for MAC layer to register all rings.
2307  */
2308 static void
2309 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2310     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2311 {
2312         aggr_grp_t      *grp = arg;
2313 
2314         switch (rtype) {
2315         case MAC_RING_TYPE_RX: {
2316                 aggr_pseudo_rx_group_t  *rx_group = &grp->lg_rx_group;
2317                 aggr_pseudo_rx_ring_t   *rx_ring;
2318                 mac_intr_t              aggr_mac_intr;
2319 
2320                 ASSERT(rg_index == 0);
2321 
2322                 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2323                 rx_ring = rx_group->arg_rings + index;
2324                 rx_ring->arr_rh = rh;
2325 
2326                 /*
2327                  * Entrypoint to enable interrupt (disable poll) and
2328                  * disable interrupt (enable poll).
2329                  */
2330                 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2331                 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2332                 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2333                 aggr_mac_intr.mi_ddi_handle = NULL;
2334 
2335                 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2336                 infop->mri_start = aggr_pseudo_start_ring;
2337                 infop->mri_stop = NULL;
2338 
2339                 infop->mri_intr = aggr_mac_intr;
2340                 infop->mri_poll = aggr_rx_poll;
2341 
2342                 infop->mri_stat = aggr_rx_ring_stat;
2343                 break;
2344         }
2345         case MAC_RING_TYPE_TX: {
2346                 aggr_pseudo_tx_group_t  *tx_group = &grp->lg_tx_group;
2347                 aggr_pseudo_tx_ring_t   *tx_ring;
2348 
2349                 ASSERT(rg_index == -1);
2350                 ASSERT(index < tx_group->atg_ring_cnt);
2351 
2352                 tx_ring = &tx_group->atg_rings[index];
2353                 tx_ring->atr_rh = rh;
2354 
2355                 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2356                 infop->mri_start = NULL;
2357                 infop->mri_stop = NULL;


2404                 if (!port->lp_collector_enabled) {
2405                         *mpp = mp->b_next;
2406                         mp->b_next = NULL;
2407                         freemsg(mp);
2408                         continue;
2409                 }
2410                 mpp = &mp->b_next;
2411         }
2412         return (mp_chain);
2413 }
2414 
2415 static int
2416 aggr_addmac(void *arg, const uint8_t *mac_addr)
2417 {
2418         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2419         aggr_unicst_addr_t      *addr, **pprev;
2420         aggr_grp_t              *grp = rx_group->arg_grp;
2421         aggr_port_t             *port, *p;
2422         mac_perim_handle_t      mph;
2423         int                     err = 0;

2424 
2425         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2426 
2427         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2428                 mac_perim_exit(mph);
2429                 return (0);
2430         }
2431 
2432         /*
2433          * Insert this mac address into the list of mac addresses owned by
2434          * the aggregation pseudo group.
2435          */
2436         pprev = &rx_group->arg_macaddr;
2437         while ((addr = *pprev) != NULL) {
2438                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2439                         mac_perim_exit(mph);
2440                         return (EEXIST);
2441                 }
2442                 pprev = &addr->aua_next;
2443         }
2444         addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2445         bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2446         addr->aua_next = NULL;
2447         *pprev = addr;
2448 
2449         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2450                 if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2451                         break;
2452 
2453         if (err != 0) {
2454                 for (p = grp->lg_ports; p != port; p = p->lp_next)
2455                         aggr_port_remmac(p, mac_addr);
2456 
2457                 *pprev = NULL;
2458                 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2459         }
2460 
2461         mac_perim_exit(mph);
2462         return (err);
2463 }
2464 
2465 static int
2466 aggr_remmac(void *arg, const uint8_t *mac_addr)
2467 {
2468         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2469         aggr_unicst_addr_t      *addr, **pprev;
2470         aggr_grp_t              *grp = rx_group->arg_grp;
2471         aggr_port_t             *port;
2472         mac_perim_handle_t      mph;
2473         int                     err = 0;
2474 
2475         mac_perim_enter_by_mh(grp->lg_mh, &mph);


2480         }
2481 
2482         /*
2483          * Insert this mac address into the list of mac addresses owned by
2484          * the aggregation pseudo group.
2485          */
2486         pprev = &rx_group->arg_macaddr;
2487         while ((addr = *pprev) != NULL) {
2488                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2489                         pprev = &addr->aua_next;
2490                         continue;
2491                 }
2492                 break;
2493         }
2494         if (addr == NULL) {
2495                 mac_perim_exit(mph);
2496                 return (EINVAL);
2497         }
2498 
2499         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2500                 aggr_port_remmac(port, mac_addr);
2501 
2502         *pprev = addr->aua_next;
2503         kmem_free(addr, sizeof (aggr_unicst_addr_t));
2504 
2505         mac_perim_exit(mph);
2506         return (err);
2507 }
2508 
2509 /*
2510  * Search for VID in the Rx group's list and return a pointer if
2511  * found. Otherwise return NULL.
2512  */
2513 static aggr_vlan_t *
2514 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2515 {
2516         ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2517         for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2518             avp = list_next(&rx_group->arg_vlans, avp)) {
2519                 if (avp->av_vid == vid)
2520                         return (avp);


2522 
2523         return (NULL);
2524 }
2525 
2526 /*
2527  * Accept traffic on the specified VID.
2528  *
2529  * Persist VLAN state in the aggr so that ports added later will
2530  * receive the correct filters. In the future it would be nice to
2531  * allow aggr to iterate its clients instead of duplicating state.
2532  */
2533 static int
2534 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2535 {
2536         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2537         aggr_grp_t              *aggr = rx_group->arg_grp;
2538         aggr_port_t             *port, *p;
2539         mac_perim_handle_t      mph;
2540         int                     err = 0;
2541         aggr_vlan_t             *avp = NULL;

2542 
2543         mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2544 
2545         if (vid == MAC_VLAN_UNTAGGED) {
2546                 /*
2547                  * Aggr is both a MAC provider and MAC client. As a
2548                  * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2549                  * client. As a client itself, it should pass
2550                  * VLAN_ID_NONE to its ports.
2551                  */
2552                 vid = VLAN_ID_NONE;
2553                 rx_group->arg_untagged++;
2554                 goto update_ports;
2555         }
2556 
2557         avp = aggr_find_vlan(rx_group, vid);
2558 
2559         if (avp != NULL) {
2560                 avp->av_refs++;
2561                 mac_perim_exit(mph);
2562                 return (0);
2563         }
2564 
2565         avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2566         avp->av_vid = vid;
2567         avp->av_refs = 1;
2568 
2569 update_ports:
2570         for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2571                 if ((err = aggr_port_addvlan(port, vid)) != 0)
2572                         break;
2573 
2574         if (err != 0) {
2575                 /*
2576                  * If any of these calls fail then we are in a
2577                  * situation where the ports have different HW state.
2578                  * There's no reasonable action the MAC client can
2579                  * take in this scenario to rectify the situation.
2580                  */
2581                 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2582                         int err2;
2583 
2584                         if ((err2 = aggr_port_remvlan(p, vid)) != 0) {
2585                                 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2586                                     " from port %s: errno %d.", vid,
2587                                     mac_client_name(p->lp_mch), err2);
2588                         }
2589 
2590                 }
2591 
2592                 if (vid == VLAN_ID_NONE)
2593                         rx_group->arg_untagged--;
2594 
2595                 if (avp != NULL) {
2596                         kmem_free(avp, sizeof (aggr_vlan_t));
2597                         avp = NULL;
2598                 }
2599         }
2600 
2601         if (avp != NULL)
2602                 list_insert_tail(&rx_group->arg_vlans, avp);
2603 
2604 done:
2605         mac_perim_exit(mph);
2606         return (err);
2607 }
2608 
2609 /*
2610  * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2611  */
2612 static int
2613 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2614 {
2615         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2616         aggr_grp_t              *aggr = rx_group->arg_grp;
2617         aggr_port_t             *port, *p;
2618         mac_perim_handle_t      mph;
2619         int                     err = 0;
2620         aggr_vlan_t             *avp = NULL;

2621 
2622         mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2623 
2624         /*
2625          * See the comment in aggr_addvlan().
2626          */
2627         if (vid == MAC_VLAN_UNTAGGED) {
2628                 vid = VLAN_ID_NONE;
2629                 rx_group->arg_untagged--;
2630 
2631                 if (rx_group->arg_untagged > 0)
2632                         goto done;
2633 
2634                 goto update_ports;
2635         }
2636 
2637         avp = aggr_find_vlan(rx_group, vid);
2638 
2639         if (avp == NULL) {
2640                 err = ENOENT;
2641                 goto done;
2642         }
2643 
2644         avp->av_refs--;
2645 
2646         if (avp->av_refs > 0)
2647                 goto done;
2648 
2649 update_ports:
2650         for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2651                 if ((err = aggr_port_remvlan(port, vid)) != 0)
2652                         break;
2653 
2654         /*
2655          * See the comment in aggr_addvlan() for justification of the
2656          * use of VERIFY here.
2657          */
2658         if (err != 0) {
2659                 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2660                         int err2;
2661 
2662                         if ((err2 = aggr_port_addvlan(p, vid)) != 0) {
2663                                 cmn_err(CE_WARN, "Failed to add VLAN %u"
2664                                     " to port %s: errno %d.", vid,
2665                                     mac_client_name(p->lp_mch), err2);
2666                         }
2667                 }
2668 
2669                 if (avp != NULL)
2670                         avp->av_refs++;
2671 
2672                 if (vid == VLAN_ID_NONE)
2673                         rx_group->arg_untagged++;
2674 
2675                 goto done;
2676         }
2677 
2678         if (err == 0 && avp != NULL) {
2679                 VERIFY3U(avp->av_refs, ==, 0);
2680                 list_remove(&rx_group->arg_vlans, avp);
2681                 kmem_free(avp, sizeof (aggr_vlan_t));
2682         }




  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2018 Joyent, Inc.
  24  */
  25 
  26 /*
  27  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
  28  *
  29  * An instance of the structure aggr_grp_t is allocated for each
  30  * link aggregation group. When created, aggr_grp_t objects are
  31  * entered into the aggr_grp_hash hash table maintained by the modhash
  32  * module. The hash key is the linkid associated with the link
  33  * aggregation group.
  34  *
  35  * Each aggregation contains a set of ports. The port is represented
  36  * by the aggr_port_t structure. A port consists of a single MAC
  37  * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
  38  * MAC. This client is used by the aggr to send and receive LACP
  39  * traffic. Each port client takes on the same MAC unicast address --
  40  * the address of the aggregation itself (taken from the first port by
  41  * default).
  42  *
  43  * The MAC client that hangs off each aggr port is not your typical
  44  * MAC client. Not only does it have exclusive control of the MAC, but
  45  * it also has no Tx or Rx SRSes. An SRS is designed to queue and
  46  * fanout traffic among L4 protocols; but the aggr is an intermediary,
  47  * not a consumer. Instead of using SRSes, the aggr puts the
  48  * underlying hardware rings into passthru mode and ships packets up
  49  * via a direct call to aggr_recv_cb(). This allows aggr to enforce
  50  * LACP while passing all other traffic up to clients of the aggr.



  51  *
  52  * Pseudo Rx Groups and Rings
  53  * --------------------------
  54  *
  55  * It is imperative for client performance that the aggr provide as
  56  * many MAC groups as possible. In order to use the underlying HW
  57  * resources, aggr creates pseudo groups to aggregate the underlying
  58  * HW groups. Every HW group gets mapped to a pseudo group; and every
  59  * HW ring in that group gets mapped to a pseudo ring. The pseudo
  60  * group at index 0 combines all the HW groups at index 0 from each
  61  * port, etc. The aggr's MAC then creates normal MAC groups and rings
  62  * out of these pseudo groups and rings to present to the aggr's
  63  * clients. To the clients, the aggr's groups and rings are absolutely
  64  * no different than a NIC's groups or rings.
  65  *
  66  * Pseudo Tx Rings
  67  * ---------------
  68  *
  69  * The underlying ports (NICs) in an aggregation can have Tx rings. To
  70  * enhance aggr's performance, these Tx rings are made available to
  71  * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
  72  * not new. They are already present and implemented on the Rx side.
  73  * The same concept is extended to the Tx side where each Tx ring of
  74  * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
  75  * each pseudo Tx ring will map to a specific hardware Tx ring. Even
  76  * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
  77  * is given to the aggregation layer.
  78  *
  79  * With this change, the outgoing stack depth looks much better:
  80  *
  81  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
  82  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
  83  *
  84  * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
  85  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
  86  *
  87  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
  88  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
  89  * ring belonging to a port on which the packet has to be sent.
  90  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
  91  * policy and then uses the fanout_hint passed to it to pick a Tx ring from
  92  * the selected port.
  93  *
  94  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
  95  * bandwidth limit is applied first on the outgoing packet and the packets
  96  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
  97  * particular Tx ring.
  98  */
  99 
 100 #include <sys/types.h>
 101 #include <sys/sysmacros.h>
 102 #include <sys/conf.h>
 103 #include <sys/cmn_err.h>
 104 #include <sys/disp.h>
 105 #include <sys/list.h>
 106 #include <sys/ksynch.h>
 107 #include <sys/kmem.h>
 108 #include <sys/stream.h>
 109 #include <sys/modctl.h>
 110 #include <sys/ddi.h>
 111 #include <sys/sunddi.h>
 112 #include <sys/atomic.h>
 113 #include <sys/stat.h>
 114 #include <sys/modhash.h>
 115 #include <sys/id_space.h>
 116 #include <sys/strsun.h>
 117 #include <sys/cred.h>


 134 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
 135     const void *);
 136 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
 137     mac_prop_info_handle_t);
 138 
 139 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
 140 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
 141     boolean_t *);
 142 
 143 static void aggr_grp_capab_set(aggr_grp_t *);
 144 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
 145 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
 146 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
 147 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
 148 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
 149 
 150 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 151 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
 152 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
 153 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
 154 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
 155 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
 156 static int aggr_addmac(void *, const uint8_t *);
 157 static int aggr_remmac(void *, const uint8_t *);
 158 static int aggr_addvlan(mac_group_driver_t, uint16_t);
 159 static int aggr_remvlan(mac_group_driver_t, uint16_t);
 160 static mblk_t *aggr_rx_poll(void *, int);
 161 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
 162     const int, mac_ring_info_t *, mac_ring_handle_t);
 163 static void aggr_fill_group(void *, mac_ring_type_t, const int,
 164     mac_group_info_t *, mac_group_handle_t);
 165 
 166 static kmem_cache_t     *aggr_grp_cache;
 167 static mod_hash_t       *aggr_grp_hash;
 168 static krwlock_t        aggr_grp_lock;
 169 static uint_t           aggr_grp_cnt;
 170 static id_space_t       *key_ids;
 171 
 172 #define GRP_HASHSZ              64
 173 #define GRP_HASH_KEY(linkid)    ((mod_hash_key_t)(uintptr_t)linkid)
 174 #define AGGR_PORT_NAME_DELIMIT '-'
 175 


 380 
 381         /*
 382          * Update the group link state.
 383          */
 384         if (grp->lg_link_state != LINK_STATE_UP) {
 385                 grp->lg_link_state = LINK_STATE_UP;
 386                 mutex_enter(&grp->lg_stat_lock);
 387                 grp->lg_link_duplex = LINK_DUPLEX_FULL;
 388                 mutex_exit(&grp->lg_stat_lock);
 389                 link_state_changed = B_TRUE;
 390         }
 391 
 392         /*
 393          * Update port's state.
 394          */
 395         port->lp_state = AGGR_PORT_STATE_ATTACHED;
 396 
 397         aggr_grp_multicst_port(port, B_TRUE);
 398 
 399         /*
 400          * The port client doesn't have an Rx SRS; instead of calling
 401          * mac_rx_set() we set the client's flow callback directly.
 402          * This datapath is used only when the port's driver doesn't
 403          * support MAC_CAPAB_RINGS. Drivers with ring support will
 404          * deliver traffic to the aggr via ring passthru.
 405          */
 406         mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
 407 
 408         /*
 409          * If LACP is OFF, the port can be used to send data as soon
 410          * as its link is up and verified to be compatible with the
 411          * aggregation.
 412          *
 413          * If LACP is active or passive, notify the LACP subsystem, which
 414          * will enable sending on the port following the LACP protocol.
 415          */
 416         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 417                 aggr_send_port_enable(port);
 418         else
 419                 aggr_lacp_port_attached(port);
 420 
 421         return (link_state_changed);
 422 }
 423 
 424 boolean_t
 425 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
 426 {
 427         boolean_t link_state_changed = B_FALSE;
 428 
 429         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 430         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 431 
 432         /* update state */
 433         if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
 434                 return (B_FALSE);
 435 
 436         mac_client_clear_flow_cb(port->lp_mch);
 437 
 438         aggr_grp_multicst_port(port, B_FALSE);
 439 
 440         if (grp->lg_lacp_mode == AGGR_LACP_OFF)
 441                 aggr_send_port_disable(port);
 442         else
 443                 aggr_lacp_port_detached(port);
 444 
 445         port->lp_state = AGGR_PORT_STATE_STANDBY;
 446 
 447         grp->lg_nattached_ports--;
 448         if (grp->lg_nattached_ports == 0) {
 449                 /* the last attached MAC port of the group is being detached */
 450                 grp->lg_link_state = LINK_STATE_DOWN;
 451                 mutex_enter(&grp->lg_stat_lock);
 452                 grp->lg_ifspeed = 0;
 453                 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
 454                 mutex_exit(&grp->lg_stat_lock);
 455                 link_state_changed = B_TRUE;
 456         }


 555                          * address now, and this might cause the link state
 556                          * of the aggregation to change.
 557                          */
 558                         *link_state_changedp = aggr_grp_attach_port(grp, port);
 559                 }
 560         }
 561 }
 562 
 563 /*
 564  * Add a port to a link aggregation group.
 565  */
 566 static int
 567 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
 568     aggr_port_t **pp)
 569 {
 570         aggr_port_t *port, **cport;
 571         mac_perim_handle_t mph;
 572         zoneid_t port_zoneid = ALL_ZONES;
 573         int err;
 574 
 575         /* The port must be in the same zone as the aggregation. */
 576         if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
 577                 port_zoneid = GLOBAL_ZONEID;
 578         if (grp->lg_zoneid != port_zoneid)
 579                 return (EBUSY);
 580 
 581         /*
 582          * If we are creating the aggr, then there is no MAC handle
 583          * and thus no perimeter to hold. If we are adding a port to
 584          * an existing aggr, then the perimiter of the aggr's MAC must
 585          * be held.
 586          */
 587         ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
 588 

 589         err = aggr_port_create(grp, port_linkid, force, &port);
 590         if (err != 0)
 591                 return (err);
 592 
 593         mac_perim_enter_by_mh(port->lp_mh, &mph);
 594 
 595         /* Add the new port to the end of the list. */
 596         cport = &grp->lg_ports;
 597         while (*cport != NULL)
 598                 cport = &((*cport)->lp_next);
 599         *cport = port;
 600 
 601         /*
 602          * Back reference to the group it is member of. A port always
 603          * holds a reference to its group to ensure that the back
 604          * reference is always valid.
 605          */
 606         port->lp_grp = grp;
 607         AGGR_GRP_REFHOLD(grp);
 608         grp->lg_nports++;
 609 
 610         aggr_lacp_init_port(port);
 611         mac_perim_exit(mph);
 612 
 613         if (pp != NULL)
 614                 *pp = port;
 615 


 657 {
 658         aggr_pseudo_rx_ring_t   *ring;
 659         int                     err;
 660         int                     j;
 661 
 662         for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 663                 ring = rx_grp->arg_rings + j;
 664                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
 665                         break;
 666         }
 667 
 668         /*
 669          * No slot for this new RX ring.
 670          */
 671         if (j == MAX_RINGS_PER_GROUP)
 672                 return (EIO);
 673 
 674         ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
 675         ring->arr_hw_rh = hw_rh;
 676         ring->arr_port = port;
 677         ring->arr_grp = rx_grp;
 678         rx_grp->arg_ring_cnt++;
 679 
 680         /*
 681          * The group is already registered, dynamically add a new ring to the
 682          * mac group.
 683          */
 684         if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
 685                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 686                 ring->arr_hw_rh = NULL;
 687                 ring->arr_port = NULL;
 688                 ring->arr_grp = NULL;
 689                 rx_grp->arg_ring_cnt--;
 690         } else {
 691                 /*
 692                  * This must run after the MAC is registered.
 693                  */
 694                 ASSERT3P(ring->arr_rh, !=, NULL);
 695                 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
 696                     (void *)port, (mac_resource_handle_t)ring);
 697         }
 698         return (err);
 699 }
 700 
 701 /*
 702  * Remove the pseudo RX ring of the given HW ring handle.
 703  */
 704 static void
 705 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
 706 {
 707         for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
 708                 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
 709 


 710                 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
 711                     ring->arr_hw_rh != hw_rh) {
 712                         continue;
 713                 }
 714 
 715                 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
 716 
 717                 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
 718                 ring->arr_hw_rh = NULL;
 719                 ring->arr_port = NULL;
 720                 ring->arr_grp = NULL;
 721                 rx_grp->arg_ring_cnt--;
 722                 mac_hwring_clear_passthru(hw_rh);
 723                 break;
 724         }
 725 }
 726 
 727 /*
 728  * Create pseudo rings over the HW rings of the port.
 729  *
 730  * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
 731  *
 732  * o Program existing unicast filters on the pseudo group into the HW group.
 733  *
 734  * o Program existing VLAN filters on the pseudo group into the HW group.
 735  */
 736 static int
 737 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 738 {

 739         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 740         aggr_unicst_addr_t      *addr, *a;
 741         mac_perim_handle_t      pmph;
 742         aggr_vlan_t             *avp;
 743         uint_t                  hw_rh_cnt, i;
 744         int                     err = 0;
 745         uint_t                  g_idx = rx_grp->arg_index;
 746 
 747         ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
 748         ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
 749         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 750 
 751         /*
 752          * This function must be called after the aggr registers its
 753          * MAC and its Rx groups have been initialized.
 754          */
 755         ASSERT(rx_grp->arg_gh != NULL);
 756 
 757         /*
 758          * Get the list of the underlying HW rings.
 759          */
 760         hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
 761             &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
 762 

 763         /*











 764          * Add existing VLAN and unicast address filters to the port.
 765          */
 766         for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
 767             avp = list_next(&rx_grp->arg_vlans, avp)) {
 768                 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
 769                         goto err;
 770         }
 771 
 772         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
 773                 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
 774                         goto err;
 775         }
 776 
 777         for (i = 0; i < hw_rh_cnt; i++) {
 778                 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
 779                 if (err != 0)
 780                         goto err;
 781         }
 782 

 783         mac_perim_exit(pmph);
 784         return (0);
 785 
 786 err:
 787         ASSERT(err != 0);
 788 
 789         for (uint_t j = 0; j < i; j++)
 790                 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
 791 
 792         for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
 793                 aggr_port_remmac(port, g_idx, a->aua_addr);
 794 
 795         if (avp != NULL)
 796                 avp = list_prev(&rx_grp->arg_vlans, avp);
 797 
 798         for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
 799                 int err2;
 800 
 801                 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
 802                         cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
 803                             ": errno %d.", avp->av_vid,
 804                             mac_client_name(port->lp_mch), err2);
 805                 }
 806         }
 807 
 808         port->lp_hwghs[g_idx] = NULL;





 809         mac_perim_exit(pmph);
 810         return (err);
 811 }
 812 
 813 /*
 814  * Destroy the pseudo rings mapping to this port and remove all VLAN
 815  * and unicast filters from this port. Even if there are no underlying
 816  * HW rings we must still remove the unicast filters to take the port
 817  * out of promisc mode.
 818  */
 819 static void
 820 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
 821 {

 822         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP];
 823         aggr_unicst_addr_t      *addr;

 824         mac_perim_handle_t      pmph;
 825         uint_t                  hw_rh_cnt;
 826         uint_t                  g_idx = rx_grp->arg_index;
 827 
 828         ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
 829         ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
 830         ASSERT3P(rx_grp->arg_gh, !=, NULL);
 831         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 832 
 833         hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
 834             MAC_RING_TYPE_RX);
 835 
 836         for (uint_t i = 0; i < hw_rh_cnt; i++)




 837                 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
 838 
 839         for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
 840                 aggr_port_remmac(port, g_idx, addr->aua_addr);
 841 
 842         for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
 843             avp = list_next(&rx_grp->arg_vlans, avp)) {
 844                 int err;
 845 
 846                 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
 847                         cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
 848                             ": errno %d.", avp->av_vid,
 849                             mac_client_name(port->lp_mch), err);
 850                 }
 851         }
 852 
 853         port->lp_hwghs[g_idx] = NULL;














 854         mac_perim_exit(pmph);
 855 }
 856 
 857 /*
 858  * Add a pseudo TX ring for the given HW ring handle.
 859  */
 860 static int
 861 aggr_add_pseudo_tx_ring(aggr_port_t *port,
 862     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
 863     mac_ring_handle_t *pseudo_rh)
 864 {
 865         aggr_pseudo_tx_ring_t   *ring;
 866         int                     err;
 867         int                     i;
 868 
 869         ASSERT(MAC_PERIM_HELD(port->lp_mh));
 870         for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
 871                 ring = tx_grp->atg_rings + i;
 872                 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
 873                         break;


 937 /*
 938  * This function is called to create pseudo rings over hardware rings of
 939  * the underlying device. There is a 1:1 mapping between the pseudo TX
 940  * rings of the aggr and the hardware rings of the underlying port.
 941  */
 942 static int
 943 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
 944 {
 945         aggr_grp_t              *grp = port->lp_grp;
 946         mac_ring_handle_t       hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
 947         mac_perim_handle_t      pmph;
 948         int                     hw_rh_cnt, i = 0, j;
 949         int                     err = 0;
 950 
 951         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
 952         mac_perim_enter_by_mh(port->lp_mh, &pmph);
 953 
 954         /*
 955          * Get the list the the underlying HW rings.
 956          */
 957         hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
 958             MAC_RING_TYPE_TX);
 959 
 960         /*
 961          * Even if the underlying NIC does not have TX rings, we
 962          * still make a psuedo TX ring for that NIC with NULL as
 963          * the ring handle.
 964          */
 965         if (hw_rh_cnt == 0)
 966                 port->lp_tx_ring_cnt = 1;
 967         else
 968                 port->lp_tx_ring_cnt = hw_rh_cnt;
 969 
 970         port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 971             port->lp_tx_ring_cnt), KM_SLEEP);
 972         port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
 973             port->lp_tx_ring_cnt), KM_SLEEP);
 974 
 975         if (hw_rh_cnt == 0) {
 976                 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
 977                     NULL, &pseudo_rh)) == 0) {
 978                         port->lp_tx_rings[0] = NULL;


1044         aggr_grp_update_default(grp);
1045 done:
1046         mac_perim_exit(pmph);
1047 }
1048 
1049 static int
1050 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1051 {
1052         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1053         return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1054 }
1055 
1056 static int
1057 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1058 {
1059         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1060         return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1061 }
1062 
1063 /*
1064  * Start the pseudo ring. Since the pseudo ring is just an abstraction
1065  * over an actual HW ring, the real task is to start the underlying HW
1066  * ring.




1067  */
1068 static int
1069 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1070 {
1071         int err;
1072         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1073 
1074         err = mac_hwring_start(rr_ring->arr_hw_rh);
1075 
1076         if (err != 0)
1077                 return (err);
1078 
1079         rr_ring->arr_gen = mr_gen;
1080         return (err);
1081 }
1082 
1083 /*
1084  * Stop the pseudo ring. Since the pseudo ring is just an abstraction
1085  * over an actual HW ring, the real task is to stop the underlying HW
1086  * ring.
1087  */
1088 static void
1089 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
1090 {
1091         aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1092 
1093         /*
1094          * The rings underlying the default group must stay up to
1095          * continue receiving LACP traffic. We would normally never
1096          * stop the default Rx rings because of the primary MAC
1097          * client; but aggr's primary MAC client doesn't call
1098          * mac_unicast_add() and thus mi_active is 0 when the last
1099          * non-primary client is deleted.
1100          */
1101         if (rr_ring->arr_grp->arg_index != 0)
1102                 mac_hwring_stop(rr_ring->arr_hw_rh);
1103 }
1104 
1105 /*
1106  * Add one or more ports to an existing link aggregation group.
1107  */
1108 int
1109 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1110     laioc_port_t *ports)
1111 {
1112         int rc;
1113         uint_t port_added = 0;
1114         uint_t grp_added;
1115         aggr_grp_t *grp = NULL;
1116         aggr_port_t *port;
1117         boolean_t link_state_changed = B_FALSE;
1118         mac_perim_handle_t mph, pmph;
1119 
1120         /* Get the aggr corresponding to linkid. */
1121         rw_enter(&aggr_grp_lock, RW_READER);
1122         if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1123             (mod_hash_val_t *)&grp) != 0) {
1124                 rw_exit(&aggr_grp_lock);
1125                 return (ENOENT);
1126         }
1127         AGGR_GRP_REFHOLD(grp);
1128 
1129         /*
1130          * Hold the perimeter so that the aggregation can't be destroyed.
1131          */
1132         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1133         rw_exit(&aggr_grp_lock);
1134 
1135         /* Add the specified ports to the aggr. */
1136         for (uint_t i = 0; i < nports; i++) {
1137                 grp_added = 0;
1138 
1139                 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1140                     force, &port)) != 0) {
1141                         goto bail;
1142                 }
1143 
1144                 ASSERT(port != NULL);
1145                 port_added++;
1146 
1147                 /* check capabilities */
1148                 if (!aggr_grp_capab_check(grp, port) ||
1149                     !aggr_grp_sdu_check(grp, port) ||
1150                     !aggr_grp_margin_check(grp, port)) {
1151                         rc = ENOTSUP;
1152                         goto bail;
1153                 }
1154 
1155                 /*
1156                  * Create the pseudo ring for each HW ring of the underlying
1157                  * port.
1158                  */
1159                 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1160                 if (rc != 0)
1161                         goto bail;
1162 
1163                 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
1164                         rc = aggr_add_pseudo_rx_group(port,
1165                             &grp->lg_rx_groups[j]);
1166 
1167                         if (rc != 0)
1168                                 goto bail;
1169 
1170                         grp_added++;
1171                 }
1172 
1173                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1174 
1175                 /* set LACP mode */
1176                 aggr_port_lacp_set_mode(grp, port);
1177 
1178                 /* start port if group has already been started */
1179                 if (grp->lg_started) {
1180                         rc = aggr_port_start(port);
1181                         if (rc != 0) {
1182                                 mac_perim_exit(pmph);
1183                                 goto bail;
1184                         }
1185 
1186                         /*
1187                          * Turn on the promiscuous mode over the port when it
1188                          * is requested to be turned on to receive the
1189                          * non-primary address over a port, or the promiscuous
1190                          * mode is enabled over the aggr.
1191                          */
1192                         if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1193                                 rc = aggr_port_promisc(port, B_TRUE);
1194                                 if (rc != 0) {
1195                                         mac_perim_exit(pmph);
1196                                         goto bail;
1197                                 }
1198                         }
1199                 }
1200                 mac_perim_exit(pmph);
1201 
1202                 /*
1203                  * Attach each port if necessary.
1204                  */
1205                 if (aggr_port_notify_link(grp, port))
1206                         link_state_changed = B_TRUE;
1207 
1208                 /*
1209                  * Initialize the callback functions for this port.
1210                  */
1211                 aggr_port_init_callbacks(port);
1212         }
1213 
1214         /* update the MAC address of the constituent ports */
1215         if (aggr_grp_update_ports_mac(grp))
1216                 link_state_changed = B_TRUE;
1217 
1218         if (link_state_changed)
1219                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1220 
1221 bail:
1222         if (rc != 0) {
1223                 /* stop and remove ports that have been added */
1224                 for (uint_t i = 0; i < port_added; i++) {
1225                         uint_t grp_remove;
1226 
1227                         port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1228                         ASSERT(port != NULL);
1229 
1230                         if (grp->lg_started) {
1231                                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1232                                 (void) aggr_port_promisc(port, B_FALSE);
1233                                 aggr_port_stop(port);
1234                                 mac_perim_exit(pmph);
1235                         }
1236 
1237                         aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1238 
1239                         /*
1240                          * Only the last port could have a partial set
1241                          * of groups added.
1242                          */
1243                         grp_remove = (i + 1 == port_added) ? grp_added :
1244                             grp->lg_rx_group_count;
1245 
1246                         for (uint_t j = 0; j < grp_remove; j++) {
1247                                 aggr_rem_pseudo_rx_group(port,
1248                                     &grp->lg_rx_groups[j]);
1249                         }
1250 
1251                         (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1252                 }
1253         }
1254 
1255         mac_perim_exit(mph);
1256         AGGR_GRP_REFRELE(grp);
1257         return (rc);
1258 }
1259 
1260 static int
1261 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1262     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1263     aggr_lacp_timer_t lacp_timer)
1264 {
1265         boolean_t mac_addr_changed = B_FALSE;
1266         boolean_t link_state_changed = B_FALSE;
1267         mac_perim_handle_t pmph;
1268 
1269         ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1270 


1392         grp->lg_refs = 1;
1393         grp->lg_closing = B_FALSE;
1394         grp->lg_force = force;
1395         grp->lg_linkid = linkid;
1396         grp->lg_zoneid = crgetzoneid(credp);
1397         grp->lg_ifspeed = 0;
1398         grp->lg_link_state = LINK_STATE_UNKNOWN;
1399         grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1400         grp->lg_started = B_FALSE;
1401         grp->lg_promisc = B_FALSE;
1402         grp->lg_lacp_done = B_FALSE;
1403         grp->lg_tx_notify_done = B_FALSE;
1404         grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1405         grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1406             aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1407         grp->lg_tx_notify_thread = thread_create(NULL, 0,
1408             aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1409         grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1410             MAX_RINGS_PER_GROUP), KM_SLEEP);
1411         grp->lg_tx_blocked_cnt = 0;
1412         bzero(&grp->lg_rx_groups,
1413             sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
1414         bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1415         aggr_lacp_init_grp(grp);
1416 




1417         /* add MAC ports to group */
1418         grp->lg_ports = NULL;
1419         grp->lg_nports = 0;
1420         grp->lg_nattached_ports = 0;
1421         grp->lg_ntx_ports = 0;
1422 
1423         /*
1424          * If key is not specified by the user, allocate the key.
1425          */
1426         if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1427                 err = ENOMEM;
1428                 goto bail;
1429         }
1430         grp->lg_key = key;
1431 
1432         for (i = 0; i < nports; i++) {
1433                 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1434                 if (err != 0)
1435                         goto bail;
1436         }
1437 
1438         grp->lg_rx_group_count = 1;
1439 
1440         for (i = 0, port = grp->lg_ports; port != NULL;
1441             i++, port = port->lp_next) {
1442                 uint_t num_rgroups;
1443 
1444                 mac_perim_enter_by_mh(port->lp_mh, &mph);
1445                 num_rgroups = mac_get_num_rx_groups(port->lp_mh);
1446                 mac_perim_exit(mph);
1447 
1448                 /*
1449                  * Utilize all the groups in a port. If some ports
1450                  * have less groups than others, then traffic destined
1451                  * for the same unicast address may be HW classified
1452                  * on some ports but SW classified by aggr when
1453                  * arriving on other ports.
1454                  */
1455                 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
1456                     num_rgroups);
1457         }
1458 
1459         /*
1460          * There could be cases where the hardware provides more
1461          * groups than aggr can support. Make sure we never go above
1462          * the max aggr can support.
1463          */
1464         grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
1465             MAX_GROUPS_PER_PORT);
1466 
1467         ASSERT3U(grp->lg_rx_group_count, >, 0);
1468         for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1469                 grp->lg_rx_groups[i].arg_index = i;
1470                 grp->lg_rx_groups[i].arg_untagged = 0;
1471                 list_create(&(grp->lg_rx_groups[i].arg_vlans),
1472                     sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
1473         }
1474 
1475         /*
1476          * If no explicit MAC address was specified by the administrator,
1477          * set it to the MAC address of the first port.
1478          */
1479         grp->lg_addr_fixed = mac_fixed;
1480         if (grp->lg_addr_fixed) {
1481                 /* validate specified address */
1482                 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1483                         err = EINVAL;
1484                         goto bail;
1485                 }
1486                 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1487         } else {
1488                 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1489                 grp->lg_mac_addr_port = grp->lg_ports;
1490         }
1491 
1492         /* Set the initial group capabilities. */
1493         aggr_grp_capab_set(grp);
1494 
1495         if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1496                 err = ENOMEM;
1497                 goto bail;
1498         }
1499         mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1500         mac->m_driver = grp;
1501         mac->m_dip = aggr_dip;
1502         mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1503         mac->m_src_addr = grp->lg_addr;
1504         mac->m_callbacks = &aggr_m_callbacks;
1505         mac->m_min_sdu = 0;
1506         mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1507         mac->m_margin = aggr_grp_max_margin(grp);
1508         mac->m_v12n = MAC_VIRT_LEVEL1;
1509         err = mac_register(mac, &grp->lg_mh);
1510         mac_free(mac);
1511         if (err != 0)
1512                 goto bail;
1513 
1514         err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1515         if (err != 0) {
1516                 (void) mac_unregister(grp->lg_mh);
1517                 grp->lg_mh = NULL;
1518                 goto bail;
1519         }
1520 
1521         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1522 
1523         /*
1524          * Update the MAC address of the constituent ports.
1525          * None of the port is attached at this time, the link state of the
1526          * aggregation will not change.
1527          *
1528          * All ports take on the primary MAC address of the aggr
1529          * (lg_aggr). At this point, none of the ports are attached;
1530          * thus the link state of the aggregation will not change.
1531          */
1532         link_state_changed = aggr_grp_update_ports_mac(grp);
1533         ASSERT(!link_state_changed);
1534 
1535         /* Update outbound load balancing policy. */
1536         aggr_send_update_policy(grp, policy);
1537 
1538         /* Set LACP mode. */
1539         aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1540 
1541         /*
1542          * Attach each port if necessary.
1543          */
1544         for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1545                 /*
1546                  * Create the pseudo ring for each HW ring of the
1547                  * underlying port. Note that this is done after the
1548                  * aggr registers its MAC.
1549                  */
1550                 VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group),
1551                     ==, 0);
1552 
1553                 for (i = 0; i < grp->lg_rx_group_count; i++) {
1554                         VERIFY3S(aggr_add_pseudo_rx_group(port,
1555                             &grp->lg_rx_groups[i]), ==, 0);
1556                 }
1557 
1558                 if (aggr_port_notify_link(grp, port))
1559                         link_state_changed = B_TRUE;
1560 
1561                 /*
1562                  * Initialize the callback functions for this port.
1563                  */
1564                 aggr_port_init_callbacks(port);
1565         }
1566 
1567         if (link_state_changed)
1568                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1569 
1570         /* add new group to hash table */
1571         err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1572             (mod_hash_val_t)grp);
1573         ASSERT(err == 0);
1574         aggr_grp_cnt++;
1575 
1576         mac_perim_exit(mph);
1577         rw_exit(&aggr_grp_lock);


1819 
1820                 /* stop port if group has already been started */
1821                 if (grp->lg_started) {
1822                         mac_perim_enter_by_mh(port->lp_mh, &pmph);
1823                         aggr_port_stop(port);
1824                         mac_perim_exit(pmph);
1825                 }
1826 
1827                 /*
1828                  * aggr_rem_pseudo_tx_group() is not called here. Instead
1829                  * it is called from inside aggr_grp_rem_port() after the
1830                  * port has been detached. The reason is that
1831                  * aggr_rem_pseudo_tx_group() removes one ring at a time
1832                  * and if there is still traffic going on, then there
1833                  * is the possibility of aggr_find_tx_ring() returning a
1834                  * removed ring for transmission. Once the port has been
1835                  * detached, that port will not be used and
1836                  * aggr_find_tx_ring() will not return any rings
1837                  * belonging to it.
1838                  */
1839                 for (i = 0; i < grp->lg_rx_group_count; i++)
1840                         aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1841 
1842                 /* remove port from group */
1843                 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1844                     &link_state_changed);
1845                 ASSERT(rc == 0);
1846                 mac_addr_update = mac_addr_update || mac_addr_changed;
1847                 link_state_update = link_state_update || link_state_changed;
1848         }
1849 
1850 bail:
1851         if (mac_addr_update)
1852                 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1853         if (link_state_update)
1854                 mac_link_update(grp->lg_mh, grp->lg_link_state);
1855 
1856         mac_perim_exit(mph);
1857         AGGR_GRP_REFRELE(grp);
1858 
1859         return (rc);
1860 }


1925                 grp->lg_tx_notify_done = B_TRUE;
1926                 cv_signal(&grp->lg_tx_flowctl_cv);
1927         }
1928         mutex_exit(&grp->lg_tx_flowctl_lock);
1929         if (tid != 0)
1930                 thread_join(tid);
1931 
1932         mac_perim_enter_by_mh(grp->lg_mh, &mph);
1933 
1934         grp->lg_closing = B_TRUE;
1935         /* detach and free MAC ports associated with group */
1936         port = grp->lg_ports;
1937         while (port != NULL) {
1938                 cport = port->lp_next;
1939                 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1940                 if (grp->lg_started)
1941                         aggr_port_stop(port);
1942                 (void) aggr_grp_detach_port(grp, port);
1943                 mac_perim_exit(pmph);
1944                 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1945                 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
1946                         aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1947                 aggr_port_delete(port);
1948                 port = cport;
1949         }
1950 
1951         mac_perim_exit(mph);
1952 
1953         kmem_free(grp->lg_tx_blocked_rings,
1954             (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1955         /*
1956          * Wait for the port's lacp timer thread and its notification callback
1957          * to exit before calling mac_unregister() since both needs to access
1958          * the mac perimeter of the grp.
1959          */
1960         aggr_grp_port_wait(grp);
1961 
1962         VERIFY(mac_unregister(grp->lg_mh) == 0);
1963         grp->lg_mh = NULL;
1964 
1965         for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1966                 list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
1967         }
1968 
1969         AGGR_GRP_REFRELE(grp);
1970         return (0);
1971 }
1972 
1973 void
1974 aggr_grp_free(aggr_grp_t *grp)
1975 {
1976         ASSERT(grp->lg_refs == 0);
1977         ASSERT(grp->lg_port_ref == 0);
1978         if (grp->lg_key > AGGR_MAX_KEY) {
1979                 id_free(key_ids, grp->lg_key);
1980                 grp->lg_key = 0;
1981         }
1982         kmem_cache_free(aggr_grp_cache, grp);
1983 }
1984 
1985 int
1986 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1987     aggr_grp_info_new_grp_fn_t new_grp_fn,


2313                 uint32_t *hcksum_txflags = cap_data;
2314                 *hcksum_txflags = grp->lg_hcksum_txflags;
2315                 break;
2316         }
2317         case MAC_CAPAB_LSO: {
2318                 mac_capab_lso_t *cap_lso = cap_data;
2319 
2320                 if (grp->lg_lso) {
2321                         *cap_lso = grp->lg_cap_lso;
2322                         break;
2323                 } else {
2324                         return (B_FALSE);
2325                 }
2326         }
2327         case MAC_CAPAB_NO_NATIVEVLAN:
2328                 return (!grp->lg_vlan);
2329         case MAC_CAPAB_NO_ZCOPY:
2330                 return (!grp->lg_zcopy);
2331         case MAC_CAPAB_RINGS: {
2332                 mac_capab_rings_t *cap_rings = cap_data;
2333                 uint_t ring_cnt = 0;
2334 
2335                 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2336                         ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
2337 
2338                 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2339                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2340                         cap_rings->mr_rnum = ring_cnt;
2341                         cap_rings->mr_gnum = grp->lg_rx_group_count;






2342                         cap_rings->mr_gaddring = NULL;
2343                         cap_rings->mr_gremring = NULL;
2344                 } else {
2345                         cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2346                         cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2347                         cap_rings->mr_gnum = 0;
2348                 }
2349                 cap_rings->mr_rget = aggr_fill_ring;
2350                 cap_rings->mr_gget = aggr_fill_group;
2351                 break;
2352         }
2353         case MAC_CAPAB_AGGR:
2354         {
2355                 mac_capab_aggr_t *aggr_cap;
2356 
2357                 if (cap_data != NULL) {
2358                         aggr_cap = cap_data;
2359                         aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2360                         aggr_cap->mca_unicst = aggr_m_unicst;
2361                         aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2362                         aggr_cap->mca_arg = arg;
2363                 }
2364                 return (B_TRUE);
2365         }
2366         default:
2367                 return (B_FALSE);
2368         }
2369         return (B_TRUE);
2370 }
2371 
2372 /*
2373  * Callback function for MAC layer to register groups.
2374  */
2375 static void
2376 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2377     mac_group_info_t *infop, mac_group_handle_t gh)
2378 {
2379         aggr_grp_t *grp = arg;


2380 

2381         if (rtype == MAC_RING_TYPE_RX) {
2382                 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
2383 
2384                 rx_group->arg_gh = gh;
2385                 rx_group->arg_grp = grp;
2386 
2387                 infop->mgi_driver = (mac_group_driver_t)rx_group;
2388                 infop->mgi_start = NULL;
2389                 infop->mgi_stop = NULL;
2390                 infop->mgi_addmac = aggr_addmac;
2391                 infop->mgi_remmac = aggr_remmac;
2392                 infop->mgi_count = rx_group->arg_ring_cnt;
2393 
2394                 /*
2395                  * Always set the HW VLAN callbacks. They are smart
2396                  * enough to know when a port has HW VLAN filters to
2397                  * program and when it doesn't.
2398                  */
2399                 infop->mgi_addvlan = aggr_addvlan;
2400                 infop->mgi_remvlan = aggr_remvlan;
2401         } else {
2402                 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2403 
2404                 ASSERT3S(index, ==, 0);
2405                 tx_group->atg_gh = gh;
2406         }
2407 }
2408 
2409 /*
2410  * Callback funtion for MAC layer to register all rings.
2411  */
2412 static void
2413 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2414     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2415 {
2416         aggr_grp_t      *grp = arg;
2417 
2418         switch (rtype) {
2419         case MAC_RING_TYPE_RX: {
2420                 aggr_pseudo_rx_group_t  *rx_group;
2421                 aggr_pseudo_rx_ring_t   *rx_ring;
2422                 mac_intr_t              aggr_mac_intr;
2423 
2424                 rx_group = &grp->lg_rx_groups[rg_index];
2425                 ASSERT3S(index, >=, 0);
2426                 ASSERT3S(index, <, rx_group->arg_ring_cnt);
2427                 rx_ring = rx_group->arg_rings + index;
2428                 rx_ring->arr_rh = rh;
2429 
2430                 /*
2431                  * Entrypoint to enable interrupt (disable poll) and
2432                  * disable interrupt (enable poll).
2433                  */
2434                 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2435                 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2436                 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2437                 aggr_mac_intr.mi_ddi_handle = NULL;
2438 
2439                 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2440                 infop->mri_start = aggr_pseudo_start_rx_ring;
2441                 infop->mri_stop = aggr_pseudo_stop_rx_ring;
2442 
2443                 infop->mri_intr = aggr_mac_intr;
2444                 infop->mri_poll = aggr_rx_poll;
2445 
2446                 infop->mri_stat = aggr_rx_ring_stat;
2447                 break;
2448         }
2449         case MAC_RING_TYPE_TX: {
2450                 aggr_pseudo_tx_group_t  *tx_group = &grp->lg_tx_group;
2451                 aggr_pseudo_tx_ring_t   *tx_ring;
2452 
2453                 ASSERT(rg_index == -1);
2454                 ASSERT(index < tx_group->atg_ring_cnt);
2455 
2456                 tx_ring = &tx_group->atg_rings[index];
2457                 tx_ring->atr_rh = rh;
2458 
2459                 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2460                 infop->mri_start = NULL;
2461                 infop->mri_stop = NULL;


2508                 if (!port->lp_collector_enabled) {
2509                         *mpp = mp->b_next;
2510                         mp->b_next = NULL;
2511                         freemsg(mp);
2512                         continue;
2513                 }
2514                 mpp = &mp->b_next;
2515         }
2516         return (mp_chain);
2517 }
2518 
2519 static int
2520 aggr_addmac(void *arg, const uint8_t *mac_addr)
2521 {
2522         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2523         aggr_unicst_addr_t      *addr, **pprev;
2524         aggr_grp_t              *grp = rx_group->arg_grp;
2525         aggr_port_t             *port, *p;
2526         mac_perim_handle_t      mph;
2527         int                     err = 0;
2528         uint_t                  idx = rx_group->arg_index;
2529 
2530         mac_perim_enter_by_mh(grp->lg_mh, &mph);
2531 
2532         if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2533                 mac_perim_exit(mph);
2534                 return (0);
2535         }
2536 
2537         /*
2538          * Insert this mac address into the list of mac addresses owned by
2539          * the aggregation pseudo group.
2540          */
2541         pprev = &rx_group->arg_macaddr;
2542         while ((addr = *pprev) != NULL) {
2543                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2544                         mac_perim_exit(mph);
2545                         return (EEXIST);
2546                 }
2547                 pprev = &addr->aua_next;
2548         }
2549         addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2550         bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2551         addr->aua_next = NULL;
2552         *pprev = addr;
2553 
2554         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2555                 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
2556                         break;
2557 
2558         if (err != 0) {
2559                 for (p = grp->lg_ports; p != port; p = p->lp_next)
2560                         aggr_port_remmac(p, idx, mac_addr);
2561 
2562                 *pprev = NULL;
2563                 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2564         }
2565 
2566         mac_perim_exit(mph);
2567         return (err);
2568 }
2569 
2570 static int
2571 aggr_remmac(void *arg, const uint8_t *mac_addr)
2572 {
2573         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
2574         aggr_unicst_addr_t      *addr, **pprev;
2575         aggr_grp_t              *grp = rx_group->arg_grp;
2576         aggr_port_t             *port;
2577         mac_perim_handle_t      mph;
2578         int                     err = 0;
2579 
2580         mac_perim_enter_by_mh(grp->lg_mh, &mph);


2585         }
2586 
2587         /*
2588          * Insert this mac address into the list of mac addresses owned by
2589          * the aggregation pseudo group.
2590          */
2591         pprev = &rx_group->arg_macaddr;
2592         while ((addr = *pprev) != NULL) {
2593                 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2594                         pprev = &addr->aua_next;
2595                         continue;
2596                 }
2597                 break;
2598         }
2599         if (addr == NULL) {
2600                 mac_perim_exit(mph);
2601                 return (EINVAL);
2602         }
2603 
2604         for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2605                 aggr_port_remmac(port, rx_group->arg_index, mac_addr);
2606 
2607         *pprev = addr->aua_next;
2608         kmem_free(addr, sizeof (aggr_unicst_addr_t));
2609 
2610         mac_perim_exit(mph);
2611         return (err);
2612 }
2613 
2614 /*
2615  * Search for VID in the Rx group's list and return a pointer if
2616  * found. Otherwise return NULL.
2617  */
2618 static aggr_vlan_t *
2619 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2620 {
2621         ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2622         for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2623             avp = list_next(&rx_group->arg_vlans, avp)) {
2624                 if (avp->av_vid == vid)
2625                         return (avp);


2627 
2628         return (NULL);
2629 }
2630 
2631 /*
2632  * Accept traffic on the specified VID.
2633  *
2634  * Persist VLAN state in the aggr so that ports added later will
2635  * receive the correct filters. In the future it would be nice to
2636  * allow aggr to iterate its clients instead of duplicating state.
2637  */
2638 static int
2639 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2640 {
2641         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2642         aggr_grp_t              *aggr = rx_group->arg_grp;
2643         aggr_port_t             *port, *p;
2644         mac_perim_handle_t      mph;
2645         int                     err = 0;
2646         aggr_vlan_t             *avp = NULL;
2647         uint_t                  idx = rx_group->arg_index;
2648 
2649         mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2650 
2651         if (vid == MAC_VLAN_UNTAGGED) {
2652                 /*
2653                  * Aggr is both a MAC provider and MAC client. As a
2654                  * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2655                  * client. As a client itself, it should pass
2656                  * VLAN_ID_NONE to its ports.
2657                  */
2658                 vid = VLAN_ID_NONE;
2659                 rx_group->arg_untagged++;
2660                 goto update_ports;
2661         }
2662 
2663         avp = aggr_find_vlan(rx_group, vid);
2664 
2665         if (avp != NULL) {
2666                 avp->av_refs++;
2667                 mac_perim_exit(mph);
2668                 return (0);
2669         }
2670 
2671         avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2672         avp->av_vid = vid;
2673         avp->av_refs = 1;
2674 
2675 update_ports:
2676         for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2677                 if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
2678                         break;
2679 
2680         if (err != 0) {
2681                 /*
2682                  * If any of these calls fail then we are in a
2683                  * situation where the ports have different HW state.
2684                  * There's no reasonable action the MAC client can
2685                  * take in this scenario to rectify the situation.
2686                  */
2687                 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2688                         int err2;
2689 
2690                         if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
2691                                 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2692                                     " from port %s: errno %d.", vid,
2693                                     mac_client_name(p->lp_mch), err2);
2694                         }
2695 
2696                 }
2697 
2698                 if (vid == VLAN_ID_NONE)
2699                         rx_group->arg_untagged--;
2700 
2701                 if (avp != NULL) {
2702                         kmem_free(avp, sizeof (aggr_vlan_t));
2703                         avp = NULL;
2704                 }
2705         }
2706 
2707         if (avp != NULL)
2708                 list_insert_tail(&rx_group->arg_vlans, avp);
2709 
2710 done:
2711         mac_perim_exit(mph);
2712         return (err);
2713 }
2714 
2715 /*
2716  * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2717  */
2718 static int
2719 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2720 {
2721         aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2722         aggr_grp_t              *aggr = rx_group->arg_grp;
2723         aggr_port_t             *port, *p;
2724         mac_perim_handle_t      mph;
2725         int                     err = 0;
2726         aggr_vlan_t             *avp = NULL;
2727         uint_t                  idx = rx_group->arg_index;
2728 
2729         mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2730 
2731         /*
2732          * See the comment in aggr_addvlan().
2733          */
2734         if (vid == MAC_VLAN_UNTAGGED) {
2735                 vid = VLAN_ID_NONE;
2736                 rx_group->arg_untagged--;
2737 
2738                 if (rx_group->arg_untagged > 0)
2739                         goto done;
2740 
2741                 goto update_ports;
2742         }
2743 
2744         avp = aggr_find_vlan(rx_group, vid);
2745 
2746         if (avp == NULL) {
2747                 err = ENOENT;
2748                 goto done;
2749         }
2750 
2751         avp->av_refs--;
2752 
2753         if (avp->av_refs > 0)
2754                 goto done;
2755 
2756 update_ports:
2757         for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2758                 if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
2759                         break;
2760 
2761         /*
2762          * See the comment in aggr_addvlan() for justification of the
2763          * use of VERIFY here.
2764          */
2765         if (err != 0) {
2766                 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2767                         int err2;
2768 
2769                         if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
2770                                 cmn_err(CE_WARN, "Failed to add VLAN %u"
2771                                     " to port %s: errno %d.", vid,
2772                                     mac_client_name(p->lp_mch), err2);
2773                         }
2774                 }
2775 
2776                 if (avp != NULL)
2777                         avp->av_refs++;
2778 
2779                 if (vid == VLAN_ID_NONE)
2780                         rx_group->arg_untagged++;
2781 
2782                 goto done;
2783         }
2784 
2785         if (err == 0 && avp != NULL) {
2786                 VERIFY3U(avp->av_refs, ==, 0);
2787                 list_remove(&rx_group->arg_vlans, avp);
2788                 kmem_free(avp, sizeof (aggr_vlan_t));
2789         }