15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2018 Joyent, Inc.
24 */
25
26 /*
27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
28 *
29 * An instance of the structure aggr_grp_t is allocated for each
30 * link aggregation group. When created, aggr_grp_t objects are
31 * entered into the aggr_grp_hash hash table maintained by the modhash
32 * module. The hash key is the linkid associated with the link
33 * aggregation group.
34 *
35 * A set of MAC ports are associated with each association group.
36 *
37 * Aggr pseudo TX rings
38 * --------------------
39 * The underlying ports (NICs) in an aggregation can have TX rings. To
40 * enhance aggr's performance, these TX rings are made available to the
41 * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
42 * They are already present and implemented on the RX side. It is called
43 * as pseudo RX rings. The same concept is extended to the TX side where
44 * each TX ring of an underlying port is reflected in aggr as a pseudo
45 * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
46 * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
47 * TX ring is given to the aggregation layer.
48 *
49 * With this change, the outgoing stack depth looks much better:
50 *
51 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
52 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
53 *
54 * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
55 * SRS_TX_AGGR and SRS_TX_BW_AGGR.
56 *
57 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
58 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
59 * ring belonging to a port on which the packet has to be sent.
60 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
61 * policy and then uses the fanout_hint passed to it to pick a TX ring from
62 * the selected port.
63 *
64 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
65 * bandwidth limit is applied first on the outgoing packet and the packets
66 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
67 * particular TX ring.
68 */
69
70 #include <sys/types.h>
71 #include <sys/sysmacros.h>
72 #include <sys/conf.h>
73 #include <sys/cmn_err.h>
74 #include <sys/disp.h>
75 #include <sys/list.h>
76 #include <sys/ksynch.h>
77 #include <sys/kmem.h>
78 #include <sys/stream.h>
79 #include <sys/modctl.h>
80 #include <sys/ddi.h>
81 #include <sys/sunddi.h>
82 #include <sys/atomic.h>
83 #include <sys/stat.h>
84 #include <sys/modhash.h>
85 #include <sys/id_space.h>
86 #include <sys/strsun.h>
87 #include <sys/cred.h>
104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
105 const void *);
106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
107 mac_prop_info_handle_t);
108
109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
111 boolean_t *);
112
113 static void aggr_grp_capab_set(aggr_grp_t *);
114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
115 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
116 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
119
120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
122 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
123 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
125 static int aggr_addmac(void *, const uint8_t *);
126 static int aggr_remmac(void *, const uint8_t *);
127 static int aggr_addvlan(mac_group_driver_t, uint16_t);
128 static int aggr_remvlan(mac_group_driver_t, uint16_t);
129 static mblk_t *aggr_rx_poll(void *, int);
130 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
131 const int, mac_ring_info_t *, mac_ring_handle_t);
132 static void aggr_fill_group(void *, mac_ring_type_t, const int,
133 mac_group_info_t *, mac_group_handle_t);
134
135 static kmem_cache_t *aggr_grp_cache;
136 static mod_hash_t *aggr_grp_hash;
137 static krwlock_t aggr_grp_lock;
138 static uint_t aggr_grp_cnt;
139 static id_space_t *key_ids;
140
141 #define GRP_HASHSZ 64
142 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
143 #define AGGR_PORT_NAME_DELIMIT '-'
144
349
350 /*
351 * Update the group link state.
352 */
353 if (grp->lg_link_state != LINK_STATE_UP) {
354 grp->lg_link_state = LINK_STATE_UP;
355 mutex_enter(&grp->lg_stat_lock);
356 grp->lg_link_duplex = LINK_DUPLEX_FULL;
357 mutex_exit(&grp->lg_stat_lock);
358 link_state_changed = B_TRUE;
359 }
360
361 /*
362 * Update port's state.
363 */
364 port->lp_state = AGGR_PORT_STATE_ATTACHED;
365
366 aggr_grp_multicst_port(port, B_TRUE);
367
368 /*
369 * Set port's receive callback
370 */
371 mac_rx_set(port->lp_mch, aggr_recv_cb, port);
372
373 /*
374 * If LACP is OFF, the port can be used to send data as soon
375 * as its link is up and verified to be compatible with the
376 * aggregation.
377 *
378 * If LACP is active or passive, notify the LACP subsystem, which
379 * will enable sending on the port following the LACP protocol.
380 */
381 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
382 aggr_send_port_enable(port);
383 else
384 aggr_lacp_port_attached(port);
385
386 return (link_state_changed);
387 }
388
389 boolean_t
390 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
391 {
392 boolean_t link_state_changed = B_FALSE;
393
394 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
395 ASSERT(MAC_PERIM_HELD(port->lp_mh));
396
397 /* update state */
398 if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
399 return (B_FALSE);
400
401 mac_rx_clear(port->lp_mch);
402
403 aggr_grp_multicst_port(port, B_FALSE);
404
405 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
406 aggr_send_port_disable(port);
407 else
408 aggr_lacp_port_detached(port);
409
410 port->lp_state = AGGR_PORT_STATE_STANDBY;
411
412 grp->lg_nattached_ports--;
413 if (grp->lg_nattached_ports == 0) {
414 /* the last attached MAC port of the group is being detached */
415 grp->lg_link_state = LINK_STATE_DOWN;
416 mutex_enter(&grp->lg_stat_lock);
417 grp->lg_ifspeed = 0;
418 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
419 mutex_exit(&grp->lg_stat_lock);
420 link_state_changed = B_TRUE;
421 }
520 * address now, and this might cause the link state
521 * of the aggregation to change.
522 */
523 *link_state_changedp = aggr_grp_attach_port(grp, port);
524 }
525 }
526 }
527
528 /*
529 * Add a port to a link aggregation group.
530 */
531 static int
532 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
533 aggr_port_t **pp)
534 {
535 aggr_port_t *port, **cport;
536 mac_perim_handle_t mph;
537 zoneid_t port_zoneid = ALL_ZONES;
538 int err;
539
540 /* The port must be int the same zone as the aggregation. */
541 if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
542 port_zoneid = GLOBAL_ZONEID;
543 if (grp->lg_zoneid != port_zoneid)
544 return (EBUSY);
545
546 /*
547 * lg_mh could be NULL when the function is called during the creation
548 * of the aggregation.
549 */
550 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
551
552 /* create new port */
553 err = aggr_port_create(grp, port_linkid, force, &port);
554 if (err != 0)
555 return (err);
556
557 mac_perim_enter_by_mh(port->lp_mh, &mph);
558
559 /* add port to list of group constituent ports */
560 cport = &grp->lg_ports;
561 while (*cport != NULL)
562 cport = &((*cport)->lp_next);
563 *cport = port;
564
565 /*
566 * Back reference to the group it is member of. A port always
567 * holds a reference to its group to ensure that the back
568 * reference is always valid.
569 */
570 port->lp_grp = grp;
571 AGGR_GRP_REFHOLD(grp);
572 grp->lg_nports++;
573
574 aggr_lacp_init_port(port);
575 mac_perim_exit(mph);
576
577 if (pp != NULL)
578 *pp = port;
579
621 {
622 aggr_pseudo_rx_ring_t *ring;
623 int err;
624 int j;
625
626 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
627 ring = rx_grp->arg_rings + j;
628 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
629 break;
630 }
631
632 /*
633 * No slot for this new RX ring.
634 */
635 if (j == MAX_RINGS_PER_GROUP)
636 return (EIO);
637
638 ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
639 ring->arr_hw_rh = hw_rh;
640 ring->arr_port = port;
641 rx_grp->arg_ring_cnt++;
642
643 /*
644 * The group is already registered, dynamically add a new ring to the
645 * mac group.
646 */
647 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
648 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
649 ring->arr_hw_rh = NULL;
650 ring->arr_port = NULL;
651 rx_grp->arg_ring_cnt--;
652 } else {
653 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
654 mac_find_ring(rx_grp->arg_gh, j));
655 }
656 return (err);
657 }
658
659 /*
660 * Remove the pseudo RX ring of the given HW ring handle.
661 */
662 static void
663 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
664 {
665 aggr_pseudo_rx_ring_t *ring;
666 int j;
667
668 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
669 ring = rx_grp->arg_rings + j;
670 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
671 ring->arr_hw_rh != hw_rh) {
672 continue;
673 }
674
675 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
676
677 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
678 ring->arr_hw_rh = NULL;
679 ring->arr_port = NULL;
680 rx_grp->arg_ring_cnt--;
681 mac_hwring_teardown(hw_rh);
682 break;
683 }
684 }
685
686 /*
687 * Create pseudo rings over the HW rings of the port.
688 *
689 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
690 *
691 * o Program existing unicast filters on the pseudo group into the HW group.
692 *
693 * o Program existing VLAN filters on the pseudo group into the HW group.
694 */
695 static int
696 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
697 {
698 aggr_grp_t *grp = port->lp_grp;
699 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
700 aggr_unicst_addr_t *addr, *a;
701 mac_perim_handle_t pmph;
702 aggr_vlan_t *avp;
703 int hw_rh_cnt, i = 0, j;
704 int err = 0;
705
706 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
707 mac_perim_enter_by_mh(port->lp_mh, &pmph);
708
709 /*
710 * This function must be called after the aggr registers its MAC
711 * and its Rx group has been initialized.
712 */
713 ASSERT(rx_grp->arg_gh != NULL);
714
715 /*
716 * Get the list of the underlying HW rings.
717 */
718 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
719 &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
720
721 if (port->lp_hwgh != NULL) {
722 /*
723 * Quiesce the HW ring and the MAC SRS on the ring. Note
724 * that the HW ring will be restarted when the pseudo ring
725 * is started. At that time all the packets will be
726 * directly passed up to the pseudo Rx ring and handled
727 * by MAC SRS created over the pseudo Rx ring.
728 */
729 mac_rx_client_quiesce(port->lp_mch);
730 mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
731 }
732
733 /*
734 * Add existing VLAN and unicast address filters to the port.
735 */
736 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
737 avp = list_next(&rx_grp->arg_vlans, avp)) {
738 if ((err = aggr_port_addvlan(port, avp->av_vid)) != 0)
739 goto err;
740 }
741
742 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
743 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
744 goto err;
745 }
746
747 for (i = 0; i < hw_rh_cnt; i++) {
748 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
749 if (err != 0)
750 goto err;
751 }
752
753 port->lp_rx_grp_added = B_TRUE;
754 mac_perim_exit(pmph);
755 return (0);
756
757 err:
758 ASSERT(err != 0);
759
760 for (j = 0; j < i; j++)
761 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
762
763 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
764 aggr_port_remmac(port, a->aua_addr);
765
766 if (avp != NULL)
767 avp = list_prev(&rx_grp->arg_vlans, avp);
768
769 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
770 int err2;
771
772 if ((err2 = aggr_port_remvlan(port, avp->av_vid)) != 0) {
773 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
774 ": errno %d.", avp->av_vid,
775 mac_client_name(port->lp_mch), err2);
776 }
777 }
778
779 if (port->lp_hwgh != NULL) {
780 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
781 mac_rx_client_restart(port->lp_mch);
782 port->lp_hwgh = NULL;
783 }
784
785 mac_perim_exit(pmph);
786 return (err);
787 }
788
789 /*
790 * Destroy the pseudo rings mapping to this port and remove all VLAN
791 * and unicast filters from this port. Even if there are no underlying
792 * HW rings we must still remove the unicast filters to take the port
793 * out of promisc mode.
794 */
795 static void
796 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
797 {
798 aggr_grp_t *grp = port->lp_grp;
799 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
800 aggr_unicst_addr_t *addr;
801 mac_group_handle_t hwgh;
802 mac_perim_handle_t pmph;
803 int hw_rh_cnt, i;
804
805 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
806 mac_perim_enter_by_mh(port->lp_mh, &pmph);
807
808 if (!port->lp_rx_grp_added)
809 goto done;
810
811 ASSERT(rx_grp->arg_gh != NULL);
812 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
813 &hwgh, hw_rh, MAC_RING_TYPE_RX);
814
815 for (i = 0; i < hw_rh_cnt; i++)
816 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
817
818 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
819 aggr_port_remmac(port, addr->aua_addr);
820
821 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
822 avp = list_next(&rx_grp->arg_vlans, avp)) {
823 int err;
824
825 if ((err = aggr_port_remvlan(port, avp->av_vid)) != 0) {
826 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
827 ": errno %d.", avp->av_vid,
828 mac_client_name(port->lp_mch), err);
829 }
830 }
831
832 if (port->lp_hwgh != NULL) {
833 port->lp_hwgh = NULL;
834
835 /*
836 * First clear the permanent-quiesced flag of the RX srs then
837 * restart the HW ring and the mac srs on the ring. Note that
838 * the HW ring and associated SRS will soon been removed when
839 * the port is removed from the aggr.
840 */
841 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
842 mac_rx_client_restart(port->lp_mch);
843 }
844
845 port->lp_rx_grp_added = B_FALSE;
846 done:
847 mac_perim_exit(pmph);
848 }
849
850 /*
851 * Add a pseudo TX ring for the given HW ring handle.
852 */
853 static int
854 aggr_add_pseudo_tx_ring(aggr_port_t *port,
855 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
856 mac_ring_handle_t *pseudo_rh)
857 {
858 aggr_pseudo_tx_ring_t *ring;
859 int err;
860 int i;
861
862 ASSERT(MAC_PERIM_HELD(port->lp_mh));
863 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
864 ring = tx_grp->atg_rings + i;
865 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
866 break;
930 /*
931 * This function is called to create pseudo rings over hardware rings of
932 * the underlying device. There is a 1:1 mapping between the pseudo TX
933 * rings of the aggr and the hardware rings of the underlying port.
934 */
935 static int
936 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
937 {
938 aggr_grp_t *grp = port->lp_grp;
939 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
940 mac_perim_handle_t pmph;
941 int hw_rh_cnt, i = 0, j;
942 int err = 0;
943
944 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
945 mac_perim_enter_by_mh(port->lp_mh, &pmph);
946
947 /*
948 * Get the list the the underlying HW rings.
949 */
950 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
951 NULL, hw_rh, MAC_RING_TYPE_TX);
952
953 /*
954 * Even if the underlying NIC does not have TX rings, we
955 * still make a psuedo TX ring for that NIC with NULL as
956 * the ring handle.
957 */
958 if (hw_rh_cnt == 0)
959 port->lp_tx_ring_cnt = 1;
960 else
961 port->lp_tx_ring_cnt = hw_rh_cnt;
962
963 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
964 port->lp_tx_ring_cnt), KM_SLEEP);
965 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
966 port->lp_tx_ring_cnt), KM_SLEEP);
967
968 if (hw_rh_cnt == 0) {
969 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
970 NULL, &pseudo_rh)) == 0) {
971 port->lp_tx_rings[0] = NULL;
1037 aggr_grp_update_default(grp);
1038 done:
1039 mac_perim_exit(pmph);
1040 }
1041
1042 static int
1043 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1044 {
1045 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1046 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1047 }
1048
1049 static int
1050 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1051 {
1052 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1053 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1054 }
1055
1056 /*
1057 * Here we need to start the pseudo-ring. As MAC already ensures that the
1058 * underlying device is set up, all we need to do is save the ring generation.
1059 *
1060 * Note, we don't end up wanting to use the underlying mac_hwring_start/stop
1061 * functions here as those don't actually stop and start the ring, they just
1062 * quiesce the ring. Regardless of whether the aggr is logically up or not, we
1063 * want to make sure that we can receive traffic for LACP.
1064 */
1065 static int
1066 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1067 {
1068 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1069
1070 rr_ring->arr_gen = mr_gen;
1071 return (0);
1072 }
1073
1074 /*
1075 * Add one or more ports to an existing link aggregation group.
1076 */
1077 int
1078 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1079 laioc_port_t *ports)
1080 {
1081 int rc, i, nadded = 0;
1082 aggr_grp_t *grp = NULL;
1083 aggr_port_t *port;
1084 boolean_t link_state_changed = B_FALSE;
1085 mac_perim_handle_t mph, pmph;
1086
1087 /* get group corresponding to linkid */
1088 rw_enter(&aggr_grp_lock, RW_READER);
1089 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1090 (mod_hash_val_t *)&grp) != 0) {
1091 rw_exit(&aggr_grp_lock);
1092 return (ENOENT);
1093 }
1094 AGGR_GRP_REFHOLD(grp);
1095
1096 /*
1097 * Hold the perimeter so that the aggregation won't be destroyed.
1098 */
1099 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1100 rw_exit(&aggr_grp_lock);
1101
1102 /* add the specified ports to group */
1103 for (i = 0; i < nports; i++) {
1104 /* add port to group */
1105 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1106 force, &port)) != 0) {
1107 goto bail;
1108 }
1109 ASSERT(port != NULL);
1110 nadded++;
1111
1112 /* check capabilities */
1113 if (!aggr_grp_capab_check(grp, port) ||
1114 !aggr_grp_sdu_check(grp, port) ||
1115 !aggr_grp_margin_check(grp, port)) {
1116 rc = ENOTSUP;
1117 goto bail;
1118 }
1119
1120 /*
1121 * Create the pseudo ring for each HW ring of the underlying
1122 * port.
1123 */
1124 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1125 if (rc != 0)
1126 goto bail;
1127 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1128 if (rc != 0)
1129 goto bail;
1130
1131 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1132
1133 /* set LACP mode */
1134 aggr_port_lacp_set_mode(grp, port);
1135
1136 /* start port if group has already been started */
1137 if (grp->lg_started) {
1138 rc = aggr_port_start(port);
1139 if (rc != 0) {
1140 mac_perim_exit(pmph);
1141 goto bail;
1142 }
1143
1144 /*
1145 * Turn on the promiscuous mode over the port when it
1146 * is requested to be turned on to receive the
1147 * non-primary address over a port, or the promiscous
1148 * mode is enabled over the aggr.
1149 */
1150 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1151 rc = aggr_port_promisc(port, B_TRUE);
1152 if (rc != 0) {
1153 mac_perim_exit(pmph);
1154 goto bail;
1155 }
1156 }
1157 }
1158 mac_perim_exit(pmph);
1159
1160 /*
1161 * Attach each port if necessary.
1162 */
1163 if (aggr_port_notify_link(grp, port))
1164 link_state_changed = B_TRUE;
1165
1166 /*
1167 * Initialize the callback functions for this port.
1168 */
1169 aggr_port_init_callbacks(port);
1170 }
1171
1172 /* update the MAC address of the constituent ports */
1173 if (aggr_grp_update_ports_mac(grp))
1174 link_state_changed = B_TRUE;
1175
1176 if (link_state_changed)
1177 mac_link_update(grp->lg_mh, grp->lg_link_state);
1178
1179 bail:
1180 if (rc != 0) {
1181 /* stop and remove ports that have been added */
1182 for (i = 0; i < nadded; i++) {
1183 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1184 ASSERT(port != NULL);
1185 if (grp->lg_started) {
1186 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1187 (void) aggr_port_promisc(port, B_FALSE);
1188 aggr_port_stop(port);
1189 mac_perim_exit(pmph);
1190 }
1191 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1192 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1193 (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1194 }
1195 }
1196
1197 mac_perim_exit(mph);
1198 AGGR_GRP_REFRELE(grp);
1199 return (rc);
1200 }
1201
1202 static int
1203 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1204 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1205 aggr_lacp_timer_t lacp_timer)
1206 {
1207 boolean_t mac_addr_changed = B_FALSE;
1208 boolean_t link_state_changed = B_FALSE;
1209 mac_perim_handle_t pmph;
1210
1211 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1212
1334 grp->lg_refs = 1;
1335 grp->lg_closing = B_FALSE;
1336 grp->lg_force = force;
1337 grp->lg_linkid = linkid;
1338 grp->lg_zoneid = crgetzoneid(credp);
1339 grp->lg_ifspeed = 0;
1340 grp->lg_link_state = LINK_STATE_UNKNOWN;
1341 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1342 grp->lg_started = B_FALSE;
1343 grp->lg_promisc = B_FALSE;
1344 grp->lg_lacp_done = B_FALSE;
1345 grp->lg_tx_notify_done = B_FALSE;
1346 grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1347 grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1348 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1349 grp->lg_tx_notify_thread = thread_create(NULL, 0,
1350 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1351 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1352 MAX_RINGS_PER_GROUP), KM_SLEEP);
1353 grp->lg_tx_blocked_cnt = 0;
1354 bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1355 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1356 aggr_lacp_init_grp(grp);
1357
1358 grp->lg_rx_group.arg_untagged = 0;
1359 list_create(&(grp->lg_rx_group.arg_vlans), sizeof (aggr_vlan_t),
1360 offsetof(aggr_vlan_t, av_link));
1361
1362 /* add MAC ports to group */
1363 grp->lg_ports = NULL;
1364 grp->lg_nports = 0;
1365 grp->lg_nattached_ports = 0;
1366 grp->lg_ntx_ports = 0;
1367
1368 /*
1369 * If key is not specified by the user, allocate the key.
1370 */
1371 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1372 err = ENOMEM;
1373 goto bail;
1374 }
1375 grp->lg_key = key;
1376
1377 for (i = 0; i < nports; i++) {
1378 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1379 if (err != 0)
1380 goto bail;
1381 }
1382
1383 /*
1384 * If no explicit MAC address was specified by the administrator,
1385 * set it to the MAC address of the first port.
1386 */
1387 grp->lg_addr_fixed = mac_fixed;
1388 if (grp->lg_addr_fixed) {
1389 /* validate specified address */
1390 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1391 err = EINVAL;
1392 goto bail;
1393 }
1394 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1395 } else {
1396 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1397 grp->lg_mac_addr_port = grp->lg_ports;
1398 }
1399
1400 /* set the initial group capabilities */
1401 aggr_grp_capab_set(grp);
1402
1403 if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1404 err = ENOMEM;
1405 goto bail;
1406 }
1407 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1408 mac->m_driver = grp;
1409 mac->m_dip = aggr_dip;
1410 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1411 mac->m_src_addr = grp->lg_addr;
1412 mac->m_callbacks = &aggr_m_callbacks;
1413 mac->m_min_sdu = 0;
1414 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1415 mac->m_margin = aggr_grp_max_margin(grp);
1416 mac->m_v12n = MAC_VIRT_LEVEL1;
1417 err = mac_register(mac, &grp->lg_mh);
1418 mac_free(mac);
1419 if (err != 0)
1420 goto bail;
1421
1422 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1423 if (err != 0) {
1424 (void) mac_unregister(grp->lg_mh);
1425 grp->lg_mh = NULL;
1426 goto bail;
1427 }
1428
1429 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1430
1431 /*
1432 * Update the MAC address of the constituent ports.
1433 * None of the port is attached at this time, the link state of the
1434 * aggregation will not change.
1435 */
1436 link_state_changed = aggr_grp_update_ports_mac(grp);
1437 ASSERT(!link_state_changed);
1438
1439 /* update outbound load balancing policy */
1440 aggr_send_update_policy(grp, policy);
1441
1442 /* set LACP mode */
1443 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1444
1445 /*
1446 * Attach each port if necessary.
1447 */
1448 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1449 /*
1450 * Create the pseudo ring for each HW ring of the underlying
1451 * port. Note that this is done after the aggr registers the
1452 * mac.
1453 */
1454 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1455 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1456 if (aggr_port_notify_link(grp, port))
1457 link_state_changed = B_TRUE;
1458
1459 /*
1460 * Initialize the callback functions for this port.
1461 */
1462 aggr_port_init_callbacks(port);
1463 }
1464
1465 if (link_state_changed)
1466 mac_link_update(grp->lg_mh, grp->lg_link_state);
1467
1468 /* add new group to hash table */
1469 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1470 (mod_hash_val_t)grp);
1471 ASSERT(err == 0);
1472 aggr_grp_cnt++;
1473
1474 mac_perim_exit(mph);
1475 rw_exit(&aggr_grp_lock);
1717
1718 /* stop port if group has already been started */
1719 if (grp->lg_started) {
1720 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1721 aggr_port_stop(port);
1722 mac_perim_exit(pmph);
1723 }
1724
1725 /*
1726 * aggr_rem_pseudo_tx_group() is not called here. Instead
1727 * it is called from inside aggr_grp_rem_port() after the
1728 * port has been detached. The reason is that
1729 * aggr_rem_pseudo_tx_group() removes one ring at a time
1730 * and if there is still traffic going on, then there
1731 * is the possibility of aggr_find_tx_ring() returning a
1732 * removed ring for transmission. Once the port has been
1733 * detached, that port will not be used and
1734 * aggr_find_tx_ring() will not return any rings
1735 * belonging to it.
1736 */
1737 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1738
1739 /* remove port from group */
1740 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1741 &link_state_changed);
1742 ASSERT(rc == 0);
1743 mac_addr_update = mac_addr_update || mac_addr_changed;
1744 link_state_update = link_state_update || link_state_changed;
1745 }
1746
1747 bail:
1748 if (mac_addr_update)
1749 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1750 if (link_state_update)
1751 mac_link_update(grp->lg_mh, grp->lg_link_state);
1752
1753 mac_perim_exit(mph);
1754 AGGR_GRP_REFRELE(grp);
1755
1756 return (rc);
1757 }
1822 grp->lg_tx_notify_done = B_TRUE;
1823 cv_signal(&grp->lg_tx_flowctl_cv);
1824 }
1825 mutex_exit(&grp->lg_tx_flowctl_lock);
1826 if (tid != 0)
1827 thread_join(tid);
1828
1829 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1830
1831 grp->lg_closing = B_TRUE;
1832 /* detach and free MAC ports associated with group */
1833 port = grp->lg_ports;
1834 while (port != NULL) {
1835 cport = port->lp_next;
1836 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1837 if (grp->lg_started)
1838 aggr_port_stop(port);
1839 (void) aggr_grp_detach_port(grp, port);
1840 mac_perim_exit(pmph);
1841 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1842 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1843 aggr_port_delete(port);
1844 port = cport;
1845 }
1846
1847 mac_perim_exit(mph);
1848
1849 kmem_free(grp->lg_tx_blocked_rings,
1850 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1851 /*
1852 * Wait for the port's lacp timer thread and its notification callback
1853 * to exit before calling mac_unregister() since both needs to access
1854 * the mac perimeter of the grp.
1855 */
1856 aggr_grp_port_wait(grp);
1857
1858 VERIFY(mac_unregister(grp->lg_mh) == 0);
1859 grp->lg_mh = NULL;
1860
1861 list_destroy(&(grp->lg_rx_group.arg_vlans));
1862
1863 AGGR_GRP_REFRELE(grp);
1864 return (0);
1865 }
1866
1867 void
1868 aggr_grp_free(aggr_grp_t *grp)
1869 {
1870 ASSERT(grp->lg_refs == 0);
1871 ASSERT(grp->lg_port_ref == 0);
1872 if (grp->lg_key > AGGR_MAX_KEY) {
1873 id_free(key_ids, grp->lg_key);
1874 grp->lg_key = 0;
1875 }
1876 kmem_cache_free(aggr_grp_cache, grp);
1877 }
1878
1879 int
1880 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1881 aggr_grp_info_new_grp_fn_t new_grp_fn,
2207 uint32_t *hcksum_txflags = cap_data;
2208 *hcksum_txflags = grp->lg_hcksum_txflags;
2209 break;
2210 }
2211 case MAC_CAPAB_LSO: {
2212 mac_capab_lso_t *cap_lso = cap_data;
2213
2214 if (grp->lg_lso) {
2215 *cap_lso = grp->lg_cap_lso;
2216 break;
2217 } else {
2218 return (B_FALSE);
2219 }
2220 }
2221 case MAC_CAPAB_NO_NATIVEVLAN:
2222 return (!grp->lg_vlan);
2223 case MAC_CAPAB_NO_ZCOPY:
2224 return (!grp->lg_zcopy);
2225 case MAC_CAPAB_RINGS: {
2226 mac_capab_rings_t *cap_rings = cap_data;
2227
2228 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2229 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2230 cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2231
2232 /*
2233 * An aggregation advertises only one (pseudo) RX
2234 * group, which virtualizes the main/primary group of
2235 * the underlying devices.
2236 */
2237 cap_rings->mr_gnum = 1;
2238 cap_rings->mr_gaddring = NULL;
2239 cap_rings->mr_gremring = NULL;
2240 } else {
2241 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2242 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2243 cap_rings->mr_gnum = 0;
2244 }
2245 cap_rings->mr_rget = aggr_fill_ring;
2246 cap_rings->mr_gget = aggr_fill_group;
2247 break;
2248 }
2249 case MAC_CAPAB_AGGR:
2250 {
2251 mac_capab_aggr_t *aggr_cap;
2252
2253 if (cap_data != NULL) {
2254 aggr_cap = cap_data;
2255 aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2256 aggr_cap->mca_unicst = aggr_m_unicst;
2257 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2258 aggr_cap->mca_arg = arg;
2259 }
2260 return (B_TRUE);
2261 }
2262 default:
2263 return (B_FALSE);
2264 }
2265 return (B_TRUE);
2266 }
2267
2268 /*
2269 * Callback function for MAC layer to register groups.
2270 */
2271 static void
2272 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2273 mac_group_info_t *infop, mac_group_handle_t gh)
2274 {
2275 aggr_grp_t *grp = arg;
2276 aggr_pseudo_rx_group_t *rx_group;
2277 aggr_pseudo_tx_group_t *tx_group;
2278
2279 ASSERT(index == 0);
2280 if (rtype == MAC_RING_TYPE_RX) {
2281 rx_group = &grp->lg_rx_group;
2282 rx_group->arg_gh = gh;
2283 rx_group->arg_grp = grp;
2284
2285 infop->mgi_driver = (mac_group_driver_t)rx_group;
2286 infop->mgi_start = NULL;
2287 infop->mgi_stop = NULL;
2288 infop->mgi_addmac = aggr_addmac;
2289 infop->mgi_remmac = aggr_remmac;
2290 infop->mgi_count = rx_group->arg_ring_cnt;
2291
2292 /*
2293 * Always set the HW VLAN callbacks. They are smart
2294 * enough to know when a port has HW VLAN filters to
2295 * program and when it doesn't.
2296 */
2297 infop->mgi_addvlan = aggr_addvlan;
2298 infop->mgi_remvlan = aggr_remvlan;
2299 } else {
2300 tx_group = &grp->lg_tx_group;
2301 tx_group->atg_gh = gh;
2302 }
2303 }
2304
2305 /*
2306 * Callback funtion for MAC layer to register all rings.
2307 */
2308 static void
2309 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2310 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2311 {
2312 aggr_grp_t *grp = arg;
2313
2314 switch (rtype) {
2315 case MAC_RING_TYPE_RX: {
2316 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group;
2317 aggr_pseudo_rx_ring_t *rx_ring;
2318 mac_intr_t aggr_mac_intr;
2319
2320 ASSERT(rg_index == 0);
2321
2322 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2323 rx_ring = rx_group->arg_rings + index;
2324 rx_ring->arr_rh = rh;
2325
2326 /*
2327 * Entrypoint to enable interrupt (disable poll) and
2328 * disable interrupt (enable poll).
2329 */
2330 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2331 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2332 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2333 aggr_mac_intr.mi_ddi_handle = NULL;
2334
2335 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2336 infop->mri_start = aggr_pseudo_start_ring;
2337 infop->mri_stop = NULL;
2338
2339 infop->mri_intr = aggr_mac_intr;
2340 infop->mri_poll = aggr_rx_poll;
2341
2342 infop->mri_stat = aggr_rx_ring_stat;
2343 break;
2344 }
2345 case MAC_RING_TYPE_TX: {
2346 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2347 aggr_pseudo_tx_ring_t *tx_ring;
2348
2349 ASSERT(rg_index == -1);
2350 ASSERT(index < tx_group->atg_ring_cnt);
2351
2352 tx_ring = &tx_group->atg_rings[index];
2353 tx_ring->atr_rh = rh;
2354
2355 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2356 infop->mri_start = NULL;
2357 infop->mri_stop = NULL;
2404 if (!port->lp_collector_enabled) {
2405 *mpp = mp->b_next;
2406 mp->b_next = NULL;
2407 freemsg(mp);
2408 continue;
2409 }
2410 mpp = &mp->b_next;
2411 }
2412 return (mp_chain);
2413 }
2414
2415 static int
2416 aggr_addmac(void *arg, const uint8_t *mac_addr)
2417 {
2418 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2419 aggr_unicst_addr_t *addr, **pprev;
2420 aggr_grp_t *grp = rx_group->arg_grp;
2421 aggr_port_t *port, *p;
2422 mac_perim_handle_t mph;
2423 int err = 0;
2424
2425 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2426
2427 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2428 mac_perim_exit(mph);
2429 return (0);
2430 }
2431
2432 /*
2433 * Insert this mac address into the list of mac addresses owned by
2434 * the aggregation pseudo group.
2435 */
2436 pprev = &rx_group->arg_macaddr;
2437 while ((addr = *pprev) != NULL) {
2438 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2439 mac_perim_exit(mph);
2440 return (EEXIST);
2441 }
2442 pprev = &addr->aua_next;
2443 }
2444 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2445 bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2446 addr->aua_next = NULL;
2447 *pprev = addr;
2448
2449 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2450 if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2451 break;
2452
2453 if (err != 0) {
2454 for (p = grp->lg_ports; p != port; p = p->lp_next)
2455 aggr_port_remmac(p, mac_addr);
2456
2457 *pprev = NULL;
2458 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2459 }
2460
2461 mac_perim_exit(mph);
2462 return (err);
2463 }
2464
2465 static int
2466 aggr_remmac(void *arg, const uint8_t *mac_addr)
2467 {
2468 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2469 aggr_unicst_addr_t *addr, **pprev;
2470 aggr_grp_t *grp = rx_group->arg_grp;
2471 aggr_port_t *port;
2472 mac_perim_handle_t mph;
2473 int err = 0;
2474
2475 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2480 }
2481
2482 /*
2483 * Insert this mac address into the list of mac addresses owned by
2484 * the aggregation pseudo group.
2485 */
2486 pprev = &rx_group->arg_macaddr;
2487 while ((addr = *pprev) != NULL) {
2488 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2489 pprev = &addr->aua_next;
2490 continue;
2491 }
2492 break;
2493 }
2494 if (addr == NULL) {
2495 mac_perim_exit(mph);
2496 return (EINVAL);
2497 }
2498
2499 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2500 aggr_port_remmac(port, mac_addr);
2501
2502 *pprev = addr->aua_next;
2503 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2504
2505 mac_perim_exit(mph);
2506 return (err);
2507 }
2508
2509 /*
2510 * Search for VID in the Rx group's list and return a pointer if
2511 * found. Otherwise return NULL.
2512 */
2513 static aggr_vlan_t *
2514 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2515 {
2516 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2517 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2518 avp = list_next(&rx_group->arg_vlans, avp)) {
2519 if (avp->av_vid == vid)
2520 return (avp);
2522
2523 return (NULL);
2524 }
2525
2526 /*
2527 * Accept traffic on the specified VID.
2528 *
2529 * Persist VLAN state in the aggr so that ports added later will
2530 * receive the correct filters. In the future it would be nice to
2531 * allow aggr to iterate its clients instead of duplicating state.
2532 */
2533 static int
2534 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2535 {
2536 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2537 aggr_grp_t *aggr = rx_group->arg_grp;
2538 aggr_port_t *port, *p;
2539 mac_perim_handle_t mph;
2540 int err = 0;
2541 aggr_vlan_t *avp = NULL;
2542
2543 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2544
2545 if (vid == MAC_VLAN_UNTAGGED) {
2546 /*
2547 * Aggr is both a MAC provider and MAC client. As a
2548 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2549 * client. As a client itself, it should pass
2550 * VLAN_ID_NONE to its ports.
2551 */
2552 vid = VLAN_ID_NONE;
2553 rx_group->arg_untagged++;
2554 goto update_ports;
2555 }
2556
2557 avp = aggr_find_vlan(rx_group, vid);
2558
2559 if (avp != NULL) {
2560 avp->av_refs++;
2561 mac_perim_exit(mph);
2562 return (0);
2563 }
2564
2565 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2566 avp->av_vid = vid;
2567 avp->av_refs = 1;
2568
2569 update_ports:
2570 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2571 if ((err = aggr_port_addvlan(port, vid)) != 0)
2572 break;
2573
2574 if (err != 0) {
2575 /*
2576 * If any of these calls fail then we are in a
2577 * situation where the ports have different HW state.
2578 * There's no reasonable action the MAC client can
2579 * take in this scenario to rectify the situation.
2580 */
2581 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2582 int err2;
2583
2584 if ((err2 = aggr_port_remvlan(p, vid)) != 0) {
2585 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2586 " from port %s: errno %d.", vid,
2587 mac_client_name(p->lp_mch), err2);
2588 }
2589
2590 }
2591
2592 if (vid == VLAN_ID_NONE)
2593 rx_group->arg_untagged--;
2594
2595 if (avp != NULL) {
2596 kmem_free(avp, sizeof (aggr_vlan_t));
2597 avp = NULL;
2598 }
2599 }
2600
2601 if (avp != NULL)
2602 list_insert_tail(&rx_group->arg_vlans, avp);
2603
2604 done:
2605 mac_perim_exit(mph);
2606 return (err);
2607 }
2608
2609 /*
2610 * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2611 */
2612 static int
2613 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2614 {
2615 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2616 aggr_grp_t *aggr = rx_group->arg_grp;
2617 aggr_port_t *port, *p;
2618 mac_perim_handle_t mph;
2619 int err = 0;
2620 aggr_vlan_t *avp = NULL;
2621
2622 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2623
2624 /*
2625 * See the comment in aggr_addvlan().
2626 */
2627 if (vid == MAC_VLAN_UNTAGGED) {
2628 vid = VLAN_ID_NONE;
2629 rx_group->arg_untagged--;
2630
2631 if (rx_group->arg_untagged > 0)
2632 goto done;
2633
2634 goto update_ports;
2635 }
2636
2637 avp = aggr_find_vlan(rx_group, vid);
2638
2639 if (avp == NULL) {
2640 err = ENOENT;
2641 goto done;
2642 }
2643
2644 avp->av_refs--;
2645
2646 if (avp->av_refs > 0)
2647 goto done;
2648
2649 update_ports:
2650 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2651 if ((err = aggr_port_remvlan(port, vid)) != 0)
2652 break;
2653
2654 /*
2655 * See the comment in aggr_addvlan() for justification of the
2656 * use of VERIFY here.
2657 */
2658 if (err != 0) {
2659 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2660 int err2;
2661
2662 if ((err2 = aggr_port_addvlan(p, vid)) != 0) {
2663 cmn_err(CE_WARN, "Failed to add VLAN %u"
2664 " to port %s: errno %d.", vid,
2665 mac_client_name(p->lp_mch), err2);
2666 }
2667 }
2668
2669 if (avp != NULL)
2670 avp->av_refs++;
2671
2672 if (vid == VLAN_ID_NONE)
2673 rx_group->arg_untagged++;
2674
2675 goto done;
2676 }
2677
2678 if (err == 0 && avp != NULL) {
2679 VERIFY3U(avp->av_refs, ==, 0);
2680 list_remove(&rx_group->arg_vlans, avp);
2681 kmem_free(avp, sizeof (aggr_vlan_t));
2682 }
|
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2018 Joyent, Inc.
24 */
25
26 /*
27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
28 *
29 * An instance of the structure aggr_grp_t is allocated for each
30 * link aggregation group. When created, aggr_grp_t objects are
31 * entered into the aggr_grp_hash hash table maintained by the modhash
32 * module. The hash key is the linkid associated with the link
33 * aggregation group.
34 *
35 * Each aggregation contains a set of ports. The port is represented
36 * by the aggr_port_t structure. A port consists of a single MAC
37 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
38 * MAC. This client is used by the aggr to send and receive LACP
39 * traffic. Each port client takes on the same MAC unicast address --
40 * the address of the aggregation itself (taken from the first port by
41 * default).
42 *
43 * The MAC client that hangs off each aggr port is not your typical
44 * MAC client. Not only does it have exclusive control of the MAC, but
45 * it also has no Tx or Rx SRSes. An SRS is designed to queue and
46 * fanout traffic among L4 protocols; but the aggr is an intermediary,
47 * not a consumer. Instead of using SRSes, the aggr puts the
48 * underlying hardware rings into passthru mode and ships packets up
49 * via a direct call to aggr_recv_cb(). This allows aggr to enforce
50 * LACP while passing all other traffic up to clients of the aggr.
51 *
52 * Pseudo Rx Groups and Rings
53 * --------------------------
54 *
55 * It is imperative for client performance that the aggr provide as
56 * many MAC groups as possible. In order to use the underlying HW
57 * resources, aggr creates pseudo groups to aggregate the underlying
58 * HW groups. Every HW group gets mapped to a pseudo group; and every
59 * HW ring in that group gets mapped to a pseudo ring. The pseudo
60 * group at index 0 combines all the HW groups at index 0 from each
61 * port, etc. The aggr's MAC then creates normal MAC groups and rings
62 * out of these pseudo groups and rings to present to the aggr's
63 * clients. To the clients, the aggr's groups and rings are absolutely
64 * no different than a NIC's groups or rings.
65 *
66 * Pseudo Tx Rings
67 * ---------------
68 *
69 * The underlying ports (NICs) in an aggregation can have Tx rings. To
70 * enhance aggr's performance, these Tx rings are made available to
71 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
72 * not new. They are already present and implemented on the Rx side.
73 * The same concept is extended to the Tx side where each Tx ring of
74 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
75 * each pseudo Tx ring will map to a specific hardware Tx ring. Even
76 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
77 * is given to the aggregation layer.
78 *
79 * With this change, the outgoing stack depth looks much better:
80 *
81 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
82 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
83 *
84 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
85 * SRS_TX_AGGR and SRS_TX_BW_AGGR.
86 *
87 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
88 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
89 * ring belonging to a port on which the packet has to be sent.
90 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
91 * policy and then uses the fanout_hint passed to it to pick a Tx ring from
92 * the selected port.
93 *
94 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
95 * bandwidth limit is applied first on the outgoing packet and the packets
96 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
97 * particular Tx ring.
98 */
99
100 #include <sys/types.h>
101 #include <sys/sysmacros.h>
102 #include <sys/conf.h>
103 #include <sys/cmn_err.h>
104 #include <sys/disp.h>
105 #include <sys/list.h>
106 #include <sys/ksynch.h>
107 #include <sys/kmem.h>
108 #include <sys/stream.h>
109 #include <sys/modctl.h>
110 #include <sys/ddi.h>
111 #include <sys/sunddi.h>
112 #include <sys/atomic.h>
113 #include <sys/stat.h>
114 #include <sys/modhash.h>
115 #include <sys/id_space.h>
116 #include <sys/strsun.h>
117 #include <sys/cred.h>
134 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
135 const void *);
136 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
137 mac_prop_info_handle_t);
138
139 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
140 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
141 boolean_t *);
142
143 static void aggr_grp_capab_set(aggr_grp_t *);
144 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
145 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
146 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
147 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
148 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
149
150 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
151 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
152 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
153 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
154 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
155 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
156 static int aggr_addmac(void *, const uint8_t *);
157 static int aggr_remmac(void *, const uint8_t *);
158 static int aggr_addvlan(mac_group_driver_t, uint16_t);
159 static int aggr_remvlan(mac_group_driver_t, uint16_t);
160 static mblk_t *aggr_rx_poll(void *, int);
161 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
162 const int, mac_ring_info_t *, mac_ring_handle_t);
163 static void aggr_fill_group(void *, mac_ring_type_t, const int,
164 mac_group_info_t *, mac_group_handle_t);
165
166 static kmem_cache_t *aggr_grp_cache;
167 static mod_hash_t *aggr_grp_hash;
168 static krwlock_t aggr_grp_lock;
169 static uint_t aggr_grp_cnt;
170 static id_space_t *key_ids;
171
172 #define GRP_HASHSZ 64
173 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
174 #define AGGR_PORT_NAME_DELIMIT '-'
175
380
381 /*
382 * Update the group link state.
383 */
384 if (grp->lg_link_state != LINK_STATE_UP) {
385 grp->lg_link_state = LINK_STATE_UP;
386 mutex_enter(&grp->lg_stat_lock);
387 grp->lg_link_duplex = LINK_DUPLEX_FULL;
388 mutex_exit(&grp->lg_stat_lock);
389 link_state_changed = B_TRUE;
390 }
391
392 /*
393 * Update port's state.
394 */
395 port->lp_state = AGGR_PORT_STATE_ATTACHED;
396
397 aggr_grp_multicst_port(port, B_TRUE);
398
399 /*
400 * The port client doesn't have an Rx SRS; instead of calling
401 * mac_rx_set() we set the client's flow callback directly.
402 * This datapath is used only when the port's driver doesn't
403 * support MAC_CAPAB_RINGS. Drivers with ring support will
404 * deliver traffic to the aggr via ring passthru.
405 */
406 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
407
408 /*
409 * If LACP is OFF, the port can be used to send data as soon
410 * as its link is up and verified to be compatible with the
411 * aggregation.
412 *
413 * If LACP is active or passive, notify the LACP subsystem, which
414 * will enable sending on the port following the LACP protocol.
415 */
416 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
417 aggr_send_port_enable(port);
418 else
419 aggr_lacp_port_attached(port);
420
421 return (link_state_changed);
422 }
423
424 boolean_t
425 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
426 {
427 boolean_t link_state_changed = B_FALSE;
428
429 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
430 ASSERT(MAC_PERIM_HELD(port->lp_mh));
431
432 /* update state */
433 if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
434 return (B_FALSE);
435
436 mac_client_clear_flow_cb(port->lp_mch);
437
438 aggr_grp_multicst_port(port, B_FALSE);
439
440 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
441 aggr_send_port_disable(port);
442 else
443 aggr_lacp_port_detached(port);
444
445 port->lp_state = AGGR_PORT_STATE_STANDBY;
446
447 grp->lg_nattached_ports--;
448 if (grp->lg_nattached_ports == 0) {
449 /* the last attached MAC port of the group is being detached */
450 grp->lg_link_state = LINK_STATE_DOWN;
451 mutex_enter(&grp->lg_stat_lock);
452 grp->lg_ifspeed = 0;
453 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
454 mutex_exit(&grp->lg_stat_lock);
455 link_state_changed = B_TRUE;
456 }
555 * address now, and this might cause the link state
556 * of the aggregation to change.
557 */
558 *link_state_changedp = aggr_grp_attach_port(grp, port);
559 }
560 }
561 }
562
563 /*
564 * Add a port to a link aggregation group.
565 */
566 static int
567 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
568 aggr_port_t **pp)
569 {
570 aggr_port_t *port, **cport;
571 mac_perim_handle_t mph;
572 zoneid_t port_zoneid = ALL_ZONES;
573 int err;
574
575 /* The port must be in the same zone as the aggregation. */
576 if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
577 port_zoneid = GLOBAL_ZONEID;
578 if (grp->lg_zoneid != port_zoneid)
579 return (EBUSY);
580
581 /*
582 * If we are creating the aggr, then there is no MAC handle
583 * and thus no perimeter to hold. If we are adding a port to
584 * an existing aggr, then the perimiter of the aggr's MAC must
585 * be held.
586 */
587 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
588
589 err = aggr_port_create(grp, port_linkid, force, &port);
590 if (err != 0)
591 return (err);
592
593 mac_perim_enter_by_mh(port->lp_mh, &mph);
594
595 /* Add the new port to the end of the list. */
596 cport = &grp->lg_ports;
597 while (*cport != NULL)
598 cport = &((*cport)->lp_next);
599 *cport = port;
600
601 /*
602 * Back reference to the group it is member of. A port always
603 * holds a reference to its group to ensure that the back
604 * reference is always valid.
605 */
606 port->lp_grp = grp;
607 AGGR_GRP_REFHOLD(grp);
608 grp->lg_nports++;
609
610 aggr_lacp_init_port(port);
611 mac_perim_exit(mph);
612
613 if (pp != NULL)
614 *pp = port;
615
657 {
658 aggr_pseudo_rx_ring_t *ring;
659 int err;
660 int j;
661
662 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
663 ring = rx_grp->arg_rings + j;
664 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
665 break;
666 }
667
668 /*
669 * No slot for this new RX ring.
670 */
671 if (j == MAX_RINGS_PER_GROUP)
672 return (EIO);
673
674 ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
675 ring->arr_hw_rh = hw_rh;
676 ring->arr_port = port;
677 ring->arr_grp = rx_grp;
678 rx_grp->arg_ring_cnt++;
679
680 /*
681 * The group is already registered, dynamically add a new ring to the
682 * mac group.
683 */
684 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
685 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
686 ring->arr_hw_rh = NULL;
687 ring->arr_port = NULL;
688 ring->arr_grp = NULL;
689 rx_grp->arg_ring_cnt--;
690 } else {
691 /*
692 * This must run after the MAC is registered.
693 */
694 ASSERT3P(ring->arr_rh, !=, NULL);
695 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
696 (void *)port, (mac_resource_handle_t)ring);
697 }
698 return (err);
699 }
700
701 /*
702 * Remove the pseudo RX ring of the given HW ring handle.
703 */
704 static void
705 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
706 {
707 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
708 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
709
710 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
711 ring->arr_hw_rh != hw_rh) {
712 continue;
713 }
714
715 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
716
717 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
718 ring->arr_hw_rh = NULL;
719 ring->arr_port = NULL;
720 ring->arr_grp = NULL;
721 rx_grp->arg_ring_cnt--;
722 mac_hwring_clear_passthru(hw_rh);
723 break;
724 }
725 }
726
727 /*
728 * Create pseudo rings over the HW rings of the port.
729 *
730 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
731 *
732 * o Program existing unicast filters on the pseudo group into the HW group.
733 *
734 * o Program existing VLAN filters on the pseudo group into the HW group.
735 */
736 static int
737 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
738 {
739 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
740 aggr_unicst_addr_t *addr, *a;
741 mac_perim_handle_t pmph;
742 aggr_vlan_t *avp;
743 uint_t hw_rh_cnt, i;
744 int err = 0;
745 uint_t g_idx = rx_grp->arg_index;
746
747 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
748 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
749 mac_perim_enter_by_mh(port->lp_mh, &pmph);
750
751 /*
752 * This function must be called after the aggr registers its
753 * MAC and its Rx groups have been initialized.
754 */
755 ASSERT(rx_grp->arg_gh != NULL);
756
757 /*
758 * Get the list of the underlying HW rings.
759 */
760 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
761 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
762
763 /*
764 * Add existing VLAN and unicast address filters to the port.
765 */
766 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
767 avp = list_next(&rx_grp->arg_vlans, avp)) {
768 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
769 goto err;
770 }
771
772 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
773 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
774 goto err;
775 }
776
777 for (i = 0; i < hw_rh_cnt; i++) {
778 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
779 if (err != 0)
780 goto err;
781 }
782
783 mac_perim_exit(pmph);
784 return (0);
785
786 err:
787 ASSERT(err != 0);
788
789 for (uint_t j = 0; j < i; j++)
790 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
791
792 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
793 aggr_port_remmac(port, g_idx, a->aua_addr);
794
795 if (avp != NULL)
796 avp = list_prev(&rx_grp->arg_vlans, avp);
797
798 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
799 int err2;
800
801 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
802 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
803 ": errno %d.", avp->av_vid,
804 mac_client_name(port->lp_mch), err2);
805 }
806 }
807
808 port->lp_hwghs[g_idx] = NULL;
809 mac_perim_exit(pmph);
810 return (err);
811 }
812
813 /*
814 * Destroy the pseudo rings mapping to this port and remove all VLAN
815 * and unicast filters from this port. Even if there are no underlying
816 * HW rings we must still remove the unicast filters to take the port
817 * out of promisc mode.
818 */
819 static void
820 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
821 {
822 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
823 aggr_unicst_addr_t *addr;
824 mac_perim_handle_t pmph;
825 uint_t hw_rh_cnt;
826 uint_t g_idx = rx_grp->arg_index;
827
828 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
829 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
830 ASSERT3P(rx_grp->arg_gh, !=, NULL);
831 mac_perim_enter_by_mh(port->lp_mh, &pmph);
832
833 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
834 MAC_RING_TYPE_RX);
835
836 for (uint_t i = 0; i < hw_rh_cnt; i++)
837 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
838
839 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
840 aggr_port_remmac(port, g_idx, addr->aua_addr);
841
842 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
843 avp = list_next(&rx_grp->arg_vlans, avp)) {
844 int err;
845
846 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
847 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
848 ": errno %d.", avp->av_vid,
849 mac_client_name(port->lp_mch), err);
850 }
851 }
852
853 port->lp_hwghs[g_idx] = NULL;
854 mac_perim_exit(pmph);
855 }
856
857 /*
858 * Add a pseudo TX ring for the given HW ring handle.
859 */
860 static int
861 aggr_add_pseudo_tx_ring(aggr_port_t *port,
862 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
863 mac_ring_handle_t *pseudo_rh)
864 {
865 aggr_pseudo_tx_ring_t *ring;
866 int err;
867 int i;
868
869 ASSERT(MAC_PERIM_HELD(port->lp_mh));
870 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
871 ring = tx_grp->atg_rings + i;
872 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
873 break;
937 /*
938 * This function is called to create pseudo rings over hardware rings of
939 * the underlying device. There is a 1:1 mapping between the pseudo TX
940 * rings of the aggr and the hardware rings of the underlying port.
941 */
942 static int
943 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
944 {
945 aggr_grp_t *grp = port->lp_grp;
946 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
947 mac_perim_handle_t pmph;
948 int hw_rh_cnt, i = 0, j;
949 int err = 0;
950
951 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
952 mac_perim_enter_by_mh(port->lp_mh, &pmph);
953
954 /*
955 * Get the list the the underlying HW rings.
956 */
957 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
958 MAC_RING_TYPE_TX);
959
960 /*
961 * Even if the underlying NIC does not have TX rings, we
962 * still make a psuedo TX ring for that NIC with NULL as
963 * the ring handle.
964 */
965 if (hw_rh_cnt == 0)
966 port->lp_tx_ring_cnt = 1;
967 else
968 port->lp_tx_ring_cnt = hw_rh_cnt;
969
970 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
971 port->lp_tx_ring_cnt), KM_SLEEP);
972 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
973 port->lp_tx_ring_cnt), KM_SLEEP);
974
975 if (hw_rh_cnt == 0) {
976 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
977 NULL, &pseudo_rh)) == 0) {
978 port->lp_tx_rings[0] = NULL;
1044 aggr_grp_update_default(grp);
1045 done:
1046 mac_perim_exit(pmph);
1047 }
1048
1049 static int
1050 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1051 {
1052 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1053 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1054 }
1055
1056 static int
1057 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1058 {
1059 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1060 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1061 }
1062
1063 /*
1064 * Start the pseudo ring. Since the pseudo ring is just an abstraction
1065 * over an actual HW ring, the real task is to start the underlying HW
1066 * ring.
1067 */
1068 static int
1069 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1070 {
1071 int err;
1072 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1073
1074 err = mac_hwring_start(rr_ring->arr_hw_rh);
1075
1076 if (err != 0)
1077 return (err);
1078
1079 rr_ring->arr_gen = mr_gen;
1080 return (err);
1081 }
1082
1083 /*
1084 * Stop the pseudo ring. Since the pseudo ring is just an abstraction
1085 * over an actual HW ring, the real task is to stop the underlying HW
1086 * ring.
1087 */
1088 static void
1089 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
1090 {
1091 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1092
1093 /*
1094 * The rings underlying the default group must stay up to
1095 * continue receiving LACP traffic. We would normally never
1096 * stop the default Rx rings because of the primary MAC
1097 * client; but aggr's primary MAC client doesn't call
1098 * mac_unicast_add() and thus mi_active is 0 when the last
1099 * non-primary client is deleted.
1100 */
1101 if (rr_ring->arr_grp->arg_index != 0)
1102 mac_hwring_stop(rr_ring->arr_hw_rh);
1103 }
1104
1105 /*
1106 * Add one or more ports to an existing link aggregation group.
1107 */
1108 int
1109 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1110 laioc_port_t *ports)
1111 {
1112 int rc;
1113 uint_t port_added = 0;
1114 uint_t grp_added;
1115 aggr_grp_t *grp = NULL;
1116 aggr_port_t *port;
1117 boolean_t link_state_changed = B_FALSE;
1118 mac_perim_handle_t mph, pmph;
1119
1120 /* Get the aggr corresponding to linkid. */
1121 rw_enter(&aggr_grp_lock, RW_READER);
1122 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1123 (mod_hash_val_t *)&grp) != 0) {
1124 rw_exit(&aggr_grp_lock);
1125 return (ENOENT);
1126 }
1127 AGGR_GRP_REFHOLD(grp);
1128
1129 /*
1130 * Hold the perimeter so that the aggregation can't be destroyed.
1131 */
1132 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1133 rw_exit(&aggr_grp_lock);
1134
1135 /* Add the specified ports to the aggr. */
1136 for (uint_t i = 0; i < nports; i++) {
1137 grp_added = 0;
1138
1139 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1140 force, &port)) != 0) {
1141 goto bail;
1142 }
1143
1144 ASSERT(port != NULL);
1145 port_added++;
1146
1147 /* check capabilities */
1148 if (!aggr_grp_capab_check(grp, port) ||
1149 !aggr_grp_sdu_check(grp, port) ||
1150 !aggr_grp_margin_check(grp, port)) {
1151 rc = ENOTSUP;
1152 goto bail;
1153 }
1154
1155 /*
1156 * Create the pseudo ring for each HW ring of the underlying
1157 * port.
1158 */
1159 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1160 if (rc != 0)
1161 goto bail;
1162
1163 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
1164 rc = aggr_add_pseudo_rx_group(port,
1165 &grp->lg_rx_groups[j]);
1166
1167 if (rc != 0)
1168 goto bail;
1169
1170 grp_added++;
1171 }
1172
1173 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1174
1175 /* set LACP mode */
1176 aggr_port_lacp_set_mode(grp, port);
1177
1178 /* start port if group has already been started */
1179 if (grp->lg_started) {
1180 rc = aggr_port_start(port);
1181 if (rc != 0) {
1182 mac_perim_exit(pmph);
1183 goto bail;
1184 }
1185
1186 /*
1187 * Turn on the promiscuous mode over the port when it
1188 * is requested to be turned on to receive the
1189 * non-primary address over a port, or the promiscuous
1190 * mode is enabled over the aggr.
1191 */
1192 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1193 rc = aggr_port_promisc(port, B_TRUE);
1194 if (rc != 0) {
1195 mac_perim_exit(pmph);
1196 goto bail;
1197 }
1198 }
1199 }
1200 mac_perim_exit(pmph);
1201
1202 /*
1203 * Attach each port if necessary.
1204 */
1205 if (aggr_port_notify_link(grp, port))
1206 link_state_changed = B_TRUE;
1207
1208 /*
1209 * Initialize the callback functions for this port.
1210 */
1211 aggr_port_init_callbacks(port);
1212 }
1213
1214 /* update the MAC address of the constituent ports */
1215 if (aggr_grp_update_ports_mac(grp))
1216 link_state_changed = B_TRUE;
1217
1218 if (link_state_changed)
1219 mac_link_update(grp->lg_mh, grp->lg_link_state);
1220
1221 bail:
1222 if (rc != 0) {
1223 /* stop and remove ports that have been added */
1224 for (uint_t i = 0; i < port_added; i++) {
1225 uint_t grp_remove;
1226
1227 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1228 ASSERT(port != NULL);
1229
1230 if (grp->lg_started) {
1231 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1232 (void) aggr_port_promisc(port, B_FALSE);
1233 aggr_port_stop(port);
1234 mac_perim_exit(pmph);
1235 }
1236
1237 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1238
1239 /*
1240 * Only the last port could have a partial set
1241 * of groups added.
1242 */
1243 grp_remove = (i + 1 == port_added) ? grp_added :
1244 grp->lg_rx_group_count;
1245
1246 for (uint_t j = 0; j < grp_remove; j++) {
1247 aggr_rem_pseudo_rx_group(port,
1248 &grp->lg_rx_groups[j]);
1249 }
1250
1251 (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1252 }
1253 }
1254
1255 mac_perim_exit(mph);
1256 AGGR_GRP_REFRELE(grp);
1257 return (rc);
1258 }
1259
1260 static int
1261 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1262 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1263 aggr_lacp_timer_t lacp_timer)
1264 {
1265 boolean_t mac_addr_changed = B_FALSE;
1266 boolean_t link_state_changed = B_FALSE;
1267 mac_perim_handle_t pmph;
1268
1269 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1270
1392 grp->lg_refs = 1;
1393 grp->lg_closing = B_FALSE;
1394 grp->lg_force = force;
1395 grp->lg_linkid = linkid;
1396 grp->lg_zoneid = crgetzoneid(credp);
1397 grp->lg_ifspeed = 0;
1398 grp->lg_link_state = LINK_STATE_UNKNOWN;
1399 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1400 grp->lg_started = B_FALSE;
1401 grp->lg_promisc = B_FALSE;
1402 grp->lg_lacp_done = B_FALSE;
1403 grp->lg_tx_notify_done = B_FALSE;
1404 grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1405 grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1406 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1407 grp->lg_tx_notify_thread = thread_create(NULL, 0,
1408 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1409 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1410 MAX_RINGS_PER_GROUP), KM_SLEEP);
1411 grp->lg_tx_blocked_cnt = 0;
1412 bzero(&grp->lg_rx_groups,
1413 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
1414 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1415 aggr_lacp_init_grp(grp);
1416
1417 /* add MAC ports to group */
1418 grp->lg_ports = NULL;
1419 grp->lg_nports = 0;
1420 grp->lg_nattached_ports = 0;
1421 grp->lg_ntx_ports = 0;
1422
1423 /*
1424 * If key is not specified by the user, allocate the key.
1425 */
1426 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1427 err = ENOMEM;
1428 goto bail;
1429 }
1430 grp->lg_key = key;
1431
1432 for (i = 0; i < nports; i++) {
1433 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1434 if (err != 0)
1435 goto bail;
1436 }
1437
1438 grp->lg_rx_group_count = 1;
1439
1440 for (i = 0, port = grp->lg_ports; port != NULL;
1441 i++, port = port->lp_next) {
1442 uint_t num_rgroups;
1443
1444 mac_perim_enter_by_mh(port->lp_mh, &mph);
1445 num_rgroups = mac_get_num_rx_groups(port->lp_mh);
1446 mac_perim_exit(mph);
1447
1448 /*
1449 * Utilize all the groups in a port. If some ports
1450 * have less groups than others, then traffic destined
1451 * for the same unicast address may be HW classified
1452 * on some ports but SW classified by aggr when
1453 * arriving on other ports.
1454 */
1455 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
1456 num_rgroups);
1457 }
1458
1459 /*
1460 * There could be cases where the hardware provides more
1461 * groups than aggr can support. Make sure we never go above
1462 * the max aggr can support.
1463 */
1464 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
1465 MAX_GROUPS_PER_PORT);
1466
1467 ASSERT3U(grp->lg_rx_group_count, >, 0);
1468 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1469 grp->lg_rx_groups[i].arg_index = i;
1470 grp->lg_rx_groups[i].arg_untagged = 0;
1471 list_create(&(grp->lg_rx_groups[i].arg_vlans),
1472 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
1473 }
1474
1475 /*
1476 * If no explicit MAC address was specified by the administrator,
1477 * set it to the MAC address of the first port.
1478 */
1479 grp->lg_addr_fixed = mac_fixed;
1480 if (grp->lg_addr_fixed) {
1481 /* validate specified address */
1482 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1483 err = EINVAL;
1484 goto bail;
1485 }
1486 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1487 } else {
1488 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1489 grp->lg_mac_addr_port = grp->lg_ports;
1490 }
1491
1492 /* Set the initial group capabilities. */
1493 aggr_grp_capab_set(grp);
1494
1495 if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1496 err = ENOMEM;
1497 goto bail;
1498 }
1499 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1500 mac->m_driver = grp;
1501 mac->m_dip = aggr_dip;
1502 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1503 mac->m_src_addr = grp->lg_addr;
1504 mac->m_callbacks = &aggr_m_callbacks;
1505 mac->m_min_sdu = 0;
1506 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1507 mac->m_margin = aggr_grp_max_margin(grp);
1508 mac->m_v12n = MAC_VIRT_LEVEL1;
1509 err = mac_register(mac, &grp->lg_mh);
1510 mac_free(mac);
1511 if (err != 0)
1512 goto bail;
1513
1514 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1515 if (err != 0) {
1516 (void) mac_unregister(grp->lg_mh);
1517 grp->lg_mh = NULL;
1518 goto bail;
1519 }
1520
1521 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1522
1523 /*
1524 * Update the MAC address of the constituent ports.
1525 * None of the port is attached at this time, the link state of the
1526 * aggregation will not change.
1527 *
1528 * All ports take on the primary MAC address of the aggr
1529 * (lg_aggr). At this point, none of the ports are attached;
1530 * thus the link state of the aggregation will not change.
1531 */
1532 link_state_changed = aggr_grp_update_ports_mac(grp);
1533 ASSERT(!link_state_changed);
1534
1535 /* Update outbound load balancing policy. */
1536 aggr_send_update_policy(grp, policy);
1537
1538 /* Set LACP mode. */
1539 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1540
1541 /*
1542 * Attach each port if necessary.
1543 */
1544 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1545 /*
1546 * Create the pseudo ring for each HW ring of the
1547 * underlying port. Note that this is done after the
1548 * aggr registers its MAC.
1549 */
1550 VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group),
1551 ==, 0);
1552
1553 for (i = 0; i < grp->lg_rx_group_count; i++) {
1554 VERIFY3S(aggr_add_pseudo_rx_group(port,
1555 &grp->lg_rx_groups[i]), ==, 0);
1556 }
1557
1558 if (aggr_port_notify_link(grp, port))
1559 link_state_changed = B_TRUE;
1560
1561 /*
1562 * Initialize the callback functions for this port.
1563 */
1564 aggr_port_init_callbacks(port);
1565 }
1566
1567 if (link_state_changed)
1568 mac_link_update(grp->lg_mh, grp->lg_link_state);
1569
1570 /* add new group to hash table */
1571 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1572 (mod_hash_val_t)grp);
1573 ASSERT(err == 0);
1574 aggr_grp_cnt++;
1575
1576 mac_perim_exit(mph);
1577 rw_exit(&aggr_grp_lock);
1819
1820 /* stop port if group has already been started */
1821 if (grp->lg_started) {
1822 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1823 aggr_port_stop(port);
1824 mac_perim_exit(pmph);
1825 }
1826
1827 /*
1828 * aggr_rem_pseudo_tx_group() is not called here. Instead
1829 * it is called from inside aggr_grp_rem_port() after the
1830 * port has been detached. The reason is that
1831 * aggr_rem_pseudo_tx_group() removes one ring at a time
1832 * and if there is still traffic going on, then there
1833 * is the possibility of aggr_find_tx_ring() returning a
1834 * removed ring for transmission. Once the port has been
1835 * detached, that port will not be used and
1836 * aggr_find_tx_ring() will not return any rings
1837 * belonging to it.
1838 */
1839 for (i = 0; i < grp->lg_rx_group_count; i++)
1840 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1841
1842 /* remove port from group */
1843 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1844 &link_state_changed);
1845 ASSERT(rc == 0);
1846 mac_addr_update = mac_addr_update || mac_addr_changed;
1847 link_state_update = link_state_update || link_state_changed;
1848 }
1849
1850 bail:
1851 if (mac_addr_update)
1852 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1853 if (link_state_update)
1854 mac_link_update(grp->lg_mh, grp->lg_link_state);
1855
1856 mac_perim_exit(mph);
1857 AGGR_GRP_REFRELE(grp);
1858
1859 return (rc);
1860 }
1925 grp->lg_tx_notify_done = B_TRUE;
1926 cv_signal(&grp->lg_tx_flowctl_cv);
1927 }
1928 mutex_exit(&grp->lg_tx_flowctl_lock);
1929 if (tid != 0)
1930 thread_join(tid);
1931
1932 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1933
1934 grp->lg_closing = B_TRUE;
1935 /* detach and free MAC ports associated with group */
1936 port = grp->lg_ports;
1937 while (port != NULL) {
1938 cport = port->lp_next;
1939 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1940 if (grp->lg_started)
1941 aggr_port_stop(port);
1942 (void) aggr_grp_detach_port(grp, port);
1943 mac_perim_exit(pmph);
1944 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1945 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
1946 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1947 aggr_port_delete(port);
1948 port = cport;
1949 }
1950
1951 mac_perim_exit(mph);
1952
1953 kmem_free(grp->lg_tx_blocked_rings,
1954 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1955 /*
1956 * Wait for the port's lacp timer thread and its notification callback
1957 * to exit before calling mac_unregister() since both needs to access
1958 * the mac perimeter of the grp.
1959 */
1960 aggr_grp_port_wait(grp);
1961
1962 VERIFY(mac_unregister(grp->lg_mh) == 0);
1963 grp->lg_mh = NULL;
1964
1965 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1966 list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
1967 }
1968
1969 AGGR_GRP_REFRELE(grp);
1970 return (0);
1971 }
1972
1973 void
1974 aggr_grp_free(aggr_grp_t *grp)
1975 {
1976 ASSERT(grp->lg_refs == 0);
1977 ASSERT(grp->lg_port_ref == 0);
1978 if (grp->lg_key > AGGR_MAX_KEY) {
1979 id_free(key_ids, grp->lg_key);
1980 grp->lg_key = 0;
1981 }
1982 kmem_cache_free(aggr_grp_cache, grp);
1983 }
1984
1985 int
1986 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1987 aggr_grp_info_new_grp_fn_t new_grp_fn,
2313 uint32_t *hcksum_txflags = cap_data;
2314 *hcksum_txflags = grp->lg_hcksum_txflags;
2315 break;
2316 }
2317 case MAC_CAPAB_LSO: {
2318 mac_capab_lso_t *cap_lso = cap_data;
2319
2320 if (grp->lg_lso) {
2321 *cap_lso = grp->lg_cap_lso;
2322 break;
2323 } else {
2324 return (B_FALSE);
2325 }
2326 }
2327 case MAC_CAPAB_NO_NATIVEVLAN:
2328 return (!grp->lg_vlan);
2329 case MAC_CAPAB_NO_ZCOPY:
2330 return (!grp->lg_zcopy);
2331 case MAC_CAPAB_RINGS: {
2332 mac_capab_rings_t *cap_rings = cap_data;
2333 uint_t ring_cnt = 0;
2334
2335 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2336 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
2337
2338 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2339 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2340 cap_rings->mr_rnum = ring_cnt;
2341 cap_rings->mr_gnum = grp->lg_rx_group_count;
2342 cap_rings->mr_gaddring = NULL;
2343 cap_rings->mr_gremring = NULL;
2344 } else {
2345 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2346 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2347 cap_rings->mr_gnum = 0;
2348 }
2349 cap_rings->mr_rget = aggr_fill_ring;
2350 cap_rings->mr_gget = aggr_fill_group;
2351 break;
2352 }
2353 case MAC_CAPAB_AGGR:
2354 {
2355 mac_capab_aggr_t *aggr_cap;
2356
2357 if (cap_data != NULL) {
2358 aggr_cap = cap_data;
2359 aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2360 aggr_cap->mca_unicst = aggr_m_unicst;
2361 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2362 aggr_cap->mca_arg = arg;
2363 }
2364 return (B_TRUE);
2365 }
2366 default:
2367 return (B_FALSE);
2368 }
2369 return (B_TRUE);
2370 }
2371
2372 /*
2373 * Callback function for MAC layer to register groups.
2374 */
2375 static void
2376 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2377 mac_group_info_t *infop, mac_group_handle_t gh)
2378 {
2379 aggr_grp_t *grp = arg;
2380
2381 if (rtype == MAC_RING_TYPE_RX) {
2382 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
2383
2384 rx_group->arg_gh = gh;
2385 rx_group->arg_grp = grp;
2386
2387 infop->mgi_driver = (mac_group_driver_t)rx_group;
2388 infop->mgi_start = NULL;
2389 infop->mgi_stop = NULL;
2390 infop->mgi_addmac = aggr_addmac;
2391 infop->mgi_remmac = aggr_remmac;
2392 infop->mgi_count = rx_group->arg_ring_cnt;
2393
2394 /*
2395 * Always set the HW VLAN callbacks. They are smart
2396 * enough to know when a port has HW VLAN filters to
2397 * program and when it doesn't.
2398 */
2399 infop->mgi_addvlan = aggr_addvlan;
2400 infop->mgi_remvlan = aggr_remvlan;
2401 } else {
2402 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2403
2404 ASSERT3S(index, ==, 0);
2405 tx_group->atg_gh = gh;
2406 }
2407 }
2408
2409 /*
2410 * Callback funtion for MAC layer to register all rings.
2411 */
2412 static void
2413 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2414 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2415 {
2416 aggr_grp_t *grp = arg;
2417
2418 switch (rtype) {
2419 case MAC_RING_TYPE_RX: {
2420 aggr_pseudo_rx_group_t *rx_group;
2421 aggr_pseudo_rx_ring_t *rx_ring;
2422 mac_intr_t aggr_mac_intr;
2423
2424 rx_group = &grp->lg_rx_groups[rg_index];
2425 ASSERT3S(index, >=, 0);
2426 ASSERT3S(index, <, rx_group->arg_ring_cnt);
2427 rx_ring = rx_group->arg_rings + index;
2428 rx_ring->arr_rh = rh;
2429
2430 /*
2431 * Entrypoint to enable interrupt (disable poll) and
2432 * disable interrupt (enable poll).
2433 */
2434 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2435 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2436 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2437 aggr_mac_intr.mi_ddi_handle = NULL;
2438
2439 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2440 infop->mri_start = aggr_pseudo_start_rx_ring;
2441 infop->mri_stop = aggr_pseudo_stop_rx_ring;
2442
2443 infop->mri_intr = aggr_mac_intr;
2444 infop->mri_poll = aggr_rx_poll;
2445
2446 infop->mri_stat = aggr_rx_ring_stat;
2447 break;
2448 }
2449 case MAC_RING_TYPE_TX: {
2450 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2451 aggr_pseudo_tx_ring_t *tx_ring;
2452
2453 ASSERT(rg_index == -1);
2454 ASSERT(index < tx_group->atg_ring_cnt);
2455
2456 tx_ring = &tx_group->atg_rings[index];
2457 tx_ring->atr_rh = rh;
2458
2459 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2460 infop->mri_start = NULL;
2461 infop->mri_stop = NULL;
2508 if (!port->lp_collector_enabled) {
2509 *mpp = mp->b_next;
2510 mp->b_next = NULL;
2511 freemsg(mp);
2512 continue;
2513 }
2514 mpp = &mp->b_next;
2515 }
2516 return (mp_chain);
2517 }
2518
2519 static int
2520 aggr_addmac(void *arg, const uint8_t *mac_addr)
2521 {
2522 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2523 aggr_unicst_addr_t *addr, **pprev;
2524 aggr_grp_t *grp = rx_group->arg_grp;
2525 aggr_port_t *port, *p;
2526 mac_perim_handle_t mph;
2527 int err = 0;
2528 uint_t idx = rx_group->arg_index;
2529
2530 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2531
2532 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2533 mac_perim_exit(mph);
2534 return (0);
2535 }
2536
2537 /*
2538 * Insert this mac address into the list of mac addresses owned by
2539 * the aggregation pseudo group.
2540 */
2541 pprev = &rx_group->arg_macaddr;
2542 while ((addr = *pprev) != NULL) {
2543 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2544 mac_perim_exit(mph);
2545 return (EEXIST);
2546 }
2547 pprev = &addr->aua_next;
2548 }
2549 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2550 bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2551 addr->aua_next = NULL;
2552 *pprev = addr;
2553
2554 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2555 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
2556 break;
2557
2558 if (err != 0) {
2559 for (p = grp->lg_ports; p != port; p = p->lp_next)
2560 aggr_port_remmac(p, idx, mac_addr);
2561
2562 *pprev = NULL;
2563 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2564 }
2565
2566 mac_perim_exit(mph);
2567 return (err);
2568 }
2569
2570 static int
2571 aggr_remmac(void *arg, const uint8_t *mac_addr)
2572 {
2573 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2574 aggr_unicst_addr_t *addr, **pprev;
2575 aggr_grp_t *grp = rx_group->arg_grp;
2576 aggr_port_t *port;
2577 mac_perim_handle_t mph;
2578 int err = 0;
2579
2580 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2585 }
2586
2587 /*
2588 * Insert this mac address into the list of mac addresses owned by
2589 * the aggregation pseudo group.
2590 */
2591 pprev = &rx_group->arg_macaddr;
2592 while ((addr = *pprev) != NULL) {
2593 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2594 pprev = &addr->aua_next;
2595 continue;
2596 }
2597 break;
2598 }
2599 if (addr == NULL) {
2600 mac_perim_exit(mph);
2601 return (EINVAL);
2602 }
2603
2604 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2605 aggr_port_remmac(port, rx_group->arg_index, mac_addr);
2606
2607 *pprev = addr->aua_next;
2608 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2609
2610 mac_perim_exit(mph);
2611 return (err);
2612 }
2613
2614 /*
2615 * Search for VID in the Rx group's list and return a pointer if
2616 * found. Otherwise return NULL.
2617 */
2618 static aggr_vlan_t *
2619 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2620 {
2621 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2622 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2623 avp = list_next(&rx_group->arg_vlans, avp)) {
2624 if (avp->av_vid == vid)
2625 return (avp);
2627
2628 return (NULL);
2629 }
2630
2631 /*
2632 * Accept traffic on the specified VID.
2633 *
2634 * Persist VLAN state in the aggr so that ports added later will
2635 * receive the correct filters. In the future it would be nice to
2636 * allow aggr to iterate its clients instead of duplicating state.
2637 */
2638 static int
2639 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2640 {
2641 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2642 aggr_grp_t *aggr = rx_group->arg_grp;
2643 aggr_port_t *port, *p;
2644 mac_perim_handle_t mph;
2645 int err = 0;
2646 aggr_vlan_t *avp = NULL;
2647 uint_t idx = rx_group->arg_index;
2648
2649 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2650
2651 if (vid == MAC_VLAN_UNTAGGED) {
2652 /*
2653 * Aggr is both a MAC provider and MAC client. As a
2654 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2655 * client. As a client itself, it should pass
2656 * VLAN_ID_NONE to its ports.
2657 */
2658 vid = VLAN_ID_NONE;
2659 rx_group->arg_untagged++;
2660 goto update_ports;
2661 }
2662
2663 avp = aggr_find_vlan(rx_group, vid);
2664
2665 if (avp != NULL) {
2666 avp->av_refs++;
2667 mac_perim_exit(mph);
2668 return (0);
2669 }
2670
2671 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2672 avp->av_vid = vid;
2673 avp->av_refs = 1;
2674
2675 update_ports:
2676 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2677 if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
2678 break;
2679
2680 if (err != 0) {
2681 /*
2682 * If any of these calls fail then we are in a
2683 * situation where the ports have different HW state.
2684 * There's no reasonable action the MAC client can
2685 * take in this scenario to rectify the situation.
2686 */
2687 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2688 int err2;
2689
2690 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
2691 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2692 " from port %s: errno %d.", vid,
2693 mac_client_name(p->lp_mch), err2);
2694 }
2695
2696 }
2697
2698 if (vid == VLAN_ID_NONE)
2699 rx_group->arg_untagged--;
2700
2701 if (avp != NULL) {
2702 kmem_free(avp, sizeof (aggr_vlan_t));
2703 avp = NULL;
2704 }
2705 }
2706
2707 if (avp != NULL)
2708 list_insert_tail(&rx_group->arg_vlans, avp);
2709
2710 done:
2711 mac_perim_exit(mph);
2712 return (err);
2713 }
2714
2715 /*
2716 * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2717 */
2718 static int
2719 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2720 {
2721 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2722 aggr_grp_t *aggr = rx_group->arg_grp;
2723 aggr_port_t *port, *p;
2724 mac_perim_handle_t mph;
2725 int err = 0;
2726 aggr_vlan_t *avp = NULL;
2727 uint_t idx = rx_group->arg_index;
2728
2729 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2730
2731 /*
2732 * See the comment in aggr_addvlan().
2733 */
2734 if (vid == MAC_VLAN_UNTAGGED) {
2735 vid = VLAN_ID_NONE;
2736 rx_group->arg_untagged--;
2737
2738 if (rx_group->arg_untagged > 0)
2739 goto done;
2740
2741 goto update_ports;
2742 }
2743
2744 avp = aggr_find_vlan(rx_group, vid);
2745
2746 if (avp == NULL) {
2747 err = ENOENT;
2748 goto done;
2749 }
2750
2751 avp->av_refs--;
2752
2753 if (avp->av_refs > 0)
2754 goto done;
2755
2756 update_ports:
2757 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2758 if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
2759 break;
2760
2761 /*
2762 * See the comment in aggr_addvlan() for justification of the
2763 * use of VERIFY here.
2764 */
2765 if (err != 0) {
2766 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2767 int err2;
2768
2769 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
2770 cmn_err(CE_WARN, "Failed to add VLAN %u"
2771 " to port %s: errno %d.", vid,
2772 mac_client_name(p->lp_mch), err2);
2773 }
2774 }
2775
2776 if (avp != NULL)
2777 avp->av_refs++;
2778
2779 if (vid == VLAN_ID_NONE)
2780 rx_group->arg_untagged++;
2781
2782 goto done;
2783 }
2784
2785 if (err == 0 && avp != NULL) {
2786 VERIFY3U(avp->av_refs, ==, 0);
2787 list_remove(&rx_group->arg_vlans, avp);
2788 kmem_free(avp, sizeof (aggr_vlan_t));
2789 }
|