1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2018 Joyent, Inc.
24 */
25
26 /*
27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
28 *
29 * An instance of the structure aggr_grp_t is allocated for each
30 * link aggregation group. When created, aggr_grp_t objects are
31 * entered into the aggr_grp_hash hash table maintained by the modhash
32 * module. The hash key is the linkid associated with the link
33 * aggregation group.
34 *
35 * A set of MAC ports are associated with each association group.
36 *
37 * Aggr pseudo TX rings
38 * --------------------
39 * The underlying ports (NICs) in an aggregation can have TX rings. To
40 * enhance aggr's performance, these TX rings are made available to the
41 * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
42 * They are already present and implemented on the RX side. It is called
43 * as pseudo RX rings. The same concept is extended to the TX side where
44 * each TX ring of an underlying port is reflected in aggr as a pseudo
45 * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
46 * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
47 * TX ring is given to the aggregation layer.
48 *
49 * With this change, the outgoing stack depth looks much better:
50 *
51 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
52 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
53 *
54 * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
55 * SRS_TX_AGGR and SRS_TX_BW_AGGR.
56 *
57 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
58 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
59 * ring belonging to a port on which the packet has to be sent.
60 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
61 * policy and then uses the fanout_hint passed to it to pick a TX ring from
62 * the selected port.
63 *
64 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
65 * bandwidth limit is applied first on the outgoing packet and the packets
66 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
67 * particular TX ring.
68 */
69
70 #include <sys/types.h>
71 #include <sys/sysmacros.h>
72 #include <sys/conf.h>
73 #include <sys/cmn_err.h>
74 #include <sys/disp.h>
75 #include <sys/list.h>
76 #include <sys/ksynch.h>
77 #include <sys/kmem.h>
78 #include <sys/stream.h>
79 #include <sys/modctl.h>
80 #include <sys/ddi.h>
81 #include <sys/sunddi.h>
82 #include <sys/atomic.h>
83 #include <sys/stat.h>
84 #include <sys/modhash.h>
85 #include <sys/id_space.h>
86 #include <sys/strsun.h>
87 #include <sys/cred.h>
88 #include <sys/dlpi.h>
89 #include <sys/zone.h>
90 #include <sys/mac_provider.h>
91 #include <sys/dls.h>
92 #include <sys/vlan.h>
93 #include <sys/aggr.h>
94 #include <sys/aggr_impl.h>
95
96 static int aggr_m_start(void *);
97 static void aggr_m_stop(void *);
98 static int aggr_m_promisc(void *, boolean_t);
99 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
100 static int aggr_m_unicst(void *, const uint8_t *);
101 static int aggr_m_stat(void *, uint_t, uint64_t *);
102 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
103 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
105 const void *);
106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
107 mac_prop_info_handle_t);
108
109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
111 boolean_t *);
112
113 static void aggr_grp_capab_set(aggr_grp_t *);
114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
115 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
116 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
119
120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
122 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
123 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
125 static int aggr_addmac(void *, const uint8_t *);
126 static int aggr_remmac(void *, const uint8_t *);
127 static int aggr_addvlan(mac_group_driver_t, uint16_t);
128 static int aggr_remvlan(mac_group_driver_t, uint16_t);
129 static mblk_t *aggr_rx_poll(void *, int);
130 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
131 const int, mac_ring_info_t *, mac_ring_handle_t);
132 static void aggr_fill_group(void *, mac_ring_type_t, const int,
133 mac_group_info_t *, mac_group_handle_t);
134
135 static kmem_cache_t *aggr_grp_cache;
136 static mod_hash_t *aggr_grp_hash;
137 static krwlock_t aggr_grp_lock;
138 static uint_t aggr_grp_cnt;
139 static id_space_t *key_ids;
140
141 #define GRP_HASHSZ 64
142 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
143 #define AGGR_PORT_NAME_DELIMIT '-'
144
145 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
146
147 #define AGGR_M_CALLBACK_FLAGS \
148 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
149
150 static mac_callbacks_t aggr_m_callbacks = {
151 AGGR_M_CALLBACK_FLAGS,
152 aggr_m_stat,
153 aggr_m_start,
154 aggr_m_stop,
155 aggr_m_promisc,
156 aggr_m_multicst,
157 NULL,
158 NULL,
159 NULL,
160 aggr_m_ioctl,
161 aggr_m_capab_get,
162 NULL,
163 NULL,
164 aggr_m_setprop,
165 NULL,
166 aggr_m_propinfo
167 };
168
169 /*ARGSUSED*/
170 static int
171 aggr_grp_constructor(void *buf, void *arg, int kmflag)
172 {
173 aggr_grp_t *grp = buf;
174
175 bzero(grp, sizeof (*grp));
176 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
177 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
178 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
179 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
180 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
181 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
182 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
183 grp->lg_link_state = LINK_STATE_UNKNOWN;
184 return (0);
185 }
186
187 /*ARGSUSED*/
188 static void
189 aggr_grp_destructor(void *buf, void *arg)
190 {
191 aggr_grp_t *grp = buf;
192
193 if (grp->lg_tx_ports != NULL) {
194 kmem_free(grp->lg_tx_ports,
195 grp->lg_tx_ports_size * sizeof (aggr_port_t *));
196 }
197
198 mutex_destroy(&grp->lg_lacp_lock);
199 cv_destroy(&grp->lg_lacp_cv);
200 mutex_destroy(&grp->lg_port_lock);
201 cv_destroy(&grp->lg_port_cv);
202 rw_destroy(&grp->lg_tx_lock);
203 mutex_destroy(&grp->lg_tx_flowctl_lock);
204 cv_destroy(&grp->lg_tx_flowctl_cv);
205 }
206
207 void
208 aggr_grp_init(void)
209 {
210 aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
211 sizeof (aggr_grp_t), 0, aggr_grp_constructor,
212 aggr_grp_destructor, NULL, NULL, NULL, 0);
213
214 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
215 GRP_HASHSZ, mod_hash_null_valdtor);
216 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
217 aggr_grp_cnt = 0;
218
219 /*
220 * Allocate an id space to manage key values (when key is not
221 * specified). The range of the id space will be from
222 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
223 * uses a 16-bit key.
224 */
225 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
226 ASSERT(key_ids != NULL);
227 }
228
229 void
230 aggr_grp_fini(void)
231 {
232 id_space_destroy(key_ids);
233 rw_destroy(&aggr_grp_lock);
234 mod_hash_destroy_idhash(aggr_grp_hash);
235 kmem_cache_destroy(aggr_grp_cache);
236 }
237
238 uint_t
239 aggr_grp_count(void)
240 {
241 uint_t count;
242
243 rw_enter(&aggr_grp_lock, RW_READER);
244 count = aggr_grp_cnt;
245 rw_exit(&aggr_grp_lock);
246 return (count);
247 }
248
249 /*
250 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
251 * requires the mac perimeter, this function holds a reference of the aggr
252 * and aggr won't call mac_unregister() until this reference drops to 0.
253 */
254 void
255 aggr_grp_port_hold(aggr_port_t *port)
256 {
257 aggr_grp_t *grp = port->lp_grp;
258
259 AGGR_PORT_REFHOLD(port);
260 mutex_enter(&grp->lg_port_lock);
261 grp->lg_port_ref++;
262 mutex_exit(&grp->lg_port_lock);
263 }
264
265 /*
266 * Release the reference of the grp and inform aggr_grp_delete() calling
267 * mac_unregister() is now safe.
268 */
269 void
270 aggr_grp_port_rele(aggr_port_t *port)
271 {
272 aggr_grp_t *grp = port->lp_grp;
273
274 mutex_enter(&grp->lg_port_lock);
275 if (--grp->lg_port_ref == 0)
276 cv_signal(&grp->lg_port_cv);
277 mutex_exit(&grp->lg_port_lock);
278 AGGR_PORT_REFRELE(port);
279 }
280
281 /*
282 * Wait for the port's lacp timer thread and the port's notification callback
283 * to exit.
284 */
285 void
286 aggr_grp_port_wait(aggr_grp_t *grp)
287 {
288 mutex_enter(&grp->lg_port_lock);
289 if (grp->lg_port_ref != 0)
290 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
291 mutex_exit(&grp->lg_port_lock);
292 }
293
294 /*
295 * Attach a port to a link aggregation group.
296 *
297 * A port is attached to a link aggregation group once its speed
298 * and link state have been verified.
299 *
300 * Returns B_TRUE if the group link state or speed has changed. If
301 * it's the case, the caller must notify the MAC layer via a call
302 * to mac_link().
303 */
304 boolean_t
305 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
306 {
307 boolean_t link_state_changed = B_FALSE;
308
309 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
310 ASSERT(MAC_PERIM_HELD(port->lp_mh));
311
312 if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
313 return (B_FALSE);
314
315 /*
316 * Validate the MAC port link speed and update the group
317 * link speed if needed.
318 */
319 if (port->lp_ifspeed == 0 ||
320 port->lp_link_state != LINK_STATE_UP ||
321 port->lp_link_duplex != LINK_DUPLEX_FULL) {
322 /*
323 * Can't attach a MAC port with unknown link speed,
324 * down link, or not in full duplex mode.
325 */
326 return (B_FALSE);
327 }
328
329 mutex_enter(&grp->lg_stat_lock);
330 if (grp->lg_ifspeed == 0) {
331 /*
332 * The group inherits the speed of the first link being
333 * attached.
334 */
335 grp->lg_ifspeed = port->lp_ifspeed;
336 link_state_changed = B_TRUE;
337 } else if (grp->lg_ifspeed != port->lp_ifspeed) {
338 /*
339 * The link speed of the MAC port must be the same as
340 * the group link speed, as per 802.3ad. Since it is
341 * not, the attach is cancelled.
342 */
343 mutex_exit(&grp->lg_stat_lock);
344 return (B_FALSE);
345 }
346 mutex_exit(&grp->lg_stat_lock);
347
348 grp->lg_nattached_ports++;
349
350 /*
351 * Update the group link state.
352 */
353 if (grp->lg_link_state != LINK_STATE_UP) {
354 grp->lg_link_state = LINK_STATE_UP;
355 mutex_enter(&grp->lg_stat_lock);
356 grp->lg_link_duplex = LINK_DUPLEX_FULL;
357 mutex_exit(&grp->lg_stat_lock);
358 link_state_changed = B_TRUE;
359 }
360
361 /*
362 * Update port's state.
363 */
364 port->lp_state = AGGR_PORT_STATE_ATTACHED;
365
366 aggr_grp_multicst_port(port, B_TRUE);
367
368 /*
369 * Set port's receive callback
370 */
371 mac_rx_set(port->lp_mch, aggr_recv_cb, port);
372
373 /*
374 * If LACP is OFF, the port can be used to send data as soon
375 * as its link is up and verified to be compatible with the
376 * aggregation.
377 *
378 * If LACP is active or passive, notify the LACP subsystem, which
379 * will enable sending on the port following the LACP protocol.
380 */
381 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
382 aggr_send_port_enable(port);
383 else
384 aggr_lacp_port_attached(port);
385
386 return (link_state_changed);
387 }
388
389 boolean_t
390 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
391 {
392 boolean_t link_state_changed = B_FALSE;
393
394 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
395 ASSERT(MAC_PERIM_HELD(port->lp_mh));
396
397 /* update state */
398 if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
399 return (B_FALSE);
400
401 mac_rx_clear(port->lp_mch);
402
403 aggr_grp_multicst_port(port, B_FALSE);
404
405 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
406 aggr_send_port_disable(port);
407 else
408 aggr_lacp_port_detached(port);
409
410 port->lp_state = AGGR_PORT_STATE_STANDBY;
411
412 grp->lg_nattached_ports--;
413 if (grp->lg_nattached_ports == 0) {
414 /* the last attached MAC port of the group is being detached */
415 grp->lg_link_state = LINK_STATE_DOWN;
416 mutex_enter(&grp->lg_stat_lock);
417 grp->lg_ifspeed = 0;
418 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
419 mutex_exit(&grp->lg_stat_lock);
420 link_state_changed = B_TRUE;
421 }
422
423 return (link_state_changed);
424 }
425
426 /*
427 * Update the MAC addresses of the constituent ports of the specified
428 * group. This function is invoked:
429 * - after creating a new aggregation group.
430 * - after adding new ports to an aggregation group.
431 * - after removing a port from a group when the MAC address of
432 * that port was used for the MAC address of the group.
433 * - after the MAC address of a port changed when the MAC address
434 * of that port was used for the MAC address of the group.
435 *
436 * Return true if the link state of the aggregation changed, for example
437 * as a result of a failure changing the MAC address of one of the
438 * constituent ports.
439 */
440 boolean_t
441 aggr_grp_update_ports_mac(aggr_grp_t *grp)
442 {
443 aggr_port_t *cport;
444 boolean_t link_state_changed = B_FALSE;
445 mac_perim_handle_t mph;
446
447 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
448
449 for (cport = grp->lg_ports; cport != NULL;
450 cport = cport->lp_next) {
451 mac_perim_enter_by_mh(cport->lp_mh, &mph);
452 if (aggr_port_unicst(cport) != 0) {
453 if (aggr_grp_detach_port(grp, cport))
454 link_state_changed = B_TRUE;
455 } else {
456 /*
457 * If a port was detached because of a previous
458 * failure changing the MAC address, the port is
459 * reattached when it successfully changes the MAC
460 * address now, and this might cause the link state
461 * of the aggregation to change.
462 */
463 if (aggr_grp_attach_port(grp, cport))
464 link_state_changed = B_TRUE;
465 }
466 mac_perim_exit(mph);
467 }
468 return (link_state_changed);
469 }
470
471 /*
472 * Invoked when the MAC address of a port has changed. If the port's
473 * MAC address was used for the group MAC address, set mac_addr_changedp
474 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
475 * notification. If the link state changes due to detach/attach of
476 * the constituent port, set link_state_changedp to B_TRUE to indicate
477 * to the caller that it should send a MAC_NOTE_LINK notification. In both
478 * cases, it is the responsibility of the caller to invoke notification
479 * functions after releasing the the port lock.
480 */
481 void
482 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
483 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
484 {
485 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
486 ASSERT(MAC_PERIM_HELD(port->lp_mh));
487 ASSERT(mac_addr_changedp != NULL);
488 ASSERT(link_state_changedp != NULL);
489
490 *mac_addr_changedp = B_FALSE;
491 *link_state_changedp = B_FALSE;
492
493 if (grp->lg_addr_fixed) {
494 /*
495 * The group is using a fixed MAC address or an automatic
496 * MAC address has not been set.
497 */
498 return;
499 }
500
501 if (grp->lg_mac_addr_port == port) {
502 /*
503 * The MAC address of the port was assigned to the group
504 * MAC address. Update the group MAC address.
505 */
506 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
507 *mac_addr_changedp = B_TRUE;
508 } else {
509 /*
510 * Update the actual port MAC address to the MAC address
511 * of the group.
512 */
513 if (aggr_port_unicst(port) != 0) {
514 *link_state_changedp = aggr_grp_detach_port(grp, port);
515 } else {
516 /*
517 * If a port was detached because of a previous
518 * failure changing the MAC address, the port is
519 * reattached when it successfully changes the MAC
520 * address now, and this might cause the link state
521 * of the aggregation to change.
522 */
523 *link_state_changedp = aggr_grp_attach_port(grp, port);
524 }
525 }
526 }
527
528 /*
529 * Add a port to a link aggregation group.
530 */
531 static int
532 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
533 aggr_port_t **pp)
534 {
535 aggr_port_t *port, **cport;
536 mac_perim_handle_t mph;
537 zoneid_t port_zoneid = ALL_ZONES;
538 int err;
539
540 /* The port must be int the same zone as the aggregation. */
541 if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
542 port_zoneid = GLOBAL_ZONEID;
543 if (grp->lg_zoneid != port_zoneid)
544 return (EBUSY);
545
546 /*
547 * lg_mh could be NULL when the function is called during the creation
548 * of the aggregation.
549 */
550 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
551
552 /* create new port */
553 err = aggr_port_create(grp, port_linkid, force, &port);
554 if (err != 0)
555 return (err);
556
557 mac_perim_enter_by_mh(port->lp_mh, &mph);
558
559 /* add port to list of group constituent ports */
560 cport = &grp->lg_ports;
561 while (*cport != NULL)
562 cport = &((*cport)->lp_next);
563 *cport = port;
564
565 /*
566 * Back reference to the group it is member of. A port always
567 * holds a reference to its group to ensure that the back
568 * reference is always valid.
569 */
570 port->lp_grp = grp;
571 AGGR_GRP_REFHOLD(grp);
572 grp->lg_nports++;
573
574 aggr_lacp_init_port(port);
575 mac_perim_exit(mph);
576
577 if (pp != NULL)
578 *pp = port;
579
580 return (0);
581 }
582
583 /*
584 * This is called in response to either our LACP state machine or a MAC
585 * notification that the link has gone down via aggr_send_port_disable(). At
586 * this point, we may need to update our default ring. To that end, we go
587 * through the set of ports (underlying datalinks in an aggregation) that are
588 * currently enabled to transmit data. If all our links have been disabled for
589 * transmit, then we don't do anything.
590 *
591 * Note, because we only have a single TX group, we don't have to worry about
592 * the rings moving between groups and the chance that mac will reassign it
593 * unless someone removes a port, at which point, we play it safe and call this
594 * again.
595 */
596 void
597 aggr_grp_update_default(aggr_grp_t *grp)
598 {
599 aggr_port_t *port;
600 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
601
602 rw_enter(&grp->lg_tx_lock, RW_WRITER);
603
604 if (grp->lg_ntx_ports == 0) {
605 rw_exit(&grp->lg_tx_lock);
606 return;
607 }
608
609 port = grp->lg_tx_ports[0];
610 ASSERT(port->lp_tx_ring_cnt > 0);
611 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
612 rw_exit(&grp->lg_tx_lock);
613 }
614
615 /*
616 * Add a pseudo RX ring for the given HW ring handle.
617 */
618 static int
619 aggr_add_pseudo_rx_ring(aggr_port_t *port,
620 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
621 {
622 aggr_pseudo_rx_ring_t *ring;
623 int err;
624 int j;
625
626 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
627 ring = rx_grp->arg_rings + j;
628 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
629 break;
630 }
631
632 /*
633 * No slot for this new RX ring.
634 */
635 if (j == MAX_RINGS_PER_GROUP)
636 return (EIO);
637
638 ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
639 ring->arr_hw_rh = hw_rh;
640 ring->arr_port = port;
641 rx_grp->arg_ring_cnt++;
642
643 /*
644 * The group is already registered, dynamically add a new ring to the
645 * mac group.
646 */
647 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
648 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
649 ring->arr_hw_rh = NULL;
650 ring->arr_port = NULL;
651 rx_grp->arg_ring_cnt--;
652 } else {
653 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
654 mac_find_ring(rx_grp->arg_gh, j));
655 }
656 return (err);
657 }
658
659 /*
660 * Remove the pseudo RX ring of the given HW ring handle.
661 */
662 static void
663 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
664 {
665 aggr_pseudo_rx_ring_t *ring;
666 int j;
667
668 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
669 ring = rx_grp->arg_rings + j;
670 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
671 ring->arr_hw_rh != hw_rh) {
672 continue;
673 }
674
675 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
676
677 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
678 ring->arr_hw_rh = NULL;
679 ring->arr_port = NULL;
680 rx_grp->arg_ring_cnt--;
681 mac_hwring_teardown(hw_rh);
682 break;
683 }
684 }
685
686 /*
687 * Create pseudo rings over the HW rings of the port.
688 *
689 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
690 *
691 * o Program existing unicast filters on the pseudo group into the HW group.
692 *
693 * o Program existing VLAN filters on the pseudo group into the HW group.
694 */
695 static int
696 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
697 {
698 aggr_grp_t *grp = port->lp_grp;
699 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
700 aggr_unicst_addr_t *addr, *a;
701 mac_perim_handle_t pmph;
702 aggr_vlan_t *avp;
703 int hw_rh_cnt, i = 0, j;
704 int err = 0;
705
706 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
707 mac_perim_enter_by_mh(port->lp_mh, &pmph);
708
709 /*
710 * This function must be called after the aggr registers its MAC
711 * and its Rx group has been initialized.
712 */
713 ASSERT(rx_grp->arg_gh != NULL);
714
715 /*
716 * Get the list of the underlying HW rings.
717 */
718 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
719 &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
720
721 if (port->lp_hwgh != NULL) {
722 /*
723 * Quiesce the HW ring and the MAC SRS on the ring. Note
724 * that the HW ring will be restarted when the pseudo ring
725 * is started. At that time all the packets will be
726 * directly passed up to the pseudo Rx ring and handled
727 * by MAC SRS created over the pseudo Rx ring.
728 */
729 mac_rx_client_quiesce(port->lp_mch);
730 mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
731 }
732
733 /*
734 * Add existing VLAN and unicast address filters to the port.
735 */
736 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
737 avp = list_next(&rx_grp->arg_vlans, avp)) {
738 if ((err = aggr_port_addvlan(port, avp->av_vid)) != 0)
739 goto err;
740 }
741
742 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
743 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
744 goto err;
745 }
746
747 for (i = 0; i < hw_rh_cnt; i++) {
748 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
749 if (err != 0)
750 goto err;
751 }
752
753 port->lp_rx_grp_added = B_TRUE;
754 mac_perim_exit(pmph);
755 return (0);
756
757 err:
758 ASSERT(err != 0);
759
760 for (j = 0; j < i; j++)
761 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
762
763 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
764 aggr_port_remmac(port, a->aua_addr);
765
766 if (avp != NULL)
767 avp = list_prev(&rx_grp->arg_vlans, avp);
768
769 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
770 int err2;
771
772 if ((err2 = aggr_port_remvlan(port, avp->av_vid)) != 0) {
773 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
774 ": errno %d.", avp->av_vid,
775 mac_client_name(port->lp_mch), err2);
776 }
777 }
778
779 if (port->lp_hwgh != NULL) {
780 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
781 mac_rx_client_restart(port->lp_mch);
782 port->lp_hwgh = NULL;
783 }
784
785 mac_perim_exit(pmph);
786 return (err);
787 }
788
789 /*
790 * Destroy the pseudo rings mapping to this port and remove all VLAN
791 * and unicast filters from this port. Even if there are no underlying
792 * HW rings we must still remove the unicast filters to take the port
793 * out of promisc mode.
794 */
795 static void
796 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
797 {
798 aggr_grp_t *grp = port->lp_grp;
799 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
800 aggr_unicst_addr_t *addr;
801 mac_group_handle_t hwgh;
802 mac_perim_handle_t pmph;
803 int hw_rh_cnt, i;
804
805 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
806 mac_perim_enter_by_mh(port->lp_mh, &pmph);
807
808 if (!port->lp_rx_grp_added)
809 goto done;
810
811 ASSERT(rx_grp->arg_gh != NULL);
812 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
813 &hwgh, hw_rh, MAC_RING_TYPE_RX);
814
815 for (i = 0; i < hw_rh_cnt; i++)
816 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
817
818 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
819 aggr_port_remmac(port, addr->aua_addr);
820
821 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
822 avp = list_next(&rx_grp->arg_vlans, avp)) {
823 int err;
824
825 if ((err = aggr_port_remvlan(port, avp->av_vid)) != 0) {
826 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
827 ": errno %d.", avp->av_vid,
828 mac_client_name(port->lp_mch), err);
829 }
830 }
831
832 if (port->lp_hwgh != NULL) {
833 port->lp_hwgh = NULL;
834
835 /*
836 * First clear the permanent-quiesced flag of the RX srs then
837 * restart the HW ring and the mac srs on the ring. Note that
838 * the HW ring and associated SRS will soon been removed when
839 * the port is removed from the aggr.
840 */
841 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
842 mac_rx_client_restart(port->lp_mch);
843 }
844
845 port->lp_rx_grp_added = B_FALSE;
846 done:
847 mac_perim_exit(pmph);
848 }
849
850 /*
851 * Add a pseudo TX ring for the given HW ring handle.
852 */
853 static int
854 aggr_add_pseudo_tx_ring(aggr_port_t *port,
855 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
856 mac_ring_handle_t *pseudo_rh)
857 {
858 aggr_pseudo_tx_ring_t *ring;
859 int err;
860 int i;
861
862 ASSERT(MAC_PERIM_HELD(port->lp_mh));
863 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
864 ring = tx_grp->atg_rings + i;
865 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
866 break;
867 }
868 /*
869 * No slot for this new TX ring.
870 */
871 if (i == MAX_RINGS_PER_GROUP)
872 return (EIO);
873 /*
874 * The following 4 statements needs to be done before
875 * calling mac_group_add_ring(). Otherwise it will
876 * result in an assertion failure in mac_init_ring().
877 */
878 ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
879 ring->atr_hw_rh = hw_rh;
880 ring->atr_port = port;
881 tx_grp->atg_ring_cnt++;
882
883 /*
884 * The TX side has no concept of ring groups unlike RX groups.
885 * There is just a single group which stores all the TX rings.
886 * This group will be used to store aggr's pseudo TX rings.
887 */
888 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
889 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
890 ring->atr_hw_rh = NULL;
891 ring->atr_port = NULL;
892 tx_grp->atg_ring_cnt--;
893 } else {
894 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
895 if (hw_rh != NULL) {
896 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
897 mac_find_ring(tx_grp->atg_gh, i));
898 }
899 }
900
901 return (err);
902 }
903
904 /*
905 * Remove the pseudo TX ring of the given HW ring handle.
906 */
907 static void
908 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
909 mac_ring_handle_t pseudo_hw_rh)
910 {
911 aggr_pseudo_tx_ring_t *ring;
912 int i;
913
914 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
915 ring = tx_grp->atg_rings + i;
916 if (ring->atr_rh != pseudo_hw_rh)
917 continue;
918
919 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
920 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
921 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
922 mac_hwring_teardown(ring->atr_hw_rh);
923 ring->atr_hw_rh = NULL;
924 ring->atr_port = NULL;
925 tx_grp->atg_ring_cnt--;
926 break;
927 }
928 }
929
930 /*
931 * This function is called to create pseudo rings over hardware rings of
932 * the underlying device. There is a 1:1 mapping between the pseudo TX
933 * rings of the aggr and the hardware rings of the underlying port.
934 */
935 static int
936 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
937 {
938 aggr_grp_t *grp = port->lp_grp;
939 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
940 mac_perim_handle_t pmph;
941 int hw_rh_cnt, i = 0, j;
942 int err = 0;
943
944 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
945 mac_perim_enter_by_mh(port->lp_mh, &pmph);
946
947 /*
948 * Get the list the the underlying HW rings.
949 */
950 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
951 NULL, hw_rh, MAC_RING_TYPE_TX);
952
953 /*
954 * Even if the underlying NIC does not have TX rings, we
955 * still make a psuedo TX ring for that NIC with NULL as
956 * the ring handle.
957 */
958 if (hw_rh_cnt == 0)
959 port->lp_tx_ring_cnt = 1;
960 else
961 port->lp_tx_ring_cnt = hw_rh_cnt;
962
963 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
964 port->lp_tx_ring_cnt), KM_SLEEP);
965 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
966 port->lp_tx_ring_cnt), KM_SLEEP);
967
968 if (hw_rh_cnt == 0) {
969 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
970 NULL, &pseudo_rh)) == 0) {
971 port->lp_tx_rings[0] = NULL;
972 port->lp_pseudo_tx_rings[0] = pseudo_rh;
973 }
974 } else {
975 for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
976 err = aggr_add_pseudo_tx_ring(port,
977 tx_grp, hw_rh[i], &pseudo_rh);
978 if (err != 0)
979 break;
980 port->lp_tx_rings[i] = hw_rh[i];
981 port->lp_pseudo_tx_rings[i] = pseudo_rh;
982 }
983 }
984
985 if (err != 0) {
986 if (hw_rh_cnt != 0) {
987 for (j = 0; j < i; j++) {
988 aggr_rem_pseudo_tx_ring(tx_grp,
989 port->lp_pseudo_tx_rings[j]);
990 }
991 }
992 kmem_free(port->lp_tx_rings,
993 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
994 kmem_free(port->lp_pseudo_tx_rings,
995 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
996 port->lp_tx_ring_cnt = 0;
997 } else {
998 port->lp_tx_grp_added = B_TRUE;
999 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1000 aggr_tx_ring_update, port);
1001 }
1002 mac_perim_exit(pmph);
1003 aggr_grp_update_default(grp);
1004 return (err);
1005 }
1006
1007 /*
1008 * This function is called by aggr to remove pseudo TX rings over the
1009 * HW rings of the underlying port.
1010 */
1011 static void
1012 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1013 {
1014 aggr_grp_t *grp = port->lp_grp;
1015 mac_perim_handle_t pmph;
1016 int i;
1017
1018 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1019 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1020
1021 if (!port->lp_tx_grp_added)
1022 goto done;
1023
1024 ASSERT(tx_grp->atg_gh != NULL);
1025
1026 for (i = 0; i < port->lp_tx_ring_cnt; i++)
1027 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1028
1029 kmem_free(port->lp_tx_rings,
1030 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1031 kmem_free(port->lp_pseudo_tx_rings,
1032 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1033
1034 port->lp_tx_ring_cnt = 0;
1035 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1036 port->lp_tx_grp_added = B_FALSE;
1037 aggr_grp_update_default(grp);
1038 done:
1039 mac_perim_exit(pmph);
1040 }
1041
1042 static int
1043 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1044 {
1045 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1046 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1047 }
1048
1049 static int
1050 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1051 {
1052 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1053 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1054 }
1055
1056 /*
1057 * Here we need to start the pseudo-ring. As MAC already ensures that the
1058 * underlying device is set up, all we need to do is save the ring generation.
1059 *
1060 * Note, we don't end up wanting to use the underlying mac_hwring_start/stop
1061 * functions here as those don't actually stop and start the ring, they just
1062 * quiesce the ring. Regardless of whether the aggr is logically up or not, we
1063 * want to make sure that we can receive traffic for LACP.
1064 */
1065 static int
1066 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1067 {
1068 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1069
1070 rr_ring->arr_gen = mr_gen;
1071 return (0);
1072 }
1073
1074 /*
1075 * Add one or more ports to an existing link aggregation group.
1076 */
1077 int
1078 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1079 laioc_port_t *ports)
1080 {
1081 int rc, i, nadded = 0;
1082 aggr_grp_t *grp = NULL;
1083 aggr_port_t *port;
1084 boolean_t link_state_changed = B_FALSE;
1085 mac_perim_handle_t mph, pmph;
1086
1087 /* get group corresponding to linkid */
1088 rw_enter(&aggr_grp_lock, RW_READER);
1089 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1090 (mod_hash_val_t *)&grp) != 0) {
1091 rw_exit(&aggr_grp_lock);
1092 return (ENOENT);
1093 }
1094 AGGR_GRP_REFHOLD(grp);
1095
1096 /*
1097 * Hold the perimeter so that the aggregation won't be destroyed.
1098 */
1099 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1100 rw_exit(&aggr_grp_lock);
1101
1102 /* add the specified ports to group */
1103 for (i = 0; i < nports; i++) {
1104 /* add port to group */
1105 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1106 force, &port)) != 0) {
1107 goto bail;
1108 }
1109 ASSERT(port != NULL);
1110 nadded++;
1111
1112 /* check capabilities */
1113 if (!aggr_grp_capab_check(grp, port) ||
1114 !aggr_grp_sdu_check(grp, port) ||
1115 !aggr_grp_margin_check(grp, port)) {
1116 rc = ENOTSUP;
1117 goto bail;
1118 }
1119
1120 /*
1121 * Create the pseudo ring for each HW ring of the underlying
1122 * port.
1123 */
1124 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1125 if (rc != 0)
1126 goto bail;
1127 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1128 if (rc != 0)
1129 goto bail;
1130
1131 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1132
1133 /* set LACP mode */
1134 aggr_port_lacp_set_mode(grp, port);
1135
1136 /* start port if group has already been started */
1137 if (grp->lg_started) {
1138 rc = aggr_port_start(port);
1139 if (rc != 0) {
1140 mac_perim_exit(pmph);
1141 goto bail;
1142 }
1143
1144 /*
1145 * Turn on the promiscuous mode over the port when it
1146 * is requested to be turned on to receive the
1147 * non-primary address over a port, or the promiscous
1148 * mode is enabled over the aggr.
1149 */
1150 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1151 rc = aggr_port_promisc(port, B_TRUE);
1152 if (rc != 0) {
1153 mac_perim_exit(pmph);
1154 goto bail;
1155 }
1156 }
1157 }
1158 mac_perim_exit(pmph);
1159
1160 /*
1161 * Attach each port if necessary.
1162 */
1163 if (aggr_port_notify_link(grp, port))
1164 link_state_changed = B_TRUE;
1165
1166 /*
1167 * Initialize the callback functions for this port.
1168 */
1169 aggr_port_init_callbacks(port);
1170 }
1171
1172 /* update the MAC address of the constituent ports */
1173 if (aggr_grp_update_ports_mac(grp))
1174 link_state_changed = B_TRUE;
1175
1176 if (link_state_changed)
1177 mac_link_update(grp->lg_mh, grp->lg_link_state);
1178
1179 bail:
1180 if (rc != 0) {
1181 /* stop and remove ports that have been added */
1182 for (i = 0; i < nadded; i++) {
1183 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1184 ASSERT(port != NULL);
1185 if (grp->lg_started) {
1186 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1187 (void) aggr_port_promisc(port, B_FALSE);
1188 aggr_port_stop(port);
1189 mac_perim_exit(pmph);
1190 }
1191 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1192 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1193 (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1194 }
1195 }
1196
1197 mac_perim_exit(mph);
1198 AGGR_GRP_REFRELE(grp);
1199 return (rc);
1200 }
1201
1202 static int
1203 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1204 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1205 aggr_lacp_timer_t lacp_timer)
1206 {
1207 boolean_t mac_addr_changed = B_FALSE;
1208 boolean_t link_state_changed = B_FALSE;
1209 mac_perim_handle_t pmph;
1210
1211 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1212
1213 /* validate fixed address if specified */
1214 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1215 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1216 (mac_addr[0] & 0x01))) {
1217 return (EINVAL);
1218 }
1219
1220 /* update policy if requested */
1221 if (update_mask & AGGR_MODIFY_POLICY)
1222 aggr_send_update_policy(grp, policy);
1223
1224 /* update unicast MAC address if requested */
1225 if (update_mask & AGGR_MODIFY_MAC) {
1226 if (mac_fixed) {
1227 /* user-supplied MAC address */
1228 grp->lg_mac_addr_port = NULL;
1229 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1230 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1231 mac_addr_changed = B_TRUE;
1232 }
1233 } else if (grp->lg_addr_fixed) {
1234 /* switch from user-supplied to automatic */
1235 aggr_port_t *port = grp->lg_ports;
1236
1237 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1238 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1239 grp->lg_mac_addr_port = port;
1240 mac_addr_changed = B_TRUE;
1241 mac_perim_exit(pmph);
1242 }
1243 grp->lg_addr_fixed = mac_fixed;
1244 }
1245
1246 if (mac_addr_changed)
1247 link_state_changed = aggr_grp_update_ports_mac(grp);
1248
1249 if (update_mask & AGGR_MODIFY_LACP_MODE)
1250 aggr_lacp_update_mode(grp, lacp_mode);
1251
1252 if (update_mask & AGGR_MODIFY_LACP_TIMER)
1253 aggr_lacp_update_timer(grp, lacp_timer);
1254
1255 if (link_state_changed)
1256 mac_link_update(grp->lg_mh, grp->lg_link_state);
1257
1258 if (mac_addr_changed)
1259 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1260
1261 return (0);
1262 }
1263
1264 /*
1265 * Update properties of an existing link aggregation group.
1266 */
1267 int
1268 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1269 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1270 aggr_lacp_timer_t lacp_timer)
1271 {
1272 aggr_grp_t *grp = NULL;
1273 mac_perim_handle_t mph;
1274 int err;
1275
1276 /* get group corresponding to linkid */
1277 rw_enter(&aggr_grp_lock, RW_READER);
1278 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1279 (mod_hash_val_t *)&grp) != 0) {
1280 rw_exit(&aggr_grp_lock);
1281 return (ENOENT);
1282 }
1283 AGGR_GRP_REFHOLD(grp);
1284
1285 /*
1286 * Hold the perimeter so that the aggregation won't be destroyed.
1287 */
1288 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1289 rw_exit(&aggr_grp_lock);
1290
1291 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1292 mac_addr, lacp_mode, lacp_timer);
1293
1294 mac_perim_exit(mph);
1295 AGGR_GRP_REFRELE(grp);
1296 return (err);
1297 }
1298
1299 /*
1300 * Create a new link aggregation group upon request from administrator.
1301 * Returns 0 on success, an errno on failure.
1302 */
1303 int
1304 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1305 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1306 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1307 cred_t *credp)
1308 {
1309 aggr_grp_t *grp = NULL;
1310 aggr_port_t *port;
1311 mac_register_t *mac;
1312 boolean_t link_state_changed;
1313 mac_perim_handle_t mph;
1314 int err;
1315 int i;
1316 kt_did_t tid = 0;
1317
1318 /* need at least one port */
1319 if (nports == 0)
1320 return (EINVAL);
1321
1322 rw_enter(&aggr_grp_lock, RW_WRITER);
1323
1324 /* does a group with the same linkid already exist? */
1325 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1326 (mod_hash_val_t *)&grp);
1327 if (err == 0) {
1328 rw_exit(&aggr_grp_lock);
1329 return (EEXIST);
1330 }
1331
1332 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1333
1334 grp->lg_refs = 1;
1335 grp->lg_closing = B_FALSE;
1336 grp->lg_force = force;
1337 grp->lg_linkid = linkid;
1338 grp->lg_zoneid = crgetzoneid(credp);
1339 grp->lg_ifspeed = 0;
1340 grp->lg_link_state = LINK_STATE_UNKNOWN;
1341 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1342 grp->lg_started = B_FALSE;
1343 grp->lg_promisc = B_FALSE;
1344 grp->lg_lacp_done = B_FALSE;
1345 grp->lg_tx_notify_done = B_FALSE;
1346 grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1347 grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1348 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1349 grp->lg_tx_notify_thread = thread_create(NULL, 0,
1350 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1351 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1352 MAX_RINGS_PER_GROUP), KM_SLEEP);
1353 grp->lg_tx_blocked_cnt = 0;
1354 bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1355 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1356 aggr_lacp_init_grp(grp);
1357
1358 grp->lg_rx_group.arg_untagged = 0;
1359 list_create(&(grp->lg_rx_group.arg_vlans), sizeof (aggr_vlan_t),
1360 offsetof(aggr_vlan_t, av_link));
1361
1362 /* add MAC ports to group */
1363 grp->lg_ports = NULL;
1364 grp->lg_nports = 0;
1365 grp->lg_nattached_ports = 0;
1366 grp->lg_ntx_ports = 0;
1367
1368 /*
1369 * If key is not specified by the user, allocate the key.
1370 */
1371 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1372 err = ENOMEM;
1373 goto bail;
1374 }
1375 grp->lg_key = key;
1376
1377 for (i = 0; i < nports; i++) {
1378 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1379 if (err != 0)
1380 goto bail;
1381 }
1382
1383 /*
1384 * If no explicit MAC address was specified by the administrator,
1385 * set it to the MAC address of the first port.
1386 */
1387 grp->lg_addr_fixed = mac_fixed;
1388 if (grp->lg_addr_fixed) {
1389 /* validate specified address */
1390 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1391 err = EINVAL;
1392 goto bail;
1393 }
1394 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1395 } else {
1396 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1397 grp->lg_mac_addr_port = grp->lg_ports;
1398 }
1399
1400 /* set the initial group capabilities */
1401 aggr_grp_capab_set(grp);
1402
1403 if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1404 err = ENOMEM;
1405 goto bail;
1406 }
1407 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1408 mac->m_driver = grp;
1409 mac->m_dip = aggr_dip;
1410 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1411 mac->m_src_addr = grp->lg_addr;
1412 mac->m_callbacks = &aggr_m_callbacks;
1413 mac->m_min_sdu = 0;
1414 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1415 mac->m_margin = aggr_grp_max_margin(grp);
1416 mac->m_v12n = MAC_VIRT_LEVEL1;
1417 err = mac_register(mac, &grp->lg_mh);
1418 mac_free(mac);
1419 if (err != 0)
1420 goto bail;
1421
1422 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1423 if (err != 0) {
1424 (void) mac_unregister(grp->lg_mh);
1425 grp->lg_mh = NULL;
1426 goto bail;
1427 }
1428
1429 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1430
1431 /*
1432 * Update the MAC address of the constituent ports.
1433 * None of the port is attached at this time, the link state of the
1434 * aggregation will not change.
1435 */
1436 link_state_changed = aggr_grp_update_ports_mac(grp);
1437 ASSERT(!link_state_changed);
1438
1439 /* update outbound load balancing policy */
1440 aggr_send_update_policy(grp, policy);
1441
1442 /* set LACP mode */
1443 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1444
1445 /*
1446 * Attach each port if necessary.
1447 */
1448 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1449 /*
1450 * Create the pseudo ring for each HW ring of the underlying
1451 * port. Note that this is done after the aggr registers the
1452 * mac.
1453 */
1454 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1455 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1456 if (aggr_port_notify_link(grp, port))
1457 link_state_changed = B_TRUE;
1458
1459 /*
1460 * Initialize the callback functions for this port.
1461 */
1462 aggr_port_init_callbacks(port);
1463 }
1464
1465 if (link_state_changed)
1466 mac_link_update(grp->lg_mh, grp->lg_link_state);
1467
1468 /* add new group to hash table */
1469 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1470 (mod_hash_val_t)grp);
1471 ASSERT(err == 0);
1472 aggr_grp_cnt++;
1473
1474 mac_perim_exit(mph);
1475 rw_exit(&aggr_grp_lock);
1476 return (0);
1477
1478 bail:
1479
1480 grp->lg_closing = B_TRUE;
1481
1482 port = grp->lg_ports;
1483 while (port != NULL) {
1484 aggr_port_t *cport;
1485
1486 cport = port->lp_next;
1487 aggr_port_delete(port);
1488 port = cport;
1489 }
1490
1491 /*
1492 * Inform the lacp_rx thread to exit.
1493 */
1494 mutex_enter(&grp->lg_lacp_lock);
1495 grp->lg_lacp_done = B_TRUE;
1496 cv_signal(&grp->lg_lacp_cv);
1497 while (grp->lg_lacp_rx_thread != NULL)
1498 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1499 mutex_exit(&grp->lg_lacp_lock);
1500 /*
1501 * Inform the tx_notify thread to exit.
1502 */
1503 mutex_enter(&grp->lg_tx_flowctl_lock);
1504 if (grp->lg_tx_notify_thread != NULL) {
1505 tid = grp->lg_tx_notify_thread->t_did;
1506 grp->lg_tx_notify_done = B_TRUE;
1507 cv_signal(&grp->lg_tx_flowctl_cv);
1508 }
1509 mutex_exit(&grp->lg_tx_flowctl_lock);
1510 if (tid != 0)
1511 thread_join(tid);
1512
1513 kmem_free(grp->lg_tx_blocked_rings,
1514 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1515 rw_exit(&aggr_grp_lock);
1516 AGGR_GRP_REFRELE(grp);
1517 return (err);
1518 }
1519
1520 /*
1521 * Return a pointer to the member of a group with specified linkid.
1522 */
1523 static aggr_port_t *
1524 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1525 {
1526 aggr_port_t *port;
1527
1528 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1529
1530 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1531 if (port->lp_linkid == linkid)
1532 break;
1533 }
1534
1535 return (port);
1536 }
1537
1538 /*
1539 * Stop, detach and remove a port from a link aggregation group.
1540 */
1541 static int
1542 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1543 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1544 {
1545 int rc = 0;
1546 aggr_port_t **pport;
1547 boolean_t mac_addr_changed = B_FALSE;
1548 boolean_t link_state_changed = B_FALSE;
1549 mac_perim_handle_t mph;
1550 uint64_t val;
1551 uint_t i;
1552 uint_t stat;
1553
1554 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1555 ASSERT(grp->lg_nports > 1);
1556 ASSERT(!grp->lg_closing);
1557
1558 /* unlink port */
1559 for (pport = &grp->lg_ports; *pport != port;
1560 pport = &(*pport)->lp_next) {
1561 if (*pport == NULL) {
1562 rc = ENOENT;
1563 goto done;
1564 }
1565 }
1566 *pport = port->lp_next;
1567
1568 mac_perim_enter_by_mh(port->lp_mh, &mph);
1569
1570 /*
1571 * If the MAC address of the port being removed was assigned
1572 * to the group, update the group MAC address
1573 * using the MAC address of a different port.
1574 */
1575 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1576 /*
1577 * Set the MAC address of the group to the
1578 * MAC address of its first port.
1579 */
1580 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1581 grp->lg_mac_addr_port = grp->lg_ports;
1582 mac_addr_changed = B_TRUE;
1583 }
1584
1585 link_state_changed = aggr_grp_detach_port(grp, port);
1586
1587 /*
1588 * Add the counter statistics of the ports while it was aggregated
1589 * to the group's residual statistics. This is done by obtaining
1590 * the current counter from the underlying MAC then subtracting the
1591 * value of the counter at the moment it was added to the
1592 * aggregation.
1593 */
1594 for (i = 0; i < MAC_NSTAT; i++) {
1595 stat = i + MAC_STAT_MIN;
1596 if (!MAC_STAT_ISACOUNTER(stat))
1597 continue;
1598 val = aggr_port_stat(port, stat);
1599 val -= port->lp_stat[i];
1600 mutex_enter(&grp->lg_stat_lock);
1601 grp->lg_stat[i] += val;
1602 mutex_exit(&grp->lg_stat_lock);
1603 }
1604 for (i = 0; i < ETHER_NSTAT; i++) {
1605 stat = i + MACTYPE_STAT_MIN;
1606 if (!ETHER_STAT_ISACOUNTER(stat))
1607 continue;
1608 val = aggr_port_stat(port, stat);
1609 val -= port->lp_ether_stat[i];
1610 mutex_enter(&grp->lg_stat_lock);
1611 grp->lg_ether_stat[i] += val;
1612 mutex_exit(&grp->lg_stat_lock);
1613 }
1614
1615 grp->lg_nports--;
1616 mac_perim_exit(mph);
1617
1618 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1619 aggr_port_delete(port);
1620
1621 /*
1622 * If the group MAC address has changed, update the MAC address of
1623 * the remaining constituent ports according to the new MAC
1624 * address of the group.
1625 */
1626 if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1627 link_state_changed = B_TRUE;
1628
1629 done:
1630 if (mac_addr_changedp != NULL)
1631 *mac_addr_changedp = mac_addr_changed;
1632 if (link_state_changedp != NULL)
1633 *link_state_changedp = link_state_changed;
1634
1635 return (rc);
1636 }
1637
1638 /*
1639 * Remove one or more ports from an existing link aggregation group.
1640 */
1641 int
1642 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1643 {
1644 int rc = 0, i;
1645 aggr_grp_t *grp = NULL;
1646 aggr_port_t *port;
1647 boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1648 boolean_t link_state_update = B_FALSE, link_state_changed;
1649 mac_perim_handle_t mph, pmph;
1650
1651 /* get group corresponding to linkid */
1652 rw_enter(&aggr_grp_lock, RW_READER);
1653 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1654 (mod_hash_val_t *)&grp) != 0) {
1655 rw_exit(&aggr_grp_lock);
1656 return (ENOENT);
1657 }
1658 AGGR_GRP_REFHOLD(grp);
1659
1660 /*
1661 * Hold the perimeter so that the aggregation won't be destroyed.
1662 */
1663 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1664 rw_exit(&aggr_grp_lock);
1665
1666 /* we need to keep at least one port per group */
1667 if (nports >= grp->lg_nports) {
1668 rc = EINVAL;
1669 goto bail;
1670 }
1671
1672 /* first verify that all the groups are valid */
1673 for (i = 0; i < nports; i++) {
1674 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1675 /* port not found */
1676 rc = ENOENT;
1677 goto bail;
1678 }
1679 }
1680
1681 /* clear the promiscous mode for the specified ports */
1682 for (i = 0; i < nports && rc == 0; i++) {
1683 /* lookup port */
1684 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1685 ASSERT(port != NULL);
1686
1687 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1688 rc = aggr_port_promisc(port, B_FALSE);
1689 mac_perim_exit(pmph);
1690 }
1691 if (rc != 0) {
1692 for (i = 0; i < nports; i++) {
1693 port = aggr_grp_port_lookup(grp,
1694 ports[i].lp_linkid);
1695 ASSERT(port != NULL);
1696
1697 /*
1698 * Turn the promiscuous mode back on if it is required
1699 * to receive the non-primary address over a port, or
1700 * the promiscous mode is enabled over the aggr.
1701 */
1702 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1703 if (port->lp_started && (grp->lg_promisc ||
1704 port->lp_prom_addr != NULL)) {
1705 (void) aggr_port_promisc(port, B_TRUE);
1706 }
1707 mac_perim_exit(pmph);
1708 }
1709 goto bail;
1710 }
1711
1712 /* remove the specified ports from group */
1713 for (i = 0; i < nports; i++) {
1714 /* lookup port */
1715 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1716 ASSERT(port != NULL);
1717
1718 /* stop port if group has already been started */
1719 if (grp->lg_started) {
1720 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1721 aggr_port_stop(port);
1722 mac_perim_exit(pmph);
1723 }
1724
1725 /*
1726 * aggr_rem_pseudo_tx_group() is not called here. Instead
1727 * it is called from inside aggr_grp_rem_port() after the
1728 * port has been detached. The reason is that
1729 * aggr_rem_pseudo_tx_group() removes one ring at a time
1730 * and if there is still traffic going on, then there
1731 * is the possibility of aggr_find_tx_ring() returning a
1732 * removed ring for transmission. Once the port has been
1733 * detached, that port will not be used and
1734 * aggr_find_tx_ring() will not return any rings
1735 * belonging to it.
1736 */
1737 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1738
1739 /* remove port from group */
1740 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1741 &link_state_changed);
1742 ASSERT(rc == 0);
1743 mac_addr_update = mac_addr_update || mac_addr_changed;
1744 link_state_update = link_state_update || link_state_changed;
1745 }
1746
1747 bail:
1748 if (mac_addr_update)
1749 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1750 if (link_state_update)
1751 mac_link_update(grp->lg_mh, grp->lg_link_state);
1752
1753 mac_perim_exit(mph);
1754 AGGR_GRP_REFRELE(grp);
1755
1756 return (rc);
1757 }
1758
1759 int
1760 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1761 {
1762 aggr_grp_t *grp = NULL;
1763 aggr_port_t *port, *cport;
1764 datalink_id_t tmpid;
1765 mod_hash_val_t val;
1766 mac_perim_handle_t mph, pmph;
1767 int err;
1768 kt_did_t tid = 0;
1769
1770 rw_enter(&aggr_grp_lock, RW_WRITER);
1771
1772 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1773 (mod_hash_val_t *)&grp) != 0) {
1774 rw_exit(&aggr_grp_lock);
1775 return (ENOENT);
1776 }
1777
1778 /*
1779 * Note that dls_devnet_destroy() must be called before lg_lock is
1780 * held. Otherwise, it will deadlock if another thread is in
1781 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1782 * dls_devnet_destroy() needs to delete.
1783 */
1784 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1785 rw_exit(&aggr_grp_lock);
1786 return (err);
1787 }
1788 ASSERT(linkid == tmpid);
1789
1790 /*
1791 * Unregister from the MAC service module. Since this can
1792 * fail if a client hasn't closed the MAC port, we gracefully
1793 * fail the operation.
1794 */
1795 if ((err = mac_disable(grp->lg_mh)) != 0) {
1796 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1797 rw_exit(&aggr_grp_lock);
1798 return (err);
1799 }
1800 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1801 ASSERT(grp == (aggr_grp_t *)val);
1802
1803 ASSERT(aggr_grp_cnt > 0);
1804 aggr_grp_cnt--;
1805 rw_exit(&aggr_grp_lock);
1806
1807 /*
1808 * Inform the lacp_rx thread to exit.
1809 */
1810 mutex_enter(&grp->lg_lacp_lock);
1811 grp->lg_lacp_done = B_TRUE;
1812 cv_signal(&grp->lg_lacp_cv);
1813 while (grp->lg_lacp_rx_thread != NULL)
1814 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1815 mutex_exit(&grp->lg_lacp_lock);
1816 /*
1817 * Inform the tx_notify_thread to exit.
1818 */
1819 mutex_enter(&grp->lg_tx_flowctl_lock);
1820 if (grp->lg_tx_notify_thread != NULL) {
1821 tid = grp->lg_tx_notify_thread->t_did;
1822 grp->lg_tx_notify_done = B_TRUE;
1823 cv_signal(&grp->lg_tx_flowctl_cv);
1824 }
1825 mutex_exit(&grp->lg_tx_flowctl_lock);
1826 if (tid != 0)
1827 thread_join(tid);
1828
1829 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1830
1831 grp->lg_closing = B_TRUE;
1832 /* detach and free MAC ports associated with group */
1833 port = grp->lg_ports;
1834 while (port != NULL) {
1835 cport = port->lp_next;
1836 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1837 if (grp->lg_started)
1838 aggr_port_stop(port);
1839 (void) aggr_grp_detach_port(grp, port);
1840 mac_perim_exit(pmph);
1841 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1842 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1843 aggr_port_delete(port);
1844 port = cport;
1845 }
1846
1847 mac_perim_exit(mph);
1848
1849 kmem_free(grp->lg_tx_blocked_rings,
1850 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1851 /*
1852 * Wait for the port's lacp timer thread and its notification callback
1853 * to exit before calling mac_unregister() since both needs to access
1854 * the mac perimeter of the grp.
1855 */
1856 aggr_grp_port_wait(grp);
1857
1858 VERIFY(mac_unregister(grp->lg_mh) == 0);
1859 grp->lg_mh = NULL;
1860
1861 list_destroy(&(grp->lg_rx_group.arg_vlans));
1862
1863 AGGR_GRP_REFRELE(grp);
1864 return (0);
1865 }
1866
1867 void
1868 aggr_grp_free(aggr_grp_t *grp)
1869 {
1870 ASSERT(grp->lg_refs == 0);
1871 ASSERT(grp->lg_port_ref == 0);
1872 if (grp->lg_key > AGGR_MAX_KEY) {
1873 id_free(key_ids, grp->lg_key);
1874 grp->lg_key = 0;
1875 }
1876 kmem_cache_free(aggr_grp_cache, grp);
1877 }
1878
1879 int
1880 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1881 aggr_grp_info_new_grp_fn_t new_grp_fn,
1882 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1883 {
1884 aggr_grp_t *grp;
1885 aggr_port_t *port;
1886 mac_perim_handle_t mph, pmph;
1887 int rc = 0;
1888
1889 /*
1890 * Make sure that the aggregation link is visible from the caller's
1891 * zone.
1892 */
1893 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1894 return (ENOENT);
1895
1896 rw_enter(&aggr_grp_lock, RW_READER);
1897
1898 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1899 (mod_hash_val_t *)&grp) != 0) {
1900 rw_exit(&aggr_grp_lock);
1901 return (ENOENT);
1902 }
1903 AGGR_GRP_REFHOLD(grp);
1904
1905 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1906 rw_exit(&aggr_grp_lock);
1907
1908 rc = new_grp_fn(fn_arg, grp->lg_linkid,
1909 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1910 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1911 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1912
1913 if (rc != 0)
1914 goto bail;
1915
1916 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1917 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1918 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1919 port->lp_state, &port->lp_lacp.ActorOperPortState);
1920 mac_perim_exit(pmph);
1921
1922 if (rc != 0)
1923 goto bail;
1924 }
1925
1926 bail:
1927 mac_perim_exit(mph);
1928 AGGR_GRP_REFRELE(grp);
1929 return (rc);
1930 }
1931
1932 /*ARGSUSED*/
1933 static void
1934 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1935 {
1936 miocnak(q, mp, 0, ENOTSUP);
1937 }
1938
1939 static int
1940 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1941 {
1942 aggr_port_t *port;
1943 uint_t stat_index;
1944
1945 ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
1946
1947 /* We only aggregate counter statistics. */
1948 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1949 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1950 return (ENOTSUP);
1951 }
1952
1953 /*
1954 * Counter statistics for a group are computed by aggregating the
1955 * counters of the members MACs while they were aggregated, plus
1956 * the residual counter of the group itself, which is updated each
1957 * time a MAC is removed from the group.
1958 */
1959 *val = 0;
1960 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1961 /* actual port statistic */
1962 *val += aggr_port_stat(port, stat);
1963 /*
1964 * minus the port stat when it was added, plus any residual
1965 * amount for the group.
1966 */
1967 if (IS_MAC_STAT(stat)) {
1968 stat_index = stat - MAC_STAT_MIN;
1969 *val -= port->lp_stat[stat_index];
1970 *val += grp->lg_stat[stat_index];
1971 } else if (IS_MACTYPE_STAT(stat)) {
1972 stat_index = stat - MACTYPE_STAT_MIN;
1973 *val -= port->lp_ether_stat[stat_index];
1974 *val += grp->lg_ether_stat[stat_index];
1975 }
1976 }
1977 return (0);
1978 }
1979
1980 int
1981 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1982 {
1983 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1984
1985 if (rx_ring->arr_hw_rh != NULL) {
1986 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1987 } else {
1988 aggr_port_t *port = rx_ring->arr_port;
1989
1990 *val = mac_stat_get(port->lp_mh, stat);
1991
1992 }
1993 return (0);
1994 }
1995
1996 int
1997 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1998 {
1999 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2000
2001 if (tx_ring->atr_hw_rh != NULL) {
2002 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2003 } else {
2004 aggr_port_t *port = tx_ring->atr_port;
2005
2006 *val = mac_stat_get(port->lp_mh, stat);
2007 }
2008 return (0);
2009 }
2010
2011 static int
2012 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2013 {
2014 aggr_grp_t *grp = arg;
2015 int rval = 0;
2016
2017 mutex_enter(&grp->lg_stat_lock);
2018
2019 switch (stat) {
2020 case MAC_STAT_IFSPEED:
2021 *val = grp->lg_ifspeed;
2022 break;
2023
2024 case ETHER_STAT_LINK_DUPLEX:
2025 *val = grp->lg_link_duplex;
2026 break;
2027
2028 default:
2029 /*
2030 * For all other statistics, we return the aggregated stat
2031 * from the underlying ports. aggr_grp_stat() will set
2032 * rval appropriately if the statistic isn't a counter.
2033 */
2034 rval = aggr_grp_stat(grp, stat, val);
2035 }
2036
2037 mutex_exit(&grp->lg_stat_lock);
2038 return (rval);
2039 }
2040
2041 static int
2042 aggr_m_start(void *arg)
2043 {
2044 aggr_grp_t *grp = arg;
2045 aggr_port_t *port;
2046 mac_perim_handle_t mph, pmph;
2047
2048 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2049
2050 /*
2051 * Attempts to start all configured members of the group.
2052 * Group members will be attached when their link-up notification
2053 * is received.
2054 */
2055 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2056 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2057 if (aggr_port_start(port) != 0) {
2058 mac_perim_exit(pmph);
2059 continue;
2060 }
2061
2062 /*
2063 * Turn on the promiscuous mode if it is required to receive
2064 * the non-primary address over a port, or the promiscous
2065 * mode is enabled over the aggr.
2066 */
2067 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2068 if (aggr_port_promisc(port, B_TRUE) != 0)
2069 aggr_port_stop(port);
2070 }
2071 mac_perim_exit(pmph);
2072 }
2073
2074 grp->lg_started = B_TRUE;
2075
2076 mac_perim_exit(mph);
2077 return (0);
2078 }
2079
2080 static void
2081 aggr_m_stop(void *arg)
2082 {
2083 aggr_grp_t *grp = arg;
2084 aggr_port_t *port;
2085 mac_perim_handle_t mph, pmph;
2086
2087 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2088
2089 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2090 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2091
2092 /* reset port promiscuous mode */
2093 (void) aggr_port_promisc(port, B_FALSE);
2094
2095 aggr_port_stop(port);
2096 mac_perim_exit(pmph);
2097 }
2098
2099 grp->lg_started = B_FALSE;
2100 mac_perim_exit(mph);
2101 }
2102
2103 static int
2104 aggr_m_promisc(void *arg, boolean_t on)
2105 {
2106 aggr_grp_t *grp = arg;
2107 aggr_port_t *port;
2108 boolean_t link_state_changed = B_FALSE;
2109 mac_perim_handle_t mph, pmph;
2110
2111 AGGR_GRP_REFHOLD(grp);
2112 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2113
2114 ASSERT(!grp->lg_closing);
2115
2116 if (on == grp->lg_promisc)
2117 goto bail;
2118
2119 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2120 int err = 0;
2121
2122 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2123 AGGR_PORT_REFHOLD(port);
2124 if (!on && (port->lp_prom_addr == NULL))
2125 err = aggr_port_promisc(port, B_FALSE);
2126 else if (on && port->lp_started)
2127 err = aggr_port_promisc(port, B_TRUE);
2128
2129 if (err != 0) {
2130 if (aggr_grp_detach_port(grp, port))
2131 link_state_changed = B_TRUE;
2132 } else {
2133 /*
2134 * If a port was detached because of a previous
2135 * failure changing the promiscuity, the port
2136 * is reattached when it successfully changes
2137 * the promiscuity now, and this might cause
2138 * the link state of the aggregation to change.
2139 */
2140 if (aggr_grp_attach_port(grp, port))
2141 link_state_changed = B_TRUE;
2142 }
2143 mac_perim_exit(pmph);
2144 AGGR_PORT_REFRELE(port);
2145 }
2146
2147 grp->lg_promisc = on;
2148
2149 if (link_state_changed)
2150 mac_link_update(grp->lg_mh, grp->lg_link_state);
2151
2152 bail:
2153 mac_perim_exit(mph);
2154 AGGR_GRP_REFRELE(grp);
2155
2156 return (0);
2157 }
2158
2159 static void
2160 aggr_grp_port_rename(const char *new_name, void *arg)
2161 {
2162 /*
2163 * aggr port's mac client name is the format of "aggr link name" plus
2164 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2165 */
2166 int aggr_len, link_len, clnt_name_len, i;
2167 char *str_end, *str_st, *str_del;
2168 char aggr_name[MAXNAMELEN];
2169 char link_name[MAXNAMELEN];
2170 char *clnt_name;
2171 aggr_grp_t *aggr_grp = arg;
2172 aggr_port_t *aggr_port = aggr_grp->lg_ports;
2173
2174 for (i = 0; i < aggr_grp->lg_nports; i++) {
2175 clnt_name = mac_client_name(aggr_port->lp_mch);
2176 clnt_name_len = strlen(clnt_name);
2177 str_st = clnt_name;
2178 str_end = &(clnt_name[clnt_name_len]);
2179 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2180 ASSERT(str_del != NULL);
2181 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2182 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2183 bzero(aggr_name, MAXNAMELEN);
2184 bzero(link_name, MAXNAMELEN);
2185 bcopy(clnt_name, aggr_name, aggr_len);
2186 bcopy(str_del, link_name, link_len + 1);
2187 bzero(clnt_name, MAXNAMELEN);
2188 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2189 link_name);
2190
2191 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2192 aggr_port = aggr_port->lp_next;
2193 }
2194 }
2195
2196 /*
2197 * Initialize the capabilities that are advertised for the group
2198 * according to the capabilities of the constituent ports.
2199 */
2200 static boolean_t
2201 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2202 {
2203 aggr_grp_t *grp = arg;
2204
2205 switch (cap) {
2206 case MAC_CAPAB_HCKSUM: {
2207 uint32_t *hcksum_txflags = cap_data;
2208 *hcksum_txflags = grp->lg_hcksum_txflags;
2209 break;
2210 }
2211 case MAC_CAPAB_LSO: {
2212 mac_capab_lso_t *cap_lso = cap_data;
2213
2214 if (grp->lg_lso) {
2215 *cap_lso = grp->lg_cap_lso;
2216 break;
2217 } else {
2218 return (B_FALSE);
2219 }
2220 }
2221 case MAC_CAPAB_NO_NATIVEVLAN:
2222 return (!grp->lg_vlan);
2223 case MAC_CAPAB_NO_ZCOPY:
2224 return (!grp->lg_zcopy);
2225 case MAC_CAPAB_RINGS: {
2226 mac_capab_rings_t *cap_rings = cap_data;
2227
2228 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2229 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2230 cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2231
2232 /*
2233 * An aggregation advertises only one (pseudo) RX
2234 * group, which virtualizes the main/primary group of
2235 * the underlying devices.
2236 */
2237 cap_rings->mr_gnum = 1;
2238 cap_rings->mr_gaddring = NULL;
2239 cap_rings->mr_gremring = NULL;
2240 } else {
2241 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2242 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2243 cap_rings->mr_gnum = 0;
2244 }
2245 cap_rings->mr_rget = aggr_fill_ring;
2246 cap_rings->mr_gget = aggr_fill_group;
2247 break;
2248 }
2249 case MAC_CAPAB_AGGR:
2250 {
2251 mac_capab_aggr_t *aggr_cap;
2252
2253 if (cap_data != NULL) {
2254 aggr_cap = cap_data;
2255 aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2256 aggr_cap->mca_unicst = aggr_m_unicst;
2257 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2258 aggr_cap->mca_arg = arg;
2259 }
2260 return (B_TRUE);
2261 }
2262 default:
2263 return (B_FALSE);
2264 }
2265 return (B_TRUE);
2266 }
2267
2268 /*
2269 * Callback function for MAC layer to register groups.
2270 */
2271 static void
2272 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2273 mac_group_info_t *infop, mac_group_handle_t gh)
2274 {
2275 aggr_grp_t *grp = arg;
2276 aggr_pseudo_rx_group_t *rx_group;
2277 aggr_pseudo_tx_group_t *tx_group;
2278
2279 ASSERT(index == 0);
2280 if (rtype == MAC_RING_TYPE_RX) {
2281 rx_group = &grp->lg_rx_group;
2282 rx_group->arg_gh = gh;
2283 rx_group->arg_grp = grp;
2284
2285 infop->mgi_driver = (mac_group_driver_t)rx_group;
2286 infop->mgi_start = NULL;
2287 infop->mgi_stop = NULL;
2288 infop->mgi_addmac = aggr_addmac;
2289 infop->mgi_remmac = aggr_remmac;
2290 infop->mgi_count = rx_group->arg_ring_cnt;
2291
2292 /*
2293 * Always set the HW VLAN callbacks. They are smart
2294 * enough to know when a port has HW VLAN filters to
2295 * program and when it doesn't.
2296 */
2297 infop->mgi_addvlan = aggr_addvlan;
2298 infop->mgi_remvlan = aggr_remvlan;
2299 } else {
2300 tx_group = &grp->lg_tx_group;
2301 tx_group->atg_gh = gh;
2302 }
2303 }
2304
2305 /*
2306 * Callback funtion for MAC layer to register all rings.
2307 */
2308 static void
2309 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2310 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2311 {
2312 aggr_grp_t *grp = arg;
2313
2314 switch (rtype) {
2315 case MAC_RING_TYPE_RX: {
2316 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group;
2317 aggr_pseudo_rx_ring_t *rx_ring;
2318 mac_intr_t aggr_mac_intr;
2319
2320 ASSERT(rg_index == 0);
2321
2322 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2323 rx_ring = rx_group->arg_rings + index;
2324 rx_ring->arr_rh = rh;
2325
2326 /*
2327 * Entrypoint to enable interrupt (disable poll) and
2328 * disable interrupt (enable poll).
2329 */
2330 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2331 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2332 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2333 aggr_mac_intr.mi_ddi_handle = NULL;
2334
2335 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2336 infop->mri_start = aggr_pseudo_start_ring;
2337 infop->mri_stop = NULL;
2338
2339 infop->mri_intr = aggr_mac_intr;
2340 infop->mri_poll = aggr_rx_poll;
2341
2342 infop->mri_stat = aggr_rx_ring_stat;
2343 break;
2344 }
2345 case MAC_RING_TYPE_TX: {
2346 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2347 aggr_pseudo_tx_ring_t *tx_ring;
2348
2349 ASSERT(rg_index == -1);
2350 ASSERT(index < tx_group->atg_ring_cnt);
2351
2352 tx_ring = &tx_group->atg_rings[index];
2353 tx_ring->atr_rh = rh;
2354
2355 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2356 infop->mri_start = NULL;
2357 infop->mri_stop = NULL;
2358 infop->mri_tx = aggr_ring_tx;
2359 infop->mri_stat = aggr_tx_ring_stat;
2360 /*
2361 * Use the hw TX ring handle to find if the ring needs
2362 * serialization or not. For NICs that do not expose
2363 * Tx rings, atr_hw_rh will be NULL.
2364 */
2365 if (tx_ring->atr_hw_rh != NULL) {
2366 infop->mri_flags =
2367 mac_hwring_getinfo(tx_ring->atr_hw_rh);
2368 }
2369 break;
2370 }
2371 default:
2372 break;
2373 }
2374 }
2375
2376 static mblk_t *
2377 aggr_rx_poll(void *arg, int bytes_to_pickup)
2378 {
2379 aggr_pseudo_rx_ring_t *rr_ring = arg;
2380 aggr_port_t *port = rr_ring->arr_port;
2381 aggr_grp_t *grp = port->lp_grp;
2382 mblk_t *mp_chain, *mp, **mpp;
2383
2384 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2385
2386 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2387 return (mp_chain);
2388
2389 mpp = &mp_chain;
2390 while ((mp = *mpp) != NULL) {
2391 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2392 struct ether_header *ehp;
2393
2394 ehp = (struct ether_header *)mp->b_rptr;
2395 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2396 *mpp = mp->b_next;
2397 mp->b_next = NULL;
2398 aggr_recv_lacp(port,
2399 (mac_resource_handle_t)rr_ring, mp);
2400 continue;
2401 }
2402 }
2403
2404 if (!port->lp_collector_enabled) {
2405 *mpp = mp->b_next;
2406 mp->b_next = NULL;
2407 freemsg(mp);
2408 continue;
2409 }
2410 mpp = &mp->b_next;
2411 }
2412 return (mp_chain);
2413 }
2414
2415 static int
2416 aggr_addmac(void *arg, const uint8_t *mac_addr)
2417 {
2418 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2419 aggr_unicst_addr_t *addr, **pprev;
2420 aggr_grp_t *grp = rx_group->arg_grp;
2421 aggr_port_t *port, *p;
2422 mac_perim_handle_t mph;
2423 int err = 0;
2424
2425 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2426
2427 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2428 mac_perim_exit(mph);
2429 return (0);
2430 }
2431
2432 /*
2433 * Insert this mac address into the list of mac addresses owned by
2434 * the aggregation pseudo group.
2435 */
2436 pprev = &rx_group->arg_macaddr;
2437 while ((addr = *pprev) != NULL) {
2438 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2439 mac_perim_exit(mph);
2440 return (EEXIST);
2441 }
2442 pprev = &addr->aua_next;
2443 }
2444 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2445 bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2446 addr->aua_next = NULL;
2447 *pprev = addr;
2448
2449 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2450 if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2451 break;
2452
2453 if (err != 0) {
2454 for (p = grp->lg_ports; p != port; p = p->lp_next)
2455 aggr_port_remmac(p, mac_addr);
2456
2457 *pprev = NULL;
2458 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2459 }
2460
2461 mac_perim_exit(mph);
2462 return (err);
2463 }
2464
2465 static int
2466 aggr_remmac(void *arg, const uint8_t *mac_addr)
2467 {
2468 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2469 aggr_unicst_addr_t *addr, **pprev;
2470 aggr_grp_t *grp = rx_group->arg_grp;
2471 aggr_port_t *port;
2472 mac_perim_handle_t mph;
2473 int err = 0;
2474
2475 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2476
2477 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2478 mac_perim_exit(mph);
2479 return (0);
2480 }
2481
2482 /*
2483 * Insert this mac address into the list of mac addresses owned by
2484 * the aggregation pseudo group.
2485 */
2486 pprev = &rx_group->arg_macaddr;
2487 while ((addr = *pprev) != NULL) {
2488 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2489 pprev = &addr->aua_next;
2490 continue;
2491 }
2492 break;
2493 }
2494 if (addr == NULL) {
2495 mac_perim_exit(mph);
2496 return (EINVAL);
2497 }
2498
2499 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2500 aggr_port_remmac(port, mac_addr);
2501
2502 *pprev = addr->aua_next;
2503 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2504
2505 mac_perim_exit(mph);
2506 return (err);
2507 }
2508
2509 /*
2510 * Search for VID in the Rx group's list and return a pointer if
2511 * found. Otherwise return NULL.
2512 */
2513 static aggr_vlan_t *
2514 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2515 {
2516 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2517 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2518 avp = list_next(&rx_group->arg_vlans, avp)) {
2519 if (avp->av_vid == vid)
2520 return (avp);
2521 }
2522
2523 return (NULL);
2524 }
2525
2526 /*
2527 * Accept traffic on the specified VID.
2528 *
2529 * Persist VLAN state in the aggr so that ports added later will
2530 * receive the correct filters. In the future it would be nice to
2531 * allow aggr to iterate its clients instead of duplicating state.
2532 */
2533 static int
2534 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2535 {
2536 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2537 aggr_grp_t *aggr = rx_group->arg_grp;
2538 aggr_port_t *port, *p;
2539 mac_perim_handle_t mph;
2540 int err = 0;
2541 aggr_vlan_t *avp = NULL;
2542
2543 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2544
2545 if (vid == MAC_VLAN_UNTAGGED) {
2546 /*
2547 * Aggr is both a MAC provider and MAC client. As a
2548 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2549 * client. As a client itself, it should pass
2550 * VLAN_ID_NONE to its ports.
2551 */
2552 vid = VLAN_ID_NONE;
2553 rx_group->arg_untagged++;
2554 goto update_ports;
2555 }
2556
2557 avp = aggr_find_vlan(rx_group, vid);
2558
2559 if (avp != NULL) {
2560 avp->av_refs++;
2561 mac_perim_exit(mph);
2562 return (0);
2563 }
2564
2565 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2566 avp->av_vid = vid;
2567 avp->av_refs = 1;
2568
2569 update_ports:
2570 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2571 if ((err = aggr_port_addvlan(port, vid)) != 0)
2572 break;
2573
2574 if (err != 0) {
2575 /*
2576 * If any of these calls fail then we are in a
2577 * situation where the ports have different HW state.
2578 * There's no reasonable action the MAC client can
2579 * take in this scenario to rectify the situation.
2580 */
2581 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2582 int err2;
2583
2584 if ((err2 = aggr_port_remvlan(p, vid)) != 0) {
2585 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2586 " from port %s: errno %d.", vid,
2587 mac_client_name(p->lp_mch), err2);
2588 }
2589
2590 }
2591
2592 if (vid == VLAN_ID_NONE)
2593 rx_group->arg_untagged--;
2594
2595 if (avp != NULL) {
2596 kmem_free(avp, sizeof (aggr_vlan_t));
2597 avp = NULL;
2598 }
2599 }
2600
2601 if (avp != NULL)
2602 list_insert_tail(&rx_group->arg_vlans, avp);
2603
2604 done:
2605 mac_perim_exit(mph);
2606 return (err);
2607 }
2608
2609 /*
2610 * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2611 */
2612 static int
2613 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2614 {
2615 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2616 aggr_grp_t *aggr = rx_group->arg_grp;
2617 aggr_port_t *port, *p;
2618 mac_perim_handle_t mph;
2619 int err = 0;
2620 aggr_vlan_t *avp = NULL;
2621
2622 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2623
2624 /*
2625 * See the comment in aggr_addvlan().
2626 */
2627 if (vid == MAC_VLAN_UNTAGGED) {
2628 vid = VLAN_ID_NONE;
2629 rx_group->arg_untagged--;
2630
2631 if (rx_group->arg_untagged > 0)
2632 goto done;
2633
2634 goto update_ports;
2635 }
2636
2637 avp = aggr_find_vlan(rx_group, vid);
2638
2639 if (avp == NULL) {
2640 err = ENOENT;
2641 goto done;
2642 }
2643
2644 avp->av_refs--;
2645
2646 if (avp->av_refs > 0)
2647 goto done;
2648
2649 update_ports:
2650 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2651 if ((err = aggr_port_remvlan(port, vid)) != 0)
2652 break;
2653
2654 /*
2655 * See the comment in aggr_addvlan() for justification of the
2656 * use of VERIFY here.
2657 */
2658 if (err != 0) {
2659 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2660 int err2;
2661
2662 if ((err2 = aggr_port_addvlan(p, vid)) != 0) {
2663 cmn_err(CE_WARN, "Failed to add VLAN %u"
2664 " to port %s: errno %d.", vid,
2665 mac_client_name(p->lp_mch), err2);
2666 }
2667 }
2668
2669 if (avp != NULL)
2670 avp->av_refs++;
2671
2672 if (vid == VLAN_ID_NONE)
2673 rx_group->arg_untagged++;
2674
2675 goto done;
2676 }
2677
2678 if (err == 0 && avp != NULL) {
2679 VERIFY3U(avp->av_refs, ==, 0);
2680 list_remove(&rx_group->arg_vlans, avp);
2681 kmem_free(avp, sizeof (aggr_vlan_t));
2682 }
2683
2684 done:
2685 mac_perim_exit(mph);
2686 return (err);
2687 }
2688
2689 /*
2690 * Add or remove the multicast addresses that are defined for the group
2691 * to or from the specified port.
2692 *
2693 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2694 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2695 * called when the port is either stopped or detached.
2696 */
2697 void
2698 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2699 {
2700 aggr_grp_t *grp = port->lp_grp;
2701
2702 ASSERT(MAC_PERIM_HELD(port->lp_mh));
2703 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2704
2705 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2706 return;
2707
2708 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2709 }
2710
2711 static int
2712 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2713 {
2714 aggr_grp_t *grp = arg;
2715 aggr_port_t *port = NULL, *errport = NULL;
2716 mac_perim_handle_t mph;
2717 int err = 0;
2718
2719 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2720 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2721 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2722 !port->lp_started) {
2723 continue;
2724 }
2725 err = aggr_port_multicst(port, add, addrp);
2726 if (err != 0) {
2727 errport = port;
2728 break;
2729 }
2730 }
2731
2732 /*
2733 * At least one port caused error return and this error is returned to
2734 * mac, eventually a NAK would be sent upwards.
2735 * Some ports have this multicast address listed now, and some don't.
2736 * Treat this error as a whole aggr failure not individual port failure.
2737 * Therefore remove this multicast address from other ports.
2738 */
2739 if ((err != 0) && add) {
2740 for (port = grp->lg_ports; port != errport;
2741 port = port->lp_next) {
2742 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2743 !port->lp_started) {
2744 continue;
2745 }
2746 (void) aggr_port_multicst(port, B_FALSE, addrp);
2747 }
2748 }
2749 mac_perim_exit(mph);
2750 return (err);
2751 }
2752
2753 static int
2754 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2755 {
2756 aggr_grp_t *grp = arg;
2757 mac_perim_handle_t mph;
2758 int err;
2759
2760 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2761 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2762 0, 0);
2763 mac_perim_exit(mph);
2764 return (err);
2765 }
2766
2767 /*
2768 * Initialize the capabilities that are advertised for the group
2769 * according to the capabilities of the constituent ports.
2770 */
2771 static void
2772 aggr_grp_capab_set(aggr_grp_t *grp)
2773 {
2774 uint32_t cksum;
2775 aggr_port_t *port;
2776 mac_capab_lso_t cap_lso;
2777
2778 ASSERT(grp->lg_mh == NULL);
2779 ASSERT(grp->lg_ports != NULL);
2780
2781 grp->lg_hcksum_txflags = (uint32_t)-1;
2782 grp->lg_zcopy = B_TRUE;
2783 grp->lg_vlan = B_TRUE;
2784
2785 grp->lg_lso = B_TRUE;
2786 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2787 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2788
2789 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2790 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2791 cksum = 0;
2792 grp->lg_hcksum_txflags &= cksum;
2793
2794 grp->lg_vlan &=
2795 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2796
2797 grp->lg_zcopy &=
2798 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2799
2800 grp->lg_lso &=
2801 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2802 if (grp->lg_lso) {
2803 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2804 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2805 cap_lso.lso_basic_tcp_ipv4.lso_max)
2806 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2807 cap_lso.lso_basic_tcp_ipv4.lso_max;
2808 }
2809 }
2810 }
2811
2812 /*
2813 * Checks whether the capabilities of the port being added are compatible
2814 * with the current capabilities of the aggregation.
2815 */
2816 static boolean_t
2817 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2818 {
2819 uint32_t hcksum_txflags;
2820
2821 ASSERT(grp->lg_ports != NULL);
2822
2823 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2824 grp->lg_vlan) != grp->lg_vlan) {
2825 return (B_FALSE);
2826 }
2827
2828 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2829 grp->lg_zcopy) != grp->lg_zcopy) {
2830 return (B_FALSE);
2831 }
2832
2833 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2834 if (grp->lg_hcksum_txflags != 0)
2835 return (B_FALSE);
2836 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2837 grp->lg_hcksum_txflags) {
2838 return (B_FALSE);
2839 }
2840
2841 if (grp->lg_lso) {
2842 mac_capab_lso_t cap_lso;
2843
2844 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2845 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2846 grp->lg_cap_lso.lso_flags)
2847 return (B_FALSE);
2848 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2849 cap_lso.lso_basic_tcp_ipv4.lso_max)
2850 return (B_FALSE);
2851 } else {
2852 return (B_FALSE);
2853 }
2854 }
2855
2856 return (B_TRUE);
2857 }
2858
2859 /*
2860 * Returns the maximum SDU according to the SDU of the constituent ports.
2861 */
2862 static uint_t
2863 aggr_grp_max_sdu(aggr_grp_t *grp)
2864 {
2865 uint_t max_sdu = (uint_t)-1;
2866 aggr_port_t *port;
2867
2868 ASSERT(grp->lg_ports != NULL);
2869
2870 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2871 uint_t port_sdu_max;
2872
2873 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2874 if (max_sdu > port_sdu_max)
2875 max_sdu = port_sdu_max;
2876 }
2877
2878 return (max_sdu);
2879 }
2880
2881 /*
2882 * Checks if the maximum SDU of the specified port is compatible
2883 * with the maximum SDU of the specified aggregation group, returns
2884 * B_TRUE if it is, B_FALSE otherwise.
2885 */
2886 static boolean_t
2887 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2888 {
2889 uint_t port_sdu_max;
2890
2891 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2892 return (port_sdu_max >= grp->lg_max_sdu);
2893 }
2894
2895 /*
2896 * Returns the maximum margin according to the margin of the constituent ports.
2897 */
2898 static uint32_t
2899 aggr_grp_max_margin(aggr_grp_t *grp)
2900 {
2901 uint32_t margin = UINT32_MAX;
2902 aggr_port_t *port;
2903
2904 ASSERT(grp->lg_mh == NULL);
2905 ASSERT(grp->lg_ports != NULL);
2906
2907 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2908 if (margin > port->lp_margin)
2909 margin = port->lp_margin;
2910 }
2911
2912 grp->lg_margin = margin;
2913 return (margin);
2914 }
2915
2916 /*
2917 * Checks if the maximum margin of the specified port is compatible
2918 * with the maximum margin of the specified aggregation group, returns
2919 * B_TRUE if it is, B_FALSE otherwise.
2920 */
2921 static boolean_t
2922 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2923 {
2924 if (port->lp_margin >= grp->lg_margin)
2925 return (B_TRUE);
2926
2927 /*
2928 * See whether the current margin value is allowed to be changed to
2929 * the new value.
2930 */
2931 if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2932 return (B_FALSE);
2933
2934 grp->lg_margin = port->lp_margin;
2935 return (B_TRUE);
2936 }
2937
2938 /*
2939 * Set MTU on individual ports of an aggregation group
2940 */
2941 static int
2942 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2943 uint32_t *old_mtu)
2944 {
2945 boolean_t removed = B_FALSE;
2946 mac_perim_handle_t mph;
2947 mac_diag_t diag;
2948 int err, rv, retry = 0;
2949
2950 if (port->lp_mah != NULL) {
2951 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2952 port->lp_mah = NULL;
2953 removed = B_TRUE;
2954 }
2955 err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2956 try_again:
2957 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2958 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2959 &port->lp_mah, 0, &diag)) != 0) {
2960 /*
2961 * following is a workaround for a bug in 'bge' driver.
2962 * See CR 6794654 for more information and this work around
2963 * will be removed once the CR is fixed.
2964 */
2965 if (rv == EIO && retry++ < 3) {
2966 delay(2 * hz);
2967 goto try_again;
2968 }
2969 /*
2970 * if mac_unicast_add() failed while setting the MTU,
2971 * detach the port from the group.
2972 */
2973 mac_perim_enter_by_mh(port->lp_mh, &mph);
2974 (void) aggr_grp_detach_port(grp, port);
2975 mac_perim_exit(mph);
2976 cmn_err(CE_WARN, "Unable to restart the port %s while "
2977 "setting MTU. Detaching the port from the aggregation.",
2978 mac_client_name(port->lp_mch));
2979 }
2980 return (err);
2981 }
2982
2983 static int
2984 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2985 {
2986 int err = 0, i, rv;
2987 aggr_port_t *port;
2988 uint32_t *mtu;
2989
2990 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2991
2992 /*
2993 * If the MTU being set is equal to aggr group's maximum
2994 * allowable value, then there is nothing to change
2995 */
2996 if (sdu == grp->lg_max_sdu)
2997 return (0);
2998
2999 /* 0 is aggr group's min sdu */
3000 if (sdu == 0)
3001 return (EINVAL);
3002
3003 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3004 for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3005 port = port->lp_next, i++) {
3006 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3007 }
3008 if (err != 0) {
3009 /* recover from error: reset the mtus of the ports */
3010 aggr_port_t *tmp;
3011
3012 for (tmp = grp->lg_ports, i = 0; tmp != port;
3013 tmp = tmp->lp_next, i++) {
3014 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3015 }
3016 goto bail;
3017 }
3018 grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3019 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3020 ASSERT(rv == 0);
3021 bail:
3022 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3023 return (err);
3024 }
3025
3026 /*
3027 * Callback functions for set/get of properties
3028 */
3029 /*ARGSUSED*/
3030 static int
3031 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3032 uint_t pr_valsize, const void *pr_val)
3033 {
3034 int err = ENOTSUP;
3035 aggr_grp_t *grp = m_driver;
3036
3037 switch (pr_num) {
3038 case MAC_PROP_MTU: {
3039 uint32_t mtu;
3040
3041 if (pr_valsize < sizeof (mtu)) {
3042 err = EINVAL;
3043 break;
3044 }
3045 bcopy(pr_val, &mtu, sizeof (mtu));
3046 err = aggr_sdu_update(grp, mtu);
3047 break;
3048 }
3049 default:
3050 break;
3051 }
3052 return (err);
3053 }
3054
3055 typedef struct rboundary {
3056 uint32_t bval;
3057 int btype;
3058 } rboundary_t;
3059
3060 /*
3061 * This function finds the intersection of mtu ranges stored in arrays -
3062 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3063 * Individual arrays are assumed to contain non-overlapping ranges.
3064 * Algorithm:
3065 * A range has two boundaries - min and max. We scan all arrays and store
3066 * each boundary as a separate element in a temporary array. We also store
3067 * the boundary types, min or max, as +1 or -1 respectively in the temporary
3068 * array. Then we sort the temporary array in ascending order. We scan the
3069 * sorted array from lower to higher values and keep a cumulative sum of
3070 * boundary types. Element in the temporary array for which the sum reaches
3071 * mcount is a min boundary of a range in the result and next element will be
3072 * max boundary.
3073 *
3074 * Example for mcount = 3,
3075 *
3076 * ----|_________|-------|_______|----|__|------ mrange[0]
3077 *
3078 * -------|________|--|____________|-----|___|-- mrange[1]
3079 *
3080 * --------|________________|-------|____|------ mrange[2]
3081 *
3082 * 3 2 1
3083 * \|/
3084 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum
3085 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3086 *
3087 * same min and max
3088 * V
3089 * --------|_____|-------|__|------------|------ intersecting ranges
3090 */
3091 void
3092 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3093 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3094 {
3095 mac_propval_uint32_range_t *rval, *ur;
3096 int rmaxcnt, rcount;
3097 size_t sz_range32;
3098 rboundary_t *ta; /* temporary array */
3099 rboundary_t temp;
3100 boolean_t range_started = B_FALSE;
3101 int i, j, m, sum;
3102
3103 sz_range32 = sizeof (mac_propval_uint32_range_t);
3104
3105 for (i = 0, rmaxcnt = 0; i < mcount; i++)
3106 rmaxcnt += mrange[i]->mpr_count;
3107
3108 /* Allocate enough space to store the results */
3109 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3110
3111 /* Number of boundaries are twice as many as ranges */
3112 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3113
3114 for (i = 0, m = 0; i < mcount; i++) {
3115 ur = &(mrange[i]->mpr_range_uint32[0]);
3116 for (j = 0; j < mrange[i]->mpr_count; j++) {
3117 ta[m].bval = ur[j].mpur_min;
3118 ta[m++].btype = 1;
3119 ta[m].bval = ur[j].mpur_max;
3120 ta[m++].btype = -1;
3121 }
3122 }
3123
3124 /*
3125 * Sort the temporary array in ascending order of bval;
3126 * if boundary values are same then sort on btype.
3127 */
3128 for (i = 0; i < m-1; i++) {
3129 for (j = i+1; j < m; j++) {
3130 if ((ta[i].bval > ta[j].bval) ||
3131 ((ta[i].bval == ta[j].bval) &&
3132 (ta[i].btype < ta[j].btype))) {
3133 temp = ta[i];
3134 ta[i] = ta[j];
3135 ta[j] = temp;
3136 }
3137 }
3138 }
3139
3140 /* Walk through temporary array to find all ranges in the results */
3141 for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3142 sum += ta[i].btype;
3143 if (sum == mcount) {
3144 rval[rcount].mpur_min = ta[i].bval;
3145 range_started = B_TRUE;
3146 } else if (sum < mcount && range_started) {
3147 rval[rcount++].mpur_max = ta[i].bval;
3148 range_started = B_FALSE;
3149 }
3150 }
3151
3152 *prval = rval;
3153 *prmaxcnt = rmaxcnt;
3154 *prcount = rcount;
3155
3156 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3157 }
3158
3159 /*
3160 * Returns the mtu ranges which could be supported by aggr group.
3161 * prmaxcnt returns the size of the buffer prval, prcount returns
3162 * the number of valid entries in prval. Caller is responsible
3163 * for freeing up prval.
3164 */
3165 int
3166 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3167 int *prmaxcnt, int *prcount)
3168 {
3169 mac_propval_range_t **vals;
3170 aggr_port_t *port;
3171 mac_perim_handle_t mph;
3172 uint_t i, numr;
3173 int err = 0;
3174 size_t sz_propval, sz_range32;
3175 size_t size;
3176
3177 sz_propval = sizeof (mac_propval_range_t);
3178 sz_range32 = sizeof (mac_propval_uint32_range_t);
3179
3180 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3181
3182 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3183 KM_SLEEP);
3184
3185 for (port = grp->lg_ports, i = 0; port != NULL;
3186 port = port->lp_next, i++) {
3187
3188 size = sz_propval;
3189 vals[i] = kmem_alloc(size, KM_SLEEP);
3190 vals[i]->mpr_count = 1;
3191
3192 mac_perim_enter_by_mh(port->lp_mh, &mph);
3193
3194 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3195 NULL, 0, vals[i], NULL);
3196 if (err == ENOSPC) {
3197 /*
3198 * Not enough space to hold all ranges.
3199 * Allocate extra space as indicated and retry.
3200 */
3201 numr = vals[i]->mpr_count;
3202 kmem_free(vals[i], sz_propval);
3203 size = sz_propval + (numr - 1) * sz_range32;
3204 vals[i] = kmem_alloc(size, KM_SLEEP);
3205 vals[i]->mpr_count = numr;
3206 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3207 NULL, 0, vals[i], NULL);
3208 ASSERT(err != ENOSPC);
3209 }
3210 mac_perim_exit(mph);
3211 if (err != 0) {
3212 kmem_free(vals[i], size);
3213 vals[i] = NULL;
3214 break;
3215 }
3216 }
3217
3218 /*
3219 * if any of the underlying ports does not support changing MTU then
3220 * just return ENOTSUP
3221 */
3222 if (port != NULL) {
3223 ASSERT(err != 0);
3224 goto done;
3225 }
3226
3227 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3228 prcount);
3229
3230 done:
3231 for (i = 0; i < grp->lg_nports; i++) {
3232 if (vals[i] != NULL) {
3233 numr = vals[i]->mpr_count;
3234 size = sz_propval + (numr - 1) * sz_range32;
3235 kmem_free(vals[i], size);
3236 }
3237 }
3238
3239 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3240 return (err);
3241 }
3242
3243 static void
3244 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3245 mac_prop_info_handle_t prh)
3246 {
3247 aggr_grp_t *grp = m_driver;
3248 mac_propval_uint32_range_t *rval = NULL;
3249 int i, rcount, rmaxcnt;
3250 int err = 0;
3251
3252 _NOTE(ARGUNUSED(pr_name));
3253
3254 switch (pr_num) {
3255 case MAC_PROP_MTU:
3256
3257 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3258 &rcount);
3259 if (err != 0) {
3260 ASSERT(rval == NULL);
3261 return;
3262 }
3263 for (i = 0; i < rcount; i++) {
3264 mac_prop_info_set_range_uint32(prh,
3265 rval[i].mpur_min, rval[i].mpur_max);
3266 }
3267 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3268 break;
3269 }
3270 }