1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2018 Joyent, Inc.
24 */
25
26 /*
27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
28 *
29 * An instance of the structure aggr_grp_t is allocated for each
30 * link aggregation group. When created, aggr_grp_t objects are
31 * entered into the aggr_grp_hash hash table maintained by the modhash
32 * module. The hash key is the linkid associated with the link
33 * aggregation group.
34 *
35 * Each aggregation contains a set of ports. The port is represented
36 * by the aggr_port_t structure. A port consists of a single MAC
37 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
38 * MAC. This client is used by the aggr to send and receive LACP
39 * traffic. Each port client takes on the same MAC unicast address --
40 * the address of the aggregation itself (taken from the first port by
41 * default).
42 *
43 * The MAC client that hangs off each aggr port is not your typical
44 * MAC client. Not only does it have exclusive control of the MAC, but
45 * it also has no Tx or Rx SRSes. An SRS is designed to queue and
46 * fanout traffic among L4 protocols; but the aggr is an intermediary,
47 * not a consumer. Instead of using SRSes, the aggr puts the
48 * underlying hardware rings into passthru mode and ships packets up
49 * via a direct call to aggr_recv_cb(). This allows aggr to enforce
50 * LACP while passing all other traffic up to clients of the aggr.
51 *
52 * Pseudo Rx Groups and Rings
53 * --------------------------
54 *
55 * It is imperative for client performance that the aggr provide as
56 * many MAC groups as possible. In order to use the underlying HW
57 * resources, aggr creates pseudo groups to aggregate the underlying
58 * HW groups. Every HW group gets mapped to a pseudo group; and every
59 * HW ring in that group gets mapped to a pseudo ring. The pseudo
60 * group at index 0 combines all the HW groups at index 0 from each
61 * port, etc. The aggr's MAC then creates normal MAC groups and rings
62 * out of these pseudo groups and rings to present to the aggr's
63 * clients. To the clients, the aggr's groups and rings are absolutely
64 * no different than a NIC's groups or rings.
65 *
66 * Pseudo Tx Rings
67 * ---------------
68 *
69 * The underlying ports (NICs) in an aggregation can have Tx rings. To
70 * enhance aggr's performance, these Tx rings are made available to
71 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
72 * not new. They are already present and implemented on the Rx side.
73 * The same concept is extended to the Tx side where each Tx ring of
74 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
75 * each pseudo Tx ring will map to a specific hardware Tx ring. Even
76 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
77 * is given to the aggregation layer.
78 *
79 * With this change, the outgoing stack depth looks much better:
80 *
81 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
82 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
83 *
84 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
85 * SRS_TX_AGGR and SRS_TX_BW_AGGR.
86 *
87 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
88 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
89 * ring belonging to a port on which the packet has to be sent.
90 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
91 * policy and then uses the fanout_hint passed to it to pick a Tx ring from
92 * the selected port.
93 *
94 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
95 * bandwidth limit is applied first on the outgoing packet and the packets
96 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
97 * particular Tx ring.
98 */
99
100 #include <sys/types.h>
101 #include <sys/sysmacros.h>
102 #include <sys/conf.h>
103 #include <sys/cmn_err.h>
104 #include <sys/disp.h>
105 #include <sys/list.h>
106 #include <sys/ksynch.h>
107 #include <sys/kmem.h>
108 #include <sys/stream.h>
109 #include <sys/modctl.h>
110 #include <sys/ddi.h>
111 #include <sys/sunddi.h>
112 #include <sys/atomic.h>
113 #include <sys/stat.h>
114 #include <sys/modhash.h>
115 #include <sys/id_space.h>
116 #include <sys/strsun.h>
117 #include <sys/cred.h>
118 #include <sys/dlpi.h>
119 #include <sys/zone.h>
120 #include <sys/mac_provider.h>
121 #include <sys/dls.h>
122 #include <sys/vlan.h>
123 #include <sys/aggr.h>
124 #include <sys/aggr_impl.h>
125
126 static int aggr_m_start(void *);
127 static void aggr_m_stop(void *);
128 static int aggr_m_promisc(void *, boolean_t);
129 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
130 static int aggr_m_unicst(void *, const uint8_t *);
131 static int aggr_m_stat(void *, uint_t, uint64_t *);
132 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
133 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
134 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
135 const void *);
136 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
137 mac_prop_info_handle_t);
138
139 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
140 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
141 boolean_t *);
142
143 static void aggr_grp_capab_set(aggr_grp_t *);
144 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
145 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
146 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
147 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
148 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
149
150 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
151 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
152 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
153 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
154 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
155 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
156 static int aggr_addmac(void *, const uint8_t *);
157 static int aggr_remmac(void *, const uint8_t *);
158 static int aggr_addvlan(mac_group_driver_t, uint16_t);
159 static int aggr_remvlan(mac_group_driver_t, uint16_t);
160 static mblk_t *aggr_rx_poll(void *, int);
161 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
162 const int, mac_ring_info_t *, mac_ring_handle_t);
163 static void aggr_fill_group(void *, mac_ring_type_t, const int,
164 mac_group_info_t *, mac_group_handle_t);
165
166 static kmem_cache_t *aggr_grp_cache;
167 static mod_hash_t *aggr_grp_hash;
168 static krwlock_t aggr_grp_lock;
169 static uint_t aggr_grp_cnt;
170 static id_space_t *key_ids;
171
172 #define GRP_HASHSZ 64
173 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
174 #define AGGR_PORT_NAME_DELIMIT '-'
175
176 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
177
178 #define AGGR_M_CALLBACK_FLAGS \
179 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
180
181 static mac_callbacks_t aggr_m_callbacks = {
182 AGGR_M_CALLBACK_FLAGS,
183 aggr_m_stat,
184 aggr_m_start,
185 aggr_m_stop,
186 aggr_m_promisc,
187 aggr_m_multicst,
188 NULL,
189 NULL,
190 NULL,
191 aggr_m_ioctl,
192 aggr_m_capab_get,
193 NULL,
194 NULL,
195 aggr_m_setprop,
196 NULL,
197 aggr_m_propinfo
198 };
199
200 /*ARGSUSED*/
201 static int
202 aggr_grp_constructor(void *buf, void *arg, int kmflag)
203 {
204 aggr_grp_t *grp = buf;
205
206 bzero(grp, sizeof (*grp));
207 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
208 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
209 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
210 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
211 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
212 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
213 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
214 grp->lg_link_state = LINK_STATE_UNKNOWN;
215 return (0);
216 }
217
218 /*ARGSUSED*/
219 static void
220 aggr_grp_destructor(void *buf, void *arg)
221 {
222 aggr_grp_t *grp = buf;
223
224 if (grp->lg_tx_ports != NULL) {
225 kmem_free(grp->lg_tx_ports,
226 grp->lg_tx_ports_size * sizeof (aggr_port_t *));
227 }
228
229 mutex_destroy(&grp->lg_lacp_lock);
230 cv_destroy(&grp->lg_lacp_cv);
231 mutex_destroy(&grp->lg_port_lock);
232 cv_destroy(&grp->lg_port_cv);
233 rw_destroy(&grp->lg_tx_lock);
234 mutex_destroy(&grp->lg_tx_flowctl_lock);
235 cv_destroy(&grp->lg_tx_flowctl_cv);
236 }
237
238 void
239 aggr_grp_init(void)
240 {
241 aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
242 sizeof (aggr_grp_t), 0, aggr_grp_constructor,
243 aggr_grp_destructor, NULL, NULL, NULL, 0);
244
245 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
246 GRP_HASHSZ, mod_hash_null_valdtor);
247 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
248 aggr_grp_cnt = 0;
249
250 /*
251 * Allocate an id space to manage key values (when key is not
252 * specified). The range of the id space will be from
253 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
254 * uses a 16-bit key.
255 */
256 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
257 ASSERT(key_ids != NULL);
258 }
259
260 void
261 aggr_grp_fini(void)
262 {
263 id_space_destroy(key_ids);
264 rw_destroy(&aggr_grp_lock);
265 mod_hash_destroy_idhash(aggr_grp_hash);
266 kmem_cache_destroy(aggr_grp_cache);
267 }
268
269 uint_t
270 aggr_grp_count(void)
271 {
272 uint_t count;
273
274 rw_enter(&aggr_grp_lock, RW_READER);
275 count = aggr_grp_cnt;
276 rw_exit(&aggr_grp_lock);
277 return (count);
278 }
279
280 /*
281 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
282 * requires the mac perimeter, this function holds a reference of the aggr
283 * and aggr won't call mac_unregister() until this reference drops to 0.
284 */
285 void
286 aggr_grp_port_hold(aggr_port_t *port)
287 {
288 aggr_grp_t *grp = port->lp_grp;
289
290 AGGR_PORT_REFHOLD(port);
291 mutex_enter(&grp->lg_port_lock);
292 grp->lg_port_ref++;
293 mutex_exit(&grp->lg_port_lock);
294 }
295
296 /*
297 * Release the reference of the grp and inform aggr_grp_delete() calling
298 * mac_unregister() is now safe.
299 */
300 void
301 aggr_grp_port_rele(aggr_port_t *port)
302 {
303 aggr_grp_t *grp = port->lp_grp;
304
305 mutex_enter(&grp->lg_port_lock);
306 if (--grp->lg_port_ref == 0)
307 cv_signal(&grp->lg_port_cv);
308 mutex_exit(&grp->lg_port_lock);
309 AGGR_PORT_REFRELE(port);
310 }
311
312 /*
313 * Wait for the port's lacp timer thread and the port's notification callback
314 * to exit.
315 */
316 void
317 aggr_grp_port_wait(aggr_grp_t *grp)
318 {
319 mutex_enter(&grp->lg_port_lock);
320 if (grp->lg_port_ref != 0)
321 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
322 mutex_exit(&grp->lg_port_lock);
323 }
324
325 /*
326 * Attach a port to a link aggregation group.
327 *
328 * A port is attached to a link aggregation group once its speed
329 * and link state have been verified.
330 *
331 * Returns B_TRUE if the group link state or speed has changed. If
332 * it's the case, the caller must notify the MAC layer via a call
333 * to mac_link().
334 */
335 boolean_t
336 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
337 {
338 boolean_t link_state_changed = B_FALSE;
339
340 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
341 ASSERT(MAC_PERIM_HELD(port->lp_mh));
342
343 if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
344 return (B_FALSE);
345
346 /*
347 * Validate the MAC port link speed and update the group
348 * link speed if needed.
349 */
350 if (port->lp_ifspeed == 0 ||
351 port->lp_link_state != LINK_STATE_UP ||
352 port->lp_link_duplex != LINK_DUPLEX_FULL) {
353 /*
354 * Can't attach a MAC port with unknown link speed,
355 * down link, or not in full duplex mode.
356 */
357 return (B_FALSE);
358 }
359
360 mutex_enter(&grp->lg_stat_lock);
361 if (grp->lg_ifspeed == 0) {
362 /*
363 * The group inherits the speed of the first link being
364 * attached.
365 */
366 grp->lg_ifspeed = port->lp_ifspeed;
367 link_state_changed = B_TRUE;
368 } else if (grp->lg_ifspeed != port->lp_ifspeed) {
369 /*
370 * The link speed of the MAC port must be the same as
371 * the group link speed, as per 802.3ad. Since it is
372 * not, the attach is cancelled.
373 */
374 mutex_exit(&grp->lg_stat_lock);
375 return (B_FALSE);
376 }
377 mutex_exit(&grp->lg_stat_lock);
378
379 grp->lg_nattached_ports++;
380
381 /*
382 * Update the group link state.
383 */
384 if (grp->lg_link_state != LINK_STATE_UP) {
385 grp->lg_link_state = LINK_STATE_UP;
386 mutex_enter(&grp->lg_stat_lock);
387 grp->lg_link_duplex = LINK_DUPLEX_FULL;
388 mutex_exit(&grp->lg_stat_lock);
389 link_state_changed = B_TRUE;
390 }
391
392 /*
393 * Update port's state.
394 */
395 port->lp_state = AGGR_PORT_STATE_ATTACHED;
396
397 aggr_grp_multicst_port(port, B_TRUE);
398
399 /*
400 * The port client doesn't have an Rx SRS; instead of calling
401 * mac_rx_set() we set the client's flow callback directly.
402 * This datapath is used only when the port's driver doesn't
403 * support MAC_CAPAB_RINGS. Drivers with ring support will
404 * deliver traffic to the aggr via ring passthru.
405 */
406 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
407
408 /*
409 * If LACP is OFF, the port can be used to send data as soon
410 * as its link is up and verified to be compatible with the
411 * aggregation.
412 *
413 * If LACP is active or passive, notify the LACP subsystem, which
414 * will enable sending on the port following the LACP protocol.
415 */
416 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
417 aggr_send_port_enable(port);
418 else
419 aggr_lacp_port_attached(port);
420
421 return (link_state_changed);
422 }
423
424 boolean_t
425 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
426 {
427 boolean_t link_state_changed = B_FALSE;
428
429 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
430 ASSERT(MAC_PERIM_HELD(port->lp_mh));
431
432 /* update state */
433 if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
434 return (B_FALSE);
435
436 mac_client_clear_flow_cb(port->lp_mch);
437
438 aggr_grp_multicst_port(port, B_FALSE);
439
440 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
441 aggr_send_port_disable(port);
442 else
443 aggr_lacp_port_detached(port);
444
445 port->lp_state = AGGR_PORT_STATE_STANDBY;
446
447 grp->lg_nattached_ports--;
448 if (grp->lg_nattached_ports == 0) {
449 /* the last attached MAC port of the group is being detached */
450 grp->lg_link_state = LINK_STATE_DOWN;
451 mutex_enter(&grp->lg_stat_lock);
452 grp->lg_ifspeed = 0;
453 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
454 mutex_exit(&grp->lg_stat_lock);
455 link_state_changed = B_TRUE;
456 }
457
458 return (link_state_changed);
459 }
460
461 /*
462 * Update the MAC addresses of the constituent ports of the specified
463 * group. This function is invoked:
464 * - after creating a new aggregation group.
465 * - after adding new ports to an aggregation group.
466 * - after removing a port from a group when the MAC address of
467 * that port was used for the MAC address of the group.
468 * - after the MAC address of a port changed when the MAC address
469 * of that port was used for the MAC address of the group.
470 *
471 * Return true if the link state of the aggregation changed, for example
472 * as a result of a failure changing the MAC address of one of the
473 * constituent ports.
474 */
475 boolean_t
476 aggr_grp_update_ports_mac(aggr_grp_t *grp)
477 {
478 aggr_port_t *cport;
479 boolean_t link_state_changed = B_FALSE;
480 mac_perim_handle_t mph;
481
482 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
483
484 for (cport = grp->lg_ports; cport != NULL;
485 cport = cport->lp_next) {
486 mac_perim_enter_by_mh(cport->lp_mh, &mph);
487 if (aggr_port_unicst(cport) != 0) {
488 if (aggr_grp_detach_port(grp, cport))
489 link_state_changed = B_TRUE;
490 } else {
491 /*
492 * If a port was detached because of a previous
493 * failure changing the MAC address, the port is
494 * reattached when it successfully changes the MAC
495 * address now, and this might cause the link state
496 * of the aggregation to change.
497 */
498 if (aggr_grp_attach_port(grp, cport))
499 link_state_changed = B_TRUE;
500 }
501 mac_perim_exit(mph);
502 }
503 return (link_state_changed);
504 }
505
506 /*
507 * Invoked when the MAC address of a port has changed. If the port's
508 * MAC address was used for the group MAC address, set mac_addr_changedp
509 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
510 * notification. If the link state changes due to detach/attach of
511 * the constituent port, set link_state_changedp to B_TRUE to indicate
512 * to the caller that it should send a MAC_NOTE_LINK notification. In both
513 * cases, it is the responsibility of the caller to invoke notification
514 * functions after releasing the the port lock.
515 */
516 void
517 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
518 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
519 {
520 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
521 ASSERT(MAC_PERIM_HELD(port->lp_mh));
522 ASSERT(mac_addr_changedp != NULL);
523 ASSERT(link_state_changedp != NULL);
524
525 *mac_addr_changedp = B_FALSE;
526 *link_state_changedp = B_FALSE;
527
528 if (grp->lg_addr_fixed) {
529 /*
530 * The group is using a fixed MAC address or an automatic
531 * MAC address has not been set.
532 */
533 return;
534 }
535
536 if (grp->lg_mac_addr_port == port) {
537 /*
538 * The MAC address of the port was assigned to the group
539 * MAC address. Update the group MAC address.
540 */
541 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
542 *mac_addr_changedp = B_TRUE;
543 } else {
544 /*
545 * Update the actual port MAC address to the MAC address
546 * of the group.
547 */
548 if (aggr_port_unicst(port) != 0) {
549 *link_state_changedp = aggr_grp_detach_port(grp, port);
550 } else {
551 /*
552 * If a port was detached because of a previous
553 * failure changing the MAC address, the port is
554 * reattached when it successfully changes the MAC
555 * address now, and this might cause the link state
556 * of the aggregation to change.
557 */
558 *link_state_changedp = aggr_grp_attach_port(grp, port);
559 }
560 }
561 }
562
563 /*
564 * Add a port to a link aggregation group.
565 */
566 static int
567 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
568 aggr_port_t **pp)
569 {
570 aggr_port_t *port, **cport;
571 mac_perim_handle_t mph;
572 zoneid_t port_zoneid = ALL_ZONES;
573 int err;
574
575 /* The port must be in the same zone as the aggregation. */
576 if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
577 port_zoneid = GLOBAL_ZONEID;
578 if (grp->lg_zoneid != port_zoneid)
579 return (EBUSY);
580
581 /*
582 * If we are creating the aggr, then there is no MAC handle
583 * and thus no perimeter to hold. If we are adding a port to
584 * an existing aggr, then the perimiter of the aggr's MAC must
585 * be held.
586 */
587 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
588
589 err = aggr_port_create(grp, port_linkid, force, &port);
590 if (err != 0)
591 return (err);
592
593 mac_perim_enter_by_mh(port->lp_mh, &mph);
594
595 /* Add the new port to the end of the list. */
596 cport = &grp->lg_ports;
597 while (*cport != NULL)
598 cport = &((*cport)->lp_next);
599 *cport = port;
600
601 /*
602 * Back reference to the group it is member of. A port always
603 * holds a reference to its group to ensure that the back
604 * reference is always valid.
605 */
606 port->lp_grp = grp;
607 AGGR_GRP_REFHOLD(grp);
608 grp->lg_nports++;
609
610 aggr_lacp_init_port(port);
611 mac_perim_exit(mph);
612
613 if (pp != NULL)
614 *pp = port;
615
616 return (0);
617 }
618
619 /*
620 * This is called in response to either our LACP state machine or a MAC
621 * notification that the link has gone down via aggr_send_port_disable(). At
622 * this point, we may need to update our default ring. To that end, we go
623 * through the set of ports (underlying datalinks in an aggregation) that are
624 * currently enabled to transmit data. If all our links have been disabled for
625 * transmit, then we don't do anything.
626 *
627 * Note, because we only have a single TX group, we don't have to worry about
628 * the rings moving between groups and the chance that mac will reassign it
629 * unless someone removes a port, at which point, we play it safe and call this
630 * again.
631 */
632 void
633 aggr_grp_update_default(aggr_grp_t *grp)
634 {
635 aggr_port_t *port;
636 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
637
638 rw_enter(&grp->lg_tx_lock, RW_WRITER);
639
640 if (grp->lg_ntx_ports == 0) {
641 rw_exit(&grp->lg_tx_lock);
642 return;
643 }
644
645 port = grp->lg_tx_ports[0];
646 ASSERT(port->lp_tx_ring_cnt > 0);
647 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
648 rw_exit(&grp->lg_tx_lock);
649 }
650
651 /*
652 * Add a pseudo RX ring for the given HW ring handle.
653 */
654 static int
655 aggr_add_pseudo_rx_ring(aggr_port_t *port,
656 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
657 {
658 aggr_pseudo_rx_ring_t *ring;
659 int err;
660 int j;
661
662 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
663 ring = rx_grp->arg_rings + j;
664 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
665 break;
666 }
667
668 /*
669 * No slot for this new RX ring.
670 */
671 if (j == MAX_RINGS_PER_GROUP)
672 return (EIO);
673
674 ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
675 ring->arr_hw_rh = hw_rh;
676 ring->arr_port = port;
677 ring->arr_grp = rx_grp;
678 rx_grp->arg_ring_cnt++;
679
680 /*
681 * The group is already registered, dynamically add a new ring to the
682 * mac group.
683 */
684 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
685 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
686 ring->arr_hw_rh = NULL;
687 ring->arr_port = NULL;
688 ring->arr_grp = NULL;
689 rx_grp->arg_ring_cnt--;
690 } else {
691 /*
692 * This must run after the MAC is registered.
693 */
694 ASSERT3P(ring->arr_rh, !=, NULL);
695 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
696 (void *)port, (mac_resource_handle_t)ring);
697 }
698 return (err);
699 }
700
701 /*
702 * Remove the pseudo RX ring of the given HW ring handle.
703 */
704 static void
705 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
706 {
707 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
708 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
709
710 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
711 ring->arr_hw_rh != hw_rh) {
712 continue;
713 }
714
715 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
716
717 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
718 ring->arr_hw_rh = NULL;
719 ring->arr_port = NULL;
720 ring->arr_grp = NULL;
721 rx_grp->arg_ring_cnt--;
722 mac_hwring_clear_passthru(hw_rh);
723 break;
724 }
725 }
726
727 /*
728 * Create pseudo rings over the HW rings of the port.
729 *
730 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
731 *
732 * o Program existing unicast filters on the pseudo group into the HW group.
733 *
734 * o Program existing VLAN filters on the pseudo group into the HW group.
735 */
736 static int
737 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
738 {
739 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
740 aggr_unicst_addr_t *addr, *a;
741 mac_perim_handle_t pmph;
742 aggr_vlan_t *avp;
743 uint_t hw_rh_cnt, i;
744 int err = 0;
745 uint_t g_idx = rx_grp->arg_index;
746
747 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
748 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
749 mac_perim_enter_by_mh(port->lp_mh, &pmph);
750
751 /*
752 * This function must be called after the aggr registers its
753 * MAC and its Rx groups have been initialized.
754 */
755 ASSERT(rx_grp->arg_gh != NULL);
756
757 /*
758 * Get the list of the underlying HW rings.
759 */
760 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
761 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
762
763 /*
764 * Add existing VLAN and unicast address filters to the port.
765 */
766 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
767 avp = list_next(&rx_grp->arg_vlans, avp)) {
768 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
769 goto err;
770 }
771
772 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
773 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
774 goto err;
775 }
776
777 for (i = 0; i < hw_rh_cnt; i++) {
778 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
779 if (err != 0)
780 goto err;
781 }
782
783 mac_perim_exit(pmph);
784 return (0);
785
786 err:
787 ASSERT(err != 0);
788
789 for (uint_t j = 0; j < i; j++)
790 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
791
792 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
793 aggr_port_remmac(port, g_idx, a->aua_addr);
794
795 if (avp != NULL)
796 avp = list_prev(&rx_grp->arg_vlans, avp);
797
798 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
799 int err2;
800
801 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
802 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
803 ": errno %d.", avp->av_vid,
804 mac_client_name(port->lp_mch), err2);
805 }
806 }
807
808 port->lp_hwghs[g_idx] = NULL;
809 mac_perim_exit(pmph);
810 return (err);
811 }
812
813 /*
814 * Destroy the pseudo rings mapping to this port and remove all VLAN
815 * and unicast filters from this port. Even if there are no underlying
816 * HW rings we must still remove the unicast filters to take the port
817 * out of promisc mode.
818 */
819 static void
820 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
821 {
822 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
823 aggr_unicst_addr_t *addr;
824 mac_perim_handle_t pmph;
825 uint_t hw_rh_cnt;
826 uint_t g_idx = rx_grp->arg_index;
827
828 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
829 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
830 ASSERT3P(rx_grp->arg_gh, !=, NULL);
831 mac_perim_enter_by_mh(port->lp_mh, &pmph);
832
833 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
834 MAC_RING_TYPE_RX);
835
836 for (uint_t i = 0; i < hw_rh_cnt; i++)
837 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
838
839 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
840 aggr_port_remmac(port, g_idx, addr->aua_addr);
841
842 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
843 avp = list_next(&rx_grp->arg_vlans, avp)) {
844 int err;
845
846 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
847 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
848 ": errno %d.", avp->av_vid,
849 mac_client_name(port->lp_mch), err);
850 }
851 }
852
853 port->lp_hwghs[g_idx] = NULL;
854 mac_perim_exit(pmph);
855 }
856
857 /*
858 * Add a pseudo TX ring for the given HW ring handle.
859 */
860 static int
861 aggr_add_pseudo_tx_ring(aggr_port_t *port,
862 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
863 mac_ring_handle_t *pseudo_rh)
864 {
865 aggr_pseudo_tx_ring_t *ring;
866 int err;
867 int i;
868
869 ASSERT(MAC_PERIM_HELD(port->lp_mh));
870 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
871 ring = tx_grp->atg_rings + i;
872 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
873 break;
874 }
875 /*
876 * No slot for this new TX ring.
877 */
878 if (i == MAX_RINGS_PER_GROUP)
879 return (EIO);
880 /*
881 * The following 4 statements needs to be done before
882 * calling mac_group_add_ring(). Otherwise it will
883 * result in an assertion failure in mac_init_ring().
884 */
885 ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
886 ring->atr_hw_rh = hw_rh;
887 ring->atr_port = port;
888 tx_grp->atg_ring_cnt++;
889
890 /*
891 * The TX side has no concept of ring groups unlike RX groups.
892 * There is just a single group which stores all the TX rings.
893 * This group will be used to store aggr's pseudo TX rings.
894 */
895 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
896 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
897 ring->atr_hw_rh = NULL;
898 ring->atr_port = NULL;
899 tx_grp->atg_ring_cnt--;
900 } else {
901 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
902 if (hw_rh != NULL) {
903 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
904 mac_find_ring(tx_grp->atg_gh, i));
905 }
906 }
907
908 return (err);
909 }
910
911 /*
912 * Remove the pseudo TX ring of the given HW ring handle.
913 */
914 static void
915 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
916 mac_ring_handle_t pseudo_hw_rh)
917 {
918 aggr_pseudo_tx_ring_t *ring;
919 int i;
920
921 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
922 ring = tx_grp->atg_rings + i;
923 if (ring->atr_rh != pseudo_hw_rh)
924 continue;
925
926 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
927 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
928 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
929 mac_hwring_teardown(ring->atr_hw_rh);
930 ring->atr_hw_rh = NULL;
931 ring->atr_port = NULL;
932 tx_grp->atg_ring_cnt--;
933 break;
934 }
935 }
936
937 /*
938 * This function is called to create pseudo rings over hardware rings of
939 * the underlying device. There is a 1:1 mapping between the pseudo TX
940 * rings of the aggr and the hardware rings of the underlying port.
941 */
942 static int
943 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
944 {
945 aggr_grp_t *grp = port->lp_grp;
946 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
947 mac_perim_handle_t pmph;
948 int hw_rh_cnt, i = 0, j;
949 int err = 0;
950
951 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
952 mac_perim_enter_by_mh(port->lp_mh, &pmph);
953
954 /*
955 * Get the list the the underlying HW rings.
956 */
957 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
958 MAC_RING_TYPE_TX);
959
960 /*
961 * Even if the underlying NIC does not have TX rings, we
962 * still make a psuedo TX ring for that NIC with NULL as
963 * the ring handle.
964 */
965 if (hw_rh_cnt == 0)
966 port->lp_tx_ring_cnt = 1;
967 else
968 port->lp_tx_ring_cnt = hw_rh_cnt;
969
970 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
971 port->lp_tx_ring_cnt), KM_SLEEP);
972 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
973 port->lp_tx_ring_cnt), KM_SLEEP);
974
975 if (hw_rh_cnt == 0) {
976 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
977 NULL, &pseudo_rh)) == 0) {
978 port->lp_tx_rings[0] = NULL;
979 port->lp_pseudo_tx_rings[0] = pseudo_rh;
980 }
981 } else {
982 for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
983 err = aggr_add_pseudo_tx_ring(port,
984 tx_grp, hw_rh[i], &pseudo_rh);
985 if (err != 0)
986 break;
987 port->lp_tx_rings[i] = hw_rh[i];
988 port->lp_pseudo_tx_rings[i] = pseudo_rh;
989 }
990 }
991
992 if (err != 0) {
993 if (hw_rh_cnt != 0) {
994 for (j = 0; j < i; j++) {
995 aggr_rem_pseudo_tx_ring(tx_grp,
996 port->lp_pseudo_tx_rings[j]);
997 }
998 }
999 kmem_free(port->lp_tx_rings,
1000 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1001 kmem_free(port->lp_pseudo_tx_rings,
1002 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1003 port->lp_tx_ring_cnt = 0;
1004 } else {
1005 port->lp_tx_grp_added = B_TRUE;
1006 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1007 aggr_tx_ring_update, port);
1008 }
1009 mac_perim_exit(pmph);
1010 aggr_grp_update_default(grp);
1011 return (err);
1012 }
1013
1014 /*
1015 * This function is called by aggr to remove pseudo TX rings over the
1016 * HW rings of the underlying port.
1017 */
1018 static void
1019 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1020 {
1021 aggr_grp_t *grp = port->lp_grp;
1022 mac_perim_handle_t pmph;
1023 int i;
1024
1025 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1026 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1027
1028 if (!port->lp_tx_grp_added)
1029 goto done;
1030
1031 ASSERT(tx_grp->atg_gh != NULL);
1032
1033 for (i = 0; i < port->lp_tx_ring_cnt; i++)
1034 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1035
1036 kmem_free(port->lp_tx_rings,
1037 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1038 kmem_free(port->lp_pseudo_tx_rings,
1039 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1040
1041 port->lp_tx_ring_cnt = 0;
1042 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1043 port->lp_tx_grp_added = B_FALSE;
1044 aggr_grp_update_default(grp);
1045 done:
1046 mac_perim_exit(pmph);
1047 }
1048
1049 static int
1050 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1051 {
1052 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1053 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1054 }
1055
1056 static int
1057 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1058 {
1059 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1060 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1061 }
1062
1063 /*
1064 * Start the pseudo ring. Since the pseudo ring is just an abstraction
1065 * over an actual HW ring, the real task is to start the underlying HW
1066 * ring.
1067 */
1068 static int
1069 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1070 {
1071 int err;
1072 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1073
1074 err = mac_hwring_start(rr_ring->arr_hw_rh);
1075
1076 if (err != 0)
1077 return (err);
1078
1079 rr_ring->arr_gen = mr_gen;
1080 return (err);
1081 }
1082
1083 /*
1084 * Stop the pseudo ring. Since the pseudo ring is just an abstraction
1085 * over an actual HW ring, the real task is to stop the underlying HW
1086 * ring.
1087 */
1088 static void
1089 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
1090 {
1091 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1092
1093 /*
1094 * The rings underlying the default group must stay up to
1095 * continue receiving LACP traffic. We would normally never
1096 * stop the default Rx rings because of the primary MAC
1097 * client; but aggr's primary MAC client doesn't call
1098 * mac_unicast_add() and thus mi_active is 0 when the last
1099 * non-primary client is deleted.
1100 */
1101 if (rr_ring->arr_grp->arg_index != 0)
1102 mac_hwring_stop(rr_ring->arr_hw_rh);
1103 }
1104
1105 /*
1106 * Add one or more ports to an existing link aggregation group.
1107 */
1108 int
1109 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1110 laioc_port_t *ports)
1111 {
1112 int rc;
1113 uint_t port_added = 0;
1114 uint_t grp_added;
1115 aggr_grp_t *grp = NULL;
1116 aggr_port_t *port;
1117 boolean_t link_state_changed = B_FALSE;
1118 mac_perim_handle_t mph, pmph;
1119
1120 /* Get the aggr corresponding to linkid. */
1121 rw_enter(&aggr_grp_lock, RW_READER);
1122 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1123 (mod_hash_val_t *)&grp) != 0) {
1124 rw_exit(&aggr_grp_lock);
1125 return (ENOENT);
1126 }
1127 AGGR_GRP_REFHOLD(grp);
1128
1129 /*
1130 * Hold the perimeter so that the aggregation can't be destroyed.
1131 */
1132 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1133 rw_exit(&aggr_grp_lock);
1134
1135 /* Add the specified ports to the aggr. */
1136 for (uint_t i = 0; i < nports; i++) {
1137 grp_added = 0;
1138
1139 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1140 force, &port)) != 0) {
1141 goto bail;
1142 }
1143
1144 ASSERT(port != NULL);
1145 port_added++;
1146
1147 /* check capabilities */
1148 if (!aggr_grp_capab_check(grp, port) ||
1149 !aggr_grp_sdu_check(grp, port) ||
1150 !aggr_grp_margin_check(grp, port)) {
1151 rc = ENOTSUP;
1152 goto bail;
1153 }
1154
1155 /*
1156 * Create the pseudo ring for each HW ring of the underlying
1157 * port.
1158 */
1159 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1160 if (rc != 0)
1161 goto bail;
1162
1163 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
1164 rc = aggr_add_pseudo_rx_group(port,
1165 &grp->lg_rx_groups[j]);
1166
1167 if (rc != 0)
1168 goto bail;
1169
1170 grp_added++;
1171 }
1172
1173 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1174
1175 /* set LACP mode */
1176 aggr_port_lacp_set_mode(grp, port);
1177
1178 /* start port if group has already been started */
1179 if (grp->lg_started) {
1180 rc = aggr_port_start(port);
1181 if (rc != 0) {
1182 mac_perim_exit(pmph);
1183 goto bail;
1184 }
1185
1186 /*
1187 * Turn on the promiscuous mode over the port when it
1188 * is requested to be turned on to receive the
1189 * non-primary address over a port, or the promiscuous
1190 * mode is enabled over the aggr.
1191 */
1192 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1193 rc = aggr_port_promisc(port, B_TRUE);
1194 if (rc != 0) {
1195 mac_perim_exit(pmph);
1196 goto bail;
1197 }
1198 }
1199 }
1200 mac_perim_exit(pmph);
1201
1202 /*
1203 * Attach each port if necessary.
1204 */
1205 if (aggr_port_notify_link(grp, port))
1206 link_state_changed = B_TRUE;
1207
1208 /*
1209 * Initialize the callback functions for this port.
1210 */
1211 aggr_port_init_callbacks(port);
1212 }
1213
1214 /* update the MAC address of the constituent ports */
1215 if (aggr_grp_update_ports_mac(grp))
1216 link_state_changed = B_TRUE;
1217
1218 if (link_state_changed)
1219 mac_link_update(grp->lg_mh, grp->lg_link_state);
1220
1221 bail:
1222 if (rc != 0) {
1223 /* stop and remove ports that have been added */
1224 for (uint_t i = 0; i < port_added; i++) {
1225 uint_t grp_remove;
1226
1227 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1228 ASSERT(port != NULL);
1229
1230 if (grp->lg_started) {
1231 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1232 (void) aggr_port_promisc(port, B_FALSE);
1233 aggr_port_stop(port);
1234 mac_perim_exit(pmph);
1235 }
1236
1237 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1238
1239 /*
1240 * Only the last port could have a partial set
1241 * of groups added.
1242 */
1243 grp_remove = (i + 1 == port_added) ? grp_added :
1244 grp->lg_rx_group_count;
1245
1246 for (uint_t j = 0; j < grp_remove; j++) {
1247 aggr_rem_pseudo_rx_group(port,
1248 &grp->lg_rx_groups[j]);
1249 }
1250
1251 (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1252 }
1253 }
1254
1255 mac_perim_exit(mph);
1256 AGGR_GRP_REFRELE(grp);
1257 return (rc);
1258 }
1259
1260 static int
1261 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1262 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1263 aggr_lacp_timer_t lacp_timer)
1264 {
1265 boolean_t mac_addr_changed = B_FALSE;
1266 boolean_t link_state_changed = B_FALSE;
1267 mac_perim_handle_t pmph;
1268
1269 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1270
1271 /* validate fixed address if specified */
1272 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1273 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1274 (mac_addr[0] & 0x01))) {
1275 return (EINVAL);
1276 }
1277
1278 /* update policy if requested */
1279 if (update_mask & AGGR_MODIFY_POLICY)
1280 aggr_send_update_policy(grp, policy);
1281
1282 /* update unicast MAC address if requested */
1283 if (update_mask & AGGR_MODIFY_MAC) {
1284 if (mac_fixed) {
1285 /* user-supplied MAC address */
1286 grp->lg_mac_addr_port = NULL;
1287 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1288 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1289 mac_addr_changed = B_TRUE;
1290 }
1291 } else if (grp->lg_addr_fixed) {
1292 /* switch from user-supplied to automatic */
1293 aggr_port_t *port = grp->lg_ports;
1294
1295 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1296 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1297 grp->lg_mac_addr_port = port;
1298 mac_addr_changed = B_TRUE;
1299 mac_perim_exit(pmph);
1300 }
1301 grp->lg_addr_fixed = mac_fixed;
1302 }
1303
1304 if (mac_addr_changed)
1305 link_state_changed = aggr_grp_update_ports_mac(grp);
1306
1307 if (update_mask & AGGR_MODIFY_LACP_MODE)
1308 aggr_lacp_update_mode(grp, lacp_mode);
1309
1310 if (update_mask & AGGR_MODIFY_LACP_TIMER)
1311 aggr_lacp_update_timer(grp, lacp_timer);
1312
1313 if (link_state_changed)
1314 mac_link_update(grp->lg_mh, grp->lg_link_state);
1315
1316 if (mac_addr_changed)
1317 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1318
1319 return (0);
1320 }
1321
1322 /*
1323 * Update properties of an existing link aggregation group.
1324 */
1325 int
1326 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1327 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1328 aggr_lacp_timer_t lacp_timer)
1329 {
1330 aggr_grp_t *grp = NULL;
1331 mac_perim_handle_t mph;
1332 int err;
1333
1334 /* get group corresponding to linkid */
1335 rw_enter(&aggr_grp_lock, RW_READER);
1336 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1337 (mod_hash_val_t *)&grp) != 0) {
1338 rw_exit(&aggr_grp_lock);
1339 return (ENOENT);
1340 }
1341 AGGR_GRP_REFHOLD(grp);
1342
1343 /*
1344 * Hold the perimeter so that the aggregation won't be destroyed.
1345 */
1346 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1347 rw_exit(&aggr_grp_lock);
1348
1349 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1350 mac_addr, lacp_mode, lacp_timer);
1351
1352 mac_perim_exit(mph);
1353 AGGR_GRP_REFRELE(grp);
1354 return (err);
1355 }
1356
1357 /*
1358 * Create a new link aggregation group upon request from administrator.
1359 * Returns 0 on success, an errno on failure.
1360 */
1361 int
1362 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1363 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1364 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1365 cred_t *credp)
1366 {
1367 aggr_grp_t *grp = NULL;
1368 aggr_port_t *port;
1369 mac_register_t *mac;
1370 boolean_t link_state_changed;
1371 mac_perim_handle_t mph;
1372 int err;
1373 int i;
1374 kt_did_t tid = 0;
1375
1376 /* need at least one port */
1377 if (nports == 0)
1378 return (EINVAL);
1379
1380 rw_enter(&aggr_grp_lock, RW_WRITER);
1381
1382 /* does a group with the same linkid already exist? */
1383 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1384 (mod_hash_val_t *)&grp);
1385 if (err == 0) {
1386 rw_exit(&aggr_grp_lock);
1387 return (EEXIST);
1388 }
1389
1390 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1391
1392 grp->lg_refs = 1;
1393 grp->lg_closing = B_FALSE;
1394 grp->lg_force = force;
1395 grp->lg_linkid = linkid;
1396 grp->lg_zoneid = crgetzoneid(credp);
1397 grp->lg_ifspeed = 0;
1398 grp->lg_link_state = LINK_STATE_UNKNOWN;
1399 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1400 grp->lg_started = B_FALSE;
1401 grp->lg_promisc = B_FALSE;
1402 grp->lg_lacp_done = B_FALSE;
1403 grp->lg_tx_notify_done = B_FALSE;
1404 grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1405 grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1406 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1407 grp->lg_tx_notify_thread = thread_create(NULL, 0,
1408 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1409 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1410 MAX_RINGS_PER_GROUP), KM_SLEEP);
1411 grp->lg_tx_blocked_cnt = 0;
1412 bzero(&grp->lg_rx_groups,
1413 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
1414 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1415 aggr_lacp_init_grp(grp);
1416
1417 /* add MAC ports to group */
1418 grp->lg_ports = NULL;
1419 grp->lg_nports = 0;
1420 grp->lg_nattached_ports = 0;
1421 grp->lg_ntx_ports = 0;
1422
1423 /*
1424 * If key is not specified by the user, allocate the key.
1425 */
1426 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1427 err = ENOMEM;
1428 goto bail;
1429 }
1430 grp->lg_key = key;
1431
1432 for (i = 0; i < nports; i++) {
1433 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1434 if (err != 0)
1435 goto bail;
1436 }
1437
1438 grp->lg_rx_group_count = 1;
1439
1440 for (i = 0, port = grp->lg_ports; port != NULL;
1441 i++, port = port->lp_next) {
1442 uint_t num_rgroups;
1443
1444 mac_perim_enter_by_mh(port->lp_mh, &mph);
1445 num_rgroups = mac_get_num_rx_groups(port->lp_mh);
1446 mac_perim_exit(mph);
1447
1448 /*
1449 * Utilize all the groups in a port. If some ports
1450 * have less groups than others, then traffic destined
1451 * for the same unicast address may be HW classified
1452 * on some ports but SW classified by aggr when
1453 * arriving on other ports.
1454 */
1455 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
1456 num_rgroups);
1457 }
1458
1459 /*
1460 * There could be cases where the hardware provides more
1461 * groups than aggr can support. Make sure we never go above
1462 * the max aggr can support.
1463 */
1464 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
1465 MAX_GROUPS_PER_PORT);
1466
1467 ASSERT3U(grp->lg_rx_group_count, >, 0);
1468 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1469 grp->lg_rx_groups[i].arg_index = i;
1470 grp->lg_rx_groups[i].arg_untagged = 0;
1471 list_create(&(grp->lg_rx_groups[i].arg_vlans),
1472 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
1473 }
1474
1475 /*
1476 * If no explicit MAC address was specified by the administrator,
1477 * set it to the MAC address of the first port.
1478 */
1479 grp->lg_addr_fixed = mac_fixed;
1480 if (grp->lg_addr_fixed) {
1481 /* validate specified address */
1482 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1483 err = EINVAL;
1484 goto bail;
1485 }
1486 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1487 } else {
1488 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1489 grp->lg_mac_addr_port = grp->lg_ports;
1490 }
1491
1492 /* Set the initial group capabilities. */
1493 aggr_grp_capab_set(grp);
1494
1495 if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1496 err = ENOMEM;
1497 goto bail;
1498 }
1499 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1500 mac->m_driver = grp;
1501 mac->m_dip = aggr_dip;
1502 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1503 mac->m_src_addr = grp->lg_addr;
1504 mac->m_callbacks = &aggr_m_callbacks;
1505 mac->m_min_sdu = 0;
1506 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1507 mac->m_margin = aggr_grp_max_margin(grp);
1508 mac->m_v12n = MAC_VIRT_LEVEL1;
1509 err = mac_register(mac, &grp->lg_mh);
1510 mac_free(mac);
1511 if (err != 0)
1512 goto bail;
1513
1514 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1515 if (err != 0) {
1516 (void) mac_unregister(grp->lg_mh);
1517 grp->lg_mh = NULL;
1518 goto bail;
1519 }
1520
1521 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1522
1523 /*
1524 * Update the MAC address of the constituent ports.
1525 * None of the port is attached at this time, the link state of the
1526 * aggregation will not change.
1527 *
1528 * All ports take on the primary MAC address of the aggr
1529 * (lg_aggr). At this point, none of the ports are attached;
1530 * thus the link state of the aggregation will not change.
1531 */
1532 link_state_changed = aggr_grp_update_ports_mac(grp);
1533 ASSERT(!link_state_changed);
1534
1535 /* Update outbound load balancing policy. */
1536 aggr_send_update_policy(grp, policy);
1537
1538 /* Set LACP mode. */
1539 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1540
1541 /*
1542 * Attach each port if necessary.
1543 */
1544 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1545 /*
1546 * Create the pseudo ring for each HW ring of the
1547 * underlying port. Note that this is done after the
1548 * aggr registers its MAC.
1549 */
1550 VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group),
1551 ==, 0);
1552
1553 for (i = 0; i < grp->lg_rx_group_count; i++) {
1554 VERIFY3S(aggr_add_pseudo_rx_group(port,
1555 &grp->lg_rx_groups[i]), ==, 0);
1556 }
1557
1558 if (aggr_port_notify_link(grp, port))
1559 link_state_changed = B_TRUE;
1560
1561 /*
1562 * Initialize the callback functions for this port.
1563 */
1564 aggr_port_init_callbacks(port);
1565 }
1566
1567 if (link_state_changed)
1568 mac_link_update(grp->lg_mh, grp->lg_link_state);
1569
1570 /* add new group to hash table */
1571 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1572 (mod_hash_val_t)grp);
1573 ASSERT(err == 0);
1574 aggr_grp_cnt++;
1575
1576 mac_perim_exit(mph);
1577 rw_exit(&aggr_grp_lock);
1578 return (0);
1579
1580 bail:
1581
1582 grp->lg_closing = B_TRUE;
1583
1584 port = grp->lg_ports;
1585 while (port != NULL) {
1586 aggr_port_t *cport;
1587
1588 cport = port->lp_next;
1589 aggr_port_delete(port);
1590 port = cport;
1591 }
1592
1593 /*
1594 * Inform the lacp_rx thread to exit.
1595 */
1596 mutex_enter(&grp->lg_lacp_lock);
1597 grp->lg_lacp_done = B_TRUE;
1598 cv_signal(&grp->lg_lacp_cv);
1599 while (grp->lg_lacp_rx_thread != NULL)
1600 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1601 mutex_exit(&grp->lg_lacp_lock);
1602 /*
1603 * Inform the tx_notify thread to exit.
1604 */
1605 mutex_enter(&grp->lg_tx_flowctl_lock);
1606 if (grp->lg_tx_notify_thread != NULL) {
1607 tid = grp->lg_tx_notify_thread->t_did;
1608 grp->lg_tx_notify_done = B_TRUE;
1609 cv_signal(&grp->lg_tx_flowctl_cv);
1610 }
1611 mutex_exit(&grp->lg_tx_flowctl_lock);
1612 if (tid != 0)
1613 thread_join(tid);
1614
1615 kmem_free(grp->lg_tx_blocked_rings,
1616 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1617 rw_exit(&aggr_grp_lock);
1618 AGGR_GRP_REFRELE(grp);
1619 return (err);
1620 }
1621
1622 /*
1623 * Return a pointer to the member of a group with specified linkid.
1624 */
1625 static aggr_port_t *
1626 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1627 {
1628 aggr_port_t *port;
1629
1630 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1631
1632 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1633 if (port->lp_linkid == linkid)
1634 break;
1635 }
1636
1637 return (port);
1638 }
1639
1640 /*
1641 * Stop, detach and remove a port from a link aggregation group.
1642 */
1643 static int
1644 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1645 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1646 {
1647 int rc = 0;
1648 aggr_port_t **pport;
1649 boolean_t mac_addr_changed = B_FALSE;
1650 boolean_t link_state_changed = B_FALSE;
1651 mac_perim_handle_t mph;
1652 uint64_t val;
1653 uint_t i;
1654 uint_t stat;
1655
1656 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1657 ASSERT(grp->lg_nports > 1);
1658 ASSERT(!grp->lg_closing);
1659
1660 /* unlink port */
1661 for (pport = &grp->lg_ports; *pport != port;
1662 pport = &(*pport)->lp_next) {
1663 if (*pport == NULL) {
1664 rc = ENOENT;
1665 goto done;
1666 }
1667 }
1668 *pport = port->lp_next;
1669
1670 mac_perim_enter_by_mh(port->lp_mh, &mph);
1671
1672 /*
1673 * If the MAC address of the port being removed was assigned
1674 * to the group, update the group MAC address
1675 * using the MAC address of a different port.
1676 */
1677 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1678 /*
1679 * Set the MAC address of the group to the
1680 * MAC address of its first port.
1681 */
1682 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1683 grp->lg_mac_addr_port = grp->lg_ports;
1684 mac_addr_changed = B_TRUE;
1685 }
1686
1687 link_state_changed = aggr_grp_detach_port(grp, port);
1688
1689 /*
1690 * Add the counter statistics of the ports while it was aggregated
1691 * to the group's residual statistics. This is done by obtaining
1692 * the current counter from the underlying MAC then subtracting the
1693 * value of the counter at the moment it was added to the
1694 * aggregation.
1695 */
1696 for (i = 0; i < MAC_NSTAT; i++) {
1697 stat = i + MAC_STAT_MIN;
1698 if (!MAC_STAT_ISACOUNTER(stat))
1699 continue;
1700 val = aggr_port_stat(port, stat);
1701 val -= port->lp_stat[i];
1702 mutex_enter(&grp->lg_stat_lock);
1703 grp->lg_stat[i] += val;
1704 mutex_exit(&grp->lg_stat_lock);
1705 }
1706 for (i = 0; i < ETHER_NSTAT; i++) {
1707 stat = i + MACTYPE_STAT_MIN;
1708 if (!ETHER_STAT_ISACOUNTER(stat))
1709 continue;
1710 val = aggr_port_stat(port, stat);
1711 val -= port->lp_ether_stat[i];
1712 mutex_enter(&grp->lg_stat_lock);
1713 grp->lg_ether_stat[i] += val;
1714 mutex_exit(&grp->lg_stat_lock);
1715 }
1716
1717 grp->lg_nports--;
1718 mac_perim_exit(mph);
1719
1720 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1721 aggr_port_delete(port);
1722
1723 /*
1724 * If the group MAC address has changed, update the MAC address of
1725 * the remaining constituent ports according to the new MAC
1726 * address of the group.
1727 */
1728 if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1729 link_state_changed = B_TRUE;
1730
1731 done:
1732 if (mac_addr_changedp != NULL)
1733 *mac_addr_changedp = mac_addr_changed;
1734 if (link_state_changedp != NULL)
1735 *link_state_changedp = link_state_changed;
1736
1737 return (rc);
1738 }
1739
1740 /*
1741 * Remove one or more ports from an existing link aggregation group.
1742 */
1743 int
1744 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1745 {
1746 int rc = 0, i;
1747 aggr_grp_t *grp = NULL;
1748 aggr_port_t *port;
1749 boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1750 boolean_t link_state_update = B_FALSE, link_state_changed;
1751 mac_perim_handle_t mph, pmph;
1752
1753 /* get group corresponding to linkid */
1754 rw_enter(&aggr_grp_lock, RW_READER);
1755 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1756 (mod_hash_val_t *)&grp) != 0) {
1757 rw_exit(&aggr_grp_lock);
1758 return (ENOENT);
1759 }
1760 AGGR_GRP_REFHOLD(grp);
1761
1762 /*
1763 * Hold the perimeter so that the aggregation won't be destroyed.
1764 */
1765 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1766 rw_exit(&aggr_grp_lock);
1767
1768 /* we need to keep at least one port per group */
1769 if (nports >= grp->lg_nports) {
1770 rc = EINVAL;
1771 goto bail;
1772 }
1773
1774 /* first verify that all the groups are valid */
1775 for (i = 0; i < nports; i++) {
1776 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1777 /* port not found */
1778 rc = ENOENT;
1779 goto bail;
1780 }
1781 }
1782
1783 /* clear the promiscous mode for the specified ports */
1784 for (i = 0; i < nports && rc == 0; i++) {
1785 /* lookup port */
1786 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1787 ASSERT(port != NULL);
1788
1789 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1790 rc = aggr_port_promisc(port, B_FALSE);
1791 mac_perim_exit(pmph);
1792 }
1793 if (rc != 0) {
1794 for (i = 0; i < nports; i++) {
1795 port = aggr_grp_port_lookup(grp,
1796 ports[i].lp_linkid);
1797 ASSERT(port != NULL);
1798
1799 /*
1800 * Turn the promiscuous mode back on if it is required
1801 * to receive the non-primary address over a port, or
1802 * the promiscous mode is enabled over the aggr.
1803 */
1804 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1805 if (port->lp_started && (grp->lg_promisc ||
1806 port->lp_prom_addr != NULL)) {
1807 (void) aggr_port_promisc(port, B_TRUE);
1808 }
1809 mac_perim_exit(pmph);
1810 }
1811 goto bail;
1812 }
1813
1814 /* remove the specified ports from group */
1815 for (i = 0; i < nports; i++) {
1816 /* lookup port */
1817 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1818 ASSERT(port != NULL);
1819
1820 /* stop port if group has already been started */
1821 if (grp->lg_started) {
1822 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1823 aggr_port_stop(port);
1824 mac_perim_exit(pmph);
1825 }
1826
1827 /*
1828 * aggr_rem_pseudo_tx_group() is not called here. Instead
1829 * it is called from inside aggr_grp_rem_port() after the
1830 * port has been detached. The reason is that
1831 * aggr_rem_pseudo_tx_group() removes one ring at a time
1832 * and if there is still traffic going on, then there
1833 * is the possibility of aggr_find_tx_ring() returning a
1834 * removed ring for transmission. Once the port has been
1835 * detached, that port will not be used and
1836 * aggr_find_tx_ring() will not return any rings
1837 * belonging to it.
1838 */
1839 for (i = 0; i < grp->lg_rx_group_count; i++)
1840 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1841
1842 /* remove port from group */
1843 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1844 &link_state_changed);
1845 ASSERT(rc == 0);
1846 mac_addr_update = mac_addr_update || mac_addr_changed;
1847 link_state_update = link_state_update || link_state_changed;
1848 }
1849
1850 bail:
1851 if (mac_addr_update)
1852 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1853 if (link_state_update)
1854 mac_link_update(grp->lg_mh, grp->lg_link_state);
1855
1856 mac_perim_exit(mph);
1857 AGGR_GRP_REFRELE(grp);
1858
1859 return (rc);
1860 }
1861
1862 int
1863 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1864 {
1865 aggr_grp_t *grp = NULL;
1866 aggr_port_t *port, *cport;
1867 datalink_id_t tmpid;
1868 mod_hash_val_t val;
1869 mac_perim_handle_t mph, pmph;
1870 int err;
1871 kt_did_t tid = 0;
1872
1873 rw_enter(&aggr_grp_lock, RW_WRITER);
1874
1875 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1876 (mod_hash_val_t *)&grp) != 0) {
1877 rw_exit(&aggr_grp_lock);
1878 return (ENOENT);
1879 }
1880
1881 /*
1882 * Note that dls_devnet_destroy() must be called before lg_lock is
1883 * held. Otherwise, it will deadlock if another thread is in
1884 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1885 * dls_devnet_destroy() needs to delete.
1886 */
1887 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1888 rw_exit(&aggr_grp_lock);
1889 return (err);
1890 }
1891 ASSERT(linkid == tmpid);
1892
1893 /*
1894 * Unregister from the MAC service module. Since this can
1895 * fail if a client hasn't closed the MAC port, we gracefully
1896 * fail the operation.
1897 */
1898 if ((err = mac_disable(grp->lg_mh)) != 0) {
1899 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1900 rw_exit(&aggr_grp_lock);
1901 return (err);
1902 }
1903 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1904 ASSERT(grp == (aggr_grp_t *)val);
1905
1906 ASSERT(aggr_grp_cnt > 0);
1907 aggr_grp_cnt--;
1908 rw_exit(&aggr_grp_lock);
1909
1910 /*
1911 * Inform the lacp_rx thread to exit.
1912 */
1913 mutex_enter(&grp->lg_lacp_lock);
1914 grp->lg_lacp_done = B_TRUE;
1915 cv_signal(&grp->lg_lacp_cv);
1916 while (grp->lg_lacp_rx_thread != NULL)
1917 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1918 mutex_exit(&grp->lg_lacp_lock);
1919 /*
1920 * Inform the tx_notify_thread to exit.
1921 */
1922 mutex_enter(&grp->lg_tx_flowctl_lock);
1923 if (grp->lg_tx_notify_thread != NULL) {
1924 tid = grp->lg_tx_notify_thread->t_did;
1925 grp->lg_tx_notify_done = B_TRUE;
1926 cv_signal(&grp->lg_tx_flowctl_cv);
1927 }
1928 mutex_exit(&grp->lg_tx_flowctl_lock);
1929 if (tid != 0)
1930 thread_join(tid);
1931
1932 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1933
1934 grp->lg_closing = B_TRUE;
1935 /* detach and free MAC ports associated with group */
1936 port = grp->lg_ports;
1937 while (port != NULL) {
1938 cport = port->lp_next;
1939 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1940 if (grp->lg_started)
1941 aggr_port_stop(port);
1942 (void) aggr_grp_detach_port(grp, port);
1943 mac_perim_exit(pmph);
1944 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1945 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
1946 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1947 aggr_port_delete(port);
1948 port = cport;
1949 }
1950
1951 mac_perim_exit(mph);
1952
1953 kmem_free(grp->lg_tx_blocked_rings,
1954 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1955 /*
1956 * Wait for the port's lacp timer thread and its notification callback
1957 * to exit before calling mac_unregister() since both needs to access
1958 * the mac perimeter of the grp.
1959 */
1960 aggr_grp_port_wait(grp);
1961
1962 VERIFY(mac_unregister(grp->lg_mh) == 0);
1963 grp->lg_mh = NULL;
1964
1965 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1966 list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
1967 }
1968
1969 AGGR_GRP_REFRELE(grp);
1970 return (0);
1971 }
1972
1973 void
1974 aggr_grp_free(aggr_grp_t *grp)
1975 {
1976 ASSERT(grp->lg_refs == 0);
1977 ASSERT(grp->lg_port_ref == 0);
1978 if (grp->lg_key > AGGR_MAX_KEY) {
1979 id_free(key_ids, grp->lg_key);
1980 grp->lg_key = 0;
1981 }
1982 kmem_cache_free(aggr_grp_cache, grp);
1983 }
1984
1985 int
1986 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1987 aggr_grp_info_new_grp_fn_t new_grp_fn,
1988 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1989 {
1990 aggr_grp_t *grp;
1991 aggr_port_t *port;
1992 mac_perim_handle_t mph, pmph;
1993 int rc = 0;
1994
1995 /*
1996 * Make sure that the aggregation link is visible from the caller's
1997 * zone.
1998 */
1999 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
2000 return (ENOENT);
2001
2002 rw_enter(&aggr_grp_lock, RW_READER);
2003
2004 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2005 (mod_hash_val_t *)&grp) != 0) {
2006 rw_exit(&aggr_grp_lock);
2007 return (ENOENT);
2008 }
2009 AGGR_GRP_REFHOLD(grp);
2010
2011 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2012 rw_exit(&aggr_grp_lock);
2013
2014 rc = new_grp_fn(fn_arg, grp->lg_linkid,
2015 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
2016 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
2017 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
2018
2019 if (rc != 0)
2020 goto bail;
2021
2022 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2023 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2024 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
2025 port->lp_state, &port->lp_lacp.ActorOperPortState);
2026 mac_perim_exit(pmph);
2027
2028 if (rc != 0)
2029 goto bail;
2030 }
2031
2032 bail:
2033 mac_perim_exit(mph);
2034 AGGR_GRP_REFRELE(grp);
2035 return (rc);
2036 }
2037
2038 /*ARGSUSED*/
2039 static void
2040 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
2041 {
2042 miocnak(q, mp, 0, ENOTSUP);
2043 }
2044
2045 static int
2046 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
2047 {
2048 aggr_port_t *port;
2049 uint_t stat_index;
2050
2051 ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
2052
2053 /* We only aggregate counter statistics. */
2054 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
2055 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
2056 return (ENOTSUP);
2057 }
2058
2059 /*
2060 * Counter statistics for a group are computed by aggregating the
2061 * counters of the members MACs while they were aggregated, plus
2062 * the residual counter of the group itself, which is updated each
2063 * time a MAC is removed from the group.
2064 */
2065 *val = 0;
2066 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2067 /* actual port statistic */
2068 *val += aggr_port_stat(port, stat);
2069 /*
2070 * minus the port stat when it was added, plus any residual
2071 * amount for the group.
2072 */
2073 if (IS_MAC_STAT(stat)) {
2074 stat_index = stat - MAC_STAT_MIN;
2075 *val -= port->lp_stat[stat_index];
2076 *val += grp->lg_stat[stat_index];
2077 } else if (IS_MACTYPE_STAT(stat)) {
2078 stat_index = stat - MACTYPE_STAT_MIN;
2079 *val -= port->lp_ether_stat[stat_index];
2080 *val += grp->lg_ether_stat[stat_index];
2081 }
2082 }
2083 return (0);
2084 }
2085
2086 int
2087 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2088 {
2089 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
2090
2091 if (rx_ring->arr_hw_rh != NULL) {
2092 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
2093 } else {
2094 aggr_port_t *port = rx_ring->arr_port;
2095
2096 *val = mac_stat_get(port->lp_mh, stat);
2097
2098 }
2099 return (0);
2100 }
2101
2102 int
2103 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2104 {
2105 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2106
2107 if (tx_ring->atr_hw_rh != NULL) {
2108 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2109 } else {
2110 aggr_port_t *port = tx_ring->atr_port;
2111
2112 *val = mac_stat_get(port->lp_mh, stat);
2113 }
2114 return (0);
2115 }
2116
2117 static int
2118 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2119 {
2120 aggr_grp_t *grp = arg;
2121 int rval = 0;
2122
2123 mutex_enter(&grp->lg_stat_lock);
2124
2125 switch (stat) {
2126 case MAC_STAT_IFSPEED:
2127 *val = grp->lg_ifspeed;
2128 break;
2129
2130 case ETHER_STAT_LINK_DUPLEX:
2131 *val = grp->lg_link_duplex;
2132 break;
2133
2134 default:
2135 /*
2136 * For all other statistics, we return the aggregated stat
2137 * from the underlying ports. aggr_grp_stat() will set
2138 * rval appropriately if the statistic isn't a counter.
2139 */
2140 rval = aggr_grp_stat(grp, stat, val);
2141 }
2142
2143 mutex_exit(&grp->lg_stat_lock);
2144 return (rval);
2145 }
2146
2147 static int
2148 aggr_m_start(void *arg)
2149 {
2150 aggr_grp_t *grp = arg;
2151 aggr_port_t *port;
2152 mac_perim_handle_t mph, pmph;
2153
2154 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2155
2156 /*
2157 * Attempts to start all configured members of the group.
2158 * Group members will be attached when their link-up notification
2159 * is received.
2160 */
2161 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2162 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2163 if (aggr_port_start(port) != 0) {
2164 mac_perim_exit(pmph);
2165 continue;
2166 }
2167
2168 /*
2169 * Turn on the promiscuous mode if it is required to receive
2170 * the non-primary address over a port, or the promiscous
2171 * mode is enabled over the aggr.
2172 */
2173 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2174 if (aggr_port_promisc(port, B_TRUE) != 0)
2175 aggr_port_stop(port);
2176 }
2177 mac_perim_exit(pmph);
2178 }
2179
2180 grp->lg_started = B_TRUE;
2181
2182 mac_perim_exit(mph);
2183 return (0);
2184 }
2185
2186 static void
2187 aggr_m_stop(void *arg)
2188 {
2189 aggr_grp_t *grp = arg;
2190 aggr_port_t *port;
2191 mac_perim_handle_t mph, pmph;
2192
2193 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2194
2195 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2196 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2197
2198 /* reset port promiscuous mode */
2199 (void) aggr_port_promisc(port, B_FALSE);
2200
2201 aggr_port_stop(port);
2202 mac_perim_exit(pmph);
2203 }
2204
2205 grp->lg_started = B_FALSE;
2206 mac_perim_exit(mph);
2207 }
2208
2209 static int
2210 aggr_m_promisc(void *arg, boolean_t on)
2211 {
2212 aggr_grp_t *grp = arg;
2213 aggr_port_t *port;
2214 boolean_t link_state_changed = B_FALSE;
2215 mac_perim_handle_t mph, pmph;
2216
2217 AGGR_GRP_REFHOLD(grp);
2218 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2219
2220 ASSERT(!grp->lg_closing);
2221
2222 if (on == grp->lg_promisc)
2223 goto bail;
2224
2225 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2226 int err = 0;
2227
2228 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2229 AGGR_PORT_REFHOLD(port);
2230 if (!on && (port->lp_prom_addr == NULL))
2231 err = aggr_port_promisc(port, B_FALSE);
2232 else if (on && port->lp_started)
2233 err = aggr_port_promisc(port, B_TRUE);
2234
2235 if (err != 0) {
2236 if (aggr_grp_detach_port(grp, port))
2237 link_state_changed = B_TRUE;
2238 } else {
2239 /*
2240 * If a port was detached because of a previous
2241 * failure changing the promiscuity, the port
2242 * is reattached when it successfully changes
2243 * the promiscuity now, and this might cause
2244 * the link state of the aggregation to change.
2245 */
2246 if (aggr_grp_attach_port(grp, port))
2247 link_state_changed = B_TRUE;
2248 }
2249 mac_perim_exit(pmph);
2250 AGGR_PORT_REFRELE(port);
2251 }
2252
2253 grp->lg_promisc = on;
2254
2255 if (link_state_changed)
2256 mac_link_update(grp->lg_mh, grp->lg_link_state);
2257
2258 bail:
2259 mac_perim_exit(mph);
2260 AGGR_GRP_REFRELE(grp);
2261
2262 return (0);
2263 }
2264
2265 static void
2266 aggr_grp_port_rename(const char *new_name, void *arg)
2267 {
2268 /*
2269 * aggr port's mac client name is the format of "aggr link name" plus
2270 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2271 */
2272 int aggr_len, link_len, clnt_name_len, i;
2273 char *str_end, *str_st, *str_del;
2274 char aggr_name[MAXNAMELEN];
2275 char link_name[MAXNAMELEN];
2276 char *clnt_name;
2277 aggr_grp_t *aggr_grp = arg;
2278 aggr_port_t *aggr_port = aggr_grp->lg_ports;
2279
2280 for (i = 0; i < aggr_grp->lg_nports; i++) {
2281 clnt_name = mac_client_name(aggr_port->lp_mch);
2282 clnt_name_len = strlen(clnt_name);
2283 str_st = clnt_name;
2284 str_end = &(clnt_name[clnt_name_len]);
2285 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2286 ASSERT(str_del != NULL);
2287 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2288 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2289 bzero(aggr_name, MAXNAMELEN);
2290 bzero(link_name, MAXNAMELEN);
2291 bcopy(clnt_name, aggr_name, aggr_len);
2292 bcopy(str_del, link_name, link_len + 1);
2293 bzero(clnt_name, MAXNAMELEN);
2294 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2295 link_name);
2296
2297 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2298 aggr_port = aggr_port->lp_next;
2299 }
2300 }
2301
2302 /*
2303 * Initialize the capabilities that are advertised for the group
2304 * according to the capabilities of the constituent ports.
2305 */
2306 static boolean_t
2307 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2308 {
2309 aggr_grp_t *grp = arg;
2310
2311 switch (cap) {
2312 case MAC_CAPAB_HCKSUM: {
2313 uint32_t *hcksum_txflags = cap_data;
2314 *hcksum_txflags = grp->lg_hcksum_txflags;
2315 break;
2316 }
2317 case MAC_CAPAB_LSO: {
2318 mac_capab_lso_t *cap_lso = cap_data;
2319
2320 if (grp->lg_lso) {
2321 *cap_lso = grp->lg_cap_lso;
2322 break;
2323 } else {
2324 return (B_FALSE);
2325 }
2326 }
2327 case MAC_CAPAB_NO_NATIVEVLAN:
2328 return (!grp->lg_vlan);
2329 case MAC_CAPAB_NO_ZCOPY:
2330 return (!grp->lg_zcopy);
2331 case MAC_CAPAB_RINGS: {
2332 mac_capab_rings_t *cap_rings = cap_data;
2333 uint_t ring_cnt = 0;
2334
2335 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2336 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
2337
2338 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2339 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2340 cap_rings->mr_rnum = ring_cnt;
2341 cap_rings->mr_gnum = grp->lg_rx_group_count;
2342 cap_rings->mr_gaddring = NULL;
2343 cap_rings->mr_gremring = NULL;
2344 } else {
2345 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2346 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2347 cap_rings->mr_gnum = 0;
2348 }
2349 cap_rings->mr_rget = aggr_fill_ring;
2350 cap_rings->mr_gget = aggr_fill_group;
2351 break;
2352 }
2353 case MAC_CAPAB_AGGR:
2354 {
2355 mac_capab_aggr_t *aggr_cap;
2356
2357 if (cap_data != NULL) {
2358 aggr_cap = cap_data;
2359 aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2360 aggr_cap->mca_unicst = aggr_m_unicst;
2361 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2362 aggr_cap->mca_arg = arg;
2363 }
2364 return (B_TRUE);
2365 }
2366 default:
2367 return (B_FALSE);
2368 }
2369 return (B_TRUE);
2370 }
2371
2372 /*
2373 * Callback function for MAC layer to register groups.
2374 */
2375 static void
2376 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2377 mac_group_info_t *infop, mac_group_handle_t gh)
2378 {
2379 aggr_grp_t *grp = arg;
2380
2381 if (rtype == MAC_RING_TYPE_RX) {
2382 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
2383
2384 rx_group->arg_gh = gh;
2385 rx_group->arg_grp = grp;
2386
2387 infop->mgi_driver = (mac_group_driver_t)rx_group;
2388 infop->mgi_start = NULL;
2389 infop->mgi_stop = NULL;
2390 infop->mgi_addmac = aggr_addmac;
2391 infop->mgi_remmac = aggr_remmac;
2392 infop->mgi_count = rx_group->arg_ring_cnt;
2393
2394 /*
2395 * Always set the HW VLAN callbacks. They are smart
2396 * enough to know when a port has HW VLAN filters to
2397 * program and when it doesn't.
2398 */
2399 infop->mgi_addvlan = aggr_addvlan;
2400 infop->mgi_remvlan = aggr_remvlan;
2401 } else {
2402 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2403
2404 ASSERT3S(index, ==, 0);
2405 tx_group->atg_gh = gh;
2406 }
2407 }
2408
2409 /*
2410 * Callback funtion for MAC layer to register all rings.
2411 */
2412 static void
2413 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2414 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2415 {
2416 aggr_grp_t *grp = arg;
2417
2418 switch (rtype) {
2419 case MAC_RING_TYPE_RX: {
2420 aggr_pseudo_rx_group_t *rx_group;
2421 aggr_pseudo_rx_ring_t *rx_ring;
2422 mac_intr_t aggr_mac_intr;
2423
2424 rx_group = &grp->lg_rx_groups[rg_index];
2425 ASSERT3S(index, >=, 0);
2426 ASSERT3S(index, <, rx_group->arg_ring_cnt);
2427 rx_ring = rx_group->arg_rings + index;
2428 rx_ring->arr_rh = rh;
2429
2430 /*
2431 * Entrypoint to enable interrupt (disable poll) and
2432 * disable interrupt (enable poll).
2433 */
2434 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2435 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2436 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2437 aggr_mac_intr.mi_ddi_handle = NULL;
2438
2439 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2440 infop->mri_start = aggr_pseudo_start_rx_ring;
2441 infop->mri_stop = aggr_pseudo_stop_rx_ring;
2442
2443 infop->mri_intr = aggr_mac_intr;
2444 infop->mri_poll = aggr_rx_poll;
2445
2446 infop->mri_stat = aggr_rx_ring_stat;
2447 break;
2448 }
2449 case MAC_RING_TYPE_TX: {
2450 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2451 aggr_pseudo_tx_ring_t *tx_ring;
2452
2453 ASSERT(rg_index == -1);
2454 ASSERT(index < tx_group->atg_ring_cnt);
2455
2456 tx_ring = &tx_group->atg_rings[index];
2457 tx_ring->atr_rh = rh;
2458
2459 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2460 infop->mri_start = NULL;
2461 infop->mri_stop = NULL;
2462 infop->mri_tx = aggr_ring_tx;
2463 infop->mri_stat = aggr_tx_ring_stat;
2464 /*
2465 * Use the hw TX ring handle to find if the ring needs
2466 * serialization or not. For NICs that do not expose
2467 * Tx rings, atr_hw_rh will be NULL.
2468 */
2469 if (tx_ring->atr_hw_rh != NULL) {
2470 infop->mri_flags =
2471 mac_hwring_getinfo(tx_ring->atr_hw_rh);
2472 }
2473 break;
2474 }
2475 default:
2476 break;
2477 }
2478 }
2479
2480 static mblk_t *
2481 aggr_rx_poll(void *arg, int bytes_to_pickup)
2482 {
2483 aggr_pseudo_rx_ring_t *rr_ring = arg;
2484 aggr_port_t *port = rr_ring->arr_port;
2485 aggr_grp_t *grp = port->lp_grp;
2486 mblk_t *mp_chain, *mp, **mpp;
2487
2488 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2489
2490 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2491 return (mp_chain);
2492
2493 mpp = &mp_chain;
2494 while ((mp = *mpp) != NULL) {
2495 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2496 struct ether_header *ehp;
2497
2498 ehp = (struct ether_header *)mp->b_rptr;
2499 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2500 *mpp = mp->b_next;
2501 mp->b_next = NULL;
2502 aggr_recv_lacp(port,
2503 (mac_resource_handle_t)rr_ring, mp);
2504 continue;
2505 }
2506 }
2507
2508 if (!port->lp_collector_enabled) {
2509 *mpp = mp->b_next;
2510 mp->b_next = NULL;
2511 freemsg(mp);
2512 continue;
2513 }
2514 mpp = &mp->b_next;
2515 }
2516 return (mp_chain);
2517 }
2518
2519 static int
2520 aggr_addmac(void *arg, const uint8_t *mac_addr)
2521 {
2522 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2523 aggr_unicst_addr_t *addr, **pprev;
2524 aggr_grp_t *grp = rx_group->arg_grp;
2525 aggr_port_t *port, *p;
2526 mac_perim_handle_t mph;
2527 int err = 0;
2528 uint_t idx = rx_group->arg_index;
2529
2530 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2531
2532 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2533 mac_perim_exit(mph);
2534 return (0);
2535 }
2536
2537 /*
2538 * Insert this mac address into the list of mac addresses owned by
2539 * the aggregation pseudo group.
2540 */
2541 pprev = &rx_group->arg_macaddr;
2542 while ((addr = *pprev) != NULL) {
2543 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2544 mac_perim_exit(mph);
2545 return (EEXIST);
2546 }
2547 pprev = &addr->aua_next;
2548 }
2549 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2550 bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2551 addr->aua_next = NULL;
2552 *pprev = addr;
2553
2554 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2555 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
2556 break;
2557
2558 if (err != 0) {
2559 for (p = grp->lg_ports; p != port; p = p->lp_next)
2560 aggr_port_remmac(p, idx, mac_addr);
2561
2562 *pprev = NULL;
2563 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2564 }
2565
2566 mac_perim_exit(mph);
2567 return (err);
2568 }
2569
2570 static int
2571 aggr_remmac(void *arg, const uint8_t *mac_addr)
2572 {
2573 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2574 aggr_unicst_addr_t *addr, **pprev;
2575 aggr_grp_t *grp = rx_group->arg_grp;
2576 aggr_port_t *port;
2577 mac_perim_handle_t mph;
2578 int err = 0;
2579
2580 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2581
2582 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2583 mac_perim_exit(mph);
2584 return (0);
2585 }
2586
2587 /*
2588 * Insert this mac address into the list of mac addresses owned by
2589 * the aggregation pseudo group.
2590 */
2591 pprev = &rx_group->arg_macaddr;
2592 while ((addr = *pprev) != NULL) {
2593 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2594 pprev = &addr->aua_next;
2595 continue;
2596 }
2597 break;
2598 }
2599 if (addr == NULL) {
2600 mac_perim_exit(mph);
2601 return (EINVAL);
2602 }
2603
2604 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2605 aggr_port_remmac(port, rx_group->arg_index, mac_addr);
2606
2607 *pprev = addr->aua_next;
2608 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2609
2610 mac_perim_exit(mph);
2611 return (err);
2612 }
2613
2614 /*
2615 * Search for VID in the Rx group's list and return a pointer if
2616 * found. Otherwise return NULL.
2617 */
2618 static aggr_vlan_t *
2619 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2620 {
2621 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2622 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2623 avp = list_next(&rx_group->arg_vlans, avp)) {
2624 if (avp->av_vid == vid)
2625 return (avp);
2626 }
2627
2628 return (NULL);
2629 }
2630
2631 /*
2632 * Accept traffic on the specified VID.
2633 *
2634 * Persist VLAN state in the aggr so that ports added later will
2635 * receive the correct filters. In the future it would be nice to
2636 * allow aggr to iterate its clients instead of duplicating state.
2637 */
2638 static int
2639 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2640 {
2641 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2642 aggr_grp_t *aggr = rx_group->arg_grp;
2643 aggr_port_t *port, *p;
2644 mac_perim_handle_t mph;
2645 int err = 0;
2646 aggr_vlan_t *avp = NULL;
2647 uint_t idx = rx_group->arg_index;
2648
2649 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2650
2651 if (vid == MAC_VLAN_UNTAGGED) {
2652 /*
2653 * Aggr is both a MAC provider and MAC client. As a
2654 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2655 * client. As a client itself, it should pass
2656 * VLAN_ID_NONE to its ports.
2657 */
2658 vid = VLAN_ID_NONE;
2659 rx_group->arg_untagged++;
2660 goto update_ports;
2661 }
2662
2663 avp = aggr_find_vlan(rx_group, vid);
2664
2665 if (avp != NULL) {
2666 avp->av_refs++;
2667 mac_perim_exit(mph);
2668 return (0);
2669 }
2670
2671 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2672 avp->av_vid = vid;
2673 avp->av_refs = 1;
2674
2675 update_ports:
2676 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2677 if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
2678 break;
2679
2680 if (err != 0) {
2681 /*
2682 * If any of these calls fail then we are in a
2683 * situation where the ports have different HW state.
2684 * There's no reasonable action the MAC client can
2685 * take in this scenario to rectify the situation.
2686 */
2687 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2688 int err2;
2689
2690 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
2691 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2692 " from port %s: errno %d.", vid,
2693 mac_client_name(p->lp_mch), err2);
2694 }
2695
2696 }
2697
2698 if (vid == VLAN_ID_NONE)
2699 rx_group->arg_untagged--;
2700
2701 if (avp != NULL) {
2702 kmem_free(avp, sizeof (aggr_vlan_t));
2703 avp = NULL;
2704 }
2705 }
2706
2707 if (avp != NULL)
2708 list_insert_tail(&rx_group->arg_vlans, avp);
2709
2710 done:
2711 mac_perim_exit(mph);
2712 return (err);
2713 }
2714
2715 /*
2716 * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2717 */
2718 static int
2719 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2720 {
2721 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2722 aggr_grp_t *aggr = rx_group->arg_grp;
2723 aggr_port_t *port, *p;
2724 mac_perim_handle_t mph;
2725 int err = 0;
2726 aggr_vlan_t *avp = NULL;
2727 uint_t idx = rx_group->arg_index;
2728
2729 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2730
2731 /*
2732 * See the comment in aggr_addvlan().
2733 */
2734 if (vid == MAC_VLAN_UNTAGGED) {
2735 vid = VLAN_ID_NONE;
2736 rx_group->arg_untagged--;
2737
2738 if (rx_group->arg_untagged > 0)
2739 goto done;
2740
2741 goto update_ports;
2742 }
2743
2744 avp = aggr_find_vlan(rx_group, vid);
2745
2746 if (avp == NULL) {
2747 err = ENOENT;
2748 goto done;
2749 }
2750
2751 avp->av_refs--;
2752
2753 if (avp->av_refs > 0)
2754 goto done;
2755
2756 update_ports:
2757 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2758 if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
2759 break;
2760
2761 /*
2762 * See the comment in aggr_addvlan() for justification of the
2763 * use of VERIFY here.
2764 */
2765 if (err != 0) {
2766 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2767 int err2;
2768
2769 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
2770 cmn_err(CE_WARN, "Failed to add VLAN %u"
2771 " to port %s: errno %d.", vid,
2772 mac_client_name(p->lp_mch), err2);
2773 }
2774 }
2775
2776 if (avp != NULL)
2777 avp->av_refs++;
2778
2779 if (vid == VLAN_ID_NONE)
2780 rx_group->arg_untagged++;
2781
2782 goto done;
2783 }
2784
2785 if (err == 0 && avp != NULL) {
2786 VERIFY3U(avp->av_refs, ==, 0);
2787 list_remove(&rx_group->arg_vlans, avp);
2788 kmem_free(avp, sizeof (aggr_vlan_t));
2789 }
2790
2791 done:
2792 mac_perim_exit(mph);
2793 return (err);
2794 }
2795
2796 /*
2797 * Add or remove the multicast addresses that are defined for the group
2798 * to or from the specified port.
2799 *
2800 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2801 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2802 * called when the port is either stopped or detached.
2803 */
2804 void
2805 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2806 {
2807 aggr_grp_t *grp = port->lp_grp;
2808
2809 ASSERT(MAC_PERIM_HELD(port->lp_mh));
2810 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2811
2812 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2813 return;
2814
2815 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2816 }
2817
2818 static int
2819 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2820 {
2821 aggr_grp_t *grp = arg;
2822 aggr_port_t *port = NULL, *errport = NULL;
2823 mac_perim_handle_t mph;
2824 int err = 0;
2825
2826 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2827 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2828 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2829 !port->lp_started) {
2830 continue;
2831 }
2832 err = aggr_port_multicst(port, add, addrp);
2833 if (err != 0) {
2834 errport = port;
2835 break;
2836 }
2837 }
2838
2839 /*
2840 * At least one port caused error return and this error is returned to
2841 * mac, eventually a NAK would be sent upwards.
2842 * Some ports have this multicast address listed now, and some don't.
2843 * Treat this error as a whole aggr failure not individual port failure.
2844 * Therefore remove this multicast address from other ports.
2845 */
2846 if ((err != 0) && add) {
2847 for (port = grp->lg_ports; port != errport;
2848 port = port->lp_next) {
2849 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2850 !port->lp_started) {
2851 continue;
2852 }
2853 (void) aggr_port_multicst(port, B_FALSE, addrp);
2854 }
2855 }
2856 mac_perim_exit(mph);
2857 return (err);
2858 }
2859
2860 static int
2861 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2862 {
2863 aggr_grp_t *grp = arg;
2864 mac_perim_handle_t mph;
2865 int err;
2866
2867 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2868 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2869 0, 0);
2870 mac_perim_exit(mph);
2871 return (err);
2872 }
2873
2874 /*
2875 * Initialize the capabilities that are advertised for the group
2876 * according to the capabilities of the constituent ports.
2877 */
2878 static void
2879 aggr_grp_capab_set(aggr_grp_t *grp)
2880 {
2881 uint32_t cksum;
2882 aggr_port_t *port;
2883 mac_capab_lso_t cap_lso;
2884
2885 ASSERT(grp->lg_mh == NULL);
2886 ASSERT(grp->lg_ports != NULL);
2887
2888 grp->lg_hcksum_txflags = (uint32_t)-1;
2889 grp->lg_zcopy = B_TRUE;
2890 grp->lg_vlan = B_TRUE;
2891
2892 grp->lg_lso = B_TRUE;
2893 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2894 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2895
2896 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2897 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2898 cksum = 0;
2899 grp->lg_hcksum_txflags &= cksum;
2900
2901 grp->lg_vlan &=
2902 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2903
2904 grp->lg_zcopy &=
2905 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2906
2907 grp->lg_lso &=
2908 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2909 if (grp->lg_lso) {
2910 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2911 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2912 cap_lso.lso_basic_tcp_ipv4.lso_max)
2913 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2914 cap_lso.lso_basic_tcp_ipv4.lso_max;
2915 }
2916 }
2917 }
2918
2919 /*
2920 * Checks whether the capabilities of the port being added are compatible
2921 * with the current capabilities of the aggregation.
2922 */
2923 static boolean_t
2924 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2925 {
2926 uint32_t hcksum_txflags;
2927
2928 ASSERT(grp->lg_ports != NULL);
2929
2930 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2931 grp->lg_vlan) != grp->lg_vlan) {
2932 return (B_FALSE);
2933 }
2934
2935 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2936 grp->lg_zcopy) != grp->lg_zcopy) {
2937 return (B_FALSE);
2938 }
2939
2940 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2941 if (grp->lg_hcksum_txflags != 0)
2942 return (B_FALSE);
2943 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2944 grp->lg_hcksum_txflags) {
2945 return (B_FALSE);
2946 }
2947
2948 if (grp->lg_lso) {
2949 mac_capab_lso_t cap_lso;
2950
2951 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2952 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2953 grp->lg_cap_lso.lso_flags)
2954 return (B_FALSE);
2955 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2956 cap_lso.lso_basic_tcp_ipv4.lso_max)
2957 return (B_FALSE);
2958 } else {
2959 return (B_FALSE);
2960 }
2961 }
2962
2963 return (B_TRUE);
2964 }
2965
2966 /*
2967 * Returns the maximum SDU according to the SDU of the constituent ports.
2968 */
2969 static uint_t
2970 aggr_grp_max_sdu(aggr_grp_t *grp)
2971 {
2972 uint_t max_sdu = (uint_t)-1;
2973 aggr_port_t *port;
2974
2975 ASSERT(grp->lg_ports != NULL);
2976
2977 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2978 uint_t port_sdu_max;
2979
2980 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2981 if (max_sdu > port_sdu_max)
2982 max_sdu = port_sdu_max;
2983 }
2984
2985 return (max_sdu);
2986 }
2987
2988 /*
2989 * Checks if the maximum SDU of the specified port is compatible
2990 * with the maximum SDU of the specified aggregation group, returns
2991 * B_TRUE if it is, B_FALSE otherwise.
2992 */
2993 static boolean_t
2994 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2995 {
2996 uint_t port_sdu_max;
2997
2998 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2999 return (port_sdu_max >= grp->lg_max_sdu);
3000 }
3001
3002 /*
3003 * Returns the maximum margin according to the margin of the constituent ports.
3004 */
3005 static uint32_t
3006 aggr_grp_max_margin(aggr_grp_t *grp)
3007 {
3008 uint32_t margin = UINT32_MAX;
3009 aggr_port_t *port;
3010
3011 ASSERT(grp->lg_mh == NULL);
3012 ASSERT(grp->lg_ports != NULL);
3013
3014 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3015 if (margin > port->lp_margin)
3016 margin = port->lp_margin;
3017 }
3018
3019 grp->lg_margin = margin;
3020 return (margin);
3021 }
3022
3023 /*
3024 * Checks if the maximum margin of the specified port is compatible
3025 * with the maximum margin of the specified aggregation group, returns
3026 * B_TRUE if it is, B_FALSE otherwise.
3027 */
3028 static boolean_t
3029 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
3030 {
3031 if (port->lp_margin >= grp->lg_margin)
3032 return (B_TRUE);
3033
3034 /*
3035 * See whether the current margin value is allowed to be changed to
3036 * the new value.
3037 */
3038 if (!mac_margin_update(grp->lg_mh, port->lp_margin))
3039 return (B_FALSE);
3040
3041 grp->lg_margin = port->lp_margin;
3042 return (B_TRUE);
3043 }
3044
3045 /*
3046 * Set MTU on individual ports of an aggregation group
3047 */
3048 static int
3049 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
3050 uint32_t *old_mtu)
3051 {
3052 boolean_t removed = B_FALSE;
3053 mac_perim_handle_t mph;
3054 mac_diag_t diag;
3055 int err, rv, retry = 0;
3056
3057 if (port->lp_mah != NULL) {
3058 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
3059 port->lp_mah = NULL;
3060 removed = B_TRUE;
3061 }
3062 err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
3063 try_again:
3064 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
3065 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
3066 &port->lp_mah, 0, &diag)) != 0) {
3067 /*
3068 * following is a workaround for a bug in 'bge' driver.
3069 * See CR 6794654 for more information and this work around
3070 * will be removed once the CR is fixed.
3071 */
3072 if (rv == EIO && retry++ < 3) {
3073 delay(2 * hz);
3074 goto try_again;
3075 }
3076 /*
3077 * if mac_unicast_add() failed while setting the MTU,
3078 * detach the port from the group.
3079 */
3080 mac_perim_enter_by_mh(port->lp_mh, &mph);
3081 (void) aggr_grp_detach_port(grp, port);
3082 mac_perim_exit(mph);
3083 cmn_err(CE_WARN, "Unable to restart the port %s while "
3084 "setting MTU. Detaching the port from the aggregation.",
3085 mac_client_name(port->lp_mch));
3086 }
3087 return (err);
3088 }
3089
3090 static int
3091 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
3092 {
3093 int err = 0, i, rv;
3094 aggr_port_t *port;
3095 uint32_t *mtu;
3096
3097 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3098
3099 /*
3100 * If the MTU being set is equal to aggr group's maximum
3101 * allowable value, then there is nothing to change
3102 */
3103 if (sdu == grp->lg_max_sdu)
3104 return (0);
3105
3106 /* 0 is aggr group's min sdu */
3107 if (sdu == 0)
3108 return (EINVAL);
3109
3110 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3111 for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3112 port = port->lp_next, i++) {
3113 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3114 }
3115 if (err != 0) {
3116 /* recover from error: reset the mtus of the ports */
3117 aggr_port_t *tmp;
3118
3119 for (tmp = grp->lg_ports, i = 0; tmp != port;
3120 tmp = tmp->lp_next, i++) {
3121 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3122 }
3123 goto bail;
3124 }
3125 grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3126 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3127 ASSERT(rv == 0);
3128 bail:
3129 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3130 return (err);
3131 }
3132
3133 /*
3134 * Callback functions for set/get of properties
3135 */
3136 /*ARGSUSED*/
3137 static int
3138 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3139 uint_t pr_valsize, const void *pr_val)
3140 {
3141 int err = ENOTSUP;
3142 aggr_grp_t *grp = m_driver;
3143
3144 switch (pr_num) {
3145 case MAC_PROP_MTU: {
3146 uint32_t mtu;
3147
3148 if (pr_valsize < sizeof (mtu)) {
3149 err = EINVAL;
3150 break;
3151 }
3152 bcopy(pr_val, &mtu, sizeof (mtu));
3153 err = aggr_sdu_update(grp, mtu);
3154 break;
3155 }
3156 default:
3157 break;
3158 }
3159 return (err);
3160 }
3161
3162 typedef struct rboundary {
3163 uint32_t bval;
3164 int btype;
3165 } rboundary_t;
3166
3167 /*
3168 * This function finds the intersection of mtu ranges stored in arrays -
3169 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3170 * Individual arrays are assumed to contain non-overlapping ranges.
3171 * Algorithm:
3172 * A range has two boundaries - min and max. We scan all arrays and store
3173 * each boundary as a separate element in a temporary array. We also store
3174 * the boundary types, min or max, as +1 or -1 respectively in the temporary
3175 * array. Then we sort the temporary array in ascending order. We scan the
3176 * sorted array from lower to higher values and keep a cumulative sum of
3177 * boundary types. Element in the temporary array for which the sum reaches
3178 * mcount is a min boundary of a range in the result and next element will be
3179 * max boundary.
3180 *
3181 * Example for mcount = 3,
3182 *
3183 * ----|_________|-------|_______|----|__|------ mrange[0]
3184 *
3185 * -------|________|--|____________|-----|___|-- mrange[1]
3186 *
3187 * --------|________________|-------|____|------ mrange[2]
3188 *
3189 * 3 2 1
3190 * \|/
3191 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum
3192 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3193 *
3194 * same min and max
3195 * V
3196 * --------|_____|-------|__|------------|------ intersecting ranges
3197 */
3198 void
3199 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3200 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3201 {
3202 mac_propval_uint32_range_t *rval, *ur;
3203 int rmaxcnt, rcount;
3204 size_t sz_range32;
3205 rboundary_t *ta; /* temporary array */
3206 rboundary_t temp;
3207 boolean_t range_started = B_FALSE;
3208 int i, j, m, sum;
3209
3210 sz_range32 = sizeof (mac_propval_uint32_range_t);
3211
3212 for (i = 0, rmaxcnt = 0; i < mcount; i++)
3213 rmaxcnt += mrange[i]->mpr_count;
3214
3215 /* Allocate enough space to store the results */
3216 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3217
3218 /* Number of boundaries are twice as many as ranges */
3219 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3220
3221 for (i = 0, m = 0; i < mcount; i++) {
3222 ur = &(mrange[i]->mpr_range_uint32[0]);
3223 for (j = 0; j < mrange[i]->mpr_count; j++) {
3224 ta[m].bval = ur[j].mpur_min;
3225 ta[m++].btype = 1;
3226 ta[m].bval = ur[j].mpur_max;
3227 ta[m++].btype = -1;
3228 }
3229 }
3230
3231 /*
3232 * Sort the temporary array in ascending order of bval;
3233 * if boundary values are same then sort on btype.
3234 */
3235 for (i = 0; i < m-1; i++) {
3236 for (j = i+1; j < m; j++) {
3237 if ((ta[i].bval > ta[j].bval) ||
3238 ((ta[i].bval == ta[j].bval) &&
3239 (ta[i].btype < ta[j].btype))) {
3240 temp = ta[i];
3241 ta[i] = ta[j];
3242 ta[j] = temp;
3243 }
3244 }
3245 }
3246
3247 /* Walk through temporary array to find all ranges in the results */
3248 for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3249 sum += ta[i].btype;
3250 if (sum == mcount) {
3251 rval[rcount].mpur_min = ta[i].bval;
3252 range_started = B_TRUE;
3253 } else if (sum < mcount && range_started) {
3254 rval[rcount++].mpur_max = ta[i].bval;
3255 range_started = B_FALSE;
3256 }
3257 }
3258
3259 *prval = rval;
3260 *prmaxcnt = rmaxcnt;
3261 *prcount = rcount;
3262
3263 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3264 }
3265
3266 /*
3267 * Returns the mtu ranges which could be supported by aggr group.
3268 * prmaxcnt returns the size of the buffer prval, prcount returns
3269 * the number of valid entries in prval. Caller is responsible
3270 * for freeing up prval.
3271 */
3272 int
3273 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3274 int *prmaxcnt, int *prcount)
3275 {
3276 mac_propval_range_t **vals;
3277 aggr_port_t *port;
3278 mac_perim_handle_t mph;
3279 uint_t i, numr;
3280 int err = 0;
3281 size_t sz_propval, sz_range32;
3282 size_t size;
3283
3284 sz_propval = sizeof (mac_propval_range_t);
3285 sz_range32 = sizeof (mac_propval_uint32_range_t);
3286
3287 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3288
3289 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3290 KM_SLEEP);
3291
3292 for (port = grp->lg_ports, i = 0; port != NULL;
3293 port = port->lp_next, i++) {
3294
3295 size = sz_propval;
3296 vals[i] = kmem_alloc(size, KM_SLEEP);
3297 vals[i]->mpr_count = 1;
3298
3299 mac_perim_enter_by_mh(port->lp_mh, &mph);
3300
3301 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3302 NULL, 0, vals[i], NULL);
3303 if (err == ENOSPC) {
3304 /*
3305 * Not enough space to hold all ranges.
3306 * Allocate extra space as indicated and retry.
3307 */
3308 numr = vals[i]->mpr_count;
3309 kmem_free(vals[i], sz_propval);
3310 size = sz_propval + (numr - 1) * sz_range32;
3311 vals[i] = kmem_alloc(size, KM_SLEEP);
3312 vals[i]->mpr_count = numr;
3313 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3314 NULL, 0, vals[i], NULL);
3315 ASSERT(err != ENOSPC);
3316 }
3317 mac_perim_exit(mph);
3318 if (err != 0) {
3319 kmem_free(vals[i], size);
3320 vals[i] = NULL;
3321 break;
3322 }
3323 }
3324
3325 /*
3326 * if any of the underlying ports does not support changing MTU then
3327 * just return ENOTSUP
3328 */
3329 if (port != NULL) {
3330 ASSERT(err != 0);
3331 goto done;
3332 }
3333
3334 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3335 prcount);
3336
3337 done:
3338 for (i = 0; i < grp->lg_nports; i++) {
3339 if (vals[i] != NULL) {
3340 numr = vals[i]->mpr_count;
3341 size = sz_propval + (numr - 1) * sz_range32;
3342 kmem_free(vals[i], size);
3343 }
3344 }
3345
3346 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3347 return (err);
3348 }
3349
3350 static void
3351 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3352 mac_prop_info_handle_t prh)
3353 {
3354 aggr_grp_t *grp = m_driver;
3355 mac_propval_uint32_range_t *rval = NULL;
3356 int i, rcount, rmaxcnt;
3357 int err = 0;
3358
3359 _NOTE(ARGUNUSED(pr_name));
3360
3361 switch (pr_num) {
3362 case MAC_PROP_MTU:
3363
3364 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3365 &rcount);
3366 if (err != 0) {
3367 ASSERT(rval == NULL);
3368 return;
3369 }
3370 for (i = 0; i < rcount; i++) {
3371 mac_prop_info_set_range_uint32(prh,
3372 rval[i].mpur_min, rval[i].mpur_max);
3373 }
3374 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3375 break;
3376 }
3377 }