4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2017 Joyent, Inc.
25 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 /*
29 * MAC data path
30 *
31 * The MAC data path is concerned with the flow of traffic from mac clients --
32 * DLS, IP, etc. -- to various GLDv3 device drivers -- e1000g, vnic, aggr,
33 * ixgbe, etc. -- and from the GLDv3 device drivers back to clients.
34 *
35 * -----------
36 * Terminology
37 * -----------
38 *
39 * MAC uses a lot of different, but related terms that are associated with the
40 * design and structure of the data path. Before we cover other aspects, first
41 * let's review the terminology that MAC uses.
42 *
43 * MAC
44 *
283 * to a soft ring set.
284 *
285 * After frames reach a soft ring set and account for any potential bandwidth
286 * related accounting, they may be fanned out based on one of the following
287 * three modes:
288 *
289 * o No Fanout
290 * o Protocol level fanout
291 * o Full software ring protocol fanout
292 *
293 * MAC makes the determination as to which of these modes a given soft ring set
294 * obtains based on parameters such as whether or not it's the primary mac
295 * client, whether it's on a 10 GbE or faster device, user controlled dladm(1M)
296 * properties, and the nature of the hardware and the resources that it has.
297 *
298 * When there is no fanout, MAC does not create any soft rings for a device and
299 * the device has frames delivered directly to the MAC client.
300 *
301 * Otherwise, all fanout is performed by software. MAC divides incoming frames
302 * into one of three buckets -- IPv4 TCP traffic, IPv4 UDP traffic, and
303 * everything else. Note, VLAN tagged traffic is considered other, regardless of
304 * the interior EtherType. Regardless of the type of fanout, these three
305 * categories or buckets are always used.
306 *
307 * The difference between protocol level fanout and full software ring protocol
308 * fanout is the number of software rings that end up getting created. The
309 * system always uses the same number of software rings per protocol bucket. So
310 * in the first case when we're just doing protocol level fanout, we just create
311 * one software ring each for IPv4 TCP traffic, IPv4 UDP traffic, and everything
312 * else.
313 *
314 * In the case where we do full software ring protocol fanout, we generally use
315 * mac_compute_soft_ring_count() to determine the number of rings. There are
316 * other combinations of properties and devices that may send us down other
317 * paths, but this is a common starting point. If it's a non-bandwidth enforced
318 * device and we're on at least a 10 GbE link, then we'll use eight soft rings
319 * per protocol bucket as a starting point. See mac_compute_soft_ring_count()
320 * for more information on the total number.
321 *
322 * For each of these rings, we create a mac_soft_ring_t and an associated worker
323 * thread. Particularly when doing full software ring protocol fanout, we bind
324 * each of the worker threads to individual CPUs.
325 *
1458
1459 #define MAC_FANOUT_DEFAULT 0
1460 #define MAC_FANOUT_RND_ROBIN 1
1461 int mac_fanout_type = MAC_FANOUT_DEFAULT;
1462
1463 #define MAX_SR_TYPES 3
1464 /* fanout types for port based hashing */
1465 enum pkt_type {
1466 V4_TCP = 0,
1467 V4_UDP,
1468 OTH,
1469 UNDEF
1470 };
1471
1472 /*
1473 * Pair of local and remote ports in the transport header
1474 */
1475 #define PORTS_SIZE 4
1476
1477 /*
1478 * mac_rx_srs_proto_fanout
1479 *
1480 * This routine delivers packets destined to an SRS into one of the
1481 * protocol soft rings.
1482 *
1483 * Given a chain of packets we need to split it up into multiple sub chains
1484 * destined into TCP, UDP or OTH soft ring. Instead of entering
1485 * the soft ring one packet at a time, we want to enter it in the form of a
1486 * chain otherwise we get this start/stop behaviour where the worker thread
1487 * goes to sleep and then next packets comes in forcing it to wake up etc.
1488 */
1489 static void
1490 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
1491 {
1492 struct ether_header *ehp;
1493 struct ether_vlan_header *evhp;
1494 uint32_t sap;
1495 ipha_t *ipha;
1496 uint8_t *dstaddr;
1497 size_t hdrsize;
1498 mblk_t *mp;
1499 mblk_t *headmp[MAX_SR_TYPES];
1500 mblk_t *tailmp[MAX_SR_TYPES];
1501 int cnt[MAX_SR_TYPES];
1502 size_t sz[MAX_SR_TYPES];
1503 size_t sz1;
1504 boolean_t bw_ctl;
1505 boolean_t hw_classified;
1506 boolean_t dls_bypass;
1507 boolean_t is_ether;
1508 boolean_t is_unicast;
1509 enum pkt_type type;
1510 mac_client_impl_t *mcip = mac_srs->srs_mcip;
1511
1512 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1513 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1514
1515 /*
1516 * If we don't have a Rx ring, S/W classification would have done
1517 * its job and its a packet meant for us. If we were polling on
1518 * the default ring (i.e. there was a ring assigned to this SRS),
1519 * then we need to make sure that the mac address really belongs
1520 * to us.
1521 */
1522 hw_classified = mac_srs->srs_ring != NULL &&
1523 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1524
1525 /*
1526 * Special clients (eg. VLAN, non ether, etc) need DLS
1527 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
1528 * such SRSs. Another way of disabling bypass is to set the
1529 * MCIS_RX_BYPASS_DISABLE flag.
1530 */
1531 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
1532 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1533
1534 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
1535 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
1536 bzero(cnt, MAX_SR_TYPES * sizeof (int));
1537 bzero(sz, MAX_SR_TYPES * sizeof (size_t));
1538
1539 /*
1540 * We got a chain from SRS that we need to send to the soft rings.
1541 * Since squeues for TCP & IPv4 sap poll their soft rings (for
1542 * performance reasons), we need to separate out v4_tcp, v4_udp
1543 * and the rest goes in other.
1544 */
1545 while (head != NULL) {
1546 mp = head;
1547 head = head->b_next;
1548 mp->b_next = NULL;
1549
1550 type = OTH;
1551 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1552
1553 if (is_ether) {
1554 /*
1555 * At this point we can be sure the packet at least
1556 * has an ether header.
1557 */
1558 if (sz1 < sizeof (struct ether_header)) {
1559 mac_rx_drop_pkt(mac_srs, mp);
1560 continue;
1561 }
1562 ehp = (struct ether_header *)mp->b_rptr;
1563
1564 /*
1565 * Determine if this is a VLAN or non-VLAN packet.
1566 */
1567 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1568 evhp = (struct ether_vlan_header *)mp->b_rptr;
1569 sap = ntohs(evhp->ether_type);
1570 hdrsize = sizeof (struct ether_vlan_header);
1571 /*
1572 * Check if the VID of the packet, if any,
1573 * belongs to this client.
1574 */
1575 if (!mac_client_check_flow_vid(mcip,
1576 VLAN_ID(ntohs(evhp->ether_tci)))) {
1577 mac_rx_drop_pkt(mac_srs, mp);
1578 continue;
1579 }
1580 } else {
1581 hdrsize = sizeof (struct ether_header);
1582 }
1583 is_unicast =
1584 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1585 dstaddr = (uint8_t *)&ehp->ether_dhost;
1586 } else {
1587 mac_header_info_t mhi;
1588
1589 if (mac_header_info((mac_handle_t)mcip->mci_mip,
1590 mp, &mhi) != 0) {
1591 mac_rx_drop_pkt(mac_srs, mp);
1592 continue;
1593 }
1618 type = UNDEF;
1619 rw_exit(&mcip->mci_rw_lock);
1620 } else if (is_unicast) {
1621 type = UNDEF;
1622 }
1623 }
1624
1625 /*
1626 * This needs to become a contract with the driver for
1627 * the fast path.
1628 *
1629 * In the normal case the packet will have at least the L2
1630 * header and the IP + Transport header in the same mblk.
1631 * This is usually the case when the NIC driver sends up
1632 * the packet. This is also true when the stack generates
1633 * a packet that is looped back and when the stack uses the
1634 * fastpath mechanism. The normal case is optimized for
1635 * performance and may bypass DLS. All other cases go through
1636 * the 'OTH' type path without DLS bypass.
1637 */
1638
1639 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1640 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
1641 type = OTH;
1642
1643 if (type == OTH) {
1644 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
1645 cnt[type], bw_ctl, sz[type], sz1, mp);
1646 continue;
1647 }
1648
1649 ASSERT(type == UNDEF);
1650 /*
1651 * We look for at least 4 bytes past the IP header to get
1652 * the port information. If we get an IP fragment, we don't
1653 * have the port information, and we use just the protocol
1654 * information.
1655 */
1656 switch (ipha->ipha_protocol) {
1657 case IPPROTO_TCP:
1658 type = V4_TCP;
1659 mp->b_rptr += hdrsize;
1660 break;
1661 case IPPROTO_UDP:
1662 type = V4_UDP;
1663 mp->b_rptr += hdrsize;
1664 break;
1665 default:
1666 type = OTH;
1667 break;
1668 }
1669
1670 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
1671 bw_ctl, sz[type], sz1, mp);
1672 }
1673
1674 for (type = V4_TCP; type < UNDEF; type++) {
1678 ASSERT(tailmp[type]->b_next == NULL);
1679 switch (type) {
1680 case V4_TCP:
1681 softring = mac_srs->srs_tcp_soft_rings[0];
1682 break;
1683 case V4_UDP:
1684 softring = mac_srs->srs_udp_soft_rings[0];
1685 break;
1686 case OTH:
1687 softring = mac_srs->srs_oth_soft_rings[0];
1688 }
1689 mac_rx_soft_ring_process(mcip, softring,
1690 headmp[type], tailmp[type], cnt[type], sz[type]);
1691 }
1692 }
1693 }
1694
1695 int fanout_unaligned = 0;
1696
1697 /*
1698 * mac_rx_srs_long_fanout
1699 *
1700 * The fanout routine for VLANs, and for anything else that isn't performing
1701 * explicit dls bypass. Returns -1 on an error (drop the packet due to a
1702 * malformed packet), 0 on success, with values written in *indx and *type.
1703 */
1704 static int
1705 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
1706 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
1707 {
1708 ip6_t *ip6h;
1709 ipha_t *ipha;
1710 uint8_t *whereptr;
1711 uint_t hash;
1712 uint16_t remlen;
1713 uint8_t nexthdr;
1714 uint16_t hdr_len;
1715 uint32_t src_val, dst_val;
1716 boolean_t modifiable = B_TRUE;
1717 boolean_t v6;
1718
1719 ASSERT(MBLKL(mp) >= hdrsize);
1720
1721 if (sap == ETHERTYPE_IPV6) {
1722 v6 = B_TRUE;
1848 *(uint32_t *)whereptr);
1849 *indx = COMPUTE_INDEX(hash,
1850 mac_srs->srs_udp_ring_count);
1851 } else {
1852 *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
1853 mac_srs->srs_ind++;
1854 }
1855 *type = OTH;
1856 break;
1857 }
1858 return (0);
1859
1860 src_dst_based_fanout:
1861 hash = HASH_ADDR(src_val, dst_val, (uint32_t)0);
1862 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
1863 *type = OTH;
1864 return (0);
1865 }
1866
1867 /*
1868 * mac_rx_srs_fanout
1869 *
1870 * This routine delivers packets destined to an SRS into a soft ring member
1871 * of the set.
1872 *
1873 * Given a chain of packets we need to split it up into multiple sub chains
1874 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
1875 * the soft ring one packet at a time, we want to enter it in the form of a
1876 * chain otherwise we get this start/stop behaviour where the worker thread
1877 * goes to sleep and then next packets comes in forcing it to wake up etc.
1878 *
1879 * Note:
1880 * Since we know what is the maximum fanout possible, we create a 2D array
1881 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
1882 * variables so that we can enter the softrings with chain. We need the
1883 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
1884 * for each packet would be expensive). If we ever want to have the
1885 * ability to have unlimited fanout, we should probably declare a head,
1886 * tail, cnt, sz with each soft ring (a data struct which contains a softring
1887 * along with these members) and create an array of this uber struct so we
1888 * don't have to do kmem_alloc.
1889 */
1890 int fanout_oth1 = 0;
1891 int fanout_oth2 = 0;
1892 int fanout_oth3 = 0;
1893 int fanout_oth4 = 0;
1894 int fanout_oth5 = 0;
1895
1896 static void
1897 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
1918 boolean_t is_ether;
1919 boolean_t is_unicast;
1920 int fanout_cnt;
1921 enum pkt_type type;
1922 mac_client_impl_t *mcip = mac_srs->srs_mcip;
1923
1924 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1925 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1926
1927 /*
1928 * If we don't have a Rx ring, S/W classification would have done
1929 * its job and its a packet meant for us. If we were polling on
1930 * the default ring (i.e. there was a ring assigned to this SRS),
1931 * then we need to make sure that the mac address really belongs
1932 * to us.
1933 */
1934 hw_classified = mac_srs->srs_ring != NULL &&
1935 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1936
1937 /*
1938 * Special clients (eg. VLAN, non ether, etc) need DLS
1939 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
1940 * such SRSs. Another way of disabling bypass is to set the
1941 * MCIS_RX_BYPASS_DISABLE flag.
1942 */
1943 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
1944 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1945
1946 /*
1947 * Since the softrings are never destroyed and we always
1948 * create equal number of softrings for TCP, UDP and rest,
1949 * its OK to check one of them for count and use it without
1950 * any lock. In future, if soft rings get destroyed because
1951 * of reduction in fanout, we will need to ensure that happens
1952 * behind the SRS_PROC.
1953 */
1954 fanout_cnt = mac_srs->srs_tcp_ring_count;
1955
1956 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1957 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1958 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1959 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1960
1961 /*
1962 * We got a chain from SRS that we need to send to the soft rings.
1963 * Since squeues for TCP & IPv4 sap poll their soft rings (for
1964 * performance reasons), we need to separate out v4_tcp, v4_udp
1965 * and the rest goes in other.
1966 */
1967 while (head != NULL) {
1968 mp = head;
1969 head = head->b_next;
1970 mp->b_next = NULL;
1971
1972 type = OTH;
1973 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1974
1975 if (is_ether) {
1976 /*
1977 * At this point we can be sure the packet at least
1978 * has an ether header.
1979 */
1980 if (sz1 < sizeof (struct ether_header)) {
1981 mac_rx_drop_pkt(mac_srs, mp);
1982 continue;
1983 }
1984 ehp = (struct ether_header *)mp->b_rptr;
1985
1986 /*
1987 * Determine if this is a VLAN or non-VLAN packet.
1988 */
1989 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1990 evhp = (struct ether_vlan_header *)mp->b_rptr;
1991 sap = ntohs(evhp->ether_type);
1992 hdrsize = sizeof (struct ether_vlan_header);
1993 /*
1994 * Check if the VID of the packet, if any,
1995 * belongs to this client.
1996 */
1997 if (!mac_client_check_flow_vid(mcip,
1998 VLAN_ID(ntohs(evhp->ether_tci)))) {
1999 mac_rx_drop_pkt(mac_srs, mp);
2000 continue;
2001 }
2002 } else {
2003 hdrsize = sizeof (struct ether_header);
2004 }
2005 is_unicast =
2006 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
2007 dstaddr = (uint8_t *)&ehp->ether_dhost;
2008 } else {
2009 mac_header_info_t mhi;
2010
2011 if (mac_header_info((mac_handle_t)mcip->mci_mip,
2012 mp, &mhi) != 0) {
2013 mac_rx_drop_pkt(mac_srs, mp);
2014 continue;
2015 }
2016 hdrsize = mhi.mhi_hdrsize;
2017 sap = mhi.mhi_bindsap;
2018 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
2019 dstaddr = (uint8_t *)mhi.mhi_daddr;
2020 }
2021
2022 if (!dls_bypass) {
2023 if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
2024 hdrsize, &type, &indx) == -1) {
2025 mac_rx_drop_pkt(mac_srs, mp);
2026 continue;
2027 }
2028
2029 FANOUT_ENQUEUE_MP(headmp[type][indx],
2030 tailmp[type][indx], cnt[type][indx], bw_ctl,
2031 sz[type][indx], sz1, mp);
2032 continue;
2033 }
2034
2035
2036 /*
2037 * If we are using the default Rx ring where H/W or S/W
2038 * classification has not happened, we need to verify if
2039 * this unicast packet really belongs to us.
2040 */
2041 if (sap == ETHERTYPE_IP) {
2042 /*
2043 * If we are H/W classified, but we have promisc
2044 * on, then we need to check for the unicast address.
2045 */
2046 if (hw_classified && mcip->mci_promisc_list != NULL) {
2047 mac_address_t *map;
2048
2049 rw_enter(&mcip->mci_rw_lock, RW_READER);
2050 map = mcip->mci_unicast;
2051 if (bcmp(dstaddr, map->ma_addr,
2052 map->ma_len) == 0)
2053 type = UNDEF;
2054 rw_exit(&mcip->mci_rw_lock);
2055 } else if (is_unicast) {
2604 */
2605 MAC_SRS_POLL_RING(mac_srs);
2606 }
2607
2608 again:
2609 head = mac_srs->srs_first;
2610 mac_srs->srs_first = NULL;
2611 tail = mac_srs->srs_last;
2612 mac_srs->srs_last = NULL;
2613 cnt = mac_srs->srs_count;
2614 mac_srs->srs_count = 0;
2615
2616 ASSERT(head != NULL);
2617 ASSERT(tail != NULL);
2618
2619 if ((tid = mac_srs->srs_tid) != NULL)
2620 mac_srs->srs_tid = NULL;
2621
2622 mac_srs->srs_state |= (SRS_PROC|proc_type);
2623
2624
2625 /*
2626 * mcip is NULL for broadcast and multicast flows. The promisc
2627 * callbacks for broadcast and multicast packets are delivered from
2628 * mac_rx() and we don't need to worry about that case in this path
2629 */
2630 if (mcip != NULL) {
2631 if (mcip->mci_promisc_list != NULL) {
2632 mutex_exit(&mac_srs->srs_lock);
2633 mac_promisc_client_dispatch(mcip, head);
2634 mutex_enter(&mac_srs->srs_lock);
2635 }
2636 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
2637 mutex_exit(&mac_srs->srs_lock);
2638 mac_protect_intercept_dynamic(mcip, head);
2639 mutex_enter(&mac_srs->srs_lock);
2640 }
2641 }
2642
2643 /*
2644 * Check if SRS itself is doing the processing
2645 * This direct path does not apply when subflows are present. In this
2646 * case, packets need to be dispatched to a soft ring according to the
2647 * flow's bandwidth and other resources contraints.
2648 */
2649 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
2650 mac_direct_rx_t proc;
2651 void *arg1;
2652 mac_resource_handle_t arg2;
2653
2654 /*
2655 * This is the case when a Rx is directly
2656 * assigned and we have a fully classified
2657 * protocol chain. We can deal with it in
2658 * one shot.
2659 */
2660 proc = srs_rx->sr_func;
2661 arg1 = srs_rx->sr_arg1;
2662 arg2 = srs_rx->sr_arg2;
2663
2664 mac_srs->srs_state |= SRS_CLIENT_PROC;
2665 mutex_exit(&mac_srs->srs_lock);
2666 if (tid != NULL) {
2667 (void) untimeout(tid);
4639 * flows as well.
4640 */
4641 /* ARGSUSED */
4642 void
4643 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
4644 mac_header_info_t *arg3)
4645 {
4646 mac_client_impl_t *mcip = arg1;
4647
4648 if (mcip->mci_nvids == 1 &&
4649 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
4650 /*
4651 * If the client has exactly one VID associated with it
4652 * and striping of VLAN header is not disabled,
4653 * remove the VLAN tag from the packet before
4654 * passing it on to the client's receive callback.
4655 * Note that this needs to be done after we dispatch
4656 * the packet to the promiscuous listeners of the
4657 * client, since they expect to see the whole
4658 * frame including the VLAN headers.
4659 */
4660 mp_chain = mac_strip_vlan_tag_chain(mp_chain);
4661 }
4662
4663 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
4664 }
4665
4666 /*
4667 * mac_rx_soft_ring_process
4668 *
4669 * process a chain for a given soft ring. The number of packets queued
4670 * in the SRS and its associated soft rings (including this one) is
4671 * very small (tracked by srs_poll_pkt_cnt), then allow the entering
4672 * thread (interrupt or poll thread) to do inline processing. This
4673 * helps keep the latency down under low load.
4674 *
4675 * The proc and arg for each mblk is already stored in the mblk in
4676 * appropriate places.
4677 */
4678 /* ARGSUSED */
4679 void
4680 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
4681 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
4682 {
4683 mac_direct_rx_t proc;
4684 void *arg1;
4685 mac_resource_handle_t arg2;
4686 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
4687
4688 ASSERT(ringp != NULL);
4689 ASSERT(mp_chain != NULL);
4690 ASSERT(tail != NULL);
4691 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4692
4693 mutex_enter(&ringp->s_ring_lock);
4694 ringp->s_ring_total_inpkt += cnt;
4712 */
4713 if (ringp->s_ring_first == NULL) {
4714 /*
4715 * Fast-path, ok to process and nothing queued.
4716 */
4717 ringp->s_ring_run = curthread;
4718 ringp->s_ring_state |= (S_RING_PROC);
4719
4720 mutex_exit(&ringp->s_ring_lock);
4721
4722 /*
4723 * We are the chain of 1 packet so
4724 * go through this fast path.
4725 */
4726 ASSERT(mp_chain->b_next == NULL);
4727
4728 (*proc)(arg1, arg2, mp_chain, NULL);
4729
4730 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4731 /*
4732 * If we have a soft ring set which is doing
4733 * bandwidth control, we need to decrement
4734 * srs_size and count so it the SRS can have a
4735 * accurate idea of what is the real data
4736 * queued between SRS and its soft rings. We
4737 * decrement the counters only when the packet
4738 * gets processed by both SRS and the soft ring.
4739 */
4740 mutex_enter(&mac_srs->srs_lock);
4741 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
4742 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
4743 mutex_exit(&mac_srs->srs_lock);
4744
4745 mutex_enter(&ringp->s_ring_lock);
4746 ringp->s_ring_run = NULL;
4747 ringp->s_ring_state &= ~S_RING_PROC;
4748 if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
4749 cv_signal(&ringp->s_ring_client_cv);
4750
4751 if ((ringp->s_ring_first == NULL) ||
4752 (ringp->s_ring_state & S_RING_BLANK)) {
4753 /*
4754 * We processed inline our packet and
4755 * nothing new has arrived or our
4756 * receiver doesn't want to receive
4757 * any packets. We are done.
4758 */
4759 mutex_exit(&ringp->s_ring_lock);
4760 return;
4761 }
4762 } else {
4763 SOFT_RING_ENQUEUE_CHAIN(ringp,
4764 mp_chain, tail, cnt, sz);
4765 }
4766
4767 /*
4768 * We are here because either we couldn't do inline
4769 * processing (because something was already
4770 * queued), or we had a chain of more than one
4771 * packet, or something else arrived after we were
4772 * done with inline processing.
4773 */
4774 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
4775 ASSERT(ringp->s_ring_first != NULL);
|
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2018 Joyent, Inc.
25 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 /*
29 * MAC data path
30 *
31 * The MAC data path is concerned with the flow of traffic from mac clients --
32 * DLS, IP, etc. -- to various GLDv3 device drivers -- e1000g, vnic, aggr,
33 * ixgbe, etc. -- and from the GLDv3 device drivers back to clients.
34 *
35 * -----------
36 * Terminology
37 * -----------
38 *
39 * MAC uses a lot of different, but related terms that are associated with the
40 * design and structure of the data path. Before we cover other aspects, first
41 * let's review the terminology that MAC uses.
42 *
43 * MAC
44 *
283 * to a soft ring set.
284 *
285 * After frames reach a soft ring set and account for any potential bandwidth
286 * related accounting, they may be fanned out based on one of the following
287 * three modes:
288 *
289 * o No Fanout
290 * o Protocol level fanout
291 * o Full software ring protocol fanout
292 *
293 * MAC makes the determination as to which of these modes a given soft ring set
294 * obtains based on parameters such as whether or not it's the primary mac
295 * client, whether it's on a 10 GbE or faster device, user controlled dladm(1M)
296 * properties, and the nature of the hardware and the resources that it has.
297 *
298 * When there is no fanout, MAC does not create any soft rings for a device and
299 * the device has frames delivered directly to the MAC client.
300 *
301 * Otherwise, all fanout is performed by software. MAC divides incoming frames
302 * into one of three buckets -- IPv4 TCP traffic, IPv4 UDP traffic, and
303 * everything else. Regardless of the type of fanout, these three categories
304 * or buckets are always used.
305 *
306 * The difference between protocol level fanout and full software ring protocol
307 * fanout is the number of software rings that end up getting created. The
308 * system always uses the same number of software rings per protocol bucket. So
309 * in the first case when we're just doing protocol level fanout, we just create
310 * one software ring each for IPv4 TCP traffic, IPv4 UDP traffic, and everything
311 * else.
312 *
313 * In the case where we do full software ring protocol fanout, we generally use
314 * mac_compute_soft_ring_count() to determine the number of rings. There are
315 * other combinations of properties and devices that may send us down other
316 * paths, but this is a common starting point. If it's a non-bandwidth enforced
317 * device and we're on at least a 10 GbE link, then we'll use eight soft rings
318 * per protocol bucket as a starting point. See mac_compute_soft_ring_count()
319 * for more information on the total number.
320 *
321 * For each of these rings, we create a mac_soft_ring_t and an associated worker
322 * thread. Particularly when doing full software ring protocol fanout, we bind
323 * each of the worker threads to individual CPUs.
324 *
1457
1458 #define MAC_FANOUT_DEFAULT 0
1459 #define MAC_FANOUT_RND_ROBIN 1
1460 int mac_fanout_type = MAC_FANOUT_DEFAULT;
1461
1462 #define MAX_SR_TYPES 3
1463 /* fanout types for port based hashing */
1464 enum pkt_type {
1465 V4_TCP = 0,
1466 V4_UDP,
1467 OTH,
1468 UNDEF
1469 };
1470
1471 /*
1472 * Pair of local and remote ports in the transport header
1473 */
1474 #define PORTS_SIZE 4
1475
1476 /*
1477 * This routine delivers packets destined for an SRS into one of the
1478 * protocol soft rings.
1479 *
1480 * Given a chain of packets we need to split it up into multiple sub
1481 * chains: TCP, UDP or OTH soft ring. Instead of entering the soft
1482 * ring one packet at a time, we want to enter it in the form of a
1483 * chain otherwise we get this start/stop behaviour where the worker
1484 * thread goes to sleep and then next packet comes in forcing it to
1485 * wake up.
1486 */
1487 static void
1488 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
1489 {
1490 struct ether_header *ehp;
1491 struct ether_vlan_header *evhp;
1492 uint32_t sap;
1493 ipha_t *ipha;
1494 uint8_t *dstaddr;
1495 size_t hdrsize;
1496 mblk_t *mp;
1497 mblk_t *headmp[MAX_SR_TYPES];
1498 mblk_t *tailmp[MAX_SR_TYPES];
1499 int cnt[MAX_SR_TYPES];
1500 size_t sz[MAX_SR_TYPES];
1501 size_t sz1;
1502 boolean_t bw_ctl;
1503 boolean_t hw_classified;
1504 boolean_t dls_bypass;
1505 boolean_t is_ether;
1506 boolean_t is_unicast;
1507 enum pkt_type type;
1508 mac_client_impl_t *mcip = mac_srs->srs_mcip;
1509
1510 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1511 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1512
1513 /*
1514 * If we don't have a Rx ring, S/W classification would have done
1515 * its job and its a packet meant for us. If we were polling on
1516 * the default ring (i.e. there was a ring assigned to this SRS),
1517 * then we need to make sure that the mac address really belongs
1518 * to us.
1519 */
1520 hw_classified = mac_srs->srs_ring != NULL &&
1521 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1522
1523 /*
1524 * Some clients, such as non-ethernet, need DLS processing in
1525 * the Rx path. Such clients clear the SRST_DLS_BYPASS flag.
1526 * DLS bypass may also be disabled via the
1527 * MCIS_RX_BYPASS_DISABLE flag.
1528 */
1529 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
1530 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1531
1532 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
1533 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
1534 bzero(cnt, MAX_SR_TYPES * sizeof (int));
1535 bzero(sz, MAX_SR_TYPES * sizeof (size_t));
1536
1537 /*
1538 * We have a chain from SRS that we need to split across the
1539 * soft rings. The squeues for the TCP and IPv4 SAPs use their
1540 * own soft rings to allow polling from the squeue. The rest of
1541 * the packets are delivered on the OTH soft ring which cannot
1542 * be polled.
1543 */
1544 while (head != NULL) {
1545 mp = head;
1546 head = head->b_next;
1547 mp->b_next = NULL;
1548
1549 type = OTH;
1550 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1551
1552 if (is_ether) {
1553 /*
1554 * At this point we can be sure the packet at least
1555 * has an ether header.
1556 */
1557 if (sz1 < sizeof (struct ether_header)) {
1558 mac_rx_drop_pkt(mac_srs, mp);
1559 continue;
1560 }
1561 ehp = (struct ether_header *)mp->b_rptr;
1562
1563 /*
1564 * Determine if this is a VLAN or non-VLAN packet.
1565 */
1566 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1567 evhp = (struct ether_vlan_header *)mp->b_rptr;
1568 sap = ntohs(evhp->ether_type);
1569 hdrsize = sizeof (struct ether_vlan_header);
1570
1571 /*
1572 * Check if the VID of the packet, if
1573 * any, belongs to this client.
1574 * Technically, if this packet came up
1575 * via a HW classified ring then we
1576 * don't need to perform this check.
1577 * Perhaps a future optimization.
1578 */
1579 if (!mac_client_check_flow_vid(mcip,
1580 VLAN_ID(ntohs(evhp->ether_tci)))) {
1581 mac_rx_drop_pkt(mac_srs, mp);
1582 continue;
1583 }
1584 } else {
1585 hdrsize = sizeof (struct ether_header);
1586 }
1587 is_unicast =
1588 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1589 dstaddr = (uint8_t *)&ehp->ether_dhost;
1590 } else {
1591 mac_header_info_t mhi;
1592
1593 if (mac_header_info((mac_handle_t)mcip->mci_mip,
1594 mp, &mhi) != 0) {
1595 mac_rx_drop_pkt(mac_srs, mp);
1596 continue;
1597 }
1622 type = UNDEF;
1623 rw_exit(&mcip->mci_rw_lock);
1624 } else if (is_unicast) {
1625 type = UNDEF;
1626 }
1627 }
1628
1629 /*
1630 * This needs to become a contract with the driver for
1631 * the fast path.
1632 *
1633 * In the normal case the packet will have at least the L2
1634 * header and the IP + Transport header in the same mblk.
1635 * This is usually the case when the NIC driver sends up
1636 * the packet. This is also true when the stack generates
1637 * a packet that is looped back and when the stack uses the
1638 * fastpath mechanism. The normal case is optimized for
1639 * performance and may bypass DLS. All other cases go through
1640 * the 'OTH' type path without DLS bypass.
1641 */
1642 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1643 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
1644 type = OTH;
1645
1646 if (type == OTH) {
1647 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
1648 cnt[type], bw_ctl, sz[type], sz1, mp);
1649 continue;
1650 }
1651
1652 ASSERT(type == UNDEF);
1653
1654 /*
1655 * Determine the type from the IP protocol value. If
1656 * classified as TCP or UDP, then update the read
1657 * pointer to the beginning of the IP header.
1658 * Otherwise leave the message as is for further
1659 * processing by DLS.
1660 */
1661 switch (ipha->ipha_protocol) {
1662 case IPPROTO_TCP:
1663 type = V4_TCP;
1664 mp->b_rptr += hdrsize;
1665 break;
1666 case IPPROTO_UDP:
1667 type = V4_UDP;
1668 mp->b_rptr += hdrsize;
1669 break;
1670 default:
1671 type = OTH;
1672 break;
1673 }
1674
1675 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
1676 bw_ctl, sz[type], sz1, mp);
1677 }
1678
1679 for (type = V4_TCP; type < UNDEF; type++) {
1683 ASSERT(tailmp[type]->b_next == NULL);
1684 switch (type) {
1685 case V4_TCP:
1686 softring = mac_srs->srs_tcp_soft_rings[0];
1687 break;
1688 case V4_UDP:
1689 softring = mac_srs->srs_udp_soft_rings[0];
1690 break;
1691 case OTH:
1692 softring = mac_srs->srs_oth_soft_rings[0];
1693 }
1694 mac_rx_soft_ring_process(mcip, softring,
1695 headmp[type], tailmp[type], cnt[type], sz[type]);
1696 }
1697 }
1698 }
1699
1700 int fanout_unaligned = 0;
1701
1702 /*
1703 * The fanout routine for any clients with DLS bypass disabled or for
1704 * traffic classified as "other". Returns -1 on an error (drop the
1705 * packet due to a malformed packet), 0 on success, with values
1706 * written in *indx and *type.
1707 */
1708 static int
1709 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
1710 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
1711 {
1712 ip6_t *ip6h;
1713 ipha_t *ipha;
1714 uint8_t *whereptr;
1715 uint_t hash;
1716 uint16_t remlen;
1717 uint8_t nexthdr;
1718 uint16_t hdr_len;
1719 uint32_t src_val, dst_val;
1720 boolean_t modifiable = B_TRUE;
1721 boolean_t v6;
1722
1723 ASSERT(MBLKL(mp) >= hdrsize);
1724
1725 if (sap == ETHERTYPE_IPV6) {
1726 v6 = B_TRUE;
1852 *(uint32_t *)whereptr);
1853 *indx = COMPUTE_INDEX(hash,
1854 mac_srs->srs_udp_ring_count);
1855 } else {
1856 *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
1857 mac_srs->srs_ind++;
1858 }
1859 *type = OTH;
1860 break;
1861 }
1862 return (0);
1863
1864 src_dst_based_fanout:
1865 hash = HASH_ADDR(src_val, dst_val, (uint32_t)0);
1866 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
1867 *type = OTH;
1868 return (0);
1869 }
1870
1871 /*
1872 * This routine delivers packets destined for an SRS into a soft ring member
1873 * of the set.
1874 *
1875 * Given a chain of packets we need to split it up into multiple sub
1876 * chains: TCP, UDP or OTH soft ring. Instead of entering the soft
1877 * ring one packet at a time, we want to enter it in the form of a
1878 * chain otherwise we get this start/stop behaviour where the worker
1879 * thread goes to sleep and then next packet comes in forcing it to
1880 * wake up.
1881 *
1882 * Note:
1883 * Since we know what is the maximum fanout possible, we create a 2D array
1884 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
1885 * variables so that we can enter the softrings with chain. We need the
1886 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
1887 * for each packet would be expensive). If we ever want to have the
1888 * ability to have unlimited fanout, we should probably declare a head,
1889 * tail, cnt, sz with each soft ring (a data struct which contains a softring
1890 * along with these members) and create an array of this uber struct so we
1891 * don't have to do kmem_alloc.
1892 */
1893 int fanout_oth1 = 0;
1894 int fanout_oth2 = 0;
1895 int fanout_oth3 = 0;
1896 int fanout_oth4 = 0;
1897 int fanout_oth5 = 0;
1898
1899 static void
1900 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
1921 boolean_t is_ether;
1922 boolean_t is_unicast;
1923 int fanout_cnt;
1924 enum pkt_type type;
1925 mac_client_impl_t *mcip = mac_srs->srs_mcip;
1926
1927 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1928 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1929
1930 /*
1931 * If we don't have a Rx ring, S/W classification would have done
1932 * its job and its a packet meant for us. If we were polling on
1933 * the default ring (i.e. there was a ring assigned to this SRS),
1934 * then we need to make sure that the mac address really belongs
1935 * to us.
1936 */
1937 hw_classified = mac_srs->srs_ring != NULL &&
1938 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1939
1940 /*
1941 * Some clients, such as non Ethernet, need DLS processing in
1942 * the Rx path. Such clients clear the SRST_DLS_BYPASS flag.
1943 * DLS bypass may also be disabled via the
1944 * MCIS_RX_BYPASS_DISABLE flag, but this is only consumed by
1945 * sun4v vsw currently.
1946 */
1947 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
1948 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1949
1950 /*
1951 * Since the softrings are never destroyed and we always
1952 * create equal number of softrings for TCP, UDP and rest,
1953 * its OK to check one of them for count and use it without
1954 * any lock. In future, if soft rings get destroyed because
1955 * of reduction in fanout, we will need to ensure that happens
1956 * behind the SRS_PROC.
1957 */
1958 fanout_cnt = mac_srs->srs_tcp_ring_count;
1959
1960 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1961 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1962 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1963 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1964
1965 /*
1966 * We got a chain from SRS that we need to send to the soft rings.
1967 * Since squeues for TCP & IPv4 SAP poll their soft rings (for
1968 * performance reasons), we need to separate out v4_tcp, v4_udp
1969 * and the rest goes in other.
1970 */
1971 while (head != NULL) {
1972 mp = head;
1973 head = head->b_next;
1974 mp->b_next = NULL;
1975
1976 type = OTH;
1977 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1978
1979 if (is_ether) {
1980 /*
1981 * At this point we can be sure the packet at least
1982 * has an ether header.
1983 */
1984 if (sz1 < sizeof (struct ether_header)) {
1985 mac_rx_drop_pkt(mac_srs, mp);
1986 continue;
1987 }
1988 ehp = (struct ether_header *)mp->b_rptr;
1989
1990 /*
1991 * Determine if this is a VLAN or non-VLAN packet.
1992 */
1993 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1994 evhp = (struct ether_vlan_header *)mp->b_rptr;
1995 sap = ntohs(evhp->ether_type);
1996 hdrsize = sizeof (struct ether_vlan_header);
1997
1998 /*
1999 * Check if the VID of the packet, if
2000 * any, belongs to this client.
2001 * Technically, if this packet came up
2002 * via a HW classified ring then we
2003 * don't need to perform this check.
2004 * Perhaps a future optimization.
2005 */
2006 if (!mac_client_check_flow_vid(mcip,
2007 VLAN_ID(ntohs(evhp->ether_tci)))) {
2008 mac_rx_drop_pkt(mac_srs, mp);
2009 continue;
2010 }
2011 } else {
2012 hdrsize = sizeof (struct ether_header);
2013 }
2014 is_unicast =
2015 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
2016 dstaddr = (uint8_t *)&ehp->ether_dhost;
2017 } else {
2018 mac_header_info_t mhi;
2019
2020 if (mac_header_info((mac_handle_t)mcip->mci_mip,
2021 mp, &mhi) != 0) {
2022 mac_rx_drop_pkt(mac_srs, mp);
2023 continue;
2024 }
2025 hdrsize = mhi.mhi_hdrsize;
2026 sap = mhi.mhi_bindsap;
2027 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
2028 dstaddr = (uint8_t *)mhi.mhi_daddr;
2029 }
2030
2031 if (!dls_bypass) {
2032 if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
2033 hdrsize, &type, &indx) == -1) {
2034 mac_rx_drop_pkt(mac_srs, mp);
2035 continue;
2036 }
2037
2038 FANOUT_ENQUEUE_MP(headmp[type][indx],
2039 tailmp[type][indx], cnt[type][indx], bw_ctl,
2040 sz[type][indx], sz1, mp);
2041 continue;
2042 }
2043
2044 /*
2045 * If we are using the default Rx ring where H/W or S/W
2046 * classification has not happened, we need to verify if
2047 * this unicast packet really belongs to us.
2048 */
2049 if (sap == ETHERTYPE_IP) {
2050 /*
2051 * If we are H/W classified, but we have promisc
2052 * on, then we need to check for the unicast address.
2053 */
2054 if (hw_classified && mcip->mci_promisc_list != NULL) {
2055 mac_address_t *map;
2056
2057 rw_enter(&mcip->mci_rw_lock, RW_READER);
2058 map = mcip->mci_unicast;
2059 if (bcmp(dstaddr, map->ma_addr,
2060 map->ma_len) == 0)
2061 type = UNDEF;
2062 rw_exit(&mcip->mci_rw_lock);
2063 } else if (is_unicast) {
2612 */
2613 MAC_SRS_POLL_RING(mac_srs);
2614 }
2615
2616 again:
2617 head = mac_srs->srs_first;
2618 mac_srs->srs_first = NULL;
2619 tail = mac_srs->srs_last;
2620 mac_srs->srs_last = NULL;
2621 cnt = mac_srs->srs_count;
2622 mac_srs->srs_count = 0;
2623
2624 ASSERT(head != NULL);
2625 ASSERT(tail != NULL);
2626
2627 if ((tid = mac_srs->srs_tid) != NULL)
2628 mac_srs->srs_tid = NULL;
2629
2630 mac_srs->srs_state |= (SRS_PROC|proc_type);
2631
2632 /*
2633 * mcip is NULL for broadcast and multicast flows. The promisc
2634 * callbacks for broadcast and multicast packets are delivered from
2635 * mac_rx() and we don't need to worry about that case in this path
2636 */
2637 if (mcip != NULL) {
2638 if (mcip->mci_promisc_list != NULL) {
2639 mutex_exit(&mac_srs->srs_lock);
2640 mac_promisc_client_dispatch(mcip, head);
2641 mutex_enter(&mac_srs->srs_lock);
2642 }
2643 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
2644 mutex_exit(&mac_srs->srs_lock);
2645 mac_protect_intercept_dynamic(mcip, head);
2646 mutex_enter(&mac_srs->srs_lock);
2647 }
2648 }
2649
2650 /*
2651 * Check if SRS itself is doing the processing. This direct
2652 * path applies only when subflows are present.
2653 */
2654 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
2655 mac_direct_rx_t proc;
2656 void *arg1;
2657 mac_resource_handle_t arg2;
2658
2659 /*
2660 * This is the case when a Rx is directly
2661 * assigned and we have a fully classified
2662 * protocol chain. We can deal with it in
2663 * one shot.
2664 */
2665 proc = srs_rx->sr_func;
2666 arg1 = srs_rx->sr_arg1;
2667 arg2 = srs_rx->sr_arg2;
2668
2669 mac_srs->srs_state |= SRS_CLIENT_PROC;
2670 mutex_exit(&mac_srs->srs_lock);
2671 if (tid != NULL) {
2672 (void) untimeout(tid);
4644 * flows as well.
4645 */
4646 /* ARGSUSED */
4647 void
4648 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
4649 mac_header_info_t *arg3)
4650 {
4651 mac_client_impl_t *mcip = arg1;
4652
4653 if (mcip->mci_nvids == 1 &&
4654 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
4655 /*
4656 * If the client has exactly one VID associated with it
4657 * and striping of VLAN header is not disabled,
4658 * remove the VLAN tag from the packet before
4659 * passing it on to the client's receive callback.
4660 * Note that this needs to be done after we dispatch
4661 * the packet to the promiscuous listeners of the
4662 * client, since they expect to see the whole
4663 * frame including the VLAN headers.
4664 *
4665 * The MCIS_STRIP_DISABLE is only issued when sun4v
4666 * vsw is in play.
4667 */
4668 mp_chain = mac_strip_vlan_tag_chain(mp_chain);
4669 }
4670
4671 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
4672 }
4673
4674 /*
4675 * Process a chain for a given soft ring. If the number of packets
4676 * queued in the SRS and its associated soft rings (including this
4677 * one) is very small (tracked by srs_poll_pkt_cnt) then allow the
4678 * entering thread (interrupt or poll thread) to process the chain
4679 * inline. This is meant to reduce latency under low load.
4680 *
4681 * The proc and arg for each mblk is already stored in the mblk in
4682 * appropriate places.
4683 */
4684 /* ARGSUSED */
4685 void
4686 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
4687 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
4688 {
4689 mac_direct_rx_t proc;
4690 void *arg1;
4691 mac_resource_handle_t arg2;
4692 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
4693
4694 ASSERT(ringp != NULL);
4695 ASSERT(mp_chain != NULL);
4696 ASSERT(tail != NULL);
4697 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4698
4699 mutex_enter(&ringp->s_ring_lock);
4700 ringp->s_ring_total_inpkt += cnt;
4718 */
4719 if (ringp->s_ring_first == NULL) {
4720 /*
4721 * Fast-path, ok to process and nothing queued.
4722 */
4723 ringp->s_ring_run = curthread;
4724 ringp->s_ring_state |= (S_RING_PROC);
4725
4726 mutex_exit(&ringp->s_ring_lock);
4727
4728 /*
4729 * We are the chain of 1 packet so
4730 * go through this fast path.
4731 */
4732 ASSERT(mp_chain->b_next == NULL);
4733
4734 (*proc)(arg1, arg2, mp_chain, NULL);
4735
4736 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4737 /*
4738 * If we have an SRS performing bandwidth
4739 * control then we need to decrement the size
4740 * and count so the SRS has an accurate count
4741 * of the data queued between the SRS and its
4742 * soft rings. We decrement the counters only
4743 * when the packet is processed by both the
4744 * SRS and the soft ring.
4745 */
4746 mutex_enter(&mac_srs->srs_lock);
4747 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
4748 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
4749 mutex_exit(&mac_srs->srs_lock);
4750
4751 mutex_enter(&ringp->s_ring_lock);
4752 ringp->s_ring_run = NULL;
4753 ringp->s_ring_state &= ~S_RING_PROC;
4754 if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
4755 cv_signal(&ringp->s_ring_client_cv);
4756
4757 if ((ringp->s_ring_first == NULL) ||
4758 (ringp->s_ring_state & S_RING_BLANK)) {
4759 /*
4760 * We processed a single packet inline
4761 * and nothing new has arrived or our
4762 * receiver doesn't want to receive
4763 * any packets. We are done.
4764 */
4765 mutex_exit(&ringp->s_ring_lock);
4766 return;
4767 }
4768 } else {
4769 SOFT_RING_ENQUEUE_CHAIN(ringp,
4770 mp_chain, tail, cnt, sz);
4771 }
4772
4773 /*
4774 * We are here because either we couldn't do inline
4775 * processing (because something was already
4776 * queued), or we had a chain of more than one
4777 * packet, or something else arrived after we were
4778 * done with inline processing.
4779 */
4780 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
4781 ASSERT(ringp->s_ring_first != NULL);
|