Print this page
11490 SRS ring polling disabled for VLANs
11491 Want DLS bypass for VLAN traffic
11492 add VLVF bypass to ixgbe core
2869 duplicate packets with vnics over aggrs
11489 DLS stat delete and aggr kstat can deadlock
Portions contributed by: Theo Schlossnagle <jesus@omniti.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2017 Joyent, Inc.
  25  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  26  */
  27 
  28 /*
  29  * MAC data path
  30  *
  31  * The MAC data path is concerned with the flow of traffic from mac clients --
  32  * DLS, IP, etc. -- to various GLDv3 device drivers -- e1000g, vnic, aggr,
  33  * ixgbe, etc. -- and from the GLDv3 device drivers back to clients.
  34  *
  35  * -----------
  36  * Terminology
  37  * -----------
  38  *
  39  * MAC uses a lot of different, but related terms that are associated with the
  40  * design and structure of the data path. Before we cover other aspects, first
  41  * let's review the terminology that MAC uses.
  42  *
  43  * MAC
  44  *


 283  * to a soft ring set.
 284  *
 285  * After frames reach a soft ring set and account for any potential bandwidth
 286  * related accounting, they may be fanned out based on one of the following
 287  * three modes:
 288  *
 289  *     o No Fanout
 290  *     o Protocol level fanout
 291  *     o Full software ring protocol fanout
 292  *
 293  * MAC makes the determination as to which of these modes a given soft ring set
 294  * obtains based on parameters such as whether or not it's the primary mac
 295  * client, whether it's on a 10 GbE or faster device, user controlled dladm(1M)
 296  * properties, and the nature of the hardware and the resources that it has.
 297  *
 298  * When there is no fanout, MAC does not create any soft rings for a device and
 299  * the device has frames delivered directly to the MAC client.
 300  *
 301  * Otherwise, all fanout is performed by software. MAC divides incoming frames
 302  * into one of three buckets -- IPv4 TCP traffic, IPv4 UDP traffic, and
 303  * everything else. Note, VLAN tagged traffic is considered other, regardless of
 304  * the interior EtherType. Regardless of the type of fanout, these three
 305  * categories or buckets are always used.
 306  *
 307  * The difference between protocol level fanout and full software ring protocol
 308  * fanout is the number of software rings that end up getting created. The
 309  * system always uses the same number of software rings per protocol bucket. So
 310  * in the first case when we're just doing protocol level fanout, we just create
 311  * one software ring each for IPv4 TCP traffic, IPv4 UDP traffic, and everything
 312  * else.
 313  *
 314  * In the case where we do full software ring protocol fanout, we generally use
 315  * mac_compute_soft_ring_count() to determine the number of rings. There are
 316  * other combinations of properties and devices that may send us down other
 317  * paths, but this is a common starting point. If it's a non-bandwidth enforced
 318  * device and we're on at least a 10 GbE link, then we'll use eight soft rings
 319  * per protocol bucket as a starting point. See mac_compute_soft_ring_count()
 320  * for more information on the total number.
 321  *
 322  * For each of these rings, we create a mac_soft_ring_t and an associated worker
 323  * thread. Particularly when doing full software ring protocol fanout, we bind
 324  * each of the worker threads to individual CPUs.
 325  *


1458 
1459 #define MAC_FANOUT_DEFAULT      0
1460 #define MAC_FANOUT_RND_ROBIN    1
1461 int mac_fanout_type = MAC_FANOUT_DEFAULT;
1462 
1463 #define MAX_SR_TYPES    3
1464 /* fanout types for port based hashing */
1465 enum pkt_type {
1466         V4_TCP = 0,
1467         V4_UDP,
1468         OTH,
1469         UNDEF
1470 };
1471 
1472 /*
1473  * Pair of local and remote ports in the transport header
1474  */
1475 #define PORTS_SIZE 4
1476 
1477 /*
1478  * mac_rx_srs_proto_fanout
1479  *
1480  * This routine delivers packets destined to an SRS into one of the
1481  * protocol soft rings.
1482  *
1483  * Given a chain of packets we need to split it up into multiple sub chains
1484  * destined into TCP, UDP or OTH soft ring. Instead of entering
1485  * the soft ring one packet at a time, we want to enter it in the form of a
1486  * chain otherwise we get this start/stop behaviour where the worker thread
1487  * goes to sleep and then next packets comes in forcing it to wake up etc.

1488  */
1489 static void
1490 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
1491 {
1492         struct ether_header             *ehp;
1493         struct ether_vlan_header        *evhp;
1494         uint32_t                        sap;
1495         ipha_t                          *ipha;
1496         uint8_t                         *dstaddr;
1497         size_t                          hdrsize;
1498         mblk_t                          *mp;
1499         mblk_t                          *headmp[MAX_SR_TYPES];
1500         mblk_t                          *tailmp[MAX_SR_TYPES];
1501         int                             cnt[MAX_SR_TYPES];
1502         size_t                          sz[MAX_SR_TYPES];
1503         size_t                          sz1;
1504         boolean_t                       bw_ctl;
1505         boolean_t                       hw_classified;
1506         boolean_t                       dls_bypass;
1507         boolean_t                       is_ether;
1508         boolean_t                       is_unicast;
1509         enum pkt_type                   type;
1510         mac_client_impl_t               *mcip = mac_srs->srs_mcip;
1511 
1512         is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1513         bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1514 
1515         /*
1516          * If we don't have a Rx ring, S/W classification would have done
1517          * its job and its a packet meant for us. If we were polling on
1518          * the default ring (i.e. there was a ring assigned to this SRS),
1519          * then we need to make sure that the mac address really belongs
1520          * to us.
1521          */
1522         hw_classified = mac_srs->srs_ring != NULL &&
1523             mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1524 
1525         /*
1526          * Special clients (eg. VLAN, non ether, etc) need DLS
1527          * processing in the Rx path. SRST_DLS_BYPASS will be clear for
1528          * such SRSs. Another way of disabling bypass is to set the
1529          * MCIS_RX_BYPASS_DISABLE flag.
1530          */
1531         dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
1532             ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1533 
1534         bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
1535         bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
1536         bzero(cnt, MAX_SR_TYPES * sizeof (int));
1537         bzero(sz, MAX_SR_TYPES * sizeof (size_t));
1538 
1539         /*
1540          * We got a chain from SRS that we need to send to the soft rings.
1541          * Since squeues for TCP & IPv4 sap poll their soft rings (for
1542          * performance reasons), we need to separate out v4_tcp, v4_udp
1543          * and the rest goes in other.

1544          */
1545         while (head != NULL) {
1546                 mp = head;
1547                 head = head->b_next;
1548                 mp->b_next = NULL;
1549 
1550                 type = OTH;
1551                 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1552 
1553                 if (is_ether) {
1554                         /*
1555                          * At this point we can be sure the packet at least
1556                          * has an ether header.
1557                          */
1558                         if (sz1 < sizeof (struct ether_header)) {
1559                                 mac_rx_drop_pkt(mac_srs, mp);
1560                                 continue;
1561                         }
1562                         ehp = (struct ether_header *)mp->b_rptr;
1563 
1564                         /*
1565                          * Determine if this is a VLAN or non-VLAN packet.
1566                          */
1567                         if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1568                                 evhp = (struct ether_vlan_header *)mp->b_rptr;
1569                                 sap = ntohs(evhp->ether_type);
1570                                 hdrsize = sizeof (struct ether_vlan_header);

1571                                 /*
1572                                  * Check if the VID of the packet, if any,
1573                                  * belongs to this client.




1574                                  */
1575                                 if (!mac_client_check_flow_vid(mcip,
1576                                     VLAN_ID(ntohs(evhp->ether_tci)))) {
1577                                         mac_rx_drop_pkt(mac_srs, mp);
1578                                         continue;
1579                                 }
1580                         } else {
1581                                 hdrsize = sizeof (struct ether_header);
1582                         }
1583                         is_unicast =
1584                             ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1585                         dstaddr = (uint8_t *)&ehp->ether_dhost;
1586                 } else {
1587                         mac_header_info_t               mhi;
1588 
1589                         if (mac_header_info((mac_handle_t)mcip->mci_mip,
1590                             mp, &mhi) != 0) {
1591                                 mac_rx_drop_pkt(mac_srs, mp);
1592                                 continue;
1593                         }


1618                                         type = UNDEF;
1619                                 rw_exit(&mcip->mci_rw_lock);
1620                         } else if (is_unicast) {
1621                                 type = UNDEF;
1622                         }
1623                 }
1624 
1625                 /*
1626                  * This needs to become a contract with the driver for
1627                  * the fast path.
1628                  *
1629                  * In the normal case the packet will have at least the L2
1630                  * header and the IP + Transport header in the same mblk.
1631                  * This is usually the case when the NIC driver sends up
1632                  * the packet. This is also true when the stack generates
1633                  * a packet that is looped back and when the stack uses the
1634                  * fastpath mechanism. The normal case is optimized for
1635                  * performance and may bypass DLS. All other cases go through
1636                  * the 'OTH' type path without DLS bypass.
1637                  */
1638 
1639                 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1640                 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
1641                         type = OTH;
1642 
1643                 if (type == OTH) {
1644                         FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
1645                             cnt[type], bw_ctl, sz[type], sz1, mp);
1646                         continue;
1647                 }
1648 
1649                 ASSERT(type == UNDEF);

1650                 /*
1651                  * We look for at least 4 bytes past the IP header to get
1652                  * the port information. If we get an IP fragment, we don't
1653                  * have the port information, and we use just the protocol
1654                  * information.

1655                  */
1656                 switch (ipha->ipha_protocol) {
1657                 case IPPROTO_TCP:
1658                         type = V4_TCP;
1659                         mp->b_rptr += hdrsize;
1660                         break;
1661                 case IPPROTO_UDP:
1662                         type = V4_UDP;
1663                         mp->b_rptr += hdrsize;
1664                         break;
1665                 default:
1666                         type = OTH;
1667                         break;
1668                 }
1669 
1670                 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
1671                     bw_ctl, sz[type], sz1, mp);
1672         }
1673 
1674         for (type = V4_TCP; type < UNDEF; type++) {


1678                         ASSERT(tailmp[type]->b_next == NULL);
1679                         switch (type) {
1680                         case V4_TCP:
1681                                 softring = mac_srs->srs_tcp_soft_rings[0];
1682                                 break;
1683                         case V4_UDP:
1684                                 softring = mac_srs->srs_udp_soft_rings[0];
1685                                 break;
1686                         case OTH:
1687                                 softring = mac_srs->srs_oth_soft_rings[0];
1688                         }
1689                         mac_rx_soft_ring_process(mcip, softring,
1690                             headmp[type], tailmp[type], cnt[type], sz[type]);
1691                 }
1692         }
1693 }
1694 
1695 int     fanout_unaligned = 0;
1696 
1697 /*
1698  * mac_rx_srs_long_fanout
1699  *
1700  * The fanout routine for VLANs, and for anything else that isn't performing
1701  * explicit dls bypass.  Returns -1 on an error (drop the packet due to a
1702  * malformed packet), 0 on success, with values written in *indx and *type.
1703  */
1704 static int
1705 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
1706     uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
1707 {
1708         ip6_t           *ip6h;
1709         ipha_t          *ipha;
1710         uint8_t         *whereptr;
1711         uint_t          hash;
1712         uint16_t        remlen;
1713         uint8_t         nexthdr;
1714         uint16_t        hdr_len;
1715         uint32_t        src_val, dst_val;
1716         boolean_t       modifiable = B_TRUE;
1717         boolean_t       v6;
1718 
1719         ASSERT(MBLKL(mp) >= hdrsize);
1720 
1721         if (sap == ETHERTYPE_IPV6) {
1722                 v6 = B_TRUE;


1848                             *(uint32_t *)whereptr);
1849                         *indx = COMPUTE_INDEX(hash,
1850                             mac_srs->srs_udp_ring_count);
1851                 } else {
1852                         *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
1853                         mac_srs->srs_ind++;
1854                 }
1855                 *type = OTH;
1856                 break;
1857         }
1858         return (0);
1859 
1860 src_dst_based_fanout:
1861         hash = HASH_ADDR(src_val, dst_val, (uint32_t)0);
1862         *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
1863         *type = OTH;
1864         return (0);
1865 }
1866 
1867 /*
1868  * mac_rx_srs_fanout
1869  *
1870  * This routine delivers packets destined to an SRS into a soft ring member
1871  * of the set.
1872  *
1873  * Given a chain of packets we need to split it up into multiple sub chains
1874  * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
1875  * the soft ring one packet at a time, we want to enter it in the form of a
1876  * chain otherwise we get this start/stop behaviour where the worker thread
1877  * goes to sleep and then next packets comes in forcing it to wake up etc.

1878  *
1879  * Note:
1880  * Since we know what is the maximum fanout possible, we create a 2D array
1881  * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
1882  * variables so that we can enter the softrings with chain. We need the
1883  * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
1884  * for each packet would be expensive). If we ever want to have the
1885  * ability to have unlimited fanout, we should probably declare a head,
1886  * tail, cnt, sz with each soft ring (a data struct which contains a softring
1887  * along with these members) and create an array of this uber struct so we
1888  * don't have to do kmem_alloc.
1889  */
1890 int     fanout_oth1 = 0;
1891 int     fanout_oth2 = 0;
1892 int     fanout_oth3 = 0;
1893 int     fanout_oth4 = 0;
1894 int     fanout_oth5 = 0;
1895 
1896 static void
1897 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)


1918         boolean_t                       is_ether;
1919         boolean_t                       is_unicast;
1920         int                             fanout_cnt;
1921         enum pkt_type                   type;
1922         mac_client_impl_t               *mcip = mac_srs->srs_mcip;
1923 
1924         is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1925         bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1926 
1927         /*
1928          * If we don't have a Rx ring, S/W classification would have done
1929          * its job and its a packet meant for us. If we were polling on
1930          * the default ring (i.e. there was a ring assigned to this SRS),
1931          * then we need to make sure that the mac address really belongs
1932          * to us.
1933          */
1934         hw_classified = mac_srs->srs_ring != NULL &&
1935             mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1936 
1937         /*
1938          * Special clients (eg. VLAN, non ether, etc) need DLS
1939          * processing in the Rx path. SRST_DLS_BYPASS will be clear for
1940          * such SRSs. Another way of disabling bypass is to set the
1941          * MCIS_RX_BYPASS_DISABLE flag.

1942          */
1943         dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
1944             ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1945 
1946         /*
1947          * Since the softrings are never destroyed and we always
1948          * create equal number of softrings for TCP, UDP and rest,
1949          * its OK to check one of them for count and use it without
1950          * any lock. In future, if soft rings get destroyed because
1951          * of reduction in fanout, we will need to ensure that happens
1952          * behind the SRS_PROC.
1953          */
1954         fanout_cnt = mac_srs->srs_tcp_ring_count;
1955 
1956         bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1957         bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1958         bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1959         bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1960 
1961         /*
1962          * We got a chain from SRS that we need to send to the soft rings.
1963          * Since squeues for TCP & IPv4 sap poll their soft rings (for
1964          * performance reasons), we need to separate out v4_tcp, v4_udp
1965          * and the rest goes in other.
1966          */
1967         while (head != NULL) {
1968                 mp = head;
1969                 head = head->b_next;
1970                 mp->b_next = NULL;
1971 
1972                 type = OTH;
1973                 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1974 
1975                 if (is_ether) {
1976                         /*
1977                          * At this point we can be sure the packet at least
1978                          * has an ether header.
1979                          */
1980                         if (sz1 < sizeof (struct ether_header)) {
1981                                 mac_rx_drop_pkt(mac_srs, mp);
1982                                 continue;
1983                         }
1984                         ehp = (struct ether_header *)mp->b_rptr;
1985 
1986                         /*
1987                          * Determine if this is a VLAN or non-VLAN packet.
1988                          */
1989                         if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1990                                 evhp = (struct ether_vlan_header *)mp->b_rptr;
1991                                 sap = ntohs(evhp->ether_type);
1992                                 hdrsize = sizeof (struct ether_vlan_header);

1993                                 /*
1994                                  * Check if the VID of the packet, if any,
1995                                  * belongs to this client.




1996                                  */
1997                                 if (!mac_client_check_flow_vid(mcip,
1998                                     VLAN_ID(ntohs(evhp->ether_tci)))) {
1999                                         mac_rx_drop_pkt(mac_srs, mp);
2000                                         continue;
2001                                 }
2002                         } else {
2003                                 hdrsize = sizeof (struct ether_header);
2004                         }
2005                         is_unicast =
2006                             ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
2007                         dstaddr = (uint8_t *)&ehp->ether_dhost;
2008                 } else {
2009                         mac_header_info_t               mhi;
2010 
2011                         if (mac_header_info((mac_handle_t)mcip->mci_mip,
2012                             mp, &mhi) != 0) {
2013                                 mac_rx_drop_pkt(mac_srs, mp);
2014                                 continue;
2015                         }
2016                         hdrsize = mhi.mhi_hdrsize;
2017                         sap = mhi.mhi_bindsap;
2018                         is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
2019                         dstaddr = (uint8_t *)mhi.mhi_daddr;
2020                 }
2021 
2022                 if (!dls_bypass) {
2023                         if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
2024                             hdrsize, &type, &indx) == -1) {
2025                                 mac_rx_drop_pkt(mac_srs, mp);
2026                                 continue;
2027                         }
2028 
2029                         FANOUT_ENQUEUE_MP(headmp[type][indx],
2030                             tailmp[type][indx], cnt[type][indx], bw_ctl,
2031                             sz[type][indx], sz1, mp);
2032                         continue;
2033                 }
2034 
2035 
2036                 /*
2037                  * If we are using the default Rx ring where H/W or S/W
2038                  * classification has not happened, we need to verify if
2039                  * this unicast packet really belongs to us.
2040                  */
2041                 if (sap == ETHERTYPE_IP) {
2042                         /*
2043                          * If we are H/W classified, but we have promisc
2044                          * on, then we need to check for the unicast address.
2045                          */
2046                         if (hw_classified && mcip->mci_promisc_list != NULL) {
2047                                 mac_address_t           *map;
2048 
2049                                 rw_enter(&mcip->mci_rw_lock, RW_READER);
2050                                 map = mcip->mci_unicast;
2051                                 if (bcmp(dstaddr, map->ma_addr,
2052                                     map->ma_len) == 0)
2053                                         type = UNDEF;
2054                                 rw_exit(&mcip->mci_rw_lock);
2055                         } else if (is_unicast) {


2604                  */
2605                 MAC_SRS_POLL_RING(mac_srs);
2606         }
2607 
2608 again:
2609         head = mac_srs->srs_first;
2610         mac_srs->srs_first = NULL;
2611         tail = mac_srs->srs_last;
2612         mac_srs->srs_last = NULL;
2613         cnt = mac_srs->srs_count;
2614         mac_srs->srs_count = 0;
2615 
2616         ASSERT(head != NULL);
2617         ASSERT(tail != NULL);
2618 
2619         if ((tid = mac_srs->srs_tid) != NULL)
2620                 mac_srs->srs_tid = NULL;
2621 
2622         mac_srs->srs_state |= (SRS_PROC|proc_type);
2623 
2624 
2625         /*
2626          * mcip is NULL for broadcast and multicast flows. The promisc
2627          * callbacks for broadcast and multicast packets are delivered from
2628          * mac_rx() and we don't need to worry about that case in this path
2629          */
2630         if (mcip != NULL) {
2631                 if (mcip->mci_promisc_list != NULL) {
2632                         mutex_exit(&mac_srs->srs_lock);
2633                         mac_promisc_client_dispatch(mcip, head);
2634                         mutex_enter(&mac_srs->srs_lock);
2635                 }
2636                 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
2637                         mutex_exit(&mac_srs->srs_lock);
2638                         mac_protect_intercept_dynamic(mcip, head);
2639                         mutex_enter(&mac_srs->srs_lock);
2640                 }
2641         }
2642 
2643         /*
2644          * Check if SRS itself is doing the processing
2645          * This direct path does not apply when subflows are present. In this
2646          * case, packets need to be dispatched to a soft ring according to the
2647          * flow's bandwidth and other resources contraints.
2648          */
2649         if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
2650                 mac_direct_rx_t         proc;
2651                 void                    *arg1;
2652                 mac_resource_handle_t   arg2;
2653 
2654                 /*
2655                  * This is the case when a Rx is directly
2656                  * assigned and we have a fully classified
2657                  * protocol chain. We can deal with it in
2658                  * one shot.
2659                  */
2660                 proc = srs_rx->sr_func;
2661                 arg1 = srs_rx->sr_arg1;
2662                 arg2 = srs_rx->sr_arg2;
2663 
2664                 mac_srs->srs_state |= SRS_CLIENT_PROC;
2665                 mutex_exit(&mac_srs->srs_lock);
2666                 if (tid != NULL) {
2667                         (void) untimeout(tid);


4639  * flows as well.
4640  */
4641 /* ARGSUSED */
4642 void
4643 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
4644     mac_header_info_t *arg3)
4645 {
4646         mac_client_impl_t *mcip = arg1;
4647 
4648         if (mcip->mci_nvids == 1 &&
4649             !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
4650                 /*
4651                  * If the client has exactly one VID associated with it
4652                  * and striping of VLAN header is not disabled,
4653                  * remove the VLAN tag from the packet before
4654                  * passing it on to the client's receive callback.
4655                  * Note that this needs to be done after we dispatch
4656                  * the packet to the promiscuous listeners of the
4657                  * client, since they expect to see the whole
4658                  * frame including the VLAN headers.



4659                  */
4660                 mp_chain = mac_strip_vlan_tag_chain(mp_chain);
4661         }
4662 
4663         mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
4664 }
4665 
4666 /*
4667  * mac_rx_soft_ring_process




4668  *
4669  * process a chain for a given soft ring. The number of packets queued
4670  * in the SRS and its associated soft rings (including this one) is
4671  * very small (tracked by srs_poll_pkt_cnt), then allow the entering
4672  * thread (interrupt or poll thread) to do inline processing. This
4673  * helps keep the latency down under low load.
4674  *
4675  * The proc and arg for each mblk is already stored in the mblk in
4676  * appropriate places.
4677  */
4678 /* ARGSUSED */
4679 void
4680 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
4681     mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
4682 {
4683         mac_direct_rx_t         proc;
4684         void                    *arg1;
4685         mac_resource_handle_t   arg2;
4686         mac_soft_ring_set_t     *mac_srs = ringp->s_ring_set;
4687 
4688         ASSERT(ringp != NULL);
4689         ASSERT(mp_chain != NULL);
4690         ASSERT(tail != NULL);
4691         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4692 
4693         mutex_enter(&ringp->s_ring_lock);
4694         ringp->s_ring_total_inpkt += cnt;


4712                  */
4713                 if (ringp->s_ring_first == NULL) {
4714                         /*
4715                          * Fast-path, ok to process and nothing queued.
4716                          */
4717                         ringp->s_ring_run = curthread;
4718                         ringp->s_ring_state |= (S_RING_PROC);
4719 
4720                         mutex_exit(&ringp->s_ring_lock);
4721 
4722                         /*
4723                          * We are the chain of 1 packet so
4724                          * go through this fast path.
4725                          */
4726                         ASSERT(mp_chain->b_next == NULL);
4727 
4728                         (*proc)(arg1, arg2, mp_chain, NULL);
4729 
4730                         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4731                         /*
4732                          * If we have a soft ring set which is doing
4733                          * bandwidth control, we need to decrement
4734                          * srs_size and count so it the SRS can have a
4735                          * accurate idea of what is the real data
4736                          * queued between SRS and its soft rings. We
4737                          * decrement the counters only when the packet
4738                          * gets processed by both SRS and the soft ring.
4739                          */
4740                         mutex_enter(&mac_srs->srs_lock);
4741                         MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
4742                         MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
4743                         mutex_exit(&mac_srs->srs_lock);
4744 
4745                         mutex_enter(&ringp->s_ring_lock);
4746                         ringp->s_ring_run = NULL;
4747                         ringp->s_ring_state &= ~S_RING_PROC;
4748                         if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
4749                                 cv_signal(&ringp->s_ring_client_cv);
4750 
4751                         if ((ringp->s_ring_first == NULL) ||
4752                             (ringp->s_ring_state & S_RING_BLANK)) {
4753                                 /*
4754                                  * We processed inline our packet and
4755                                  * nothing new has arrived or our
4756                                  * receiver doesn't want to receive
4757                                  * any packets. We are done.
4758                                  */
4759                                 mutex_exit(&ringp->s_ring_lock);
4760                                 return;
4761                         }
4762                 } else {
4763                         SOFT_RING_ENQUEUE_CHAIN(ringp,
4764                             mp_chain, tail, cnt, sz);
4765                 }
4766 
4767                 /*
4768                  * We are here because either we couldn't do inline
4769                  * processing (because something was already
4770                  * queued), or we had a chain of more than one
4771                  * packet, or something else arrived after we were
4772                  * done with inline processing.
4773                  */
4774                 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
4775                 ASSERT(ringp->s_ring_first != NULL);




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2018 Joyent, Inc.
  25  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  26  */
  27 
  28 /*
  29  * MAC data path
  30  *
  31  * The MAC data path is concerned with the flow of traffic from mac clients --
  32  * DLS, IP, etc. -- to various GLDv3 device drivers -- e1000g, vnic, aggr,
  33  * ixgbe, etc. -- and from the GLDv3 device drivers back to clients.
  34  *
  35  * -----------
  36  * Terminology
  37  * -----------
  38  *
  39  * MAC uses a lot of different, but related terms that are associated with the
  40  * design and structure of the data path. Before we cover other aspects, first
  41  * let's review the terminology that MAC uses.
  42  *
  43  * MAC
  44  *


 283  * to a soft ring set.
 284  *
 285  * After frames reach a soft ring set and account for any potential bandwidth
 286  * related accounting, they may be fanned out based on one of the following
 287  * three modes:
 288  *
 289  *     o No Fanout
 290  *     o Protocol level fanout
 291  *     o Full software ring protocol fanout
 292  *
 293  * MAC makes the determination as to which of these modes a given soft ring set
 294  * obtains based on parameters such as whether or not it's the primary mac
 295  * client, whether it's on a 10 GbE or faster device, user controlled dladm(1M)
 296  * properties, and the nature of the hardware and the resources that it has.
 297  *
 298  * When there is no fanout, MAC does not create any soft rings for a device and
 299  * the device has frames delivered directly to the MAC client.
 300  *
 301  * Otherwise, all fanout is performed by software. MAC divides incoming frames
 302  * into one of three buckets -- IPv4 TCP traffic, IPv4 UDP traffic, and
 303  * everything else. Regardless of the type of fanout, these three categories
 304  * or buckets are always used.

 305  *
 306  * The difference between protocol level fanout and full software ring protocol
 307  * fanout is the number of software rings that end up getting created. The
 308  * system always uses the same number of software rings per protocol bucket. So
 309  * in the first case when we're just doing protocol level fanout, we just create
 310  * one software ring each for IPv4 TCP traffic, IPv4 UDP traffic, and everything
 311  * else.
 312  *
 313  * In the case where we do full software ring protocol fanout, we generally use
 314  * mac_compute_soft_ring_count() to determine the number of rings. There are
 315  * other combinations of properties and devices that may send us down other
 316  * paths, but this is a common starting point. If it's a non-bandwidth enforced
 317  * device and we're on at least a 10 GbE link, then we'll use eight soft rings
 318  * per protocol bucket as a starting point. See mac_compute_soft_ring_count()
 319  * for more information on the total number.
 320  *
 321  * For each of these rings, we create a mac_soft_ring_t and an associated worker
 322  * thread. Particularly when doing full software ring protocol fanout, we bind
 323  * each of the worker threads to individual CPUs.
 324  *


1457 
1458 #define MAC_FANOUT_DEFAULT      0
1459 #define MAC_FANOUT_RND_ROBIN    1
1460 int mac_fanout_type = MAC_FANOUT_DEFAULT;
1461 
1462 #define MAX_SR_TYPES    3
1463 /* fanout types for port based hashing */
1464 enum pkt_type {
1465         V4_TCP = 0,
1466         V4_UDP,
1467         OTH,
1468         UNDEF
1469 };
1470 
1471 /*
1472  * Pair of local and remote ports in the transport header
1473  */
1474 #define PORTS_SIZE 4
1475 
1476 /*
1477  * This routine delivers packets destined for an SRS into one of the


1478  * protocol soft rings.
1479  *
1480  * Given a chain of packets we need to split it up into multiple sub
1481  * chains: TCP, UDP or OTH soft ring. Instead of entering the soft
1482  * ring one packet at a time, we want to enter it in the form of a
1483  * chain otherwise we get this start/stop behaviour where the worker
1484  * thread goes to sleep and then next packet comes in forcing it to
1485  * wake up.
1486  */
1487 static void
1488 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
1489 {
1490         struct ether_header             *ehp;
1491         struct ether_vlan_header        *evhp;
1492         uint32_t                        sap;
1493         ipha_t                          *ipha;
1494         uint8_t                         *dstaddr;
1495         size_t                          hdrsize;
1496         mblk_t                          *mp;
1497         mblk_t                          *headmp[MAX_SR_TYPES];
1498         mblk_t                          *tailmp[MAX_SR_TYPES];
1499         int                             cnt[MAX_SR_TYPES];
1500         size_t                          sz[MAX_SR_TYPES];
1501         size_t                          sz1;
1502         boolean_t                       bw_ctl;
1503         boolean_t                       hw_classified;
1504         boolean_t                       dls_bypass;
1505         boolean_t                       is_ether;
1506         boolean_t                       is_unicast;
1507         enum pkt_type                   type;
1508         mac_client_impl_t               *mcip = mac_srs->srs_mcip;
1509 
1510         is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1511         bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1512 
1513         /*
1514          * If we don't have a Rx ring, S/W classification would have done
1515          * its job and its a packet meant for us. If we were polling on
1516          * the default ring (i.e. there was a ring assigned to this SRS),
1517          * then we need to make sure that the mac address really belongs
1518          * to us.
1519          */
1520         hw_classified = mac_srs->srs_ring != NULL &&
1521             mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1522 
1523         /*
1524          * Some clients, such as non-ethernet, need DLS processing in
1525          * the Rx path. Such clients clear the SRST_DLS_BYPASS flag.
1526          * DLS bypass may also be disabled via the
1527          * MCIS_RX_BYPASS_DISABLE flag.
1528          */
1529         dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
1530             ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1531 
1532         bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
1533         bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
1534         bzero(cnt, MAX_SR_TYPES * sizeof (int));
1535         bzero(sz, MAX_SR_TYPES * sizeof (size_t));
1536 
1537         /*
1538          * We have a chain from SRS that we need to split across the
1539          * soft rings. The squeues for the TCP and IPv4 SAPs use their
1540          * own soft rings to allow polling from the squeue. The rest of
1541          * the packets are delivered on the OTH soft ring which cannot
1542          * be polled.
1543          */
1544         while (head != NULL) {
1545                 mp = head;
1546                 head = head->b_next;
1547                 mp->b_next = NULL;
1548 
1549                 type = OTH;
1550                 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1551 
1552                 if (is_ether) {
1553                         /*
1554                          * At this point we can be sure the packet at least
1555                          * has an ether header.
1556                          */
1557                         if (sz1 < sizeof (struct ether_header)) {
1558                                 mac_rx_drop_pkt(mac_srs, mp);
1559                                 continue;
1560                         }
1561                         ehp = (struct ether_header *)mp->b_rptr;
1562 
1563                         /*
1564                          * Determine if this is a VLAN or non-VLAN packet.
1565                          */
1566                         if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1567                                 evhp = (struct ether_vlan_header *)mp->b_rptr;
1568                                 sap = ntohs(evhp->ether_type);
1569                                 hdrsize = sizeof (struct ether_vlan_header);
1570 
1571                                 /*
1572                                  * Check if the VID of the packet, if
1573                                  * any, belongs to this client.
1574                                  * Technically, if this packet came up
1575                                  * via a HW classified ring then we
1576                                  * don't need to perform this check.
1577                                  * Perhaps a future optimization.
1578                                  */
1579                                 if (!mac_client_check_flow_vid(mcip,
1580                                     VLAN_ID(ntohs(evhp->ether_tci)))) {
1581                                         mac_rx_drop_pkt(mac_srs, mp);
1582                                         continue;
1583                                 }
1584                         } else {
1585                                 hdrsize = sizeof (struct ether_header);
1586                         }
1587                         is_unicast =
1588                             ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1589                         dstaddr = (uint8_t *)&ehp->ether_dhost;
1590                 } else {
1591                         mac_header_info_t               mhi;
1592 
1593                         if (mac_header_info((mac_handle_t)mcip->mci_mip,
1594                             mp, &mhi) != 0) {
1595                                 mac_rx_drop_pkt(mac_srs, mp);
1596                                 continue;
1597                         }


1622                                         type = UNDEF;
1623                                 rw_exit(&mcip->mci_rw_lock);
1624                         } else if (is_unicast) {
1625                                 type = UNDEF;
1626                         }
1627                 }
1628 
1629                 /*
1630                  * This needs to become a contract with the driver for
1631                  * the fast path.
1632                  *
1633                  * In the normal case the packet will have at least the L2
1634                  * header and the IP + Transport header in the same mblk.
1635                  * This is usually the case when the NIC driver sends up
1636                  * the packet. This is also true when the stack generates
1637                  * a packet that is looped back and when the stack uses the
1638                  * fastpath mechanism. The normal case is optimized for
1639                  * performance and may bypass DLS. All other cases go through
1640                  * the 'OTH' type path without DLS bypass.
1641                  */

1642                 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1643                 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
1644                         type = OTH;
1645 
1646                 if (type == OTH) {
1647                         FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
1648                             cnt[type], bw_ctl, sz[type], sz1, mp);
1649                         continue;
1650                 }
1651 
1652                 ASSERT(type == UNDEF);
1653 
1654                 /*
1655                  * Determine the type from the IP protocol value. If
1656                  * classified as TCP or UDP, then update the read
1657                  * pointer to the beginning of the IP header.
1658                  * Otherwise leave the message as is for further
1659                  * processing by DLS.
1660                  */
1661                 switch (ipha->ipha_protocol) {
1662                 case IPPROTO_TCP:
1663                         type = V4_TCP;
1664                         mp->b_rptr += hdrsize;
1665                         break;
1666                 case IPPROTO_UDP:
1667                         type = V4_UDP;
1668                         mp->b_rptr += hdrsize;
1669                         break;
1670                 default:
1671                         type = OTH;
1672                         break;
1673                 }
1674 
1675                 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
1676                     bw_ctl, sz[type], sz1, mp);
1677         }
1678 
1679         for (type = V4_TCP; type < UNDEF; type++) {


1683                         ASSERT(tailmp[type]->b_next == NULL);
1684                         switch (type) {
1685                         case V4_TCP:
1686                                 softring = mac_srs->srs_tcp_soft_rings[0];
1687                                 break;
1688                         case V4_UDP:
1689                                 softring = mac_srs->srs_udp_soft_rings[0];
1690                                 break;
1691                         case OTH:
1692                                 softring = mac_srs->srs_oth_soft_rings[0];
1693                         }
1694                         mac_rx_soft_ring_process(mcip, softring,
1695                             headmp[type], tailmp[type], cnt[type], sz[type]);
1696                 }
1697         }
1698 }
1699 
1700 int     fanout_unaligned = 0;
1701 
1702 /*
1703  * The fanout routine for any clients with DLS bypass disabled or for
1704  * traffic classified as "other". Returns -1 on an error (drop the
1705  * packet due to a malformed packet), 0 on success, with values
1706  * written in *indx and *type.

1707  */
1708 static int
1709 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
1710     uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
1711 {
1712         ip6_t           *ip6h;
1713         ipha_t          *ipha;
1714         uint8_t         *whereptr;
1715         uint_t          hash;
1716         uint16_t        remlen;
1717         uint8_t         nexthdr;
1718         uint16_t        hdr_len;
1719         uint32_t        src_val, dst_val;
1720         boolean_t       modifiable = B_TRUE;
1721         boolean_t       v6;
1722 
1723         ASSERT(MBLKL(mp) >= hdrsize);
1724 
1725         if (sap == ETHERTYPE_IPV6) {
1726                 v6 = B_TRUE;


1852                             *(uint32_t *)whereptr);
1853                         *indx = COMPUTE_INDEX(hash,
1854                             mac_srs->srs_udp_ring_count);
1855                 } else {
1856                         *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
1857                         mac_srs->srs_ind++;
1858                 }
1859                 *type = OTH;
1860                 break;
1861         }
1862         return (0);
1863 
1864 src_dst_based_fanout:
1865         hash = HASH_ADDR(src_val, dst_val, (uint32_t)0);
1866         *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
1867         *type = OTH;
1868         return (0);
1869 }
1870 
1871 /*
1872  * This routine delivers packets destined for an SRS into a soft ring member


1873  * of the set.
1874  *
1875  * Given a chain of packets we need to split it up into multiple sub
1876  * chains: TCP, UDP or OTH soft ring. Instead of entering the soft
1877  * ring one packet at a time, we want to enter it in the form of a
1878  * chain otherwise we get this start/stop behaviour where the worker
1879  * thread goes to sleep and then next packet comes in forcing it to
1880  * wake up.
1881  *
1882  * Note:
1883  * Since we know what is the maximum fanout possible, we create a 2D array
1884  * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
1885  * variables so that we can enter the softrings with chain. We need the
1886  * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
1887  * for each packet would be expensive). If we ever want to have the
1888  * ability to have unlimited fanout, we should probably declare a head,
1889  * tail, cnt, sz with each soft ring (a data struct which contains a softring
1890  * along with these members) and create an array of this uber struct so we
1891  * don't have to do kmem_alloc.
1892  */
1893 int     fanout_oth1 = 0;
1894 int     fanout_oth2 = 0;
1895 int     fanout_oth3 = 0;
1896 int     fanout_oth4 = 0;
1897 int     fanout_oth5 = 0;
1898 
1899 static void
1900 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)


1921         boolean_t                       is_ether;
1922         boolean_t                       is_unicast;
1923         int                             fanout_cnt;
1924         enum pkt_type                   type;
1925         mac_client_impl_t               *mcip = mac_srs->srs_mcip;
1926 
1927         is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
1928         bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
1929 
1930         /*
1931          * If we don't have a Rx ring, S/W classification would have done
1932          * its job and its a packet meant for us. If we were polling on
1933          * the default ring (i.e. there was a ring assigned to this SRS),
1934          * then we need to make sure that the mac address really belongs
1935          * to us.
1936          */
1937         hw_classified = mac_srs->srs_ring != NULL &&
1938             mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
1939 
1940         /*
1941          * Some clients, such as non Ethernet, need DLS processing in
1942          * the Rx path. Such clients clear the SRST_DLS_BYPASS flag.
1943          * DLS bypass may also be disabled via the
1944          * MCIS_RX_BYPASS_DISABLE flag, but this is only consumed by
1945          * sun4v vsw currently.
1946          */
1947         dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
1948             ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1949 
1950         /*
1951          * Since the softrings are never destroyed and we always
1952          * create equal number of softrings for TCP, UDP and rest,
1953          * its OK to check one of them for count and use it without
1954          * any lock. In future, if soft rings get destroyed because
1955          * of reduction in fanout, we will need to ensure that happens
1956          * behind the SRS_PROC.
1957          */
1958         fanout_cnt = mac_srs->srs_tcp_ring_count;
1959 
1960         bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1961         bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1962         bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1963         bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1964 
1965         /*
1966          * We got a chain from SRS that we need to send to the soft rings.
1967          * Since squeues for TCP & IPv4 SAP poll their soft rings (for
1968          * performance reasons), we need to separate out v4_tcp, v4_udp
1969          * and the rest goes in other.
1970          */
1971         while (head != NULL) {
1972                 mp = head;
1973                 head = head->b_next;
1974                 mp->b_next = NULL;
1975 
1976                 type = OTH;
1977                 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1978 
1979                 if (is_ether) {
1980                         /*
1981                          * At this point we can be sure the packet at least
1982                          * has an ether header.
1983                          */
1984                         if (sz1 < sizeof (struct ether_header)) {
1985                                 mac_rx_drop_pkt(mac_srs, mp);
1986                                 continue;
1987                         }
1988                         ehp = (struct ether_header *)mp->b_rptr;
1989 
1990                         /*
1991                          * Determine if this is a VLAN or non-VLAN packet.
1992                          */
1993                         if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1994                                 evhp = (struct ether_vlan_header *)mp->b_rptr;
1995                                 sap = ntohs(evhp->ether_type);
1996                                 hdrsize = sizeof (struct ether_vlan_header);
1997 
1998                                 /*
1999                                  * Check if the VID of the packet, if
2000                                  * any, belongs to this client.
2001                                  * Technically, if this packet came up
2002                                  * via a HW classified ring then we
2003                                  * don't need to perform this check.
2004                                  * Perhaps a future optimization.
2005                                  */
2006                                 if (!mac_client_check_flow_vid(mcip,
2007                                     VLAN_ID(ntohs(evhp->ether_tci)))) {
2008                                         mac_rx_drop_pkt(mac_srs, mp);
2009                                         continue;
2010                                 }
2011                         } else {
2012                                 hdrsize = sizeof (struct ether_header);
2013                         }
2014                         is_unicast =
2015                             ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
2016                         dstaddr = (uint8_t *)&ehp->ether_dhost;
2017                 } else {
2018                         mac_header_info_t               mhi;
2019 
2020                         if (mac_header_info((mac_handle_t)mcip->mci_mip,
2021                             mp, &mhi) != 0) {
2022                                 mac_rx_drop_pkt(mac_srs, mp);
2023                                 continue;
2024                         }
2025                         hdrsize = mhi.mhi_hdrsize;
2026                         sap = mhi.mhi_bindsap;
2027                         is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
2028                         dstaddr = (uint8_t *)mhi.mhi_daddr;
2029                 }
2030 
2031                 if (!dls_bypass) {
2032                         if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
2033                             hdrsize, &type, &indx) == -1) {
2034                                 mac_rx_drop_pkt(mac_srs, mp);
2035                                 continue;
2036                         }
2037 
2038                         FANOUT_ENQUEUE_MP(headmp[type][indx],
2039                             tailmp[type][indx], cnt[type][indx], bw_ctl,
2040                             sz[type][indx], sz1, mp);
2041                         continue;
2042                 }
2043 

2044                 /*
2045                  * If we are using the default Rx ring where H/W or S/W
2046                  * classification has not happened, we need to verify if
2047                  * this unicast packet really belongs to us.
2048                  */
2049                 if (sap == ETHERTYPE_IP) {
2050                         /*
2051                          * If we are H/W classified, but we have promisc
2052                          * on, then we need to check for the unicast address.
2053                          */
2054                         if (hw_classified && mcip->mci_promisc_list != NULL) {
2055                                 mac_address_t           *map;
2056 
2057                                 rw_enter(&mcip->mci_rw_lock, RW_READER);
2058                                 map = mcip->mci_unicast;
2059                                 if (bcmp(dstaddr, map->ma_addr,
2060                                     map->ma_len) == 0)
2061                                         type = UNDEF;
2062                                 rw_exit(&mcip->mci_rw_lock);
2063                         } else if (is_unicast) {


2612                  */
2613                 MAC_SRS_POLL_RING(mac_srs);
2614         }
2615 
2616 again:
2617         head = mac_srs->srs_first;
2618         mac_srs->srs_first = NULL;
2619         tail = mac_srs->srs_last;
2620         mac_srs->srs_last = NULL;
2621         cnt = mac_srs->srs_count;
2622         mac_srs->srs_count = 0;
2623 
2624         ASSERT(head != NULL);
2625         ASSERT(tail != NULL);
2626 
2627         if ((tid = mac_srs->srs_tid) != NULL)
2628                 mac_srs->srs_tid = NULL;
2629 
2630         mac_srs->srs_state |= (SRS_PROC|proc_type);
2631 

2632         /*
2633          * mcip is NULL for broadcast and multicast flows. The promisc
2634          * callbacks for broadcast and multicast packets are delivered from
2635          * mac_rx() and we don't need to worry about that case in this path
2636          */
2637         if (mcip != NULL) {
2638                 if (mcip->mci_promisc_list != NULL) {
2639                         mutex_exit(&mac_srs->srs_lock);
2640                         mac_promisc_client_dispatch(mcip, head);
2641                         mutex_enter(&mac_srs->srs_lock);
2642                 }
2643                 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
2644                         mutex_exit(&mac_srs->srs_lock);
2645                         mac_protect_intercept_dynamic(mcip, head);
2646                         mutex_enter(&mac_srs->srs_lock);
2647                 }
2648         }
2649 
2650         /*
2651          * Check if SRS itself is doing the processing. This direct
2652          * path applies only when subflows are present.


2653          */
2654         if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
2655                 mac_direct_rx_t         proc;
2656                 void                    *arg1;
2657                 mac_resource_handle_t   arg2;
2658 
2659                 /*
2660                  * This is the case when a Rx is directly
2661                  * assigned and we have a fully classified
2662                  * protocol chain. We can deal with it in
2663                  * one shot.
2664                  */
2665                 proc = srs_rx->sr_func;
2666                 arg1 = srs_rx->sr_arg1;
2667                 arg2 = srs_rx->sr_arg2;
2668 
2669                 mac_srs->srs_state |= SRS_CLIENT_PROC;
2670                 mutex_exit(&mac_srs->srs_lock);
2671                 if (tid != NULL) {
2672                         (void) untimeout(tid);


4644  * flows as well.
4645  */
4646 /* ARGSUSED */
4647 void
4648 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
4649     mac_header_info_t *arg3)
4650 {
4651         mac_client_impl_t *mcip = arg1;
4652 
4653         if (mcip->mci_nvids == 1 &&
4654             !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
4655                 /*
4656                  * If the client has exactly one VID associated with it
4657                  * and striping of VLAN header is not disabled,
4658                  * remove the VLAN tag from the packet before
4659                  * passing it on to the client's receive callback.
4660                  * Note that this needs to be done after we dispatch
4661                  * the packet to the promiscuous listeners of the
4662                  * client, since they expect to see the whole
4663                  * frame including the VLAN headers.
4664                  *
4665                  * The MCIS_STRIP_DISABLE is only issued when sun4v
4666                  * vsw is in play.
4667                  */
4668                 mp_chain = mac_strip_vlan_tag_chain(mp_chain);
4669         }
4670 
4671         mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
4672 }
4673 
4674 /*
4675  * Process a chain for a given soft ring. If the number of packets
4676  * queued in the SRS and its associated soft rings (including this
4677  * one) is very small (tracked by srs_poll_pkt_cnt) then allow the
4678  * entering thread (interrupt or poll thread) to process the chain
4679  * inline. This is meant to reduce latency under low load.
4680  *






4681  * The proc and arg for each mblk is already stored in the mblk in
4682  * appropriate places.
4683  */
4684 /* ARGSUSED */
4685 void
4686 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
4687     mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
4688 {
4689         mac_direct_rx_t         proc;
4690         void                    *arg1;
4691         mac_resource_handle_t   arg2;
4692         mac_soft_ring_set_t     *mac_srs = ringp->s_ring_set;
4693 
4694         ASSERT(ringp != NULL);
4695         ASSERT(mp_chain != NULL);
4696         ASSERT(tail != NULL);
4697         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4698 
4699         mutex_enter(&ringp->s_ring_lock);
4700         ringp->s_ring_total_inpkt += cnt;


4718                  */
4719                 if (ringp->s_ring_first == NULL) {
4720                         /*
4721                          * Fast-path, ok to process and nothing queued.
4722                          */
4723                         ringp->s_ring_run = curthread;
4724                         ringp->s_ring_state |= (S_RING_PROC);
4725 
4726                         mutex_exit(&ringp->s_ring_lock);
4727 
4728                         /*
4729                          * We are the chain of 1 packet so
4730                          * go through this fast path.
4731                          */
4732                         ASSERT(mp_chain->b_next == NULL);
4733 
4734                         (*proc)(arg1, arg2, mp_chain, NULL);
4735 
4736                         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
4737                         /*
4738                          * If we have an SRS performing bandwidth
4739                          * control then we need to decrement the size
4740                          * and count so the SRS has an accurate count
4741                          * of the data queued between the SRS and its
4742                          * soft rings. We decrement the counters only
4743                          * when the packet is processed by both the
4744                          * SRS and the soft ring.
4745                          */
4746                         mutex_enter(&mac_srs->srs_lock);
4747                         MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
4748                         MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
4749                         mutex_exit(&mac_srs->srs_lock);
4750 
4751                         mutex_enter(&ringp->s_ring_lock);
4752                         ringp->s_ring_run = NULL;
4753                         ringp->s_ring_state &= ~S_RING_PROC;
4754                         if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
4755                                 cv_signal(&ringp->s_ring_client_cv);
4756 
4757                         if ((ringp->s_ring_first == NULL) ||
4758                             (ringp->s_ring_state & S_RING_BLANK)) {
4759                                 /*
4760                                  * We processed a single packet inline
4761                                  * and nothing new has arrived or our
4762                                  * receiver doesn't want to receive
4763                                  * any packets. We are done.
4764                                  */
4765                                 mutex_exit(&ringp->s_ring_lock);
4766                                 return;
4767                         }
4768                 } else {
4769                         SOFT_RING_ENQUEUE_CHAIN(ringp,
4770                             mp_chain, tail, cnt, sz);
4771                 }
4772 
4773                 /*
4774                  * We are here because either we couldn't do inline
4775                  * processing (because something was already
4776                  * queued), or we had a chain of more than one
4777                  * packet, or something else arrived after we were
4778                  * done with inline processing.
4779                  */
4780                 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
4781                 ASSERT(ringp->s_ring_first != NULL);