1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  *
  25  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  26  * Copyright 2018 Joyent, Inc.
  27  */
  28 /* Copyright (c) 1990 Mentat Inc. */
  29 
  30 #include <sys/types.h>
  31 #include <sys/stream.h>
  32 #include <sys/dlpi.h>
  33 #include <sys/stropts.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/strsubr.h>
  36 #include <sys/strlog.h>
  37 #include <sys/strsun.h>
  38 #include <sys/zone.h>
  39 #define _SUN_TPI_VERSION 2
  40 #include <sys/tihdr.h>
  41 #include <sys/xti_inet.h>
  42 #include <sys/ddi.h>
  43 #include <sys/sunddi.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/debug.h>
  46 #include <sys/kobj.h>
  47 #include <sys/modctl.h>
  48 #include <sys/atomic.h>
  49 #include <sys/policy.h>
  50 #include <sys/priv.h>
  51 
  52 #include <sys/systm.h>
  53 #include <sys/param.h>
  54 #include <sys/kmem.h>
  55 #include <sys/sdt.h>
  56 #include <sys/socket.h>
  57 #include <sys/vtrace.h>
  58 #include <sys/isa_defs.h>
  59 #include <sys/mac.h>
  60 #include <net/if.h>
  61 #include <net/if_arp.h>
  62 #include <net/route.h>
  63 #include <sys/sockio.h>
  64 #include <netinet/in.h>
  65 #include <net/if_dl.h>
  66 
  67 #include <inet/common.h>
  68 #include <inet/mi.h>
  69 #include <inet/mib2.h>
  70 #include <inet/nd.h>
  71 #include <inet/arp.h>
  72 #include <inet/snmpcom.h>
  73 #include <inet/kstatcom.h>
  74 
  75 #include <netinet/igmp_var.h>
  76 #include <netinet/ip6.h>
  77 #include <netinet/icmp6.h>
  78 #include <netinet/sctp.h>
  79 
  80 #include <inet/ip.h>
  81 #include <inet/ip_impl.h>
  82 #include <inet/ip6.h>
  83 #include <inet/ip6_asp.h>
  84 #include <inet/optcom.h>
  85 #include <inet/tcp.h>
  86 #include <inet/tcp_impl.h>
  87 #include <inet/ip_multi.h>
  88 #include <inet/ip_if.h>
  89 #include <inet/ip_ire.h>
  90 #include <inet/ip_ftable.h>
  91 #include <inet/ip_rts.h>
  92 #include <inet/ip_ndp.h>
  93 #include <inet/ip_listutils.h>
  94 #include <netinet/igmp.h>
  95 #include <netinet/ip_mroute.h>
  96 #include <inet/ipp_common.h>
  97 
  98 #include <net/pfkeyv2.h>
  99 #include <inet/sadb.h>
 100 #include <inet/ipsec_impl.h>
 101 #include <inet/ipdrop.h>
 102 #include <inet/ip_netinfo.h>
 103 #include <inet/ilb_ip.h>
 104 #include <sys/squeue_impl.h>
 105 #include <sys/squeue.h>
 106 
 107 #include <sys/ethernet.h>
 108 #include <net/if_types.h>
 109 #include <sys/cpuvar.h>
 110 
 111 #include <ipp/ipp.h>
 112 #include <ipp/ipp_impl.h>
 113 #include <ipp/ipgpc/ipgpc.h>
 114 
 115 #include <sys/pattr.h>
 116 #include <inet/ipclassifier.h>
 117 #include <inet/sctp_ip.h>
 118 #include <inet/sctp/sctp_impl.h>
 119 #include <inet/udp_impl.h>
 120 #include <sys/sunddi.h>
 121 
 122 #include <sys/tsol/label.h>
 123 #include <sys/tsol/tnet.h>
 124 
 125 #include <sys/clock_impl.h>       /* For LBOLT_FASTPATH{,64} */
 126 
 127 #ifdef  DEBUG
 128 extern boolean_t skip_sctp_cksum;
 129 #endif
 130 
 131 static void     ip_input_local_v4(ire_t *, mblk_t *, ipha_t *,
 132     ip_recv_attr_t *);
 133 
 134 static void     ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *,
 135     ip_recv_attr_t *);
 136 static void     ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *,
 137     ip_recv_attr_t *);
 138 
 139 #pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4)
 140 
 141 /*
 142  * Direct read side procedure capable of dealing with chains. GLDv3 based
 143  * drivers call this function directly with mblk chains while STREAMS
 144  * read side procedure ip_rput() calls this for single packet with ip_ring
 145  * set to NULL to process one packet at a time.
 146  *
 147  * The ill will always be valid if this function is called directly from
 148  * the driver.
 149  *
 150  * If this chain is part of a VLAN stream, then the VLAN tag is
 151  * stripped from the MAC header before being delivered to this
 152  * function.
 153  *
 154  * If the IP header in packet is not 32-bit aligned, every message in the
 155  * chain will be aligned before further operations. This is required on SPARC
 156  * platform.
 157  */
 158 void
 159 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 160     struct mac_header_info_s *mhip)
 161 {
 162         (void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL,
 163             NULL);
 164 }
 165 
 166 /*
 167  * ip_accept_tcp() - This function is called by the squeue when it retrieves
 168  * a chain of packets in the poll mode. The packets have gone through the
 169  * data link processing but not IP processing. For performance and latency
 170  * reasons, the squeue wants to process the chain in line instead of feeding
 171  * it back via ip_input path.
 172  *
 173  * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4
 174  * will pass back any TCP packets matching the target sqp to
 175  * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by
 176  * ip_input_v4 and ip_fanout_v4 as normal.
 177  * The TCP packets that match the target squeue are returned to the caller
 178  * as a b_next chain after each packet has been prepend with an mblk
 179  * from ip_recv_attr_to_mblk.
 180  */
 181 mblk_t *
 182 ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
 183     mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
 184 {
 185         return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp,
 186             last, cnt));
 187 }
 188 
 189 /*
 190  * Used by ip_input and ip_accept_tcp
 191  * The last three arguments are only used by ip_accept_tcp, and mhip is
 192  * only used by ip_input.
 193  */
 194 mblk_t *
 195 ip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 196     struct mac_header_info_s *mhip, squeue_t *target_sqp,
 197     mblk_t **last, uint_t *cnt)
 198 {
 199         mblk_t          *mp;
 200         ipha_t          *ipha;
 201         ip_recv_attr_t  iras;   /* Receive attributes */
 202         rtc_t           rtc;
 203         iaflags_t       chain_flags = 0;        /* Fixed for chain */
 204         mblk_t          *ahead = NULL;  /* Accepted head */
 205         mblk_t          *atail = NULL;  /* Accepted tail */
 206         uint_t          acnt = 0;       /* Accepted count */
 207 
 208         ASSERT(mp_chain != NULL);
 209         ASSERT(ill != NULL);
 210 
 211         /* These ones do not change as we loop over packets */
 212         iras.ira_ill = iras.ira_rill = ill;
 213         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
 214         iras.ira_rifindex = iras.ira_ruifindex;
 215         iras.ira_sqp = NULL;
 216         iras.ira_ring = ip_ring;
 217         /* For ECMP and outbound transmit ring selection */
 218         iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring);
 219 
 220         iras.ira_target_sqp = target_sqp;
 221         iras.ira_target_sqp_mp = NULL;
 222         if (target_sqp != NULL)
 223                 chain_flags |= IRAF_TARGET_SQP;
 224 
 225         /*
 226          * We try to have a mhip pointer when possible, but
 227          * it might be NULL in some cases. In those cases we
 228          * have to assume unicast.
 229          */
 230         iras.ira_mhip = mhip;
 231         iras.ira_flags = 0;
 232         if (mhip != NULL) {
 233                 switch (mhip->mhi_dsttype) {
 234                 case MAC_ADDRTYPE_MULTICAST :
 235                         chain_flags |= IRAF_L2DST_MULTICAST;
 236                         break;
 237                 case MAC_ADDRTYPE_BROADCAST :
 238                         chain_flags |= IRAF_L2DST_BROADCAST;
 239                         break;
 240                 }
 241         }
 242 
 243         /*
 244          * Initialize the one-element route cache.
 245          *
 246          * We do ire caching from one iteration to
 247          * another. In the event the packet chain contains
 248          * all packets from the same dst, this caching saves
 249          * an ire_route_recursive for each of the succeeding
 250          * packets in a packet chain.
 251          */
 252         rtc.rtc_ire = NULL;
 253         rtc.rtc_ipaddr = INADDR_ANY;
 254 
 255         /* Loop over b_next */
 256         for (mp = mp_chain; mp != NULL; mp = mp_chain) {
 257                 mp_chain = mp->b_next;
 258                 mp->b_next = NULL;
 259 
 260                 ASSERT(DB_TYPE(mp) == M_DATA);
 261 
 262 
 263                 /*
 264                  * if db_ref > 1 then copymsg and free original. Packet
 265                  * may be changed and we do not want the other entity
 266                  * who has a reference to this message to trip over the
 267                  * changes. This is a blind change because trying to
 268                  * catch all places that might change the packet is too
 269                  * difficult.
 270                  *
 271                  * This corresponds to the fast path case, where we have
 272                  * a chain of M_DATA mblks.  We check the db_ref count
 273                  * of only the 1st data block in the mblk chain. There
 274                  * doesn't seem to be a reason why a device driver would
 275                  * send up data with varying db_ref counts in the mblk
 276                  * chain. In any case the Fast path is a private
 277                  * interface, and our drivers don't do such a thing.
 278                  * Given the above assumption, there is no need to walk
 279                  * down the entire mblk chain (which could have a
 280                  * potential performance problem)
 281                  *
 282                  * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
 283                  * to here because of exclusive ip stacks and vnics.
 284                  * Packets transmitted from exclusive stack over vnic
 285                  * can have db_ref > 1 and when it gets looped back to
 286                  * another vnic in a different zone, you have ip_input()
 287                  * getting dblks with db_ref > 1. So if someone
 288                  * complains of TCP performance under this scenario,
 289                  * take a serious look here on the impact of copymsg().
 290                  */
 291                 if (DB_REF(mp) > 1) {
 292                         if ((mp = ip_fix_dbref(mp, &iras)) == NULL) {
 293                                 /* mhip might point into 1st packet in chain */
 294                                 iras.ira_mhip = NULL;
 295                                 continue;
 296                         }
 297                 }
 298 
 299                 /*
 300                  * IP header ptr not aligned?
 301                  * OR IP header not complete in first mblk
 302                  */
 303                 ipha = (ipha_t *)mp->b_rptr;
 304                 if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) {
 305                         mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH,
 306                             &iras);
 307                         if (mp == NULL) {
 308                                 /* mhip might point into 1st packet in chain */
 309                                 iras.ira_mhip = NULL;
 310                                 continue;
 311                         }
 312                         ipha = (ipha_t *)mp->b_rptr;
 313                 }
 314 
 315                 /* Protect against a mix of Ethertypes and IP versions */
 316                 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
 317                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
 318                         ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
 319                         freemsg(mp);
 320                         /* mhip might point into 1st packet in the chain. */
 321                         iras.ira_mhip = NULL;
 322                         continue;
 323                 }
 324 
 325                 /*
 326                  * Check for Martian addrs; we have to explicitly
 327                  * test for for zero dst since this is also used as
 328                  * an indication that the rtc is not used.
 329                  */
 330                 if (ipha->ipha_dst == INADDR_ANY) {
 331                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 332                         ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 333                         freemsg(mp);
 334                         /* mhip might point into 1st packet in the chain. */
 335                         iras.ira_mhip = NULL;
 336                         continue;
 337                 }
 338 
 339                 /*
 340                  * Keep L2SRC from a previous packet in chain since mhip
 341                  * might point into an earlier packet in the chain.
 342                  * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast
 343                  * source check in forwarding path.
 344                  */
 345                 chain_flags |= (iras.ira_flags &
 346                     (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC));
 347 
 348                 iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM |
 349                     IRAF_VERIFY_ULP_CKSUM | chain_flags;
 350                 iras.ira_free_flags = 0;
 351                 iras.ira_cred = NULL;
 352                 iras.ira_cpid = NOPID;
 353                 iras.ira_tsl = NULL;
 354                 iras.ira_zoneid = ALL_ZONES;    /* Default for forwarding */
 355 
 356                 /*
 357                  * We must count all incoming packets, even if they end
 358                  * up being dropped later on. Defer counting bytes until
 359                  * we have the whole IP header in first mblk.
 360                  */
 361                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
 362 
 363                 iras.ira_pktlen = ntohs(ipha->ipha_length);
 364                 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
 365                     iras.ira_pktlen);
 366 
 367                 /*
 368                  * Call one of:
 369                  *      ill_input_full_v4
 370                  *      ill_input_short_v4
 371                  * The former is used in unusual cases. See ill_set_inputfn().
 372                  */
 373                 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
 374 
 375                 /* Any references to clean up? No hold on ira_ill */
 376                 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
 377                         ira_cleanup(&iras, B_FALSE);
 378 
 379                 if (iras.ira_target_sqp_mp != NULL) {
 380                         /* Better be called from ip_accept_tcp */
 381                         ASSERT(target_sqp != NULL);
 382 
 383                         /* Found one packet to accept */
 384                         mp = iras.ira_target_sqp_mp;
 385                         iras.ira_target_sqp_mp = NULL;
 386                         ASSERT(ip_recv_attr_is_mblk(mp));
 387 
 388                         if (atail != NULL)
 389                                 atail->b_next = mp;
 390                         else
 391                                 ahead = mp;
 392                         atail = mp;
 393                         acnt++;
 394                         mp = NULL;
 395                 }
 396                 /* mhip might point into 1st packet in the chain. */
 397                 iras.ira_mhip = NULL;
 398         }
 399         /* Any remaining references to the route cache? */
 400         if (rtc.rtc_ire != NULL) {
 401                 ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
 402                 ire_refrele(rtc.rtc_ire);
 403         }
 404 
 405         if (ahead != NULL) {
 406                 /* Better be called from ip_accept_tcp */
 407                 ASSERT(target_sqp != NULL);
 408                 *last = atail;
 409                 *cnt = acnt;
 410                 return (ahead);
 411         }
 412 
 413         return (NULL);
 414 }
 415 
 416 /*
 417  * This input function is used when
 418  *  - is_system_labeled()
 419  *  - CGTP filtering
 420  *  - DHCP unicast before we have an IP address configured
 421  *  - there is an listener for IPPROTO_RSVP
 422  */
 423 void
 424 ill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
 425     ip_recv_attr_t *ira, rtc_t *rtc)
 426 {
 427         ipha_t          *ipha = (ipha_t *)iph_arg;
 428         ipaddr_t        nexthop = *(ipaddr_t *)nexthop_arg;
 429         ill_t           *ill = ira->ira_ill;
 430         ip_stack_t      *ipst = ill->ill_ipst;
 431         int             cgtp_flt_pkt;
 432 
 433         ASSERT(ira->ira_tsl == NULL);
 434 
 435         /*
 436          * Attach any necessary label information to
 437          * this packet
 438          */
 439         if (is_system_labeled()) {
 440                 ira->ira_flags |= IRAF_SYSTEM_LABELED;
 441 
 442                 /*
 443                  * This updates ira_cred, ira_tsl and ira_free_flags based
 444                  * on the label.
 445                  */
 446                 if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) {
 447                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 448                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
 449                         freemsg(mp);
 450                         return;
 451                 }
 452                 /* Note that ira_tsl can be NULL here. */
 453 
 454                 /* tsol_get_pkt_label sometimes does pullupmsg */
 455                 ipha = (ipha_t *)mp->b_rptr;
 456         }
 457 
 458         /*
 459          * Invoke the CGTP (multirouting) filtering module to process
 460          * the incoming packet. Packets identified as duplicates
 461          * must be discarded. Filtering is active only if the
 462          * the ip_cgtp_filter ndd variable is non-zero.
 463          */
 464         cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP;
 465         if (ipst->ips_ip_cgtp_filter &&
 466             ipst->ips_ip_cgtp_filter_ops != NULL) {
 467                 netstackid_t stackid;
 468 
 469                 stackid = ipst->ips_netstack->netstack_stackid;
 470                 /*
 471                  * CGTP and IPMP are mutually exclusive so
 472                  * phyint_ifindex is fine here.
 473                  */
 474                 cgtp_flt_pkt =
 475                     ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid,
 476                     ill->ill_phyint->phyint_ifindex, mp);
 477                 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
 478                         ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill);
 479                         freemsg(mp);
 480                         return;
 481                 }
 482         }
 483 
 484         /*
 485          * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
 486          * server to unicast DHCP packets to a DHCP client using the
 487          * IP address it is offering to the client.  This can be
 488          * disabled through the "broadcast bit", but not all DHCP
 489          * servers honor that bit.  Therefore, to interoperate with as
 490          * many DHCP servers as possible, the DHCP client allows the
 491          * server to unicast, but we treat those packets as broadcast
 492          * here.  Note that we don't rewrite the packet itself since
 493          * (a) that would mess up the checksums and (b) the DHCP
 494          * client conn is bound to INADDR_ANY so ip_fanout_udp() will
 495          * hand it the packet regardless.
 496          */
 497         if (ill->ill_dhcpinit != 0 &&
 498             ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION &&
 499             ipha->ipha_protocol == IPPROTO_UDP) {
 500                 udpha_t *udpha;
 501 
 502                 ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira);
 503                 if (ipha == NULL) {
 504                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 505                         ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill);
 506                         freemsg(mp);
 507                         return;
 508                 }
 509                 /* Reload since pullupmsg() can change b_rptr. */
 510                 udpha = (udpha_t *)&ipha[1];
 511 
 512                 if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) {
 513                         DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill,
 514                             mblk_t *, mp);
 515                         /*
 516                          * This assumes that we deliver to all conns for
 517                          * multicast and broadcast packets.
 518                          */
 519                         nexthop = INADDR_BROADCAST;
 520                         ira->ira_flags |= IRAF_DHCP_UNICAST;
 521                 }
 522         }
 523 
 524         /*
 525          * If rsvpd is running, let RSVP daemon handle its processing
 526          * and forwarding of RSVP multicast/unicast packets.
 527          * If rsvpd is not running but mrouted is running, RSVP
 528          * multicast packets are forwarded as multicast traffic
 529          * and RSVP unicast packets are forwarded by unicast router.
 530          * If neither rsvpd nor mrouted is running, RSVP multicast
 531          * packets are not forwarded, but the unicast packets are
 532          * forwarded like unicast traffic.
 533          */
 534         if (ipha->ipha_protocol == IPPROTO_RSVP &&
 535             ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
 536                 /* RSVP packet and rsvpd running. Treat as ours */
 537                 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop)));
 538                 /*
 539                  * We use a multicast address to get the packet to
 540                  * ire_recv_multicast_v4. There will not be a membership
 541                  * check since we set IRAF_RSVP
 542                  */
 543                 nexthop = htonl(INADDR_UNSPEC_GROUP);
 544                 ira->ira_flags |= IRAF_RSVP;
 545         }
 546 
 547         ill_input_short_v4(mp, ipha, &nexthop, ira, rtc);
 548 }
 549 
 550 /*
 551  * This is the tail-end of the full receive side packet handling.
 552  * It can be used directly when the configuration is simple.
 553  */
 554 void
 555 ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
 556     ip_recv_attr_t *ira, rtc_t *rtc)
 557 {
 558         ire_t           *ire;
 559         uint_t          opt_len;
 560         ill_t           *ill = ira->ira_ill;
 561         ip_stack_t      *ipst = ill->ill_ipst;
 562         uint_t          pkt_len;
 563         ssize_t         len;
 564         ipha_t          *ipha = (ipha_t *)iph_arg;
 565         ipaddr_t        nexthop = *(ipaddr_t *)nexthop_arg;
 566         ilb_stack_t     *ilbs = ipst->ips_netstack->netstack_ilb;
 567         uint_t          irr_flags;
 568 #define rptr    ((uchar_t *)ipha)
 569 
 570         ASSERT(DB_TYPE(mp) == M_DATA);
 571 
 572         /*
 573          * The following test for loopback is faster than
 574          * IP_LOOPBACK_ADDR(), because it avoids any bitwise
 575          * operations.
 576          * Note that these addresses are always in network byte order
 577          */
 578         if (((*(uchar_t *)&ipha->ipha_dst) == IN_LOOPBACKNET) ||
 579             ((*(uchar_t *)&ipha->ipha_src) == IN_LOOPBACKNET)) {
 580                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 581                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 582                 freemsg(mp);
 583                 return;
 584         }
 585 
 586         len = mp->b_wptr - rptr;
 587         pkt_len = ira->ira_pktlen;
 588 
 589         /* multiple mblk or too short */
 590         len -= pkt_len;
 591         if (len != 0) {
 592                 mp = ip_check_length(mp, rptr, len, pkt_len,
 593                     IP_SIMPLE_HDR_LENGTH, ira);
 594                 if (mp == NULL)
 595                         return;
 596                 ipha = (ipha_t *)mp->b_rptr;
 597         }
 598 
 599         DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
 600             ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
 601             int, 0);
 602 
 603         /*
 604          * The event for packets being received from a 'physical'
 605          * interface is placed after validation of the source and/or
 606          * destination address as being local so that packets can be
 607          * redirected to loopback addresses using ipnat.
 608          */
 609         DTRACE_PROBE4(ip4__physical__in__start,
 610             ill_t *, ill, ill_t *, NULL,
 611             ipha_t *, ipha, mblk_t *, mp);
 612 
 613         if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) {
 614                 int     ll_multicast = 0;
 615                 int     error;
 616                 ipaddr_t orig_dst = ipha->ipha_dst;
 617 
 618                 if (ira->ira_flags & IRAF_L2DST_MULTICAST)
 619                         ll_multicast = HPE_MULTICAST;
 620                 else if (ira->ira_flags & IRAF_L2DST_BROADCAST)
 621                         ll_multicast = HPE_BROADCAST;
 622 
 623                 FW_HOOKS(ipst->ips_ip4_physical_in_event,
 624                     ipst->ips_ipv4firewall_physical_in,
 625                     ill, NULL, ipha, mp, mp, ll_multicast, ipst, error);
 626 
 627                 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
 628 
 629                 if (mp == NULL)
 630                         return;
 631                 /* The length could have changed */
 632                 ipha = (ipha_t *)mp->b_rptr;
 633                 ira->ira_pktlen = ntohs(ipha->ipha_length);
 634                 pkt_len = ira->ira_pktlen;
 635 
 636                 /*
 637                  * In case the destination changed we override any previous
 638                  * change to nexthop.
 639                  */
 640                 if (orig_dst != ipha->ipha_dst)
 641                         nexthop = ipha->ipha_dst;
 642                 if (nexthop == INADDR_ANY) {
 643                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 644                         ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 645                         freemsg(mp);
 646                         return;
 647                 }
 648         }
 649 
 650         if (ipst->ips_ip4_observe.he_interested) {
 651                 zoneid_t dzone;
 652 
 653                 /*
 654                  * On the inbound path the src zone will be unknown as
 655                  * this packet has come from the wire.
 656                  */
 657                 dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES);
 658                 ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst);
 659         }
 660 
 661         /*
 662          * If there is a good HW IP header checksum we clear the need
 663          * look at the IP header checksum.
 664          */
 665         if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
 666             ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
 667                 /* Header checksum was ok. Clear the flag */
 668                 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 669                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
 670         }
 671 
 672         /*
 673          * Here we check to see if we machine is setup as
 674          * L3 loadbalancer and if the incoming packet is for a VIP
 675          *
 676          * Check the following:
 677          * - there is at least a rule
 678          * - protocol of the packet is supported
 679          */
 680         if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) {
 681                 ipaddr_t        lb_dst;
 682                 int             lb_ret;
 683 
 684                 /* For convenience, we pull up the mblk. */
 685                 if (mp->b_cont != NULL) {
 686                         if (pullupmsg(mp, -1) == 0) {
 687                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 688                                 ip_drop_input("ipIfStatsInDiscards - pullupmsg",
 689                                     mp, ill);
 690                                 freemsg(mp);
 691                                 return;
 692                         }
 693                         ipha = (ipha_t *)mp->b_rptr;
 694                 }
 695 
 696                 /*
 697                  * We just drop all fragments going to any VIP, at
 698                  * least for now....
 699                  */
 700                 if (ntohs(ipha->ipha_fragment_offset_and_flags) &
 701                     (IPH_MF | IPH_OFFSET)) {
 702                         if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) {
 703                                 goto after_ilb;
 704                         }
 705 
 706                         ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1);
 707                         ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1);
 708                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 709                         ip_drop_input("ILB fragment", mp, ill);
 710                         freemsg(mp);
 711                         return;
 712                 }
 713                 lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol,
 714                     (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst);
 715 
 716                 if (lb_ret == ILB_DROPPED) {
 717                         /* Is this the right counter to increase? */
 718                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 719                         ip_drop_input("ILB_DROPPED", mp, ill);
 720                         freemsg(mp);
 721                         return;
 722                 }
 723                 if (lb_ret == ILB_BALANCED) {
 724                         /* Set the dst to that of the chosen server */
 725                         nexthop = lb_dst;
 726                         DB_CKSUMFLAGS(mp) = 0;
 727                 }
 728         }
 729 
 730 after_ilb:
 731         opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION;
 732         ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
 733         if (opt_len != 0) {
 734                 int error = 0;
 735 
 736                 ira->ira_ip_hdr_length += (opt_len << 2);
 737                 ira->ira_flags |= IRAF_IPV4_OPTIONS;
 738 
 739                 /* IP Options present!  Validate the length. */
 740                 mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira);
 741                 if (mp == NULL)
 742                         return;
 743 
 744                 /* Might have changed */
 745                 ipha = (ipha_t *)mp->b_rptr;
 746 
 747                 /* Verify IP header checksum before parsing the options */
 748                 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
 749                     ip_csum_hdr(ipha)) {
 750                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
 751                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
 752                         freemsg(mp);
 753                         return;
 754                 }
 755                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
 756 
 757                 /*
 758                  * Go off to ip_input_options which returns the next hop
 759                  * destination address, which may have been affected
 760                  * by source routing.
 761                  */
 762                 IP_STAT(ipst, ip_opt);
 763 
 764                 nexthop = ip_input_options(ipha, nexthop, mp, ira, &error);
 765                 if (error != 0) {
 766                         /*
 767                          * An ICMP error has been sent and the packet has
 768                          * been dropped.
 769                          */
 770                         return;
 771                 }
 772         }
 773 
 774         if (ill->ill_flags & ILLF_ROUTER)
 775                 irr_flags = IRR_ALLOCATE;
 776         else
 777                 irr_flags = IRR_NONE;
 778 
 779         /* Can not use route cache with TX since the labels can differ */
 780         if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
 781                 if (CLASSD(nexthop)) {
 782                         ire = ire_multicast(ill);
 783                 } else {
 784                         /* Match destination and label */
 785                         ire = ire_route_recursive_v4(nexthop, 0, NULL,
 786                             ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR,
 787                             irr_flags, ira->ira_xmit_hint, ipst, NULL, NULL,
 788                             NULL);
 789                 }
 790                 /* Update the route cache so we do the ire_refrele */
 791                 ASSERT(ire != NULL);
 792                 if (rtc->rtc_ire != NULL)
 793                         ire_refrele(rtc->rtc_ire);
 794                 rtc->rtc_ire = ire;
 795                 rtc->rtc_ipaddr = nexthop;
 796         } else if (nexthop == rtc->rtc_ipaddr && rtc->rtc_ire != NULL) {
 797                 /* Use the route cache */
 798                 ire = rtc->rtc_ire;
 799         } else {
 800                 /* Update the route cache */
 801                 if (CLASSD(nexthop)) {
 802                         ire = ire_multicast(ill);
 803                 } else {
 804                         /* Just match the destination */
 805                         ire = ire_route_recursive_dstonly_v4(nexthop, irr_flags,
 806                             ira->ira_xmit_hint, ipst);
 807                 }
 808                 ASSERT(ire != NULL);
 809                 if (rtc->rtc_ire != NULL)
 810                         ire_refrele(rtc->rtc_ire);
 811                 rtc->rtc_ire = ire;
 812                 rtc->rtc_ipaddr = nexthop;
 813         }
 814 
 815         ire->ire_ib_pkt_count++;
 816 
 817         /*
 818          * Based on ire_type and ire_flags call one of:
 819          *      ire_recv_local_v4 - for IRE_LOCAL
 820          *      ire_recv_loopback_v4 - for IRE_LOOPBACK
 821          *      ire_recv_multirt_v4 - if RTF_MULTIRT
 822          *      ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
 823          *      ire_recv_multicast_v4 - for IRE_MULTICAST
 824          *      ire_recv_broadcast_v4 - for IRE_BROADCAST
 825          *      ire_recv_noaccept_v4 - for ire_noaccept ones
 826          *      ire_recv_forward_v4 - for the rest.
 827          */
 828         (*ire->ire_recvfn)(ire, mp, ipha, ira);
 829 }
 830 #undef rptr
 831 
 832 /*
 833  * ire_recvfn for IREs that need forwarding
 834  */
 835 void
 836 ire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
 837 {
 838         ipha_t          *ipha = (ipha_t *)iph_arg;
 839         ill_t           *ill = ira->ira_ill;
 840         ip_stack_t      *ipst = ill->ill_ipst;
 841         ill_t           *dst_ill;
 842         nce_t           *nce;
 843         ipaddr_t        src = ipha->ipha_src;
 844         uint32_t        added_tx_len;
 845         uint32_t        mtu, iremtu;
 846 
 847         if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
 848                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 849                 ip_drop_input("l2 multicast not forwarded", mp, ill);
 850                 freemsg(mp);
 851                 return;
 852         }
 853 
 854         if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) {
 855                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 856                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
 857                 freemsg(mp);
 858                 return;
 859         }
 860 
 861         /*
 862          * Either ire_nce_capable or ire_dep_parent would be set for the IRE
 863          * when it is found by ire_route_recursive, but that some other thread
 864          * could have changed the routes with the effect of clearing
 865          * ire_dep_parent. In that case we'd end up dropping the packet, or
 866          * finding a new nce below.
 867          * Get, allocate, or update the nce.
 868          * We get a refhold on ire_nce_cache as a result of this to avoid races
 869          * where ire_nce_cache is deleted.
 870          *
 871          * This ensures that we don't forward if the interface is down since
 872          * ipif_down removes all the nces.
 873          */
 874         mutex_enter(&ire->ire_lock);
 875         nce = ire->ire_nce_cache;
 876         if (nce == NULL) {
 877                 /* Not yet set up - try to set one up */
 878                 mutex_exit(&ire->ire_lock);
 879                 (void) ire_revalidate_nce(ire);
 880                 mutex_enter(&ire->ire_lock);
 881                 nce = ire->ire_nce_cache;
 882                 if (nce == NULL) {
 883                         mutex_exit(&ire->ire_lock);
 884                         /* The ire_dep_parent chain went bad, or no memory */
 885                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 886                         ip_drop_input("No ire_dep_parent", mp, ill);
 887                         freemsg(mp);
 888                         return;
 889                 }
 890         }
 891         nce_refhold(nce);
 892         mutex_exit(&ire->ire_lock);
 893 
 894         if (nce->nce_is_condemned) {
 895                 nce_t *nce1;
 896 
 897                 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE);
 898                 nce_refrele(nce);
 899                 if (nce1 == NULL) {
 900                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 901                         ip_drop_input("No nce", mp, ill);
 902                         freemsg(mp);
 903                         return;
 904                 }
 905                 nce = nce1;
 906         }
 907         dst_ill = nce->nce_ill;
 908 
 909         /*
 910          * Unless we are forwarding, drop the packet.
 911          * We have to let source routed packets through if they go out
 912          * the same interface i.e., they are 'ping -l' packets.
 913          */
 914         if (!(dst_ill->ill_flags & ILLF_ROUTER) &&
 915             !(ip_source_routed(ipha, ipst) && dst_ill == ill)) {
 916                 if (ip_source_routed(ipha, ipst)) {
 917                         ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
 918                         icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
 919                         nce_refrele(nce);
 920                         return;
 921                 }
 922                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 923                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
 924                 freemsg(mp);
 925                 nce_refrele(nce);
 926                 return;
 927         }
 928 
 929         if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) {
 930                 ipaddr_t        dst = ipha->ipha_dst;
 931 
 932                 ire->ire_ib_pkt_count--;
 933                 /*
 934                  * Should only use IREs that are visible from the
 935                  * global zone for forwarding.
 936                  * Take a source route into account the same way as ip_input
 937                  * did.
 938                  */
 939                 if (ira->ira_flags & IRAF_IPV4_OPTIONS) {
 940                         int             error = 0;
 941 
 942                         dst = ip_input_options(ipha, dst, mp, ira, &error);
 943                         ASSERT(error == 0);     /* ip_input checked */
 944                 }
 945                 ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID,
 946                     ira->ira_tsl, MATCH_IRE_SECATTR,
 947                     (ill->ill_flags & ILLF_ROUTER) ? IRR_ALLOCATE : IRR_NONE,
 948                     ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
 949                 ire->ire_ib_pkt_count++;
 950                 (*ire->ire_recvfn)(ire, mp, ipha, ira);
 951                 ire_refrele(ire);
 952                 nce_refrele(nce);
 953                 return;
 954         }
 955 
 956         /*
 957          * ipIfStatsHCInForwDatagrams should only be increment if there
 958          * will be an attempt to forward the packet, which is why we
 959          * increment after the above condition has been checked.
 960          */
 961         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
 962 
 963         /* Initiate Read side IPPF processing */
 964         if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
 965                 /* ip_process translates an IS_UNDER_IPMP */
 966                 mp = ip_process(IPP_FWD_IN, mp, ill, ill);
 967                 if (mp == NULL) {
 968                         /* ip_drop_packet and MIB done */
 969                         ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred "
 970                             "during IPPF processing\n"));
 971                         nce_refrele(nce);
 972                         return;
 973                 }
 974         }
 975 
 976         DTRACE_PROBE4(ip4__forwarding__start,
 977             ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp);
 978 
 979         if (HOOKS4_INTERESTED_FORWARDING(ipst)) {
 980                 int error;
 981 
 982                 FW_HOOKS(ipst->ips_ip4_forwarding_event,
 983                     ipst->ips_ipv4firewall_forwarding,
 984                     ill, dst_ill, ipha, mp, mp, 0, ipst, error);
 985 
 986                 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
 987 
 988                 if (mp == NULL) {
 989                         nce_refrele(nce);
 990                         return;
 991                 }
 992                 /*
 993                  * Even if the destination was changed by the filter we use the
 994                  * forwarding decision that was made based on the address
 995                  * in ip_input.
 996                  */
 997 
 998                 /* Might have changed */
 999                 ipha = (ipha_t *)mp->b_rptr;
1000                 ira->ira_pktlen = ntohs(ipha->ipha_length);
1001         }
1002 
1003         /* Packet is being forwarded. Turning off hwcksum flag. */
1004         DB_CKSUMFLAGS(mp) = 0;
1005 
1006         /*
1007          * Martian Address Filtering [RFC 1812, Section 5.3.7]
1008          * The loopback address check for both src and dst has already
1009          * been checked in ip_input
1010          * In the future one can envision adding RPF checks using number 3.
1011          * If we already checked the same source address we can skip this.
1012          */
1013         if (!(ira->ira_flags & IRAF_VERIFIED_SRC) ||
1014             src != ira->ira_verified_src) {
1015                 switch (ipst->ips_src_check) {
1016                 case 0:
1017                         break;
1018                 case 2:
1019                         if (ip_type_v4(src, ipst) == IRE_BROADCAST) {
1020                                 BUMP_MIB(ill->ill_ip_mib,
1021                                     ipIfStatsForwProhibits);
1022                                 BUMP_MIB(ill->ill_ip_mib,
1023                                     ipIfStatsInAddrErrors);
1024                                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1025                                 freemsg(mp);
1026                                 nce_refrele(nce);
1027                                 return;
1028                         }
1029                         /* FALLTHRU */
1030 
1031                 case 1:
1032                         if (CLASSD(src)) {
1033                                 BUMP_MIB(ill->ill_ip_mib,
1034                                     ipIfStatsForwProhibits);
1035                                 BUMP_MIB(ill->ill_ip_mib,
1036                                     ipIfStatsInAddrErrors);
1037                                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1038                                 freemsg(mp);
1039                                 nce_refrele(nce);
1040                                 return;
1041                         }
1042                         break;
1043                 }
1044                 /* Remember for next packet */
1045                 ira->ira_flags |= IRAF_VERIFIED_SRC;
1046                 ira->ira_verified_src = src;
1047         }
1048 
1049         /*
1050          * Check if packet is going out the same link on which it arrived.
1051          * Means we might need to send a redirect.
1052          */
1053         if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) {
1054                 ip_send_potential_redirect_v4(mp, ipha, ire, ira);
1055         }
1056 
1057         added_tx_len = 0;
1058         if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
1059                 mblk_t          *mp1;
1060                 uint32_t        old_pkt_len = ira->ira_pktlen;
1061 
1062                 /* Verify IP header checksum before adding/removing options */
1063                 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
1064                     ip_csum_hdr(ipha)) {
1065                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1066                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1067                         freemsg(mp);
1068                         nce_refrele(nce);
1069                         return;
1070                 }
1071                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
1072 
1073                 /*
1074                  * Check if it can be forwarded and add/remove
1075                  * CIPSO options as needed.
1076                  */
1077                 if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) {
1078                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1079                         ip_drop_input("tsol_ip_forward", mp, ill);
1080                         freemsg(mp);
1081                         nce_refrele(nce);
1082                         return;
1083                 }
1084                 /*
1085                  * Size may have changed. Remember amount added in case
1086                  * IP needs to send an ICMP too big.
1087                  */
1088                 mp = mp1;
1089                 ipha = (ipha_t *)mp->b_rptr;
1090                 ira->ira_pktlen = ntohs(ipha->ipha_length);
1091                 ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
1092                 if (ira->ira_pktlen > old_pkt_len)
1093                         added_tx_len = ira->ira_pktlen - old_pkt_len;
1094 
1095                 /* Options can have been added or removed */
1096                 if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH)
1097                         ira->ira_flags |= IRAF_IPV4_OPTIONS;
1098                 else
1099                         ira->ira_flags &= ~IRAF_IPV4_OPTIONS;
1100         }
1101 
1102         mtu = dst_ill->ill_mtu;
1103         if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu)
1104                 mtu = iremtu;
1105         ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len);
1106         nce_refrele(nce);
1107 }
1108 
1109 /*
1110  * Used for sending out unicast and multicast packets that are
1111  * forwarded.
1112  */
1113 void
1114 ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1115     ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len)
1116 {
1117         ill_t           *dst_ill = nce->nce_ill;
1118         uint32_t        pkt_len;
1119         uint32_t        sum;
1120         iaflags_t       iraflags = ira->ira_flags;
1121         ip_stack_t      *ipst = ill->ill_ipst;
1122         iaflags_t       ixaflags;
1123 
1124         if (ipha->ipha_ttl <= 1) {
1125                 /* Perhaps the checksum was bad */
1126                 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1127                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1128                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1129                         freemsg(mp);
1130                         return;
1131                 }
1132                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1133                 ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill);
1134                 icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira);
1135                 return;
1136         }
1137         ipha->ipha_ttl--;
1138         /* Adjust the checksum to reflect the ttl decrement. */
1139         sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
1140         ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
1141 
1142         /* Check if there are options to update */
1143         if (iraflags & IRAF_IPV4_OPTIONS) {
1144                 ASSERT(ipha->ipha_version_and_hdr_length !=
1145                     IP_SIMPLE_HDR_VERSION);
1146                 ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM));
1147 
1148                 if (!ip_forward_options(mp, ipha, dst_ill, ira)) {
1149                         /* ipIfStatsForwProhibits and ip_drop_input done */
1150                         return;
1151                 }
1152 
1153                 ipha->ipha_hdr_checksum = 0;
1154                 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1155         }
1156 
1157         /* Initiate Write side IPPF processing before any fragmentation */
1158         if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
1159                 /* ip_process translates an IS_UNDER_IPMP */
1160                 mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill);
1161                 if (mp == NULL) {
1162                         /* ip_drop_packet and MIB done */
1163                         ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \
1164                             " during IPPF processing\n"));
1165                         return;
1166                 }
1167         }
1168 
1169         pkt_len = ira->ira_pktlen;
1170 
1171         BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
1172 
1173         ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL;
1174 
1175         if (pkt_len > mtu) {
1176                 /*
1177                  * It needs fragging on its way out.  If we haven't
1178                  * verified the header checksum yet we do it now since
1179                  * are going to put a surely good checksum in the
1180                  * outgoing header, we have to make sure that it
1181                  * was good coming in.
1182                  */
1183                 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1184                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1185                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1186                         freemsg(mp);
1187                         return;
1188                 }
1189                 if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) {
1190                         BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails);
1191                         ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill);
1192                         if (iraflags & IRAF_SYSTEM_LABELED) {
1193                                 /*
1194                                  * Remove any CIPSO option added by
1195                                  * tsol_ip_forward, and make sure we report
1196                                  * a path MTU so that there
1197                                  * is room to add such a CIPSO option for future
1198                                  * packets.
1199                                  */
1200                                 mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len,
1201                                     AF_INET);
1202                         }
1203 
1204                         icmp_frag_needed(mp, mtu, ira);
1205                         return;
1206                 }
1207 
1208                 (void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu,
1209                     ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL);
1210                 return;
1211         }
1212 
1213         ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
1214         if (iraflags & IRAF_LOOPBACK_COPY) {
1215                 /*
1216                  * IXAF_NO_LOOP_ZONEID is not set hence 7th arg
1217                  * is don't care
1218                  */
1219                 (void) ip_postfrag_loopcheck(mp, nce,
1220                     ixaflags | IXAF_LOOPBACK_COPY,
1221                     pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
1222         } else {
1223                 (void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint,
1224                     GLOBAL_ZONEID, 0, NULL);
1225         }
1226 }
1227 
1228 /*
1229  * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
1230  * which is what ire_route_recursive returns when there is no matching ire.
1231  * Send ICMP unreachable unless blackhole.
1232  */
1233 void
1234 ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1235 {
1236         ipha_t          *ipha = (ipha_t *)iph_arg;
1237         ill_t           *ill = ira->ira_ill;
1238         ip_stack_t      *ipst = ill->ill_ipst;
1239 
1240         /* Would we have forwarded this packet if we had a route? */
1241         if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
1242                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1243                 ip_drop_input("l2 multicast not forwarded", mp, ill);
1244                 freemsg(mp);
1245                 return;
1246         }
1247 
1248         if (!(ill->ill_flags & ILLF_ROUTER)) {
1249                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1250                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
1251                 freemsg(mp);
1252                 return;
1253         }
1254         /*
1255          * If we had a route this could have been forwarded. Count as such.
1256          *
1257          * ipIfStatsHCInForwDatagrams should only be increment if there
1258          * will be an attempt to forward the packet, which is why we
1259          * increment after the above condition has been checked.
1260          */
1261         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
1262 
1263         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1264 
1265         ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST,
1266             ipst);
1267 
1268         if (ire->ire_flags & RTF_BLACKHOLE) {
1269                 ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill);
1270                 freemsg(mp);
1271         } else {
1272                 ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill);
1273 
1274                 if (ip_source_routed(ipha, ipst)) {
1275                         icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
1276                 } else {
1277                         icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira);
1278                 }
1279         }
1280 }
1281 
1282 /*
1283  * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
1284  * VRRP when in noaccept mode.
1285  * We silently drop the packet. ARP handles packets even if noaccept is set.
1286  */
1287 /* ARGSUSED */
1288 void
1289 ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1290     ip_recv_attr_t *ira)
1291 {
1292         ill_t           *ill = ira->ira_ill;
1293 
1294         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1295         ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
1296         freemsg(mp);
1297 }
1298 
1299 /*
1300  * ire_recvfn for IRE_BROADCAST.
1301  */
1302 void
1303 ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1304     ip_recv_attr_t *ira)
1305 {
1306         ipha_t          *ipha = (ipha_t *)iph_arg;
1307         ill_t           *ill = ira->ira_ill;
1308         ill_t           *dst_ill = ire->ire_ill;
1309         ip_stack_t      *ipst = ill->ill_ipst;
1310         ire_t           *alt_ire;
1311         nce_t           *nce;
1312         ipaddr_t        ipha_dst;
1313 
1314         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts);
1315 
1316         /* Tag for higher-level protocols */
1317         ira->ira_flags |= IRAF_BROADCAST;
1318 
1319         /*
1320          * Whether local or directed broadcast forwarding: don't allow
1321          * for TCP.
1322          */
1323         if (ipha->ipha_protocol == IPPROTO_TCP) {
1324                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1325                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1326                 freemsg(mp);
1327                 return;
1328         }
1329 
1330         /*
1331          * So that we don't end up with dups, only one ill an IPMP group is
1332          * nominated to receive broadcast traffic.
1333          * If we have no cast_ill we are liberal and accept everything.
1334          */
1335         if (IS_UNDER_IPMP(ill)) {
1336                 /* For an under ill_grp can change under lock */
1337                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1338                 if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1339                     ill->ill_grp->ig_cast_ill != NULL) {
1340                         rw_exit(&ipst->ips_ill_g_lock);
1341                         /* No MIB since this is normal operation */
1342                         ip_drop_input("not nom_cast", mp, ill);
1343                         freemsg(mp);
1344                         return;
1345                 }
1346                 rw_exit(&ipst->ips_ill_g_lock);
1347 
1348                 ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1349         }
1350 
1351         /*
1352          * After reassembly and IPsec we will need to duplicate the
1353          * broadcast packet for all matching zones on the ill.
1354          */
1355         ira->ira_zoneid = ALL_ZONES;
1356 
1357         /*
1358          * Check for directed broadcast i.e. ire->ire_ill is different than
1359          * the incoming ill.
1360          * The same broadcast address can be assigned to multiple interfaces
1361          * so have to check explicitly for that case by looking up the alt_ire
1362          */
1363         if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) {
1364                 /* Reassemble on the ill on which the packet arrived */
1365                 ip_input_local_v4(ire, mp, ipha, ira);
1366                 /* Restore */
1367                 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1368                 return;
1369         }
1370 
1371         /* Is there an IRE_BROADCAST on the incoming ill? */
1372         ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST :
1373             ipha->ipha_dst);
1374         alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill,
1375             ALL_ZONES, ira->ira_tsl,
1376             MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL);
1377         if (alt_ire != NULL) {
1378                 /* Not a directed broadcast */
1379                 /*
1380                  * In the special case of multirouted broadcast
1381                  * packets, we unconditionally need to "gateway"
1382                  * them to the appropriate interface here so that reassembly
1383                  * works. We know that the IRE_BROADCAST on cgtp0 doesn't
1384                  * have RTF_MULTIRT set so we look for such an IRE in the
1385                  * bucket.
1386                  */
1387                 if (alt_ire->ire_flags & RTF_MULTIRT) {
1388                         irb_t           *irb;
1389                         ire_t           *ire1;
1390 
1391                         irb = ire->ire_bucket;
1392                         irb_refhold(irb);
1393                         for (ire1 = irb->irb_ire; ire1 != NULL;
1394                             ire1 = ire1->ire_next) {
1395                                 if (IRE_IS_CONDEMNED(ire1))
1396                                         continue;
1397                                 if (!(ire1->ire_type & IRE_BROADCAST) ||
1398                                     (ire1->ire_flags & RTF_MULTIRT))
1399                                         continue;
1400                                 ill = ire1->ire_ill;
1401                                 ill_refhold(ill);
1402                                 break;
1403                         }
1404                         irb_refrele(irb);
1405                         if (ire1 != NULL) {
1406                                 ill_t *orig_ill = ira->ira_ill;
1407 
1408                                 ire_refrele(alt_ire);
1409                                 /* Reassemble on the new ill */
1410                                 ira->ira_ill = ill;
1411                                 ip_input_local_v4(ire, mp, ipha, ira);
1412                                 ill_refrele(ill);
1413                                 /* Restore */
1414                                 ira->ira_ill = orig_ill;
1415                                 ira->ira_ruifindex =
1416                                     orig_ill->ill_phyint->phyint_ifindex;
1417                                 return;
1418                         }
1419                 }
1420                 ire_refrele(alt_ire);
1421                 /* Reassemble on the ill on which the packet arrived */
1422                 ip_input_local_v4(ire, mp, ipha, ira);
1423                 goto done;
1424         }
1425 
1426         /*
1427          * This is a directed broadcast
1428          *
1429          * If directed broadcast is allowed, then forward the packet out
1430          * the destination interface with IXAF_LOOPBACK_COPY set. That will
1431          * result in ip_input() receiving a copy of the packet on the
1432          * appropriate ill. (We could optimize this to avoid the extra trip
1433          * via ip_input(), but since directed broadcasts are normally disabled
1434          * it doesn't make sense to optimize it.)
1435          */
1436         if (!ipst->ips_ip_g_forward_directed_bcast ||
1437             (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) {
1438                 ip_drop_input("directed broadcast not allowed", mp, ill);
1439                 freemsg(mp);
1440                 goto done;
1441         }
1442         if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1443                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1444                 ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1445                 freemsg(mp);
1446                 goto done;
1447         }
1448 
1449         /*
1450          * Clear the indication that this may have hardware
1451          * checksum as we are not using it for forwarding.
1452          */
1453         DB_CKSUMFLAGS(mp) = 0;
1454 
1455         /*
1456          * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one.
1457          */
1458         ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1;
1459         ipha->ipha_hdr_checksum = 0;
1460         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1461 
1462         /*
1463          * We use ip_forward_xmit to do any fragmentation.
1464          * and loopback copy on the outbound interface.
1465          *
1466          * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side.
1467          */
1468         ira->ira_flags |= IRAF_LOOPBACK_COPY;
1469 
1470         nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST);
1471         if (nce == NULL) {
1472                 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards);
1473                 ip_drop_output("No nce", mp, dst_ill);
1474                 freemsg(mp);
1475                 goto done;
1476         }
1477 
1478         ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mc_mtu, 0);
1479         nce_refrele(nce);
1480 done:
1481         /* Restore */
1482         ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1483 }
1484 
1485 /*
1486  * ire_recvfn for IRE_MULTICAST.
1487  */
1488 void
1489 ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1490     ip_recv_attr_t *ira)
1491 {
1492         ipha_t          *ipha = (ipha_t *)iph_arg;
1493         ill_t           *ill = ira->ira_ill;
1494         ip_stack_t      *ipst = ill->ill_ipst;
1495 
1496         ASSERT(ire->ire_ill == ira->ira_ill);
1497 
1498         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
1499         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen);
1500 
1501         /* RSVP hook */
1502         if (ira->ira_flags & IRAF_RSVP)
1503                 goto forus;
1504 
1505         /* Tag for higher-level protocols */
1506         ira->ira_flags |= IRAF_MULTICAST;
1507 
1508         /*
1509          * So that we don't end up with dups, only one ill an IPMP group is
1510          * nominated to receive multicast traffic.
1511          * If we have no cast_ill we are liberal and accept everything.
1512          */
1513         if (IS_UNDER_IPMP(ill)) {
1514                 ip_stack_t      *ipst = ill->ill_ipst;
1515 
1516                 /* For an under ill_grp can change under lock */
1517                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1518                 if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1519                     ill->ill_grp->ig_cast_ill != NULL) {
1520                         rw_exit(&ipst->ips_ill_g_lock);
1521                         ip_drop_input("not on cast ill", mp, ill);
1522                         freemsg(mp);
1523                         return;
1524                 }
1525                 rw_exit(&ipst->ips_ill_g_lock);
1526                 /*
1527                  * We switch to the upper ill so that mrouter and hasmembers
1528                  * can operate on upper here and in ip_input_multicast.
1529                  */
1530                 ill = ipmp_ill_hold_ipmp_ill(ill);
1531                 if (ill != NULL) {
1532                         ASSERT(ill != ira->ira_ill);
1533                         ASSERT(ire->ire_ill == ira->ira_ill);
1534                         ira->ira_ill = ill;
1535                         ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1536                 } else {
1537                         ill = ira->ira_ill;
1538                 }
1539         }
1540 
1541         /*
1542          * Check if we are a multicast router - send ip_mforward a copy of
1543          * the packet.
1544          * Due to mroute_decap tunnels we consider forwarding packets even if
1545          * mrouted has not joined the allmulti group on this interface.
1546          */
1547         if (ipst->ips_ip_g_mrouter) {
1548                 int retval;
1549 
1550                 /*
1551                  * Clear the indication that this may have hardware
1552                  * checksum as we are not using it for forwarding.
1553                  */
1554                 DB_CKSUMFLAGS(mp) = 0;
1555 
1556                 /*
1557                  * ip_mforward helps us make these distinctions: If received
1558                  * on tunnel and not IGMP, then drop.
1559                  * If IGMP packet, then don't check membership
1560                  * If received on a phyint and IGMP or PIM, then
1561                  * don't check membership
1562                  */
1563                 retval = ip_mforward(mp, ira);
1564                 /* ip_mforward updates mib variables if needed */
1565 
1566                 switch (retval) {
1567                 case 0:
1568                         /*
1569                          * pkt is okay and arrived on phyint.
1570                          *
1571                          * If we are running as a multicast router
1572                          * we need to see all IGMP and/or PIM packets.
1573                          */
1574                         if ((ipha->ipha_protocol == IPPROTO_IGMP) ||
1575                             (ipha->ipha_protocol == IPPROTO_PIM)) {
1576                                 goto forus;
1577                         }
1578                         break;
1579                 case -1:
1580                         /* pkt is mal-formed, toss it */
1581                         freemsg(mp);
1582                         goto done;
1583                 case 1:
1584                         /*
1585                          * pkt is okay and arrived on a tunnel
1586                          *
1587                          * If we are running a multicast router
1588                          * we need to see all igmp packets.
1589                          */
1590                         if (ipha->ipha_protocol == IPPROTO_IGMP) {
1591                                 goto forus;
1592                         }
1593                         ip_drop_input("Multicast on tunnel ignored", mp, ill);
1594                         freemsg(mp);
1595                         goto done;
1596                 }
1597         }
1598 
1599         /*
1600          * Check if we have members on this ill. This is not necessary for
1601          * correctness because even if the NIC/GLD had a leaky filter, we
1602          * filter before passing to each conn_t.
1603          */
1604         if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) {
1605                 /*
1606                  * Nobody interested
1607                  *
1608                  * This might just be caused by the fact that
1609                  * multiple IP Multicast addresses map to the same
1610                  * link layer multicast - no need to increment counter!
1611                  */
1612                 ip_drop_input("Multicast with no members", mp, ill);
1613                 freemsg(mp);
1614                 goto done;
1615         }
1616 forus:
1617         ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n",
1618             ntohl(ipha->ipha_dst)));
1619 
1620         /*
1621          * After reassembly and IPsec we will need to duplicate the
1622          * multicast packet for all matching zones on the ill.
1623          */
1624         ira->ira_zoneid = ALL_ZONES;
1625 
1626         /* Reassemble on the ill on which the packet arrived */
1627         ip_input_local_v4(ire, mp, ipha, ira);
1628 done:
1629         if (ill != ire->ire_ill) {
1630                 ill_refrele(ill);
1631                 ira->ira_ill = ire->ire_ill;
1632                 ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
1633         }
1634 }
1635 
1636 /*
1637  * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
1638  * Drop packets since we don't forward out multirt routes.
1639  */
1640 /* ARGSUSED */
1641 void
1642 ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1643 {
1644         ill_t           *ill = ira->ira_ill;
1645 
1646         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1647         ip_drop_input("Not forwarding out MULTIRT", mp, ill);
1648         freemsg(mp);
1649 }
1650 
1651 /*
1652  * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
1653  * has rewritten the packet to have a loopback destination address (We
1654  * filter out packet with a loopback destination from arriving over the wire).
1655  * We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
1656  */
1657 void
1658 ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1659 {
1660         ipha_t          *ipha = (ipha_t *)iph_arg;
1661         ill_t           *ill = ira->ira_ill;
1662         ill_t           *ire_ill = ire->ire_ill;
1663 
1664         ira->ira_zoneid = GLOBAL_ZONEID;
1665 
1666         /* Switch to the lo0 ill for further processing  */
1667         if (ire_ill != ill) {
1668                 /*
1669                  * Update ira_ill to be the ILL on which the IP address
1670                  * is hosted.
1671                  * No need to hold the ill since we have a hold on the ire
1672                  */
1673                 ASSERT(ira->ira_ill == ira->ira_rill);
1674                 ira->ira_ill = ire_ill;
1675 
1676                 ip_input_local_v4(ire, mp, ipha, ira);
1677 
1678                 /* Restore */
1679                 ASSERT(ira->ira_ill == ire_ill);
1680                 ira->ira_ill = ill;
1681                 return;
1682 
1683         }
1684         ip_input_local_v4(ire, mp, ipha, ira);
1685 }
1686 
1687 /*
1688  * ire_recvfn for IRE_LOCAL.
1689  */
1690 void
1691 ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1692 {
1693         ipha_t          *ipha = (ipha_t *)iph_arg;
1694         ill_t           *ill = ira->ira_ill;
1695         ill_t           *ire_ill = ire->ire_ill;
1696 
1697         /* Make a note for DAD that this address is in use */
1698         ire->ire_last_used_time = LBOLT_FASTPATH;
1699 
1700         /* Only target the IRE_LOCAL with the right zoneid. */
1701         ira->ira_zoneid = ire->ire_zoneid;
1702 
1703         /*
1704          * If the packet arrived on the wrong ill, we check that
1705          * this is ok.
1706          * If it is, then we ensure that we do the reassembly on
1707          * the ill on which the address is hosted. We keep ira_rill as
1708          * the one on which the packet arrived, so that IP_PKTINFO and
1709          * friends can report this.
1710          */
1711         if (ire_ill != ill) {
1712                 ire_t *new_ire;
1713 
1714                 new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
1715                 if (new_ire == NULL) {
1716                         /* Drop packet */
1717                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1718                         ip_drop_input("ipIfStatsInForwProhibits", mp, ill);
1719                         freemsg(mp);
1720                         return;
1721                 }
1722                 /*
1723                  * Update ira_ill to be the ILL on which the IP address
1724                  * is hosted. No need to hold the ill since we have a
1725                  * hold on the ire. Note that we do the switch even if
1726                  * new_ire == ire (for IPMP, ire would be the one corresponding
1727                  * to the IPMP ill).
1728                  */
1729                 ASSERT(ira->ira_ill == ira->ira_rill);
1730                 ira->ira_ill = new_ire->ire_ill;
1731 
1732                 /* ira_ruifindex tracks the upper for ira_rill */
1733                 if (IS_UNDER_IPMP(ill))
1734                         ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1735 
1736                 ip_input_local_v4(new_ire, mp, ipha, ira);
1737 
1738                 /* Restore */
1739                 ASSERT(ira->ira_ill == new_ire->ire_ill);
1740                 ira->ira_ill = ill;
1741                 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1742 
1743                 if (new_ire != ire)
1744                         ire_refrele(new_ire);
1745                 return;
1746         }
1747 
1748         ip_input_local_v4(ire, mp, ipha, ira);
1749 }
1750 
1751 /*
1752  * Common function for packets arriving for the host. Handles
1753  * checksum verification, reassembly checks, etc.
1754  */
1755 static void
1756 ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1757 {
1758         ill_t           *ill = ira->ira_ill;
1759         iaflags_t       iraflags = ira->ira_flags;
1760 
1761         /*
1762          * Verify IP header checksum. If the packet was AH or ESP then
1763          * this flag has already been cleared. Likewise if the packet
1764          * had a hardware checksum.
1765          */
1766         if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1767                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1768                 ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1769                 freemsg(mp);
1770                 return;
1771         }
1772 
1773         if (iraflags & IRAF_IPV4_OPTIONS) {
1774                 if (!ip_input_local_options(mp, ipha, ira)) {
1775                         /* Error has been sent and mp consumed */
1776                         return;
1777                 }
1778                 /*
1779                  * Some old hardware does partial checksum by including the
1780                  * whole IP header, so the partial checksum value might have
1781                  * become invalid if any option in the packet have been
1782                  * updated. Always clear partial checksum flag here.
1783                  */
1784                 DB_CKSUMFLAGS(mp) &= ~HCK_PARTIALCKSUM;
1785         }
1786 
1787         /*
1788          * Is packet part of fragmented IP packet?
1789          * We compare against defined values in network byte order
1790          */
1791         if (ipha->ipha_fragment_offset_and_flags &
1792             (IPH_MF_HTONS | IPH_OFFSET_HTONS)) {
1793                 /*
1794                  * Make sure we have ira_l2src before we loose the original
1795                  * mblk
1796                  */
1797                 if (!(ira->ira_flags & IRAF_L2SRC_SET))
1798                         ip_setl2src(mp, ira, ira->ira_rill);
1799 
1800                 mp = ip_input_fragment(mp, ipha, ira);
1801                 if (mp == NULL)
1802                         return;
1803                 /* Completed reassembly */
1804                 ipha = (ipha_t *)mp->b_rptr;
1805         }
1806 
1807         /*
1808          * For broadcast and multicast we need some extra work before
1809          * we call ip_fanout_v4(), since in the case of shared-IP zones
1810          * we need to pretend that a packet arrived for each zoneid.
1811          */
1812         if (iraflags & IRAF_MULTIBROADCAST) {
1813                 if (iraflags & IRAF_BROADCAST)
1814                         ip_input_broadcast_v4(ire, mp, ipha, ira);
1815                 else
1816                         ip_input_multicast_v4(ire, mp, ipha, ira);
1817                 return;
1818         }
1819         ip_fanout_v4(mp, ipha, ira);
1820 }
1821 
1822 
1823 /*
1824  * Handle multiple zones which match the same broadcast address
1825  * and ill by delivering a packet to each of them.
1826  * Walk the bucket and look for different ire_zoneid but otherwise
1827  * the same IRE (same ill/addr/mask/type).
1828  * Note that ire_add() tracks IREs that are identical in all
1829  * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by
1830  * increasing ire_identical_cnt. Thus we don't need to be concerned
1831  * about those.
1832  */
1833 static void
1834 ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1835 {
1836         ill_t           *ill = ira->ira_ill;
1837         ip_stack_t      *ipst = ill->ill_ipst;
1838         netstack_t      *ns = ipst->ips_netstack;
1839         irb_t           *irb;
1840         ire_t           *ire1;
1841         mblk_t          *mp1;
1842         ipha_t          *ipha1;
1843         uint_t          ira_pktlen = ira->ira_pktlen;
1844         uint16_t        ira_ip_hdr_length = ira->ira_ip_hdr_length;
1845 
1846         irb = ire->ire_bucket;
1847 
1848         /*
1849          * If we don't have more than one shared-IP zone, or if
1850          * there can't be more than one IRE_BROADCAST for this
1851          * IP address, then just set the zoneid and proceed.
1852          */
1853         if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) {
1854                 ira->ira_zoneid = ire->ire_zoneid;
1855 
1856                 ip_fanout_v4(mp, ipha, ira);
1857                 return;
1858         }
1859         irb_refhold(irb);
1860         for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1861                 /* We do the main IRE after the end of the loop */
1862                 if (ire1 == ire)
1863                         continue;
1864 
1865                 /*
1866                  * Only IREs for the same IP address should be in the same
1867                  * bucket.
1868                  * But could have IRE_HOSTs in the case of CGTP.
1869                  */
1870                 ASSERT(ire1->ire_addr == ire->ire_addr);
1871                 if (!(ire1->ire_type & IRE_BROADCAST))
1872                         continue;
1873 
1874                 if (IRE_IS_CONDEMNED(ire1))
1875                         continue;
1876 
1877                 mp1 = copymsg(mp);
1878                 if (mp1 == NULL) {
1879                         /* Failed to deliver to one zone */
1880                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1881                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1882                         continue;
1883                 }
1884                 ira->ira_zoneid = ire1->ire_zoneid;
1885                 ipha1 = (ipha_t *)mp1->b_rptr;
1886                 ip_fanout_v4(mp1, ipha1, ira);
1887                 /*
1888                  * IPsec might have modified ira_pktlen and ira_ip_hdr_length
1889                  * so we restore them for a potential next iteration
1890                  */
1891                 ira->ira_pktlen = ira_pktlen;
1892                 ira->ira_ip_hdr_length = ira_ip_hdr_length;
1893         }
1894         irb_refrele(irb);
1895         /* Do the main ire */
1896         ira->ira_zoneid = ire->ire_zoneid;
1897         ip_fanout_v4(mp, ipha, ira);
1898 }
1899 
1900 /*
1901  * Handle multiple zones which want to receive the same multicast packets
1902  * on this ill by delivering a packet to each of them.
1903  *
1904  * Note that for packets delivered to transports we could instead do this
1905  * as part of the fanout code, but since we need to handle icmp_inbound
1906  * it is simpler to have multicast work the same as broadcast.
1907  *
1908  * The ip_fanout matching for multicast matches based on ilm independent of
1909  * zoneid since the zoneid restriction is applied when joining a multicast
1910  * group.
1911  */
1912 /* ARGSUSED */
1913 static void
1914 ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1915 {
1916         ill_t           *ill = ira->ira_ill;
1917         iaflags_t       iraflags = ira->ira_flags;
1918         ip_stack_t      *ipst = ill->ill_ipst;
1919         netstack_t      *ns = ipst->ips_netstack;
1920         zoneid_t        zoneid;
1921         mblk_t          *mp1;
1922         ipha_t          *ipha1;
1923         uint_t          ira_pktlen = ira->ira_pktlen;
1924         uint16_t        ira_ip_hdr_length = ira->ira_ip_hdr_length;
1925 
1926         /* ire_recv_multicast has switched to the upper ill for IPMP */
1927         ASSERT(!IS_UNDER_IPMP(ill));
1928 
1929         /*
1930          * If we don't have more than one shared-IP zone, or if
1931          * there are no members in anything but the global zone,
1932          * then just set the zoneid and proceed.
1933          */
1934         if (ns->netstack_numzones == 1 ||
1935             !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1936             GLOBAL_ZONEID)) {
1937                 ira->ira_zoneid = GLOBAL_ZONEID;
1938 
1939                 /* If sender didn't want this zone to receive it, drop */
1940                 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1941                     ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1942                         ip_drop_input("Multicast but wrong zoneid", mp, ill);
1943                         freemsg(mp);
1944                         return;
1945                 }
1946                 ip_fanout_v4(mp, ipha, ira);
1947                 return;
1948         }
1949 
1950         /*
1951          * Here we loop over all zoneids that have members in the group
1952          * and deliver a packet to ip_fanout for each zoneid.
1953          *
1954          * First find any members in the lowest numeric zoneid by looking for
1955          * first zoneid larger than -1 (ALL_ZONES).
1956          * We terminate the loop when we receive -1 (ALL_ZONES).
1957          */
1958         zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES);
1959         for (; zoneid != ALL_ZONES;
1960             zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) {
1961                 /*
1962                  * Avoid an extra copymsg/freemsg by skipping global zone here
1963                  * and doing that at the end.
1964                  */
1965                 if (zoneid == GLOBAL_ZONEID)
1966                         continue;
1967 
1968                 ira->ira_zoneid = zoneid;
1969 
1970                 /* If sender didn't want this zone to receive it, skip */
1971                 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1972                     ira->ira_no_loop_zoneid == ira->ira_zoneid)
1973                         continue;
1974 
1975                 mp1 = copymsg(mp);
1976                 if (mp1 == NULL) {
1977                         /* Failed to deliver to one zone */
1978                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1979                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1980                         continue;
1981                 }
1982                 ipha1 = (ipha_t *)mp1->b_rptr;
1983                 ip_fanout_v4(mp1, ipha1, ira);
1984                 /*
1985                  * IPsec might have modified ira_pktlen and ira_ip_hdr_length
1986                  * so we restore them for a potential next iteration
1987                  */
1988                 ira->ira_pktlen = ira_pktlen;
1989                 ira->ira_ip_hdr_length = ira_ip_hdr_length;
1990         }
1991 
1992         /* Do the main ire */
1993         ira->ira_zoneid = GLOBAL_ZONEID;
1994         /* If sender didn't want this zone to receive it, drop */
1995         if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1996             ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1997                 ip_drop_input("Multicast but wrong zoneid", mp, ill);
1998                 freemsg(mp);
1999         } else {
2000                 ip_fanout_v4(mp, ipha, ira);
2001         }
2002 }
2003 
2004 
2005 /*
2006  * Determine the zoneid and IRAF_TX_* flags if trusted extensions
2007  * is in use. Updates ira_zoneid and ira_flags as a result.
2008  */
2009 static void
2010 ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol,
2011     uint_t ip_hdr_length, ip_recv_attr_t *ira)
2012 {
2013         uint16_t        *up;
2014         uint16_t        lport;
2015         zoneid_t        zoneid;
2016 
2017         ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED);
2018 
2019         /*
2020          * If the packet is unlabeled we might allow read-down
2021          * for MAC_EXEMPT. Below we clear this if it is a multi-level
2022          * port (MLP).
2023          * Note that ira_tsl can be NULL here.
2024          */
2025         if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED)
2026                 ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE;
2027 
2028         if (ira->ira_zoneid != ALL_ZONES)
2029                 return;
2030 
2031         ira->ira_flags |= IRAF_TX_SHARED_ADDR;
2032 
2033         up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
2034         switch (protocol) {
2035         case IPPROTO_TCP:
2036         case IPPROTO_SCTP:
2037         case IPPROTO_UDP:
2038                 /* Caller ensures this */
2039                 ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr);
2040 
2041                 /*
2042                  * Only these transports support MLP.
2043                  * We know their destination port numbers is in
2044                  * the same place in the header.
2045                  */
2046                 lport = up[1];
2047 
2048                 /*
2049                  * No need to handle exclusive-stack zones
2050                  * since ALL_ZONES only applies to the shared IP instance.
2051                  */
2052                 zoneid = tsol_mlp_findzone(protocol, lport);
2053                 /*
2054                  * If no shared MLP is found, tsol_mlp_findzone returns
2055                  * ALL_ZONES.  In that case, we assume it's SLP, and
2056                  * search for the zone based on the packet label.
2057                  *
2058                  * If there is such a zone, we prefer to find a
2059                  * connection in it.  Otherwise, we look for a
2060                  * MAC-exempt connection in any zone whose label
2061                  * dominates the default label on the packet.
2062                  */
2063                 if (zoneid == ALL_ZONES)
2064                         zoneid = tsol_attr_to_zoneid(ira);
2065                 else
2066                         ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE;
2067                 break;
2068         default:
2069                 /* Handle shared address for other protocols */
2070                 zoneid = tsol_attr_to_zoneid(ira);
2071                 break;
2072         }
2073         ira->ira_zoneid = zoneid;
2074 }
2075 
2076 /*
2077  * Increment checksum failure statistics
2078  */
2079 static void
2080 ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill)
2081 {
2082         ip_stack_t      *ipst = ill->ill_ipst;
2083 
2084         switch (protocol) {
2085         case IPPROTO_TCP:
2086                 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
2087 
2088                 if (hck_flags & HCK_FULLCKSUM)
2089                         IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err);
2090                 else if (hck_flags & HCK_PARTIALCKSUM)
2091                         IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err);
2092                 else
2093                         IP_STAT(ipst, ip_tcp_in_sw_cksum_err);
2094                 break;
2095         case IPPROTO_UDP:
2096                 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
2097                 if (hck_flags & HCK_FULLCKSUM)
2098                         IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
2099                 else if (hck_flags & HCK_PARTIALCKSUM)
2100                         IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
2101                 else
2102                         IP_STAT(ipst, ip_udp_in_sw_cksum_err);
2103                 break;
2104         case IPPROTO_ICMP:
2105                 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2106                 break;
2107         default:
2108                 ASSERT(0);
2109                 break;
2110         }
2111 }
2112 
2113 /* Calculate the IPv4 pseudo-header checksum */
2114 uint32_t
2115 ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira)
2116 {
2117         uint_t          ulp_len;
2118         uint32_t        cksum;
2119         uint8_t         protocol = ira->ira_protocol;
2120         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2121 
2122 #define iphs    ((uint16_t *)ipha)
2123 
2124         switch (protocol) {
2125         case IPPROTO_TCP:
2126                 ulp_len = ira->ira_pktlen - ip_hdr_length;
2127 
2128                 /* Protocol and length */
2129                 cksum = htons(ulp_len) + IP_TCP_CSUM_COMP;
2130                 /* IP addresses */
2131                 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2132                 break;
2133 
2134         case IPPROTO_UDP: {
2135                 udpha_t         *udpha;
2136 
2137                 udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2138 
2139                 /* Protocol and length */
2140                 cksum = udpha->uha_length + IP_UDP_CSUM_COMP;
2141                 /* IP addresses */
2142                 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2143                 break;
2144         }
2145 
2146         default:
2147                 cksum = 0;
2148                 break;
2149         }
2150 #undef  iphs
2151         return (cksum);
2152 }
2153 
2154 
2155 /*
2156  * Software verification of the ULP checksums.
2157  * Returns B_TRUE if ok.
2158  * Increments statistics of failed.
2159  */
2160 static boolean_t
2161 ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2162 {
2163         ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2164         uint32_t        cksum;
2165         uint8_t         protocol = ira->ira_protocol;
2166         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2167 
2168         IP_STAT(ipst, ip_in_sw_cksum);
2169 
2170         ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
2171 
2172         cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2173         cksum = IP_CSUM(mp, ip_hdr_length, cksum);
2174         if (cksum == 0)
2175                 return (B_TRUE);
2176 
2177         ip_input_cksum_err_v4(protocol, 0, ira->ira_ill);
2178         return (B_FALSE);
2179 }
2180 
2181 /*
2182  * Verify the ULP checksums.
2183  * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
2184  * algorithm.
2185  * Increments statistics if failed.
2186  */
2187 static boolean_t
2188 ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
2189     ip_recv_attr_t *ira)
2190 {
2191         ill_t           *ill = ira->ira_rill;
2192         uint16_t        hck_flags;
2193         uint32_t        cksum;
2194         mblk_t          *mp1;
2195         int32_t         len;
2196         uint8_t         protocol = ira->ira_protocol;
2197         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2198 
2199 
2200         switch (protocol) {
2201         case IPPROTO_TCP:
2202                 break;
2203 
2204         case IPPROTO_UDP: {
2205                 udpha_t         *udpha;
2206 
2207                 udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2208                 if (udpha->uha_checksum == 0) {
2209                         /* Packet doesn't have a UDP checksum */
2210                         return (B_TRUE);
2211                 }
2212                 break;
2213         }
2214         case IPPROTO_SCTP: {
2215                 sctp_hdr_t      *sctph;
2216                 uint32_t        pktsum;
2217 
2218                 sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length);
2219 #ifdef  DEBUG
2220                 if (skip_sctp_cksum)
2221                         return (B_TRUE);
2222 #endif
2223                 pktsum = sctph->sh_chksum;
2224                 sctph->sh_chksum = 0;
2225                 cksum = sctp_cksum(mp, ip_hdr_length);
2226                 sctph->sh_chksum = pktsum;
2227                 if (cksum == pktsum)
2228                         return (B_TRUE);
2229 
2230                 /*
2231                  * Defer until later whether a bad checksum is ok
2232                  * in order to allow RAW sockets to use Adler checksum
2233                  * with SCTP.
2234                  */
2235                 ira->ira_flags |= IRAF_SCTP_CSUM_ERR;
2236                 return (B_TRUE);
2237         }
2238 
2239         default:
2240                 /* No ULP checksum to verify. */
2241                 return (B_TRUE);
2242         }
2243         /*
2244          * Revert to software checksum calculation if the interface
2245          * isn't capable of checksum offload.
2246          * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
2247          * Note: IRAF_NO_HW_CKSUM is not currently used.
2248          */
2249         ASSERT(!IS_IPMP(ill));
2250         if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
2251             !dohwcksum) {
2252                 return (ip_input_sw_cksum_v4(mp, ipha, ira));
2253         }
2254 
2255         /*
2256          * We apply this for all ULP protocols. Does the HW know to
2257          * not set the flags for SCTP and other protocols.
2258          */
2259 
2260         hck_flags = DB_CKSUMFLAGS(mp);
2261 
2262         if (hck_flags & HCK_FULLCKSUM_OK) {
2263                 /*
2264                  * Hardware has already verified the checksum.
2265                  */
2266                 return (B_TRUE);
2267         }
2268 
2269         if (hck_flags & HCK_FULLCKSUM) {
2270                 /*
2271                  * Full checksum has been computed by the hardware
2272                  * and has been attached.  If the driver wants us to
2273                  * verify the correctness of the attached value, in
2274                  * order to protect against faulty hardware, compare
2275                  * it against -0 (0xFFFF) to see if it's valid.
2276                  */
2277                 cksum = DB_CKSUM16(mp);
2278                 if (cksum == 0xFFFF)
2279                         return (B_TRUE);
2280                 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2281                 return (B_FALSE);
2282         }
2283 
2284         mp1 = mp->b_cont;
2285         if ((hck_flags & HCK_PARTIALCKSUM) &&
2286             (mp1 == NULL || mp1->b_cont == NULL) &&
2287             ip_hdr_length >= DB_CKSUMSTART(mp) &&
2288             ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) {
2289                 uint32_t        adj;
2290                 uchar_t         *cksum_start;
2291 
2292                 cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2293 
2294                 cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp));
2295 
2296                 /*
2297                  * Partial checksum has been calculated by hardware
2298                  * and attached to the packet; in addition, any
2299                  * prepended extraneous data is even byte aligned,
2300                  * and there are at most two mblks associated with
2301                  * the packet.  If any such data exists, we adjust
2302                  * the checksum; also take care any postpended data.
2303                  */
2304                 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj);
2305                 /*
2306                  * One's complement subtract extraneous checksum
2307                  */
2308                 cksum += DB_CKSUM16(mp);
2309                 if (adj >= cksum)
2310                         cksum = ~(adj - cksum) & 0xFFFF;
2311                 else
2312                         cksum -= adj;
2313                 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2314                 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2315                 if (!(~cksum & 0xFFFF))
2316                         return (B_TRUE);
2317 
2318                 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2319                 return (B_FALSE);
2320         }
2321         return (ip_input_sw_cksum_v4(mp, ipha, ira));
2322 }
2323 
2324 
2325 /*
2326  * Handle fanout of received packets.
2327  * Unicast packets that are looped back (from ire_send_local_v4) and packets
2328  * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
2329  *
2330  * IPQoS Notes
2331  * Before sending it to the client, invoke IPPF processing. Policy processing
2332  * takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
2333  */
2334 void
2335 ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2336 {
2337         ill_t           *ill = ira->ira_ill;
2338         iaflags_t       iraflags = ira->ira_flags;
2339         ip_stack_t      *ipst = ill->ill_ipst;
2340         uint8_t         protocol = ipha->ipha_protocol;
2341         conn_t          *connp;
2342 #define rptr    ((uchar_t *)ipha)
2343         uint_t          ip_hdr_length;
2344         uint_t          min_ulp_header_length;
2345         int             offset;
2346         ssize_t         len;
2347         netstack_t      *ns = ipst->ips_netstack;
2348         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2349         ill_t           *rill = ira->ira_rill;
2350 
2351         ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length));
2352 
2353         ip_hdr_length = ira->ira_ip_hdr_length;
2354         ira->ira_protocol = protocol;
2355 
2356         /*
2357          * Time for IPP once we've done reassembly and IPsec.
2358          * We skip this for loopback packets since we don't do IPQoS
2359          * on loopback.
2360          */
2361         if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
2362             !(iraflags & IRAF_LOOPBACK) &&
2363             (protocol != IPPROTO_ESP && protocol != IPPROTO_AH)) {
2364                 /*
2365                  * Use the interface on which the packet arrived - not where
2366                  * the IP address is hosted.
2367                  */
2368                 /* ip_process translates an IS_UNDER_IPMP */
2369                 mp = ip_process(IPP_LOCAL_IN, mp, rill, ill);
2370                 if (mp == NULL) {
2371                         /* ip_drop_packet and MIB done */
2372                         return;
2373                 }
2374         }
2375 
2376         /* Determine the minimum required size of the upper-layer header */
2377         /* Need to do this for at least the set of ULPs that TX handles. */
2378         switch (protocol) {
2379         case IPPROTO_TCP:
2380                 min_ulp_header_length = TCP_MIN_HEADER_LENGTH;
2381                 break;
2382         case IPPROTO_SCTP:
2383                 min_ulp_header_length = SCTP_COMMON_HDR_LENGTH;
2384                 break;
2385         case IPPROTO_UDP:
2386                 min_ulp_header_length = UDPH_SIZE;
2387                 break;
2388         case IPPROTO_ICMP:
2389                 min_ulp_header_length = ICMPH_SIZE;
2390                 break;
2391         default:
2392                 min_ulp_header_length = 0;
2393                 break;
2394         }
2395         /* Make sure we have the min ULP header length */
2396         len = mp->b_wptr - rptr;
2397         if (len < ip_hdr_length + min_ulp_header_length) {
2398                 if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) {
2399                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
2400                         ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
2401                         freemsg(mp);
2402                         return;
2403                 }
2404                 IP_STAT(ipst, ip_recv_pullup);
2405                 ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length,
2406                     ira);
2407                 if (ipha == NULL)
2408                         goto discard;
2409                 len = mp->b_wptr - rptr;
2410         }
2411 
2412         /*
2413          * If trusted extensions then determine the zoneid and TX specific
2414          * ira_flags.
2415          */
2416         if (iraflags & IRAF_SYSTEM_LABELED) {
2417                 /* This can update ira->ira_flags and ira->ira_zoneid */
2418                 ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira);
2419                 iraflags = ira->ira_flags;
2420         }
2421 
2422 
2423         /* Verify ULP checksum. Handles TCP, UDP, and SCTP */
2424         if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
2425                 if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) {
2426                         /* Bad checksum. Stats are already incremented */
2427                         ip_drop_input("Bad ULP checksum", mp, ill);
2428                         freemsg(mp);
2429                         return;
2430                 }
2431                 /* IRAF_SCTP_CSUM_ERR could have been set */
2432                 iraflags = ira->ira_flags;
2433         }
2434         switch (protocol) {
2435         case IPPROTO_TCP:
2436                 /* For TCP, discard broadcast and multicast packets. */
2437                 if (iraflags & IRAF_MULTIBROADCAST)
2438                         goto discard;
2439 
2440                 /* First mblk contains IP+TCP headers per above check */
2441                 ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH);
2442 
2443                 /* TCP options present? */
2444                 offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4;
2445                 if (offset != 5) {
2446                         if (offset < 5)
2447                                 goto discard;
2448 
2449                         /*
2450                          * There must be TCP options.
2451                          * Make sure we can grab them.
2452                          */
2453                         offset <<= 2;
2454                         offset += ip_hdr_length;
2455                         if (len < offset) {
2456                                 if (ira->ira_pktlen < offset) {
2457                                         BUMP_MIB(ill->ill_ip_mib,
2458                                             ipIfStatsInTruncatedPkts);
2459                                         ip_drop_input(
2460                                             "ipIfStatsInTruncatedPkts",
2461                                             mp, ill);
2462                                         freemsg(mp);
2463                                         return;
2464                                 }
2465                                 IP_STAT(ipst, ip_recv_pullup);
2466                                 ipha = ip_pullup(mp, offset, ira);
2467                                 if (ipha == NULL)
2468                                         goto discard;
2469                                 len = mp->b_wptr - rptr;
2470                         }
2471                 }
2472 
2473                 /*
2474                  * Pass up a squeue hint to tcp.
2475                  * If ira_sqp is already set (this is loopback) we leave it
2476                  * alone.
2477                  */
2478                 if (ira->ira_sqp == NULL) {
2479                         ira->ira_sqp = ip_squeue_get(ira->ira_ring);
2480                 }
2481 
2482                 /* Look for AF_INET or AF_INET6 that matches */
2483                 connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length,
2484                     ira, ipst);
2485                 if (connp == NULL) {
2486                         /* Send the TH_RST */
2487                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2488                         tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2489                         return;
2490                 }
2491                 if (connp->conn_incoming_ifindex != 0 &&
2492                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2493                         CONN_DEC_REF(connp);
2494 
2495                         /* Send the TH_RST */
2496                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2497                         tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2498                         return;
2499                 }
2500                 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2501                     (iraflags & IRAF_IPSEC_SECURE)) {
2502                         mp = ipsec_check_inbound_policy(mp, connp,
2503                             ipha, NULL, ira);
2504                         if (mp == NULL) {
2505                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2506                                 /* Note that mp is NULL */
2507                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2508                                 CONN_DEC_REF(connp);
2509                                 return;
2510                         }
2511                 }
2512                 /* Found a client; up it goes */
2513                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2514                 ira->ira_ill = ira->ira_rill = NULL;
2515                 if (!IPCL_IS_TCP(connp)) {
2516                         /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
2517                         (connp->conn_recv)(connp, mp, NULL, ira);
2518                         CONN_DEC_REF(connp);
2519                         ira->ira_ill = ill;
2520                         ira->ira_rill = rill;
2521                         return;
2522                 }
2523 
2524                 /*
2525                  * We do different processing whether called from
2526                  * ip_accept_tcp and we match the target, don't match
2527                  * the target, and when we are called by ip_input.
2528                  */
2529                 if (iraflags & IRAF_TARGET_SQP) {
2530                         if (ira->ira_target_sqp == connp->conn_sqp) {
2531                                 mblk_t  *attrmp;
2532 
2533                                 attrmp = ip_recv_attr_to_mblk(ira);
2534                                 if (attrmp == NULL) {
2535                                         BUMP_MIB(ill->ill_ip_mib,
2536                                             ipIfStatsInDiscards);
2537                                         ip_drop_input("ipIfStatsInDiscards",
2538                                             mp, ill);
2539                                         freemsg(mp);
2540                                         CONN_DEC_REF(connp);
2541                                 } else {
2542                                         SET_SQUEUE(attrmp, connp->conn_recv,
2543                                             connp);
2544                                         attrmp->b_cont = mp;
2545                                         ASSERT(ira->ira_target_sqp_mp == NULL);
2546                                         ira->ira_target_sqp_mp = attrmp;
2547                                         /*
2548                                          * Conn ref release when drained from
2549                                          * the squeue.
2550                                          */
2551                                 }
2552                         } else {
2553                                 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2554                                     connp->conn_recv, connp, ira, SQ_FILL,
2555                                     SQTAG_IP_TCP_INPUT);
2556                         }
2557                 } else {
2558                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
2559                             connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT);
2560                 }
2561                 ira->ira_ill = ill;
2562                 ira->ira_rill = rill;
2563                 return;
2564 
2565         case IPPROTO_SCTP: {
2566                 sctp_hdr_t      *sctph;
2567                 in6_addr_t      map_src, map_dst;
2568                 uint32_t        ports;  /* Source and destination ports */
2569                 sctp_stack_t    *sctps = ipst->ips_netstack->netstack_sctp;
2570 
2571                 /* For SCTP, discard broadcast and multicast packets. */
2572                 if (iraflags & IRAF_MULTIBROADCAST)
2573                         goto discard;
2574 
2575                 /*
2576                  * Since there is no SCTP h/w cksum support yet, just
2577                  * clear the flag.
2578                  */
2579                 DB_CKSUMFLAGS(mp) = 0;
2580 
2581                 /* Length ensured above */
2582                 ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH);
2583                 sctph = (sctp_hdr_t *)(rptr + ip_hdr_length);
2584 
2585                 /* get the ports */
2586                 ports = *(uint32_t *)&sctph->sh_sport;
2587 
2588                 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
2589                 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
2590                 if (iraflags & IRAF_SCTP_CSUM_ERR) {
2591                         /*
2592                          * No potential sctp checksum errors go to the Sun
2593                          * sctp stack however they might be Adler-32 summed
2594                          * packets a userland stack bound to a raw IP socket
2595                          * could reasonably use. Note though that Adler-32 is
2596                          * a long deprecated algorithm and customer sctp
2597                          * networks should eventually migrate to CRC-32 at
2598                          * which time this facility should be removed.
2599                          */
2600                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2601                         return;
2602                 }
2603                 connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp,
2604                     sctps, sctph);
2605                 if (connp == NULL) {
2606                         /* Check for raw socket or OOTB handling */
2607                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2608                         return;
2609                 }
2610                 if (connp->conn_incoming_ifindex != 0 &&
2611                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2612                         CONN_DEC_REF(connp);
2613                         /* Check for raw socket or OOTB handling */
2614                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2615                         return;
2616                 }
2617 
2618                 /* Found a client; up it goes */
2619                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2620                 sctp_input(connp, ipha, NULL, mp, ira);
2621                 /* sctp_input does a rele of the sctp_t */
2622                 return;
2623         }
2624 
2625         case IPPROTO_UDP:
2626                 /* First mblk contains IP+UDP headers as checked above */
2627                 ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE);
2628 
2629                 if (iraflags & IRAF_MULTIBROADCAST) {
2630                         uint16_t *up;   /* Pointer to ports in ULP header */
2631 
2632                         up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
2633                         ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira);
2634                         return;
2635                 }
2636 
2637                 /* Look for AF_INET or AF_INET6 that matches */
2638                 connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length,
2639                     ira, ipst);
2640                 if (connp == NULL) {
2641         no_udp_match:
2642                         if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].
2643                             connf_head != NULL) {
2644                                 ASSERT(ira->ira_protocol == IPPROTO_UDP);
2645                                 ip_fanout_proto_v4(mp, ipha, ira);
2646                         } else {
2647                                 ip_fanout_send_icmp_v4(mp,
2648                                     ICMP_DEST_UNREACHABLE,
2649                                     ICMP_PORT_UNREACHABLE, ira);
2650                         }
2651                         return;
2652 
2653                 }
2654                 if (connp->conn_incoming_ifindex != 0 &&
2655                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2656                         CONN_DEC_REF(connp);
2657                         goto no_udp_match;
2658                 }
2659                 if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
2660                     !canputnext(connp->conn_rq)) {
2661                         CONN_DEC_REF(connp);
2662                         BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
2663                         ip_drop_input("udpIfStatsInOverflows", mp, ill);
2664                         freemsg(mp);
2665                         return;
2666                 }
2667                 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2668                     (iraflags & IRAF_IPSEC_SECURE)) {
2669                         mp = ipsec_check_inbound_policy(mp, connp,
2670                             ipha, NULL, ira);
2671                         if (mp == NULL) {
2672                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2673                                 /* Note that mp is NULL */
2674                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2675                                 CONN_DEC_REF(connp);
2676                                 return;
2677                         }
2678                 }
2679                 /*
2680                  * Remove 0-spi if it's 0, or move everything behind
2681                  * the UDP header over it and forward to ESP via
2682                  * ip_fanout_v4().
2683                  */
2684                 if (connp->conn_udp->udp_nat_t_endpoint) {
2685                         if (iraflags & IRAF_IPSEC_SECURE) {
2686                                 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2687                                     DROPPER(ipss, ipds_esp_nat_t_ipsec),
2688                                     &ipss->ipsec_dropper);
2689                                 CONN_DEC_REF(connp);
2690                                 return;
2691                         }
2692 
2693                         mp = zero_spi_check(mp, ira);
2694                         if (mp == NULL) {
2695                                 /*
2696                                  * Packet was consumed - probably sent to
2697                                  * ip_fanout_v4.
2698                                  */
2699                                 CONN_DEC_REF(connp);
2700                                 return;
2701                         }
2702                         /* Else continue like a normal UDP packet. */
2703                         ipha = (ipha_t *)mp->b_rptr;
2704                         protocol = ipha->ipha_protocol;
2705                         ira->ira_protocol = protocol;
2706                 }
2707                 /* Found a client; up it goes */
2708                 IP_STAT(ipst, ip_udp_fannorm);
2709                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2710                 ira->ira_ill = ira->ira_rill = NULL;
2711                 (connp->conn_recv)(connp, mp, NULL, ira);
2712                 CONN_DEC_REF(connp);
2713                 ira->ira_ill = ill;
2714                 ira->ira_rill = rill;
2715                 return;
2716         default:
2717                 break;
2718         }
2719 
2720         /*
2721          * Clear hardware checksumming flag as it is currently only
2722          * used by TCP and UDP.
2723          */
2724         DB_CKSUMFLAGS(mp) = 0;
2725 
2726         switch (protocol) {
2727         case IPPROTO_ICMP:
2728                 /*
2729                  * We need to accomodate icmp messages coming in clear
2730                  * until we get everything secure from the wire. If
2731                  * icmp_accept_clear_messages is zero we check with
2732                  * the global policy and act accordingly. If it is
2733                  * non-zero, we accept the message without any checks.
2734                  * But *this does not mean* that this will be delivered
2735                  * to RAW socket clients. By accepting we might send
2736                  * replies back, change our MTU value etc.,
2737                  * but delivery to the ULP/clients depends on their
2738                  * policy dispositions.
2739                  */
2740                 if (ipst->ips_icmp_accept_clear_messages == 0) {
2741                         mp = ipsec_check_global_policy(mp, NULL,
2742                             ipha, NULL, ira, ns);
2743                         if (mp == NULL)
2744                                 return;
2745                 }
2746 
2747                 /*
2748                  * On a labeled system, we have to check whether the zone
2749                  * itself is permitted to receive raw traffic.
2750                  */
2751                 if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2752                         if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
2753                                 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
2754                                 ip_drop_input("tsol_can_accept_raw", mp, ill);
2755                                 freemsg(mp);
2756                                 return;
2757                         }
2758                 }
2759 
2760                 /*
2761                  * ICMP header checksum, including checksum field,
2762                  * should be zero.
2763                  */
2764                 if (IP_CSUM(mp, ip_hdr_length, 0)) {
2765                         BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2766                         ip_drop_input("icmpInCksumErrs", mp, ill);
2767                         freemsg(mp);
2768                         return;
2769                 }
2770                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2771                 mp = icmp_inbound_v4(mp, ira);
2772                 if (mp == NULL) {
2773                         /* No need to pass to RAW sockets */
2774                         return;
2775                 }
2776                 break;
2777 
2778         case IPPROTO_IGMP:
2779                 /*
2780                  * If we are not willing to accept IGMP packets in clear,
2781                  * then check with global policy.
2782                  */
2783                 if (ipst->ips_igmp_accept_clear_messages == 0) {
2784                         mp = ipsec_check_global_policy(mp, NULL,
2785                             ipha, NULL, ira, ns);
2786                         if (mp == NULL)
2787                                 return;
2788                 }
2789                 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2790                     !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2791                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2792                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2793                         freemsg(mp);
2794                         return;
2795                 }
2796                 /*
2797                  * Validate checksum
2798                  */
2799                 if (IP_CSUM(mp, ip_hdr_length, 0)) {
2800                         ++ipst->ips_igmpstat.igps_rcv_badsum;
2801                         ip_drop_input("igps_rcv_badsum", mp, ill);
2802                         freemsg(mp);
2803                         return;
2804                 }
2805 
2806                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2807                 mp = igmp_input(mp, ira);
2808                 if (mp == NULL) {
2809                         /* Bad packet - discarded by igmp_input */
2810                         return;
2811                 }
2812                 break;
2813         case IPPROTO_PIM:
2814                 /*
2815                  * If we are not willing to accept PIM packets in clear,
2816                  * then check with global policy.
2817                  */
2818                 if (ipst->ips_pim_accept_clear_messages == 0) {
2819                         mp = ipsec_check_global_policy(mp, NULL,
2820                             ipha, NULL, ira, ns);
2821                         if (mp == NULL)
2822                                 return;
2823                 }
2824                 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2825                     !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2826                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2827                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2828                         freemsg(mp);
2829                         return;
2830                 }
2831                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2832 
2833                 /* Checksum is verified in pim_input */
2834                 mp = pim_input(mp, ira);
2835                 if (mp == NULL) {
2836                         /* Bad packet - discarded by pim_input */
2837                         return;
2838                 }
2839                 break;
2840         case IPPROTO_AH:
2841         case IPPROTO_ESP: {
2842                 /*
2843                  * Fast path for AH/ESP.
2844                  */
2845                 netstack_t *ns = ipst->ips_netstack;
2846                 ipsec_stack_t *ipss = ns->netstack_ipsec;
2847 
2848                 IP_STAT(ipst, ipsec_proto_ahesp);
2849 
2850                 if (!ipsec_loaded(ipss)) {
2851                         ip_proto_not_sup(mp, ira);
2852                         return;
2853                 }
2854 
2855                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2856                 /* select inbound SA and have IPsec process the pkt */
2857                 if (protocol == IPPROTO_ESP) {
2858                         esph_t *esph;
2859                         boolean_t esp_in_udp_sa;
2860                         boolean_t esp_in_udp_packet;
2861 
2862                         mp = ipsec_inbound_esp_sa(mp, ira, &esph);
2863                         if (mp == NULL)
2864                                 return;
2865 
2866                         ASSERT(esph != NULL);
2867                         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2868                         ASSERT(ira->ira_ipsec_esp_sa != NULL);
2869                         ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL);
2870 
2871                         esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags &
2872                             IPSA_F_NATT) != 0);
2873                         esp_in_udp_packet =
2874                             (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0;
2875 
2876                         /*
2877                          * The following is a fancy, but quick, way of saying:
2878                          * ESP-in-UDP SA and Raw ESP packet --> drop
2879                          *    OR
2880                          * ESP SA and ESP-in-UDP packet --> drop
2881                          */
2882                         if (esp_in_udp_sa != esp_in_udp_packet) {
2883                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2884                                 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2885                                     DROPPER(ipss, ipds_esp_no_sa),
2886                                     &ipss->ipsec_dropper);
2887                                 return;
2888                         }
2889                         mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph,
2890                             ira);
2891                 } else {
2892                         ah_t *ah;
2893 
2894                         mp = ipsec_inbound_ah_sa(mp, ira, &ah);
2895                         if (mp == NULL)
2896                                 return;
2897 
2898                         ASSERT(ah != NULL);
2899                         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2900                         ASSERT(ira->ira_ipsec_ah_sa != NULL);
2901                         ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
2902                         mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah,
2903                             ira);
2904                 }
2905 
2906                 if (mp == NULL) {
2907                         /*
2908                          * Either it failed or is pending. In the former case
2909                          * ipIfStatsInDiscards was increased.
2910                          */
2911                         return;
2912                 }
2913                 /* we're done with IPsec processing, send it up */
2914                 ip_input_post_ipsec(mp, ira);
2915                 return;
2916         }
2917         case IPPROTO_ENCAP: {
2918                 ipha_t          *inner_ipha;
2919 
2920                 /*
2921                  * Handle self-encapsulated packets (IP-in-IP where
2922                  * the inner addresses == the outer addresses).
2923                  */
2924                 if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) >
2925                     mp->b_wptr) {
2926                         if (ira->ira_pktlen <
2927                             ip_hdr_length + sizeof (ipha_t)) {
2928                                 BUMP_MIB(ill->ill_ip_mib,
2929                                     ipIfStatsInTruncatedPkts);
2930                                 ip_drop_input("ipIfStatsInTruncatedPkts",
2931                                     mp, ill);
2932                                 freemsg(mp);
2933                                 return;
2934                         }
2935                         ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length +
2936                             sizeof (ipha_t) - mp->b_rptr, ira);
2937                         if (ipha == NULL) {
2938                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2939                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2940                                 freemsg(mp);
2941                                 return;
2942                         }
2943                 }
2944                 inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length);
2945                 /*
2946                  * Check the sanity of the inner IP header.
2947                  */
2948                 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) {
2949                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2950                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2951                         freemsg(mp);
2952                         return;
2953                 }
2954                 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) {
2955                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2956                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2957                         freemsg(mp);
2958                         return;
2959                 }
2960                 if (inner_ipha->ipha_src != ipha->ipha_src ||
2961                     inner_ipha->ipha_dst != ipha->ipha_dst) {
2962                         /* We fallthru to iptun fanout below */
2963                         goto iptun;
2964                 }
2965 
2966                 /*
2967                  * Self-encapsulated tunnel packet. Remove
2968                  * the outer IP header and fanout again.
2969                  * We also need to make sure that the inner
2970                  * header is pulled up until options.
2971                  */
2972                 mp->b_rptr = (uchar_t *)inner_ipha;
2973                 ipha = inner_ipha;
2974                 ip_hdr_length = IPH_HDR_LENGTH(ipha);
2975                 if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) {
2976                         if (ira->ira_pktlen <
2977                             (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) {
2978                                 BUMP_MIB(ill->ill_ip_mib,
2979                                     ipIfStatsInTruncatedPkts);
2980                                 ip_drop_input("ipIfStatsInTruncatedPkts",
2981                                     mp, ill);
2982                                 freemsg(mp);
2983                                 return;
2984                         }
2985                         ipha = ip_pullup(mp,
2986                             (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira);
2987                         if (ipha == NULL) {
2988                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2989                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2990                                 freemsg(mp);
2991                                 return;
2992                         }
2993                 }
2994                 if (ip_hdr_length > sizeof (ipha_t)) {
2995                         /* We got options on the inner packet. */
2996                         ipaddr_t        dst = ipha->ipha_dst;
2997                         int             error = 0;
2998 
2999                         dst = ip_input_options(ipha, dst, mp, ira, &error);
3000                         if (error != 0) {
3001                                 /*
3002                                  * An ICMP error has been sent and the packet
3003                                  * has been dropped.
3004                                  */
3005                                 return;
3006                         }
3007                         if (dst != ipha->ipha_dst) {
3008                                 /*
3009                                  * Someone put a source-route in
3010                                  * the inside header of a self-
3011                                  * encapsulated packet.  Drop it
3012                                  * with extreme prejudice and let
3013                                  * the sender know.
3014                                  */
3015                                 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
3016                                     mp, ill);
3017                                 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
3018                                     ira);
3019                                 return;
3020                         }
3021                 }
3022                 if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
3023                         /*
3024                          * This means that somebody is sending
3025                          * Self-encapsualted packets without AH/ESP.
3026                          *
3027                          * Send this packet to find a tunnel endpoint.
3028                          * if I can't find one, an ICMP
3029                          * PROTOCOL_UNREACHABLE will get sent.
3030                          */
3031                         protocol = ipha->ipha_protocol;
3032                         ira->ira_protocol = protocol;
3033                         goto iptun;
3034                 }
3035 
3036                 /* Update based on removed IP header */
3037                 ira->ira_ip_hdr_length = ip_hdr_length;
3038                 ira->ira_pktlen = ntohs(ipha->ipha_length);
3039 
3040                 if (ira->ira_flags & IRAF_IPSEC_DECAPS) {
3041                         /*
3042                          * This packet is self-encapsulated multiple
3043                          * times. We don't want to recurse infinitely.
3044                          * To keep it simple, drop the packet.
3045                          */
3046                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3047                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3048                         freemsg(mp);
3049                         return;
3050                 }
3051                 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3052                 ira->ira_flags |= IRAF_IPSEC_DECAPS;
3053 
3054                 ip_input_post_ipsec(mp, ira);
3055                 return;
3056         }
3057 
3058         iptun:  /* IPPROTO_ENCAPS that is not self-encapsulated */
3059         case IPPROTO_IPV6:
3060                 /* iptun will verify trusted label */
3061                 connp = ipcl_classify_v4(mp, protocol, ip_hdr_length,
3062                     ira, ipst);
3063                 if (connp != NULL) {
3064                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
3065                         ira->ira_ill = ira->ira_rill = NULL;
3066                         (connp->conn_recv)(connp, mp, NULL, ira);
3067                         CONN_DEC_REF(connp);
3068                         ira->ira_ill = ill;
3069                         ira->ira_rill = rill;
3070                         return;
3071                 }
3072                 /* FALLTHRU */
3073         default:
3074                 /*
3075                  * On a labeled system, we have to check whether the zone
3076                  * itself is permitted to receive raw traffic.
3077                  */
3078                 if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
3079                         if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
3080                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3081                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
3082                                 freemsg(mp);
3083                                 return;
3084                         }
3085                 }
3086                 break;
3087         }
3088 
3089         /*
3090          * The above input functions may have returned the pulled up message.
3091          * So ipha need to be reinitialized.
3092          */
3093         ipha = (ipha_t *)mp->b_rptr;
3094         ira->ira_protocol = protocol = ipha->ipha_protocol;
3095         if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) {
3096                 /*
3097                  * No user-level listener for these packets packets.
3098                  * Check for IPPROTO_ENCAP...
3099                  */
3100                 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) {
3101                         /*
3102                          * Check policy here,
3103                          * THEN ship off to ip_mroute_decap().
3104                          *
3105                          * BTW,  If I match a configured IP-in-IP
3106                          * tunnel above, this path will not be reached, and
3107                          * ip_mroute_decap will never be called.
3108                          */
3109                         mp = ipsec_check_global_policy(mp, connp,
3110                             ipha, NULL, ira, ns);
3111                         if (mp != NULL) {
3112                                 ip_mroute_decap(mp, ira);
3113                         } /* Else we already freed everything! */
3114                 } else {
3115                         ip_proto_not_sup(mp, ira);
3116                 }
3117                 return;
3118         }
3119 
3120         /*
3121          * Handle fanout to raw sockets.  There
3122          * can be more than one stream bound to a particular
3123          * protocol.  When this is the case, each one gets a copy
3124          * of any incoming packets.
3125          */
3126         ASSERT(ira->ira_protocol == ipha->ipha_protocol);
3127         ip_fanout_proto_v4(mp, ipha, ira);
3128         return;
3129 
3130 discard:
3131         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3132         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3133         freemsg(mp);
3134 #undef rptr
3135 }