143
Linux Network Linux Network Architecture Architecture Network Layer Network Layer Isaac Y. Tsai <eplusplus@gma il.com>

Linux Network Architecture Network Layer

  • Upload
    lucie

  • View
    106

  • Download
    2

Embed Size (px)

DESCRIPTION

Linux Network Architecture Network Layer. Isaac Y. Tsai . Outline. Network Layer in Linux Network filter and iptable framework PF Ring architecture. Interface between device driver and network layer. Network layer functions. /net/ipv4/ip_input.c - PowerPoint PPT Presentation

Citation preview

Page 1: Linux Network Architecture Network Layer

Linux Network Linux Network ArchitectureArchitecture

Network LayerNetwork Layer

Isaac Y. Tsai <[email protected]>

Page 2: Linux Network Architecture Network Layer

2010/09/17 © by

Outline

Network Layer in LinuxNetwork filter and iptable frameworkPF Ring architecture

Page 3: Linux Network Architecture Network Layer

2010/09/17 © by

Interface between device driver and network layer

Page 4: Linux Network Architecture Network Layer

2010/09/17 © by

Network layer functions<kernel src>/net/ipv4/ip_input.cip_rcv(skb)ip_rcv_finish(skb)ip_local_deliver(skb)ip_local_deliver_finish(skb)

<kernel src>/net/ipv4/ip_forward.cip_forward(skb)ip_forward_finish(skb)

<kernel src>/net/ipv4/ipmr.cint ip_mr_input(skb)

<kernel src>/net/ipv4/ip_output.cip_queue_xmit(skb,ipfragok)ip_local_out(skb)__ip_local_out(skb)ip_output(skb)ip_finish_output(skb)ip_finish_output2(skb)ip_mc_output(skb)

Page 5: Linux Network Architecture Network Layer

2010/09/17 © by

netif_receive_skb()<kernel src>/net/core/dev.cint netif_receive_skb(struct sk_buff *skb){

struct packet_type *ptype, *pt_prev;struct net_device *orig_dev, *master, *null_or_orig, *null_or_bond;int ret = NET_RX_DROP;__be16 type;if (!skb->tstamp.tv64) net_timestamp(skb);if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))

return NET_RX_SUCCESS;if (netpoll_receive_skb(skb)) return NET_RX_DROP;if (!skb->skb_iif) skb->skb_iif = skb->dev->ifindex;null_or_orig = NULL; orig_dev = skb->dev;master = ACCESS_ONCE(orig_dev->master);

Page 6: Linux Network Architecture Network Layer

2010/09/17 © by

netif_receive_skb() (cont’ed)if (master) {

if (skb_bond_should_drop(skb, master)) null_or_orig = orig_dev;

else skb->dev = master;}__get_cpu_var(netdev_rx_stat).total++;skb_reset_network_header(skb); skb_reset_transport_header(skb);skb->mac_len = skb->network_header - skb->mac_header; pt_prev = NULL;rcu_read_lock();

#ifdef CONFIG_NET_CLS_ACTif (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); goto ncls; }

#endiflist_for_each_entry_rcu(ptype, &ptype_all, list) {

if (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev) {

if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;

}}

Page 7: Linux Network Architecture Network Layer

2010/09/17 © by

netif_receive_skb() (cont’ed)#ifdef CONFIG_NET_CLS_ACT

skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out;ncls:#endif

skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);if (!skb) goto out;skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);if (!skb) goto out;null_or_bond = NULL;if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {

null_or_bond = vlan_dev_real_dev(skb->dev);}type = skb->protocol;

Page 8: Linux Network Architecture Network Layer

2010/09/17 © by

netif_receive_skb() (cont’ed)list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {

if (ptype->type == type && (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev || ptype->dev == null_or_bond)) {

if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;

}}if (pt_prev) {

ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);} else {

kfree_skb(skb);ret = NET_RX_DROP;

}out:

rcu_read_unlock();return ret;

}

Page 9: Linux Network Architecture Network Layer

2010/09/17 © by

net_rx_action()<kernel src>/net/core/dev.c

static void net_rx_action(struct softirq_action *h){

struct list_head *list = &__get_cpu_var(softnet_data).poll_list;unsigned long time_limit = jiffies + 2;int budget = netdev_budget;void *have;local_irq_disable();while (!list_empty(list)) {

struct napi_struct *n;int work, weight;if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))

goto softnet_break;local_irq_enable();n = list_first_entry(list, struct napi_struct, poll_list);

Page 10: Linux Network Architecture Network Layer

2010/09/17 © by

net_rx_action() (cont’ed)have = netpoll_poll_lock(n);weight = n->weight; work = 0;if (test_bit(NAPI_STATE_SCHED, &n->state)) {

work = n->poll(n, weight);trace_napi_poll(n);

}WARN_ON_ONCE(work > weight);budget -= work;local_irq_disable();if (unlikely(work == weight)) {

if (unlikely(napi_disable_pending(n))) {local_irq_enable();napi_complete(n);local_irq_disable();

} elselist_move_tail(&n->poll_list, list);

}netpoll_poll_unlock(have);

}

Page 11: Linux Network Architecture Network Layer

2010/09/17 © by

net_rx_action() (cont’ed)out:

local_irq_enable();#ifdef CONFIG_NET_DMA

/* * There may not be any more sk_buffs coming right now, so push * any pending DMA copies to hardware */dma_issue_pending_all();

#endifreturn;

softnet_break:__get_cpu_var(netdev_rx_stat).time_squeeze++;__raise_softirq_irqoff(NET_RX_SOFTIRQ);goto out;

}

Page 12: Linux Network Architecture Network Layer

2010/09/17 © by

Packet reception path: ip_rcv()

Network layer packet reception code ip_rcv()

ip_rcv() first performs some error checking related to packet type, packet header and it keeps some packet statistics. At the end of the code, it makes a macro function call to NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);

Page 13: Linux Network Architecture Network Layer

2010/09/17 © by

ip_rcv()<kernel src>/net/ipv4/ip_input.c

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)

{struct iphdr *iph;u32 len;if (skb->pkt_type == PACKET_OTHERHOST) goto drop;IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {

IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);goto out;

}if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error;

Page 14: Linux Network Architecture Network Layer

2010/09/17 © by

ip_rcv() (cont’ed)iph = ip_hdr(skb);if (iph->ihl < 5 || iph->version != 4) goto inhdr_error;if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error;iph = ip_hdr(skb);if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto inhdr_error;len = ntohs(iph->tot_len);if (skb->len < len) {

IP_INC_STATS_BH(dev_net(dev),IPSTATS_MIB_INTRUNCATEDPKTS);goto drop;

} else if (len < (iph->ihl*4)) goto inhdr_error;

Page 15: Linux Network Architecture Network Layer

2010/09/17 © by

ip_rcv() (cont’ed)if (pskb_trim_rcsum(skb, len)) {

IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);goto drop;

}/* Remove any debris in the socket control block */memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));/* Must drop socket now because of tproxy. */skb_orphan(skb);return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);

inhdr_error:IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);

drop:kfree_skb(skb);

out:return NET_RX_DROP;

}

Page 16: Linux Network Architecture Network Layer

2010/09/17 © by

ip_rcv_finish()

ip_rcv_finish() calls ip_route_input() The skb->dst pointer of the socket buffer is set to an entry in the routing cache,

which stores not only the destination on the IP level, but also a pointer to an entry in the hard header cache (cache for layer-2 frame packet headers), if present. If ip_route_input() cannot find a route, then the packet is discarded. Finally in ip_rcv_finish(), the procedure of the IP protocol reaches the junction between packets addressed to the local computer and packets to be forwarded. The information about the further path of an IP packet is stored in the routing entry skb->dst. Notice that a trick often used in the Linux kernel is used here. If a switch (variable value) is used to select different functions, then we simply insert a pointer to each of these functions. This saves us an if or switch instruction for each decision of how the program should continue. In the example used here, the pointer skb->dst->input() points to the function that should be used to handle a packet further:

Page 17: Linux Network Architecture Network Layer

2010/09/17 © by

The pointer skb->dst->input() points to the function that should be used to handle a packet further:

ip_local_deliver() is entered in the case of unicast and multicast packets that should be delivered to the local computer.

ip_forward() handles all unicast packets that should be forwarded.ip_mr_input() is used for multicast packets that should be forwarded.

Page 18: Linux Network Architecture Network Layer

2010/09/17 © by

ip_rcv_finish(skb)<kernel src>/net/ipv4/ip_input.c

static int ip_rcv_finish(struct sk_buff *skb){

const struct iphdr *iph = ip_hdr(skb);struct rtable *rt;if (skb_dst(skb) == NULL) {

int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, skb->dev);if (unlikely(err)) {

if (err == -EHOSTUNREACH)IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INADDRERROR

S);else if (err == -ENETUNREACH)

IP_INC_STATS_BH(dev_net(skb->dev), IPSTATS_MIB_INNOROUTES);goto drop;

}}

Page 19: Linux Network Architecture Network Layer

2010/09/17 © by

ip_rcv_finish(skb) (cont’ed)#ifdef CONFIG_NET_CLS_ROUTE

if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id()); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++;

st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len;

}#endif

if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop;rt = skb_rtable(skb);if (rt->rt_type == RTN_MULTICAST) {

IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST, skb->len);} else if (rt->rt_type == RTN_BROADCAST)

IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST, skb->len);return dst_input(skb);

drop:kfree_skb(skb); return NET_RX_DROP;

}

Page 20: Linux Network Architecture Network Layer

2010/09/17 © by

ip_local_deliver(skb)

<kernel src>/net/ipv4/ip_input.c/* Deliver IP Packets to the higher protocol layers. */int ip_local_deliver(struct sk_buff *skb){

/* Reassemble IP fragments. */if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {

if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))return 0;

}return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,

ip_local_deliver_finish);}

Page 21: Linux Network Architecture Network Layer

2010/09/17 © by

ip_local_deliver_finish(skb)<kernel src>/net/ipv4/ip_input.c

static int ip_local_deliver_finish(struct sk_buff *skb){

struct net *net = dev_net(skb->dev);__skb_pull(skb, ip_hdrlen(skb));

/* Point into the IP datagram, just past the header. */skb_reset_transport_header(skb);rcu_read_lock();{

int protocol = ip_hdr(skb)->protocol;int hash, raw;const struct net_protocol *ipprot;

resubmit:raw = raw_local_deliver(skb, protocol);hash = protocol & (MAX_INET_PROTOS - 1);ipprot = rcu_dereference(inet_protos[hash]);

Page 22: Linux Network Architecture Network Layer

2010/09/17 © by

ip_local_deliver_finish(skb) (cont’ed)if (ipprot != NULL) {

int ret; if (!net_eq(net, &init_net) && !ipprot->netns_ok) { if (net_ratelimit())printk("%s: proto %d isn't netns-ready\n", __func__, protocol); kfree_skb(skb); goto out; } if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {

kfree_skb(skb); goto out; } nf_reset(skb); } ret = ipprot->handler(skb); if (ret < 0) { protocol = -ret; goto resubmit; } IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);} else {

Page 23: Linux Network Architecture Network Layer

2010/09/17 © by

ip_local_deliver_finish(skb) (cont’ed)

if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {

IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } } else IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); kfree_skb(skb);

}}

out:rcu_read_unlock();return 0;

}

Page 24: Linux Network Architecture Network Layer

2010/09/17 © by

dst_input(skb)

<net/dst.h>static inline int dst_input(struct sk_buff *skb) {

return skb_dst(skb)->input(skb);}

<linux/skbuff.h>static inline struct dst_entry *skb_dst(const struct sk_buff *skb){

return (struct dst_entry *)skb->_skb_dst;}static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst){

skb->_skb_dst = (unsigned long)dst;}

Page 25: Linux Network Architecture Network Layer

2010/09/17 © by

dst_output(skb)<net/dst.h>/* Output packet to network from transport. */static inline int dst_output(struct sk_buff *skb){

return skb_dst(skb)->output(skb);}

Page 26: Linux Network Architecture Network Layer

2010/09/17 © by

struct dst_entry<net/dst.h>struct dst_entry {

struct rcu_head rcu_head;struct dst_entry *child;struct net_device *dev;short error, obsolete;int flags;unsigned long expires;unsigned short header_len, trailer_len; /* space to reserve at tail */unsigned int rate_tokens;unsigned long rate_last; /* rate limiting for ICMP */struct dst_entry *path;struct neighbour *neighbour;struct hh_cache *hh;

#ifdef CONFIG_XFRMstruct xfrm_state *xfrm;

#else

Page 27: Linux Network Architecture Network Layer

2010/09/17 © by

struct dst_entry (cont’ed)void *__pad1;

#endifint (*input)(struct sk_buff*);int (*output)(struct sk_buff*);struct dst_ops *ops;u32 metrics[RTAX_MAX];

#ifdef CONFIG_NET_CLS_ROUTE__u32 tclassid;

#else__u32 __pad2;

#endif/* Align __refcnt to a 64 bytes alignment */

#ifdef CONFIG_64BITlong __pad_to_align_refcnt[1];

#endif

Page 28: Linux Network Architecture Network Layer

2010/09/17 © by

struct dst_entry (cont’ed)/* * __refcnt wants to be on a different cache line from * input/output/ops or performance tanks badly */atomic_t __refcnt; /* client references */int __use;unsigned long lastuse;union {

struct dst_entry *next;struct rtable *rt_next;struct rt6_info *rt6_next;struct dn_route *dn_next;

};};

Page 29: Linux Network Architecture Network Layer

2010/09/17 © by

ip_forward(skb)The primary task of ip_forward(skb) is to process a few conditions of the Interne

t Protocol (e.g., a packet's lifetime) and packet options. First, packets not marked with pkt_type == PACKET_HOST are deleted. Next, the reach of the packet is checked. If the value in its TTL field is 1 (before it is decremented), then the packet is deleted. RFC 791 specifies that, if such an action occurs, an ICMP packet has to be returned to the sender to inform the latter (ICMP_TIME_EXCEEDED).

Once a redirect message has been checked, if applicable, the socket buffer is checked to see if there is sufficient memory for the headroom. This means that the function skb_cow(skb, headroom) is used to check whether there is still sufficient space for the MAC header in the output network device (out_dev->hard_header_len). If this is not the case, then skb_realloc_headroom() creates sufficient space. Subsequently, the TTL field of the IP packet is decremented by one.

When the actual packet length (including the MAC header) is known, it is checked for whether it really fits into the frame format of the new output network device. If it is too long (skb->len > mtu), and if no fragmenting is allowed because the Don't-Fragment bit is set in the IP header, then the packet is discarded, and the ICMP message ICMP_FRAG_NEEDED is transmitted to the sender. In any case, the packet is not fragmented yet; fragmenting is delayed. The early test for such cases prevents potential Don't-Fragment candidates from running through the entire IP protocol-handling process, only to be dropped eventually.

Page 30: Linux Network Architecture Network Layer

2010/09/17 © by

ip_forward(skb)<kernel src>/net/ipv4/ip_forward.cint ip_forward(struct sk_buff *skb){

struct iphdr *iph;/* Our header */struct rtable *rt; /* Route we use */struct ip_options * opt = &(IPCB(skb)->opt);if (skb_warn_if_lro(skb)) goto drop;if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))

goto drop;if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))

return NET_RX_SUCCESS;if (skb->pkt_type != PACKET_HOST)

goto drop;skb_forward_csum(skb);/* According to the RFC, we must first decrease the TTL field. If

that reaches zero, we must reply an ICMP control message telling that the packet's lifetime expired. */if (ip_hdr(skb)->ttl <= 1) goto too_many_hops;if (!xfrm4_route_forward(skb)) goto drop;

Page 31: Linux Network Architecture Network Layer

2010/09/17 © by

ip_forward(skb) (cont’ed)rt = skb_rtable(skb);if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto sr_failed;if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&

(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS);icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,

htonl(dst_mtu(&rt->u.dst)));goto drop;

}/* We are about to mangle packet. Copy it! */if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))

goto drop;iph = ip_hdr(skb); /* Decrease ttl after skb cow done */ip_decrease_ttl(iph);

/* now generate an ICMP HOST REDIRECT giving the route calculated. */if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))

ip_rt_send_redirect(skb);

Page 32: Linux Network Architecture Network Layer

2010/09/17 © by

ip_forward(skb) (cont’ed)skb->priority = rt_tos2priority(iph->tos);

return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev, ip_forward_finish);

sr_failed:/* Strict routing permits no gatewaying */ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); goto drop;

too_many_hops:/* Tell the sender its packet died... */IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);

drop:kfree_skb(skb);return NET_RX_DROP;

}

Page 33: Linux Network Architecture Network Layer

2010/09/17 © by

ip_forward_finish(skb)<kernel src>/net/ipv4/ip_forward.c

static int ip_forward_finish(struct sk_buff *skb){

struct ip_options * opt = &(IPCB(skb)->opt);IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);if (unlikely(opt->optlen)) ip_forward_options(skb);return dst_output(skb);

}

ip_forward_finish(). This function has actually very little functionality (unless FASTROUTE is enabled). Once the IP options, if used, have been processed in ip_forward_options(), the ip_send() function is invoked to check on whether the packet has to be fragmented and to eventually do a fragmentation, if applicable.

Page 34: Linux Network Architecture Network Layer

2010/09/17 © by

ip_forward_options(skb)<kernel src>/net/ipv4/ip_forward.cvoid ip_forward_options(struct sk_buff *skb){

struct ip_options * opt = &(IPCB(skb)->opt);unsigned char * optptr;struct rtable *rt = skb_rtable(skb);unsigned char *raw = skb_network_header(skb);if (opt->rr_needaddr) {

optptr = (unsigned char *)raw + opt->rr;ip_rt_get_source(&optptr[optptr[2]-5], rt); opt->is_changed = 1;

}if (opt->srr_is_hit) {

int srrptr, srrspace; optptr = raw + opt->srr; for ( srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) {

if (srrptr + 3 > srrspace) break;if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) break;

}

Page 35: Linux Network Architecture Network Layer

2010/09/17 © by

ip_forward_options(skb) (cont’ed)

if (srrptr + 3 <= srrspace) {opt->is_changed = 1;ip_rt_get_source(&optptr[srrptr-1], rt);ip_hdr(skb)->daddr = rt->rt_dst;optptr[2] = srrptr+4;} else if (net_ratelimit())printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");if (opt->ts_needaddr) {optptr = raw + opt->ts;ip_rt_get_source(&optptr[optptr[2]-9], rt);opt->is_changed = 1;}

}if (opt->is_changed) {

opt->is_changed = 0;ip_send_check(ip_hdr(skb));

}}

Page 36: Linux Network Architecture Network Layer

2010/09/17 © by

ip_send_check(iph)

<kernel src>/net/ipv4/ip_output.c

/* Generate a checksum for an outgoing IP datagram. */__inline__ void ip_send_check(struct iphdr *iph){ iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);}

Page 37: Linux Network Architecture Network Layer

2010/09/17 © by

ip_queue_xmit(skb, ipfragok)<kernel src>/net/ipv4/ip_output.c

int ip_queue_xmit(struct sk_buff *skb, int ipfragok){

struct sock *sk = skb->sk;struct inet_sock *inet = inet_sk(sk);struct ip_options *opt = inet->opt;struct rtable *rt;struct iphdr *iph;rt = skb_rtable(skb);if (rt != NULL) goto packet_routed;/* Make sure we can route this packet. */rt = (struct rtable *)__sk_dst_check(sk, 0);if (rt == NULL) {

__be32 daddr;/* Use correct destination address if we have options. */daddr = inet->inet_daddr;if(opt && opt->srr) daddr = opt->faddr;

Page 38: Linux Network Architecture Network Layer

2010/09/17 © by

ip_queue_xmit(skb, ipfragok) (cont’ed)

{ struct flowi fl = { .oif = sk->sk_bound_dev_if,

.mark = sk->sk_mark,

.nl_u = { .ip4_u = { .daddr = daddr,.saddr = inet->inet_saddr,.tos = RT_CONN_FLAGS(sk) } },

.proto = sk->sk_protocol,

.flags = inet_sk_flowi_flags(sk),

.uli_u = { .ports = { .sport = inet->inet_sport, .dport = inet->inet_dport } } };

security_sk_classify_flow(sk, &fl);if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))

goto no_route;}

Page 39: Linux Network Architecture Network Layer

2010/09/17 © by

ip_queue_xmit(skb, ipfragok) (cont’ed)

sk_setup_caps(sk, &rt->u.dst);}skb_dst_set(skb, dst_clone(&rt->u.dst));

packet_routed:if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)

goto no_route;skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));skb_reset_network_header(skb);iph = ip_hdr(skb);*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)

iph->frag_off = htons(IP_DF);else

iph->frag_off = 0;iph->ttl = ip_select_ttl(inet, &rt->u.dst);iph->protocol = sk->sk_protocol;iph->saddr = rt->rt_src; iph->daddr = rt->rt_dst;

Page 40: Linux Network Architecture Network Layer

2010/09/17 © by

ip_queue_xmit(skb, ipfragok) (cont’ed)

if (opt && opt->optlen) {iph->ihl += opt->optlen >> 2;ip_options_build(skb, opt, inet->inet_daddr, rt, 0);

}ip_select_ident_more(iph, &rt->u.dst, sk,

(skb_shinfo(skb)->gso_segs ?: 1) - 1);skb->priority = sk->sk_priority;skb->mark = sk->sk_mark;return ip_local_out(skb);

no_route:IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);kfree_skb(skb);return -EHOSTUNREACH;

}

Page 41: Linux Network Architecture Network Layer

2010/09/17 © by

ip_local_out(skb)

<kernel src>/net/ipv4/ip_output.c

int ip_local_out(struct sk_buff *skb){

int err;err = __ip_local_out(skb);if (likely(err == 1)) err = dst_output(skb);return err;

}EXPORT_SYMBOL_GPL(ip_local_out);

Page 42: Linux Network Architecture Network Layer

2010/09/17 © by

__ip_local_out(skb)<kernel src>/net/ipv4/ip_output.c

int __ip_local_out(struct sk_buff *skb){

struct iphdr *iph = ip_hdr(skb);iph->tot_len = htons(skb->len);ip_send_check(iph);return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb,

NULL, skb_dst(skb)->dev, dst_output);}

Page 43: Linux Network Architecture Network Layer

2010/09/17 © by

ip_output(skb)<kernel src>/net/ipv4/ip_output.c

int ip_output(struct sk_buff *skb){

struct net_device *dev = skb_dst(skb)->dev;IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);skb->dev = dev;skb->protocol = htons(ETH_P_IP);return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev, ip_finish_output,!(IPCB(skb)->flags & IPSKB_REROUTED));

}

Page 44: Linux Network Architecture Network Layer

2010/09/17 © by

ip_finish_output(skb)<kernel src>/net/ipv4/ip_output.c

static int ip_finish_output(struct sk_buff *skb){#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)

/* Policy lookup after SNAT yielded a new policy */if (skb_dst(skb)->xfrm != NULL) {

IPCB(skb)->flags |= IPSKB_REROUTED;return dst_output(skb);

}#endif

if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))return ip_fragment(skb, ip_finish_output2);

elsereturn ip_finish_output2(skb);

}

Page 45: Linux Network Architecture Network Layer

2010/09/17 © by

ip_finish_output2(skb)<kernel src>/net/ipv4/ip_output.c

static inline int ip_finish_output2(struct sk_buff *skb){

struct dst_entry *dst = skb_dst(skb);struct rtable *rt = (struct rtable *)dst;struct net_device *dev = dst->dev;unsigned int hh_len = LL_RESERVED_SPACE(dev);if (rt->rt_type == RTN_MULTICAST) {

IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);} else if (rt->rt_type == RTN_BROADCAST)

IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);/* Be paranoid, rather than too clever. */if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {

struct sk_buff *skb2;skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));

Page 46: Linux Network Architecture Network Layer

2010/09/17 © by

ip_finish_output2(skb) (cont’ed)

if (skb2 == NULL) {kfree_skb(skb);return -ENOMEM;

}if (skb->sk)

skb_set_owner_w(skb2, skb->sk);kfree_skb(skb);skb = skb2;

}if (dst->hh) return neigh_hh_output(dst->hh, skb);else if (dst->neighbour) return dst->neighbour->output(skb);if (net_ratelimit())

printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");kfree_skb(skb);return -EINVAL;

}

Page 47: Linux Network Architecture Network Layer

2010/09/17 © by

Netfilter hooks for connection tracking

Page 48: Linux Network Architecture Network Layer

2010/09/17 © by

HF_HOOK()<linux/netfilter.h>static inline intNF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb,

struct net_device *in, struct net_device *out,int (*okfn)(struct sk_buff *))

{return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN);

}static inline intNF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct sk_buff *skb,

struct net_device *in, struct net_device *out, int (*okfn)(struct sk_buff *), int thresh)

{int ret = nf_hook_thresh(pf, hook, skb, in, out, okfn, thresh);if (ret == 1) ret = okfn(skb);return ret;

}

Page 49: Linux Network Architecture Network Layer

2010/09/17 © by

Arguments of NF_HOOK macro

pf (protocol family): This is the identifier of the protocol family: PF_INET for IP Version 4, PF_INET6 for IP Version 6.

hook: This is the hook identifier. All valid identifiers for each protocol family are defined in a header file (e.g., <linux/netfilter_ipv4.h>).

skb: This is a pointer to the sk_buff structure with the packet to be handled.indev (input device): This is a pointer to the net_device structure of the net

work device that received the packet. It is set to NULL in the above example, because the packet is an outgoing packet.

outdev (output device): This is a pointer to the net_device structure of the network device that should be used by the packet to leave the local computer. In the above example, the device used has to be determined first by use of the routing table (rt).

okfn() (okay function): This function is invoked when all filter functions registered with this hook returned NF_ACCEPT, thereby okaying the packet's transit.

Page 50: Linux Network Architecture Network Layer

2010/09/17 © by

nf_hook()<linux/netfilter.h>static inline int nf_hook(u_int8_t pf, unsigned int hook,

struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *))

{return nf_hook_thresh(pf, hook, skb, indev, outdev, okfn, INT_MIN);

}

static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh)

{#ifndef CONFIG_NETFILTER_DEBUG

if (list_empty(&nf_hooks[pf][hook]))return 1;

#endifreturn nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh);

}

Page 51: Linux Network Architecture Network Layer

2010/09/17 © by

nf_hook_thresh()<linux/netfilter.h>/** nf_hook_thresh - call a netfilter hook Returns 1 if the hook has allowed the packet to pass. The function

okfn must be invoked by the caller in this case. Any other return value indicates the packet has been consumed by the hook.

*/static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int thresh){#ifndef CONFIG_NETFILTER_DEBUG

if (list_empty(&nf_hooks[pf][hook])) return 1;#endif

return nf_hook_slow(pf, hook, skb, indev, outdev, okfn, thresh);}

Page 52: Linux Network Architecture Network Layer

2010/09/17 © by

nf_hook_slow()<kernel src>/net/netfilter/core.c

/* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. */

int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), int hook_thresh){

struct list_head *elem;unsigned int verdict; int ret = 0;rcu_read_lock();elem = &nf_hooks[pf][hook];

next_hook:verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,

outdev, &elem, okfn, hook_thresh);

Page 53: Linux Network Architecture Network Layer

2010/09/17 © by

nf_hook_slow() (cont’ed)

if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1;} else if (verdict == NF_DROP) { kfree_skb(skb); ret = -EPERM;} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn, verdict >> NF_VERDICT_BITS))

goto next_hook;}rcu_read_unlock(); return ret;

}EXPORT_SYMBOL(nf_hook_slow);

Page 54: Linux Network Architecture Network Layer

2010/09/17 © by

nf_hook_slow()<kernel src>/net/netfilter/core.c/* Returns 1 if okfn() needs to be executed by the caller, -EPERM for NF_DROP, 0 otherwise. */int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev,struct net_device *outdev, int (*okfn)(struct sk_buff *), int hook_thresh){

struct list_head *elem; unsigned int verdict; int ret = 0;rcu_read_lock(); /* We may already have this, but read-locks nest anyway */elem = &nf_hooks[pf][hook];

next_hook:verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, outdev, &elem, okfn, hook_thresh);if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1;} else if (verdict == NF_DROP) {

kfree_skb(skb); ret = -EPERM;} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,verdict >> NF_VERDICT_BITS))

goto next_hook;}rcu_read_unlock(); return ret;

}EXPORT_SYMBOL(nf_hook_slow);

Page 55: Linux Network Architecture Network Layer

2010/09/17 © by

nf_iterate()<kernel src>/net/netfilter/core.c

unsigned int nf_iterate(struct list_head *head,struct sk_buff *skb, unsigned int hook,const struct net_device *indev, const struct net_device *outdev,struct list_head **i, int (*okfn)(struct sk_buff *), int hook_thresh)

{unsigned int verdict;

/* The caller must not block between calls to this function because of risk of continuing from deleted element. */list_for_each_continue_rcu(*i, head) {

struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;if (hook_thresh > elem->priority) continue;

/* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */

verdict = elem->hook(hook, skb, indev, outdev, okfn);if (verdict != NF_ACCEPT) {

Page 56: Linux Network Architecture Network Layer

2010/09/17 © by

nf_iterate() (cont’ed)

#ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", elem->hook, hook); continue; }

#endif if (verdict != NF_REPEAT) return verdict;

*i = (*i)->prev;}

}return NF_ACCEPT;

}

Page 57: Linux Network Architecture Network Layer

2010/09/17 © by

Netfilter hook identifier<linux/netfilter_ipv4.h>

NF_IP_PRE_ROUTING (0): Incoming packets pass this hook in the ip_rcv() function before they are processed by the routing code. Prior to that, only a few simple consistency checks with regard to the version, length, and checksum fields in the IP header are done. Meaningful opportunities to use this hook result whenever incoming packets should be caught before they are processed—for example, to detect certain types of denial-of-service attacks that operate on poorly built IP packets, or for address-translation mechanisms (NAT), or for accounting functions (counting of incoming packets).

NF_IP_LOCAL_IN (1): All incoming packets addressed to the local computer

pass this hook in the function ip_local_deliver(). At this point, the iptables module hooks the INPUT rules list into place to filter incoming data packets. This corresponds to the input rules list in ipchains.

Page 58: Linux Network Architecture Network Layer

2010/09/17 © by

Netfilter hook identifier (cont’ed)

NF_IP_FORWARD (2): All incoming packets not addressed to the local computer pass this hook in the function ip_forward()—that is, packets to be forwarded and leaving the computer over a different network interface.This includes any packet the address of which was modified by NAT. At this point, the iptables module hooks the FORWARD rules list into place to filter forwarded data packets. This corresponds to the forward rules list in ipchains.

NF_IP_LOCAL_OUT (3): All outgoing packets created in the local computer pass this hook in the function ip_build_and_send_pkt(). At this point, the iptables module hooks the OUTPUT rules list into place to filter outgoing data packets. This corresponds to the output rules list in ipchains.

NF_IP_POST_ROUTING (4): This hook in the ip_finish_output() function represents the last chance to access all outgoing (forwarded or locally created) packets before they leave the computer over a network device. Like the NF_IP_PRE_ROUTING hook, this is a good place to integrate accounting functions.

Page 59: Linux Network Architecture Network Layer

2010/09/17 © by

nf_hookfnThe packet-filter functions that are actually hooked into the netfilter hooks are so-called hook functions of the type nf_hookfn. The parameters (except for the protocol family identifier) correspond exactly to those of the NF_HOOK macro

<linux/netfilter.h>typedef unsigned int nf_hookfn(unsigned int hooknum,

struct sk_buff *skb, const struct net_device *in,const struct net_device *out, int (*okfn)(struct sk_buff *));

Page 60: Linux Network Architecture Network Layer

2010/09/17 © by

Return value of a packet-filter function

The return value of a packet-filter function specifies what should happen to the packet. These are defined in <linux/netfilter.h>.

NF_DROP (0): The active rules list processing is stopped, and the packet is dropped.

NF_ACCEPT (1): The packet is passed to the next packet filter function in the rules list. Once the end of the list has been reached, the packet is released by okfn() for further processing.

NF_STOLEN (2): The packet filter function withholds the packet for further processing, so that the active rules list processing is stopped. In contrast to NF_DROP, however, the packet does not have to be explicitly dropped.

NF_QUEUE (3): The function nf_queue() (net/core/netfilter.c) puts the packet in a queue from which it can be removed and processed (e.g., by a user space program). Subsequently, nf_reinject() has to be invoked to return the packet to the Linux kernel for further processing by netfilter.

NF_REPEAT (4): In contrast to NF_ACCEPT, rather than a continuation of processing at the next packet-filter function, the current filter function is invoked again.

Page 61: Linux Network Architecture Network Layer

2010/09/17 © by

nf_register_hook(), nf_unregister_hook()

nf_register_hook(), nf_unregister_hook() registers and unregisters a packet-filter function with the Linux kernel. The parameter passed is a nf_hook_ops structure, which includes all information required.

<linux/netfilter.h>

struct nf_hook_ops {struct list_head list;nf_hookfn *hook;struct module *owner;u_int8_t pf;unsigned int hooknum; /* Hooks are ordered in ascending priority. */int priority;

};

Page 62: Linux Network Architecture Network Layer

2010/09/17 © by

struct nf_hook_opslist: The nf_hook_ops structures are maintained in a linked list within the Linux

kernel.hook(): This is a pointer to the actual packet-filter function of the type nf_hookf

n.pf, hooknum: The protocol family identifier (e.g., PF_INET or PF_INET6) and the

hook identifier (e.g., NF_IP_INPUT) are used to determine the hook for this packet-filter function.

priority: Packet-filter functions within the rules list of a hook are sorted by the priority field in ascending order, so that they will be invoked in this order when a packet transits. Priority values are defined as follows, e.g., in <linux/netfilter_ipv4.h>:

enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, NF_IP_PRI_CONNTRACK = -200, NF_IP_PRI_MANGLE = -150, NF_IP_PRI_NAT_DST = -100, NF_IP_PRI_FILTER = 0, NF_IP_PRI_NAT_src = 100, NF_IP_PRI_LAST = INT_MAX,};

Page 63: Linux Network Architecture Network Layer

2010/09/17 © by

First netfilter example module/* Sample code to install a Netfilter hook function that will * drop all incoming packets. */#define __KERNEL__#define MODULE#include <linux/kernel.h>#include <linux/module.h>#include <linux/netfilter.h>#include <linux/netfilter_ipv4.h>#include <linux/skbuff.h>

static struct nf_hook_ops nfho;unsigned int my_hookfunc(unsigned int hooknum, struct sk_buff **skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { return NF_DROP; /* Drop ALL packets */ }

Page 64: Linux Network Architecture Network Layer

2010/09/17 © by

First netfilter example module (cont’ed)

static int __init init_module(void) { /* Fill in our hook structure */ nfho.hook = my_hookfunc; /* Handler function */

nfho.hooknum = NF_IP_PRE_ROUTING; /* First hook for IPv4 */ nfho.pf = PF_INET; nfho.priority = NF_IP_PRI_FIRST; /* Make our function first */

nf_register_hook(&nfho); return 0; }static void __exit cleanup_module(void) { nf_unregister_hook(&nfho); }module_init(init_module);module_exit(cleanup_module);

Page 65: Linux Network Architecture Network Layer

2010/09/17 © by

Second netfilter example module//For any packet, get the ip header and check the protocol field//if the protocol number equal to UDP (17), log in var/log/messages//default action of module to let all packets through #include <linux/kernel.h>#include <linux/module.h>#include <linux/netfilter.h>#include <linux/netfilter_ipv4.h>#include <linux/skbuff.h>#include <linux/udp.h>#include <linux/ip.h>static struct nf_hook_ops nfho; //net filter hook option structstruct sk_buff *sock_buff;struct udphdr *udp_header; struct iphdr *ip_header; //ip header struct

Page 66: Linux Network Architecture Network Layer

2010/09/17 © by

Second netfilter example module (cont’ed)

unsigned int my_hookfunc(unsigned int hooknum, struct sk_buff **skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *))

{sock_buff = *skb;

ip_header = (struct iphdr *)skb_network_header(sock_buff); if(!sock_buff) { return NF_ACCEPT; }

if (ip_header->protocol==17) { udp_header = (struct udphdr *)skb_transport_header(sock_buff); printk(KERN_INFO "got udp packet \n"); //log to /var/log/messages return NF_DROP; } return NF_ACCEPT; }

Page 67: Linux Network Architecture Network Layer

2010/09/17 © by

Second netfilter example module (cont’ed)

static int __init init_module(void) { nfho.hook = my_hookfunc; nfho.hooknum = NF_IP_PRE_ROUTING; nfho.pf = PF_INET; nfho.priority = NF_IP_PRI_FIRST; nf_register_hook(&nfho); return 0; } static void __exit cleanup_module(void) { nf_unregister_hook(&nfho); }module_init(init_module);module_exit(cleanup_module);

Page 68: Linux Network Architecture Network Layer

2010/09/17 © by

Third netfilter example module

/* Sample code to install a Netfilter hook function that will drop all incoming packets from an IP address we specify */#define __KERNEL__#define MODULE#include <linux/kernel.h>#include <linux/module.h>#include <linux/netfilter.h>#include <linux/netfilter_ipv4.h>#include <linux/skbuff.h>#include <linux/udp.h>#include <linux/ip.h>

/* The structure used to register filter function */static struct nf_hook_ops nfho;/* IP address we want to drop packets from, in network byte order */static unsigned char *drop_ip = "x7fx00x00x01"; /* 127.0.0.1 */

Page 69: Linux Network Architecture Network Layer

2010/09/17 © by

Third netfilter example module (cont’ed)

/* This is the hook function itself */unsigned int my_hookfunc(unsigned int hooknum, struct sk_buff **skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { struct sk_buff *sb = *skb; if (sb->nh.iph->saddr == *(unsigned int *)drop_ip) { printk("Dropped packet from... %d.%d.%d.%dn", drop_ip, *(drop_ip + 1), *(drop_ip + 2), *(drop_ip + 3)); return NF_DROP; } else { return NF_ACCEPT; } }

Page 70: Linux Network Architecture Network Layer

2010/09/17 © by

Third netfilter example module (cont’ed)

static int __init init_module(void) { nfho.hook = my_hookfunc; nfho.hooknum = NF_IP_PRE_ROUTING; /* First hook for IPv4 */ nfho.pf = PF_INET; nfho.priority = NF_IP_PRI_FIRST; /* Make our function first */ nf_register_hook(&nfho); return 0;}/* Cleanup routine */static void __exit cleanup_module(void){ nf_unregister_hook(&nfho);}module_init(init_module);module_exit(cleanup_module);

Page 71: Linux Network Architecture Network Layer

2010/09/17 © by

The module interface of the connection-tracking module is located in the file net/ipv4/netfilter/ip_conntrack_standalone.c. The file net/ipv4/netfilter/ip_conntrack_core.c contains the actual connection-tracking functionality. The connection-tracking module hooks itself into the netfilter hooks NF_IP_PRE_ROUTING and NF_IP_LOCAL_OUT with very high priority (the NF_IP_PRI_CONNTRACK is set to -200 in <linux/netfilter_ipv4.h>).

<linux/netfilter_ipv4.h>enum nf_ip_hook_priorities {

NF_IP_PRI_FIRST = INT_MIN,NF_IP_PRI_CONNTRACK_DEFRAG = -400, NF_IP_PRI_RAW = -300,NF_IP_PRI_SELINUX_FIRST = -225, NF_IP_PRI_CONNTRACK = -200,NF_IP_PRI_MANGLE = -150, NF_IP_PRI_NAT_DST = -100, NF_IP_PRI_FILTER = 0, NF_IP_PRI_SECURITY = 50,NF_IP_PRI_NAT_SRC = 100, NF_IP_PRI_SELINUX_LAST = 225,NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,NF_IP_PRI_LAST = INT_MAX, };

Page 72: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input()<kernel src>/net/ipv4/route.c

int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev)

{struct rtable * rth;unsigned hash;int iif = dev->ifindex;struct net *net;net = dev_net(dev);if (!rt_caching(net)) goto skip_cache;tos &= IPTOS_RT_MASK;hash = rt_hash(daddr, saddr, iif, rt_genid(net));rcu_read_lock();

Page 73: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input() (cont’ed) for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;

rth = rcu_dereference(rth->u.dst.rt_next)) { if (((rth->fl.fl4_dst ^ daddr) | (rth->fl.fl4_src ^ saddr) | (rth->fl.iif ^ iif) | rth->fl.oif | (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && net_eq(dev_net(rth->u.dst.dev), net) && !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); skb_dst_set(skb, &rth->u.dst); return 0; } RT_CACHE_STAT_INC(in_hlist_search);

} rcu_read_unlock();

Page 74: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input() (cont’ed)skip_cache:

/* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting network acquires a lot of useless route cache entries, sort of SDR messages from all the world. Now we try to get rid of them. Really, provided software IP multicast filter is organized reasonably (at least, hashed), it does not result in a slowdown comparing with route cache reject entries. Note, that multicast routers are not affected, because route cache entry is created eventually. */if (ipv4_is_multicast(daddr)) {

struct in_device *in_dev;rcu_read_lock();

Page 75: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input() (cont’ed)if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { int our = ip_check_mc(in_dev, daddr, saddr, ip_hdr(skb)->protocol); if (our

#ifdef CONFIG_IP_MROUTE||

(!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))

#endif ) {

rcu_read_unlock();return ip_route_input_mc(skb, daddr, saddr, tos, dev, our);

}}rcu_read_unlock(); return -EINVAL;

} return ip_route_input_slow(skb, daddr, saddr, tos, dev);}

Page 76: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input_slow()<kernel src>/net/ipv4/route.cstatic int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,

u8 tos, struct net_device *dev){

struct fib_result res;struct in_device *in_dev = in_dev_get(dev);struct flowi fl = { .nl_u = { .ip4_u =

{ .daddr = daddr, .saddr = saddr,.tos = tos, .scope = RT_SCOPE_UNIVERSE,

} }, .mark = skb->mark, .iif = dev->ifindex };

unsigned flags = 0; u32 itag = 0;struct rtable * rth;unsigned hash;__be32 spec_dst;int err = -EINVAL, free_res = 0;

Page 77: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input_slow() (cont’ed)struct net * net = dev_net(dev);

/* IP on this device is disabled. */if (!in_dev) goto out;

/*Check for the most weird martians, which can be not detected by fib_lookup. */if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_loopback(saddr)) goto martian_source;if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))

goto brd_input;/* Accept zero addresses only to limited broadcast;

I even do not know to fix it or not. Waiting for complains :-) */if (ipv4_is_zeronet(saddr)) goto martian_source;if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))

goto martian_destination;/* Now we are ready to route packet. */if ((err = fib_lookup(net, &fl, &res)) != 0) {

if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach;goto no_route;

}

Page 78: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input_slow() (cont’ed)

free_res = 1;RT_CACHE_STAT_INC(in_slow_tot);if (res.type == RTN_BROADCAST) goto brd_input;if (res.type == RTN_LOCAL) {

int result;result = fib_validate_source(saddr, daddr, tos,

net->loopback_dev->ifindex, dev, &spec_dst, &itag, skb->mark);

if (result < 0) goto martian_source;if (result) flags |= RTCF_DIRECTSRC;spec_dst = daddr;goto local_input;

}if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach;if (res.type != RTN_UNICAST) goto martian_destination;err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);

Page 79: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input_slow() (cont’ed)

done:in_dev_put(in_dev);if (free_res) fib_res_put(&res);

out: return err;brd_input:

if (skb->protocol != htons(ETH_P_IP)) goto e_inval;if (ipv4_is_zeronet(saddr))

spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);else {

err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag, skb->mark);

if (err < 0) goto martian_source;if (err) flags |= RTCF_DIRECTSRC;

}flags |= RTCF_BROADCAST;res.type = RTN_BROADCAST;RT_CACHE_STAT_INC(in_brd);

Page 80: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input_slow() (cont’ed)

local_input:rth = dst_alloc(&ipv4_dst_ops);if (!rth) goto e_nobufs;rth->u.dst.output= ip_rt_bug;rth->u.dst.obsolete = -1;rth->rt_genid = rt_genid(net);atomic_set(&rth->u.dst.__refcnt, 1);rth->u.dst.flags= DST_HOST;if (IN_DEV_CONF_GET(in_dev, NOPOLICY))

rth->u.dst.flags |= DST_NOPOLICY;rth->fl.fl4_dst = daddr;rth->rt_dst = daddr;rth->fl.fl4_tos = tos;rth->fl.mark = skb->mark;rth->fl.fl4_src = saddr;rth->rt_src = saddr;

#ifdef CONFIG_NET_CLS_ROUTErth->u.dst.tclassid = itag;

#endif

Page 81: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input_slow() (cont’ed)

rth->rt_iif =rth->fl.iif = dev->ifindex;rth->u.dst.dev = net->loopback_dev;dev_hold(rth->u.dst.dev);rth->idev = in_dev_get(rth->u.dst.dev);rth->rt_gateway = daddr;rth->rt_spec_dst= spec_dst;rth->u.dst.input= ip_local_deliver;rth->rt_flags = flags|RTCF_LOCAL;if (res.type == RTN_UNREACHABLE) {

rth->u.dst.input= ip_error;rth->u.dst.error= -err;rth->rt_flags &= ~RTCF_LOCAL;

}rth->rt_type = res.type;hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);goto done;

Page 82: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input_slow() (cont’ed)

no_route:RT_CACHE_STAT_INC(in_no_route);spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);res.type = RTN_UNREACHABLE;if (err == -ESRCH) err = -ENETUNREACH;goto local_input;

/* Do not cache martian addresses: they should be logged (RFC1812) */martian_destination:

RT_CACHE_STAT_INC(in_martian_dst);#ifdef CONFIG_IP_ROUTE_VERBOSE

if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",

&daddr, &saddr, dev->name);#endife_hostunreach:

err = -EHOSTUNREACH;goto done;

Page 83: Linux Network Architecture Network Layer

2010/09/17 © by

ip_route_input_slow() (cont’ed)

e_inval:err = -EINVAL;goto done;

e_nobufs:err = -ENOBUFS;goto done;

martian_source:ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);goto e_inval;

}

Page 84: Linux Network Architecture Network Layer

2010/09/17 © by

ip_handle_martian_source()static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) {

RT_CACHE_STAT_INC(in_martian_src);#ifdef CONFIG_IP_ROUTE_VERBOSE

if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {/* RFC1812, if source is martian, the only hint is MAC header*/

printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",&daddr, &saddr, dev->name);

if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; const unsigned char *p = skb_mac_header(skb); printk(KERN_WARNING "ll header: "); for (i = 0; i < dev->hard_header_len; i++, p++) { printk("%02x", *p); if (i < (dev->hard_header_len - 1)) printk(":"); } printk("\n");}

}#endif}

Page 85: Linux Network Architecture Network Layer

2010/09/17 © by

PF_RING architecture

Page 86: Linux Network Architecture Network Layer

2010/09/17 © by

PF_RINGPF_RING is a new type of socket based interface. It includes

three software modules. A kernel module called ‘PF_RING’ which is written as a new socket protocol type and handles all the socket buffers in both packet reception and transmission. A user space library ‘libpfring’ is used to facilitate user applications access the underlying socket based ring buffer management scheme. The third part of the software modules is a set of example user applications that demonstrated how to use PF_RING.

Kernel module called PF_RING (pf_ring.h and pf_ring.c)User library libpfring.a or libpfring.o (pfring.h and pfring.c)Example user application programs such as pfcount.c

Page 87: Linux Network Architecture Network Layer

2010/09/17 © by

Some pf_ring.c global variablesstatic struct proto ring_proto;

static struct list_head ring_table;static u_int ring_table_size;static struct list_head ring_cluster_list;/* List of all devices on which PF_RING has been registered */static struct list_head ring_aware_device_list;/* List of all dna (direct nic access) devices */static struct list_head ring_dna_devices_list;static u_int dna_devices_list_size = 0;/* pf_ring.h #define MAX_NUM_DEVICES 256 */static struct list_head device_ring_list[MAX_NUM_DEVICES];static struct net_proto_family ring_family_ops = { .family = PF_RING, .create = ring_create, .owner = THIS_MODULE,};/* Dummy 'any' device */static struct net_device any_dev, none_dev;

Page 88: Linux Network Architecture Network Layer

2010/09/17 © by

struct proto<net/sock.h>/* Networking protocol blocks attached to sockets. socket layer -> transport laye

r interface transport -> network interface is defined by struct inet_proto */struct proto {

void (*close)(struct sock *sk, long timeout);int (*connect)(struct sock *sk,struct sockaddr *uaddr, int addr_len);int (*disconnect)(struct sock *sk, int flags);struct sock * (*accept) (struct sock *sk, int flags, int *err);int (*ioctl)(struct sock *sk, int cmd, unsigned long arg);int (*init)(struct sock *sk);void (*destroy)(struct sock *sk);void (*shutdown)(struct sock *sk, int how);int (*setsockopt)(struct sock *sk, int level, int optname,

char __user *optval, unsigned int optlen);int (*getsockopt)(struct sock *sk, int level,

int optname, char __user *optval, int __user *option);

Page 89: Linux Network Architecture Network Layer

2010/09/17 © by

struct proto (cont’ed)#ifdef CONFIG_COMPAT

int (*compat_setsockopt)(struct sock *sk, int level,int optname, char __user *optval, unsigned int optlen);

int (*compat_getsockopt)(struct sock *sk, int level,int optname, char __user *optval, int __user *option);

#endifint (*sendmsg)(struct kiocb *iocb, struct sock *sk,

struct msghdr *msg, size_t len);int (*recvmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,

size_t len, int noblock, int flags, int *addr_len);int (*sendpage)(struct sock *sk, struct page *page,

int offset, size_t size, int flags);int (*bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len);int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb);/* Keeping track of sk's, looking them up, and port selection methods. */void (*hash)(struct sock *sk);void (*unhash)(struct sock *sk);int (*get_port)(struct sock *sk, unsigned short snum);

Page 90: Linux Network Architecture Network Layer

2010/09/17 © by

struct proto (cont’ed)#ifdef CONFIG_PROC_FS

unsigned int inuse_idx;#endif

void (*enter_memory_pressure)(struct sock *sk);atomic_t *memory_allocated;/* Current allocated memory. */struct percpu_counter *sockets_allocated;/* Current num of sockets. */int *memory_pressure;int *sysctl_mem;int *sysctl_wmem;int *sysctl_rmem;int max_header;struct kmem_cache *slab;unsigned int obj_size;int slab_flags;struct percpu_counter *orphan_count;struct request_sock_ops *rsk_prot;struct timewait_sock_ops *twsk_prot;

Page 91: Linux Network Architecture Network Layer

2010/09/17 © by

struct proto (cont’ed)

union {struct inet_hashinfo *hashinfo;struct udp_table *udp_table;struct raw_hashinfo *raw_hash;

} h;struct module *owner;char name[32];struct list_head node;

#ifdef SOCK_REFCNT_DEBUGatomic_t socks;

#endif};

Page 92: Linux Network Architecture Network Layer

2010/09/17 © by

struct ring_opt/* Ring options */struct ring_opt { u_int8_t ring_active, num_rx_channels; struct net_device *ring_netdev; u_short ring_pid; u_int32_t ring_id; char *appl_name; /* String that id the application bound to the socket */ packet_direction direction; /* Specify the capture direction for packets */ struct ring_opt *master_ring; /* Master Ring */ u_int8_t mmap_count; dna_device *dna_device; /* Direct NIC Access */ u_short cluster_id; /* Cluster, 0 = no cluster */ int32_t channel_id; /* Channel, -1 = any channel */ struct net_device *reflector_dev; /* Reflector device */ unsigned long order; /* Packet buffers */ void *ring_memory; /* Ring Slots */

Page 93: Linux Network Architecture Network Layer

2010/09/17 © by

struct ring_opt (cont’ed) u_int32_t bucket_len; FlowSlotInfo *slots_info; /* Points to ring_memory */ char *ring_slots; /* Points to ring_memory+sizeof(FlowSlotInfo) */ u_int32_t pktToSample, sample_rate; /* Packet Sampling */ struct sk_filter *bpfFilter; /* BPF Filter */ filtering_hash_bucket **filtering_hash; /* Filtering Rules */ u_int16_t num_filtering_rules; u_int8_t rules_default_accept_policy; /*1=default is accept,drop otherwise */ struct list_head rules; atomic_t num_ring_users;/* Locks */ wait_queue_head_t ring_slots_waitqueue; rwlock_t ring_index_lock, ring_rules_lock; u_int insert_page_id, insert_slot_id;/* Indexes (Internal) */ do_handle_filtering_hash_bucket handle_hash_rule;/* Function pointer */};

Page 94: Linux Network Architecture Network Layer

2010/09/17 © by

struct pfring_hooks/* Hack to jump from a device directly to PF_RING */

struct pfring_hooks { u_int32_t magic; /* Should be set to PF_RING and be the first one */ unsigned int *transparent_mode; handle_ring_skb ring_handler; handle_ring_buffer buffer_ring_handler; handle_add_hdr_to_ring buffer_add_hdr_to_ring; register_pfring_plugin pfring_registration; unregister_pfring_plugin pfring_unregistration; handle_ring_dna_device ring_dna_device_handler; read_device_pfring_free_slots pfring_free_device_slots;};

Page 95: Linux Network Architecture Network Layer

2010/09/17 © by

Global variable ring_hooks/* pf_ring.h */#define PF_RING 27 /* Packet Ring */#define SOCK_RING PF_RING

/* pf_ring.c */static struct pfring_hooks ring_hooks = { .magic = PF_RING, .transparent_mode = &transparent_mode, .ring_handler = skb_ring_handler, .buffer_ring_handler = buffer_ring_handler, .buffer_add_hdr_to_ring = add_hdr_to_ring, .pfring_registration = register_plugin, .pfring_unregistration = unregister_plugin, .ring_dna_device_handler = dna_device_handler,};

Page 96: Linux Network Architecture Network Layer

2010/09/17 © by

ring_init()kernel/pf_ring.cstatic int __init ring_init(void){ int i, rc; if((rc = proto_register(&ring_proto, 0)) != 0) return(rc); INIT_LIST_HEAD(&ring_table); INIT_LIST_HEAD(&ring_cluster_list); INIT_LIST_HEAD(&ring_aware_device_list); INIT_LIST_HEAD(&ring_dna_devices_list); for (i = 0; i < MAX_NUM_DEVICES; i++) INIT_LIST_HEAD(&device_ring_list[i]); memset(&any_dev, 0, sizeof(any_dev)); strcpy(any_dev.name, "any"); memset(&none_dev, 0, sizeof(none_dev)); strcpy(none_dev.name, "none"); ring_proc_init(); sock_register(&ring_family_ops); register_netdevice_notifier(&ring_netdev_notifier); /* Sanity check */ if(transparent_mode > driver2pf_ring_non_transparent) transparent_mode = standard_linux_path;

Page 97: Linux Network Architecture Network Layer

2010/09/17 © by

ring_init() (cont’ed)

printk("[PF_RING] Ring slots %d\n", num_slots); printk("[PF_RING] Slot version %d\n", RING_FLOWSLOT_VERSION); printk("[PF_RING] Capture TX %s\n", enable_tx_capture ? "Yes [RX+TX]" :

"No [RX only]"); printk("[PF_RING] Transparent Mode %d\n", transparent_mode); printk("[PF_RING] IP Defragment %s\n", enable_ip_defrag ? "Yes" : "No"); printk("[PF_RING] Initialized correctly\n"); register_device_handler(); pfring_enabled = 1; return 0;}

Page 98: Linux Network Architecture Network Layer

2010/09/17 © by

ring_proc_init()static void ring_proc_init(void){ ring_proc_dir = proc_mkdir("pf_ring",#if(LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24))

init_net.#endif

proc_net); if(ring_proc_dir) {#if(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)) ring_proc_dir->owner = THIS_MODULE;#endif ring_proc_dev_dir = proc_mkdir(PROC_DEV, ring_proc_dir); ring_proc = create_proc_read_entry(PROC_INFO, 0,

ring_proc_dir, ring_proc_get_info, NULL);

Page 99: Linux Network Architecture Network Layer

2010/09/17 © by

ring_proc_init() (cont’ed) ring_proc_plugins_info = create_proc_read_entry(PROC_PLUGINS_INFO, 0, ring_proc_dir,

ring_proc_get_plugin_info, NULL); if(!ring_proc || !ring_proc_plugins_info) printk("[PF_RING] unable to register proc file\n"); else {#if(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30)) ring_proc->owner = THIS_MODULE; ring_proc_plugins_info->owner = THIS_MODULE;#endif printk("[PF_RING] registered /proc/net/pf_ring/\n"); } } else printk("[PF_RING] unable to create /proc/net/pf_ring\n");}

Page 100: Linux Network Architecture Network Layer

2010/09/17 © by

INIT_LIST_HEAD()

<linux/list.h>

struct list_head {struct list_head *next, *prev;

};static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; list->prev = list;}

Page 101: Linux Network Architecture Network Layer

2010/09/17 © by

register_netdevice_notifier(nb)<linux/netdevice.h>

<kernel src>/net/core/dev.cint register_netdevice_notifier(struct notifier_block *nb){

struct net_device *dev; struct net_device *last;struct net *net; int err;rtnl_lock();err = raw_notifier_chain_register(&netdev_chain, nb);if (err) goto unlock;if (dev_boot_phase) goto unlock;for_each_net(net) {

for_each_netdev(net, dev) {err = nb->notifier_call(nb, NETDEV_REGISTER, dev);err = notifier_to_errno(err);if (err) goto rollback;if (!(dev->flags & IFF_UP)) continue;nb->notifier_call(nb, NETDEV_UP, dev);

}}

Page 102: Linux Network Architecture Network Layer

2010/09/17 © by

register_netdevice_notifier(nb) (cont’ed)

unlock:rtnl_unlock(); return err;

rollback:last = dev;for_each_net(net) { for_each_netdev(net, dev) {

if (dev == last) break;if (dev->flags & IFF_UP) { nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); nb->notifier_call(nb, NETDEV_DOWN, dev);}nb->notifier_call(nb, NETDEV_UNREGISTER, dev);nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);

}}raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock;

}EXPORT_SYMBOL(register_netdevice_notifier);

Page 103: Linux Network Architecture Network Layer

2010/09/17 © by

dev_add_pack(pt)<linux/netdevice.h><kernel src>/net/core/dev.c

void dev_add_pack(struct packet_type *pt){

int hash;spin_lock_bh(&ptype_lock);if (pt->type == htons(ETH_P_ALL))

list_add_rcu(&pt->list, &ptype_all);else {

hash = ntohs(pt->type) & PTYPE_HASH_MASK;list_add_rcu(&pt->list, &ptype_base[hash]);

}spin_unlock_bh(&ptype_lock);

}EXPORT_SYMBOL(dev_add_pack);

Page 104: Linux Network Architecture Network Layer

2010/09/17 © by

ring_notifier()static struct notifier_block ring_netdev_notifier = { .notifier_call = ring_notifier,};

static int ring_notifier(struct notifier_block *this, unsigned long msg, void *data)

{ struct net_device *dev = data; struct pfring_hooks *hook; switch(msg) { case NETDEV_UP: break; case NETDEV_DOWN: break; case NETDEV_REGISTER:#ifdef RING_DEBUG printk("[PF_RING] packet_notifier(%s) [REGISTER][pfring_ptr=%p]\n",

dev->name, dev->pfring_ptr);#endif

Page 105: Linux Network Architecture Network Layer

2010/09/17 © by

ring_notifier() (cont’ed) if(dev->pfring_ptr == NULL) { dev->pfring_ptr = &ring_hooks; add_device_to_ring_list(dev); } break; case NETDEV_UNREGISTER:#ifdef RING_DEBUG printk("[PF_RING] packet_notifier(%s) [UNREGISTER][pfring_ptr=%p]\n",

dev->name, dev->pfring_ptr);#endif hook = (struct pfring_hooks*)dev->pfring_ptr; if(hook->magic == PF_RING) { remove_device_from_ring_list(dev); dev->pfring_ptr = NULL; } break; case NETDEV_CHANGE: /* Interface state change */ case NETDEV_CHANGEADDR: break;

Page 106: Linux Network Architecture Network Layer

2010/09/17 © by

ring_notifier() (cont’ed)case NETDEV_CHANGENAME: /* Rename interface ethX -> ethY */ { struct list_head *ptr, *tmp_ptr;#if defined(RING_DEBUG) printk("[PF_RING] device change name %s\n", dev->name);#endif list_for_each_safe(ptr, tmp_ptr, &ring_aware_device_list) {

ring_device_element *dev_ptr = list_entry(ptr, ring_device_element, list); if(dev_ptr->dev == dev) {

#if defined(RING_DEBUG) printk("[PF_RING] ==>> FOUND device change name %s\n", dev->name);#endif

dev_ptr->proc_entry->name = dev->name; break; } } } break;

default: printk("[PF_RING] packet_notifier(%s): unhandled message [msg=%lu][pfring_ptr=%p]\n", dev->name, msg, dev->pfring_ptr); break; } return NOTIFY_DONE;}

Page 107: Linux Network Architecture Network Layer

2010/09/17 © by

proto_register()<net/sock.h><kernel src>/net/core/sock.c

int proto_register(struct proto *prot, int alloc_slab){

if (alloc_slab) {prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,

SLAB_HWCACHE_ALIGN | prot->slab_flags, NULL); if (prot->slab == NULL) {printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", prot->name);

goto out; } if (prot->rsk_prot != NULL) {

prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);

Page 108: Linux Network Architecture Network Layer

2010/09/17 © by

proto_register() (cont’ed) if (prot->rsk_prot->slab_name == NULL) goto out_free_sock_slab;

prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,

prot->rsk_prot->obj_size, 0,SLAB_HWCACHE_ALIGN, NULL);

if (prot->rsk_prot->slab == NULL) {printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",

prot->name);goto out_free_request_sock_slab_name;

}}if (prot->twsk_prot != NULL) {

prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, “tw_sock_%s", prot->nam

e);

Page 109: Linux Network Architecture Network Layer

2010/09/17 © by

proto_register() (cont’ed)if (prot->twsk_prot->twsk_slab_name == NULL)

goto out_free_request_sock_slab;prot->twsk_prot->twsk_slab =

kmem_cache_create(prot->twsk_prot->twsk_slab_name,prot->twsk_prot->twsk_obj_size, 0,SLAB_HWCACHE_ALIGN | prot->slab_flags, NULL);

if (prot->twsk_prot->twsk_slab == NULL)goto out_free_timewait_sock_slab_name;

}}write_lock(&proto_list_lock);list_add(&prot->node, &proto_list);assign_proto_idx(prot);write_unlock(&proto_list_lock);return 0;

out_free_timewait_sock_slab_name:kfree(prot->twsk_prot->twsk_slab_name);

Page 110: Linux Network Architecture Network Layer

2010/09/17 © by

proto_register() (cont’ed)out_free_request_sock_slab:

if (prot->rsk_prot && prot->rsk_prot->slab) {kmem_cache_destroy(prot->rsk_prot->slab);prot->rsk_prot->slab = NULL;

}out_free_request_sock_slab_name:

if (prot->rsk_prot) kfree(prot->rsk_prot->slab_name);out_free_sock_slab:

kmem_cache_destroy(prot->slab); prot->slab = NULL;out:

return -ENOBUFS;}EXPORT_SYMBOL(proto_register);

Page 111: Linux Network Architecture Network Layer

2010/09/17 © by

sock_register()<linux/net.h><kernel src>/net/socket.c/*sock_register - add a socket protocol handler */int sock_register(const struct net_proto_family *ops){

int err;if (ops->family >= NPROTO) {

printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);return -ENOBUFS;

}spin_lock(&net_family_lock);if (net_families[ops->family]) err = -EEXIST;else {

net_families[ops->family] = ops; err = 0;}spin_unlock(&net_family_lock);

printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);return err;

}

Page 112: Linux Network Architecture Network Layer

2010/09/17 © by

register_device_handler(void)

/* Protocol hook */static struct packet_type prot_hook;

void register_device_handler(void) {if(transparent_mode != standard_linux_path) return; prot_hook.func = packet_rcv;prot_hook.type = htons(ETH_P_ALL);dev_add_pack(&prot_hook);

}

Page 113: Linux Network Architecture Network Layer

2010/09/17 © by

sk_alloc()<net/sock.h><kernel src>/net/sock.c/** sk_alloc - All socket objects are allocated here * @net: the applicable net namespace @family: protocol family * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) * @prot: struct proto associated with this new sock instance */struct sock *sk_alloc(struct net *net, int family, gfp_t priority,

struct proto *prot){

struct sock *sk;sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);if (sk) {

sk->sk_family = family;sk->sk_prot = sk->sk_prot_creator = prot;sock_lock_init(sk); sock_net_set(sk, get_net(net));atomic_set(&sk->sk_wmem_alloc, 1);

}return sk;

}EXPORT_SYMBOL(sk_alloc);

Page 114: Linux Network Architecture Network Layer

2010/09/17 © by

ring_create()static int ring_create(struct net *net,struct socket *sock, int protocol, int kern){ struct sock *sk; struct ring_opt *pfr; int err;#if defined(RING_DEBUG) printk("[PF_RING] ring_create()\n");#endif if(!capable(CAP_NET_ADMIN)) return -EPERM; if(sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if(protocol != htons(ETH_P_ALL)) return -EPROTONOSUPPORT; err = -ENOMEM;#if(LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL);#else#if(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24))sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1);#else sk = sk_alloc(net, PF_INET, GFP_KERNEL, &ring_proto);#endif#endif

Page 115: Linux Network Architecture Network Layer

2010/09/17 © by

ring_create() (cont’ed) if(sk == NULL) goto out; sock->ops = &ring_ops; sock_init_data(sock, sk);#if(LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) sk_set_owner(sk, THIS_MODULE);#endif err = -ENOMEM; ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL)); if(!(pfr = ring_sk(sk))) { sk_free(sk); goto out; } memset(pfr, 0, sizeof(*pfr)); pfr->ring_active = 0; /* Activate as soon as somebody waits for pakts */ pfr->num_rx_channels = UNKNOWN_NUM_RX_CHANNELS; pfr->channel_id = RING_ANY_CHANNEL; pfr->bucket_len = DEFAULT_BUCKET_LEN; pfr->handle_hash_rule = handle_filtering_hash_bucket;

Page 116: Linux Network Architecture Network Layer

2010/09/17 © by

ring_create() (cont’ed) init_waitqueue_head(&pfr->ring_slots_waitqueue); rwlock_init(&pfr->ring_index_lock); rwlock_init(&pfr->ring_rules_lock); atomic_set(&pfr->num_ring_users, 0); INIT_LIST_HEAD(&pfr->rules); sk->sk_family = PF_RING; sk->sk_destruct = ring_sock_destruct; ring_insert(sk); pfr->master_ring = NULL; pfr->ring_netdev = &none_dev; /* Unbound socket */ pfr->sample_rate = 1; /* No sampling */ pfr->ring_pid = current->pid; pfr->ring_id = ring_id_serial++; ring_proc_add(pfr);#if defined(RING_DEBUG) printk("[PF_RING] ring_create(): created\n");#endif return(0); out: return err;}

Page 117: Linux Network Architecture Network Layer

2010/09/17 © by

packet_rcv(skb,dev,pt,orig_dev)

static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)

{ int rc; if(skb->pkt_type != PACKET_LOOPBACK) { rc = skb_ring_handler(skb,

(skb->pkt_type == PACKET_OUTGOING) ? 0 : 1,1, UNKNOWN_RX_CHANNEL, UNKNOWN_NUM_RX_CHANNELS);

} else rc = 0; kfree_skb(skb); return(rc);}

Page 118: Linux Network Architecture Network Layer

2010/09/17 © by

skb_ring_handler()static int skb_ring_handler(struct sk_buff *skb,

u_char recv_packet, u_char real_skb /* 1=real skb, 0=faked skb */ , u_int8_t channel_id, u_int8_t num_rx_channels)

{ struct sock *skElement; int rc = 0, is_ip_pkt, displ; struct list_head *ptr; struct pfring_pkthdr hdr; struct sk_buff *skk = NULL, *orig_skb = skb;

Page 119: Linux Network Architecture Network Layer

2010/09/17 © by

skb_ring_handler() (cont’ed) if((!skb) ||((!enable_tx_capture) && (!recv_packet))) { /*An outgoing packet is about to be sent out but we decided not to handle transm

itted packets. */ return(0); } if(recv_packet) { /* Hack for identifying a packet received by the e1000 */ if(real_skb) displ = SKB_DISPLACEMENT; else displ = 0; /* Received by the e1000 wrapper */ } else displ = 0; is_ip_pkt = parse_pkt(skb, displ, &hdr);

Page 120: Linux Network Architecture Network Layer

2010/09/17 © by

skb_ring_handler() (cont’ed)if(enable_ip_defrag && real_skb && is_ip_pkt && recv_packet && (ring_table_size > 0)) { struct sk_buff *cloned = NULL; struct iphdr *iphdr = NULL; skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_set_network_header(skb, ETH_HLEN - displ); iphdr = ip_hdr(skb); if(iphdr) { if(iphdr->frag_off & htons(IP_MF | IP_OFFSET)) {

if((cloned = skb_clone(skb, GFP_ATOMIC)) != NULL) { skk = ring_gather_frags(cloned); if(skk != NULL) { skb = skk; parse_pkt(skb, displ, &hdr); hdr.len = hdr.caplen = skb->len + displ; } else { return(0); /* mask rcvd fragments */ } }

} } }

Page 121: Linux Network Architecture Network Layer

2010/09/17 © by

skb_ring_handler() (cont’ed) if(skb->tstamp.tv64 == 0) __net_timestamp(skb); hdr.ts = ktime_to_timeval(skb->tstamp); hdr.len = hdr.caplen = skb->len + displ; /* Avoid the ring to be manipulated while playing with it */ read_lock_bh(&ring_mgmt_lock); /* [1] Check unclustered sockets */ list_for_each(ptr, &ring_table) { struct ring_opt *pfr; struct ring_element *entry; entry = list_entry(ptr, struct ring_element, list); skElement = entry->sk; pfr = ring_sk(skElement);

Page 122: Linux Network Architecture Network Layer

2010/09/17 © by

skb_ring_handler() (cont’ed) if( (pfr != NULL) && (pfr->ring_netdev != &none_dev)

&& (pfr->cluster_id == 0 )&& (pfr->ring_slots != NULL) && is_valid_skb_direction(pfr->direction, recv_packet)

&& ((pfr->ring_netdev == skb->dev) || (pfr->ring_netdev == &any_dev) /* Socket bound to 'any' */ || ((skb->dev->flags & IFF_SLAVE)

&& (pfr->ring_netdev == skb->dev->master)))) { /* We've found the ring where the packet can be stored */ int old_caplen = hdr.caplen; /* Keep old lenght */ hdr.caplen = min(hdr.caplen, pfr->bucket_len); add_skb_to_ring(skb, pfr, &hdr, is_ip_pkt, displ, channel_id, num_rx_channels); hdr.caplen = old_caplen; rc = 1; /* Ring found: we've done our job */ } }

Page 123: Linux Network Architecture Network Layer

2010/09/17 © by

skb_ring_handler() (cont’ed) /* [2] Check socket clusters */ list_for_each(ptr, &ring_cluster_list) { ring_cluster_element *cluster_ptr; struct ring_opt *pfr; cluster_ptr = list_entry(ptr, ring_cluster_element, list); if(cluster_ptr->cluster.num_cluster_elements > 0) { u_int skb_hash = hash_pkt_cluster(cluster_ptr, &hdr); u_short num_iterations; for(num_iterations = 0;

num_iterations < cluster_ptr->cluster.num_cluster_elements; num_iterations++) { skElement = cluster_ptr->cluster.sk[skb_hash]; if(skElement != NULL) { pfr = ring_sk(skElement);

Page 124: Linux Network Architecture Network Layer

2010/09/17 © by

skb_ring_handler() (cont’ed) if((pfr != NULL) && (pfr->ring_slots != NULL) && ((pfr->ring_netdev == skb->dev)

|| ((skb->dev->flags & IFF_SLAVE) && (pfr->ring_netdev == skb->dev->master)))

&& is_valid_skb_direction(pfr->direction, recv_packet) ) { FlowSlot *theSlot = get_insert_slot(pfr); if((theSlot == NULL) || (theSlot->slot_state == 0 /* Not full */)) { /* We've found the ring where the packet can be stored */ add_skb_to_ring(skb, pfr, &hdr, is_ip_pkt, displ,

channel_id, num_rx_channels); rc = 1; /* Ring found: we've done our job */ break; } }}

Page 125: Linux Network Architecture Network Layer

2010/09/17 © by

skb_ring_handler() (cont’ed)if(cluster_ptr->cluster.hashing_mode != cluster_round_robin) break;else

skb_hash = (skb_hash + 1) % cluster_ptr>cluster.num_cluster_elements; } } } /* Clustering */ read_unlock_bh(&ring_mgmt_lock);/* Fragment handling */ if(skk != NULL) kfree_skb(skk); if(rc == 1) { if(transparent_mode != driver2pf_ring_non_transparent) { rc = 0; } else { if(recv_packet && real_skb) {

kfree_skb(orig_skb); } } }return(rc);/* 0 = packet not handled */}

Page 126: Linux Network Architecture Network Layer

2010/09/17 © by

User space libraryAll begins with ‘pfring_’ prefix. A struct pfring in user space keeps all the needed information using the under

lying PF_RING module.

int pfring_recv(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet);

pfring* pfring_open(char *device_name, u_int8_t promisc, u_int32_t caplen, u_int8_t reentrant);

int pfring_bind(pfring *ring, char *device_name);

int pfring_read(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet,

u_int8_t consume_packet_immediately);

Page 127: Linux Network Architecture Network Layer

2010/09/17 © by

struct pfringtypedef struct { /* DNA (Direct NIC Access) */

u_char dna_mapped_device; u_int32_t tot_dna_read_pkts, rx_reg; dna_device dna_dev; u_int32_t *rx_reg_ptr[MAX_NUM_RX_CHANNELS]; /* All devices */char *buffer, *slots, *device_name;int fd;FlowSlotInfo *slots_info; FlowSlot *last_slot_to_update;u_int page_id, slot_id, pkts_per_page;u_int poll_sleep; u_int8_t clear_promisc, reentrant; u_long num_poll_calls;pthread_spinlock_t spinlock;

} pfring;

Page 128: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_open()pfring* pfring_open(char *device_name, u_int8_t promisc,

u_int32_t caplen, u_int8_t _reentrant) { int err = 0; pfring *ring = (pfring*)malloc(sizeof(pfring)); if(ring == NULL) return(NULL); else memset(ring, 0, sizeof(pfring)); ring->reentrant = _reentrant; ring->fd = socket(PF_RING, SOCK_RAW, htons(ETH_P_ALL)); if(ring->fd > 0) { int rc; u_int memSlotsLen; if(caplen > MAX_CAPLEN) caplen = MAX_CAPLEN; setsockopt(ring->fd, 0, SO_RING_BUCKET_LEN, &caplen, sizeof(caplen)); if((device_name == NULL) || (strcmp(device_name, "none") == 0)) { rc = 0; /* No binding yet */ } else rc = pfring_bind(ring, device_name);

Page 129: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_open() (cont’ed) if(rc == 0) { ring->buffer = (char *)mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE,

MAP_SHARED, ring->fd, 0); if(ring->buffer == MAP_FAILED) {

printf("mmap() failed");free(ring); return(NULL); } ring->slots_info = (FlowSlotInfo *)ring->buffer; if(ring->slots_info->version != RING_FLOWSLOT_VERSION) {printf("Wrong RING version: kernel is %i, libpfring was compiled with %i\n",

ring->slots_info->version, RING_FLOWSLOT_VERSION); free(ring); return(NULL);

} memSlotsLen = ring->slots_info->tot_mem; munmap(ring->buffer, PAGE_SIZE); ring->buffer = (char *)mmap(NULL, memSlotsLen, PROT_READ|PROT_WRITE,

MAP_SHARED, ring->fd, 0); if(ring->buffer == MAP_FAILED) {

printf("mmap() failed"); free(ring); return(NULL); }

Page 130: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_open() (cont’ed)ring->slots_info = (FlowSlotInfo *)ring->buffer;ring->slots = (char *)(ring->buffer+sizeof(FlowSlotInfo));if(ring->slots_info->remove_idx >= ring->slots_info->tot_slots)

ring->slots_info->remove_idx = 0;ring->page_id = PAGE_SIZE, ring->slot_id = 0, ring->pkts_per_page = 0;ring->device_name = strdup(device_name);

if(promisc) { if(set_if_promisc(device_name, 1) == 0) ring->clear_promisc = 1; } } else { close(ring->fd); err = -1; } } else { err = -1; free(ring); } if(err == 0) { if(ring->reentrant) pthread_spin_init(&ring->spinlock, PTHREAD_PROCESS_PRIVATE); return(ring); } else return(NULL);}

Page 131: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_bind()int pfring_bind(pfring *ring, char *device_name) { struct sockaddr sa; char *at; int32_t channel_id = -1; int rc = 0; if((device_name==NULL) || (strcmp(device_name, "none") ==0)) return(-1); at = strchr(device_name, '@'); if(at != NULL) { char *tok, *pos = NULL; at[0] = '\0';/* Syntax : ethX@1,5 channel 1 and 5, ethX@1-5 channel 1,2...5, ethX@1-3,5-7 channel 1,2,3,5,6,7 */ tok = strtok_r(&at[1], ",", &pos); channel_id = 0;

Page 132: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_bind() (cont’ed) while(tok != NULL) { char *dash = strchr(tok, '-'); int32_t min_val, max_val, i; if(dash) {

dash[0] = '\0'; min_val = atoi(tok); max_val = atoi(&dash[1]); } else min_val = max_val = atoi(tok); for(i = min_val; i <= max_val; i++) channel_id |= 1 << i; tok = strtok_r(NULL, ",", &pos); } } sa.sa_family = PF_RING; snprintf(sa.sa_data, sizeof(sa.sa_data), "%s", device_name); rc = bind(ring->fd, (struct sockaddr *)&sa, sizeof(sa)); if(rc == 0) { if(channel_id != -1) { int rc = pfring_set_channel_id(ring, channel_id); if(rc != 0) printf("pfring_set_channel_id() failed: %d\n", rc); } } return(rc);}

Page 133: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_recv()pfring_recv() is just a wrapper of the pfring_read() function

int pfring_recv(pfring *ring, char* buffer, u_int buffer_len,struct pfring_pkthdr *hdr, u_int8_t wait_for_incoming_packet)

{ return(pfring_read(ring, buffer, buffer_len,

hdr, wait_for_incoming_packet, 1));}

Page 134: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_read()int pfring_read(pfring *ring, char* buffer, u_int buffer_len, struct pfring_pkthdr *hdr,

u_int8_t wait_for_incoming_packet,u_int8_t consume_packet_immediately) { if(ring == NULL) return(-1); if(ring->reentrant) { /* Late packet consumers is not supported in multithreaded env. as threads can st

eal each other's packets */ consume_packet_immediately = 1; } if(ring->dna_mapped_device) { char *pkt = NULL; if(wait_for_incoming_packet) { if(ring->reentrant) pthread_spin_lock(&ring->spinlock); switch(ring->dna_dev.device_model) { case intel_e1000:

e1000_there_is_a_packet_to_read(ring, wait_for_incoming_packet); break;

default: return(0); } if(ring->reentrant) pthread_spin_unlock(&ring->spinlock); }

Page 135: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_read() (cont’ed) switch(ring->dna_dev.device_model) { case intel_e1000: pkt=get_next_e1000_packet(ring,buffer,buffer_len,hdr); break; case intel_igb: pkt = NULL, hdr->len = 0; break; case intel_ixgbe: pkt = NULL, hdr->len = 0; break; } if(pkt && (hdr->len > 0)) { /* Set the (1) below to (0) for enabling packet parsing for DNA devices */ if(1) hdr->parsed_header_len = 0; else parse_pkt(buffer, hdr); return(1); } else return(0); } else { FlowSlot *slot; u_int32_t queuedPkts;#ifdef USE_ADAPTIVE_WAIT u_int32_t num_loops = 0;#endif if((ring == NULL) || (ring->buffer == NULL)) return(-1); if(ring->last_slot_to_update) pfring_notify(ring, REFLECT_PACKET_DEVICE_NONE);

Page 136: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_read() (cont’ed) do_pfring_recv: if(ring->reentrant) pthread_spin_lock(&ring->spinlock); slot = (FlowSlot*)&ring->slots[ring->slots_info->remove_idx*ring->slots_info->sl

ot_len]; if(ring->slots_info->tot_insert >= ring->slots_info->tot_read) queuedPkts = ring->slots_info->tot_insert - ring->slots_info->tot_read; else queuedPkts = ring->slots_info->tot_slots + ring->slots_info->tot_insert - ring->sl

ots_info->tot_read; if(queuedPkts && (slot->slot_state == 1 /* There's a packet to read */)) { char *bucket = (char*)&slot->bucket; struct pfring_pkthdr *_hdr = (struct pfring_pkthdr*)bucket; int bktLen = _hdr->caplen+_hdr->parsed_header_len; if(bktLen > buffer_len) bktLen = buffer_len-1; if(buffer && (bktLen > 0)) {

memcpy(buffer, &bucket[sizeof(struct pfring_pkthdr)], bktLen); bucket[bktLen] = '\0';

}

Page 137: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_read() (cont’ed) if(ring->slots_info->remove_idx >= (ring->slots_info->tot_slots-1)) {

ring->slots_info->remove_idx = 0;ring->page_id = PAGE_SIZE, ring->slot_id = 0, ring->pkts_per_page = 0;

} else {ring->slots_info->remove_idx++;ring->pkts_per_page++, ring->slot_id += ring->slots_info->slot_len;

} if(hdr) memcpy(hdr, _hdr, sizeof(struct pfring_pkthdr)); ring->slots_info->tot_read++; if(consume_packet_immediately) {

ring->last_slot_to_update = NULL, slot->slot_state = 0; /* Empty slot */ } else {

/* We do not notify pf_ring that the packet has been read hence this slot will not be available for storing a new packet until we notify pf_ring */

ring->last_slot_to_update = slot; }

Page 138: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_read() (cont’ed) if(ring->reentrant) pthread_spin_unlock(&ring->spinlock); return(1); } else { if(ring->reentrant) pthread_spin_unlock(&ring->spinlock); if(wait_for_incoming_packet) { struct pollfd pfd; int rc;#ifdef USE_ADAPTIVE_WAIT

/* Spin in userland for a while and if no packet arrives then it's time to poll the kernel. Only do poll() if there is no chance to avoid it, as a call to poll() is too costly */ if(num_loops < MAX_NUM_LOOPS) { num_loops++; if(num_loops % YIELD_MULTIPLIER) { sched_yield(); } }

#endif

Page 139: Linux Network Architecture Network Layer

2010/09/17 © by

pfring_read() (cont’ed)/* Sleep when nothing is happening */pfd.fd = ring->fd;pfd.events = POLLIN|POLLERR;pfd.revents = 0;errno = 0;rc = poll(&pfd, 1, -1);ring->num_poll_calls++;if(rc == -1) return(-1);else goto do_pfring_recv;

} } return(-1); /* Not reached */ }}

Page 140: Linux Network Architecture Network Layer

2010/09/17 © by

pfcount.c main() { /* Omitted.. argument processing codes */ if(device == NULL) device = DEFAULT_DEVICE; if(num_threads > MAX_NUM_THREADS) num_threads=MAX_NUM_THREADS; printf("Capturing from %s\n", device); /* hardcode: promisc=1, to_ms=500 */ promisc = 1; if(num_threads > 0) pthread_rwlock_init(&statsLock, NULL); if(!dna_mode) pd = pfring_open(device, promisc, snaplen, (num_threads > 0) ? 1 : 0);#ifdef ENABLE_DNA_SUPPORT else pd = pfring_open_dna(device, 0 /* we don't use threads */);#endif /* Omitted … check pd to see if pfring_open() error */ /* Omitted … set filtering rule */

Page 141: Linux Network Architecture Network Layer

2010/09/17 © by

pfcount.c (cont’ed) signal(SIGINT, sigproc); signal(SIGTERM, sigproc); signal(SIGINT, sigproc); if(!verbose) { signal(SIGALRM, my_sigalarm); alarm(ALARM_SLEEP); } if(dna_mode) num_threads = 1; else { if(num_threads > 0) wait_for_packet = 1; } if(!wait_for_packet) pfring_enable_ring(pd); if(num_threads > 1) { pthread_t my_thread; int i; for(i=1; i<num_threads; i++) pthread_create(&my_thread, NULL, packet_consumer_thread, (void*)i); } packet_consumer_thread(0); pfring_close(pd); sleep(3); return(0);}

Page 142: Linux Network Architecture Network Layer

2010/09/17 © by

packet_consumer_thread()void* packet_consumer_thread(void* _id) { while(1) { struct simple_stats { u_int64_t num_pkts, num_bytes; }; u_char buffer[2048]; struct simple_stats stats; struct pfring_pkthdr hdr; int rc; u_int len; if(do_shutdown) break; if(pfring_recv(pd, (char*)buffer, sizeof(buffer), &hdr, wait_for_packet) > 0) { if(do_shutdown) break; dummyProcesssPacket(&hdr, buffer); } if(0) { len = sizeof(stats); rc = pfring_get_filtering_rule_stats(pd, 5, (char*)&stats, &len); if(rc < 0) printf("pfring_get_filtering_rule_stats() failed [rc=%d]\n", rc); else { printf("[Pkts=%u][Bytes=%u]\n", (unsigned int)stats.num_pkts, (unsigned in

t)stats.num_bytes); } } } return(NULL);}

Page 143: Linux Network Architecture Network Layer

2010/09/17 © by

Questions?