The Linux Kernel Source Codeusers.ece.gatech.edu/riley/ece4110/handouts/kernel.pdf · 2009. 4. 16. · 3 * operating system. INET is implemented using the BSD Socket 4 * interface

The Linux Kernel Source Code

Source code layout:/usr/src/linux-2.4.2/usr/src/linux-2.4.2/net//usr/src/linux-2.4.2/net/core//usr/src/linux-2.4.2/net/core/dev.c/usr/src/linux-2.4.2/net/ipv4//usr/src/linux-2.4.2/net/ipv4/ip output.c/usr/src/linux-2.4.2/net/ipv4/at inet.c/usr/src/linux-2.4.2/net/ipv4/ip input.c/usr/src/linux-2.4.2/net/ipv4/route.c/usr/src/linux-2.4.2/net/ipv4/ip forward.c/usr/src/linux-2.4.2/net/sched//usr/src/linux-2.4.2/net/sched/sch generic.c/usr/src/linux-2.4.2/include//usr/src/linux-2.4.2/include/linux//usr/src/linux-2.4.2/include/linux/netdevice.h/usr/src/linux-2.4.2/include/linux/if ether.h/usr/src/linux-2.4.2/include/linux/interrupt.h/usr/src/linux-2.4.2/include/linux/skbuff.h/usr/src/linux-2.4.2/include/linux/ip.h/usr/src/linux-2.4.2/include/net//usr/src/linux-2.4.2/include/net/pkt sched.h/usr/src/linux-2.4.2/include/net/ip.h/usr/src/linux-2.4.2/kernel//usr/src/linux-2.4.2/kernel/softirq.c/usr/src/linux-2.4.2/drivers//usr/src/linux-2.4.2/drivers/net/usr/src/linux-2.4.2/drivers/neteepro100.c

1

When a network interface card receives an Ethernet frame that matches the card’s MAC ad-dress (or a broadcast), the NIC generates a hardware interrupt to the card driver. The driver han-dles the interrupt and allocates askbuff (skb) and places the data frame into main memory. Seedrivers/net/eepro100.c , and the call tonetif rx .

2

1 static int2 speedo rx(struct net device * dev)3 {4 struct speedo private * sp = (struct speedo private * )dev->priv;5 int entry = sp->cur rx % RX RING SIZE;6 int rx work limit = sp->dirty rx + RX RING SIZE - sp->cur rx;7 int alloc ok = 1;89 if (speedo debug > 4)

10 printk(KERN DEBUG " In speedo rx(). \n");11 / * If we own the next entry, it’s a new packet. Send it up. * /12 while (sp->rx ringp[entry] != NULL) {13 int status;14 int pkt len;1516 pci dma sync single(sp->pdev, sp->rx ring dma[entry],17 sizeof(struct RxFD), PCI DMAFROMDEVICE);18 status = le32 to cpu(sp->rx ringp[entry]->status);19 pkt len = le32 to cpu(sp->rx ringp[entry]->count) & 0x3fff;2021 if (!(status & RxComplete))22 break;2324 if (--rx work limit < 0)25 break;2627 / * Check for a rare out-of-memory case: the current buffer is28 the last buffer allocated in the RX ring. --SAW * /29 if (sp->last rxf == sp->rx ringp[entry]) {30 / * Postpone the packet. It’ll be reaped at an interrupt when thi s31 packet is no longer the last packet in the ring. * /32 if (speedo debug > 2)33 printk(KERN DEBUG "%s: RX packet postponed! \n",34 dev->name);35 sp->rx ring state |= RrPostponed;36 break;37 }3839 if (speedo debug > 4)40 printk(KERN DEBUG " speedo rx() status %8.8x len %d. \n", status,41 pkt len);42 if ((status & (RxErrTooBig|RxOK|0x0f90)) != RxOK) {43 if (status & RxErrTooBig)44 printk(KERN ERR "%s: Ethernet frame overran the Rx buffer, "45 "status %8.8x! \n", dev->name, status);46 else if (! (status & RxOK)) {

Program eepro100.c

3

47 / * There was a fatal error. This * should * be impossible. * /48 sp->stats.rx errors++;49 printk(KERN ERR "%s: Anomalous event in speedo rx(), "50 "status %8.8x. \n",51 dev->name, status);52 }53 } else {54 struct sk buff * skb;5556 / * Check if the packet is long enough to just accept without57 copying to a properly sized skbuff. * /58 if (pkt len < rx copybreak59 && (skb = dev alloc skb(pkt len + 2)) != 0) {60 skb->dev = dev;61 skb reserve(skb, 2); / * Align IP on 16 byte boundaries * /62 / * ’skb put()’ points to the start of sk buff data area. * /63 pci dma sync single(sp->pdev, sp->rx ring dma[entry],64 sizeof(struct RxFD) + pkt len, PCI DMAFROMDEVICE);6566 #if 1 || USE IP CSUM67 / * Packet is in one chunk -- we can copy + cksum. * /68 eth copy and sum(skb, sp->rx skbuff[entry]->tail, pkt len, 0);69 skb put(skb, pkt len);70 #else71 memcpy(skb put(skb, pkt len), sp->rx skbuff[entry]->tail,72 pkt len);73 #endif74 } else {75 / * Pass up the already-filled skbuff. * /76 skb = sp->rx skbuff[entry];77 if (skb == NULL) {78 printk(KERN ERR "%s: Inconsistent Rx descriptor chain. \n",79 dev->name);80 break;81 }82 sp->rx skbuff[entry] = NULL;83 skb put(skb, pkt len);84 sp->rx ringp[entry] = NULL;85 pci unmap single(sp->pdev, sp->rx ring dma[entry],86 PKT BUF SZ + sizeof(struct RxFD), PCI DMAFROMDEVICE);87 }88 skb->protocol = eth type trans(skb, dev);89 netif rx(skb);90 sp->stats.rx packets++;91 sp->stats.rx bytes += pkt len;92 }

Program eepro100.c (continued)

4

93 entry = (++sp->cur rx) % RX RING SIZE;94 sp->rx ring state &= ˜RrPostponed;95 / * Refill the recently taken buffers.96 Do it one-by-one to handle traffic bursts better. * /97 if (alloc ok && speedo refill rx buf(dev, 0) == -1)98 alloc ok = 0;99 }

100101 / * Try hard to refill the recently taken buffers. * /102 speedo refill rx buffers(dev, 1);103104 sp->last rx time = jiffies;105106 return 0;107 }

Program eepro100.c (continued)

The socket buffer, or skbuff, or skb is the generic buffer that linux uses to handle all net-work packets. Pointers toskbuffs are allocated and passed up and down the protocol stack. seeinclude/linux/skbuff.h . Note the use of the C/C++union construct, that allocatesseveral pointers all in the same memory location. Also see the definition of the IP header ininclude/linux/ip.h .

5

1 / *2 * Definitions for the ’struct sk buff’ memory handlers.3 *4 * Authors:5 * Alan Cox, 6 * Florian La Roche, 7 *8 * This program is free software; you can redistribute it and/o r9 * modify it under the terms of the GNU General Public License

10 * as published by the Free Software Foundation; either versio n11 * 2 of the License, or (at your option) any later version.12 * /1314 #ifndef LINUX SKBUFFH15 #define LINUX SKBUFFH1617 #include 18 #include 19 #include 20 #include 2122 #include 23 #include 24 #include 2526 #define HAVE ALLOCSKB /* For the drivers to know * /27 #define HAVE ALIGNABLE SKB /* Ditto 8) * /28 #define SLAB SKB /* Slabified skbuffs * /2930 #define CHECKSUM NONE 031 #define CHECKSUM HW 132 #define CHECKSUM UNNECESSARY 23334 #ifdef i38635 #define NET CALLER(arg) ( * (((void ** )&arg)-1))36 #else37 #define NET CALLER(arg) builtin return address(0)38 #endif3940 #ifdef CONFIG NETFILTER41 struct nf conntrack {42 atomic t use;43 void ( * destroy)(struct nf conntrack * );44 };4546 struct nf ct info {

Program skbuff.h

6

47 struct nf conntrack * master;48 };49 #endif5051 struct sk buff head {52 / * These two members must be first. * /53 struct sk buff * next;54 struct sk buff * prev;5556 u32 qlen;57 spinlock t lock;58 };5960 struct sk buff {61 / * These two members must be first. * /62 struct sk buff * next; / * Next buffer in list * /63 struct sk buff * prev; / * Previous buffer in list * /6465 struct sk buff head * list; / * List we are on * /66 struct sock * sk; / * Socket we are owned by * /67 struct timeval stamp; / * Time we arrived * /68 struct net device * dev; / * Device we arrived on/are leaving by * /6970 / * Transport layer header * /71 union72 {73 struct tcphdr * th;74 struct udphdr * uh;75 struct icmphdr * icmph;76 struct igmphdr * igmph;77 struct iphdr * ipiph;78 struct spxhdr * spxh;79 unsigned char * raw;80 } h;8182 / * Network layer header * /83 union84 {85 struct iphdr * iph;86 struct ipv6hdr * ipv6h;87 struct arphdr * arph;88 struct ipxhdr * ipxh;89 unsigned char * raw;90 } nh;9192 / * Link layer header * /

Program skbuff.h (continued)

7

93 union94 {95 struct ethhdr * ethernet;96 unsigned char * raw;97 } mac;9899 struct dst entry * dst;

100101 / *102 * This is the control buffer. It is free to use for every103 * layer. Please put your private variables there. If you104 * want to keep them across layers you have to do a skb clone()105 * first. This is owned by whoever has the skb queued ATM.106 * /107 char cb[48];108109 unsigned int len; / * Length of actual data * /110 unsigned int csum; / * Checksum * /111 volatile char used; / * Data moved to user and not MSG PEEK * /112 unsigned char cloned, / * head may be cloned (check refcnt to be sure). * /113 pkt type, / * Packet class * /114 ip summed; / * Driver fed us an IP checksum * /115 u32 priority; / * Packet queueing priority * /116 atomic t users; / * User count - see datagram.c,tcp.c * /117 unsigned short protocol; / * Packet protocol from driver. * /118 unsigned short security; / * Security level of packet * /119 unsigned int truesize; / * Buffer size * /120121 unsigned char * head; / * Head of buffer * /122 unsigned char * data; / * Data head pointer * /123 unsigned char * tail; / * Tail pointer * /124 unsigned char * end; / * End pointer * /125 void ( * destructor)(struct sk buff * ); / * Destruct function * /126 #ifdef CONFIG NETFILTER127 / * Can be used for communication between hooks. * /128 unsigned long nfmark;129 / * Cache info * /130 u32 nfcache;131 / * Associated connection, if any * /132 struct nf ct info * nfct;133 #ifdef CONFIG NETFILTER DEBUG134 unsigned int nf debug;135 #endif136 #endif / * CONFIGNETFILTER* /137138 #if defined(CONFIG HIPPI)


8

139 union {140 u32 ifield;141 } private;142 #endif143144 #ifdef CONFIG NET SCHED145 u32 tc index; / * traffic control index * /146 #endif147 };148149 #define SK WMEMMAX 65535150 #define SK RMEMMAX 65535151152 #endif / * LINUX SKBUFFH * /


9

1 / *2 * INET An implementation of the TCP/IP protocol suite for the L INUX3 * operating system. INET is implemented using the BSD Socket4 * interface as the means of communication with the user level.5 *6 * Definitions for the IP protocol.7 *8 * Version: @(#)ip.h 1.0.2 04/28/939 *

10 * Authors: Fred N. van Kempen, 11 *12 * This program is free software; you can redistribute it and/o r13 * modify it under the terms of the GNU General Public License14 * as published by the Free Software Foundation; either versio n15 * 2 of the License, or (at your option) any later version.16 * /17 #ifndef LINUX IP H18 #define LINUX IP H19 #include 2021 / * SOL IP socket options * /2223 #define IPTOS TOSMASK 0x1E24 #define IPTOS TOS(tos) ((tos)&IPTOS TOSMASK)25 #define IPTOS LOWDELAY 0x1026 #define IPTOS THROUGHPUT 0x0827 #define IPTOS RELIABILITY 0x0428 #define IPTOS MINCOST 0x022930 #define IPTOS PRECMASK 0xE031 #define IPTOS PREC(tos) ((tos)&IPTOS PRECMASK)32 #define IPTOS PRECNETCONTROL 0xe033 #define IPTOS PRECINTERNETCONTROL 0xc034 #define IPTOS PRECCRITIC ECP 0xa035 #define IPTOS PRECFLASHOVERRIDE 0x8036 #define IPTOS PRECFLASH 0x6037 #define IPTOS PRECIMMEDIATE 0x4038 #define IPTOS PRECPRIORITY 0x2039 #define IPTOS PRECROUTINE 0x00404142 / * IP options * /43 #define IPOPT COPY 0x8044 #define IPOPT CLASSMASK 0x6045 #define IPOPT NUMBERMASK 0x1f46

Program ip.h

10

47 #define IPOPT COPIED(o) ((o)&IPOPT COPY)48 #define IPOPT CLASS(o) ((o)&IPOPT CLASSMASK)49 #define IPOPT NUMBER(o) ((o)&IPOPT NUMBERMASK)5051 #define IPOPT CONTROL 0x0052 #define IPOPT RESERVED1 0x2053 #define IPOPT MEASUREMENT 0x4054 #define IPOPT RESERVED2 0x605556 #define IPOPT END (0 |IPOPT CONTROL)57 #define IPOPT NOOP (1 |IPOPT CONTROL)58 #define IPOPT SEC (2 |IPOPT CONTROL|IPOPTCOPY)59 #define IPOPT LSRR (3 |IPOPT CONTROL|IPOPTCOPY)60 #define IPOPT TIMESTAMP (4 |IPOPT MEASUREMENT)61 #define IPOPT RR (7 |IPOPT CONTROL)62 #define IPOPT SID (8 |IPOPT CONTROL|IPOPTCOPY)63 #define IPOPT SSRR (9 |IPOPT CONTROL|IPOPTCOPY)64 #define IPOPT RA (20|IPOPT CONTROL|IPOPTCOPY)6566 #define IPVERSION 467 #define MAXTTL 25568 #define IPDEFTTL 646970 / * struct timestamp, struct route and MAX ROUTES are removed.7172 REASONS: it is clear that nobody used them because:73 - MAX ROUTES value was wrong.74 - "struct route" was wrong.75 - "struct timestamp" had fatally misaligned bitfields and w as completely unusable.76 * /7778 #define IPOPT OPTVAL 079 #define IPOPT OLEN 180 #define IPOPT OFFSET 281 #define IPOPT MINOFF 482 #define MAX IPOPTLEN 4083 #define IPOPT NOP IPOPTNOOP84 #define IPOPT EOL IPOPTEND85 #define IPOPT TS IPOPT TIMESTAMP8687 #define IPOPT TS TSONLY 0 /* timestamps only * /88 #define IPOPT TS TSANDADDR 1 /* timestamps and addresses * /89 #define IPOPT TS PRESPEC 3 /* specified modules only * /9091 #ifdef KERNEL92

Program ip.h (continued)

11

93 struct ip options {94 u32 faddr; / * Saved first hop address * /95 unsigned char optlen;96 unsigned char srr;97 unsigned char rr;98 unsigned char ts;99 unsigned char is setbyuser:1, / * Set by setsockopt? * /

100 is data:1, / * Options in data, rather than skb * /101 is strictroute:1, / * Strict source route * /102 srr is hit:1, / * Packet destination addr was our one * /103 is changed:1, / * IP checksum more not valid * /104 rr needaddr:1, / * Need to record addr of outgoing dev * /105 ts needtime:1, / * Need to record timestamp * /106 ts needaddr:1; / * Need to record addr of outgoing dev * /107 unsigned char router alert;108 unsigned char pad1;109 unsigned char pad2;110 unsigned char data[0];111 };112113 #define optlength(opt) (sizeof(struct ip options) + opt->optlen)114 #endif115116 struct iphdr {117 #if defined( LITTLE ENDIAN BITFIELD)118 u8 ihl:4,119 version:4;120 #elif defined ( BIG ENDIAN BITFIELD)121 u8 version:4,122 ihl:4;123 #else124 #error "Please fix "125 #endif126 u8 tos;127 u16 tot len;128 u16 id;129 u16 frag off;130 u8 ttl;131 u8 protocol;132 u16 check;133 u32 saddr;134 u32 daddr;135 / * The options start here. * /136 };137138 #endif / * LINUX IP H * /

Program ip.h (continued)

12

Theskb is placed in a queue for the CPU that is handling this packet. If the queue is full thepacket is dropped. After being placed on the queue, a receivesoft interrupt is marked for exe-cution at the next convenient time in the kernel processing.Seenet/core/dev.c , subroutinenetif rx .

13

1 / **2 * dev queue xmit - transmit a buffer3 * @skb: buffer to transmit4 *5 * Queue a buffer for transmission to a network device. The call er must6 * have set the device and priority and built the buffer before c alling this7 * function. The function can be called from an interrupt.8 *9 * A negative errno code is returned on a failure. A success does not

10 * guarantee the frame will be transmitted as it may be dropped d ue11 * to congestion or traffic shaping.12 * /1314 int dev queue xmit(struct sk buff * skb)15 {16 struct net device * dev = skb->dev;17 struct Qdisc * q;1819 / * Grab device queue * /20 spin lock bh(&dev->queue lock);21 q = dev->qdisc;22 if (q->enqueue) {23 int ret = q->enqueue(skb, q);2425 qdisc run(dev);2627 spin unlock bh(&dev->queue lock);28 return ret == NET XMIT BYPASS ? NETXMIT SUCCESS : ret;29 }3031 / * The device has no queue. Common case for software devices:32 loopback, all the sorts of tunnels...3334 Really, it is unlikely that xmit lock protection is necessary here.35 (f.e. loopback and IP tunnels are clean ignoring statistics counters.)36 However, it is possible, that they rely on protection37 made by us here.3839 Check this and shot the lock. It is not prone from deadlocks.40 Either shot noqueue qdisc, it is even simpler 8)41 * /42 if (dev->flags&IFF UP) {43 int cpu = smp processor id();4445 if (dev->xmit lock owner != cpu) {46 spin unlock(&dev->queue lock);

Program dev.c

14

47 spin lock(&dev->xmit lock);48 dev->xmit lock owner = cpu;4950 if (!netif queue stopped(dev)) {51 if (netdev nit)52 dev queue xmit nit(skb,dev);5354 if (dev->hard start xmit(skb, dev) == 0) {55 dev->xmit lock owner = -1;56 spin unlock bh(&dev->xmit lock);57 return 0;58 }59 }60 dev->xmit lock owner = -1;61 spin unlock bh(&dev->xmit lock);62 if (net ratelimit())63 printk(KERN DEBUG "Virtual device %s asks to queue packet! \n", dev->name);64 kfree skb(skb);65 return -ENETDOWN;66 } else {67 / * Recursion is detected! It is possible, unfortunately * /68 if (net ratelimit())69 printk(KERN DEBUG "Dead loop on virtual device %s, fix it urgently! \n", dev->name);70 }71 }72 spin unlock bh(&dev->queue lock);7374 kfree skb(skb);75 return -ENETDOWN;76 }77787980 / **81 * netif rx - post buffer to the network code82 * @skb: buffer to post83 *84 * This function receives a packet from a device driver and queu es it for85 * the upper (protocol) levels to process. It always succeeds. The buffer86 * may be dropped during processing for congestion control or b y the87 * protocol layers.88 *89 * return values:90 * NET RX SUCCESS (no congestion)91 * NET RX CNLOW (low congestion)92 * NET RX CNMOD (moderate congestion)

Program dev.c (continued)

15

93 * NET RX CNHIGH (high congestion)94 * NET RX DROP (packet was dropped)95 *96 *97 * /9899 int netif rx(struct sk buff * skb)

100 {101 int this cpu = smp processor id();102 struct softnet data * queue;103 unsigned long flags;104105 if (skb->stamp.tv sec == 0)106 get fast time(&skb->stamp);107108 / * The code is rearranged so that the path is the most109 short when CPU is congested, but is still operating.110 * /111 queue = &softnet data[this cpu];112113 local irq save(flags);114115 netdev rx stat[this cpu].total++;116 if (queue->input pkt queue.qlen input pkt queue.qlen) {118 if (queue->throttle)119 goto drop;120121 enqueue:122 dev hold(skb->dev);123 skb queue tail(&queue->input pkt queue,skb);124 cpu raise softirq(this cpu, NET RX SOFTIRQ);125 local irq restore(flags);126 #ifndef OFFLINE SAMPLE127 get sample stats(this cpu);128 #endif129 return softnet data[this cpu].cng level;130 }131132 if (queue->throttle) {133 queue->throttle = 0;134 #ifdef CONFIG NET HWFLOWCONTROL135 if (atomic dec and test(&netdev dropping))136 netdev wakeup();137 #endif138 }


16

139 goto enqueue;140 }141142 if (queue->throttle == 0) {143 queue->throttle = 1;144 netdev rx stat[this cpu].throttled++;145 #ifdef CONFIG NET HWFLOWCONTROL146 atomic inc(&netdev dropping);147 #endif148 }149150 drop:151 netdev rx stat[this cpu].dropped++;152 local irq restore(flags);153154 kfree skb(skb);155 return NET RX DROP;156 }157158 / * Deliver skb to an old protocol, which is not threaded well159 or which do not understand shared skbs.160 * /161 static int deliver to old ones(struct packet type * pt, struct sk buff * skb, int last)162 {163 static spinlock t net bh lock = SPIN LOCKUNLOCKED;164 int ret = NET RX DROP;165166167 if (!last) {168 skb = skb clone(skb, GFP ATOMIC);169 if (skb == NULL)170 return ret;171 }172173 / * The assumption (correct one) is that old protocols174 did not depened on BHs different of NET BH and TIMER BH.175 * /176177 / * Emulate NET BH with special spinlock * /178 spin lock(&net bh lock);179180 / * Disable timers and wait for all timers completion * /181 tasklet disable(bh task vec+TIMER BH);182183 ret = pt->func(skb, skb->dev, pt);184


17

185 tasklet enable(bh task vec+TIMER BH);186 spin unlock(&net bh lock);187 return ret;188 }189190 / * Reparent skb to master device. This function is called191 * only from net rx action under BR NETPROTOLOCK. It is misuse192 * of BR NETPROTOLOCK, but it is OK for now.193 * /194 static inline void skb bond(struct sk buff * skb)195 {196 struct net device * dev = skb->dev;197198 if (dev->master) {199 dev hold(dev->master);200 skb->dev = dev->master;201 dev put(dev);202 }203 }204205 static void net tx action(struct softirq action * h)206 {207 int cpu = smp processor id();208209 if (softnet data[cpu].completion queue) {210 struct sk buff * clist;211212 local irq disable();213 clist = softnet data[cpu].completion queue;214 softnet data[cpu].completion queue = NULL;215 local irq enable();216217 while (clist != NULL) {218 struct sk buff * skb = clist;219 clist = clist->next;220221 BUGTRAP(atomic read(&skb->users) == 0);222 kfree skb(skb);223 }224 }225226 if (softnet data[cpu].output queue) {227 struct net device * head;228229 local irq disable();230 head = softnet data[cpu].output queue;


18

231 softnet data[cpu].output queue = NULL;232 local irq enable();233234 while (head != NULL) {235 struct net device * dev = head;236 head = head->next sched;237238 smp mb before clear bit();239 clear bit( LINK STATESCHED, &dev->state);240241 if (spin trylock(&dev->queue lock)) {242 qdisc run(dev);243 spin unlock(&dev->queue lock);244 } else {245 netif schedule(dev);246 }247 }248 }249 }250251 / **252 * net call rx atomic253 * @fn: function to call254 *255 * Make a function call that is atomic with respect to the protoc ol256 * layers.257 * /258259 void net call rx atomic(void ( * fn)(void))260 {261 br write lock bh(BR NETPROTOLOCK);262 fn();263 br write unlock bh(BR NETPROTOLOCK);264 }265266 #if defined(CONFIG BRIDGE) || defined(CONFIG BRIDGEMODULE)267 void ( * br handle frame hook)(struct sk buff * skb) = NULL;268 #endif269270 static int inline handle bridge(struct sk buff * skb,271 struct packet type * pt prev)272 {273 int ret = NET RX DROP;274275 if (pt prev) {276 if (!pt prev->data)


19

277 ret = deliver to old ones(pt prev, skb, 0);278 else {279 atomic inc(&skb->users);280 ret = pt prev->func(skb, skb->dev, pt prev);281 }282 }283284 br handle frame hook(skb);285 return ret;286 }287288289 #ifdef CONFIG NET DIVERT290 static inline void handle diverter(struct sk buff * skb)291 {292 / * if diversion is supported on device, then divert * /293 if (skb->dev->divert && skb->dev->divert->divert)294 divert frame(skb);295 }296 #endif / * CONFIGNET DIVERT * /297298299 static void net rx action(struct softirq action * h)300 {301 int this cpu = smp processor id();302 struct softnet data * queue = &softnet data[this cpu];303 unsigned long start time = jiffies;304 int bugdet = netdev max backlog;305306 br read lock(BR NETPROTOLOCK);307308 for (;;) {309 struct sk buff * skb;310 struct net device * rx dev;311312 local irq disable();313 skb = skb dequeue(&queue->input pkt queue);314 local irq enable();315316 if (skb == NULL)317 break;318319 skb bond(skb);320321 rx dev = skb->dev;322


20

323 #ifdef CONFIG NET FASTROUTE324 if (skb->pkt type == PACKET FASTROUTE) {325 netdev rx stat[this cpu].fastroute deferred out++;326 dev queue xmit(skb);327 dev put(rx dev);328 continue;329 }330 #endif331 skb->h.raw = skb->nh.raw = skb->data;332 {333 struct packet type * ptype, * pt prev;334 unsigned short type = skb->protocol;335336 pt prev = NULL;337 for (ptype = ptype all; ptype; ptype = ptype->next) {338 if (!ptype->dev || ptype->dev == skb->dev) {339 if (pt prev) {340 if (!pt prev->data) {341 deliver to old ones(pt prev, skb, 0);342 } else {343 atomic inc(&skb->users);344 pt prev->func(skb,345 skb->dev,346 pt prev);347 }348 }349 pt prev = ptype;350 }351 }352353 #ifdef CONFIG NET DIVERT354 if (skb->dev->divert && skb->dev->divert->divert)355 handle diverter(skb);356 #endif / * CONFIGNET DIVERT * /357358359 #if defined(CONFIG BRIDGE) || defined(CONFIG BRIDGEMODULE)360 if (skb->dev->br port != NULL &&361 br handle frame hook != NULL) {362 handle bridge(skb, pt prev);363 dev put(rx dev);364 continue;365 }366 #endif367368 for (ptype=ptype base[ntohs(type)&15];ptype;ptype=ptype->next) {


21

369 if (ptype->type == type &&370 (!ptype->dev || ptype->dev == skb->dev)) {371 if (pt prev) {372 if (!pt prev->data)373 deliver to old ones(pt prev, skb, 0);374 else {375 atomic inc(&skb->users);376 pt prev->func(skb,377 skb->dev,378 pt prev);379 }380 }381 pt prev = ptype;382 }383 }384385 if (pt prev) {386 if (!pt prev->data)387 deliver to old ones(pt prev, skb, 1);388 else389 pt prev->func(skb, skb->dev, pt prev);390 } else391 kfree skb(skb);392 }393394 dev put(rx dev);395396 if (bugdet-- < 0 || jiffies - start time > 1)397 goto softnet break;398399 #ifdef CONFIG NET HWFLOWCONTROL400 if (queue->throttle && queue->input pkt queue.qlen < no cong thresh ) {401 if (atomic dec and test(&netdev dropping)) {402 queue->throttle = 0;403 netdev wakeup();404 goto softnet break;405 }406 }407 #endif408409 }410 br read unlock(BR NETPROTOLOCK);411412 local irq disable();413 if (queue->throttle) {414 queue->throttle = 0;


22

415 #ifdef CONFIG NET HWFLOWCONTROL416 if (atomic dec and test(&netdev dropping))417 netdev wakeup();418 #endif419 }420 local irq enable();421422 NET PROFILE LEAVE(softnet process);423 return;424425 softnet break:426 br read unlock(BR NETPROTOLOCK);427428 local irq disable();429 netdev rx stat[this cpu].time squeeze++;430 cpu raise softirq(this cpu, NET RX SOFTIRQ);431 local irq enable();432433 NET PROFILE LEAVE(softnet process);434 return;435 }436437 static gifconf func t * gifconf list [NPROTO];


23

The soft interrupt processing is done in the kernel in one of three places:

1. In the main process schedulerkernel/sched.c

2. Any time the kernel returns from a system callkernel/entry.s

3. Periodic checkingkernel/irq.c .

In any case, the soft interrupt eventually calls the device driver indev.c , subroutinenet rx action .Here, theskb is dequeued from the receive queue and sent to the next higherlayer protocol handler(in the example we assume it is IPV4). Note the code indev.c looking through the registeredprotocols to find a matching type and callingpt->prev->func(.... This ends up callingfunction ip rcv in net/ipv4/ip input.c .

24

12 / *3 * SNMP management statistics4 * /56 struct ip mib ip statistics[NR CPUS* 2];78 / *9 * Process Router Attention IP option

10 * /11 int ip call ra chain(struct sk buff * skb)12 {13 struct ip ra chain * ra;14 u8 protocol = skb->nh.iph->protocol;15 struct sock * last = NULL;1617 read lock(&ip ra lock);18 for (ra = ip ra chain; ra; ra = ra->next) {19 struct sock * sk = ra->sk;2021 / * If socket is bound to an interface, only report22 * the packet if it came from that interface.23 * /24 if (sk && sk->num == protocol25 && ((sk->bound dev if == 0)26 || (sk->bound dev if == skb->dev->ifindex))) {27 if (skb->nh.iph->frag off & htons(IP MF|IP OFFSET)) {28 skb = ip defrag(skb);29 if (skb == NULL) {30 read unlock(&ip ra lock);31 return 1;32 }33 }34 if (last) {35 struct sk buff * skb2 = skb clone(skb, GFP ATOMIC);36 if (skb2)37 raw rcv(last, skb2);38 }39 last = sk;40 }41 }4243 if (last) {44 raw rcv(last, skb);45 read unlock(&ip ra lock);46 return 1;

Program ipinput.c

25

47 }48 read unlock(&ip ra lock);49 return 0;50 }5152 / * Handle this out of line, it is rare. * /53 static int ip run ipprot(struct sk buff * skb, struct iphdr * iph,54 struct inet protocol * ipprot, int force copy)55 {56 int ret = 0;5758 do {59 if (ipprot->protocol == iph->protocol) {60 struct sk buff * skb2 = skb;61 if (ipprot->copy || force copy)62 skb2 = skb clone(skb, GFP ATOMIC);63 if(skb2 != NULL) {64 ret = 1;65 ipprot->handler(skb2,66 ntohs(iph->tot len) - (iph->ihl * 4));67 }68 }69 ipprot = (struct inet protocol * ) ipprot->next;70 } while(ipprot != NULL);7172 return ret;73 }7475 static inline int ip local deliver finish(struct sk buff * skb)76 {77 struct iphdr * iph = skb->nh.iph;7879 #ifdef CONFIG NETFILTER DEBUG80 nf debug ip local deliver(skb);81 #endif / * CONFIGNETFILTER DEBUG* /8283 / * Point into the IP datagram, just past the header. * /84 skb->h.raw = skb->nh.raw + iph->ihl * 4;8586 {87 / * Note: See raw.c and net/raw.h, RAWV4 HTABLESIZE==MAX INET PROTOS* /88 int hash = iph->protocol & (MAX INET PROTOS - 1);89 struct sock * raw sk = raw v4 htable[hash];90 struct inet protocol * ipprot;91 int flag;92

Program ipinput.c (continued)

26

93 / * If there maybe a raw socket we must check - if not we94 * don’t care less95 * /96 if(raw sk != NULL)97 raw sk = raw v4 input(skb, iph, hash);9899 ipprot = (struct inet protocol * ) inet protos[hash];

100 flag = 0;101 if(ipprot != NULL) {102 if(raw sk == NULL &&103 ipprot->next == NULL &&104 ipprot->protocol == iph->protocol) {105 int ret;106107 / * Fast path... * /108 ret = ipprot->handler(skb, (ntohs(iph->tot len) -109 (iph->ihl * 4)));110111 return ret;112 } else {113 flag = ip run ipprot(skb, iph, ipprot, (raw sk != NULL));114 }115 }116117 / * All protocols checked.118 * If this packet was a broadcast, we may * not * reply to it, since that119 * causes (proven, grin) ARP storms and a leakage of memory (i.e . all120 * ICMP reply messages get queued up for transmission...)121 * /122 if(raw sk != NULL) { / * Shift to last raw user * /123 raw rcv(raw sk, skb);124 sock put(raw sk);125 } else if (!flag) { / * Free and report errors * /126 icmp send(skb, ICMP DESTUNREACH, ICMPPROTUNREACH, 0);127 kfree skb(skb);128 }129 }130131 return 0;132 }133134 / *135 * Deliver IP Packets to the higher protocol layers.136 * /137 int ip local deliver(struct sk buff * skb)138 {


27

139 struct iphdr * iph = skb->nh.iph;140141 / *142 * Reassemble IP fragments.143 * /144145 if (iph->frag off & htons(IP MF|IP OFFSET)) {146 skb = ip defrag(skb);147 if (!skb)148 return 0;149 }150151 return NF HOOK(PFINET, NF IP LOCALIN, skb, skb->dev, NULL,152 ip local deliver finish);153 }154155 static inline int ip rcv finish(struct sk buff * skb)156 {157 struct net device * dev = skb->dev;158 struct iphdr * iph = skb->nh.iph;159160 / *161 * Initialise the virtual path cache for the packet. It describ es162 * how the packet travels inside Linux networking.163 * /164 if (skb->dst == NULL) {165 if (ip route input(skb, iph->daddr, iph->saddr, iph->tos, dev))166 goto drop;167 }168169 #ifdef CONFIG NET CLS ROUTE170 if (skb->dst->tclassid) {171 struct ip rt acct * st = ip rt acct + 256 * smp processor id();172 u32 idx = skb->dst->tclassid;173 st[idx&0xFF].o packets++;174 st[idx&0xFF].o bytes+=skb->len;175 st[(idx>>16)&0xFF].i packets++;176 st[(idx>>16)&0xFF].i bytes+=skb->len;177 }178 #endif179180 if (iph->ihl > 5) {181 struct ip options * opt;182183 / * It looks as overkill, because not all184 IP options require packet mangling.


28

185 But it is the easiest for now, especially taking186 into account that combination of IP options187 and running sniffer is extremely rare condition.188 --ANK (980813)189 * /190191 skb = skb cow(skb, skb headroom(skb));192 if (skb == NULL)193 return NET RX DROP;194 iph = skb->nh.iph;195196 skb->ip summed = 0;197 if (ip options compile(NULL, skb))198 goto inhdr error;199200 opt = &(IPCB(skb)->opt);201 if (opt->srr) {202 struct in device * in dev = in dev get(dev);203 if (in dev) {204 if (!IN DEVSOURCEROUTE(in dev)) {205 if (IN DEVLOGMARTIANS(in dev) && net ratelimit())206 printk(KERN INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u \n",207 NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));208 in dev put(in dev);209 goto drop;210 }211 in dev put(in dev);212 }213 if (ip options rcv srr(skb))214 goto drop;215 }216 }217218 return skb->dst->input(skb);219220 inhdr error:221 IP INC STATSBH(IpInHdrErrors);222 drop:223 kfree skb(skb);224 return NET RX DROP;225 }226227 / *228 * Main IP Receive routine.229 * /230 int ip rcv(struct sk buff * skb, struct net device * dev, struct packet type * pt)


29

231 {232 struct iphdr * iph = skb->nh.iph;233234 / * When the interface is in promisc. mode, drop all the crap235 * that it receives, do not try to analyse it.236 * /237 if (skb->pkt type == PACKET OTHERHOST)238 goto drop;239240 IP INC STATSBH(IpInReceives);241242 if ((skb = skb share check(skb, GFP ATOMIC)) == NULL)243 goto out;244245 / *246 * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fai ls the checksum.247 *248 * Is the datagram acceptable?249 *250 * 1. Length at least the size of an ip header251 * 2. Version of 4252 * 3. Checksums correctly. [Speed optimisation for later, ski p loopback checksums]253 * 4. Doesn’t have a bogus length254 * /255256 if (skb->len < sizeof(struct iphdr) || skb->len < (iph->ihl version != 4 || ip fast csum((u8 * )iph, iph->ihl) != 0)259 goto inhdr error;260261 {262 u32 len = ntohs(iph->tot len);263 if (skb->len < len || len < (iph->ihltot len).269 * /270 skb trim(skb, len);271 }272273 return NF HOOK(PFINET, NF IP PREROUTING, skb, dev, NULL,274 ip rcv finish);275276 inhdr error:


30

277 IP INC STATSBH(IpInHdrErrors);278 drop:279 kfree skb(skb);280 out:281 return NET RX DROP;282 }


ip rcv is the main IPV4 receive function. Note the checks for a legalpacket (proper check-sum). If the packet does not validate, it is dropped silently. Note the call to theNetFilter, which isthe way the kernel allows new functions to be integrated in the packet processing without changingthe code in IPV4. See the call toNF Hook.

The net filters (sometimes called nethooks) are called in the IPV4 protocol stack in five places,allowing new code to be incorporated, to support new features for example, or to implement afirewall. Parts of the kernel can register to receive hook callbacks at various points in the packetprocessing. WHen called, the hook functions can decide to alter, discard, or pass the packet un-modified. It can als pass the packet to a user space function.

Upon completion of all net hook processing, the control flow ends up atip recv finish .Here, the next hop route is looked up by calling functionip route input in net/ipv4/route.c .Eventuallyip route input slow is called. Here, the packet can be sent to one of four places.

1. If the packet is addressed to the local machine, it is passed to the layer 4 processor. Seeip input.c:ip local+deliver() .

2. Destination is not the local sysetm, forward the packet tothe next hop. Seenet/ipv4/ip forward.c:ip

3. We are unable to find an appropriate routing entry, we callroute.c:ip error() .

4. If a multicast packet, we need special handling (not shownhere).

31

1 int ip route input(struct sk buff * skb, u32 daddr, u32 saddr,2 u8 tos, struct net device * dev)3 {4 struct rtable * rth;5 unsigned hash;6 int iif = dev->ifindex;78 tos &= IPTOS RT MASK;9 hash = rt hash code(daddr, saddr (̂iifkey.dst == daddr &&14 rth->key.src == saddr &&15 rth->key.iif == iif &&16 rth->key.oif == 0 &&17 #ifdef CONFIG IP ROUTEFWMARK18 rth->key.fwmark == skb->nfmark &&19 #endif20 rth->key.tos == tos) {21 rth->u.dst.lastuse = jiffies;22 dst hold(&rth->u.dst);23 rth->u.dst. use++;24 read unlock(&rt hash table[hash].lock);25 skb->dst = (struct dst entry * )rth;26 return 0;27 }28 }29 read unlock(&rt hash table[hash].lock);3031 / * Multicast recognition logic is moved from route cache to her e.32 The problem was that too many Ethernet cards have broken/mis sing33 hardware multicast filters :-( As result the host on multica sting34 network acquires a lot of useless route cache entries, sort o f35 SDR messages from all the world. Now we try to get rid of them.36 Really, provided software IP multicast filter is organized37 reasonably (at least, hashed), it does not result in a slowdo wn38 comparing with route cache reject entries.39 Note, that multicast routers are not affected, because40 route cache entry is created eventually.41 * /42 if (MULTICAST(daddr)) {43 struct in device * in dev;4445 read lock(&inetdev lock);46 if ((in dev = in dev get(dev)) != NULL) {

Program route.c

32

47 int our = ip check mc(in dev, daddr);48 if (our49 #ifdef CONFIG IP MROUTE50 || (!LOCAL MCAST(daddr) && IN DEVMFORWARD(indev))51 #endif52 ) {53 read unlock(&inetdev lock);54 return ip route input mc(skb, daddr, saddr, tos, dev, our);55 }56 }57 read unlock(&inetdev lock);58 return -EINVAL;59 }60 return ip route input slow(skb, daddr, saddr, tos, dev);61 }6263 / *64 * Major route resolver routine.65 * /6667 int ip route output slow(struct rtable ** rp, const struct rt key * oldkey)68 {69 struct rt key key;70 struct fib result res;71 unsigned flags = 0;72 struct rtable * rth;73 struct net device * dev out = NULL;74 unsigned hash;75 int free res = 0;76 int err;77 u32 tos;7879 tos = oldkey->tos & (IPTOS RT MASK|RTOONLINK);80 key.dst = oldkey->dst;81 key.src = oldkey->src;82 key.tos = tos&IPTOS RT MASK;83 key.iif = loopback dev.ifindex;84 key.oif = oldkey->oif;85 #ifdef CONFIG IP ROUTEFWMARK86 key.fwmark = oldkey->fwmark;87 #endif88 key.scope = (tos&RTO ONLINK) ? RT SCOPELINK : RT SCOPEUNIVERSE;89 res.fi = NULL;90 #ifdef CONFIG IP MULTIPLE TABLES91 res.r = NULL;92 #endif

Program route.c (continued)

33

9394 if (oldkey->src) {95 if (MULTICAST(oldkey->src)96 || BADCLASS(oldkey->src)97 || ZERONET(oldkey->src))98 return -EINVAL;99

100 / * It is equivalent to inet addr type(saddr) == RTN LOCAL * /101 dev out = ip dev find(oldkey->src);102 if (dev out == NULL)103 return -EINVAL;104105 / * I removed check for oif == dev out->oif here.106 It was wrong by three reasons:107 1. ip dev find(saddr) can return wrong iface, if saddr is108 assigned to multiple interfaces.109 2. Moreover, we are allowed to send packets with saddr110 of another iface. --ANK111 * /112113 if (oldkey->oif == 0114 && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) ) {115 / * Special hack: user can direct multicasts116 and limited broadcast via necessary interface117 without fiddling with IP MULTICASTIF or IP PKTINFO.118 This hack is not just for fun, it allows119 vic,vat and friends to work.120 They bind socket to loopback, set ttl to zero121 and expect that it will work.122 From the viewpoint of routing cache they are broken,123 because we are not allowed to build multicast path124 with loopback source addr (look, routing cache125 cannot know, that ttl is zero, so that packet126 will not leave this host and route is valid).127 Luckily, this hack is good workaround.128 * /129130 key.oif = dev out->ifindex;131 goto make route;132 }133 if (dev out)134 dev put(dev out);135 dev out = NULL;136 }137 if (oldkey->oif) {138 dev out = dev get by index(oldkey->oif);


34

139 if (dev out == NULL)140 return -ENODEV;141 if ( in dev get(dev out) == NULL) {142 dev put(dev out);143 return -ENODEV; / * Wrong error code * /144 }145146 if (LOCAL MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {147 if (!key.src)148 key.src = inet select addr(dev out, 0, RT SCOPELINK);149 goto make route;150 }151 if (!key.src) {152 if (MULTICAST(oldkey->dst))153 key.src = inet select addr(dev out, 0, key.scope);154 else if (!oldkey->dst)155 key.src = inet select addr(dev out, 0, RT SCOPEHOST);156 }157 }158159 if (!key.dst) {160 key.dst = key.src;161 if (!key.dst)162 key.dst = key.src = htonl(INADDR LOOPBACK);163 if (dev out)164 dev put(dev out);165 dev out = &loopback dev;166 dev hold(dev out);167 key.oif = loopback dev.ifindex;168 res.type = RTN LOCAL;169 flags |= RTCF LOCAL;170 goto make route;171 }172173 if (fib lookup(&key, &res)) {174 res.fi = NULL;175 if (oldkey->oif) {176 / * Apparently, routing tables are wrong. Assume,177 that the destination is on link.178179 WHY? DW.180 Because we are allowed to send to iface181 even if it has NO routes and NO assigned182 addresses. When oif is specified, routing183 tables are looked up with only one purpose:184 to catch if destination is gatewayed, rather than


35

185 direct. Moreover, if MSG DONTROUTE is set,186 we send packet, ignoring both routing tables187 and ifaddr state. --ANK188189190 We could make it even if oif is unknown,191 likely IPv6, but we do not.192 * /193194 if (key.src == 0)195 key.src = inet select addr(dev out, 0, RT SCOPELINK);196 res.type = RTN UNICAST;197 goto make route;198 }199 if (dev out)200 dev put(dev out);201 return -ENETUNREACH;202 }203 free res = 1;204205 if (res.type == RTN NAT)206 goto e inval;207208 if (res.type == RTN LOCAL) {209 if (!key.src)210 key.src = key.dst;211 if (dev out)212 dev put(dev out);213 dev out = &loopback dev;214 dev hold(dev out);215 key.oif = dev out->ifindex;216 if (res.fi)217 fib info put(res.fi);218 res.fi = NULL;219 flags |= RTCF LOCAL;220 goto make route;221 }222223 #ifdef CONFIG IP ROUTEMULTIPATH224 if (res.fi->fib nhs > 1 && key.oif == 0)225 fib select multipath(&key, &res);226 else227 #endif228 if (res.prefixlen==0 && res.type == RTN UNICAST && key.oif == 0)229 fib select default(&key, &res);230


36

231 if (!key.src)232 key.src = FIB RESPREFSRC(res);233234 if (dev out)235 dev put(dev out);236 dev out = FIB RESDEV(res);237 dev hold(dev out);238 key.oif = dev out->ifindex;239240 make route:241 if (LOOPBACK(key.src) && !(dev out->flags&IFF LOOPBACK))242 goto e inval;243244 if (key.dst == 0xFFFFFFFF)245 res.type = RTN BROADCAST;246 else if (MULTICAST(key.dst))247 res.type = RTN MULTICAST;248 else if (BADCLASS(key.dst) || ZERONET(key.dst))249 goto e inval;250251 if (dev out->flags&IFF LOOPBACK)252 flags |= RTCF LOCAL;253254 if (res.type == RTN BROADCAST){255 flags |= RTCF BROADCAST|RTCFLOCAL;256 if (res.fi) {257 fib info put(res.fi);258 res.fi = NULL;259 }260 } else if (res.type == RTN MULTICAST) {261 flags |= RTCF MULTICAST|RTCFLOCAL;262 read lock(&inetdev lock);263 if (! in dev get(dev out) || !ip check mc( in dev get(dev out), oldkey->dst))264 flags &= ˜RTCF LOCAL;265 read unlock(&inetdev lock);266 / * If multicast route do not exist use267 default one, but do not gateway in this case.268 Yes, it is hack.269 * /270 if (res.fi && res.prefixlen < 4) {271 fib info put(res.fi);272 res.fi = NULL;273 }274 }275276 rth = dst alloc(&ipv4 dst ops);


37

277 if (!rth)278 goto e nobufs;279280 atomic set(&rth->u.dst. refcnt, 1);281 rth->u.dst.flags= DST HOST;282 rth->key.dst = oldkey->dst;283 rth->key.tos = tos;284 rth->key.src = oldkey->src;285 rth->key.iif = 0;286 rth->key.oif = oldkey->oif;287 #ifdef CONFIG IP ROUTEFWMARK288 rth->key.fwmark = oldkey->fwmark;289 #endif290 rth->rt dst = key.dst;291 rth->rt src = key.src;292 #ifdef CONFIG IP ROUTENAT293 rth->rt dst map = key.dst;294 rth->rt src map = key.src;295 #endif296 rth->rt iif = oldkey->oif ? : dev out->ifindex;297 rth->u.dst.dev = dev out;298 dev hold(dev out);299 rth->rt gateway = key.dst;300 rth->rt spec dst= key.src;301302 rth->u.dst.output=ip output;303304 if (flags&RTCF LOCAL) {305 rth->u.dst.input = ip local deliver;306 rth->rt spec dst = key.dst;307 }308 if (flags&(RTCF BROADCAST|RTCFMULTICAST)) {309 rth->rt spec dst = key.src;310 if (flags&RTCF LOCAL && !(dev out->flags&IFF LOOPBACK))311 rth->u.dst.output = ip mc output;312 #ifdef CONFIG IP MROUTE313 if (res.type == RTN MULTICAST) {314 struct in device * in dev = in dev get(dev out);315 if (in dev) {316 if (IN DEVMFORWARD(indev) && !LOCAL MCAST(oldkey->dst)) {317 rth->u.dst.input = ip mr input;318 rth->u.dst.output = ip mc output;319 }320 in dev put(in dev);321 }322 }


38

323 #endif324 }325326 rt set nexthop(rth, &res, 0);327328 rth->rt flags = flags;329330 hash = rt hash code(oldkey->dst, oldkey->src (̂oldkey->oif

1 static inline int ip forward finish(struct sk buff * skb)2 {3 struct ip options * opt = &(IPCB(skb)->opt);45 IP INC STATSBH(IpForwDatagrams);67 if (opt->optlen == 0) {8 #ifdef CONFIG NET FASTROUTE9 struct rtable * rt = (struct rtable * )skb->dst;

1011 if (rt->rt flags&RTCF FAST && !netdev fastroute obstacles) {12 struct dst entry * old dst;13 unsigned h = (( * (u8 * )&rt->key.dst) (̂ * (u8 * )&rt->key.src))&NETDEV FASTROUTEHMASK;1415 write lock irq(&skb->dev->fastpath lock);16 old dst = skb->dev->fastpath[h];17 skb->dev->fastpath[h] = dst clone(&rt->u.dst);18 write unlock irq(&skb->dev->fastpath lock);1920 dst release(old dst);21 }22 #endif23 return (ip send(skb));24 }2526 ip forward options(skb);27 return (ip send(skb));28 }2930 int ip forward(struct sk buff * skb)31 {32 struct net device * dev2; / * Output device * /33 struct iphdr * iph; / * Our header * /34 struct rtable * rt; / * Route we use * /35 struct ip options * opt = &(IPCB(skb)->opt);36 unsigned short mtu;3738 if (IPCB(skb)->opt.router alert && ip call ra chain(skb))39 return NET RX SUCCESS;4041 if (skb->pkt type != PACKET HOST)42 goto drop;4344 / *45 * According to the RFC, we must first decrease the TTL field. If46 * that reaches zero, we must reply an ICMP control message tell ing

Program ipforward.c

40

47 * that the packet’s lifetime expired.48 * /4950 iph = skb->nh.iph;51 rt = (struct rtable * )skb->dst;5253 if (iph->ttl is strictroute && rt->rt dst != rt->rt gateway)57 goto sr failed;5859 / *60 * Having picked a route we can now send the frame out61 * after asking the firewall permission to do so.62 * /6364 skb->priority = rt tos2priority(iph->tos);65 dev2 = rt->u.dst.dev;66 mtu = rt->u.dst.pmtu;6768 / *69 * We now generate an ICMP HOST REDIRECT giving the route70 * we calculated.71 * /72 if (rt->rt flags&RTCF DOREDIRECT && !opt->srr)73 ip rt send redirect(skb);7475 / * We are about to mangle packet. Copy it! * /76 if ((skb = skb cow(skb, dev2->hard header len)) == NULL)77 return NET RX DROP;78 iph = skb->nh.iph;79 opt = &(IPCB(skb)->opt);8081 / * Decrease ttl after skb cow done * /82 ip decrease ttl(iph);8384 / *85 * We now may allocate a new buffer, and copy the datagram into it .86 * If the indicated interface is up and running, kick it.87 * /8889 if (skb->len > mtu && (ntohs(iph->frag off) & IP DF))90 goto frag needed;9192 #ifdef CONFIG IP ROUTENAT

Program ipforward.c (continued)

41

93 if (rt->rt flags & RTCF NAT) {94 if (ip do nat(skb)) {95 kfree skb(skb);96 return NET RX BAD;97 }98 }99 #endif

100101 return NF HOOK(PFINET, NF IP FORWARD, skb, skb->dev, dev2,102 ip forward finish);103104 frag needed:105 IP INC STATSBH(IpFragFails);106 icmp send(skb, ICMP DESTUNREACH, ICMPFRAGNEEDED, htonl(mtu));107 goto drop;108109 sr failed:110 / *111 * Strict routing permits no gatewaying112 * /113 icmp send(skb, ICMP DESTUNREACH, ICMPSR FAILED, 0);114 goto drop;115116 too many hops:117 / * Tell the sender its packet died... * /118 icmp send(skb, ICMP TIME EXCEEDED, ICMPEXCTTL, 0);119 drop:120 kfree skb(skb);121 return NET RX DROP;122 }

Program ipforward.c (continued)

Assuming the packet is addressed locally (address to this machine), seeip input.c:ip local deliver .There is another net hook here, allowing additional processing before moving up the stack. Afterall net hooks are called, seeip local deliver finish() If a raw socket, do not deliver tolayer 4, but to the user process. See line 96-97 in the handout.

Then forward the packet up the stack normally. We will look atthis later.Assuming the packet must be forwarded, seeip forward.c:ip forward() . Note the

TTL check, and the dropping of the packet if TTL has expired. After the net hooks are done, we endup atip forward finish . After dealing with any possible IP options (not discussed here), weend up atip forward.c:ip send() , and then finally atip output.c:ip finish output() .After calling the net hooks we end up atip finish output2() . Then the new layer 2 header isadded and functionhh output(skb) is called. Eventually, we end up indev.c:dev queue xmit() ,and finally insch generic.c:pfifo fast enqueue() .

As an aside, the type of queuing used (do we want some packets to have priority over othersfor example) is done insch generic.c:dev activate .

42

1 / *2 * INET An implementation of the TCP/IP protocol suite for the L INUX3 * operating system. INET is implemented using the BSD Socket4 * interface as the means of communication with the user level.5 *6 * The Internet Protocol (IP) output module.7 *8 * Version: $Id: ip output.c,v 1.87 2000/10/25 20:07:22 davem Exp $9 *

10 * Authors: Ross Biro, 11 * Fred N. van Kempen, 12 * Donald Becker, 13 * Alan Cox, 14 * Richard Underwood15 * Stefan Becker, 16 * Jorge Cwik, 17 * Arnt Gulbrandsen, 18 *19 * See ip input.c for original log20 *21 * Fixes:22 * Alan Cox : Missing nonblock feature in ip build xmit.23 * Mike Kilburn : htons() missing in ip build xmit.24 * Bradford Johnson: Fix faulty handling of some frames when25 * no route is found.26 * Alexander Demenshin: Missing sk/skb free in ip queue xmit27 * (in case if packet not accepted by28 * output firewall rules)29 * Mike McLagan : Routing by source30 * Alexey Kuznetsov: use new route cache31 * Andi Kleen: Fix broken PMTU recovery and remove32 * some redundant tests.33 * Vitaly E. Lavrov : Transparent proxy revived after year coma .34 * Andi Kleen : Replace ip reply with ip send reply.35 * Andi Kleen : Split fast and slow ip build xmit path36 * for decreased register pressure on x8637 * and more readibility.38 * Marc Boucher : When call out firewall returns FW QUEUE,39 * silently drop skb instead of failing with -EPERM.40 * /4142 #include 43 #include 44 #include 45 #include 46 #include

Program ipoutput.c

43

47 #include 48 #include 49 #include 50 #include 5152 #include 53 #include 54 #include 55 #include 56 #include 57 #include 58 #include 59 #include 60 #include 6162 #include 63 #include 64 #include 65 #include 66 #include 67 #include 68 #include 69 #include 70 #include 71 #include 72 #include 73 #include 74 #include 75 #include 76 #include 77 #include 78 #include 7980 / *81 * Shall we try to damage output packets if routing dev changes?82 * /8384 int sysctl ip dynaddr = 0;85 int sysctl ip default ttl = IPDEFTTL;8687 / * Generate a checksum for an outgoing IP datagram. * /88 inline void ip send check(struct iphdr * iph)89 {90 iph->check = 0;91 iph->check = ip fast csum((unsigned char * )iph, iph->ihl);92 }

Program ipoutput.c (continued)

44

9394 / * dev loopback xmit for use with netfilter. * /95 static int ip dev loopback xmit(struct sk buff * newskb)96 {97 newskb->mac.raw = newskb->data;98 skb pull(newskb, newskb->nh.raw - newskb->data);99 newskb->pkt type = PACKET LOOPBACK;

100 newskb->ip summed = CHECKSUMUNNECESSARY;101 BUGTRAP(newskb->dst);102103 #ifdef CONFIG NETFILTER DEBUG104 nf debug ip loopback xmit(newskb);105 #endif106 netif rx(newskb);107 return 0;108 }109110 / * Don’t just hand NF HOOK skb->dst->output, in case netfilter hook111 changes route * /112 static inline int113 output maybe reroute(struct sk buff * skb)114 {115 return skb->dst->output(skb);116 }117118 / *119 * Add an ip header to a skbuff and send it out.120 * /121 int ip build and send pkt(struct sk buff * skb, struct sock * sk,122 u32 saddr, u32 daddr, struct ip options * opt)123 {124 struct rtable * rt = (struct rtable * )skb->dst;125 struct iphdr * iph;126127 / * Build the IP header. * /128 if (opt)129 iph=(struct iphdr * )skb push(skb,sizeof(struct iphdr) + opt->optlen);130 else131 iph=(struct iphdr * )skb push(skb,sizeof(struct iphdr));132133 iph->version = 4;134 iph->ihl = 5;135 iph->tos = sk->protinfo.af inet.tos;136 iph->frag off = 0;137 if (ip dont fragment(sk, &rt->u.dst))138 iph->frag off |= htons(IP DF);


45

139 iph->ttl = sk->protinfo.af inet.ttl;140 iph->daddr = rt->rt dst;141 iph->saddr = rt->rt src;142 iph->protocol = sk->protocol;143 iph->tot len = htons(skb->len);144 ip select ident(iph, &rt->u.dst);145 skb->nh.iph = iph;146147 if (opt && opt->optlen) {148 iph->ihl += opt->optlen>>2;149 ip options build(skb, opt, daddr, rt, 0);150 }151 ip send check(iph);152153 / * Send it out. * /154 return NF HOOK(PFINET, NF IP LOCALOUT, skb, NULL, rt->u.dst.dev,155 output maybe reroute);156 }157158 static inline int ip finish output2(struct sk buff * skb)159 {160 struct dst entry * dst = skb->dst;161 struct hh cache * hh = dst->hh;162163 #ifdef CONFIG NETFILTER DEBUG164 nf debug ip finish output2(skb);165 #endif / * CONFIGNETFILTER DEBUG* /166167 if (hh) {168 read lock bh(&hh->hh lock);169 memcpy(skb->data - 16, hh->hh data, 16);170 read unlock bh(&hh->hh lock);171 skb push(skb, hh->hh len);172 return hh->hh output(skb);173 } else if (dst->neighbour)174 return dst->neighbour->output(skb);175176 printk(KERN DEBUG "khm\n");177 kfree skb(skb);178 return -EINVAL;179 }180181 inline int ip finish output(struct sk buff * skb)182 {183 struct net device * dev = skb->dst->dev;184


46

185 skb->dev = dev;186 skb->protocol = constant htons(ETH P IP);187188 return NF HOOK(PFINET, NF IP POSTROUTING, skb, NULL, dev,189 ip finish output2);190 }191192 int ip mc output(struct sk buff * skb)193 {194 struct sock * sk = skb->sk;195 struct rtable * rt = (struct rtable * )skb->dst;196 struct net device * dev = rt->u.dst.dev;197198 / *199 * If the indicated interface is up and running, send the packet .200 * /201 IP INC STATS(IpOutRequests);202 #ifdef CONFIG IP ROUTENAT203 if (rt->rt flags & RTCF NAT)204 ip do nat(skb);205 #endif206207 skb->dev = dev;208 skb->protocol = constant htons(ETH P IP);209210 / *211 * Multicasts are looped back for other local users212 * /213214 if (rt->rt flags&RTCF MULTICAST && (!sk || sk->protinfo.af inet.mc loop)) {215 #ifdef CONFIG IP MROUTE216 / * Small optimization: do not loopback not local frames,217 which returned after forwarding; they will be dropped218 by ip mr input in any case.219 Note, that local frames are looped back to be delivered220 to local recipients.221222 This check is duplicated in ip mr input at the moment.223 * /224 if ((rt->rt flags&RTCF LOCAL) || !(IPCB(skb)->flags&IPSKB FORWARDED))225 #endif226 {227 struct sk buff * newskb = skb clone(skb, GFP ATOMIC);228 if (newskb)229 NF HOOK(PFINET, NF IP POSTROUTING, newskb, NULL,230 newskb->dev,


47

231 ip dev loopback xmit);232 }233234 / * Multicasts with ttl 0 must not go beyond the host * /235236 if (skb->nh.iph->ttl == 0) {237 kfree skb(skb);238 return 0;239 }240 }241242 if (rt->rt flags&RTCF BROADCAST){243 struct sk buff * newskb = skb clone(skb, GFP ATOMIC);244 if (newskb)245 NF HOOK(PFINET, NF IP POSTROUTING, newskb, NULL,246 newskb->dev, ip dev loopback xmit);247 }248249 return ip finish output(skb);250 }251252 int ip output(struct sk buff * skb)253 {254 #ifdef CONFIG IP ROUTENAT255 struct rtable * rt = (struct rtable * )skb->dst;256 #endif257258 IP INC STATS(IpOutRequests);259260 #ifdef CONFIG IP ROUTENAT261 if (rt->rt flags&RTCF NAT)262 ip do nat(skb);263 #endif264265 return ip finish output(skb);266 }267268 / * Queues a packet to be sent, and starts the transmitter if nece ssary.269 * This routine also needs to put in the total length and compute the270 * checksum. We use to do this in two stages, ip build header() then271 * this, but that scheme created a mess when routes disappeared etc.272 * So we do it all here, and the TCP send engine has been changed to273 * match. (No more unroutable FIN disasters, etc. wheee...) Th is will274 * most likely make other reliable transport layers above IP ea sier275 * to implement under Linux.276 * /


48

277 static inline int ip queue xmit2(struct sk buff * skb)278 {279 struct sock * sk = skb->sk;280 struct rtable * rt = (struct rtable * )skb->dst;281 struct net device * dev;282 struct iphdr * iph = skb->nh.iph;283284 dev = rt->u.dst.dev;285286 / * This can happen when the transport layer has segments queued287 * with a cached route, and by the time we get here things are288 * re-routed to a device with a different MTU than the original289 * device. Sick, but we must cover it.290 * /291 if (skb headroom(skb) < dev->hard header len && dev->hard header) {292 struct sk buff * skb2;293294 skb2 = skb realloc headroom(skb, (dev->hard header len + 15) & ˜15);295 kfree skb(skb);296 if (skb2 == NULL)297 return -ENOMEM;298 if (sk)299 skb set owner w(skb2, sk);300 skb = skb2;301 iph = skb->nh.iph;302 }303304 if (skb->len > rt->u.dst.pmtu)305 goto fragment;306307 if (ip dont fragment(sk, &rt->u.dst))308 iph->frag off |= constant htons(IP DF);309310 ip select ident(iph, &rt->u.dst);311312 / * Add an IP checksum. * /313 ip send check(iph);314315 skb->priority = sk->priority;316 return skb->dst->output(skb);317318 fragment:319 if (ip dont fragment(sk, &rt->u.dst)) {320 / * Reject packet ONLY if TCP might fragment321 * it itself, if were careful enough.322 * /


49

323 iph->frag off |= constant htons(IP DF);324 NETDEBUG(printk(KERN DEBUG "sending pkt too big to self \n"));325326 icmp send(skb, ICMP DESTUNREACH, ICMPFRAGNEEDED,327 htonl(rt->u.dst.pmtu));328 kfree skb(skb);329 return -EMSGSIZE;330 }331 ip select ident(iph, &rt->u.dst);332 return ip fragment(skb, skb->dst->output);333 }334335 int ip queue xmit(struct sk buff * skb)336 {337 struct sock * sk = skb->sk;338 struct ip options * opt = sk->protinfo.af inet.opt;339 struct rtable * rt;340 struct iphdr * iph;341342 / * Make sure we can route this packet. * /343 rt = (struct rtable * ) sk dst check(sk, 0);344 if (rt == NULL) {345 u32 daddr;346347 / * Use correct destination address if we have options. * /348 daddr = sk->daddr;349 if(opt && opt->srr)350 daddr = opt->faddr;351352 / * If this fails, retransmit mechanism of transport layer will353 * keep trying until route appears or the connection times itse lf354 * out.355 * /356 if (ip route output(&rt, daddr, sk->saddr,357 RT TOS(sk->protinfo.af inet.tos) | RTO CONN | sk->localroute,358 sk->bound dev if))359 goto no route;360 sk dst set(sk, &rt->u.dst);361 }362 skb->dst = dst clone(&rt->u.dst);363364 if (opt && opt->is strictroute && rt->rt dst != rt->rt gateway)365 goto no route;366367 / * OK, we know where to send it, allocate and build IP header. * /368 iph = (struct iphdr * ) skb push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));


50

369 * (( u16 * )iph) = htons((4 tot len = htons(skb->len);371 iph->frag off = 0;372 iph->ttl = sk->protinfo.af inet.ttl;373 iph->protocol = sk->protocol;374 iph->saddr = rt->rt src;375 iph->daddr = rt->rt dst;376 skb->nh.iph = iph;377 / * Transport layer set skb->h.foo itself. * /378379 if(opt && opt->optlen) {380 iph->ihl += opt->optlen >> 2;381 ip options build(skb, opt, sk->daddr, rt, 0);382 }383384 return NF HOOK(PFINET, NF IP LOCALOUT, skb, NULL, rt->u.dst.dev,385 ip queue xmit2);386387 no route:388 IP INC STATS(IpOutNoRoutes);389 kfree skb(skb);390 return -EHOSTUNREACH;391 }392393 / *394 * Build and send a packet, with as little as one copy395 *396 * Doesn’t care much about ip options... option length can be397 * different for fragment at 0 and other fragments.398 *399 * Note that the fragment at the highest offset is sent first,400 * so the getfrag routine can fill in the TCP/UDP checksum heade r401 * field in the last fragment it sends... actually it also helps402 * the reassemblers, they can put most packets in at the head of403 * the fragment queue, and they know the total size in advance. T his404 * last feature will measurably improve the Linux fragment han dler one405 * day.406 *407 * The callback has five args, an arbitrary pointer (copy of fra g),408 * the source IP address (may depend on the routing table), the409 * destination address (char * ), the offset to copy from, and the410 * length to be copied.411 * /412413 static int ip build xmit slow(struct sock * sk,414 int getfrag (const void * ,


51

415 char * ,416 unsigned int,417 unsigned int),418 const void * frag,419 unsigned length,420 struct ipcm cookie * ipc,421 struct rtable * rt,422 int flags)423 {424 unsigned int fraglen, maxfraglen, fragheaderlen;425 int err;426 int offset, mf;427 int mtu;428 u16 id = 0;429430 int hh len = (rt->u.dst.dev->hard header len + 15)&˜15;431 int nfrags=0;432 struct ip options * opt = ipc->opt;433 int df = 0;434435 mtu = rt->u.dst.pmtu;436 if (ip dont fragment(sk, &rt->u.dst))437 df = htons(IP DF);438439 length -= sizeof(struct iphdr);440441 if (opt) {442 fragheaderlen = sizeof(struct iphdr) + opt->optlen;443 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ˜ 7) + fragheaderlen;444 } else {445 fragheaderlen = sizeof(struct iphdr);446447 / *448 * Fragheaderlen is the size of ’overhead’ on each buffer. Now w ork449 * out the size of the frames to send.450 * /451452 maxfraglen = ((mtu-sizeof(struct iphdr)) & ˜7) + fragheade rlen;453 }454455 if (length + fragheaderlen > 0xFFFF) {456 ip local error(sk, EMSGSIZE, rt->rt dst, sk->dport, mtu);457 return -EMSGSIZE;458 }459460 / *


52

461 * Start at the end of the frame by handling the remainder.462 * /463464 offset = length - (length % (maxfraglen - fragheaderlen));465466 / *467 * Amount of memory to allocate for final fragment.468 * /469470 fraglen = length - offset + fragheaderlen;471472 if (length-offset==0) {473 fraglen = maxfraglen;474 offset -= maxfraglen-fragheaderlen;475 }476477 / *478 * The last fragment will not have MF (more fragments) set.479 * /480481 mf = 0;482483 / *484 * Don’t fragment packets for path mtu discovery.485 * /486487 if (offset > 0 && sk->protinfo.af inet.pmtudisc==IP PMTUDISCDO) {488 ip local error(sk, EMSGSIZE, rt->rt dst, sk->dport, mtu);489 return -EMSGSIZE;490 }491 if (flags&MSG PROBE)492 goto out;493494 / *495 * Begin outputting the bytes.496 * /497498 do {499 char * data;500 struct sk buff * skb;501502 / *503 * Get the memory we require with some space left for alignment.504 * /505506 skb = sock alloc send skb(sk, fraglen+hh len+15, 0, flags&MSG DONTWAIT, &err);


53

507 if (skb == NULL)508 goto error;509510 / *511 * Fill in the control structures512 * /513514 skb->priority = sk->priority;515 skb->dst = dst clone(&rt->u.dst);516 skb reserve(skb, hh len);517518 / *519 * Find where to start putting bytes.520 * /521522 data = skb put(skb, fraglen);523 skb->nh.iph = (struct iphdr * )data;524525 / *526 * Only write IP header onto non-raw packets527 * /528529 {530 struct iphdr * iph = (struct iphdr * )data;531532 iph->version = 4;533 iph->ihl = 5;534 if (opt) {535 iph->ihl += opt->optlen>>2;536 ip options build(skb, opt,537 ipc->addr, rt, offset);538 }539 iph->tos = sk->protinfo.af inet.tos;540 iph->tot len = htons(fraglen - fragheaderlen + iph->ihl * 4);541 iph->frag off = htons(offset>>3)|mf|df;542 iph->id = id;543 if (!mf) {544 if (offset || !df) {545 / * Select an unpredictable ident only546 * for packets without DF or having547 * been fragmented.548 * /549 ip select ident(iph, &rt->u.dst);550 id = iph->id;551 }552


54

553 / *554 * Any further fragments will have MF set.555 * /556 mf = htons(IP MF);557 }558 if (rt->rt type == RTN MULTICAST)559 iph->ttl = sk->protinfo.af inet.mc ttl;560 else561 iph->ttl = sk->protinfo.af inet.ttl;562 iph->protocol = sk->protocol;563 iph->check = 0;564 iph->saddr = rt->rt src;565 iph->daddr = rt->rt dst;566 iph->check = ip fast csum((unsigned char * )iph, iph->ihl);567 data += iph->ihl * 4;568 }569570 / *571 * User data callback572 * /573574 if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {575 err = -EFAULT;576 kfree skb(skb);577 goto error;578 }579580 offset -= (maxfraglen-fragheaderlen);581 fraglen = maxfraglen;582583 nfrags++;584585 err = NF HOOK(PFINET, NF IP LOCALOUT, skb, NULL,586 skb->dst->dev, output maybe reroute);587 if (err) {588 if (err > 0)589 err = sk->protinfo.af inet.recverr ? net xmit errno(err) : 0;590 if (err)591 goto error;592 }593 } while (offset >= 0);594595 if (nfrags>1)596 ip statistics[smp processor id() * 2 + !in softirq()].IpFragCreates += nfrags;597 out:598 return 0;


55

599600 error:601 IP INC STATS(IpOutDiscards);602 if (nfrags>1)603 ip statistics[smp processor id() * 2 + !in softirq()].IpFragCreates += nfrags;604 return err;605 }606607 / *608 * Fast path for unfragmented packets.609 * /610 int ip build xmit(struct sock * sk,611 int getfrag (const void * ,612 char * ,613 unsigned int,614 unsigned int),615 const void * frag,616 unsigned length,617 struct ipcm cookie * ipc,618 struct rtable * rt,619 int flags)620 {621 int err;622 struct sk buff * skb;623 int df;624 struct iphdr * iph;625626 / *627 * Try the simple case first. This leaves fragmented frames, an d by628 * choice RAW frames within 20 bytes of maximum size(rare) to th e long path629 * /630631 if (!sk->protinfo.af inet.hdrincl) {632 length += sizeof(struct iphdr);633634 / *635 * Check for slow path.636 * /637 if (length > rt->u.dst.pmtu || ipc->opt != NULL)638 return ip build xmit slow(sk,getfrag,frag,length,ipc,rt,flags);639 } else {640 if (length > rt->u.dst.dev->mtu) {641 ip local error(sk, EMSGSIZE, rt->rt dst, sk->dport, rt->u.dst.dev->mtu);642 return -EMSGSIZE;643 }644 }


56

645 if (flags&MSG PROBE)646 goto out;647648 / *649 * Do path mtu discovery if needed.650 * /651 df = 0;652 if (ip dont fragment(sk, &rt->u.dst))653 df = htons(IP DF);654655 / *656 * Fast path for unfragmented frames without options.657 * /658 {659 int hh len = (rt->u.dst.dev->hard header len + 15)&˜15;660661 skb = sock alloc send skb(sk, length+hh len+15,662 0, flags&MSG DONTWAIT, &err);663 if(skb==NULL)664 goto error;665 skb reserve(skb, hh len);666 }667668 skb->priority = sk->priority;669 skb->dst = dst clone(&rt->u.dst);670671 skb->nh.iph = iph = (struct iphdr * )skb put(skb, length);672673 if(!sk->protinfo.af inet.hdrincl) {674 iph->version=4;675 iph->ihl=5;676 iph->tos=sk->protinfo.af inet.tos;677 iph->tot len = htons(length);678 iph->frag off = df;679 iph->ttl=sk->protinfo.af inet.mc ttl;680 ip select ident(iph, &rt->u.dst);681 if (rt->rt type != RTN MULTICAST)682 iph->ttl=sk->protinfo.af inet.ttl;683 iph->protocol=sk->protocol;684 iph->saddr=rt->rt src;685 iph->daddr=rt->rt dst;686 iph->check=0;687 iph->check = ip fast csum((unsigned char * )iph, iph->ihl);688 err = getfrag(frag, ((char * )iph)+iph->ihl * 4,0, length-iph->ihl * 4);689 }690 else


57

691 err = getfrag(frag, (void * )iph, 0, length);692693 if (err)694 goto error fault;695696 err = NF HOOK(PFINET, NF IP LOCALOUT, skb, NULL, rt->u.dst.dev,697 output maybe reroute);698 if (err > 0)699 err = sk->protinfo.af inet.recverr ? net xmit errno(err) : 0;700 if (err)701 goto error;702 out:703 return 0;704705 error fault:706 err = -EFAULT;707 kfree skb(skb);708 error:709 IP INC STATS(IpOutDiscards);710 return err;711 }712713 / *714 * This IP datagram is too large to be sent in one piece. Break it u p into715 * smaller pieces (each of size equal to IP header plus716 * a block of the data of the original IP data part) that will yet f it in a717 * single device frame, and queue such a frame for sending.718 *719 * Yes this is inefficient, feel free to submit a quicker one.720 * /721722 int ip fragment(struct sk buff * skb, int ( * output)(struct sk buff * ))723 {724 struct iphdr * iph;725 unsigned char * raw;726 unsigned char * ptr;727 struct net device * dev;728 struct sk buff * skb2;729 unsigned int mtu, hlen, left, len;730 int offset;731 int not last frag;732 struct rtable * rt = (struct rtable * )skb->dst;733 int err = 0;734735 dev = rt->u.dst.dev;736


58

737 / *738 * Point into the IP datagram header.739 * /740741 raw = skb->nh.raw;742 iph = (struct iphdr * )raw;743744 / *745 * Setup starting values.746 * /747748 hlen = iph->ihl * 4;749 left = ntohs(iph->tot len) - hlen; / * Space per frame * /750 mtu = rt->u.dst.pmtu - hlen; / * Size of data space * /751 ptr = raw + hlen; / * Where to start from * /752753 / *754 * Fragment the datagram.755 * /756757 offset = (ntohs(iph->frag off) & IP OFFSET) frag off & htons(IP MF);759760 / *761 * Keep copying data until we run out.762 * /763764 while(left > 0) {765 len = left;766 / * IF: it doesn’t fit, use ’mtu’ - the data space left * /767 if (len > mtu)768 len = mtu;769 / * IF: we are not sending upto and including the packet end770 then align the next start on an eight byte boundary * /771 if (len < left) {772 len &= ˜7;773 }774 / *775 * Allocate buffer.776 * /777778 if ((skb2 = alloc skb(len+hlen+dev->hard header len+15,GFP ATOMIC)) == NULL) {779 NETDEBUG(printk(KERN INFO "IP: frag: no memory for new fragment! \n"));780 err = -ENOMEM;781 goto fail;782 }


59

783784 / *785 * Set up data on packet786 * /787788 skb2->pkt type = skb->pkt type;789 skb2->priority = skb->priority;790 skb reserve(skb2, (dev->hard header len+15)&˜15);791 skb put(skb2, len + hlen);792 skb2->nh.raw = skb2->data;793 skb2->h.raw = skb2->data + hlen;794795 / *796 * Charge the memory for the fragment to any owner797 * it might possess798 * /799800 if (skb->sk)801 skb set owner w(skb2, skb->sk);802 skb2->dst = dst clone(skb->dst);803 skb2->dev = skb->dev;804805 / *806 * Copy the packet header into the new buffer.807 * /808809 memcpy(skb2->nh.raw, raw, hlen);810811 / *812 * Copy a block of the IP datagram.813 * /814 memcpy(skb2->h.raw, ptr, len);815 left -= len;816817 / *818 * Fill in the new header fields.819 * /820 iph = skb2->nh.iph;821 iph->frag off = htons((offset >> 3));822823 / * ANK: dirty, but effective trick. Upgrade options only if824 * the segment to be fragmented was THE FIRST (otherwise,825 * options are already fixed) and make it ONCE826 * on the initial skb, so that all the following fragments827 * will inherit fixed options.828 * /


60

829 if (offset == 0)830 ip options fragment(skb);831832 / *833 * Added AC : If we are fragmenting a fragment that’s not the834 * last fragment then keep MF on each bit835 * /836 if (left > 0 || not last frag)837 iph->frag off |= htons(IP MF);838 ptr += len;839 offset += len;840841 #ifdef CONFIG NETFILTER842 / * Connection association is same as pre-frag packet * /843 skb2->nfct = skb->nfct;844 nf conntrack get(skb2->nfct);845 #ifdef CONFIG NETFILTER DEBUG846 skb2->nf debug = skb->nf debug;847 #endif848 #endif849850 / *851 * Put this fragment into the sending queue.852 * /853854 IP INC STATS(IpFragCreates);855856 iph->tot len = htons(len + hlen);857858 ip send check(iph);859860 err = output(skb2);861 if (err)862 goto fail;863 }864 kfree skb(skb);865 IP INC STATS(IpFragOKs);866 return err;867868 fail:869 kfree skb(skb);870 IP INC STATS(IpFragFails);871 return err;872 }873874 / *


61

875 * Fetch data from kernel space and fill in checksum if needed.876 * /877 static int ip reply glue bits(const void * dptr, char * to, unsigned int offset,878 unsigned int fraglen)879 {880 struct ip reply arg * dp = (struct ip reply arg * )dptr;881 u16 * pktp = (u16 * )to;882 struct iovec * iov;883 int len;884 int hdrflag = 1;885886 iov = &dp->iov[0];887 if (offset >= iov->iov len) {888 offset -= iov->iov len;889 iov++;890 hdrflag = 0;891 }892 len = iov->iov len - offset;893 if (fraglen > len) { / * overlapping. * /894 dp->csum = csum partial copy nocheck(iov->iov base+offset, to, len,895 dp->csum);896 offset = 0;897 fraglen -= len;898 to += len;899 iov++;900 }901902 dp->csum = csum partial copy nocheck(iov->iov base+offset, to, fraglen,903 dp->csum);904905 if (hdrflag && dp->csumoffset)906 * (pktp + dp->csumoffset) = csum fold(dp->csum); / * fill in checksum * /907 return 0;908 }909910 / *911 * Generic function to send a packet as reply to another packet.912 * Used to send TCP resets so far. ICMP should use this function t oo.913 *914 * Should run single threaded per socket because it uses the soc k915 * structure to pass arguments.916 * /917 void ip send reply(struct sock * sk, struct sk buff * skb, struct ip reply arg * arg,918 unsigned int len)919 {920 struct {


62

921 struct ip options opt;922 char data[40];923 } replyopts;924 struct ipcm cookie ipc;925 u32 daddr;926 struct rtable * rt = (struct rtable * )skb->dst;927928 if (ip options echo(&replyopts.opt, skb))929 return;930931 daddr = ipc.addr = rt->rt src;932 ipc.opt = NULL;933934 if (replyopts.opt.optlen) {935 ipc.opt = &replyopts.opt;936937 if (ipc.opt->srr)938 daddr = replyopts.opt.faddr;939 }940941 if (ip route output(&rt, daddr, rt->rt spec dst, RT TOS(skb->nh.iph->tos), 0))942 return;943944 / * And let IP do all the hard work.945946 This chunk is not reenterable, hence spinlock.947 Note that it uses the fact, that this function is called948 with locally disabled BH and that sk cannot be already spinlo cked.949 * /950 bh lock sock(sk);951 sk->protinfo.af inet.tos = skb->nh.iph->tos;952 sk->priority = skb->priority;953 sk->protocol = skb->nh.iph->protocol;954 ip build xmit(sk, ip reply glue bits, arg, len, &ipc, rt, MSG DONTWAIT);955 bh unlock sock(sk);956957 ip rt put(rt);958 }959960 / *961 * IP protocol layer initialiser962 * /963964 static struct packet type ip packet type =965 {966 constant htons(ETH P IP),


63

967 NULL, / * All devices * /968 ip rcv,969 (void * )1,970 NULL,971 };972973 / *974 * IP registers the packet type and then calls the subprotocol i nitialisers975 * /976977 void init ip init(void)978 {979 dev add pack(&ip packet type);980981 ip rt init();982 inet initpeers();983984 #ifdef CONFIG IP MULTICAST985 proc net create("igmp", 0, ip mc procinfo);986 #endif987 }


64

12 #include / * struct sock * /34 struct inet skb parm5 {6 struct ip options opt; / * Compiled IP options * /7 unsigned char flags;89 #define IPSKB MASQUERADED 1

10 #define IPSKB TRANSLATED 211 #define IPSKB FORWARDED 412 };1314 struct ipcm cookie15 {16 u32 addr;17 int oif;18 struct ip options * opt;19 };2021 #define IPCB(skb) ((struct inet skb parm* )((skb)->cb))2223 struct ip ra chain24 {25 struct ip ra chain * next;26 struct sock * sk;27 void ( * destructor)(struct sock * );28 };2930 extern struct ip ra chain * ip ra chain;31 extern rwlock t ip ra lock;3233 / * IP flags. * /34 #define IP CE 0x8000 / * Flag: "Congestion" * /35 #define IP DF 0x4000 / * Flag: "Don’t Fragment" * /36 #define IP MF 0x2000 / * Flag: "More Fragments" * /37 #define IP OFFSET 0x1FFF / * "Fragment Offset" part * /3839 #define IP FRAGTIME (30 * HZ) / * fragment lifetime * /4041 extern void ip mc dropsocket(struct sock * );42 extern void ip mc dropdevice(struct net device * dev);43 extern int ip mc procinfo(char * , char ** , off t, int);4445 / *46 * Functions provided by ip.c

Program netip.h

65

47 * /4849 extern int ip build and send pkt(struct sk buff * skb, struct sock * sk,50 u32 saddr, u32 daddr,51 struct ip options * opt);52 extern int ip rcv(struct sk buff * skb, struct net device * dev,53 struct packet type * pt);54 extern int ip local deliver(struct sk buff * skb);55 extern int ip mr input(struct sk buff * skb);56 extern int ip output(struct sk buff * skb);57 extern int ip mc output(struct sk buff * skb);58 extern int ip fragment(struct sk buff * skb, int ( * out)(struct sk buff * ));59 extern int ip do nat(struct sk buff * skb);60 extern void ip send check(struct iphdr * ip);61 extern int ip queue xmit(struct sk buff * skb);62 extern void ip init(void);63 extern int ip build xmit(struct sock * sk,64 int getfrag (const void * ,65 char * ,66 unsigned int,67 unsigned int),68 const void * frag,69 unsigned length,70 struct ipcm cookie * ipc,71 struct rtable * rt,72 int flags);7374 / *75 * Map a multicast IP onto multicast MAC for type Token Ring.76 * This conforms to RFC1469 Option 2 Multicasting i.e.77 * using a functional address to transmit / receive78 * multicast packets.79 * /8081 static inline void ip tr mc map(u32 addr, char * buf)82 {83 buf[0]=0xC0;84 buf[1]=0x00;85 buf[2]=0x00;86 buf[3]=0x04;87 buf[4]=0x00;88 buf[5]=0x00;89 }9091 struct ip reply arg {92 struct iovec iov[2];

Program netip.h (continued)

66

93 int n iov; / * redundant * /94 u32 csum;95 int csumoffset; / * u16 offset of csum in iov[0].iov base * /96 / * -1 if not needed * /97 };9899 void ip send reply(struct sock * sk, struct sk buff * skb, struct ip reply arg * arg,

100 unsigned int len);101102 extern inline int ip finish output(struct sk buff * skb);103104 struct ipv4 config105 {106 int log martians;107 int autoconfig;108 int no pmtu disc;109 };110111 extern struct ipv4 config ipv4 config;112 extern struct ip mib ip statistics[NR CPUS* 2];113 #define IP INC STATS(field) SNMP INC STATS(ip statistics, field)114 #define IP INC STATSBH(field) SNMP INC STATSBH(ip statistics, field)115 #define IP INC STATSUSER(field) SNMP INC STATSUSER(ip statistics, field)116 extern struct linux mib net statistics[NR CPUS* 2];117 #define NET INC STATS(field) SNMP INC STATS(net statistics, field)118 #define NET INC STATSBH(field) SNMP INC STATSBH(net statistics, field)119 #define NET INC STATSUSER(field) SNMP INC STATSUSER(net statistics, field)120121 extern int sysctl local port range[2];122 extern int sysctl ip default ttl;123124 #ifdef CONFIG INET125 static inline int ip send(struct sk buff * skb)126 {127 if (skb->len > skb->dst->pmtu)128 return ip fragment(skb, ip finish output);129 else130 return ip finish output(skb);131 }132133 / * The function in 2.2 was invalid, producing wrong result for134 * check=0xFEFF. It was noticed by Arthur Skawina year ago. --ANK(000625) * /135 static inline136 int ip decrease ttl(struct iphdr * iph)137 {138 u32 check = iph->check;


67

139 check += constant htons(0x0100);140 iph->check = check + (check>=0xFFFF);141 return --iph->ttl;142 }143144 static inline145 int ip dont fragment(struct sock * sk, struct dst entry * dst)146 {147 return (sk->protinfo.af inet.pmtudisc == IP PMTUDISCDO ||148 (sk->protinfo.af inet.pmtudisc == IP PMTUDISCWANT &&149 !(dst->mxlock&(1id = 0;158 else159 ip select ident(iph, dst);160 }161162 / *163 * Map a multicast IP onto multicast MAC for type ethernet.164 * /165166 static inline void ip eth mc map(u32 addr, char * buf)167 {168 addr=ntohl(addr);169 buf[0]=0x01;170 buf[1]=0x00;171 buf[2]=0x5e;172 buf[5]=addr&0xFF;173 addr>>=8;174 buf[4]=addr&0xFF;175 addr>>=8;176 buf[3]=addr&0x7F;177 }178179 #endif180181 extern int ip call ra chain(struct sk buff * skb);182183 / *184 * Functions provided by ip fragment.o


68

185 * /186187 struct sk buff * ip defrag(struct sk buff * skb);188 extern int ip frag nqueues;189 extern atomic t ip frag mem;190191 / *192 * Functions provided by ip forward.c193 * /194195 extern int ip forward(struct sk buff * skb);196 extern int ip net unreachable(struct sk buff * skb);197198 / *199 * Functions provided by ip options.c200 * /201202 extern void ip options build(struct sk buff * skb, struct ip options * opt, u32 daddr,203 extern int ip options echo(struct ip options * dopt, struct sk buff * skb);204 extern void ip options fragment(struct sk buff * skb);205 extern int ip options compile(struct ip options * opt, struct sk buff * skb);206 extern int ip options get(struct ip options ** optp, unsigned char * data, int optlen,207 extern void ip options undo(struct ip options * opt);208 extern void ip forward options(struct sk buff * skb);209 extern int ip options rcv srr(struct sk buff * skb);210211 / *212 * Functions provided by ip sockglue.c213 * /214215 extern void ip cmsg recv(struct msghdr * msg, struct sk buff * skb);216 extern int ip cmsg send(struct msghdr * msg, struct ipcm cookie * ipc);217 extern int ip setsockopt(struct sock * sk, int level, int optname, char * optval, int218 extern int ip getsockopt(struct sock * sk, int level, int optname, char * optval, int219 extern int ip ra control(struct sock * sk, unsigned char on, void ( * destructor)(stru220221 extern int ip recv error(struct sock * sk, struct msghdr * msg, int len);222 extern void ip icmp error(struct sock * sk, struct sk buff * skb, int err,223 u16 port, u32 info, u8 * payload);224 extern void ip local error(struct sock * sk, int err, u32 daddr, u16 dport,225 u32 info);226227 #endif / * IP H * /


69

12 / * "NOOP" scheduler: the best scheduler, recommended for all i nterfaces3 under all circumstances. It is difficult to invent anything faster or4 cheaper.5 * /67 static int8 noop enqueue(struct sk buff * skb, struct Qdisc * qdisc)9 {

10 kfree skb(skb);11 return NET XMIT CN;12 }1314 static struct sk buff *15 noop dequeue(struct Qdisc * qdisc)16 {17 return NULL;18 }1920 static int21 noop requeue(struct sk buff * skb, struct Qdisc * qdisc)22 {23 if (net ratelimit())24 printk(KERN DEBUG "%s deferred output. It is buggy. \n", skb->dev->name);25 kfree skb(skb);26 return NET XMIT CN;27 }2829 struct Qdisc ops noop qdisc ops =30 {31 NULL,32 NULL,33 "noop",34 0,3536 noop enqueue,37 noop dequeue,38 noop requeue,39 };4041 struct Qdisc noop qdisc =42 {43 noop enqueue,44 noop dequeue,45 TCQF BUILTIN,46 &noop qdisc ops,

Program schgeneric.c

70

47 };484950 struct Qdisc ops noqueue qdisc ops =51 {52 NULL,53 NULL,54 "noqueue",55 0,5657 noop enqueue,58 noop dequeue,59 noop requeue,6061 };6263 struct Qdisc noqueue qdisc =64 {65 NULL,66 noop dequeue,67 TCQF BUILTIN,68 &noqueue qdisc ops,69 };707172 static const u8 prio2band[TC PRIO MAX+1] =73 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };7475 / * 3-band FIFO queue: old style, but should be a bit faster than76 generic prio+fifo combination.77 * /7879 static int80 pfifo fast enqueue(struct sk buff * skb, struct Qdisc * qdisc)81 {82 struct sk buff head * list;8384 list = ((struct sk buff head * )qdisc->data) +85 prio2band[skb->priority&TC PRIO MAX];8687 if (list->qlen dev->tx queue len) {88 skb queue tail(list, skb);89 qdisc->q.qlen++;90 return 0;91 }92 qdisc->stats.drops++;

Program schgeneric.c (continued)

71

93 kfree skb(skb);94 return NET XMIT DROP;95 }9697 static struct sk buff *98 pfifo fast dequeue(struct Qdisc * qdisc)99 {

100 int prio;101 struct sk buff head * list = ((struct sk buff head * )qdisc->data);102 struct sk buff * skb;103104 for (prio = 0; prio < 3; prio++, list++) {105 skb = skb dequeue(list);106 if (skb) {107 qdisc->q.qlen--;108 return skb;109 }110 }111 return NULL;112 }113114 static int115 pfifo fast requeue(struct sk buff * skb, struct Qdisc * qdisc)116 {117 struct sk buff head * list;118119 list = ((struct sk buff head * )qdisc->data) +120 prio2band[skb->priority&TC PRIO MAX];121122 skb queue head(list, skb);123 qdisc->q.qlen++;124 return 0;125 }126127 static void128 pfifo fast reset(struct Qdisc * qdisc)129 {130 int prio;131 struct sk buff head * list = ((struct sk buff head * )qdisc->data);132133 for (prio=0; prio < 3; prio++)134 skb queue purge(list+prio);135 qdisc->q.qlen = 0;136 }137138 static int pfifo fast init(struct Qdisc * qdisc, struct rtattr * opt)


72

139 {140 int i;141 struct sk buff head * list;142143 list = ((struct sk buff head * )qdisc->data);144145 for (i=0; ipriv size;171172 sch = kmalloc(size, GFP KERNEL);173 if (!sch)174 return NULL;175 memset(sch, 0, size);176177 skb queue head init(&sch->q);178 sch->ops = ops;179 sch->enqueue = ops->enqueue;180 sch->dequeue = ops->dequeue;181 sch->dev = dev;182 sch->stats.lock = &dev->queue lock;183 atomic set(&sch->refcnt, 1);184 if (!ops->init || ops->init(sch, NULL) == 0)


73

185 return sch;186187 kfree(sch);188 return NULL;189 }190191 int qdisc restart(struct net device * dev)192 {193 struct Qdisc * q = dev->qdisc;194 struct sk buff * skb;195196 / * Dequeue packet * /197 if ((skb = q->dequeue(q)) != NULL) {198 if (spin trylock(&dev->xmit lock)) {199 / * Remember that the driver is grabbed by us. * /200 dev->xmit lock owner = smp processor id();201202 / * And release queue * /203 spin unlock(&dev->queue lock);204205 if (!netif queue stopped(dev)) {206 if (netdev nit)207 dev queue xmit nit(skb, dev);208209 if (dev->hard start xmit(skb, dev) == 0) {210 dev->xmit lock owner = -1;211 spin unlock(&dev->xmit lock);212213 spin lock(&dev->queue lock);214 return -1;215 }216 }217218 / * Release the driver * /219 dev->xmit lock owner = -1;220 spin unlock(&dev->xmit lock);221 spin lock(&dev->queue lock);222 q = dev->qdisc;223 } else {224 / * So, someone grabbed the driver. * /225226 / * It may be transient configuration error,227 when hard start xmit() recurses. We detect228 it by checking xmit owner and drop the229 packet when deadloop is detected.230 * /


74

231 if (dev->xmit lock owner == smp processor id()) {232 kfree skb(skb);233 if (net ratelimit())234 printk(KERN DEBUG "Dead loop on netdevice %s, fix it urgently! \n", dev->name);235 return -1;236 }237 netdev rx stat[smp processor id()].cpu collision++;238 }239240 / * Device kicked us out :(241 This is possible in three cases:242243 0. driver is locked244 1. fastroute is enabled245 2. device cannot determine busy state246 before start of transmission (f.e. dialout)247 3. device is buggy (ppp)248 * /249250 q->ops->requeue(skb, q);251 netif schedule(dev);252 return 1;253 }254 return q->q.qlen;255 }


In the generic scheduler, pay particular attention to the sequence of code at lines 72 - 92.To finish up the packet output sequence, after enquing the packet, we end up callingpkt sched.h:qdisc run()

and eventuallysch generic/qdisc restart() . Here the highest priority packet is de-queued and the device driver is called to transmit the packetdev hard start xmit() . Thiscall the transmit function ineepro100.c and the packet is sent on the network. If the devicedriver is busy, the packet is re-queued and the transmissionis attempted later.

75

Documents

The Linux Kernel Source Codeusers.ece.gatech.edu/riley/ece4110/handouts/kernel.pdf · 2009. 4. 16. · 3 * operating system. INET is implemented using the BSD Socket 4 * interface