diff --git a/include/net/route.h b/include/net/route.h index 486e37a..acc84e8 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -72,6 +72,8 @@ struct rtable /* Cache lookup keys */ struct flowi fl; + struct leaf *parent; + /* Miscellaneous cached information */ __be32 rt_spec_dst; /* RFC1122 specific destination */ struct inet_peer *peer; /* long-living peer info */ @@ -109,15 +111,14 @@ extern struct ip_rt_acct *ip_rt_acct; struct in_device; extern int ip_rt_init(void); -extern void ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw, - __be32 src, struct net_device *dev); +extern void ip_rt_redirect(struct iphdr *iph, __be16 sprt, __be16 dprt, __be32 old_gw, __be32 new_gw, struct net_device *dev); extern void ip_rt_advice(struct rtable **rp, int advice); extern void rt_cache_flush(int how); extern int __ip_route_output_key(struct rtable **, const struct flowi *flp); extern int ip_route_output_key(struct rtable **, struct flowi *flp); extern int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin); -extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu); +extern unsigned short ip_rt_frag_needed(struct iphdr *iph, __be16 sprt, __be16 dprt, unsigned short new_mtu); extern void ip_rt_send_redirect(struct sk_buff *skb); extern unsigned inet_addr_type(__be32 addr); @@ -129,6 +130,34 @@ extern int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb); struct in_ifaddr; extern void fib_add_ifaddr(struct in_ifaddr *); + +static __inline__ int rt_valuable(struct rtable *rth) +{ + return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || + (rth->u.dst.expires && time_before_eq(jiffies, rth->u.dst.expires)); +} + +/* Bits of score are: + * 31: very valuable + * 30: not quite useless + * 29..0: usage counter + */ +static inline u32 rt_score(struct rtable *rt) +{ + u32 score = jiffies - rt->u.dst.lastuse; + + score = ~score & ~(3<<30); + + if (rt_valuable(rt)) + score |= (1<<31); + + if (!rt->fl.iif || + !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) + score |= (1<<30); + + return score; +} + static inline void ip_rt_put(struct rtable * rt) { if (rt) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index b3b9b60..fbb3b5b 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -23,6 +23,7 @@ #define NETLINK_GENERIC 16 /* leave room for NETLINK_DM (DM Events) */ #define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */ +#define NETLINK_UNICACHE 19 #define MAX_LINKS 32 diff --git a/include/linux/in_route.h b/include/linux/in_route.h index 61f25c3..ba41816 100644 --- a/include/linux/in_route.h +++ b/include/linux/in_route.h @@ -9,6 +9,11 @@ /* Obsolete flag. About to be deleted */ #define RTCF_NOPMTUDISC RTM_F_NOPMTUDISC +#define RTCF_TCP_SPARE 0x00001000 +#define RTCF_TCP_EST 0x00002000 +#define RTCF_TCP_SYN 0x00004000 +#define RTCF_TCP_FIN 0x00008000 + #define RTCF_NOTIFY 0x00010000 #define RTCF_DIRECTDST 0x00020000 #define RTCF_REDIRECTED 0x00040000 diff --git a/net/core/Makefile b/net/core/Makefile index 73272d5..df411f1 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,7 +3,7 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ - gen_stats.o gen_estimator.o + gen_stats.o gen_estimator.o trie_core.o unicache.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8640096..e395284 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -115,6 +115,8 @@ #ifdef CONFIG_IP_MROUTE #include #endif +#include +#include DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; @@ -1345,6 +1347,8 @@ static int __init inet_init(void) dev_add_pack(&ip_packet_type); + unicache_init(); + rc = 0; out: return rc; @@ -1374,8 +1378,14 @@ static int __init ipv4_proc_init(void) goto out_fib; if (ip_misc_proc_init()) goto out_misc; + if(unicache_proc_init()) + goto out_unicache; + out: return rc; + +out_unicache: + unicache_proc_exit(); out_misc: fib_proc_exit(); out_fib: @@ -1415,3 +1425,6 @@ EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_unregister_protosw); EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); + + + diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2daa0dc..369d153 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -57,6 +57,8 @@ * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations + * Robert Olsson : Major rework for full key lookup based trie/trash + * : datastructure with active GC and flow acounting etc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -105,10 +107,16 @@ #include #include #include +#include +#include #ifdef CONFIG_SYSCTL #include #endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED +#error +#endif + #define RT_FL_TOS(oldflp) \ ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) @@ -127,11 +135,11 @@ static int ip_rt_redirect_load = HZ / 50; static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost = HZ; static int ip_rt_error_burst = 5 * HZ; -static int ip_rt_gc_elasticity = 8; +static int ip_rt_gc_elasticity = 3; static int ip_rt_mtu_expires = 10 * 60 * HZ; static int ip_rt_min_pmtu = 512 + 20 + 20; static int ip_rt_min_advmss = 256; -static int ip_rt_secret_interval = 10 * 60 * HZ; +static int ip_rt_secret_interval = 24 * 10 * 60 * HZ; static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) @@ -167,6 +175,11 @@ static struct dst_ops ipv4_dst_ops = { .entry_size = sizeof(struct rtable), }; +extern struct trie *t_unicache; +extern spinlock_t trie_write_lock; +extern struct trie_ops unicache_ops; +extern unsigned long unicache_flow_cnt; + #define ECN_OR_COST(class) TC_PRIO_##class __u8 ip_tos2prio[16] = { @@ -246,20 +259,17 @@ static spinlock_t *rt_hash_locks; static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; static int rt_hash_log; -static unsigned int rt_hash_rnd; +unsigned int rt_hash_rnd; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ (__raw_get_cpu_var(rt_cache_stat).field++) -static int rt_intern_hash(unsigned hash, struct rtable *rth, - struct rtable **res); +#define RT_CACHE_STAT_ADD(field, cnt) \ + ((__raw_get_cpu_var(rt_cache_stat).field) += cnt) -static unsigned int rt_hash_code(u32 daddr, u32 saddr) -{ - return (jhash_2words(daddr, saddr, rt_hash_rnd) - & rt_hash_mask); -} +static int rt_intern_flow(const struct flowi *fl, struct rtable *rth, + struct rtable **res); #define rt_hash(daddr, saddr, idx) \ rt_hash_code((__force u32)(__be32)(daddr),\ @@ -268,36 +278,31 @@ static unsigned int rt_hash_code(u32 daddr, u32 saddr) #ifdef CONFIG_PROC_FS struct rt_cache_iter_state { int bucket; + struct rtable *last; }; static struct rtable *rt_cache_get_first(struct seq_file *seq) { - struct rtable *r = NULL; struct rt_cache_iter_state *st = seq->private; - for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; - if (r) - break; + rcu_read_lock_bh(); + + st->last = rt_next(t_unicache, NULL); + + if(!st->last) rcu_read_unlock_bh(); - } - return r; + + return st->last; } static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) { struct rt_cache_iter_state *st = rcu_dereference(seq->private); - r = r->u.rt_next; - while (!r) { + st->last = rt_next(t_unicache, st->last); + if(!st->last) rcu_read_unlock_bh(); - if (--st->bucket < 0) - break; - rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; - } - return r; + return st->last; } static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) @@ -494,12 +499,33 @@ static struct file_operations rt_cpu_seq_fops = { #endif /* CONFIG_PROC_FS */ -static __inline__ void rt_free(struct rtable *rt) +void __rt_free(struct rtable *rt) { multipath_remove(rt); call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); } +void rt_free(struct rtable *rt) +{ + struct leaf *l; + + l = rt->parent; + BUG_ON(!l); + + unicache_ops.dump(rt); + unicache_flow_cnt++; + + __rt_free(rt); + + /* + * All leaf rt-entries should be removed + * before leaf is removed + */ + + if(l->obj == NULL) + trie_remove(t_unicache, l, &unicache_ops); +} + static __inline__ void rt_drop(struct rtable *rt) { multipath_remove(rt); @@ -512,13 +538,7 @@ static __inline__ int rt_fast_clean(struct rtable *rth) /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rth->fl.iif && rth->u.rt_next; -} - -static __inline__ int rt_valuable(struct rtable *rth) -{ - return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->u.dst.expires; + rth->fl.iif; /* && rth->u.rt_next */ } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -543,27 +563,6 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t out: return ret; } -/* Bits of score are: - * 31: very valuable - * 30: not quite useless - * 29..0: usage counter - */ -static inline u32 rt_score(struct rtable *rt) -{ - u32 score = jiffies - rt->u.dst.lastuse; - - score = ~score & ~(3<<30); - - if (rt_valuable(rt)) - score |= (1<<31); - - if (!rt->fl.iif || - !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) - score |= (1<<30); - - return score; -} - static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | @@ -624,65 +623,69 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, /* This runs via a timer and thus is always in BH context. */ static void rt_check_expire(unsigned long dummy) { - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth, **rthp; + struct trie *t = t_unicache; + struct rtable *rth, *rth_next = NULL; unsigned long now = jiffies; + unsigned long tmo = ip_rt_gc_timeout; + unsigned goal; u64 mult; - - mult = ((u64)ip_rt_gc_interval) << rt_hash_log; + int i=0; + + mult = ((u64)ip_rt_gc_interval) * t->size; if (ip_rt_gc_timeout > 1) do_div(mult, ip_rt_gc_timeout); goal = (unsigned int)mult; - if (goal > rt_hash_mask) goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; + if (goal > t->size) goal = t->size/4; - if (*rthp == 0) + spin_lock(&trie_write_lock); + + rth = t->token; + +/* + printk("rt_check_expire goal=%d rth=%p", goal, rth); +*/ + for (; goal > 0; goal--, rth = rth_next) { + + rth_next = rt_next(t, rth); + + if(!rth) continue; - spin_lock(rt_hash_lock_addr(i)); - while ((rth = *rthp) != NULL) { - if (rth->u.dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(now, rth->u.dst.expires)) { - tmo >>= 1; - rthp = &rth->u.rt_next; - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.rt_next; + + if (rth->u.dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(now, rth->u.dst.expires)) continue; - } - /* Cleanup aged off entries. */ + } else if (!rt_may_expire(rth, tmo/2, tmo)) + continue; + + /* Cleanup aged off entries. */ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED - /* remove all related balanced entries if necessary */ - if (rth->u.dst.flags & DST_BALANCED) { - rthp = rt_remove_balanced_route( - &rt_hash_table[i].chain, - rth, NULL); - if (!rthp) - break; - } else { - *rthp = rth->u.rt_next; - rt_free(rth); - } + /* remove all related balanced entries if necessary */ + if (rth->u.dst.flags & DST_BALANCED) { + rthp = rt_remove_balanced_route( + &rt_hash_table[i].chain, + rth, NULL); + if (!rthp) + break; + } else + __rt_del(t, rth, &unicache_ops); #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - *rthp = rth->u.rt_next; - rt_free(rth); + __rt_del(t, rth, &unicache_ops); + i++; #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - } - spin_unlock(rt_hash_lock_addr(i)); - + /* Fallback loop breaker. */ if (time_after(jiffies, now)) break; } - rover = i; + t->token = rth_next; + spin_unlock(&trie_write_lock); + +/* + printk(" expired=%d\n", i); +*/ mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); } @@ -691,25 +694,8 @@ static void rt_check_expire(unsigned long dummy) */ static void rt_run_flush(unsigned long dummy) { - int i; - struct rtable *rth, *next; - - rt_deadline = 0; - - get_random_bytes(&rt_hash_rnd, 4); - - for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(rt_hash_lock_addr(i)); - rth = rt_hash_table[i].chain; - if (rth) - rt_hash_table[i].chain = NULL; - spin_unlock_bh(rt_hash_lock_addr(i)); - - for (; rth; rth = next) { - next = rth->u.rt_next; - rt_free(rth); - } - } + if(t_unicache) + trie_flush(t_unicache, &unicache_ops); } static DEFINE_SPINLOCK(rt_flush_lock); @@ -794,13 +780,21 @@ static int rt_garbage_collect(void) */ RT_CACHE_STAT_INC(gc_total); - if (now - last_gc < ip_rt_gc_min_interval && atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { - RT_CACHE_STAT_INC(gc_ignored); +/* RT_CACHE_STAT_INC(gc_ignored); Reused now */ goto out; } + last_gc = now; + + spin_lock(&trie_write_lock); + unicache_garbage_collect(t_unicache); + spin_unlock(&trie_write_lock); + + return 0; + + /* Calculate number of entries, which we want to expire now. */ goal = atomic_read(&ipv4_dst_ops.entries) - (ip_rt_gc_elasticity << rt_hash_log); @@ -886,7 +880,7 @@ static int rt_garbage_collect(void) We will not spin here for long time in any case. */ - RT_CACHE_STAT_INC(gc_goal_miss); +/* RT_CACHE_STAT_INC(gc_goal_miss); reused */ if (expire == 0) break; @@ -920,13 +914,32 @@ work_done: out: return 0; } -static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) +void print_flow(struct flowi *fl) +{ + printk("daddr=%08x saddr=%08x tos=%x scope=%x ", + fl->fl4_dst, + fl->fl4_src, + fl->fl4_tos, + fl->fl4_scope); + + printk("mark=%x ", fl->mark); + + printk("iif=%d, proto=%d, sprt=%d, dport=%d\n", + fl->iif, + fl->proto, + fl->fl_ip_sport, + fl->fl_ip_dport); +} + +static int rt_intern_flow(const struct flowi *flp, struct rtable *rt, struct rtable **rp) { struct rtable *rth, **rthp; + struct leaf *l; unsigned long now; struct rtable *cand, **candp; u32 min_score; int chain_length; + int err; int attempts = !in_softirq(); restart: @@ -936,9 +949,21 @@ restart: candp = NULL; now = jiffies; - rthp = &rt_hash_table[hash].chain; + spin_lock_bh(&trie_write_lock); + + l = unicache_insert_key(flp, &err); + + if(!l) { + rt_drop(rt); + spin_unlock_bh(&trie_write_lock); + if (net_ratelimit()) + printk(KERN_WARNING "unicache overflow\n"); + RT_CACHE_STAT_INC(gc_dst_overflow); + return -ENOBUFS; + } + + rthp = (struct rtable **) &l->obj; - spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED if (!(rth->u.dst.flags & DST_BALANCED) && @@ -946,25 +971,16 @@ restart: #else if (compare_keys(&rth->fl, &rt->fl)) { #endif +#if 0 /* Put it first */ *rthp = rth->u.rt_next; - /* - * Since lookup is lockfree, the deletion - * must be visible to another weakly ordered CPU before - * the insertion at the start of the hash chain. - */ - rcu_assign_pointer(rth->u.rt_next, - rt_hash_table[hash].chain); - /* - * Since lookup is lockfree, the update writes - * must be ordered for consistency on SMP. - */ - rcu_assign_pointer(rt_hash_table[hash].chain, rth); - + rt->u.rt_next = (struct rtable *) l->obj; + l->obj = (void *) rt; +#endif rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; - spin_unlock_bh(rt_hash_lock_addr(hash)); + spin_unlock_bh(&trie_write_lock); rt_drop(rt); *rp = rth; @@ -980,9 +996,7 @@ restart: min_score = score; } } - chain_length++; - rthp = &rth->u.rt_next; } @@ -1002,16 +1016,17 @@ restart: /* Try to bind route to arp only if it is output route or unicast forwarding path. */ + if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { - spin_unlock_bh(rt_hash_lock_addr(hash)); + + spin_unlock_bh(&trie_write_lock); if (err != -ENOBUFS) { rt_drop(rt); return err; } - /* Neighbour tables are full and nothing can be released. Try to shrink route cache, it is most likely it holds some neighbour records. @@ -1030,24 +1045,25 @@ restart: if (net_ratelimit()) printk(KERN_WARNING "Neighbour table overflow.\n"); rt_drop(rt); + spin_unlock_bh(&trie_write_lock); return -ENOBUFS; } } - - rt->u.rt_next = rt_hash_table[hash].chain; + rt->u.rt_next = (struct rtable *) l->obj; #if RT_CACHE_DEBUG >= 2 if (rt->u.rt_next) { struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, + printk(KERN_DEBUG "rt_cache: %u.%u.%u.%u", NIPQUAD(rt->rt_dst)); for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next) printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); printk("\n"); } #endif - rt_hash_table[hash].chain = rt; - spin_unlock_bh(rt_hash_lock_addr(hash)); + rt->parent = l; + rcu_assign_pointer(l->obj, rt); *rp = rt; + spin_unlock_bh(&trie_write_lock); return 0; } @@ -1110,31 +1126,20 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) ip_select_fb_ident(iph); } -static void rt_del(unsigned hash, struct rtable *rt) -{ - struct rtable **rthp; - - spin_lock_bh(rt_hash_lock_addr(hash)); - ip_rt_put(rt); - for (rthp = &rt_hash_table[hash].chain; *rthp; - rthp = &(*rthp)->u.rt_next) - if (*rthp == rt) { - *rthp = rt->u.rt_next; - rt_free(rt); - break; - } - spin_unlock_bh(rt_hash_lock_addr(hash)); -} - -void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, - __be32 saddr, struct net_device *dev) +void ip_rt_redirect(struct iphdr *iph, __be16 sprt, __be16 dprt, + __be32 old_gw, __be32 new_gw, struct net_device *dev) { int i, k; struct in_device *in_dev = in_dev_get(dev); struct rtable *rth, **rthp; + __be32 saddr = iph->saddr; + __be32 daddr = iph->daddr; __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct netevent_redirect netevent; + u32 key[LPK]; + u32 prts; + u8 proto = iph->protocol; if (!in_dev) return; @@ -1153,13 +1158,24 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } + if( !proto == IPPROTO_TCP && !proto == IPPROTO_UDP) { + dprt = 0; + sprt = 0; + } + + prts = (dprt << 16) + sprt; + for (i = 0; i < 2; i++) { for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); - - rthp=&rt_hash_table[hash].chain; + + fill_key_ipv4_trash(key, skeys[i], daddr, + prts, (proto<<16)); rcu_read_lock(); + + rth = unicache_lookup(t_unicache, key); + rthp = &rth; + while ((rth = rcu_dereference(*rthp)) != NULL) { struct rtable *rt; @@ -1231,9 +1247,11 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); - rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt)) + rcu_read_unlock(); + rt_del(t_unicache, rth, &unicache_ops); + if (!rt_intern_flow(&rt->fl, rt, &rt)) ip_rt_put(rt); + rcu_read_lock(); goto do_next; } rcu_read_unlock(); @@ -1267,14 +1285,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || rt->u.dst.expires) { - unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif); + rt_del(t_unicache, rt, &unicache_ops); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ip_rt_advice: redirect to " - "%u.%u.%u.%u/%02x dropped\n", - NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); + "%u.%u.%u.%u/%02x dropped\n", + NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); #endif - rt_del(hash, rt); ret = NULL; } } @@ -1399,7 +1415,7 @@ static __inline__ unsigned short guess_mtu(unsigned short old_mtu) return 68; } -unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) +unsigned short ip_rt_frag_needed(struct iphdr *iph, __be16 sprt, __be16 dprt, unsigned short new_mtu) { int i; unsigned short old_mtu = ntohs(iph->tot_len); @@ -1407,15 +1423,27 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) __be32 skeys[2] = { iph->saddr, 0, }; __be32 daddr = iph->daddr; unsigned short est_mtu = 0; + u32 key[LPK]; + u32 prts; + u8 proto = iph->protocol; if (ipv4_config.no_pmtu_disc) return 0; + if( !proto == IPPROTO_TCP && !proto == IPPROTO_UDP) { + dprt = 0; + sprt = 0; + } + prts = (dprt << 16) + sprt; + for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], 0); + + fill_key_ipv4_trash(key, skeys[i], daddr, + prts, (proto<<16)); rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + + for (rth = unicache_lookup(t_unicache, key); rth; rth = rcu_dereference(rth->u.rt_next)) { if (rth->fl.fl4_dst == daddr && rth->fl.fl4_src == skeys[i] && @@ -1603,15 +1631,18 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) rt->rt_type = res->type; } -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +static int ip_route_input_mc(struct sk_buff *skb, struct flowi *fl, + struct net_device *dev, int our) { - unsigned hash; struct rtable *rth; __be32 spec_dst; struct in_device *in_dev = in_dev_get(dev); u32 itag = 0; - + + __be32 saddr = fl->fl4_src; + __be32 daddr = fl->fl4_dst; + u8 tos = fl->fl4_tos; + /* Primary sanity checks. */ if (in_dev == NULL) @@ -1670,8 +1701,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, RT_CACHE_STAT_INC(in_slow_mc); in_dev_put(in_dev); - hash = rt_hash(daddr, saddr, dev->ifindex); - return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); + return rt_intern_flow(fl, rth, (struct rtable**) &skb->dst); e_nobufs: in_dev_put(in_dev); @@ -1817,11 +1847,11 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb, struct fib_result* res, const struct flowi *fl, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) + __be32 daddr, __be32 saddr, + u32 tos) { struct rtable* rth = NULL; int err; - unsigned hash; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) @@ -1833,16 +1863,21 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb, if (err) return err; + rth->fl.fl_ip_sport = fl->fl_ip_sport; + rth->fl.fl_ip_dport = fl->fl_ip_dport; + rth->fl.proto = fl->proto; + /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif); - return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + + return rt_intern_flow(fl, rth, (struct rtable**)&skb->dst); } static inline int ip_mkroute_input(struct sk_buff *skb, struct fib_result* res, const struct flowi *fl, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) + __be32 daddr, __be32 saddr, + u32 tos) { #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED struct rtable* rth = NULL, *rtres; @@ -1894,6 +1929,58 @@ static inline int ip_mkroute_input(struct sk_buff *skb, #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ } +void get_flow(struct flowi *fl, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + u8 proto = iph->protocol; + u16 dprt, sprt; + fl->fl4_dst = iph->daddr; + fl->fl4_src = iph->saddr; + fl->proto = proto; + fl->fl4_tos = iph->tos; + fl->mark = skb->mark; + + if( skb->protocol == htons(ETH_P_ARP)) { + + /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ + if (!pskb_may_pull(skb, (sizeof(struct arphdr) + + (4 * sizeof(u32))))) + goto discard_it; + } + + if( proto == IPPROTO_TCP ) { + struct tcphdr *th; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto discard_it; + + th = (struct tcphdr *)(skb->data + iph->ihl*4); + dprt = th->dest; + sprt = th->source; + } + else if( proto == IPPROTO_UDP ) { + struct udphdr *uh; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto discard_it; + + uh = (struct udphdr *) (skb->data + iph->ihl*4); + dprt = uh->dest; + sprt = uh->source; + } + else { + /* Not valid */ + + dprt = 0; + sprt = 0; + } + + fl->fl_ip_sport = sprt; + fl->fl_ip_dport = dprt; + +discard_it:; +} + /* * NOTE. We drop all the packets that has local source @@ -1904,25 +1991,17 @@ static inline int ip_mkroute_input(struct sk_buff *skb, * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. */ - -static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) +static int ip_route_input_slow(struct sk_buff *skb, struct flowi *fl, struct net_device *dev) { struct fib_result res; struct in_device *in_dev = in_dev_get(dev); - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = saddr, - .tos = tos, - .scope = RT_SCOPE_UNIVERSE, - } }, - .mark = skb->mark, - .iif = dev->ifindex }; unsigned flags = 0; u32 itag = 0; struct rtable * rth; - unsigned hash; __be32 spec_dst; + __be32 daddr = fl->fl4_dst; + __be32 saddr = fl->fl4_src; + u8 tos = fl->fl4_tos; int err = -EINVAL; int free_res = 0; @@ -1934,7 +2013,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* Check for the most weird martians, which can be not detected by fib_lookup. */ - + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) goto martian_source; @@ -1953,7 +2032,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* * Now we are ready to route packet. */ - if ((err = fib_lookup(&fl, &res)) != 0) { + if ((err = fib_lookup(fl, &res)) != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; @@ -1983,12 +2062,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, &res, fl, in_dev, daddr, saddr, tos); + if (err == -ENOBUFS) goto e_nobufs; if (err == -EINVAL) goto e_inval; - + done: in_dev_put(in_dev); if (free_res) @@ -2009,6 +2089,7 @@ brd_input: if (err) flags |= RTCF_DIRECTSRC; } + flags |= RTCF_BROADCAST; res.type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); @@ -2035,6 +2116,11 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; + + rth->fl.fl_ip_sport = fl->fl_ip_sport; + rth->fl.fl_ip_dport = fl->fl_ip_dport; + rth->fl.proto = fl->proto; + rth->u.dst.dev = &loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); @@ -2048,8 +2134,9 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif); - err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + + err = rt_intern_flow(fl, rth, (struct rtable**)&skb->dst); + skb->dst = &rth->u.dst; goto done; no_route: @@ -2091,27 +2178,53 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { struct rtable * rth; - unsigned hash; int iif = dev->ifindex; - + u32 key[LPK]; + int err; + struct flowi *fl; tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif); + + fl = kmalloc(sizeof(struct flowi), GFP_ATOMIC); + if(!fl) + return -ENOBUFS; + + memset(fl, 0, sizeof(struct flowi)); + get_flow(fl, skb); + + fl->iif = iif; + fl->fl4_dst = daddr; + fl->fl4_src = saddr; + fl->fl4_tos = tos; + fl->fl4_scope = RT_SCOPE_UNIVERSE; + + flow_to_key(fl, key); rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.rt_next)) { - if (rth->fl.fl4_dst == daddr && - rth->fl.fl4_src == saddr && - rth->fl.iif == iif && - rth->fl.oif == 0 && - rth->fl.mark == skb->mark && - rth->fl.fl4_tos == tos) { + rth = unicache_lookup(t_unicache, key); + + for ( ;rth ; rth = rcu_dereference(rth->u.rt_next)) { + if (compare_keys(&rth->fl, fl)) { + unsigned agc; + rth->u.dst.lastuse = jiffies; dst_hold(&rth->u.dst); rth->u.dst.__use++; RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); skb->dst = (struct dst_entry*)rth; + + /* + * Look for state changes and flow termination + */ + + if( unicache_tcp_establish(skb)); + /* printk("TCP est\n"); */ + + agc = unicache_garbage_collect_active(t_unicache, skb); + + RT_CACHE_STAT_ADD(gc_ignored, agc); /* Reuse counter */ + + kfree(fl); return 0; } RT_CACHE_STAT_INC(in_hlist_search); @@ -2142,14 +2255,17 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif ) { rcu_read_unlock(); - return ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); + kfree(fl); + return ip_route_input_mc(skb, fl, dev, our); } } rcu_read_unlock(); return -EINVAL; } - return ip_route_input_slow(skb, daddr, saddr, tos, dev); + err = ip_route_input_slow(skb, fl, dev); + kfree(fl); + + return err; } static inline int __mkroute_output(struct rtable **result, @@ -2229,6 +2345,9 @@ static inline int __mkroute_output(struct rtable **result, rth->fl.fl4_src = oldflp->fl4_src; rth->fl.oif = oldflp->oif; rth->fl.mark = oldflp->mark; + rth->fl.fl_ip_sport = oldflp->fl_ip_sport; + rth->fl.fl_ip_dport = oldflp->fl_ip_dport; + rth->fl.proto = oldflp->proto; rth->rt_dst = fl->fl4_dst; rth->rt_src = fl->fl4_src; rth->rt_iif = oldflp->oif ? : dev_out->ifindex; @@ -2287,11 +2406,8 @@ static inline int ip_mkroute_output_def(struct rtable **rp, { struct rtable *rth = NULL; int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); - unsigned hash; - if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif); - err = rt_intern_hash(hash, rth, rp); - } + if (err == 0) + err = rt_intern_flow(oldflp, rth, rp); return err; } @@ -2556,14 +2672,13 @@ out: return err; int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) { - unsigned hash; struct rtable *rth; - - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif); + u32 key[LPK]; + flow_to_key(flp, key); rcu_read_lock_bh(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.rt_next)) { + + for (rth = unicache_lookup(t_unicache, key); rth; rth = rcu_dereference(rth->u.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && rth->fl.iif == 0 && @@ -2593,7 +2708,6 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) RT_CACHE_STAT_INC(out_hlist_search); } rcu_read_unlock_bh(); - return ip_route_output_slow(rp, flp); } @@ -2851,6 +2965,12 @@ void ip_rt_multicast_event(struct in_device *in_dev) rt_cache_flush(0); } +void ip_rt_new_size(struct trie *t) +{ + ipv4_dst_ops.gc_thresh = t->gc_thresh * ip_rt_gc_elasticity; + ip_rt_max_size = t->gc_thresh * (ip_rt_gc_elasticity + 1); +} + #ifdef CONFIG_SYSCTL static int flush_delay; @@ -3151,8 +3271,14 @@ int __init ip_rt_init(void) memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); rt_hash_lock_init(); - ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); - ip_rt_max_size = (rt_hash_mask + 1) * 16; + /* + * Size of trash (GC_THRESH) is the key parameter. We scale + * max_size and gc_thresh after it. + * + */ + + ipv4_dst_ops.gc_thresh = GC_THRESH * ip_rt_gc_elasticity; + ip_rt_max_size = GC_THRESH * (ip_rt_gc_elasticity + 1); devinet_init(); ip_fib_init(); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 40cf0d0..1893f2a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -603,6 +603,9 @@ static void icmp_unreach(struct sk_buff *skb) struct net_protocol *ipprot; struct sock *raw_sk; u32 info = 0; + struct udphdr *phdr; + __be16 sprt; + __be16 dprt; /* * Incomplete header ? @@ -627,14 +630,27 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: + /* + * We need to get sprt/dprt for unicache lookup + * assume UDP but we find TCP ports too + */ + + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + goto out; + + phdr = (struct udphdr *) (skb->data + iph->ihl*4); + dprt = phdr->dest; + sprt = phdr->source; + + printk("ICMP sprt=%d dprt=%d prot0=%d\n", sprt, dprt, iph->protocol); + if (ipv4_config.no_pmtu_disc) { LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: " "fragmentation needed " "and DF set.\n", NIPQUAD(iph->daddr)); } else { - info = ip_rt_frag_needed(iph, - ntohs(icmph->un.frag.mtu)); + info = ip_rt_frag_needed(iph, sprt, dprt, ntohs(icmph->un.frag.mtu)); if (!info) goto out; } @@ -731,6 +747,9 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { struct iphdr *iph; + struct udphdr *phdr; + __be16 sprt, dprt; + if (skb->len < sizeof(struct iphdr)) goto out_err; @@ -742,6 +761,19 @@ static void icmp_redirect(struct sk_buff *skb) goto out; iph = (struct iphdr *)skb->data; + + /* + * We need to get sprt/dprt for unicache lookup + * we assume UDP but we get TCP ports too + */ + + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + goto out; + + phdr = (struct udphdr *) (skb->data + iph->ihl*4); + dprt = phdr->dest; + sprt = phdr->source; + switch (skb->h.icmph->code & 7) { case ICMP_REDIR_NET: @@ -751,9 +783,10 @@ static void icmp_redirect(struct sk_buff *skb) */ case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, + ip_rt_redirect(iph, sprt, dprt, + skb->nh.iph->saddr, skb->h.icmph->un.gateway, - iph->saddr, skb->dev); + skb->dev); break; } out: