Subject: [PATCH] nethost: ipv4 rework the routing so everything actually works. From: Eric W. Biederman Date: 1133975916 -0700 Linux ip routing is by destination address. A few other parameters are considered but nothing else really matters. This is a problem if you have multiple hosts on 1 machine. - For loopback addresses you need to know which host you are in order to route to yourself. - Even worse if you don't specify a source address (selecting only by destination) the kernel can easily pick a source address from another host. --- include/linux/inetdevice.h | 2 - include/net/ip_fib.h | 2 - include/net/route.h | 5 +- net/atm/clip.c | 2 - net/core/sock.c | 3 + net/ipv4/arp.c | 8 ++- net/ipv4/devinet.c | 37 ++++++++++------ net/ipv4/fib_frontend.c | 8 ++- net/ipv4/fib_semantics.c | 2 - net/ipv4/icmp.c | 8 ++- net/ipv4/igmp.c | 6 +-- net/ipv4/ip_input.c | 2 - net/ipv4/ip_output.c | 2 - net/ipv4/ipmr.c | 4 +- net/ipv4/netfilter.c | 4 +- net/ipv4/raw.c | 2 - net/ipv4/route.c | 104 ++++++++++++++++++++++++++++++-------------- net/ipv4/syncookies.c | 2 - net/ipv4/tcp_ipv4.c | 2 - net/ipv4/udp.c | 2 - 20 files changed, 129 insertions(+), 78 deletions(-) 324fe4ce72762b3e59e5161302b62ac18118a3c8 diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index e8b315c..2b22334 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -117,7 +117,7 @@ extern u32 inet_select_addr(struct neth extern u32 inet_confirm_addr(struct nethost *host, const struct net_device *dev, u32 dst, u32 local, int scope); -extern struct nethost * inet_dev_host(int ifindex, u32 addr); +extern struct nethost * inet_host(struct net_device *dev, u32 addr); extern struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask); extern void inet_forward_change(void); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 0cd66a4..e3d5a36 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -234,7 +234,7 @@ extern int inet_rtm_delroute(struct sk_b extern int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); extern int inet_rtm_getroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); extern int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb); -extern int fib_validate_source(struct nethost *host, u32 src, u32 dst, u8 tos, int oif, +extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, struct net_device *dev, u32 *spec_dst, u32 *itag); extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res); diff --git a/include/net/route.h b/include/net/route.h index fff3acd..e1a0d02 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -75,7 +75,8 @@ struct rtable /* Miscellaneous cached information */ __u32 rt_spec_dst; /* RFC1122 specific destination */ struct inet_peer *peer; /* long-living peer info */ - struct nethost *rt_host; /* Only used on local input routes */ + struct nethost *rt_dhost; /* Host we are sending to */ + struct nethost *rt_shost; /* Host we are sending from */ }; struct ip_rt_acct @@ -115,7 +116,7 @@ extern void ip_rt_redirect(u32 old_gw, extern void ip_rt_advice(struct rtable **rp, int advice); extern void rt_cache_flush(int how); extern int __ip_route_output_key(struct nethost *host, struct rtable **, const struct flowi *flp); -extern int ip_route_output_key(struct rtable **, struct flowi *flp); +extern int ip_route_output_key(struct nethost *host, struct rtable **, struct flowi *flp); extern int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); extern int ip_route_input(struct sk_buff*, u32 dst, u32 src, u8 tos, struct net_device *devin); extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu); diff --git a/net/atm/clip.c b/net/atm/clip.c index 4f54c9a..039436e 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -537,7 +537,7 @@ static int clip_setentry(struct atm_vcc unlink_clip_vcc(clip_vcc); return 0; } - error = ip_route_output_key(&rt,&fl); + error = ip_route_output_key(NULL, &rt,&fl); if (error) return error; neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1); ip_rt_put(rt); diff --git a/net/core/sock.c b/net/core/sock.c index b0f8918..1ded9c3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -774,6 +774,9 @@ struct sock *sk_clone(const struct sock if (newsk->sk_prot->sockets_allocated) atomic_inc(newsk->sk_prot->sockets_allocated); + + if (sk->sk_host) + get_host(sk->sk_host); } out: return newsk; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 9586d33..744ea0a 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -421,13 +421,15 @@ static int arp_ignore(struct in_device * static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev) { + struct nethost *host; struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, .saddr = tip } } }; struct rtable *rt; int flag = 0; /*unsigned long now; */ - if (ip_route_output_key(&rt, &fl) < 0) + host = inet_host(dev, tip); + if (ip_route_output_key(host, &rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); @@ -1002,7 +1004,7 @@ static int arp_req_set(struct arpreq *r, struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = RTO_ONLINK } } }; struct rtable * rt; - if ((err = ip_route_output_key(&rt, &fl)) != 0) + if ((err = ip_route_output_key(current->host, &rt, &fl)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -1108,7 +1110,7 @@ static int arp_req_delete(struct arpreq struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = RTO_ONLINK } } }; struct rtable * rt; - if ((err = ip_route_output_key(&rt, &fl)) != 0) + if ((err = ip_route_output_key(current->host, &rt, &fl)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index f499f74..a00a305 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -956,33 +956,42 @@ u32 inet_confirm_addr(struct nethost *ho } /** - * inet_dev_host - Find the host for a device, interface pair - * @ifindex: index of device - * @addr: ip address on interface for host + * inet_host - Attempt to find the host for an incoming ipaddress + * @dev: Network device we saw the address on + * @addr: ip address whose host we are seeking */ -struct nethost *inet_dev_host(int ifindex, u32 addr) +struct nethost *inet_host(struct net_device *dev, u32 addr) { - struct net_device *dev; + struct net_device *odev; struct in_device *in_dev; - struct nethost *host = NULL; /* ERR_PTR(-EADDRNOTAVAIL); */ - dev = dev_get_by_index(ifindex); - if (!dev) - goto out; + struct nethost *host = NULL; rcu_read_lock(); if ((in_dev = __in_dev_get_rcu(dev))) { for_ifa(in_dev) { if (ifa->ifa_address == addr) { host = ifa->ifa_host; - break; + goto found; } } endfor_ifa(in_dev); } - rcu_read_unlock(); - if (!host && dev->type == ARPHRD_LOOPBACK) { + + for (odev = dev_base ; odev ; odev = dev->next) { + if (odev == dev) + continue; + if ((in_dev = __in_dev_get_rcu(dev))) { + for_ifa(in_dev) { + if (ifa->ifa_address == addr) { + host = ifa->ifa_host; + goto found; + } + } endfor_ifa(in_dev); + } + } + if (dev->type == ARPHRD_LOOPBACK) { host = container_of(dev, struct nethost, loopback_dev); } - dev_put(dev); -out: +found: + rcu_read_unlock(); return host; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0515fcf..f9a2db0 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -158,7 +158,7 @@ unsigned inet_addr_type(u32 addr) - check, that packet arrived from expected physical interface. */ -int fib_validate_source(struct nethost *host, u32 src, u32 dst, u8 tos, int oif, +int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, struct net_device *dev, u32 *spec_dst, u32 *itag) { struct in_device *in_dev; @@ -187,7 +187,7 @@ int fib_validate_source(struct nethost * goto last_resort; if (res.type != RTN_UNICAST) goto e_inval_res; - *spec_dst = FIB_RES_PREFSRC(host, res); + *spec_dst = FIB_RES_PREFSRC(NULL, res); fib_combine_itag(itag, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) @@ -209,7 +209,7 @@ int fib_validate_source(struct nethost * ret = 0; if (fib_lookup(&fl, &res) == 0) { if (res.type == RTN_UNICAST) { - *spec_dst = FIB_RES_PREFSRC(host, res); + *spec_dst = FIB_RES_PREFSRC(NULL, res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } fib_res_put(&res); @@ -219,7 +219,7 @@ int fib_validate_source(struct nethost * last_resort: if (rpf) goto e_inval; - *spec_dst = inet_select_addr(host, dev, 0, RT_SCOPE_UNIVERSE); + *spec_dst = inet_select_addr(NULL, dev, 0, RT_SCOPE_UNIVERSE); *itag = 0; return 0; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 15a3972..cd68986 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -937,7 +937,7 @@ u32 fib_res_prefsrc(struct nethost *host u32 addr = 0; if (res->fi->fib_prefsrc) { addr = res->fi->fib_prefsrc; - if (host && host != inet_dev_host(FIB_RES_DEV(*res)->ifindex, addr)) + if (host && host != inet_host(FIB_RES_DEV(*res), addr)) addr = 0; } if (!addr) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index cbbd593..5c6663e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -406,7 +406,7 @@ static void icmp_reply(struct icmp_bxm * .saddr = rt->rt_spec_dst, .tos = RT_TOS(skb->nh.iph->tos) } }, .proto = IPPROTO_ICMP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(rt->rt_dhost, &rt, &fl)) goto out_unlock; } if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, @@ -515,7 +515,7 @@ void icmp_send(struct sk_buff *skb_in, i if (!(rt->rt_flags & RTCF_LOCAL)) { if (sysctl_icmp_errors_use_inbound_ifaddr) { struct nethost *host; - host = inet_dev_host(skb_in->dev->ifindex, iph->saddr); + host = inet_host(skb_in->dev, iph->saddr); saddr = inet_select_addr(host, skb_in->dev, 0, RT_SCOPE_LINK); } else @@ -564,7 +564,7 @@ void icmp_send(struct sk_buff *skb_in, i } } }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(rt->rt_dhost, &rt, &fl)) goto out_unlock; } @@ -694,7 +694,7 @@ static void icmp_unreach(struct sk_buff iph = (struct iphdr *)skb->data; protocol = iph->protocol; - host = inet_dev_host(skb->dev->ifindex, iph->saddr); + host = inet_host(skb->dev, iph->saddr); /* * Deliver ICMP message to raw sockets. Pretty useless feature? diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 8b6d393..2fe7572 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -286,7 +286,7 @@ static struct sk_buff *igmpv3_newpack(st .nl_u = { .ip4_u = { .daddr = IGMPV3_ALL_MCR } }, .proto = IPPROTO_IGMP }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(NULL, &rt, &fl)) { kfree_skb(skb); return NULL; } @@ -630,7 +630,7 @@ static int igmp_send_report(struct in_de struct flowi fl = { .oif = dev->ifindex, .nl_u = { .ip4_u = { .daddr = dst } }, .proto = IPPROTO_IGMP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(NULL, &rt, &fl)) return -1; } if (rt->rt_src == 0) { @@ -1317,7 +1317,7 @@ static struct in_device * ip_mc_find_dev __dev_put(dev); } - if (!dev && !ip_route_output_key(&rt, &fl)) { + if (!dev && !ip_route_output_key(NULL, &rt, &fl)) { dev = rt->u.dst.dev; ip_rt_put(rt); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 4d4dcee..af69668 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -267,7 +267,7 @@ int ip_local_deliver(struct sk_buff *skb { WARN_ON(!MULTICAST(skb->nh.iph->daddr) && (skb->nh.iph->daddr != 0xffffffff) && - !(((struct rtable *)skb->dst)->rt_host)); + !(((struct rtable *)skb->dst)->rt_dhost)); /* * Reassemble IP fragments. */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3f1a263..b924c81 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1284,7 +1284,7 @@ void ip_send_reply(struct sock *sk, stru { .sport = skb->h.th->dest, .dport = skb->h.th->source } }, .proto = sk->sk_protocol }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(rt->rt_dhost, &rt, &fl)) return; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 302b7eb..8aad5b9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1162,7 +1162,7 @@ static void ipmr_queue_xmit(struct sk_bu .saddr = vif->local, .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(NULL, &rt, &fl)) goto out_free; encap = sizeof(struct iphdr); } else { @@ -1171,7 +1171,7 @@ static void ipmr_queue_xmit(struct sk_bu { .daddr = iph->daddr, .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; - if (ip_route_output_key(&rt, &fl)) + if (ip_route_output_key(NULL, &rt, &fl)) goto out_free; } diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index ae0779d..63440d1 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -34,7 +34,7 @@ int ip_route_me_harder(struct sk_buff ** fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; #endif fl.proto = iph->protocol; - if (ip_route_output_key(&rt, &fl) != 0) + if (ip_route_output_key(NULL, &rt, &fl) != 0) return -1; /* Drop old route. */ @@ -44,7 +44,7 @@ int ip_route_me_harder(struct sk_buff ** /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ fl.nl_u.ip4_u.daddr = iph->saddr; - if (ip_route_output_key(&rt, &fl) != 0) + if (ip_route_output_key(NULL, &rt, &fl) != 0) return -1; odst = (*pskb)->dst; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 2f2b0b7..c66b9d3 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -162,7 +162,7 @@ int raw_v4_input(struct sk_buff *skb, st head = &raw_v4_htable[hash]; if (hlist_empty(head)) goto out; - host = inet_dev_host(skb->dev->ifindex, iph->daddr); + host = inet_host(skb->dev, iph->daddr); sk = __raw_v4_lookup(__sk_head(head), iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex, host); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d500c4d..dbf6a17 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -927,9 +927,11 @@ restart: while ((rth = *rthp) != NULL) { #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED if (!(rth->u.dst.flags & DST_BALANCED) && - compare_keys(&rth->fl, &rt->fl)) { + compare_keys(&rth->fl, &rt->fl) && + (rth->rt_shost == rt->rt_shost)) { #else - if (compare_keys(&rth->fl, &rt->fl)) { + if (compare_keys(&rth->fl, &rt->fl) && + (rth->rt_shost == rt->rt_shost)) { #endif /* Put it first */ *rthp = rth->u.rt_next; @@ -1612,8 +1614,8 @@ static int ip_route_input_mc(struct sk_b if (!LOCAL_MCAST(daddr)) goto e_inval; spec_dst = inet_select_addr(NULL, dev, 0, RT_SCOPE_LINK); - } else if (fib_validate_source(NULL, saddr, 0, tos, 0, - dev, &spec_dst, &itag) < 0) + } else if (fib_validate_source(saddr, 0, tos, 0, + dev, &spec_dst, &itag) < 0) goto e_inval; rth = dst_alloc(&ipv4_dst_ops); @@ -1646,7 +1648,8 @@ static int ip_route_input_mc(struct sk_b rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_type = RTN_MULTICAST; - rth->rt_host = NULL; + rth->rt_dhost = NULL; + rth->rt_shost = NULL; rth->rt_flags = RTCF_MULTICAST; if (our) { rth->u.dst.input= ip_local_deliver; @@ -1727,7 +1730,7 @@ static inline int __mkroute_input(struct } - err = fib_validate_source(NULL, saddr, daddr, tos, FIB_RES_OIF(*res), + err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), in_dev->dev, &spec_dst, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, @@ -1792,6 +1795,9 @@ static inline int __mkroute_input(struct rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; + rth->rt_dhost = NULL; + rth->rt_shost = NULL; + rt_set_nexthop(rth, res, itag); rth->rt_flags = flags; @@ -1919,6 +1925,7 @@ static int ip_route_input_slow(struct sk int err = -EINVAL; int free_res = 0; struct nethost *host = NULL; + struct net_device *loopback_dev = &init_host.loopback_dev; /* IP on this device is disabled. */ @@ -1961,11 +1968,12 @@ static int ip_route_input_slow(struct sk if (res.type == RTN_LOCAL) { int result; - host = inet_dev_host(dev->ifindex, daddr); + host = inet_host(dev, daddr); if (!host) goto martian_destination; - result = fib_validate_source(host, saddr, daddr, tos, - host->loopback_dev.ifindex, + loopback_dev = &host->loopback_dev; + result = fib_validate_source(saddr, daddr, tos, + loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1996,11 +2004,11 @@ brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - host = inet_dev_host(dev->ifindex, daddr); + host = NULL; if (ZERONET(saddr)) - spec_dst = inet_select_addr(host, dev, 0, RT_SCOPE_LINK); + spec_dst = inet_select_addr(NULL, dev, 0, RT_SCOPE_LINK); else { - err = fib_validate_source(host, saddr, 0, tos, 0, dev, + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag); if (err < 0) goto martian_source; @@ -2011,10 +2019,6 @@ brd_input: res.type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); - /* FIXME how do I handle broadcast traffic through loopback interfaces? */ - if (!host) - host = &init_host; - local_input: rth = dst_alloc(&ipv4_dst_ops); if (!rth) @@ -2039,7 +2043,7 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = &host->loopback_dev; + rth->u.dst.dev = loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; @@ -2052,7 +2056,8 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - rth->rt_host = host; /* FIXME doesn't this need to be null for broadcast/multicast */ + rth->rt_dhost = host; + rth->rt_shost = NULL; hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos); err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); goto done; @@ -2112,6 +2117,7 @@ int ip_route_input(struct sk_buff *skb, #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark == skb->nfmark && #endif + rth->rt_shost == NULL && rth->fl.fl4_tos == tos) { rth->u.dst.lastuse = jiffies; dst_hold(&rth->u.dst); @@ -2160,7 +2166,8 @@ int ip_route_input(struct sk_buff *skb, } static inline int __mkroute_output(struct rtable **result, - struct fib_result* res, + struct fib_result* res, + struct nethost *host, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, @@ -2248,20 +2255,21 @@ static inline int __mkroute_output(struc rth->idev = in_dev_get(dev_out); rth->rt_gateway = fl->fl4_dst; rth->rt_spec_dst= fl->fl4_src; + rth->rt_shost = host; rth->u.dst.output=ip_output; RT_CACHE_STAT_INC(out_slow_tot); if (dev_out->type == ARPHRD_LOOPBACK) { - rth->rt_host = container_of(dev_out, struct nethost, loopback_dev); + rth->rt_dhost = container_of(dev_out, struct nethost, loopback_dev); } if (flags & RTCF_LOCAL) { rth->u.dst.input = ip_local_deliver; rth->rt_spec_dst = fl->fl4_dst; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_host = NULL; + rth->rt_dhost = NULL; rth->rt_spec_dst = fl->fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { @@ -2293,13 +2301,14 @@ static inline int __mkroute_output(struc static inline int ip_mkroute_output_def(struct rtable **rp, struct fib_result* res, + struct nethost *host, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, unsigned flags) { struct rtable *rth = NULL; - int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); + int err = __mkroute_output(&rth, res, host, fl, oldflp, dev_out, flags); unsigned hash; if (err == 0) { u32 tos = RT_FL_TOS(oldflp); @@ -2314,6 +2323,7 @@ static inline int ip_mkroute_output_def( static inline int ip_mkroute_output(struct rtable** rp, struct fib_result* res, + struct nethost *host, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, @@ -2342,7 +2352,7 @@ static inline int ip_mkroute_output(stru if (hop) ip_rt_put(*rp); - err = __mkroute_output(&rth, res, fl, oldflp, + err = __mkroute_output(&rth, res, host, fl, oldflp, dev2nexthop, flags); if (err != 0) @@ -2368,11 +2378,11 @@ static inline int ip_mkroute_output(stru } return err; } else { - return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, + return ip_mkroute_output_def(rp, res, host, fl, oldflp, dev_out, flags); } #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags); + return ip_mkroute_output_def(rp, res, host, fl, oldflp, dev_out, flags); #endif } @@ -2532,15 +2542,35 @@ static int ip_route_output_slow(struct n free_res = 1; if (res.type == RTN_LOCAL) { + struct nethost *dest_host; + /* Special case for loopback addresses. + * + * Routing decisions in general and fib_loopup in particular + * make decisions based upon the destination address. + * + * In the context of multiple hosts each host has its + * own loopback interface. Each loopback interface is + * configured with the same loopback address. Therefore + * we need to know which host we are starting from + * to stay there. The destination address which fib_lookup + * uses is not enough information. + * + * Luckily this is easy. + */ + if (LOOPBACK(fl.fl4_dst)) + dest_host = host; + else + dest_host = inet_host(FIB_RES_DEV(res), fl.fl4_dst); + BUG_ON(!dest_host); if (!fl.fl4_src) { - if (!host || (inet_dev_host(FIB_RES_DEV(res)->ifindex, fl.fl4_dst) == host)) + if (!host || (dest_host == host)) fl.fl4_src = fl.fl4_dst; else fl.fl4_src = FIB_RES_PREFSRC(host, res); } if (dev_out) dev_put(dev_out); - dev_out = loopback_dev; + dev_out = &dest_host->loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) @@ -2569,7 +2599,7 @@ static int ip_route_output_slow(struct n make_route: - err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + err = ip_mkroute_output(rp, &res, host, &fl, oldflp, dev_out, flags); if (free_res) @@ -2596,6 +2626,7 @@ int __ip_route_output_key(struct nethost #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark == flp->fl4_fwmark && #endif + rth->rt_shost == host && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { @@ -2626,11 +2657,11 @@ int __ip_route_output_key(struct nethost EXPORT_SYMBOL_GPL(__ip_route_output_key); -int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) +static int __ip_route_output_flow(struct nethost *host, struct rtable **rp, + struct flowi *flp, struct sock *sk, int flags) { int err; - struct nethost *host = NULL; - + if (sk) host = sk->sk_host; @@ -2648,11 +2679,16 @@ int ip_route_output_flow(struct rtable * return 0; } +int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) +{ + return __ip_route_output_flow(NULL, rp, flp, sk, flags); +} + EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct rtable **rp, struct flowi *flp) +int ip_route_output_key(struct nethost *host, struct rtable **rp, struct flowi *flp) { - return ip_route_output_flow(rp, flp, NULL, 0); + return __ip_route_output_flow(host, rp, flp, NULL, 0); } static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, @@ -2806,7 +2842,7 @@ int inet_rtm_getroute(struct sk_buff *in if (rta[RTA_OIF - 1]) memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); fl.oif = oif; - err = ip_route_output_key(&rt, &fl); + err = ip_route_output_key(in_skb->sk->sk_host, &rt, &fl); } if (err) goto out_free; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index a34e60e..20d60d4 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -259,7 +259,7 @@ struct sock *cookie_v4_check(struct sock .uli_u = { .ports = { .sport = skb->h.th->dest, .dport = skb->h.th->source } } }; - if (ip_route_output_key(&rt, &fl)) { + if (ip_route_output_key(sk->sk_host, &rt, &fl)) { reqsk_free(req); goto out; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fc9dd2d..68ce4a5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1239,7 +1239,7 @@ int tcp_v4_rcv(struct sk_buff *skb) rt = (struct rtable*)skb->dst; sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, ntohs(th->dest), - inet_iif(skb), rt->rt_host); + inet_iif(skb), rt->rt_dhost); if (!sk) goto no_tcp_socket; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 129268a..c65f8aa 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1157,7 +1157,7 @@ int udp_rcv(struct sk_buff *skb) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, - skb->dev->ifindex, rt->rt_host); + skb->dev->ifindex, rt->rt_dhost); if (sk != NULL) { int ret = udp_queue_rcv_skb(sk, skb); -- 1.0.GIT