Subject: [PATCH] nethost: rework to allow ipv4 to not need extra fields in the routing cache. From: Eric W. Biederman Date: 1134053657 -0700 Add flag RTCF_OUTPUT to tag output routes instead of using fl.iif = 0 This allows different logical hosts to set iff to their local loopback device. Setting iff to the host local loopback device has the nice property of tagging which host a route is for in a way that can be displayed with /bin/ip. Implement loopback_host a function that can go from the looback device to it's host. I have implemented it a dozen times in inline code this removes that duplication and makes the intent of my actions clear. Remove rt_dhost and rt_shost from struct rtable. I can derive rt_shost from fl.fl_iff and rt_dhost from rt->u.dst.dev; Add the function rt_output to test if a route entry is an output route. This is used by several devices to detect packet loops. Add the function rt_dhost to return the destination host of a route. We use this fairly frequently now. Modify ip_route_connect, ip_route_output_key, and ip_route_output flow to fl.iff to the loopback interface of our current host or to leave it 0 if we don't have a host. With the host now in fl.iff remove the host parameter from the rest of the output routing functions. Fix inet_bind to properly confirm an ipaddress in a member of our current host using inet_confirm_addr. Fix raw_bind to use the proper test to confirm an ipaddress is for our current host as well. Finall get the proc output working properly for udp and raw protocols. Modify inet_del_ifa and inet_insert_ifa to allow one primary interface for each host even if there is another host has a primary ifa on that subnet. This is important for finding usable packet source addresses. Fix inet host to correctly look on other interfaces if the current interfaces does not list the current host. modify ip_route_output_slow - so it derives host and loopback from fl.iif. - disallows the loopback devices as a source address if I don't have a host. - Confirms the provided source address is usable on my current host. - Make it an error to have no destination address and no host. - When I am routing between two local hosts report the scope as RT_SCOPE_LINK and not RT_SCOPE_HOST. This prevents me from using the loopback interface for a source leading and creating packets that cannot be replied to. - Hold a reference to my local loopback_dev while I am making routing decisions this ensures struct nethost won't disappear on during ip_route_output_slow --- include/linux/in_route.h | 1 include/linux/nethost.h | 8 +++ include/net/route.h | 22 +++++++- net/bridge/br_netfilter.c | 6 +- net/ipv4/af_inet.c | 9 ++- net/ipv4/devinet.c | 18 +++---- net/ipv4/icmp.c | 4 +- net/ipv4/igmp.c | 2 - net/ipv4/ip_gre.c | 2 - net/ipv4/ip_input.c | 3 - net/ipv4/ip_output.c | 2 - net/ipv4/ipmr.c | 2 - net/ipv4/raw.c | 6 +- net/ipv4/route.c | 120 ++++++++++++++++++++++++++------------------- net/ipv4/tcp_ipv4.c | 2 - net/ipv4/udp.c | 4 +- net/ipv4/xfrm4_policy.c | 2 - 17 files changed, 127 insertions(+), 86 deletions(-) cd17f5a42af791f7c6d0da5dee9b6ea065622938 diff --git a/include/linux/in_route.h b/include/linux/in_route.h index 61f25c3..16b796c 100644 --- a/include/linux/in_route.h +++ b/include/linux/in_route.h @@ -14,6 +14,7 @@ #define RTCF_REDIRECTED 0x00040000 #define RTCF_TPROXY 0x00080000 +#define RTCF_OUTPUT 0x00100000 /* This is a cached output route */ #define RTCF_FAST 0x00200000 #define RTCF_MASQ 0x00400000 #define RTCF_SNAT 0x00800000 diff --git a/include/linux/nethost.h b/include/linux/nethost.h index 0cd135d..9e394b4 100644 --- a/include/linux/nethost.h +++ b/include/linux/nethost.h @@ -77,4 +77,12 @@ static inline void exit_host(struct task put_host(host); } +static inline struct nethost *loopback_host(struct net_device *dev) +{ + struct nethost *host = NULL; + if (dev && (dev->flags & IFF_LOOPBACK)) + host = container_of(dev, struct nethost, loopback_dev); + return host; +} + #endif /* _LINUX_NETHOST_H */ diff --git a/include/net/route.h b/include/net/route.h index e1a0d02..1271713 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -33,6 +33,7 @@ #include #include #include +#include #ifndef __KERNEL__ #warning This file is not supposed to be used outside of kernel. @@ -75,8 +76,6 @@ struct rtable /* Miscellaneous cached information */ __u32 rt_spec_dst; /* RFC1122 specific destination */ struct inet_peer *peer; /* long-living peer info */ - struct nethost *rt_dhost; /* Host we are sending to */ - struct nethost *rt_shost; /* Host we are sending from */ }; struct ip_rt_acct @@ -115,7 +114,7 @@ extern void ip_rt_redirect(u32 old_gw, u32 src, u8 tos, struct net_device *dev); extern void ip_rt_advice(struct rtable **rp, int advice); extern void rt_cache_flush(int how); -extern int __ip_route_output_key(struct nethost *host, struct rtable **, const struct flowi *flp); +extern int __ip_route_output_key(struct rtable **, const struct flowi *flp); extern int ip_route_output_key(struct nethost *host, struct rtable **, struct flowi *flp); extern int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); extern int ip_route_input(struct sk_buff*, u32 dst, u32 src, u8 tos, struct net_device *devin); @@ -148,6 +147,7 @@ static inline int ip_route_connect(struc u16 sport, u16 dport, struct sock *sk) { struct flowi fl = { .oif = oif, + .iif = sk->sk_host->loopback_dev.ifindex, .nl_u = { .ip4_u = { .daddr = dst, .saddr = src, .tos = tos } }, @@ -158,7 +158,7 @@ static inline int ip_route_connect(struc int err; if (!dst || !src) { - err = __ip_route_output_key(sk->sk_host, rp, &fl); + err = __ip_route_output_key(rp, &fl); if (err) return err; fl.fl4_dst = (*rp)->rt_dst; @@ -197,6 +197,20 @@ static inline struct inet_peer *rt_get_p return rt->peer; } +static inline int rt_output(struct rtable *rt) +{ + return (rt->rt_flags & RTCF_OUTPUT); +} + +static inline struct nethost *rt_dhost(struct rtable *rt) +{ + struct net_device *dev = rt->u.dst.dev; + struct nethost *host = NULL; + if (!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) + host = loopback_host(dev); + return host; +} + extern ctl_table ipv4_route_table[]; #endif /* _ROUTE_H */ diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index d8e36b7..29e15e4 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -164,8 +164,8 @@ static void __br_dnat_complain(void) * Let us now consider the case that ip_route_input() fails: * * After a "echo '0' > /proc/sys/net/ipv4/ip_forward" ip_route_input() - * will fail, while __ip_route_output_key() will return success. The source - * address for __ip_route_output_key() is set to zero, so __ip_route_output_key + * will fail, while_ip_route_output_key() will return success. The source + * address for ip_route_output_key() is set to zero, so ip_route_output_key * thinks we're handling a locally generated packet and won't care * if IP forwarding is allowed. We send a warning message to the users's * log telling her to put IP forwarding on. @@ -213,7 +213,7 @@ static int br_nf_pre_routing_finish(stru { .ip4_u = { .daddr = iph->daddr, .saddr = 0 , .tos = RT_TOS(iph->tos)} }, .proto = 0}; - if (!ip_route_output_key(&rt, &fl)) { + if (!ip_route_output_key(NULL, &rt, &fl)) { /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. * - Deal with redirected traffic. */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c6e0361..bc2f5aa 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -437,10 +437,11 @@ int inet_bind(struct socket *sock, struc goto out; /* Verify local addresses are for the current host */ - /* FIXME do I need to handle inet->freebind and sysctl_ip_nonlocal_bind here? */ - if ((chk_addr_ret == RTN_LOCAL) && - !inet_confirm_addr(sk->sk_host, NULL, 0, addr->sin_addr.s_addr, - RT_SCOPE_UNIVERSE)) + if (!sysctl_ip_nonlocal_bind && + !inet->freebind && + chk_addr_ret == RTN_LOCAL && + !inet_confirm_addr(sk->sk_host, NULL, 0, addr->sin_addr.s_addr, + RT_SCOPE_NOWHERE)) goto out; snum = ntohs(addr->sin_port); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index a00a305..0b8e7ca 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -185,6 +185,8 @@ static void inethost_destroy(struct neth { /* FIXME is the lock correct in this function? */ struct net_device *dev; + if (!host) + return; read_lock(&dev_base_lock); rcu_read_lock(); for (dev = dev_base; dev; dev = dev->next) { @@ -277,6 +279,7 @@ static void inet_del_ifa(struct in_devic while ((ifa = *ifap1) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) || + ifa1->ifa_host != ifa->ifa_host || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; @@ -357,7 +360,8 @@ static int inet_insert_ifa(struct in_ifa inet_free_ifa(ifa); return -EINVAL; } - ifa->ifa_flags |= IFA_F_SECONDARY; + if (ifa1->ifa_host == ifa->ifa_host) + ifa->ifa_flags |= IFA_F_SECONDARY; } } @@ -978,7 +982,7 @@ struct nethost *inet_host(struct net_dev for (odev = dev_base ; odev ; odev = dev->next) { if (odev == dev) continue; - if ((in_dev = __in_dev_get_rcu(dev))) { + if ((in_dev = __in_dev_get_rcu(odev))) { for_ifa(in_dev) { if (ifa->ifa_address == addr) { host = ifa->ifa_host; @@ -987,9 +991,7 @@ struct nethost *inet_host(struct net_dev } endfor_ifa(in_dev); } } - if (dev->type == ARPHRD_LOOPBACK) { - host = container_of(dev, struct nethost, loopback_dev); - } + host = loopback_host(dev); found: rcu_read_unlock(); return host; @@ -1069,7 +1071,7 @@ static int inetdev_event(struct notifier if (dev->type == ARPHRD_LOOPBACK) { struct in_ifaddr *ifa; if ((ifa = inet_alloc_ifa()) != NULL) { - ifa->ifa_host = container_of(dev, struct nethost, loopback_dev); + ifa->ifa_host = loopback_host(dev); ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; @@ -1091,9 +1093,7 @@ static int inetdev_event(struct notifier break; /* MTU falled under 68, disable IP */ case NETDEV_UNREGISTER: - if (dev->type == ARPHRD_LOOPBACK) { - inethost_destroy(container_of(dev, struct nethost, loopback_dev)); - } + inethost_destroy(loopback_host(dev)); inetdev_destroy(in_dev); break; case NETDEV_CHANGENAME: diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5c6663e..f08ca46 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -406,7 +406,7 @@ static void icmp_reply(struct icmp_bxm * .saddr = rt->rt_spec_dst, .tos = RT_TOS(skb->nh.iph->tos) } }, .proto = IPPROTO_ICMP }; - if (ip_route_output_key(rt->rt_dhost, &rt, &fl)) + if (ip_route_output_key(rt_dhost(rt), &rt, &fl)) goto out_unlock; } if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, @@ -564,7 +564,7 @@ void icmp_send(struct sk_buff *skb_in, i } } }; - if (ip_route_output_key(rt->rt_dhost, &rt, &fl)) + if (ip_route_output_key(rt_dhost(rt), &rt, &fl)) goto out_unlock; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 2fe7572..d77ba2d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -888,7 +888,7 @@ int igmp_rcv(struct sk_buff *skb) case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ - if (((struct rtable*)skb->dst)->fl.iif == 0) + if (rt_output((struct rtable *)skb->dst)) break; igmp_heard_report(in_dev, ih->group); break; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 896ce3f..c7f2d40 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -623,7 +623,7 @@ static int ipgre_rcv(struct sk_buff *skb #ifdef CONFIG_NET_IPGRE_BROADCAST if (MULTICAST(iph->daddr)) { /* Looped back packet, drop it! */ - if (((struct rtable*)skb->dst)->fl.iif == 0) + if (rt_output((struct rtable *)skb->dst)) goto drop; tunnel->stat.multicast++; skb->pkt_type = PACKET_BROADCAST; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index af69668..473d0f2 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -265,9 +265,6 @@ static inline int ip_local_deliver_finis */ int ip_local_deliver(struct sk_buff *skb) { - WARN_ON(!MULTICAST(skb->nh.iph->daddr) && - (skb->nh.iph->daddr != 0xffffffff) && - !(((struct rtable *)skb->dst)->rt_dhost)); /* * Reassemble IP fragments. */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b924c81..4e4e9a1 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1284,7 +1284,7 @@ void ip_send_reply(struct sock *sk, stru { .sport = skb->h.th->dest, .dport = skb->h.th->source } }, .proto = sk->sk_protocol }; - if (ip_route_output_key(rt->rt_dhost, &rt, &fl)) + if (ip_route_output_key(rt_dhost(rt), &rt, &fl)) return; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8aad5b9..82ff9d7 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1261,7 +1261,7 @@ static int ip_mr_forward(struct sk_buff if (vif_table[vif].dev != skb->dev) { int true_vifi; - if (((struct rtable*)skb->dst)->fl.iif == 0) { + if (rt_output((struct rtable *)skb->dst)) { /* It is our own packet, looped back. Very complicated situation... diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 15cd673..5d70aae 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -162,7 +162,7 @@ int raw_v4_input(struct sk_buff *skb, st head = &raw_v4_htable[hash]; if (hlist_empty(head)) goto out; - host = inet_host(skb->dev, iph->daddr); + host = rt_dhost((struct rtable *)skb->dst); sk = __raw_v4_lookup(__sk_head(head), iph->protocol, iph->saddr, iph->daddr, skb->dev->ifindex, host); @@ -562,7 +562,7 @@ static int raw_bind(struct sock *sk, str /* Verify local addresses are for the current host */ if ((chk_addr_ret == RTN_LOCAL) && !inet_confirm_addr(sk->sk_host, NULL, 0, addr->sin_addr.s_addr, - RT_SCOPE_UNIVERSE)) + RT_SCOPE_NOWHERE)) goto out; inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; @@ -780,7 +780,7 @@ static struct sock *raw_get_next(struct sk = sk_next(sk); try_again: ; - } while ((sk && sk->sk_family != PF_INET) || (sk->sk_host == state->host)); + } while (sk && ((sk->sk_family != PF_INET) || (sk->sk_host == state->host))); if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { sk = sk_head(&raw_v4_htable[state->bucket]); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index dbf6a17..604f31e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -501,7 +501,7 @@ static __inline__ int rt_fast_clean(stru /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && - rth->fl.iif && rth->u.rt_next; + !(rth->rt_flags & RTCF_OUTPUT) && rth->u.rt_next; } static __inline__ int rt_valuable(struct rtable *rth) @@ -546,7 +546,7 @@ static inline u32 rt_score(struct rtable if (rt_valuable(rt)) score |= (1<<31); - if (!rt->fl.iif || + if ((rt->rt_flags & RTCF_OUTPUT) || !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) score |= (1<<30); @@ -927,11 +927,9 @@ restart: while ((rth = *rthp) != NULL) { #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED if (!(rth->u.dst.flags & DST_BALANCED) && - compare_keys(&rth->fl, &rt->fl) && - (rth->rt_shost == rt->rt_shost)) { + compare_keys(&rth->fl, &rt->fl)) { #else - if (compare_keys(&rth->fl, &rt->fl) && - (rth->rt_shost == rt->rt_shost)) { + if (compare_keys(&rth->fl, &rt->fl)) { #endif /* Put it first */ *rthp = rth->u.rt_next; @@ -989,7 +987,7 @@ restart: /* Try to bind route to arp only if it is output route or unicast forwarding path. */ - if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { + if (rt->rt_type == RTN_UNICAST || (rt->rt_flags & RTCF_OUTPUT)) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { spin_unlock_bh(rt_hash_lock_addr(hash)); @@ -1157,7 +1155,7 @@ void ip_rt_redirect(u32 old_gw, u32 dadd rth->fl.fl4_src != skeys[i] || rth->fl.fl4_tos != tos || rth->fl.oif != ikeys[k] || - rth->fl.iif != 0) { + !(rth->rt_flags & RTCF_OUTPUT)) { rthp = &rth->u.rt_next; continue; } @@ -1411,7 +1409,7 @@ unsigned short ip_rt_frag_needed(struct rth->rt_dst == daddr && rth->rt_src == iph->saddr && rth->fl.fl4_tos == tos && - rth->fl.iif == 0 && + (rth->rt_flags & RTCF_OUTPUT) && !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { unsigned short mtu = new_mtu; @@ -1529,7 +1527,7 @@ void ip_rt_get_source(struct nethost *ho u32 src; struct fib_result res; - if (rt->fl.iif == 0) + if (rt->rt_flags & RTCF_OUTPUT) src = rt->rt_src; else if (fib_lookup(&rt->fl, &res) == 0) { src = FIB_RES_PREFSRC(host, res); @@ -1648,8 +1646,6 @@ static int ip_route_input_mc(struct sk_b rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_type = RTN_MULTICAST; - rth->rt_dhost = NULL; - rth->rt_shost = NULL; rth->rt_flags = RTCF_MULTICAST; if (our) { rth->u.dst.input= ip_local_deliver; @@ -1795,9 +1791,6 @@ static inline int __mkroute_input(struct rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; - rth->rt_dhost = NULL; - rth->rt_shost = NULL; - rt_set_nexthop(rth, res, itag); rth->rt_flags = flags; @@ -2056,8 +2049,6 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - rth->rt_dhost = host; - rth->rt_shost = NULL; hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos); err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); goto done; @@ -2117,7 +2108,6 @@ int ip_route_input(struct sk_buff *skb, #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark == skb->nfmark && #endif - rth->rt_shost == NULL && rth->fl.fl4_tos == tos) { rth->u.dst.lastuse = jiffies; dst_hold(&rth->u.dst); @@ -2167,7 +2157,6 @@ int ip_route_input(struct sk_buff *skb, static inline int __mkroute_output(struct rtable **result, struct fib_result* res, - struct nethost *host, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, @@ -2188,6 +2177,7 @@ static inline int __mkroute_output(struc else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) return -EINVAL; + flags |= RTCF_OUTPUT; if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; @@ -2242,6 +2232,7 @@ static inline int __mkroute_output(struc rth->fl.fl4_tos = tos; rth->fl.fl4_src = oldflp->fl4_src; rth->fl.oif = oldflp->oif; + rth->fl.iif = oldflp->iif; #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark= oldflp->fl4_fwmark; #endif @@ -2255,21 +2246,16 @@ static inline int __mkroute_output(struc rth->idev = in_dev_get(dev_out); rth->rt_gateway = fl->fl4_dst; rth->rt_spec_dst= fl->fl4_src; - rth->rt_shost = host; rth->u.dst.output=ip_output; RT_CACHE_STAT_INC(out_slow_tot); - if (dev_out->type == ARPHRD_LOOPBACK) { - rth->rt_dhost = container_of(dev_out, struct nethost, loopback_dev); - } if (flags & RTCF_LOCAL) { rth->u.dst.input = ip_local_deliver; rth->rt_spec_dst = fl->fl4_dst; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_dhost = NULL; rth->rt_spec_dst = fl->fl4_src; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { @@ -2301,14 +2287,13 @@ static inline int __mkroute_output(struc static inline int ip_mkroute_output_def(struct rtable **rp, struct fib_result* res, - struct nethost *host, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, unsigned flags) { struct rtable *rth = NULL; - int err = __mkroute_output(&rth, res, host, fl, oldflp, dev_out, flags); + int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); unsigned hash; if (err == 0) { u32 tos = RT_FL_TOS(oldflp); @@ -2323,7 +2308,6 @@ static inline int ip_mkroute_output_def( static inline int ip_mkroute_output(struct rtable** rp, struct fib_result* res, - struct nethost *host, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, @@ -2352,7 +2336,7 @@ static inline int ip_mkroute_output(stru if (hop) ip_rt_put(*rp); - err = __mkroute_output(&rth, res, host, fl, oldflp, + err = __mkroute_output(&rth, res, fl, oldflp, dev2nexthop, flags); if (err != 0) @@ -2378,11 +2362,11 @@ static inline int ip_mkroute_output(stru } return err; } else { - return ip_mkroute_output_def(rp, res, host, fl, oldflp, dev_out, + return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags); } #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ - return ip_mkroute_output_def(rp, res, host, fl, oldflp, dev_out, flags); + return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags); #endif } @@ -2390,10 +2374,9 @@ static inline int ip_mkroute_output(stru * Major route resolver routine. */ -static int ip_route_output_slow(struct nethost *host, struct rtable **rp, const struct flowi *oldflp) +static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) { u32 tos = RT_FL_TOS(oldflp); - struct net_device *loopback_dev = host? &host->loopback_dev : &init_host.loopback_dev; struct flowi fl = { .nl_u = { .ip4_u = { .daddr = oldflp->fl4_dst, .saddr = oldflp->fl4_src, @@ -2405,8 +2388,10 @@ static int ip_route_output_slow(struct n .fwmark = oldflp->fl4_fwmark #endif } }, - .iif = loopback_dev->ifindex, + .iif = oldflp->iif ? : init_host.loopback_dev.ifindex, .oif = oldflp->oif }; + struct net_device *loopback_dev = NULL; + struct nethost *host = NULL; struct fib_result res; unsigned flags = 0; struct net_device *dev_out = NULL; @@ -2419,6 +2404,14 @@ static int ip_route_output_slow(struct n res.r = NULL; #endif + if (oldflp->iif) { + err = -ENODEV; + loopback_dev = dev_get_by_index(fl.iif); + host = loopback_host(loopback_dev); + if (!host) + goto out; + } + if (oldflp->fl4_src) { err = -EINVAL; if (MULTICAST(oldflp->fl4_src) || @@ -2426,9 +2419,16 @@ static int ip_route_output_slow(struct n ZERONET(oldflp->fl4_src)) goto out; - /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(oldflp->fl4_src); - if (dev_out == NULL) + /* Loopback addresses must have a host to be used */ + if (!host && LOOPBACK(oldflp->fl4_src)) + goto out; + + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL + * In addition we verify the source ip address is for the + * correct host. + */ + if (!inet_confirm_addr(host, NULL, 0, oldflp->fl4_src, + RT_SCOPE_NOWHERE)) goto out; /* I removed check for oif == dev_out->oif here. @@ -2456,12 +2456,13 @@ static int ip_route_output_slow(struct n Luckily, this hack is good workaround. */ + dev_out = ip_dev_find(oldflp->fl4_src); + if (dev_out == NULL) + goto out; + fl.oif = dev_out->ifindex; goto make_route; } - if (dev_out) - dev_put(dev_out); - dev_out = NULL; } @@ -2494,11 +2495,14 @@ static int ip_route_output_slow(struct n } if (!fl.fl4_dst) { + if (dev_out) + dev_put(dev_out); + err = -EINVAL; + if (!host) + goto out; fl.fl4_dst = fl.fl4_src; if (!fl.fl4_dst) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); - if (dev_out) - dev_put(dev_out); dev_out = loopback_dev; dev_hold(dev_out); fl.oif = loopback_dev->ifindex; @@ -2565,9 +2569,19 @@ static int ip_route_output_slow(struct n if (!fl.fl4_src) { if (!host || (dest_host == host)) fl.fl4_src = fl.fl4_dst; - else + else { + if (res.scope >= RT_SCOPE_HOST) + res.scope = RT_SCOPE_LINK; fl.fl4_src = FIB_RES_PREFSRC(host, res); + } } + /* If fl.fl4_src is still 0 it is technically an error. + * However this only occurs if we don't have a valid + * source address we can use. Further protocols like + * DHCP expect 0 to be used as your source address when + * no source address is known. So we can't return + * an error here :( + */ if (dev_out) dev_put(dev_out); dev_out = &dest_host->loopback_dev; @@ -2599,17 +2613,20 @@ static int ip_route_output_slow(struct n make_route: - err = ip_mkroute_output(rp, &res, host, &fl, oldflp, dev_out, flags); + err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); if (free_res) fib_res_put(&res); if (dev_out) dev_put(dev_out); -out: return err; +out: + if (loopback_dev) + dev_put(loopback_dev); + return err; } -int __ip_route_output_key(struct nethost *host, struct rtable **rp, const struct flowi *flp) +int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) { unsigned hash; struct rtable *rth; @@ -2621,12 +2638,11 @@ int __ip_route_output_key(struct nethost rth = rcu_dereference(rth->u.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && - rth->fl.iif == 0 && + rth->fl.iif == flp->iif && rth->fl.oif == flp->oif && #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark == flp->fl4_fwmark && #endif - rth->rt_shost == host && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { @@ -2652,7 +2668,7 @@ int __ip_route_output_key(struct nethost } rcu_read_unlock_bh(); - return ip_route_output_slow(host, rp, flp); + return ip_route_output_slow(rp, flp); } EXPORT_SYMBOL_GPL(__ip_route_output_key); @@ -2665,7 +2681,11 @@ static int __ip_route_output_flow(struct if (sk) host = sk->sk_host; - if ((err = __ip_route_output_key(host, rp, flp)) != 0) + /* Add the host loopback interface to the flow */ + if (host) + flp->iif = host->loopback_dev.ifindex; + + if ((err = __ip_route_output_key(rp, flp)) != 0) return err; if (flp->proto) { @@ -2733,7 +2753,7 @@ static int rt_fill_info(struct sk_buff * RTA_PUT(skb, RTA_MP_ALGO, 4, &alg); } #endif - if (rt->fl.iif) + if (!(rt->rt_flags & RTCF_OUTPUT)) RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); else if (rt->rt_src != rt->fl.fl4_src) RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 68ce4a5..fff3b41 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1239,7 +1239,7 @@ int tcp_v4_rcv(struct sk_buff *skb) rt = (struct rtable*)skb->dst; sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, ntohs(th->dest), - inet_iif(skb), rt->rt_dhost); + inet_iif(skb), rt_dhost(rt)); if (!sk) goto no_tcp_socket; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5ccf867..7c0b7e1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1157,7 +1157,7 @@ int udp_rcv(struct sk_buff *skb) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, - skb->dev->ifindex, rt->rt_dhost); + skb->dev->ifindex, rt_dhost(rt)); if (sk != NULL) { int ret = udp_queue_rcv_skb(sk, skb); @@ -1411,7 +1411,7 @@ static struct sock *udp_get_next(struct sk = sk_next(sk); try_again: ; - } while ((sk && sk->sk_family != state->family) || (sk->sk_host != state->host)); + } while (sk && ((sk->sk_family != state->family) || (sk->sk_host != state->host))); if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { sk = sk_head(&udp_hash[state->bucket]); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bc54ffe..af33399 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -31,7 +31,7 @@ static int xfrm4_dst_lookup(struct xfrm_ * Interestingly enough this feels like a way to export a GPL * symbol non-gpl as well. */ - return __ip_route_output_key(NULL, (struct rtable**)dst, fl); + return __ip_route_output_key((struct rtable**)dst, fl); } static struct dst_entry * -- 1.0.GIT