Subject: [PATCH] nethost af_netlink: first working version From: Eric W. Biederman Date: 1133776458 -0700 There are probably a few bugs left but this version of af_netlink generally works in the nethost context. Some debugging code surrounded by #if 1 blocks has been left in this needs be removed before this becomes production code. The biggest challenge to getting this to working was figuring out how to deal with the sockets the kernel opens to talk to user space. When the kernel opens these sockets a reader function is passed was well and that function is called when any data is present on those sockets. The practical problem is that with the naive implementation there is no knowledge of which host to send the replies to. The technique I am using is to give each host it's own separate pid space, and user space sockets cannot talk between hosts. For the magic kernel socket when user space creates a socket I dup the kernel socket to contain a kernel socket for that host that I only use for delivering messages to the kernel. This allows replies to work as long as they are sent back on the socket from which they came. Getting there was tricky and a few bugs had to be fixed so the replies were always on the socket the skbs came from but it seems reliable easy to maintain and preserves the existing kernel api so most of the kernel does not need to change. --- net/netlink/af_netlink.c | 168 ++++++++++++++++++++++++++++++++++------------ 1 files changed, 125 insertions(+), 43 deletions(-) b05c613d01047b77810502d58ef48592b21aec1c diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 59219f6..bea05f1 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -77,6 +77,7 @@ struct netlink_sock { wait_queue_head_t wait; struct netlink_callback *cb; spinlock_t cb_lock; + struct sock *kern_sk; void (*data_ready)(struct sock *sk, int bytes); struct module *module; }; @@ -110,6 +111,7 @@ struct netlink_table { unsigned int groups; struct module *module; int registered; + void (*data_ready)(struct sock *sk, int bytes); }; static struct netlink_table *nl_table; @@ -118,6 +120,8 @@ static DECLARE_WAIT_QUEUE_HEAD(nl_table_ static int netlink_dump(struct sock *sk); static void netlink_destroy_callback(struct netlink_callback *cb); +static void netlink_data_ready(struct sock *sk, int len); +static int netlink_release_sock(struct sock *sk); static DEFINE_RWLOCK(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); @@ -206,24 +210,25 @@ static __inline__ struct sock *netlink_l struct sock *sk; struct hlist_node *node; - if (pid == 0) - host = NULL; - /* FIXME how do I modify netlink so when I transmit - * to userspace I always have enough information to - * unique identify the process. - * Do I perhaps need to open new kernel sockets when - * I get a new network namespace? - */ - WARN_ON(pid && !host); read_lock(&nl_table_lock); head = nl_pid_hashfn(hash, pid); sk_for_each(sk, node, head) { - if ((!host || (sk->sk_host == host)) && - (nlk_sk(sk)->pid == pid)) { + if ((sk->sk_host == host) && (nlk_sk(sk)->pid == pid)) { sock_hold(sk); goto found; } } +#if 1 + WARN_ON(!host); + if (!host) { + sk_for_each(sk, node, head) { + if (nlk_sk(sk)->pid == pid) { + sock_hold(sk); + goto found; + } + } + } +#endif sk = NULL; found: read_unlock(&nl_table_lock); @@ -362,16 +367,18 @@ static struct proto netlink_proto = { .obj_size = sizeof(struct netlink_sock), }; -static int __netlink_create(struct socket *sock, int protocol) +static struct sock *__netlink_create(struct nethost *host, struct socket *sock, + int protocol) { struct sock *sk; struct netlink_sock *nlk; - sock->ops = &netlink_ops; + if (sock) + sock->ops = &netlink_ops; sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); if (!sk) - return -ENOMEM; + return ERR_PTR(-ENOMEM); sock_init_data(sock, sk); @@ -381,13 +388,78 @@ static int __netlink_create(struct socke sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; - return 0; + sk->sk_host = host; + if (host) + get_host(host); + return sk; +} + +struct sock *__netlink_kernel_create(struct nethost *host, struct socket *sock, + int unit, void (*input)(struct sock *sk, int len)) +{ + struct sock *sk; + int err; + + if (IS_ERR(sk = __netlink_create(host, sock, unit))) + goto out; + + sk->sk_data_ready = netlink_data_ready; + if (input) + nlk_sk(sk)->data_ready = input; + + if ((err = netlink_insert(sk, 0)) < 0) + goto out_release_sock; + +out: + return sk; + +out_release_sock: + netlink_release_sock(sk); + sk = ERR_PTR(err); + goto out; +} + + +static int get_kern_sk(struct netlink_sock *nlk) +{ + struct nethost *host; + struct sock *sk; + void (*data_ready)(struct sock *sk, int bytes); + int protocol; + int err; + + host = nlk->sk.sk_host; + protocol = nlk->sk.sk_protocol; + +retry: + err = 0; + sk = netlink_lookup(host, protocol, 0); + if (sk == NULL) { + + netlink_table_grab(); + data_ready = nl_table[protocol].data_ready; + netlink_table_ungrab(); + if (!data_ready) + goto out; + + sk = __netlink_kernel_create(host, NULL, protocol, data_ready); + if (IS_ERR(sk)) { + err = PTR_ERR(sk); + sk = NULL; + } + if (err == -EADDRINUSE) + goto retry; + } +out: + nlk->kern_sk = sk; + return err; } static int netlink_create(struct socket *sock, int protocol) { struct module *module = NULL; struct netlink_sock *nlk; + struct sock *sk; unsigned int groups; int err = 0; @@ -413,32 +485,40 @@ static int netlink_create(struct socket groups = nl_table[protocol].groups; netlink_unlock_table(); +#if 1 err = -EPROTONOSUPPORT; if (!nl_table[protocol].nl_multihost && (current->host != &init_host)) goto out_module; +#endif - if ((err = __netlink_create(sock, protocol) < 0)) + err = -ENOMEM; + if (IS_ERR(sk = __netlink_create(current->host, sock, protocol))) goto out_module; - sock->sk->sk_host = current->host; - get_host(sock->sk->sk_host); - nlk = nlk_sk(sock->sk); + nlk = nlk_sk(sk); nlk->module = module; + + if ((err = get_kern_sk(nlk)) < 0) + goto out_release_sock; + out: return err; +out_release_sock: + netlink_release_sock(sk); out_module: module_put(module); goto out; } -static int netlink_release(struct socket *sock) +static int netlink_release_sock(struct sock *sk) { - struct sock *sk = sock->sk; struct netlink_sock *nlk; if (!sk) return 0; + if (sk->sk_socket) + sk->sk_socket->sk = NULL; netlink_remove(sk); nlk = nlk_sk(sk); @@ -450,16 +530,11 @@ static int netlink_release(struct socket nlk->cb = NULL; } spin_unlock(&nlk->cb_lock); - if (sk->sk_host) { - put_host(sk->sk_host); - sk->sk_host = NULL; - } /* OK. Socket is unlinked, and, therefore, no new packets will arrive */ sock_orphan(sk); - sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); skb_queue_purge(&sk->sk_write_queue); @@ -484,11 +559,17 @@ static int netlink_release(struct socket kfree(nlk->groups); nlk->groups = NULL; + netlink_release_sock(nlk->kern_sk); sock_put(sk); return 0; } +static int netlink_release(struct socket *sock) +{ + return netlink_release_sock(sock->sk); +} + static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; @@ -864,7 +945,7 @@ static inline int do_one_broadcast(struc if (p->exclude_sk == sk) goto out; - if (p->host && sk->sk_host && p->host != sk->sk_host) + if (p->host && p->host != sk->sk_host) goto out; if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups || @@ -1179,7 +1260,7 @@ static int netlink_sendmsg(struct kiocb if (dst_group) { atomic_inc(&skb->users); - netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); + netlink_host_broadcast(sk, skb, sk->sk_host, dst_pid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); @@ -1272,25 +1353,18 @@ netlink_kernel_create(int unit, unsigned struct sock *sk; struct netlink_sock *nlk; + sk = NULL; + if (!nl_table) - return NULL; + goto out; if (unit<0 || unit>=MAX_LINKS) - return NULL; - - if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) - return NULL; - - if (__netlink_create(sock, unit) < 0) - goto out_sock_release; + goto out; - sk = sock->sk; - sk->sk_host = NULL; - sk->sk_data_ready = netlink_data_ready; - if (input) - nlk_sk(sk)->data_ready = input; + if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock) < 0) + goto out; - if (netlink_insert(sk, 0)) + if (IS_ERR(sk = __netlink_kernel_create(NULL, sock, unit, input))) goto out_sock_release; nlk = nlk_sk(sk); @@ -1300,13 +1374,16 @@ netlink_kernel_create(int unit, unsigned nl_table[unit].groups = groups < 32 ? 32 : groups; nl_table[unit].module = module; nl_table[unit].registered = 1; + nl_table[unit].data_ready = input; netlink_table_ungrab(); +out: return sk; out_sock_release: sock_release(sock); - return NULL; + sk = NULL; + goto out; } void netlink_set_nonroot(int protocol, unsigned int flags) @@ -1315,11 +1392,13 @@ void netlink_set_nonroot(int protocol, u nl_table[protocol].nl_nonroot = flags; } +#if 1 void netlink_set_multihost(int protocol) { if ((unsigned int)protocol < MAX_LINKS) nl_table[protocol].nl_multihost = 1; } +#endif static void netlink_destroy_callback(struct netlink_callback *cb) { @@ -1706,12 +1785,15 @@ core_initcall(netlink_proto_init); EXPORT_SYMBOL(netlink_ack); EXPORT_SYMBOL(netlink_broadcast); +EXPORT_SYMBOL(netlink_host_broadcast); EXPORT_SYMBOL(netlink_dump_start); EXPORT_SYMBOL(netlink_kernel_create); EXPORT_SYMBOL(netlink_register_notifier); EXPORT_SYMBOL(netlink_set_err); EXPORT_SYMBOL(netlink_set_nonroot); +#if 1 EXPORT_SYMBOL(netlink_set_multihost); +#endif EXPORT_SYMBOL(netlink_unicast); EXPORT_SYMBOL(netlink_unregister_notifier); -- 1.0.GIT