GIT 8bf89c20914d1323c80602ba60405b2873a6ef60 git+ssh://master.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6.18.git commit 8bf89c20914d1323c80602ba60405b2873a6ef60 Author: Herbert Xu Date: Sat May 27 23:06:33 2006 -0700 [IPSEC] xfrm: Use IPPROTO_MAX instead of 256 The size of the type_map array (256) comes from the number of IP protocols, i.e., IPPROTO_MAX. This patch is based on a suggestion from Ingo Oeser. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller commit 064a98507dfa570812b4786e105fc74599d8dc6a Author: Herbert Xu Date: Sat May 27 23:06:13 2006 -0700 [IPSEC] proto: Move transport mode input path into xfrm_mode_transport Now that we have xfrm_mode objects we can move the transport mode specific input decapsulation code into xfrm_mode_transport. This removes duplicate code as well as unnecessary header movement in case of tunnel mode SAs since we will discard the original IP header immediately. This also fixes a minor bug for transport-mode ESP where the IP payload length is set to the correct value minus the header length (with extension headers for IPv6). Of course the other neat thing is that we no longer have to allocate temporary buffers to hold the IP headers for ESP and IPComp. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller commit 1e2e845def12536acae93aa22fe883ce27e83bb7 Author: Herbert Xu Date: Sat May 27 23:05:54 2006 -0700 [IPSEC] xfrm: Abstract out encapsulation modes This patch adds the structure xfrm_mode. It is meant to represent the operations carried out by transport/tunnel modes. By doing this we allow additional encapsulation modes to be added without clogging up the xfrm_input/xfrm_output paths. Candidate modes include 4-to-6 tunnel mode, 6-to-4 tunnel mode, and BEET modes. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller commit eac6df415e3df902761da60a504dffd586ac1a2b Author: Herbert Xu Date: Sat May 27 23:03:58 2006 -0700 [IPSEC] xfrm: Undo afinfo lock proliferation The number of locks used to manage afinfo structures can easily be reduced down to one each for policy and state respectively. This is based on the observation that the write locks are only held by module insertion/removal which are very rare events so there is no need to further differentiate between the insertion of modules like ipv6 versus esp6. The removal of the read locks in xfrm4_policy.c/xfrm6_policy.c might look suspicious at first. However, after you realise that nobody ever takes the corresponding write lock you'll feel better :) As far as I can gather it's an attempt to guard against the removal of the corresponding modules. Since neither module can be unloaded at all we can leave it to whoever fixes up IPv6 unloading :) Signed-off-by: Herbert Xu Signed-off-by: David S. Miller commit 4218cc7d8e53ceb7241e1466303059725404e606 Author: Michael Chan Date: Fri May 26 17:48:48 2006 -0700 [TG3]: update version and reldate Update version to 3.59. Signed-off-by: Michael Chan Signed-off-by: David S. Miller commit d4d568d82088ea654db193974d7720ac33cde09c Author: Michael Chan Date: Fri May 26 17:48:07 2006 -0700 [TG3]: Add recovery logic when MMIOs are re-ordered Add recovery logic when we suspect that the system is re-ordering MMIOs. Re-ordered MMIOs to the send mailbox can cause bogus tx completions and hit BUG_ON() in the tx completion path. tg3 already has logic to handle re-ordered MMIOs by flushing the MMIOs that must be strictly ordered (such as the send mailbox). Determining when to enable the flush is currently a manual process of adding known chipsets to a list. The new code replaces the BUG_ON() in the tx completion path with the call to tg3_tx_recover(). It will set the TG3_FLAG_MBOX_WRITE_REORDER flag and reset the chip later in the workqueue to recover and start flushing MMIOs to the mailbox. A message to report the problem will be printed. We will then decide whether or not to add the host bridge to the list of chipsets that do re-ordering. We may add some additional code later to print the host bridge's ID so that the user can report it more easily. The assumption that re-ordering can only happen on x86 systems is also removed. Signed-off-by: Michael Chan Signed-off-by: David S. Miller commit 3015279dd148a564e223b58a1936cbe5932afdd7 Author: Michael Chan Date: Fri May 26 17:44:45 2006 -0700 [TG3]: Add 5786 PCI ID Add PCI ID for BCM5786 which is a variant of 5787. Signed-off-by: Michael Chan Signed-off-by: David S. Miller commit c77e1e899b8b0456100a671eab9aafca0bac63f9 Author: Samuel Ortiz Date: Thu May 25 16:21:10 2006 -0700 [IRDA]: ali-ircc: using device model power management This patch gets rid of the old power management code and now uses the device model for the ali-ircc driver. Signed-off-by: Samuel Ortiz Signed-off-by: David S. Miller commit fd9c940a8904579f7aa11f44bdea00150ae1b98e Author: Christoph Hellwig Date: Thu May 25 16:20:19 2006 -0700 [IRDA]: stir4200, switching to the kthread API stir4200 uses a kernel thread for its TX/RX operations, and it is now converted to the kernel kthread API. Tested on an STIR4200 based dongle. Signed-off-by: Christoph Hellwig Signed-off-by: Samuel Ortiz Signed-off-by: David S. Miller commit 1da35708d2e16f738bab1b06ab6f6d14384736c1 Author: Samuel Ortiz Date: Thu May 25 16:19:22 2006 -0700 [IRDA]: Initial support for MCS7780 based dongles The MosChip MCS7780 chipset is an IrDA USB bridge that doesn't conform with the IrDA-USB standard and thus needs its separate driver. Tested on an actual MCS7780 based dongle. Original implementation by Brian Pugh Signed-off-by: Samuel Ortiz Signed-off-by: David S. Miller commit e0036d210cb7fa19300c9b47faaa1b7815f3413b Author: David S. Miller Date: Thu May 25 16:11:14 2006 -0700 [TCP]: tcp_rcv_rtt_measure_ts() call in pure-ACK path is superfluous We only want to take receive RTT mesaurements for data bearing frames, here in the header prediction fast path for a pure-sender, we know that we have a pure-ACK and thus the checks in tcp_rcv_rtt_mesaure_ts() will not pass. Signed-off-by: David S. Miller commit 640fad4699a3ca56d14bbbb71b66c0a19a88ee95 Author: Stephen Hemminger Date: Thu May 25 16:00:12 2006 -0700 [BRIDGE]: netlink interface for link management Add basic netlink support to the Ethernet bridge. Including: * dump interfaces in bridges * monitor link status changes * change state of bridge port For some demo programs see: http://developer.osdl.org/shemminger/prototypes/brnl.tar.gz These are to allow building a daemon that does alternative implementations of Spanning Tree Protocol. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller commit 980c6039c54be1ba3a241348b7fdc99044d8f903 Author: Stephen Hemminger Date: Thu May 25 15:59:33 2006 -0700 [BRIDGE]: fix module startup error handling Return address in use, if some other kernel code has the SAP. Propogate out error codes from netfilter registration and unwind. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller commit 5f68f45f44b8057a3775bede1d409bba34b0ed85 Author: Stephen Hemminger Date: Thu May 25 15:58:54 2006 -0700 [BRIDGE]: optimize conditional in forward path Small optimizations of bridge forwarding path. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller commit aa9ca056ca9d72f01196616b0b4452be8830cc76 Author: Stephen Hemminger Date: Thu May 25 15:10:37 2006 -0700 [LLC]: add multicast support for datagrams Allow mulitcast reception of datagrams (similar to UDP). All sockets bound to the same SAP receive a clone. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller commit 06aa0fc96b0597d41dbbdb2920c6fb455e5af6d9 Author: Stephen Hemminger Date: Thu May 25 15:10:02 2006 -0700 [LLC]: allow applications to get copy of kernel datagrams It is legal for an application to bind to a SAP that is also being used by the kernel. This happens if the bridge module binds to the STP SAP, and the user wants to have a daemon for STP as well. It is possible to have kernel doing STP on one bridge, but let application do RSTP on another bridge. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller commit b99cb8d1619b0be05ee40391d3eece1cf8ef3564 Author: Stephen Hemminger Date: Thu May 25 15:09:37 2006 -0700 [LLC]: use rcu_dereference on receive handler The receive hander pointer might be modified during network changes of protocol. So use rcu_dereference (only matters on alpha). Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller commit 2a3d8aacc113d0cc36fd9657d20f388b600e6302 Author: Stephen Hemminger Date: Thu May 25 15:08:59 2006 -0700 [LLC]: allow datagram recvmsg LLC receive is broken for SOCK_DGRAM. If an application does recv() on a datagram socket and there is no data present, don't return "not connected". Instead, just do normal datagram semantics. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller commit 058bdb0cf13444e87eeb875b4527f1cdc7c734cf Author: Stephen Hemminger Date: Thu May 25 15:08:30 2006 -0700 [LLC]: use more efficient ether address routines Use more cache efficient Ethernet address manipulation functions in etherdevice.h. Signed-off-by: Stephen Hemminger commit f9256a57025107ae7191efe69c8f2d4b04078c99 Author: Andrew Morton Date: Thu May 25 13:26:53 2006 -0700 [I/OAT]: Do not use for_each_cpu(). for_each_cpu() is going away (and is gone in -mm). Signed-off-by: Andrew Morton Signed-off-by: Chris Leech Signed-off-by: David S. Miller commit fab2200dc8c97188a4194cf5e8888c78036794c2 Author: Chris Leech Date: Tue May 23 18:05:53 2006 -0700 [I/OAT]: TCP recv offload to I/OAT Locks down user pages and sets up for DMA in tcp_recvmsg, then calls dma_async_try_early_copy in tcp_v4_do_rcv Signed-off-by: Chris Leech Signed-off-by: David S. Miller commit 51dc314cc8661b001a435fb553080f58edefc9e0 Author: Chris Leech Date: Tue May 23 18:02:55 2006 -0700 [I/OAT]: Add a sysctl for tuning the I/OAT offloaded I/O threshold Any socket recv of less than this ammount will not be offloaded Signed-off-by: Chris Leech Signed-off-by: David S. Miller commit 7577cbcead050b9486e976a7d811d65a6f8ead81 Author: Chris Leech Date: Tue May 23 18:01:28 2006 -0700 [I/OAT]: Make sk_eat_skb I/OAT aware. Add an extra argument to sk_eat_skb, and make it move early copied packets to the async_wait_queue instead of freeing them. Signed-off-by: Chris Leech Signed-off-by: David S. Miller commit 24df1ec5e3414b1c4928bf68f03fe9b4ec19f5fd Author: Chris Leech Date: Tue May 23 18:00:16 2006 -0700 [I/OAT]: Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT Signed-off-by: Chris Leech Signed-off-by: David S. Miller commit 8d4c3913343ed6c383e549c8fedc58977f3c31c7 Author: Chris Leech Date: Tue May 23 17:55:33 2006 -0700 [I/OAT]: Structure changes for TCP recv offload to I/OAT Adds an async_wait_queue and some additional fields to tcp_sock, and a dma_cookie_t to sk_buff. Signed-off-by: Chris Leech Signed-off-by: David S. Miller commit 78e00acbdbfb7823d8c8bd87721606c15b38a574 Author: Chris Leech Date: Tue May 23 17:50:37 2006 -0700 [I/OAT]: Utility functions for offloading sk_buff to iovec copies Provides for pinning user space pages in memory, copying to iovecs, and copying from sk_buffs including fragmented and chained sk_buffs. Signed-off-by: Chris Leech Signed-off-by: David S. Miller commit 7d823651c485e7b7bc0f4130f3e2c4c4704e7981 Author: Chris Leech Date: Tue May 23 17:45:03 2006 -0700 [I/OAT]: Setup the networking subsystem as a DMA client Attempts to allocate per-CPU DMA channels Signed-off-by: Chris Leech Signed-off-by: David S. Miller commit 2a7fb9692c1707d3560be3b7794ec1da30881b80 Author: David S. Miller Date: Tue May 23 17:39:49 2006 -0700 [I/OAT]: Move PCI_DEVICE_ID_INTEL_IOAT to linux/pci_ids.h Signed-off-by: David S. Miller commit b5200f9a172620f25057622511684919ce354a28 Author: David S. Miller Date: Tue May 23 17:37:58 2006 -0700 [I/OAT]: ioatdma.c needs linux/dma-mapping.h For DMA_*_MASK defines. Signed-off-by: David S. Miller commit f719fc9325e3593cb9d29b0ff8e7329833dc9180 Author: Chris Leech Date: Tue May 23 17:35:34 2006 -0700 [I/OAT]: Driver for the Intel(R) I/OAT DMA engine Adds a new ioatdma driver Signed-off-by: Chris Leech Signed-off-by: David S. Miller commit e0ab2968fb9758bbb6eacc029efc576f014cf803 Author: Chris Leech Date: Tue May 23 17:18:44 2006 -0700 [I/OAT]: DMA memcpy subsystem Provides an API for offloading memory copies to DMA devices Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- Signed-off-by: Andrew Morton --- drivers/Kconfig | 2 drivers/Makefile | 1 drivers/dma/Kconfig | 34 + drivers/dma/Makefile | 3 drivers/dma/dmaengine.c | 408 ++++++++++++ drivers/dma/ioatdma.c | 840 ++++++++++++++++++++++++ drivers/dma/ioatdma.h | 125 +++ drivers/dma/ioatdma_hw.h | 52 + drivers/dma/ioatdma_io.h | 118 +++ drivers/dma/ioatdma_registers.h | 126 +++ drivers/dma/iovlock.c | 301 ++++++++ drivers/net/irda/Kconfig | 15 drivers/net/irda/Makefile | 1 drivers/net/irda/ali-ircc.c | 106 +-- drivers/net/irda/mcs7780.c | 1009 ++++++++++++++++++++++++++++++ drivers/net/irda/mcs7780.h | 167 ++++ drivers/net/irda/stir4200.c | 38 - drivers/net/tg3.c | 59 + drivers/net/tg3.h | 5 include/linux/dmaengine.h | 359 ++++++++++ include/linux/netdevice.h | 4 include/linux/pci_ids.h | 2 include/linux/skbuff.h | 4 include/linux/sysctl.h | 1 include/linux/tcp.h | 8 include/linux/xfrm.h | 4 include/net/llc_if.h | 17 include/net/netdma.h | 44 + include/net/sock.h | 15 include/net/tcp.h | 10 include/net/xfrm.h | 26 net/bridge/Makefile | 2 net/bridge/br.c | 28 net/bridge/br_forward.c | 12 net/bridge/br_netlink.c | 199 +++++ net/bridge/br_notify.c | 2 net/bridge/br_private.h | 12 net/bridge/br_stp_if.c | 4 net/core/Makefile | 1 net/core/dev.c | 104 +++ net/core/sock.c | 6 net/core/user_dma.c | 131 +++ net/dccp/proto.c | 4 net/ipv4/Kconfig | 18 net/ipv4/ah4.c | 15 net/ipv4/esp4.c | 18 net/ipv4/ipcomp.c | 23 net/ipv4/sysctl_net_ipv4.c | 10 net/ipv4/tcp.c | 117 ++- net/ipv4/tcp_input.c | 76 +- net/ipv4/tcp_ipv4.c | 18 net/ipv4/xfrm4_input.c | 28 net/ipv4/xfrm4_mode_transport.c | 83 ++ net/ipv4/xfrm4_mode_tunnel.c | 125 +++ net/ipv4/xfrm4_output.c | 61 - net/ipv4/xfrm4_policy.c | 6 net/ipv4/xfrm4_state.c | 1 net/ipv6/Kconfig | 20 net/ipv6/Makefile | 2 net/ipv6/ah6.c | 10 net/ipv6/esp6.c | 20 net/ipv6/ip6_output.c | 2 net/ipv6/ipcomp6.c | 27 net/ipv6/tcp_ipv6.c | 12 net/ipv6/xfrm6_input.c | 29 net/ipv6/xfrm6_mode_transport.c | 88 ++ net/ipv6/xfrm6_mode_tunnel.c | 121 +++ net/ipv6/xfrm6_output.c | 63 - net/ipv6/xfrm6_policy.c | 6 net/ipv6/xfrm6_state.c | 1 net/llc/af_llc.c | 6 net/llc/llc_if.c | 2 net/llc/llc_input.c | 11 net/llc/llc_sap.c | 59 + net/xfrm/xfrm_policy.c | 141 +++- net/xfrm/xfrm_state.c | 15 76 files changed, 5172 insertions(+), 471 deletions(-) diff -puN /dev/null drivers/dma/dmaengine.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/dma/dmaengine.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,408 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code implements the DMA subsystem. It provides a HW-neutral interface + * for other kernel code to use asynchronous memory copy capabilities, + * if present, and allows different HW DMA drivers to register as providing + * this capability. + * + * Due to the fact we are accelerating what is already a relatively fast + * operation, the code goes to great lengths to avoid additional overhead, + * such as locking. + * + * LOCKING: + * + * The subsystem keeps two global lists, dma_device_list and dma_client_list. + * Both of these are protected by a mutex, dma_list_mutex. + * + * Each device has a channels list, which runs unlocked but is never modified + * once the device is registered, it's just setup by the driver. + * + * Each client has a channels list, it's only modified under the client->lock + * and in an RCU callback, so it's safe to read under rcu_read_lock(). + * + * Each device has a kref, which is initialized to 1 when the device is + * registered. A kref_put is done for each class_device registered. When the + * class_device is released, the coresponding kref_put is done in the release + * method. Every time one of the device's channels is allocated to a client, + * a kref_get occurs. When the channel is freed, the coresponding kref_put + * happens. The device's release function does a completion, so + * unregister_device does a remove event, class_device_unregister, a kref_put + * for the first reference, then waits on the completion for all other + * references to finish. + * + * Each channel has an open-coded implementation of Rusty Russell's "bigref," + * with a kref and a per_cpu local_t. A single reference is set when on an + * ADDED event, and removed with a REMOVE event. Net DMA client takes an + * extra reference per outstanding transaction. The relase function does a + * kref_put on the device. -ChrisL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(dma_list_mutex); +static LIST_HEAD(dma_device_list); +static LIST_HEAD(dma_client_list); + +/* --- sysfs implementation --- */ + +static ssize_t show_memcpy_count(struct class_device *cd, char *buf) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + unsigned long count = 0; + int i; + + for_each_possible_cpu(i) + count += per_cpu_ptr(chan->local, i)->memcpy_count; + + return sprintf(buf, "%lu\n", count); +} + +static ssize_t show_bytes_transferred(struct class_device *cd, char *buf) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + unsigned long count = 0; + int i; + + for_each_possible_cpu(i) + count += per_cpu_ptr(chan->local, i)->bytes_transferred; + + return sprintf(buf, "%lu\n", count); +} + +static ssize_t show_in_use(struct class_device *cd, char *buf) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + + return sprintf(buf, "%d\n", (chan->client ? 1 : 0)); +} + +static struct class_device_attribute dma_class_attrs[] = { + __ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL), + __ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL), + __ATTR(in_use, S_IRUGO, show_in_use, NULL), + __ATTR_NULL +}; + +static void dma_async_device_cleanup(struct kref *kref); + +static void dma_class_dev_release(struct class_device *cd) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + kref_put(&chan->device->refcount, dma_async_device_cleanup); +} + +static struct class dma_devclass = { + .name = "dma", + .class_dev_attrs = dma_class_attrs, + .release = dma_class_dev_release, +}; + +/* --- client and device registration --- */ + +/** + * dma_client_chan_alloc - try to allocate a channel to a client + * @client: &dma_client + * + * Called with dma_list_mutex held. + */ +static struct dma_chan *dma_client_chan_alloc(struct dma_client *client) +{ + struct dma_device *device; + struct dma_chan *chan; + unsigned long flags; + int desc; /* allocated descriptor count */ + + /* Find a channel, any DMA engine will do */ + list_for_each_entry(device, &dma_device_list, global_node) { + list_for_each_entry(chan, &device->channels, device_node) { + if (chan->client) + continue; + + desc = chan->device->device_alloc_chan_resources(chan); + if (desc >= 0) { + kref_get(&device->refcount); + kref_init(&chan->refcount); + chan->slow_ref = 0; + INIT_RCU_HEAD(&chan->rcu); + chan->client = client; + spin_lock_irqsave(&client->lock, flags); + list_add_tail_rcu(&chan->client_node, + &client->channels); + spin_unlock_irqrestore(&client->lock, flags); + return chan; + } + } + } + + return NULL; +} + +/** + * dma_client_chan_free - release a DMA channel + * @chan: &dma_chan + */ +void dma_chan_cleanup(struct kref *kref) +{ + struct dma_chan *chan = container_of(kref, struct dma_chan, refcount); + chan->device->device_free_chan_resources(chan); + chan->client = NULL; + kref_put(&chan->device->refcount, dma_async_device_cleanup); +} + +static void dma_chan_free_rcu(struct rcu_head *rcu) +{ + struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu); + int bias = 0x7FFFFFFF; + int i; + for_each_possible_cpu(i) + bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount); + atomic_sub(bias, &chan->refcount.refcount); + kref_put(&chan->refcount, dma_chan_cleanup); +} + +static void dma_client_chan_free(struct dma_chan *chan) +{ + atomic_add(0x7FFFFFFF, &chan->refcount.refcount); + chan->slow_ref = 1; + call_rcu(&chan->rcu, dma_chan_free_rcu); +} + +/** + * dma_chans_rebalance - reallocate channels to clients + * + * When the number of DMA channel in the system changes, + * channels need to be rebalanced among clients + */ +static void dma_chans_rebalance(void) +{ + struct dma_client *client; + struct dma_chan *chan; + unsigned long flags; + + mutex_lock(&dma_list_mutex); + + list_for_each_entry(client, &dma_client_list, global_node) { + while (client->chans_desired > client->chan_count) { + chan = dma_client_chan_alloc(client); + if (!chan) + break; + client->chan_count++; + client->event_callback(client, + chan, + DMA_RESOURCE_ADDED); + } + while (client->chans_desired < client->chan_count) { + spin_lock_irqsave(&client->lock, flags); + chan = list_entry(client->channels.next, + struct dma_chan, + client_node); + list_del_rcu(&chan->client_node); + spin_unlock_irqrestore(&client->lock, flags); + client->chan_count--; + client->event_callback(client, + chan, + DMA_RESOURCE_REMOVED); + dma_client_chan_free(chan); + } + } + + mutex_unlock(&dma_list_mutex); +} + +/** + * dma_async_client_register - allocate and register a &dma_client + * @event_callback: callback for notification of channel addition/removal + */ +struct dma_client *dma_async_client_register(dma_event_callback event_callback) +{ + struct dma_client *client; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (!client) + return NULL; + + INIT_LIST_HEAD(&client->channels); + spin_lock_init(&client->lock); + client->chans_desired = 0; + client->chan_count = 0; + client->event_callback = event_callback; + + mutex_lock(&dma_list_mutex); + list_add_tail(&client->global_node, &dma_client_list); + mutex_unlock(&dma_list_mutex); + + return client; +} + +/** + * dma_async_client_unregister - unregister a client and free the &dma_client + * @client: + * + * Force frees any allocated DMA channels, frees the &dma_client memory + */ +void dma_async_client_unregister(struct dma_client *client) +{ + struct dma_chan *chan; + + if (!client) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(chan, &client->channels, client_node) + dma_client_chan_free(chan); + rcu_read_unlock(); + + mutex_lock(&dma_list_mutex); + list_del(&client->global_node); + mutex_unlock(&dma_list_mutex); + + kfree(client); + dma_chans_rebalance(); +} + +/** + * dma_async_client_chan_request - request DMA channels + * @client: &dma_client + * @number: count of DMA channels requested + * + * Clients call dma_async_client_chan_request() to specify how many + * DMA channels they need, 0 to free all currently allocated. + * The resulting allocations/frees are indicated to the client via the + * event callback. + */ +void dma_async_client_chan_request(struct dma_client *client, + unsigned int number) +{ + client->chans_desired = number; + dma_chans_rebalance(); +} + +/** + * dma_async_device_register - + * @device: &dma_device + */ +int dma_async_device_register(struct dma_device *device) +{ + static int id; + int chancnt = 0; + struct dma_chan* chan; + + if (!device) + return -ENODEV; + + init_completion(&device->done); + kref_init(&device->refcount); + device->dev_id = id++; + + /* represent channels in sysfs. Probably want devs too */ + list_for_each_entry(chan, &device->channels, device_node) { + chan->local = alloc_percpu(typeof(*chan->local)); + if (chan->local == NULL) + continue; + + chan->chan_id = chancnt++; + chan->class_dev.class = &dma_devclass; + chan->class_dev.dev = NULL; + snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d", + device->dev_id, chan->chan_id); + + kref_get(&device->refcount); + class_device_register(&chan->class_dev); + } + + mutex_lock(&dma_list_mutex); + list_add_tail(&device->global_node, &dma_device_list); + mutex_unlock(&dma_list_mutex); + + dma_chans_rebalance(); + + return 0; +} + +/** + * dma_async_device_unregister - + * @device: &dma_device + */ +static void dma_async_device_cleanup(struct kref *kref) +{ + struct dma_device *device; + + device = container_of(kref, struct dma_device, refcount); + complete(&device->done); +} + +void dma_async_device_unregister(struct dma_device* device) +{ + struct dma_chan *chan; + unsigned long flags; + + mutex_lock(&dma_list_mutex); + list_del(&device->global_node); + mutex_unlock(&dma_list_mutex); + + list_for_each_entry(chan, &device->channels, device_node) { + if (chan->client) { + spin_lock_irqsave(&chan->client->lock, flags); + list_del(&chan->client_node); + chan->client->chan_count--; + spin_unlock_irqrestore(&chan->client->lock, flags); + chan->client->event_callback(chan->client, + chan, + DMA_RESOURCE_REMOVED); + dma_client_chan_free(chan); + } + class_device_unregister(&chan->class_dev); + } + dma_chans_rebalance(); + + kref_put(&device->refcount, dma_async_device_cleanup); + wait_for_completion(&device->done); +} + +static int __init dma_bus_init(void) +{ + mutex_init(&dma_list_mutex); + return class_register(&dma_devclass); +} + +subsys_initcall(dma_bus_init); + +EXPORT_SYMBOL(dma_async_client_register); +EXPORT_SYMBOL(dma_async_client_unregister); +EXPORT_SYMBOL(dma_async_client_chan_request); +EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); +EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); +EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg); +EXPORT_SYMBOL(dma_async_memcpy_complete); +EXPORT_SYMBOL(dma_async_memcpy_issue_pending); +EXPORT_SYMBOL(dma_async_device_register); +EXPORT_SYMBOL(dma_async_device_unregister); +EXPORT_SYMBOL(dma_chan_cleanup); diff -puN /dev/null drivers/dma/ioatdma.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/dma/ioatdma.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,840 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This driver supports an Intel I/OAT DMA engine, which does asynchronous + * copy operations. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ioatdma.h" +#include "ioatdma_io.h" +#include "ioatdma_registers.h" +#include "ioatdma_hw.h" + +#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common) +#define to_ioat_device(dev) container_of(dev, struct ioat_device, common) +#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node) + +/* internal functions */ +static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent); +static void __devexit ioat_remove(struct pci_dev *pdev); + +static int enumerate_dma_channels(struct ioat_device *device) +{ + u8 xfercap_scale; + u32 xfercap; + int i; + struct ioat_dma_chan *ioat_chan; + + device->common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET); + xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET); + xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale)); + + for (i = 0; i < device->common.chancnt; i++) { + ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL); + if (!ioat_chan) { + device->common.chancnt = i; + break; + } + + ioat_chan->device = device; + ioat_chan->reg_base = device->reg_base + (0x80 * (i + 1)); + ioat_chan->xfercap = xfercap; + spin_lock_init(&ioat_chan->cleanup_lock); + spin_lock_init(&ioat_chan->desc_lock); + INIT_LIST_HEAD(&ioat_chan->free_desc); + INIT_LIST_HEAD(&ioat_chan->used_desc); + /* This should be made common somewhere in dmaengine.c */ + ioat_chan->common.device = &device->common; + ioat_chan->common.client = NULL; + list_add_tail(&ioat_chan->common.device_node, + &device->common.channels); + } + return device->common.chancnt; +} + +static struct ioat_desc_sw *ioat_dma_alloc_descriptor( + struct ioat_dma_chan *ioat_chan, + int flags) +{ + struct ioat_dma_descriptor *desc; + struct ioat_desc_sw *desc_sw; + struct ioat_device *ioat_device; + dma_addr_t phys; + + ioat_device = to_ioat_device(ioat_chan->common.device); + desc = pci_pool_alloc(ioat_device->dma_pool, flags, &phys); + if (unlikely(!desc)) + return NULL; + + desc_sw = kzalloc(sizeof(*desc_sw), flags); + if (unlikely(!desc_sw)) { + pci_pool_free(ioat_device->dma_pool, desc, phys); + return NULL; + } + + memset(desc, 0, sizeof(*desc)); + desc_sw->hw = desc; + desc_sw->phys = phys; + + return desc_sw; +} + +#define INITIAL_IOAT_DESC_COUNT 128 + +static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan); + +/* returns the actual number of allocated descriptors */ +static int ioat_dma_alloc_chan_resources(struct dma_chan *chan) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + struct ioat_desc_sw *desc = NULL; + u16 chanctrl; + u32 chanerr; + int i; + LIST_HEAD(tmp_list); + + /* + * In-use bit automatically set by reading chanctrl + * If 0, we got it, if 1, someone else did + */ + chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + if (chanctrl & IOAT_CHANCTRL_CHANNEL_IN_USE) + return -EBUSY; + + /* Setup register to interrupt and write completion status on error */ + chanctrl = IOAT_CHANCTRL_CHANNEL_IN_USE | + IOAT_CHANCTRL_ERR_INT_EN | + IOAT_CHANCTRL_ANY_ERR_ABORT_EN | + IOAT_CHANCTRL_ERR_COMPLETION_EN; + ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); + + chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET); + if (chanerr) { + printk("IOAT: CHANERR = %x, clearing\n", chanerr); + ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr); + } + + /* Allocate descriptors */ + for (i = 0; i < INITIAL_IOAT_DESC_COUNT; i++) { + desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL); + if (!desc) { + printk(KERN_ERR "IOAT: Only %d initial descriptors\n", i); + break; + } + list_add_tail(&desc->node, &tmp_list); + } + spin_lock_bh(&ioat_chan->desc_lock); + list_splice(&tmp_list, &ioat_chan->free_desc); + spin_unlock_bh(&ioat_chan->desc_lock); + + /* allocate a completion writeback area */ + /* doing 2 32bit writes to mmio since 1 64b write doesn't work */ + ioat_chan->completion_virt = + pci_pool_alloc(ioat_chan->device->completion_pool, + GFP_KERNEL, + &ioat_chan->completion_addr); + memset(ioat_chan->completion_virt, 0, + sizeof(*ioat_chan->completion_virt)); + ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW, + ((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF); + ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH, + ((u64) ioat_chan->completion_addr) >> 32); + + ioat_start_null_desc(ioat_chan); + return i; +} + +static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan); + +static void ioat_dma_free_chan_resources(struct dma_chan *chan) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + struct ioat_device *ioat_device = to_ioat_device(chan->device); + struct ioat_desc_sw *desc, *_desc; + u16 chanctrl; + int in_use_descs = 0; + + ioat_dma_memcpy_cleanup(ioat_chan); + + ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET); + + spin_lock_bh(&ioat_chan->desc_lock); + list_for_each_entry_safe(desc, _desc, &ioat_chan->used_desc, node) { + in_use_descs++; + list_del(&desc->node); + pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys); + kfree(desc); + } + list_for_each_entry_safe(desc, _desc, &ioat_chan->free_desc, node) { + list_del(&desc->node); + pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys); + kfree(desc); + } + spin_unlock_bh(&ioat_chan->desc_lock); + + pci_pool_free(ioat_device->completion_pool, + ioat_chan->completion_virt, + ioat_chan->completion_addr); + + /* one is ok since we left it on there on purpose */ + if (in_use_descs > 1) + printk(KERN_ERR "IOAT: Freeing %d in use descriptors!\n", + in_use_descs - 1); + + ioat_chan->last_completion = ioat_chan->completion_addr = 0; + + /* Tell hw the chan is free */ + chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + chanctrl &= ~IOAT_CHANCTRL_CHANNEL_IN_USE; + ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); +} + +/** + * do_ioat_dma_memcpy - actual function that initiates a IOAT DMA transaction + * @chan: IOAT DMA channel handle + * @dest: DMA destination address + * @src: DMA source address + * @len: transaction length in bytes + */ + +static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan, + dma_addr_t dest, + dma_addr_t src, + size_t len) +{ + struct ioat_desc_sw *first; + struct ioat_desc_sw *prev; + struct ioat_desc_sw *new; + dma_cookie_t cookie; + LIST_HEAD(new_chain); + u32 copy; + size_t orig_len; + dma_addr_t orig_src, orig_dst; + unsigned int desc_count = 0; + unsigned int append = 0; + + if (!ioat_chan || !dest || !src) + return -EFAULT; + + if (!len) + return ioat_chan->common.cookie; + + orig_len = len; + orig_src = src; + orig_dst = dest; + + first = NULL; + prev = NULL; + + spin_lock_bh(&ioat_chan->desc_lock); + + while (len) { + if (!list_empty(&ioat_chan->free_desc)) { + new = to_ioat_desc(ioat_chan->free_desc.next); + list_del(&new->node); + } else { + /* try to get another desc */ + new = ioat_dma_alloc_descriptor(ioat_chan, GFP_ATOMIC); + /* will this ever happen? */ + /* TODO add upper limit on these */ + BUG_ON(!new); + } + + copy = min((u32) len, ioat_chan->xfercap); + + new->hw->size = copy; + new->hw->ctl = 0; + new->hw->src_addr = src; + new->hw->dst_addr = dest; + new->cookie = 0; + + /* chain together the physical address list for the HW */ + if (!first) + first = new; + else + prev->hw->next = (u64) new->phys; + + prev = new; + + len -= copy; + dest += copy; + src += copy; + + list_add_tail(&new->node, &new_chain); + desc_count++; + } + new->hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS; + new->hw->next = 0; + + /* cookie incr and addition to used_list must be atomic */ + + cookie = ioat_chan->common.cookie; + cookie++; + if (cookie < 0) + cookie = 1; + ioat_chan->common.cookie = new->cookie = cookie; + + pci_unmap_addr_set(new, src, orig_src); + pci_unmap_addr_set(new, dst, orig_dst); + pci_unmap_len_set(new, src_len, orig_len); + pci_unmap_len_set(new, dst_len, orig_len); + + /* write address into NextDescriptor field of last desc in chain */ + to_ioat_desc(ioat_chan->used_desc.prev)->hw->next = first->phys; + list_splice_init(&new_chain, ioat_chan->used_desc.prev); + + ioat_chan->pending += desc_count; + if (ioat_chan->pending >= 20) { + append = 1; + ioat_chan->pending = 0; + } + + spin_unlock_bh(&ioat_chan->desc_lock); + + if (append) + ioatdma_chan_write8(ioat_chan, + IOAT_CHANCMD_OFFSET, + IOAT_CHANCMD_APPEND); + return cookie; +} + +/** + * ioat_dma_memcpy_buf_to_buf - wrapper that takes src & dest bufs + * @chan: IOAT DMA channel handle + * @dest: DMA destination address + * @src: DMA source address + * @len: transaction length in bytes + */ + +static dma_cookie_t ioat_dma_memcpy_buf_to_buf(struct dma_chan *chan, + void *dest, + void *src, + size_t len) +{ + dma_addr_t dest_addr; + dma_addr_t src_addr; + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + + dest_addr = pci_map_single(ioat_chan->device->pdev, + dest, len, PCI_DMA_FROMDEVICE); + src_addr = pci_map_single(ioat_chan->device->pdev, + src, len, PCI_DMA_TODEVICE); + + return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len); +} + +/** + * ioat_dma_memcpy_buf_to_pg - wrapper, copying from a buf to a page + * @chan: IOAT DMA channel handle + * @page: pointer to the page to copy to + * @offset: offset into that page + * @src: DMA source address + * @len: transaction length in bytes + */ + +static dma_cookie_t ioat_dma_memcpy_buf_to_pg(struct dma_chan *chan, + struct page *page, + unsigned int offset, + void *src, + size_t len) +{ + dma_addr_t dest_addr; + dma_addr_t src_addr; + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + + dest_addr = pci_map_page(ioat_chan->device->pdev, + page, offset, len, PCI_DMA_FROMDEVICE); + src_addr = pci_map_single(ioat_chan->device->pdev, + src, len, PCI_DMA_TODEVICE); + + return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len); +} + +/** + * ioat_dma_memcpy_pg_to_pg - wrapper, copying between two pages + * @chan: IOAT DMA channel handle + * @dest_pg: pointer to the page to copy to + * @dest_off: offset into that page + * @src_pg: pointer to the page to copy from + * @src_off: offset into that page + * @len: transaction length in bytes. This is guaranteed to not make a copy + * across a page boundary. + */ + +static dma_cookie_t ioat_dma_memcpy_pg_to_pg(struct dma_chan *chan, + struct page *dest_pg, + unsigned int dest_off, + struct page *src_pg, + unsigned int src_off, + size_t len) +{ + dma_addr_t dest_addr; + dma_addr_t src_addr; + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + + dest_addr = pci_map_page(ioat_chan->device->pdev, + dest_pg, dest_off, len, PCI_DMA_FROMDEVICE); + src_addr = pci_map_page(ioat_chan->device->pdev, + src_pg, src_off, len, PCI_DMA_TODEVICE); + + return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len); +} + +/** + * ioat_dma_memcpy_issue_pending - push potentially unrecognoized appended descriptors to hw + * @chan: DMA channel handle + */ + +static void ioat_dma_memcpy_issue_pending(struct dma_chan *chan) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + + if (ioat_chan->pending != 0) { + ioat_chan->pending = 0; + ioatdma_chan_write8(ioat_chan, + IOAT_CHANCMD_OFFSET, + IOAT_CHANCMD_APPEND); + } +} + +static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan) +{ + unsigned long phys_complete; + struct ioat_desc_sw *desc, *_desc; + dma_cookie_t cookie = 0; + + prefetch(chan->completion_virt); + + if (!spin_trylock(&chan->cleanup_lock)) + return; + + /* The completion writeback can happen at any time, + so reads by the driver need to be atomic operations + The descriptor physical addresses are limited to 32-bits + when the CPU can only do a 32-bit mov */ + +#if (BITS_PER_LONG == 64) + phys_complete = + chan->completion_virt->full & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR; +#else + phys_complete = chan->completion_virt->low & IOAT_LOW_COMPLETION_MASK; +#endif + + if ((chan->completion_virt->full & IOAT_CHANSTS_DMA_TRANSFER_STATUS) == + IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) { + printk("IOAT: Channel halted, chanerr = %x\n", + ioatdma_chan_read32(chan, IOAT_CHANERR_OFFSET)); + + /* TODO do something to salvage the situation */ + } + + if (phys_complete == chan->last_completion) { + spin_unlock(&chan->cleanup_lock); + return; + } + + spin_lock_bh(&chan->desc_lock); + list_for_each_entry_safe(desc, _desc, &chan->used_desc, node) { + + /* + * Incoming DMA requests may use multiple descriptors, due to + * exceeding xfercap, perhaps. If so, only the last one will + * have a cookie, and require unmapping. + */ + if (desc->cookie) { + cookie = desc->cookie; + + /* yes we are unmapping both _page and _single alloc'd + regions with unmap_page. Is this *really* that bad? + */ + pci_unmap_page(chan->device->pdev, + pci_unmap_addr(desc, dst), + pci_unmap_len(desc, dst_len), + PCI_DMA_FROMDEVICE); + pci_unmap_page(chan->device->pdev, + pci_unmap_addr(desc, src), + pci_unmap_len(desc, src_len), + PCI_DMA_TODEVICE); + } + + if (desc->phys != phys_complete) { + /* a completed entry, but not the last, so cleanup */ + list_del(&desc->node); + list_add_tail(&desc->node, &chan->free_desc); + } else { + /* last used desc. Do not remove, so we can append from + it, but don't look at it next time, either */ + desc->cookie = 0; + + /* TODO check status bits? */ + break; + } + } + + spin_unlock_bh(&chan->desc_lock); + + chan->last_completion = phys_complete; + if (cookie != 0) + chan->completed_cookie = cookie; + + spin_unlock(&chan->cleanup_lock); +} + +/** + * ioat_dma_is_complete - poll the status of a IOAT DMA transaction + * @chan: IOAT DMA channel handle + * @cookie: DMA transaction identifier + */ + +static enum dma_status ioat_dma_is_complete(struct dma_chan *chan, + dma_cookie_t cookie, + dma_cookie_t *done, + dma_cookie_t *used) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + dma_cookie_t last_used; + dma_cookie_t last_complete; + enum dma_status ret; + + last_used = chan->cookie; + last_complete = ioat_chan->completed_cookie; + + if (done) + *done= last_complete; + if (used) + *used = last_used; + + ret = dma_async_is_complete(cookie, last_complete, last_used); + if (ret == DMA_SUCCESS) + return ret; + + ioat_dma_memcpy_cleanup(ioat_chan); + + last_used = chan->cookie; + last_complete = ioat_chan->completed_cookie; + + if (done) + *done= last_complete; + if (used) + *used = last_used; + + return dma_async_is_complete(cookie, last_complete, last_used); +} + +/* PCI API */ + +static struct pci_device_id ioat_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) }, + { 0, } +}; + +static struct pci_driver ioat_pci_drv = { + .name = "ioatdma", + .id_table = ioat_pci_tbl, + .probe = ioat_probe, + .remove = __devexit_p(ioat_remove), +}; + +static irqreturn_t ioat_do_interrupt(int irq, void *data, struct pt_regs *regs) +{ + struct ioat_device *instance = data; + unsigned long attnstatus; + u8 intrctrl; + + intrctrl = ioatdma_read8(instance, IOAT_INTRCTRL_OFFSET); + + if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN)) + return IRQ_NONE; + + if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) { + ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl); + return IRQ_NONE; + } + + attnstatus = ioatdma_read32(instance, IOAT_ATTNSTATUS_OFFSET); + + printk(KERN_ERR "ioatdma error: interrupt! status %lx\n", attnstatus); + + ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl); + return IRQ_HANDLED; +} + +static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan) +{ + struct ioat_desc_sw *desc; + + spin_lock_bh(&ioat_chan->desc_lock); + + if (!list_empty(&ioat_chan->free_desc)) { + desc = to_ioat_desc(ioat_chan->free_desc.next); + list_del(&desc->node); + } else { + /* try to get another desc */ + spin_unlock_bh(&ioat_chan->desc_lock); + desc = ioat_dma_alloc_descriptor(ioat_chan, GFP_KERNEL); + spin_lock_bh(&ioat_chan->desc_lock); + /* will this ever happen? */ + BUG_ON(!desc); + } + + desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL; + desc->hw->next = 0; + + list_add_tail(&desc->node, &ioat_chan->used_desc); + spin_unlock_bh(&ioat_chan->desc_lock); + +#if (BITS_PER_LONG == 64) + ioatdma_chan_write64(ioat_chan, IOAT_CHAINADDR_OFFSET, desc->phys); +#else + ioatdma_chan_write32(ioat_chan, + IOAT_CHAINADDR_OFFSET_LOW, + (u32) desc->phys); + ioatdma_chan_write32(ioat_chan, IOAT_CHAINADDR_OFFSET_HIGH, 0); +#endif + ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_START); +} + +/* + * Perform a IOAT transaction to verify the HW works. + */ +#define IOAT_TEST_SIZE 2000 + +static int ioat_self_test(struct ioat_device *device) +{ + int i; + u8 *src; + u8 *dest; + struct dma_chan *dma_chan; + dma_cookie_t cookie; + int err = 0; + + src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL); + if (!src) + return -ENOMEM; + dest = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, SLAB_KERNEL); + if (!dest) { + kfree(src); + return -ENOMEM; + } + + /* Fill in src buffer */ + for (i = 0; i < IOAT_TEST_SIZE; i++) + src[i] = (u8)i; + + /* Start copy, using first DMA channel */ + dma_chan = container_of(device->common.channels.next, + struct dma_chan, + device_node); + if (ioat_dma_alloc_chan_resources(dma_chan) < 1) { + err = -ENODEV; + goto out; + } + + cookie = ioat_dma_memcpy_buf_to_buf(dma_chan, dest, src, IOAT_TEST_SIZE); + ioat_dma_memcpy_issue_pending(dma_chan); + msleep(1); + + if (ioat_dma_is_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { + printk(KERN_ERR "ioatdma: Self-test copy timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + if (memcmp(src, dest, IOAT_TEST_SIZE)) { + printk(KERN_ERR "ioatdma: Self-test copy failed compare, disabling\n"); + err = -ENODEV; + goto free_resources; + } + +free_resources: + ioat_dma_free_chan_resources(dma_chan); +out: + kfree(src); + kfree(dest); + return err; +} + +static int __devinit ioat_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int err; + unsigned long mmio_start, mmio_len; + void *reg_base; + struct ioat_device *device; + + err = pci_enable_device(pdev); + if (err) + goto err_enable_device; + + err = pci_set_dma_mask(pdev, DMA_64BIT_MASK); + if (err) + err = pci_set_dma_mask(pdev, DMA_32BIT_MASK); + if (err) + goto err_set_dma_mask; + + err = pci_request_regions(pdev, ioat_pci_drv.name); + if (err) + goto err_request_regions; + + mmio_start = pci_resource_start(pdev, 0); + mmio_len = pci_resource_len(pdev, 0); + + reg_base = ioremap(mmio_start, mmio_len); + if (!reg_base) { + err = -ENOMEM; + goto err_ioremap; + } + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) { + err = -ENOMEM; + goto err_kzalloc; + } + + /* DMA coherent memory pool for DMA descriptor allocations */ + device->dma_pool = pci_pool_create("dma_desc_pool", pdev, + sizeof(struct ioat_dma_descriptor), 64, 0); + if (!device->dma_pool) { + err = -ENOMEM; + goto err_dma_pool; + } + + device->completion_pool = pci_pool_create("completion_pool", pdev, sizeof(u64), SMP_CACHE_BYTES, SMP_CACHE_BYTES); + if (!device->completion_pool) { + err = -ENOMEM; + goto err_completion_pool; + } + + device->pdev = pdev; + pci_set_drvdata(pdev, device); +#ifdef CONFIG_PCI_MSI + if (pci_enable_msi(pdev) == 0) { + device->msi = 1; + } else { + device->msi = 0; + } +#endif + err = request_irq(pdev->irq, &ioat_do_interrupt, SA_SHIRQ, "ioat", + device); + if (err) + goto err_irq; + + device->reg_base = reg_base; + + ioatdma_write8(device, IOAT_INTRCTRL_OFFSET, IOAT_INTRCTRL_MASTER_INT_EN); + pci_set_master(pdev); + + INIT_LIST_HEAD(&device->common.channels); + enumerate_dma_channels(device); + + device->common.device_alloc_chan_resources = ioat_dma_alloc_chan_resources; + device->common.device_free_chan_resources = ioat_dma_free_chan_resources; + device->common.device_memcpy_buf_to_buf = ioat_dma_memcpy_buf_to_buf; + device->common.device_memcpy_buf_to_pg = ioat_dma_memcpy_buf_to_pg; + device->common.device_memcpy_pg_to_pg = ioat_dma_memcpy_pg_to_pg; + device->common.device_memcpy_complete = ioat_dma_is_complete; + device->common.device_memcpy_issue_pending = ioat_dma_memcpy_issue_pending; + printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n", + device->common.chancnt); + + err = ioat_self_test(device); + if (err) + goto err_self_test; + + dma_async_device_register(&device->common); + + return 0; + +err_self_test: +err_irq: + pci_pool_destroy(device->completion_pool); +err_completion_pool: + pci_pool_destroy(device->dma_pool); +err_dma_pool: + kfree(device); +err_kzalloc: + iounmap(reg_base); +err_ioremap: + pci_release_regions(pdev); +err_request_regions: +err_set_dma_mask: + pci_disable_device(pdev); +err_enable_device: + return err; +} + +static void __devexit ioat_remove(struct pci_dev *pdev) +{ + struct ioat_device *device; + struct dma_chan *chan, *_chan; + struct ioat_dma_chan *ioat_chan; + + device = pci_get_drvdata(pdev); + dma_async_device_unregister(&device->common); + + free_irq(device->pdev->irq, device); +#ifdef CONFIG_PCI_MSI + if (device->msi) + pci_disable_msi(device->pdev); +#endif + pci_pool_destroy(device->dma_pool); + pci_pool_destroy(device->completion_pool); + iounmap(device->reg_base); + pci_release_regions(pdev); + pci_disable_device(pdev); + list_for_each_entry_safe(chan, _chan, &device->common.channels, device_node) { + ioat_chan = to_ioat_chan(chan); + list_del(&chan->device_node); + kfree(ioat_chan); + } + kfree(device); +} + +/* MODULE API */ +MODULE_VERSION("1.7"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Intel Corporation"); + +static int __init ioat_init_module(void) +{ + /* it's currently unsafe to unload this module */ + /* if forced, worst case is that rmmod hangs */ + if (THIS_MODULE != NULL) + THIS_MODULE->unsafe = 1; + + return pci_module_init(&ioat_pci_drv); +} + +module_init(ioat_init_module); + +static void __exit ioat_exit_module(void) +{ + pci_unregister_driver(&ioat_pci_drv); +} + +module_exit(ioat_exit_module); diff -puN /dev/null drivers/dma/ioatdma.h --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/dma/ioatdma.h 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,125 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef IOATDMA_H +#define IOATDMA_H + +#include +#include "ioatdma_hw.h" +#include +#include +#include +#include + +#define IOAT_LOW_COMPLETION_MASK 0xffffffc0 + +extern struct list_head dma_device_list; +extern struct list_head dma_client_list; + +/** + * struct ioat_device - internal representation of a IOAT device + * @pdev: PCI-Express device + * @reg_base: MMIO register space base address + * @dma_pool: for allocating DMA descriptors + * @common: embedded struct dma_device + * @msi: Message Signaled Interrupt number + */ + +struct ioat_device { + struct pci_dev *pdev; + void *reg_base; + struct pci_pool *dma_pool; + struct pci_pool *completion_pool; + + struct dma_device common; + u8 msi; +}; + +/** + * struct ioat_dma_chan - internal representation of a DMA channel + * @device: + * @reg_base: + * @sw_in_use: + * @completion: + * @completion_low: + * @completion_high: + * @completed_cookie: last cookie seen completed on cleanup + * @cookie: value of last cookie given to client + * @last_completion: + * @xfercap: + * @desc_lock: + * @free_desc: + * @used_desc: + * @resource: + * @device_node: + */ + +struct ioat_dma_chan { + + void *reg_base; + + dma_cookie_t completed_cookie; + unsigned long last_completion; + + u32 xfercap; /* XFERCAP register value expanded out */ + + spinlock_t cleanup_lock; + spinlock_t desc_lock; + struct list_head free_desc; + struct list_head used_desc; + + int pending; + + struct ioat_device *device; + struct dma_chan common; + + dma_addr_t completion_addr; + union { + u64 full; /* HW completion writeback */ + struct { + u32 low; + u32 high; + }; + } *completion_virt; +}; + +/* wrapper around hardware descriptor format + additional software fields */ + +/** + * struct ioat_desc_sw - wrapper around hardware descriptor + * @hw: hardware DMA descriptor + * @node: + * @cookie: + * @phys: + */ + +struct ioat_desc_sw { + struct ioat_dma_descriptor *hw; + struct list_head node; + dma_cookie_t cookie; + dma_addr_t phys; + DECLARE_PCI_UNMAP_ADDR(src) + DECLARE_PCI_UNMAP_LEN(src_len) + DECLARE_PCI_UNMAP_ADDR(dst) + DECLARE_PCI_UNMAP_LEN(dst_len) +}; + +#endif /* IOATDMA_H */ + diff -puN /dev/null drivers/dma/ioatdma_hw.h --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/dma/ioatdma_hw.h 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,52 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef _IOAT_HW_H_ +#define _IOAT_HW_H_ + +/* PCI Configuration Space Values */ +#define IOAT_PCI_VID 0x8086 +#define IOAT_PCI_DID 0x1A38 +#define IOAT_PCI_RID 0x00 +#define IOAT_PCI_SVID 0x8086 +#define IOAT_PCI_SID 0x8086 +#define IOAT_VER 0x12 /* Version 1.2 */ + +struct ioat_dma_descriptor { + uint32_t size; + uint32_t ctl; + uint64_t src_addr; + uint64_t dst_addr; + uint64_t next; + uint64_t rsv1; + uint64_t rsv2; + uint64_t user1; + uint64_t user2; +}; + +#define IOAT_DMA_DESCRIPTOR_CTL_INT_GN 0x00000001 +#define IOAT_DMA_DESCRIPTOR_CTL_SRC_SN 0x00000002 +#define IOAT_DMA_DESCRIPTOR_CTL_DST_SN 0x00000004 +#define IOAT_DMA_DESCRIPTOR_CTL_CP_STS 0x00000008 +#define IOAT_DMA_DESCRIPTOR_CTL_FRAME 0x00000010 +#define IOAT_DMA_DESCRIPTOR_NUL 0x00000020 +#define IOAT_DMA_DESCRIPTOR_OPCODE 0xFF000000 + +#endif diff -puN /dev/null drivers/dma/ioatdma_io.h --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/dma/ioatdma_io.h 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,118 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef IOATDMA_IO_H +#define IOATDMA_IO_H + +#include + +/* + * device and per-channel MMIO register read and write functions + * this is a lot of anoying inline functions, but it's typesafe + */ + +static inline u8 ioatdma_read8(struct ioat_device *device, + unsigned int offset) +{ + return readb(device->reg_base + offset); +} + +static inline u16 ioatdma_read16(struct ioat_device *device, + unsigned int offset) +{ + return readw(device->reg_base + offset); +} + +static inline u32 ioatdma_read32(struct ioat_device *device, + unsigned int offset) +{ + return readl(device->reg_base + offset); +} + +static inline void ioatdma_write8(struct ioat_device *device, + unsigned int offset, u8 value) +{ + writeb(value, device->reg_base + offset); +} + +static inline void ioatdma_write16(struct ioat_device *device, + unsigned int offset, u16 value) +{ + writew(value, device->reg_base + offset); +} + +static inline void ioatdma_write32(struct ioat_device *device, + unsigned int offset, u32 value) +{ + writel(value, device->reg_base + offset); +} + +static inline u8 ioatdma_chan_read8(struct ioat_dma_chan *chan, + unsigned int offset) +{ + return readb(chan->reg_base + offset); +} + +static inline u16 ioatdma_chan_read16(struct ioat_dma_chan *chan, + unsigned int offset) +{ + return readw(chan->reg_base + offset); +} + +static inline u32 ioatdma_chan_read32(struct ioat_dma_chan *chan, + unsigned int offset) +{ + return readl(chan->reg_base + offset); +} + +static inline void ioatdma_chan_write8(struct ioat_dma_chan *chan, + unsigned int offset, u8 value) +{ + writeb(value, chan->reg_base + offset); +} + +static inline void ioatdma_chan_write16(struct ioat_dma_chan *chan, + unsigned int offset, u16 value) +{ + writew(value, chan->reg_base + offset); +} + +static inline void ioatdma_chan_write32(struct ioat_dma_chan *chan, + unsigned int offset, u32 value) +{ + writel(value, chan->reg_base + offset); +} + +#if (BITS_PER_LONG == 64) +static inline u64 ioatdma_chan_read64(struct ioat_dma_chan *chan, + unsigned int offset) +{ + return readq(chan->reg_base + offset); +} + +static inline void ioatdma_chan_write64(struct ioat_dma_chan *chan, + unsigned int offset, u64 value) +{ + writeq(value, chan->reg_base + offset); +} +#endif + +#endif /* IOATDMA_IO_H */ + diff -puN /dev/null drivers/dma/ioatdma_registers.h --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/dma/ioatdma_registers.h 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,126 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef _IOAT_REGISTERS_H_ +#define _IOAT_REGISTERS_H_ + + +/* MMIO Device Registers */ +#define IOAT_CHANCNT_OFFSET 0x00 /* 8-bit */ + +#define IOAT_XFERCAP_OFFSET 0x01 /* 8-bit */ +#define IOAT_XFERCAP_4KB 12 +#define IOAT_XFERCAP_8KB 13 +#define IOAT_XFERCAP_16KB 14 +#define IOAT_XFERCAP_32KB 15 +#define IOAT_XFERCAP_32GB 0 + +#define IOAT_GENCTRL_OFFSET 0x02 /* 8-bit */ +#define IOAT_GENCTRL_DEBUG_EN 0x01 + +#define IOAT_INTRCTRL_OFFSET 0x03 /* 8-bit */ +#define IOAT_INTRCTRL_MASTER_INT_EN 0x01 /* Master Interrupt Enable */ +#define IOAT_INTRCTRL_INT_STATUS 0x02 /* ATTNSTATUS -or- Channel Int */ +#define IOAT_INTRCTRL_INT 0x04 /* INT_STATUS -and- MASTER_INT_EN */ + +#define IOAT_ATTNSTATUS_OFFSET 0x04 /* Each bit is a channel */ + +#define IOAT_VER_OFFSET 0x08 /* 8-bit */ +#define IOAT_VER_MAJOR_MASK 0xF0 +#define IOAT_VER_MINOR_MASK 0x0F +#define GET_IOAT_VER_MAJOR(x) ((x) & IOAT_VER_MAJOR_MASK) +#define GET_IOAT_VER_MINOR(x) ((x) & IOAT_VER_MINOR_MASK) + +#define IOAT_PERPORTOFFSET_OFFSET 0x0A /* 16-bit */ + +#define IOAT_INTRDELAY_OFFSET 0x0C /* 16-bit */ +#define IOAT_INTRDELAY_INT_DELAY_MASK 0x3FFF /* Interrupt Delay Time */ +#define IOAT_INTRDELAY_COALESE_SUPPORT 0x8000 /* Interrupt Coalesing Supported */ + +#define IOAT_DEVICE_STATUS_OFFSET 0x0E /* 16-bit */ +#define IOAT_DEVICE_STATUS_DEGRADED_MODE 0x0001 + + +#define IOAT_CHANNEL_MMIO_SIZE 0x80 /* Each Channel MMIO space is this size */ + +/* DMA Channel Registers */ +#define IOAT_CHANCTRL_OFFSET 0x00 /* 16-bit Channel Control Register */ +#define IOAT_CHANCTRL_CHANNEL_PRIORITY_MASK 0xF000 +#define IOAT_CHANCTRL_CHANNEL_IN_USE 0x0100 +#define IOAT_CHANCTRL_DESCRIPTOR_ADDR_SNOOP_CONTROL 0x0020 +#define IOAT_CHANCTRL_ERR_INT_EN 0x0010 +#define IOAT_CHANCTRL_ANY_ERR_ABORT_EN 0x0008 +#define IOAT_CHANCTRL_ERR_COMPLETION_EN 0x0004 +#define IOAT_CHANCTRL_INT_DISABLE 0x0001 + +#define IOAT_DMA_COMP_OFFSET 0x02 /* 16-bit DMA channel compatability */ +#define IOAT_DMA_COMP_V1 0x0001 /* Compatability with DMA version 1 */ + +#define IOAT_CHANSTS_OFFSET 0x04 /* 64-bit Channel Status Register */ +#define IOAT_CHANSTS_OFFSET_LOW 0x04 +#define IOAT_CHANSTS_OFFSET_HIGH 0x08 +#define IOAT_CHANSTS_COMPLETED_DESCRIPTOR_ADDR 0xFFFFFFFFFFFFFFC0 +#define IOAT_CHANSTS_SOFT_ERR 0x0000000000000010 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS 0x0000000000000007 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE 0x0 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE 0x1 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_SUSPENDED 0x2 +#define IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED 0x3 + +#define IOAT_CHAINADDR_OFFSET 0x0C /* 64-bit Descriptor Chain Address Register */ +#define IOAT_CHAINADDR_OFFSET_LOW 0x0C +#define IOAT_CHAINADDR_OFFSET_HIGH 0x10 + +#define IOAT_CHANCMD_OFFSET 0x14 /* 8-bit DMA Channel Command Register */ +#define IOAT_CHANCMD_RESET 0x20 +#define IOAT_CHANCMD_RESUME 0x10 +#define IOAT_CHANCMD_ABORT 0x08 +#define IOAT_CHANCMD_SUSPEND 0x04 +#define IOAT_CHANCMD_APPEND 0x02 +#define IOAT_CHANCMD_START 0x01 + +#define IOAT_CHANCMP_OFFSET 0x18 /* 64-bit Channel Completion Address Register */ +#define IOAT_CHANCMP_OFFSET_LOW 0x18 +#define IOAT_CHANCMP_OFFSET_HIGH 0x1C + +#define IOAT_CDAR_OFFSET 0x20 /* 64-bit Current Descriptor Address Register */ +#define IOAT_CDAR_OFFSET_LOW 0x20 +#define IOAT_CDAR_OFFSET_HIGH 0x24 + +#define IOAT_CHANERR_OFFSET 0x28 /* 32-bit Channel Error Register */ +#define IOAT_CHANERR_DMA_TRANSFER_SRC_ADDR_ERR 0x0001 +#define IOAT_CHANERR_DMA_TRANSFER_DEST_ADDR_ERR 0x0002 +#define IOAT_CHANERR_NEXT_DESCRIPTOR_ADDR_ERR 0x0004 +#define IOAT_CHANERR_NEXT_DESCRIPTOR_ALIGNMENT_ERR 0x0008 +#define IOAT_CHANERR_CHAIN_ADDR_VALUE_ERR 0x0010 +#define IOAT_CHANERR_CHANCMD_ERR 0x0020 +#define IOAT_CHANERR_CHIPSET_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0040 +#define IOAT_CHANERR_DMA_UNCORRECTABLE_DATA_INTEGRITY_ERR 0x0080 +#define IOAT_CHANERR_READ_DATA_ERR 0x0100 +#define IOAT_CHANERR_WRITE_DATA_ERR 0x0200 +#define IOAT_CHANERR_DESCRIPTOR_CONTROL_ERR 0x0400 +#define IOAT_CHANERR_DESCRIPTOR_LENGTH_ERR 0x0800 +#define IOAT_CHANERR_COMPLETION_ADDR_ERR 0x1000 +#define IOAT_CHANERR_INT_CONFIGURATION_ERR 0x2000 +#define IOAT_CHANERR_SOFT_ERR 0x4000 + +#define IOAT_CHANERR_MASK_OFFSET 0x2C /* 32-bit Channel Error Register */ + +#endif /* _IOAT_REGISTERS_H_ */ diff -puN /dev/null drivers/dma/iovlock.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/dma/iovlock.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,301 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * Portions based on net/core/datagram.c and copyrighted by their authors. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code allows the net stack to make use of a DMA engine for + * skb to iovec copies. + */ + +#include +#include +#include /* for memcpy_toiovec */ +#include +#include + +int num_pages_spanned(struct iovec *iov) +{ + return + ((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) - + ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT); +} + +/* + * Pin down all the iovec pages needed for len bytes. + * Return a struct dma_pinned_list to keep track of pages pinned down. + * + * We are allocating a single chunk of memory, and then carving it up into + * 3 sections, the latter 2 whose size depends on the number of iovecs and the + * total number of pages, respectively. + */ +struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len) +{ + struct dma_pinned_list *local_list; + struct page **pages; + int i; + int ret; + int nr_iovecs = 0; + int iovec_len_used = 0; + int iovec_pages_used = 0; + long err; + + /* don't pin down non-user-based iovecs */ + if (segment_eq(get_fs(), KERNEL_DS)) + return NULL; + + /* determine how many iovecs/pages there are, up front */ + do { + iovec_len_used += iov[nr_iovecs].iov_len; + iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]); + nr_iovecs++; + } while (iovec_len_used < len); + + /* single kmalloc for pinned list, page_list[], and the page arrays */ + local_list = kmalloc(sizeof(*local_list) + + (nr_iovecs * sizeof (struct dma_page_list)) + + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL); + if (!local_list) { + err = -ENOMEM; + goto out; + } + + /* list of pages starts right after the page list array */ + pages = (struct page **) &local_list->page_list[nr_iovecs]; + + for (i = 0; i < nr_iovecs; i++) { + struct dma_page_list *page_list = &local_list->page_list[i]; + + len -= iov[i].iov_len; + + if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) { + err = -EFAULT; + goto unpin; + } + + page_list->nr_pages = num_pages_spanned(&iov[i]); + page_list->base_address = iov[i].iov_base; + + page_list->pages = pages; + pages += page_list->nr_pages; + + /* pin pages down */ + down_read(¤t->mm->mmap_sem); + ret = get_user_pages( + current, + current->mm, + (unsigned long) iov[i].iov_base, + page_list->nr_pages, + 1, /* write */ + 0, /* force */ + page_list->pages, + NULL); + up_read(¤t->mm->mmap_sem); + + if (ret != page_list->nr_pages) { + err = -ENOMEM; + goto unpin; + } + + local_list->nr_iovecs = i + 1; + } + + return local_list; + +unpin: + dma_unpin_iovec_pages(local_list); +out: + return ERR_PTR(err); +} + +void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list) +{ + int i, j; + + if (!pinned_list) + return; + + for (i = 0; i < pinned_list->nr_iovecs; i++) { + struct dma_page_list *page_list = &pinned_list->page_list[i]; + for (j = 0; j < page_list->nr_pages; j++) { + set_page_dirty_lock(page_list->pages[j]); + page_cache_release(page_list->pages[j]); + } + } + + kfree(pinned_list); +} + +static dma_cookie_t dma_memcpy_to_kernel_iovec(struct dma_chan *chan, struct + iovec *iov, unsigned char *kdata, size_t len) +{ + dma_cookie_t dma_cookie = 0; + + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, iov->iov_len, len); + dma_cookie = dma_async_memcpy_buf_to_buf( + chan, + iov->iov_base, + kdata, + copy); + kdata += copy; + len -= copy; + iov->iov_len -= copy; + iov->iov_base += copy; + } + iov++; + } + + return dma_cookie; +} + +/* + * We have already pinned down the pages we will be using in the iovecs. + * Each entry in iov array has corresponding entry in pinned_list->page_list. + * Using array indexing to keep iov[] and page_list[] in sync. + * Initial elements in iov array's iov->iov_len will be 0 if already copied into + * by another call. + * iov array length remaining guaranteed to be bigger than len. + */ +dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov, + struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len) +{ + int iov_byte_offset; + int copy; + dma_cookie_t dma_cookie = 0; + int iovec_idx; + int page_idx; + + if (!chan) + return memcpy_toiovec(iov, kdata, len); + + /* -> kernel copies (e.g. smbfs) */ + if (!pinned_list) + return dma_memcpy_to_kernel_iovec(chan, iov, kdata, len); + + iovec_idx = 0; + while (iovec_idx < pinned_list->nr_iovecs) { + struct dma_page_list *page_list; + + /* skip already used-up iovecs */ + while (!iov[iovec_idx].iov_len) + iovec_idx++; + + page_list = &pinned_list->page_list[iovec_idx]; + + iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK); + page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK) + - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT; + + /* break up copies to not cross page boundary */ + while (iov[iovec_idx].iov_len) { + copy = min_t(int, PAGE_SIZE - iov_byte_offset, len); + copy = min_t(int, copy, iov[iovec_idx].iov_len); + + dma_cookie = dma_async_memcpy_buf_to_pg(chan, + page_list->pages[page_idx], + iov_byte_offset, + kdata, + copy); + + len -= copy; + iov[iovec_idx].iov_len -= copy; + iov[iovec_idx].iov_base += copy; + + if (!len) + return dma_cookie; + + kdata += copy; + iov_byte_offset = 0; + page_idx++; + } + iovec_idx++; + } + + /* really bad if we ever run out of iovecs */ + BUG(); + return -EFAULT; +} + +dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov, + struct dma_pinned_list *pinned_list, struct page *page, + unsigned int offset, size_t len) +{ + int iov_byte_offset; + int copy; + dma_cookie_t dma_cookie = 0; + int iovec_idx; + int page_idx; + int err; + + /* this needs as-yet-unimplemented buf-to-buff, so punt. */ + /* TODO: use dma for this */ + if (!chan || !pinned_list) { + u8 *vaddr = kmap(page); + err = memcpy_toiovec(iov, vaddr + offset, len); + kunmap(page); + return err; + } + + iovec_idx = 0; + while (iovec_idx < pinned_list->nr_iovecs) { + struct dma_page_list *page_list; + + /* skip already used-up iovecs */ + while (!iov[iovec_idx].iov_len) + iovec_idx++; + + page_list = &pinned_list->page_list[iovec_idx]; + + iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK); + page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK) + - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT; + + /* break up copies to not cross page boundary */ + while (iov[iovec_idx].iov_len) { + copy = min_t(int, PAGE_SIZE - iov_byte_offset, len); + copy = min_t(int, copy, iov[iovec_idx].iov_len); + + dma_cookie = dma_async_memcpy_pg_to_pg(chan, + page_list->pages[page_idx], + iov_byte_offset, + page, + offset, + copy); + + len -= copy; + iov[iovec_idx].iov_len -= copy; + iov[iovec_idx].iov_base += copy; + + if (!len) + return dma_cookie; + + offset += copy; + iov_byte_offset = 0; + page_idx++; + } + iovec_idx++; + } + + /* really bad if we ever run out of iovecs */ + BUG(); + return -EFAULT; +} diff -puN /dev/null drivers/dma/Kconfig --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/dma/Kconfig 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,34 @@ +# +# DMA engine configuration +# + +menu "DMA Engine support" + +config DMA_ENGINE + bool "Support for DMA engines" + ---help--- + DMA engines offload copy operations from the CPU to dedicated + hardware, allowing the copies to happen asynchronously. + +comment "DMA Clients" + +config NET_DMA + bool "Network: TCP receive copy offload" + depends on DMA_ENGINE && NET + default y + ---help--- + This enables the use of DMA engines in the network stack to + offload receive copy-to-user operations, freeing CPU cycles. + Since this is the main user of the DMA engine, it should be enabled; + say Y here. + +comment "DMA Devices" + +config INTEL_IOATDMA + tristate "Intel I/OAT DMA support" + depends on DMA_ENGINE && PCI + default m + ---help--- + Enable support for the Intel(R) I/OAT DMA engine. + +endmenu diff -puN /dev/null drivers/dma/Makefile --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/dma/Makefile 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,3 @@ +obj-$(CONFIG_DMA_ENGINE) += dmaengine.o +obj-$(CONFIG_NET_DMA) += iovlock.o +obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o diff -puN drivers/Kconfig~git-net drivers/Kconfig --- devel/drivers/Kconfig~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/drivers/Kconfig 2006-05-29 15:05:00.000000000 -0700 @@ -72,4 +72,6 @@ source "drivers/edac/Kconfig" source "drivers/rtc/Kconfig" +source "drivers/dma/Kconfig" + endmenu diff -puN drivers/Makefile~git-net drivers/Makefile --- devel/drivers/Makefile~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/drivers/Makefile 2006-05-29 15:05:00.000000000 -0700 @@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ obj-$(CONFIG_SUPERH) += sh/ +obj-$(CONFIG_DMA_ENGINE) += dma/ diff -puN drivers/net/irda/ali-ircc.c~git-net drivers/net/irda/ali-ircc.c --- devel/drivers/net/irda/ali-ircc.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/drivers/net/irda/ali-ircc.c 2006-05-29 15:05:00.000000000 -0700 @@ -34,14 +34,12 @@ #include #include #include +#include #include #include #include -#include -#include - #include #include #include @@ -51,7 +49,19 @@ #define CHIP_IO_EXTENT 8 #define BROKEN_DONGLE_ID -static char *driver_name = "ali-ircc"; +#define ALI_IRCC_DRIVER_NAME "ali-ircc" + +/* Power Management */ +static int ali_ircc_suspend(struct platform_device *dev, pm_message_t state); +static int ali_ircc_resume(struct platform_device *dev); + +static struct platform_driver ali_ircc_driver = { + .suspend = ali_ircc_suspend, + .resume = ali_ircc_resume, + .driver = { + .name = ALI_IRCC_DRIVER_NAME, + }, +}; /* Module parameters */ static int qos_mtt_bits = 0x07; /* 1 ms or more */ @@ -97,10 +107,7 @@ static int ali_ircc_is_receiving(struct static int ali_ircc_net_open(struct net_device *dev); static int ali_ircc_net_close(struct net_device *dev); static int ali_ircc_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); -static int ali_ircc_pmproc(struct pm_dev *dev, pm_request_t rqst, void *data); static void ali_ircc_change_speed(struct ali_ircc_cb *self, __u32 baud); -static void ali_ircc_suspend(struct ali_ircc_cb *self); -static void ali_ircc_wakeup(struct ali_ircc_cb *self); static struct net_device_stats *ali_ircc_net_get_stats(struct net_device *dev); /* SIR function */ @@ -145,6 +152,14 @@ static int __init ali_ircc_init(void) int i = 0; IRDA_DEBUG(2, "%s(), ---------------- Start ----------------\n", __FUNCTION__); + + ret = platform_driver_register(&ali_ircc_driver); + if (ret) { + IRDA_ERROR("%s, Can't register driver!\n", + ALI_IRCC_DRIVER_NAME); + return ret; + } + /* Probe for all the ALi chipsets we know about */ for (chip= chips; chip->name; chip++, i++) @@ -214,6 +229,10 @@ static int __init ali_ircc_init(void) } IRDA_DEBUG(2, "%s(), ----------------- End -----------------\n", __FUNCTION__); + + if (ret) + platform_driver_unregister(&ali_ircc_driver); + return ret; } @@ -228,14 +247,14 @@ static void __exit ali_ircc_cleanup(void int i; IRDA_DEBUG(2, "%s(), ---------------- Start ----------------\n", __FUNCTION__); - - pm_unregister_all(ali_ircc_pmproc); for (i=0; i < 4; i++) { if (dev_self[i]) ali_ircc_close(dev_self[i]); } + platform_driver_unregister(&ali_ircc_driver); + IRDA_DEBUG(2, "%s(), ----------------- End -----------------\n", __FUNCTION__); } @@ -249,7 +268,6 @@ static int ali_ircc_open(int i, chipio_t { struct net_device *dev; struct ali_ircc_cb *self; - struct pm_dev *pmdev; int dongle_id; int err; @@ -284,7 +302,8 @@ static int ali_ircc_open(int i, chipio_t self->io.fifo_size = 16; /* SIR: 16, FIR: 32 Benjamin 2000/11/1 */ /* Reserve the ioports that we need */ - if (!request_region(self->io.fir_base, self->io.fir_ext, driver_name)) { + if (!request_region(self->io.fir_base, self->io.fir_ext, + ALI_IRCC_DRIVER_NAME)) { IRDA_WARNING("%s(), can't get iobase of 0x%03x\n", __FUNCTION__, self->io.fir_base); err = -ENODEV; @@ -354,13 +373,10 @@ static int ali_ircc_open(int i, chipio_t /* Check dongle id */ dongle_id = ali_ircc_read_dongle_id(i, info); - IRDA_MESSAGE("%s(), %s, Found dongle: %s\n", __FUNCTION__, driver_name, dongle_types[dongle_id]); + IRDA_MESSAGE("%s(), %s, Found dongle: %s\n", __FUNCTION__, + ALI_IRCC_DRIVER_NAME, dongle_types[dongle_id]); self->io.dongle_id = dongle_id; - - pmdev = pm_register(PM_SYS_DEV, PM_SYS_IRDA, ali_ircc_pmproc); - if (pmdev) - pmdev->data = self; IRDA_DEBUG(2, "%s(), ----------------- End -----------------\n", __FUNCTION__); @@ -548,12 +564,11 @@ static int ali_ircc_setup(chipio_t *info /* Should be 0x00 in the M1535/M1535D */ if(version != 0x00) { - IRDA_ERROR("%s, Wrong chip version %02x\n", driver_name, version); + IRDA_ERROR("%s, Wrong chip version %02x\n", + ALI_IRCC_DRIVER_NAME, version); return -1; } - // IRDA_MESSAGE("%s, Found chip at base=0x%03x\n", driver_name, info->cfg_base); - /* Set FIR FIFO Threshold Register */ switch_bank(iobase, BANK1); outb(RX_FIFO_Threshold, iobase+FIR_FIFO_TR); @@ -583,7 +598,8 @@ static int ali_ircc_setup(chipio_t *info /* Switch to SIR space */ FIR2SIR(iobase); - IRDA_MESSAGE("%s, driver loaded (Benjamin Kong)\n", driver_name); + IRDA_MESSAGE("%s, driver loaded (Benjamin Kong)\n", + ALI_IRCC_DRIVER_NAME); /* Enable receive interrupts */ // outb(UART_IER_RDI, iobase+UART_IER); //benjamin 2000/11/23 01:25PM @@ -647,7 +663,8 @@ static irqreturn_t ali_ircc_interrupt(in IRDA_DEBUG(2, "%s(), ---------------- Start ----------------\n", __FUNCTION__); if (!dev) { - IRDA_WARNING("%s: irq %d for unknown device.\n", driver_name, irq); + IRDA_WARNING("%s: irq %d for unknown device.\n", + ALI_IRCC_DRIVER_NAME, irq); return IRQ_NONE; } @@ -1328,7 +1345,8 @@ static int ali_ircc_net_open(struct net_ /* Request IRQ and install Interrupt Handler */ if (request_irq(self->io.irq, ali_ircc_interrupt, 0, dev->name, dev)) { - IRDA_WARNING("%s, unable to allocate irq=%d\n", driver_name, + IRDA_WARNING("%s, unable to allocate irq=%d\n", + ALI_IRCC_DRIVER_NAME, self->io.irq); return -EAGAIN; } @@ -1338,7 +1356,8 @@ static int ali_ircc_net_open(struct net_ * failure. */ if (request_dma(self->io.dma, dev->name)) { - IRDA_WARNING("%s, unable to allocate dma=%d\n", driver_name, + IRDA_WARNING("%s, unable to allocate dma=%d\n", + ALI_IRCC_DRIVER_NAME, self->io.dma); free_irq(self->io.irq, self); return -EAGAIN; @@ -2108,61 +2127,38 @@ static struct net_device_stats *ali_ircc return &self->stats; } -static void ali_ircc_suspend(struct ali_ircc_cb *self) +static int ali_ircc_suspend(struct platform_device *dev, pm_message_t state) { - IRDA_DEBUG(2, "%s(), ---------------- Start ----------------\n", __FUNCTION__ ); + struct ali_ircc_cb *self = platform_get_drvdata(dev); - IRDA_MESSAGE("%s, Suspending\n", driver_name); + IRDA_MESSAGE("%s, Suspending\n", ALI_IRCC_DRIVER_NAME); if (self->io.suspended) - return; + return 0; ali_ircc_net_close(self->netdev); self->io.suspended = 1; - IRDA_DEBUG(2, "%s(), ----------------- End ------------------\n", __FUNCTION__ ); + return 0; } -static void ali_ircc_wakeup(struct ali_ircc_cb *self) +static int ali_ircc_resume(struct platform_device *dev) { - IRDA_DEBUG(2, "%s(), ---------------- Start ----------------\n", __FUNCTION__ ); + struct ali_ircc_cb *self = platform_get_drvdata(dev); if (!self->io.suspended) - return; + return 0; ali_ircc_net_open(self->netdev); - IRDA_MESSAGE("%s, Waking up\n", driver_name); + IRDA_MESSAGE("%s, Waking up\n", ALI_IRCC_DRIVER_NAME); self->io.suspended = 0; - - IRDA_DEBUG(2, "%s(), ----------------- End ------------------\n", __FUNCTION__ ); -} -static int ali_ircc_pmproc(struct pm_dev *dev, pm_request_t rqst, void *data) -{ - struct ali_ircc_cb *self = (struct ali_ircc_cb*) dev->data; - - IRDA_DEBUG(2, "%s(), ---------------- Start ----------------\n", __FUNCTION__ ); - - if (self) { - switch (rqst) { - case PM_SUSPEND: - ali_ircc_suspend(self); - break; - case PM_RESUME: - ali_ircc_wakeup(self); - break; - } - } - - IRDA_DEBUG(2, "%s(), ----------------- End ------------------\n", __FUNCTION__ ); - return 0; } - /* ALi Chip Function */ static void SetCOMInterrupts(struct ali_ircc_cb *self , unsigned char enable) diff -puN drivers/net/irda/Kconfig~git-net drivers/net/irda/Kconfig --- devel/drivers/net/irda/Kconfig~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/drivers/net/irda/Kconfig 2006-05-29 15:05:00.000000000 -0700 @@ -417,5 +417,20 @@ config PXA_FICP available capabilities may vary from one PXA2xx target to another. +config MCS_FIR + tristate "MosChip MCS7780 IrDA-USB dongle" + depends on IRDA && USB && EXPERIMENTAL + help + Say Y or M here if you want to build support for the MosChip + MCS7780 IrDA-USB bridge device driver. + + USB bridge based on the MosChip MCS7780 don't conform to the + IrDA-USB device class specification, and therefore need their + own specific driver. Those dongles support SIR and FIR (4Mbps) + speeds. + + To compile it as a module, choose M here: the module will be called + mcs7780. + endmenu diff -puN drivers/net/irda/Makefile~git-net drivers/net/irda/Makefile --- devel/drivers/net/irda/Makefile~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/drivers/net/irda/Makefile 2006-05-29 15:05:00.000000000 -0700 @@ -19,6 +19,7 @@ obj-$(CONFIG_ALI_FIR) += ali-ircc.o obj-$(CONFIG_VLSI_FIR) += vlsi_ir.o obj-$(CONFIG_VIA_FIR) += via-ircc.o obj-$(CONFIG_PXA_FICP) += pxaficp_ir.o +obj-$(CONFIG_MCS_FIR) += mcs7780.o # Old dongle drivers for old SIR drivers obj-$(CONFIG_ESI_DONGLE_OLD) += esi.o obj-$(CONFIG_TEKRAM_DONGLE_OLD) += tekram.o diff -puN /dev/null drivers/net/irda/mcs7780.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/net/irda/mcs7780.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,1009 @@ +/***************************************************************************** +* +* Filename: mcs7780.c +* Version: 0.4-alpha +* Description: Irda MosChip USB Dongle Driver +* Authors: Lukasz Stelmach +* Brian Pugh +* Judy Fischbach +* +* Based on stir4200 driver, but some things done differently. +* Based on earlier driver by Paul Stewart +* +* Copyright (C) 2000, Roman Weissgaerber +* Copyright (C) 2001, Dag Brattli +* Copyright (C) 2001, Jean Tourrilhes +* Copyright (C) 2004, Stephen Hemminger +* Copyright (C) 2005, Lukasz Stelmach +* Copyright (C) 2005, Brian Pugh +* Copyright (C) 2005, Judy Fischbach +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +*****************************************************************************/ + +/* + * MCS7780 is a simple USB to IrDA bridge by MosChip. It is neither + * compatibile with irda-usb nor with stir4200. Although it is quite + * similar to the later as far as general idea of operation is concerned. + * That is it requires the software to do all the framing job at SIR speeds. + * The hardware does take care of the framing at MIR and FIR speeds. + * It supports all speeds from 2400 through 4Mbps + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include "mcs7780.h" + +#define MCS_VENDOR_ID 0x9710 +#define MCS_PRODUCT_ID 0x7780 + +static struct usb_device_id mcs_table[] = { + /* MosChip Corp., MCS7780 FIR-USB Adapter */ + {USB_DEVICE(MCS_VENDOR_ID, MCS_PRODUCT_ID)}, + {}, +}; + +MODULE_AUTHOR("Brian Pugh "); +MODULE_DESCRIPTION("IrDA-USB Dongle Driver for MosChip MCS7780"); +MODULE_VERSION("0.3alpha"); +MODULE_LICENSE("GPL"); + +MODULE_DEVICE_TABLE(usb, mcs_table); + +static int qos_mtt_bits = 0x07 /* > 1ms */ ; +module_param(qos_mtt_bits, int, 0); +MODULE_PARM_DESC(qos_mtt_bits, "Minimum Turn Time"); + +static int receive_mode = 0x1; +module_param(receive_mode, int, 0); +MODULE_PARM_DESC(receive_mode, + "Receive mode of the device (1:fast, 0:slow, default:1)"); + +static int sir_tweak = 1; +module_param(sir_tweak, int, 0444); +MODULE_PARM_DESC(sir_tweak, + "Default pulse width (1:1.6us, 0:3/16 bit, default:1)."); + +static int transceiver_type = MCS_TSC_VISHAY; +module_param(transceiver_type, int, 0444); +MODULE_PARM_DESC(transceiver_type, "IR transceiver type, see mcs7780.h."); + +struct usb_driver mcs_driver = { + .name = "mcs7780", + .probe = mcs_probe, + .disconnect = mcs_disconnect, + .id_table = mcs_table, +}; + +/* speed flag selection by direct addressing. +addr = (speed >> 8) & 0x0f + +0x1 57600 0x2 115200 0x4 1152000 0x5 9600 +0x6 38400 0x9 2400 0xa 576000 0xb 19200 + +4Mbps (or 2400) must be checked separately. Since it also has +to be programmed in a different manner that is not a big problem. +*/ +static __u16 mcs_speed_set[16] = { 0, + MCS_SPEED_57600, + MCS_SPEED_115200, + 0, + MCS_SPEED_1152000, + MCS_SPEED_9600, + MCS_SPEED_38400, + 0, 0, + MCS_SPEED_2400, + MCS_SPEED_576000, + MCS_SPEED_19200, + 0, 0, 0, +}; + +/* Set given 16 bit register with a 16 bit value. Send control message + * to set dongle register. */ +static int mcs_set_reg(struct mcs_cb *mcs, __u16 reg, __u16 val) +{ + struct usb_device *dev = mcs->usbdev; + return usb_control_msg(dev, usb_sndctrlpipe(dev, 0), MCS_WRREQ, + MCS_WR_RTYPE, val, reg, NULL, 0, + msecs_to_jiffies(MCS_CTRL_TIMEOUT)); +} + +/* Get 16 bit register value. Send contol message to read dongle register. */ +static int mcs_get_reg(struct mcs_cb *mcs, __u16 reg, __u16 * val) +{ + struct usb_device *dev = mcs->usbdev; + int ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), MCS_RDREQ, + MCS_RD_RTYPE, 0, reg, val, 2, + msecs_to_jiffies(MCS_CTRL_TIMEOUT)); + + return ret; +} + +/* Setup a communication between mcs7780 and TFDU chips. It is described + * in more detail in the data sheet. The setup sequence puts the the + * vishay tranceiver into high speed mode. It will also receive SIR speed + * packets but at reduced sensitivity. + */ + +/* 0: OK 1:ERROR */ +static inline int mcs_setup_transceiver_vishay(struct mcs_cb *mcs) +{ + int ret = 0; + __u16 rval; + + /* mcs_get_reg should read exactly two bytes from the dongle */ + ret = mcs_get_reg(mcs, MCS_XCVR_REG, &rval); + if (unlikely(ret != 2)) { + ret = -EIO; + goto error; + } + + /* The MCS_XCVR_CONF bit puts the transceiver into configuration + * mode. The MCS_MODE0 bit must start out high (1) and then + * transition to low and the MCS_STFIR and MCS_MODE1 bits must + * be low. + */ + rval |= (MCS_MODE0 | MCS_XCVR_CONF); + rval &= ~MCS_STFIR; + rval &= ~MCS_MODE1; + ret = mcs_set_reg(mcs, MCS_XCVR_REG, rval); + if (unlikely(ret)) + goto error; + + rval &= ~MCS_MODE0; + ret = mcs_set_reg(mcs, MCS_XCVR_REG, rval); + if (unlikely(ret)) + goto error; + + rval &= ~MCS_XCVR_CONF; + ret = mcs_set_reg(mcs, MCS_XCVR_REG, rval); + if (unlikely(ret)) + goto error; + + ret = 0; + error: + return ret; +} + +/* Setup a communication between mcs7780 and agilent chip. */ +static inline int mcs_setup_transceiver_agilent(struct mcs_cb *mcs) +{ + IRDA_WARNING("This transceiver type is not supported yet."); + return 1; +} + +/* Setup a communication between mcs7780 and sharp chip. */ +static inline int mcs_setup_transceiver_sharp(struct mcs_cb *mcs) +{ + IRDA_WARNING("This transceiver type is not supported yet."); + return 1; +} + +/* Common setup for all transceivers */ +static inline int mcs_setup_transceiver(struct mcs_cb *mcs) +{ + int ret = 0; + __u16 rval; + char *msg; + + msg = "Basic transceiver setup error."; + + /* read value of MODE Register, set the DRIVER and RESET bits + * and write value back out to MODE Register + */ + ret = mcs_get_reg(mcs, MCS_MODE_REG, &rval); + if(unlikely(ret != 2)) + goto error; + rval |= MCS_DRIVER; /* put the mcs7780 into configuration mode. */ + ret = mcs_set_reg(mcs, MCS_MODE_REG, rval); + if(unlikely(ret)) + goto error; + + rval = 0; /* set min pulse width to 0 initially. */ + ret = mcs_set_reg(mcs, MCS_MINRXPW_REG, rval); + if(unlikely(ret)) + goto error; + + ret = mcs_get_reg(mcs, MCS_MODE_REG, &rval); + if(unlikely(ret != 2)) + goto error; + + rval &= ~MCS_FIR; /* turn off fir mode. */ + if(mcs->sir_tweak) + rval |= MCS_SIR16US; /* 1.6us pulse width */ + else + rval &= ~MCS_SIR16US; /* 3/16 bit time pulse width */ + + /* make sure ask mode and back to back packets are off. */ + rval &= ~(MCS_BBTG | MCS_ASK); + + rval &= ~MCS_SPEED_MASK; + rval |= MCS_SPEED_9600; /* make sure initial speed is 9600. */ + mcs->speed = 9600; + mcs->new_speed = 0; /* new_speed is set to 0 */ + rval &= ~MCS_PLLPWDN; /* disable power down. */ + + /* make sure device determines direction and that the auto send sip + * pulse are on. + */ + rval |= MCS_DTD | MCS_SIPEN; + + ret = mcs_set_reg(mcs, MCS_MODE_REG, rval); + if(unlikely(ret)) + goto error; + + msg = "transceiver model specific setup error."; + switch (mcs->transceiver_type) { + case MCS_TSC_VISHAY: + ret = mcs_setup_transceiver_vishay(mcs); + break; + + case MCS_TSC_SHARP: + ret = mcs_setup_transceiver_sharp(mcs); + break; + + case MCS_TSC_AGILENT: + ret = mcs_setup_transceiver_agilent(mcs); + break; + + default: + IRDA_WARNING("Unknown transceiver type: %d", + mcs->transceiver_type); + ret = 1; + } + if (unlikely(ret)) + goto error; + + /* If transceiver is not SHARP, then if receive mode set + * on the RXFAST bit in the XCVR Register otherwise unset it + */ + if (mcs->transceiver_type != MCS_TSC_SHARP) { + + ret = mcs_get_reg(mcs, MCS_XCVR_REG, &rval); + if (unlikely(ret != 2)) + goto error; + if (mcs->receive_mode) + rval |= MCS_RXFAST; + else + rval &= ~MCS_RXFAST; + ret = mcs_set_reg(mcs, MCS_XCVR_REG, rval); + if (unlikely(ret)) + goto error; + } + + msg = "transceiver reset."; + + ret = mcs_get_reg(mcs, MCS_MODE_REG, &rval); + if (unlikely(ret != 2)) + goto error; + + /* reset the mcs7780 so all changes take effect. */ + rval &= ~MCS_RESET; + ret = mcs_set_reg(mcs, MCS_MODE_REG, rval); + if (unlikely(ret)) + goto error; + else + return ret; + +error: + IRDA_ERROR("%s", msg); + return ret; +} + +/* Wraps the data in format for SIR */ +static inline int mcs_wrap_sir_skb(struct sk_buff *skb, __u8 * buf) +{ + int wraplen; + + /* 2: full frame length, including "the length" */ + wraplen = async_wrap_skb(skb, buf + 2, 4094); + + wraplen += 2; + buf[0] = wraplen & 0xff; + buf[1] = (wraplen >> 8) & 0xff; + + return wraplen; +} + +/* Wraps the data in format for FIR */ +static unsigned mcs_wrap_fir_skb(const struct sk_buff *skb, __u8 *buf) +{ + unsigned int len = 0; + __u32 fcs = ~(crc32_le(~0, skb->data, skb->len)); + + /* add 2 bytes for length value and 4 bytes for fcs. */ + len = skb->len + 6; + + /* The mcs7780 requires that the first two bytes are the packet + * length in little endian order. Note: the length value includes + * the two bytes for the length value itself. + */ + buf[0] = len & 0xff; + buf[1] = (len >> 8) & 0xff; + /* copy the data into the tx buffer. */ + memcpy(buf+2, skb->data, skb->len); + /* put the fcs in the last four bytes in little endian order. */ + buf[len - 4] = fcs & 0xff; + buf[len - 3] = (fcs >> 8) & 0xff; + buf[len - 2] = (fcs >> 16) & 0xff; + buf[len - 1] = (fcs >> 24) & 0xff; + + return len; +} + +/* Wraps the data in format for MIR */ +static unsigned mcs_wrap_mir_skb(const struct sk_buff *skb, __u8 *buf) +{ + __u16 fcs = 0; + int len = skb->len + 4; + + fcs = ~(irda_calc_crc16(~fcs, skb->data, skb->len)); + /* put the total packet length in first. Note: packet length + * value includes the two bytes that hold the packet length + * itself. + */ + buf[0] = len & 0xff; + buf[1] = (len >> 8) & 0xff; + /* copy the data */ + memcpy(buf+2, skb->data, skb->len); + /* put the fcs in last two bytes in little endian order. */ + buf[len - 2] = fcs & 0xff; + buf[len - 1] = (fcs >> 8) & 0xff; + + return len; +} + +/* Unwrap received packets at MIR speed. A 16 bit crc_ccitt checksum is + * used for the fcs. When performed over the entire packet the result + * should be GOOD_FCS = 0xf0b8. Hands the unwrapped data off to the IrDA + * layer via a sk_buff. + */ +static void mcs_unwrap_mir(struct mcs_cb *mcs, __u8 *buf, int len) +{ + __u16 fcs; + int new_len; + struct sk_buff *skb; + + /* Assume that the frames are going to fill a single packet + * rather than span multiple packets. + */ + + new_len = len - 2; + if(unlikely(new_len <= 0)) { + IRDA_ERROR("%s short frame length %d\n", + mcs->netdev->name, new_len); + ++mcs->stats.rx_errors; + ++mcs->stats.rx_length_errors; + return; + } + fcs = 0; + fcs = irda_calc_crc16(~fcs, buf, len); + + if(fcs != GOOD_FCS) { + IRDA_ERROR("crc error calc 0x%x len %d\n", + fcs, new_len); + mcs->stats.rx_errors++; + mcs->stats.rx_crc_errors++; + return; + } + + skb = dev_alloc_skb(new_len + 1); + if(unlikely(!skb)) { + ++mcs->stats.rx_dropped; + return; + } + + skb_reserve(skb, 1); + memcpy(skb->data, buf, new_len); + skb_put(skb, new_len); + skb->mac.raw = skb->data; + skb->protocol = htons(ETH_P_IRDA); + skb->dev = mcs->netdev; + + netif_rx(skb); + + mcs->stats.rx_packets++; + mcs->stats.rx_bytes += new_len; + + return; +} + +/* Unwrap received packets at FIR speed. A 32 bit crc_ccitt checksum is + * used for the fcs. Hands the unwrapped data off to the IrDA + * layer via a sk_buff. + */ +static void mcs_unwrap_fir(struct mcs_cb *mcs, __u8 *buf, int len) +{ + __u32 fcs; + int new_len; + struct sk_buff *skb; + + /* Assume that the frames are going to fill a single packet + * rather than span multiple packets. This is most likely a false + * assumption. + */ + + new_len = len - 4; + if(unlikely(new_len <= 0)) { + IRDA_ERROR("%s short frame length %d\n", + mcs->netdev->name, new_len); + ++mcs->stats.rx_errors; + ++mcs->stats.rx_length_errors; + return; + } + + fcs = ~(crc32_le(~0, buf, new_len)); + if(fcs != le32_to_cpu(get_unaligned((u32 *)(buf+new_len)))) { + IRDA_ERROR("crc error calc 0x%x len %d\n", fcs, new_len); + mcs->stats.rx_errors++; + mcs->stats.rx_crc_errors++; + return; + } + + skb = dev_alloc_skb(new_len + 1); + if(unlikely(!skb)) { + ++mcs->stats.rx_dropped; + return; + } + + skb_reserve(skb, 1); + memcpy(skb->data, buf, new_len); + skb_put(skb, new_len); + skb->mac.raw = skb->data; + skb->protocol = htons(ETH_P_IRDA); + skb->dev = mcs->netdev; + + netif_rx(skb); + + mcs->stats.rx_packets++; + mcs->stats.rx_bytes += new_len; + + return; +} + + +/* Allocates urbs for both receive and transmit. + * If alloc fails return error code 0 (fail) otherwise + * return error code 1 (success). + */ +static inline int mcs_setup_urbs(struct mcs_cb *mcs) +{ + mcs->rx_urb = NULL; + + mcs->tx_urb = usb_alloc_urb(0, GFP_KERNEL); + if (!mcs->tx_urb) + return 0; + + mcs->rx_urb = usb_alloc_urb(0, GFP_KERNEL); + if (!mcs->rx_urb) + return 0; + + return 1; +} + +/* Sets up state to be initially outside frame, gets receive urb, + * sets status to successful and then submits the urb to start + * receiving the data. + */ +static inline int mcs_receive_start(struct mcs_cb *mcs) +{ + mcs->rx_buff.in_frame = FALSE; + mcs->rx_buff.state = OUTSIDE_FRAME; + + usb_fill_bulk_urb(mcs->rx_urb, mcs->usbdev, + usb_rcvbulkpipe(mcs->usbdev, mcs->ep_in), + mcs->in_buf, 4096, mcs_receive_irq, mcs); + + mcs->rx_urb->status = 0; + return usb_submit_urb(mcs->rx_urb, GFP_KERNEL); +} + +/* Finds the in and out endpoints for the mcs control block */ +static inline int mcs_find_endpoints(struct mcs_cb *mcs, + struct usb_host_endpoint *ep, int epnum) +{ + int i; + int ret = 0; + + /* If no place to store the endpoints just return */ + if (!ep) + return ret; + + /* cycle through all endpoints, find the first two that are DIR_IN */ + for (i = 0; i < epnum; i++) { + if (ep[i].desc.bEndpointAddress & USB_DIR_IN) + mcs->ep_in = ep[i].desc.bEndpointAddress; + else + mcs->ep_out = ep[i].desc.bEndpointAddress; + + /* MosChip says that the chip has only two bulk + * endpoints. Find one for each direction and move on. + */ + if ((mcs->ep_in != 0) && (mcs->ep_out != 0)) { + ret = 1; + break; + } + } + + return ret; +} + +static void mcs_speed_work(void *arg) +{ + struct mcs_cb *mcs = arg; + struct net_device *netdev = mcs->netdev; + + mcs_speed_change(mcs); + netif_wake_queue(netdev); +} + +/* Function to change the speed of the mcs7780. Fully supports SIR, + * MIR, and FIR speeds. + */ +static int mcs_speed_change(struct mcs_cb *mcs) +{ + int ret = 0; + int rst = 0; + int cnt = 0; + __u16 nspeed; + __u16 rval; + + nspeed = mcs_speed_set[(mcs->new_speed >> 8) & 0x0f]; + + do { + mcs_get_reg(mcs, MCS_RESV_REG, &rval); + } while(cnt++ < 100 && (rval & MCS_IRINTX)); + + if(cnt >= 100) { + IRDA_ERROR("unable to change speed"); + ret = -EIO; + goto error; + } + + mcs_get_reg(mcs, MCS_MODE_REG, &rval); + + /* MINRXPW values recomended by MosChip */ + if (mcs->new_speed <= 115200) { + rval &= ~MCS_FIR; + + if ((rst = (mcs->speed > 115200))) + mcs_set_reg(mcs, MCS_MINRXPW_REG, 0); + + } else if (mcs->new_speed <= 1152000) { + rval &= ~MCS_FIR; + + if ((rst = !(mcs->speed == 576000 || mcs->speed == 1152000))) + mcs_set_reg(mcs, MCS_MINRXPW_REG, 5); + + } else { + rval |= MCS_FIR; + + if ((rst = (mcs->speed != 4000000))) + mcs_set_reg(mcs, MCS_MINRXPW_REG, 5); + + } + + rval &= ~MCS_SPEED_MASK; + rval |= nspeed; + + ret = mcs_set_reg(mcs, MCS_MODE_REG, rval); + if (unlikely(ret)) + goto error; + + if (rst) + switch (mcs->transceiver_type) { + case MCS_TSC_VISHAY: + ret = mcs_setup_transceiver_vishay(mcs); + break; + + case MCS_TSC_SHARP: + ret = mcs_setup_transceiver_sharp(mcs); + break; + + case MCS_TSC_AGILENT: + ret = mcs_setup_transceiver_agilent(mcs); + break; + + default: + ret = 1; + IRDA_WARNING("Unknown transceiver type: %d", + mcs->transceiver_type); + } + if (unlikely(ret)) + goto error; + + mcs_get_reg(mcs, MCS_MODE_REG, &rval); + rval &= ~MCS_RESET; + ret = mcs_set_reg(mcs, MCS_MODE_REG, rval); + + mcs->speed = mcs->new_speed; + error: + mcs->new_speed = 0; + return ret; +} + +/* Ioctl calls not supported at this time. Can be an area of future work. */ +static int mcs_net_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd) +{ + /* struct if_irda_req *irq = (struct if_irda_req *)rq; */ + /* struct mcs_cb *mcs = netdev_priv(netdev); */ + int ret = 0; + + switch (cmd) { + default: + ret = -EOPNOTSUPP; + } + + return ret; +} + +/* Network device is taken down, done by "ifconfig irda0 down" */ +static int mcs_net_close(struct net_device *netdev) +{ + int ret = 0; + struct mcs_cb *mcs = netdev_priv(netdev); + + /* Stop transmit processing */ + netif_stop_queue(netdev); + + /* kill and free the receive and transmit URBs */ + usb_kill_urb(mcs->rx_urb); + usb_free_urb(mcs->rx_urb); + usb_kill_urb(mcs->tx_urb); + usb_free_urb(mcs->tx_urb); + + /* Stop and remove instance of IrLAP */ + if (mcs->irlap) + irlap_close(mcs->irlap); + + mcs->irlap = NULL; + return ret; +} + +/* Network device is taken up, done by "ifconfig irda0 up" */ +static int mcs_net_open(struct net_device *netdev) +{ + struct mcs_cb *mcs = netdev_priv(netdev); + char hwname[16]; + int ret = 0; + + ret = usb_clear_halt(mcs->usbdev, + usb_sndbulkpipe(mcs->usbdev, mcs->ep_in)); + if (ret) + goto error1; + ret = usb_clear_halt(mcs->usbdev, + usb_rcvbulkpipe(mcs->usbdev, mcs->ep_out)); + if (ret) + goto error1; + + ret = mcs_setup_transceiver(mcs); + if (ret) + goto error1; + + ret = -ENOMEM; + + /* Initialize for SIR/FIR to copy data directly into skb. */ + mcs->receiving = 0; + mcs->rx_buff.truesize = IRDA_SKB_MAX_MTU; + mcs->rx_buff.skb = dev_alloc_skb(IRDA_SKB_MAX_MTU); + if (!mcs->rx_buff.skb) + goto error1; + + skb_reserve(mcs->rx_buff.skb, 1); + mcs->rx_buff.head = mcs->rx_buff.skb->data; + do_gettimeofday(&mcs->rx_time); + + /* + * Now that everything should be initialized properly, + * Open new IrLAP layer instance to take care of us... + * Note : will send immediately a speed change... + */ + sprintf(hwname, "usb#%d", mcs->usbdev->devnum); + mcs->irlap = irlap_open(netdev, &mcs->qos, hwname); + if (!mcs->irlap) { + IRDA_ERROR("mcs7780: irlap_open failed"); + goto error2; + } + + if (!mcs_setup_urbs(mcs)) + goto error3; + + ret = mcs_receive_start(mcs); + if (ret) + goto error3; + + netif_start_queue(netdev); + return 0; + + error3: + irlap_close(mcs->irlap); + error2: + kfree_skb(mcs->rx_buff.skb); + error1: + return ret; +} + + +/* Get device stats for /proc/net/dev and ifconfig */ +static struct net_device_stats *mcs_net_get_stats(struct net_device *netdev) +{ + struct mcs_cb *mcs = netdev_priv(netdev); + return &mcs->stats; +} + +/* Receive callback function. */ +static void mcs_receive_irq(struct urb *urb, struct pt_regs *regs) +{ + __u8 *bytes; + struct mcs_cb *mcs = urb->context; + int i; + int ret; + + if (!netif_running(mcs->netdev)) + return; + + if (urb->status) + return; + + if (urb->actual_length > 0) { + bytes = urb->transfer_buffer; + + /* MCS returns frames without BOF and EOF + * I assume it returns whole frames. + */ + /* SIR speed */ + if(mcs->speed < 576000) { + async_unwrap_char(mcs->netdev, &mcs->stats, + &mcs->rx_buff, 0xc0); + + for (i = 0; i < urb->actual_length; i++) + async_unwrap_char(mcs->netdev, &mcs->stats, + &mcs->rx_buff, bytes[i]); + + async_unwrap_char(mcs->netdev, &mcs->stats, + &mcs->rx_buff, 0xc1); + } + /* MIR speed */ + else if(mcs->speed == 576000 || mcs->speed == 1152000) { + mcs_unwrap_mir(mcs, urb->transfer_buffer, + urb->actual_length); + } + /* FIR speed */ + else { + mcs_unwrap_fir(mcs, urb->transfer_buffer, + urb->actual_length); + } + mcs->netdev->last_rx = jiffies; + do_gettimeofday(&mcs->rx_time); + } + + ret = usb_submit_urb(urb, GFP_ATOMIC); +} + +/* Transmit callback funtion. */ +static void mcs_send_irq(struct urb *urb, struct pt_regs *regs) +{ + struct mcs_cb *mcs = urb->context; + struct net_device *ndev = mcs->netdev; + + if (unlikely(mcs->new_speed)) + schedule_work(&mcs->work); + else + netif_wake_queue(ndev); +} + +/* Transmit callback funtion. */ +static int mcs_hard_xmit(struct sk_buff *skb, struct net_device *ndev) +{ + unsigned long flags; + struct mcs_cb *mcs; + int wraplen; + int ret = 0; + + + if (skb == NULL || ndev == NULL) + return -EINVAL; + + netif_stop_queue(ndev); + mcs = netdev_priv(ndev); + + spin_lock_irqsave(&mcs->lock, flags); + + mcs->new_speed = irda_get_next_speed(skb); + if (likely(mcs->new_speed == mcs->speed)) + mcs->new_speed = 0; + + /* SIR speed */ + if(mcs->speed < 576000) { + wraplen = mcs_wrap_sir_skb(skb, mcs->out_buf); + } + /* MIR speed */ + else if(mcs->speed == 576000 || mcs->speed == 1152000) { + wraplen = mcs_wrap_mir_skb(skb, mcs->out_buf); + } + /* FIR speed */ + else { + wraplen = mcs_wrap_fir_skb(skb, mcs->out_buf); + } + usb_fill_bulk_urb(mcs->tx_urb, mcs->usbdev, + usb_sndbulkpipe(mcs->usbdev, mcs->ep_out), + mcs->out_buf, wraplen, mcs_send_irq, mcs); + + if ((ret = usb_submit_urb(mcs->tx_urb, GFP_ATOMIC))) { + IRDA_ERROR("failed tx_urb: %d", ret); + switch (ret) { + case -ENODEV: + case -EPIPE: + break; + default: + mcs->stats.tx_errors++; + netif_start_queue(ndev); + } + } else { + mcs->stats.tx_packets++; + mcs->stats.tx_bytes += skb->len; + } + + dev_kfree_skb(skb); + spin_unlock_irqrestore(&mcs->lock, flags); + return ret; +} + +/* + * This function is called by the USB subsystem for each new device in the + * system. Need to verify the device and if it is, then start handling it. + */ +static int mcs_probe(struct usb_interface *intf, + const struct usb_device_id *id) +{ + struct usb_device *udev = interface_to_usbdev(intf); + struct net_device *ndev = NULL; + struct mcs_cb *mcs; + int ret = -ENOMEM; + + ndev = alloc_irdadev(sizeof(*mcs)); + if (!ndev) + goto error1; + + IRDA_DEBUG(1, "MCS7780 USB-IrDA bridge found at %d.", udev->devnum); + + /* what is it realy for? */ + SET_MODULE_OWNER(ndev); + SET_NETDEV_DEV(ndev, &intf->dev); + + ret = usb_reset_configuration(udev); + if (ret != 0) { + IRDA_ERROR("mcs7780: usb reset configuration failed"); + goto error2; + } + + mcs = netdev_priv(ndev); + mcs->usbdev = udev; + mcs->netdev = ndev; + spin_lock_init(&mcs->lock); + + /* Initialize QoS for this device */ + irda_init_max_qos_capabilies(&mcs->qos); + + /* That's the Rx capability. */ + mcs->qos.baud_rate.bits &= + IR_2400 | IR_9600 | IR_19200 | IR_38400 | IR_57600 | IR_115200 + | IR_576000 | IR_1152000 | (IR_4000000 << 8); + + + mcs->qos.min_turn_time.bits &= qos_mtt_bits; + irda_qos_bits_to_value(&mcs->qos); + + /* Speed change work initialisation*/ + INIT_WORK(&mcs->work, mcs_speed_work, mcs); + + /* Override the network functions we need to use */ + ndev->hard_start_xmit = mcs_hard_xmit; + ndev->open = mcs_net_open; + ndev->stop = mcs_net_close; + ndev->get_stats = mcs_net_get_stats; + ndev->do_ioctl = mcs_net_ioctl; + + if (!intf->cur_altsetting) + goto error2; + + ret = mcs_find_endpoints(mcs, intf->cur_altsetting->endpoint, + intf->cur_altsetting->desc.bNumEndpoints); + if (!ret) { + ret = -ENODEV; + goto error2; + } + + ret = register_netdev(ndev); + if (ret != 0) + goto error2; + + IRDA_DEBUG(1, "IrDA: Registered MosChip MCS7780 device as %s", + ndev->name); + + mcs->transceiver_type = transceiver_type; + mcs->sir_tweak = sir_tweak; + mcs->receive_mode = receive_mode; + + usb_set_intfdata(intf, mcs); + return 0; + + error2: + free_netdev(ndev); + + error1: + return ret; +} + +/* The current device is removed, the USB layer tells us to shut down. */ +static void mcs_disconnect(struct usb_interface *intf) +{ + struct mcs_cb *mcs = usb_get_intfdata(intf); + + if (!mcs) + return; + + flush_scheduled_work(); + + unregister_netdev(mcs->netdev); + free_netdev(mcs->netdev); + + usb_set_intfdata(intf, NULL); + IRDA_DEBUG(0, "MCS7780 now disconnected."); +} + +/* Module insertion */ +static int __init mcs_init(void) +{ + int result; + + /* register this driver with the USB subsystem */ + result = usb_register(&mcs_driver); + if (result) + IRDA_ERROR("usb_register failed. Error number %d", result); + + return result; +} +module_init(mcs_init); + +/* Module removal */ +static void __exit mcs_exit(void) +{ + /* deregister this driver with the USB subsystem */ + usb_deregister(&mcs_driver); +} +module_exit(mcs_exit); + diff -puN /dev/null drivers/net/irda/mcs7780.h --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/drivers/net/irda/mcs7780.h 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,167 @@ +/***************************************************************************** +* +* Filename: mcs7780.h +* Version: 0.2-alpha +* Description: Irda MosChip USB Dongle +* Status: Experimental +* Authors: Lukasz Stelmach +* Brian Pugh +* +* Copyright (C) 2005, Lukasz Stelmach +* Copyright (C) 2005, Brian Pugh +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +*****************************************************************************/ +#ifndef _MCS7780_H +#define _MCS7780_H + +#define MCS_MODE_SIR 0 +#define MCS_MODE_MIR 1 +#define MCS_MODE_FIR 2 + +#define MCS_CTRL_TIMEOUT 500 +#define MCS_XMIT_TIMEOUT 500 +/* Possible transceiver types */ +#define MCS_TSC_VISHAY 0 /* Vishay TFD, default choice */ +#define MCS_TSC_AGILENT 1 /* Agilent 3602/3600 */ +#define MCS_TSC_SHARP 2 /* Sharp GP2W1000YP */ + +/* Requests */ +#define MCS_RD_RTYPE 0xC0 +#define MCS_WR_RTYPE 0x40 +#define MCS_RDREQ 0x0F +#define MCS_WRREQ 0x0E + +/* Register 0x00 */ +#define MCS_MODE_REG 0 +#define MCS_FIR ((__u16)0x0001) +#define MCS_SIR16US ((__u16)0x0002) +#define MCS_BBTG ((__u16)0x0004) +#define MCS_ASK ((__u16)0x0008) +#define MCS_PARITY ((__u16)0x0010) + +/* SIR/MIR speed constants */ +#define MCS_SPEED_SHIFT 5 +#define MCS_SPEED_MASK ((__u16)0x00E0) +#define MCS_SPEED(x) ((x & MCS_SPEED_MASK) >> MCS_SPEED_SHIFT) +#define MCS_SPEED_2400 ((0 << MCS_SPEED_SHIFT) & MCS_SPEED_MASK) +#define MCS_SPEED_9600 ((1 << MCS_SPEED_SHIFT) & MCS_SPEED_MASK) +#define MCS_SPEED_19200 ((2 << MCS_SPEED_SHIFT) & MCS_SPEED_MASK) +#define MCS_SPEED_38400 ((3 << MCS_SPEED_SHIFT) & MCS_SPEED_MASK) +#define MCS_SPEED_57600 ((4 << MCS_SPEED_SHIFT) & MCS_SPEED_MASK) +#define MCS_SPEED_115200 ((5 << MCS_SPEED_SHIFT) & MCS_SPEED_MASK) +#define MCS_SPEED_576000 ((6 << MCS_SPEED_SHIFT) & MCS_SPEED_MASK) +#define MCS_SPEED_1152000 ((7 << MCS_SPEED_SHIFT) & MCS_SPEED_MASK) + +#define MCS_PLLPWDN ((__u16)0x0100) +#define MCS_DRIVER ((__u16)0x0200) +#define MCS_DTD ((__u16)0x0400) +#define MCS_DIR ((__u16)0x0800) +#define MCS_SIPEN ((__u16)0x1000) +#define MCS_SENDSIP ((__u16)0x2000) +#define MCS_CHGDIR ((__u16)0x4000) +#define MCS_RESET ((__u16)0x8000) + +/* Register 0x02 */ +#define MCS_XCVR_REG 2 +#define MCS_MODE0 ((__u16)0x0001) +#define MCS_STFIR ((__u16)0x0002) +#define MCS_XCVR_CONF ((__u16)0x0004) +#define MCS_RXFAST ((__u16)0x0008) +/* TXCUR [6:4] */ +#define MCS_TXCUR_SHIFT 4 +#define MCS_TXCUR_MASK ((__u16)0x0070) +#define MCS_TXCUR(x) ((x & MCS_TXCUR_MASK) >> MCS_TXCUR_SHIFT) +#define MCS_SETTXCUR(x,y) \ + ((x & ~MCS_TXCUR_MASK) | (y << MCS_TXCUR_SHIFT) & MCS_TXCUR_MASK) + +#define MCS_MODE1 ((__u16)0x0080) +#define MCS_SMODE0 ((__u16)0x0100) +#define MCS_SMODE1 ((__u16)0x0200) +#define MCS_INVTX ((__u16)0x0400) +#define MCS_INVRX ((__u16)0x0800) + +#define MCS_MINRXPW_REG 4 + +#define MCS_RESV_REG 7 +#define MCS_IRINTX ((__u16)0x0001) +#define MCS_IRINRX ((__u16)0x0002) + +struct mcs_cb { + struct usb_device *usbdev; /* init: probe_irda */ + struct net_device *netdev; /* network layer */ + struct irlap_cb *irlap; /* The link layer we are binded to */ + struct net_device_stats stats; /* network statistics */ + struct qos_info qos; + unsigned int speed; /* Current speed */ + unsigned int new_speed; /* new speed */ + + struct work_struct work; /* Change speed work */ + + struct sk_buff *tx_pending; + char in_buf[4096]; /* transmit/receive buffer */ + char out_buf[4096]; /* transmit/receive buffer */ + __u8 *fifo_status; + + iobuff_t rx_buff; /* receive unwrap state machine */ + struct timeval rx_time; + spinlock_t lock; + int receiving; + + __u8 ep_in; + __u8 ep_out; + + struct urb *rx_urb; + struct urb *tx_urb; + + int transceiver_type; + int sir_tweak; + int receive_mode; +}; + +static int mcs_set_reg(struct mcs_cb *mcs, __u16 reg, __u16 val); +static int mcs_get_reg(struct mcs_cb *mcs, __u16 reg, __u16 * val); + +static inline int mcs_setup_transceiver_vishay(struct mcs_cb *mcs); +static inline int mcs_setup_transceiver_agilent(struct mcs_cb *mcs); +static inline int mcs_setup_transceiver_sharp(struct mcs_cb *mcs); +static inline int mcs_setup_transceiver(struct mcs_cb *mcs); +static inline int mcs_wrap_sir_skb(struct sk_buff *skb, __u8 * buf); +static unsigned mcs_wrap_fir_skb(const struct sk_buff *skb, __u8 *buf); +static unsigned mcs_wrap_mir_skb(const struct sk_buff *skb, __u8 *buf); +static void mcs_unwrap_mir(struct mcs_cb *mcs, __u8 *buf, int len); +static void mcs_unwrap_fir(struct mcs_cb *mcs, __u8 *buf, int len); +static inline int mcs_setup_urbs(struct mcs_cb *mcs); +static inline int mcs_receive_start(struct mcs_cb *mcs); +static inline int mcs_find_endpoints(struct mcs_cb *mcs, + struct usb_host_endpoint *ep, int epnum); + +static int mcs_speed_change(struct mcs_cb *mcs); + +static int mcs_net_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd); +static int mcs_net_close(struct net_device *netdev); +static int mcs_net_open(struct net_device *netdev); +static struct net_device_stats *mcs_net_get_stats(struct net_device *netdev); + +static void mcs_receive_irq(struct urb *urb, struct pt_regs *regs); +static void mcs_send_irq(struct urb *urb, struct pt_regs *regs); +static int mcs_hard_xmit(struct sk_buff *skb, struct net_device *netdev); + +static int mcs_probe(struct usb_interface *intf, + const struct usb_device_id *id); +static void mcs_disconnect(struct usb_interface *intf); + +#endif /* _MCS7780_H */ diff -puN drivers/net/irda/stir4200.c~git-net drivers/net/irda/stir4200.c --- devel/drivers/net/irda/stir4200.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/drivers/net/irda/stir4200.c 2006-05-29 15:05:00.000000000 -0700 @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -173,9 +174,7 @@ struct stir_cb { struct qos_info qos; unsigned speed; /* Current speed */ - wait_queue_head_t thr_wait; /* transmit thread wakeup */ - struct completion thr_exited; - pid_t thr_pid; + struct task_struct *thread; /* transmit thread */ struct sk_buff *tx_pending; void *io_buf; /* transmit/receive buffer */ @@ -577,7 +576,7 @@ static int stir_hard_xmit(struct sk_buff SKB_LINEAR_ASSERT(skb); skb = xchg(&stir->tx_pending, skb); - wake_up(&stir->thr_wait); + wake_up_process(stir->thread); /* this should never happen unless stop/wakeup problem */ if (unlikely(skb)) { @@ -753,13 +752,7 @@ static int stir_transmit_thread(void *ar struct net_device *dev = stir->netdev; struct sk_buff *skb; - daemonize("%s", dev->name); - allow_signal(SIGTERM); - - while (netif_running(dev) - && netif_device_present(dev) - && !signal_pending(current)) - { + while (!kthread_should_stop()) { #ifdef CONFIG_PM /* if suspending, then power off and wait */ if (unlikely(freezing(current))) { @@ -813,10 +806,11 @@ static int stir_transmit_thread(void *ar } /* sleep if nothing to send */ - wait_event_interruptible(stir->thr_wait, stir->tx_pending); - } + set_current_state(TASK_INTERRUPTIBLE); + schedule(); - complete_and_exit (&stir->thr_exited, 0); + } + return 0; } @@ -859,7 +853,7 @@ static void stir_rcv_irq(struct urb *urb warn("%s: usb receive submit error: %d", stir->netdev->name, err); stir->receiving = 0; - wake_up(&stir->thr_wait); + wake_up_process(stir->thread); } } @@ -928,10 +922,10 @@ static int stir_net_open(struct net_devi } /** Start kernel thread for transmit. */ - stir->thr_pid = kernel_thread(stir_transmit_thread, stir, - CLONE_FS|CLONE_FILES); - if (stir->thr_pid < 0) { - err = stir->thr_pid; + stir->thread = kthread_run(stir_transmit_thread, stir, + "%s", stir->netdev->name); + if (IS_ERR(stir->thread)) { + err = PTR_ERR(stir->thread); err("stir4200: unable to start kernel thread"); goto err_out6; } @@ -968,8 +962,7 @@ static int stir_net_close(struct net_dev netif_stop_queue(netdev); /* Kill transmit thread */ - kill_proc(stir->thr_pid, SIGTERM, 1); - wait_for_completion(&stir->thr_exited); + kthread_stop(stir->thread); kfree(stir->fifo_status); /* Mop up receive urb's */ @@ -1084,9 +1077,6 @@ static int stir_probe(struct usb_interfa stir->qos.min_turn_time.bits &= qos_mtt_bits; irda_qos_bits_to_value(&stir->qos); - init_completion (&stir->thr_exited); - init_waitqueue_head (&stir->thr_wait); - /* Override the network functions we need to use */ net->hard_start_xmit = stir_hard_xmit; net->open = stir_net_open; diff -puN drivers/net/tg3.c~git-net drivers/net/tg3.c --- devel/drivers/net/tg3.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/drivers/net/tg3.c 2006-05-29 15:05:00.000000000 -0700 @@ -69,8 +69,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.58" -#define DRV_MODULE_RELDATE "May 22, 2006" +#define DRV_MODULE_VERSION "3.59" +#define DRV_MODULE_RELDATE "May 26, 2006" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 @@ -229,6 +229,8 @@ static struct pci_device_id tg3_pci_tbl[ PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL }, { PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5755M, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL }, + { PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5786, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL }, { PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL }, { PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5787M, @@ -2965,6 +2967,29 @@ static int tg3_setup_phy(struct tg3 *tp, return err; } +/* This is called whenever we suspect that the system chipset is re- + * ordering the sequence of MMIO to the tx send mailbox. The symptom + * is bogus tx completions. We try to recover by setting the + * TG3_FLAG_MBOX_WRITE_REORDER flag and resetting the chip later + * in the workqueue. + */ +static void tg3_tx_recover(struct tg3 *tp) +{ + BUG_ON((tp->tg3_flags & TG3_FLAG_MBOX_WRITE_REORDER) || + tp->write32_tx_mbox == tg3_write_indirect_mbox); + + printk(KERN_WARNING PFX "%s: The system may be re-ordering memory-" + "mapped I/O cycles to the network device, attempting to " + "recover. Please report the problem to the driver maintainer " + "and include system chipset information.\n", tp->dev->name); + + spin_lock(&tp->lock); + spin_lock(&tp->tx_lock); + tp->tg3_flags |= TG3_FLAG_TX_RECOVERY_PENDING; + spin_unlock(&tp->tx_lock); + spin_unlock(&tp->lock); +} + /* Tigon3 never reports partial packet sends. So we do not * need special logic to handle SKBs that have not had all * of their frags sent yet, like SunGEM does. @@ -2977,9 +3002,13 @@ static void tg3_tx(struct tg3 *tp) while (sw_idx != hw_idx) { struct tx_ring_info *ri = &tp->tx_buffers[sw_idx]; struct sk_buff *skb = ri->skb; - int i; + int i, tx_bug = 0; + + if (unlikely(skb == NULL)) { + tg3_tx_recover(tp); + return; + } - BUG_ON(skb == NULL); pci_unmap_single(tp->pdev, pci_unmap_addr(ri, mapping), skb_headlen(skb), @@ -2990,10 +3019,9 @@ static void tg3_tx(struct tg3 *tp) sw_idx = NEXT_TX(sw_idx); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - BUG_ON(sw_idx == hw_idx); - ri = &tp->tx_buffers[sw_idx]; - BUG_ON(ri->skb != NULL); + if (unlikely(ri->skb != NULL || sw_idx == hw_idx)) + tx_bug = 1; pci_unmap_page(tp->pdev, pci_unmap_addr(ri, mapping), @@ -3004,6 +3032,11 @@ static void tg3_tx(struct tg3 *tp) } dev_kfree_skb(skb); + + if (unlikely(tx_bug)) { + tg3_tx_recover(tp); + return; + } } tp->tx_cons = sw_idx; @@ -3331,6 +3364,11 @@ static int tg3_poll(struct net_device *n /* run TX completion thread */ if (sblk->idx[0].tx_consumer != tp->tx_cons) { tg3_tx(tp); + if (unlikely(tp->tg3_flags & TG3_FLAG_TX_RECOVERY_PENDING)) { + netif_rx_complete(netdev); + schedule_work(&tp->reset_task); + return 0; + } } /* run RX thread, within the bounds set by NAPI. @@ -3579,6 +3617,13 @@ static void tg3_reset_task(void *_data) restart_timer = tp->tg3_flags2 & TG3_FLG2_RESTART_TIMER; tp->tg3_flags2 &= ~TG3_FLG2_RESTART_TIMER; + if (tp->tg3_flags & TG3_FLAG_TX_RECOVERY_PENDING) { + tp->write32_tx_mbox = tg3_write32_tx_mbox; + tp->write32_rx_mbox = tg3_write_flush_reg32; + tp->tg3_flags |= TG3_FLAG_MBOX_WRITE_REORDER; + tp->tg3_flags &= ~TG3_FLAG_TX_RECOVERY_PENDING; + } + tg3_halt(tp, RESET_KIND_SHUTDOWN, 0); tg3_init_hw(tp, 1); diff -puN drivers/net/tg3.h~git-net drivers/net/tg3.h --- devel/drivers/net/tg3.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/drivers/net/tg3.h 2006-05-29 15:05:00.000000000 -0700 @@ -2155,11 +2155,7 @@ struct tg3 { #define TG3_FLAG_ENABLE_ASF 0x00000020 #define TG3_FLAG_5701_REG_WRITE_BUG 0x00000040 #define TG3_FLAG_POLL_SERDES 0x00000080 -#if defined(CONFIG_X86) #define TG3_FLAG_MBOX_WRITE_REORDER 0x00000100 -#else -#define TG3_FLAG_MBOX_WRITE_REORDER 0 /* disables code too */ -#endif #define TG3_FLAG_PCIX_TARGET_HWBUG 0x00000200 #define TG3_FLAG_WOL_SPEED_100MB 0x00000400 #define TG3_FLAG_WOL_ENABLE 0x00000800 @@ -2172,6 +2168,7 @@ struct tg3 { #define TG3_FLAG_PCI_HIGH_SPEED 0x00040000 #define TG3_FLAG_PCI_32BIT 0x00080000 #define TG3_FLAG_SRAM_USE_CONFIG 0x00100000 +#define TG3_FLAG_TX_RECOVERY_PENDING 0x00200000 #define TG3_FLAG_SERDES_WOL_CAP 0x00400000 #define TG3_FLAG_JUMBO_RING_ENABLE 0x00800000 #define TG3_FLAG_10_100_ONLY 0x01000000 diff -puN /dev/null include/linux/dmaengine.h --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/include/linux/dmaengine.h 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,359 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef DMAENGINE_H +#define DMAENGINE_H +#include +#ifdef CONFIG_DMA_ENGINE + +#include +#include +#include +#include +#include + +/** + * enum dma_event - resource PNP/power managment events + * @DMA_RESOURCE_SUSPEND: DMA device going into low power state + * @DMA_RESOURCE_RESUME: DMA device returning to full power + * @DMA_RESOURCE_ADDED: DMA device added to the system + * @DMA_RESOURCE_REMOVED: DMA device removed from the system + */ +enum dma_event { + DMA_RESOURCE_SUSPEND, + DMA_RESOURCE_RESUME, + DMA_RESOURCE_ADDED, + DMA_RESOURCE_REMOVED, +}; + +/** + * typedef dma_cookie_t + * + * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code + */ +typedef s32 dma_cookie_t; + +#define dma_submit_error(cookie) ((cookie) < 0 ? 1 : 0) + +/** + * enum dma_status - DMA transaction status + * @DMA_SUCCESS: transaction completed successfully + * @DMA_IN_PROGRESS: transaction not yet processed + * @DMA_ERROR: transaction failed + */ +enum dma_status { + DMA_SUCCESS, + DMA_IN_PROGRESS, + DMA_ERROR, +}; + +/** + * struct dma_chan_percpu - the per-CPU part of struct dma_chan + * @refcount: local_t used for open-coded "bigref" counting + * @memcpy_count: transaction counter + * @bytes_transferred: byte counter + */ + +struct dma_chan_percpu { + local_t refcount; + /* stats */ + unsigned long memcpy_count; + unsigned long bytes_transferred; +}; + +/** + * struct dma_chan - devices supply DMA channels, clients use them + * @client: ptr to the client user of this chan, will be NULL when unused + * @device: ptr to the dma device who supplies this channel, always !NULL + * @cookie: last cookie value returned to client + * @chan_id: + * @class_dev: + * @refcount: kref, used in "bigref" slow-mode + * @slow_ref: + * @rcu: + * @client_node: used to add this to the client chan list + * @device_node: used to add this to the device chan list + * @local: per-cpu pointer to a struct dma_chan_percpu + */ +struct dma_chan { + struct dma_client *client; + struct dma_device *device; + dma_cookie_t cookie; + + /* sysfs */ + int chan_id; + struct class_device class_dev; + + struct kref refcount; + int slow_ref; + struct rcu_head rcu; + + struct list_head client_node; + struct list_head device_node; + struct dma_chan_percpu *local; +}; + +void dma_chan_cleanup(struct kref *kref); + +static inline void dma_chan_get(struct dma_chan *chan) +{ + if (unlikely(chan->slow_ref)) + kref_get(&chan->refcount); + else { + local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount)); + put_cpu(); + } +} + +static inline void dma_chan_put(struct dma_chan *chan) +{ + if (unlikely(chan->slow_ref)) + kref_put(&chan->refcount, dma_chan_cleanup); + else { + local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount)); + put_cpu(); + } +} + +/* + * typedef dma_event_callback - function pointer to a DMA event callback + */ +typedef void (*dma_event_callback) (struct dma_client *client, + struct dma_chan *chan, enum dma_event event); + +/** + * struct dma_client - info on the entity making use of DMA services + * @event_callback: func ptr to call when something happens + * @chan_count: number of chans allocated + * @chans_desired: number of chans requested. Can be +/- chan_count + * @lock: protects access to the channels list + * @channels: the list of DMA channels allocated + * @global_node: list_head for global dma_client_list + */ +struct dma_client { + dma_event_callback event_callback; + unsigned int chan_count; + unsigned int chans_desired; + + spinlock_t lock; + struct list_head channels; + struct list_head global_node; +}; + +/** + * struct dma_device - info on the entity supplying DMA services + * @chancnt: how many DMA channels are supported + * @channels: the list of struct dma_chan + * @global_node: list_head for global dma_device_list + * @refcount: + * @done: + * @dev_id: + * Other func ptrs: used to make use of this device's capabilities + */ +struct dma_device { + + unsigned int chancnt; + struct list_head channels; + struct list_head global_node; + + struct kref refcount; + struct completion done; + + int dev_id; + + int (*device_alloc_chan_resources)(struct dma_chan *chan); + void (*device_free_chan_resources)(struct dma_chan *chan); + dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan, + void *dest, void *src, size_t len); + dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan, + struct page *page, unsigned int offset, void *kdata, + size_t len); + dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan, + struct page *dest_pg, unsigned int dest_off, + struct page *src_pg, unsigned int src_off, size_t len); + enum dma_status (*device_memcpy_complete)(struct dma_chan *chan, + dma_cookie_t cookie, dma_cookie_t *last, + dma_cookie_t *used); + void (*device_memcpy_issue_pending)(struct dma_chan *chan); +}; + +/* --- public DMA engine API --- */ + +struct dma_client *dma_async_client_register(dma_event_callback event_callback); +void dma_async_client_unregister(struct dma_client *client); +void dma_async_client_chan_request(struct dma_client *client, + unsigned int number); + +/** + * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses + * @chan: DMA channel to offload copy to + * @dest: destination address (virtual) + * @src: source address (virtual) + * @len: length + * + * Both @dest and @src must be mappable to a bus address according to the + * DMA mapping API rules for streaming mappings. + * Both @dest and @src must stay memory resident (kernel memory or locked + * user space pages) + */ +static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, + void *dest, void *src, size_t len) +{ + int cpu = get_cpu(); + per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; + per_cpu_ptr(chan->local, cpu)->memcpy_count++; + put_cpu(); + + return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len); +} + +/** + * dma_async_memcpy_buf_to_pg - offloaded copy + * @chan: DMA channel to offload copy to + * @page: destination page + * @offset: offset in page to copy to + * @kdata: source address (virtual) + * @len: length + * + * Both @page/@offset and @kdata must be mappable to a bus address according + * to the DMA mapping API rules for streaming mappings. + * Both @page/@offset and @kdata must stay memory resident (kernel memory or + * locked user space pages) + */ +static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, + struct page *page, unsigned int offset, void *kdata, size_t len) +{ + int cpu = get_cpu(); + per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; + per_cpu_ptr(chan->local, cpu)->memcpy_count++; + put_cpu(); + + return chan->device->device_memcpy_buf_to_pg(chan, page, offset, + kdata, len); +} + +/** + * dma_async_memcpy_buf_to_pg - offloaded copy + * @chan: DMA channel to offload copy to + * @dest_page: destination page + * @dest_off: offset in page to copy to + * @src_page: source page + * @src_off: offset in page to copy from + * @len: length + * + * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus + * address according to the DMA mapping API rules for streaming mappings. + * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident + * (kernel memory or locked user space pages) + */ +static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan, + struct page *dest_pg, unsigned int dest_off, struct page *src_pg, + unsigned int src_off, size_t len) +{ + int cpu = get_cpu(); + per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; + per_cpu_ptr(chan->local, cpu)->memcpy_count++; + put_cpu(); + + return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off, + src_pg, src_off, len); +} + +/** + * dma_async_memcpy_issue_pending - flush pending copies to HW + * @chan: + * + * This allows drivers to push copies to HW in batches, + * reducing MMIO writes where possible. + */ +static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan) +{ + return chan->device->device_memcpy_issue_pending(chan); +} + +/** + * dma_async_memcpy_complete - poll for transaction completion + * @chan: DMA channel + * @cookie: transaction identifier to check status of + * @last: returns last completed cookie, can be NULL + * @used: returns last issued cookie, can be NULL + * + * If @last and @used are passed in, upon return they reflect the driver + * internal state and can be used with dma_async_is_complete() to check + * the status of multiple cookies without re-checking hardware state. + */ +static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan, + dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used) +{ + return chan->device->device_memcpy_complete(chan, cookie, last, used); +} + +/** + * dma_async_is_complete - test a cookie against chan state + * @cookie: transaction identifier to test status of + * @last_complete: last know completed transaction + * @last_used: last cookie value handed out + * + * dma_async_is_complete() is used in dma_async_memcpy_complete() + * the test logic is seperated for lightweight testing of multiple cookies + */ +static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie, + dma_cookie_t last_complete, dma_cookie_t last_used) +{ + if (last_complete <= last_used) { + if ((cookie <= last_complete) || (cookie > last_used)) + return DMA_SUCCESS; + } else { + if ((cookie <= last_complete) && (cookie > last_used)) + return DMA_SUCCESS; + } + return DMA_IN_PROGRESS; +} + + +/* --- DMA device --- */ + +int dma_async_device_register(struct dma_device *device); +void dma_async_device_unregister(struct dma_device *device); + +/* --- Helper iov-locking functions --- */ + +struct dma_page_list { + char *base_address; + int nr_pages; + struct page **pages; +}; + +struct dma_pinned_list { + int nr_iovecs; + struct dma_page_list page_list[0]; +}; + +struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len); +void dma_unpin_iovec_pages(struct dma_pinned_list* pinned_list); + +dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov, + struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len); +dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov, + struct dma_pinned_list *pinned_list, struct page *page, + unsigned int offset, size_t len); + +#endif /* CONFIG_DMA_ENGINE */ +#endif /* DMAENGINE_H */ diff -puN include/linux/netdevice.h~git-net include/linux/netdevice.h --- devel/include/linux/netdevice.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/linux/netdevice.h 2006-05-29 15:05:00.000000000 -0700 @@ -36,6 +36,7 @@ #include #include +#include struct divert_blk; struct vlan_group; @@ -592,6 +593,9 @@ struct softnet_data struct sk_buff *completion_queue; struct net_device backlog_dev; /* Sorry. 8) */ +#ifdef CONFIG_NET_DMA + struct dma_chan *net_dma; +#endif }; DECLARE_PER_CPU(struct softnet_data,softnet_data); diff -puN include/linux/pci_ids.h~git-net include/linux/pci_ids.h --- devel/include/linux/pci_ids.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/linux/pci_ids.h 2006-05-29 15:05:00.000000000 -0700 @@ -1912,6 +1912,7 @@ #define PCI_DEVICE_ID_TIGON3_5751F 0x167e #define PCI_DEVICE_ID_TIGON3_5787M 0x1693 #define PCI_DEVICE_ID_TIGON3_5782 0x1696 +#define PCI_DEVICE_ID_TIGON3_5786 0x169a #define PCI_DEVICE_ID_TIGON3_5787 0x169b #define PCI_DEVICE_ID_TIGON3_5788 0x169c #define PCI_DEVICE_ID_TIGON3_5789 0x169d @@ -2068,6 +2069,7 @@ #define PCI_DEVICE_ID_INTEL_80960_RP 0x1960 #define PCI_DEVICE_ID_INTEL_82840_HB 0x1a21 #define PCI_DEVICE_ID_INTEL_82845_HB 0x1a30 +#define PCI_DEVICE_ID_INTEL_IOAT 0x1a38 #define PCI_DEVICE_ID_INTEL_82801AA_0 0x2410 #define PCI_DEVICE_ID_INTEL_82801AA_1 0x2411 #define PCI_DEVICE_ID_INTEL_82801AA_3 0x2413 diff -puN include/linux/skbuff.h~git-net include/linux/skbuff.h --- devel/include/linux/skbuff.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/linux/skbuff.h 2006-05-29 15:05:00.000000000 -0700 @@ -28,6 +28,7 @@ #include #include #include +#include #define HAVE_ALLOC_SKB /* For the drivers to know */ #define HAVE_ALIGNABLE_SKB /* Ditto 8) */ @@ -284,6 +285,9 @@ struct sk_buff { __u16 tc_verd; /* traffic control verdict */ #endif #endif +#ifdef CONFIG_NET_DMA + dma_cookie_t dma_cookie; +#endif /* These elements must be at the end, see alloc_skb() for details. */ diff -puN include/linux/sysctl.h~git-net include/linux/sysctl.h --- devel/include/linux/sysctl.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/linux/sysctl.h 2006-05-29 15:05:00.000000000 -0700 @@ -403,6 +403,7 @@ enum NET_TCP_MTU_PROBING=113, NET_TCP_BASE_MSS=114, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, + NET_TCP_DMA_COPYBREAK=116, }; enum { diff -puN include/linux/tcp.h~git-net include/linux/tcp.h --- devel/include/linux/tcp.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/linux/tcp.h 2006-05-29 15:05:00.000000000 -0700 @@ -18,6 +18,7 @@ #define _LINUX_TCP_H #include +#include #include struct tcphdr { @@ -232,6 +233,13 @@ struct tcp_sock { struct iovec *iov; int memory; int len; +#ifdef CONFIG_NET_DMA + /* members for async copy */ + struct dma_chan *dma_chan; + int wakeup; + struct dma_pinned_list *pinned_list; + dma_cookie_t dma_cookie; +#endif } ucopy; __u32 snd_wl1; /* Sequence for window update */ diff -puN include/linux/xfrm.h~git-net include/linux/xfrm.h --- devel/include/linux/xfrm.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/linux/xfrm.h 2006-05-29 15:05:00.000000000 -0700 @@ -118,6 +118,10 @@ enum XFRM_SHARE_UNIQUE /* Use once */ }; +#define XFRM_MODE_TRANSPORT 0 +#define XFRM_MODE_TUNNEL 1 +#define XFRM_MODE_MAX 2 + /* Netlink configuration messages. */ enum { XFRM_MSG_BASE = 0x10, diff -puN include/net/llc_if.h~git-net include/net/llc_if.h --- devel/include/net/llc_if.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/net/llc_if.h 2006-05-29 15:05:00.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #include #define LLC_DATAUNIT_PRIM 1 @@ -61,8 +62,6 @@ #define LLC_STATUS_CONFLICT 7 /* disconnect conn */ #define LLC_STATUS_RESET_DONE 8 /* */ -extern u8 llc_mac_null_var[IFHWADDRLEN]; - /** * llc_mac_null - determines if a address is a null mac address * @mac: Mac address to test if null. @@ -70,16 +69,20 @@ extern u8 llc_mac_null_var[IFHWADDRLEN]; * Determines if a given address is a null mac address. Returns 0 if the * address is not a null mac, 1 if the address is a null mac. */ -static __inline__ int llc_mac_null(u8 *mac) +static inline int llc_mac_null(const u8 *mac) { - return !memcmp(mac, llc_mac_null_var, IFHWADDRLEN); + return is_zero_ether_addr(mac); } -static __inline__ int llc_addrany(struct llc_addr *addr) +static inline int llc_addrany(const struct llc_addr *addr) { return llc_mac_null(addr->mac) && !addr->lsap; } +static inline int llc_mac_multicast(const u8 *mac) +{ + return is_multicast_ether_addr(mac); +} /** * llc_mac_match - determines if two mac addresses are the same * @mac1: First mac address to compare. @@ -89,9 +92,9 @@ static __inline__ int llc_addrany(struct * is not a complete match up to len, 1 if a complete match up to len is * found. */ -static __inline__ int llc_mac_match(u8 *mac1, u8 *mac2) +static inline int llc_mac_match(const u8 *mac1, const u8 *mac2) { - return !memcmp(mac1, mac2, IFHWADDRLEN); + return !compare_ether_addr(mac1, mac2); } extern int llc_establish_connection(struct sock *sk, u8 *lmac, diff -puN /dev/null include/net/netdma.h --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/include/net/netdma.h 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,44 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef NETDMA_H +#define NETDMA_H +#include +#ifdef CONFIG_NET_DMA +#include +#include + +static inline struct dma_chan *get_softnet_dma(void) +{ + struct dma_chan *chan; + rcu_read_lock(); + chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma)); + if (chan) + dma_chan_get(chan); + rcu_read_unlock(); + return chan; +} + +int dma_skb_copy_datagram_iovec(struct dma_chan* chan, + const struct sk_buff *skb, int offset, struct iovec *to, + size_t len, struct dma_pinned_list *pinned_list); + +#endif /* CONFIG_NET_DMA */ +#endif /* NETDMA_H */ diff -puN include/net/sock.h~git-net include/net/sock.h --- devel/include/net/sock.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/net/sock.h 2006-05-29 15:05:00.000000000 -0700 @@ -131,6 +131,7 @@ struct sock_common { * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_write_queue: Packet sending queue + * @sk_async_wait_queue: DMA copied packets * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward @@ -204,6 +205,7 @@ struct sock { atomic_t sk_omem_alloc; struct sk_buff_head sk_receive_queue; struct sk_buff_head sk_write_queue; + struct sk_buff_head sk_async_wait_queue; int sk_wmem_queued; int sk_forward_alloc; gfp_t sk_allocation; @@ -1270,11 +1272,22 @@ sock_recv_timestamp(struct msghdr *msg, * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) +#ifdef CONFIG_NET_DMA +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) +{ + __skb_unlink(skb, &sk->sk_receive_queue); + if (!copied_early) + __kfree_skb(skb); + else + __skb_queue_tail(&sk->sk_async_wait_queue, skb); +} +#else +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } +#endif extern void sock_enable_timestamp(struct sock *sk); extern int sock_get_timestamp(struct sock *, struct timeval __user *); diff -puN include/net/tcp.h~git-net include/net/tcp.h --- devel/include/net/tcp.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/net/tcp.h 2006-05-29 15:05:00.000000000 -0700 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -217,6 +218,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; +extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -292,6 +294,8 @@ extern int tcp_rcv_established(struct extern void tcp_rcv_space_adjust(struct sock *sk); +extern void tcp_cleanup_rbuf(struct sock *sk, int copied); + extern int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); @@ -816,6 +820,12 @@ static inline void tcp_prequeue_init(str tp->ucopy.len = 0; tp->ucopy.memory = 0; skb_queue_head_init(&tp->ucopy.prequeue); +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + tp->ucopy.wakeup = 0; + tp->ucopy.pinned_list = NULL; + tp->ucopy.dma_cookie = 0; +#endif } /* Packet is added to VJ-style prequeue for processing in process diff -puN include/net/xfrm.h~git-net include/net/xfrm.h --- devel/include/net/xfrm.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/include/net/xfrm.h 2006-05-29 15:05:00.000000000 -0700 @@ -20,6 +20,8 @@ #include #define XFRM_ALIGN8(len) (((len) + 7) & ~7) +#define MODULE_ALIAS_XFRM_MODE(family, encap) \ + MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap)) extern struct sock *xfrm_nl; extern u32 sysctl_xfrm_aevent_etime; @@ -164,6 +166,7 @@ struct xfrm_state /* Reference to data common to all the instances of this * transformer. */ struct xfrm_type *type; + struct xfrm_mode *mode; /* Security context */ struct xfrm_sec_ctx *security; @@ -204,8 +207,8 @@ struct xfrm_type; struct xfrm_dst; struct xfrm_policy_afinfo { unsigned short family; - rwlock_t lock; - struct xfrm_type_map *type_map; + struct xfrm_type *type_map[IPPROTO_MAX]; + struct xfrm_mode *mode_map[XFRM_MODE_MAX]; struct dst_ops *dst_ops; void (*garbage_collect)(void); int (*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl); @@ -232,7 +235,6 @@ extern int __xfrm_state_delete(struct xf struct xfrm_state_afinfo { unsigned short family; - rwlock_t lock; struct list_head *state_bydst; struct list_head *state_byspi; int (*init_flags)(struct xfrm_state *x); @@ -264,16 +266,24 @@ struct xfrm_type u32 (*get_max_size)(struct xfrm_state *, int size); }; -struct xfrm_type_map { - rwlock_t lock; - struct xfrm_type *map[256]; -}; - extern int xfrm_register_type(struct xfrm_type *type, unsigned short family); extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family); extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family); extern void xfrm_put_type(struct xfrm_type *type); +struct xfrm_mode { + int (*input)(struct xfrm_state *x, struct sk_buff *skb); + int (*output)(struct sk_buff *skb); + + struct module *owner; + unsigned int encap; +}; + +extern int xfrm_register_mode(struct xfrm_mode *mode, int family); +extern int xfrm_unregister_mode(struct xfrm_mode *mode, int family); +extern struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family); +extern void xfrm_put_mode(struct xfrm_mode *mode); + struct xfrm_tmpl { /* id in template is interpreted as: diff -puN net/bridge/br.c~git-net net/bridge/br.c --- devel/net/bridge/br.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/bridge/br.c 2006-05-29 15:05:00.000000000 -0700 @@ -30,36 +30,46 @@ static struct llc_sap *br_stp_sap; static int __init br_init(void) { + int err; + br_stp_sap = llc_sap_open(LLC_SAP_BSPAN, br_stp_rcv); if (!br_stp_sap) { printk(KERN_ERR "bridge: can't register sap for STP\n"); - return -EBUSY; + return -EADDRINUSE; } br_fdb_init(); -#ifdef CONFIG_BRIDGE_NETFILTER - if (br_netfilter_init()) - return 1; -#endif + err = br_netfilter_init(); + if (err) + goto err_out1; + + err = register_netdevice_notifier(&br_device_notifier); + if (err) + goto err_out2; + + br_netlink_init(); brioctl_set(br_ioctl_deviceless_stub); br_handle_frame_hook = br_handle_frame; br_fdb_get_hook = br_fdb_get; br_fdb_put_hook = br_fdb_put; - register_netdevice_notifier(&br_device_notifier); - return 0; + +err_out2: + br_netfilter_fini(); +err_out1: + llc_sap_put(br_stp_sap); + return err; } static void __exit br_deinit(void) { rcu_assign_pointer(br_stp_sap->rcv_func, NULL); -#ifdef CONFIG_BRIDGE_NETFILTER + br_netlink_fini(); br_netfilter_fini(); -#endif unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); diff -puN net/bridge/br_forward.c~git-net net/bridge/br_forward.c --- devel/net/bridge/br_forward.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/bridge/br_forward.c 2006-05-29 15:05:00.000000000 -0700 @@ -20,14 +20,11 @@ #include #include "br_private.h" +/* Don't forward packets to originating port or forwarding diasabled */ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { - if (skb->dev == p->dev || - p->state != BR_STATE_FORWARDING) - return 0; - - return 1; + return (skb->dev != p->dev && p->state == BR_STATE_FORWARDING); } static inline unsigned packet_length(const struct sk_buff *skb) @@ -55,10 +52,9 @@ int br_dev_queue_push_xmit(struct sk_buf int br_forward_finish(struct sk_buff *skb) { - NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, - br_dev_queue_push_xmit); + return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, + br_dev_queue_push_xmit); - return 0; } static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) diff -puN /dev/null net/bridge/br_netlink.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/net/bridge/br_netlink.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,199 @@ +/* + * Bridge netlink control interface + * + * Authors: + * Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "br_private.h" + +/* + * Create one netlink message for one interface + * Contains port and master info as well as carrier and bridge state. + */ +static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, + u32 pid, u32 seq, int event, unsigned int flags) +{ + const struct net_bridge *br = port->br; + const struct net_device *dev = port->dev; + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + u32 mtu = dev->mtu; + u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + u8 portstate = port->state; + + pr_debug("br_fill_info event %d port %s master %s\n", + event, dev->name, br->dev->name); + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); + r = NLMSG_DATA(nlh); + r->ifi_family = AF_BRIDGE; + r->__ifi_pad = 0; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev_get_flags(dev); + r->ifi_change = 0; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + + RTA_PUT(skb, IFLA_MASTER, sizeof(int), &br->dev->ifindex); + + if (dev->addr_len) + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + if (dev->ifindex != dev->iflink) + RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); + + + RTA_PUT(skb, IFLA_OPERSTATE, sizeof(operstate), &operstate); + + if (event == RTM_NEWLINK) + RTA_PUT(skb, IFLA_PROTINFO, sizeof(portstate), &portstate); + + nlh->nlmsg_len = skb->tail - b; + + return skb->len; + +nlmsg_failure: +rtattr_failure: + + skb_trim(skb, b - skb->data); + return -EINVAL; +} + +/* + * Notify listeners of a change in port information + */ +void br_ifinfo_notify(int event, struct net_bridge_port *port) +{ + struct sk_buff *skb; + int err = -ENOMEM; + + pr_debug("bridge notify event=%d\n", event); + skb = alloc_skb(NLMSG_SPACE(sizeof(struct ifinfomsg) + 128), + GFP_ATOMIC); + if (!skb) + goto err_out; + + err = br_fill_ifinfo(skb, port, current->pid, 0, event, 0); + if (err) + goto err_kfree; + + NETLINK_CB(skb).dst_group = RTNLGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC); + return; + +err_kfree: + kfree_skb(skb); +err_out: + netlink_set_err(rtnl, 0, RTNLGRP_LINK, err); +} + +/* + * Dump information about all ports, in response to GETLINK + */ +static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net_device *dev; + int idx; + int s_idx = cb->args[0]; + int err = 0; + + read_lock(&dev_base_lock); + for (dev = dev_base, idx = 0; dev; dev = dev->next) { + struct net_bridge_port *p = dev->br_port; + + /* not a bridge port */ + if (!p) + continue; + + if (idx < s_idx) + continue; + + err = br_fill_ifinfo(skb, p, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI); + if (err <= 0) + break; + ++idx; + } + read_unlock(&dev_base_lock); + + cb->args[0] = idx; + + return skb->len; +} + +/* + * Change state of port (ie from forwarding to blocking etc) + * Used by spanning tree in user space. + */ +static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct ifinfomsg *ifm = NLMSG_DATA(nlh); + struct net_device *dev; + struct net_bridge_port *p; + u8 new_state; + + if (ifm->ifi_family != AF_BRIDGE) + return -EPFNOSUPPORT; + + /* Must pass valid state as PROTINFO */ + if (rta[IFLA_PROTINFO-1]) { + u8 *pstate = RTA_DATA(rta[IFLA_PROTINFO-1]); + new_state = *pstate; + } else + return -EINVAL; + + if (new_state > BR_STATE_BLOCKING) + return -EINVAL; + + /* Find bridge port */ + dev = __dev_get_by_index(ifm->ifi_index); + if (!dev) + return -ENODEV; + + p = dev->br_port; + if (!p) + return -EINVAL; + + /* if kernel STP is running, don't allow changes */ + if (p->br->stp_enabled) + return -EBUSY; + + if (!netif_running(dev)) + return -ENETDOWN; + + if (!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED) + return -ENETDOWN; + + p->state = new_state; + br_log_state(p); + return 0; +} + + +static struct rtnetlink_link bridge_rtnetlink_table[RTM_NR_MSGTYPES] = { + [RTM_GETLINK - RTM_BASE] = { .dumpit = br_dump_ifinfo, }, + [RTM_SETLINK - RTM_BASE] = { .doit = br_rtm_setlink, }, +}; + +void __init br_netlink_init(void) +{ + rtnetlink_links[PF_BRIDGE] = bridge_rtnetlink_table; +} + +void __exit br_netlink_fini(void) +{ + rtnetlink_links[PF_BRIDGE] = NULL; +} + diff -puN net/bridge/br_notify.c~git-net net/bridge/br_notify.c --- devel/net/bridge/br_notify.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/bridge/br_notify.c 2006-05-29 15:05:00.000000000 -0700 @@ -14,6 +14,7 @@ */ #include +#include #include "br_private.h" @@ -49,6 +50,7 @@ static int br_device_event(struct notifi case NETDEV_CHANGEADDR: br_fdb_changeaddr(p, dev->dev_addr); + br_ifinfo_notify(RTM_NEWLINK, p); br_stp_recalculate_bridge_id(br); break; diff -puN net/bridge/br_private.h~git-net net/bridge/br_private.h --- devel/net/bridge/br_private.h~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/bridge/br_private.h 2006-05-29 15:05:00.000000000 -0700 @@ -29,7 +29,7 @@ #define BR_PORT_DEBOUNCE (HZ/10) -#define BR_VERSION "2.1" +#define BR_VERSION "2.2" typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; @@ -192,8 +192,13 @@ extern int br_dev_ioctl(struct net_devic extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg); /* br_netfilter.c */ +#ifdef CONFIG_BRIDGE_NETFILTER extern int br_netfilter_init(void); extern void br_netfilter_fini(void); +#else +#define br_netfilter_init() (0) +#define br_netfilter_fini() do { } while(0) +#endif /* br_stp.c */ extern void br_log_state(const struct net_bridge_port *p); @@ -232,6 +237,11 @@ extern struct net_bridge_fdb_entry *(*br extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); +/* br_netlink.c */ +extern void br_netlink_init(void); +extern void br_netlink_fini(void); +extern void br_ifinfo_notify(int event, struct net_bridge_port *port); + #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ extern struct sysfs_ops brport_sysfs_ops; diff -puN net/bridge/br_stp_if.c~git-net net/bridge/br_stp_if.c --- devel/net/bridge/br_stp_if.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/bridge/br_stp_if.c 2006-05-29 15:05:00.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #include "br_private.h" #include "br_private_stp.h" @@ -86,6 +87,7 @@ void br_stp_disable_bridge(struct net_br void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); + br_ifinfo_notify(RTM_NEWLINK, p); br_port_state_selection(p->br); } @@ -99,6 +101,8 @@ void br_stp_disable_port(struct net_brid printk(KERN_INFO "%s: port %i(%s) entering %s state\n", br->dev->name, p->port_no, p->dev->name, "disabled"); + br_ifinfo_notify(RTM_DELLINK, p); + wasroot = br_is_root_bridge(br); br_become_designated_port(p); p->state = BR_STATE_DISABLED; diff -puN net/bridge/Makefile~git-net net/bridge/Makefile --- devel/net/bridge/Makefile~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/bridge/Makefile 2006-05-29 15:05:00.000000000 -0700 @@ -6,7 +6,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ - br_stp_if.o br_stp_timer.o + br_stp_if.o br_stp_timer.o br_netlink.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o diff -puN net/core/dev.c~git-net net/core/dev.c --- devel/net/core/dev.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/core/dev.c 2006-05-29 15:05:00.000000000 -0700 @@ -115,6 +115,7 @@ #include #include #include +#include /* * The list of packet types we will receive (as opposed to discard) @@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16]; /* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ +#ifdef CONFIG_NET_DMA +static struct dma_client *net_dma_client; +static unsigned int net_dma_count; +static spinlock_t net_dma_event_lock; +#endif + /* * The @dev_base list is protected by @dev_base_lock and the rtnl * semaphore. @@ -1846,6 +1853,19 @@ static void net_rx_action(struct softirq } } out: +#ifdef CONFIG_NET_DMA + /* + * There may not be any more sk_buffs coming right now, so push + * any pending DMA copies to hardware + */ + if (net_dma_client) { + struct dma_chan *chan; + rcu_read_lock(); + list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node) + dma_async_memcpy_issue_pending(chan); + rcu_read_unlock(); + } +#endif local_irq_enable(); return; @@ -3300,6 +3320,88 @@ static int dev_cpu_callback(struct notif } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_NET_DMA +/** + * net_dma_rebalance - + * This is called when the number of channels allocated to the net_dma_client + * changes. The net_dma_client tries to have one DMA channel per CPU. + */ +static void net_dma_rebalance(void) +{ + unsigned int cpu, i, n; + struct dma_chan *chan; + + lock_cpu_hotplug(); + + if (net_dma_count == 0) { + for_each_online_cpu(cpu) + rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL); + unlock_cpu_hotplug(); + return; + } + + i = 0; + cpu = first_cpu(cpu_online_map); + + rcu_read_lock(); + list_for_each_entry(chan, &net_dma_client->channels, client_node) { + n = ((num_online_cpus() / net_dma_count) + + (i < (num_online_cpus() % net_dma_count) ? 1 : 0)); + + while(n) { + per_cpu(softnet_data.net_dma, cpu) = chan; + cpu = next_cpu(cpu, cpu_online_map); + n--; + } + i++; + } + rcu_read_unlock(); + + unlock_cpu_hotplug(); +} + +/** + * netdev_dma_event - event callback for the net_dma_client + * @client: should always be net_dma_client + * @chan: + * @event: + */ +static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, + enum dma_event event) +{ + spin_lock(&net_dma_event_lock); + switch (event) { + case DMA_RESOURCE_ADDED: + net_dma_count++; + net_dma_rebalance(); + break; + case DMA_RESOURCE_REMOVED: + net_dma_count--; + net_dma_rebalance(); + break; + default: + break; + } + spin_unlock(&net_dma_event_lock); +} + +/** + * netdev_dma_regiser - register the networking subsystem as a DMA client + */ +static int __init netdev_dma_register(void) +{ + spin_lock_init(&net_dma_event_lock); + net_dma_client = dma_async_client_register(netdev_dma_event); + if (net_dma_client == NULL) + return -ENOMEM; + + dma_async_client_chan_request(net_dma_client, num_online_cpus()); + return 0; +} + +#else +static int __init netdev_dma_register(void) { return -ENODEV; } +#endif /* CONFIG_NET_DMA */ /* * Initialize the DEV module. At boot time this walks the device list and @@ -3353,6 +3455,8 @@ static int __init net_dev_init(void) atomic_set(&queue->backlog_dev.refcnt, 1); } + netdev_dma_register(); + dev_boot_phase = 0; open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); diff -puN net/core/Makefile~git-net net/core/Makefile --- devel/net/core/Makefile~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/core/Makefile 2006-05-29 15:05:00.000000000 -0700 @@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_WIRELESS_EXT) += wireless.o obj-$(CONFIG_NETPOLL) += netpoll.o +obj-$(CONFIG_NET_DMA) += user_dma.o diff -puN net/core/sock.c~git-net net/core/sock.c --- devel/net/core/sock.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/core/sock.c 2006-05-29 15:05:00.000000000 -0700 @@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&newsk->sk_async_wait_queue); +#endif rwlock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); @@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&sk->sk_async_wait_queue); +#endif sk->sk_send_head = NULL; diff -puN /dev/null net/core/user_dma.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/net/core/user_dma.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,131 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * Portions based on net/core/datagram.c and copyrighted by their authors. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code allows the net stack to make use of a DMA engine for + * skb to iovec copies. + */ + +#include +#include +#include /* for BUG_TRAP */ +#include + +#define NET_DMA_DEFAULT_COPYBREAK 4096 + +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; + +/** + * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. + * @skb - buffer to copy + * @offset - offset in the buffer to start copying from + * @iovec - io vector to copy to + * @len - amount of data to copy from buffer to iovec + * @pinned_list - locked iovec buffer data + * + * Note: the iovec is modified during the copy. + */ +int dma_skb_copy_datagram_iovec(struct dma_chan *chan, + struct sk_buff *skb, int offset, struct iovec *to, + size_t len, struct dma_pinned_list *pinned_list) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + dma_cookie_t cookie = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_memcpy_to_iovec(chan, to, pinned_list, + skb->data + offset, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + copy = end - offset; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + + cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, + frag->page_offset + offset - start, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + copy = end - offset; + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_skb_copy_datagram_iovec(chan, list, + offset - start, to, copy, + pinned_list); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + } + +end: + if (!len) { + skb->dma_cookie = cookie; + return cookie; + } + +fault: + return -EFAULT; +} diff -puN net/dccp/proto.c~git-net net/dccp/proto.c --- devel/net/dccp/proto.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/dccp/proto.c 2006-05-29 15:05:00.000000000 -0700 @@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, str } dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; @@ -773,7 +773,7 @@ verify_sock_status: } found_fin_ok: if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (1); out: diff -puN net/ipv4/ah4.c~git-net net/ipv4/ah4.c --- devel/net/ipv4/ah4.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/ah4.c 2006-05-29 15:05:00.000000000 -0700 @@ -119,6 +119,7 @@ error: static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; + int ihl; struct iphdr *iph; struct ip_auth_hdr *ah; struct ah_data *ahp; @@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x ah = (struct ip_auth_hdr*)skb->data; iph = skb->nh.iph; - memcpy(work_buf, iph, iph->ihl*4); + ihl = skb->data - skb->nh.raw; + memcpy(work_buf, iph, ihl); iph->ttl = 0; iph->tos = 0; iph->frag_off = 0; iph->check = 0; - if (iph->ihl != 5) { + if (ihl > sizeof(*iph)) { u32 dummy; if (ip_clear_mutable_options(iph, &dummy)) goto out; @@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x u8 auth_data[MAX_AH_AUTH_LEN]; memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, ihl); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { x->stats.integrity_failed++; @@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x } } ((struct iphdr*)work_buf)->protocol = ah->nexthdr; - skb->nh.raw = skb_pull(skb, ah_hlen); - memcpy(skb->nh.raw, work_buf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); - skb_pull(skb, skb->nh.iph->ihl*4); - skb->h.raw = skb->data; + skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); + __skb_pull(skb, ah_hlen + ihl); return 0; diff -puN net/ipv4/esp4.c~git-net net/ipv4/esp4.c --- devel/net/ipv4/esp4.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/esp4.c 2006-05-29 15:05:00.000000000 -0700 @@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state * int alen = esp->auth.icv_trunc_len; int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; int nfrags; - int encap_len = 0; + int ihl; u8 nexthdr[2]; struct scatterlist *sg; - u8 workbuf[60]; int padlen; if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) @@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state * skb->ip_summed = CHECKSUM_NONE; esph = (struct ip_esp_hdr*)skb->data; - iph = skb->nh.iph; /* Get ivec. This can be wrong, check against another impls. */ if (esp->conf.ivlen) @@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state * /* ... check padding bits here. Silly. :-) */ + iph = skb->nh.iph; + ihl = iph->ihl * 4; + if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; - struct udphdr *uh; - - uh = (struct udphdr *)(iph + 1); - encap_len = (void*)esph - (void*)uh; + struct udphdr *uh = (void *)(skb->nh.raw + ihl); /* * 1) if the NAT-T peer's IP or port changed then @@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state * iph->protocol = nexthdr[1]; pskb_trim(skb, skb->len - alen - padlen - 2); - memcpy(workbuf, skb->nh.raw, iph->ihl*4); - skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); - skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; - memcpy(skb->nh.raw, workbuf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); + skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; return 0; diff -puN net/ipv4/ipcomp.c~git-net net/ipv4/ipcomp.c --- devel/net/ipv4/ipcomp.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/ipcomp.c 2006-05-29 15:05:00.000000000 -0700 @@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list); static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { int err, plen, dlen; - struct iphdr *iph; struct ipcomp_data *ipcd = x->data; u8 *start, *scratch; struct crypto_tfm *tfm; @@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm skb_put(skb, dlen - plen); memcpy(skb->data, scratch, dlen); - iph = skb->nh.iph; - iph->tot_len = htons(dlen + iph->ihl * 4); out: put_cpu(); return err; @@ -83,14 +80,9 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { - u8 nexthdr; int err = 0; struct iphdr *iph; - union { - struct iphdr iph; - char buf[60]; - } tmp_iph; - + struct ip_comp_hdr *ipch; if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && skb_linearize(skb, GFP_ATOMIC) != 0) { @@ -102,15 +94,10 @@ static int ipcomp_input(struct xfrm_stat /* Remove ipcomp header and decompress original payload */ iph = skb->nh.iph; - memcpy(&tmp_iph, iph, iph->ihl * 4); - nexthdr = *(u8 *)skb->data; - skb_pull(skb, sizeof(struct ip_comp_hdr)); - skb->nh.raw += sizeof(struct ip_comp_hdr); - memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); - iph = skb->nh.iph; - iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); - iph->protocol = nexthdr; - skb->h.raw = skb->data; + ipch = (void *)skb->data; + iph->protocol = ipch->nexthdr; + skb->h.raw = skb->nh.raw + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); err = ipcomp_decompress(x, skb); out: diff -puN net/ipv4/Kconfig~git-net net/ipv4/Kconfig --- devel/net/ipv4/Kconfig~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/Kconfig 2006-05-29 15:05:00.000000000 -0700 @@ -414,6 +414,24 @@ config INET_TUNNEL tristate default n +config INET_XFRM_MODE_TRANSPORT + tristate "IP: IPsec transport mode" + default y + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET_XFRM_MODE_TUNNEL + tristate "IP: IPsec tunnel mode" + default y + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + config INET_DIAG tristate "INET: socket monitoring interface" default y diff -puN net/ipv4/Makefile~git-net net/ipv4/Makefile diff -puN net/ipv4/sysctl_net_ipv4.c~git-net net/ipv4/sysctl_net_ipv4.c --- devel/net/ipv4/sysctl_net_ipv4.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/sysctl_net_ipv4.c 2006-05-29 15:05:00.000000000 -0700 @@ -688,6 +688,16 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#ifdef CONFIG_NET_DMA + { + .ctl_name = NET_TCP_DMA_COPYBREAK, + .procname = "tcp_dma_copybreak", + .data = &sysctl_tcp_dma_copybreak, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif { .ctl_name = 0 } }; diff -puN net/ipv4/tcp.c~git-net net/ipv4/tcp.c --- devel/net/ipv4/tcp.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/tcp.c 2006-05-29 15:05:00.000000000 -0700 @@ -263,7 +263,7 @@ #include #include #include - +#include #include #include @@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, * calculation of whether or not we must ACK for the sake of * a window update. */ -static void cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_ break; } if (skb->h.th->fin) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); ++seq; break; } - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); if (!desc->count) break; } @@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_ /* Clean up data we have read: This will do ACK frames. */ if (copied) - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1110,6 +1110,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; + int copied_early = 0; lock_sock(sk); @@ -1133,6 +1134,17 @@ int tcp_recvmsg(struct kiocb *iocb, stru target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + preempt_disable(); + if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && + !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) { + preempt_enable_no_resched(); + tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); + } else + preempt_enable_no_resched(); +#endif + do { struct sk_buff *skb; u32 offset; @@ -1220,7 +1232,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru } } - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { /* Install new reader */ @@ -1274,6 +1286,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru } else sk_wait_data(sk, &timeo); +#ifdef CONFIG_NET_DMA + tp->ucopy.wakeup = 0; +#endif + if (user_recv) { int chunk; @@ -1329,13 +1345,39 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; +#ifdef CONFIG_NET_DMA + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan) { + tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp->ucopy.dma_chan, skb, offset, + msg->msg_iov, used, + tp->ucopy.pinned_list); + + if (tp->ucopy.dma_cookie < 0) { + + printk(KERN_ALERT "dma_cookie < 0\n"); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + if ((offset + used) == skb->len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } } @@ -1355,15 +1397,19 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } break; } while (len > 0); @@ -1386,12 +1432,42 @@ skip_copy: tp->ucopy.len = 0; } +#ifdef CONFIG_NET_DMA + if (tp->ucopy.dma_chan) { + struct sk_buff *skb; + dma_cookie_t done, used; + + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + + while (dma_async_memcpy_complete(tp->ucopy.dma_chan, + tp->ucopy.dma_cookie, &done, + &used) == DMA_IN_PROGRESS) { + /* do partial cleanup of sk_async_wait_queue */ + while ((skb = skb_peek(&sk->sk_async_wait_queue)) && + (dma_async_is_complete(skb->dma_cookie, done, + used) == DMA_SUCCESS)) { + __skb_dequeue(&sk->sk_async_wait_queue); + kfree_skb(skb); + } + } + + /* Safe to free early-copied skbs now */ + __skb_queue_purge(&sk->sk_async_wait_queue); + dma_chan_put(tp->ucopy.dma_chan); + tp->ucopy.dma_chan = NULL; + } + if (tp->ucopy.pinned_list) { + dma_unpin_iovec_pages(tp->ucopy.pinned_list); + tp->ucopy.pinned_list = NULL; + } +#endif + /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1658,6 +1734,9 @@ int tcp_disconnect(struct sock *sk, int __skb_queue_purge(&sk->sk_receive_queue); sk_stream_writequeue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif inet->dport = 0; @@ -1858,7 +1937,7 @@ static int do_tcp_setsockopt(struct sock (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - cleanup_rbuf(sk, 1); + tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) icsk->icsk_ack.pingpong = 1; } diff -puN net/ipv4/tcp_input.c~git-net net/ipv4/tcp_input.c --- devel/net/ipv4/tcp_input.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/tcp_input.c 2006-05-29 15:05:00.000000000 -0700 @@ -71,6 +71,7 @@ #include #include #include +#include int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; @@ -3787,6 +3788,50 @@ static inline int tcp_checksum_complete_ __tcp_checksum_complete_user(sk, skb); } +#ifdef CONFIG_NET_DMA +static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int dma_cookie; + int copied_early = 0; + + if (tp->ucopy.wakeup) + return 0; + + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + + dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, + skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); + + if (dma_cookie < 0) + goto out; + + tp->ucopy.dma_cookie = dma_cookie; + copied_early = 1; + + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + + if ((tp->ucopy.len == 0) || + (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } + } else if (chunk > 0) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } +out: + return copied_early; +} +#endif /* CONFIG_NET_DMA */ + /* * TCP receive function for the ESTABLISHED state. * @@ -3888,8 +3933,6 @@ int tcp_rcv_established(struct sock *sk, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(sk, skb); - /* We know that such packets are checksummed * on entry. */ @@ -3903,14 +3946,23 @@ int tcp_rcv_established(struct sock *sk, } } else { int eaten = 0; + int copied_early = 0; - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); + if (tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len) { +#ifdef CONFIG_NET_DMA + if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + copied_early = 1; + eaten = 1; + } +#endif + if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) + eaten = 1; + } + if (eaten) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -3926,8 +3978,9 @@ int tcp_rcv_established(struct sock *sk, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; } + if (copied_early) + tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -3968,6 +4021,11 @@ int tcp_rcv_established(struct sock *sk, __tcp_ack_snd_check(sk, 0); no_ack: +#ifdef CONFIG_NET_DMA + if (copied_early) + __skb_queue_tail(&sk->sk_async_wait_queue, skb); + else +#endif if (eaten) __kfree_skb(skb); else diff -puN net/ipv4/tcp_ipv4.c~git-net net/ipv4/tcp_ipv4.c --- devel/net/ipv4/tcp_ipv4.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/tcp_ipv4.c 2006-05-29 15:05:00.000000000 -0700 @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -1091,8 +1092,18 @@ process: bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); @@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + /* Cleans up our sk_async_wait_queue */ + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif + /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); diff -puN net/ipv4/xfrm4_input.c~git-net net/ipv4/xfrm4_input.c --- devel/net/ipv4/xfrm4_input.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/xfrm4_input.c 2006-05-29 15:05:00.000000000 -0700 @@ -13,7 +13,6 @@ #include #include #include -#include #include #include @@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb) EXPORT_SYMBOL(xfrm4_rcv); -static inline void ipip_ecn_decapsulate(struct sk_buff *skb) -{ - struct iphdr *outer_iph = skb->nh.iph; - struct iphdr *inner_iph = skb->h.ipiph; - - if (INET_ECN_is_ce(outer_iph->tos)) - IP_ECN_set_ce(inner_iph); -} - static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) { switch (nexthdr) { @@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, xfrm_vec[xfrm_nr++] = x; - iph = skb->nh.iph; + if (x->mode->input(x, skb)) + goto drop; if (x->props.mode) { - if (iph->protocol != IPPROTO_IPIP) - goto drop; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto drop; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - goto drop; - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(iph, skb->h.ipiph); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip_ecn_decapsulate(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); decaps = 1; break; } diff -puN /dev/null net/ipv4/xfrm4_mode_transport.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/net/ipv4/xfrm4_mode_transport.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,83 @@ +/* + * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header will be moved forward to make space for the encapsulation + * header. + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_transport_output(struct sk_buff *skb) +{ + struct xfrm_state *x; + struct iphdr *iph; + int ihl; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + ihl = iph->ihl * 4; + skb->h.raw += ihl; + + x = skb->dst->xfrm; + skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl); + return 0; +} + +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ +static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int ihl = skb->data - skb->h.raw; + + if (skb->h.raw != skb->nh.raw) + skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); + skb->nh.iph->tot_len = htons(skb->len + ihl); + skb->h.raw = skb->data; + return 0; +} + +static struct xfrm_mode xfrm4_transport_mode = { + .input = xfrm4_transport_input, + .output = xfrm4_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm4_transport_init(void) +{ + return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); +} + +static void __exit xfrm4_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_transport_init); +module_exit(xfrm4_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); diff -puN /dev/null net/ipv4/xfrm4_mode_tunnel.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/net/ipv4/xfrm4_mode_tunnel.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,125 @@ +/* + * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *outer_iph = skb->nh.iph; + struct iphdr *inner_iph = skb->h.ipiph; + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. The following fields + * in it shall be filled in by x->type->output: + * tot_len + * check + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_tunnel_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + int flags; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + skb->nh.raw = skb_push(skb, x->props.header_len); + top_iph = skb->nh.iph; + + top_iph->ihl = 5; + top_iph->version = 4; + + /* DS disclosed */ + top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); + + flags = x->props.flags; + if (flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + 0 : (iph->frag_off & htons(IP_DF)); + if (!top_iph->frag_off) + __ip_select_ident(top_iph, dst->child, 0); + + top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + top_iph->protocol = IPPROTO_IPIP; + + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + return 0; +} + +static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + int err = -EINVAL; + + if (iph->protocol != IPPROTO_IPIP) + goto out; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(iph, skb->h.ipiph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm4_tunnel_mode = { + .input = xfrm4_tunnel_input, + .output = xfrm4_tunnel_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, +}; + +static int __init xfrm4_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); +} + +static void __exit xfrm4_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_tunnel_init); +module_exit(xfrm4_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); diff -puN net/ipv4/xfrm4_output.c~git-net net/ipv4/xfrm4_output.c --- devel/net/ipv4/xfrm4_output.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/xfrm4_output.c 2006-05-29 15:05:00.000000000 -0700 @@ -12,67 +12,10 @@ #include #include #include -#include #include #include #include -/* Add encapsulation header. - * - * In transport mode, the IP header will be moved forward to make space - * for the encapsulation header. - * - * In tunnel mode, the top IP header will be constructed per RFC 2401. - * The following fields in it shall be filled in by x->type->output: - * tot_len - * check - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. - */ -static void xfrm4_encap(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct iphdr *iph, *top_iph; - int flags; - - iph = skb->nh.iph; - skb->h.ipiph = iph; - - skb->nh.raw = skb_push(skb, x->props.header_len); - top_iph = skb->nh.iph; - - if (!x->props.mode) { - skb->h.raw += iph->ihl*4; - memmove(top_iph, iph, iph->ihl*4); - return; - } - - top_iph->ihl = 5; - top_iph->version = 4; - - /* DS disclosed */ - top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); - - flags = x->props.flags; - if (flags & XFRM_STATE_NOECN) - IP_ECN_clear(top_iph); - - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : (iph->frag_off & htons(IP_DF)); - if (!top_iph->frag_off) - __ip_select_ident(top_iph, dst->child, 0); - - top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); - - top_iph->saddr = x->props.saddr.a4; - top_iph->daddr = x->id.daddr.a4; - top_iph->protocol = IPPROTO_IPIP; - - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); -} - static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_bu if (err) goto error; - xfrm4_encap(skb); + err = x->mode->output(skb); + if (err) + goto error; err = x->type->output(x, skb); if (err) diff -puN net/ipv4/xfrm4_policy.c~git-net net/ipv4/xfrm4_policy.c --- devel/net/ipv4/xfrm4_policy.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/xfrm4_policy.c 2006-05-29 15:05:00.000000000 -0700 @@ -17,8 +17,6 @@ static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; - static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { return __ip_route_output_key((struct rtable**)dst, fl); @@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, st static inline int xfrm4_garbage_collect(void) { - read_lock(&xfrm4_policy_afinfo.lock); xfrm4_policy_afinfo.garbage_collect(); - read_unlock(&xfrm4_policy_afinfo.lock); return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); } @@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = { static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, - .type_map = &xfrm4_type_map, .dst_ops = &xfrm4_dst_ops, .dst_lookup = xfrm4_dst_lookup, .find_bundle = __xfrm4_find_bundle, diff -puN net/ipv4/xfrm4_state.c~git-net net/ipv4/xfrm4_state.c --- devel/net/ipv4/xfrm4_state.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv4/xfrm4_state.c 2006-05-29 15:05:00.000000000 -0700 @@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, .init_flags = xfrm4_init_flags, .init_tempsel = __xfrm4_init_tempsel, .state_lookup = __xfrm4_state_lookup, diff -puN net/ipv6/ah6.c~git-net net/ipv6/ah6.c --- devel/net/ipv6/ah6.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/ah6.c 2006-05-29 15:05:00.000000000 -0700 @@ -292,7 +292,7 @@ static int ah6_input(struct xfrm_state * memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, hdr_len); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); @@ -301,12 +301,8 @@ static int ah6_input(struct xfrm_state * } } - skb->nh.raw = skb_pull(skb, ah_hlen); - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); - skb_pull(skb, hdr_len); - skb->h.raw = skb->data; - + skb->h.raw = memcpy(skb->nh.raw += ah_hlen, tmp_hdr, hdr_len); + __skb_pull(skb, ah_hlen + hdr_len); kfree(tmp_hdr); diff -puN net/ipv6/esp6.c~git-net net/ipv6/esp6.c --- devel/net/ipv6/esp6.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/esp6.c 2006-05-29 15:05:00.000000000 -0700 @@ -142,25 +142,17 @@ static int esp6_input(struct xfrm_state int hdr_len = skb->h.raw - skb->nh.raw; int nfrags; - unsigned char *tmp_hdr = NULL; int ret = 0; if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { ret = -EINVAL; - goto out_nofree; + goto out; } if (elen <= 0 || (elen & (blksize-1))) { ret = -EINVAL; - goto out_nofree; - } - - tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); - if (!tmp_hdr) { - ret = -ENOMEM; - goto out_nofree; + goto out; } - memcpy(tmp_hdr, skb->nh.raw, hdr_len); /* If integrity check is required, do this. */ if (esp->auth.icv_full_len) { @@ -222,16 +214,12 @@ static int esp6_input(struct xfrm_state /* ... check padding bits here. Silly. :-) */ pskb_trim(skb, skb->len - alen - padlen - 2); - skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen); - skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); ret = nexthdr[1]; } + skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - hdr_len; + out: - kfree(tmp_hdr); -out_nofree: return ret; } diff -puN net/ipv6/ip6_output.c~git-net net/ipv6/ip6_output.c --- devel/net/ipv6/ip6_output.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/ip6_output.c 2006-05-29 15:05:00.000000000 -0700 @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ int ip6_find_1stfragopt(struct sk_buff * return offset; } +EXPORT_SYMBOL_GPL(ip6_find_1stfragopt); static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { diff -puN net/ipv6/ipcomp6.c~git-net net/ipv6/ipcomp6.c --- devel/net/ipv6/ipcomp6.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/ipcomp6.c 2006-05-29 15:05:00.000000000 -0700 @@ -66,10 +66,8 @@ static LIST_HEAD(ipcomp6_tfms_list); static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) { int err = 0; - u8 nexthdr = 0; - int hdr_len = skb->h.raw - skb->nh.raw; - unsigned char *tmp_hdr = NULL; struct ipv6hdr *iph; + struct ipv6_comp_hdr *ipch; int plen, dlen; struct ipcomp_data *ipcd = x->data; u8 *start, *scratch; @@ -86,17 +84,9 @@ static int ipcomp6_input(struct xfrm_sta /* Remove ipcomp header and decompress original payload */ iph = skb->nh.ipv6h; - tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); - if (!tmp_hdr) - goto out; - memcpy(tmp_hdr, iph, hdr_len); - nexthdr = *(u8 *)skb->data; - skb_pull(skb, sizeof(struct ipv6_comp_hdr)); - skb->nh.raw += sizeof(struct ipv6_comp_hdr); - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - iph = skb->nh.ipv6h; - iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr)); - skb->h.raw = skb->data; + ipch = (void *)skb->data; + skb->h.raw = skb->nh.raw + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); /* decompression */ plen = skb->len; @@ -125,18 +115,11 @@ static int ipcomp6_input(struct xfrm_sta skb_put(skb, dlen - plen); memcpy(skb->data, scratch, dlen); + err = ipch->nexthdr; - iph = skb->nh.ipv6h; - iph->payload_len = htons(skb->len); - out_put_cpu: put_cpu(); out: - kfree(tmp_hdr); - if (err) - goto error_out; - return nexthdr; -error_out: return err; } diff -puN net/ipv6/Kconfig~git-net net/ipv6/Kconfig --- devel/net/ipv6/Kconfig~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/Kconfig 2006-05-29 15:05:00.000000000 -0700 @@ -106,6 +106,26 @@ config INET6_TUNNEL tristate default n +config INET6_XFRM_MODE_TRANSPORT + tristate "IPv6: IPsec transport mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_TUNNEL + tristate "IPv6: IPsec tunnel mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + config IPV6_TUNNEL tristate "IPv6: IPv6-in-IPv6 tunnel" select INET6_TUNNEL diff -puN net/ipv6/Makefile~git-net net/ipv6/Makefile --- devel/net/ipv6/Makefile~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/Makefile 2006-05-29 15:05:00.000000000 -0700 @@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o +obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o +obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o diff -puN net/ipv6/tcp_ipv6.c~git-net net/ipv6/tcp_ipv6.c --- devel/net/ipv6/tcp_ipv6.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/tcp_ipv6.c 2006-05-29 15:05:00.000000000 -0700 @@ -1218,8 +1218,16 @@ process: bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (tp->ucopy.dma_chan) + ret = tcp_v6_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v6_do_rcv(sk, skb); + } } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); diff -puN net/ipv6/xfrm6_input.c~git-net net/ipv6/xfrm6_input.c --- devel/net/ipv6/xfrm6_input.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/xfrm6_input.c 2006-05-29 15:05:00.000000000 -0700 @@ -13,21 +13,9 @@ #include #include #include -#include -#include -#include #include #include -static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) -{ - struct ipv6hdr *outer_iph = skb->nh.ipv6h; - struct ipv6hdr *inner_iph = skb->h.ipv6h; - - if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) - IP6_ECN_set_ce(inner_iph); -} - int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) { int err; @@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u xfrm_vec[xfrm_nr++] = x; + if (x->mode->input(x, skb)) + goto drop; + if (x->props.mode) { /* XXX */ - if (nexthdr != IPPROTO_IPV6) - goto drop; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto drop; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - goto drop; - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip6_ecn_decapsulate(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; decaps = 1; break; } diff -puN /dev/null net/ipv6/xfrm6_mode_transport.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/net/ipv6/xfrm6_mode_transport.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,88 @@ +/* + * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the encapsulation header. + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_transport_output(struct sk_buff *skb) +{ + struct xfrm_state *x = skb->dst->xfrm; + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + skb->nh.raw = prevhdr - x->props.header_len; + skb->h.raw = skb->data + hdr_len; + memmove(skb->data, iph, hdr_len); + return 0; +} + +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ +static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int ihl = skb->data - skb->h.raw; + + if (skb->h.raw != skb->nh.raw) + skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); + skb->nh.ipv6h->payload_len = htons(skb->len + ihl - + sizeof(struct ipv6hdr)); + skb->h.raw = skb->data; + return 0; +} + +static struct xfrm_mode xfrm6_transport_mode = { + .input = xfrm6_transport_input, + .output = xfrm6_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm6_transport_init(void) +{ + return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6); +} + +static void __exit xfrm6_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_transport_init); +module_exit(xfrm6_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT); diff -puN /dev/null net/ipv6/xfrm6_mode_tunnel.c --- /dev/null 2006-05-29 10:18:53.280907750 -0700 +++ devel-akpm/net/ipv6/xfrm6_mode_tunnel.c 2006-05-29 15:05:00.000000000 -0700 @@ -0,0 +1,121 @@ +/* + * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + struct ipv6hdr *outer_iph = skb->nh.ipv6h; + struct ipv6hdr *inner_iph = skb->h.ipv6h; + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. The following fields + * in it shall be filled in by x->type->output: + * payload_len + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_tunnel_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct ipv6hdr *iph, *top_iph; + int dsfield; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + skb->nh.raw = skb->data; + top_iph = skb->nh.ipv6h; + skb->nh.raw = &top_iph->nexthdr; + skb->h.ipv6h = top_iph + 1; + + top_iph->version = 6; + top_iph->priority = iph->priority; + top_iph->flow_lbl[0] = iph->flow_lbl[0]; + top_iph->flow_lbl[1] = iph->flow_lbl[1]; + top_iph->flow_lbl[2] = iph->flow_lbl[2]; + dsfield = ipv6_get_dsfield(top_iph); + dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.flags & XFRM_STATE_NOECN) + dsfield &= ~INET_ECN_MASK; + ipv6_change_dsfield(top_iph, 0, dsfield); + top_iph->nexthdr = IPPROTO_IPV6; + top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); + ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + return 0; +} + +static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = -EINVAL; + + if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6) + goto out; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm6_tunnel_mode = { + .input = xfrm6_tunnel_input, + .output = xfrm6_tunnel_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, +}; + +static int __init xfrm6_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6); +} + +static void __exit xfrm6_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_tunnel_init); +module_exit(xfrm6_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL); diff -puN net/ipv6/xfrm6_output.c~git-net net/ipv6/xfrm6_output.c --- devel/net/ipv6/xfrm6_output.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/xfrm6_output.c 2006-05-29 15:05:00.000000000 -0700 @@ -14,68 +14,9 @@ #include #include #include -#include -#include #include #include -/* Add encapsulation header. - * - * In transport mode, the IP header and mutable extension headers will be moved - * forward to make space for the encapsulation header. - * - * In tunnel mode, the top IP header will be constructed per RFC 2401. - * The following fields in it shall be filled in by x->type->output: - * payload_len - * - * On exit, skb->h will be set to the start of the encapsulation header to be - * filled in by x->type->output and skb->nh will be set to the nextheader field - * of the extension header directly preceding the encapsulation header, or in - * its absence, that of the top IP header. The value of skb->data will always - * point to the top IP header. - */ -static void xfrm6_encap(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct ipv6hdr *iph, *top_iph; - int dsfield; - - skb_push(skb, x->props.header_len); - iph = skb->nh.ipv6h; - - if (!x->props.mode) { - u8 *prevhdr; - int hdr_len; - - hdr_len = ip6_find_1stfragopt(skb, &prevhdr); - skb->nh.raw = prevhdr - x->props.header_len; - skb->h.raw = skb->data + hdr_len; - memmove(skb->data, iph, hdr_len); - return; - } - - skb->nh.raw = skb->data; - top_iph = skb->nh.ipv6h; - skb->nh.raw = &top_iph->nexthdr; - skb->h.ipv6h = top_iph + 1; - - top_iph->version = 6; - top_iph->priority = iph->priority; - top_iph->flow_lbl[0] = iph->flow_lbl[0]; - top_iph->flow_lbl[1] = iph->flow_lbl[1]; - top_iph->flow_lbl[2] = iph->flow_lbl[2]; - dsfield = ipv6_get_dsfield(top_iph); - dsfield = INET_ECN_encapsulate(dsfield, dsfield); - if (x->props.flags & XFRM_STATE_NOECN) - dsfield &= ~INET_ECN_MASK; - ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->nexthdr = IPPROTO_IPV6; - top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); - ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); -} - static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_bu if (err) goto error; - xfrm6_encap(skb); + err = x->mode->output(skb); + if (err) + goto error; err = x->type->output(x, skb); if (err) diff -puN net/ipv6/xfrm6_policy.c~git-net net/ipv6/xfrm6_policy.c --- devel/net/ipv6/xfrm6_policy.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/xfrm6_policy.c 2006-05-29 15:05:00.000000000 -0700 @@ -23,8 +23,6 @@ static struct dst_ops xfrm6_dst_ops; static struct xfrm_policy_afinfo xfrm6_policy_afinfo; -static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED }; - static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { int err = 0; @@ -249,9 +247,7 @@ _decode_session6(struct sk_buff *skb, st static inline int xfrm6_garbage_collect(void) { - read_lock(&xfrm6_policy_afinfo.lock); xfrm6_policy_afinfo.garbage_collect(); - read_unlock(&xfrm6_policy_afinfo.lock); return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); } @@ -311,8 +307,6 @@ static struct dst_ops xfrm6_dst_ops = { static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, - .lock = RW_LOCK_UNLOCKED, - .type_map = &xfrm6_type_map, .dst_ops = &xfrm6_dst_ops, .dst_lookup = xfrm6_dst_lookup, .find_bundle = __xfrm6_find_bundle, diff -puN net/ipv6/xfrm6_state.c~git-net net/ipv6/xfrm6_state.c --- devel/net/ipv6/xfrm6_state.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/ipv6/xfrm6_state.c 2006-05-29 15:05:00.000000000 -0700 @@ -135,7 +135,6 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, - .lock = RW_LOCK_UNLOCKED, .init_tempsel = __xfrm6_init_tempsel, .state_lookup = __xfrm6_state_lookup, .find_acq = __xfrm6_find_acq, diff -puN net/llc/af_llc.c~git-net net/llc/af_llc.c --- devel/net/llc/af_llc.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/llc/af_llc.c 2006-05-29 15:05:00.000000000 -0700 @@ -674,7 +674,7 @@ static int llc_ui_recvmsg(struct kiocb * lock_sock(sk); copied = -ENOTCONN; - if (sk->sk_state == TCP_LISTEN) + if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) goto out; timeo = sock_rcvtimeo(sk, nonblock); @@ -733,7 +733,7 @@ static int llc_ui_recvmsg(struct kiocb * if (sk->sk_shutdown & RCV_SHUTDOWN) break; - if (sk->sk_state == TCP_CLOSE) { + if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { if (!sock_flag(sk, SOCK_DONE)) { /* * This occurs when user tries to read @@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb * continue; if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); *seq = 0; } } while (len > 0); diff -puN net/llc/llc_if.c~git-net net/llc/llc_if.c --- devel/net/llc/llc_if.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/llc/llc_if.c 2006-05-29 15:05:00.000000000 -0700 @@ -26,8 +26,6 @@ #include #include -u8 llc_mac_null_var[IFHWADDRLEN]; - /** * llc_build_and_send_pkt - Connection data sending for upper layers. * @sk: connection diff -puN net/llc/llc_input.c~git-net net/llc/llc_input.c --- devel/net/llc/llc_input.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/llc/llc_input.c 2006-05-29 15:05:00.000000000 -0700 @@ -142,6 +142,8 @@ int llc_rcv(struct sk_buff *skb, struct struct llc_sap *sap; struct llc_pdu_sn *pdu; int dest; + int (*rcv)(struct sk_buff *, struct net_device *, + struct packet_type *, struct net_device *); /* * When the interface is in promisc. mode, drop all the crap that it @@ -169,9 +171,12 @@ int llc_rcv(struct sk_buff *skb, struct * First the upper layer protocols that don't need the full * LLC functionality */ - if (sap->rcv_func) { - sap->rcv_func(skb, dev, pt, orig_dev); - goto out_put; + rcv = rcu_dereference(sap->rcv_func); + if (rcv) { + struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); + if (cskb) + rcv(cskb, dev, pt, orig_dev); + rcv(skb, dev, pt, orig_dev); } dest = llc_pdu_type(skb); if (unlikely(!dest || !llc_type_handlers[dest - 1])) diff -puN net/llc/llc_sap.c~git-net net/llc/llc_sap.c --- devel/net/llc/llc_sap.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/llc/llc_sap.c 2006-05-29 15:05:00.000000000 -0700 @@ -282,7 +282,7 @@ static void llc_sap_rcv(struct llc_sap * * mac, and local sap. Returns pointer for socket found, %NULL otherwise. */ static struct sock *llc_lookup_dgram(struct llc_sap *sap, - struct llc_addr *laddr) + const struct llc_addr *laddr) { struct sock *rc; struct hlist_node *node; @@ -304,19 +304,62 @@ found: return rc; } +/** + * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. + * @sap: SAP + * @laddr: address of local LLC (MAC + SAP) + * + * Search socket list of the SAP and finds connections with same sap. + * Deliver clone to each. + */ +static void llc_sap_mcast(struct llc_sap *sap, + const struct llc_addr *laddr, + struct sk_buff *skb) +{ + struct sock *sk; + struct hlist_node *node; + + read_lock_bh(&sap->sk_list.lock); + sk_for_each(sk, node, &sap->sk_list.list) { + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *skb1; + + if (sk->sk_type != SOCK_DGRAM) + continue; + + if (llc->laddr.lsap != laddr->lsap) + continue; + + skb1 = skb_clone(skb, GFP_ATOMIC); + if (!skb1) + break; + + sock_hold(sk); + skb_set_owner_r(skb1, sk); + llc_sap_rcv(sap, skb1); + sock_put(sk); + } + read_unlock_bh(&sap->sk_list.lock); +} + + void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) { struct llc_addr laddr; - struct sock *sk; llc_pdu_decode_da(skb, laddr.mac); llc_pdu_decode_dsap(skb, &laddr.lsap); - sk = llc_lookup_dgram(sap, &laddr); - if (sk) { - skb_set_owner_r(skb, sk); - llc_sap_rcv(sap, skb); - sock_put(sk); - } else + if (llc_mac_multicast(laddr.mac)) { + llc_sap_mcast(sap, &laddr, skb); kfree_skb(skb); + } else { + struct sock *sk = llc_lookup_dgram(sap, &laddr); + if (sk) { + skb_set_owner_r(skb, sk); + llc_sap_rcv(sap, skb); + sock_put(sk); + } else + kfree_skb(skb); + } } diff -puN net/xfrm/xfrm_policy.c~git-net net/xfrm/xfrm_policy.c --- devel/net/xfrm/xfrm_policy.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/xfrm/xfrm_policy.c 2006-05-29 15:05:00.000000000 -0700 @@ -46,45 +46,43 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lo static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); +static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family); +static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo); int xfrm_register_type(struct xfrm_type *type, unsigned short family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - struct xfrm_type_map *typemap; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); + struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock_bh(&typemap->lock); - if (likely(typemap->map[type->proto] == NULL)) - typemap->map[type->proto] = type; + if (likely(typemap[type->proto] == NULL)) + typemap[type->proto] = type; else err = -EEXIST; - write_unlock_bh(&typemap->lock); - xfrm_policy_put_afinfo(afinfo); + xfrm_policy_unlock_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_register_type); int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - struct xfrm_type_map *typemap; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); + struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock_bh(&typemap->lock); - if (unlikely(typemap->map[type->proto] != type)) + if (unlikely(typemap[type->proto] != type)) err = -ENOENT; else - typemap->map[type->proto] = NULL; - write_unlock_bh(&typemap->lock); - xfrm_policy_put_afinfo(afinfo); + typemap[type->proto] = NULL; + xfrm_policy_unlock_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_unregister_type); @@ -92,7 +90,7 @@ EXPORT_SYMBOL(xfrm_unregister_type); struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) { struct xfrm_policy_afinfo *afinfo; - struct xfrm_type_map *typemap; + struct xfrm_type **typemap; struct xfrm_type *type; int modload_attempted = 0; @@ -102,11 +100,9 @@ retry: return NULL; typemap = afinfo->type_map; - read_lock(&typemap->lock); - type = typemap->map[proto]; + type = typemap[proto]; if (unlikely(type && !try_module_get(type->owner))) type = NULL; - read_unlock(&typemap->lock); if (!type && !modload_attempted) { xfrm_policy_put_afinfo(afinfo); request_module("xfrm-type-%d-%d", @@ -142,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *typ module_put(type->owner); } +int xfrm_register_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_policy_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -EEXIST; + modemap = afinfo->mode_map; + if (likely(modemap[mode->encap] == NULL)) { + modemap[mode->encap] = mode; + err = 0; + } + + xfrm_policy_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_register_mode); + +int xfrm_unregister_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_policy_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -ENOENT; + modemap = afinfo->mode_map; + if (likely(modemap[mode->encap] == mode)) { + modemap[mode->encap] = NULL; + err = 0; + } + + xfrm_policy_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_unregister_mode); + +struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode *mode; + int modload_attempted = 0; + + if (unlikely(encap >= XFRM_MODE_MAX)) + return NULL; + +retry: + afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return NULL; + + mode = afinfo->mode_map[encap]; + if (unlikely(mode && !try_module_get(mode->owner))) + mode = NULL; + if (!mode && !modload_attempted) { + xfrm_policy_put_afinfo(afinfo); + request_module("xfrm-mode-%d-%d", family, encap); + modload_attempted = 1; + goto retry; + } + + xfrm_policy_put_afinfo(afinfo); + return mode; +} + +void xfrm_put_mode(struct xfrm_mode *mode) +{ + module_put(mode->owner); +} + static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) @@ -1306,17 +1385,31 @@ static struct xfrm_policy_afinfo *xfrm_p return NULL; read_lock(&xfrm_policy_afinfo_lock); afinfo = xfrm_policy_afinfo[family]; - if (likely(afinfo != NULL)) - read_lock(&afinfo->lock); - read_unlock(&xfrm_policy_afinfo_lock); + if (unlikely(!afinfo)) + read_unlock(&xfrm_policy_afinfo_lock); return afinfo; } static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) { - if (unlikely(afinfo == NULL)) - return; - read_unlock(&afinfo->lock); + read_unlock(&xfrm_policy_afinfo_lock); +} + +static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family) +{ + struct xfrm_policy_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + write_lock_bh(&xfrm_policy_afinfo_lock); + afinfo = xfrm_policy_afinfo[family]; + if (unlikely(!afinfo)) + write_unlock_bh(&xfrm_policy_afinfo_lock); + return afinfo; +} + +static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + write_unlock_bh(&xfrm_policy_afinfo_lock); } static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) diff -puN net/xfrm/xfrm_state.c~git-net net/xfrm/xfrm_state.c --- devel/net/xfrm/xfrm_state.c~git-net 2006-05-29 15:05:00.000000000 -0700 +++ devel-akpm/net/xfrm/xfrm_state.c 2006-05-29 15:05:00.000000000 -0700 @@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct kfree(x->ealg); kfree(x->calg); kfree(x->encap); + if (x->mode) + xfrm_put_mode(x->mode); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -1103,17 +1105,14 @@ static struct xfrm_state_afinfo *xfrm_st return NULL; read_lock(&xfrm_state_afinfo_lock); afinfo = xfrm_state_afinfo[family]; - if (likely(afinfo != NULL)) - read_lock(&afinfo->lock); - read_unlock(&xfrm_state_afinfo_lock); + if (unlikely(!afinfo)) + read_unlock(&xfrm_state_afinfo_lock); return afinfo; } static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) { - if (unlikely(afinfo == NULL)) - return; - read_unlock(&afinfo->lock); + read_unlock(&xfrm_state_afinfo_lock); } /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ @@ -1196,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x if (err) goto error; + x->mode = xfrm_get_mode(x->props.mode, family); + if (x->mode == NULL) + goto error; + x->km.state = XFRM_STATE_VALID; error: _