GIT fa15c844d5ca3550bf3ef97f248f26c4653a9635 git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2.git#ALL commit 4725123c035c5e86563bb55ec90324b74e0ec73e Author: Mark Fasheh Date: Thu Dec 7 17:43:53 2006 -0800 ocfs2: comment protocol version We should be recording why we bumped O2NET_PROTOCOL_VERSION. Signed-off-by: Mark Fasheh commit d78ed1d09ba78f68f7c93e1c139c0554f059439a Author: Sunil Mushran Date: Tue Dec 5 17:56:35 2006 -0800 ocfs2: local mounts This allows users to format an ocfs2 file system with a special flag, OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT. When the file system sees this flag, it will not use any cluster services, nor will it require a cluster configuration, thus acting like a 'local' file system. Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh commit cac1f5bdd44ced5afbee3e73f6474334b44f82ab Author: Andrew Beekhof Date: Mon Dec 4 14:04:55 2006 +0100 [patch 3/3] OCFS2 Configurable timeouts - Protocol changes Modify the OCFS2 handshake to ensure essential timeouts are configured identically on all nodes. Only allow changes when there are no connected peers Improves the logic in o2net_advance_rx() which broke now that sizeof(struct o2net_handshake) is greater than sizeof(struct o2net_msg) Included is the field for userspace-heartbeat timeout to avoid the need for further protocol changes. Uses a global spinlock to ensure the decisions to update configfs entries are made on the correct value. The region covered by the spinlock when incrimenting the counter is much larger as this is the more critical case. Signed-off-by: Andrew Beekhof Signed-off-by: Mark Fasheh commit d9d0abe8b5c627eedf5df661094e177bd766bb14 Author: Jeff Mahoney Date: Mon Dec 4 14:04:54 2006 +0100 [patch 2/3] OCFS2 Configurable timeouts Allow configuration of OCFS2 timeouts from userspace via configfs Signed-off-by: Andrew Beekhof Signed-off-by: Mark Fasheh commit d7facdb00c7b7cca06976b1616fd8f0a6900b84a Author: Andrew Beekhof Date: Mon Dec 4 14:04:53 2006 +0100 [patch 1/3] OCFS2 - Expose struct o2nm_cluster Subsequent patches (namely userspace heartbeat and configurable timeouts) require access to the o2nm_cluster struct. This patch does the necessary shuffling. Signed-off-by: Andrew Beekhof Signed-off-by: Mark Fasheh commit 98885367babbd17a1e2fdb5606315834d0175991 Author: Tiger Yang Date: Tue Dec 5 10:09:17 2006 +0800 ocfs2: update mount option documentation We forgot to document the atime_quantum mount option in ocfs2.txt. This adds a proper description of how the it works. Signed-off-by: Tiger Yang Signed-off-by: Mark Fasheh commit 8b5b26723fb5f95ed3072a6ca400832367c7efa8 Author: Kurt Hackel Date: Fri Dec 1 14:47:20 2006 -0800 ocfs2: fix cluster-wide refcounting of lock resources This was previously broken and migration of some locks had to be temporarily disabled. We use a new (and backward-incompatible) set of network messages to account for all references to a lock resources held across the cluster. once these are all freed, the master node may then free the lock resource memory once its local references are dropped. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit e47575895914402e1e72f00b3db539f468b849e3 Author: Joel Becker Date: Fri Oct 6 17:33:23 2006 -0700 configfs: accessing item hierarchy during rmdir(2) Add a notification callback, ops->disconnect_notify(). It has the same prototype as ->drop_item(), but it will be called just before the item linkage is broken. This way, configfs users who want to do work while the object is still in the heirarchy have a chance. Client drivers will still need to config_item_put() in their ->drop_item(), if they implement it. They need do nothing in ->disconnect_notify(). They don't have to provide it if they don't care. But someone who wants to be notified before ci_parent is set to NULL can now be notified. Signed-off-by: Joel Becker Documentation/filesystems/configfs/configfs.txt | 12 + Documentation/filesystems/ocfs2.txt | 3 fs/configfs/dir.c | 29 ++ fs/ocfs2/cluster/nodemanager.c | 192 +++++++++++ fs/ocfs2/cluster/nodemanager.h | 17 + fs/ocfs2/cluster/tcp.c | 152 ++++++++- fs/ocfs2/cluster/tcp.h | 8 fs/ocfs2/cluster/tcp_internal.h | 18 + fs/ocfs2/dlm/dlmcommon.h | 75 ++++ fs/ocfs2/dlm/dlmdebug.c | 18 + fs/ocfs2/dlm/dlmdomain.c | 117 +++++-- fs/ocfs2/dlm/dlmlock.c | 4 fs/ocfs2/dlm/dlmmaster.c | 394 +++++++++++++++++++++-- fs/ocfs2/dlm/dlmrecovery.c | 123 +++++++ fs/ocfs2/dlm/dlmthread.c | 167 ++++------ fs/ocfs2/dlmglue.c | 79 +++-- fs/ocfs2/heartbeat.c | 9 + fs/ocfs2/inode.c | 3 fs/ocfs2/journal.c | 46 ++- fs/ocfs2/journal.h | 5 fs/ocfs2/mmap.c | 6 fs/ocfs2/namei.c | 8 fs/ocfs2/ocfs2.h | 5 fs/ocfs2/ocfs2_fs.h | 5 fs/ocfs2/super.c | 90 ++++- fs/ocfs2/vote.c | 3 include/linux/configfs.h | 1 27 files changed, 1309 insertions(+), 280 deletions(-) diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index b34cdb5..c199484 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt @@ -238,6 +238,8 @@ config_item_type. struct config_group *(*make_group)(struct config_group *group, const char *name); int (*commit_item)(struct config_item *item); + void (*disconnect_notify)(struct config_group *group, + struct config_item *item); void (*drop_item)(struct config_group *group, struct config_item *item); }; @@ -268,6 +270,16 @@ the item in other threads, the memory is for the item to actually disappear from the subsystem's usage. But it is gone from configfs. +When drop_item() is called, the item's linkage has already been torn +down. It no longer has a reference on its parent and has no place in +the item hierarchy. If a client needs to do some cleanup before this +teardown happens, the subsystem can implement the +ct_group_ops->disconnect_notify() method. The method is called after +configfs has removed the item from the filesystem view but before the +item is removed from its parent group. Like drop_item(), +disconnect_notify() is void and cannot fail. Client subsystems should +not drop any references here, as they still must do it in drop_item(). + A config_group cannot be removed while it still has child items. This is implemented in the configfs rmdir(2) code. ->drop_item() will not be called, as the item has not been dropped. rmdir(2) will fail, as the diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index af6defd..8ccf0c1 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -54,3 +54,6 @@ errors=panic Panic and halt the machine intr (*) Allow signals to interrupt cluster operations. nointr Do not allow signals to interrupt cluster operations. +atime_quantum=60(*) OCFS2 will not update atime unless this number + of seconds has passed since the last update. + Set to zero to always update atime. diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 1814ba4..c8d234d 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -715,6 +715,28 @@ static void configfs_detach_group(struct } /* + * After the item has been detached from the filesystem view, we are + * ready to tear it out of the hierarchy. Notify the client before + * we do that so they can perform any cleanup that requires + * navigating the hierarchy. A client does not need to provide this + * callback. The subsystem semaphore MUST be held by the caller, and + * references must be valid for both items. It also assumes the + * caller has validated ci_type. + */ +static void client_disconnect_notify(struct config_item *parent_item, + struct config_item *item) +{ + struct config_item_type *type; + + type = parent_item->ci_type; + BUG_ON(!type); + + if (type->ct_group_ops && type->ct_group_ops->disconnect_notify) + type->ct_group_ops->disconnect_notify(to_config_group(parent_item), + item); +} + +/* * Drop the initial reference from make_item()/make_group() * This function assumes that reference is held on item * and that item holds a valid reference to the parent. Also, it @@ -734,7 +756,7 @@ static void client_drop_item(struct conf */ if (type->ct_group_ops && type->ct_group_ops->drop_item) type->ct_group_ops->drop_item(to_config_group(parent_item), - item); + item); else config_item_put(item); } @@ -843,11 +865,14 @@ out_unlink: if (ret) { /* Tear down everything we built up */ down(&subsys->su_sem); + + client_disconnect_notify(parent_item, item); if (group) unlink_group(group); else unlink_obj(item); client_drop_item(parent_item, item); + up(&subsys->su_sem); if (module_got) @@ -912,11 +937,13 @@ static int configfs_rmdir(struct inode * configfs_detach_group(item); down(&subsys->su_sem); + client_disconnect_notify(parent_item, item); unlink_group(to_config_group(item)); } else { configfs_detach_item(item); down(&subsys->su_sem); + client_disconnect_notify(parent_item, item); unlink_obj(item); } diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index d11753c..357f1d5 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,7 +35,7 @@ #include "ver.h" /* for now we operate under the assertion that there can be only one * cluster active at a time. Changing this will require trickling * cluster references throughout where nodes are looked up */ -static struct o2nm_cluster *o2nm_single_cluster = NULL; +struct o2nm_cluster *o2nm_single_cluster = NULL; #define OCFS2_MAX_HB_CTL_PATH 256 static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; @@ -97,17 +97,6 @@ const char *o2nm_get_hb_ctl_path(void) } EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path); -struct o2nm_cluster { - struct config_group cl_group; - unsigned cl_has_local:1; - u8 cl_local_node; - rwlock_t cl_nodes_lock; - struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; - struct rb_root cl_node_ip_tree; - /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ - unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; -}; - struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { struct o2nm_node *node = NULL; @@ -543,6 +532,179 @@ static struct o2nm_node_group *to_o2nm_n } #endif +struct o2nm_cluster_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2nm_cluster *, char *); + ssize_t (*store)(struct o2nm_cluster *, const char *, size_t); +}; + +static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, + unsigned int *val) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp == 0) + return -EINVAL; + if (tmp >= (u32)-1) + return -ERANGE; + + *val = tmp; + + return count; +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); +} + +static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_idle_timeout_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change idle timeout after " + "the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val <= cluster->cl_keepalive_delay_ms) { + mlog(ML_NOTICE, "o2net: idle timeout must be larger " + "than keepalive delay\n"); + ret = -EINVAL; + } else { + cluster->cl_idle_timeout_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); +} + +static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + ssize_t ret; + unsigned int val; + + ret = o2nm_cluster_attr_write(page, count, &val); + + if (ret > 0) { + if (cluster->cl_keepalive_delay_ms != val + && o2net_num_connected_peers()) { + mlog(ML_NOTICE, + "o2net: cannot change keepalive delay after" + " the first peer has agreed to it." + " %d connected peers\n", + o2net_num_connected_peers()); + ret = -EINVAL; + } else if (val >= cluster->cl_idle_timeout_ms) { + mlog(ML_NOTICE, "o2net: keepalive delay must be " + "smaller than idle timeout\n"); + ret = -EINVAL; + } else { + cluster->cl_keepalive_delay_ms = val; + } + } + + return ret; +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read( + struct o2nm_cluster *cluster, char *page) +{ + return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); +} + +static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + return o2nm_cluster_attr_write(page, count, + &cluster->cl_reconnect_delay_ms); +} +static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "idle_timeout_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_idle_timeout_ms_read, + .store = o2nm_cluster_attr_idle_timeout_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "keepalive_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_keepalive_delay_ms_read, + .store = o2nm_cluster_attr_keepalive_delay_ms_write, +}; + +static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "reconnect_delay_ms", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_reconnect_delay_ms_read, + .store = o2nm_cluster_attr_reconnect_delay_ms_write, +}; + +static struct configfs_attribute *o2nm_cluster_attrs[] = { + &o2nm_cluster_attr_idle_timeout_ms.attr, + &o2nm_cluster_attr_keepalive_delay_ms.attr, + &o2nm_cluster_attr_reconnect_delay_ms.attr, + NULL, +}; +static ssize_t o2nm_cluster_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret = 0; + + if (o2nm_cluster_attr->show) + ret = o2nm_cluster_attr->show(cluster, page); + return ret; +} + +static ssize_t o2nm_cluster_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + struct o2nm_cluster_attribute *o2nm_cluster_attr = + container_of(attr, struct o2nm_cluster_attribute, attr); + ssize_t ret; + + if (o2nm_cluster_attr->store == NULL) { + ret = -EINVAL; + goto out; + } + + ret = o2nm_cluster_attr->store(cluster, page, count); + if (ret < count) + goto out; +out: + return ret; +} + static struct config_item *o2nm_node_group_make_item(struct config_group *group, const char *name) { @@ -624,10 +786,13 @@ static void o2nm_cluster_release(struct static struct configfs_item_operations o2nm_cluster_item_ops = { .release = o2nm_cluster_release, + .show_attribute = o2nm_cluster_show, + .store_attribute = o2nm_cluster_store, }; static struct config_item_type o2nm_cluster_type = { .ct_item_ops = &o2nm_cluster_item_ops, + .ct_attrs = o2nm_cluster_attrs, .ct_owner = THIS_MODULE, }; @@ -678,6 +843,9 @@ static struct config_group *o2nm_cluster cluster->cl_group.default_groups[2] = NULL; rwlock_init(&cluster->cl_nodes_lock); cluster->cl_node_ip_tree = RB_ROOT; + cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; + cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; + cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; ret = &cluster->cl_group; o2nm_single_cluster = cluster; diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index fce8033..8fb23ca 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -53,6 +53,23 @@ struct o2nm_node { unsigned long nd_set_attributes; }; +struct o2nm_cluster { + struct config_group cl_group; + unsigned cl_has_local:1; + u8 cl_local_node; + rwlock_t cl_nodes_lock; + struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; + struct rb_root cl_node_ip_tree; + unsigned int cl_idle_timeout_ms; + unsigned int cl_keepalive_delay_ms; + unsigned int cl_reconnect_delay_ms; + + /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ + unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +extern struct o2nm_cluster *o2nm_single_cluster; + u8 o2nm_this_node(void); int o2nm_configured_node_map(unsigned long *map, unsigned bytes); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 9b3209d..266e6db 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -147,6 +147,28 @@ static void o2net_listen_data_ready(stru static void o2net_sc_send_keep_req(struct work_struct *work); static void o2net_idle_timer(unsigned long data); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); + +/* + * FIXME: These should use to_o2nm_cluster_from_node(), but we end up + * losing our parent link to the cluster during shutdown. This can be + * solved by adding a pre-removal callback to configfs, or passing + * around the cluster with the node. -jeffm + */ +static inline int o2net_reconnect_delay(struct o2nm_node *node) +{ + return o2nm_single_cluster->cl_reconnect_delay_ms; +} + +static inline int o2net_keepalive_delay(struct o2nm_node *node) +{ + return o2nm_single_cluster->cl_keepalive_delay_ms; +} + +static inline int o2net_idle_timeout(struct o2nm_node *node) +{ + return o2nm_single_cluster->cl_idle_timeout_ms; +} static inline int o2net_sys_err_to_errno(enum o2net_system_error err) { @@ -271,6 +293,8 @@ static void sc_kref_release(struct kref { struct o2net_sock_container *sc = container_of(kref, struct o2net_sock_container, sc_kref); + BUG_ON(timer_pending(&sc->sc_idle_timeout)); + sclog(sc, "releasing\n"); if (sc->sc_sock) { @@ -356,6 +380,13 @@ static void o2net_sc_cancel_delayed_work sc_put(sc); } +atomic_t o2net_connected_peers = ATOMIC_INIT(0); + +int o2net_num_connected_peers(void) +{ + return atomic_read(&o2net_connected_peers); +} + static void o2net_set_nn_state(struct o2net_node *nn, struct o2net_sock_container *sc, unsigned valid, int err) @@ -366,6 +397,11 @@ static void o2net_set_nn_state(struct o2 assert_spin_locked(&nn->nn_lock); + if (old_sc && !sc) + atomic_dec(&o2net_connected_peers); + else if (!old_sc && sc) + atomic_inc(&o2net_connected_peers); + /* the node num comparison and single connect/accept path should stop * an non-null sc from being overwritten with another */ BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); @@ -424,9 +460,9 @@ static void o2net_set_nn_state(struct o2 /* delay if we're withing a RECONNECT_DELAY of the * last attempt */ delay = (nn->nn_last_connect_attempt + - msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) + msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node))) - jiffies; - if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) + if (delay > msecs_to_jiffies(o2net_reconnect_delay(sc->sc_node))) delay = 0; mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); @@ -1099,13 +1135,51 @@ static int o2net_check_handshake(struct return -1; } + /* + * Ensure timeouts are consistent with other nodes, otherwise + * we can end up with one node thinking that the other must be down, + * but isn't. This can ultimately cause corruption. + */ + if (be32_to_cpu(hand->o2net_idle_timeout_ms) != + o2net_idle_timeout(sc->sc_node)) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_idle_timeout_ms), + o2net_idle_timeout(sc->sc_node)); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != + o2net_keepalive_delay(sc->sc_node)) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_keepalive_delay_ms), + o2net_keepalive_delay(sc->sc_node)); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != + O2HB_MAX_WRITE_TIMEOUT_MS) { + mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " + "%u ms, but we use %u ms locally. disconnecting\n", + SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + sc->sc_handshake_ok = 1; spin_lock(&nn->nn_lock); /* set valid and queue the idle timers only if it hasn't been * shut down already */ if (nn->nn_sc == sc) { - o2net_sc_postpone_idle(sc); + o2net_sc_reset_idle_timer(sc); o2net_set_nn_state(nn, sc, 1, 0); } spin_unlock(&nn->nn_lock); @@ -1131,6 +1205,23 @@ static int o2net_advance_rx(struct o2net sclog(sc, "receiving\n"); do_gettimeofday(&sc->sc_tv_advance_start); + if (unlikely(sc->sc_handshake_ok == 0)) { + if(sc->sc_page_off < sizeof(struct o2net_handshake)) { + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = sizeof(struct o2net_handshake) - sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) + sc->sc_page_off += ret; + } + + if (sc->sc_page_off == sizeof(struct o2net_handshake)) { + o2net_check_handshake(sc); + if (unlikely(sc->sc_handshake_ok == 0)) + ret = -EPROTO; + } + goto out; + } + /* do we need more header? */ if (sc->sc_page_off < sizeof(struct o2net_msg)) { data = page_address(sc->sc_page) + sc->sc_page_off; @@ -1138,15 +1229,6 @@ static int o2net_advance_rx(struct o2net ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); if (ret > 0) { sc->sc_page_off += ret; - - /* this working relies on the handshake being - * smaller than the normal message header */ - if (sc->sc_page_off >= sizeof(struct o2net_handshake)&& - !sc->sc_handshake_ok && o2net_check_handshake(sc)) { - ret = -EPROTO; - goto out; - } - /* only swab incoming here.. we can * only get here once as we cross from * being under to over */ @@ -1248,6 +1330,18 @@ static int o2net_set_nodelay(struct sock return ret; } +static void o2net_initialize_handshake(void) +{ + o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( + O2HB_MAX_WRITE_TIMEOUT_MS); + o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( + o2net_idle_timeout(NULL)); + o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( + o2net_keepalive_delay(NULL)); + o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( + o2net_reconnect_delay(NULL)); +} + /* ------------------------------------------------------------ */ /* called when a connect completes and after a sock is accepted. the @@ -1262,6 +1356,7 @@ static void o2net_sc_connect_completed(s (unsigned long long)O2NET_PROTOCOL_VERSION, (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); + o2net_initialize_handshake(); o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); sc_put(sc); } @@ -1287,8 +1382,10 @@ static void o2net_idle_timer(unsigned lo do_gettimeofday(&now); - printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for 10 " - "seconds, shutting it down.\n", SC_NODEF_ARGS(sc)); + printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " + "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), + o2net_idle_timeout(sc->sc_node) / 1000, + o2net_idle_timeout(sc->sc_node) % 1000); mlog(ML_NOTICE, "here are some times that might help debug the " "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", @@ -1306,14 +1403,21 @@ static void o2net_idle_timer(unsigned lo o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } -static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) +static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) { o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, - O2NET_KEEPALIVE_DELAY_SECS * HZ); + msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node))); do_gettimeofday(&sc->sc_tv_timer); mod_timer(&sc->sc_idle_timeout, - jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ)); + jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node))); +} + +static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) +{ + /* Only push out an existing timer */ + if (timer_pending(&sc->sc_idle_timeout)) + o2net_sc_reset_idle_timer(sc); } /* this work func is kicked whenever a path sets the nn state which doesn't @@ -1435,9 +1539,12 @@ static void o2net_connect_expired(struct spin_lock(&nn->nn_lock); if (!nn->nn_sc_valid) { + struct o2nm_node *node = nn->nn_sc->sc_node; mlog(ML_ERROR, "no connection established with node %u after " - "%u seconds, giving up and returning errors.\n", - o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS); + "%u.%u seconds, giving up and returning errors.\n", + o2net_num_from_nn(nn), + o2net_idle_timeout(node) / 1000, + o2net_idle_timeout(node) % 1000); o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); } @@ -1478,6 +1585,8 @@ static void o2net_hb_node_down_cb(struct if (node_num != o2nm_this_node()) o2net_disconnect_node(node); + + BUG_ON(atomic_read(&o2net_connected_peers) < 0); } static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, @@ -1489,14 +1598,14 @@ static void o2net_hb_node_up_cb(struct o /* ensure an immediate connect attempt */ nn->nn_last_connect_attempt = jiffies - - (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1); + (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); if (node_num != o2nm_this_node()) { /* heartbeat doesn't work unless a local node number is * configured and doing so brings up the o2net_wq, so we can * use it.. */ queue_delayed_work(o2net_wq, &nn->nn_connect_expired, - O2NET_IDLE_TIMEOUT_SECS * HZ); + msecs_to_jiffies(o2net_idle_timeout(node))); /* believe it or not, accept and node hearbeating testing * can succeed for this node before we got here.. so @@ -1641,6 +1750,7 @@ static int o2net_accept_one(struct socke o2net_register_callbacks(sc->sc_sock->sk, sc); o2net_sc_queue_work(sc, &sc->sc_rx_work); + o2net_initialize_handshake(); o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); out: diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 616ff2b..21a4e43 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -54,6 +54,13 @@ typedef int (o2net_msg_handler_func)(str #define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) +/* same as hb delay, we're waiting for another node to recognize our hb */ +#define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 + +#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 +#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 + + /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) { @@ -101,6 +108,7 @@ void o2net_unregister_hb_callbacks(void) int o2net_start_listening(struct o2nm_node *node); void o2net_stop_listening(struct o2nm_node *node); void o2net_disconnect_node(struct o2nm_node *node); +int o2net_num_connected_peers(void); int o2net_init(void); void o2net_exit(void); diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index daebbd3..775c911 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -27,23 +27,23 @@ #define O2NET_MSG_STATUS_MAGIC ((u16) #define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57) #define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58) -/* same as hb delay, we're waiting for another node to recognize our hb */ -#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS - /* we're delaying our quorum decision so that heartbeat will have timed * out truly dead nodes by the time we come around to making decisions * on their number */ #define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) -#define O2NET_KEEPALIVE_DELAY_SECS 5 -#define O2NET_IDLE_TIMEOUT_SECS 10 - /* * This version number represents quite a lot, unfortunately. It not * only represents the raw network message protocol on the wire but also * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 6: + * - DLM lockres remote refcount fixes. + * + * New in version 5: + * - Network timeout checking protocol + * * New in version 4: * - Remove i_generation from lock names for better stat performance. * @@ -54,10 +54,14 @@ #define O2NET_IDLE_TIMEOUT_SECS 10 * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 4ULL +#define O2NET_PROTOCOL_VERSION 6ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; + __be32 o2hb_heartbeat_timeout_ms; + __be32 o2net_idle_timeout_ms; + __be32 o2net_keepalive_delay_ms; + __be32 o2net_reconnect_delay_ms; }; struct o2net_node { diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 6b6ff76..9fa4271 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -222,6 +222,7 @@ #define DLM_LOCK_RES_READY #define DLM_LOCK_RES_DIRTY 0x00000008 #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 #define DLM_LOCK_RES_MIGRATING 0x00000020 +#define DLM_LOCK_RES_DROPPING_REF 0x00000040 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -265,6 +266,8 @@ struct dlm_lock_resource u8 owner; //node which owns the lock resource, or unknown u16 state; char lvb[DLM_LVB_LEN]; + unsigned int inflight_locks; + unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; }; struct dlm_migratable_lock @@ -367,7 +370,7 @@ enum { DLM_CONVERT_LOCK_MSG, /* 504 */ DLM_PROXY_AST_MSG, /* 505 */ DLM_UNLOCK_LOCK_MSG, /* 506 */ - DLM_UNUSED_MSG2, /* 507 */ + DLM_DEREF_LOCKRES_MSG, /* 507 */ DLM_MIGRATE_REQUEST_MSG, /* 508 */ DLM_MIG_LOCKRES_MSG, /* 509 */ DLM_QUERY_JOIN_MSG, /* 510 */ @@ -417,6 +420,9 @@ struct dlm_master_request u8 name[O2NM_MAX_NAME_LEN]; }; +#define DLM_ASSERT_RESPONSE_REASSERT 0x00000001 +#define DLM_ASSERT_RESPONSE_MASTERY_REF 0x00000002 + #define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 #define DLM_ASSERT_MASTER_REQUERY 0x00000002 #define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 @@ -430,6 +436,8 @@ struct dlm_assert_master u8 name[O2NM_MAX_NAME_LEN]; }; +#define DLM_MIGRATE_RESPONSE_MASTERY_REF 0x00000001 + struct dlm_migrate_request { u8 master; @@ -648,6 +656,16 @@ struct dlm_finalize_reco __be32 pad2; }; +struct dlm_deref_lockres +{ + u32 pad1; + u16 pad2; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + static inline enum dlm_status __dlm_lockres_state_to_status(struct dlm_lock_resource *res) { @@ -721,8 +739,8 @@ void __dlm_lockres_calc_usage(struct dlm struct dlm_lock_resource *res); void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); -void dlm_purge_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres); +int dlm_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres); static inline void dlm_lockres_get(struct dlm_lock_resource *res) { /* This is called on every lookup, so it might be worth @@ -733,6 +751,10 @@ void dlm_lockres_put(struct dlm_lock_res void __dlm_unhash_lockres(struct dlm_lock_resource *res); void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash); struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len, @@ -753,6 +775,47 @@ struct dlm_lock_resource *dlm_new_lockre const char *name, unsigned int namelen); +#define dlm_lockres_set_refmap_bit(bit,res) \ + __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) +#define dlm_lockres_clear_refmap_bit(bit,res) \ + __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) + +static inline void __dlm_lockres_set_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: setting bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + set_bit(bit, res->refmap); +} + +static inline void __dlm_lockres_clear_refmap_bit(int bit, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + //printk("%s:%d:%.*s: clearing bit %d\n", file, line, + // res->lockname.len, res->lockname.name, bit); + clear_bit(bit, res->refmap); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line); +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line); +#define dlm_lockres_drop_inflight_ref(d,r) \ + __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) +#define dlm_lockres_grab_inflight_ref_new(d,r) \ + __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) + void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, @@ -805,6 +868,7 @@ int dlm_lockres_is_dirty(struct dlm_ctxt int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 target); +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 old_master); @@ -814,6 +878,7 @@ void __dlm_lockres_reserve_ast(struct dl int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data); int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data); int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data); int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data); int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data); @@ -856,10 +921,12 @@ static inline void __dlm_wait_on_lockres int dlm_init_mle_cache(void); void dlm_destroy_mle_cache(void); void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); - +int __dlm_lockres_has_locks(struct dlm_lock_resource *res); int __dlm_lockres_unused(struct dlm_lock_resource *res); static inline const char * dlm_lock_mode_name(int mode) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 3f6c8d8..1015cc7 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -53,6 +53,23 @@ void dlm_print_one_lock_resource(struct spin_unlock(&res->spinlock); } +static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) +{ + int bit; + assert_spin_locked(&res->spinlock); + + mlog(ML_NOTICE, " refmap nodes: [ "); + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + printk("%u ", bit); + bit++; + } + printk("], inflight=%u\n", res->inflight_locks); +} + void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) { struct list_head *iter2; @@ -65,6 +82,7 @@ void __dlm_print_one_lock_resource(struc res->owner, res->state); mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", res->last_used, list_empty(&res->purge) ? "no" : "yes"); + dlm_print_lockres_refmap(res); mlog(ML_NOTICE, " granted queue: \n"); list_for_each(iter2, &res->granted) { lock = list_entry(iter2, struct dlm_lock, list); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 420a375..21e7a2a 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -125,10 +125,10 @@ void __dlm_insert_lockres(struct dlm_ctx hlist_add_head(&res->hash_node, bucket); } -struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, - const char *name, - unsigned int len, - unsigned int hash) +struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) { struct hlist_head *bucket; struct hlist_node *list; @@ -154,6 +154,37 @@ struct dlm_lock_resource * __dlm_lookup_ return NULL; } +/* intended to be called by functions which do not care about lock + * resources which are being purged (most net _handler functions). + * this will return NULL for any lock resource which is found but + * currently in the process of dropping its mastery reference. + * use __dlm_lookup_lockres_full when you need the lock resource + * regardless (e.g. dlm_get_lock_resource) */ +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len, + unsigned int hash) +{ + struct dlm_lock_resource *res = NULL; + + mlog_entry("%.*s\n", len, name); + + assert_spin_locked(&dlm->spinlock); + + res = __dlm_lookup_lockres_full(dlm, name, len, hash); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + return NULL; + } + spin_unlock(&res->spinlock); + } + + return res; +} + struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len) @@ -330,43 +361,60 @@ static void dlm_complete_dlm_shutdown(st wake_up(&dlm_domain_events); } -static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) +static int dlm_migrate_all_locks(struct dlm_ctxt *dlm) { - int i; + int i, num, n, ret = 0; struct dlm_lock_resource *res; + struct hlist_node *iter; + struct hlist_head *bucket; + int dropped; mlog(0, "Migrating locks from domain %s\n", dlm->name); -restart: + + num = 0; spin_lock(&dlm->spinlock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { - while (!hlist_empty(dlm_lockres_hash(dlm, i))) { - res = hlist_entry(dlm_lockres_hash(dlm, i)->first, - struct dlm_lock_resource, hash_node); - /* need reference when manually grabbing lockres */ +redo_bucket: + n = 0; + bucket = dlm_lockres_hash(dlm, i); + iter = bucket->first; + while (iter) { + n++; + res = hlist_entry(iter, struct dlm_lock_resource, + hash_node); dlm_lockres_get(res); - /* this should unhash the lockres - * and exit with dlm->spinlock */ - mlog(0, "purging res=%p\n", res); - if (dlm_lockres_is_dirty(dlm, res)) { - /* HACK! this should absolutely go. - * need to figure out why some empty - * lockreses are still marked dirty */ - mlog(ML_ERROR, "lockres %.*s dirty!\n", - res->lockname.len, res->lockname.name); - - spin_unlock(&dlm->spinlock); - dlm_kick_thread(dlm, res); - wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); - dlm_lockres_put(res); - goto restart; - } - dlm_purge_lockres(dlm, res); + /* migrate, if necessary. this will drop the dlm + * spinlock and retake it if it does migration. */ + dropped = dlm_empty_lockres(dlm, res); + + spin_lock(&res->spinlock); + __dlm_lockres_calc_usage(dlm, res); + iter = res->hash_node.next; + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + + cond_resched_lock(&dlm->spinlock); + + if (dropped) + goto redo_bucket; } + num += n; + mlog(0, "%s: touched %d lockreses in bucket %d " + "(tot=%d)\n", dlm->name, n, i, num); } spin_unlock(&dlm->spinlock); - + wake_up(&dlm->dlm_thread_wq); + + /* let the dlm thread take care of purging, keep scanning until + * nothing remains in the hash */ + if (num) { + mlog(0, "%s: %d lock resources in hash last pass\n", + dlm->name, num); + ret = -EAGAIN; + } mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); + return ret; } static int dlm_no_joining_node(struct dlm_ctxt *dlm) @@ -571,7 +619,9 @@ void dlm_unregister_domain(struct dlm_ct /* We changed dlm state, notify the thread */ dlm_kick_thread(dlm, NULL); - dlm_migrate_all_locks(dlm); + while (dlm_migrate_all_locks(dlm)) { + mlog(0, "%s: more migration to do\n", dlm->name); + } dlm_mark_domain_leaving(dlm); dlm_leave_domain(dlm); dlm_complete_dlm_shutdown(dlm); @@ -1082,6 +1132,13 @@ static int dlm_register_domain_handlers( if (status) goto bail; + status = o2net_register_handler(DLM_DEREF_LOCKRES_MSG, dlm->key, + sizeof(struct dlm_deref_lockres), + dlm_deref_lockres_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, sizeof(struct dlm_migrate_request), dlm_migrate_request_handler, diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 42a1b91..21939e4 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -163,6 +163,10 @@ static enum dlm_status dlmlock_master(st kick_thread = 1; } } + /* reduce the inflight count, this may result in the lockres + * being purged below during calc_usage */ + if (lock->ml.node == dlm->node_num) + dlm_lockres_drop_inflight_ref(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 856012b..287b199 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -99,9 +99,9 @@ static void dlm_mle_node_up(struct dlm_c int idx); static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); -static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, - unsigned int namelen, void *nodemap, - u32 flags); +static int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags); static inline int dlm_mle_equal(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle, @@ -237,7 +237,8 @@ static int dlm_find_mle(struct dlm_ctxt struct dlm_master_list_entry **mle, char *name, unsigned int namelen); -static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to); +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to); static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, @@ -687,6 +688,7 @@ static void dlm_init_lockres(struct dlm_ INIT_LIST_HEAD(&res->purge); atomic_set(&res->asts_reserved, 0); res->migration_pending = 0; + res->inflight_locks = 0; kref_init(&res->refs); @@ -700,6 +702,7 @@ static void dlm_init_lockres(struct dlm_ res->last_used = 0; memset(res->lvb, 0, DLM_LVB_LEN); + memset(res->refmap, 0, sizeof(res->refmap)); } struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, @@ -722,6 +725,42 @@ struct dlm_lock_resource *dlm_new_lockre return res; } +void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int new_lockres, + const char *file, + int line) +{ + if (!new_lockres) + assert_spin_locked(&res->spinlock); + + if (!test_bit(dlm->node_num, res->refmap)) { + BUG_ON(res->inflight_locks != 0); + dlm_lockres_set_refmap_bit(dlm->node_num, res); + } + res->inflight_locks++; + mlog(0, "%s:%.*s: inflight++: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); +} + +void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *file, + int line) +{ + assert_spin_locked(&res->spinlock); + + BUG_ON(res->inflight_locks == 0); + res->inflight_locks--; + mlog(0, "%s:%.*s: inflight--: now %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->inflight_locks); + if (res->inflight_locks == 0) + dlm_lockres_clear_refmap_bit(dlm->node_num, res); + wake_up(&res->wq); +} + /* * lookup a lock resource by name. * may already exist in the hashtable. @@ -752,6 +791,7 @@ struct dlm_lock_resource * dlm_get_lock_ unsigned int hash; int tries = 0; int bit, wait_on_recovery = 0; + int drop_inflight_if_nonlocal = 0; BUG_ON(!lockid); @@ -761,9 +801,30 @@ struct dlm_lock_resource * dlm_get_lock_ lookup: spin_lock(&dlm->spinlock); - tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); + tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); if (tmpres) { + int dropping_ref = 0; + + spin_lock(&tmpres->spinlock); + if (tmpres->owner == dlm->node_num) { + BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); + dlm_lockres_grab_inflight_ref(dlm, tmpres); + } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) + dropping_ref = 1; + spin_unlock(&tmpres->spinlock); spin_unlock(&dlm->spinlock); + + /* wait until done messaging the master, drop our ref to allow + * the lockres to be purged, start over. */ + if (dropping_ref) { + spin_lock(&tmpres->spinlock); + __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + mlog(0, "found in hash!\n"); if (res) dlm_lockres_put(res); @@ -793,6 +854,7 @@ lookup: spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, dlm->node_num); __dlm_insert_lockres(dlm, res); + dlm_lockres_grab_inflight_ref(dlm, res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); /* lockres still marked IN_PROGRESS */ @@ -805,29 +867,40 @@ lookup: /* if we found a block, wait for lock to be mastered by another node */ blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); if (blocked) { + int mig; if (mle->type == DLM_MLE_MASTER) { mlog(ML_ERROR, "master entry for nonexistent lock!\n"); BUG(); - } else if (mle->type == DLM_MLE_MIGRATION) { - /* migration is in progress! */ - /* the good news is that we now know the - * "current" master (mle->master). */ - + } + mig = (mle->type == DLM_MLE_MIGRATION); + /* if there is a migration in progress, let the migration + * finish before continuing. we can wait for the absence + * of the MIGRATION mle: either the migrate finished or + * one of the nodes died and the mle was cleaned up. + * if there is a BLOCK here, but it already has a master + * set, we are too late. the master does not have a ref + * for us in the refmap. detach the mle and drop it. + * either way, go back to the top and start over. */ + if (mig || mle->master != O2NM_MAX_NODES) { + BUG_ON(mig && mle->master == dlm->node_num); + /* we arrived too late. the master does not + * have a ref for us. retry. */ + mlog(0, "%s:%.*s: late on %s\n", + dlm->name, namelen, lockid, + mig ? "MIGRATION" : "BLOCK"); spin_unlock(&dlm->master_lock); - assert_spin_locked(&dlm->spinlock); - - /* set the lockres owner and hash it */ - spin_lock(&res->spinlock); - dlm_set_lockres_owner(dlm, res, mle->master); - __dlm_insert_lockres(dlm, res); - spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); /* master is known, detach */ - dlm_mle_detach_hb_events(dlm, mle); + if (!mig) + dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); mle = NULL; - goto wake_waiters; + /* this is lame, but we cant wait on either + * the mle or lockres waitqueue here */ + if (mig) + msleep(100); + goto lookup; } } else { /* go ahead and try to master lock on this node */ @@ -858,6 +931,13 @@ lookup: /* finally add the lockres to its hash bucket */ __dlm_insert_lockres(dlm, res); + /* since this lockres is new it doesnt not require the spinlock */ + dlm_lockres_grab_inflight_ref_new(dlm, res); + + /* if this node does not become the master make sure to drop + * this inflight reference below */ + drop_inflight_if_nonlocal = 1; + /* get an extra ref on the mle in case this is a BLOCK * if so, the creator of the BLOCK may try to put the last * ref at this time in the assert master handler, so we @@ -910,7 +990,7 @@ redo_request: ret = -EINVAL; dlm_node_iter_init(mle->vote_map, &iter); while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { - ret = dlm_do_master_request(mle, nodenum); + ret = dlm_do_master_request(res, mle, nodenum); if (ret < 0) mlog_errno(ret); if (mle->master != O2NM_MAX_NODES) { @@ -960,6 +1040,8 @@ wait: wake_waiters: spin_lock(&res->spinlock); + if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) + dlm_lockres_drop_inflight_ref(dlm, res); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -998,7 +1080,7 @@ recheck: /* this will cause the master to re-assert across * the whole cluster, freeing up mles */ if (res->owner != dlm->node_num) { - ret = dlm_do_master_request(mle, res->owner); + ret = dlm_do_master_request(res, mle, res->owner); if (ret < 0) { /* give recovery a chance to run */ mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); @@ -1062,6 +1144,8 @@ recheck: * now tell other nodes that I am * mastering this. */ mle->master = dlm->node_num; + /* ref was grabbed in get_lock_resource + * will be dropped in dlmlock_master */ assert = 1; sleep = 0; } @@ -1087,7 +1171,8 @@ recheck: (atomic_read(&mle->woken) == 1), timeo); if (res->owner == O2NM_MAX_NODES) { - mlog(0, "waiting again\n"); + mlog(0, "%s:%.*s: waiting again\n", dlm->name, + res->lockname.len, res->lockname.name); goto recheck; } mlog(0, "done waiting, master is %u\n", res->owner); @@ -1100,8 +1185,7 @@ recheck: m = dlm->node_num; mlog(0, "about to master %.*s here, this=%u\n", res->lockname.len, res->lockname.name, m); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, mle->vote_map, 0); + ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); if (ret) { /* This is a failure in the network path, * not in the response to the assert_master @@ -1117,6 +1201,8 @@ recheck: /* set the lockres owner */ spin_lock(&res->spinlock); + /* mastery reference obtained either during + * assert_master_handler or in get_lock_resource */ dlm_change_lockres_owner(dlm, res, m); spin_unlock(&res->spinlock); @@ -1283,7 +1369,8 @@ static int dlm_restart_lock_mastery(stru * */ -static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to) +static int dlm_do_master_request(struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, int to) { struct dlm_ctxt *dlm = mle->dlm; struct dlm_master_request request; @@ -1339,6 +1426,9 @@ again: case DLM_MASTER_RESP_YES: set_bit(to, mle->response_map); mlog(0, "node %u is the master, response=YES\n", to); + mlog(0, "%s:%.*s: master node %u now knows I have a " + "reference\n", dlm->name, res->lockname.len, + res->lockname.name, to); mle->master = to; break; case DLM_MASTER_RESP_NO: @@ -1428,8 +1518,10 @@ way_up_top: } if (res->owner == dlm->node_num) { + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, request->node_idx); + dlm_lockres_set_refmap_bit(request->node_idx, res); spin_unlock(&res->spinlock); - // mlog(0, "this node is the master\n"); response = DLM_MASTER_RESP_YES; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1477,7 +1569,6 @@ way_up_top: mlog(0, "node %u is master, but trying to migrate to " "node %u.\n", tmpmle->master, tmpmle->new_master); if (tmpmle->master == dlm->node_num) { - response = DLM_MASTER_RESP_YES; mlog(ML_ERROR, "no owner on lockres, but this " "node is trying to migrate it to %u?!\n", tmpmle->new_master); @@ -1494,6 +1585,10 @@ way_up_top: * go back and clean the mles on any * other nodes */ dispatch_assert = 1; + dlm_lockres_set_refmap_bit(request->node_idx, res); + mlog(0, "%s:%.*s: setting bit %u in refmap\n", + dlm->name, namelen, name, + request->node_idx); } else response = DLM_MASTER_RESP_NO; } else { @@ -1607,15 +1702,17 @@ send_response: * can periodically run all locks owned by this node * and re-assert across the cluster... */ -static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, - unsigned int namelen, void *nodemap, - u32 flags) +int dlm_do_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + void *nodemap, u32 flags) { struct dlm_assert_master assert; int to, tmpret; struct dlm_node_iter iter; int ret = 0; int reassert; + const char *lockname = res->lockname.name; + unsigned int namelen = res->lockname.len; BUG_ON(namelen > O2NM_MAX_NAME_LEN); again: @@ -1647,6 +1744,7 @@ again: mlog(0, "link to %d went down!\n", to); /* any nonzero status return will do */ ret = tmpret; + r = 0; } else if (r < 0) { /* ok, something horribly messed. kill thyself. */ mlog(ML_ERROR,"during assert master of %.*s to %u, " @@ -1661,12 +1759,29 @@ again: spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); BUG(); - } else if (r == EAGAIN) { + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT && + !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { + mlog(ML_ERROR, "%.*s: very strange, " + "master MLE but no lockres on %u\n", + namelen, lockname, to); + } + + if (r & DLM_ASSERT_RESPONSE_REASSERT) { mlog(0, "%.*s: node %u create mles on other " "nodes and requests a re-assert\n", namelen, lockname, to); reassert = 1; } + if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { + mlog(0, "%.*s: node %u has a reference to this " + "lockres, set the bit in the refmap\n", + namelen, lockname, to); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(to, res); + spin_unlock(&res->spinlock); + } } if (reassert) @@ -1693,7 +1808,7 @@ int dlm_assert_master_handler(struct o2n char *name; unsigned int namelen, hash; u32 flags; - int master_request = 0; + int master_request = 0, have_lockres_ref = 0; int ret = 0; if (!dlm_grab(dlm)) @@ -1864,6 +1979,7 @@ ok: dlm_change_lockres_owner(dlm, res, mle->master); } spin_unlock(&res->spinlock); + have_lockres_ref = 1; } /* master is known, detach if not already detached. @@ -1918,7 +2034,19 @@ done: dlm_put(dlm); if (master_request) { mlog(0, "need to tell master to reassert\n"); - ret = EAGAIN; // positive. negative would shoot down the node. + /* positive. negative would shoot down the node. */ + ret |= DLM_ASSERT_RESPONSE_REASSERT; + if (!have_lockres_ref) { + mlog(ML_ERROR, "strange, got assert from %u, MASTER " + "mle present here for %s:%.*s, but no lockres!\n", + assert->node_idx, dlm->name, namelen, name); + } + } + if (have_lockres_ref) { + /* let the master know we have a reference to the lockres */ + ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: got assert from %u, need a ref\n", + dlm->name, namelen, name, assert->node_idx); } return ret; @@ -2023,9 +2151,7 @@ static void dlm_assert_master_worker(str * even if one or more nodes die */ mlog(0, "worker about to master %.*s here, this=%u\n", res->lockname.len, res->lockname.name, dlm->node_num); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, - nodemap, flags); + ret = dlm_do_assert_master(dlm, res, nodemap, flags); if (ret < 0) { /* no need to restart, we are done */ if (!dlm_is_host_down(ret)) @@ -2097,6 +2223,104 @@ static int dlm_pre_master_reco_lockres(s return ret; } +/* + * DLM_DEREF_LOCKRES_MSG + */ + +int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + struct dlm_deref_lockres deref; + int ret = 0, r; + const char *lockname; + unsigned int namelen; + + lockname = res->lockname.name; + namelen = res->lockname.len; + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + mlog(0, "%s:%.*s: sending deref to %d\n", + dlm->name, namelen, lockname, res->owner); + memset(&deref, 0, sizeof(deref)); + deref.node_idx = dlm->node_num; + deref.namelen = namelen; + memcpy(deref.name, lockname, namelen); + + ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, + &deref, sizeof(deref), res->owner, &r); + if (ret < 0) + mlog_errno(ret); + else if (r < 0) { + /* BAD. other node says I did not have a ref. */ + mlog(ML_ERROR,"while dropping ref on %s:%.*s " + "(master=%u) got %d.\n", dlm->name, namelen, + lockname, res->owner, r); + dlm_print_one_lock_resource(res); + BUG(); + } + return ret; +} + +int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + int ret = -EINVAL; + u8 node; + unsigned int hash; + + if (!dlm_grab(dlm)) + return 0; + + name = deref->name; + namelen = deref->namelen; + node = deref->node_idx; + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + if (deref->node_idx >= O2NM_MAX_NODES) { + mlog(ML_ERROR, "Invalid node number: %u\n", node); + goto done; + } + + hash = dlm_lockid_hash(name, namelen); + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", + dlm->name, namelen, name); + goto done; + } + spin_unlock(&dlm->spinlock); + + spin_lock(&res->spinlock); + BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + if (test_bit(node, res->refmap)) { + ret = 0; + dlm_lockres_clear_refmap_bit(node, res); + } else { + mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " + "but it is already dropped!\n", dlm->name, namelen, + name, node); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&res->spinlock); + + if (!ret) + dlm_lockres_calc_usage(dlm, res); +done: + if (res) + dlm_lockres_put(res); + dlm_put(dlm); + return ret; +} + /* * DLM_MIGRATE_LOCKRES @@ -2376,6 +2600,53 @@ leave: return ret; } +#define DLM_MIGRATION_RETRY_MS 100 + +/* Should be called only after beginning the domain leave process. + * There should not be any remaining locks on nonlocal lock resources, + * and there should be no local locks left on locally mastered resources. + * + * Called with the dlm spinlock held, may drop it to do migration, but + * will re-acquire before exit. + * + * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ +int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + int ret; + int lock_dropped = 0; + + if (res->owner != dlm->node_num) { + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s:%.*s: this node is not master, " + "trying to free this but locks remain\n", + dlm->name, res->lockname.len, res->lockname.name); + } + goto leave; + } + + /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ + spin_unlock(&dlm->spinlock); + lock_dropped = 1; + while (1) { + ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); + if (ret >= 0) + break; + if (ret == -ENOTEMPTY) { + mlog(ML_ERROR, "lockres %.*s still has local locks!\n", + res->lockname.len, res->lockname.name); + BUG(); + } + + mlog(0, "lockres %.*s: migrate failed, " + "retrying\n", res->lockname.len, + res->lockname.name); + msleep(DLM_MIGRATION_RETRY_MS); + } + spin_lock(&dlm->spinlock); +leave: + return lock_dropped; +} + int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) { int ret; @@ -2490,7 +2761,7 @@ static void dlm_remove_nonlocal_locks(st { struct list_head *iter, *iter2; struct list_head *queue = &res->granted; - int i; + int i, bit; struct dlm_lock *lock; assert_spin_locked(&res->spinlock); @@ -2508,12 +2779,28 @@ static void dlm_remove_nonlocal_locks(st BUG_ON(!list_empty(&lock->bast_list)); BUG_ON(lock->ast_pending); BUG_ON(lock->bast_pending); + dlm_lockres_clear_refmap_bit(lock->ml.node, res); list_del_init(&lock->list); dlm_lock_put(lock); } } queue++; } + bit = 0; + while (1) { + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); + if (bit >= O2NM_MAX_NODES) + break; + /* do not clear the local node reference, if there is a + * process holding this, let it drop the ref itself */ + if (bit != dlm->node_num) { + mlog(0, "%s:%.*s: node %u had a ref to this " + "migrating lockres, clearing\n", dlm->name, + res->lockname.len, res->lockname.name, bit); + dlm_lockres_clear_refmap_bit(bit, res); + } + bit++; + } } /* for now this is not too intelligent. we will @@ -2601,6 +2888,16 @@ static int dlm_do_migrate_request(struct mlog(0, "migrate request (node %u) returned %d!\n", nodenum, status); ret = status; + } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { + /* during the migration request we short-circuited + * the mastery of the lockres. make sure we have + * a mastery ref for nodenum */ + mlog(0, "%s:%.*s: need ref for node %u\n", + dlm->name, res->lockname.len, res->lockname.name, + nodenum); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(nodenum, res); + spin_unlock(&res->spinlock); } } @@ -2745,7 +3042,13 @@ static int dlm_add_migration_mle(struct /* remove it from the list so that only one * mle will be found */ list_del_init(&tmp->list); - __dlm_mle_detach_hb_events(dlm, mle); + /* this was obviously WRONG. mle is uninited here. should be tmp. */ + __dlm_mle_detach_hb_events(dlm, tmp); + ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; + mlog(0, "%s:%.*s: master=%u, newmaster=%u, " + "telling master to get ref for cleared out mle " + "during migration\n", dlm->name, namelen, name, + master, new_master); } spin_unlock(&tmp->spinlock); } @@ -2753,6 +3056,8 @@ static int dlm_add_migration_mle(struct /* now add a migration mle to the tail of the list */ dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); mle->new_master = new_master; + /* the new master will be sending an assert master for this. + * at that point we will get the refmap reference */ mle->master = master; /* do this for consistency with other mle types */ set_bit(new_master, mle->maybe_map); @@ -2902,6 +3207,13 @@ int dlm_finish_migration(struct dlm_ctxt clear_bit(dlm->node_num, iter.node_map); spin_unlock(&dlm->spinlock); + /* ownership of the lockres is changing. account for the + * mastery reference here since old_master will briefly have + * a reference after the migration completes */ + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(old_master, res); + spin_unlock(&res->spinlock); + mlog(0, "now time to do a migrate request to other nodes\n"); ret = dlm_do_migrate_request(dlm, res, old_master, dlm->node_num, &iter); @@ -2914,8 +3226,7 @@ int dlm_finish_migration(struct dlm_ctxt res->lockname.len, res->lockname.name); /* this call now finishes out the nodemap * even if one or more nodes die */ - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, iter.node_map, + ret = dlm_do_assert_master(dlm, res, iter.node_map, DLM_ASSERT_MASTER_FINISH_MIGRATION); if (ret < 0) { /* no longer need to retry. all living nodes contacted. */ @@ -2927,8 +3238,7 @@ int dlm_finish_migration(struct dlm_ctxt set_bit(old_master, iter.node_map); mlog(0, "doing assert master of %.*s back to %u\n", res->lockname.len, res->lockname.name, old_master); - ret = dlm_do_assert_master(dlm, res->lockname.name, - res->lockname.len, iter.node_map, + ret = dlm_do_assert_master(dlm, res, iter.node_map, DLM_ASSERT_MASTER_FINISH_MIGRATION); if (ret < 0) { mlog(0, "assert master to original master failed " diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index fb3e2b0..f1f3c9d 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1129,6 +1129,11 @@ static int dlm_send_mig_lockres_msg(stru if (total_locks == mres_total_locks) mres->flags |= DLM_MRES_ALL_DONE; + mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", + send_to); + /* send it */ ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, sz, send_to, &status); @@ -1213,6 +1218,34 @@ static int dlm_add_lock_to_array(struct return 0; } +static void dlm_add_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres) +{ + struct dlm_lock dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.ml.cookie = 0; + dummy.ml.type = LKM_IVMODE; + dummy.ml.convert_type = LKM_IVMODE; + dummy.ml.highest_blocked = LKM_IVMODE; + dummy.lksb = NULL; + dummy.ml.node = dlm->node_num; + dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST); +} + +static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm, + struct dlm_migratable_lock *ml, + u8 *nodenum) +{ + if (unlikely(ml->cookie == 0 && + ml->type == LKM_IVMODE && + ml->convert_type == LKM_IVMODE && + ml->highest_blocked == LKM_IVMODE && + ml->list == DLM_BLOCKED_LIST)) { + *nodenum = ml->node; + return 1; + } + return 0; +} int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_migratable_lockres *mres, @@ -1260,6 +1293,14 @@ int dlm_send_one_lockres(struct dlm_ctxt goto error; } } + if (total_locks == 0) { + /* send a dummy lock to indicate a mastery reference only */ + mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n", + dlm->name, res->lockname.len, res->lockname.name, + send_to, flags & DLM_MRES_RECOVERY ? "recovery" : + "migration"); + dlm_add_dummy_lock(dlm, mres); + } /* flush any remaining locks */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); if (ret < 0) @@ -1386,13 +1427,16 @@ int dlm_mig_lockres_handler(struct o2net /* add an extra ref for just-allocated lockres * otherwise the lockres will be purged immediately */ dlm_lockres_get(res); - } /* at this point we have allocated everything we need, * and we have a hashed lockres with an extra ref and * the proper res->state flags. */ ret = 0; + spin_lock(&res->spinlock); + /* drop this either when master requery finds a different master + * or when a lock is added by the recovery worker */ + dlm_lockres_grab_inflight_ref(dlm, res); if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { /* migration cannot have an unknown master */ BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); @@ -1400,10 +1444,11 @@ int dlm_mig_lockres_handler(struct o2net "unknown owner.. will need to requery: " "%.*s\n", mres->lockname_len, mres->lockname); } else { - spin_lock(&res->spinlock); + /* take a reference now to pin the lockres, drop it + * when locks are added in the worker */ dlm_change_lockres_owner(dlm, res, dlm->node_num); - spin_unlock(&res->spinlock); } + spin_unlock(&res->spinlock); /* queue up work for dlm_mig_lockres_worker */ dlm_grab(dlm); /* get an extra ref for the work item */ @@ -1459,6 +1504,9 @@ again: "this node will take it.\n", res->lockname.len, res->lockname.name); } else { + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); mlog(0, "master needs to respond to sender " "that node %u still owns %.*s\n", real_master, res->lockname.len, @@ -1666,10 +1714,25 @@ static int dlm_process_recovery_data(str int i, bad; struct list_head *iter; struct dlm_lock *lock = NULL; + u8 from = O2NM_MAX_NODES; + unsigned int added = 0; mlog(0, "running %d locks for this lockres\n", mres->num_locks); for (i=0; inum_locks; i++) { ml = &(mres->ml[i]); + + if (dlm_is_dummy_lock(dlm, ml, &from)) { + /* placeholder, just need to set the refmap bit */ + BUG_ON(mres->num_locks != 1); + mlog(0, "%s:%.*s: dummy lock for %u\n", + dlm->name, mres->lockname_len, mres->lockname, + from); + spin_lock(&res->spinlock); + dlm_lockres_set_refmap_bit(from, res); + spin_unlock(&res->spinlock); + added++; + break; + } BUG_ON(ml->highest_blocked != LKM_IVMODE); newlock = NULL; lksb = NULL; @@ -1711,6 +1774,7 @@ static int dlm_process_recovery_data(str /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); + added++; mlog(0, "just reordered a local lock!\n"); continue; @@ -1817,12 +1881,24 @@ skip_lvb: if (!bad) { dlm_lock_get(newlock); list_add_tail(&newlock->list, queue); + mlog(0, "%s:%.*s: added lock for node %u, " + "setting refmap bit\n", dlm->name, + res->lockname.len, res->lockname.name, ml->node); + dlm_lockres_set_refmap_bit(ml->node, res); + added++; } spin_unlock(&res->spinlock); } mlog(0, "done running all the locks\n"); leave: + /* balance the ref taken when the work was queued */ + if (added > 0) { + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + } + if (ret < 0) { mlog_errno(ret); if (newlock) @@ -1935,9 +2011,11 @@ static void dlm_finish_local_lockres_rec if (res->owner == dead_node) { list_del_init(&res->recovering); spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - if (!__dlm_lockres_unused(res)) + if (__dlm_lockres_has_locks(res)) __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -1977,9 +2055,11 @@ static void dlm_finish_local_lockres_rec dlm_lockres_put(res); } spin_lock(&res->spinlock); + /* new_master has our reference from + * the lock state sent during recovery */ dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - if (!__dlm_lockres_unused(res)) + if (__dlm_lockres_has_locks(res)) __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -2048,6 +2128,7 @@ static void dlm_free_dead_locks(struct d { struct list_head *iter, *tmpiter; struct dlm_lock *lock; + unsigned int freed = 0; /* this node is the lockres master: * 1) remove any stale locks for the dead node @@ -2062,6 +2143,7 @@ static void dlm_free_dead_locks(struct d if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } list_for_each_safe(iter, tmpiter, &res->converting) { @@ -2069,6 +2151,7 @@ static void dlm_free_dead_locks(struct d if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } list_for_each_safe(iter, tmpiter, &res->blocked) { @@ -2076,9 +2159,23 @@ static void dlm_free_dead_locks(struct d if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); + freed++; } } + if (freed) { + mlog(0, "%s:%.*s: freed %u locks for dead node %u, " + "dropping ref from lockres\n", dlm->name, + res->lockname.len, res->lockname.name, freed, dead_node); + BUG_ON(!test_bit(dead_node, res->refmap)); + dlm_lockres_clear_refmap_bit(dead_node, res); + } else if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", dlm->name, + res->lockname.len, res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dead_node, res); + } + /* do not kick thread yet */ __dlm_dirty_lockres(dlm, res); } @@ -2141,9 +2238,21 @@ static void dlm_do_local_recovery_cleanu spin_lock(&res->spinlock); /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); - if (res->owner == dead_node) + if (res->owner == dead_node) { + if (res->state & DLM_LOCK_RES_DROPPING_REF) + mlog(0, "%s:%.*s: owned by " + "dead node %u, this node was " + "dropping its ref when it died. " + "continue, dropping the flag.\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + + /* the wake_up for this will happen when the + * RECOVERING flag is dropped later */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + dlm_move_lockres_to_recovery_list(dlm, res); - else if (res->owner == dlm->node_num) { + } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); } diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 0c822f3..620eb82 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -54,9 +54,6 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_ #include "cluster/masklog.h" static int dlm_thread(void *data); -static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres); - static void dlm_flush_asts(struct dlm_ctxt *dlm); #define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) @@ -82,14 +79,33 @@ repeat: current->state = TASK_RUNNING; } - -int __dlm_lockres_unused(struct dlm_lock_resource *res) +int __dlm_lockres_has_locks(struct dlm_lock_resource *res) { if (list_empty(&res->granted) && list_empty(&res->converting) && - list_empty(&res->blocked) && - list_empty(&res->dirty)) - return 1; + list_empty(&res->blocked)) + return 0; + return 1; +} + +/* "unused": the lockres has no locks, is not on the dirty list, + * has no inflight locks (in the gap between mastery and acquiring + * the first lock), and has no bits in its refmap. + * truly ready to be freed. */ +int __dlm_lockres_unused(struct dlm_lock_resource *res) +{ + if (!__dlm_lockres_has_locks(res) && + list_empty(&res->dirty)) { + /* try not to scan the bitmap unless the first two + * conditions are already true */ + int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (bit >= O2NM_MAX_NODES) { + /* since the bit for dlm->node_num is not + * set, inflight_locks better be zero */ + BUG_ON(res->inflight_locks != 0); + return 1; + } + } return 0; } @@ -106,46 +122,21 @@ void __dlm_lockres_calc_usage(struct dlm assert_spin_locked(&res->spinlock); if (__dlm_lockres_unused(res)){ - /* For now, just keep any resource we master */ - if (res->owner == dlm->node_num) - { - if (!list_empty(&res->purge)) { - mlog(0, "we master %s:%.*s, but it is on " - "the purge list. Removing\n", - dlm->name, res->lockname.len, - res->lockname.name); - list_del_init(&res->purge); - dlm->purge_count--; - } - return; - } - if (list_empty(&res->purge)) { - mlog(0, "putting lockres %.*s from purge list\n", - res->lockname.len, res->lockname.name); + mlog(0, "putting lockres %.*s:%p onto purge list\n", + res->lockname.len, res->lockname.name, res); res->last_used = jiffies; + dlm_lockres_get(res); list_add_tail(&res->purge, &dlm->purge_list); dlm->purge_count++; - - /* if this node is not the owner, there is - * no way to keep track of who the owner could be. - * unhash it to avoid serious problems. */ - if (res->owner != dlm->node_num) { - mlog(0, "%s:%.*s: doing immediate " - "purge of lockres owned by %u\n", - dlm->name, res->lockname.len, - res->lockname.name, res->owner); - - dlm_purge_lockres_now(dlm, res); - } } } else if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s from purge list, " - "owner=%u\n", res->lockname.len, res->lockname.name, - res->owner); + mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n", + res->lockname.len, res->lockname.name, res, res->owner); list_del_init(&res->purge); + dlm_lockres_put(res); dlm->purge_count--; } } @@ -163,68 +154,60 @@ void dlm_lockres_calc_usage(struct dlm_c spin_unlock(&dlm->spinlock); } -/* TODO: Eventual API: Called with the dlm spinlock held, may drop it - * to do migration, but will re-acquire before exit. */ -void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres) +int dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int master; - int ret; - - spin_lock(&lockres->spinlock); - master = lockres->owner == dlm->node_num; - spin_unlock(&lockres->spinlock); - - mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len, - lockres->lockname.name, master); + int ret = 0; - /* Non master is the easy case -- no migration required, just - * quit. */ + spin_lock(&res->spinlock); + if (!__dlm_lockres_unused(res)) { + spin_unlock(&res->spinlock); + mlog(0, "%s:%.*s: tried to purge but not unused\n", + dlm->name, res->lockname.len, res->lockname.name); + return -ENOTEMPTY; + } + master = (res->owner == dlm->node_num); if (!master) - goto finish; - - /* Wheee! Migrate lockres here! */ - spin_unlock(&dlm->spinlock); -again: + res->state |= DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); - ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES); - if (ret == -ENOTEMPTY) { - mlog(ML_ERROR, "lockres %.*s still has local locks!\n", - lockres->lockname.len, lockres->lockname.name); + mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, + res->lockname.name, master); - BUG(); - } else if (ret < 0) { - mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", - lockres->lockname.len, lockres->lockname.name); - msleep(100); - goto again; + if (!master) { + /* drop spinlock to do messaging, retake below */ + spin_unlock(&dlm->spinlock); + /* clear our bit from the master's refmap, ignore errors */ + ret = dlm_drop_lockres_ref(dlm, res); + if (ret < 0) { + mlog_errno(ret); + if (!dlm_is_host_down(ret)) + BUG(); + } + mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", + dlm->name, res->lockname.len, res->lockname.name, ret); + spin_lock(&dlm->spinlock); } - spin_lock(&dlm->spinlock); - -finish: - if (!list_empty(&lockres->purge)) { - list_del_init(&lockres->purge); + if (!list_empty(&res->purge)) { + mlog(0, "removing lockres %.*s:%p from purgelist, " + "master = %d\n", res->lockname.len, res->lockname.name, + res, master); + list_del_init(&res->purge); + dlm_lockres_put(res); dlm->purge_count--; } - __dlm_unhash_lockres(lockres); -} - -/* make an unused lockres go away immediately. - * as soon as the dlm spinlock is dropped, this lockres - * will not be found. kfree still happens on last put. */ -static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, - struct dlm_lock_resource *lockres) -{ - assert_spin_locked(&dlm->spinlock); - assert_spin_locked(&lockres->spinlock); - - BUG_ON(!__dlm_lockres_unused(lockres)); + __dlm_unhash_lockres(res); - if (!list_empty(&lockres->purge)) { - list_del_init(&lockres->purge); - dlm->purge_count--; + /* lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. */ + if (!master) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_DROPPING_REF; + spin_unlock(&res->spinlock); + wake_up(&res->wq); } - __dlm_unhash_lockres(lockres); + return 0; } static void dlm_run_purge_list(struct dlm_ctxt *dlm, @@ -268,13 +251,17 @@ static void dlm_run_purge_list(struct dl break; } + mlog(0, "removing lockres %.*s:%p from purgelist\n", + lockres->lockname.len, lockres->lockname.name, lockres); list_del_init(&lockres->purge); + dlm_lockres_put(lockres); dlm->purge_count--; /* This may drop and reacquire the dlm spinlock if it * has to do migration. */ mlog(0, "calling dlm_purge_lockres!\n"); - dlm_purge_lockres(dlm, lockres); + if (dlm_purge_lockres(dlm, lockres)) + BUG(); mlog(0, "DONE calling dlm_purge_lockres!\n"); /* Avoid adding any scheduling latencies */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 69fba16..e622013 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -770,7 +770,7 @@ static int ocfs2_lock_create(struct ocfs int dlm_flags) { int ret = 0; - enum dlm_status status; + enum dlm_status status = DLM_NORMAL; unsigned long flags; mlog_entry_void(); @@ -1138,6 +1138,7 @@ int ocfs2_rw_lock(struct inode *inode, i { int status, level; struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); @@ -1147,6 +1148,9 @@ int ocfs2_rw_lock(struct inode *inode, i (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); + if (ocfs2_mount_local(osb)) + return 0; + lockres = &OCFS2_I(inode)->ip_rw_lockres; level = write ? LKM_EXMODE : LKM_PRMODE; @@ -1164,6 +1168,7 @@ void ocfs2_rw_unlock(struct inode *inode { int level = write ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1171,7 +1176,8 @@ void ocfs2_rw_unlock(struct inode *inode (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); - ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); } @@ -1182,6 +1188,7 @@ int ocfs2_data_lock_full(struct inode *i { int status = 0, level; struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); @@ -1201,6 +1208,9 @@ int ocfs2_data_lock_full(struct inode *i goto out; } + if (ocfs2_mount_local(osb)) + goto out; + lockres = &OCFS2_I(inode)->ip_data_lockres; level = write ? LKM_EXMODE : LKM_PRMODE; @@ -1269,6 +1279,7 @@ void ocfs2_data_unlock(struct inode *ino { int level = write ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1276,7 +1287,8 @@ void ocfs2_data_unlock(struct inode *ino (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); @@ -1467,8 +1479,9 @@ static int ocfs2_meta_lock_update(struct { int status = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_lock_res *lockres; + struct ocfs2_lock_res *lockres = NULL; struct ocfs2_dinode *fe; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1483,10 +1496,12 @@ static int ocfs2_meta_lock_update(struct } spin_unlock(&oi->ip_lock); - lockres = &oi->ip_meta_lockres; + if (!ocfs2_mount_local(osb)) { + lockres = &oi->ip_meta_lockres; - if (!ocfs2_should_refresh_lock_res(lockres)) - goto bail; + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + } /* This will discard any caching information we might have had * for the inode metadata. */ @@ -1496,7 +1511,7 @@ static int ocfs2_meta_lock_update(struct * map (directories, bitmap files, etc) */ ocfs2_extent_map_trunc(inode, 0); - if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { + if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); ocfs2_refresh_inode_from_lvb(inode); @@ -1543,7 +1558,8 @@ static int ocfs2_meta_lock_update(struct status = 0; bail_refresh: - ocfs2_complete_lock_res_refresh(lockres, status); + if (lockres) + ocfs2_complete_lock_res_refresh(lockres, status); bail: mlog_exit(status); return status; @@ -1585,7 +1601,7 @@ int ocfs2_meta_lock_full(struct inode *i int arg_flags) { int status, level, dlm_flags, acquired; - struct ocfs2_lock_res *lockres; + struct ocfs2_lock_res *lockres = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *local_bh = NULL; @@ -1607,6 +1623,9 @@ int ocfs2_meta_lock_full(struct inode *i goto bail; } + if (ocfs2_mount_local(osb)) + goto local; + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); @@ -1636,6 +1655,7 @@ int ocfs2_meta_lock_full(struct inode *i wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); +local: /* * We only see this flag if we're being called from * ocfs2_read_locked_inode(). It means we're locking an inode @@ -1644,7 +1664,8 @@ int ocfs2_meta_lock_full(struct inode *i */ if (inode->i_state & I_NEW) { status = 0; - ocfs2_complete_lock_res_refresh(lockres, 0); + if (lockres) + ocfs2_complete_lock_res_refresh(lockres, 0); goto bail; } @@ -1767,6 +1788,7 @@ void ocfs2_meta_unlock(struct inode *ino { int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); @@ -1774,7 +1796,8 @@ void ocfs2_meta_unlock(struct inode *ino (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); - if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && + !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); mlog_exit_void(); @@ -1783,7 +1806,7 @@ void ocfs2_meta_unlock(struct inode *ino int ocfs2_super_lock(struct ocfs2_super *osb, int ex) { - int status; + int status = 0; int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; struct buffer_head *bh; @@ -1794,6 +1817,9 @@ int ocfs2_super_lock(struct ocfs2_super if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ocfs2_mount_local(osb)) + goto bail; + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); if (status < 0) { mlog_errno(status); @@ -1832,7 +1858,8 @@ void ocfs2_super_unlock(struct ocfs2_sup int level = ex ? LKM_EXMODE : LKM_PRMODE; struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; - ocfs2_cluster_unlock(osb, lockres, level); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); } int ocfs2_rename_lock(struct ocfs2_super *osb) @@ -1843,6 +1870,9 @@ int ocfs2_rename_lock(struct ocfs2_super if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ocfs2_mount_local(osb)) + return 0; + status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); if (status < 0) mlog_errno(status); @@ -1854,7 +1884,8 @@ void ocfs2_rename_unlock(struct ocfs2_su { struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; - ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); } int ocfs2_dentry_lock(struct dentry *dentry, int ex) @@ -1869,6 +1900,9 @@ int ocfs2_dentry_lock(struct dentry *den if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ocfs2_mount_local(osb)) + return 0; + ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0); if (ret < 0) mlog_errno(ret); @@ -1882,7 +1916,8 @@ void ocfs2_dentry_unlock(struct dentry * struct ocfs2_dentry_lock *dl = dentry->d_fsdata; struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); } /* Reference counting of the dlm debug structure. We want this because @@ -2145,12 +2180,15 @@ static void ocfs2_dlm_shutdown_debug(str int ocfs2_dlm_init(struct ocfs2_super *osb) { - int status; + int status = 0; u32 dlm_key; - struct dlm_ctxt *dlm; + struct dlm_ctxt *dlm = NULL; mlog_entry_void(); + if (ocfs2_mount_local(osb)) + goto local; + status = ocfs2_dlm_init_debug(osb); if (status < 0) { mlog_errno(status); @@ -2178,11 +2216,12 @@ int ocfs2_dlm_init(struct ocfs2_super *o goto bail; } + dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); + +local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); - dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); - osb->dlm = dlm; status = 0; diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index cbfd45a..8fc52d6 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -154,6 +154,9 @@ int ocfs2_register_hb_callbacks(struct o { int status; + if (ocfs2_mount_local(osb)) + return 0; + status = o2hb_register_callback(&osb->osb_hb_down); if (status < 0) { mlog_errno(status); @@ -172,6 +175,9 @@ void ocfs2_clear_hb_callbacks(struct ocf { int status; + if (ocfs2_mount_local(osb)) + return; + status = o2hb_unregister_callback(&osb->osb_hb_down); if (status < 0) mlog_errno(status); @@ -186,6 +192,9 @@ void ocfs2_stop_heartbeat(struct ocfs2_s int ret; char *argv[5], *envp[3]; + if (ocfs2_mount_local(osb)) + return; + if (!osb->uuid_str) { /* This can happen if we don't get far enough in mount... */ mlog(0, "No UUID with which to stop heartbeat!\n\n"); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 42e361f..e4d9149 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -423,7 +423,8 @@ static int ocfs2_read_locked_inode(struc * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) - && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK); + && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) + && !ocfs2_mount_local(osb); /* * To maintain backwards compatibility with older versions of diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1d7f4ab..825cb0a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -144,8 +144,10 @@ handle_t *ocfs2_start_trans(struct ocfs2 ocfs2_abort(osb->sb, "Detected aborted journal"); handle = ERR_PTR(-EROFS); } - } else - atomic_inc(&(osb->journal->j_num_trans)); + } else { + if (!ocfs2_mount_local(osb)) + atomic_inc(&(osb->journal->j_num_trans)); + } return handle; } @@ -507,9 +509,23 @@ void ocfs2_journal_shutdown(struct ocfs2 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); - status = ocfs2_journal_toggle_dirty(osb, 0); - if (status < 0) - mlog_errno(status); + if (ocfs2_mount_local(osb)) { + journal_lock_updates(journal->j_journal); + status = journal_flush(journal->j_journal); + journal_unlock_updates(journal->j_journal); + if (status < 0) + mlog_errno(status); + } + + if (status == 0) { + /* + * Do not toggle if flush was unsuccessful otherwise + * will leave dirty metadata in a "clean" journal + */ + status = ocfs2_journal_toggle_dirty(osb, 0); + if (status < 0) + mlog_errno(status); + } /* Shutdown the kernel journal system */ journal_destroy(journal->j_journal); @@ -549,7 +565,7 @@ static void ocfs2_clear_journal_error(st } } -int ocfs2_journal_load(struct ocfs2_journal *journal) +int ocfs2_journal_load(struct ocfs2_journal *journal, int local) { int status = 0; struct ocfs2_super *osb; @@ -576,14 +592,18 @@ int ocfs2_journal_load(struct ocfs2_jour } /* Launch the commit thread */ - osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt"); - if (IS_ERR(osb->commit_task)) { - status = PTR_ERR(osb->commit_task); + if (!local) { + osb->commit_task = kthread_run(ocfs2_commit_thread, osb, + "ocfs2cmt"); + if (IS_ERR(osb->commit_task)) { + status = PTR_ERR(osb->commit_task); + osb->commit_task = NULL; + mlog(ML_ERROR, "unable to launch ocfs2commit thread, " + "error=%d", status); + goto done; + } + } else osb->commit_task = NULL; - mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d", - status); - goto done; - } done: mlog_exit(status); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 899112a..e121636 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -157,7 +157,7 @@ int ocfs2_journal_init(struct ocfs2_j void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); -int ocfs2_journal_load(struct ocfs2_journal *journal); +int ocfs2_journal_load(struct ocfs2_journal *journal, int local); int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num); @@ -174,6 +174,9 @@ static inline void ocfs2_checkpoint_inod { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + if (ocfs2_mount_local(osb)) + return; + if (!ocfs2_inode_fully_checkpointed(inode)) { /* WARNING: This only kicks off a single * checkpoint. If someone races you and adds more diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 69f85ae..51b0204 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -83,10 +83,12 @@ static struct vm_operations_struct ocfs2 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) { int ret = 0, lock_level = 0; + struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); /* We don't want to support shared writable mappings yet. */ - if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) - && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { + if (!ocfs2_mount_local(osb) && + ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && + ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); /* This is -EINVAL because generic_file_readonly_mmap * returns it in a similar situation. */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 21db45d..9637039 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -587,9 +587,11 @@ static int ocfs2_mknod_locked(struct ocf } ocfs2_inode_set_new(osb, inode); - status = ocfs2_create_new_inode_locks(inode); - if (status < 0) - mlog_errno(status); + if (!ocfs2_mount_local(osb)) { + status = ocfs2_create_new_inode_locks(inode); + if (status < 0) + mlog_errno(status); + } status = 0; /* error in ocfs2_create_new_inode_locks is not * critical */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index b767fd7..db8e77c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -349,6 +349,11 @@ static inline int ocfs2_is_soft_readonly return ret; } +static inline int ocfs2_mount_local(struct ocfs2_super *osb) +{ + return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); +} + #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 3330a5d..2e90e89 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -86,7 +86,7 @@ #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb, OCFS2_SB(sb)->s_feature_incompat &= ~(mask) #define OCFS2_FEATURE_COMPAT_SUPP 0 -#define OCFS2_FEATURE_INCOMPAT_SUPP 0 +#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 /* @@ -97,6 +97,9 @@ #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 #define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 +/* Used to denote a non-clustered volume */ +#define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 0x0008 + /* * Flags on ocfs2_dinode.i_flags */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 4bf3954..a6d2f8c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -508,6 +508,27 @@ bail: return status; } +static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) +{ + if (ocfs2_mount_local(osb)) { + if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + mlog(ML_ERROR, "Cannot heartbeat on a locally " + "mounted device.\n"); + return -EINVAL; + } + } + + if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { + if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) { + mlog(ML_ERROR, "Heartbeat has to be started to mount " + "a read-write clustered device.\n"); + return -EINVAL; + } + } + + return 0; +} + static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) { struct dentry *root; @@ -516,16 +537,24 @@ static int ocfs2_fill_super(struct super struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; + char nodestr[8]; mlog_entry("%p, %p, %i", sb, data, silent); - /* for now we only have one cluster/node, make sure we see it - * in the heartbeat universe */ - if (!o2hb_check_local_node_heartbeating()) { + if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { status = -EINVAL; goto read_super_error; } + /* for now we only have one cluster/node, make sure we see it + * in the heartbeat universe */ + if (parsed_opt & OCFS2_MOUNT_HB_LOCAL) { + if (!o2hb_check_local_node_heartbeating()) { + status = -EINVAL; + goto read_super_error; + } + } + /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size); if (status < 0) { @@ -541,11 +570,6 @@ static int ocfs2_fill_super(struct super } brelse(bh); bh = NULL; - - if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { - status = -EINVAL; - goto read_super_error; - } osb->s_mount_opt = parsed_opt; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -588,21 +612,16 @@ static int ocfs2_fill_super(struct super } if (!ocfs2_is_hard_readonly(osb)) { - /* If this isn't a hard readonly mount, then we need - * to make sure that heartbeat is in a valid state, - * and that we mark ourselves soft readonly is -oro - * was specified. */ - if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { - mlog(ML_ERROR, "No heartbeat for device (%s)\n", - sb->s_id); - status = -EINVAL; - goto read_super_error; - } - if (sb->s_flags & MS_RDONLY) ocfs2_set_ro_flag(osb, 0); } + status = ocfs2_verify_heartbeat(osb); + if (status < 0) { + mlog_errno(status); + goto read_super_error; + } + osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); if (!osb->osb_debug_root) { @@ -635,9 +654,14 @@ static int ocfs2_fill_super(struct super ocfs2_complete_mount_recovery(osb); - printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %d, slot %d) " + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); + + printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " "with %s data mode.\n", - osb->dev_str, osb->node_num, osb->slot_num, + osb->dev_str, nodestr, osb->slot_num, osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); @@ -999,7 +1023,11 @@ static int ocfs2_fill_local_node_info(st /* XXX hold a ref on the node while mounte? easy enough, if * desirable. */ - osb->node_num = o2nm_this_node(); + if (ocfs2_mount_local(osb)) + osb->node_num = 0; + else + osb->node_num = o2nm_this_node(); + if (osb->node_num == O2NM_MAX_NODES) { mlog(ML_ERROR, "could not find this host's node number\n"); status = -ENOENT; @@ -1084,6 +1112,9 @@ static int ocfs2_mount_volume(struct sup goto leave; } + if (ocfs2_mount_local(osb)) + goto leave; + /* This should be sent *after* we recovered our journal as it * will cause other nodes to unmark us as needing * recovery. However, we need to send it *before* dropping the @@ -1114,6 +1145,7 @@ static void ocfs2_dismount_volume(struct { int tmp; struct ocfs2_super *osb = NULL; + char nodestr[8]; mlog_entry("(0x%p)\n", sb); @@ -1177,8 +1209,13 @@ static void ocfs2_dismount_volume(struct atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); - printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %d)\n", - osb->dev_str, osb->node_num); + if (ocfs2_mount_local(osb)) + snprintf(nodestr, sizeof(nodestr), "local"); + else + snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); + + printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", + osb->dev_str, nodestr); ocfs2_delete_osb(osb); kfree(osb); @@ -1536,6 +1573,7 @@ static int ocfs2_check_volume(struct ocf { int status = 0; int dirty; + int local; struct ocfs2_dinode *local_alloc = NULL; /* only used if we * recover * ourselves. */ @@ -1563,8 +1601,10 @@ static int ocfs2_check_volume(struct ocf "recovering volume.\n"); } + local = ocfs2_mount_local(osb); + /* will play back anything left in the journal. */ - ocfs2_journal_load(osb->journal); + ocfs2_journal_load(osb->journal, local); if (dirty) { /* recover my local alloc if we didn't unmount cleanly. */ diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index 5b4dca7..0315a8b 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -1000,6 +1000,9 @@ int ocfs2_register_net_handlers(struct o { int status = 0; + if (ocfs2_mount_local(osb)) + return 0; + status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE, osb->net_key, sizeof(struct ocfs2_response_msg), diff --git a/include/linux/configfs.h b/include/linux/configfs.h index a7f0150..11100ca 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -157,6 +157,7 @@ struct configfs_group_operations { struct config_item *(*make_item)(struct config_group *group, const char *name); struct config_group *(*make_group)(struct config_group *group, const char *name); int (*commit_item)(struct config_item *item); + void (*disconnect_notify)(struct config_group *group, struct config_item *item); void (*drop_item)(struct config_group *group, struct config_item *item); };