ocfs2: Remove i_generation from inode lock names
OCFS2 puts inode meta data in the "lock value block" provided by the DLM.
Typically, i_generation is encoded in the lock name so that a deleted inode
on and a new one in the same block don't share the same lvb.
Unfortunately, that scheme means that the read in ocfs2_read_locked_inode()
is potentially thrown away as soon as the meta data lock is taken - we
cannot encode the lock name without first knowing i_generation, which
requires a disk read.
This patch encodes i_generation in the inode meta data lvb, and removes the
value from the inode meta data lock name. This way, the read can be covered
by a lock, and at the same time we can distinguish between an up to date and
a stale LVB.
This will help cold-cache stat(2) performance in particular.
Since this patch changes the protocol version, we take the opportunity to do
a minor re-organization of two of the LVB fields.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6cd84df..ecb3cba 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -320,6 +320,7 @@
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
+ unsigned int generation,
struct inode *inode)
{
struct ocfs2_lock_res_ops *ops;
@@ -341,7 +342,7 @@
};
ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
- inode->i_generation, res->l_name);
+ generation, res->l_name);
ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode);
}
@@ -1173,17 +1174,19 @@
int ocfs2_create_new_lock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres,
- int ex)
+ int ex,
+ int local)
{
int level = ex ? LKM_EXMODE : LKM_PRMODE;
unsigned long flags;
+ int lkm_flags = local ? LKM_LOCAL : 0;
spin_lock_irqsave(&lockres->l_lock, flags);
BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- return ocfs2_lock_create(osb, lockres, level, LKM_LOCAL);
+ return ocfs2_lock_create(osb, lockres, level, lkm_flags);
}
/* Grants us an EX lock on the data and metadata resources, skipping
@@ -1212,19 +1215,23 @@
* on a resource which has an invalid one -- we'll set it
* valid when we release the EX. */
- ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1);
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1);
if (ret) {
mlog_errno(ret);
goto bail;
}
- ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1);
+ /*
+ * We don't want to use LKM_LOCAL on a meta data lock as they
+ * don't use a generation in their lock names.
+ */
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0);
if (ret) {
mlog_errno(ret);
goto bail;
}
- ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1);
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1);
if (ret) {
mlog_errno(ret);
goto bail;
@@ -1413,6 +1420,16 @@
lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
+ /*
+ * Invalidate the LVB of a deleted inode - this way other
+ * nodes are forced to go to disk and discover the new inode
+ * status.
+ */
+ if (oi->ip_flags & OCFS2_INODE_DELETED) {
+ lvb->lvb_version = 0;
+ goto out;
+ }
+
lvb->lvb_version = OCFS2_LVB_VERSION;
lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
@@ -1429,6 +1446,7 @@
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
+out:
mlog_meta_lvb(0, lockres);
mlog_exit_void();
@@ -1727,6 +1745,18 @@
wait_event(osb->recovery_event,
ocfs2_node_map_is_empty(osb, &osb->recovery_map));
+ /*
+ * We only see this flag if we're being called from
+ * ocfs2_read_locked_inode(). It means we're locking an inode
+ * which hasn't been populated yet, so clear the refresh flag
+ * and let the caller handle it.
+ */
+ if (inode->i_state & I_NEW) {
+ status = 0;
+ ocfs2_complete_lock_res_refresh(lockres, 0);
+ goto bail;
+ }
+
/* This is fun. The caller may want a bh back, or it may
* not. ocfs2_meta_lock_update definitely wants one in, but
* may or may not read one, depending on what's in the