Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6 * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6: (107 commits) vfs: use ERR_CAST for err-ptr tossing in lookup_instantiate_filp isofs: Remove global fs lock jffs2: fix IN_DELETE_SELF on overwriting rename() killing a directory fix IN_DELETE_SELF on overwriting rename() on ramfs et.al. mm/truncate.c: fix build for CONFIG_BLOCK not enabled fs:update the NOTE of the file_operations structure Remove dead code in dget_parent() AFS: Fix silly characters in a comment switch d_add_ci() to d_splice_alias() in "found negative" case as well simplify gfs2_lookup() jfs_lookup(): don't bother with . or .. get rid of useless dget_parent() in btrfs rename() and link() get rid of useless dget_parent() in fs/btrfs/ioctl.c fs: push i_mutex and filemap_write_and_wait down into ->fsync() handlers drivers: fix up various ->llseek() implementations fs: handle SEEK_HOLE/SEEK_DATA properly in all fs's that define their own llseek Ext4: handle SEEK_HOLE/SEEK_DATA generically Btrfs: implement our own ->llseek fs: add SEEK_HOLE and SEEK_DATA flags reiserfs: make reiserfs default to barrier=flush ... Fix up trivial conflicts in fs/xfs/linux-2.6/xfs_super.c due to the new shrinker callout for the inode cache, that clashed with the xfs code to start the periodic workers later.
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 57d827d6..ca7e252 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking
@@ -52,7 +52,7 @@ void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); int (*permission) (struct inode *, int, unsigned int); - int (*check_acl)(struct inode *, int, unsigned int); + int (*check_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); @@ -412,7 +412,7 @@ int (*open) (struct inode *, struct file *); int (*flush) (struct file *); int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, int datasync); + int (*fsync) (struct file *, loff_t start, loff_t end, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); @@ -438,9 +438,7 @@ locking rules: All may block except for ->setlease. - No VFS locks held on entry except for ->fsync and ->setlease. - -->fsync() has i_mutex on inode. + No VFS locks held on entry except for ->setlease. ->setlease has the file_list_lock held and must not sleep.
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 6e29954..7f8861d 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting
@@ -400,10 +400,31 @@ -- [mandatory] - --- -[mandatory] ->get_sb() is gone. Switch to use of ->mount(). Typically it's just a matter of switching from calling get_sb_... to mount_... and changing the function type. If you were doing it manually, just switch from setting ->mnt_root to some pointer to returning that pointer. On errors return ERR_PTR(...). + +-- +[mandatory] + ->permission(), generic_permission() and ->check_acl() have lost flags +argument; instead of passing IPERM_FLAG_RCU we add MAY_NOT_BLOCK into mask. + generic_permission() has also lost the check_acl argument; if you want +non-NULL to be used for that inode, put it into ->i_op->check_acl. + +-- +[mandatory] + If you implement your own ->llseek() you must handle SEEK_HOLE and +SEEK_DATA. You can hanle this by returning -EINVAL, but it would be nicer to +support it in some way. The generic handler assumes that the entire file is +data and there is a virtual hole at the end of the file. So if the provided +offset is less than i_size and SEEK_DATA is specified, return the same offset. +If the above is true for the offset and you are given SEEK_HOLE, return the end +of the file. If the offset is i_size or greater return -ENXIO in either case. + +[mandatory] + If you have your own ->fsync() you must make sure to call +filemap_write_and_wait_range() so that all dirty pages are synced out properly. +You must also keep in mind that ->fsync() is not called with i_mutex held +anymore, so if you require i_mutex locking you must make sure to take it and +release it yourself.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 88b9f55..eff6617 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt
@@ -229,6 +229,8 @@ ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); + int (*nr_cached_objects)(struct super_block *); + void (*free_cached_objects)(struct super_block *, int); }; All methods are called without any locks being held, unless otherwise @@ -301,6 +303,26 @@ quota_write: called by the VFS to write to filesystem quota file. + nr_cached_objects: called by the sb cache shrinking function for the + filesystem to return the number of freeable cached objects it contains. + Optional. + + free_cache_objects: called by the sb cache shrinking function for the + filesystem to scan the number of objects indicated to try to free them. + Optional, but any filesystem implementing this method needs to also + implement ->nr_cached_objects for it to be called correctly. + + We can't do anything with any errors that the filesystem might + encountered, hence the void return type. This will never be called if + the VM is trying to reclaim under GFP_NOFS conditions, hence this + method does not need to handle that situation itself. + + Implementations must include conditional reschedule calls inside any + scanning loop that is done. This allows the VFS to determine + appropriate scan batch sizes without having to worry about whether + implementations will cause holdoff problems due to large scan batch + sizes. + Whoever sets up the inode is responsible for filling in the "i_op" field. This is a pointer to a "struct inode_operations" which describes the methods that can be performed on individual inodes. @@ -333,8 +355,8 @@ void * (*follow_link) (struct dentry *, struct nameidata *); void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); - int (*permission) (struct inode *, int, unsigned int); - int (*check_acl)(struct inode *, int, unsigned int); + int (*permission) (struct inode *, int); + int (*check_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); @@ -423,7 +445,7 @@ permission: called by the VFS to check for access rights on a POSIX-like filesystem. - May be called in rcu-walk mode (flags & IPERM_FLAG_RCU). If in rcu-walk + May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk mode, the filesystem must check the permission without blocking or storing to the inode. @@ -755,7 +777,7 @@ int (*open) (struct inode *, struct file *); int (*flush) (struct file *); int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, int datasync); + int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *);
diff --git a/arch/arm/mach-tegra/clock.c b/arch/arm/mach-tegra/clock.c index e028320..f8d41ff 100644 --- a/arch/arm/mach-tegra/clock.c +++ b/arch/arm/mach-tegra/clock.c
@@ -585,7 +585,7 @@ static int clk_debugfs_register_one(struct clk *c) { - struct dentry *d, *child, *child_tmp; + struct dentry *d; d = debugfs_create_dir(c->name, clk_debugfs_root); if (!d) @@ -614,10 +614,7 @@ return 0; err_out: - d = c->dent; - list_for_each_entry_safe(child, child_tmp, &d->d_subdirs, d_u.d_child) - debugfs_remove(child); - debugfs_remove(c->dent); + debugfs_remove_recursive(c->dent); return -ENOMEM; }
diff --git a/arch/arm/mach-ux500/clock.c b/arch/arm/mach-ux500/clock.c index 32ce908..7d107be 100644 --- a/arch/arm/mach-ux500/clock.c +++ b/arch/arm/mach-ux500/clock.c
@@ -635,16 +635,13 @@ static struct dentry *clk_debugfs_register_dir(struct clk *c, struct dentry *p_dentry) { - struct dentry *d, *clk_d, *child, *child_tmp; - char s[255]; - char *p = s; + struct dentry *d, *clk_d; + const char *p = c->name; - if (c->name == NULL) - p += sprintf(p, "BUG"); - else - p += sprintf(p, "%s", c->name); + if (!p) + p = "BUG"; - clk_d = debugfs_create_dir(s, p_dentry); + clk_d = debugfs_create_dir(p, p_dentry); if (!clk_d) return NULL; @@ -666,24 +663,10 @@ return clk_d; err_out: - d = clk_d; - list_for_each_entry_safe(child, child_tmp, &d->d_subdirs, d_u.d_child) - debugfs_remove(child); - debugfs_remove(clk_d); + debugfs_remove_recursive(clk_d); return NULL; } -static void clk_debugfs_remove_dir(struct dentry *cdentry) -{ - struct dentry *d, *child, *child_tmp; - - d = cdentry; - list_for_each_entry_safe(child, child_tmp, &d->d_subdirs, d_u.d_child) - debugfs_remove(child); - debugfs_remove(cdentry); - return ; -} - static int clk_debugfs_register_one(struct clk *c) { struct clk *pa = c->parent_periph; @@ -700,7 +683,7 @@ c->dent_bus = clk_debugfs_register_dir(c, bpa->dent_bus ? bpa->dent_bus : bpa->dent); if ((!c->dent_bus) && (c->dent)) { - clk_debugfs_remove_dir(c->dent); + debugfs_remove_recursive(c->dent); c->dent = NULL; return -ENOMEM; }
diff --git a/arch/arm/plat-omap/clock.c b/arch/arm/plat-omap/clock.c index c9122dd..964704f 100644 --- a/arch/arm/plat-omap/clock.c +++ b/arch/arm/plat-omap/clock.c
@@ -480,13 +480,10 @@ static int clk_debugfs_register_one(struct clk *c) { int err; - struct dentry *d, *child, *child_tmp; + struct dentry *d; struct clk *pa = c->parent; - char s[255]; - char *p = s; - p += sprintf(p, "%s", c->name); - d = debugfs_create_dir(s, pa ? pa->dent : clk_debugfs_root); + d = debugfs_create_dir(c->name, pa ? pa->dent : clk_debugfs_root); if (!d) return -ENOMEM; c->dent = d; @@ -509,10 +506,7 @@ return 0; err_out: - d = c->dent; - list_for_each_entry_safe(child, child_tmp, &d->d_subdirs, d_u.d_child) - debugfs_remove(child); - debugfs_remove(c->dent); + debugfs_remove_recursive(c->dent); return err; }
diff --git a/arch/arm/plat-samsung/clock.c b/arch/arm/plat-samsung/clock.c index 7728928..0c9f95d 100644 --- a/arch/arm/plat-samsung/clock.c +++ b/arch/arm/plat-samsung/clock.c
@@ -458,7 +458,7 @@ static int clk_debugfs_register_one(struct clk *c) { int err; - struct dentry *d, *child, *child_tmp; + struct dentry *d; struct clk *pa = c->parent; char s[255]; char *p = s; @@ -488,10 +488,7 @@ return 0; err_out: - d = c->dent; - list_for_each_entry_safe(child, child_tmp, &d->d_subdirs, d_u.d_child) - debugfs_remove(child); - debugfs_remove(c->dent); + debugfs_remove_recursive(c->dent); return err; }
diff --git a/arch/arm/plat-spear/clock.c b/arch/arm/plat-spear/clock.c index 6fa474c..67dd003 100644 --- a/arch/arm/plat-spear/clock.c +++ b/arch/arm/plat-spear/clock.c
@@ -916,7 +916,7 @@ static int clk_debugfs_register_one(struct clk *c) { int err; - struct dentry *d, *child; + struct dentry *d; struct clk *pa = c->pclk; char s[255]; char *p = s; @@ -951,10 +951,7 @@ return 0; err_out: - d = c->dent; - list_for_each_entry(child, &d->d_subdirs, d_u.d_child) - debugfs_remove(child); - debugfs_remove(c->dent); + debugfs_remove_recursive(c->dent); return err; }
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 3c7c3f8..fb59c46 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1850,9 +1850,16 @@ return ret; } -static int spufs_mfc_fsync(struct file *file, int datasync) +static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - return spufs_mfc_flush(file, NULL); + struct inode *inode = file->f_path.dentry->d_inode; + int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (!err) { + mutex_lock(&inode->i_mutex); + err = spufs_mfc_flush(file, NULL); + mutex_unlock(&inode->i_mutex); + } + return err; } static int spufs_mfc_fasync(int fd, struct file *file, int on)
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 856e9c3..e481f6b 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -611,15 +611,14 @@ static struct file_system_type spufs_type; -long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode, - struct file *filp) +long spufs_create(struct path *path, struct dentry *dentry, + unsigned int flags, mode_t mode, struct file *filp) { - struct dentry *dentry; int ret; ret = -EINVAL; /* check if we are on spufs */ - if (nd->path.dentry->d_sb->s_type != &spufs_type) + if (path->dentry->d_sb->s_type != &spufs_type) goto out; /* don't accept undefined flags */ @@ -627,33 +626,27 @@ goto out; /* only threads can be underneath a gang */ - if (nd->path.dentry != nd->path.dentry->d_sb->s_root) { + if (path->dentry != path->dentry->d_sb->s_root) { if ((flags & SPU_CREATE_GANG) || - !SPUFS_I(nd->path.dentry->d_inode)->i_gang) + !SPUFS_I(path->dentry->d_inode)->i_gang) goto out; } - dentry = lookup_create(nd, 1); - ret = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_dir; - mode &= ~current_umask(); if (flags & SPU_CREATE_GANG) - ret = spufs_create_gang(nd->path.dentry->d_inode, - dentry, nd->path.mnt, mode); + ret = spufs_create_gang(path->dentry->d_inode, + dentry, path->mnt, mode); else - ret = spufs_create_context(nd->path.dentry->d_inode, - dentry, nd->path.mnt, flags, mode, + ret = spufs_create_context(path->dentry->d_inode, + dentry, path->mnt, flags, mode, filp); if (ret >= 0) - fsnotify_mkdir(nd->path.dentry->d_inode, dentry); + fsnotify_mkdir(path->dentry->d_inode, dentry); return ret; -out_dir: - mutex_unlock(&nd->path.dentry->d_inode->i_mutex); out: + mutex_unlock(&path->dentry->d_inode->i_mutex); return ret; }
diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h index c448bac..099245f 100644 --- a/arch/powerpc/platforms/cell/spufs/spufs.h +++ b/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -248,7 +248,7 @@ /* system call implementation */ extern struct spufs_calls spufs_calls; long spufs_run_spu(struct spu_context *ctx, u32 *npc, u32 *status); -long spufs_create(struct nameidata *nd, unsigned int flags, +long spufs_create(struct path *nd, struct dentry *dentry, unsigned int flags, mode_t mode, struct file *filp); /* ELF coredump callbacks for writing SPU ELF notes */ extern int spufs_coredump_extra_notes_size(void);
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index a3d2ce5..609e016 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -62,21 +62,17 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, mode_t mode, struct file *neighbor) { - char *tmp; + struct path path; + struct dentry *dentry; int ret; - tmp = getname(pathname); - ret = PTR_ERR(tmp); - if (!IS_ERR(tmp)) { - struct nameidata nd; - - ret = kern_path_parent(tmp, &nd); - if (!ret) { - nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE; - ret = spufs_create(&nd, flags, mode, neighbor); - path_put(&nd.path); - } - putname(tmp); + dentry = user_path_create(AT_FDCWD, pathname, &path, 1); + ret = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + ret = spufs_create(&path, dentry, flags, mode, neighbor); + mutex_unlock(&path.dentry->d_inode->i_mutex); + dput(dentry); + path_put(&path); } return ret;
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 82bbb59..6d678c9 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c
@@ -21,12 +21,11 @@ #include <linux/fs.h> #include <linux/shmem_fs.h> #include <linux/ramfs.h> -#include <linux/cred.h> #include <linux/sched.h> -#include <linux/init_task.h> #include <linux/slab.h> +#include <linux/kthread.h> -static struct vfsmount *dev_mnt; +static struct task_struct *thread; #if defined CONFIG_DEVTMPFS_MOUNT static int mount_dev = 1; @@ -34,7 +33,16 @@ static int mount_dev; #endif -static DEFINE_MUTEX(dirlock); +static DEFINE_SPINLOCK(req_lock); + +static struct req { + struct req *next; + struct completion done; + int err; + const char *name; + mode_t mode; /* 0 => delete */ + struct device *dev; +} *requests; static int __init mount_param(char *str) { @@ -68,131 +76,152 @@ static inline int is_blockdev(struct device *dev) { return 0; } #endif +int devtmpfs_create_node(struct device *dev) +{ + const char *tmp = NULL; + struct req req; + + if (!thread) + return 0; + + req.mode = 0; + req.name = device_get_devnode(dev, &req.mode, &tmp); + if (!req.name) + return -ENOMEM; + + if (req.mode == 0) + req.mode = 0600; + if (is_blockdev(dev)) + req.mode |= S_IFBLK; + else + req.mode |= S_IFCHR; + + req.dev = dev; + + init_completion(&req.done); + + spin_lock(&req_lock); + req.next = requests; + requests = &req; + spin_unlock(&req_lock); + + wake_up_process(thread); + wait_for_completion(&req.done); + + kfree(tmp); + + return req.err; +} + +int devtmpfs_delete_node(struct device *dev) +{ + const char *tmp = NULL; + struct req req; + + if (!thread) + return 0; + + req.name = device_get_devnode(dev, NULL, &tmp); + if (!req.name) + return -ENOMEM; + + req.mode = 0; + req.dev = dev; + + init_completion(&req.done); + + spin_lock(&req_lock); + req.next = requests; + requests = &req; + spin_unlock(&req_lock); + + wake_up_process(thread); + wait_for_completion(&req.done); + + kfree(tmp); + return req.err; +} + static int dev_mkdir(const char *name, mode_t mode) { - struct nameidata nd; struct dentry *dentry; + struct path path; int err; - err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt, - name, LOOKUP_PARENT, &nd); - if (err) - return err; + dentry = kern_path_create(AT_FDCWD, name, &path, 1); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); - dentry = lookup_create(&nd, 1); - if (!IS_ERR(dentry)) { - err = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); - if (!err) - /* mark as kernel-created inode */ - dentry->d_inode->i_private = &dev_mnt; - dput(dentry); - } else { - err = PTR_ERR(dentry); - } - - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); + err = vfs_mkdir(path.dentry->d_inode, dentry, mode); + if (!err) + /* mark as kernel-created inode */ + dentry->d_inode->i_private = &thread; + dput(dentry); + mutex_unlock(&path.dentry->d_inode->i_mutex); + path_put(&path); return err; } static int create_path(const char *nodepath) { + char *path; + char *s; int err; - mutex_lock(&dirlock); - err = dev_mkdir(nodepath, 0755); - if (err == -ENOENT) { - char *path; - char *s; + /* parent directories do not exist, create them */ + path = kstrdup(nodepath, GFP_KERNEL); + if (!path) + return -ENOMEM; - /* parent directories do not exist, create them */ - path = kstrdup(nodepath, GFP_KERNEL); - if (!path) { - err = -ENOMEM; - goto out; - } - s = path; - for (;;) { - s = strchr(s, '/'); - if (!s) - break; - s[0] = '\0'; - err = dev_mkdir(path, 0755); - if (err && err != -EEXIST) - break; - s[0] = '/'; - s++; - } - kfree(path); + s = path; + for (;;) { + s = strchr(s, '/'); + if (!s) + break; + s[0] = '\0'; + err = dev_mkdir(path, 0755); + if (err && err != -EEXIST) + break; + s[0] = '/'; + s++; } -out: - mutex_unlock(&dirlock); + kfree(path); return err; } -int devtmpfs_create_node(struct device *dev) +static int handle_create(const char *nodename, mode_t mode, struct device *dev) { - const char *tmp = NULL; - const char *nodename; - const struct cred *curr_cred; - mode_t mode = 0; - struct nameidata nd; struct dentry *dentry; + struct path path; int err; - if (!dev_mnt) - return 0; - - nodename = device_get_devnode(dev, &mode, &tmp); - if (!nodename) - return -ENOMEM; - - if (mode == 0) - mode = 0600; - if (is_blockdev(dev)) - mode |= S_IFBLK; - else - mode |= S_IFCHR; - - curr_cred = override_creds(&init_cred); - - err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt, - nodename, LOOKUP_PARENT, &nd); - if (err == -ENOENT) { + dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); - err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt, - nodename, LOOKUP_PARENT, &nd); + dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); } - if (err) - goto out; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); - dentry = lookup_create(&nd, 0); - if (!IS_ERR(dentry)) { - err = vfs_mknod(nd.path.dentry->d_inode, - dentry, mode, dev->devt); - if (!err) { - struct iattr newattrs; + err = vfs_mknod(path.dentry->d_inode, + dentry, mode, dev->devt); + if (!err) { + struct iattr newattrs; - /* fixup possibly umasked mode */ - newattrs.ia_mode = mode; - newattrs.ia_valid = ATTR_MODE; - mutex_lock(&dentry->d_inode->i_mutex); - notify_change(dentry, &newattrs); - mutex_unlock(&dentry->d_inode->i_mutex); + /* fixup possibly umasked mode */ + newattrs.ia_mode = mode; + newattrs.ia_valid = ATTR_MODE; + mutex_lock(&dentry->d_inode->i_mutex); + notify_change(dentry, &newattrs); + mutex_unlock(&dentry->d_inode->i_mutex); - /* mark as kernel-created inode */ - dentry->d_inode->i_private = &dev_mnt; - } - dput(dentry); - } else { - err = PTR_ERR(dentry); + /* mark as kernel-created inode */ + dentry->d_inode->i_private = &thread; } + dput(dentry); - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); -out: - kfree(tmp); - revert_creds(curr_cred); + mutex_unlock(&path.dentry->d_inode->i_mutex); + path_put(&path); return err; } @@ -202,8 +231,7 @@ struct dentry *dentry; int err; - err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt, - name, LOOKUP_PARENT, &nd); + err = kern_path_parent(name, &nd); if (err) return err; @@ -211,7 +239,7 @@ dentry = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); if (!IS_ERR(dentry)) { if (dentry->d_inode) { - if (dentry->d_inode->i_private == &dev_mnt) + if (dentry->d_inode->i_private == &thread) err = vfs_rmdir(nd.path.dentry->d_inode, dentry); else @@ -238,7 +266,6 @@ if (!path) return -ENOMEM; - mutex_lock(&dirlock); for (;;) { char *base; @@ -250,7 +277,6 @@ if (err) break; } - mutex_unlock(&dirlock); kfree(path); return err; @@ -259,7 +285,7 @@ static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat) { /* did we create it */ - if (inode->i_private != &dev_mnt) + if (inode->i_private != &thread) return 0; /* does the dev_t match */ @@ -277,29 +303,17 @@ return 1; } -int devtmpfs_delete_node(struct device *dev) +static int handle_remove(const char *nodename, struct device *dev) { - const char *tmp = NULL; - const char *nodename; - const struct cred *curr_cred; struct nameidata nd; struct dentry *dentry; struct kstat stat; int deleted = 1; int err; - if (!dev_mnt) - return 0; - - nodename = device_get_devnode(dev, NULL, &tmp); - if (!nodename) - return -ENOMEM; - - curr_cred = override_creds(&init_cred); - err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt, - nodename, LOOKUP_PARENT, &nd); + err = kern_path_parent(nodename, &nd); if (err) - goto out; + return err; mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); @@ -337,9 +351,6 @@ path_put(&nd.path); if (deleted && strchr(nodename, '/')) delete_path(nodename); -out: - kfree(tmp); - revert_creds(curr_cred); return err; } @@ -354,7 +365,7 @@ if (!mount_dev) return 0; - if (!dev_mnt) + if (!thread) return 0; err = sys_mount("devtmpfs", (char *)mntdir, "devtmpfs", MS_SILENT, NULL); @@ -365,31 +376,79 @@ return err; } +static __initdata DECLARE_COMPLETION(setup_done); + +static int handle(const char *name, mode_t mode, struct device *dev) +{ + if (mode) + return handle_create(name, mode, dev); + else + return handle_remove(name, dev); +} + +static int devtmpfsd(void *p) +{ + char options[] = "mode=0755"; + int *err = p; + *err = sys_unshare(CLONE_NEWNS); + if (*err) + goto out; + *err = sys_mount("devtmpfs", "/", "devtmpfs", MS_SILENT, options); + if (*err) + goto out; + sys_chdir("/.."); /* will traverse into overmounted root */ + sys_chroot("."); + complete(&setup_done); + while (1) { + spin_lock(&req_lock); + while (requests) { + struct req *req = requests; + requests = NULL; + spin_unlock(&req_lock); + while (req) { + req->err = handle(req->name, req->mode, req->dev); + complete(&req->done); + req = req->next; + } + spin_lock(&req_lock); + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&req_lock); + schedule(); + __set_current_state(TASK_RUNNING); + } + return 0; +out: + complete(&setup_done); + return *err; +} + /* * Create devtmpfs instance, driver-core devices will add their device * nodes here. */ int __init devtmpfs_init(void) { - int err; - struct vfsmount *mnt; - char options[] = "mode=0755"; - - err = register_filesystem(&dev_fs_type); + int err = register_filesystem(&dev_fs_type); if (err) { printk(KERN_ERR "devtmpfs: unable to register devtmpfs " "type %i\n", err); return err; } - mnt = kern_mount_data(&dev_fs_type, options); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); + thread = kthread_run(devtmpfsd, &err, "kdevtmpfs"); + if (!IS_ERR(thread)) { + wait_for_completion(&setup_done); + } else { + err = PTR_ERR(thread); + thread = NULL; + } + + if (err) { printk(KERN_ERR "devtmpfs: unable to create devtmpfs %i\n", err); unregister_filesystem(&dev_fs_type); return err; } - dev_mnt = mnt; printk(KERN_INFO "devtmpfs: initialized\n"); return 0;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 07a382e..e133f09 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c
@@ -1206,7 +1206,7 @@ if (!sb) return 0; - if (!sb->s_op || !sb->s_op->relocate_blocks) + if (!sb->s_op->relocate_blocks) goto out; old_block = pkt->sector / (CD_FRAMESIZE >> 9);
diff --git a/drivers/char/generic_nvram.c b/drivers/char/generic_nvram.c index 0e941b5..6c4f4b5 100644 --- a/drivers/char/generic_nvram.c +++ b/drivers/char/generic_nvram.c
@@ -34,12 +34,16 @@ static loff_t nvram_llseek(struct file *file, loff_t offset, int origin) { switch (origin) { + case 0: + break; case 1: offset += file->f_pos; break; case 2: offset += nvram_len; break; + default: + offset = -1; } if (offset < 0) return -EINVAL;
diff --git a/drivers/char/nvram.c b/drivers/char/nvram.c index 166f1e7..da3cfee 100644 --- a/drivers/char/nvram.c +++ b/drivers/char/nvram.c
@@ -224,6 +224,8 @@ case 2: offset += NVRAM_BYTES; break; + default: + return -EINVAL; } return (offset >= 0) ? (file->f_pos = offset) : -EINVAL;
diff --git a/drivers/char/ps3flash.c b/drivers/char/ps3flash.c index 85c004a..d0c57c2 100644 --- a/drivers/char/ps3flash.c +++ b/drivers/char/ps3flash.c
@@ -101,12 +101,16 @@ mutex_lock(&file->f_mapping->host->i_mutex); switch (origin) { + case 0: + break; case 1: offset += file->f_pos; break; case 2: offset += dev->regions[dev->region_idx].size*dev->blk_size; break; + default: + offset = -1; } if (offset < 0) { res = -EINVAL; @@ -305,9 +309,14 @@ return ps3flash_writeback(ps3flash_dev); } -static int ps3flash_fsync(struct file *file, int datasync) +static int ps3flash_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - return ps3flash_writeback(ps3flash_dev); + struct inode *inode = file->f_path.dentry->d_inode; + int err; + mutex_lock(&inode->i_mutex); + err = ps3flash_writeback(ps3flash_dev); + mutex_unlock(&inode->i_mutex); + return err; } static irqreturn_t ps3flash_interrupt(int irq, void *data)
diff --git a/drivers/macintosh/nvram.c b/drivers/macintosh/nvram.c index a271c82..f0e03e7 100644 --- a/drivers/macintosh/nvram.c +++ b/drivers/macintosh/nvram.c
@@ -21,12 +21,16 @@ static loff_t nvram_llseek(struct file *file, loff_t offset, int origin) { switch (origin) { + case 0: + break; case 1: offset += file->f_pos; break; case 2: offset += NVRAM_SIZE; break; + default: + offset = -1; } if (offset < 0) return -EINVAL;
diff --git a/drivers/md/md.c b/drivers/md/md.c index 91e31e2..dfc9425 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c
@@ -6394,16 +6394,11 @@ mddev_put(mddev); } -struct mdstat_info { - int event; -}; - static int md_seq_show(struct seq_file *seq, void *v) { mddev_t *mddev = v; sector_t sectors; mdk_rdev_t *rdev; - struct mdstat_info *mi = seq->private; struct bitmap *bitmap; if (v == (void*)1) { @@ -6415,7 +6410,7 @@ spin_unlock(&pers_lock); seq_printf(seq, "\n"); - mi->event = atomic_read(&md_event_count); + seq->poll_event = atomic_read(&md_event_count); return 0; } if (v == (void*)2) { @@ -6527,26 +6522,21 @@ static int md_seq_open(struct inode *inode, struct file *file) { + struct seq_file *seq; int error; - struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); - if (mi == NULL) - return -ENOMEM; error = seq_open(file, &md_seq_ops); if (error) - kfree(mi); - else { - struct seq_file *p = file->private_data; - p->private = mi; - mi->event = atomic_read(&md_event_count); - } + return error; + + seq = file->private_data; + seq->poll_event = atomic_read(&md_event_count); return error; } static unsigned int mdstat_poll(struct file *filp, poll_table *wait) { - struct seq_file *m = filp->private_data; - struct mdstat_info *mi = m->private; + struct seq_file *seq = filp->private_data; int mask; poll_wait(filp, &md_event_waiters, wait); @@ -6554,7 +6544,7 @@ /* always allow read */ mask = POLLIN | POLLRDNORM; - if (mi->event != atomic_read(&md_event_count)) + if (seq->poll_event != atomic_read(&md_event_count)) mask |= POLLERR | POLLPRI; return mask; }
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c index 191f3bb..3320a50 100644 --- a/drivers/mtd/ubi/cdev.c +++ b/drivers/mtd/ubi/cdev.c
@@ -189,12 +189,16 @@ return new_offset; } -static int vol_cdev_fsync(struct file *file, int datasync) +static int vol_cdev_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct ubi_volume_desc *desc = file->private_data; struct ubi_device *ubi = desc->vol->ubi; - - return ubi_sync(ubi->ubi_num); + struct inode *inode = file->f_path.dentry->d_inode; + int err; + mutex_lock(&inode->i_mutex); + err = ubi_sync(ubi->ubi_num); + mutex_unlock(&inode->i_mutex); + return err; }
diff --git a/drivers/sh/clk/core.c b/drivers/sh/clk/core.c index 7e9c399..d6702e5 100644 --- a/drivers/sh/clk/core.c +++ b/drivers/sh/clk/core.c
@@ -670,7 +670,7 @@ static int clk_debugfs_register_one(struct clk *c) { int err; - struct dentry *d, *child, *child_tmp; + struct dentry *d; struct clk *pa = c->parent; char s[255]; char *p = s; @@ -699,10 +699,7 @@ return 0; err_out: - d = c->dentry; - list_for_each_entry_safe(child, child_tmp, &d->d_subdirs, d_u.d_child) - debugfs_remove(child); - debugfs_remove(c->dentry); + debugfs_remove_recursive(c->dentry); return err; }
diff --git a/drivers/staging/pohmelfs/dir.c b/drivers/staging/pohmelfs/dir.c index 9732a96..7598e77 100644 --- a/drivers/staging/pohmelfs/dir.c +++ b/drivers/staging/pohmelfs/dir.c
@@ -512,7 +512,7 @@ int err, lock_type = POHMELFS_READ_LOCK, need_lock = 1; struct qstr str = dentry->d_name; - if ((nd->intent.open.flags & O_ACCMODE) > 1) + if ((nd->intent.open.flags & O_ACCMODE) != O_RDONLY) lock_type = POHMELFS_WRITE_LOCK; if (test_bit(NETFS_INODE_OWNED, &parent->state)) {
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index c0f0ac7..f3c6060 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c
@@ -887,11 +887,16 @@ /* * We want fsync() to work on POHMELFS. */ -static int pohmelfs_fsync(struct file *file, int datasync) +static int pohmelfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; - - return sync_inode_metadata(inode, 1); + int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (!err) { + mutex_lock(&inode->i_mutex); + err = sync_inode_metadata(inode, 1); + mutex_unlock(&inode->i_mutex); + } + return err; } ssize_t pohmelfs_write(struct file *file, const char __user *buf,
diff --git a/drivers/usb/gadget/printer.c b/drivers/usb/gadget/printer.c index 271ef94..978e6a1 100644 --- a/drivers/usb/gadget/printer.c +++ b/drivers/usb/gadget/printer.c
@@ -795,12 +795,14 @@ } static int -printer_fsync(struct file *fd, int datasync) +printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync) { struct printer_dev *dev = fd->private_data; + struct inode *inode = fd->f_path.dentry->d_inode; unsigned long flags; int tx_list_empty; + mutex_lock(&inode->i_mutex); spin_lock_irqsave(&dev->lock, flags); tx_list_empty = (likely(list_empty(&dev->tx_reqs))); spin_unlock_irqrestore(&dev->lock, flags); @@ -810,6 +812,7 @@ wait_event_interruptible(dev->tx_flush_wait, (likely(list_empty(&dev->tx_reqs_active)))); } + mutex_unlock(&inode->i_mutex); return 0; }
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 8040001..32814e8 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c
@@ -66,19 +66,26 @@ return 0; } -int fb_deferred_io_fsync(struct file *file, int datasync) +int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fb_info *info = file->private_data; + struct inode *inode = file->f_path.dentry->d_inode; + int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; /* Skip if deferred io is compiled-in but disabled on this fbdev */ if (!info->fbdefio) return 0; + mutex_lock(&inode->i_mutex); /* Kill off the delayed work */ cancel_delayed_work_sync(&info->deferred_work); /* Run it immediately */ - return schedule_delayed_work(&info->deferred_work, 0); + err = schedule_delayed_work(&info->deferred_work, 0); + mutex_unlock(&inode->i_mutex); + return err; } EXPORT_SYMBOL_GPL(fb_deferred_io_fsync);
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 535ab6e..e98f56d 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c
@@ -96,12 +96,12 @@ return acl; } -int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags) +int v9fs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; struct v9fs_session_info *v9ses; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; v9ses = v9fs_inode2v9ses(inode);
diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 7ef3ac9..59e18c2 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h
@@ -16,7 +16,7 @@ #ifdef CONFIG_9P_FS_POSIX_ACL extern int v9fs_get_acl(struct inode *, struct p9_fid *); -extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags); +extern int v9fs_check_acl(struct inode *inode, int mask); extern int v9fs_acl_chmod(struct dentry *); extern int v9fs_set_create_acl(struct dentry *, struct posix_acl *, struct posix_acl *);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 4014160..46ce357 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h
@@ -70,7 +70,8 @@ ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64); void v9fs_blank_wstat(struct p9_wstat *wstat); int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); -int v9fs_file_fsync_dotl(struct file *filp, int datasync); +int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, + int datasync); ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *, const char __user *, size_t, loff_t *, int); int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index ffed558..3c173fc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c
@@ -519,32 +519,50 @@ } -static int v9fs_file_fsync(struct file *filp, int datasync) +static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct p9_fid *fid; + struct inode *inode = filp->f_mapping->host; struct p9_wstat wstat; int retval; + retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (retval) + return retval; + + mutex_lock(&inode->i_mutex); P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); retval = p9_client_wstat(fid, &wstat); + mutex_unlock(&inode->i_mutex); + return retval; } -int v9fs_file_fsync_dotl(struct file *filp, int datasync) +int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, + int datasync) { struct p9_fid *fid; + struct inode *inode = filp->f_mapping->host; int retval; + retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (retval) + return retval; + + mutex_lock(&inode->i_mutex); P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n", filp, datasync); fid = filp->private_data; retval = p9_client_fsync(fid, datasync); + mutex_unlock(&inode->i_mutex); + return retval; }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7f6c677..7f9976a 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c
@@ -633,8 +633,8 @@ fid = NULL; v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode); - if (nd && nd->flags & LOOKUP_OPEN) - flags = nd->intent.open.flags - 1; + if (nd) + flags = nd->intent.open.flags; else flags = O_RDWR; @@ -649,7 +649,7 @@ v9fs_invalidate_inode_attr(dir); /* if we are opening a file, assign the open fid to the file */ - if (nd && nd->flags & LOOKUP_OPEN) { + if (nd) { v9inode = V9FS_I(dentry->d_inode); mutex_lock(&v9inode->v_mutex); if (v9ses->cache && !v9inode->writeback_fid &&
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 691c78f..32bbbe5 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c
@@ -173,8 +173,8 @@ struct posix_acl *pacl = NULL, *dacl = NULL; v9ses = v9fs_inode2v9ses(dir); - if (nd && nd->flags & LOOKUP_OPEN) - flags = nd->intent.open.flags - 1; + if (nd) + flags = nd->intent.open.flags; else { /* * create call without LOOKUP_OPEN is due
diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 0e95f73..c2b9c79 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h
@@ -182,7 +182,7 @@ void affs_free_prealloc(struct inode *inode); extern void affs_truncate(struct inode *); -int affs_file_fsync(struct file *, int); +int affs_file_fsync(struct file *, loff_t, loff_t, int); /* dir.c */
diff --git a/fs/affs/file.c b/fs/affs/file.c index acf321b..2f4c935 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c
@@ -923,14 +923,20 @@ affs_free_prealloc(inode); } -int affs_file_fsync(struct file *filp, int datasync) +int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; int ret, err; + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); ret = write_inode_now(inode, 0); err = sync_blockdev(inode->i_sb->s_bdev); if (!ret) ret = err; + mutex_unlock(&inode->i_mutex); return ret; }
diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h index 8bbefe0..800f607 100644 --- a/fs/afs/afs_vl.h +++ b/fs/afs/afs_vl.h
@@ -49,7 +49,7 @@ AFSVL_BADVOLOPER = 363542, /* Bad volume operation code */ AFSVL_BADRELLOCKTYPE = 363543, /* Bad release lock type */ AFSVL_RERELEASE = 363544, /* Status report: last release was aborted */ - AFSVL_BADSERVERFLAG = 363545, /* Invalid replication site server °ag */ + AFSVL_BADSERVERFLAG = 363545, /* Invalid replication site server flag */ AFSVL_PERM = 363546, /* No permission access */ AFSVL_NOMEM = 363547, /* malloc/realloc failed to alloc enough memory */ };
diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 5a9b684..d2b0888 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h
@@ -627,7 +627,7 @@ extern void afs_cache_permit(struct afs_vnode *, struct key *, long); extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); -extern int afs_permission(struct inode *, int, unsigned int); +extern int afs_permission(struct inode *, int); /* * server.c @@ -750,7 +750,7 @@ extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern int afs_writeback_all(struct afs_vnode *); -extern int afs_fsync(struct file *, int); +extern int afs_fsync(struct file *, loff_t, loff_t, int); /*****************************************************************************/
diff --git a/fs/afs/security.c b/fs/afs/security.c index f44b9d3..8d01042 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c
@@ -285,14 +285,14 @@ * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct inode *inode, int mask, unsigned int flags) +int afs_permission(struct inode *inode, int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); afs_access_t uninitialized_var(access); struct key *key; int ret; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; _enter("{{%x:%u},%lx},%x,", @@ -350,7 +350,7 @@ } key_put(key); - ret = generic_permission(inode, mask, flags, NULL); + ret = generic_permission(inode, mask); _leave(" = %d", ret); return ret;
diff --git a/fs/afs/write.c b/fs/afs/write.c index b806285..9aa52d9 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c
@@ -681,9 +681,10 @@ * - the return status from this call provides a reliable indication of * whether any write errors occurred for this process. */ -int afs_fsync(struct file *file, int datasync) +int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file->f_path.dentry; + struct inode *inode = file->f_mapping->host; struct afs_writeback *wb, *xwb; struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); int ret; @@ -692,12 +693,19 @@ vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, datasync); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + /* use a writeback record as a marker in the queue - when this reaches * the front of the queue, all the outstanding writes are either * completed or rejected */ wb = kzalloc(sizeof(*wb), GFP_KERNEL); - if (!wb) - return -ENOMEM; + if (!wb) { + ret = -ENOMEM; + goto out; + } wb->vnode = vnode; wb->first = 0; wb->last = -1; @@ -720,7 +728,7 @@ if (ret < 0) { afs_put_writeback(wb); _leave(" = %d [wb]", ret); - return ret; + goto out; } /* wait for the preceding writes to actually complete */ @@ -729,6 +737,8 @@ vnode->writebacks.next == &wb->link); afs_put_writeback(wb); _leave(" = %d", ret); +out: + mutex_unlock(&inode->i_mutex); return ret; }
diff --git a/fs/attr.c b/fs/attr.c index caf2aa5..538e279 100644 --- a/fs/attr.c +++ b/fs/attr.c
@@ -232,17 +232,11 @@ if (error) return error; - if (ia_valid & ATTR_SIZE) - down_write(&dentry->d_inode->i_alloc_sem); - if (inode->i_op->setattr) error = inode->i_op->setattr(dentry, attr); else error = simple_setattr(dentry, attr); - if (ia_valid & ATTR_SIZE) - up_write(&dentry->d_inode->i_alloc_sem); - if (!error) fsnotify_change(dentry, ia_valid);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c index bfcb18f..9205cf2 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c
@@ -87,7 +87,8 @@ return -EIO; } -static int bad_file_fsync(struct file *file, int datasync) +static int bad_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { return -EIO; } @@ -229,7 +230,7 @@ return -EIO; } -static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags) +static int bad_inode_permission(struct inode *inode, int mask) { return -EIO; }
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 303983f..dd0fdfc 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c
@@ -668,8 +668,7 @@ * mm->dumpable = 0 regardless of the interpreter's * permissions. */ - if (file_permission(interpreter, MAY_READ) < 0) - bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + would_dump(bprm, interpreter); retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 2bc5dc6..30745f4 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c
@@ -245,8 +245,7 @@ * mm->dumpable = 0 regardless of the interpreter's * permissions. */ - if (file_permission(interpreter, MAY_READ) < 0) - bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + would_dump(bprm, interpreter); retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 1befe2e..ba1a1ae 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c
@@ -149,8 +149,7 @@ /* if the binary is not readable than enforce mm->dumpable=0 regardless of the interpreter's permissions */ - if (file_permission(bprm->file, MAY_READ)) - bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + would_dump(bprm, bprm->file); allow_write_access(bprm->file); bprm->file = NULL;
diff --git a/fs/block_dev.c b/fs/block_dev.c index 610e8e0..9fb0b15 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c
@@ -355,25 +355,30 @@ mutex_lock(&bd_inode->i_mutex); size = i_size_read(bd_inode); + retval = -EINVAL; switch (origin) { - case 2: + case SEEK_END: offset += size; break; - case 1: + case SEEK_CUR: offset += file->f_pos; + case SEEK_SET: + break; + default: + goto out; } - retval = -EINVAL; if (offset >= 0 && offset <= size) { if (offset != file->f_pos) { file->f_pos = offset; } retval = offset; } +out: mutex_unlock(&bd_inode->i_mutex); return retval; } -int blkdev_fsync(struct file *filp, int datasync) +int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *bd_inode = filp->f_mapping->host; struct block_device *bdev = I_BDEV(bd_inode); @@ -384,14 +389,10 @@ * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - mutex_unlock(&bd_inode->i_mutex); - error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); if (error == -EOPNOTSUPP) error = 0; - mutex_lock(&bd_inode->i_mutex); - return error; } EXPORT_SYMBOL(blkdev_fsync);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index f66fc99..9f62ab2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c
@@ -195,14 +195,13 @@ return ret; } -int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags) +int btrfs_check_acl(struct inode *inode, int mask) { int error = -EAGAIN; - if (flags & IPERM_FLAG_RCU) { + if (mask & MAY_NOT_BLOCK) { if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) error = -ECHILD; - } else { struct posix_acl *acl; acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3b859a3..82be74ef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h
@@ -1219,7 +1219,7 @@ * right now this just gets used so that a root has its own devid * for stat. It may be used for more later */ - struct super_block anon_super; + dev_t anon_dev; }; struct btrfs_ioctl_defrag_range_args { @@ -2510,6 +2510,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); /* inode.c */ +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create); /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ #if defined(ClearPageFsMisc) && !defined(ClearPageChecked) @@ -2602,7 +2605,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); -int btrfs_sync_file(struct file *file, int datasync); +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; @@ -2642,7 +2645,7 @@ /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags); +int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1ac8db5d..b231ae1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c
@@ -1077,12 +1077,7 @@ init_completion(&root->kobj_unregister); root->defrag_running = 0; root->root_key.objectid = objectid; - root->anon_super.s_root = NULL; - root->anon_super.s_dev = 0; - INIT_LIST_HEAD(&root->anon_super.s_list); - INIT_LIST_HEAD(&root->anon_super.s_instances); - init_rwsem(&root->anon_super.s_umount); - + root->anon_dev = 0; return 0; } @@ -1311,7 +1306,7 @@ spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); - ret = set_anon_super(&root->anon_super, NULL); + ret = get_anon_bdev(&root->anon_dev); if (ret) goto fail; @@ -2393,10 +2388,8 @@ { iput(root->cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - if (root->anon_super.s_dev) { - down_write(&root->anon_super.s_umount); - kill_anon_super(&root->anon_super); - } + if (root->anon_dev) + free_anon_bdev(root->anon_dev); free_extent_buffer(root->node); free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa4ef18..59cbdb1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c
@@ -1452,7 +1452,7 @@ * important optimization for directories because holding the mutex prevents * new operations on the dir while we write to disk. */ -int btrfs_sync_file(struct file *file, int datasync) +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; @@ -1462,9 +1462,13 @@ trace_btrfs_sync_file(file, datasync); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + /* we wait first, since the writeback may change the inode */ root->log_batch++; - /* the VFS called filemap_fdatawrite for us */ btrfs_wait_ordered_range(inode, 0, (u64)-1); root->log_batch++; @@ -1472,8 +1476,10 @@ * check the transaction that last modified this inode * and see if its already been committed */ - if (!BTRFS_I(inode)->last_trans) + if (!BTRFS_I(inode)->last_trans) { + mutex_unlock(&inode->i_mutex); goto out; + } /* * if the last transaction that changed this file was before @@ -1484,6 +1490,7 @@ if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; + mutex_unlock(&inode->i_mutex); goto out; } @@ -1496,12 +1503,15 @@ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + mutex_unlock(&inode->i_mutex); goto out; } ret = btrfs_log_dentry_safe(trans, root, dentry); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&inode->i_mutex); goto out; + } /* we've logged all the items and now have a consistent * version of the file in the log. It is possible that @@ -1513,7 +1523,7 @@ * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&inode->i_mutex); if (ret != BTRFS_NO_LOG_SYNC) { if (ret > 0) { @@ -1528,7 +1538,6 @@ } else { ret = btrfs_end_transaction(trans, root); } - mutex_lock(&dentry->d_inode->i_mutex); out: return ret > 0 ? -EIO : ret; } @@ -1664,8 +1673,154 @@ return ret; } +static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map *em; + struct extent_state *cached_state = NULL; + u64 lockstart = *offset; + u64 lockend = i_size_read(inode); + u64 start = *offset; + u64 orig_start = *offset; + u64 len = i_size_read(inode); + u64 last_end = 0; + int ret = 0; + + lockend = max_t(u64, root->sectorsize, lockend); + if (lockend <= lockstart) + lockend = lockstart + root->sectorsize; + + len = lockend - lockstart + 1; + + len = max_t(u64, len, root->sectorsize); + if (inode->i_size == 0) + return -ENXIO; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, + &cached_state, GFP_NOFS); + + /* + * Delalloc is such a pain. If we have a hole and we have pending + * delalloc for a portion of the hole we will get back a hole that + * exists for the entire range since it hasn't been actually written + * yet. So to take care of this case we need to look for an extent just + * before the position we want in case there is outstanding delalloc + * going on here. + */ + if (origin == SEEK_HOLE && start != 0) { + if (start <= root->sectorsize) + em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, + root->sectorsize, 0); + else + em = btrfs_get_extent_fiemap(inode, NULL, 0, + start - root->sectorsize, + root->sectorsize, 0); + if (IS_ERR(em)) { + ret = -ENXIO; + goto out; + } + last_end = em->start + em->len; + if (em->block_start == EXTENT_MAP_DELALLOC) + last_end = min_t(u64, last_end, inode->i_size); + free_extent_map(em); + } + + while (1) { + em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = -ENXIO; + break; + } + + if (em->block_start == EXTENT_MAP_HOLE) { + if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + if (last_end <= orig_start) { + free_extent_map(em); + ret = -ENXIO; + break; + } + } + + if (origin == SEEK_HOLE) { + *offset = start; + free_extent_map(em); + break; + } + } else { + if (origin == SEEK_DATA) { + if (em->block_start == EXTENT_MAP_DELALLOC) { + if (start >= inode->i_size) { + free_extent_map(em); + ret = -ENXIO; + break; + } + } + + *offset = start; + free_extent_map(em); + break; + } + } + + start = em->start + em->len; + last_end = em->start + em->len; + + if (em->block_start == EXTENT_MAP_DELALLOC) + last_end = min_t(u64, last_end, inode->i_size); + + if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + free_extent_map(em); + ret = -ENXIO; + break; + } + free_extent_map(em); + cond_resched(); + } + if (!ret) + *offset = min(*offset, inode->i_size); +out: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + return ret; +} + +static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int ret; + + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_END: + case SEEK_CUR: + offset = generic_file_llseek_unlocked(file, offset, origin); + goto out; + case SEEK_DATA: + case SEEK_HOLE: + ret = find_desired_extent(inode, &offset, origin); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + } + + if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (offset > inode->i_sb->s_maxbytes) + return -EINVAL; + + /* Special lock needed here? */ + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + const struct file_operations btrfs_file_operations = { - .llseek = generic_file_llseek, + .llseek = btrfs_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3601f0a..2548a04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c
@@ -4079,13 +4079,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - struct inode *inode; - - inode = btrfs_lookup_dentry(dir, dentry); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - return d_splice_alias(inode, dentry); + return d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); } unsigned char btrfs_filetype_table[] = { @@ -4772,11 +4766,10 @@ if (err) { drop_inode = 1; } else { - struct dentry *parent = dget_parent(dentry); + struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); BUG_ON(err); btrfs_log_new_name(trans, inode, NULL, parent); - dput(parent); } nr = trans->blocks_used; @@ -6900,7 +6893,7 @@ { struct inode *inode = dentry->d_inode; generic_fillattr(inode, stat); - stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; + stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; stat->blocks = (inode_get_bytes(inode) + BTRFS_I(inode)->delalloc_bytes) >> 9; @@ -7068,9 +7061,8 @@ BUG_ON(ret); if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { - struct dentry *parent = dget_parent(new_dentry); + struct dentry *parent = new_dentry->d_parent; btrfs_log_new_name(trans, old_inode, old_dir, parent); - dput(parent); btrfs_end_log_trans(root); } out_fail: @@ -7331,7 +7323,7 @@ return __set_page_dirty_nobuffers(page); } -static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) +static int btrfs_permission(struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -7339,7 +7331,7 @@ return -EROFS; if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; - return generic_permission(inode, mask, flags, btrfs_check_acl); + return generic_permission(inode, mask); } static const struct inode_operations btrfs_dir_inode_operations = { @@ -7359,10 +7351,12 @@ .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, + .check_acl = btrfs_check_acl, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, + .check_acl = btrfs_check_acl, }; static const struct file_operations btrfs_dir_file_operations = { @@ -7431,6 +7425,7 @@ .removexattr = btrfs_removexattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, + .check_acl = btrfs_check_acl, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, @@ -7440,6 +7435,7 @@ .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, + .check_acl = btrfs_check_acl, }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, @@ -7451,6 +7447,7 @@ .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, + .check_acl = btrfs_check_acl, }; const struct dentry_operations btrfs_dentry_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a3c4751..6225433 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c
@@ -323,7 +323,7 @@ struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *new_root; - struct dentry *parent = dget_parent(dentry); + struct dentry *parent = dentry->d_parent; struct inode *dir; int ret; int err; @@ -332,10 +332,8 @@ u64 index = 0; ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); - if (ret) { - dput(parent); + if (ret) return ret; - } dir = parent->d_inode; @@ -346,10 +344,8 @@ * 2 - dir items */ trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) { - dput(parent); + if (IS_ERR(trans)) return PTR_ERR(trans); - } leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); @@ -439,7 +435,6 @@ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: - dput(parent); if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -456,7 +451,6 @@ bool readonly) { struct inode *inode; - struct dentry *parent; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; @@ -504,9 +498,7 @@ if (ret) goto fail; - parent = dget_parent(dentry); - inode = btrfs_lookup_dentry(parent->d_inode, dentry); - dput(parent); + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index a2603e7..622f469 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c
@@ -129,8 +129,6 @@ !root->d_inode->i_op->mkdir || !root->d_inode->i_op->setxattr || !root->d_inode->i_op->getxattr || - !root->d_sb || - !root->d_sb->s_op || !root->d_sb->s_op->statfs || !root->d_sb->s_op->sync_fs) goto error_unsupported;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f605753..8d74ad7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c
@@ -1811,7 +1811,7 @@ spin_unlock(&ci->i_unsafe_lock); } -int ceph_fsync(struct file *file, int datasync) +int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); @@ -1822,9 +1822,10 @@ dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); sync_write_wait(inode); - ret = filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) return ret; + mutex_lock(&inode->i_mutex); dirty = try_flush_caps(inode, NULL, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -1841,6 +1842,7 @@ } dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); + mutex_unlock(&inode->i_mutex); return ret; }
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ef8f08c..1065ac7 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c
@@ -252,7 +252,7 @@ off = 1; } if (filp->f_pos == 1) { - ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino; + ino_t ino = parent_ino(filp->f_dentry); dout("readdir off 1 -> '..'\n"); if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), ceph_translate_ino(inode->i_sb, ino), @@ -446,14 +446,19 @@ loff_t retval; mutex_lock(&inode->i_mutex); + retval = -EINVAL; switch (origin) { case SEEK_END: offset += inode->i_size + 2; /* FIXME */ break; case SEEK_CUR: offset += file->f_pos; + case SEEK_SET: + break; + default: + goto out; } - retval = -EINVAL; + if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { if (offset != file->f_pos) { file->f_pos = offset; @@ -477,6 +482,7 @@ if (offset > old_offset) fi->dir_release_count--; } +out: mutex_unlock(&inode->i_mutex); return retval; } @@ -566,7 +572,6 @@ /* open (but not create!) intent? */ if (nd && (nd->flags & LOOKUP_OPEN) && - (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */ !(nd->intent.open.flags & O_CREAT)) { int mode = nd->intent.open.create_mode & ~current->fs->umask; return ceph_lookup_open(dir, dentry, nd, mode, 1); @@ -1113,7 +1118,8 @@ * an fsync() on a dir will wait for any uncommitted directory * operations to commit. */ -static int ceph_dir_fsync(struct file *file, int datasync) +static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_path.dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); @@ -1123,6 +1129,11 @@ int ret = 0; dout("dir_fsync %p\n", inode); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + spin_lock(&ci->i_unsafe_lock); if (list_empty(head)) goto out; @@ -1156,6 +1167,8 @@ } while (req->r_tid < last_tid); out: spin_unlock(&ci->i_unsafe_lock); + mutex_unlock(&inode->i_mutex); + return ret; }
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4698a5c..0d0eae0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c
@@ -226,7 +226,7 @@ struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); struct ceph_mds_request *req; int err; - int flags = nd->intent.open.flags - 1; /* silly vfs! */ + int flags = nd->intent.open.flags; dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n", dentry, dentry->d_name.len, dentry->d_name.name, flags, mode); @@ -768,13 +768,16 @@ mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - switch (origin) { - case SEEK_END: + if (origin != SEEK_CUR || origin != SEEK_SET) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); if (ret < 0) { offset = ret; goto out; } + } + + switch (origin) { + case SEEK_END: offset += inode->i_size; break; case SEEK_CUR: @@ -790,6 +793,19 @@ } offset += file->f_pos; break; + case SEEK_DATA: + if (offset >= inode->i_size) { + ret = -ENXIO; + goto out; + } + break; + case SEEK_HOLE: + if (offset >= inode->i_size) { + ret = -ENXIO; + goto out; + } + offset = inode->i_size; + break; } if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d8858e9..dfb2831 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c
@@ -1795,17 +1795,17 @@ * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct inode *inode, int mask, unsigned int flags) +int ceph_permission(struct inode *inode, int mask) { int err; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); if (!err) - err = generic_permission(inode, mask, flags, NULL); + err = generic_permission(inode, mask); return err; }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f5cabef..30446b1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h
@@ -692,7 +692,7 @@ extern void ceph_queue_writeback(struct inode *inode); extern int ceph_do_getattr(struct inode *inode, int mask); -extern int ceph_permission(struct inode *inode, int mask, unsigned int flags); +extern int ceph_permission(struct inode *inode, int mask); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); @@ -728,7 +728,8 @@ extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); -extern int ceph_fsync(struct file *file, int datasync); +extern int ceph_fsync(struct file *file, loff_t start, loff_t end, + int datasync); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index bc4b12c..8655174 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c
@@ -224,7 +224,7 @@ return 0; } -static int cifs_permission(struct inode *inode, int mask, unsigned int flags) +static int cifs_permission(struct inode *inode, int mask) { struct cifs_sb_info *cifs_sb; @@ -239,7 +239,7 @@ on the client (above and beyond ACL on servers) for servers which do not support setting and viewing mode bits, so allowing client to check permissions is useful */ - return generic_permission(inode, mask, flags, NULL); + return generic_permission(inode, mask); } static struct kmem_cache *cifs_inode_cachep; @@ -704,8 +704,11 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) { - /* origin == SEEK_END => we must revalidate the cached file length */ - if (origin == SEEK_END) { + /* + * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate + * the cached file length + */ + if (origin != SEEK_SET || origin != SEEK_CUR) { int rc; struct inode *inode = file->f_path.dentry->d_inode;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 036ca83..fbd050c 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h
@@ -91,8 +91,8 @@ extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); extern int cifs_lock(struct file *, int, struct file_lock *); -extern int cifs_fsync(struct file *, int); -extern int cifs_strict_fsync(struct file *, int); +extern int cifs_fsync(struct file *, loff_t, loff_t, int); +extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int); extern int cifs_flush(struct file *, fl_owner_t id); extern int cifs_file_mmap(struct file * , struct vm_area_struct *); extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ccc1afa..e66297b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c
@@ -320,9 +320,10 @@ } static int -cifs_demultiplex_thread(struct TCP_Server_Info *server) +cifs_demultiplex_thread(void *p) { int length; + struct TCP_Server_Info *server = p; unsigned int pdu_length, total_read; struct smb_hdr *smb_buffer = NULL; struct smb_hdr *bigbuf = NULL; @@ -1791,7 +1792,7 @@ * this will succeed. No need for try_module_get(). */ __module_get(THIS_MODULE); - tcp_ses->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, + tcp_ses->tsk = kthread_run(cifs_demultiplex_thread, tcp_ses, "cifsd"); if (IS_ERR(tcp_ses->tsk)) { rc = PTR_ERR(tcp_ses->tsk);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index fa8c21d9..14d602f 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c
@@ -179,7 +179,7 @@ if (oplockEnabled) oplock = REQ_OPLOCK; - if (nd && (nd->flags & LOOKUP_OPEN)) + if (nd) oflags = nd->intent.open.file->f_flags; else oflags = O_RDONLY | O_CREAT; @@ -214,7 +214,7 @@ which should be rare for path not covered on files) */ } - if (nd && (nd->flags & LOOKUP_OPEN)) { + if (nd) { /* if the file is going to stay open, then we need to set the desired access properly */ desiredAccess = 0; @@ -328,7 +328,7 @@ else cFYI(1, "Create worked, get_inode_info failed rc = %d", rc); - if (newinode && nd && (nd->flags & LOOKUP_OPEN)) { + if (newinode && nd) { struct cifsFileInfo *pfile_info; struct file *filp; @@ -568,7 +568,7 @@ * reduction in network traffic in the other paths. */ if (pTcon->unix_ext) { - if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && + if (nd && !(nd->flags & LOOKUP_DIRECTORY) && (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && (nd->intent.open.file->f_flags & O_CREAT)) { rc = cifs_posix_open(full_path, &newInode, @@ -663,10 +663,8 @@ * case sensitive name which is specified by user if this is * for creation. */ - if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { - if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) - return 0; - } + if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; if (time_after(jiffies, direntry->d_time + HZ) || !lookupCacheEnabled) return 0;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a9b4a24..378acdaf 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c
@@ -1401,7 +1401,8 @@ return rc; } -int cifs_strict_fsync(struct file *file, int datasync) +int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { int xid; int rc = 0; @@ -1410,6 +1411,11 @@ struct inode *inode = file->f_path.dentry->d_inode; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (rc) + return rc; + mutex_lock(&inode->i_mutex); + xid = GetXid(); cFYI(1, "Sync file - name: %s datasync: 0x%x", @@ -1428,16 +1434,23 @@ rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); FreeXid(xid); + mutex_unlock(&inode->i_mutex); return rc; } -int cifs_fsync(struct file *file, int datasync) +int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int xid; int rc = 0; struct cifs_tcon *tcon; struct cifsFileInfo *smbfile = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + struct inode *inode = file->f_mapping->host; + + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (rc) + return rc; + mutex_lock(&inode->i_mutex); xid = GetXid(); @@ -1449,6 +1462,7 @@ rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); FreeXid(xid); + mutex_unlock(&inode->i_mutex); return rc; }
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 6751e74..965a3af 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c
@@ -796,7 +796,7 @@ file->f_pos++; case 1: if (filldir(direntry, "..", 2, file->f_pos, - file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { + parent_ino(file->f_path.dentry), DT_DIR) < 0) { cERROR(1, "Filldir for parent dir failed"); rc = -ENOMEM; break;
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index 6b443ff..b7143cf 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h
@@ -11,7 +11,7 @@ void coda_destroy_inodecache(void); int coda_init_inodecache(void); -int coda_fsync(struct file *coda_file, int datasync); +int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync); void coda_sysctl_init(void); void coda_sysctl_clean(void);
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index 9b0c532..44e17e9 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h
@@ -39,7 +39,7 @@ /* operations shared over more than one file */ int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); -int coda_permission(struct inode *inode, int mask, unsigned int flags); +int coda_permission(struct inode *inode, int mask); int coda_revalidate_inode(struct dentry *); int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); int coda_setattr(struct dentry *, struct iattr *);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 2b8dae4..0239433 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c
@@ -132,11 +132,11 @@ } -int coda_permission(struct inode *inode, int mask, unsigned int flags) +int coda_permission(struct inode *inode, int mask) { int error; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; mask &= MAY_READ | MAY_WRITE | MAY_EXEC; @@ -449,8 +449,7 @@ struct file *host_file; struct dentry *de; struct venus_dirent *vdir; - unsigned long vdir_size = - (unsigned long)(&((struct venus_dirent *)0)->d_name); + unsigned long vdir_size = offsetof(struct venus_dirent, d_name); unsigned int type; struct qstr name; ino_t ino; @@ -474,7 +473,7 @@ coda_file->f_pos++; } if (coda_file->f_pos == 1) { - ret = filldir(buf, "..", 2, 1, de->d_parent->d_inode->i_ino, DT_DIR); + ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR); if (ret < 0) goto out; result++;
diff --git a/fs/coda/file.c b/fs/coda/file.c index 0433057..8edd404 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c
@@ -199,7 +199,7 @@ return 0; } -int coda_fsync(struct file *coda_file, int datasync) +int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) { struct file *host_file; struct inode *coda_inode = coda_file->f_path.dentry->d_inode; @@ -210,6 +210,11 @@ S_ISLNK(coda_inode->i_mode))) return -EINVAL; + err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end); + if (err) + return err; + mutex_lock(&coda_inode->i_mutex); + cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; @@ -217,6 +222,7 @@ err = vfs_fsync(host_file, datasync); if (!err && !datasync) err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); + mutex_unlock(&coda_inode->i_mutex); return err; }
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index cb140ef..ee0981f 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c
@@ -24,7 +24,7 @@ #include "coda_linux.h" /* pioctl ops */ -static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags); +static int coda_ioctl_permission(struct inode *inode, int mask); static long coda_pioctl(struct file *filp, unsigned int cmd, unsigned long user_data); @@ -41,7 +41,7 @@ }; /* the coda pioctl inode ops */ -static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags) +static int coda_ioctl_permission(struct inode *inode, int mask) { return (mask & MAY_EXEC) ? -EACCES : 0; }
diff --git a/fs/dcache.c b/fs/dcache.c index fbdcbca..be18598 100644 --- a/fs/dcache.c +++ b/fs/dcache.c
@@ -344,6 +344,24 @@ EXPORT_SYMBOL(d_drop); /* + * d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag + * @dentry: dentry to drop + * + * This is called when we do a lookup on a placeholder dentry that needed to be + * looked up. The dentry should have been hashed in order for it to be found by + * the lookup code, but now needs to be unhashed while we do the actual lookup + * and clear the DCACHE_NEED_LOOKUP flag. + */ +void d_clear_need_lookup(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __d_drop(dentry); + dentry->d_flags &= ~DCACHE_NEED_LOOKUP; + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_clear_need_lookup); + +/* * Finish off a dentry we've decided to kill. * dentry->d_lock must be held, returns with it unlocked. * If ref is non-zero, then decrement the refcount too. @@ -432,8 +450,13 @@ if (d_unhashed(dentry)) goto kill_it; - /* Otherwise leave it cached and ensure it's on the LRU */ - dentry->d_flags |= DCACHE_REFERENCED; + /* + * If this dentry needs lookup, don't set the referenced flag so that it + * is more likely to be cleaned up by the dcache shrinker in case of + * memory pressure. + */ + if (!d_need_lookup(dentry)) + dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); dentry->d_count--; @@ -526,10 +549,6 @@ */ rcu_read_lock(); ret = dentry->d_parent; - if (!ret) { - rcu_read_unlock(); - goto out; - } spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); @@ -540,7 +559,6 @@ BUG_ON(!ret->d_count); ret->d_count++; spin_unlock(&ret->d_lock); -out: return ret; } EXPORT_SYMBOL(dget_parent); @@ -720,13 +738,11 @@ * * If flags contains DCACHE_REFERENCED reference dentries will not be pruned. */ -static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags) +static void __shrink_dcache_sb(struct super_block *sb, int count, int flags) { - /* called from prune_dcache() and shrink_dcache_parent() */ struct dentry *dentry; LIST_HEAD(referenced); LIST_HEAD(tmp); - int cnt = *count; relock: spin_lock(&dcache_lru_lock); @@ -754,7 +770,7 @@ } else { list_move_tail(&dentry->d_lru, &tmp); spin_unlock(&dentry->d_lock); - if (!--cnt) + if (!--count) break; } cond_resched_lock(&dcache_lru_lock); @@ -764,83 +780,22 @@ spin_unlock(&dcache_lru_lock); shrink_dentry_list(&tmp); - - *count = cnt; } /** - * prune_dcache - shrink the dcache - * @count: number of entries to try to free + * prune_dcache_sb - shrink the dcache + * @nr_to_scan: number of entries to try to free * - * Shrink the dcache. This is done when we need more memory, or simply when we - * need to unmount something (at which point we need to unuse all dentries). + * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is + * done when we need more memory an called from the superblock shrinker + * function. * - * This function may fail to free any resources if all the dentries are in use. + * This function may fail to free any resources if all the dentries are in + * use. */ -static void prune_dcache(int count) +void prune_dcache_sb(struct super_block *sb, int nr_to_scan) { - struct super_block *sb, *p = NULL; - int w_count; - int unused = dentry_stat.nr_unused; - int prune_ratio; - int pruned; - - if (unused == 0 || count == 0) - return; - if (count >= unused) - prune_ratio = 1; - else - prune_ratio = unused / count; - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) - continue; - if (sb->s_nr_dentry_unused == 0) - continue; - sb->s_count++; - /* Now, we reclaim unused dentrins with fairness. - * We reclaim them same percentage from each superblock. - * We calculate number of dentries to scan on this sb - * as follows, but the implementation is arranged to avoid - * overflows: - * number of dentries to scan on this sb = - * count * (number of dentries on this sb / - * number of dentries in the machine) - */ - spin_unlock(&sb_lock); - if (prune_ratio != 1) - w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1; - else - w_count = sb->s_nr_dentry_unused; - pruned = w_count; - /* - * We need to be sure this filesystem isn't being unmounted, - * otherwise we could race with generic_shutdown_super(), and - * end up holding a reference to an inode while the filesystem - * is unmounted. So we try to get s_umount, and make sure - * s_root isn't NULL. - */ - if (down_read_trylock(&sb->s_umount)) { - if ((sb->s_root != NULL) && - (!list_empty(&sb->s_dentry_lru))) { - __shrink_dcache_sb(sb, &w_count, - DCACHE_REFERENCED); - pruned -= w_count; - } - up_read(&sb->s_umount); - } - spin_lock(&sb_lock); - if (p) - __put_super(p); - count -= pruned; - p = sb; - /* more work left to do? */ - if (count <= 0) - break; - } - if (p) - __put_super(p); - spin_unlock(&sb_lock); + __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED); } /** @@ -1215,45 +1170,13 @@ int found; while ((found = select_parent(parent)) != 0) - __shrink_dcache_sb(sb, &found, 0); + __shrink_dcache_sb(sb, found, 0); } EXPORT_SYMBOL(shrink_dcache_parent); -/* - * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain. - * - * We need to avoid reentering the filesystem if the caller is performing a - * GFP_NOFS allocation attempt. One example deadlock is: - * - * ext2_new_block->getblk->GFP->shrink_dcache_memory->prune_dcache-> - * prune_one_dentry->dput->dentry_iput->iput->inode->i_sb->s_op->put_inode-> - * ext2_discard_prealloc->ext2_free_blocks->lock_super->DEADLOCK. - * - * In this case we return -1 to tell the caller that we baled. - */ -static int shrink_dcache_memory(struct shrinker *shrink, - struct shrink_control *sc) -{ - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - - if (nr) { - if (!(gfp_mask & __GFP_FS)) - return -1; - prune_dcache(nr); - } - - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; -} - -static struct shrinker dcache_shrinker = { - .shrink = shrink_dcache_memory, - .seeks = DEFAULT_SEEKS, -}; - /** - * d_alloc - allocate a dcache entry - * @parent: parent of entry to allocate + * __d_alloc - allocate a dcache entry + * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory @@ -1261,7 +1184,7 @@ * copied and the copy passed in may be reused after this call. */ -struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) +struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; @@ -1291,8 +1214,8 @@ spin_lock_init(&dentry->d_lock); seqcount_init(&dentry->d_seq); dentry->d_inode = NULL; - dentry->d_parent = NULL; - dentry->d_sb = NULL; + dentry->d_parent = dentry; + dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); @@ -1300,36 +1223,47 @@ INIT_LIST_HEAD(&dentry->d_subdirs); INIT_LIST_HEAD(&dentry->d_alias); INIT_LIST_HEAD(&dentry->d_u.d_child); - - if (parent) { - spin_lock(&parent->d_lock); - /* - * don't need child lock because it is not subject - * to concurrency here - */ - __dget_dlock(parent); - dentry->d_parent = parent; - dentry->d_sb = parent->d_sb; - d_set_d_op(dentry, dentry->d_sb->s_d_op); - list_add(&dentry->d_u.d_child, &parent->d_subdirs); - spin_unlock(&parent->d_lock); - } + d_set_d_op(dentry, dentry->d_sb->s_d_op); this_cpu_inc(nr_dentry); return dentry; } + +/** + * d_alloc - allocate a dcache entry + * @parent: parent of entry to allocate + * @name: qstr of the name + * + * Allocates a dentry. It returns %NULL if there is insufficient memory + * available. On a success the dentry is returned. The name passed in is + * copied and the copy passed in may be reused after this call. + */ +struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) +{ + struct dentry *dentry = __d_alloc(parent->d_sb, name); + if (!dentry) + return NULL; + + spin_lock(&parent->d_lock); + /* + * don't need child lock because it is not subject + * to concurrency here + */ + __dget_dlock(parent); + dentry->d_parent = parent; + list_add(&dentry->d_u.d_child, &parent->d_subdirs); + spin_unlock(&parent->d_lock); + + return dentry; +} EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { - struct dentry *dentry = d_alloc(NULL, name); - if (dentry) { - dentry->d_sb = sb; - d_set_d_op(dentry, dentry->d_sb->s_d_op); - dentry->d_parent = dentry; + struct dentry *dentry = __d_alloc(sb, name); + if (dentry) dentry->d_flags |= DCACHE_DISCONNECTED; - } return dentry; } EXPORT_SYMBOL(d_alloc_pseudo); @@ -1499,13 +1433,9 @@ if (root_inode) { static const struct qstr name = { .name = "/", .len = 1 }; - res = d_alloc(NULL, &name); - if (res) { - res->d_sb = root_inode->i_sb; - d_set_d_op(res, res->d_sb->s_d_op); - res->d_parent = res; + res = __d_alloc(root_inode->i_sb, &name); + if (res) d_instantiate(res, root_inode); - } } return res; } @@ -1566,13 +1496,11 @@ if (res) goto out_iput; - tmp = d_alloc(NULL, &anonstring); + tmp = __d_alloc(inode->i_sb, &anonstring); if (!tmp) { res = ERR_PTR(-ENOMEM); goto out_iput; } - tmp->d_parent = tmp; /* make sure dput doesn't croak */ - spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); @@ -1584,8 +1512,6 @@ /* attach a disconnected dentry */ spin_lock(&tmp->d_lock); - tmp->d_sb = inode->i_sb; - d_set_d_op(tmp, tmp->d_sb->s_d_op); tmp->d_inode = inode; tmp->d_flags |= DCACHE_DISCONNECTED; list_add(&tmp->d_alias, &inode->i_dentry); @@ -1626,6 +1552,9 @@ { struct dentry *new = NULL; + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (inode && S_ISDIR(inode->i_mode)) { spin_lock(&inode->i_lock); new = __d_find_alias(inode, 1); @@ -1708,29 +1637,22 @@ } /* + * We are going to instantiate this dentry, unhash it and clear the + * lookup flag so we can do that. + */ + if (unlikely(d_need_lookup(found))) + d_clear_need_lookup(found); + + /* * Negative dentry: instantiate it unless the inode is a directory and * already has a dentry. */ - spin_lock(&inode->i_lock); - if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) { - __d_instantiate(found, inode); - spin_unlock(&inode->i_lock); - security_d_instantiate(found, inode); - return found; + new = d_splice_alias(inode, found); + if (new) { + dput(found); + found = new; } - - /* - * In case a directory already has a (disconnected) entry grab a - * reference to it, move it in place and use it. - */ - new = list_entry(inode->i_dentry.next, struct dentry, d_alias); - __dget(new); - spin_unlock(&inode->i_lock); - security_d_instantiate(found, inode); - d_move(new, found); - iput(inode); - dput(found); - return new; + return found; err_out: iput(inode); @@ -3045,8 +2967,6 @@ */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); - - register_shrinker(&dcache_shrinker); /* Hash may have been set up in dcache_init_early */ if (!hashdist)
diff --git a/fs/direct-io.c b/fs/direct-io.c index ac5f164..01d2d9e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c
@@ -135,6 +135,50 @@ struct page *pages[DIO_PAGES]; /* page buffer */ }; +static void __inode_dio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + do { + prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&inode->i_dio_count)) + schedule(); + } while (atomic_read(&inode->i_dio_count)); + finish_wait(wq, &q.wait); +} + +/** + * inode_dio_wait - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +void inode_dio_wait(struct inode *inode) +{ + if (atomic_read(&inode->i_dio_count)) + __inode_dio_wait(inode); +} +EXPORT_SYMBOL_GPL(inode_dio_wait); + +/* + * inode_dio_done - signal finish of a direct I/O requests + * @inode: inode the direct I/O happens on + * + * This is called once we've finished processing a direct I/O request, + * and is used to wake up callers waiting for direct I/O to be quiesced. + */ +void inode_dio_done(struct inode *inode) +{ + if (atomic_dec_and_test(&inode->i_dio_count)) + wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); +} +EXPORT_SYMBOL_GPL(inode_dio_done); + /* * How many pages are in the queue? */ @@ -249,14 +293,12 @@ if (dio->end_io && dio->result) { dio->end_io(dio->iocb, offset, transferred, dio->map_bh.b_private, ret, is_async); - } else if (is_async) { - aio_complete(dio->iocb, ret, 0); + } else { + if (is_async) + aio_complete(dio->iocb, ret, 0); + inode_dio_done(dio->inode); } - if (dio->flags & DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - return ret; } @@ -980,9 +1022,6 @@ return ret; } -/* - * Releases both i_mutex and i_alloc_sem - */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, @@ -1146,15 +1185,16 @@ * For writes this function is called under i_mutex and returns with * i_mutex held, for reads, i_mutex is not held on entry, but it is * taken and dropped again before returning. - * For reads and writes i_alloc_sem is taken in shared mode and released - * on I/O completion (which may happen asynchronously after returning to - * the caller). - * * - if the flags value does NOT contain DIO_LOCKING we don't use any * internal locking but rather rely on the filesystem to synchronize * direct I/O reads/writes versus each other and truncate. - * For reads and writes both i_mutex and i_alloc_sem are not held on - * entry and are never taken. + * + * To help with locking against truncate we incremented the i_dio_count + * counter before starting direct I/O, and decrement it once we are done. + * Truncate can wait for it to reach zero to provide exclusion. It is + * expected that filesystem provide exclusion between new direct I/O + * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, + * but other filesystems need to take care of this on their own. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, @@ -1200,6 +1240,10 @@ } } + /* watch out for a 0 len io from a tricksy fs */ + if (rw == READ && end == offset) + return 0; + dio = kmalloc(sizeof(*dio), GFP_KERNEL); retval = -ENOMEM; if (!dio) @@ -1213,8 +1257,7 @@ dio->flags = flags; if (dio->flags & DIO_LOCKING) { - /* watch out for a 0 len io from a tricksy fs */ - if (rw == READ && end > offset) { + if (rw == READ) { struct address_space *mapping = iocb->ki_filp->f_mapping; @@ -1229,15 +1272,14 @@ goto out; } } - - /* - * Will be released at I/O completion, possibly in a - * different thread. - */ - down_read_non_owner(&inode->i_alloc_sem); } /* + * Will be decremented at I/O completion time. + */ + atomic_inc(&inode->i_dio_count); + + /* * For file extending writes updating i_size before data * writeouts complete can expose uninitialized blocks. So * even for AIO, we need to wait for i/o to complete before
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 4ec9eb0..c6ac98c 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c
@@ -270,14 +270,15 @@ } static int -ecryptfs_fsync(struct file *file, int datasync) +ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int rc = 0; - rc = generic_file_fsync(file, datasync); + rc = generic_file_fsync(file, start, end, datasync); if (rc) goto out; - rc = vfs_fsync(ecryptfs_file_to_lower(file), datasync); + rc = vfs_fsync_range(ecryptfs_file_to_lower(file), start, end, + datasync); out: return rc; }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 7349ade..340c657 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c
@@ -147,7 +147,6 @@ * @lower_dir_inode: inode of the parent in the lower fs of the new file * @dentry: New file's dentry * @mode: The mode of the new file - * @nd: nameidata of ecryptfs' parent's dentry & vfsmount * * Creates the file in the lower file system. * @@ -155,31 +154,10 @@ */ static int ecryptfs_create_underlying_file(struct inode *lower_dir_inode, - struct dentry *dentry, int mode, - struct nameidata *nd) + struct dentry *dentry, int mode) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); - struct dentry *dentry_save; - struct vfsmount *vfsmount_save; - unsigned int flags_save; - int rc; - - if (nd) { - dentry_save = nd->path.dentry; - vfsmount_save = nd->path.mnt; - flags_save = nd->flags; - nd->path.dentry = lower_dentry; - nd->path.mnt = lower_mnt; - nd->flags &= ~LOOKUP_OPEN; - } - rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); - if (nd) { - nd->path.dentry = dentry_save; - nd->path.mnt = vfsmount_save; - nd->flags = flags_save; - } - return rc; + return vfs_create(lower_dir_inode, lower_dentry, mode, NULL); } /** @@ -197,8 +175,7 @@ */ static int ecryptfs_do_create(struct inode *directory_inode, - struct dentry *ecryptfs_dentry, int mode, - struct nameidata *nd) + struct dentry *ecryptfs_dentry, int mode) { int rc; struct dentry *lower_dentry; @@ -213,7 +190,7 @@ goto out; } rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, - ecryptfs_dentry, mode, nd); + ecryptfs_dentry, mode); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -294,7 +271,7 @@ int rc; /* ecryptfs_do_create() calls ecryptfs_interpose() */ - rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); + rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); if (unlikely(rc)) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" "lower filesystem\n"); @@ -942,10 +919,8 @@ } static int -ecryptfs_permission(struct inode *inode, int mask, unsigned int flags) +ecryptfs_permission(struct inode *inode, int mask) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; return inode_permission(ecryptfs_inode_to_lower(inode), mask); }
diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 1511bf9..832b10d 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c
@@ -60,14 +60,11 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { efs_ino_t inodenum; - struct inode * inode = NULL; + struct inode *inode = NULL; inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len); - if (inodenum) { + if (inodenum) inode = efs_iget(dir->i_sb, inodenum); - if (IS_ERR(inode)) - return ERR_CAST(inode); - } return d_splice_alias(inode, dentry); }
diff --git a/fs/exec.c b/fs/exec.c index d9576f2..842d570 100644 --- a/fs/exec.c +++ b/fs/exec.c
@@ -1114,6 +1114,13 @@ } EXPORT_SYMBOL(flush_old_exec); +void would_dump(struct linux_binprm *bprm, struct file *file) +{ + if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0) + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; +} +EXPORT_SYMBOL(would_dump); + void setup_new_exec(struct linux_binprm * bprm) { int i, ch; @@ -1153,9 +1160,10 @@ if (bprm->cred->uid != current_euid() || bprm->cred->gid != current_egid()) { current->pdeath_signal = 0; - } else if (file_permission(bprm->file, MAY_READ) || - bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) { - set_dumpable(current->mm, suid_dumpable); + } else { + would_dump(bprm, bprm->file); + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) + set_dumpable(current->mm, suid_dumpable); } /*
diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 45ca323..491c6c0 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c
@@ -42,11 +42,19 @@ * Note, in exofs all metadata is written as part of inode, regardless. * The writeout is synchronous */ -static int exofs_file_fsync(struct file *filp, int datasync) +static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { + struct inode *inode = filp->f_mapping->host; int ret; + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); ret = sync_inode_metadata(filp->f_mapping->host, 1); + mutex_unlock(&inode->i_mutex); return ret; }
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 4d70db1..b54c437 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c
@@ -55,12 +55,7 @@ return ERR_PTR(-ENAMETOOLONG); ino = exofs_inode_by_name(dir, dentry); - inode = NULL; - if (ino) { - inode = exofs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - } + inode = ino ? exofs_iget(dir->i_sb, ino) : NULL; return d_splice_alias(inode, dentry); }
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index abea5a1..bfe651f 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c
@@ -232,11 +232,11 @@ } int -ext2_check_acl(struct inode *inode, int mask, unsigned int flags) +ext2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; - if (flags & IPERM_FLAG_RCU) { + if (mask & MAY_NOT_BLOCK) { if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) return -ECHILD; return -EAGAIN;
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index c939b7b..3ff6cbb 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h
@@ -54,7 +54,7 @@ #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_check_acl (struct inode *, int, unsigned int); +extern int ext2_check_acl (struct inode *, int); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 645be9e..af9fc89 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h
@@ -150,7 +150,8 @@ extern const struct file_operations ext2_dir_operations; /* file.c */ -extern int ext2_fsync(struct file *file, int datasync); +extern int ext2_fsync(struct file *file, loff_t start, loff_t end, + int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 49eec94..82e0632 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ return 0; } -int ext2_fsync(struct file *file, int datasync) +int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int ret; struct super_block *sb = file->f_mapping->host->i_sb; struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; - ret = generic_file_fsync(file, datasync); + ret = generic_file_fsync(file, start, end, datasync); if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { /* We don't really know where the IO error happened... */ ext2_error(sb, __func__,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 788e09a..a8a58f6 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c
@@ -843,8 +843,8 @@ struct inode *inode = mapping->host; ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, - iov, offset, nr_segs, ext2_get_block, NULL); + ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + ext2_get_block); if (ret < 0 && (rw & WRITE)) ext2_write_failed(mapping, offset + iov_length(iov, nr_segs)); return ret; @@ -1184,6 +1184,8 @@ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + inode_dio_wait(inode); + if (mapping_is_xip(inode->i_mapping)) error = xip_truncate_page(inode->i_mapping, newsize); else if (test_opt(inode->i_sb, NOBH))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index ed5c5d4..d60b709 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c
@@ -67,15 +67,11 @@ inode = NULL; if (ino) { inode = ext2_iget(dir->i_sb, ino); - if (IS_ERR(inode)) { - if (PTR_ERR(inode) == -ESTALE) { - ext2_error(dir->i_sb, __func__, - "deleted inode referenced: %lu", - (unsigned long) ino); - return ERR_PTR(-EIO); - } else { - return ERR_CAST(inode); - } + if (inode == ERR_PTR(-ESTALE)) { + ext2_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + (unsigned long) ino); + return ERR_PTR(-EIO); } } return d_splice_alias(inode, dentry);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 9d021c0..edfeb29 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c
@@ -240,11 +240,11 @@ } int -ext3_check_acl(struct inode *inode, int mask, unsigned int flags) +ext3_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; - if (flags & IPERM_FLAG_RCU) { + if (mask & MAY_NOT_BLOCK) { if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) return -ECHILD; return -EAGAIN;
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 5faf8048..5973346 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h
@@ -54,7 +54,7 @@ #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_check_acl (struct inode *, int, unsigned int); +extern int ext3_check_acl (struct inode *, int); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 09b13bb..0bcf63a 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c
@@ -43,7 +43,7 @@ * inode to disk. */ -int ext3_sync_file(struct file *file, int datasync) +int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ext3_inode_info *ei = EXT3_I(inode); @@ -54,6 +54,17 @@ if (inode->i_sb->s_flags & MS_RDONLY) return 0; + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + /* + * Taking the mutex here just to keep consistent with how fsync was + * called previously, however it looks like we don't need to take + * i_mutex at all. + */ + mutex_lock(&inode->i_mutex); + J_ASSERT(ext3_journal_current_handle() == NULL); /* @@ -70,8 +81,10 @@ * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext3_should_journal_data(inode)) + if (ext3_should_journal_data(inode)) { + mutex_unlock(&inode->i_mutex); return ext3_force_commit(inode->i_sb); + } if (datasync) commit_tid = atomic_read(&ei->i_datasync_tid); @@ -91,5 +104,6 @@ */ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + mutex_unlock(&inode->i_mutex); return ret; }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 3451d23..2978a2a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c
@@ -1816,9 +1816,8 @@ } retry: - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext3_get_block, NULL); + ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + ext3_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. @@ -3216,6 +3215,9 @@ ext3_journal_stop(handle); } + if (attr->ia_valid & ATTR_SIZE) + inode_dio_wait(inode); + if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { handle_t *handle;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 34b6d9b..c095cf5 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c
@@ -1038,15 +1038,11 @@ return ERR_PTR(-EIO); } inode = ext3_iget(dir->i_sb, ino); - if (IS_ERR(inode)) { - if (PTR_ERR(inode) == -ESTALE) { - ext3_error(dir->i_sb, __func__, - "deleted inode referenced: %lu", - ino); - return ERR_PTR(-EIO); - } else { - return ERR_CAST(inode); - } + if (inode == ERR_PTR(-ESTALE)) { + ext3_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + ino); + return ERR_PTR(-EIO); } } return d_splice_alias(inode, dentry);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c index aad153e..b57ea2f 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c
@@ -1718,6 +1718,8 @@ sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + /* enable barriers by default */ + set_opt(sbi->s_mount_opt, BARRIER); set_opt(sbi->s_mount_opt, RESERVATION); if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 21eacd7..60d900f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c
@@ -238,11 +238,11 @@ } int -ext4_check_acl(struct inode *inode, int mask, unsigned int flags) +ext4_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; - if (flags & IPERM_FLAG_RCU) { + if (mask & MAY_NOT_BLOCK) { if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) return -ECHILD; return -EAGAIN;
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index dec8211..9d843d5 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h
@@ -54,7 +54,7 @@ #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_check_acl(struct inode *, int, unsigned int); +extern int ext4_check_acl(struct inode *, int); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1921392..fa44df8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h
@@ -1758,7 +1758,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ -extern int ext4_sync_file(struct file *, int); +extern int ext4_sync_file(struct file *, loff_t, loff_t, int); extern int ext4_flush_completed_IO(struct inode *); /* hash.c */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2c09723..ce766f9 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c
@@ -236,6 +236,27 @@ } offset += file->f_pos; break; + case SEEK_DATA: + /* + * In the generic case the entire file is data, so as long as + * offset isn't at the end of the file then the offset is data. + */ + if (offset >= inode->i_size) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + break; + case SEEK_HOLE: + /* + * There is a virtual hole at the end of the file, so as long as + * offset isn't i_size or larger, return i_size. + */ + if (offset >= inode->i_size) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + offset = inode->i_size; + break; } if (offset < 0 || offset > maxbytes) {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index ce66d2f..da3bed3 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c
@@ -151,6 +151,32 @@ return ret; } +/** + * __sync_file - generic_file_fsync without the locking and filemap_write + * @inode: inode to sync + * @datasync: only sync essential metadata if true + * + * This is just generic_file_fsync without the locking. This is needed for + * nojournal mode to make sure this inodes data/metadata makes it to disk + * properly. The i_mutex should be held already. + */ +static int __sync_inode(struct inode *inode, int datasync) +{ + int err; + int ret; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = sync_inode_metadata(inode, 1); + if (ret == 0) + ret = err; + return ret; +} + /* * akpm: A new design for ext4_sync_file(). * @@ -165,7 +191,7 @@ * i_mutex lock is held when entering and exiting this function */ -int ext4_sync_file(struct file *file, int datasync) +int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); @@ -178,15 +204,20 @@ trace_ext4_sync_file_enter(file, datasync); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + if (inode->i_sb->s_flags & MS_RDONLY) - return 0; + goto out; ret = ext4_flush_completed_IO(inode); if (ret < 0) goto out; if (!journal) { - ret = generic_file_fsync(file, datasync); + ret = __sync_inode(inode, datasync); if (!ret && !list_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); goto out; @@ -220,6 +251,7 @@ if (needs_barrier) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); out: + mutex_unlock(&inode->i_mutex); trace_ext4_sync_file_exit(inode, ret); return ret; }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e3126c0..678cde8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c
@@ -3501,10 +3501,8 @@ offset, nr_segs, ext4_get_block, NULL, NULL, 0); else { - ret = blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL); + ret = blockdev_direct_IO(rw, iocb, inode, iov, + offset, nr_segs, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); @@ -3575,6 +3573,7 @@ ssize_t size, void *private, int ret, bool is_async) { + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; unsigned long flags; @@ -3596,6 +3595,7 @@ out: if (is_async) aio_complete(iocb, ret, 0); + inode_dio_done(inode); return; } @@ -3616,6 +3616,9 @@ /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); iocb->private = NULL; + + /* XXX: probably should move into the real I/O completion handler */ + inode_dio_done(inode); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) @@ -3748,11 +3751,13 @@ EXT4_I(inode)->cur_aio_dio = iocb->private; } - ret = blockdev_direct_IO(rw, iocb, inode, + ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block_write, - ext4_end_io_dio); + ext4_end_io_dio, + NULL, + DIO_LOCKING | DIO_SKIP_HOLES); if (iocb->private) EXT4_I(inode)->cur_aio_dio = NULL; /* @@ -5351,6 +5356,8 @@ } if (attr->ia_valid & ATTR_SIZE) { + inode_dio_wait(inode); + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -5843,80 +5850,84 @@ struct page *page = vmf->page; loff_t size; unsigned long len; - int ret = -EINVAL; - void *fsdata; + int ret; struct file *file = vma->vm_file; struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; + handle_t *handle; + get_block_t *get_block; + int retries = 0; /* - * Get i_alloc_sem to stop truncates messing with the inode. We cannot - * get i_mutex because we are already holding mmap_sem. + * This check is racy but catches the common case. We rely on + * __block_page_mkwrite() to do a reliable check. */ - down_read(&inode->i_alloc_sem); - size = i_size_read(inode); - if (page->mapping != mapping || size <= page_offset(page) - || !PageUptodate(page)) { - /* page got truncated from under us? */ - goto out_unlock; + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + /* Delalloc case is easy... */ + if (test_opt(inode->i_sb, DELALLOC) && + !ext4_should_journal_data(inode) && + !ext4_nonda_switch(inode->i_sb)) { + do { + ret = __block_page_mkwrite(vma, vmf, + ext4_da_get_block_prep); + } while (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)); + goto out_ret; } - ret = 0; lock_page(page); - wait_on_page_writeback(page); - if (PageMappedToDisk(page)) { - up_read(&inode->i_alloc_sem); - return VM_FAULT_LOCKED; + size = i_size_read(inode); + /* Page got truncated from under us? */ + if (page->mapping != mapping || page_offset(page) > size) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; } if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; else len = PAGE_CACHE_SIZE; - /* - * return if we have all the buffers mapped. This avoid - * the need to call write_begin/write_end which does a - * journal_start/journal_stop which can block and take - * long time + * Return if we have all the buffers mapped. This avoids the need to do + * journal_start/journal_stop which can block and take a long time */ if (page_has_buffers(page)) { if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, ext4_bh_unmapped)) { - up_read(&inode->i_alloc_sem); - return VM_FAULT_LOCKED; + /* Wait so that we don't change page under IO */ + wait_on_page_writeback(page); + ret = VM_FAULT_LOCKED; + goto out; } } unlock_page(page); - /* - * OK, we need to fill the hole... Do write_begin write_end - * to do block allocation/reservation.We are not holding - * inode.i__mutex here. That allow * parallel write_begin, - * write_end call. lock_page prevent this from happening - * on the same page though - */ - ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), - len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); - if (ret < 0) - goto out_unlock; - ret = mapping->a_ops->write_end(file, mapping, page_offset(page), - len, len, page, fsdata); - if (ret < 0) - goto out_unlock; - ret = 0; - - /* - * write_begin/end might have created a dirty page and someone - * could wander in and start the IO. Make sure that hasn't - * happened. - */ - lock_page(page); - wait_on_page_writeback(page); - up_read(&inode->i_alloc_sem); - return VM_FAULT_LOCKED; -out_unlock: - if (ret) + /* OK, we need to fill the hole... */ + if (ext4_should_dioread_nolock(inode)) + get_block = ext4_get_block_write; + else + get_block = ext4_get_block; +retry_alloc: + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { ret = VM_FAULT_SIGBUS; - up_read(&inode->i_alloc_sem); + goto out; + } + ret = __block_page_mkwrite(vma, vmf, get_block); + if (!ret && ext4_should_journal_data(inode)) { + if (walk_page_buffers(handle, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + } + ext4_journal_stop(handle); + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_alloc; +out_ret: + ret = block_page_mkwrite_return(ret); +out: return ret; }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b754b77..707d605 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c
@@ -1037,15 +1037,11 @@ return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); - if (IS_ERR(inode)) { - if (PTR_ERR(inode) == -ESTALE) { - EXT4_ERROR_INODE(dir, - "deleted inode referenced: %u", - ino); - return ERR_PTR(-EIO); - } else { - return ERR_CAST(inode); - } + if (inode == ERR_PTR(-ESTALE)) { + EXT4_ERROR_INODE(dir, + "deleted inode referenced: %u", + ino); + return ERR_PTR(-EIO); } } return d_splice_alias(inode, dentry);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 8276cc2..a5d3853 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h
@@ -109,6 +109,7 @@ int i_attrs; /* unused attribute bits */ loff_t i_pos; /* on-disk position of directory entry or 0 */ struct hlist_node i_fat_hash; /* hash by i_location */ + struct rw_semaphore truncate_lock; /* protect bmap against truncate */ struct inode vfs_inode; }; @@ -309,7 +310,8 @@ extern void fat_truncate_blocks(struct inode *inode, loff_t offset); extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -extern int fat_file_fsync(struct file *file, int datasync); +extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync); /* fat/inode.c */ extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/file.c b/fs/fat/file.c index 7018e1d..c118acf 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c
@@ -149,12 +149,12 @@ return 0; } -int fat_file_fsync(struct file *filp, int datasync) +int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; int res, err; - res = generic_file_fsync(filp, datasync); + res = generic_file_fsync(filp, start, end, datasync); err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); return res ? res : err; @@ -397,6 +397,8 @@ * sequence. */ if (attr->ia_valid & ATTR_SIZE) { + inode_dio_wait(inode); + if (attr->ia_size > inode->i_size) { error = fat_cont_expand(inode, attr->ia_size); if (error || attr->ia_valid == ATTR_SIZE) @@ -429,8 +431,10 @@ } if (attr->ia_valid & ATTR_SIZE) { + down_write(&MSDOS_I(inode)->truncate_lock); truncate_setsize(inode, attr->ia_size); fat_truncate_blocks(inode, attr->ia_size); + up_write(&MSDOS_I(inode)->truncate_lock); } setattr_copy(inode, attr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c index cb8d839..5942fec 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c
@@ -211,8 +211,8 @@ * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, - iov, offset, nr_segs, fat_get_block, NULL); + ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + fat_get_block); if (ret < 0 && (rw & WRITE)) fat_write_failed(mapping, offset + iov_length(iov, nr_segs)); @@ -224,9 +224,9 @@ sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ - down_read(&mapping->host->i_alloc_sem); + down_read(&MSDOS_I(mapping->host)->truncate_lock); blocknr = generic_block_bmap(mapping, block, fat_get_block); - up_read(&mapping->host->i_alloc_sem); + up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; } @@ -510,6 +510,8 @@ ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS); if (!ei) return NULL; + + init_rwsem(&ei->truncate_lock); return &ei->vfs_inode; }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 3b222da..66e83b8 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c
@@ -209,29 +209,20 @@ int err; lock_super(sb); - err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo); - if (err) { - if (err == -ENOENT) { - inode = NULL; - goto out; - } - goto error; + switch (err) { + case -ENOENT: + inode = NULL; + break; + case 0: + inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); + brelse(sinfo.bh); + break; + default: + inode = ERR_PTR(err); } - - inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); - brelse(sinfo.bh); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto error; - } -out: unlock_super(sb); return d_splice_alias(inode, dentry); - -error: - unlock_super(sb); - return ERR_PTR(err); } /***** Creates a directory entry (name is already formatted). */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 20b4ea5..bb3f29c 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c
@@ -82,10 +82,8 @@ * case sensitive name which is specified by user if this is * for creation. */ - if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { - if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) - return 0; - } + if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; return vfat_revalidate_shortname(dentry); }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0f015a0..b8c507c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c
@@ -461,32 +461,6 @@ } /* - * For background writeback the caller does not have the sb pinned - * before calling writeback. So make sure that we do pin it, so it doesn't - * go away while we are writing inodes from it. - */ -static bool pin_sb_for_writeback(struct super_block *sb) -{ - spin_lock(&sb_lock); - if (list_empty(&sb->s_instances)) { - spin_unlock(&sb_lock); - return false; - } - - sb->s_count++; - spin_unlock(&sb_lock); - - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) - return true; - up_read(&sb->s_umount); - } - - put_super(sb); - return false; -} - -/* * Write a portion of b_io inodes which belong to @sb. * * If @only_this_sb is true, then find and write all such @@ -585,7 +559,7 @@ struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; - if (!pin_sb_for_writeback(sb)) { + if (!grab_super_passive(sb)) { requeue_io(inode); continue; }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d5016071..9f63e49 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c
@@ -382,7 +382,7 @@ struct fuse_entry_out outentry; struct fuse_file *ff; struct file *file; - int flags = nd->intent.open.flags - 1; + int flags = nd->intent.open.flags; if (fc->no_create) return -ENOSYS; @@ -576,7 +576,7 @@ static int fuse_create(struct inode *dir, struct dentry *entry, int mode, struct nameidata *nd) { - if (nd && (nd->flags & LOOKUP_OPEN)) { + if (nd) { int err = fuse_create_open(dir, entry, mode, nd); if (err != -ENOSYS) return err; @@ -971,9 +971,9 @@ return err; } -static int fuse_perm_getattr(struct inode *inode, int flags) +static int fuse_perm_getattr(struct inode *inode, int mask) { - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; return fuse_do_getattr(inode, NULL, NULL); @@ -992,7 +992,7 @@ * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct inode *inode, int mask, unsigned int flags) +static int fuse_permission(struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; @@ -1011,23 +1011,22 @@ if (fi->i_time < get_jiffies_64()) { refreshed = true; - err = fuse_perm_getattr(inode, flags); + err = fuse_perm_getattr(inode, mask); if (err) return err; } } if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { - err = generic_permission(inode, mask, flags, NULL); + err = generic_permission(inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root node will at first have no permissions */ if (err == -EACCES && !refreshed) { - err = fuse_perm_getattr(inode, flags); + err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(inode, mask, - flags, NULL); + err = generic_permission(inode, mask); } /* Note: the opposite of the above test does not @@ -1035,7 +1034,7 @@ noticed immediately, only after the attribute timeout has expired */ } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; err = fuse_access(inode, mask); @@ -1044,7 +1043,7 @@ if (refreshed) return -EACCES; - err = fuse_perm_getattr(inode, flags); + err = fuse_perm_getattr(inode, mask); if (!err && !(inode->i_mode & S_IXUGO)) return -EACCES; } @@ -1177,9 +1176,10 @@ return 0; } -static int fuse_dir_fsync(struct file *file, int datasync) +static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { - return fuse_fsync_common(file, datasync, 1); + return fuse_fsync_common(file, start, end, datasync, 1); } static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 82a6646..7bb685c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c
@@ -400,7 +400,8 @@ fuse_release_nowrite(inode); } -int fuse_fsync_common(struct file *file, int datasync, int isdir) +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, + int datasync, int isdir) { struct inode *inode = file->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); @@ -412,9 +413,15 @@ if (is_bad_inode(inode)) return -EIO; + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) return 0; + mutex_lock(&inode->i_mutex); + /* * Start writeback against all dirty pages of the inode, then * wait for all outstanding writes, before sending the FSYNC @@ -422,13 +429,15 @@ */ err = write_inode_now(inode, 0); if (err) - return err; + goto out; fuse_sync_writes(inode); req = fuse_get_req(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -448,12 +457,15 @@ fc->no_fsync = 1; err = 0; } +out: + mutex_unlock(&inode->i_mutex); return err; } -static int fuse_fsync(struct file *file, int datasync) +static int fuse_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { - return fuse_fsync_common(file, datasync, 0); + return fuse_fsync_common(file, start, end, datasync, 0); } void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, @@ -1600,15 +1612,32 @@ struct inode *inode = file->f_path.dentry->d_inode; mutex_lock(&inode->i_mutex); - switch (origin) { - case SEEK_END: + if (origin != SEEK_CUR || origin != SEEK_SET) { retval = fuse_update_attributes(inode, NULL, file, NULL); if (retval) goto exit; + } + + switch (origin) { + case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: offset += file->f_pos; + break; + case SEEK_DATA: + if (offset >= i_size_read(inode)) { + retval = -ENXIO; + goto exit; + } + break; + case SEEK_HOLE: + if (offset >= i_size_read(inode)) { + retval = -ENXIO; + goto exit; + } + offset = i_size_read(inode); + break; } retval = -EINVAL; if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b788bec..c6aa2d4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h
@@ -589,7 +589,8 @@ /** * Send FSYNC or FSYNCDIR request */ -int fuse_fsync_common(struct file *file, int datasync, int isdir); +int fuse_fsync_common(struct file *file, loff_t start, loff_t end, + int datasync, int isdir); /** * Notify poll wakeup
diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 8f26d1a..70e90b4 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c
@@ -190,9 +190,9 @@ } int -generic_check_acl(struct inode *inode, int mask, unsigned int flags) +generic_check_acl(struct inode *inode, int mask) { - if (flags & IPERM_FLAG_RCU) { + if (mask & MAY_NOT_BLOCK) { if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) return -ECHILD; } else {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index cbc0715..8ef1079 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c
@@ -75,12 +75,12 @@ * Returns: errno */ -int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags) +int gfs2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; int error; - if (flags & IPERM_FLAG_RCU) { + if (mask & MAY_NOT_BLOCK) { if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) return -ECHILD; return -EAGAIN;
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index a93907c..b522b0c 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h
@@ -16,7 +16,7 @@ #define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" #define GFS2_ACL_MAX_ENTRIES 25 -extern int gfs2_check_acl(struct inode *inode, int mask, unsigned int); +extern int gfs2_check_acl(struct inode *inode, int mask); extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); extern const struct xattr_handler gfs2_xattr_system_handler;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 42e477f..7878c47 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c
@@ -1216,6 +1216,8 @@ if (ret) return ret; + inode_dio_wait(inode); + oldsize = inode->i_size; if (newsize >= oldsize) return do_grow(inode, newsize);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index bc2590e..edeb9e8 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c
@@ -245,7 +245,7 @@ !capable(CAP_LINUX_IMMUTABLE)) goto out; if (!IS_IMMUTABLE(inode)) { - error = gfs2_permission(inode, MAY_WRITE, 0); + error = gfs2_permission(inode, MAY_WRITE); if (error) goto out; } @@ -546,7 +546,9 @@ /** * gfs2_fsync - sync the dirty data for a file (across the cluster) - * @file: the file that points to the dentry (we ignore this) + * @file: the file that points to the dentry + * @start: the start position in the file to sync + * @end: the end position in the file to sync * @datasync: set if we can ignore timestamp changes * * The VFS will flush data for us. We only need to worry @@ -555,23 +557,32 @@ * Returns: errno */ -static int gfs2_fsync(struct file *file, int datasync) +static int gfs2_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_mapping->host; int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); struct gfs2_inode *ip = GFS2_I(inode); int ret; + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + if (datasync) sync_state &= ~I_DIRTY_SYNC; if (sync_state) { ret = sync_inode_metadata(inode, 1); - if (ret) + if (ret) { + mutex_unlock(&inode->i_mutex); return ret; + } gfs2_ail_flush(ip->i_gl); } + mutex_unlock(&inode->i_mutex); return 0; }
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 03e0c52..0fb51a9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c
@@ -307,7 +307,7 @@ } if (!is_root) { - error = gfs2_permission(dir, MAY_EXEC, 0); + error = gfs2_permission(dir, MAY_EXEC); if (error) goto out; } @@ -337,7 +337,7 @@ { int error; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -792,13 +792,8 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - struct inode *inode = NULL; - - inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (inode && IS_ERR(inode)) - return ERR_CAST(inode); - - if (inode) { + struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (inode && !IS_ERR(inode)) { struct gfs2_glock *gl = GFS2_I(inode)->i_gl; struct gfs2_holder gh; int error; @@ -808,11 +803,8 @@ return ERR_PTR(error); } gfs2_glock_dq_uninit(&gh); - return d_splice_alias(inode, dentry); } - d_add(dentry, inode); - - return NULL; + return d_splice_alias(inode, dentry); } /** @@ -857,7 +849,7 @@ if (inode->i_nlink == 0) goto out_gunlock; - error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0); + error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -990,7 +982,7 @@ if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); if (error) return error; @@ -1336,7 +1328,7 @@ } } } else { - error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0); + error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1371,7 +1363,7 @@ /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0); + error = gfs2_permission(odentry->d_inode, MAY_WRITE); if (error) goto out_gunlock; } @@ -1543,7 +1535,7 @@ * Returns: errno */ -int gfs2_permission(struct inode *inode, int mask, unsigned int flags) +int gfs2_permission(struct inode *inode, int mask) { struct gfs2_inode *ip; struct gfs2_holder i_gh; @@ -1553,7 +1545,7 @@ ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) @@ -1564,7 +1556,7 @@ if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EACCES; else - error = generic_permission(inode, mask, flags, gfs2_check_acl); + error = generic_permission(inode, mask); if (unlock) gfs2_glock_dq_uninit(&i_gh); @@ -1854,6 +1846,7 @@ .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, + .check_acl = gfs2_check_acl, }; const struct inode_operations gfs2_dir_iops = { @@ -1874,6 +1867,7 @@ .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, + .check_acl = gfs2_check_acl, }; const struct inode_operations gfs2_symlink_iops = { @@ -1888,5 +1882,6 @@ .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, + .check_acl = gfs2_check_acl, };
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 3160607..8d90e0c 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h
@@ -108,7 +108,7 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); -extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags); +extern int gfs2_permission(struct inode *inode, int mask); extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index fff16c9..96a1b62 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c
@@ -123,8 +123,8 @@ struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, hfs_get_block, NULL); + ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + hfs_get_block); /* * In case of error extending write may have instantiated a few @@ -615,6 +615,8 @@ if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { + inode_dio_wait(inode); + error = vmtruncate(inode, attr->ia_size); if (error) return error; @@ -625,12 +627,18 @@ return 0; } -static int hfs_file_fsync(struct file *filp, int datasync) +static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *inode = filp->f_mapping->host; struct super_block * sb; int ret, err; + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + /* sync the inode to buffers */ ret = write_inode_now(inode, 0); @@ -647,6 +655,7 @@ err = sync_blockdev(sb->s_bdev); if (!ret) ret = err; + mutex_unlock(&inode->i_mutex); return ret; }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 81dfd1e..d7674d0 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h
@@ -404,7 +404,8 @@ int hfsplus_cat_write_inode(struct inode *); struct inode *hfsplus_new_inode(struct super_block *, int); void hfsplus_delete_inode(struct inode *); -int hfsplus_file_fsync(struct file *file, int datasync); +int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync); /* ioctl.c */ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 010cd36..4cc1e3a 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c
@@ -119,8 +119,8 @@ struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, hfsplus_get_block, NULL); + ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + hfsplus_get_block); /* * In case of error extending write may have instantiated a few @@ -298,6 +298,8 @@ if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { + inode_dio_wait(inode); + error = vmtruncate(inode, attr->ia_size); if (error) return error; @@ -308,13 +310,19 @@ return 0; } -int hfsplus_file_fsync(struct file *file, int datasync) +int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_mapping->host; struct hfsplus_inode_info *hip = HFSPLUS_I(inode); struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); int error = 0, error2; + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + mutex_lock(&inode->i_mutex); + /* * Sync inode metadata into the catalog and extent trees. */ @@ -342,6 +350,8 @@ if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + mutex_unlock(&inode->i_mutex); + return error; }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2638c834e..0d22afd 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c
@@ -362,9 +362,20 @@ return 0; } -int hostfs_fsync(struct file *file, int datasync) +int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync); + struct inode *inode = file->f_mapping->host; + int ret; + + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + ret = fsync_file(HOSTFS_I(inode)->fd, datasync); + mutex_unlock(&inode->i_mutex); + + return ret; } static const struct file_operations hostfs_file_fops = { @@ -748,12 +759,12 @@ return err; } -int hostfs_permission(struct inode *ino, int desired, unsigned int flags) +int hostfs_permission(struct inode *ino, int desired) { char *name; int r = 0, w = 0, x = 0, err; - if (flags & IPERM_FLAG_RCU) + if (desired & MAY_NOT_BLOCK) return -ECHILD; if (desired & MAY_READ) r = 1; @@ -770,7 +781,7 @@ err = access_file(name, r, w, x); __putname(name); if (!err) - err = generic_permission(ino, desired, flags, NULL); + err = generic_permission(ino, desired); return err; }
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index f46ae02..96a8ed9 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c
@@ -29,6 +29,10 @@ struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct super_block *s = i->i_sb; + /* Somebody else will have to figure out what to do here */ + if (whence == SEEK_DATA || whence == SEEK_HOLE) + return -EINVAL; + hpfs_lock(s); /*printk("dir lseek\n");*/
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 89c500e..89d2a58 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c
@@ -18,9 +18,14 @@ return 0; } -int hpfs_file_fsync(struct file *file, int datasync) +int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; + int ret; + + ret = filemap_write_and_wait_range(file->f_mapping, start, end); + if (ret) + return ret; return sync_blockdev(inode->i_sb->s_bdev); }
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index dd552f8..331b5e2 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h
@@ -258,7 +258,7 @@ /* file.c */ -int hpfs_file_fsync(struct file *, int); +int hpfs_file_fsync(struct file *, loff_t, loff_t, int); extern const struct file_operations hpfs_file_ops; extern const struct inode_operations hpfs_file_iops; extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index acf95da..2df69e2 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c
@@ -398,7 +398,7 @@ hpfs_unlock(dir->i_sb); return -ENOSPC; } - if (generic_permission(inode, MAY_WRITE, 0, NULL) || + if (generic_permission(inode, MAY_WRITE) || !S_ISREG(inode->i_mode) || get_write_access(inode)) { d_rehash(dentry);
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 85c098a..8635be5 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c
@@ -573,9 +573,10 @@ return err; } -static int hppfs_fsync(struct file *file, int datasync) +static int hppfs_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { - return 0; + return filemap_write_and_wait_range(file->f_mapping, start, end); } static const struct file_operations hppfs_dir_fops = {
diff --git a/fs/inode.c b/fs/inode.c index 43566d1..96c77b8 100644 --- a/fs/inode.c +++ b/fs/inode.c
@@ -33,8 +33,8 @@ * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget() - * inode_lru_lock protects: - * inode_lru, inode->i_lru + * inode->i_sb->s_inode_lru_lock protects: + * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list * inode_wb_list_lock protects: @@ -46,7 +46,7 @@ * * inode_sb_list_lock * inode->i_lock - * inode_lru_lock + * inode->i_sb->s_inode_lru_lock * * inode_wb_list_lock * inode->i_lock @@ -64,24 +64,10 @@ static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); -static LIST_HEAD(inode_lru); -static DEFINE_SPINLOCK(inode_lru_lock); - __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* - * iprune_sem provides exclusion between the icache shrinking and the - * umount path. - * - * We don't actually need it to protect anything in the umount path, - * but only need to cycle through it to make sure any inode that - * prune_icache took off the LRU list has been fully torn down by the - * time we are past evict_inodes. - */ -static DECLARE_RWSEM(iprune_sem); - -/* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. */ @@ -95,6 +81,7 @@ struct inodes_stat_t inodes_stat; static DEFINE_PER_CPU(unsigned int, nr_inodes); +static DEFINE_PER_CPU(unsigned int, nr_unused); static struct kmem_cache *inode_cachep __read_mostly; @@ -109,7 +96,11 @@ static inline int get_nr_inodes_unused(void) { - return inodes_stat.nr_unused; + int i; + int sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_unused, i); + return sum < 0 ? 0 : sum; } int get_nr_dirty_inodes(void) @@ -127,6 +118,7 @@ void __user *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); + inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_dointvec(table, write, buffer, lenp, ppos); } #endif @@ -176,8 +168,7 @@ mutex_init(&inode->i_mutex); lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); - init_rwsem(&inode->i_alloc_sem); - lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); + atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; @@ -337,22 +328,24 @@ static void inode_lru_list_add(struct inode *inode) { - spin_lock(&inode_lru_lock); + spin_lock(&inode->i_sb->s_inode_lru_lock); if (list_empty(&inode->i_lru)) { - list_add(&inode->i_lru, &inode_lru); - inodes_stat.nr_unused++; + list_add(&inode->i_lru, &inode->i_sb->s_inode_lru); + inode->i_sb->s_nr_inodes_unused++; + this_cpu_inc(nr_unused); } - spin_unlock(&inode_lru_lock); + spin_unlock(&inode->i_sb->s_inode_lru_lock); } static void inode_lru_list_del(struct inode *inode) { - spin_lock(&inode_lru_lock); + spin_lock(&inode->i_sb->s_inode_lru_lock); if (!list_empty(&inode->i_lru)) { list_del_init(&inode->i_lru); - inodes_stat.nr_unused--; + inode->i_sb->s_nr_inodes_unused--; + this_cpu_dec(nr_unused); } - spin_unlock(&inode_lru_lock); + spin_unlock(&inode->i_sb->s_inode_lru_lock); } /** @@ -537,14 +530,6 @@ spin_unlock(&inode_sb_list_lock); dispose_list(&dispose); - - /* - * Cycle through iprune_sem to make sure any inode that prune_icache - * moved off the list before we took the lock has been fully torn - * down. - */ - down_write(&iprune_sem); - up_write(&iprune_sem); } /** @@ -607,8 +592,10 @@ } /* - * Scan `goal' inodes on the unused list for freeable ones. They are moved to a - * temporary list and then are freed outside inode_lru_lock by dispose_list(). + * Walk the superblock inode LRU for freeable inodes and attempt to free them. + * This is called from the superblock shrinker function with a number of inodes + * to trim from the LRU. Inodes to be freed are moved to a temporary list and + * then are freed outside inode_lock by dispose_list(). * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -622,29 +609,28 @@ * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -static void prune_icache(int nr_to_scan) +void prune_icache_sb(struct super_block *sb, int nr_to_scan) { LIST_HEAD(freeable); int nr_scanned; unsigned long reap = 0; - down_read(&iprune_sem); - spin_lock(&inode_lru_lock); - for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + spin_lock(&sb->s_inode_lru_lock); + for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { struct inode *inode; - if (list_empty(&inode_lru)) + if (list_empty(&sb->s_inode_lru)) break; - inode = list_entry(inode_lru.prev, struct inode, i_lru); + inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); /* - * we are inverting the inode_lru_lock/inode->i_lock here, + * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, * so use a trylock. If we fail to get the lock, just move the * inode to the back of the list so we don't spin on it. */ if (!spin_trylock(&inode->i_lock)) { - list_move(&inode->i_lru, &inode_lru); + list_move(&inode->i_lru, &sb->s_inode_lru); continue; } @@ -656,28 +642,29 @@ (inode->i_state & ~I_REFERENCED)) { list_del_init(&inode->i_lru); spin_unlock(&inode->i_lock); - inodes_stat.nr_unused--; + sb->s_nr_inodes_unused--; + this_cpu_dec(nr_unused); continue; } /* recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; - list_move(&inode->i_lru, &inode_lru); + list_move(&inode->i_lru, &sb->s_inode_lru); spin_unlock(&inode->i_lock); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lru_lock); + spin_unlock(&sb->s_inode_lru_lock); if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, 0, -1); iput(inode); - spin_lock(&inode_lru_lock); + spin_lock(&sb->s_inode_lru_lock); - if (inode != list_entry(inode_lru.next, + if (inode != list_entry(sb->s_inode_lru.next, struct inode, i_lru)) continue; /* wrong inode or list_empty */ /* avoid lock inversions with trylock */ @@ -693,51 +680,18 @@ spin_unlock(&inode->i_lock); list_move(&inode->i_lru, &freeable); - inodes_stat.nr_unused--; + sb->s_nr_inodes_unused--; + this_cpu_dec(nr_unused); } if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); - spin_unlock(&inode_lru_lock); + spin_unlock(&sb->s_inode_lru_lock); dispose_list(&freeable); - up_read(&iprune_sem); } -/* - * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, - * "unused" means that no dentries are referring to the inodes: the files are - * not open and the dcache references to those inodes have already been - * reclaimed. - * - * This function is passed the number of inodes to scan, and it returns the - * total number of remaining possibly-reclaimable inodes. - */ -static int shrink_icache_memory(struct shrinker *shrink, - struct shrink_control *sc) -{ - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - - if (nr) { - /* - * Nasty deadlock avoidance. We may hold various FS locks, - * and we don't want to recurse into the FS that called us - * in clear_inode() and friends.. - */ - if (!(gfp_mask & __GFP_FS)) - return -1; - prune_icache(nr); - } - return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure; -} - -static struct shrinker icache_shrinker = { - .shrink = shrink_icache_memory, - .seeks = DEFAULT_SEEKS, -}; - static void __wait_on_freeing_inode(struct inode *inode); /* * Called with the inode lock held. @@ -1331,7 +1285,7 @@ WARN_ON(inode->i_state & I_NEW); - if (op && op->drop_inode) + if (op->drop_inode) drop = op->drop_inode(inode); else drop = generic_drop_inode(inode); @@ -1617,7 +1571,6 @@ (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_MEM_SPREAD), init_once); - register_shrinker(&icache_shrinker); /* Hash may have been set up in inode_init_early */ if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h index b29c46e..fe327c2 100644 --- a/fs/internal.h +++ b/fs/internal.h
@@ -97,6 +97,7 @@ * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); +extern bool grab_super_passive(struct super_block *sb); extern void __put_super(struct super_block *sb); extern void put_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, @@ -135,3 +136,8 @@ extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); + +/* + * dcache.c + */ +extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 0542b6e..f20437c 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c
@@ -254,19 +254,16 @@ char *tmpname; struct iso_directory_record *tmpde; struct inode *inode = filp->f_path.dentry->d_inode; - struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); tmpname = (char *)__get_free_page(GFP_KERNEL); if (tmpname == NULL) return -ENOMEM; - mutex_lock(&sbi->s_mutex); tmpde = (struct iso_directory_record *) (tmpname+1024); result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); free_page((unsigned long) tmpname); - mutex_unlock(&sbi->s_mutex); return result; }
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index b3cc858..a5d0367 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c
@@ -863,7 +863,6 @@ sbi->s_utf8 = opt.utf8; sbi->s_nocompress = opt.nocompress; sbi->s_overriderockperm = opt.overriderockperm; - mutex_init(&sbi->s_mutex); /* * It would be incredibly stupid to allow people to mark every file * on the disk as suid, so we merely allow them to set the default
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 2882dc0..7d33de8 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h
@@ -55,7 +55,6 @@ gid_t s_gid; uid_t s_uid; struct nls_table *s_nls_iocharset; /* Native language support table */ - struct mutex s_mutex; /* replaces BKL, please remove if possible */ }; #define ISOFS_INVALID_MODE ((mode_t) -1)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 4fb3e80..1e2946f 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c
@@ -168,7 +168,6 @@ int found; unsigned long uninitialized_var(block); unsigned long uninitialized_var(offset); - struct isofs_sb_info *sbi = ISOFS_SB(dir->i_sb); struct inode *inode; struct page *page; @@ -176,21 +175,13 @@ if (!page) return ERR_PTR(-ENOMEM); - mutex_lock(&sbi->s_mutex); found = isofs_find_entry(dir, dentry, &block, &offset, page_address(page), 1024 + page_address(page)); __free_page(page); - inode = NULL; - if (found) { - inode = isofs_iget(dir->i_sb, block, offset); - if (IS_ERR(inode)) { - mutex_unlock(&sbi->s_mutex); - return ERR_CAST(inode); - } - } - mutex_unlock(&sbi->s_mutex); + inode = found ? isofs_iget(dir->i_sb, block, offset) : NULL; + return d_splice_alias(inode, dentry); }
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index f9cd04d..1fbc7de 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c
@@ -678,7 +678,6 @@ init_rock_state(&rs, inode); block = ei->i_iget5_block; - mutex_lock(&sbi->s_mutex); bh = sb_bread(inode->i_sb, block); if (!bh) goto out_noread; @@ -748,7 +747,6 @@ goto fail; brelse(bh); *rpnt = '\0'; - mutex_unlock(&sbi->s_mutex); SetPageUptodate(page); kunmap(page); unlock_page(page); @@ -765,7 +763,6 @@ printk("symlink spans iso9660 blocks\n"); fail: brelse(bh); - mutex_unlock(&sbi->s_mutex); error: SetPageError(page); kunmap(page);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 828a0e1..3675b3c 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c
@@ -259,12 +259,12 @@ return rc; } -int jffs2_check_acl(struct inode *inode, int mask, unsigned int flags) +int jffs2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; int rc; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 3119f59..5e42de8 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_check_acl(struct inode *, int, unsigned int); +extern int jffs2_check_acl(struct inode *, int); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 4bca6a2..5f243cd 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c
@@ -102,10 +102,8 @@ mutex_unlock(&dir_f->sem); if (ino) { inode = jffs2_iget(dir_i->i_sb, ino); - if (IS_ERR(inode)) { + if (IS_ERR(inode)) printk(KERN_WARNING "iget() failed for ino #%u\n", ino); - return ERR_CAST(inode); - } } return d_splice_alias(inode, target); @@ -822,7 +820,10 @@ if (victim_f) { /* There was a victim. Kill it off nicely */ - drop_nlink(new_dentry->d_inode); + if (S_ISDIR(new_dentry->d_inode->i_mode)) + clear_nlink(new_dentry->d_inode); + else + drop_nlink(new_dentry->d_inode); /* Don't oops if the victim was a dirent pointing to an inode which didn't exist. */ if (victim_f->inocache) {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 1c0a08d..3989f7e 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c
@@ -27,13 +27,20 @@ struct page **pagep, void **fsdata); static int jffs2_readpage (struct file *filp, struct page *pg); -int jffs2_fsync(struct file *filp, int datasync) +int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); + int ret; + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); /* Trigger GC to flush any pending writes for this inode */ jffs2_flush_wbuf_gc(c, inode->i_ino); + mutex_unlock(&inode->i_mutex); return 0; }
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 65c6c43..9c25283 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern const struct file_operations jffs2_file_operations; extern const struct inode_operations jffs2_file_inode_operations; extern const struct address_space_operations jffs2_file_address_operations; -int jffs2_fsync(struct file *, int); +int jffs2_fsync(struct file *, loff_t, loff_t, int); int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); /* ioctl.c */
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index e5de942..8a0a066 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c
@@ -114,11 +114,11 @@ return rc; } -int jfs_check_acl(struct inode *inode, int mask, unsigned int flags) +int jfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2f3f531..7527855 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c
@@ -28,19 +28,26 @@ #include "jfs_acl.h" #include "jfs_debug.h" -int jfs_fsync(struct file *file, int datasync) +int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int rc = 0; + rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (rc) + return rc; + + mutex_lock(&inode->i_mutex); if (!(inode->i_state & I_DIRTY) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); + mutex_unlock(&inode->i_mutex); return rc; } rc |= jfs_commit_inode(inode, 1); + mutex_unlock(&inode->i_mutex); return rc ? -EIO : 0; } @@ -110,6 +117,8 @@ if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { + inode_dio_wait(inode); + rc = vmtruncate(inode, iattr->ia_size); if (rc) return rc;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 1096559..77b69b2 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c
@@ -329,8 +329,8 @@ struct inode *inode = file->f_mapping->host; ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, jfs_get_block, NULL); + ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + jfs_get_block); /* * In case of error extending write may have instantiated a few
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index f9285c4..54e0755 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_check_acl(struct inode *, int, unsigned int flags); +int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_acl_chmod(struct inode *inode);
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index ec2fb8b..9271cfe4 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@ struct fid; extern struct inode *ialloc(struct inode *, umode_t); -extern int jfs_fsync(struct file *, int); +extern int jfs_fsync(struct file *, loff_t, loff_t, int); extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index eaaf2b5..03787ef 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c
@@ -1456,34 +1456,23 @@ ino_t inum; struct inode *ip; struct component_name key; - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; int rc; - jfs_info("jfs_lookup: name = %s", name); + jfs_info("jfs_lookup: name = %s", dentry->d_name.name); - if ((name[0] == '.') && (len == 1)) - inum = dip->i_ino; - else if (strcmp(name, "..") == 0) - inum = PARENT(dip); - else { - if ((rc = get_UCSname(&key, dentry))) - return ERR_PTR(rc); - rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP); - free_UCSname(&key); - if (rc == -ENOENT) { - d_add(dentry, NULL); - return NULL; - } else if (rc) { - jfs_err("jfs_lookup: dtSearch returned %d", rc); - return ERR_PTR(rc); - } - } - - ip = jfs_iget(dip->i_sb, inum); - if (IS_ERR(ip)) { - jfs_err("jfs_lookup: iget failed on inum %d", (uint) inum); - return ERR_CAST(ip); + if ((rc = get_UCSname(&key, dentry))) + return ERR_PTR(rc); + rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP); + free_UCSname(&key); + if (rc == -ENOENT) { + ip = NULL; + } else if (rc) { + jfs_err("jfs_lookup: dtSearch returned %d", rc); + ip = ERR_PTR(rc); + } else { + ip = jfs_iget(dip->i_sb, inum); + if (IS_ERR(ip)) + jfs_err("jfs_lookup: iget failed on inum %d", (uint)inum); } return d_splice_alias(ip, dentry); @@ -1597,8 +1586,6 @@ static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd && nd->flags & LOOKUP_RCU) - return -ECHILD; /* * This is not negative dentry. Always valid. * @@ -1624,10 +1611,8 @@ * case sensitive name which is specified by user if this is * for creation. */ - if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) { - if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) - return 0; - } + if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; return 1; }
diff --git a/fs/libfs.c b/fs/libfs.c index 275ca474..c18e9a1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c
@@ -16,6 +16,8 @@ #include <asm/uaccess.h> +#include "internal.h" + static inline int simple_positive(struct dentry *dentry) { return dentry->d_inode && !d_unhashed(dentry); @@ -246,13 +248,11 @@ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; - dentry = d_alloc(NULL, &d_name); + dentry = __d_alloc(s, &d_name); if (!dentry) { iput(root); goto Enomem; } - dentry->d_sb = s; - dentry->d_parent = dentry; d_instantiate(dentry, root); s->s_root = dentry; s->s_d_op = dops; @@ -328,8 +328,10 @@ if (new_dentry->d_inode) { simple_unlink(new_dir, new_dentry); - if (they_are_dirs) + if (they_are_dirs) { + drop_nlink(new_dentry->d_inode); drop_nlink(old_dir); + } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); @@ -905,21 +907,29 @@ * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ -int generic_file_fsync(struct file *file, int datasync) +int generic_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_mapping->host; int err; int ret; + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY)) - return ret; + goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; + goto out; err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; +out: + mutex_unlock(&inode->i_mutex); return ret; } EXPORT_SYMBOL(generic_file_fsync); @@ -956,7 +966,7 @@ /* * No-op implementation of ->fsync for in-memory filesystems. */ -int noop_fsync(struct file *file, int datasync) +int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) { return 0; }
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 1afae26..b3ff3d8 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c
@@ -371,11 +371,9 @@ page_cache_release(page); inode = logfs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) { + if (IS_ERR(inode)) printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n", ino, dir->i_ino, index); - return ERR_CAST(inode); - } return d_splice_alias(inode, dentry); }
diff --git a/fs/logfs/file.c b/fs/logfs/file.c index c2ad702..b548c87 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c
@@ -219,11 +219,20 @@ } } -int logfs_fsync(struct file *file, int datasync) +int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct super_block *sb = file->f_mapping->host->i_sb; + struct inode *inode = file->f_mapping->host; + int ret; + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); logfs_write_anchor(sb); + mutex_unlock(&inode->i_mutex); + return 0; }
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 57afd4a..f22d108 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h
@@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops; int logfs_readpage(struct file *file, struct page *page); long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int logfs_fsync(struct file *file, int datasync); +int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync); /* gc.c */ u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c index adcdc0a..e7d23e25 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c
@@ -596,8 +596,7 @@ int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *dir = dentry->d_parent->d_inode; - struct super_block *sb = dir->i_sb; + struct super_block *sb = dentry->d_sb; generic_fillattr(dentry->d_inode, stat); if (INODE_VERSION(dentry->d_inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
diff --git a/fs/namei.c b/fs/namei.c index 14ab8d3..b7fad00 100644 --- a/fs/namei.c +++ b/fs/namei.c
@@ -176,12 +176,12 @@ /* * This does basic POSIX ACL permission checking */ -static int acl_permission_check(struct inode *inode, int mask, unsigned int flags, - int (*check_acl)(struct inode *inode, int mask, unsigned int flags)) +static int acl_permission_check(struct inode *inode, int mask) { + int (*check_acl)(struct inode *inode, int mask); unsigned int mode = inode->i_mode; - mask &= MAY_READ | MAY_WRITE | MAY_EXEC; + mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; if (current_user_ns() != inode_userns(inode)) goto other_perms; @@ -189,8 +189,9 @@ if (current_fsuid() == inode->i_uid) mode >>= 6; else { + check_acl = inode->i_op->check_acl; if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { - int error = check_acl(inode, mask, flags); + int error = check_acl(inode, mask); if (error != -EAGAIN) return error; } @@ -203,7 +204,7 @@ /* * If the DACs are ok we don't need any capability check. */ - if ((mask & ~mode) == 0) + if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) return 0; return -EACCES; } @@ -212,8 +213,6 @@ * generic_permission - check for access rights on a Posix-like filesystem * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * @check_acl: optional callback to check for Posix ACLs - * @flags: IPERM_FLAG_ flags. * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions @@ -224,24 +223,32 @@ * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. */ -int generic_permission(struct inode *inode, int mask, unsigned int flags, - int (*check_acl)(struct inode *inode, int mask, unsigned int flags)) +int generic_permission(struct inode *inode, int mask) { int ret; /* * Do the basic POSIX ACL permission checks. */ - ret = acl_permission_check(inode, mask, flags, check_acl); + ret = acl_permission_check(inode, mask); if (ret != -EACCES) return ret; + if (S_ISDIR(inode->i_mode)) { + /* DACs are overridable for directories */ + if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) + return 0; + if (!(mask & MAY_WRITE)) + if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) + return 0; + return -EACCES; + } /* * Read/write DACs are always overridable. - * Executable DACs are overridable for all directories and - * for non-directories that have least one exec bit set. + * Executable DACs are overridable when there is + * at least one exec bit set. */ - if (!(mask & MAY_EXEC) || execute_ok(inode)) + if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) return 0; @@ -249,7 +256,7 @@ * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; - if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) + if (mask == MAY_READ) if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH)) return 0; @@ -288,10 +295,9 @@ } if (inode->i_op->permission) - retval = inode->i_op->permission(inode, mask, 0); + retval = inode->i_op->permission(inode, mask); else - retval = generic_permission(inode, mask, 0, - inode->i_op->check_acl); + retval = generic_permission(inode, mask); if (retval) return retval; @@ -304,69 +310,6 @@ } /** - * file_permission - check for additional access rights to a given file - * @file: file to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * - * Used to check for read/write/execute permissions on an already opened - * file. - * - * Note: - * Do not use this function in new code. All access checks should - * be done using inode_permission(). - */ -int file_permission(struct file *file, int mask) -{ - return inode_permission(file->f_path.dentry->d_inode, mask); -} - -/* - * get_write_access() gets write permission for a file. - * put_write_access() releases this write permission. - * This is used for regular files. - * We cannot support write (and maybe mmap read-write shared) accesses and - * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode - * can have the following values: - * 0: no writers, no VM_DENYWRITE mappings - * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist - * > 0: (i_writecount) users are writing to the file. - * - * Normally we operate on that counter with atomic_{inc,dec} and it's safe - * except for the cases where we don't hold i_writecount yet. Then we need to - * use {get,deny}_write_access() - these functions check the sign and refuse - * to do the change if sign is wrong. Exclusion between them is provided by - * the inode->i_lock spinlock. - */ - -int get_write_access(struct inode * inode) -{ - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) < 0) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_inc(&inode->i_writecount); - spin_unlock(&inode->i_lock); - - return 0; -} - -int deny_write_access(struct file * file) -{ - struct inode *inode = file->f_path.dentry->d_inode; - - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 0) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; - } - atomic_dec(&inode->i_writecount); - spin_unlock(&inode->i_lock); - - return 0; -} - -/** * path_get - get a reference to a path * @path: path to get the reference to * @@ -492,28 +435,6 @@ return dentry->d_op->d_revalidate(dentry, nd); } -static struct dentry * -do_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - int status = d_revalidate(dentry, nd); - if (unlikely(status <= 0)) { - /* - * The dentry failed validation. - * If d_revalidate returned 0 attempt to invalidate - * the dentry otherwise d_revalidate is asking us - * to return a fail status. - */ - if (status < 0) { - dput(dentry); - dentry = ERR_PTR(status); - } else if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - } - return dentry; -} - /** * complete_walk - successful completion of path walk * @nd: pointer nameidata @@ -568,40 +489,6 @@ return status; } -/* - * Short-cut version of permission(), for calling on directories - * during pathname resolution. Combines parts of permission() - * and generic_permission(), and tests ONLY for MAY_EXEC permission. - * - * If appropriate, check DAC only. If not appropriate, or - * short-cut DAC fails, then call ->permission() to do more - * complete permission check. - */ -static inline int exec_permission(struct inode *inode, unsigned int flags) -{ - int ret; - struct user_namespace *ns = inode_userns(inode); - - if (inode->i_op->permission) { - ret = inode->i_op->permission(inode, MAY_EXEC, flags); - } else { - ret = acl_permission_check(inode, MAY_EXEC, flags, - inode->i_op->check_acl); - } - if (likely(!ret)) - goto ok; - if (ret == -ECHILD) - return ret; - - if (ns_capable(ns, CAP_DAC_OVERRIDE) || - ns_capable(ns, CAP_DAC_READ_SEARCH)) - goto ok; - - return ret; -ok: - return security_inode_exec_permission(inode, flags); -} - static __always_inline void set_root(struct nameidata *nd) { if (!nd->root.mnt) @@ -776,7 +663,7 @@ /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT * and this is the terminal part of the path. */ - if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_CONTINUE)) + if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) return -EISDIR; /* we actually want to stop here */ /* We want to mount if someone is trying to open/create a file of any @@ -788,7 +675,7 @@ * appended a '/' to the name. */ if (!(flags & LOOKUP_FOLLOW) && - !(flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY | + !(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE))) return -EISDIR; @@ -807,7 +694,7 @@ * the path being looked up; if it wasn't then the remainder of * the path is inaccessible and we should say so. */ - if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_CONTINUE)) + if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) return -EREMOTE; return PTR_ERR(mnt); } @@ -1134,6 +1021,30 @@ } /* + * We already have a dentry, but require a lookup to be performed on the parent + * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error. + * parent->d_inode->i_mutex must be held. d_lookup must have verified that no + * child exists while under i_mutex. + */ +static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = parent->d_inode; + struct dentry *old; + + /* Don't create child dentry for a dead directory. */ + if (unlikely(IS_DEADDIR(inode))) + return ERR_PTR(-ENOENT); + + old = inode->i_op->lookup(inode, dentry, nd); + if (unlikely(old)) { + dput(dentry); + dentry = old; + } + return dentry; +} + +/* * It's more convoluted than I'd like it to be, but... it's still fairly * small and for now I'd prefer to have fast path as straight as possible. * It _is_ time-critical. @@ -1172,6 +1083,8 @@ goto unlazy; } } + if (unlikely(d_need_lookup(dentry))) + goto unlazy; path->mnt = mnt; path->dentry = dentry; if (unlikely(!__follow_mount_rcu(nd, path, inode))) @@ -1186,6 +1099,10 @@ dentry = __d_lookup(parent, name); } + if (dentry && unlikely(d_need_lookup(dentry))) { + dput(dentry); + dentry = NULL; + } retry: if (unlikely(!dentry)) { struct inode *dir = parent->d_inode; @@ -1202,6 +1119,15 @@ /* known good */ need_reval = 0; status = 1; + } else if (unlikely(d_need_lookup(dentry))) { + dentry = d_inode_lookup(parent, dentry, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); + } + /* known good */ + need_reval = 0; + status = 1; } mutex_unlock(&dir->i_mutex); } @@ -1234,13 +1160,13 @@ static inline int may_lookup(struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { - int err = exec_permission(nd->inode, IPERM_FLAG_RCU); + int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD) return err; if (unlazy_walk(nd, NULL)) return -ECHILD; } - return exec_permission(nd->inode, 0); + return inode_permission(nd->inode, MAY_EXEC); } static inline int handle_dots(struct nameidata *nd, int type) @@ -1354,7 +1280,6 @@ { struct path next; int err; - unsigned int lookup_flags = nd->flags; while (*name=='/') name++; @@ -1368,8 +1293,6 @@ unsigned int c; int type; - nd->flags |= LOOKUP_CONTINUE; - err = may_lookup(nd); if (err) break; @@ -1431,8 +1354,6 @@ /* here ends the main loop */ last_component: - /* Clear LOOKUP_CONTINUE iff it was previously unset */ - nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; nd->last = this; nd->last_type = type; return 0; @@ -1515,7 +1436,7 @@ if (!S_ISDIR(dentry->d_inode->i_mode)) goto fput_fail; - retval = file_permission(file, MAY_EXEC); + retval = inode_permission(dentry->d_inode, MAY_EXEC); if (retval) goto fput_fail; } @@ -1653,16 +1574,22 @@ * @mnt: pointer to vfs mount of the base directory * @name: pointer to file name * @flags: lookup flags - * @nd: pointer to nameidata + * @path: pointer to struct path to fill */ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, - struct nameidata *nd) + struct path *path) { - nd->root.dentry = dentry; - nd->root.mnt = mnt; + struct nameidata nd; + int err; + nd.root.dentry = dentry; + nd.root.mnt = mnt; + BUG_ON(flags & LOOKUP_PARENT); /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ - return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd); + err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd); + if (!err) + *path = nd.path; + return err; } static struct dentry *__lookup_hash(struct qstr *name, @@ -1672,7 +1599,7 @@ struct dentry *dentry; int err; - err = exec_permission(inode, 0); + err = inode_permission(inode, MAY_EXEC); if (err) return ERR_PTR(err); @@ -1683,8 +1610,34 @@ */ dentry = d_lookup(base, name); - if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) - dentry = do_revalidate(dentry, nd); + if (dentry && d_need_lookup(dentry)) { + /* + * __lookup_hash is called with the parent dir's i_mutex already + * held, so we are good to go here. + */ + dentry = d_inode_lookup(base, dentry, nd); + if (IS_ERR(dentry)) + return dentry; + } + + if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) { + int status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + /* + * The dentry failed validation. + * If d_revalidate returned 0 attempt to invalidate + * the dentry otherwise d_revalidate is asking us + * to return a fail status. + */ + if (status < 0) { + dput(dentry); + return ERR_PTR(status); + } else if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + } + } + } if (!dentry) dentry = d_alloc_and_lookup(base, name, nd); @@ -2012,27 +1965,10 @@ return error; } -/* - * Note that while the flag value (low two bits) for sys_open means: - * 00 - read-only - * 01 - write-only - * 10 - read-write - * 11 - special - * it is changed into - * 00 - no permissions needed - * 01 - read-permission - * 10 - write-permission - * 11 - read-write - * for the internal routines (ie open_namei()/follow_link() etc) - * This is more logical, and also allows the 00 "no perm needed" - * to be used for symlinks (where the permissions are checked - * later). - * -*/ static inline int open_to_namei_flags(int flag) { - if ((flag+1) & O_ACCMODE) - flag++; + if ((flag & O_ACCMODE) == 3) + flag--; return flag; } @@ -2327,35 +2263,29 @@ return file; } -/** - * lookup_create - lookup a dentry, creating it if it doesn't exist - * @nd: nameidata info - * @is_dir: directory flag - * - * Simple function to lookup and return a dentry and create it - * if it doesn't exist. Is SMP-safe. - * - * Returns with nd->path.dentry->d_inode->i_mutex locked. - */ -struct dentry *lookup_create(struct nameidata *nd, int is_dir) +struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir) { struct dentry *dentry = ERR_PTR(-EEXIST); + struct nameidata nd; + int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); + if (error) + return ERR_PTR(error); - mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ - if (nd->last_type != LAST_NORM) - goto fail; - nd->flags &= ~LOOKUP_PARENT; - nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL; - nd->intent.open.flags = O_EXCL; + if (nd.last_type != LAST_NORM) + goto out; + nd.flags &= ~LOOKUP_PARENT; + nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; + nd.intent.open.flags = O_EXCL; /* * Do the final lookup. */ - dentry = lookup_hash(nd); + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_hash(&nd); if (IS_ERR(dentry)) goto fail; @@ -2367,18 +2297,35 @@ * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ - if (unlikely(!is_dir && nd->last.name[nd->last.len])) { + if (unlikely(!is_dir && nd.last.name[nd.last.len])) { dput(dentry); dentry = ERR_PTR(-ENOENT); + goto fail; } + *path = nd.path; return dentry; eexist: dput(dentry); dentry = ERR_PTR(-EEXIST); fail: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +out: + path_put(&nd.path); return dentry; } -EXPORT_SYMBOL_GPL(lookup_create); +EXPORT_SYMBOL(kern_path_create); + +struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) +{ + char *tmp = getname(pathname); + struct dentry *res; + if (IS_ERR(tmp)) + return ERR_CAST(tmp); + res = kern_path_create(dfd, tmp, path, is_dir); + putname(tmp); + return res; +} +EXPORT_SYMBOL(user_path_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { @@ -2428,54 +2375,46 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode, unsigned, dev) { - int error; - char *tmp; struct dentry *dentry; - struct nameidata nd; + struct path path; + int error; if (S_ISDIR(mode)) return -EPERM; - error = user_path_parent(dfd, filename, &nd, &tmp); - if (error) - return error; + dentry = user_path_create(dfd, filename, &path, 0); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); - dentry = lookup_create(&nd, 0); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_unlock; - } - if (!IS_POSIXACL(nd.path.dentry->d_inode)) + if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); error = may_mknod(mode); if (error) goto out_dput; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto out_dput; - error = security_path_mknod(&nd.path, dentry, mode, dev); + error = security_path_mknod(&path, dentry, mode, dev); if (error) goto out_drop_write; switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); + error = vfs_create(path.dentry->d_inode,dentry,mode,NULL); break; case S_IFCHR: case S_IFBLK: - error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode, + error = vfs_mknod(path.dentry->d_inode,dentry,mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: - error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); + error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); break; } out_drop_write: - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path.mnt); out_dput: dput(dentry); -out_unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); - putname(tmp); + mutex_unlock(&path.dentry->d_inode->i_mutex); + path_put(&path); return error; } @@ -2508,38 +2447,29 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode) { - int error = 0; - char * tmp; struct dentry *dentry; - struct nameidata nd; + struct path path; + int error; - error = user_path_parent(dfd, pathname, &nd, &tmp); - if (error) - goto out_err; - - dentry = lookup_create(&nd, 1); - error = PTR_ERR(dentry); + dentry = user_path_create(dfd, pathname, &path, 1); if (IS_ERR(dentry)) - goto out_unlock; + return PTR_ERR(dentry); - if (!IS_POSIXACL(nd.path.dentry->d_inode)) + if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto out_dput; - error = security_path_mkdir(&nd.path, dentry, mode); + error = security_path_mkdir(&path, dentry, mode); if (error) goto out_drop_write; - error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); out_drop_write: - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path.mnt); out_dput: dput(dentry); -out_unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); - putname(tmp); -out_err: + mutex_unlock(&path.dentry->d_inode->i_mutex); + path_put(&path); return error; } @@ -2799,38 +2729,31 @@ { int error; char *from; - char *to; struct dentry *dentry; - struct nameidata nd; + struct path path; from = getname(oldname); if (IS_ERR(from)) return PTR_ERR(from); - error = user_path_parent(newdfd, newname, &nd, &to); - if (error) - goto out_putname; - - dentry = lookup_create(&nd, 0); + dentry = user_path_create(newdfd, newname, &path, 0); error = PTR_ERR(dentry); if (IS_ERR(dentry)) - goto out_unlock; + goto out_putname; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto out_dput; - error = security_path_symlink(&nd.path, dentry, from); + error = security_path_symlink(&path, dentry, from); if (error) goto out_drop_write; - error = vfs_symlink(nd.path.dentry->d_inode, dentry, from); + error = vfs_symlink(path.dentry->d_inode, dentry, from); out_drop_write: - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path.mnt); out_dput: dput(dentry); -out_unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); - putname(to); + mutex_unlock(&path.dentry->d_inode->i_mutex); + path_put(&path); out_putname: putname(from); return error; @@ -2895,11 +2818,9 @@ int, newdfd, const char __user *, newname, int, flags) { struct dentry *new_dentry; - struct nameidata nd; - struct path old_path; + struct path old_path, new_path; int how = 0; int error; - char *to; if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; @@ -2921,32 +2842,27 @@ if (error) return error; - error = user_path_parent(newdfd, newname, &nd, &to); - if (error) - goto out; - error = -EXDEV; - if (old_path.mnt != nd.path.mnt) - goto out_release; - new_dentry = lookup_create(&nd, 0); + new_dentry = user_path_create(newdfd, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) - goto out_unlock; - error = mnt_want_write(nd.path.mnt); + goto out; + + error = -EXDEV; + if (old_path.mnt != new_path.mnt) + goto out_dput; + error = mnt_want_write(new_path.mnt); if (error) goto out_dput; - error = security_path_link(old_path.dentry, &nd.path, new_dentry); + error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_drop_write; - error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry); + error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); out_drop_write: - mnt_drop_write(nd.path.mnt); + mnt_drop_write(new_path.mnt); out_dput: dput(new_dentry); -out_unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); -out_release: - path_put(&nd.path); - putname(to); + mutex_unlock(&new_path.dentry->d_inode->i_mutex); + path_put(&new_path); out: path_put(&old_path); @@ -3352,11 +3268,9 @@ EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(kern_path_parent); EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(file_permission); EXPORT_SYMBOL(unlock_rename); EXPORT_SYMBOL(vfs_create); EXPORT_SYMBOL(vfs_follow_link);
diff --git a/fs/namespace.c b/fs/namespace.c index fe59bd1..cda50fe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c
@@ -934,8 +934,8 @@ int res = 0; br_read_lock(vfsmount_lock); - if (p->event != ns->event) { - p->event = ns->event; + if (p->m.poll_event != ns->event) { + p->m.poll_event = ns->event; res = 1; } br_read_unlock(vfsmount_lock);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 0ed65e0..64a3264 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c
@@ -20,9 +20,9 @@ #include "ncp_fs.h" -static int ncp_fsync(struct file *file, int datasync) +static int ncp_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - return 0; + return filemap_write_and_wait_range(file->f_mapping, start, end); } /*
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 8469031..c98b439 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c
@@ -113,19 +113,18 @@ int nfs_cache_register(struct cache_detail *cd) { - struct nameidata nd; struct vfsmount *mnt; + struct path path; int ret; mnt = rpc_get_mount(); if (IS_ERR(mnt)) return PTR_ERR(mnt); - ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd); + ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &path); if (ret) goto err; - ret = sunrpc_cache_register_pipefs(nd.path.dentry, - cd->name, 0600, cd); - path_put(&nd.path); + ret = sunrpc_cache_register_pipefs(path.dentry, cd->name, 0600, cd); + path_put(&path); if (!ret) return ret; err:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ededdbd..57f578e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c
@@ -56,7 +56,7 @@ static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -static int nfs_fsync_dir(struct file *, int); +static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); static void nfs_readdir_clear_array(struct page*); @@ -945,15 +945,19 @@ * All directory operations under NFS are synchronous, so fsync() * is a dummy operation. */ -static int nfs_fsync_dir(struct file *filp, int datasync) +static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, + int datasync) { struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); + mutex_lock(&inode->i_mutex); nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC); + mutex_unlock(&inode->i_mutex); return 0; } @@ -997,14 +1001,12 @@ * Return the intent data that applies to this particular path component * * Note that the current set of intents only apply to the very last - * component of the path. - * We check for this using LOOKUP_CONTINUE and LOOKUP_PARENT. + * component of the path and none of them is set before that last + * component. */ static inline unsigned int nfs_lookup_check_intent(struct nameidata *nd, unsigned int mask) { - if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT)) - return 0; return nd->flags & mask; } @@ -1338,25 +1340,31 @@ return 0; /* Are we trying to write to a read only partition? */ if (__mnt_is_readonly(nd->path.mnt) && - (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) + (nd->intent.open.flags & (O_CREAT|O_TRUNC|O_ACCMODE))) return 0; return 1; } -static struct nfs_open_context *nameidata_to_nfs_open_context(struct dentry *dentry, struct nameidata *nd) +static fmode_t flags_to_mode(int flags) { - struct path path = { - .mnt = nd->path.mnt, - .dentry = dentry, - }; + fmode_t res = (__force fmode_t)flags & FMODE_EXEC; + if ((flags & O_ACCMODE) != O_WRONLY) + res |= FMODE_READ; + if ((flags & O_ACCMODE) != O_RDONLY) + res |= FMODE_WRITE; + return res; +} + +static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags) +{ struct nfs_open_context *ctx; struct rpc_cred *cred; - fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + fmode_t fmode = flags_to_mode(open_flags); cred = rpc_lookup_cred(); if (IS_ERR(cred)) return ERR_CAST(cred); - ctx = alloc_nfs_open_context(&path, cred, fmode); + ctx = alloc_nfs_open_context(dentry, cred, fmode); put_rpccred(cred); if (ctx == NULL) return ERR_PTR(-ENOMEM); @@ -1376,13 +1384,13 @@ /* If the open_intent is for execute, we have an extra check to make */ if (ctx->mode & FMODE_EXEC) { - ret = nfs_may_open(ctx->path.dentry->d_inode, + ret = nfs_may_open(ctx->dentry->d_inode, ctx->cred, nd->intent.open.flags); if (ret < 0) goto out; } - filp = lookup_instantiate_filp(nd, ctx->path.dentry, do_open); + filp = lookup_instantiate_filp(nd, ctx->dentry, do_open); if (IS_ERR(filp)) ret = PTR_ERR(filp); else @@ -1420,12 +1428,13 @@ goto out; } - ctx = nameidata_to_nfs_open_context(dentry, nd); + open_flags = nd->intent.open.flags; + + ctx = create_nfs_open_context(dentry, open_flags); res = ERR_CAST(ctx); if (IS_ERR(ctx)) goto out; - open_flags = nd->intent.open.flags; if (nd->flags & LOOKUP_CREATE) { attr.ia_mode = nd->intent.open.create_mode; attr.ia_valid = ATTR_MODE; @@ -1463,8 +1472,8 @@ res = d_add_unique(dentry, inode); nfs_unblock_sillyrename(dentry->d_parent); if (res != NULL) { - dput(ctx->path.dentry); - ctx->path.dentry = dget(res); + dput(ctx->dentry); + ctx->dentry = dget(res); dentry = res; } err = nfs_intent_set_file(nd, ctx); @@ -1517,7 +1526,7 @@ /* We can't create new files, or truncate existing ones here */ openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); - ctx = nameidata_to_nfs_open_context(dentry, nd); + ctx = create_nfs_open_context(dentry, openflags); ret = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; @@ -1570,7 +1579,7 @@ struct nfs_open_context *ctx = NULL; struct iattr attr; int error; - int open_flags = 0; + int open_flags = O_CREAT|O_EXCL; dfprintk(VFS, "NFS: create(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1578,27 +1587,27 @@ attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - if ((nd->flags & LOOKUP_CREATE) != 0) { + if (nd) open_flags = nd->intent.open.flags; - ctx = nameidata_to_nfs_open_context(dentry, nd); - error = PTR_ERR(ctx); - if (IS_ERR(ctx)) - goto out_err_drop; - } + ctx = create_nfs_open_context(dentry, open_flags); + error = PTR_ERR(ctx); + if (IS_ERR(ctx)) + goto out_err_drop; error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, ctx); if (error != 0) goto out_put_ctx; - if (ctx != NULL) { + if (nd) { error = nfs_intent_set_file(nd, ctx); if (error < 0) goto out_err; + } else { + put_nfs_open_context(ctx); } return 0; out_put_ctx: - if (ctx != NULL) - put_nfs_open_context(ctx); + put_nfs_open_context(ctx); out_err_drop: d_drop(dentry); out_err: @@ -1660,7 +1669,7 @@ { struct iattr attr; int error; - int open_flags = 0; + int open_flags = O_CREAT|O_EXCL; dfprintk(VFS, "NFS: create(%s/%ld), %s\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); @@ -1668,7 +1677,7 @@ attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; - if ((nd->flags & LOOKUP_CREATE) != 0) + if (nd) open_flags = nd->intent.open.flags; error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL); @@ -2259,11 +2268,11 @@ { int mask = 0; - if (openflags & FMODE_READ) + if ((openflags & O_ACCMODE) != O_WRONLY) mask |= MAY_READ; - if (openflags & FMODE_WRITE) + if ((openflags & O_ACCMODE) != O_RDONLY) mask |= MAY_WRITE; - if (openflags & FMODE_EXEC) + if (openflags & __FMODE_EXEC) mask |= MAY_EXEC; return mask; } @@ -2273,12 +2282,12 @@ return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); } -int nfs_permission(struct inode *inode, int mask, unsigned int flags) +int nfs_permission(struct inode *inode, int mask) { struct rpc_cred *cred; int res = 0; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; nfs_inc_stats(inode, NFSIOS_VFSACCESS); @@ -2328,7 +2337,7 @@ out_notsup: res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) - res = generic_permission(inode, mask, flags, NULL); + res = generic_permission(inode, mask); goto out; }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 8eea253..b35d25b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c
@@ -284,7 +284,7 @@ loff_t pos) { struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->path.dentry->d_inode; + struct inode *inode = ctx->dentry->d_inode; unsigned long user_addr = (unsigned long)iov->iov_base; size_t count = iov->iov_len; size_t rsize = NFS_SERVER(inode)->rsize; @@ -715,7 +715,7 @@ loff_t pos, int sync) { struct nfs_open_context *ctx = dreq->ctx; - struct inode *inode = ctx->path.dentry->d_inode; + struct inode *inode = ctx->dentry->d_inode; unsigned long user_addr = (unsigned long)iov->iov_base; size_t count = iov->iov_len; struct rpc_task *task;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2f093ed..28b8c3f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c
@@ -55,7 +55,7 @@ static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, unsigned long nr_segs, loff_t pos); static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_file_fsync(struct file *, int datasync); +static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync); static int nfs_check_flags(int flags); static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); @@ -187,8 +187,11 @@ filp->f_path.dentry->d_name.name, offset, origin); - /* origin == SEEK_END => we must revalidate the cached file length */ - if (origin == SEEK_END) { + /* + * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate + * the cached file length + */ + if (origin != SEEK_SET || origin != SEEK_CUR) { struct inode *inode = filp->f_mapping->host; int retval = nfs_revalidate_file_size(inode, filp); @@ -305,7 +308,7 @@ * fall back to doing a synchronous write. */ static int -nfs_file_fsync(struct file *file, int datasync) +nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file->f_path.dentry; struct nfs_open_context *ctx = nfs_file_open_context(file); @@ -313,11 +316,15 @@ int have_error, status; int ret = 0; - dprintk("NFS: fsync file(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + nfs_inc_stats(inode, NFSIOS_VFSFSYNC); have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); status = nfs_commit_inode(inode, FLUSH_SYNC); @@ -329,6 +336,7 @@ if (!ret && !datasync) /* application has asked for meta-data sync */ ret = pnfs_layoutcommit_inode(inode, true); + mutex_unlock(&inode->i_mutex); return ret; }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6f4850d..fe12037 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c
@@ -567,7 +567,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *res, *new = NULL; - struct inode *inode = ctx->path.dentry->d_inode; + struct inode *inode = ctx->dentry->d_inode; spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); @@ -594,7 +594,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx) { struct nfs_open_context *ctx = l_ctx->open_context; - struct inode *inode = ctx->path.dentry->d_inode; + struct inode *inode = ctx->dentry->d_inode; if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; @@ -620,7 +620,7 @@ return; if (!is_sync) return; - inode = ctx->path.dentry->d_inode; + inode = ctx->dentry->d_inode; if (!list_empty(&NFS_I(inode)->open_files)) return; server = NFS_SERVER(inode); @@ -629,14 +629,14 @@ nfs_revalidate_inode(server, inode); } -struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode) +struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred, fmode_t f_mode) { struct nfs_open_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { - ctx->path = *path; - path_get(&ctx->path); + nfs_sb_active(dentry->d_sb); + ctx->dentry = dget(dentry); ctx->cred = get_rpccred(cred); ctx->state = NULL; ctx->mode = f_mode; @@ -658,7 +658,8 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { - struct inode *inode = ctx->path.dentry->d_inode; + struct inode *inode = ctx->dentry->d_inode; + struct super_block *sb = ctx->dentry->d_sb; if (!list_empty(&ctx->list)) { if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) @@ -671,7 +672,8 @@ NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) put_rpccred(ctx->cred); - path_put(&ctx->path); + dput(ctx->dentry); + nfs_sb_deactive(sb); kfree(ctx); } @@ -741,7 +743,7 @@ cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); - ctx = alloc_nfs_open_context(&filp->f_path, cred, filp->f_mode); + ctx = alloc_nfs_open_context(filp->f_path.dentry, cred, filp->f_mode); put_rpccred(cred); if (ctx == NULL) return -ENOMEM;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c4a6983..b788f2e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h
@@ -238,7 +238,7 @@ extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *); extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *); -extern int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); +extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); @@ -341,8 +341,8 @@ extern void nfs4_put_state_owner(struct nfs4_state_owner *); extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); -extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); -extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); +extern void nfs4_close_state(struct nfs4_state *, fmode_t); +extern void nfs4_close_sync(struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); @@ -373,8 +373,8 @@ #else -#define nfs4_close_state(a, b, c) do { } while (0) -#define nfs4_close_sync(a, b, c) do { } while (0) +#define nfs4_close_state(a, b) do { } while (0) +#define nfs4_close_sync(a, b) do { } while (0) #endif /* CONFIG_NFS_V4 */ #endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5879b23..26bece8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c
@@ -763,8 +763,8 @@ struct nfs_open_confirmres c_res; struct nfs_fattr f_attr; struct nfs_fattr dir_attr; - struct path path; struct dentry *dir; + struct dentry *dentry; struct nfs4_state_owner *owner; struct nfs4_state *state; struct iattr attrs; @@ -786,12 +786,12 @@ nfs_fattr_init(&p->dir_attr); } -static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, +static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct nfs4_state_owner *sp, fmode_t fmode, int flags, const struct iattr *attrs, gfp_t gfp_mask) { - struct dentry *parent = dget_parent(path->dentry); + struct dentry *parent = dget_parent(dentry); struct inode *dir = parent->d_inode; struct nfs_server *server = NFS_SERVER(dir); struct nfs4_opendata *p; @@ -802,8 +802,8 @@ p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); if (p->o_arg.seqid == NULL) goto err_free; - path_get(path); - p->path = *path; + nfs_sb_active(dentry->d_sb); + p->dentry = dget(dentry); p->dir = parent; p->owner = sp; atomic_inc(&sp->so_count); @@ -812,7 +812,7 @@ p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id = sp->so_owner_id.id; - p->o_arg.name = &p->path.dentry->d_name; + p->o_arg.name = &dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; @@ -842,13 +842,15 @@ { struct nfs4_opendata *p = container_of(kref, struct nfs4_opendata, kref); + struct super_block *sb = p->dentry->d_sb; nfs_free_seqid(p->o_arg.seqid); if (p->state != NULL) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); dput(p->dir); - path_put(&p->path); + dput(p->dentry); + nfs_sb_deactive(sb); kfree(p); } @@ -1130,7 +1132,7 @@ { struct nfs4_opendata *opendata; - opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL, GFP_NOFS); + opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, NULL, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -1154,7 +1156,7 @@ newstate = nfs4_opendata_to_nfs4_state(opendata); if (IS_ERR(newstate)) return PTR_ERR(newstate); - nfs4_close_state(&opendata->path, newstate, fmode); + nfs4_close_state(newstate, fmode); *res = newstate; return 0; } @@ -1352,7 +1354,7 @@ goto out_free; state = nfs4_opendata_to_nfs4_state(data); if (!IS_ERR(state)) - nfs4_close_state(&data->path, state, data->o_arg.fmode); + nfs4_close_state(state, data->o_arg.fmode); out_free: nfs4_opendata_put(data); } @@ -1497,7 +1499,7 @@ goto out_free; state = nfs4_opendata_to_nfs4_state(data); if (!IS_ERR(state)) - nfs4_close_state(&data->path, state, data->o_arg.fmode); + nfs4_close_state(state, data->o_arg.fmode); out_free: nfs4_opendata_put(data); } @@ -1648,7 +1650,7 @@ return PTR_ERR(opendata); ret = nfs4_open_recover(opendata, state); if (ret == -ESTALE) - d_drop(ctx->path.dentry); + d_drop(ctx->dentry); nfs4_opendata_put(opendata); return ret; } @@ -1706,7 +1708,7 @@ /* * Returns a referenced nfs4_state */ -static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) +static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) { struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; @@ -1723,15 +1725,15 @@ status = nfs4_recover_expired_lease(server); if (status != 0) goto err_put_state_owner; - if (path->dentry->d_inode != NULL) - nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode); + if (dentry->d_inode != NULL) + nfs4_return_incompatible_delegation(dentry->d_inode, fmode); status = -ENOMEM; - opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr, GFP_KERNEL); + opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, GFP_KERNEL); if (opendata == NULL) goto err_put_state_owner; - if (path->dentry->d_inode != NULL) - opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp); + if (dentry->d_inode != NULL) + opendata->state = nfs4_get_open_state(dentry->d_inode, sp); status = _nfs4_proc_open(opendata); if (status != 0) @@ -1769,14 +1771,14 @@ } -static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred) +static struct nfs4_state *nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred) { struct nfs4_exception exception = { }; struct nfs4_state *res; int status; do { - status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res); + status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, &res); if (status == 0) break; /* NOTE: BAD_SEQID means the server and client disagree about the @@ -1873,7 +1875,6 @@ } struct nfs4_closedata { - struct path path; struct inode *inode; struct nfs4_state *state; struct nfs_closeargs arg; @@ -1888,13 +1889,14 @@ { struct nfs4_closedata *calldata = data; struct nfs4_state_owner *sp = calldata->state->owner; + struct super_block *sb = calldata->state->inode->i_sb; if (calldata->roc) pnfs_roc_release(calldata->state->inode); nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); - path_put(&calldata->path); + nfs_sb_deactive(sb); kfree(calldata); } @@ -2014,7 +2016,7 @@ * * NOTE: Caller must be holding the sp->so_owner semaphore! */ -int nfs4_do_close(struct path *path, struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) +int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_closedata *calldata; @@ -2050,8 +2052,7 @@ calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; calldata->roc = roc; - path_get(path); - calldata->path = *path; + nfs_sb_active(calldata->inode->i_sb); msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; @@ -2080,7 +2081,7 @@ struct nfs4_state *state; /* Protect against concurrent sillydeletes */ - state = nfs4_do_open(dir, &ctx->path, ctx->mode, open_flags, attr, ctx->cred); + state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, ctx->cred); if (IS_ERR(state)) return ERR_CAST(state); ctx->state = state; @@ -2092,9 +2093,9 @@ if (ctx->state == NULL) return; if (is_sync) - nfs4_close_sync(&ctx->path, ctx->state, ctx->mode); + nfs4_close_sync(ctx->state, ctx->mode); else - nfs4_close_state(&ctx->path, ctx->state, ctx->mode); + nfs4_close_state(ctx->state, ctx->mode); } static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) @@ -2616,10 +2617,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags, struct nfs_open_context *ctx) { - struct path my_path = { - .dentry = dentry, - }; - struct path *path = &my_path; + struct dentry *de = dentry; struct nfs4_state *state; struct rpc_cred *cred = NULL; fmode_t fmode = 0; @@ -2627,11 +2625,11 @@ if (ctx != NULL) { cred = ctx->cred; - path = &ctx->path; + de = ctx->dentry; fmode = ctx->mode; } sattr->ia_mode &= ~current_umask(); - state = nfs4_do_open(dir, path, fmode, flags, sattr, cred); + state = nfs4_do_open(dir, de, fmode, flags, sattr, cred); d_drop(dentry); if (IS_ERR(state)) { status = PTR_ERR(state); @@ -2642,7 +2640,7 @@ if (ctx != NULL) ctx->state = state; else - nfs4_close_sync(path, state, fmode); + nfs4_close_sync(state, fmode); out: return status; } @@ -4294,7 +4292,7 @@ memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, sizeof(data->lsp->ls_stateid.data)); data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; - renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); } out: dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e97dd21..7acfe88 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c
@@ -641,7 +641,7 @@ /* * Close the current file. */ -static void __nfs4_close(struct path *path, struct nfs4_state *state, +static void __nfs4_close(struct nfs4_state *state, fmode_t fmode, gfp_t gfp_mask, int wait) { struct nfs4_state_owner *owner = state->owner; @@ -685,18 +685,18 @@ } else { bool roc = pnfs_roc(state->inode); - nfs4_do_close(path, state, gfp_mask, wait, roc); + nfs4_do_close(state, gfp_mask, wait, roc); } } -void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +void nfs4_close_state(struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(path, state, fmode, GFP_NOFS, 0); + __nfs4_close(state, fmode, GFP_NOFS, 0); } -void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) +void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) { - __nfs4_close(path, state, fmode, GFP_KERNEL, 1); + __nfs4_close(state, fmode, GFP_KERNEL, 1); } /*
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 0098557..18449f4 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c
@@ -114,7 +114,7 @@ if (!nfs_lock_request_dontget(req)) return 0; if (test_bit(PG_MAPPED, &req->wb_flags)) - radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); + radix_tree_tag_set(&NFS_I(req->wb_context->dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); return 1; } @@ -124,7 +124,7 @@ void nfs_clear_page_tag_locked(struct nfs_page *req) { if (test_bit(PG_MAPPED, &req->wb_flags)) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = req->wb_context->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 20a7f95..a68679f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c
@@ -144,7 +144,7 @@ static void nfs_readpage_release(struct nfs_page *req) { - struct inode *d_inode = req->wb_context->path.dentry->d_inode; + struct inode *d_inode = req->wb_context->dentry->d_inode; if (PageUptodate(req->wb_page)) nfs_readpage_to_fscache(d_inode, req->wb_page, 0); @@ -152,8 +152,8 @@ unlock_page(req->wb_page); dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", - req->wb_context->path.dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), + req->wb_context->dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); @@ -207,7 +207,7 @@ unsigned int count, unsigned int offset, struct pnfs_layout_segment *lseg) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = req->wb_context->dentry->d_inode; data->req = req; data->inode = inode;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ce40e5c..b961cea 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c
@@ -2773,16 +2773,12 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path) { - struct nameidata *nd = NULL; struct mnt_namespace *ns_private; struct super_block *s; struct dentry *dentry; + struct path path; int ret; - nd = kmalloc(sizeof(*nd), GFP_KERNEL); - if (nd == NULL) - return ERR_PTR(-ENOMEM); - ns_private = create_mnt_ns(root_mnt); ret = PTR_ERR(ns_private); if (IS_ERR(ns_private)) @@ -2793,7 +2789,7 @@ goto out_put_mnt_ns; ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, - export_path, LOOKUP_FOLLOW, nd); + export_path, LOOKUP_FOLLOW, &path); nfs_referral_loop_unprotect(); put_mnt_ns(ns_private); @@ -2801,12 +2797,11 @@ if (ret != 0) goto out_err; - s = nd->path.mnt->mnt_sb; + s = path.mnt->mnt_sb; atomic_inc(&s->s_active); - dentry = dget(nd->path.dentry); + dentry = dget(path.dentry); - path_put(&nd->path); - kfree(nd); + path_put(&path); down_write(&s->s_umount); return dentry; out_put_mnt_ns: @@ -2814,7 +2809,6 @@ out_mntput: mntput(root_mnt); out_err: - kfree(nd); return ERR_PTR(ret); }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7271680..0857931 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c
@@ -409,7 +409,7 @@ */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = req->wb_context->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); BUG_ON (!NFS_WBACK_BUSY(req)); @@ -438,7 +438,7 @@ static void nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = req->wb_context->dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); @@ -852,13 +852,13 @@ struct pnfs_layout_segment *lseg, int how) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = req->wb_context->dentry->d_inode; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ data->req = req; - data->inode = inode = req->wb_context->path.dentry->d_inode; + data->inode = inode = req->wb_context->dentry->d_inode; data->cred = req->wb_context->cred; data->lseg = get_lseg(lseg); @@ -1053,9 +1053,9 @@ dprintk("NFS: %5u write(%s/%lld %d@%lld)", task->tk_pid, - data->req->wb_context->path.dentry->d_inode->i_sb->s_id, + data->req->wb_context->dentry->d_inode->i_sb->s_id, (long long) - NFS_FILEID(data->req->wb_context->path.dentry->d_inode), + NFS_FILEID(data->req->wb_context->dentry->d_inode), data->req->wb_bytes, (long long)req_offset(data->req)); nfs_writeback_done(task, data); @@ -1148,8 +1148,8 @@ dprintk("NFS: %5u write (%s/%lld %d@%lld)", data->task.tk_pid, - req->wb_context->path.dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), + req->wb_context->dentry->d_inode->i_sb->s_id, + (long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); @@ -1347,7 +1347,7 @@ struct pnfs_layout_segment *lseg) { struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = first->wb_context->path.dentry->d_inode; + struct inode *inode = first->wb_context->dentry->d_inode; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1435,8 +1435,8 @@ nfs_clear_request_commit(req); dprintk("NFS: commit (%s/%lld %d@%lld)", - req->wb_context->path.dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), + req->wb_context->dentry->d_sb->s_id, + (long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); if (status < 0) {
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index ffb59ef..29d77f6 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c
@@ -191,52 +191,42 @@ } static int -nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) +nfsd4_list_rec_dir(recdir_func *f) { const struct cred *original_cred; - struct file *filp; + struct dentry *dir = rec_file->f_path.dentry; LIST_HEAD(names); - struct name_list *entry; - struct dentry *dentry; int status; - if (!rec_file) - return 0; - status = nfs4_save_creds(&original_cred); if (status < 0) return status; - filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY, - current_cred()); - status = PTR_ERR(filp); - if (IS_ERR(filp)) - goto out; - status = vfs_readdir(filp, nfsd4_build_namelist, &names); - fput(filp); + status = vfs_llseek(rec_file, 0, SEEK_SET); + if (status < 0) { + nfs4_reset_creds(original_cred); + return status; + } + + status = vfs_readdir(rec_file, nfsd4_build_namelist, &names); mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); while (!list_empty(&names)) { + struct name_list *entry; entry = list_entry(names.next, struct name_list, list); - - dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); - if (IS_ERR(dentry)) { - status = PTR_ERR(dentry); - break; + if (!status) { + struct dentry *dentry; + dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + break; + } + status = f(dir, dentry); + dput(dentry); } - status = f(dir, dentry); - dput(dentry); - if (status) - break; list_del(&entry->list); kfree(entry); } mutex_unlock(&dir->d_inode->i_mutex); -out: - while (!list_empty(&names)) { - entry = list_entry(names.next, struct name_list, list); - list_del(&entry->list); - kfree(entry); - } nfs4_reset_creds(original_cred); return status; } @@ -322,7 +312,7 @@ status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out; - status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old); + status = nfsd4_list_rec_dir(purge_old); if (status == 0) vfs_fsync(rec_file, 0); mnt_drop_write(rec_file->f_path.mnt); @@ -352,7 +342,7 @@ if (!rec_file) return 0; - status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir); + status = nfsd4_list_rec_dir(load_recdir); if (status) printk("nfsd4: failed loading clients from recovery" " directory %s\n", rec_file->f_path.dentry->d_name.name);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index d7eeca6..2660152 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@ #include "nilfs.h" #include "segment.h" -int nilfs_sync_file(struct file *file, int datasync) +int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { /* * Called from fsync() system call @@ -40,8 +40,15 @@ struct inode *inode = file->f_mapping->host; int err; - if (!nilfs_inode_dirty(inode)) + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + mutex_lock(&inode->i_mutex); + + if (!nilfs_inode_dirty(inode)) { + mutex_unlock(&inode->i_mutex); return 0; + } if (datasync) err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0, @@ -49,6 +56,7 @@ else err = nilfs_construct_segment(inode->i_sb); + mutex_unlock(&inode->i_mutex); return err; }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b9b45fc..666628b 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c
@@ -259,8 +259,8 @@ return 0; /* Needs synchronization with the cleaner */ - size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, nilfs_get_block, NULL); + size = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + nilfs_get_block); /* * In case of error extending write may have instantiated a few @@ -778,6 +778,8 @@ if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { + inode_dio_wait(inode); + err = vmtruncate(inode, iattr->ia_size); if (unlikely(err)) goto out_err; @@ -799,14 +801,14 @@ return err; } -int nilfs_permission(struct inode *inode, int mask, unsigned int flags) +int nilfs_permission(struct inode *inode, int mask) { struct nilfs_root *root = NILFS_I(inode)->i_root; if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ - return generic_permission(inode, mask, flags, NULL); + return generic_permission(inode, mask); } int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 546849b..a314199 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c
@@ -72,12 +72,7 @@ return ERR_PTR(-ENAMETOOLONG); ino = nilfs_inode_by_name(dir, &dentry->d_name); - inode = NULL; - if (ino) { - inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - } + inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL; return d_splice_alias(inode, dentry); }
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index f02b9ad..255d5e1 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h
@@ -235,7 +235,7 @@ struct page *, struct inode *); /* file.c */ -extern int nilfs_sync_file(struct file *, int); +extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); /* ioctl.c */ long nilfs_ioctl(struct file *, unsigned int, unsigned long); @@ -264,7 +264,7 @@ extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); extern int nilfs_setattr(struct dentry *, struct iattr *); -int nilfs_permission(struct inode *inode, int mask, unsigned int flags); +int nilfs_permission(struct inode *inode, int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 0f48e7c..99e3610 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c
@@ -1527,13 +1527,20 @@ * this problem for now. We do write the $BITMAP attribute if it is present * which is the important one for a directory so things are not too bad. */ -static int ntfs_dir_fsync(struct file *filp, int datasync) +static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *bmp_vi, *vi = filp->f_mapping->host; int err, ret; ntfs_attr na; ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + + err = filemap_write_and_wait_range(vi->i_mapping, start, end); + if (err) + return err; + mutex_lock(&vi->i_mutex); + BUG_ON(!S_ISDIR(vi->i_mode)); /* If the bitmap attribute inode is in memory sync it, too. */ na.mft_no = vi->i_ino; @@ -1555,6 +1562,7 @@ else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); + mutex_unlock(&vi->i_mutex); return ret; }
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f4b1057..c587e2d 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c
@@ -1832,9 +1832,8 @@ * fails again. */ if (unlikely(NInoTruncateFailed(ni))) { - down_write(&vi->i_alloc_sem); + inode_dio_wait(vi); err = ntfs_truncate(vi); - up_write(&vi->i_alloc_sem); if (err || NInoTruncateFailed(ni)) { if (!err) err = -EIO; @@ -2153,12 +2152,19 @@ * with this inode but since we have no simple way of getting to them we ignore * this problem for now. */ -static int ntfs_file_fsync(struct file *filp, int datasync) +static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *vi = filp->f_mapping->host; int err, ret = 0; ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + + err = filemap_write_and_wait_range(vi->i_mapping, start, end); + if (err) + return err; + mutex_lock(&vi->i_mutex); + BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) ret = __ntfs_write_inode(vi, 1); @@ -2176,6 +2182,7 @@ else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); + mutex_unlock(&vi->i_mutex); return ret; }
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index c05d6dc..1371487 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c
@@ -2357,12 +2357,7 @@ * * Returns 0 on success or -errno on error. * - * Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for - * writing. The only case in the kernel where ->i_alloc_sem is not held is - * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called - * with the current i_size as the offset. The analogous place in NTFS is in - * fs/ntfs/file.c::ntfs_file_buffered_write() where we call vmtruncate() again - * without holding ->i_alloc_sem. + * Called with ->i_mutex held. */ int ntfs_truncate(struct inode *vi) { @@ -2887,8 +2882,7 @@ * We also abort all changes of user, group, and mode as we do not implement * the NTFS ACLs yet. * - * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also - * called with ->i_alloc_sem held for writing. + * Called with ->i_mutex held. */ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) {
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e913ad1..1cee970 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c
@@ -290,14 +290,14 @@ return ret; } -int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags) +int ocfs2_check_acl(struct inode *inode, int mask) { struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; struct posix_acl *acl; int ret = -EAGAIN; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; osb = OCFS2_SB(inode->i_sb);
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 4fe7c9c..5c5d31f 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ __le32 e_id; }; -extern int ocfs2_check_acl(struct inode *, int, unsigned int); +extern int ocfs2_check_acl(struct inode *, int); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ac97bca..c1efe93 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c
@@ -551,9 +551,8 @@ /* * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're - * particularly interested in the aio/dio case. Like the core uses - * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from - * truncation on another. + * particularly interested in the aio/dio case. We use the rw_lock DLM lock + * to protect io on one node from truncation on another. */ static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, @@ -568,10 +567,8 @@ /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (ocfs2_iocb_is_sem_locked(iocb)) { - up_read(&inode->i_alloc_sem); + if (ocfs2_iocb_is_sem_locked(iocb)) ocfs2_iocb_clear_sem_locked(iocb); - } ocfs2_iocb_clear_rw_locked(iocb); @@ -580,6 +577,7 @@ if (is_async) aio_complete(iocb, ret, 0); + inode_dio_done(inode); } /*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index b1e35a3..0fc2bd3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c
@@ -171,7 +171,8 @@ return 0; } -static int ocfs2_sync_file(struct file *file, int datasync) +static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, + int datasync) { int err = 0; journal_t *journal; @@ -184,6 +185,16 @@ file->f_path.dentry->d_name.name, (unsigned long long)datasync); + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + /* + * Probably don't need the i_mutex at all in here, just putting it here + * to be consistent with how fsync used to be called, someone more + * familiar with the fs could possibly remove it. + */ + mutex_lock(&inode->i_mutex); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { /* * We still have to flush drive's caches to get data to the @@ -200,6 +211,7 @@ bail: if (err) mlog_errno(err); + mutex_unlock(&inode->i_mutex); return (err < 0) ? -EIO : 0; } @@ -1142,6 +1154,8 @@ if (status) goto bail_unlock; + inode_dio_wait(inode); + if (i_size_read(inode) > attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, @@ -1279,11 +1293,11 @@ return err; } -int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) +int ocfs2_permission(struct inode *inode, int mask) { int ret; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; ret = ocfs2_inode_lock(inode, NULL, 0); @@ -1293,7 +1307,7 @@ goto out; } - ret = generic_permission(inode, mask, flags, ocfs2_check_acl); + ret = generic_permission(inode, mask); ocfs2_inode_unlock(inode, 0); out: @@ -2236,9 +2250,8 @@ ocfs2_iocb_clear_sem_locked(iocb); relock: - /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ + /* to match setattr's i_mutex -> rw_lock ordering */ if (direct_io) { - down_read(&inode->i_alloc_sem); have_alloc_sem = 1; /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_sem_locked(iocb); @@ -2290,7 +2303,6 @@ */ if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - up_read(&inode->i_alloc_sem); have_alloc_sem = 0; rw_level = -1; @@ -2361,8 +2373,7 @@ /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that - * it can unlock our rw lock. (it's the clustered equivalent of - * i_alloc_sem; protects truncate from racing with pending ios). + * it can unlock our rw lock. * Unfortunately there are error cases which call end_io and others * that don't. so we don't have to unlock the rw_lock if either an * async dio is going to do it in the future or an end_io after an @@ -2378,10 +2389,8 @@ ocfs2_rw_unlock(inode, rw_level); out_sems: - if (have_alloc_sem) { - up_read(&inode->i_alloc_sem); + if (have_alloc_sem) ocfs2_iocb_clear_sem_locked(iocb); - } mutex_unlock(&inode->i_mutex); @@ -2531,7 +2540,6 @@ * need locks to protect pending reads from racing with truncate. */ if (filp->f_flags & O_DIRECT) { - down_read(&inode->i_alloc_sem); have_alloc_sem = 1; ocfs2_iocb_set_sem_locked(iocb); @@ -2574,10 +2582,9 @@ } bail: - if (have_alloc_sem) { - up_read(&inode->i_alloc_sem); + if (have_alloc_sem) ocfs2_iocb_clear_sem_locked(iocb); - } + if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); @@ -2593,12 +2600,14 @@ .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, + .check_acl = ocfs2_check_acl, }; const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .check_acl = ocfs2_check_acl, }; /*
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index f5afbbe..97bf761 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int ocfs2_permission(struct inode *inode, int mask, unsigned int flags); +int ocfs2_permission(struct inode *inode, int mask); int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index e5d738c..33889dc 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c
@@ -2498,4 +2498,5 @@ .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, + .check_acl = ocfs2_check_acl, };
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ebfd382..cf78233 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c
@@ -4368,25 +4368,6 @@ return inode_permission(dir, MAY_WRITE | MAY_EXEC); } -/* copied from user_path_parent. */ -static int ocfs2_user_path_parent(const char __user *path, - struct nameidata *nd, char **name) -{ - char *s = getname(path); - int error; - - if (IS_ERR(s)) - return PTR_ERR(s); - - error = kern_path_parent(s, nd); - if (error) - putname(s); - else - *name = s; - - return error; -} - /** * ocfs2_vfs_reflink - Create a reference-counted link * @@ -4460,10 +4441,8 @@ bool preserve) { struct dentry *new_dentry; - struct nameidata nd; - struct path old_path; + struct path old_path, new_path; int error; - char *to = NULL; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; @@ -4474,39 +4453,33 @@ return error; } - error = ocfs2_user_path_parent(newname, &nd, &to); - if (error) { + new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) { mlog_errno(error); goto out; } error = -EXDEV; - if (old_path.mnt != nd.path.mnt) - goto out_release; - new_dentry = lookup_create(&nd, 0); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) { + if (old_path.mnt != new_path.mnt) { mlog_errno(error); - goto out_unlock; + goto out_dput; } - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(new_path.mnt); if (error) { mlog_errno(error); goto out_dput; } error = ocfs2_vfs_reflink(old_path.dentry, - nd.path.dentry->d_inode, + new_path.dentry->d_inode, new_dentry, preserve); - mnt_drop_write(nd.path.mnt); + mnt_drop_write(new_path.mnt); out_dput: dput(new_dentry); -out_unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); -out_release: - path_put(&nd.path); - putname(to); + mutex_unlock(&new_path.dentry->d_inode->i_mutex); + path_put(&new_path); out: path_put(&old_path);
diff --git a/fs/open.c b/fs/open.c index b52cf01..739b751 100644 --- a/fs/open.c +++ b/fs/open.c
@@ -793,7 +793,7 @@ return nd->intent.open.file; out_err: release_open_intent(nd); - nd->intent.open.file = (struct file *)dentry; + nd->intent.open.file = ERR_CAST(dentry); goto out; } EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
diff --git a/fs/proc/base.c b/fs/proc/base.c index c47719a..91fb655 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c
@@ -673,7 +673,7 @@ p->m.private = p; p->ns = ns; p->root = root; - p->event = ns->event; + p->m.poll_event = ns->event; return 0; @@ -2167,9 +2167,9 @@ * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) +static int proc_fd_permission(struct inode *inode, int mask) { - int rv = generic_permission(inode, mask, flags, NULL); + int rv = generic_permission(inode, mask); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode))
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d167de3..1a77dbe 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c
@@ -294,7 +294,7 @@ return ret; } -static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) +static int proc_sys_permission(struct inode *inode, int mask) { /* * sysctl entries that are not writeable, @@ -316,7 +316,7 @@ if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ - error = sysctl_perm(head->root, table, mask); + error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error;
diff --git a/fs/read_write.c b/fs/read_write.c index 5520f8a..5907b49 100644 --- a/fs/read_write.c +++ b/fs/read_write.c
@@ -64,6 +64,23 @@ return file->f_pos; offset += file->f_pos; break; + case SEEK_DATA: + /* + * In the generic case the entire file is data, so as long as + * offset isn't at the end of the file then the offset is data. + */ + if (offset >= inode->i_size) + return -ENXIO; + break; + case SEEK_HOLE: + /* + * There is a virtual hole at the end of the file, so as long as + * offset isn't i_size or larger, return i_size. + */ + if (offset >= inode->i_size) + return -ENXIO; + offset = inode->i_size; + break; } if (offset < 0 && !unsigned_offsets(file)) @@ -128,12 +145,13 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) { + struct inode *inode = file->f_path.dentry->d_inode; loff_t retval; - mutex_lock(&file->f_dentry->d_inode->i_mutex); + mutex_lock(&inode->i_mutex); switch (origin) { case SEEK_END: - offset += i_size_read(file->f_path.dentry->d_inode); + offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { @@ -141,6 +159,26 @@ goto out; } offset += file->f_pos; + break; + case SEEK_DATA: + /* + * In the generic case the entire file is data, so as + * long as offset isn't at the end of the file then the + * offset is data. + */ + if (offset >= inode->i_size) + return -ENXIO; + break; + case SEEK_HOLE: + /* + * There is a virtual hole at the end of the file, so + * as long as offset isn't i_size or larger, return + * i_size. + */ + if (offset >= inode->i_size) + return -ENXIO; + offset = inode->i_size; + break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { @@ -151,7 +189,7 @@ retval = offset; } out: - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + mutex_unlock(&inode->i_mutex); return retval; } EXPORT_SYMBOL(default_llseek);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 198dabf..133e935 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c
@@ -14,7 +14,8 @@ extern const struct reiserfs_key MIN_KEY; static int reiserfs_readdir(struct file *, void *, filldir_t); -static int reiserfs_dir_fsync(struct file *filp, int datasync); +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync); const struct file_operations reiserfs_dir_operations = { .llseek = generic_file_llseek, @@ -27,13 +28,21 @@ #endif }; -static int reiserfs_dir_fsync(struct file *filp, int datasync) +static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *inode = filp->f_mapping->host; int err; + + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); reiserfs_write_lock(inode->i_sb); err = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); + mutex_unlock(&inode->i_mutex); if (err < 0) return err; return 0;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 91f080c..c7156dc 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c
@@ -140,12 +140,18 @@ * be removed... */ -static int reiserfs_sync_file(struct file *filp, int datasync) +static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *inode = filp->f_mapping->host; int err; int barrier_done; + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + + mutex_lock(&inode->i_mutex); BUG_ON(!S_ISREG(inode->i_mode)); err = sync_mapping_buffers(inode->i_mapping); reiserfs_write_lock(inode->i_sb); @@ -153,6 +159,7 @@ reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + mutex_unlock(&inode->i_mutex); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; @@ -312,4 +319,5 @@ .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, + .check_acl = reiserfs_check_acl, };
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 4fd5bb3..2922b90 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c
@@ -3068,9 +3068,8 @@ struct inode *inode = file->f_mapping->host; ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, - reiserfs_get_blocks_direct_io, NULL); + ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, + reiserfs_get_blocks_direct_io); /* * In case of error extending write may have instantiated a few @@ -3114,6 +3113,9 @@ error = -EFBIG; goto out; } + + inode_dio_wait(inode); + /* fill in hole pointers in the expanding truncate case. */ if (attr->ia_size > inode->i_size) { error = generic_cont_expand_simple(inode, attr->ia_size);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 1186626..551f1b7 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c
@@ -1529,6 +1529,7 @@ .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, + .check_acl = reiserfs_check_acl, }; /* @@ -1545,6 +1546,7 @@ .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, + .check_acl = reiserfs_check_acl, }; @@ -1558,5 +1560,5 @@ .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - + .check_acl = reiserfs_check_acl, };
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index aa91089..14363b9 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c
@@ -1643,6 +1643,7 @@ /* Set default values for options: non-aggressive tails, RO on errors */ REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO); + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH); /* no preallocation minimum, be smart in reiserfs_file_write instead */ REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index d780896..6938d8c 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c
@@ -555,11 +555,10 @@ reiserfs_write_unlock(inode->i_sb); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); - down_write(&dentry->d_inode->i_alloc_sem); + inode_dio_wait(dentry->d_inode); reiserfs_write_lock(inode->i_sb); err = reiserfs_setattr(dentry, &newattrs); - up_write(&dentry->d_inode->i_alloc_sem); mutex_unlock(&dentry->d_inode->i_mutex); } else update_ctime(inode); @@ -868,12 +867,18 @@ return err; } -static int reiserfs_check_acl(struct inode *inode, int mask, unsigned int flags) +int reiserfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; int error = -EAGAIN; /* do regular unix permission checks by default */ - if (flags & IPERM_FLAG_RCU) + /* + * Stat data v1 doesn't support ACLs. + */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return -EAGAIN; + + if (mask & MAY_NOT_BLOCK) return -ECHILD; acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); @@ -952,7 +957,7 @@ return 0; } -int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) +int reiserfs_permission(struct inode *inode, int mask) { /* * We don't do permission checks on the internal objects. @@ -961,15 +966,7 @@ if (IS_PRIVATE(inode)) return 0; -#ifdef CONFIG_REISERFS_FS_XATTR - /* - * Stat data v1 doesn't support ACLs. - */ - if (get_inode_sd_version(inode) != STAT_DATA_V1) - return generic_permission(inode, mask, flags, - reiserfs_check_acl); -#endif - return generic_permission(inode, mask, flags, NULL); + return generic_permission(inode, mask); } static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 4bc63ac..0682b38 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c
@@ -220,11 +220,6 @@ blk, off, ino_num); inode = squashfs_iget(dir->i_sb, ino, ino_num); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto failed; - } - goto exit_lookup; } } @@ -232,10 +227,7 @@ exit_lookup: kfree(dire); - if (inode) - return d_splice_alias(inode, dentry); - d_add(dentry, inode); - return ERR_PTR(0); + return d_splice_alias(inode, dentry); data_error: err = -EIO;
diff --git a/fs/super.c b/fs/super.c index ab3d672..7943f04 100644 --- a/fs/super.c +++ b/fs/super.c
@@ -38,6 +38,69 @@ LIST_HEAD(super_blocks); DEFINE_SPINLOCK(sb_lock); +/* + * One thing we have to be careful of with a per-sb shrinker is that we don't + * drop the last active reference to the superblock from within the shrinker. + * If that happens we could trigger unregistering the shrinker from within the + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * take a passive reference to the superblock to avoid this from occurring. + */ +static int prune_super(struct shrinker *shrink, struct shrink_control *sc) +{ + struct super_block *sb; + int fs_objects = 0; + int total_objects; + + sb = container_of(shrink, struct super_block, s_shrink); + + /* + * Deadlock avoidance. We may hold various FS locks, and we don't want + * to recurse into the FS that called us in clear_inode() and friends.. + */ + if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS)) + return -1; + + if (!grab_super_passive(sb)) + return -1; + + if (sb->s_op && sb->s_op->nr_cached_objects) + fs_objects = sb->s_op->nr_cached_objects(sb); + + total_objects = sb->s_nr_dentry_unused + + sb->s_nr_inodes_unused + fs_objects + 1; + + if (sc->nr_to_scan) { + int dentries; + int inodes; + + /* proportion the scan between the caches */ + dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) / + total_objects; + inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) / + total_objects; + if (fs_objects) + fs_objects = (sc->nr_to_scan * fs_objects) / + total_objects; + /* + * prune the dcache first as the icache is pinned by it, then + * prune the icache, followed by the filesystem specific caches + */ + prune_dcache_sb(sb, dentries); + prune_icache_sb(sb, inodes); + + if (fs_objects && sb->s_op->free_cached_objects) { + sb->s_op->free_cached_objects(sb, fs_objects); + fs_objects = sb->s_op->nr_cached_objects(sb); + } + total_objects = sb->s_nr_dentry_unused + + sb->s_nr_inodes_unused + fs_objects; + } + + total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure; + drop_super(sb); + return total_objects; +} + /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to @@ -77,6 +140,8 @@ INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); + INIT_LIST_HEAD(&s->s_inode_lru); + spin_lock_init(&s->s_inode_lru_lock); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -114,6 +179,10 @@ s->s_op = &default_op; s->s_time_gran = 1000000000; s->cleancache_poolid = -1; + + s->s_shrink.seeks = DEFAULT_SEEKS; + s->s_shrink.shrink = prune_super; + s->s_shrink.batch = 1024; } out: return s; @@ -181,6 +250,10 @@ if (atomic_dec_and_test(&s->s_active)) { cleancache_flush_fs(s); fs->kill_sb(s); + + /* caches are now gone, we can safely kill the shrinker now */ + unregister_shrinker(&s->s_shrink); + /* * We need to call rcu_barrier so all the delayed rcu free * inodes are flushed before we release the fs module. @@ -241,6 +314,39 @@ } /* + * grab_super_passive - acquire a passive reference + * @s: reference we are trying to grab + * + * Tries to acquire a passive reference. This is used in places where we + * cannot take an active reference but we need to ensure that the + * superblock does not go away while we are working on it. It returns + * false if a reference was not gained, and returns true with the s_umount + * lock held in read mode if a reference is gained. On successful return, + * the caller must drop the s_umount lock and the passive reference when + * done. + */ +bool grab_super_passive(struct super_block *sb) +{ + spin_lock(&sb_lock); + if (list_empty(&sb->s_instances)) { + spin_unlock(&sb_lock); + return false; + } + + sb->s_count++; + spin_unlock(&sb_lock); + + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root) + return true; + up_read(&sb->s_umount); + } + + put_super(sb); + return false; +} + +/* * Superblock locking. We really ought to get rid of these two. */ void lock_super(struct super_block * sb) @@ -276,7 +382,6 @@ { const struct super_operations *sop = sb->s_op; - if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); @@ -364,6 +469,7 @@ list_add(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); + register_shrinker(&s->s_shrink); return s; } @@ -452,6 +558,42 @@ } /** + * iterate_supers_type - call function for superblocks of given type + * @type: fs type + * @f: function to call + * @arg: argument to pass to it + * + * Scans the superblock list and calls given function, passing it + * locked superblock and given argument. + */ +void iterate_supers_type(struct file_system_type *type, + void (*f)(struct super_block *, void *), void *arg) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &type->fs_supers, s_instances) { + sb->s_count++; + spin_unlock(&sb_lock); + + down_read(&sb->s_umount); + if (sb->s_root) + f(sb, arg); + up_read(&sb->s_umount); + + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} + +EXPORT_SYMBOL(iterate_supers_type); + +/** * get_super - get the superblock of a device * @bdev: device to get the superblock for * @@ -657,7 +799,7 @@ static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ static int unnamed_dev_start = 0; /* don't bother trying below it */ -int set_anon_super(struct super_block *s, void *data) +int get_anon_bdev(dev_t *p) { int dev; int error; @@ -684,23 +826,37 @@ spin_unlock(&unnamed_dev_lock); return -EMFILE; } - s->s_dev = MKDEV(0, dev & MINORMASK); - s->s_bdi = &noop_backing_dev_info; + *p = MKDEV(0, dev & MINORMASK); return 0; } +EXPORT_SYMBOL(get_anon_bdev); + +void free_anon_bdev(dev_t dev) +{ + int slot = MINOR(dev); + spin_lock(&unnamed_dev_lock); + ida_remove(&unnamed_dev_ida, slot); + if (slot < unnamed_dev_start) + unnamed_dev_start = slot; + spin_unlock(&unnamed_dev_lock); +} +EXPORT_SYMBOL(free_anon_bdev); + +int set_anon_super(struct super_block *s, void *data) +{ + int error = get_anon_bdev(&s->s_dev); + if (!error) + s->s_bdi = &noop_backing_dev_info; + return error; +} EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { - int slot = MINOR(sb->s_dev); - + dev_t dev = sb->s_dev; generic_shutdown_super(sb); - spin_lock(&unnamed_dev_lock); - ida_remove(&unnamed_dev_ida, slot); - if (slot < unnamed_dev_start) - unnamed_dev_start = slot; - spin_unlock(&unnamed_dev_lock); + free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super);
diff --git a/fs/sync.c b/fs/sync.c index c38ec16..c98a747 100644 --- a/fs/sync.c +++ b/fs/sync.c
@@ -165,28 +165,9 @@ */ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { - struct address_space *mapping = file->f_mapping; - int err, ret; - - if (!file->f_op || !file->f_op->fsync) { - ret = -EINVAL; - goto out; - } - - ret = filemap_write_and_wait_range(mapping, start, end); - - /* - * We need to protect against concurrent writers, which could cause - * livelocks in fsync_buffers_list(). - */ - mutex_lock(&mapping->host->i_mutex); - err = file->f_op->fsync(file, datasync); - if (!ret) - ret = err; - mutex_unlock(&mapping->host->i_mutex); - -out: - return ret; + if (!file->f_op || !file->f_op->fsync) + return -EINVAL; + return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 0a12eb8..e3f091a 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c
@@ -349,11 +349,11 @@ return -ENOENT; } -int sysfs_permission(struct inode *inode, int mask, unsigned int flags) +int sysfs_permission(struct inode *inode, int mask) { struct sysfs_dirent *sd; - if (flags & IPERM_FLAG_RCU) + if (mask & MAY_NOT_BLOCK) return -ECHILD; sd = inode->i_private; @@ -362,5 +362,5 @@ sysfs_refresh_inode(sd, inode); mutex_unlock(&sysfs_mutex); - return generic_permission(inode, mask, flags, NULL); + return generic_permission(inode, mask); }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 2ed2404..845ab3a 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h
@@ -201,7 +201,7 @@ struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); void sysfs_evict_inode(struct inode *inode); int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); -int sysfs_permission(struct inode *inode, int mask, unsigned int flags); +int sysfs_permission(struct inode *inode, int mask); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 7cf738a..f9c234b 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c
@@ -1304,7 +1304,7 @@ return NULL; } -int ubifs_fsync(struct file *file, int datasync) +int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1319,14 +1319,16 @@ */ return 0; - /* - * VFS has already synchronized dirty pages for this inode. Synchronize - * the inode unless this is a 'datasync()' call. - */ + err = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (err) + return err; + mutex_lock(&inode->i_mutex); + + /* Synchronize the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) - return err; + goto out; } /* @@ -1334,10 +1336,9 @@ * them. */ err = ubifs_sync_wbufs_by_inode(c, inode); - if (err) - return err; - - return 0; +out: + mutex_unlock(&inode->i_mutex); + return err; } /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 702b792..27f2255 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h
@@ -1729,7 +1729,7 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ -int ubifs_fsync(struct file *file, int datasync); +int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); int ubifs_setattr(struct dentry *dentry, struct iattr *attr); /* dir.c */
diff --git a/fs/udf/file.c b/fs/udf/file.c index 2a346bb..d8ffa7c 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c
@@ -150,7 +150,7 @@ long old_block, new_block; int result = -EINVAL; - if (file_permission(filp, MAY_READ) != 0) { + if (inode_permission(inode, MAY_READ) != 0) { udf_debug("no permission to access inode %lu\n", inode->i_ino); result = -EPERM; goto out;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index b57aab9..639d491 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c
@@ -59,8 +59,6 @@ if (ino) inode = ufs_iget(dir->i_sb, ino); unlock_ufs(dir->i_sb); - if (IS_ERR(inode)) - return ERR_CAST(inode); return d_splice_alias(inode, dentry); }
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 115ac69..cac48fe 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -219,7 +219,7 @@ } int -xfs_check_acl(struct inode *inode, int mask, unsigned int flags) +xfs_check_acl(struct inode *inode, int mask) { struct xfs_inode *ip; struct posix_acl *acl; @@ -235,7 +235,7 @@ if (!XFS_IFORK_Q(ip)) return -EAGAIN; - if (flags & IPERM_FLAG_RCU) { + if (mask & MAY_NOT_BLOCK) { if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) return -ECHILD; return -EAGAIN;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 26384fe..63e971e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1329,6 +1329,9 @@ } else { xfs_finish_ioend_sync(ioend); } + + /* XXX: probably should move into the real I/O completion handler */ + inode_dio_done(ioend->io_inode); } STATIC ssize_t
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 8073f61..cca00f4 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -127,6 +127,8 @@ STATIC int xfs_file_fsync( struct file *file, + loff_t start, + loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; @@ -138,6 +140,10 @@ trace_xfs_file_fsync(ip); + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); @@ -875,18 +881,11 @@ /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; - int error, error2; xfs_rw_iunlock(ip, iolock); - error = filemap_write_and_wait_range(mapping, pos, end); + ret = -xfs_file_fsync(file, pos, end, + (file->f_flags & __O_SYNC) ? 0 : 1); xfs_rw_ilock(ip, iolock); - - error2 = -xfs_file_fsync(file, - (file->f_flags & __O_SYNC) ? 0 : 1); - if (error) - ret = error; - else if (error2) - ret = error2; } out_unlock:
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 25fd2cd..9a72dda 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1024,11 +1024,6 @@ { struct xfs_mount *mp = XFS_M(sb); - /* - * Unregister the memory shrinker before we tear down the mount - * structure so we don't have memory reclaim racing with us here. - */ - xfs_inode_shrinker_unregister(mp); xfs_syncd_stop(mp); /* @@ -1411,8 +1406,6 @@ sb->s_time_gran = 1; set_posix_acl_flag(sb); - xfs_inode_shrinker_register(mp); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1439,7 +1432,6 @@ return 0; out_filestream_unmount: - xfs_inode_shrinker_unregister(mp); xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1458,8 +1450,6 @@ out_syncd_stop: xfs_syncd_stop(mp); out_unmount: - xfs_inode_shrinker_unregister(mp); - /* * Blow away any referenced inode in the filestreams cache. * This can and will cause log traffic as inodes go inactive @@ -1483,6 +1473,21 @@ return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); } +static int +xfs_fs_nr_cached_objects( + struct super_block *sb) +{ + return xfs_reclaim_inodes_count(XFS_M(sb)); +} + +static void +xfs_fs_free_cached_objects( + struct super_block *sb, + int nr_to_scan) +{ + xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1496,6 +1501,8 @@ .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, .show_options = xfs_fs_show_options, + .nr_cached_objects = xfs_fs_nr_cached_objects, + .free_cached_objects = xfs_fs_free_cached_objects, }; static struct file_system_type xfs_fs_type = {
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 5cc158e..e4c938a 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -179,6 +179,8 @@ if (error == EFSCORRUPTED) break; + cond_resched(); + } while (nr_found && !done); if (skipped) { @@ -984,6 +986,8 @@ *nr_to_scan -= XFS_LOOKUP_BATCH; + cond_resched(); + } while (nr_found && !done && *nr_to_scan > 0); if (trylock && !done) @@ -1001,7 +1005,7 @@ * ensure that when we get more reclaimers than AGs we block rather * than spin trying to execute reclaim. */ - if (trylock && skipped && *nr_to_scan > 0) { + if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { trylock = 0; goto restart; } @@ -1019,44 +1023,38 @@ } /* - * Inode cache shrinker. + * Scan a certain number of inodes for reclaim. * * When called we make sure that there is a background (fast) inode reclaim in - * progress, while we will throttle the speed of reclaim via doiing synchronous + * progress, while we will throttle the speed of reclaim via doing synchronous * reclaim of inodes. That means if we come across dirty inodes, we wait for * them to be cleaned, which we hope will not be very long due to the * background walker having already kicked the IO off on those dirty inodes. */ -static int -xfs_reclaim_inode_shrink( - struct shrinker *shrink, - struct shrink_control *sc) +void +xfs_reclaim_inodes_nr( + struct xfs_mount *mp, + int nr_to_scan) { - struct xfs_mount *mp; - struct xfs_perag *pag; - xfs_agnumber_t ag; - int reclaimable; - int nr_to_scan = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; + /* kick background reclaimer and push the AIL */ + xfs_syncd_queue_reclaim(mp); + xfs_ail_push_all(mp->m_ail); - mp = container_of(shrink, struct xfs_mount, m_inode_shrink); - if (nr_to_scan) { - /* kick background reclaimer and push the AIL */ - xfs_syncd_queue_reclaim(mp); - xfs_ail_push_all(mp->m_ail); + xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); +} - if (!(gfp_mask & __GFP_FS)) - return -1; +/* + * Return the number of reclaimable inodes in the filesystem for + * the shrinker to determine how much to reclaim. + */ +int +xfs_reclaim_inodes_count( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t ag = 0; + int reclaimable = 0; - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, - &nr_to_scan); - /* terminate if we don't exhaust the scan */ - if (nr_to_scan > 0) - return -1; - } - - reclaimable = 0; - ag = 0; while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { ag = pag->pag_agno + 1; reclaimable += pag->pag_ici_reclaimable; @@ -1065,18 +1063,3 @@ return reclaimable; } -void -xfs_inode_shrinker_register( - struct xfs_mount *mp) -{ - mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; - mp->m_inode_shrink.seeks = DEFAULT_SEEKS; - register_shrinker(&mp->m_inode_shrink); -} - -void -xfs_inode_shrinker_unregister( - struct xfs_mount *mp) -{ - unregister_shrinker(&mp->m_inode_shrink); -}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index e914fd6..941202e 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -35,6 +35,8 @@ void xfs_flush_inodes(struct xfs_inode *ip); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); +int xfs_reclaim_inodes_count(struct xfs_mount *mp); +void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); @@ -46,7 +48,4 @@ int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags); -void xfs_inode_shrinker_register(struct xfs_mount *mp); -void xfs_inode_shrinker_unregister(struct xfs_mount *mp); - #endif
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 11dd720..0135e2a 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h
@@ -42,7 +42,7 @@ #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) #ifdef CONFIG_XFS_POSIX_ACL -extern int xfs_check_acl(struct inode *inode, int mask, unsigned int flags); +extern int xfs_check_acl(struct inode *inode, int mask); extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); extern int xfs_acl_chmod(struct inode *inode);
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h index 69a21e0..8013a45 100644 --- a/include/linux/anon_inodes.h +++ b/include/linux/anon_inodes.h
@@ -8,6 +8,8 @@ #ifndef _LINUX_ANON_INODES_H #define _LINUX_ANON_INODES_H +struct file_operations; + struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags);
diff --git a/include/linux/atomic.h b/include/linux/atomic.h index ee456c7..bc6615d 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h
@@ -34,6 +34,32 @@ } #endif +#ifndef atomic_inc_unless_negative +static inline int atomic_inc_unless_negative(atomic_t *p) +{ + int v, v1; + for (v = 0; v >= 0; v = v1) { + v1 = atomic_cmpxchg(p, v, v + 1); + if (likely(v1 == v)) + return 1; + } + return 0; +} +#endif + +#ifndef atomic_dec_unless_positive +static inline int atomic_dec_unless_positive(atomic_t *p) +{ + int v, v1; + for (v = 0; v <= 0; v = v1) { + v1 = atomic_cmpxchg(p, v, v - 1); + if (likely(v1 == v)) + return 1; + } + return 0; +} +#endif + #ifndef CONFIG_ARCH_HAS_ATOMIC_OR static inline void atomic_or(int i, atomic_t *v) {
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 8845613..fd88a39 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h
@@ -111,6 +111,7 @@ extern int search_binary_handler(struct linux_binprm *, struct pt_regs *); extern int flush_old_exec(struct linux_binprm * bprm); extern void setup_new_exec(struct linux_binprm * bprm); +extern void would_dump(struct linux_binprm *, struct file *); extern int suid_dumpable; #define SUID_DUMP_DISABLE 0 /* No setuid dumping */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 19d90a5..3f22d8d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h
@@ -216,6 +216,7 @@ #define DCACHE_MOUNTED 0x10000 /* is a mountpoint */ #define DCACHE_NEED_AUTOMOUNT 0x20000 /* handle automount on this dir */ #define DCACHE_MANAGE_TRANSIT 0x40000 /* manage transit from this dirent */ +#define DCACHE_NEED_LOOKUP 0x80000 /* dentry requires i_op->lookup */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) @@ -416,7 +417,12 @@ return dentry->d_flags & DCACHE_MOUNTED; } -extern struct dentry *lookup_create(struct nameidata *nd, int is_dir); +static inline bool d_need_lookup(struct dentry *dentry) +{ + return dentry->d_flags & DCACHE_NEED_LOOKUP; +} + +extern void d_clear_need_lookup(struct dentry *dentry); extern int sysctl_vfs_cache_pressure;
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 5e06acf..0c473fd 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h
@@ -877,7 +877,7 @@ extern void ext3_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ -extern int ext3_sync_file(struct file *, int); +extern int ext3_sync_file(struct file *, loff_t, loff_t, int); /* hash.c */ extern int ext3fs_dirhash(const char *name, int len, struct
diff --git a/include/linux/fb.h b/include/linux/fb.h index 6a82748..1d6836c 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h
@@ -1043,7 +1043,8 @@ struct inode *inode, struct file *file); extern void fb_deferred_io_cleanup(struct fb_info *info); -extern int fb_deferred_io_fsync(struct file *file, int datasync); +extern int fb_deferred_io_fsync(struct file *file, loff_t start, + loff_t end, int datasync); static inline bool fb_be_math(struct fb_info *info) {
diff --git a/include/linux/fs.h b/include/linux/fs.h index b5b9792..b224dc4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h
@@ -32,7 +32,9 @@ #define SEEK_SET 0 /* seek relative to beginning of file */ #define SEEK_CUR 1 /* seek relative to current file position */ #define SEEK_END 2 /* seek relative to end of file */ -#define SEEK_MAX SEEK_END +#define SEEK_DATA 3 /* seek to the next data */ +#define SEEK_HOLE 4 /* seek to the next hole */ +#define SEEK_MAX SEEK_HOLE struct fstrim_range { __u64 start; @@ -63,6 +65,7 @@ #define MAY_ACCESS 16 #define MAY_OPEN 32 #define MAY_CHDIR 64 +#define MAY_NOT_BLOCK 128 /* called from RCU mode, don't block */ /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond @@ -392,8 +395,9 @@ #include <linux/semaphore.h> #include <linux/fiemap.h> #include <linux/rculist_bl.h> +#include <linux/shrinker.h> +#include <linux/atomic.h> -#include <asm/atomic.h> #include <asm/byteorder.h> struct export_operations; @@ -777,7 +781,7 @@ struct timespec i_ctime; blkcnt_t i_blocks; unsigned short i_bytes; - struct rw_semaphore i_alloc_sem; + atomic_t i_dio_count; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock *i_flock; struct address_space *i_mapping; @@ -1396,6 +1400,11 @@ struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ + /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ + spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp; + struct list_head s_inode_lru; /* unused inode lru */ + int s_nr_inodes_unused; /* # of inodes on lru */ + struct block_device *s_bdev; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; @@ -1438,8 +1447,14 @@ * Saved pool identifier for cleancache (-1 means none) */ int cleancache_poolid; + + struct shrinker s_shrink; /* per-sb shrinker handle */ }; +/* superblock cache pruning functions */ +extern void prune_icache_sb(struct super_block *sb, int nr_to_scan); +extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan); + extern struct timespec current_fs_time(struct super_block *sb); /* @@ -1490,7 +1505,6 @@ /* * VFS file helper functions. */ -extern int file_permission(struct file *, int); extern void inode_init_owner(struct inode *inode, const struct inode *dir, mode_t mode); /* @@ -1538,11 +1552,6 @@ #define HAVE_COMPAT_IOCTL 1 #define HAVE_UNLOCKED_IOCTL 1 -/* - * NOTE: - * all file operations except setlease can be called without - * the big kernel lock held in all filesystems. - */ struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); @@ -1558,7 +1567,7 @@ int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); - int (*fsync) (struct file *, int datasync); + int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); @@ -1573,13 +1582,11 @@ loff_t len); }; -#define IPERM_FLAG_RCU 0x0001 - struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); - int (*permission) (struct inode *, int, unsigned int); - int (*check_acl)(struct inode *, int, unsigned int); + int (*permission) (struct inode *, int); + int (*check_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); void (*put_link) (struct dentry *, struct nameidata *, void *); @@ -1645,6 +1652,8 @@ ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); + int (*nr_cached_objects)(struct super_block *); + void (*free_cached_objects)(struct super_block *, int); }; /* @@ -1693,6 +1702,10 @@ * set during data writeback, and cleared with a wakeup * on the bit address once it is done. * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@ -1706,6 +1719,8 @@ #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_REFERENCED (1 << 8) +#define __I_DIO_WAKEUP 9 +#define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) @@ -1816,7 +1831,6 @@ struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; - struct lock_class_key i_alloc_sem_key; }; extern struct dentry *mount_ns(struct file_system_type *fs_type, int flags, @@ -1837,6 +1851,8 @@ void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); +int get_anon_bdev(dev_t *); +void free_anon_bdev(dev_t); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), @@ -2188,16 +2204,38 @@ #endif extern int notify_change(struct dentry *, struct iattr *); extern int inode_permission(struct inode *, int); -extern int generic_permission(struct inode *, int, unsigned int, - int (*check_acl)(struct inode *, int, unsigned int)); +extern int generic_permission(struct inode *, int); static inline bool execute_ok(struct inode *inode) { return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); } -extern int get_write_access(struct inode *); -extern int deny_write_access(struct file *); +/* + * get_write_access() gets write permission for a file. + * put_write_access() releases this write permission. + * This is used for regular files. + * We cannot support write (and maybe mmap read-write shared) accesses and + * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode + * can have the following values: + * 0: no writers, no VM_DENYWRITE mappings + * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist + * > 0: (i_writecount) users are writing to the file. + * + * Normally we operate on that counter with atomic_{inc,dec} and it's safe + * except for the cases where we don't hold i_writecount yet. Then we need to + * use {get,deny}_write_access() - these functions check the sign and refuse + * to do the change if sign is wrong. + */ +static inline int get_write_access(struct inode *inode) +{ + return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; +} +static inline int deny_write_access(struct file *file) +{ + struct inode *inode = file->f_path.dentry->d_inode; + return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; +} static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); @@ -2317,7 +2355,8 @@ /* fs/block_dev.c */ extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); -extern int blkdev_fsync(struct file *filp, int datasync); +extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync); /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, @@ -2368,6 +2407,8 @@ }; void dio_end_io(struct bio *bio, int error); +void inode_dio_wait(struct inode *inode); +void inode_dio_done(struct inode *inode); ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, @@ -2375,14 +2416,17 @@ dio_submit_t submit_io, int flags); static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, - struct inode *inode, struct block_device *bdev, const struct iovec *iov, - loff_t offset, unsigned long nr_segs, get_block_t get_block, - dio_iodone_t end_io) + struct inode *inode, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block) { - return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, NULL, + return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } +#else +static inline void inode_dio_wait(struct inode *inode) +{ +} #endif extern const struct file_operations generic_ro_fops; @@ -2432,6 +2476,8 @@ extern struct super_block *user_get_super(dev_t); extern void drop_super(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); +extern void iterate_supers_type(struct file_system_type *, + void (*)(struct super_block *, void *), void *); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); @@ -2444,7 +2490,7 @@ extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -extern int noop_fsync(struct file *, int); +extern int noop_fsync(struct file *, loff_t, loff_t, int); extern int simple_empty(struct dentry *); extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, @@ -2469,7 +2515,7 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); -extern int generic_file_fsync(struct file *, int); +extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64);
diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h index 0437e37..574bea4 100644 --- a/include/linux/generic_acl.h +++ b/include/linux/generic_acl.h
@@ -10,6 +10,6 @@ int generic_acl_init(struct inode *, struct inode *); int generic_acl_chmod(struct inode *); -int generic_check_acl(struct inode *inode, int mask, unsigned int flags); +int generic_check_acl(struct inode *inode, int mask); #endif /* LINUX_GENERIC_ACL_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h index c70a326..8a45ad2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h
@@ -15,6 +15,7 @@ #include <linux/range.h> #include <linux/pfn.h> #include <linux/bit_spinlock.h> +#include <linux/shrinker.h> struct mempolicy; struct anon_vma; @@ -1121,44 +1122,6 @@ } #endif -/* - * This struct is used to pass information from page reclaim to the shrinkers. - * We consolidate the values for easier extention later. - */ -struct shrink_control { - gfp_t gfp_mask; - - /* How many slab objects shrinker() should scan and try to reclaim */ - unsigned long nr_to_scan; -}; - -/* - * A callback you can register to apply pressure to ageable caches. - * - * 'sc' is passed shrink_control which includes a count 'nr_to_scan' - * and a 'gfpmask'. It should look through the least-recently-used - * 'nr_to_scan' entries and attempt to free them up. It should return - * the number of objects which remain in the cache. If it returns -1, it means - * it cannot do any scanning at this time (eg. there is a risk of deadlock). - * - * The 'gfpmask' refers to the allocation we are currently trying to - * fulfil. - * - * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is - * querying the cache size, so a fastpath for that case is appropriate. - */ -struct shrinker { - int (*shrink)(struct shrinker *, struct shrink_control *sc); - int seeks; /* seeks to recreate an obj */ - - /* These are for internal use */ - struct list_head list; - long nr; /* objs pending delete */ -}; -#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ -extern void register_shrinker(struct shrinker *); -extern void unregister_shrinker(struct shrinker *); - int vma_wants_writenotify(struct vm_area_struct *vma); extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 0b89efc..2930485 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h
@@ -18,7 +18,6 @@ struct seq_file m; /* must be the first element */ struct mnt_namespace *ns; struct path root; - int event; }; struct fs_struct;
diff --git a/include/linux/namei.h b/include/linux/namei.h index eba45ea..76fe2c6 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h
@@ -48,7 +48,6 @@ */ #define LOOKUP_FOLLOW 0x0001 #define LOOKUP_DIRECTORY 0x0002 -#define LOOKUP_CONTINUE 0x0004 #define LOOKUP_PARENT 0x0010 #define LOOKUP_REVAL 0x0020 @@ -75,9 +74,11 @@ extern int kern_path(const char *, unsigned, struct path *); +extern struct dentry *kern_path_create(int, const char *, struct path *, int); +extern struct dentry *user_path_create(int, const char __user *, struct path *, int); extern int kern_path_parent(const char *, struct nameidata *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, - const char *, unsigned int, struct nameidata *); + const char *, unsigned int, struct path *); extern struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, int (*open)(struct inode *, struct file *));
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 1b93b9c..8b579be 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h
@@ -85,7 +85,7 @@ struct nfs4_state; struct nfs_open_context { struct nfs_lock_context lock_context; - struct path path; + struct dentry *dentry; struct rpc_cred *cred; struct nfs4_state *state; fmode_t mode; @@ -360,7 +360,7 @@ extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -extern int nfs_permission(struct inode *, int, unsigned int); +extern int nfs_permission(struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_release(struct inode *, struct file *); extern int nfs_attribute_timeout(struct inode *inode); @@ -372,7 +372,7 @@ extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); -extern struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred, fmode_t f_mode); +extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred, fmode_t f_mode); extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx); extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 50d20ab..cc37a55 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h
@@ -68,6 +68,7 @@ void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct fs_struct *); +int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) {
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h index 6deef5d..57958c0 100644 --- a/include/linux/reiserfs_xattr.h +++ b/include/linux/reiserfs_xattr.h
@@ -41,10 +41,11 @@ int reiserfs_lookup_privroot(struct super_block *sb); int reiserfs_delete_xattrs(struct inode *inode); int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct inode *inode, int mask, unsigned int flags); +int reiserfs_permission(struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) +int reiserfs_check_acl(struct inode *inode, int mask); ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size); int reiserfs_setxattr(struct dentry *dentry, const char *name, @@ -122,6 +123,7 @@ #define reiserfs_setxattr NULL #define reiserfs_listxattr NULL #define reiserfs_removexattr NULL +#define reiserfs_check_acl NULL static inline void reiserfs_init_xattr_rwsem(struct inode *inode) {
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index a8afe9c..77950df 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h
@@ -124,19 +124,9 @@ */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); -/* - * Take/release a lock when not the owner will release it. - * - * [ This API should be avoided as much as possible - the - * proper abstraction for this case is completions. ] - */ -extern void down_read_non_owner(struct rw_semaphore *sem); -extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_write_nested(sem, subclass) down_write(sem) -# define down_read_non_owner(sem) down_read(sem) -# define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */
diff --git a/include/linux/security.h b/include/linux/security.h index 8ce59ef..ebd2a53 100644 --- a/include/linux/security.h +++ b/include/linux/security.h
@@ -1456,7 +1456,7 @@ struct inode *new_dir, struct dentry *new_dentry); int (*inode_readlink) (struct dentry *dentry); int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd); - int (*inode_permission) (struct inode *inode, int mask, unsigned flags); + int (*inode_permission) (struct inode *inode, int mask); int (*inode_setattr) (struct dentry *dentry, struct iattr *attr); int (*inode_getattr) (struct vfsmount *mnt, struct dentry *dentry); int (*inode_setxattr) (struct dentry *dentry, const char *name, @@ -1720,7 +1720,6 @@ int security_inode_readlink(struct dentry *dentry); int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); int security_inode_permission(struct inode *inode, int mask); -int security_inode_exec_permission(struct inode *inode, unsigned int flags); int security_inode_setattr(struct dentry *dentry, struct iattr *attr); int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry); int security_inode_setxattr(struct dentry *dentry, const char *name, @@ -2113,12 +2112,6 @@ return 0; } -static inline int security_inode_exec_permission(struct inode *inode, - unsigned int flags) -{ - return 0; -} - static inline int security_inode_setattr(struct dentry *dentry, struct iattr *attr) {
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 03c0232..be720cd 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h
@@ -23,6 +23,7 @@ u64 version; struct mutex lock; const struct seq_operations *op; + int poll_event; void *private; };
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h new file mode 100644 index 0000000..790651b --- /dev/null +++ b/include/linux/shrinker.h
@@ -0,0 +1,42 @@ +#ifndef _LINUX_SHRINKER_H +#define _LINUX_SHRINKER_H + +/* + * This struct is used to pass information from page reclaim to the shrinkers. + * We consolidate the values for easier extention later. + */ +struct shrink_control { + gfp_t gfp_mask; + + /* How many slab objects shrinker() should scan and try to reclaim */ + unsigned long nr_to_scan; +}; + +/* + * A callback you can register to apply pressure to ageable caches. + * + * 'sc' is passed shrink_control which includes a count 'nr_to_scan' + * and a 'gfpmask'. It should look through the least-recently-used + * 'nr_to_scan' entries and attempt to free them up. It should return + * the number of objects which remain in the cache. If it returns -1, it means + * it cannot do any scanning at this time (eg. there is a risk of deadlock). + * + * The 'gfpmask' refers to the allocation we are currently trying to + * fulfil. + * + * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is + * querying the cache size, so a fastpath for that case is appropriate. + */ +struct shrinker { + int (*shrink)(struct shrinker *, struct shrink_control *sc); + int seeks; /* seeks to recreate an obj */ + long batch; /* reclaim batch size, 0 = default */ + + /* These are for internal use */ + struct list_head list; + long nr; /* objs pending delete */ +}; +#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ +extern void register_shrinker(struct shrinker *); +extern void unregister_shrinker(struct shrinker *); +#endif
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index b2c33bd..36851f7 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h
@@ -179,6 +179,83 @@ TP_ARGS(nr_reclaimed) ); +TRACE_EVENT(mm_shrink_slab_start, + TP_PROTO(struct shrinker *shr, struct shrink_control *sc, + long nr_objects_to_shrink, unsigned long pgs_scanned, + unsigned long lru_pgs, unsigned long cache_items, + unsigned long long delta, unsigned long total_scan), + + TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs, + cache_items, delta, total_scan), + + TP_STRUCT__entry( + __field(struct shrinker *, shr) + __field(void *, shrink) + __field(long, nr_objects_to_shrink) + __field(gfp_t, gfp_flags) + __field(unsigned long, pgs_scanned) + __field(unsigned long, lru_pgs) + __field(unsigned long, cache_items) + __field(unsigned long long, delta) + __field(unsigned long, total_scan) + ), + + TP_fast_assign( + __entry->shr = shr; + __entry->shrink = shr->shrink; + __entry->nr_objects_to_shrink = nr_objects_to_shrink; + __entry->gfp_flags = sc->gfp_mask; + __entry->pgs_scanned = pgs_scanned; + __entry->lru_pgs = lru_pgs; + __entry->cache_items = cache_items; + __entry->delta = delta; + __entry->total_scan = total_scan; + ), + + TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + __entry->shrink, + __entry->shr, + __entry->nr_objects_to_shrink, + show_gfp_flags(__entry->gfp_flags), + __entry->pgs_scanned, + __entry->lru_pgs, + __entry->cache_items, + __entry->delta, + __entry->total_scan) +); + +TRACE_EVENT(mm_shrink_slab_end, + TP_PROTO(struct shrinker *shr, int shrinker_retval, + long unused_scan_cnt, long new_scan_cnt), + + TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt), + + TP_STRUCT__entry( + __field(struct shrinker *, shr) + __field(void *, shrink) + __field(long, unused_scan) + __field(long, new_scan) + __field(int, retval) + __field(long, total_scan) + ), + + TP_fast_assign( + __entry->shr = shr; + __entry->shrink = shr->shrink; + __entry->unused_scan = unused_scan_cnt; + __entry->new_scan = new_scan_cnt; + __entry->retval = shrinker_retval; + __entry->total_scan = new_scan_cnt - unused_scan_cnt; + ), + + TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + __entry->shrink, + __entry->shr, + __entry->unused_scan, + __entry->new_scan, + __entry->total_scan, + __entry->retval) +); DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
diff --git a/ipc/shm.c b/ipc/shm.c index ab3385a..27884ad 100644 --- a/ipc/shm.c +++ b/ipc/shm.c
@@ -277,13 +277,13 @@ return 0; } -static int shm_fsync(struct file *file, int datasync) +static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct shm_file_data *sfd = shm_file_data(file); if (!sfd->file->f_op->fsync) return -EINVAL; - return sfd->file->f_op->fsync(sfd->file, datasync); + return sfd->file->f_op->fsync(sfd->file, start, end, datasync); } static unsigned long shm_get_unmapped_area(struct file *file,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2731d11..e1c72c0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c
@@ -3542,7 +3542,8 @@ } /* the process need read permission on control file */ - ret = file_permission(cfile, MAY_READ); + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); if (ret < 0) goto fail;
diff --git a/kernel/fork.c b/kernel/fork.c index ca339c5..aeae5b1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c
@@ -1585,6 +1585,7 @@ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); mmap_init(); + nsproxy_cache_init(); } /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index d6a00f3..9aeab4b 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c
@@ -271,10 +271,8 @@ return err; } -static int __init nsproxy_cache_init(void) +int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); return 0; } - -module_init(nsproxy_cache_init);
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index cae050b..176e5e5 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c
@@ -117,15 +117,6 @@ EXPORT_SYMBOL(down_read_nested); -void down_read_non_owner(struct rw_semaphore *sem) -{ - might_sleep(); - - __down_read(sem); -} - -EXPORT_SYMBOL(down_read_non_owner); - void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); @@ -136,13 +127,6 @@ EXPORT_SYMBOL(down_write_nested); -void up_read_non_owner(struct rw_semaphore *sem) -{ - __up_read(sem); -} - -EXPORT_SYMBOL(up_read_non_owner); - #endif
diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8..f820e60 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -78,9 +78,6 @@ * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * - * ->i_mutex - * ->i_alloc_sem (various) - * * inode_wb_list_lock * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode)
diff --git a/mm/madvise.c b/mm/madvise.c index 2221491..74bf193 100644 --- a/mm/madvise.c +++ b/mm/madvise.c
@@ -218,7 +218,7 @@ endoff = (loff_t)(end - vma->vm_start - 1) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + /* vmtruncate_range needs to take i_mutex */ up_read(¤t->mm->mmap_sem); error = vmtruncate_range(mapping->host, offset, endoff); down_read(¤t->mm->mmap_sem);
diff --git a/mm/rmap.c b/mm/rmap.c index 23295f65..2540a39 100644 --- a/mm/rmap.c +++ b/mm/rmap.c
@@ -21,7 +21,6 @@ * Lock ordering in mm: * * inode->i_mutex (while writing or truncating, not reading or faulting) - * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) * mapping->i_mmap_mutex
diff --git a/mm/swapfile.c b/mm/swapfile.c index ff8dc1a..1b8c339 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c
@@ -1681,19 +1681,14 @@ } #ifdef CONFIG_PROC_FS -struct proc_swaps { - struct seq_file seq; - int event; -}; - static unsigned swaps_poll(struct file *file, poll_table *wait) { - struct proc_swaps *s = file->private_data; + struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); - if (s->event != atomic_read(&proc_poll_event)) { - s->event = atomic_read(&proc_poll_event); + if (seq->poll_event != atomic_read(&proc_poll_event)) { + seq->poll_event = atomic_read(&proc_poll_event); return POLLIN | POLLRDNORM | POLLERR | POLLPRI; } @@ -1783,24 +1778,16 @@ static int swaps_open(struct inode *inode, struct file *file) { - struct proc_swaps *s; + struct seq_file *seq; int ret; - s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); - if (!s) - return -ENOMEM; - - file->private_data = s; - ret = seq_open(file, &swaps_op); - if (ret) { - kfree(s); + if (ret) return ret; - } - s->seq.private = s; - s->event = atomic_read(&proc_poll_event); - return ret; + seq = file->private_data; + seq->poll_event = atomic_read(&proc_poll_event); + return 0; } static const struct file_operations proc_swaps_operations = {
diff --git a/mm/truncate.c b/mm/truncate.c index e13f22e..003c6c6 100644 --- a/mm/truncate.c +++ b/mm/truncate.c
@@ -622,12 +622,11 @@ return -ENOSYS; mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); + inode_dio_wait(inode); unmap_mapping_range(mapping, offset, (end - offset), 1); inode->i_op->truncate_range(inode, offset, end); /* unmap again to remove racily COWed private pages */ unmap_mapping_range(mapping, offset, (end - offset), 1); - up_write(&inode->i_alloc_sem); mutex_unlock(&inode->i_mutex); return 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c index d036e59..febbc04 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -250,49 +250,90 @@ unsigned long long delta; unsigned long total_scan; unsigned long max_pass; + int shrink_ret = 0; + long nr; + long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + do { + nr = shrinker->nr; + } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + + total_scan = nr; max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) { + total_scan += delta; + if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to " "delete nr=%ld\n", - shrinker->shrink, shrinker->nr); - shrinker->nr = max_pass; + shrinker->shrink, total_scan); + total_scan = max_pass; } /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * max_pass. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < max_pass / 4) + total_scan = min(total_scan, max_pass / 2); + + /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (shrinker->nr > max_pass * 2) - shrinker->nr = max_pass * 2; + if (total_scan > max_pass * 2) + total_scan = max_pass * 2; - total_scan = shrinker->nr; - shrinker->nr = 0; + trace_mm_shrink_slab_start(shrinker, shrink, nr, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); - while (total_scan >= SHRINK_BATCH) { - long this_scan = SHRINK_BATCH; - int shrink_ret; + while (total_scan >= batch_size) { int nr_before; nr_before = do_shrinker_shrink(shrinker, shrink, 0); shrink_ret = do_shrinker_shrink(shrinker, shrink, - this_scan); + batch_size); if (shrink_ret == -1) break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, this_scan); - total_scan -= this_scan; + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; cond_resched(); } - shrinker->nr += total_scan; + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + do { + nr = shrinker->nr; + new_nr = total_scan + nr; + if (total_scan <= 0) + break; + } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + + trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } up_read(&shrinker_rwsem); out:
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7389b7d..c50818f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c
@@ -97,8 +97,7 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) { static uint32_t clntid; - struct nameidata nd; - struct path path; + struct path path, dir; char name[15]; struct qstr q = { .name = name, @@ -113,7 +112,7 @@ path.mnt = rpc_get_mount(); if (IS_ERR(path.mnt)) return PTR_ERR(path.mnt); - error = vfs_path_lookup(path.mnt->mnt_root, path.mnt, dir_name, 0, &nd); + error = vfs_path_lookup(path.mnt->mnt_root, path.mnt, dir_name, 0, &dir); if (error) goto err; @@ -121,7 +120,7 @@ q.len = snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++); name[sizeof(name) - 1] = '\0'; q.hash = full_name_hash(q.name, q.len); - path.dentry = rpc_create_client_dir(nd.path.dentry, &q, clnt); + path.dentry = rpc_create_client_dir(dir.dentry, &q, clnt); if (!IS_ERR(path.dentry)) break; error = PTR_ERR(path.dentry); @@ -132,11 +131,11 @@ goto err_path_put; } } - path_put(&nd.path); + path_put(&dir); clnt->cl_path = path; return 0; err_path_put: - path_put(&nd.path); + path_put(&dir); err: rpc_put_mount(); return error;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0722a25..ec68e1c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c
@@ -808,8 +808,9 @@ struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; + char *sun_path = sunaddr->sun_path; struct dentry *dentry = NULL; - struct nameidata nd; + struct path path; int err; unsigned hash; struct unix_address *addr; @@ -845,48 +846,44 @@ addr->hash = hash ^ sk->sk_type; atomic_set(&addr->refcnt, 1); - if (sunaddr->sun_path[0]) { + if (sun_path[0]) { unsigned int mode; err = 0; /* * Get the parent directory, calculate the hash for last * component. */ - err = kern_path_parent(sunaddr->sun_path, &nd); - if (err) - goto out_mknod_parent; - - dentry = lookup_create(&nd, 0); + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); err = PTR_ERR(dentry); if (IS_ERR(dentry)) - goto out_mknod_unlock; + goto out_mknod_parent; /* * All right, let's create it. */ mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = mnt_want_write(nd.path.mnt); + err = mnt_want_write(path.mnt); if (err) goto out_mknod_dput; - err = security_path_mknod(&nd.path, dentry, mode, 0); + err = security_path_mknod(&path, dentry, mode, 0); if (err) goto out_mknod_drop_write; - err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0); + err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); out_mknod_drop_write: - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path.mnt); if (err) goto out_mknod_dput; - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - dput(nd.path.dentry); - nd.path.dentry = dentry; + mutex_unlock(&path.dentry->d_inode->i_mutex); + dput(path.dentry); + path.dentry = dentry; addr->hash = UNIX_HASH_SIZE; } spin_lock(&unix_table_lock); - if (!sunaddr->sun_path[0]) { + if (!sun_path[0]) { err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { @@ -897,8 +894,8 @@ list = &unix_socket_table[addr->hash]; } else { list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; - u->dentry = nd.path.dentry; - u->mnt = nd.path.mnt; + u->dentry = path.dentry; + u->mnt = path.mnt; } err = 0; @@ -915,9 +912,8 @@ out_mknod_dput: dput(dentry); -out_mknod_unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); + mutex_unlock(&path.dentry->d_inode->i_mutex); + path_put(&path); out_mknod_parent: if (err == -EEXIST) err = -EADDRINUSE;
diff --git a/security/capability.c b/security/capability.c index bbb5115..2984ea4 100644 --- a/security/capability.c +++ b/security/capability.c
@@ -181,7 +181,7 @@ return 0; } -static int cap_inode_permission(struct inode *inode, int mask, unsigned flags) +static int cap_inode_permission(struct inode *inode, int mask) { return 0; }
diff --git a/security/security.c b/security/security.c index 4ba6d4c..0e4fccf 100644 --- a/security/security.c +++ b/security/security.c
@@ -518,14 +518,7 @@ { if (unlikely(IS_PRIVATE(inode))) return 0; - return security_ops->inode_permission(inode, mask, 0); -} - -int security_inode_exec_permission(struct inode *inode, unsigned int flags) -{ - if (unlikely(IS_PRIVATE(inode))) - return 0; - return security_ops->inode_permission(inode, MAY_EXEC, flags); + return security_ops->inode_permission(inode, mask); } int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
diff --git a/security/selinux/avc.c b/security/selinux/avc.c index d515b21..dca1c22 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c
@@ -527,7 +527,7 @@ * happened a little later. */ if ((a->type == LSM_AUDIT_DATA_INODE) && - (flags & IPERM_FLAG_RCU)) + (flags & MAY_NOT_BLOCK)) return -ECHILD; a->selinux_audit_data.tclass = tclass;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 4225155..9f4c77d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c
@@ -2659,12 +2659,13 @@ return dentry_has_perm(cred, dentry, FILE__READ); } -static int selinux_inode_permission(struct inode *inode, int mask, unsigned flags) +static int selinux_inode_permission(struct inode *inode, int mask) { const struct cred *cred = current_cred(); struct common_audit_data ad; u32 perms; bool from_access; + unsigned flags = mask & MAY_NOT_BLOCK; from_access = mask & MAY_ACCESS; mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND);
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 9831a39..f375eb2 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c
@@ -689,9 +689,10 @@ * * Returns 0 if access is permitted, -EACCES otherwise */ -static int smack_inode_permission(struct inode *inode, int mask, unsigned flags) +static int smack_inode_permission(struct inode *inode, int mask) { struct smk_audit_info ad; + int no_block = mask & MAY_NOT_BLOCK; mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND); /* @@ -701,7 +702,7 @@ return 0; /* May be droppable after audit */ - if (flags & IPERM_FLAG_RCU) + if (no_block) return -ECHILD; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_INODE); smk_ad_setfield_u_fs_inode(&ad, inode);
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index d1e05b0..8d95e91 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c
@@ -103,7 +103,7 @@ if (!buf) break; /* Get better name for socket. */ - if (dentry->d_sb && dentry->d_sb->s_magic == SOCKFS_MAGIC) { + if (dentry->d_sb->s_magic == SOCKFS_MAGIC) { struct inode *inode = dentry->d_inode; struct socket *sock = inode ? SOCKET_I(inode) : NULL; struct sock *sk = sock ? sock->sk : NULL;