blob: 6104676857f5c6567bcbb5a375cb91e90a78956f [file] [log] [blame]
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "math.h"
#include "dev-replace.h"
#include "sysfs.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
static DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
static void lock_chunks(struct btrfs_root *root)
{
mutex_lock(&root->fs_info->chunk_mutex);
}
static void unlock_chunks(struct btrfs_root *root)
{
mutex_unlock(&root->fs_info->chunk_mutex);
}
static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
struct btrfs_fs_devices *fs_devs;
fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
if (!fs_devs)
return ERR_PTR(-ENOMEM);
mutex_init(&fs_devs->device_list_mutex);
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->list);
return fs_devs;
}
/**
* alloc_fs_devices - allocate struct btrfs_fs_devices
* @fsid: a pointer to UUID for this FS. If NULL a new UUID is
* generated.
*
* Return: a pointer to a new &struct btrfs_fs_devices on success;
* ERR_PTR() on error. Returned struct is not linked onto any lists and
* can be destroyed with kfree() right away.
*/
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
{
struct btrfs_fs_devices *fs_devs;
fs_devs = __alloc_fs_devices();
if (IS_ERR(fs_devs))
return fs_devs;
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
else
generate_random_uuid(fs_devs->fsid);
return fs_devs;
}
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
WARN_ON(fs_devices->opened);
while (!list_empty(&fs_devices->devices)) {
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
rcu_string_free(device->name);
kfree(device);
}
kfree(fs_devices);
}
static void btrfs_kobject_uevent(struct block_device *bdev,
enum kobject_action action)
{
int ret;
ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
if (ret)
pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
action,
kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
&disk_to_dev(bdev->bd_disk)->kobj);
}
void btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
while (!list_empty(&fs_uuids)) {
fs_devices = list_entry(fs_uuids.next,
struct btrfs_fs_devices, list);
list_del(&fs_devices->list);
free_fs_devices(fs_devices);
}
}
static struct btrfs_device *__alloc_device(void)
{
struct btrfs_device *dev;
dev = kzalloc(sizeof(*dev), GFP_NOFS);
if (!dev)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
spin_lock_init(&dev->io_lock);
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
return dev;
}
static noinline struct btrfs_device *__find_device(struct list_head *head,
u64 devid, u8 *uuid)
{
struct btrfs_device *dev;
list_for_each_entry(dev, head, dev_list) {
if (dev->devid == devid &&
(!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
return dev;
}
}
return NULL;
}
static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
list_for_each_entry(fs_devices, &fs_uuids, list) {
if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
return fs_devices;
}
return NULL;
}
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
int flush, struct block_device **bdev,
struct buffer_head **bh)
{
int ret;
*bdev = blkdev_get_by_path(device_path, flags, holder);
if (IS_ERR(*bdev)) {
ret = PTR_ERR(*bdev);
printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
goto error;
}
if (flush)
filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
ret = set_blocksize(*bdev, 4096);
if (ret) {
blkdev_put(*bdev, flags);
goto error;
}
invalidate_bdev(*bdev);
*bh = btrfs_read_dev_super(*bdev);
if (!*bh) {
ret = -EINVAL;
blkdev_put(*bdev, flags);
goto error;
}
return 0;
error:
*bdev = NULL;
*bh = NULL;
return ret;
}
static void requeue_list(struct btrfs_pending_bios *pending_bios,
struct bio *head, struct bio *tail)
{
struct bio *old_head;
old_head = pending_bios->head;
pending_bios->head = head;
if (pending_bios->tail)
tail->bi_next = old_head;
else
pending_bios->tail = tail;
}
/*
* we try to collect pending bios for a device so we don't get a large
* number of procs sending bios down to the same device. This greatly
* improves the schedulers ability to collect and merge the bios.
*
* But, it also turns into a long list of bios to process and that is sure
* to eventually make the worker thread block. The solution here is to
* make some progress and then put this work struct back at the end of
* the list if the block device is congested. This way, multiple devices
* can make progress from a single worker thread.
*/
static noinline void run_scheduled_bios(struct btrfs_device *device)
{
struct bio *pending;
struct backing_dev_info *bdi;
struct btrfs_fs_info *fs_info;
struct btrfs_pending_bios *pending_bios;
struct bio *tail;
struct bio *cur;
int again = 0;
unsigned long num_run;
unsigned long batch_run = 0;
unsigned long limit;
unsigned long last_waited = 0;
int force_reg = 0;
int sync_pending = 0;
struct blk_plug plug;
/*
* this function runs all the bios we've collected for
* a particular device. We don't want to wander off to
* another device without first sending all of these down.
* So, setup a plug here and finish it off before we return
*/
blk_start_plug(&plug);
bdi = blk_get_backing_dev_info(device->bdev);
fs_info = device->dev_root->fs_info;
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
loop:
spin_lock(&device->io_lock);
loop_lock:
num_run = 0;
/* take all the bios off the list at once and process them
* later on (without the lock held). But, remember the
* tail and other pointers so the bios can be properly reinserted
* into the list if we hit congestion
*/
if (!force_reg && device->pending_sync_bios.head) {
pending_bios = &device->pending_sync_bios;
force_reg = 1;
} else {
pending_bios = &device->pending_bios;
force_reg = 0;
}
pending = pending_bios->head;
tail = pending_bios->tail;
WARN_ON(pending && !tail);
/*
* if pending was null this time around, no bios need processing
* at all and we can stop. Otherwise it'll loop back up again
* and do an additional check so no bios are missed.
*
* device->running_pending is used to synchronize with the
* schedule_bio code.
*/
if (device->pending_sync_bios.head == NULL &&
device->pending_bios.head == NULL) {
again = 0;
device->running_pending = 0;
} else {
again = 1;
device->running_pending = 1;
}
pending_bios->head = NULL;
pending_bios->tail = NULL;
spin_unlock(&device->io_lock);
while (pending) {
rmb();
/* we want to work on both lists, but do more bios on the
* sync list than the regular list
*/
if ((num_run > 32 &&
pending_bios != &device->pending_sync_bios &&
device->pending_sync_bios.head) ||
(num_run > 64 && pending_bios == &device->pending_sync_bios &&
device->pending_bios.head)) {
spin_lock(&device->io_lock);
requeue_list(pending_bios, pending, tail);
goto loop_lock;
}
cur = pending;
pending = pending->bi_next;
cur->bi_next = NULL;
if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
BUG_ON(atomic_read(&cur->bi_cnt) == 0);
/*
* if we're doing the sync list, record that our
* plug has some sync requests on it
*
* If we're doing the regular list and there are
* sync requests sitting around, unplug before
* we add more
*/
if (pending_bios == &device->pending_sync_bios) {
sync_pending = 1;
} else if (sync_pending) {
blk_finish_plug(&plug);
blk_start_plug(&plug);
sync_pending = 0;
}
btrfsic_submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
if (need_resched())
cond_resched();
/*
* we made progress, there is more work to do and the bdi
* is now congested. Back off and let other work structs
* run instead
*/
if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
fs_info->fs_devices->open_devices > 1) {
struct io_context *ioc;
ioc = current->io_context;
/*
* the main goal here is that we don't want to
* block if we're going to be able to submit
* more requests without blocking.
*
* This code does two great things, it pokes into
* the elevator code from a filesystem _and_
* it makes assumptions about how batching works.
*/
if (ioc && ioc->nr_batch_requests > 0 &&
time_before(jiffies, ioc->last_waited + HZ/50UL) &&
(last_waited == 0 ||
ioc->last_waited == last_waited)) {
/*
* we want to go through our batch of
* requests and stop. So, we copy out
* the ioc->last_waited time and test
* against it before looping
*/
last_waited = ioc->last_waited;
if (need_resched())
cond_resched();
continue;
}
spin_lock(&device->io_lock);
requeue_list(pending_bios, pending, tail);
device->running_pending = 1;
spin_unlock(&device->io_lock);
btrfs_queue_work(fs_info->submit_workers,
&device->work);
goto done;
}
/* unplug every 64 requests just for good measure */
if (batch_run % 64 == 0) {
blk_finish_plug(&plug);
blk_start_plug(&plug);
sync_pending = 0;
}
}
cond_resched();
if (again)
goto loop;
spin_lock(&device->io_lock);
if (device->pending_bios.head || device->pending_sync_bios.head)
goto loop_lock;
spin_unlock(&device->io_lock);
done:
blk_finish_plug(&plug);
}
static void pending_bios_fn(struct btrfs_work *work)
{
struct btrfs_device *device;
device = container_of(work, struct btrfs_device, work);
run_scheduled_bios(device);
}
/*
* Add new device to list of registered devices
*
* Returns:
* 1 - first time device is seen
* 0 - device already known
* < 0 - error
*/
static noinline int device_list_add(const char *path,
struct btrfs_super_block *disk_super,
u64 devid, struct btrfs_fs_devices **fs_devices_ret)
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
struct rcu_string *name;
int ret = 0;
u64 found_transid = btrfs_super_generation(disk_super);
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
fs_devices = alloc_fs_devices(disk_super->fsid);
if (IS_ERR(fs_devices))
return PTR_ERR(fs_devices);
list_add(&fs_devices->list, &fs_uuids);
fs_devices->latest_devid = devid;
fs_devices->latest_trans = found_transid;
device = NULL;
} else {
device = __find_device(&fs_devices->devices, devid,
disk_super->dev_item.uuid);
}
if (!device) {
if (fs_devices->opened)
return -EBUSY;
device = btrfs_alloc_device(NULL, &devid,
disk_super->dev_item.uuid);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
return PTR_ERR(device);
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
kfree(device);
return -ENOMEM;
}
rcu_assign_pointer(device->name, name);
mutex_lock(&fs_devices->device_list_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
fs_devices->num_devices++;
mutex_unlock(&fs_devices->device_list_mutex);
ret = 1;
device->fs_devices = fs_devices;
} else if (!device->name || strcmp(device->name->str, path)) {
name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
return -ENOMEM;
rcu_string_free(device->name);
rcu_assign_pointer(device->name, name);
if (device->missing) {
fs_devices->missing_devices--;
device->missing = 0;
}
}
if (found_transid > fs_devices->latest_trans) {
fs_devices->latest_devid = devid;
fs_devices->latest_trans = found_transid;
}
*fs_devices_ret = fs_devices;
return ret;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
struct btrfs_fs_devices *fs_devices;
struct btrfs_device *device;
struct btrfs_device *orig_dev;
fs_devices = alloc_fs_devices(orig->fsid);
if (IS_ERR(fs_devices))
return fs_devices;
fs_devices->latest_devid = orig->latest_devid;
fs_devices->latest_trans = orig->latest_trans;
fs_devices->total_devices = orig->total_devices;
/* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
device = btrfs_alloc_device(NULL, &orig_dev->devid,
orig_dev->uuid);
if (IS_ERR(device))
goto error;
/*
* This is ok to do without rcu read locked because we hold the
* uuid mutex so nothing we touch in here is going to disappear.
*/
if (orig_dev->name) {
name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
if (!name) {
kfree(device);
goto error;
}
rcu_assign_pointer(device->name, name);
}
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
return fs_devices;
error:
free_fs_devices(fs_devices);
return ERR_PTR(-ENOMEM);
}
void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
struct block_device *latest_bdev = NULL;
u64 latest_devid = 0;
u64 latest_transid = 0;
mutex_lock(&uuid_mutex);
again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (device->in_fs_metadata) {
if (!device->is_tgtdev_for_dev_replace &&
(!latest_transid ||
device->generation > latest_transid)) {
latest_devid = device->devid;
latest_transid = device->generation;
latest_bdev = device->bdev;
}
continue;
}
if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
/*
* In the first step, keep the device which has
* the correct fsid and the devid that is used
* for the dev_replace procedure.
* In the second step, the dev_replace state is
* read from the device tree and it is known
* whether the procedure is really active or
* not, which means whether this device is
* used or whether it should be removed.
*/
if (step == 0 || device->is_tgtdev_for_dev_replace) {
continue;
}
}
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
fs_devices->open_devices--;
}
if (device->writeable) {
list_del_init(&device->dev_alloc_list);
device->writeable = 0;
if (!device->is_tgtdev_for_dev_replace)
fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
rcu_string_free(device->name);
kfree(device);
}
if (fs_devices->seed) {
fs_devices = fs_devices->seed;
goto again;
}
fs_devices->latest_bdev = latest_bdev;
fs_devices->latest_devid = latest_devid;
fs_devices->latest_trans = latest_transid;
mutex_unlock(&uuid_mutex);
}
static void __free_device(struct work_struct *work)
{
struct btrfs_device *device;
device = container_of(work, struct btrfs_device, rcu_work);
if (device->bdev)
blkdev_put(device->bdev, device->mode);
rcu_string_free(device->name);
kfree(device);
}
static void free_device(struct rcu_head *head)
{
struct btrfs_device *device;
device = container_of(head, struct btrfs_device, rcu);
INIT_WORK(&device->rcu_work, __free_device);
schedule_work(&device->rcu_work);
}
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
if (--fs_devices->opened > 0)
return 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
struct btrfs_device *new_device;
struct rcu_string *name;
if (device->bdev)
fs_devices->open_devices--;
if (device->writeable &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
if (device->can_discard)
fs_devices->num_can_discard--;
if (device->missing)
fs_devices->missing_devices--;
new_device = btrfs_alloc_device(NULL, &device->devid,
device->uuid);
BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
/* Safe because we are under uuid_mutex */
if (device->name) {
name = rcu_string_strdup(device->name->str, GFP_NOFS);
BUG_ON(!name); /* -ENOMEM */
rcu_assign_pointer(new_device->name, name);
}
list_replace_rcu(&device->dev_list, &new_device->dev_list);
new_device->fs_devices = device->fs_devices;
call_rcu(&device->rcu, free_device);
}
mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
fs_devices->seeding = 0;
return 0;
}
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_fs_devices *seed_devices = NULL;
int ret;
mutex_lock(&uuid_mutex);
ret = __btrfs_close_devices(fs_devices);
if (!fs_devices->opened) {
seed_devices = fs_devices->seed;
fs_devices->seed = NULL;
}
mutex_unlock(&uuid_mutex);
while (seed_devices) {
fs_devices = seed_devices;
seed_devices = fs_devices->seed;
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
}
/*
* Wait for rcu kworkers under __btrfs_close_devices
* to finish all blkdev_puts so device is really
* free when umount is done.
*/
rcu_barrier();
return ret;
}
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
struct request_queue *q;
struct block_device *bdev;
struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
struct block_device *latest_bdev = NULL;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
u64 latest_devid = 0;
u64 latest_transid = 0;
u64 devid;
int seeding = 1;
int ret = 0;
flags |= FMODE_EXCL;
list_for_each_entry(device, head, dev_list) {
if (device->bdev)
continue;
if (!device->name)
continue;
/* Just open everything we can; ignore failures here */
if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
&bdev, &bh))
continue;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
if (devid != device->devid)
goto error_brelse;
if (memcmp(device->uuid, disk_super->dev_item.uuid,
BTRFS_UUID_SIZE))
goto error_brelse;
device->generation = btrfs_super_generation(disk_super);
if (!latest_transid || device->generation > latest_transid) {
latest_devid = devid;
latest_transid = device->generation;
latest_bdev = bdev;
}
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
device->writeable = 0;
} else {
device->writeable = !bdev_read_only(bdev);
seeding = 0;
}
q = bdev_get_queue(bdev);
if (blk_queue_discard(q)) {
device->can_discard = 1;
fs_devices->num_can_discard++;
}
device->bdev = bdev;
device->in_fs_metadata = 0;
device->mode = flags;
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
fs_devices->rotating = 1;
fs_devices->open_devices++;
if (device->writeable &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
fs_devices->rw_devices++;
list_add(&device->dev_alloc_list,
&fs_devices->alloc_list);
}
brelse(bh);
continue;
error_brelse:
brelse(bh);
blkdev_put(bdev, flags);
continue;
}
if (fs_devices->open_devices == 0) {
ret = -EINVAL;
goto out;
}
fs_devices->seeding = seeding;
fs_devices->opened = 1;
fs_devices->latest_bdev = latest_bdev;
fs_devices->latest_devid = latest_devid;
fs_devices->latest_trans = latest_transid;
fs_devices->total_rw_bytes = 0;
out:
return ret;
}
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
int ret;
mutex_lock(&uuid_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
} else {
ret = __btrfs_open_devices(fs_devices, flags, holder);
}
mutex_unlock(&uuid_mutex);
return ret;
}
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
* is read via pagecache
*/
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret)
{
struct btrfs_super_block *disk_super;
struct block_device *bdev;
struct page *page;
void *p;
int ret = -EINVAL;
u64 devid;
u64 transid;
u64 total_devices;
u64 bytenr;
pgoff_t index;
/*
* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
mutex_lock(&uuid_mutex);
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev)) {
ret = PTR_ERR(bdev);
goto error;
}
/* make sure our super fits in the device */
if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
goto error_bdev_put;
/* make sure our super fits in the page */
if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
goto error_bdev_put;
/* make sure our super doesn't straddle pages on disk */
index = bytenr >> PAGE_CACHE_SHIFT;
if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
goto error_bdev_put;
/* pull in the page with our super */
page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
index, GFP_NOFS);
if (IS_ERR_OR_NULL(page))
goto error_bdev_put;
p = kmap(page);
/* align our pointer to the offset of the super block */
disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
if (btrfs_super_bytenr(disk_super) != bytenr ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC)
goto error_unmap;
devid = btrfs_stack_device_id(&disk_super->dev_item);
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
if (ret > 0) {
if (disk_super->label[0]) {
if (disk_super->label[BTRFS_LABEL_SIZE - 1])
disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
} else {
printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
}
printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
ret = 0;
}
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
error_unmap:
kunmap(page);
page_cache_release(page);
error_bdev_put:
blkdev_put(bdev, flags);
error:
mutex_unlock(&uuid_mutex);
return ret;
}
/* helper to account the used device space in the range */
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 extent_end;
int ret;
int slot;
struct extent_buffer *l;
*length = 0;
if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = 2;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
goto out;
}
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.objectid < device->devid)
goto next;
if (key.objectid > device->devid)
break;
if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
goto next;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
extent_end = key.offset + btrfs_dev_extent_length(l,
dev_extent);
if (key.offset <= start && extent_end > end) {
*length = end - start + 1;
break;
} else if (key.offset <= start && extent_end > start)
*length += extent_end - start;
else if (key.offset > start && extent_end <= end)
*length += extent_end - key.offset;
else if (key.offset > start && key.offset <= end) {
*length += end - key.offset + 1;
break;
} else if (key.offset > end)
break;
next:
path->slots[0]++;
}
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
static int contains_pending_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 *start, u64 len)
{
struct extent_map *em;
int ret = 0;
list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
struct map_lookup *map;
int i;
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev != device)
continue;
if (map->stripes[i].physical >= *start + len ||
map->stripes[i].physical + em->orig_block_len <=
*start)
continue;
*start = map->stripes[i].physical +
em->orig_block_len;
ret = 1;
}
}
return ret;
}
/*
* find_free_dev_extent - find free space in the specified device
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @start: store the start of the free space.
* @len: the size of the free space. that we find, or the size of the max
* free space if we don't find suitable free space
*
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
* of extents
*
* @start is used to store the start of the free space if we find. But if we
* don't find suitable free space, it will be used to store the start position
* of the max free space.
*
* @len is used to store the size of the free space that we find.
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 hole_size;
u64 max_hole_start;
u64 max_hole_size;
u64 extent_end;
u64 search_start;
u64 search_end = device->total_bytes;
int ret;
int slot;
struct extent_buffer *l;
/* FIXME use last free of some kind */
/* we don't want to overwrite the superblock on the drive,
* so we make sure to start at an offset of at least 1MB
*/
search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
max_hole_start = search_start;
max_hole_size = 0;
hole_size = 0;
if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
ret = -ENOSPC;
goto out;
}
path->reada = 2;
path->search_commit_root = 1;
path->skip_locking = 1;
key.objectid = device->devid;
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
goto out;
}
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.objectid < device->devid)
goto next;
if (key.objectid > device->devid)
break;
if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
goto next;
if (key.offset > search_start) {
hole_size = key.offset - search_start;
/*
* Have to check before we set max_hole_start, otherwise
* we could end up sending back this offset anyway.
*/
if (contains_pending_extent(trans, device,
&search_start,
hole_size))
hole_size = 0;
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
}
/*
* If this free space is greater than which we need,
* it must be the max free space that we have found
* until now, so max_hole_start must point to the start
* of this free space and the length of this free space
* is stored in max_hole_size. Thus, we return
* max_hole_start and max_hole_size and go back to the
* caller.
*/
if (hole_size >= num_bytes) {
ret = 0;
goto out;
}
}
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
extent_end = key.offset + btrfs_dev_extent_length(l,
dev_extent);
if (extent_end > search_start)
search_start = extent_end;
next:
path->slots[0]++;
cond_resched();
}
/*
* At this point, search_start should be the end of
* allocated dev extents, and when shrinking the device,
* search_end may be smaller than search_start.
*/
if (search_end > search_start)
hole_size = search_end - search_start;
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
}
if (contains_pending_extent(trans, device, &search_start, hole_size)) {
btrfs_release_path(path);
goto again;
}
/* See above. */
if (hole_size < num_bytes)
ret = -ENOSPC;
else
ret = 0;
out:
btrfs_free_path(path);
*start = max_hole_start;
if (len)
*len = max_hole_size;
return ret;
}
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start)
{
int ret;
struct btrfs_path *path;
struct btrfs_root *root = device->dev_root;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf = NULL;
struct btrfs_dev_extent *extent = NULL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
BTRFS_DEV_EXTENT_KEY);
if (ret)
goto out;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
BUG_ON(found_key.offset > start || found_key.offset +
btrfs_dev_extent_length(leaf, extent) < start);
key = found_key;
btrfs_release_path(path);
goto again;
} else if (ret == 0) {
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
btrfs_error(root->fs_info, ret, "Slot search failed");
goto out;
}
if (device->bytes_used > 0) {
u64 len = btrfs_dev_extent_length(leaf, extent);
device->bytes_used -= len;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += len;
spin_unlock(&root->fs_info->free_chunk_lock);
}
ret = btrfs_del_item(trans, root, path);
if (ret) {
btrfs_error(root->fs_info, ret,
"Failed to remove dev extent item");
}
out:
btrfs_free_path(path);
return ret;
}
static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset, u64 start, u64 num_bytes)
{
int ret;
struct btrfs_path *path;
struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
WARN_ON(!device->in_fs_metadata);
WARN_ON(device->is_tgtdev_for_dev_replace);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*extent));
if (ret)
goto out;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
}
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
em_tree = &fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
n = rb_last(&em_tree->map);
if (n) {
em = rb_entry(n, struct extent_map, rb_node);
ret = em->start + em->len;
}
read_unlock(&em_tree->lock);
return ret;
}
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
u64 *devid_ret)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path *path;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(fs_info->chunk_root, path,
BTRFS_DEV_ITEMS_OBJECTID,
BTRFS_DEV_ITEM_KEY);
if (ret) {
*devid_ret = 1;
} else {
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
*devid_ret = found_key.offset + 1;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
/*
* the device information is stored in the chunk root
* the btrfs_device struct should be fully filled in
*/
static int btrfs_add_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
root = root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*dev_item));
if (ret)
goto out;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
btrfs_set_device_id(leaf, dev_item, device->devid);
btrfs_set_device_generation(leaf, dev_item, 0);
btrfs_set_device_type(leaf, dev_item, device->type);
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
btrfs_set_device_group(leaf, dev_item, 0);
btrfs_set_device_seek_speed(leaf, dev_item, 0);
btrfs_set_device_bandwidth(leaf, dev_item, 0);
btrfs_set_device_start_offset(leaf, dev_item, 0);
ptr = btrfs_device_uuid(dev_item);
write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
ptr = btrfs_device_fsid(dev_item);
write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
btrfs_mark_buffer_dirty(leaf);
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
*/
static void update_dev_time(char *path_name)
{
struct file *filp;
filp = filp_open(path_name, O_RDWR, 0);
if (!filp)
return;
file_update_time(filp);
filp_close(filp, NULL);
return;
}
static int btrfs_rm_dev_item(struct btrfs_root *root,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_trans_handle *trans;
root = root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
lock_chunks(root);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret)
goto out;
out:
btrfs_free_path(path);
unlock_chunks(root);
btrfs_commit_transaction(trans, root);
return ret;
}
int btrfs_rm_device(struct btrfs_root *root, char *device_path)
{
struct btrfs_device *device;
struct btrfs_device *next_device;
struct block_device *bdev;
struct buffer_head *bh = NULL;
struct btrfs_super_block *disk_super;
struct btrfs_fs_devices *cur_devices;
u64 all_avail;
u64 devid;
u64 num_devices;
u8 *dev_uuid;
unsigned seq;
int ret = 0;
bool clear_super = false;
mutex_lock(&uuid_mutex);
do {
seq = read_seqbegin(&root->fs_info->profiles_lock);
all_avail = root->fs_info->avail_data_alloc_bits |
root->fs_info->avail_system_alloc_bits |
root->fs_info->avail_metadata_alloc_bits;
} while (read_seqretry(&root->fs_info->profiles_lock, seq));
num_devices = root->fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&root->fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
num_devices--;
}
btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
root->fs_info->fs_devices->rw_devices <= 2) {
ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
root->fs_info->fs_devices->rw_devices <= 3) {
ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
goto out;
}
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
device = NULL;
devices = &root->fs_info->fs_devices->devices;
/*
* It is safe to read the devices since the volume_mutex
* is held.
*/
list_for_each_entry(tmp, devices, dev_list) {
if (tmp->in_fs_metadata &&
!tmp->is_tgtdev_for_dev_replace &&
!tmp->bdev) {
device = tmp;
break;
}
}
bdev = NULL;
bh = NULL;
disk_super = NULL;
if (!device) {
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
goto out;
}
} else {
ret = btrfs_get_bdev_and_sb(device_path,
FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder, 0,
&bdev, &bh);
if (ret)
goto out;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
device = btrfs_find_device(root->fs_info, devid, dev_uuid,
disk_super->fsid);
if (!device) {
ret = -ENOENT;
goto error_brelse;
}
}
if (device->is_tgtdev_for_dev_replace) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto error_brelse;
}
if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto error_brelse;
}
if (device->writeable) {
lock_chunks(root);
list_del_init(&device->dev_alloc_list);
unlock_chunks(root);
root->fs_info->fs_devices->rw_devices--;
clear_super = true;
}
mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
/*
* TODO: the superblock still includes this device in its num_devices
* counter although write_all_supers() is not locked out. This
* could give a filesystem state which requires a degraded mount.
*/
ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
if (ret)
goto error_undo;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space = device->total_bytes -
device->bytes_used;
spin_unlock(&root->fs_info->free_chunk_lock);
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root->fs_info, device);
/*
* the device list mutex makes sure that we don't change
* the device list while someone else is writing out all
* the device supers. Whoever is writing all supers, should
* lock the device list mutex before getting the number of
* devices in the super block (super_copy). Conversely,
* whoever updates the number of devices in the super block
* (super_copy) should hold the device list mutex.
*/
cur_devices = device->fs_devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_del_rcu(&device->dev_list);
device->fs_devices->num_devices--;
device->fs_devices->total_devices--;
if (device->missing)
root->fs_info->fs_devices->missing_devices--;
next_device = list_entry(root->fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
if (device->bdev == root->fs_info->sb->s_bdev)
root->fs_info->sb->s_bdev = next_device->bdev;
if (device->bdev == root->fs_info->fs_devices->latest_bdev)
root->fs_info->fs_devices->latest_bdev = next_device->bdev;
if (device->bdev)
device->fs_devices->open_devices--;
/* remove sysfs entry */
btrfs_kobj_rm_device(root->fs_info, device);
call_rcu(&device->rcu, free_device);
num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
fs_devices = root->fs_info->fs_devices;
while (fs_devices) {
if (fs_devices->seed == cur_devices) {
fs_devices->seed = cur_devices->seed;
break;
}
fs_devices = fs_devices->seed;
}
cur_devices->seed = NULL;
lock_chunks(root);
__btrfs_close_devices(cur_devices);
unlock_chunks(root);
free_fs_devices(cur_devices);
}
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
/*
* at this point, the device is zero sized. We want to
* remove it from the devices list and zero out the old super
*/
if (clear_super && disk_super) {
u64 bytenr;
int i;
/* make sure this device isn't detected as part of
* the FS anymore
*/
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
set_buffer_dirty(bh);
sync_dirty_buffer(bh);
/* clear the mirror copies of super block on the disk
* being removed, 0th copy is been taken care above and
* the below would take of the rest
*/
for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
i_size_read(bdev->bd_inode))
break;
brelse(bh);
bh = __bread(bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
if (!bh)
continue;
disk_super = (struct btrfs_super_block *)bh->b_data;
if (btrfs_super_bytenr(disk_super) != bytenr ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
continue;
}
memset(&disk_super->magic, 0,
sizeof(disk_super->magic));
set_buffer_dirty(bh);
sync_dirty_buffer(bh);
}
}
ret = 0;
if (bdev) {
/* Notify udev that device has changed */
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
update_dev_time(device_path);
}
error_brelse:
brelse(bh);
if (bdev)
blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
mutex_unlock(&uuid_mutex);
return ret;
error_undo:
if (device->writeable) {
lock_chunks(root);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
unlock_chunks(root);
root->fs_info->fs_devices->rw_devices++;
}
goto error_brelse;
}
void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev)
{
WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
list_del_rcu(&srcdev->dev_list);
list_del_rcu(&srcdev->dev_alloc_list);
fs_info->fs_devices->num_devices--;
if (srcdev->missing) {
fs_info->fs_devices->missing_devices--;
fs_info->fs_devices->rw_devices++;
}
if (srcdev->can_discard)
fs_info->fs_devices->num_can_discard--;
if (srcdev->bdev) {
fs_info->fs_devices->open_devices--;
/* zero out the old super */
btrfs_scratch_superblock(srcdev);
}
call_rcu(&srcdev->rcu, free_device);
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
struct btrfs_device *next_device;
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
if (tgtdev->bdev) {
btrfs_scratch_superblock(tgtdev);
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
if (tgtdev->can_discard)
fs_info->fs_devices->num_can_discard++;
next_device = list_entry(fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
if (tgtdev->bdev == fs_info->sb->s_bdev)
fs_info->sb->s_bdev = next_device->bdev;
if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
fs_info->fs_devices->latest_bdev = next_device->bdev;
list_del_rcu(&tgtdev->dev_list);
call_rcu(&tgtdev->rcu, free_device);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
}
static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
struct btrfs_device **device)
{
int ret = 0;
struct btrfs_super_block *disk_super;
u64 devid;
u8 *dev_uuid;
struct block_device *bdev;
struct buffer_head *bh;
*device = NULL;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
root->fs_info->bdev_holder, 0, &bdev, &bh);
if (ret)
return ret;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
*device = btrfs_find_device(root->fs_info, devid, dev_uuid,
disk_super->fsid);
brelse(bh);
if (!*device)
ret = -ENOENT;
blkdev_put(bdev, FMODE_READ);
return ret;
}
int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
char *device_path,
struct btrfs_device **device)
{
*device = NULL;
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
devices = &root->fs_info->fs_devices->devices;
/*
* It is safe to read the devices since the volume_mutex
* is held by the caller.
*/
list_for_each_entry(tmp, devices, dev_list) {
if (tmp->in_fs_metadata && !tmp->bdev) {
*device = tmp;
break;
}
}
if (!*device) {
btrfs_err(root->fs_info, "no missing device found");
return -ENOENT;
}
return 0;
} else {
return btrfs_find_device_by_path(root, device_path, device);
}
}
/*
* does all the dirty work required for changing file system's UUID.
*/
static int btrfs_prepare_sprout(struct btrfs_root *root)
{
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
struct btrfs_super_block *disk_super = root->fs_info->super_copy;
struct btrfs_device *device;
u64 super_flags;
BUG_ON(!mutex_is_locked(&uuid_mutex));
if (!fs_devices->seeding)
return -EINVAL;
seed_devices = __alloc_fs_devices();
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
return PTR_ERR(old_devices);
}
list_add(&old_devices->list, &fs_uuids);
memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
seed_devices->opened = 1;
INIT_LIST_HEAD(&seed_devices->devices);
INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
list_for_each_entry(device, &seed_devices->devices, dev_list) {
device->fs_devices = seed_devices;
}
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
super_flags = btrfs_super_flags(disk_super) &
~BTRFS_SUPER_FLAG_SEEDING;
btrfs_set_super_flags(disk_super, super_flags);
return 0;
}
/*
* strore the expected generation for seed devices in device items.
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dev_item *dev_item;
struct btrfs_device *device;
struct btrfs_key key;
u8 fs_uuid[BTRFS_UUID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
u64 devid;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
root = root->fs_info->chunk_root;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
goto error;
leaf = path->nodes[0];
next_slot:
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret > 0)
break;
if (ret < 0)
goto error;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
continue;
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
key.type != BTRFS_DEV_ITEM_KEY)
break;
dev_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_item);
devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_UUID_SIZE);
device = btrfs_find_device(root->fs_info, devid, dev_uuid,
fs_uuid);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
btrfs_set_device_generation(leaf, dev_item,
device->generation);
btrfs_mark_buffer_dirty(leaf);
}
path->slots[0]++;
goto next_slot;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
{
struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct block_device *bdev;
struct list_head *devices;
struct super_block *sb = root->fs_info->sb;
struct rcu_string *name;
u64 total_bytes;
int seeding_dev = 0;
int ret = 0;
if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
return -EROFS;
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
if (root->fs_info->fs_devices->seeding) {
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &root->fs_info->fs_devices->devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
mutex_unlock(
&root->fs_info->fs_devices->device_list_mutex);
goto error;
}
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
device = btrfs_alloc_device(root->fs_info, NULL, NULL);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_NOFS);
if (!name) {
kfree(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
rcu_string_free(device->name);
kfree(device);
ret = PTR_ERR(trans);
goto error;
}
lock_chunks(root);
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
device->writeable = 1;
device->generation = trans->transid;
device->io_width = root->sectorsize;
device->io_align = root->sectorsize;
device->sector_size = root->sectorsize;
device->total_bytes = i_size_read(bdev->bd_inode);
device->disk_total_bytes = device->total_bytes;
device->dev_root = root->fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
device->is_tgtdev_for_dev_replace = 0;
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, 4096);
if (seeding_dev) {
sb->s_flags &= ~MS_RDONLY;
ret = btrfs_prepare_sprout(root);
BUG_ON(ret); /* -ENOMEM */
}
device->fs_devices = root->fs_info->fs_devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
root->fs_info->fs_devices->num_devices++;
root->fs_info->fs_devices->open_devices++;
root->fs_info->fs_devices->rw_devices++;
root->fs_info->fs_devices->total_devices++;
if (device->can_discard)
root->fs_info->fs_devices->num_can_discard++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += device->total_bytes;
spin_unlock(&root->fs_info->free_chunk_lock);
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;
total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
btrfs_set_super_total_bytes(root->fs_info->super_copy,
total_bytes + device->total_bytes);
total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
/* add sysfs device entry */
btrfs_kobj_add_device(root->fs_info, device);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
ret = init_first_rw_device(trans, root, device);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
ret = btrfs_finish_sprout(trans, root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
/* Sprouting would change fsid of the mounted root,
* so rename the fsid on the sysfs
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
goto error_trans;
} else {
ret = btrfs_add_device(trans, root, device);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
}
/*
* we've got more storage, clear any full flags on the space
* infos
*/
btrfs_clear_space_info_full(root->fs_info);
unlock_chunks(root);
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
ret = btrfs_commit_transaction(trans, root);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
if (ret) /* transaction commit */
return ret;
ret = btrfs_relocate_sys_chunks(root);
if (ret < 0)
btrfs_error(root->fs_info, ret,
"Failed to relocate sys chunks after "
"device initialization. This can be fixed "
"using the \"btrfs balance\" command.");
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) == -ENOENT)
return 0;
return PTR_ERR(trans);
}
ret = btrfs_commit_transaction(trans, root);
}
/* Update ctime/mtime for libblkid */
update_dev_time(device_path);
return ret;
error_trans:
unlock_chunks(root);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
btrfs_kobj_rm_device(root->fs_info, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
return ret;
}
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
struct btrfs_device **device_out)
{
struct request_queue *q;
struct btrfs_device *device;
struct block_device *bdev;
struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
if (fs_info->fs_devices->seeding)
return -EINVAL;
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
goto error;
}
}
device = btrfs_alloc_device(NULL, &devid, NULL);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_NOFS);
if (!name) {
kfree(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
device->writeable = 1;
device->generation = 0;
device->io_width = root->sectorsize;
device->io_align = root->sectorsize;
device->sector_size = root->sectorsize;
device->total_bytes = i_size_read(bdev->bd_inode);
device->disk_total_bytes = device->total_bytes;
device->dev_root = fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
device->is_tgtdev_for_dev_replace = 1;
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, 4096);
device->fs_devices = fs_info->fs_devices;
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
if (device->can_discard)
fs_info->fs_devices->num_can_discard++;
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
*device_out = device;
return ret;
error:
blkdev_put(bdev, FMODE_EXCL);
return ret;
}
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
WARN_ON(fs_info->fs_devices->rw_devices == 0);
tgtdev->io_width = fs_info->dev_root->sectorsize;
tgtdev->io_align = fs_info->dev_root->sectorsize;
tgtdev->sector_size = fs_info->dev_root->sectorsize;
tgtdev->dev_root = fs_info->dev_root;
tgtdev->in_fs_metadata = 1;
}
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_root *root;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
root = device->dev_root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
btrfs_set_device_id(leaf, dev_item, device->devid);
btrfs_set_device_type(leaf, dev_item, device->type);
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
}
static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
device->dev_root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 diff = new_size - device->total_bytes;
if (!device->writeable)
return -EACCES;
if (new_size <= device->total_bytes ||
device->is_tgtdev_for_dev_replace)
return -EINVAL;
btrfs_set_super_total_bytes(super_copy, old_total + diff);
device->fs_devices->total_rw_bytes += diff;
device->total_bytes = new_size;
device->disk_total_bytes = new_size;
btrfs_clear_space_info_full(device->dev_root->fs_info);
return btrfs_update_device(trans, device);
}
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
int ret;
lock_chunks(device->dev_root);
ret = __btrfs_grow_device(trans, device, new_size);
unlock_chunks(device->dev_root);
return ret;
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset)
{
int ret;
struct btrfs_path *path;
struct btrfs_key key;
root = root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = chunk_objectid;
key.offset = chunk_offset;
key.type = BTRFS_CHUNK_ITEM_KEY;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
btrfs_error(root->fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
btrfs_error(root->fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
return ret;
}
static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
chunk_offset)
{
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *ptr;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
u32 cur;
struct btrfs_key key;
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
cur = 0;
while (cur < array_size) {
disk_key = (struct btrfs_disk_key *)ptr;
btrfs_disk_key_to_cpu(&key, disk_key);
len = sizeof(*disk_key);
if (key.type == BTRFS_CHUNK_ITEM_KEY) {
chunk = (struct btrfs_chunk *)(ptr + len);
num_stripes = btrfs_stack_chunk_num_stripes(chunk);
len += btrfs_chunk_item_size(num_stripes);
} else {
ret = -EIO;
break;
}
if (key.objectid == chunk_objectid &&
key.offset == chunk_offset) {
memmove(ptr, ptr + len, array_size - (cur + len));
array_size -= len;
btrfs_set_super_sys_array_size(super_copy, array_size);
} else {
ptr += len;
cur += len;
}
}
return ret;
}
static int btrfs_relocate_chunk(struct btrfs_root *root,
u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset)
{
struct extent_map_tree *em_tree;
struct btrfs_root *extent_root;
struct btrfs_trans_handle *trans;
struct extent_map *em;
struct map_lookup *map;
int ret;
int i;
root = root->fs_info->chunk_root;
extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
ret = btrfs_can_relocate(extent_root, chunk_offset);
if (ret)
return -ENOSPC;
/* step one, relocate all the extents inside this chunk */
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
if (ret)
return ret;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_std_error(root->fs_info, ret);
return ret;
}
lock_chunks(root);
/*
* step two, delete the device extents and the
* chunk tree entries
*/
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
BUG_ON(!em || em->start > chunk_offset ||
em->start + em->len < chunk_offset);
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
map->stripes[i].physical);
BUG_ON(ret);
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
BUG_ON(ret);
}
}
ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
chunk_offset);
BUG_ON(ret);
trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
BUG_ON(ret);
}
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
BUG_ON(ret);
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for the tree */
free_extent_map(em);
/* once for us */
free_extent_map(em);
unlock_chunks(root);
btrfs_end_transaction(trans, root);
return 0;
}
static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
{
struct btrfs_root *chunk_root = root->fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_chunk *chunk;
struct btrfs_key key;
struct btrfs_key found_key;
u64 chunk_tree = chunk_root->root_key.objectid;
u64 chunk_type;
bool retried = false;
int failed = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret < 0)
goto error;
if (ret > 0)
break;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
chunk = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_chunk);
chunk_type = btrfs_chunk_type(leaf, chunk);
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
found_key.objectid,
found_key.offset);
if (ret == -ENOSPC)
failed++;
else if (ret)
BUG();
}
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
}
ret = 0;
if (failed && !retried) {
failed = 0;
retried = true;
goto again;
} else if (WARN_ON(failed && retried)) {
ret = -ENOSPC;
}
error:
btrfs_free_path(path);
return ret;
}
static int insert_balance_item(struct btrfs_root *root,
struct btrfs_balance_control *bctl)
{
struct btrfs_trans_handle *trans;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
int ret, err;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_BALANCE_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*item));
if (ret)
goto out;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
btrfs_set_balance_data(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
btrfs_set_balance_meta(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
btrfs_set_balance_sys(leaf, item, &disk_bargs);
btrfs_set_balance_flags(leaf, item, bctl->flags);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
return ret;
}
static int del_balance_item(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_key key;
int ret, err;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_BALANCE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
return ret;
}
/*
* This is a heuristic used to reduce the number of chunks balanced on
* resume after balance was interrupted.
*/
static void update_balance_args(struct btrfs_balance_control *bctl)
{
/*
* Turn on soft mode for chunk types that were being converted.
*/
if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
/*
* Turn on usage filter if is not already used. The idea is
* that chunks that we have already balanced should be
* reasonably full. Don't do it for chunks that are being
* converted - that will keep us from relocating unconverted
* (albeit full) chunks.
*/
if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->data.usage = 90;
}
if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->sys.usage = 90;
}
if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->meta.usage = 90;
}
}
/*
* Should be called with both balance and volume mutexes held to
* serialize other volume operations (add_dev/rm_dev/resize) with
* restriper. Same goes for unset_balance_control.
*/
static void set_balance_control(struct btrfs_balance_control *bctl)
{
struct btrfs_fs_info *fs_info = bctl->fs_info;
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = bctl;
spin_unlock(&fs_info->balance_lock);
}
static void unset_balance_control(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
BUG_ON(!fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = NULL;
spin_unlock(&fs_info->balance_lock);
kfree(bctl);
}
/*
* Balance filters. Return 1 if chunk should be filtered out
* (should not be balanced).
*/
static int chunk_profiles_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->profiles & chunk_type)
return 0;
return 1;
}
static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
if (bargs->usage == 0)
user_thresh = 1;
else if (bargs->usage > 100)
user_thresh = cache->key.offset;
else
user_thresh = div_factor_fine(cache->key.offset,
bargs->usage);
if (chunk_used < user_thresh)
ret = 0;
btrfs_put_block_group(cache);
return ret;
}
static int chunk_devid_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
int i;
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
return 0;
}
return 1;
}
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
u64 stripe_offset;
u64 stripe_length;
int factor;
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
return 0;
if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
factor = num_stripes / 2;
} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
factor = num_stripes - 1;
} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
factor = num_stripes - 2;
} else {
factor = num_stripes;
}
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
continue;
stripe_offset = btrfs_stripe_offset(leaf, stripe);
stripe_length = btrfs_chunk_length(leaf, chunk);
do_div(stripe_length, factor);
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
return 0;
}
return 1;
}
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
if (chunk_offset < bargs->vend &&
chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
/* at least part of the chunk is inside this vrange */
return 0;
return 1;
}
static int chunk_soft_convert_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return 0;
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->target == chunk_type)
return 1;
return 0;
}
static int should_balance_chunk(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 chunk_offset)
{
struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
struct btrfs_balance_args *bargs = NULL;
u64 chunk_type = btrfs_chunk_type(leaf, chunk);
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
return 0;
}
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
bargs = &bctl->data;
else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
bargs = &bctl->sys;
else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
bargs = &bctl->meta;
/* profiles filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
chunk_profiles_filter(chunk_type, bargs)) {
return 0;
}
/* usage filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
return 0;
}
/* devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
chunk_devid_filter(leaf, chunk, bargs)) {
return 0;
}
/* drange filter, makes sense only with devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
return 0;
}
/* vrange filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
return 0;
}
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
return 0;
}
/*
* limited by count, must be the last filter
*/
if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
if (bargs->limit == 0)
return 0;
else
bargs->limit--;
}
return 1;
}
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_root *dev_root = fs_info->dev_root;
struct list_head *devices;
struct btrfs_device *device;
u64 old_size;
u64 size_to_free;
struct btrfs_chunk *chunk;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_trans_handle *trans;
struct extent_buffer *leaf;
int slot;
int ret;
int enospc_errors = 0;
bool counting = true;
u64 limit_data = bctl->data.limit;
u64 limit_meta = bctl->meta.limit;
u64 limit_sys = bctl->sys.limit;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
old_size = device->total_bytes;
size_to_free = div_factor(old_size, 1);
size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
if (!device->writeable ||
device->total_bytes - device->bytes_used > size_to_free ||
device->is_tgtdev_for_dev_replace)
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
if (ret == -ENOSPC)
break;
BUG_ON(ret);
trans = btrfs_start_transaction(dev_root, 0);
BUG_ON(IS_ERR(trans));
ret = btrfs_grow_device(trans, device, old_size);
BUG_ON(ret);
btrfs_end_transaction(trans, dev_root);
}
/* step two, relocate all the chunks */
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto error;
}
/* zero out stat counters */
spin_lock(&fs_info->balance_lock);
memset(&bctl->stat, 0, sizeof(bctl->stat));
spin_unlock(&fs_info->balance_lock);
again:
if (!counting) {
bctl->data.limit = limit_data;
bctl->meta.limit = limit_meta;
bctl->sys.limit = limit_sys;
}
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;