drbd: RCU for disk_conf

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index b8ea480..ea62838 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -384,7 +384,8 @@
 	rcu_read_lock();
 	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
 		if (get_ldev_if_state(mdev, D_CONSISTENT)) {
-			fp = max_t(enum drbd_fencing_p, fp, mdev->ldev->dc.fencing);
+			fp = max_t(enum drbd_fencing_p, fp,
+				   rcu_dereference(mdev->ldev->disk_conf)->fencing);
 			put_ldev(mdev);
 		}
 	}
@@ -678,7 +679,12 @@
 				       struct drbd_backing_dev *bdev)
 {
 	sector_t md_size_sect = 0;
-	switch (bdev->dc.meta_dev_idx) {
+	int meta_dev_idx;
+
+	rcu_read_lock();
+	meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+
+	switch (meta_dev_idx) {
 	default:
 		/* v07 style fixed size indexed meta data */
 		bdev->md.md_size_sect = MD_RESERVED_SECT;
@@ -713,6 +719,7 @@
 		bdev->md.bm_offset   = -md_size_sect + MD_AL_OFFSET;
 		break;
 	}
+	rcu_read_unlock();
 }
 
 /* input size is expected to be in KB */
@@ -803,7 +810,9 @@
 	/* TODO: should only be some assert here, not (re)init... */
 	drbd_md_set_sector_offsets(mdev, mdev->ldev);
 
-	u_size = mdev->ldev->dc.disk_size;
+	rcu_read_lock();
+	u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
 	size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
 
 	if (drbd_get_capacity(mdev->this_bdev) != size ||
@@ -979,7 +988,9 @@
 		struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
 
 		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
-		max_segments = mdev->ldev->dc.max_bio_bvecs;
+		rcu_read_lock();
+		max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs;
+		rcu_read_unlock();
 		put_ldev(mdev);
 	}
 
@@ -1095,7 +1106,7 @@
 {
 	enum drbd_ret_code retcode;
 	struct drbd_conf *mdev;
-	struct disk_conf *new_disk_conf;
+	struct disk_conf *new_disk_conf, *old_disk_conf;
 	int err, fifo_size;
 	int *rs_plan_s = NULL;
 
@@ -1114,19 +1125,15 @@
 		goto out;
 	}
 
-/* FIXME freeze IO, cluster wide.
- *
- * We should make sure no-one uses
- * some half-updated struct when we
- * assign it later. */
-
-	new_disk_conf = kmalloc(sizeof(*new_disk_conf), GFP_KERNEL);
+	new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
 	if (!new_disk_conf) {
 		retcode = ERR_NOMEM;
 		goto fail;
 	}
 
-	memcpy(new_disk_conf, &mdev->ldev->dc, sizeof(*new_disk_conf));
+	mutex_lock(&mdev->tconn->conf_update);
+	old_disk_conf = mdev->ldev->disk_conf;
+	*new_disk_conf = *old_disk_conf;
 	if (should_set_defaults(info))
 		set_disk_conf_defaults(new_disk_conf);
 
@@ -1151,7 +1158,7 @@
 		if (!rs_plan_s) {
 			dev_err(DEV, "kmalloc of fifo_buffer failed");
 			retcode = ERR_NOMEM;
-			goto fail;
+			goto fail_unlock;
 		}
 	}
 
@@ -1171,31 +1178,37 @@
 
 	if (err) {
 		retcode = ERR_NOMEM;
-		goto fail;
+		goto fail_unlock;
 	}
 
-	/* FIXME
-	 * To avoid someone looking at a half-updated struct, we probably
-	 * should have a rw-semaphor on net_conf and disk_conf.
-	 */
 	write_lock_irq(&global_state_lock);
 	retcode = drbd_sync_after_valid(mdev, new_disk_conf->resync_after);
 	if (retcode == NO_ERROR) {
-		mdev->ldev->dc = *new_disk_conf;
+		rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
 		drbd_sync_after_changed(mdev);
 	}
 	write_unlock_irq(&global_state_lock);
 
-	drbd_md_sync(mdev);
+	if (retcode != NO_ERROR)
+		goto fail_unlock;
 
+	drbd_md_sync(mdev);
 
 	if (mdev->state.conn >= C_CONNECTED)
 		drbd_send_sync_param(mdev);
 
+	mutex_unlock(&mdev->tconn->conf_update);
+	synchronize_rcu();
+	kfree(old_disk_conf);
+	goto success;
+
+fail_unlock:
+	mutex_unlock(&mdev->tconn->conf_update);
  fail:
-	put_ldev(mdev);
 	kfree(new_disk_conf);
 	kfree(rs_plan_s);
+success:
+	put_ldev(mdev);
  out:
 	drbd_adm_finish(info, retcode);
 	return 0;
@@ -1210,6 +1223,7 @@
 	sector_t max_possible_sectors;
 	sector_t min_md_device_sectors;
 	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+	struct disk_conf *new_disk_conf = NULL;
 	struct block_device *bdev;
 	struct lru_cache *resync_lru = NULL;
 	union drbd_state ns, os;
@@ -1243,17 +1257,22 @@
 		retcode = ERR_NOMEM;
 		goto fail;
 	}
+	new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+	if (!new_disk_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	nbc->disk_conf = new_disk_conf;
 
-	set_disk_conf_defaults(&nbc->dc);
-
-	err = disk_conf_from_attrs(&nbc->dc, info);
+	set_disk_conf_defaults(new_disk_conf);
+	err = disk_conf_from_attrs(new_disk_conf, info);
 	if (err) {
 		retcode = ERR_MANDATORY_TAG;
 		drbd_msg_put_info(from_attrs_err_to_txt(err));
 		goto fail;
 	}
 
-	if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+	if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
 		retcode = ERR_MD_IDX_INVALID;
 		goto fail;
 	}
@@ -1261,7 +1280,7 @@
 	rcu_read_lock();
 	nc = rcu_dereference(mdev->tconn->net_conf);
 	if (nc) {
-		if (nbc->dc.fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
 			rcu_read_unlock();
 			retcode = ERR_STONITH_AND_PROT_A;
 			goto fail;
@@ -1269,10 +1288,10 @@
 	}
 	rcu_read_unlock();
 
-	bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+	bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
 				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
 	if (IS_ERR(bdev)) {
-		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
 			PTR_ERR(bdev));
 		retcode = ERR_OPEN_DISK;
 		goto fail;
@@ -1287,12 +1306,12 @@
 	 * should check it for you already; but if you don't, or
 	 * someone fooled it, we need to double check here)
 	 */
-	bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+	bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
 				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-				  (nbc->dc.meta_dev_idx < 0) ?
+				  (new_disk_conf->meta_dev_idx < 0) ?
 				  (void *)mdev : (void *)drbd_m_holder);
 	if (IS_ERR(bdev)) {
-		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
+		dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
 			PTR_ERR(bdev));
 		retcode = ERR_OPEN_MD_DISK;
 		goto fail;
@@ -1300,8 +1319,8 @@
 	nbc->md_bdev = bdev;
 
 	if ((nbc->backing_bdev == nbc->md_bdev) !=
-	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
-	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
 		retcode = ERR_MD_IDX_INVALID;
 		goto fail;
 	}
@@ -1317,21 +1336,21 @@
 	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
 	drbd_md_set_sector_offsets(mdev, nbc);
 
-	if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+	if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
 		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
 			(unsigned long long) drbd_get_max_capacity(nbc),
-			(unsigned long long) nbc->dc.disk_size);
+			(unsigned long long) new_disk_conf->disk_size);
 		retcode = ERR_DISK_TO_SMALL;
 		goto fail;
 	}
 
-	if (nbc->dc.meta_dev_idx < 0) {
+	if (new_disk_conf->meta_dev_idx < 0) {
 		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
 		/* at least one MB, otherwise it does not make sense */
 		min_md_device_sectors = (2<<10);
 	} else {
 		max_possible_sectors = DRBD_MAX_SECTORS;
-		min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+		min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
 	}
 
 	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
@@ -1356,7 +1375,7 @@
 		dev_warn(DEV, "==> truncating very big lower level device "
 			"to currently maximum possible %llu sectors <==\n",
 			(unsigned long long) max_possible_sectors);
-		if (nbc->dc.meta_dev_idx >= 0)
+		if (new_disk_conf->meta_dev_idx >= 0)
 			dev_warn(DEV, "==>> using internal or flexible "
 				      "meta data may help <<==\n");
 	}
@@ -1399,14 +1418,14 @@
 	}
 
 	/* Since we are diskless, fix the activity log first... */
-	if (drbd_check_al_size(mdev, &nbc->dc)) {
+	if (drbd_check_al_size(mdev, new_disk_conf)) {
 		retcode = ERR_NOMEM;
 		goto force_diskless_dec;
 	}
 
 	/* Prevent shrinking of consistent devices ! */
 	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
-	    drbd_new_dev_size(mdev, nbc, nbc->dc.disk_size, 0) < nbc->md.la_size_sect) {
+	    drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
 		dev_warn(DEV, "refusing to truncate a consistent device\n");
 		retcode = ERR_DISK_TO_SMALL;
 		goto force_diskless_dec;
@@ -1419,11 +1438,13 @@
 
 	/* Reset the "barriers don't work" bits here, then force meta data to
 	 * be written, to ensure we determine if barriers are supported. */
-	if (nbc->dc.no_md_flush)
+	if (new_disk_conf->no_md_flush)
 		set_bit(MD_NO_FUA, &mdev->flags);
 	else
 		clear_bit(MD_NO_FUA, &mdev->flags);
 
+	/* FIXME Missing stuff: rs_plan_s, clip al range */
+
 	/* Point of no return reached.
 	 * Devices and memory are no longer released by error cleanup below.
 	 * now mdev takes over responsibility, and the state engine should
@@ -1433,6 +1454,7 @@
 	mdev->resync = resync_lru;
 	nbc = NULL;
 	resync_lru = NULL;
+	new_disk_conf = NULL;
 
 	mdev->write_ordering = WO_bdev_flush;
 	drbd_bump_write_ordering(mdev, WO_bdev_flush);
@@ -1530,9 +1552,11 @@
 	if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
 		ns.pdsk = D_OUTDATED;
 
-	if ( ns.disk == D_CONSISTENT &&
-	    (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+	rcu_read_lock();
+	if (ns.disk == D_CONSISTENT &&
+	    (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE))
 		ns.disk = D_UP_TO_DATE;
+	rcu_read_unlock();
 
 	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
 	   MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
@@ -1589,6 +1613,7 @@
 				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		kfree(nbc);
 	}
+	kfree(new_disk_conf);
 	lc_destroy(resync_lru);
 
  finish:
@@ -1691,7 +1716,7 @@
 
 	idr_for_each_entry(&tconn->volumes, mdev, i) {
 		if (get_ldev(mdev)) {
-			enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
+			enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
 			put_ldev(mdev);
 			if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
 				return ERR_STONITH_AND_PROT_A;
@@ -2159,11 +2184,13 @@
 
 int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 {
+	struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
 	struct resize_parms rs;
 	struct drbd_conf *mdev;
 	enum drbd_ret_code retcode;
 	enum determine_dev_size dd;
 	enum dds_flags ddsf;
+	sector_t u_size;
 	int err;
 
 	retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
@@ -2204,10 +2231,31 @@
 		goto fail;
 	}
 
+	rcu_read_lock();
+	u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
+	if (u_size != (sector_t)rs.resize_size) {
+		new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+		if (!new_disk_conf) {
+			retcode = ERR_NOMEM;
+			goto fail;
+		}
+	}
+
 	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
 		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
 
-	mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
+	if (new_disk_conf) {
+		mutex_lock(&mdev->tconn->conf_update);
+		old_disk_conf = mdev->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
+		new_disk_conf->disk_size = (sector_t)rs.resize_size;
+		rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+		mutex_unlock(&mdev->tconn->conf_update);
+		synchronize_rcu();
+		kfree(old_disk_conf);
+	}
+
 	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
 	dd = drbd_determine_dev_size(mdev, ddsf);
 	drbd_md_sync(mdev);
@@ -2501,11 +2549,11 @@
 	if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive))
 		goto nla_put_failure;
 
+	rcu_read_lock();
 	if (got_ldev)
-		if (disk_conf_to_skb(skb, &mdev->ldev->dc, exclude_sensitive))
+		if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
 			goto nla_put_failure;
 
-	rcu_read_lock();
 	nc = rcu_dereference(mdev->tconn->net_conf);
 	if (nc)
 		err = net_conf_to_skb(skb, nc, exclude_sensitive);