[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1229521998-7870-6-git-send-email-zbr@ioremap.net>
Date: Wed, 17 Dec 2008 16:53:16 +0300
From: Evgeniy Polyakov <zbr@...emap.net>
To: Andrew Morton <akpm@...ux-foundation.org>
Cc: linux-kernel@...r.kernel.org, Evgeniy Polyakov <zbr@...emap.net>
Subject: [5/7] dst: transactions.
DST uses transaction model, when each store has to be explicitly acked
from the remote node to be considered as successfully written. There
may be lots of in-flight transactions. When remote host does not ack
the transaction it will be resent predefined number of times with specified
timeouts between them. All those parameters are configurable. Transactions
are marked as failed after all resends completed unsuccessfully, having
long enough resend timeout and/or large number of resends allows not to
return error to the higher (FS usually) layer in case of short network
problems or remote node outages. In case of network RAID setup this means
that storage will not degrade until transactions are marked as failed, and
thus will not force checksum recalculation and data rebuild. In case of
connection failure DST will try to reconnect to the remote node automatically.
DST sends ping commands at idle time to detect if remote node is alive.
Because of transactional model it is possible to use zero-copy sending
without worry of data corruption (which in turn could be detected by the
strong checksums though).
Transactions are handled in this patch: allocation/freeing/completion,
scanning for stall and to-be-resent transactions and overall management
of the storing tree.
Signed-off-by: Evgeniy Polyakov <zbr@...emap.net>
diff --git a/drivers/block/dst/trans.c b/drivers/block/dst/trans.c
new file mode 100644
index 0000000..557d372
--- /dev/null
+++ b/drivers/block/dst/trans.c
@@ -0,0 +1,335 @@
+/*
+ * 2007+ Copyright (c) Evgeniy Polyakov <zbr@...emap.net>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/bio.h>
+#include <linux/dst.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+
+/*
+ * Transaction memory pool size.
+ */
+static int dst_mempool_num = 32;
+module_param(dst_mempool_num, int, 0644);
+
+/*
+ * Transaction tree management.
+ */
+static inline int dst_trans_cmp(dst_gen_t gen, dst_gen_t new)
+{
+ if (gen < new)
+ return 1;
+ if (gen > new)
+ return -1;
+ return 0;
+}
+
+struct dst_trans *dst_trans_search(struct dst_node *node, dst_gen_t gen)
+{
+ struct rb_root *root = &node->trans_root;
+ struct rb_node *n = root->rb_node;
+ struct dst_trans *t, *ret = NULL;
+ int cmp;
+
+ while (n) {
+ t = rb_entry(n, struct dst_trans, trans_entry);
+
+ cmp = dst_trans_cmp(t->gen, gen);
+ if (cmp < 0)
+ n = n->rb_left;
+ else if (cmp > 0)
+ n = n->rb_right;
+ else {
+ ret = t;
+ break;
+ }
+ }
+
+ dprintk("%s: %s transaction: id: %llu.\n", __func__,
+ (ret)?"found":"not found", gen);
+
+ return ret;
+}
+
+static int dst_trans_insert(struct dst_trans *new)
+{
+ struct rb_root *root = &new->n->trans_root;
+ struct rb_node **n = &root->rb_node, *parent = NULL;
+ struct dst_trans *ret = NULL, *t;
+ int cmp;
+
+ while (*n) {
+ parent = *n;
+
+ t = rb_entry(parent, struct dst_trans, trans_entry);
+
+ cmp = dst_trans_cmp(t->gen, new->gen);
+ if (cmp < 0)
+ n = &parent->rb_left;
+ else if (cmp > 0)
+ n = &parent->rb_right;
+ else {
+ ret = t;
+ break;
+ }
+ }
+
+ new->send_time = jiffies;
+ if (ret) {
+ printk("%s: exist: old: gen: %llu, bio: %llu/%u, send_time: %lu, "
+ "new: gen: %llu, bio: %llu/%u, send_time: %lu.\n",
+ __func__,
+ ret->gen, (u64)ret->bio->bi_sector,
+ ret->bio->bi_size, ret->send_time,
+ new->gen, (u64)new->bio->bi_sector,
+ new->bio->bi_size, new->send_time);
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->trans_entry, parent, n);
+ rb_insert_color(&new->trans_entry, root);
+
+ dprintk("%s: inserted: gen: %llu, bio: %llu/%u, send_time: %lu.\n",
+ __func__, new->gen, (u64)new->bio->bi_sector,
+ new->bio->bi_size, new->send_time);
+
+ return 0;
+}
+
+int dst_trans_remove_nolock(struct dst_trans *t)
+{
+ struct dst_node *n = t->n;
+
+ if (t->trans_entry.rb_parent_color) {
+ rb_erase(&t->trans_entry, &n->trans_root);
+ t->trans_entry.rb_parent_color = 0;
+ }
+ return 0;
+}
+
+int dst_trans_remove(struct dst_trans *t)
+{
+ int ret;
+ struct dst_node *n = t->n;
+
+ mutex_lock(&n->trans_lock);
+ ret = dst_trans_remove_nolock(t);
+ mutex_unlock(&n->trans_lock);
+
+ return ret;
+}
+
+/*
+ * When transaction is completed and there are no more users,
+ * we complete appriate block IO request with given error status.
+ */
+void dst_trans_put(struct dst_trans *t)
+{
+ if (atomic_dec_and_test(&t->refcnt)) {
+ struct bio *bio = t->bio;
+
+ dprintk("%s: completed t: %p, gen: %llu, bio: %p.\n",
+ __func__, t, t->gen, bio);
+
+ bio_endio(bio, t->error);
+ bio_put(bio);
+
+ dst_node_put(t->n);
+ mempool_free(t, t->n->trans_pool);
+ }
+}
+
+/*
+ * Process given block IO request: allocate transaction, insert it into the tree
+ * and send/schedule crypto processing.
+ */
+int dst_process_bio(struct dst_node *n, struct bio *bio)
+{
+ struct dst_trans *t;
+ int err = -ENOMEM;
+
+ t = mempool_alloc(n->trans_pool, GFP_NOFS);
+ if (!t)
+ goto err_out_exit;
+
+ t->n = dst_node_get(n);
+ t->bio = bio;
+ t->error = 0;
+ t->retries = 0;
+ atomic_set(&t->refcnt, 1);
+ t->gen = atomic_long_inc_return(&n->gen);
+
+ t->enc = bio_data_dir(bio);
+ dst_bio_to_cmd(bio, &t->cmd, DST_IO, t->gen);
+
+ mutex_lock(&n->trans_lock);
+ err = dst_trans_insert(t);
+ mutex_unlock(&n->trans_lock);
+ if (err)
+ goto err_out_free;
+
+ dprintk("%s: gen: %llu, bio: %llu/%u, dir/enc: %d, need_crypto: %d.\n",
+ __func__, t->gen, (u64)bio->bi_sector,
+ bio->bi_size, t->enc, dst_need_crypto(n));
+
+ if (dst_need_crypto(n) && t->enc)
+ dst_trans_crypto(t);
+ else
+ dst_trans_send(t);
+
+ return 0;
+
+err_out_free:
+ dst_node_put(n);
+ mempool_free(t, n->trans_pool);
+err_out_exit:
+ bio_endio(bio, err);
+ bio_put(bio);
+ return err;
+}
+
+/*
+ * Scan for timeout/stale transactions.
+ * Each transaction is being resent multiple times before error completion.
+ */
+static void dst_trans_scan(struct work_struct *work)
+{
+ struct dst_node *n = container_of(work, struct dst_node, trans_work.work);
+ struct rb_node *rb_node;
+ struct dst_trans *t;
+ unsigned long timeout = n->trans_scan_timeout;
+ int num = 10 * n->trans_max_retries;
+
+ mutex_lock(&n->trans_lock);
+
+ for (rb_node = rb_first(&n->trans_root); rb_node; ) {
+ t = rb_entry(rb_node, struct dst_trans, trans_entry);
+
+ if (timeout && time_after(t->send_time + timeout, jiffies)
+ && t->retries == 0)
+ break;
+#if 0
+ dprintk("%s: t: %p, gen: %llu, n: %s, retries: %u, max: %u.\n",
+ __func__, t, t->gen, n->name,
+ t->retries, n->trans_max_retries);
+#endif
+ if (--num == 0)
+ break;
+
+ dst_trans_get(t);
+
+ rb_node = rb_next(rb_node);
+
+ if (timeout && (++t->retries < n->trans_max_retries)) {
+ dst_trans_send(t);
+ } else {
+ t->error = -ETIMEDOUT;
+ dst_trans_remove_nolock(t);
+ dst_trans_put(t);
+ }
+
+ dst_trans_put(t);
+ }
+
+ mutex_unlock(&n->trans_lock);
+
+ /*
+ * If no timeout specified then system is in the middle of exiting process,
+ * so no need to reschedule scanning process again.
+ */
+ if (timeout) {
+ if (!num)
+ timeout = HZ;
+ schedule_delayed_work(&n->trans_work, timeout);
+ }
+}
+
+/*
+ * Flush all transactions and mark them as timed out.
+ * Destroy transaction pools.
+ */
+void dst_node_trans_exit(struct dst_node *n)
+{
+ struct dst_trans *t;
+ struct rb_node *rb_node;
+
+ if (!n->trans_cache)
+ return;
+
+ dprintk("%s: n: %p, cancelling the work.\n", __func__, n);
+ cancel_delayed_work_sync(&n->trans_work);
+ flush_scheduled_work();
+ dprintk("%s: n: %p, work has been cancelled.\n", __func__, n);
+
+ for (rb_node = rb_first(&n->trans_root); rb_node; ) {
+ t = rb_entry(rb_node, struct dst_trans, trans_entry);
+
+ dprintk("%s: t: %p, gen: %llu, n: %s.\n",
+ __func__, t, t->gen, n->name);
+
+ rb_node = rb_next(rb_node);
+
+ t->error = -ETIMEDOUT;
+ dst_trans_remove_nolock(t);
+ dst_trans_put(t);
+ }
+
+ mempool_destroy(n->trans_pool);
+ kmem_cache_destroy(n->trans_cache);
+}
+
+/*
+ * Initialize transaction storage for given node.
+ * Transaction stores not only control information,
+ * but also network command and crypto data (if needed)
+ * to reduce number of allocations. Thus transaction size
+ * differs from node to node.
+ */
+int dst_node_trans_init(struct dst_node *n, unsigned int size)
+{
+ /*
+ * We need this, since node with given name can be dropped from the
+ * hash table, but be still alive, so subsequent creation of the node
+ * with the same name may collide with existing cache name.
+ */
+
+ snprintf(n->cache_name, sizeof(n->cache_name), "%s-%p", n->name, n);
+
+ n->trans_cache = kmem_cache_create(n->cache_name,
+ size + n->crypto.crypto_attached_size,
+ 0, 0, NULL);
+ if (!n->trans_cache)
+ goto err_out_exit;
+
+ n->trans_pool = mempool_create_slab_pool(dst_mempool_num, n->trans_cache);
+ if (!n->trans_pool)
+ goto err_out_cache_destroy;
+
+ mutex_init(&n->trans_lock);
+ n->trans_root = RB_ROOT;
+
+ INIT_DELAYED_WORK(&n->trans_work, dst_trans_scan);
+ schedule_delayed_work(&n->trans_work, n->trans_scan_timeout);
+
+ dprintk("%s: n: %p, size: %u, crypto: %u.\n",
+ __func__, n, size, n->crypto.crypto_attached_size);
+
+ return 0;
+
+err_out_cache_destroy:
+ kmem_cache_destroy(n->trans_cache);
+err_out_exit:
+ return -ENOMEM;
+}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists