lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Date:	Fri, 23 Dec 2011 18:01:24 +0300
From:	Evgeniy Polyakov <zbr@...emap.net>
To:	linux-kernel@...r.kernel.org
Subject: POHMELFS: Happy New Year and Merry Christmas

Hi

I know, most of you asked Santa Claus for new distributed filesystem.
So instead of increasing entropy and size of the kernel Sata brings you
old but completely redesigned distributed filesystem.

Pohmelfs moved away from old and basically non-working design of
parallel NFS to real-life working distributed storage named elliptics [1]

It implements key-value storage which by default works as DHT.
As every distributed system it has replicas, automatic recovery,
integrity checksums, no single-points-of-failure aka master server/name
node and so on, solumn data store, automatic repartitioning on new
servers, compression support, server-side scripting and so on
Server-side scripting allows to perform arbitrary actions on provided
data, for example pohmelfs directory support implemented that way - all
transactions in elliptics are atomic (on single replica) by default, so
server-side may read data, update it the way it likes and push it back
into the storage.

Elliptics is used in production about 2 years now and hosts clusters
from several billions of small objects (Yandex.Maps/partially
Yandex.Photos) to 1 Pb storage (Yandex.Music)

Main goal of this storage is ability to work with multiple datacenters
out of the box (replication unit aka group fits quite well into single
datacenter abstraction) and is aimed at high IO operations per second
workload type instead of boring bulk MB/s IOs in 21 century.
Getting its p2p nature bulk IO is not an issue either.

Pohmelfs is a POSIX frontend to elliptics. It supports weak
synchronization between mounted nodes in that regard, that data
read/written into local page cache is not synced with the storage until
timeout fires. Directory content when read is populated directly into
dentry/inode cache instead of doing that per-entry to make directory
reading fast.

Yet there are number of features to complete:
 - quorum read. pohmelfs supports quorum write only so far
 - http compatibility mode - we do want to upload data via pohmelfs and
   read it through http applications. And vice versa actually too.
 - column read-write or more generally file-as-directory feature
 - even more testing - abusing dcache is fun, but likely there are
   hiddens stones
 - replace drivers/staging/pohmelfs with this code

That was a geek-style present from Santa. See you next year!
Thank you.

Signed-off-by: Evgeniy "Geeky Santa" Polyakov <zbr@...emap.net>

1. Elliptics network
http://www.ioremap.net/projects/elliptics

2. Pohmelfs
http://www.ioremap.net/taxonomy/term/4


diff --git a/fs/Kconfig b/fs/Kconfig
index 9fe0b34..7232749 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -259,6 +259,7 @@ config NFS_COMMON
 source "net/sunrpc/Kconfig"
 source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
+source "fs/pohmelfs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
 source "fs/afs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index afc1096..36664fe 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,3 +123,4 @@ obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
+obj-$(CONFIG_POHMELFS)		+= pohmelfs/
diff --git a/fs/pohmelfs/Kconfig b/fs/pohmelfs/Kconfig
new file mode 100644
index 0000000..b91e56d
--- /dev/null
+++ b/fs/pohmelfs/Kconfig
@@ -0,0 +1,11 @@
+config POHMELFS
+	tristate "POHMELFS distributed filesystem"
+	depends on INET && EXPERIMENTAL
+	select CRYPTO_HASH
+	help
+	  POHMELFS is a POSIX frontend to Elliptics network
+
+	  Elliptics is a key/value storage, which by default imlpements
+	  distributed hash table structure.
+
+	  More information can be found at http://www.ioremap.net/projects/elliptics
diff --git a/fs/pohmelfs/Makefile b/fs/pohmelfs/Makefile
new file mode 100644
index 0000000..ad358d7
--- /dev/null
+++ b/fs/pohmelfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux ext2-filesystem routines.
+#
+
+obj-$(CONFIG_POHMELFS) += pohmelfs.o
+
+pohmelfs-y := dir.o file.o inode.o net.o route.o super.o trans.o symlink.o
diff --git a/fs/pohmelfs/Module.symvers b/fs/pohmelfs/Module.symvers
new file mode 100644
index 0000000..e69de29
diff --git a/fs/pohmelfs/dir.c b/fs/pohmelfs/dir.c
new file mode 100644
index 0000000..a7aa093
--- /dev/null
+++ b/fs/pohmelfs/dir.c
@@ -0,0 +1,1001 @@
+/*
+ * Copyright (C) 2011+ Evgeniy Polyakov <zbr@...emap.net>
+ */
+
+#include <linux/fs.h>
+#include <linux/dcache.h>
+
+#include "pohmelfs.h"
+
+#define POHMELFS_LOOKUP_SCRIPT				"pohmelfs_lookup.py"
+#define POHMELFS_UNLINK_SCRIPT				"pohmelfs_unlink.py"
+#define POHMELFS_RENAME_SCRIPT				"pohmelfs_rename.py"
+#define POHMELFS_INODE_INFO_SCRIPT_INSERT		"pohmelfs_inode_info_insert.py"
+#define POHMELFS_DENTRY_NAME_SCRIPT			"pohmelfs_dentry_name="
+
+static void pohmelfs_inode_dirty(struct pohmelfs_inode *parent, struct pohmelfs_inode *pi)
+{
+	struct inode *inode = &pi->vfs_inode;
+	struct inode *dir = &parent->vfs_inode;
+
+	pi->parent_id = parent->id;
+	inode_init_owner(inode, dir, inode->i_mode);
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	dir->i_mtime = CURRENT_TIME;
+
+	mark_inode_dirty(inode);
+	mark_inode_dirty(dir);
+}
+
+struct pohmelfs_script_req {
+	char			*obj_name;
+	int			obj_len;
+
+	char			*script_name;
+	int			script_namelen;
+
+	void			*binary;
+	int			binary_size;
+
+	int			group_id;
+
+	int			sync;
+
+	struct dnet_raw_id	*id;
+
+	int			(* complete)(struct pohmelfs_trans *t, struct pohmelfs_state *recv);
+	void			*ret;
+};
+
+static int pohmelfs_send_inode_info_complete(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct pohmelfs_inode *pi = pohmelfs_inode(t->inode);
+	struct pohmelfs_wait *wait = t->priv;
+	struct dnet_cmd *cmd = &recv->cmd;
+	unsigned long long trans = cmd->trans & ~DNET_TRANS_REPLY;
+
+	if (cmd->flags & DNET_FLAGS_MORE) {
+		if (cmd->status == 0 && cmd->size != sizeof(struct dnet_attr) + 2)
+			cmd->status = -EINVAL;
+
+		pr_debug("pohmelfs: %s: pohmelfs_send_inode_info_complete: %llu, cmd_size: %llu, flags: %x, status: %d\n",
+				pohmelfs_dump_id(pi->id.id), trans, cmd->size, cmd->flags, cmd->status);
+
+		if (!cmd->status)
+			wait->condition = 1;
+		else
+			wait->condition = cmd->status;
+	}
+
+	return 0;
+}
+
+static int pohmelfs_send_inode_info_init(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_wait *wait = t->priv;
+
+	pohmelfs_wait_get(wait);
+	return 0;
+}
+
+static void pohmelfs_send_inode_info_destroy(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_wait *wait = t->priv;
+
+	wake_up(&wait->wq);
+	pohmelfs_wait_put(wait);
+}
+
+static int pohmelfs_lookup_complete(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct pohmelfs_inode *parent = pohmelfs_inode(t->inode);
+	struct pohmelfs_wait *wait = t->priv;
+	struct dnet_cmd *cmd = &recv->cmd;
+	unsigned long long trans = cmd->trans & ~DNET_TRANS_REPLY;
+	int err = cmd->status;
+
+	if (err)
+		goto err_out_exit;
+
+	if (cmd->flags & DNET_FLAGS_MORE) {
+		struct pohmelfs_inode_info *info;
+		struct pohmelfs_inode *pi;
+
+		if (cmd->size != sizeof(struct dnet_attr) + sizeof(struct pohmelfs_inode_info)) {
+			err = -ENOENT;
+			goto err_out_exit;
+		}
+
+		pr_debug("pohmelfs: %s: pohmelfs_lookup_complete: %llu, size: %llu, min size: %zu, flags: %x, status: %d\n",
+				pohmelfs_dump_id(parent->id.id), trans, cmd->size,
+				sizeof(struct dnet_attr) + sizeof(struct pohmelfs_inode_info), cmd->flags, cmd->status);
+
+
+		info = t->recv_data + sizeof(struct dnet_attr);
+		pohmelfs_convert_inode_info(info);
+
+		pi = pohmelfs_existing_inode(pohmelfs_sb(t->inode->i_sb), info);
+		if (IS_ERR(pi)) {
+			err = PTR_ERR(pi);
+			goto err_out_exit;
+		}
+
+		pi->parent_id = parent->id;
+		pi->received = 1;
+		wait->ret = pi;
+	}
+
+err_out_exit:
+	if (err)
+		wait->condition = err;
+	else
+		wait->condition = 1;
+
+	return 0;
+}
+
+static int pohmelfs_send_script_request(struct pohmelfs_inode *parent, struct pohmelfs_script_req *req)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(parent->vfs_inode.i_sb);
+	struct pohmelfs_wait *wait;
+	struct pohmelfs_io *pio;
+	struct dnet_exec *e;
+	int script_len;
+	long ret;
+	int err;
+
+	/* 2 commas, \n and 0-byte, which is accounted in sizeof(string) */
+	script_len = sizeof(POHMELFS_DENTRY_NAME_SCRIPT) + req->obj_len + 3;
+
+	wait = pohmelfs_wait_alloc(parent);
+	if (!wait) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	pio = kmem_cache_zalloc(pohmelfs_io_cache, GFP_NOIO);
+	if (!pio) {
+		err = -ENOMEM;
+		goto err_out_wait_put;
+	}
+
+	e = kmalloc(sizeof(struct dnet_exec) + req->script_namelen + script_len + req->binary_size, GFP_NOIO);
+	if (!e) {
+		err = -ENOMEM;
+		goto err_out_free_pio;
+	}
+
+	memset(e, 0, sizeof(struct dnet_exec));
+
+	snprintf(e->data, req->script_namelen + script_len, "%s%s'%s'\n", req->script_name, POHMELFS_DENTRY_NAME_SCRIPT, req->obj_name);
+	script_len--; /* do not include last 0-byte in the script */
+
+	memcpy(e->data + req->script_namelen + script_len, req->binary, req->binary_size);
+
+	e->type = DNET_EXEC_PYTHON_SCRIPT_NAME;
+	e->name_size = req->script_namelen;
+	e->script_size = script_len;
+	e->binary_size = req->binary_size;
+	dnet_convert_exec(e);
+
+	pio->pi = parent;
+	pio->id = req->id;
+	pio->group_id = req->group_id;
+	pio->cflags = DNET_FLAGS_NEED_ACK;
+	if (req->complete == pohmelfs_lookup_complete)
+		pio->cflags |= DNET_FLAGS_NOLOCK;
+
+	pio->cmd = DNET_CMD_EXEC;
+	pio->size = sizeof(struct dnet_exec) + req->script_namelen + script_len + req->binary_size;
+	pio->data = e;
+	pio->priv = wait;
+	pio->cb.init = pohmelfs_send_inode_info_init;
+	pio->cb.destroy = pohmelfs_send_inode_info_destroy;
+	pio->cb.complete = req->complete;
+
+	if (pio->group_id) {
+		err = pohmelfs_send_buf_single(pio, NULL);
+	} else {
+		err = pohmelfs_send_buf(pio);
+	}
+	if (err)
+		goto err_out_free;
+
+	if (req->sync) {
+		ret = wait_event_interruptible_timeout(wait->wq, wait->condition != 0, msecs_to_jiffies(psb->read_wait_timeout));
+		if (ret <= 0) {
+			err = ret;
+			if (ret == 0)
+				err = -ETIMEDOUT;
+			goto err_out_free;
+		}
+
+		if (wait->condition < 0)
+			err = wait->condition;
+
+		req->ret = wait->ret;
+	}
+
+	{
+		int len = 6;
+		char parent_id_str[len*2+1];
+
+		pr_debug("pohmelfs: %.*s: %s: inode->id: %s, ino: %lu, object: %s, binary size: %d\n",
+				req->script_namelen, req->script_name,
+				pohmelfs_dump_id(req->id->id),
+				pohmelfs_dump_id_len_raw(parent->id.id, len, parent_id_str),
+				parent->vfs_inode.i_ino, req->obj_name, req->binary_size);
+	}
+
+err_out_free:
+	kfree(e);
+err_out_free_pio:
+	kmem_cache_free(pohmelfs_io_cache, pio);
+err_out_wait_put:
+	pohmelfs_wait_put(wait);
+err_out_exit:
+	return err;
+}
+
+int pohmelfs_send_inode_info(struct pohmelfs_inode *pi, struct dnet_raw_id *id, const char *sname, int len, int sync)
+{
+	struct pohmelfs_inode_info_binary_package *bin;
+	struct pohmelfs_script_req req;
+	int err;
+
+	if (!len) {
+		err = -EINVAL;
+		goto err_out_exit;
+	}
+
+	bin = kmem_cache_alloc(pohmelfs_inode_info_binary_package_cache, GFP_NOIO);
+	if (!bin) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	req.script_name = POHMELFS_INODE_INFO_SCRIPT_INSERT;
+	req.script_namelen = sizeof(POHMELFS_INODE_INFO_SCRIPT_INSERT) - 1; /* not including 0-byte */
+
+	req.obj_name = (char *)sname;
+	req.obj_len = len;
+
+	req.binary = bin;
+	req.binary_size = sizeof(struct pohmelfs_inode_info) + sizeof(struct dnet_raw_id);
+
+	req.group_id = 0;
+	req.id = id;
+
+	req.sync = sync;
+
+	memcpy(&bin->parent, id, sizeof(struct dnet_raw_id));
+	pohmelfs_fill_inode_info(&pi->vfs_inode, &bin->info);
+	bin->info.namelen = len;
+
+	pohmelfs_convert_inode_info(&bin->info);
+
+	req.complete = pohmelfs_send_inode_info_complete;
+
+	err = pohmelfs_send_script_request(pi, &req);
+	if (err)
+		goto err_out_free;
+
+err_out_free:
+	kmem_cache_free(pohmelfs_inode_info_binary_package_cache, bin);
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(dir->i_sb);
+	struct pohmelfs_inode *parent = pohmelfs_inode(dir);
+	struct pohmelfs_inode *pi;
+	int err;
+
+	pi = pohmelfs_new_inode(psb, mode);
+	if (IS_ERR(pi)) {
+		err = PTR_ERR(pi);
+		goto err_out_exit;
+	}
+
+	pohmelfs_inode_dirty(parent, pi);
+
+	pr_debug("pohmelfs: create: %s, ino: %lu, parent dir: %lu, object: %s\n",
+			pohmelfs_dump_id(pi->id.id), pi->vfs_inode.i_ino,
+			dir->i_ino, dentry->d_name.name);
+
+	/*
+	 * calling d_instantiate() implies that
+	 * ->lookup() used d_splice_alias() with NULL inode
+	 *  when it failed to find requested object
+	 */
+	d_instantiate(dentry, &pi->vfs_inode);
+
+	return 0;
+
+err_out_exit:
+	return err;
+}
+
+struct pohmelfs_readdir_header {
+	char				magic[8];
+	unsigned short			version;
+	unsigned short			chunk_size;
+	unsigned int			chunk_num;
+} __attribute__((packed));
+
+static void pohmelfs_convert_readdir_header(struct pohmelfs_readdir_header *h)
+{
+	h->version = dnet_bswap16(h->version);
+	h->chunk_size = dnet_bswap16(h->chunk_size);
+	h->chunk_num = dnet_bswap32(h->chunk_num);
+}
+
+struct pohmelfs_readdir_chunk_header {
+	unsigned short			length;
+	unsigned short			num;
+	unsigned short			key_size;
+	unsigned short			payload_size;
+} __attribute__((packed));
+
+static void pohmelfs_convert_readdir_chunk_header(struct pohmelfs_readdir_chunk_header *h)
+{
+	h->length = dnet_bswap16(h->length);
+	h->num = dnet_bswap16(h->num);
+	h->key_size = dnet_bswap16(h->key_size);
+	h->payload_size = dnet_bswap16(h->payload_size);
+}
+
+/* Chunk size = maximum file name length + sizeof header + sizeof pohmelfs_inode_info
+ * It allows to store whole file entry on 1 chunk
+ */
+#define POHMELFS_CHUNK_SIZE	(NAME_MAX + 1 + sizeof(struct pohmelfs_inode_info) + sizeof(struct pohmelfs_readdir_chunk_header))
+
+enum pohmelfs_readdir_states {
+	POHMELFS_READDIR_WANT_HEADER = 1,
+	POHMELFS_READDIR_WANT_RECV_CHUNK,
+};
+
+struct pohmelfs_readdir_priv {
+	struct pohmelfs_wait		*wait;
+
+	struct kref			refcnt;
+
+	struct pohmelfs_readdir_header	header;
+
+	int				state;
+	int				read_total; /* number of inode offsets read or processed total (in all chunks summed) */
+	int				read_in_inode; /* offset of name+pohmelfs_inode_info read in below buffer */
+
+	char				chunk[POHMELFS_CHUNK_SIZE];
+};
+
+static void pohmelfs_readdir_free(struct kref *kref)
+{
+	struct pohmelfs_readdir_priv *priv = container_of(kref, struct pohmelfs_readdir_priv, refcnt);
+
+	if (priv->wait)
+		pohmelfs_wait_put(priv->wait);
+	kfree(priv);
+}
+
+static void pohmelfs_readdir_destroy(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_readdir_priv *priv = t->priv;
+	struct pohmelfs_wait *wait = priv->wait;
+
+	wake_up(&wait->wq);
+	kref_put(&priv->refcnt, pohmelfs_readdir_free);
+}
+
+static int pohmelfs_readdir_complete(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct pohmelfs_readdir_priv *priv = t->priv;
+	struct pohmelfs_wait *wait = priv->wait;
+	struct dnet_cmd *cmd = &recv->cmd;
+
+	if (t->recv_data) {
+		kfree(t->recv_data);
+		t->recv_data = NULL;
+	}
+
+	if (!(cmd->flags & DNET_FLAGS_MORE)) {
+		wait->condition = cmd->status;
+		if (!wait->condition)
+			wait->condition = 1;
+	}
+
+	return 0;
+}
+
+static int pohmelfs_dentry_add(struct pohmelfs_inode *parent, struct pohmelfs_inode *pi, char *name, int len)
+{
+	struct inode *inode = &pi->vfs_inode;
+	struct inode *dir = &parent->vfs_inode;
+	struct dentry *dentry, *parent_dentry, *old;
+	struct qstr str;
+	int err;
+
+	str.name = name;
+	str.len = len;
+	str.hash = full_name_hash(str.name, str.len);
+
+	/* we do not need to hold dir->i_mutex here, don't we? :) */
+	parent_dentry = d_find_alias(dir);
+	if (!parent_dentry) {
+		err = -ENOENT;
+		goto err_out_exit;
+	}
+
+	dentry = d_lookup(parent_dentry, &str);
+	if (dentry) {
+		err = -EEXIST;
+
+		dentry->d_fsdata = NULL;
+		dput(dentry);
+		goto err_out_put_parent;
+	}
+	/*
+	 * if things are ok, dentry has 2 references -
+	 * one in parent dir, and another its own,
+	 * which we should drop
+	 */
+	dentry = d_alloc(parent_dentry, &str);
+	if (!dentry) {
+		err = -ENOMEM;
+		goto err_out_put_parent;
+	}
+
+	old = d_splice_alias(inode, dentry);
+	if (unlikely(old)) {
+		dput(dentry);
+		dentry = old;
+	} else {
+		dput(dentry);
+	}
+
+	dput(parent_dentry);
+	return 0;
+
+err_out_put_parent:
+	dput(parent_dentry);
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_update_inode(struct pohmelfs_inode *parent, struct pohmelfs_inode_info *info, char *name)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(parent->vfs_inode.i_sb);
+	struct pohmelfs_inode *pi;
+	struct inode *inode;
+	int err = 0;
+	int existing = 0;
+
+	pi = pohmelfs_sb_inode_lookup(psb, &info->id);
+	if (pi) {
+		inode = &pi->vfs_inode;
+		pohmelfs_fill_inode(inode, info);
+		existing = 1;
+	} else {
+		pi = pohmelfs_existing_inode(psb, info);
+		if (IS_ERR(pi)) {
+			err = PTR_ERR(pi);
+			goto err_out_exit;
+		}
+		inode = &pi->vfs_inode;
+
+		pi->parent_id = parent->id;
+	}
+
+	err = pohmelfs_dentry_add(parent, pi, name, info->namelen);
+	inode->i_version = 0;
+	pi->received = 1;
+
+	/*
+	 * We incremented refcnt for existing inodes,
+	 * but if there is no dentry for inode in question,
+	 * then we will allocate and connect them, otherwise
+	 * we have to drop its reference counter (i.e. when
+	 * dentry for this inode already exists)
+	 */
+
+	pr_debug("pohmelfs: %s: update inode: %lu, existing: %d, refcnt: %d, err: %d\n",
+			pohmelfs_dump_id(pi->id.id), inode->i_ino, existing,
+			atomic_read(&inode->i_count), err);
+	if (err)
+		iput(inode);
+
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_readdir_recv_reply(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct pohmelfs_readdir_priv *priv = t->priv;
+	struct dnet_cmd *cmd = &recv->cmd;
+	int attr_size = sizeof(struct dnet_attr) + sizeof(struct dnet_io_attr);
+	long old_recv_offset;
+	void *data;
+	int size;
+	int err = 0;
+
+	if (t->recv_offset < attr_size) {
+		data = &t->cmd.attr;
+
+		data += t->recv_offset;
+		size = attr_size - t->recv_offset;
+
+		err = pohmelfs_recv(t, recv, data, size);
+		if (err < 0)
+			goto err_out_exit;
+
+		if (t->recv_offset == attr_size) {
+			dnet_convert_attr(&t->cmd.attr);
+			dnet_convert_io_attr(&t->cmd.p.io);
+
+			pr_debug("pohmelfs: %d:%s: cmd size: %llu, io size: %llu\n",
+				cmd->id.group_id, pohmelfs_dump_id(cmd->id.id),
+				(unsigned long long)cmd->size, (unsigned long long)t->cmd.p.io.size);
+
+			priv->state = POHMELFS_READDIR_WANT_HEADER;
+		}
+	}
+
+	if (priv->state == POHMELFS_READDIR_WANT_HEADER) {
+		int header_size_to_read = sizeof(struct pohmelfs_readdir_header) - (t->recv_offset - attr_size);
+
+		data = &priv->header;
+		data += sizeof(struct pohmelfs_readdir_header) - header_size_to_read;
+
+		err = pohmelfs_recv(t, recv, data, header_size_to_read);
+		if (err < 0)
+			goto err_out_exit;
+
+		pohmelfs_convert_readdir_header(&priv->header);
+		priv->read_total = 0;
+		priv->read_in_inode = 0;
+		priv->state = POHMELFS_READDIR_WANT_RECV_CHUNK;
+
+		pr_debug("pohmelfs: %d:%s: header: header size: %d, version: %hd, chunk_size: %hd, chunk_num: %d\n",
+			cmd->id.group_id, pohmelfs_dump_id(cmd->id.id), header_size_to_read,
+			priv->header.version, priv->header.chunk_size, priv->header.chunk_num);
+
+		if (priv->header.chunk_size > POHMELFS_CHUNK_SIZE) {
+			err = -E2BIG;
+			goto err_out_exit;
+		}
+	}
+
+get_new_chunk:
+	if (priv->read_total == priv->header.chunk_num) {
+		err = 0;
+		goto err_out_exit;
+	}
+
+	if (priv->state == POHMELFS_READDIR_WANT_RECV_CHUNK) {
+		data = priv->chunk + priv->read_in_inode;
+		size = POHMELFS_CHUNK_SIZE - priv->read_in_inode;
+
+		old_recv_offset = t->recv_offset;
+
+		err = pohmelfs_recv(t, recv, data, size);
+		if (err < 0)
+			goto err_out_exit;
+
+		priv->read_in_inode += t->recv_offset - old_recv_offset;
+
+		if (priv->read_in_inode == POHMELFS_CHUNK_SIZE) {
+			struct pohmelfs_readdir_chunk_header *chunk_header;
+			struct pohmelfs_inode_info *info;
+			char *filename;
+
+			priv->read_in_inode = 0;
+			priv->read_total++;
+
+			chunk_header = (struct pohmelfs_readdir_chunk_header *)priv->chunk;
+			pohmelfs_convert_readdir_chunk_header(chunk_header);
+
+			/*
+			 * Here we assume that record always fits in 1 chunk.
+			 * In future this code should be changed to read several chunks
+			 * and concatenate it to build continous buffer for
+			 * file name and pohmelfs_inode_info structure
+			 */
+
+			info = (struct pohmelfs_inode_info *)(priv->chunk + sizeof(struct pohmelfs_readdir_chunk_header) + chunk_header->key_size);
+			pohmelfs_convert_inode_info(info);
+
+			filename = (char *)(priv->chunk + sizeof(struct pohmelfs_readdir_chunk_header));
+
+			err = pohmelfs_update_inode(priv->wait->pi, info, filename);
+			pr_debug("pohmelfs: %d:%s: inode: %llu, namelen: %d, name: %.*s: %d\n",
+				cmd->id.group_id, pohmelfs_dump_id(info->id.id), (unsigned long long)info->ino,
+				info->namelen, info->namelen, filename, err);
+		} else {
+			err = -EAGAIN;
+			goto err_out_exit;
+		}
+
+		if ((priv->read_total < priv->header.chunk_num) && (t->recv_offset < cmd->size))
+			goto get_new_chunk;
+	}
+
+	return 0;
+
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_readdir_init(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_readdir_priv *priv = t->priv;
+
+	kref_get(&priv->refcnt);
+	return 0;
+}
+
+static int pohmelfs_warm_dir_group(struct inode *dir, int group_id)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(dir->i_sb);
+	struct pohmelfs_inode *parent = pohmelfs_inode(dir);
+	struct pohmelfs_io *io;
+	struct pohmelfs_readdir_priv *priv;
+	struct pohmelfs_wait *wait;
+	long ret;
+	int err;
+
+	io = kmem_cache_zalloc(pohmelfs_io_cache, GFP_NOIO);
+	if (!io) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	priv = kzalloc(sizeof(struct pohmelfs_readdir_priv), GFP_NOIO);
+	if (!priv) {
+		err = -ENOMEM;
+		goto err_out_free;
+	}
+
+	kref_init(&priv->refcnt);
+
+	wait = pohmelfs_wait_alloc(parent);
+	if (!wait) {
+		err = -ENOMEM;
+		goto err_out_put;
+	}
+
+	priv->wait = wait;
+
+	io->pi = parent;
+	io->id = &parent->id;
+	io->cflags = DNET_FLAGS_NEED_ACK | DNET_FLAGS_NOLOCK;
+	io->cmd = DNET_CMD_READ;
+	io->cb.recv_reply = pohmelfs_readdir_recv_reply;
+	io->cb.complete = pohmelfs_readdir_complete;
+	io->cb.destroy = pohmelfs_readdir_destroy;
+	io->cb.init = pohmelfs_readdir_init;
+	io->priv = priv;
+
+	err = pohmelfs_send_io_group(io, group_id);
+	if (err)
+		goto err_out_put;
+
+	/* destruction callback will drop reference */
+	ret = wait_event_interruptible_timeout(wait->wq, wait->condition != 0, msecs_to_jiffies(psb->read_wait_timeout));
+	if (ret <= 0) {
+		err = ret;
+		if (ret == 0)
+			err = -ETIMEDOUT;
+		goto err_out_put;
+	}
+
+	if (wait->condition < 0) {
+		err = wait->condition;
+		goto err_out_put;
+	}
+
+	/* drop the reference we grabbed at creation time */
+	kref_put(&priv->refcnt, pohmelfs_readdir_free);
+	kmem_cache_free(pohmelfs_io_cache, io);
+	return 0;
+
+err_out_put:
+	kref_put(&priv->refcnt, pohmelfs_readdir_free);
+err_out_free:
+	kmem_cache_free(pohmelfs_io_cache, io);
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_warm_dir(struct inode *dir)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(dir->i_sb);
+	int i, err = -ENOENT;
+
+	for (i = 0; i < psb->group_num; ++i) {
+		err = pohmelfs_warm_dir_group(dir, psb->groups[i]);
+		if (err)
+			continue;
+
+		return 0;
+	}
+
+	return err;
+}
+
+static struct pohmelfs_inode *pohmelfs_lookup_group(struct inode *dir, struct dentry *dentry, int group_id)
+{
+	struct pohmelfs_inode *parent = pohmelfs_inode(dir);
+	struct pohmelfs_script_req req;
+	struct pohmelfs_inode *pi;
+	int err;
+
+	req.script_name = POHMELFS_LOOKUP_SCRIPT;
+	req.script_namelen = sizeof(POHMELFS_LOOKUP_SCRIPT) - 1; /* not including 0-byte */
+
+	req.obj_name = (char *)dentry->d_name.name;
+	req.obj_len = dentry->d_name.len;
+
+	req.binary = &parent->id;
+	req.binary_size = sizeof(struct dnet_raw_id);
+
+	req.id = &parent->id;
+	req.complete = pohmelfs_lookup_complete;
+
+	req.group_id = group_id;
+	req.sync = 1;
+
+	err = pohmelfs_send_script_request(parent, &req);
+	if (err)
+		goto err_out_exit;
+
+	pi = req.ret;
+	if (!pi) {
+		err = -ENOENT;
+		goto err_out_exit;
+	}
+
+	return pi;
+
+err_out_exit:
+	pr_debug("pohmelfs: pohmelfs_lookup_group: %s: group: %d: parent ino: %lu, name: %s: %d\n",
+		pohmelfs_dump_id(parent->id.id), group_id, parent->vfs_inode.i_ino, dentry->d_name.name, err);
+	return ERR_PTR(err);
+}
+
+static struct dentry *pohmelfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(dir->i_sb);
+	struct inode *inode = NULL;
+	struct pohmelfs_inode *pi;
+	int i, err = -ENOENT;
+
+	for (i = 0; i < psb->group_num; ++i) {
+		pi = pohmelfs_lookup_group(dir, dentry, psb->groups[i]);
+		if (IS_ERR(pi)) {
+			err = PTR_ERR(pi);
+			continue;
+		}
+
+		inode = &pi->vfs_inode;
+		err = 0;
+		break;
+	}
+
+	if (err && (err != -ENOENT) && (err != -EOPNOTSUPP))
+		return ERR_PTR(err);
+
+	return d_splice_alias(inode, dentry);
+}
+
+static int pohmelfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(dir->i_sb);
+	struct pohmelfs_inode *parent = pohmelfs_inode(dir);
+	struct pohmelfs_inode *pi;
+	int err;
+
+	inode_inc_link_count(dir);
+
+	pi = pohmelfs_new_inode(psb, mode | S_IFDIR);
+	if (IS_ERR(pi)) {
+		err = PTR_ERR(pi);
+		goto err_out_dir;
+	}
+
+	pohmelfs_inode_dirty(parent, pi);
+
+	d_instantiate(dentry, &pi->vfs_inode);
+	pr_debug("pohmelfs: mkdir: %s, ino: %lu, parent dir: %lu, object: %s, refcnt: %d\n",
+			pohmelfs_dump_id(pi->id.id), pi->vfs_inode.i_ino,
+			dir->i_ino, dentry->d_name.name, dentry->d_count);
+
+	return 0;
+
+err_out_dir:
+	inode_dec_link_count(dir);
+	return err;
+}
+
+static int pohmelfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct pohmelfs_inode *parent = pohmelfs_inode(dir);
+	struct pohmelfs_script_req req;
+
+	req.script_name = POHMELFS_UNLINK_SCRIPT;
+	req.script_namelen = sizeof(POHMELFS_UNLINK_SCRIPT) - 1; /* not including 0-byte */
+
+	req.obj_name = (char *)dentry->d_name.name;
+	req.obj_len = dentry->d_name.len;
+
+	req.binary = &parent->id;
+	req.binary_size = sizeof(struct dnet_raw_id);
+
+	req.group_id = 0;
+	req.id = &parent->id;
+	req.complete = pohmelfs_send_inode_info_complete;
+
+	req.sync = 0;
+
+	return pohmelfs_send_script_request(parent, &req);
+}
+
+static int pohmelfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	return pohmelfs_unlink(dir, dentry);
+}
+
+struct pohmelfs_rename_req {
+	struct dnet_raw_id		old_dir_id;
+	struct dnet_raw_id		new_dir_id;
+	int				new_len;
+	char				new_name[0];
+} __attribute__ ((packed));
+
+static int pohmelfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct pohmelfs_inode *old_parent = pohmelfs_inode(old_dir);
+	struct pohmelfs_script_req req;
+	struct pohmelfs_rename_req *r;
+	int size = sizeof(struct pohmelfs_rename_req) + new_dentry->d_name.len;
+	int err;
+
+	r = kmalloc(size, GFP_NOIO);
+	if (!r) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	r->old_dir_id = pohmelfs_inode(old_dir)->id;
+	r->new_dir_id = pohmelfs_inode(new_dir)->id;
+	r->new_len = cpu_to_le32(new_dentry->d_name.len);
+	memcpy(r->new_name, new_dentry->d_name.name, new_dentry->d_name.len);
+
+	req.script_name = POHMELFS_RENAME_SCRIPT;
+	req.script_namelen = sizeof(POHMELFS_RENAME_SCRIPT) - 1; /* not including 0-byte */
+
+	req.obj_name = (char *)old_dentry->d_name.name;
+	req.obj_len = old_dentry->d_name.len;
+
+	req.binary = r;
+	req.binary_size = size;
+
+	req.sync = 0;
+	req.group_id = 0;
+	req.id = &old_parent->id;
+	req.complete = pohmelfs_send_inode_info_complete;
+
+	err = pohmelfs_send_script_request(old_parent, &req);
+	if (err)
+		goto err_out_free;
+
+err_out_free:
+	kfree(r);
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(dir->i_sb);
+	struct pohmelfs_inode *pi;
+	struct inode *inode;
+	unsigned len = strlen(symname)+1;
+	int err = 0;
+
+	pi = pohmelfs_new_inode(psb, S_IFLNK | S_IRWXUGO);
+	if (IS_ERR(pi)) {
+		err = PTR_ERR(pi);
+		goto err_out_exit;
+	}
+
+	pohmelfs_inode_dirty(pohmelfs_inode(dir), pi);
+	inode = &pi->vfs_inode;
+
+	err = page_symlink(inode, symname, len);
+	if (err)
+		goto err_out_put;
+
+	d_instantiate(dentry, inode);
+
+	return 0;
+
+err_out_put:
+	iput(inode);
+err_out_exit:
+	return err;
+}
+
+const struct inode_operations pohmelfs_dir_inode_operations = {
+	.create		= pohmelfs_create,
+	.lookup 	= pohmelfs_lookup,
+	.mkdir		= pohmelfs_mkdir,
+	.unlink		= pohmelfs_unlink,
+	.rmdir		= pohmelfs_rmdir,
+	.rename		= pohmelfs_rename,
+	.symlink	= pohmelfs_symlink,
+};
+
+static int pohmelfs_dir_open(struct inode *dir, struct file *file)
+{
+	struct pohmelfs_inode *pi;
+	struct dentry *parent_dentry = file->f_path.dentry;
+	struct dentry *dentry, *tmp;
+	LIST_HEAD(kill_list);
+	int err;
+	u64 magic_version = 0x100;
+	void *magic_data = (void *)(0x1234);
+
+	spin_lock(&parent_dentry->d_lock);
+	list_for_each_entry_safe(dentry, tmp, &file->f_path.dentry->d_subdirs, d_u.d_child) {
+		pi = pohmelfs_inode(dentry->d_inode);
+
+		if (dentry->d_inode && pi->received) {
+			pi->vfs_inode.i_version = magic_version;
+			dentry->d_fsdata = magic_data;
+		}
+	}
+	spin_unlock(&parent_dentry->d_lock);
+
+	pohmelfs_warm_dir(dir);
+
+	spin_lock(&parent_dentry->d_lock);
+	list_for_each_entry_safe(dentry, tmp, &file->f_path.dentry->d_subdirs, d_u.d_child) {
+		pi = pohmelfs_inode(dentry->d_inode);
+
+		if ((dentry->d_inode && pi->received && (pi->vfs_inode.i_version == magic_version)) || (dentry->d_fsdata == magic_data)) {
+			spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+			__d_drop(dentry);
+			dget_dlock(dentry);
+
+			list_move(&dentry->d_u.d_child, &kill_list);
+			spin_unlock(&dentry->d_lock);
+		}
+	}
+	spin_unlock(&parent_dentry->d_lock);
+
+	list_for_each_entry_safe(dentry, tmp, &kill_list, d_u.d_child) {
+		d_delete(dentry);
+		dput(dentry);
+	}
+
+	err = dcache_dir_open(dir, file);
+	if (err)
+		goto err_out_exit;
+
+err_out_exit:
+	return err;
+}
+
+const struct file_operations pohmelfs_dir_fops = {
+	.open		= pohmelfs_dir_open,
+	.release	= dcache_dir_close,
+	.llseek		= dcache_dir_lseek,
+
+	.read		= generic_read_dir,
+	.readdir	= dcache_readdir,
+};
diff --git a/fs/pohmelfs/file.c b/fs/pohmelfs/file.c
new file mode 100644
index 0000000..a6d94e9
--- /dev/null
+++ b/fs/pohmelfs/file.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (C) 2011+ Evgeniy Polyakov <zbr@...emap.net>
+ */
+
+#include <linux/fs.h>
+
+#include "pohmelfs.h"
+
+static int pohmelfs_write_init(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_wait *wait = t->priv;
+
+	pohmelfs_wait_get(wait);
+	return 0;
+}
+
+static void pohmelfs_write_destroy(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_wait *wait = t->priv;
+
+	wake_up(&wait->wq);
+	pohmelfs_wait_put(wait);
+}
+
+static int pohmelfs_write_complete(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct pohmelfs_wait *wait = t->priv;
+	struct pohmelfs_inode *pi = pohmelfs_inode(t->inode);
+	struct dnet_cmd *cmd = &recv->cmd;
+	unsigned long long trans = cmd->trans & ~DNET_TRANS_REPLY;
+
+	pr_debug("pohmelfs: %s: write complete: %llu, flags: %x, status: %d\n",
+			pohmelfs_dump_id(pi->id.id), trans, cmd->flags, cmd->status);
+
+	if (cmd->flags & DNET_FLAGS_MORE)
+		return 0;
+
+	wait->condition = cmd->status;
+	if (!wait->condition)
+		wait->condition = 1;
+
+	return 0;
+}
+
+static int pohmelfs_send_write_metadata(struct pohmelfs_inode *pi, struct pohmelfs_io *pio, struct pohmelfs_wait *wait)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(pi->vfs_inode.i_sb);
+	struct timespec ts = CURRENT_TIME;
+	struct dnet_meta_update *mu;
+	struct dnet_meta *m;
+	int err, size;
+	void *data;
+
+	size = sizeof(struct dnet_meta) * 4 +
+			sizeof(struct dnet_meta_check_status) +
+			sizeof(struct dnet_meta_update) +
+			psb->fsid_len +
+			psb->group_num * sizeof(int);
+
+	data = kzalloc(size, GFP_NOIO);
+	if (!data) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	m = data;
+	m->type = DNET_META_GROUPS;
+	m->size = psb->group_num * sizeof(int);
+	memcpy(m->data, psb->groups, m->size);
+	dnet_convert_meta(m);
+
+	m = (struct dnet_meta *)(m->data + le32_to_cpu(m->size));
+	m->type = DNET_META_NAMESPACE;
+	m->size = psb->fsid_len;
+	memcpy(m->data, psb->fsid, psb->fsid_len);
+	dnet_convert_meta(m);
+
+	m = (struct dnet_meta *)(m->data + le32_to_cpu(m->size));
+	m->type = DNET_META_UPDATE;
+	m->size = sizeof(struct dnet_meta_update);
+	mu = (struct dnet_meta_update *)m->data;
+	mu->tm.tsec = ts.tv_sec;
+	mu->tm.tnsec = ts.tv_nsec;
+	dnet_convert_meta_update(mu);
+	dnet_convert_meta(m);
+
+	m = (struct dnet_meta *)(m->data + le32_to_cpu(m->size));
+	m->type = DNET_META_CHECK_STATUS;
+	m->size = sizeof(struct dnet_meta_check_status);
+	/* do not fill, it will be updated on server */
+	dnet_convert_meta(m);
+
+	pio->pi = pi;
+	pio->id = &pi->id;
+	pio->cmd = DNET_CMD_WRITE;
+	pio->ioflags = DNET_IO_FLAGS_OVERWRITE | DNET_IO_FLAGS_META;
+	pio->cflags = DNET_FLAGS_NEED_ACK;
+	pio->type = 1;
+	pio->cb.init = pohmelfs_write_init;
+	pio->cb.destroy = pohmelfs_write_destroy;
+	pio->cb.complete = pohmelfs_write_complete;
+	pio->priv = wait;
+	pio->data = data;
+	pio->size = size;
+
+	err = pohmelfs_send_io(pio);
+	if (err)
+		goto err_out_free;
+
+err_out_free:
+	kfree(data);
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_write_command_complete(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct dnet_cmd *cmd = &recv->cmd;
+	struct pohmelfs_write_ctl *ctl = t->wctl;
+
+	if (cmd->flags & DNET_FLAGS_MORE)
+		return 0;
+
+	if (cmd->status == 0)
+		atomic_inc(&ctl->good_writes);
+	else {
+		struct inode *inode = t->inode;
+		struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+		unsigned long long size = le64_to_cpu(t->cmd.p.io.size);
+		unsigned long long offset = le64_to_cpu(t->cmd.p.io.offset);
+
+		pr_debug("pohmelfs: %s: write failed: ino: %lu, isize: %llu, offset: %llu, size: %llu: %d\n",
+				pohmelfs_dump_id(pi->id.id), inode->i_ino, inode->i_size, offset, size, cmd->status);
+	}
+
+	return 0;
+}
+
+static int pohmelfs_write_command_init(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_write_ctl *ctl = t->wctl;
+
+	kref_get(&ctl->refcnt);
+	return 0;
+}
+
+static void pohmelfs_write_command_destroy(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_write_ctl *ctl = t->wctl;
+
+	kref_put(&ctl->refcnt, pohmelfs_write_ctl_release);
+}
+
+static int pohmelfs_write_prepare_commit(struct pohmelfs_inode *pi, struct pohmelfs_write_ctl *ctl,
+		uint64_t prepare_size, loff_t offset, size_t len)
+{
+	int err;
+	struct inode *inode = &pi->vfs_inode;
+	struct pohmelfs_io *pio;
+
+	pio = kmem_cache_zalloc(pohmelfs_io_cache, GFP_NOIO);
+	if (!pio) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	pio->pi = pi;
+	pio->id = &pi->id;
+	pio->cmd = DNET_CMD_WRITE;
+	pio->offset = offset;
+	pio->size = len;
+	pio->cflags = DNET_FLAGS_NEED_ACK;
+
+	/*
+	 * We always set prepare bit, since elliptics/eblob reuses existing (previously prepared/reserved) area
+	 * But it also allows to 'miss' prepare message (for example if we sent prepare bit when node was offline)
+	 */
+	pio->ioflags = DNET_IO_FLAGS_OVERWRITE | DNET_IO_FLAGS_PLAIN_WRITE | DNET_IO_FLAGS_PREPARE;
+
+	pio->num = prepare_size;
+
+	/* commit when whole inode is written */
+	if (offset + len == prepare_size) {
+		pio->ioflags |= DNET_IO_FLAGS_COMMIT;
+	}
+
+	pio->wctl = ctl;
+	pio->priv = ctl;
+	pio->cb.complete = pohmelfs_write_command_complete;
+	pio->cb.init = pohmelfs_write_command_init;
+	pio->cb.destroy = pohmelfs_write_command_destroy;
+
+	pr_debug("pohmelfs_write_prepare_commit: %s: ino: %lu, offset: %llu, len: %zu, total size: %llu\n",
+			pohmelfs_dump_id(pi->id.id), inode->i_ino, (unsigned long long)offset, len, inode->i_size);
+
+	err = pohmelfs_send_io(pio);
+	if (err)
+		goto err_out_free;
+
+err_out_free:
+	kmem_cache_free(pohmelfs_io_cache, pio);
+err_out_exit:
+	return err;
+}
+
+int pohmelfs_write_command(struct pohmelfs_inode *pi, struct pohmelfs_write_ctl *ctl, loff_t offset, size_t len)
+{
+	return pohmelfs_write_prepare_commit(pi, ctl, i_size_read(&pi->vfs_inode), offset, len);
+}
+
+int pohmelfs_metadata_inode(struct pohmelfs_inode *pi, int sync)
+{
+	struct inode *inode = &pi->vfs_inode;
+	struct pohmelfs_sb *psb = pohmelfs_sb(inode->i_sb);
+	struct pohmelfs_io *pio;
+	struct pohmelfs_wait *wait;
+	long ret;
+	int err;
+
+	wait = pohmelfs_wait_alloc(pi);
+	if (!wait) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	pio = kmem_cache_zalloc(pohmelfs_io_cache, GFP_NOIO);
+	if (!pio) {
+		err = -ENOMEM;
+		goto err_out_put;
+	}
+
+	err = pohmelfs_send_write_metadata(pi, pio, wait);
+	if (err)
+		goto err_out_free;
+
+	if (sync) {
+		ret = wait_event_interruptible_timeout(wait->wq,
+				wait->condition != 0 && atomic_read(&wait->refcnt.refcount) <= 2,
+				msecs_to_jiffies(psb->write_wait_timeout));
+		if (ret <= 0) {
+			err = ret;
+			if (ret == 0)
+				err = -ETIMEDOUT;
+			goto err_out_free;
+		}
+
+		if (wait->condition < 0) {
+			err = wait->condition;
+			goto err_out_free;
+		}
+	}
+
+err_out_free:
+	kmem_cache_free(pohmelfs_io_cache, pio);
+err_out_put:
+	pohmelfs_wait_put(wait);
+err_out_exit:
+	return 0;
+}
+
+static long pohmelfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+	struct pohmelfs_io *pio;
+	int err;
+
+	if (offset + len < i_size_read(inode)) {
+		err = 0;
+		goto err_out_exit;
+	}
+
+	pio = kmem_cache_zalloc(pohmelfs_io_cache, GFP_NOIO);
+	if (!pio) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	pio->pi = pi;
+	pio->id = &pi->id;
+	pio->cmd = DNET_CMD_WRITE;
+	pio->cflags = DNET_FLAGS_NEED_ACK;
+	pio->ioflags = DNET_IO_FLAGS_PREPARE;
+	pio->num = i_size_read(inode);
+
+	pr_info("pohmelfs_fallocate: %s: ino: %lu, offset: %llu, len: %llu, total size: %llu\n",
+			pohmelfs_dump_id(pi->id.id), inode->i_ino,
+			(unsigned long long)offset, (unsigned long long)len, inode->i_size);
+
+	err = pohmelfs_send_io(pio);
+	if (err)
+		goto err_out_free;
+
+err_out_free:
+	kmem_cache_free(pohmelfs_io_cache, pio);
+err_out_exit:
+	return err;
+}
+
+const struct file_operations pohmelfs_file_ops = {
+	.open		= generic_file_open,
+
+	.llseek		= generic_file_llseek,
+
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+
+	.mmap		= generic_file_mmap,
+
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+
+	.fallocate	= pohmelfs_fallocate,
+};
+
+const struct inode_operations pohmelfs_file_inode_operations = {
+};
diff --git a/fs/pohmelfs/inode.c b/fs/pohmelfs/inode.c
new file mode 100644
index 0000000..a765ec6
--- /dev/null
+++ b/fs/pohmelfs/inode.c
@@ -0,0 +1,785 @@
+/*
+ * Copyright (C) 2011+ Evgeniy Polyakov <zbr@...emap.net>
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/cred.h>
+#include <linux/fiemap.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/pagevec.h>
+#include <linux/pagemap.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/writeback.h>
+
+#include "pohmelfs.h"
+
+char *pohmelfs_dump_id_len_raw(const unsigned char *id, unsigned int len, char *dst)
+{
+	unsigned int i;
+
+	if (len > SHA512_DIGEST_SIZE)
+		len = SHA512_DIGEST_SIZE;
+
+	for (i=0; i<len; ++i)
+		sprintf(&dst[2*i], "%02x", id[i]);
+	return dst;
+}
+
+#define pohmelfs_dump_len 6
+typedef struct {
+	char			id_str[pohmelfs_dump_len * 2 + 1];
+} pohmelfs_dump_t;
+static DEFINE_PER_CPU(pohmelfs_dump_t, pohmelfs_dump_per_cpu);
+
+char *pohmelfs_dump_id(const unsigned char *id)
+{
+	pohmelfs_dump_t *ptr;
+
+	ptr = &get_cpu_var(pohmelfs_dump_per_cpu);
+	pohmelfs_dump_id_len_raw(id, pohmelfs_dump_len, ptr->id_str);
+	put_cpu_var(ptr);
+
+	return ptr->id_str;
+}
+
+#define dnet_raw_id_scratch 6
+typedef struct {
+	unsigned long 			rand;
+	struct timespec			ts;
+} dnet_raw_id_scratch_t;
+static DEFINE_PER_CPU(dnet_raw_id_scratch_t, dnet_raw_id_scratch_per_cpu);
+
+static int pohmelfs_gen_id(struct pohmelfs_sb *psb, struct dnet_raw_id *id)
+{
+	dnet_raw_id_scratch_t *sc;
+	int err;
+	long rand;
+
+	get_random_bytes(&rand, sizeof(sc->rand));
+
+	sc = &get_cpu_var(dnet_raw_id_scratch_per_cpu);
+	sc->rand ^= rand;
+	sc->ts = CURRENT_TIME;
+
+	err = pohmelfs_hash(psb, sc, sizeof(dnet_raw_id_scratch_t), id);
+	put_cpu_var(sc);
+
+	return err;
+}
+
+static int pohmelfs_sb_inode_insert(struct pohmelfs_sb *psb, struct pohmelfs_inode *pi)
+{
+	struct rb_node **n = &psb->inode_root.rb_node, *parent = NULL;
+	struct pohmelfs_inode *tmp;
+	int cmp, err = 0;
+
+	spin_lock(&psb->inode_lock);
+	while (*n) {
+		parent = *n;
+
+		tmp = rb_entry(parent, struct pohmelfs_inode, node);
+
+		cmp = dnet_id_cmp_str(tmp->id.id, pi->id.id);
+		if (cmp < 0)
+			n = &parent->rb_left;
+		else if (cmp > 0)
+			n = &parent->rb_right;
+		else {
+			err = -EEXIST;
+			goto err_out_unlock;
+		}
+	}
+
+	rb_link_node(&pi->node, parent, n);
+	rb_insert_color(&pi->node, &psb->inode_root);
+
+err_out_unlock:
+	spin_unlock(&psb->inode_lock);
+
+	return err;
+}
+
+struct pohmelfs_inode *pohmelfs_sb_inode_lookup(struct pohmelfs_sb *psb, struct dnet_raw_id *id)
+{
+	struct rb_node *n = psb->inode_root.rb_node;
+	struct pohmelfs_inode *pi, *found = NULL;
+	int cmp;
+
+	spin_lock(&psb->inode_lock);
+	while (n) {
+		pi = rb_entry(n, struct pohmelfs_inode, node);
+
+		cmp = dnet_id_cmp_str(pi->id.id, id->id);
+		if (cmp < 0) {
+			n = n->rb_left;
+		} else if (cmp > 0)
+			n = n->rb_right;
+		else {
+			found = pi;
+			break;
+		}
+	}
+	if (found) {
+		if (!igrab(&found->vfs_inode))
+			found = NULL;
+	}
+	spin_unlock(&psb->inode_lock);
+
+	return found;
+}
+
+struct inode *pohmelfs_alloc_inode(struct super_block *sb)
+{
+	struct pohmelfs_inode *pi;
+
+	pi = kmem_cache_zalloc(pohmelfs_inode_cache, GFP_NOIO);
+	if (!pi)
+		goto err_out_exit;
+
+	inode_init_once(&pi->vfs_inode);
+
+	pi->received = 0;
+
+	rb_init_node(&pi->node);
+
+	return &pi->vfs_inode;
+
+err_out_exit:
+	return NULL;
+}
+
+void pohmelfs_destroy_inode(struct inode *inode)
+{
+	struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+
+	pr_debug("pohmelfs: %s: destroy: ino: %ld, dirty: %lx\n", pohmelfs_dump_id(pi->id.id), inode->i_ino, inode->i_state & I_DIRTY);
+
+	kmem_cache_free(pohmelfs_inode_cache, pi);
+}
+
+int pohmelfs_hash(struct pohmelfs_sb *psb, const void *data, const size_t size, struct dnet_raw_id *id)
+{
+	struct scatterlist sg;
+	struct hash_desc desc;
+
+	sg_init_table(&sg, 1);
+	sg_set_buf(&sg, data, size);
+
+	desc.tfm = psb->hash;
+	desc.flags = 0;
+
+	return crypto_hash_digest(&desc, &sg, size, id->id);
+}
+
+static void pohmelfs_readpages_destroy(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_wait *wait = t->priv;
+
+	wake_up(&wait->wq);
+	pohmelfs_wait_put(wait);
+}
+
+static int pohmelfs_readpages_complete(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct pohmelfs_wait *wait = t->priv;
+	struct dnet_cmd *cmd = &recv->cmd;
+
+	if (!(cmd->flags & DNET_FLAGS_MORE)) {
+		if (!wait->condition) {
+			wait->condition = cmd->status;
+			if (!wait->condition)
+				wait->condition = 1;
+		}
+	}
+
+	pr_debug("pohmelfs: %d:%s: pohmelfs_readpages_complete: read: %ld, wait: %d\n",
+		cmd->id.group_id, pohmelfs_dump_id(wait->pi->id.id), atomic_long_read(&wait->count), wait->condition);
+
+	return 0;
+}
+
+static int pohmelfs_readpages_init(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_wait *wait = t->priv;
+
+	pohmelfs_wait_get(wait);
+	return 0;
+}
+
+static int pohmelfs_readpages_recv_reply(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct pohmelfs_wait *wait = t->priv;
+	struct pohmelfs_inode *pi = wait->pi;
+	struct address_space *mapping = wait->ret;
+	unsigned int asize = sizeof(struct dnet_attr) + sizeof(struct dnet_io_attr);
+	void *data = &t->cmd.attr; /* overwrite send buffer used for attr/ioattr */
+	struct dnet_cmd *cmd = &recv->cmd;
+	pgoff_t offset;
+	struct page *page;
+	int err, size;
+
+	if (t->recv_offset < asize) {
+		size = asize - t->recv_offset;
+		data += t->recv_offset;
+		err = pohmelfs_recv(t, recv, data, size);
+		if (err < 0)
+			goto err_out_exit;
+
+		dnet_convert_io_attr(&t->cmd.p.io);
+	}
+
+	while (t->recv_offset != cmd->size) {
+		offset = (t->recv_offset - asize) & (PAGE_CACHE_SIZE - 1);
+		size = PAGE_CACHE_SIZE - offset;
+
+		if (size > cmd->size - t->recv_offset)
+			size = cmd->size - t->recv_offset;
+
+		page = find_or_create_page(mapping, (t->recv_offset - asize + t->cmd.p.io.offset) >> PAGE_CACHE_SHIFT, GFP_NOIO);
+		if (!page) {
+			err = -ENOMEM;
+			goto err_out_exit;
+		}
+
+		data = kmap(page);
+		err = pohmelfs_recv(t, recv, data + offset, size);
+		kunmap(page);
+
+		if (err > 0 && ((err + offset == PAGE_CACHE_SIZE) || (t->recv_offset == cmd->size)))  {
+			SetPageUptodate(page);
+		}
+
+		unlock_page(page);
+		page_cache_release(page);
+
+		if (err < 0)
+			goto err_out_exit;
+
+		atomic_long_add(err, &wait->count);
+	}
+
+	err = 0;
+
+err_out_exit:
+	if ((err < 0) && (err != -ENOENT) && (err != -EAGAIN))
+		pr_info("pohmelfs: %d:%s: pohmelfs_readpages_recv_data: offset: %lld, data size: %llu, err: %d\n",
+			cmd->id.group_id, pohmelfs_dump_id(pi->id.id), t->recv_offset - asize + t->cmd.p.io.offset,
+			(unsigned long long)cmd->size - asize, err);
+
+	return err;
+}
+
+static int pohmelfs_readpages_group(struct address_space *mapping, int group_id, pgoff_t offset, size_t size)
+{
+	struct inode *inode = mapping->host;
+	struct pohmelfs_sb *psb = pohmelfs_sb(inode->i_sb);
+	struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+	struct pohmelfs_wait *wait;
+	struct pohmelfs_io *io;
+	long ret;
+	int err;
+
+	wait = pohmelfs_wait_alloc(pi);
+	if (!wait) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	io = kmem_cache_zalloc(pohmelfs_io_cache, GFP_NOIO);
+	if (!io) {
+		err = -ENOMEM;
+		goto err_out_put;
+	}
+
+	io->pi = pi;
+	io->id = &pi->id;
+	io->cmd = DNET_CMD_READ;
+	io->cflags = DNET_FLAGS_NEED_ACK;
+	io->offset = offset;
+	io->size = size;
+	if (psb->no_read_csum)
+		io->ioflags = DNET_IO_FLAGS_NOCSUM;
+	io->cb.init = pohmelfs_readpages_init;
+	io->cb.complete = pohmelfs_readpages_complete;
+	io->cb.destroy = pohmelfs_readpages_destroy;
+	io->cb.recv_reply = pohmelfs_readpages_recv_reply;
+	io->priv = wait;
+
+	/* it is safe, since we hold a reference to corresponding inode in wait->pi */
+	wait->ret = mapping;
+
+	err = pohmelfs_send_io_group(io, group_id);
+	if (err)
+		goto err_out_free;
+
+	ret = wait_event_interruptible_timeout(wait->wq, wait->condition != 0, msecs_to_jiffies(psb->read_wait_timeout));
+	if (ret <= 0) {
+		err = ret;
+		if (ret == 0)
+			err = -ETIMEDOUT;
+		goto err_out_free;
+	}
+
+	if (wait->condition < 0) {
+		err = wait->condition;
+		goto err_out_free;
+	}
+
+	err = atomic_long_read(&wait->count);
+
+err_out_free:
+	kmem_cache_free(pohmelfs_io_cache, io);
+err_out_put:
+	pohmelfs_wait_put(wait);
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_readpages(struct file *filp, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct pohmelfs_sb *psb = pohmelfs_sb(inode->i_sb);
+	struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+	int i, err = -ENOENT;
+	pgoff_t offset = 0;
+	struct page *tmp, *page;
+
+	list_for_each_entry_safe(page, tmp, pages, lru) {
+		list_del(&page->lru);
+
+		if (page_offset(page) < offset)
+			offset = page_offset(page);
+
+		/*
+		 * we do not really care about these pages
+		 * completion callback will try to find it in mapping
+		 * and will allocate new pages if mapping is empty
+		 */
+		if (!add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL))
+			unlock_page(page);
+		page_cache_release(page);
+	}
+
+	for (i = 0; i < psb->group_num; ++i) {
+		err = pohmelfs_readpages_group(mapping, psb->groups[i], offset, nr_pages * PAGE_CACHE_SIZE);
+		if (err < 0)
+			continue;
+
+		err = 0;
+		break;
+	}
+
+	pr_debug("pohmelfs: %s: readpages: ino: %lu, offset: %lu, pages: %u: %d\n",
+			pohmelfs_dump_id(pi->id.id), inode->i_ino, offset, nr_pages, err);
+
+	return err;
+}
+
+static int pohmelfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct pohmelfs_sb *psb = pohmelfs_sb(inode->i_sb);
+	int i, err = -ENOENT;
+
+	if (inode->i_size <= page->index << PAGE_CACHE_SHIFT) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+
+	unlock_page(page);
+
+	for (i = 0; i < psb->group_num; ++i) {
+		err = pohmelfs_readpages_group(page->mapping, psb->groups[i], page_offset(page), PAGE_CACHE_SIZE);
+		if (err < 0)
+			continue;
+
+		err = 0;
+		break;
+	}
+
+	if ((err < 0) && (err != -ENOENT))
+		pr_err("pohmelfs: %s: readpage: ino: %lu, offset: %lu, uptodate: %d, err: %d\n",
+			pohmelfs_dump_id(pohmelfs_inode(inode)->id.id), inode->i_ino, (long)page_offset(page),
+			PageUptodate(page), err);
+	return err;
+}
+
+void pohmelfs_write_ctl_release(struct kref *kref)
+{
+	struct pohmelfs_write_ctl *ctl = container_of(kref, struct pohmelfs_write_ctl, refcnt);
+	int bad_write = atomic_read(&ctl->good_writes) < ctl->psb->group_num / 2 + 1;
+	struct page *page;
+	unsigned int i;
+
+	if (bad_write) {
+		struct inode *inode = ctl->pvec.pages[0]->mapping->host;
+		struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+		unsigned long long offset = page_offset(ctl->pvec.pages[0]);
+
+		pr_debug("pohmelfs: %s: bad write: ino: %lu, isize: %llu, offset: %llu: %d/%d\n",
+				pohmelfs_dump_id(pi->id.id), inode->i_ino, inode->i_size, offset,
+				atomic_read(&ctl->good_writes), ctl->psb->group_num);
+	}
+
+	for (i = 0; i < pagevec_count(&ctl->pvec); ++i) {
+		page = ctl->pvec.pages[i];
+
+		if (PageLocked(page)) {
+			end_page_writeback(page);
+
+			if (bad_write) {
+				SetPageError(page);
+				set_page_dirty(page);
+			}
+			unlock_page(page);
+		}
+	}
+
+	pagevec_release(&ctl->pvec);
+	kmem_cache_free(pohmelfs_write_cache, ctl);
+}
+
+static int pohmelfs_writepages_chunk(struct pohmelfs_inode *pi, struct pohmelfs_write_ctl *ctl, struct writeback_control *wbc)
+{
+	struct inode *inode = &pi->vfs_inode;
+	uint64_t offset, size;
+	unsigned i;
+	int err;
+
+	offset = page_offset(ctl->pvec.pages[0]);
+
+	size = 0;
+	/* we will lookup them again when doing actual send */
+	for (i = 0; i< pagevec_count(&ctl->pvec); ++i) {
+		struct page *page = ctl->pvec.pages[i];
+
+		lock_page(page);
+		/* just write all pages even if they were truncated - this is handled by inode info metadata */
+#if 0
+		if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+			unlock_page(page);
+			continue;
+		}
+
+		if (!PageDirty(page))
+			goto continue_unlock;
+
+		if (!clear_page_dirty_for_io(page))
+			goto continue_unlock;
+#else
+		clear_page_dirty_for_io(page);
+#endif
+
+		set_page_writeback(page);
+
+		size += PAGE_CACHE_SIZE;
+		wbc->nr_to_write--;
+	}
+
+	if (offset + size > inode->i_size)
+		size = inode->i_size - offset;
+
+	err = pohmelfs_write_command(pi, ctl, offset, size);
+	if (err)
+		goto err_out_exit;
+
+err_out_exit:
+	kref_put(&ctl->refcnt, pohmelfs_write_ctl_release);
+	return err;
+}
+
+static int pohmelfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+	struct pohmelfs_write_ctl *ctl;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int nr_pages, err = 0;
+
+	index = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+	pr_debug("pohmelfs: %s: writepages: ino: %ld, nr: %ld, index: %llu, end: %llu, total_size: %lu, sync: %d\n",
+			pohmelfs_dump_id(pohmelfs_inode(inode)->id.id), inode->i_ino,
+			wbc->nr_to_write, wbc->range_start, wbc->range_end, (unsigned long)inode->i_size, wbc->sync_mode);
+
+	if ((!wbc->range_start && !wbc->range_end) || !inode->i_size) {
+		err = 0;
+		goto err_out_exit;
+	}
+
+	while (index <= end) {
+		ctl = kmem_cache_zalloc(pohmelfs_write_cache, GFP_NOIO);
+		if (!ctl) {
+			err = -ENOMEM;
+			goto err_out_exit;
+		}
+
+		kref_init(&ctl->refcnt);
+		atomic_set(&ctl->good_writes, 0);
+		ctl->psb = pohmelfs_sb(inode->i_sb);
+
+		nr_pages = pagevec_lookup_tag(&ctl->pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (!nr_pages) {
+			err = 0;
+			kmem_cache_free(pohmelfs_write_cache, ctl);
+			break;
+		}
+
+		err = pohmelfs_writepages_chunk(pi, ctl, wbc);
+		if (err)
+			goto err_out_exit;
+	}
+
+	err = pohmelfs_metadata_inode(pi, wbc->sync_mode != WB_SYNC_NONE);
+	if (err)
+		goto err_out_exit;
+
+err_out_exit:
+	return err;
+}
+
+static const struct address_space_operations pohmelfs_aops = {
+	.write_begin		= simple_write_begin,
+	.write_end		= simple_write_end,
+	.writepages		= pohmelfs_writepages,
+	.readpage		= pohmelfs_readpage,
+	.readpages		= pohmelfs_readpages,
+	.set_page_dirty 	= __set_page_dirty_nobuffers,
+};
+
+void pohmelfs_convert_inode_info(struct pohmelfs_inode_info *info)
+{
+	info->ino = cpu_to_le64(info->ino);
+	info->mode = cpu_to_le64(info->mode);
+	info->nlink = cpu_to_le64(info->nlink);
+	info->uid = cpu_to_le32(info->uid);
+	info->gid = cpu_to_le32(info->gid);
+	info->namelen = cpu_to_le32(info->namelen);
+	info->blocks = cpu_to_le64(info->blocks);
+	info->rdev = cpu_to_le64(info->rdev);
+	info->size = cpu_to_le64(info->size);
+	info->version = cpu_to_le64(info->version);
+	info->blocksize = cpu_to_le64(info->blocksize);
+	info->flags = cpu_to_le64(info->flags);
+
+	dnet_convert_time(&info->ctime);
+	dnet_convert_time(&info->mtime);
+	dnet_convert_time(&info->atime);
+}
+
+void pohmelfs_fill_inode_info(struct inode *inode, struct pohmelfs_inode_info *info)
+{
+	struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+
+	memcpy(info->id.id, pi->id.id, DNET_ID_SIZE);
+
+	info->ino = inode->i_ino;
+	info->mode = inode->i_mode;
+	info->nlink = inode->i_nlink;
+	info->uid = inode->i_uid;
+	info->gid = inode->i_gid;
+	info->blocks = inode->i_blocks;
+	info->rdev = inode->i_rdev;
+	info->size = inode->i_size;
+	info->version = inode->i_version;
+	info->blocksize = 1 << inode->i_blkbits;
+
+	info->ctime.tsec = inode->i_ctime.tv_sec;
+	info->ctime.tnsec = inode->i_ctime.tv_nsec;
+
+	info->mtime.tsec = inode->i_mtime.tv_sec;
+	info->mtime.tnsec = inode->i_mtime.tv_nsec;
+
+	info->atime.tsec = inode->i_atime.tv_sec;
+	info->atime.tnsec = inode->i_atime.tv_nsec;
+
+	info->flags = 0;
+}
+
+void pohmelfs_fill_inode(struct inode *inode, struct pohmelfs_inode_info *info)
+{
+	pr_debug("pohmelfs: %s: ino: %lu inode is regular: %d, dir: %d, link: %d, mode: %o, "
+			"namelen: %u, size: %llu, state: %lx, mtime: %llu.%llu/%lu.%lu\n",
+			pohmelfs_dump_id(info->id.id), inode->i_ino,
+			S_ISREG(inode->i_mode), S_ISDIR(inode->i_mode),
+			S_ISLNK(inode->i_mode), inode->i_mode, info->namelen, inode->i_size, inode->i_state,
+			(unsigned long long)info->mtime.tsec, (unsigned long long)info->mtime.tnsec,
+			inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec);
+
+	if (info->mtime.tsec < inode->i_mtime.tv_sec)
+		return;
+	if ((info->mtime.tsec == inode->i_mtime.tv_sec) &&
+			(info->mtime.tnsec < inode->i_mtime.tv_nsec))
+		return;
+
+	pohmelfs_inode(inode)->id = info->id;
+
+	inode->i_mode = info->mode;
+	inode->i_nlink = info->nlink;
+	inode->i_uid = info->uid;
+	inode->i_gid = info->gid;
+	inode->i_blocks = info->blocks;
+	inode->i_rdev = info->rdev;
+	inode->i_size = info->size;
+	inode->i_version = info->version;
+	inode->i_blkbits = ffs(info->blocksize);
+
+	inode->i_mtime = pohmelfs_date(&info->mtime);
+	inode->i_atime = pohmelfs_date(&info->atime);
+	inode->i_ctime = pohmelfs_date(&info->ctime);
+}
+
+static void pohmelfs_inode_info_current(struct pohmelfs_sb *psb, struct pohmelfs_inode_info *info)
+{
+	struct timespec ts = CURRENT_TIME;
+	struct dnet_time dtime;
+
+	info->nlink = S_ISDIR(info->mode) ? 2 : 1;
+	info->uid = current_fsuid();
+	info->gid = current_fsgid();
+	info->size = 0;
+	info->blocksize = PAGE_SIZE;
+	info->blocks = 0;
+	info->rdev = 0;
+	info->version = 0;
+
+	dtime.tsec = ts.tv_sec;
+	dtime.tnsec = ts.tv_nsec;
+
+	info->ctime = dtime;
+	info->mtime = dtime;
+	info->atime = dtime;
+
+	pohmelfs_gen_id(psb, &info->id);
+}
+
+struct pohmelfs_inode *pohmelfs_existing_inode(struct pohmelfs_sb *psb, struct pohmelfs_inode_info *info)
+{
+	struct pohmelfs_inode *pi;
+	struct inode *inode;
+	int err;
+
+	inode = iget_locked(psb->sb, atomic_long_inc_return(&psb->ino));
+	if (!inode) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	pi = pohmelfs_inode(inode);
+
+	if (inode->i_state & I_NEW) {
+		pohmelfs_fill_inode(inode, info);
+		/*
+		 * i_mapping is a pointer to i_data during inode initialization.
+		 */
+		inode->i_data.a_ops = &pohmelfs_aops;
+
+		if (S_ISREG(inode->i_mode)) {
+			inode->i_fop = &pohmelfs_file_ops;
+			inode->i_op = &pohmelfs_file_inode_operations;
+		} else if (S_ISDIR(inode->i_mode)) {
+			inode->i_fop = &pohmelfs_dir_fops;
+			inode->i_op = &pohmelfs_dir_inode_operations;
+		} else if (S_ISLNK(inode->i_mode)) {
+			inode->i_op = &pohmelfs_symlink_inode_operations;
+			inode->i_mapping->a_ops = &pohmelfs_aops;
+		} else {
+			inode->i_fop = &generic_ro_fops;
+		}
+
+		err = pohmelfs_sb_inode_insert(psb, pi);
+		if (err)
+			goto err_out_put;
+
+		unlock_new_inode(inode);
+	}
+
+	return pi;
+
+err_out_put:
+	unlock_new_inode(inode);
+	iput(inode);
+err_out_exit:
+	return ERR_PTR(err);
+}
+
+struct pohmelfs_inode *pohmelfs_new_inode(struct pohmelfs_sb *psb, int mode)
+{
+	struct pohmelfs_inode *pi;
+	struct pohmelfs_inode_info *info;
+	int err;
+
+	info = kmem_cache_zalloc(pohmelfs_inode_info_cache, GFP_NOIO);
+	if (!info) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	info->mode = mode;
+
+	pohmelfs_inode_info_current(psb, info);
+
+	pi = pohmelfs_existing_inode(psb, info);
+	if (IS_ERR(pi)) {
+		err = PTR_ERR(pi);
+		goto err_out_free;
+	}
+
+	kmem_cache_free(pohmelfs_inode_info_cache, info);
+	return pi;
+
+err_out_free:
+	kmem_cache_free(pohmelfs_inode_info_cache, info);
+err_out_exit:
+	return ERR_PTR(err);
+}
+
+struct pohmelfs_wait *pohmelfs_wait_alloc(struct pohmelfs_inode *pi)
+{
+	struct pohmelfs_wait *wait;
+
+	wait = kmem_cache_zalloc(pohmelfs_wait_cache, GFP_NOIO);
+	if (!wait) {
+		goto err_out_exit;
+	}
+
+	if (!igrab(&pi->vfs_inode))
+		goto err_out_free;
+
+	wait->pi = pi;
+
+	atomic_long_set(&wait->count, 0);
+	init_waitqueue_head(&wait->wq);
+	kref_init(&wait->refcnt);
+
+	return wait;
+
+err_out_free:
+	kmem_cache_free(pohmelfs_wait_cache, wait);
+err_out_exit:
+	return NULL;
+}
+
+static void pohmelfs_wait_free(struct kref *kref)
+{
+	struct pohmelfs_wait *wait = container_of(kref, struct pohmelfs_wait, refcnt);
+	struct inode *inode = &wait->pi->vfs_inode;
+
+	iput(inode);
+	kmem_cache_free(pohmelfs_wait_cache, wait);
+}
+
+void pohmelfs_wait_put(struct pohmelfs_wait *wait)
+{
+	kref_put(&wait->refcnt, pohmelfs_wait_free);
+}
diff --git a/fs/pohmelfs/net.c b/fs/pohmelfs/net.c
new file mode 100644
index 0000000..c0ccc10
--- /dev/null
+++ b/fs/pohmelfs/net.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (C) 2011+ Evgeniy Polyakov <zbr@...emap.net>
+ */
+
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/net.h>
+
+#include <net/sock.h>
+
+#include "pohmelfs.h"
+
+void *pohmelfs_scratch_buf;
+int pohmelfs_scratch_buf_size = 4096;
+
+void pohmelfs_print_addr(struct sockaddr_storage *addr, const char *fmt, ...)
+{
+	struct sockaddr *sa = (struct sockaddr *)addr;
+	va_list args;
+	char *ptr;
+
+	va_start(args, fmt);
+	ptr = kvasprintf(GFP_NOIO, fmt, args);
+	if (!ptr)
+		goto err_out_exit;
+
+	if (sa->sa_family == AF_INET) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+		pr_info("pohmelfs: %pI4:%d: %s", &sin->sin_addr.s_addr, ntohs(sin->sin_port), ptr);
+	} else if (sa->sa_family == AF_INET6) {
+		struct sockaddr_in6 *sin = (struct sockaddr_in6 *)addr;
+		pr_info("pohmelfs: %pI6:%d: %s", &sin->sin6_addr, ntohs(sin->sin6_port), ptr);
+	}
+
+err_out_exit:
+	va_end(args);
+}
+
+/*
+ * Basic network sending/receiving functions.
+ * Blocked mode is used.
+ */
+int pohmelfs_data_recv(struct pohmelfs_state *st, void *buf, u64 size, unsigned int flags)
+{
+	struct msghdr msg;
+	struct kvec iov;
+	int err;
+
+	BUG_ON(!size);
+
+	iov.iov_base = buf;
+	iov.iov_len = size;
+
+	msg.msg_iov = (struct iovec *)&iov;
+	msg.msg_iovlen = 1;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = flags;
+
+	err = kernel_recvmsg(st->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
+	if (err <= 0) {
+		if (err == 0)
+			err = -ECONNRESET;
+		goto err_out_exit;
+	}
+
+err_out_exit:
+	return err;
+}
+
+int pohmelfs_recv(struct pohmelfs_trans *t, struct pohmelfs_state *recv, void *data, int size)
+{
+	int err;
+
+	err = pohmelfs_data_recv(recv, data, size, MSG_DONTWAIT);
+	if (err < 0)
+		return err;
+
+	t->recv_offset += err;
+	return err;
+}
+
+static int pohmelfs_data_send(struct pohmelfs_trans *t)
+{
+	struct msghdr msg;
+	struct iovec io[2];
+	int err, ionum = 1;
+
+	io[0].iov_base = &t->cmd;
+	io[0].iov_len = t->header_size;
+
+	if (t->data) {
+		io[1].iov_base = t->data;
+		io[1].iov_len = t->data_size;
+		ionum = 2;
+	}
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = MSG_WAITALL;
+
+	msg.msg_iov = io;
+	msg.msg_iovlen = ionum;
+
+	err = kernel_sendmsg(t->st->sock, &msg, (struct kvec *)msg.msg_iov, ionum, t->data_size + t->header_size);
+	if (err <= 0) {
+		if (err == 0)
+			err = -ECONNRESET;
+		goto err_out_exit;
+	}
+
+	err = 0;
+
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_page_send(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_write_ctl *ctl = t->wctl;
+	size_t size = le64_to_cpu(t->cmd.p.io.size);
+	pgoff_t offset = le64_to_cpu(t->cmd.p.io.offset);
+	struct msghdr msg;
+	struct iovec io;
+	unsigned i;
+	int err;
+
+	io.iov_base = &t->cmd;
+	io.iov_len = t->header_size;
+
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = MSG_WAITALL;
+
+	msg.msg_iov = &io;
+	msg.msg_iovlen = 1;
+
+	err = kernel_sendmsg(t->st->sock, &msg, (struct kvec *)msg.msg_iov, 1, t->header_size);
+	if (err <= 0) {
+		if (err == 0)
+			err = -ECONNRESET;
+		goto err_out_exit;
+	}
+
+	for (i = 0; i< pagevec_count(&ctl->pvec); ++i) {
+		struct page *page = ctl->pvec.pages[i];
+		pgoff_t off = offset & (PAGE_CACHE_SIZE - 1);
+		size_t sz = PAGE_CACHE_SIZE - off;
+
+		if (sz > size)
+			sz = size;
+
+		err = kernel_sendpage(t->st->sock, page, off, sz, msg.msg_flags);
+		if (err <= 0) {
+			if (err == 0)
+				err = -ECONNRESET;
+
+			goto err_out_reset;
+		}
+
+		size -= err;
+		offset += err;
+
+	}
+
+	return 0;
+
+err_out_reset:
+err_out_exit:
+	return err;
+}
+
+/*
+ * Polling machinery.
+ */
+
+struct pohmelfs_poll_helper {
+	poll_table 		pt;
+	struct pohmelfs_state	*st;
+};
+
+static int pohmelfs_queue_wake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct pohmelfs_state *st = container_of(wait, struct pohmelfs_state, wait);
+
+	queue_work(st->psb->wq, &st->recv_work);
+	return 1;
+}
+
+static void pohmelfs_queue_func(struct file *file, wait_queue_head_t *whead, poll_table *pt)
+{
+	struct pohmelfs_state *st = container_of(pt, struct pohmelfs_poll_helper, pt)->st;
+
+	st->whead = whead;
+
+	init_waitqueue_func_entry(&st->wait, pohmelfs_queue_wake);
+	add_wait_queue(whead, &st->wait);
+}
+
+static void pohmelfs_poll_exit(struct pohmelfs_state *st)
+{
+	if (st->whead) {
+		remove_wait_queue(st->whead, &st->wait);
+		st->whead = NULL;
+	}
+}
+
+static int pohmelfs_poll_init(struct pohmelfs_state *st)
+{
+	struct pohmelfs_poll_helper ph;
+
+	ph.st = st;
+	init_poll_funcptr(&ph.pt, &pohmelfs_queue_func);
+
+	st->sock->ops->poll(NULL, st->sock, &ph.pt);
+	return 0;
+}
+
+static void pohmelfs_state_send_work(struct work_struct *work)
+{
+	struct pohmelfs_state *st = container_of(work, struct pohmelfs_state, send_work);
+	struct pohmelfs_trans *t;
+	int err;
+
+	while (1) {
+		t = NULL;
+
+		mutex_lock(&st->trans_lock);
+		if (!list_empty(&st->trans_list)) {
+			t = list_first_entry(&st->trans_list, struct pohmelfs_trans, trans_entry);
+			list_move(&t->trans_entry, &st->sent_trans_list);
+		}
+		mutex_unlock(&st->trans_lock);
+
+		if (!t)
+			break;
+
+		if (t->wctl)
+			err = pohmelfs_page_send(t);
+		else
+			err = pohmelfs_data_send(t);
+
+		if (err) {
+			pohmelfs_print_addr(&st->sa, "send error: %d\n", err);
+
+			pohmelfs_state_add_reconnect(st);
+			break;
+		}
+	}
+}
+
+static void pohmelfs_suck_scratch(struct pohmelfs_state *st)
+{
+	struct dnet_cmd *cmd = &st->cmd;
+	int err = 0;
+
+	pr_debug("pohmelfs_suck_scratch: %llu\n", (unsigned long long)cmd->size);
+
+	while (cmd->size) {
+		int sz = pohmelfs_scratch_buf_size;
+
+		if (cmd->size < sz)
+			sz = cmd->size;
+
+		err = pohmelfs_data_recv(st, pohmelfs_scratch_buf, sz, MSG_WAITALL);
+		if (err < 0) {
+			pohmelfs_print_addr(&st->sa, "recv-scratch err: %d\n", err);
+			goto err_out_exit;
+		}
+
+		cmd->size -= err;
+	}
+
+err_out_exit:
+	st->cmd_read = 1;
+}
+
+static void pohmelfs_state_recv_work(struct work_struct *work)
+{
+	struct pohmelfs_state *st = container_of(work, struct pohmelfs_state, recv_work);
+	struct dnet_cmd *cmd = &st->cmd;
+	struct pohmelfs_trans *t;
+	unsigned long long trans;
+	unsigned int revents;
+	int err = 0;
+
+	while (1) {
+		revents = st->sock->ops->poll(NULL, st->sock, NULL);
+		if (!(revents & POLLIN))
+			break;
+
+		if (st->cmd_read) {
+			err = pohmelfs_data_recv(st, cmd, sizeof(struct dnet_cmd), MSG_WAITALL);
+			if (err < 0) {
+				pohmelfs_print_addr(&st->sa, "recv error: %d\n", err);
+				goto err_out_exit;
+			}
+
+			dnet_convert_cmd(cmd);
+
+			trans = cmd->trans & ~DNET_TRANS_REPLY;
+			st->cmd_read = 0;
+		}
+
+		t = pohmelfs_trans_lookup(st, cmd);
+		if (!t) {
+			pohmelfs_suck_scratch(st);
+
+			err = 0;
+			goto err_out_continue;
+		}
+		if (cmd->size && (t->recv_offset != cmd->size)) {
+			err = t->cb.recv_reply(t, st);
+			if (err && (err != -EAGAIN)) {
+				pohmelfs_print_addr(&st->sa, "recv-reply error: %d\n", err);
+				goto err_out_remove;
+			}
+
+			if (t->recv_offset != cmd->size)
+				goto err_out_continue_put;
+		}
+
+		err = t->cb.complete(t, st);
+		if (err) {
+			pohmelfs_print_addr(&st->sa, "recv-complete err: %d\n", err);
+		}
+
+		kfree(t->recv_data);
+		t->recv_data = NULL;
+		t->recv_offset = 0;
+
+err_out_remove:
+		/* only remove and free transaction if there is error or there will be no more replies */
+		if (!(cmd->flags & DNET_FLAGS_MORE) || err) {
+			mutex_lock(&st->trans_lock);
+			list_del(&t->trans_entry);
+			mutex_unlock(&st->trans_lock);
+
+			/*
+			 * refcnt was grabbed twice:
+			 *   in pohmelfs_trans_lookup()
+			 *   and at transaction creation
+			 */
+			pohmelfs_trans_put(t);
+		}
+		st->cmd_read = 1;
+		if (err) {
+			cmd->size -= t->recv_offset;
+			t->recv_offset = 0;
+		}
+err_out_continue_put:
+		pohmelfs_trans_put(t);
+
+err_out_continue:
+		if (err && (err != -EAGAIN)) {
+			//pohmelfs_suck_scratch(st);
+			goto err_out_exit;
+		}
+
+		continue;
+	}
+
+err_out_exit:
+	if (err && err != -EAGAIN)
+		pohmelfs_state_add_reconnect(st);
+	return;
+}
+
+struct pohmelfs_state *pohmelfs_addr_exist(struct pohmelfs_sb *psb, struct sockaddr_storage *sa, int addrlen)
+{
+	struct pohmelfs_state *st;
+
+	list_for_each_entry(st, &psb->state_list, state_entry) {
+		if (st->addrlen != addrlen)
+			continue;
+
+		if (!memcmp(&st->sa, sa, addrlen)) {
+			return st;
+		}
+	}
+
+	return 0;
+}
+
+struct pohmelfs_state *pohmelfs_state_create(struct pohmelfs_sb *psb, struct sockaddr_storage *sa, int addrlen,
+		int ask_route, int group_id)
+{
+	int err = 0;
+	struct pohmelfs_state *st;
+	struct sockaddr *addr = (struct sockaddr *)sa;
+
+	/* early check - this state can be inserted into route table, no need to create state and check again */
+	spin_lock(&psb->state_lock);
+	if (pohmelfs_addr_exist(psb, sa, addrlen))
+		err = -EEXIST;
+	spin_unlock(&psb->state_lock);
+
+	if (err)
+		goto err_out_exit;
+
+	st = kzalloc(sizeof(struct pohmelfs_state), GFP_KERNEL);
+	if (!st) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	st->psb = psb;
+	mutex_init(&st->trans_lock);
+	INIT_LIST_HEAD(&st->trans_list);
+	INIT_LIST_HEAD(&st->sent_trans_list);
+
+	st->group_id = group_id;
+
+	kref_init(&st->refcnt);
+
+	INIT_WORK(&st->send_work, pohmelfs_state_send_work);
+	INIT_WORK(&st->recv_work, pohmelfs_state_recv_work);
+
+	st->cmd_read = 1;
+
+	err = sock_create(addr->sa_family, SOCK_STREAM, IPPROTO_TCP, &st->sock);
+	if (err) {
+		pohmelfs_print_addr(sa, "sock_create: failed family: %d, err: %d\n", addr->sa_family, err);
+		goto err_out_free;
+	}
+
+	st->sock->sk->sk_allocation = GFP_NOIO;
+	st->sock->sk->sk_sndtimeo = st->sock->sk->sk_rcvtimeo = msecs_to_jiffies(60000);
+
+	err = kernel_connect(st->sock, (struct sockaddr *)addr, addrlen, 0);
+	if (err) {
+		pohmelfs_print_addr(sa, "kernel_connect: failed family: %d, err: %d\n", addr->sa_family, err);
+		goto err_out_release;
+	}
+	st->sock->sk->sk_sndtimeo = st->sock->sk->sk_rcvtimeo = msecs_to_jiffies(60000);
+
+	memcpy(&st->sa, sa, sizeof(struct sockaddr_storage));
+	st->addrlen = addrlen;
+
+	pohmelfs_print_addr(sa, "connected\n");
+
+	err = pohmelfs_poll_init(st);
+	if (err)
+		goto err_out_shutdown;
+
+
+	spin_lock(&psb->state_lock);
+	err = -EEXIST;
+	if (!pohmelfs_addr_exist(psb, sa, addrlen)) {
+		list_add_tail(&st->state_entry, &psb->state_list);
+		err = 0;
+	}
+	spin_unlock(&psb->state_lock);
+
+	if (err)
+		goto err_out_poll_exit;
+
+	if (ask_route) {
+		err = pohmelfs_route_request(st);
+		if (err)
+			goto err_out_poll_exit;
+	}
+
+	return st;
+
+err_out_poll_exit:
+	pohmelfs_poll_exit(st);
+err_out_shutdown:
+	st->sock->ops->shutdown(st->sock, 2);
+err_out_release:
+	sock_release(st->sock);
+err_out_free:
+	kfree(st);
+err_out_exit:
+	if (err != -EEXIST) {
+		pohmelfs_print_addr(sa, "state creation failed: %d\n", err);
+	}
+	return ERR_PTR(err);
+}
+
+static void pohmelfs_state_exit(struct pohmelfs_state *st)
+{
+	if (!st->sock)
+		return;
+
+	pohmelfs_poll_exit(st);
+	st->sock->ops->shutdown(st->sock, 2);
+
+	pohmelfs_print_addr(&st->sa, "disconnected\n");
+	sock_release(st->sock);
+}
+
+static void pohmelfs_state_release(struct kref *kref)
+{
+	struct pohmelfs_state *st = container_of(kref, struct pohmelfs_state, refcnt);
+	pohmelfs_state_exit(st);
+}
+
+void pohmelfs_state_put(struct pohmelfs_state *st)
+{
+	kref_put(&st->refcnt, pohmelfs_state_release);
+}
+
+static void pohmelfs_state_clean(struct pohmelfs_state *st)
+{
+	struct pohmelfs_trans *t, *tmp;
+
+	pohmelfs_route_remove_all(st);
+
+	mutex_lock(&st->trans_lock);
+	list_for_each_entry_safe(t, tmp, &st->trans_list, trans_entry) {
+		list_del(&t->trans_entry);
+		pohmelfs_trans_put(t);
+	}
+
+	list_for_each_entry_safe(t, tmp, &st->sent_trans_list, trans_entry) {
+		list_del(&t->trans_entry);
+		pohmelfs_trans_put(t);
+	}
+	mutex_unlock(&st->trans_lock);
+
+	cancel_work_sync(&st->send_work);
+	cancel_work_sync(&st->recv_work);
+}
+
+void pohmelfs_state_kill(struct pohmelfs_state *st)
+{
+	BUG_ON(!list_empty(&st->state_entry));
+
+	pohmelfs_state_clean(st);
+	pohmelfs_state_put(st);
+}
+
+void pohmelfs_state_schedule(struct pohmelfs_state *st)
+{
+	struct pohmelfs_sb *psb = st->psb;
+
+	queue_work(psb->wq, &st->send_work);
+}
+
+int pohmelfs_state_add_reconnect(struct pohmelfs_state *st)
+{
+	struct pohmelfs_sb *psb = st->psb;
+	struct pohmelfs_reconnect *r, *tmp;
+	int err = 0;
+
+	pohmelfs_route_remove_all(st);
+
+	/*
+	 * Remove state from route table
+	 */
+	spin_lock(&psb->state_lock);
+	list_move(&st->state_entry, &psb->kill_state_list);
+	spin_unlock(&psb->state_lock);
+
+	r = kzalloc(sizeof(struct pohmelfs_reconnect), GFP_NOIO);
+	if (!r) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&r->sa, &st->sa, sizeof(struct sockaddr_storage));
+	r->addrlen = st->addrlen;
+	r->group_id = st->group_id;
+
+	mutex_lock(&psb->reconnect_lock);
+	list_for_each_entry(tmp, &psb->reconnect_list, reconnect_entry) {
+		if (tmp->addrlen != r->addrlen)
+			continue;
+
+		if (memcmp(&tmp->sa, &r->sa, r->addrlen))
+			continue;
+
+		err = -EEXIST;
+		break;
+	}
+
+	if (!err) {
+		list_add_tail(&r->reconnect_entry, &psb->reconnect_list);
+	}
+	mutex_unlock(&psb->reconnect_lock);
+
+	if (err)
+		goto err_out_free;
+
+	/* we do not really care if this work will not be processed immediately */
+	queue_delayed_work(psb->wq, &psb->reconnect_work, 0);
+
+	pohmelfs_print_addr(&st->sa, "reconnection added\n");
+	err = 0;
+	goto err_out_exit;
+
+err_out_free:
+	kfree(r);
+err_out_exit:
+	return err;
+}
diff --git a/fs/pohmelfs/packet.h b/fs/pohmelfs/packet.h
new file mode 100644
index 0000000..f432987
--- /dev/null
+++ b/fs/pohmelfs/packet.h
@@ -0,0 +1,752 @@
+/*
+ * 2008+ Copyright (c) Evgeniy Polyakov <zbr@...emap.net>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __DNET_PACKET_H
+#define __DNET_PACKET_H
+
+#ifndef __KERNEL__
+#include <sys/time.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+
+#include <string.h>
+#include <stdint.h>
+
+#include <elliptics/typedefs.h>
+#include <elliptics/core.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum dnet_commands {
+	DNET_CMD_LOOKUP = 1,			/* Lookup address by ID and per-object info: size, permissions and so on*/
+	DNET_CMD_REVERSE_LOOKUP,		/* Lookup ID by address */
+	DNET_CMD_JOIN,				/* Join the network - force remote nodes to update
+						 * their route tables to include given node with given
+						 * address
+						 */
+	DNET_CMD_WRITE,
+	DNET_CMD_READ,				/* IO commands. They have to follow by the
+						 * IO attribute which will have offset and size
+						 * parameters.
+						 */
+	DNET_CMD_LIST,				/* List all objects for given node ID */
+	DNET_CMD_EXEC,				/* Execute given command on the remote node */
+	DNET_CMD_ROUTE_LIST,			/* Receive route table from given node */
+	DNET_CMD_STAT,				/* Gather remote VM, LA and FS statistics */
+	DNET_CMD_NOTIFY,			/* Notify when object in question was modified */
+	DNET_CMD_DEL,				/* Remove given object from the storage */
+	DNET_CMD_STAT_COUNT,			/* Gather remote per-cmd statistics */
+	DNET_CMD_STATUS,			/* Change elliptics node status */
+	DNET_CMD_READ_RANGE,			/* Read range of objects */
+	DNET_CMD_DEL_RANGE,			/* Remove range of objects */
+	DNET_CMD_AUTH,				/* Authentification cookie check */
+	DNET_CMD_BULK_READ,			/* Read a number of ids at one time */
+
+	DNET_CMD_UNKNOWN,			/* This slot is allocated for statistics gathered for unknown commands */
+	__DNET_CMD_MAX,
+};
+
+enum dnet_counters {
+	DNET_CNTR_LA1 = __DNET_CMD_MAX*2,	/* Load average for 1 min */
+	DNET_CNTR_LA5,				/* Load average for 5 min */
+	DNET_CNTR_LA15,				/* Load average for 15 min */
+	DNET_CNTR_BSIZE,			/* Block size */
+	DNET_CNTR_FRSIZE,			/* Fragment size */
+	DNET_CNTR_BLOCKS,			/* Filesystem size in frsize units */
+	DNET_CNTR_BFREE,			/* # free blocks */
+	DNET_CNTR_BAVAIL,			/* # free blocks for non-root */
+	DNET_CNTR_FILES,			/* # inodes */
+	DNET_CNTR_FFREE,			/* # free inodes */
+	DNET_CNTR_FAVAIL,			/* # free inodes for non-root */
+	DNET_CNTR_FSID,				/* File system ID */
+	DNET_CNTR_VM_ACTIVE,			/* Active memory */
+	DNET_CNTR_VM_INACTIVE,			/* Inactive memory */
+	DNET_CNTR_VM_TOTAL,			/* Total memory */
+	DNET_CNTR_VM_FREE,			/* Free memory */
+	DNET_CNTR_VM_CACHED,			/* Used for cache */
+	DNET_CNTR_VM_BUFFERS,			/* Used for buffers */
+	DNET_CNTR_NODE_FILES,			/* # files in meta */
+	DNET_CNTR_NODE_LAST_MERGE,		/* Result of the last merge */
+	DNET_CNTR_NODE_CHECK_COPY,		/* Result of the last check copies */
+	DNET_CNTR_DBR_NOREC,			/* Kyoto Cabinet DB read error KCENOREC */
+	DNET_CNTR_DBR_SYSTEM,			/* Kyoto Cabinet DB read error KCESYSTEM */
+	DNET_CNTR_DBR_ERROR,			/* Kyoto Cabinet DB read error */
+	DNET_CNTR_DBW_SYSTEM,			/* Kyoto Cabinet DB write error KCESYSTEM */
+	DNET_CNTR_DBW_ERROR,			/* Kyoto Cabinet DB write error */
+	DNET_CNTR_UNKNOWN,			/* This slot is allocated for statistics gathered for unknown counters */
+	__DNET_CNTR_MAX,
+};
+
+/*
+ * Transaction ID direction bit.
+ * When set, data is a reply for the given transaction.
+ */
+#define DNET_TRANS_REPLY		0x8000000000000000ULL
+
+/*
+ * Command flags.
+ */
+
+/*
+ * When set, node will generate a reply when transaction
+ * is completed and put completion status into cmd.status
+ * field.
+ */
+#define DNET_FLAGS_NEED_ACK		(1<<0)
+
+/* There will be more commands with the same parameters (transaction number and id) */
+#define DNET_FLAGS_MORE			(1<<1)
+
+/* Transaction is about to be destroyed */
+#define DNET_FLAGS_DESTROY		(1<<2)
+
+/* Do not forward requst to antoher node even if given ID does not belong to our range */
+#define DNET_FLAGS_DIRECT		(1<<3)
+
+/* Do not locks operations - must be set for script callers or recursive operations */
+#define DNET_FLAGS_NOLOCK		(1<<4)
+
+struct dnet_id {
+	uint8_t			id[DNET_ID_SIZE];
+	uint32_t		group_id;
+	int			type;
+} __attribute__ ((packed));
+
+struct dnet_raw_id {
+	uint8_t			id[DNET_ID_SIZE];
+} __attribute__ ((packed));
+
+static inline void dnet_convert_raw_id(struct dnet_raw_id *id __attribute__ ((unused)))
+{
+}
+
+static inline void dnet_setup_id(struct dnet_id *id, unsigned int group_id, unsigned char *raw)
+{
+	memcpy(id->id, raw, DNET_ID_SIZE);
+	id->group_id = group_id;
+}
+
+struct dnet_cmd
+{
+	struct dnet_id		id;
+	uint32_t		flags;
+	int			status;
+	uint64_t		trans;
+	uint64_t		size;
+	uint8_t			data[0];
+} __attribute__ ((packed));
+
+/* kernel (pohmelfs) provides own defines for byteorder changes */
+#ifndef __KERNEL__
+#ifdef WORDS_BIGENDIAN
+
+#define dnet_bswap16(x)		((((x) >> 8) & 0xff) | (((x) & 0xff) << 8))
+
+#define dnet_bswap32(x) \
+     ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) |		      \
+      (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
+
+#define dnet_bswap64(x) \
+     ((((x) & 0xff00000000000000ull) >> 56)				      \
+      | (((x) & 0x00ff000000000000ull) >> 40)				      \
+      | (((x) & 0x0000ff0000000000ull) >> 24)				      \
+      | (((x) & 0x000000ff00000000ull) >> 8)				      \
+      | (((x) & 0x00000000ff000000ull) << 8)				      \
+      | (((x) & 0x0000000000ff0000ull) << 24)				      \
+      | (((x) & 0x000000000000ff00ull) << 40)				      \
+      | (((x) & 0x00000000000000ffull) << 56))
+#else
+#define dnet_bswap16(x) (x)
+#define dnet_bswap32(x) (x)
+#define dnet_bswap64(x) (x)
+#endif
+#endif
+
+static inline void dnet_convert_id(struct dnet_id *id)
+{
+	id->group_id = dnet_bswap32(id->group_id);
+	id->type = dnet_bswap32(id->type);
+}
+
+static inline void dnet_convert_cmd(struct dnet_cmd *cmd)
+{
+	dnet_convert_id(&cmd->id);
+	cmd->flags = dnet_bswap32(cmd->flags);
+	cmd->status = dnet_bswap32(cmd->status);
+	cmd->size = dnet_bswap64(cmd->size);
+	cmd->trans = dnet_bswap64(cmd->trans);
+}
+
+/* Completely remove object history and metadata */
+#define DNET_ATTR_DELETE_HISTORY		(1<<0)
+
+/* What type of counters to fetch */
+#define DNET_ATTR_CNTR_GLOBAL			(1<<0)
+
+/* Bulk request for checking files */
+#define DNET_ATTR_BULK_CHECK			(1<<0)
+
+/* Fill ctime/mtime from metadata when processing DNET_CMD_LOOKUP */
+#define DNET_ATTR_META_TIMES			(1<<1)
+
+/* Do not verify checksum */
+#define DNET_ATTR_NOCSUM			(1<<2)
+
+/*
+ * ascending sort data before returning range request to user
+ * c++ bindings only
+ */
+#define DNET_ATTR_SORT				(1<<3)
+
+/*
+ * This flag will force its parent CMD not to lock operation
+ * Flag will be propagated to cmd->flags
+ */
+#define DNET_ATTR_NOLOCK			(1<<4)
+
+struct dnet_attr
+{
+	uint64_t		size;
+	uint32_t		cmd;
+	uint32_t		flags;
+	uint32_t		unused[2];
+} __attribute__ ((packed));
+
+static inline void dnet_convert_attr(struct dnet_attr *a)
+{
+	a->size = dnet_bswap64(a->size);
+	a->cmd = dnet_bswap32(a->cmd);
+	a->flags = dnet_bswap32(a->flags);
+}
+
+#define DNET_ADDR_SIZE		28
+
+struct dnet_addr
+{
+	uint8_t			addr[DNET_ADDR_SIZE];
+	uint32_t		addr_len;
+} __attribute__ ((packed));
+
+struct dnet_list
+{
+	struct dnet_id		id;
+	uint32_t		size;
+	uint8_t			data[0];
+} __attribute__ ((packed));
+
+static inline void dnet_convert_list(struct dnet_list *l)
+{
+	dnet_convert_id(&l->id);
+	l->size = dnet_bswap32(l->size);
+}
+
+struct dnet_addr_attr
+{
+	uint16_t		sock_type;
+	uint16_t		family;
+	uint32_t		proto;
+	struct dnet_addr	addr;
+} __attribute__ ((packed));
+
+static inline void dnet_convert_addr_attr(struct dnet_addr_attr *a)
+{
+	a->addr.addr_len = dnet_bswap32(a->addr.addr_len);
+	a->proto = dnet_bswap32(a->proto);
+	a->sock_type = dnet_bswap16(a->sock_type);
+	a->family = dnet_bswap16(a->family);
+}
+
+struct dnet_addr_cmd
+{
+	struct dnet_cmd		cmd;
+	struct dnet_attr	a;
+	struct dnet_addr_attr	addr;
+} __attribute__ ((packed));
+
+static inline void dnet_convert_addr_cmd(struct dnet_addr_cmd *l)
+{
+	dnet_convert_cmd(&l->cmd);
+	dnet_convert_attr(&l->a);
+	dnet_convert_addr_attr(&l->addr);
+}
+
+/* Do not update history for given transaction */
+#define DNET_IO_FLAGS_SKIP_SENDING	(1<<0)
+
+/* Append given data at the end of the object */
+#define DNET_IO_FLAGS_APPEND		(1<<1)
+
+#define DNET_IO_FLAGS_COMPRESS		(1<<2)
+
+/* Metada IO request */
+#define DNET_IO_FLAGS_META		(1<<3)
+
+/* eblob prepare/commit phase */
+#define DNET_IO_FLAGS_PREPARE		(1<<4)
+#define DNET_IO_FLAGS_COMMIT		(1<<5)
+
+/* Object was removed */
+#define DNET_IO_FLAGS_REMOVED		(1<<6)
+
+/* Overwrite data */
+#define DNET_IO_FLAGS_OVERWRITE		(1<<7)
+
+/* Do not checksum data */
+#define DNET_IO_FLAGS_NOCSUM		(1<<8)
+
+/*
+ * this flag is used when we want backend not to perform any additional actions
+ * except than write data at given offset. This is no-op in filesystem backend,
+ * but eblob one should disable prepare/commit operations.
+ */
+#define DNET_IO_FLAGS_PLAIN_WRITE	(1<<9)
+
+/* Do not really send data in range request.
+ * Send only statistics instead.
+ *
+ * -- we do not care if it matches above DNET_IO_FLAGS_PLAIN_WRITE,
+ *  since using plain write and nodata (read) is useless anyway
+ */
+#define DNET_IO_FLAGS_NODATA		(1<<9)
+
+struct dnet_io_attr
+{
+	uint8_t			parent[DNET_ID_SIZE];
+	uint8_t			id[DNET_ID_SIZE];
+
+	/*
+	 * used in range request as start and number for LIMIT(start, num) 
+	 *
+	 * write prepare request uses @num is used as a placeholder
+	 * for number of bytes to reserve on disk
+	 */
+	uint64_t		start, num;
+	int			type;
+	uint32_t		flags;
+	uint64_t		offset;
+	uint64_t		size;
+} __attribute__ ((packed));
+
+static inline void dnet_convert_io_attr(struct dnet_io_attr *a)
+{
+	a->start = dnet_bswap64(a->start);
+	a->num = dnet_bswap64(a->num);
+
+	a->flags = dnet_bswap32(a->flags);
+	a->offset = dnet_bswap64(a->offset);
+	a->size = dnet_bswap64(a->size);
+}
+
+struct dnet_history_entry
+{
+	uint8_t			id[DNET_ID_SIZE];
+	uint32_t		flags;
+	uint64_t		reserved;
+	uint64_t		tsec, tnsec;
+	uint64_t		offset;
+	uint64_t		size;
+} __attribute__ ((packed));
+
+/*
+ * Helper structure and set of functions to map history file and perform basic checks.
+ */
+struct dnet_history_map
+{
+	struct dnet_history_entry	*ent;
+	long				num;
+	ssize_t				size;
+	int				fd;
+};
+
+static inline void dnet_convert_history_entry(struct dnet_history_entry *a)
+{
+	a->flags = dnet_bswap32(a->flags);
+	a->offset = dnet_bswap64(a->offset);
+	a->size = dnet_bswap64(a->size);
+	a->tsec = dnet_bswap64(a->tsec);
+	a->tnsec = dnet_bswap64(a->tnsec);
+}
+
+static inline void dnet_setup_history_entry(struct dnet_history_entry *e,
+		unsigned char *id, uint64_t size, uint64_t offset,
+		struct timespec *ts, uint32_t flags)
+{
+	if (!ts) {
+		struct timeval tv;
+
+		gettimeofday(&tv, NULL);
+
+		e->tsec = tv.tv_sec;
+		e->tnsec = tv.tv_usec * 1000;
+	} else {
+		e->tsec = ts->tv_sec;
+		e->tnsec = ts->tv_nsec;
+	}
+
+	memcpy(e->id, id, DNET_ID_SIZE);
+
+	e->size = size;
+	e->offset = offset;
+	e->flags = flags;
+	e->reserved = 0;
+
+	dnet_convert_history_entry(e);
+}
+
+struct dnet_stat
+{
+	/* Load average from the target system multiplied by 100 */
+	uint16_t		la[3];
+
+	uint16_t		namemax;	/* maximum filename length */
+
+	uint64_t		bsize;		/* Block size */
+	uint64_t		frsize;		/* Fragment size */
+	uint64_t		blocks;		/* Filesystem size in frsize units */
+	uint64_t		bfree;		/* # free blocks */
+	uint64_t		bavail;		/* # free blocks for non-root */
+	uint64_t		files;		/* # inodes */
+	uint64_t		ffree;		/* # free inodes */
+	uint64_t		favail;		/* # free inodes for non-root */
+	uint64_t		fsid;		/* file system ID */
+	uint64_t		flag;		/* mount flags */
+
+	/*
+	 * VM counters in KB (1024) units.
+	 * On FreeBSD vm_buffers is used for wire counter.
+	 */
+	uint64_t		vm_active;
+	uint64_t		vm_inactive;
+	uint64_t		vm_total;
+	uint64_t		vm_free;
+	uint64_t		vm_cached;
+	uint64_t		vm_buffers;
+
+	/*
+	 * Per node IO statistics will live here.
+	 * Reserved for future use.
+	 */
+	uint64_t		reserved[32];
+};
+
+static inline void dnet_convert_stat(struct dnet_stat *st)
+{
+	int i;
+
+	for (i=0; i<3; ++i)
+		st->la[i] = dnet_bswap16(st->la[i]);
+
+	st->bsize = dnet_bswap64(st->bsize);
+	st->frsize = dnet_bswap64(st->frsize);
+	st->blocks = dnet_bswap64(st->blocks);
+	st->bfree = dnet_bswap64(st->bfree);
+	st->bavail = dnet_bswap64(st->bavail);
+	st->files = dnet_bswap64(st->files);
+	st->ffree = dnet_bswap64(st->ffree);
+	st->favail = dnet_bswap64(st->favail);
+	st->fsid = dnet_bswap64(st->fsid);
+	st->namemax = dnet_bswap16(st->namemax);
+
+	st->vm_active = dnet_bswap64(st->vm_active);
+	st->vm_inactive = dnet_bswap64(st->vm_inactive);
+	st->vm_total = dnet_bswap64(st->vm_total);
+	st->vm_free = dnet_bswap64(st->vm_free);
+	st->vm_buffers = dnet_bswap64(st->vm_buffers);
+	st->vm_cached = dnet_bswap64(st->vm_cached);
+}
+
+struct dnet_io_notification
+{
+	struct dnet_addr_attr		addr;
+	struct dnet_io_attr		io;
+};
+
+static inline void dnet_convert_io_notification(struct dnet_io_notification *n)
+{
+	dnet_convert_addr_attr(&n->addr);
+	dnet_convert_io_attr(&n->io);
+}
+
+struct dnet_stat_count
+{
+	uint64_t			count;
+	uint64_t			err;
+};
+
+static inline void dnet_convert_stat_count(struct dnet_stat_count *st, int num)
+{
+	int i;
+
+	for (i=0; i<num; ++i) {
+		st[i].count = dnet_bswap64(st[i].count);
+		st[i].err = dnet_bswap64(st[i].err);
+	}
+}
+
+struct dnet_addr_stat
+{
+	struct dnet_addr		addr;
+	int				num;
+	int				cmd_num;
+	struct dnet_stat_count		count[0];
+} __attribute__ ((packed));
+
+static inline void dnet_convert_addr_stat(struct dnet_addr_stat *st, int num)
+{
+	st->addr.addr_len = dnet_bswap32(st->addr.addr_len);
+	st->num = dnet_bswap32(st->num);
+	if (!num)
+		num = st->num;
+	st->cmd_num = dnet_bswap32(st->cmd_num);
+
+	dnet_convert_stat_count(st->count, num);
+}
+
+static inline void dnet_stat_inc(struct dnet_stat_count *st, int cmd, int err)
+{
+	if (cmd >= __DNET_CMD_MAX)
+		cmd = DNET_CMD_UNKNOWN;
+
+	if (!err)
+		st[cmd].count++;
+	else
+		st[cmd].err++;
+}
+
+struct dnet_time {
+	uint64_t		tsec, tnsec;
+};
+
+static inline void dnet_convert_time(struct dnet_time *tm)
+{
+	tm->tsec = dnet_bswap64(tm->tsec);
+	tm->tnsec = dnet_bswap64(tm->tnsec);
+}
+
+static inline void dnet_current_time(struct dnet_time *t)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+
+	t->tsec = tv.tv_sec;
+	t->tnsec = tv.tv_usec * 1000;
+}
+
+struct dnet_file_info {
+	int			flen;		/* filename length, which goes after this structure */
+	unsigned char		checksum[DNET_CSUM_SIZE];
+
+	unsigned int		nlink;
+
+	uint64_t		mode;
+
+	uint64_t		dev;
+	uint64_t		rdev;
+
+	uint64_t		ino;
+
+	uint64_t		uid;
+	uint64_t		gid;
+
+	uint64_t		blksize;
+	uint64_t		blocks;
+
+	uint64_t		size;
+	uint64_t		offset;		/* offset within eblob */
+
+	struct dnet_time	atime;
+	struct dnet_time	ctime;
+	struct dnet_time	mtime;
+};
+
+static inline void dnet_convert_file_info(struct dnet_file_info *info)
+{
+	info->flen = dnet_bswap32(info->flen);
+	info->nlink = dnet_bswap32(info->nlink);
+
+	info->mode = dnet_bswap64(info->mode);
+	info->dev = dnet_bswap64(info->dev);
+	info->ino = dnet_bswap64(info->ino);
+	info->uid = dnet_bswap64(info->uid);
+	info->gid = dnet_bswap64(info->gid);
+	info->blksize = dnet_bswap64(info->blksize);
+	info->blocks = dnet_bswap64(info->blocks);
+	info->rdev = dnet_bswap64(info->rdev);
+	info->size = dnet_bswap64(info->size);
+	info->offset = dnet_bswap64(info->offset);
+
+	dnet_convert_time(&info->atime);
+	dnet_convert_time(&info->ctime);
+	dnet_convert_time(&info->mtime);
+}
+
+static inline void dnet_info_from_stat(struct dnet_file_info *info, struct stat *st)
+{
+	info->nlink = st->st_nlink;
+	info->mode = st->st_mode;
+	info->dev = st->st_dev;
+	info->ino = st->st_ino;
+	info->uid = st->st_uid;
+	info->gid = st->st_gid;
+	info->blksize = st->st_blksize;
+	info->blocks = st->st_blocks;
+	info->rdev = st->st_rdev;
+	info->size = st->st_size;
+	info->offset = 0;
+
+	info->atime.tsec = st->st_atime;
+	info->ctime.tsec = st->st_ctime;
+	info->mtime.tsec = st->st_mtime;
+
+	info->atime.tnsec = 0;
+	info->ctime.tnsec = 0;
+	info->mtime.tnsec = 0;
+}
+
+/* Elliptics node status - if set, status will be changed */
+#define DNET_ATTR_STATUS_CHANGE		(1<<0)
+
+/* Elliptics node should exit */
+#define DNET_STATUS_EXIT		(1<<0)
+
+/* Ellipitcs node goes ro/rw */
+#define DNET_STATUS_RO			(1<<1)
+
+struct dnet_node_status {
+	int nflags;
+	int status_flags;  /* DNET_STATUS_EXIT, DNET_STATUS_RO should be specified here */
+	uint32_t log_mask;
+};
+
+static inline void dnet_convert_node_status(struct dnet_node_status *st)
+{
+	st->nflags = dnet_bswap32(st->nflags);
+	st->status_flags = dnet_bswap32(st->status_flags);
+	st->log_mask = dnet_bswap32(st->log_mask);
+}
+
+enum cmd_type {
+	DNET_EXEC_SHELL = 0,
+	DNET_EXEC_PYTHON_SCRIPT_NAME,
+	DNET_EXEC_PYTHON,
+};
+
+struct dnet_exec {
+	int			type;
+	int			flags;
+	uint64_t		script_size, name_size, binary_size;
+	uint64_t		reserved[2];
+
+	/*
+	 * we pack script name first, then user's script content and then binary data,
+	 * which will be pushed into server's object
+	 */
+	char			data[0];
+} __attribute__((packed));
+
+static inline void dnet_convert_exec(struct dnet_exec *e)
+{
+	e->type = dnet_bswap32(e->type);
+	e->script_size = dnet_bswap64(e->script_size);
+	e->name_size = dnet_bswap64(e->name_size);
+	e->binary_size = dnet_bswap64(e->binary_size);
+	e->flags = dnet_bswap32(e->flags);
+}
+
+#define DNET_AUTH_COOKIE_SIZE	32
+
+struct dnet_auth {
+	char			cookie[DNET_AUTH_COOKIE_SIZE];
+	uint64_t		flags;
+	uint64_t		unused[3];
+};
+
+static inline void dnet_convert_auth(struct dnet_auth *a)
+{
+	a->flags = dnet_bswap64(a->flags);
+}
+
+enum dnet_meta_types {
+	DNET_META_PARENT_OBJECT = 1,	/* parent object name */
+	DNET_META_GROUPS,		/* this object has copies in given groups */
+	DNET_META_CHECK_STATUS,		/* last checking status: timestamp and so on */
+	DNET_META_NAMESPACE,		/* namespace where given object lives */
+	DNET_META_UPDATE,		/* last update information (timestamp, flags) */
+	DNET_META_CHECKSUM,		/* checksum (sha512) of the whole data object calculated on server */
+	__DNET_META_MAX,
+};
+
+struct dnet_meta
+{
+	uint32_t			type;
+	uint32_t			size;
+	uint64_t			common;
+	uint8_t				tmp[16];
+	uint8_t				data[0];
+} __attribute__ ((packed));
+
+static inline void dnet_convert_meta(struct dnet_meta *m)
+{
+	m->type = dnet_bswap32(m->type);
+	m->size = dnet_bswap32(m->size);
+	m->common = dnet_bswap64(m->common);
+}
+
+struct dnet_meta_update {
+	int			unused_gap;
+	int			group_id;
+	uint64_t		flags;
+	struct dnet_time	tm;
+	uint64_t		reserved[4];
+} __attribute__((packed));
+
+static inline void dnet_convert_meta_update(struct dnet_meta_update *m)
+{
+	dnet_convert_time(&m->tm);
+	m->flags = dnet_bswap64(m->flags);
+}
+
+struct dnet_meta_check_status {
+	int			status;
+	int			pad;
+	struct dnet_time	tm;
+	uint64_t		reserved[4];
+} __attribute__ ((packed));
+
+static inline void dnet_convert_meta_check_status(struct dnet_meta_check_status *c)
+{
+	c->status = dnet_bswap32(c->status);
+	dnet_convert_time(&c->tm);
+}
+
+struct dnet_meta_checksum {
+	uint8_t			checksum[DNET_CSUM_SIZE];
+	struct dnet_time	tm;
+} __attribute__ ((packed));
+
+static inline void dnet_convert_meta_checksum(struct dnet_meta_checksum *c)
+{
+	dnet_convert_time(&c->tm);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __DNET_PACKET_H */
diff --git a/fs/pohmelfs/pohmelfs.h b/fs/pohmelfs/pohmelfs.h
new file mode 100644
index 0000000..8cce946
--- /dev/null
+++ b/fs/pohmelfs/pohmelfs.h
@@ -0,0 +1,396 @@
+/*
+ * Copyright (C) 2011+ Evgeniy Polyakov <zbr@...emap.net>
+ */
+
+#ifndef __POHMELFS_H
+#define __POHMELFS_H
+
+#include <linux/backing-dev.h>
+#include <linux/crypto.h>
+#include <linux/fs.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+
+#include <crypto/sha.h>
+
+#define dnet_bswap16(x)		cpu_to_le16(x)
+#define dnet_bswap32(x)		cpu_to_le32(x)
+#define dnet_bswap64(x)		cpu_to_le64(x)
+
+/* theese are needed for packet.h below to compile */
+#define DNET_ID_SIZE	SHA512_DIGEST_SIZE
+#define DNET_CSUM_SIZE	SHA512_DIGEST_SIZE
+
+/*
+ * is not used in kernel, but we want to share the same header
+ * with userspace, so I put it here for compiler to shut up
+ */
+int gettimeofday(struct timeval *, struct timezone *);
+
+#include "packet.h"
+
+static inline struct timespec pohmelfs_date(struct dnet_time *tm)
+{
+	struct timespec ts;
+
+	ts.tv_sec = tm->tsec;
+	ts.tv_nsec = tm->tnsec;
+
+	return ts;
+}
+
+struct pohmelfs_cmd {
+	struct dnet_cmd		cmd;
+	struct dnet_attr	attr;
+	union {
+		struct dnet_io_attr	io;
+	} p;
+};
+
+/*
+ * Compare two IDs.
+ * Returns  1 when id1 > id2
+ *         -1 when id1 < id2
+ *          0 when id1 = id2
+ */
+static inline int dnet_id_cmp_str(const unsigned char *id1, const unsigned char *id2)
+{
+	unsigned int i = 0;
+
+	for (i*=sizeof(unsigned long); i<DNET_ID_SIZE; ++i) {
+		if (id1[i] < id2[i])
+			return -1;
+		if (id1[i] > id2[i])
+			return 1;
+	}
+
+	return 0;
+}
+
+struct pohmelfs_state;
+struct pohmelfs_sb;
+struct pohmelfs_trans;
+
+struct pohmelfs_trans_cb {
+	int				(* init)(struct pohmelfs_trans *t);
+	int				(* complete)(struct pohmelfs_trans *t, struct pohmelfs_state *recv);
+	int				(* recv_reply)(struct pohmelfs_trans *t, struct pohmelfs_state *recv);
+	void				(* destroy)(struct pohmelfs_trans *t);
+};
+
+struct pohmelfs_trans {
+	struct list_head	trans_entry;
+
+	struct kref		refcnt;
+
+	unsigned long		trans;
+
+	struct inode		*inode;
+
+	struct pohmelfs_state	*st;
+
+	struct pohmelfs_cmd	cmd;
+
+	u64			header_size, data_size;
+
+	void			*data;
+
+	unsigned long long	recv_offset;
+	void			*recv_data;
+
+	struct pohmelfs_write_ctl	*wctl;
+	void			*priv;
+
+	struct pohmelfs_trans_cb	cb;
+};
+
+struct pohmelfs_trans *pohmelfs_trans_alloc(struct inode *inode);
+struct pohmelfs_trans *pohmelfs_trans_alloc_io_buf(struct inode *inode, int group, int command,
+		void *data, u64 offset, u64 size, int aflags, int ioflags, int type);
+void pohmelfs_trans_put(struct pohmelfs_trans *t);
+
+int pohmelfs_trans_insert(struct pohmelfs_trans *t);
+struct pohmelfs_trans *pohmelfs_trans_lookup(struct pohmelfs_state *st, struct dnet_cmd *cmd);
+
+struct pohmelfs_state {
+	struct pohmelfs_sb	*psb;
+	struct list_head	state_entry;
+
+	struct sockaddr_storage	sa;
+	int			addrlen;
+	struct socket		*sock;
+
+	int			group_id;
+
+	struct mutex		trans_lock;
+	struct list_head	trans_list;
+	struct list_head	sent_trans_list;
+
+	struct kref		refcnt;
+
+	int			routes;
+
+	/* Waiting/polling machinery */
+	wait_queue_t		wait;
+	wait_queue_head_t	*whead;
+
+	struct work_struct	send_work;
+	struct work_struct	recv_work;
+
+	/* is set when dnet_cmd is being read, otherwise attached data */
+	int			cmd_read;
+	/* currently read command reply */
+	struct dnet_cmd		cmd;
+};
+
+struct pohmelfs_state *pohmelfs_state_create(struct pohmelfs_sb *psb, struct sockaddr_storage *sa, int addrlen,
+		int ask_route, int group_id);
+struct pohmelfs_state *pohmelfs_state_lookup(struct pohmelfs_sb *psb, struct dnet_raw_id *id, int group);
+
+static inline void pohmelfs_state_get(struct pohmelfs_state *st)
+{
+	kref_get(&st->refcnt);
+}
+
+void pohmelfs_state_put(struct pohmelfs_state *st);
+void pohmelfs_state_kill(struct pohmelfs_state *st);
+
+struct pohmelfs_state *pohmelfs_addr_exist(struct pohmelfs_sb *psb, struct sockaddr_storage *sa, int addrlen);
+
+void pohmelfs_state_schedule(struct pohmelfs_state *st);
+
+__attribute__ ((format (printf, 2, 3))) void pohmelfs_print_addr(struct sockaddr_storage *addr, const char *fmt, ...);
+
+#define POHMELFS_INODE_INFO_REMOVED		(1<<0)
+
+struct pohmelfs_inode_info {
+	struct dnet_raw_id	id;
+
+	unsigned int		mode;
+	unsigned int		nlink;
+	unsigned int		uid;
+	unsigned int		gid;
+	unsigned int		blocksize;
+	unsigned int		namelen;
+	__u64			ino;
+	__u64			blocks;
+	__u64			rdev;
+	__u64			size;
+	__u64			version;
+
+	__u64			flags;
+
+	struct dnet_time	ctime;
+	struct dnet_time	mtime;
+	struct dnet_time	atime;
+} __attribute__ ((packed));
+
+void pohmelfs_fill_inode_info(struct inode *inode, struct pohmelfs_inode_info *info);
+void pohmelfs_fill_inode(struct inode *inode, struct pohmelfs_inode_info *info);
+void pohmelfs_convert_inode_info(struct pohmelfs_inode_info *info);
+
+struct pohmelfs_inode {
+	struct inode		vfs_inode;
+	struct dnet_raw_id	id;
+	struct dnet_raw_id	parent_id;
+
+	int			received;
+
+	struct rb_node		node;
+};
+
+int pohmelfs_send_inode_info(struct pohmelfs_inode *pi, struct dnet_raw_id *id, const char *sname, int len, int sync);
+struct pohmelfs_inode *pohmelfs_sb_inode_lookup(struct pohmelfs_sb *psb, struct dnet_raw_id *id);
+
+
+struct pohmelfs_reconnect {
+	struct list_head	reconnect_entry;
+	struct sockaddr_storage	sa;
+	int			addrlen;
+	int			group_id;
+};
+
+int pohmelfs_state_add_reconnect(struct pohmelfs_state *st);
+
+
+struct pohmelfs_sb {
+	struct super_block	*sb;
+	struct backing_dev_info	bdi;
+
+	struct pohmelfs_inode	*root;
+
+	spinlock_t		inode_lock;
+	struct rb_root		inode_root;
+
+	int			sync;
+	int			use_http_compat;
+
+	int			bdi_num;
+
+	struct rb_root		route_root;
+	struct list_head	state_list;
+	spinlock_t		state_lock;
+
+	long			read_wait_timeout;
+	long			write_wait_timeout;
+	long			sync_timeout;
+
+	char			*fsid;
+	int			fsid_len;
+
+	int			no_read_csum;
+
+	atomic_long_t		ino;
+	atomic_long_t		trans;
+
+	struct crypto_hash	*hash;
+
+	struct workqueue_struct	*wq;
+
+	int			*groups;
+	int			group_num;
+
+	struct mutex		reconnect_lock;
+	struct list_head	reconnect_list;
+	struct list_head	kill_state_list;
+	struct delayed_work	reconnect_work;
+	long			reconnect_timeout;
+};
+
+static inline struct pohmelfs_sb *pohmelfs_sb(struct super_block *sb)
+{
+	return (struct pohmelfs_sb *)sb->s_fs_info;
+}
+
+static inline struct pohmelfs_inode *pohmelfs_inode(struct inode *inode)
+{
+	return container_of(inode, struct pohmelfs_inode, vfs_inode);
+}
+
+struct pohmelfs_inode_info_binary_package {
+	struct dnet_raw_id		parent;
+	struct pohmelfs_inode_info	info;
+};
+
+struct pohmelfs_write_ctl {
+	struct pagevec		pvec;
+
+	struct pohmelfs_sb	*psb;
+
+	struct kref		refcnt;
+
+	atomic_t		good_writes;
+};
+
+extern struct kmem_cache *pohmelfs_inode_cache;
+extern struct kmem_cache *pohmelfs_trans_cache;
+extern struct kmem_cache *pohmelfs_inode_info_cache;
+extern struct kmem_cache *pohmelfs_route_cache;
+extern struct kmem_cache *pohmelfs_wait_cache;
+extern struct kmem_cache *pohmelfs_io_cache;
+extern struct kmem_cache *pohmelfs_inode_info_binary_package_cache;
+extern struct kmem_cache *pohmelfs_write_cache;
+
+struct inode *pohmelfs_alloc_inode(struct super_block *sb);
+void pohmelfs_destroy_inode(struct inode *);
+
+struct pohmelfs_inode *pohmelfs_existing_inode(struct pohmelfs_sb *psb, struct pohmelfs_inode_info *info);
+struct pohmelfs_inode *pohmelfs_new_inode(struct pohmelfs_sb *psb, int mode);
+int pohmelfs_hash(struct pohmelfs_sb *psb, const void *data, const size_t size, struct dnet_raw_id *id);
+
+char *pohmelfs_dump_id(const unsigned char *id);
+char *pohmelfs_dump_id_len_raw(const unsigned char *id, unsigned int len, char *dst);
+
+int pohmelfs_write_command(struct pohmelfs_inode *pi, struct pohmelfs_write_ctl *ctl, loff_t offset, size_t len);
+void pohmelfs_write_ctl_release(struct kref *kref);
+int pohmelfs_metadata_inode(struct pohmelfs_inode *pi, int sync);
+
+extern const struct file_operations pohmelfs_dir_fops;
+extern const struct inode_operations pohmelfs_dir_inode_operations;
+
+extern const struct file_operations pohmelfs_file_ops;
+extern const struct inode_operations pohmelfs_file_inode_operations;
+
+extern const struct inode_operations pohmelfs_symlink_inode_operations;
+
+extern void *pohmelfs_scratch_buf;
+extern int pohmelfs_scratch_buf_size;
+
+/*
+ * if this flag is set, pohmelfs_inode_info->data is owned by the caller,
+ * so sending path may use it on its own and free (using kfree) when it's done
+ *
+ * This logic does not work for shared buffers or
+ * when multiple transactions will be sent for single pohmelfs_inode_info
+ */
+#define POHMELFS_IO_OWN			(1<<0)
+
+struct pohmelfs_io {
+	struct pohmelfs_inode		*pi;
+
+	struct dnet_raw_id		*id;
+
+	int				cmd;
+	int				type;
+
+	u64				offset, size;
+	u64				start, num;
+
+	u32				cflags;
+	u32				aflags;
+	u32				ioflags;
+
+	int				group_id;
+
+	u32				alloc_flags;
+	void				*data;
+
+	struct pohmelfs_write_ctl	*wctl;
+	void				*priv;
+
+	struct pohmelfs_trans_cb	cb;
+};
+
+int pohmelfs_send_io_group(struct pohmelfs_io *pio, int group_id);
+int pohmelfs_send_io(struct pohmelfs_io *pio);
+int pohmelfs_send_buf_single(struct pohmelfs_io *pio, struct pohmelfs_state *st);
+int pohmelfs_send_buf(struct pohmelfs_io *pio);
+
+int pohmelfs_data_recv(struct pohmelfs_state *st, void *buf, u64 size, unsigned int flags);
+int pohmelfs_recv(struct pohmelfs_trans *t, struct pohmelfs_state *recv, void *data, int size);
+
+struct pohmelfs_route {
+	struct rb_node			node;
+	int				group_id;
+	struct dnet_raw_id		id;
+	struct pohmelfs_state		*st;
+};
+
+int pohmelfs_route_request(struct pohmelfs_state *st);
+void pohmelfs_route_remove_all(struct pohmelfs_state *st);
+
+struct pohmelfs_wait {
+	wait_queue_head_t	wq;
+	struct pohmelfs_inode	*pi;
+	void			*ret;
+	atomic_long_t		count;
+	int			condition;
+	struct kref		refcnt;
+};
+
+struct pohmelfs_wait *pohmelfs_wait_alloc(struct pohmelfs_inode *pi);
+void pohmelfs_wait_put(struct pohmelfs_wait *wait);
+static inline void pohmelfs_wait_get(struct pohmelfs_wait *wait)
+{
+	kref_get(&wait->refcnt);
+}
+
+#endif /* __POHMELFS_H */
diff --git a/fs/pohmelfs/route.c b/fs/pohmelfs/route.c
new file mode 100644
index 0000000..6a0400d
--- /dev/null
+++ b/fs/pohmelfs/route.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (C) 2011+ Evgeniy Polyakov <zbr@...emap.net>
+ */
+
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "pohmelfs.h"
+
+
+static inline int pohmelfs_route_cmp_raw(const struct pohmelfs_route *rt, const struct dnet_raw_id *raw, int group_id)
+{
+	if (rt->group_id < group_id)
+		return -1;
+	if (rt->group_id > group_id)
+		return 1;
+
+	return dnet_id_cmp_str(rt->id.id, raw->id);
+}
+
+static inline int pohmelfs_route_cmp(const struct pohmelfs_route *id1, const struct pohmelfs_route *id2)
+{
+	return pohmelfs_route_cmp_raw(id1, &id2->id, id2->group_id);
+}
+
+static int pohmelfs_route_insert(struct pohmelfs_sb *psb, struct pohmelfs_route *rt)
+{
+	struct rb_node **n = &psb->route_root.rb_node, *parent = NULL;
+	struct pohmelfs_route *tmp;
+	int cmp, err = 0;
+
+	spin_lock(&psb->state_lock);
+	while (*n) {
+		parent = *n;
+
+		tmp = rb_entry(parent, struct pohmelfs_route, node);
+
+		cmp = pohmelfs_route_cmp(tmp, rt);
+		if (cmp < 0)
+			n = &parent->rb_left;
+		else if (cmp > 0)
+			n = &parent->rb_right;
+		else {
+			err = -EEXIST;
+			goto err_out_unlock;
+		}
+	}
+
+	rb_link_node(&rt->node, parent, n);
+	rb_insert_color(&rt->node, &psb->route_root);
+
+err_out_unlock:
+	spin_unlock(&psb->state_lock);
+	return err;
+	
+}
+
+static int pohmelfs_route_add(struct pohmelfs_state *st, struct dnet_raw_id *id, int group_id)
+{
+	struct pohmelfs_sb *psb = st->psb;
+	struct pohmelfs_route *rt;
+	int err;
+
+	rt = kmem_cache_zalloc(pohmelfs_route_cache, GFP_NOIO);
+	if (!rt) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	memcpy(&rt->id, id, sizeof(struct dnet_raw_id));
+	rt->group_id = group_id;
+	rt->st = st;
+
+	pohmelfs_state_get(st);
+
+	err = pohmelfs_route_insert(psb, rt);
+	if (err)
+		goto err_out_put;
+
+	rt->st->routes++;
+	return 0;
+
+err_out_put:
+	pohmelfs_state_put(st);
+	kmem_cache_free(pohmelfs_route_cache, rt);
+err_out_exit:
+	return err;
+}
+
+struct pohmelfs_state *pohmelfs_state_lookup(struct pohmelfs_sb *psb, struct dnet_raw_id *id, int group_id)
+{
+	struct rb_node *n = psb->route_root.rb_node;
+	struct pohmelfs_route *rt;
+	struct pohmelfs_state *st = NULL;
+	int cmp;
+
+	spin_lock(&psb->state_lock);
+	while (n) {
+		rt = rb_entry(n, struct pohmelfs_route, node);
+
+		cmp = pohmelfs_route_cmp_raw(rt, id, group_id);
+
+		if (!st && (rt->group_id == group_id)) {
+			st = rt->st;
+		}
+
+		if (cmp < 0) {
+			n = n->rb_left;
+
+			if (rt->group_id == group_id) {
+				st = rt->st;
+			}
+		} else if (cmp > 0)
+			n = n->rb_right;
+		else {
+			st = rt->st;
+			break;
+		}
+	}
+	if (st)
+		pohmelfs_state_get(st);
+
+	spin_unlock(&psb->state_lock);
+
+	return st;
+}
+
+static void pohmelfs_route_remove_nolock(struct pohmelfs_sb *psb, struct pohmelfs_route *rt)
+{
+	rt->st->routes--;
+	rb_erase(&rt->node, &psb->route_root);
+	pohmelfs_state_put(rt->st);
+	kmem_cache_free(pohmelfs_route_cache, rt);
+}
+
+void pohmelfs_route_remove_all(struct pohmelfs_state *st)
+{
+	struct pohmelfs_sb *psb = st->psb;
+	struct pohmelfs_route *rt;
+	struct rb_node *n;
+	int found = 1;
+
+	spin_lock(&psb->state_lock);
+
+	while (found) {
+		n = rb_first(&psb->route_root);
+		found = 0;
+
+		while (n) {
+			rt = rb_entry(n, struct pohmelfs_route, node);
+
+			if (rt->st == st) {
+				pohmelfs_route_remove_nolock(psb, rt);
+				found = 1;
+				break;
+			}
+
+			n = rb_next(&rt->node);
+		}
+	}
+
+	spin_unlock(&psb->state_lock);
+}
+
+static int pohmelfs_route_request_complete(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(t->inode->i_sb);
+	struct dnet_cmd *cmd = &recv->cmd;
+	struct pohmelfs_state *st;
+	struct dnet_attr *attr;
+	struct dnet_addr_attr *a;
+	struct dnet_raw_id *ids;
+	int err = 0;
+
+	if (!t->recv_offset)
+		goto err_out_exit;
+
+	attr = t->recv_data;
+	dnet_convert_attr(attr);
+
+	if (attr->size > sizeof(struct dnet_addr_attr)) {
+		int i, num = (attr->size - sizeof(struct dnet_addr_attr)) / sizeof(struct dnet_raw_id);
+
+		a = (struct dnet_addr_attr *)(attr + 1);
+		dnet_convert_addr_attr(a);
+		ids = (struct dnet_raw_id *)(a + 1);
+
+		st = pohmelfs_state_create(psb, (struct sockaddr_storage *)&a->addr.addr, a->addr.addr_len, 0, cmd->id.group_id);
+		if (IS_ERR(st)) {
+			err = PTR_ERR(st);
+
+			if (err == -EEXIST) {
+				spin_lock(&psb->state_lock);
+				st = pohmelfs_addr_exist(psb, (struct sockaddr_storage *)&a->addr.addr, a->addr.addr_len);
+				if (st) {
+					st->group_id = cmd->id.group_id;
+					pohmelfs_state_get(st);
+					err = 0;
+				}
+				spin_unlock(&psb->state_lock);
+			}
+
+			if (err)
+				goto err_out_exit;
+		} else {
+			/*
+			 * reference grab logic should be the same
+			 * as in case when state exist - we will drop
+			 * it at the end, so we would not check whether
+			 * it is new state (and refcnt == 1) or
+			 * existing (refcnt > 1)
+			 */
+			pohmelfs_state_get(st);
+		}
+
+		for (i = 0; i < num; ++i) {
+			dnet_convert_raw_id(&ids[i]);
+#if 0
+			pohmelfs_print_addr((struct sockaddr_storage *)&a->addr.addr, "%d:%s\n",
+					cmd->id.group_id, pohmelfs_dump_id(ids[i].id));
+#endif
+
+			err = pohmelfs_route_add(st, &ids[i], cmd->id.group_id);
+			if (err) {
+				if (err != -EEXIST) {
+					/* remove this state from route table */
+					spin_lock(&psb->state_lock);
+					list_del_init(&st->state_entry);
+					spin_unlock(&psb->state_lock);
+
+					/* drop abovementioned refcnt */
+					pohmelfs_state_put(st);
+
+					pohmelfs_state_kill(st);
+					goto err_out_exit;
+				}
+
+				err = 0;
+			}
+		}
+
+		/* drop abovementioned refcnt */
+		pohmelfs_state_put(st);
+	}
+
+err_out_exit:
+	return err;
+}
+
+int pohmelfs_route_request(struct pohmelfs_state *st)
+{
+	struct pohmelfs_sb *psb = st->psb;
+	struct pohmelfs_io *pio;
+	int err;
+
+	pio = kmem_cache_zalloc(pohmelfs_io_cache, GFP_NOIO);
+	if (!pio) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	pio->pi = psb->root;
+	pio->id = &psb->root->id;
+	pio->cmd = DNET_CMD_ROUTE_LIST;
+	pio->cflags = DNET_FLAGS_DIRECT | DNET_FLAGS_NEED_ACK;
+	pio->cb.complete = pohmelfs_route_request_complete;
+
+	err = pohmelfs_send_buf_single(pio, st);
+	if (err) {
+		pohmelfs_print_addr(&st->sa, "pohmelfs: pohmelfs_route_request: %d\n", err);
+		goto err_out_free;
+	}
+	pohmelfs_print_addr(&st->sa, "route request sent\n");
+
+err_out_free:
+	kmem_cache_free(pohmelfs_io_cache, pio);
+err_out_exit:
+	return err;
+}
diff --git a/fs/pohmelfs/super.c b/fs/pohmelfs/super.c
new file mode 100644
index 0000000..635a32f
--- /dev/null
+++ b/fs/pohmelfs/super.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (C) 2011+ Evgeniy Polyakov <zbr@...emap.net>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/quotaops.h>
+#include <asm/uaccess.h>
+
+#include "pohmelfs.h"
+
+#define POHMELFS_MAGIC_NUM	0x504f482e
+
+struct kmem_cache *pohmelfs_inode_cache;
+struct kmem_cache *pohmelfs_trans_cache;
+struct kmem_cache *pohmelfs_inode_info_cache;
+struct kmem_cache *pohmelfs_route_cache;
+struct kmem_cache *pohmelfs_wait_cache;
+struct kmem_cache *pohmelfs_io_cache;
+struct kmem_cache *pohmelfs_inode_info_binary_package_cache;
+struct kmem_cache *pohmelfs_write_cache;
+
+static atomic_t psb_bdi_num = ATOMIC_INIT(0);
+
+static void pohmelfs_cleanup_psb(struct pohmelfs_sb *psb)
+{
+	struct pohmelfs_state *st, *tmp;
+	struct pohmelfs_reconnect *r, *rtmp;
+
+	cancel_delayed_work(&psb->reconnect_work);
+
+	list_for_each_entry_safe(st, tmp, &psb->state_list, state_entry) {
+		list_del_init(&st->state_entry);
+
+		pohmelfs_state_kill(st);
+	}
+
+	list_for_each_entry_safe(st, tmp, &psb->kill_state_list, state_entry) {
+		list_del_init(&st->state_entry);
+		pohmelfs_state_kill(st);
+	}
+
+	list_for_each_entry_safe(r, rtmp, &psb->reconnect_list, reconnect_entry) {
+		list_del(&r->reconnect_entry);
+		kfree(r);
+	}
+
+	destroy_workqueue(psb->wq);
+	crypto_free_hash(psb->hash);
+
+	kfree(psb->groups);
+	kfree(psb->fsid);
+}
+
+static void pohmelfs_put_super(struct super_block *sb)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(sb);
+
+	pohmelfs_cleanup_psb(psb);
+	bdi_destroy(&psb->bdi);
+}
+
+static int pohmelfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+
+	/*
+	 * There are no filesystem size limits yet.
+	 */
+	memset(buf, 0, sizeof(struct kstatfs));
+
+	buf->f_type = POHMELFS_MAGIC_NUM; /* 'POH.' */
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_files = 0;
+	buf->f_namelen = 4096;
+	buf->f_files = 0;
+	buf->f_bfree = buf->f_bavail = ~0ULL >> PAGE_SHIFT;
+	buf->f_blocks = ~0ULL >> PAGE_SHIFT;
+
+	return 0;
+}
+
+static int pohmelfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(vfs->mnt_sb);
+
+	if (psb->no_read_csum)
+		seq_printf(seq, ",noreadcsum");
+	seq_printf(seq, ",sync_timeout=%u", psb->sync);
+	if (psb->fsid)
+		seq_printf(seq, ",fsid=%s", psb->fsid);
+	return 0;
+}
+
+/*
+ * This is tricky function - inode cache can be shrunk and inode is about to be dropped,
+ * since its last reference is dropped. But then icache can __iget() on this inode and
+ * later iput() it, which will again call ->drop_inode() callback.
+ *
+ * So, ->drop_inode() can be called multiple times for single inode without its reintialization
+ * And we better to be ready for this
+ */
+static int pohmelfs_drop_inode(struct inode *inode)
+{
+	struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+	struct pohmelfs_sb *psb = pohmelfs_sb(inode->i_sb);
+
+	pr_debug("pohmelfs: %s: drop ino: %ld, mapping: %p\n", pohmelfs_dump_id(pi->id.id), inode->i_ino, inode->i_mapping);
+
+	spin_lock(&psb->inode_lock);
+	if (rb_parent(&pi->node) != &pi->node)
+		rb_erase(&pi->node, &psb->inode_root);
+	rb_init_node(&pi->node);
+	spin_unlock(&psb->inode_lock);
+
+	return generic_drop_inode(inode);
+}
+
+static int pohmelfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct pohmelfs_inode *pi = pohmelfs_inode(inode);
+	struct dentry *dentry;
+	int err = 0;
+	int sync = 0;
+
+	if (wbc)
+		sync = wbc->sync_mode == WB_SYNC_ALL;
+
+	if (pi == pohmelfs_sb(inode->i_sb)->root)
+		return 0;
+
+	dentry = d_find_alias(inode);
+	if (dentry) {
+		err = pohmelfs_send_inode_info(pi, &pi->parent_id, dentry->d_name.name, dentry->d_name.len, sync);
+		dput(dentry);
+	}
+
+	return err;
+}
+
+static int pohmelfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(sb);
+
+	flush_workqueue(psb->wq);
+	return 0;
+}
+
+static int pohmelfs_parse_options(struct pohmelfs_sb *psb, char *data);
+
+static int pohmelfs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(sb);
+
+	return pohmelfs_parse_options(psb, data);
+}
+
+static const struct super_operations pohmelfs_sb_ops = {
+	.alloc_inode	= pohmelfs_alloc_inode,
+	.destroy_inode	= pohmelfs_destroy_inode,
+	.drop_inode	= pohmelfs_drop_inode,
+	.write_inode	= pohmelfs_write_inode,
+	.put_super	= pohmelfs_put_super,
+	.show_options   = pohmelfs_show_options,
+	.statfs		= pohmelfs_statfs,
+	.sync_fs	= pohmelfs_sync_fs,
+	.remount_fs	= pohmelfs_remount_fs,
+};
+
+static void pohmelfs_reconnect(struct work_struct *work)
+{
+	struct pohmelfs_sb *psb = container_of(to_delayed_work(work), struct pohmelfs_sb, reconnect_work);
+	struct pohmelfs_reconnect *r, *tmp;
+	struct pohmelfs_state *st, *stmp;
+	LIST_HEAD(head);
+	int err;
+
+	mutex_lock(&psb->reconnect_lock);
+	list_for_each_entry_safe(r, tmp, &psb->reconnect_list, reconnect_entry) {
+		st = pohmelfs_state_create(psb, &r->sa, r->addrlen, 1, r->group_id);
+		if (IS_ERR(st)) {
+			err = PTR_ERR(st);
+
+			if (err != -EEXIST)
+				continue;
+		} else {
+			pohmelfs_print_addr(&st->sa, "reconnected\n");
+		}
+
+		list_del(&r->reconnect_entry);
+		kfree(r);
+	}
+	mutex_unlock(&psb->reconnect_lock);
+
+	spin_lock(&psb->state_lock);
+	list_for_each_entry_safe(st, stmp, &psb->kill_state_list, state_entry) {
+		list_move(&st->state_entry, &head);
+	}
+	spin_unlock(&psb->state_lock);
+
+	list_for_each_entry_safe(st, stmp, &head, state_entry) {
+		list_del_init(&st->state_entry);
+		pohmelfs_state_kill(st);
+	}
+
+	if (!list_empty(&psb->reconnect_list))
+		queue_delayed_work(psb->wq, &psb->reconnect_work, psb->reconnect_timeout);
+}
+
+static int pohmelfs_init_psb(struct pohmelfs_sb *psb, struct super_block *sb)
+{
+	int err;
+	char name[16];
+
+	INIT_LIST_HEAD(&psb->state_list);
+	psb->route_root = RB_ROOT;
+
+	psb->inode_root = RB_ROOT;
+	spin_lock_init(&psb->inode_lock);
+
+	spin_lock_init(&psb->state_lock);
+
+	atomic_long_set(&psb->ino, 0);
+	atomic_long_set(&psb->trans, 0);
+
+	sb->s_fs_info = psb;
+	sb->s_op = &pohmelfs_sb_ops;
+	sb->s_magic = POHMELFS_MAGIC_NUM;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_bdi = &psb->bdi;
+	sb->s_time_gran = 0;
+
+	psb->read_wait_timeout = 5000;
+	psb->write_wait_timeout = 5000;
+
+	psb->sync_timeout = msecs_to_jiffies(30000);
+
+	psb->sb = sb;
+
+	psb->hash = crypto_alloc_hash("sha512", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(psb->hash)) {
+		err = PTR_ERR(psb->hash);
+		goto err_out_exit;
+	}
+
+	snprintf(name, sizeof(name), "pohmelfs-%d", psb->bdi_num);
+	psb->wq = alloc_workqueue(name, WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM, 0);
+	if (!psb->wq) {
+		err = -ENOMEM;
+		goto err_out_crypto_free;
+	}
+
+	INIT_DELAYED_WORK(&psb->reconnect_work, pohmelfs_reconnect);
+	mutex_init(&psb->reconnect_lock);
+	INIT_LIST_HEAD(&psb->reconnect_list);
+	INIT_LIST_HEAD(&psb->kill_state_list);
+	psb->reconnect_timeout = msecs_to_jiffies(30000);
+
+	return 0;
+
+err_out_crypto_free:
+	crypto_free_hash(psb->hash);
+err_out_exit:
+	psb->sb = NULL;
+	sb->s_fs_info = NULL;
+	return err;
+}
+
+static int pohmelfs_parse_addr(char *addr, struct sockaddr_storage *a, int *addrlen)
+{
+	int family, port;
+	char *ptr;
+	int err = -EINVAL;
+
+	ptr = strrchr(addr, ':');
+	if (!ptr)
+		goto err_out_print_wrong_param;
+	*ptr++ = 0;
+	if (!ptr)
+		goto err_out_print_wrong_param;
+
+	family = simple_strtol(ptr, NULL, 10);
+
+	ptr = strrchr(addr, ':');
+	if (!ptr)
+		goto err_out_print_wrong_param;
+	*ptr++ = 0;
+	if (!ptr)
+		goto err_out_print_wrong_param;
+
+	port = simple_strtol(ptr, NULL, 10);
+
+	if (family == AF_INET) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)a;
+
+		sin->sin_family = family;
+		sin->sin_port = htons(port);
+
+		err = in4_pton(addr, strlen(addr), (u8 *)&sin->sin_addr, ':', NULL);
+		*addrlen = sizeof(struct sockaddr_in);
+	} else if (family == AF_INET6) {
+		struct sockaddr_in6 *sin = (struct sockaddr_in6 *)a;
+
+		sin->sin6_family = family;
+		sin->sin6_port = htons(port);
+		err = in6_pton(addr, strlen(addr), (u8 *)&sin->sin6_addr, ':', NULL);
+		*addrlen = sizeof(struct sockaddr_in6);
+	} else {
+		err = -ENOTSUPP;
+	}
+
+	if (err == 1)
+		err = 0;
+	else if (!err)
+		err = -EINVAL;
+
+	if (err)
+		goto err_out_print_wrong_param;
+
+	return 0;
+
+err_out_print_wrong_param:
+	pr_err("pohmelfs: %s: wrong addr: '%s', should be 'addr:port:family': %d.\n", __func__, addr, err);
+	return err;
+}
+
+static int pohmelfs_option(char *option, char *data, int *lenp, int have_data)
+{
+	int len;
+	char *ptr;
+
+	if (!strncmp(option, data, strlen(option))) {
+		len = strlen(option);
+		ptr = data + len;
+
+		if (have_data && (!ptr || !*ptr))
+			return 0;
+
+		*lenp = len;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int pohmelfs_set_groups(struct pohmelfs_sb *psb, char *value, int len)
+{
+	int i, num = 0, start = 0, pos = 0;
+	char *ptr = value;
+
+	for (i = 0; i < len; ++i) {
+		if (value[i] == ':')
+			start = 0;
+		else if (!start) {
+			start = 1;
+			num++;
+		}
+	}
+
+	if (!num) {
+		return -ENOENT;
+	}
+
+	psb->groups = kzalloc(sizeof(int) * num, GFP_KERNEL);
+	if (!psb->groups)
+		return -ENOMEM;
+	psb->group_num = num;
+
+	start = 0;
+	for (i = 0; i < len; ++i) {
+		if (value[i] == ':') {
+			value[i] = '\0';
+			if (start) {
+				psb->groups[pos] = simple_strtol(ptr, NULL, 10);
+				pos++;
+				start = 0;
+			}
+		} else if (!start) {
+			ptr = &value[i];
+			start = 1;
+		}
+	}
+
+	if (start) {
+		psb->groups[pos] = simple_strtol(ptr, NULL, 10);
+		pos++;
+	}
+
+	return 0;
+}
+
+static int pohmelfs_parse_option(struct pohmelfs_sb *psb, char *data)
+{
+	int len;
+	int err = 0;
+
+	pr_debug("pohmelfs: %s: option: %s\n", __func__, data);
+
+	if (pohmelfs_option("server=", data, &len, 1)) {
+		int addrlen;
+		char *addr_str = data + len;
+		struct sockaddr_storage sa;
+		struct pohmelfs_state *st;
+
+		memset(&sa, 0, sizeof(struct sockaddr_storage));
+		err = pohmelfs_parse_addr(addr_str, &sa, &addrlen);
+		if (err)
+			goto err_out_exit;
+
+		st = pohmelfs_state_create(psb, &sa, addrlen, 1, 0);
+		if (IS_ERR(st)) {
+			err = PTR_ERR(st);
+			goto err_out_exit;
+		}
+	} else if (pohmelfs_option("fsid=", data, &len, 1)) {
+		data += len;
+		len = strlen(data);
+
+		psb->fsid = kmalloc(len + 1, GFP_KERNEL);
+		if (!psb->fsid) {
+			err = -ENOMEM;
+			goto err_out_exit;
+		}
+
+		snprintf(psb->fsid, len + 1, "%s", data);
+		psb->fsid_len = len;
+	} else if (pohmelfs_option("sync_timeout=", data, &len, 1)) {
+		psb->sync_timeout = msecs_to_jiffies(1000 * simple_strtol(data + len, NULL, 10));
+	} else if (pohmelfs_option("use_http_compat", data, &len, 0)) {
+		psb->use_http_compat = 1;
+	} else if (pohmelfs_option("groups=", data, &len, 1)) {
+		data += len;
+		len = strlen(data);
+
+		err = pohmelfs_set_groups(psb, data, len);
+	} else if (pohmelfs_option("noatime", data, &len, 0)) {
+		psb->sb->s_flags |= FS_NOATIME_FL;
+	} else if (pohmelfs_option("relatime", data, &len, 0)) {
+		psb->sb->s_flags |= MS_RELATIME;
+	} else if (pohmelfs_option("noreadcsum", data, &len, 0)) {
+		psb->no_read_csum = 1;
+	} else if (pohmelfs_option("readcsum", data, &len, 0)) {
+		psb->no_read_csum = 0;
+	} else {
+		err = -ENOTSUPP;
+	}
+
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_parse_options(struct pohmelfs_sb *psb, char *data)
+{
+	int err = -ENOENT;
+	char *ptr, *start;
+
+	ptr = start = data;
+
+	while (ptr && *ptr) {
+		if (*ptr == ',') {
+			*ptr = '\0';
+			err = pohmelfs_parse_option(psb, start);
+			if (err)
+				goto err_out_exit;
+			ptr++;
+			if (ptr && *ptr)
+				start = ptr;
+
+			continue;
+		}
+
+		ptr++;
+	}
+
+	if (start != ptr) {
+		err = pohmelfs_parse_option(psb, start);
+		if (err)
+			goto err_out_exit;
+	}
+
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct pohmelfs_sb *psb;
+	int err;
+
+	psb = kzalloc(sizeof(struct pohmelfs_sb), GFP_KERNEL);
+	if (!psb) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	psb->bdi_num = atomic_inc_return(&psb_bdi_num);
+
+	err = bdi_init(&psb->bdi);
+	if (err)
+		goto err_out_free_psb;
+
+	err = bdi_register(&psb->bdi, NULL, "pfs-%d", psb->bdi_num);
+	if (err) {
+		bdi_destroy(&psb->bdi);
+		goto err_out_free_psb;
+	}
+
+	err = pohmelfs_init_psb(psb, sb);
+	if (err)
+		goto err_out_free_bdi;
+
+	psb->root = pohmelfs_new_inode(psb, 0755|S_IFDIR);
+	if (IS_ERR(psb->root)) {
+		err = PTR_ERR(psb->root);
+		goto err_out_cleanup_psb;
+	}
+
+	err = pohmelfs_parse_options(psb, data);
+	if (err)
+		goto err_out_put_root;
+
+	if (!psb->group_num) {
+		err = -EINVAL;
+		pr_err("pohmelfs: you did not specify groups option, which is mandatory\n");
+		goto err_out_put_root;
+	}
+
+	if (!psb->fsid_len) {
+		char str[] = "pohmelfs";
+		err = pohmelfs_hash(psb, str, 8, &psb->root->id);
+	} else {
+		err = pohmelfs_hash(psb, psb->fsid, psb->fsid_len, &psb->root->id);
+	}
+	if (err)
+		goto err_out_put_root;
+
+	psb->root->parent_id = psb->root->id;
+
+	sb->s_root = d_alloc_root(&psb->root->vfs_inode);
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto err_out_put_root;
+	}
+
+	return 0;
+
+err_out_put_root:
+	iput(&psb->root->vfs_inode);
+err_out_cleanup_psb:
+	pohmelfs_cleanup_psb(psb);
+err_out_free_bdi:
+	bdi_destroy(&psb->bdi);
+err_out_free_psb:
+	kfree(psb);
+err_out_exit:
+	pr_err("pohmelfs: %s: error: %d\n", __func__, err);
+	return err;
+}
+
+static struct dentry *pohmelfs_mount(struct file_system_type *fs_type,
+		       int flags, const char *dev_name, void *data)
+{
+	return mount_nodev(fs_type, flags, data, pohmelfs_fill_super);
+}
+
+static void pohmelfs_kill_sb(struct super_block *sb)
+{
+	sync_inodes_sb(sb);
+	kill_anon_super(sb);
+}
+
+static struct file_system_type pohmelfs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "pohmelfs",
+	.mount		= pohmelfs_mount,
+	.kill_sb	= pohmelfs_kill_sb,
+};
+
+static void pohmelfs_cleanup_cache(void)
+{
+	kmem_cache_destroy(pohmelfs_trans_cache);
+	kmem_cache_destroy(pohmelfs_inode_cache);
+	kmem_cache_destroy(pohmelfs_inode_info_cache);
+	kmem_cache_destroy(pohmelfs_route_cache);
+	kmem_cache_destroy(pohmelfs_wait_cache);
+	kmem_cache_destroy(pohmelfs_io_cache);
+	kmem_cache_destroy(pohmelfs_inode_info_binary_package_cache);
+	kfree(pohmelfs_scratch_buf);
+	kmem_cache_destroy(pohmelfs_write_cache);
+}
+
+static int pohmelfs_init_cache(void)
+{
+	int err = -ENOMEM;
+
+	pohmelfs_inode_cache = KMEM_CACHE(pohmelfs_inode, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (!pohmelfs_inode_cache)
+		goto err_out_exit;
+
+	pohmelfs_trans_cache = KMEM_CACHE(pohmelfs_trans, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (!pohmelfs_trans_cache)
+		goto err_out_destroy_inode_cache;
+
+	pohmelfs_inode_info_cache = KMEM_CACHE(pohmelfs_inode_info, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (!pohmelfs_inode_info_cache)
+		goto err_out_destroy_trans_cache;
+
+	pohmelfs_route_cache = KMEM_CACHE(pohmelfs_route, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (!pohmelfs_route_cache)
+		goto err_out_destroy_inode_info_cache;
+
+	pohmelfs_wait_cache = KMEM_CACHE(pohmelfs_wait, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (!pohmelfs_wait_cache)
+		goto err_out_destroy_inode_info_cache;
+
+	pohmelfs_io_cache = KMEM_CACHE(pohmelfs_io, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (!pohmelfs_io_cache)
+		goto err_out_destroy_wait_cache;
+
+	pohmelfs_scratch_buf = kmalloc(pohmelfs_scratch_buf_size, GFP_KERNEL);
+	if (!pohmelfs_scratch_buf) {
+		err = -ENOMEM;
+		goto err_out_destroy_io_cache;
+	}
+
+	pohmelfs_inode_info_binary_package_cache = KMEM_CACHE(pohmelfs_inode_info_binary_package, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (!pohmelfs_inode_info_binary_package_cache)
+		goto err_out_free_scratch;
+
+	pohmelfs_write_cache = KMEM_CACHE(pohmelfs_write_ctl, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (!pohmelfs_write_cache)
+		goto err_out_destroy_inode_info_binary_package_cache;
+
+	return 0;
+
+err_out_destroy_inode_info_binary_package_cache:
+	kmem_cache_destroy(pohmelfs_inode_info_binary_package_cache);
+err_out_free_scratch:
+	kfree(pohmelfs_scratch_buf);
+err_out_destroy_io_cache:
+	kmem_cache_destroy(pohmelfs_io_cache);
+err_out_destroy_wait_cache:
+	kmem_cache_destroy(pohmelfs_wait_cache);
+err_out_destroy_inode_info_cache:
+	kmem_cache_destroy(pohmelfs_inode_info_cache);
+err_out_destroy_trans_cache:
+	kmem_cache_destroy(pohmelfs_trans_cache);
+err_out_destroy_inode_cache:
+	kmem_cache_destroy(pohmelfs_inode_cache);
+err_out_exit:
+	return err;
+}
+
+static int __init pohmelfs_init(void)
+{
+	int err;
+
+	err = pohmelfs_init_cache();
+	if (err)
+		goto err_out_exit;
+
+        err = register_filesystem(&pohmelfs_type);
+	if (err)
+		goto err_out_cleanup_cache;
+
+	return 0;
+
+err_out_cleanup_cache:
+	pohmelfs_cleanup_cache();
+err_out_exit:
+	return err;
+}
+
+static void __exit pohmelfs_exit(void)
+{
+	unregister_filesystem(&pohmelfs_type);
+	pohmelfs_cleanup_cache();
+}
+
+module_init(pohmelfs_init)
+module_exit(pohmelfs_exit)
+
+MODULE_AUTHOR("Evgeniy Polyakov <zbr@...emap.net>");
+MODULE_DESCRIPTION("POHMELFS");
+MODULE_LICENSE("GPL");
diff --git a/fs/pohmelfs/symlink.c b/fs/pohmelfs/symlink.c
new file mode 100644
index 0000000..80a9d87
--- /dev/null
+++ b/fs/pohmelfs/symlink.c
@@ -0,0 +1,13 @@
+/*
+ * Copyright (C) 2011+ Evgeniy Polyakov <zbr@...emap.net>
+ */
+
+#include <linux/namei.h>
+
+#include "pohmelfs.h"
+
+const struct inode_operations pohmelfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
diff --git a/fs/pohmelfs/trans.c b/fs/pohmelfs/trans.c
new file mode 100644
index 0000000..fcd1aa4
--- /dev/null
+++ b/fs/pohmelfs/trans.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright (C) 2011+ Evgeniy Polyakov <zbr@...emap.net>
+ */
+
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "pohmelfs.h"
+
+static void pohmelfs_trans_free(struct pohmelfs_trans *t)
+{
+	iput(t->inode);
+
+	kmem_cache_free(pohmelfs_trans_cache, t);
+}
+
+static void pohmelfs_trans_release(struct kref *kref)
+{
+	struct pohmelfs_trans *t = container_of(kref, struct pohmelfs_trans, refcnt);
+	struct pohmelfs_inode *pi = pohmelfs_inode(t->inode);
+
+	pr_debug("pohmelfs: %s: trans freed: %lu, recv_offset: %llu, ino: %ld\n",
+			pohmelfs_dump_id(pi->id.id), t->trans, t->recv_offset, t->inode->i_ino);
+
+	if (t->cb.destroy)
+		t->cb.destroy(t);
+
+	pohmelfs_state_put(t->st);
+
+	kfree(t->data);
+	kfree(t->recv_data);
+	pohmelfs_trans_free(t);
+}
+
+void pohmelfs_trans_put(struct pohmelfs_trans *t)
+{
+	kref_put(&t->refcnt, pohmelfs_trans_release);
+}
+
+struct pohmelfs_trans *pohmelfs_trans_alloc(struct inode *inode)
+{
+	struct pohmelfs_trans *t;
+	int err;
+
+	t = kmem_cache_zalloc(pohmelfs_trans_cache, GFP_NOIO);
+	if (!t) {
+		err = -ENOMEM;
+		goto err_out_exit;
+	}
+
+	kref_init(&t->refcnt);
+
+	t->inode = igrab(inode);
+	if (!t->inode) {
+		err = -ENOENT;
+		goto err_out_free;
+	}
+
+	return t;
+
+err_out_free:
+	kmem_cache_free(pohmelfs_trans_cache, t);
+err_out_exit:
+	return ERR_PTR(err);
+}
+
+static int pohmelfs_buf_complete(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct pohmelfs_inode *pi = pohmelfs_inode(t->inode);
+	struct dnet_cmd *cmd = &recv->cmd;
+	unsigned long long trans = cmd->trans & ~DNET_TRANS_REPLY;
+
+	pr_debug("pohmelfs: %s: trans complete: %llu, flags: %x\n",
+			pohmelfs_dump_id(pi->id.id), trans, cmd->flags);
+
+	return 0;
+}
+
+static int pohmelfs_buf_recv(struct pohmelfs_trans *t, struct pohmelfs_state *recv)
+{
+	struct dnet_cmd *cmd = &recv->cmd;
+	int err;
+
+	if (!t->recv_data) {
+		t->recv_data = kmalloc(cmd->size, GFP_NOIO);
+		if (!t->recv_data) {
+			err = -ENOMEM;
+			goto err_out_exit;
+		}
+
+		t->recv_offset = 0;
+	}
+
+	err = pohmelfs_data_recv(recv, t->recv_data + t->recv_offset, cmd->size - t->recv_offset, MSG_DONTWAIT);
+	if (err < 0)
+		goto err_out_exit;
+
+	t->recv_offset += err;
+	err = 0;
+
+err_out_exit:
+	return err;
+}
+
+static int pohmelfs_init_callbacks(struct pohmelfs_trans *t, struct pohmelfs_io *pio)
+{
+	int err = 0;
+	struct pohmelfs_state *st = t->st;
+
+	t->priv = pio->priv;
+	t->cb = pio->cb;
+
+	if (!t->cb.complete)
+		t->cb.complete = pohmelfs_buf_complete;
+
+	if (!t->cb.recv_reply)
+		t->cb.recv_reply = pohmelfs_buf_recv;
+
+	if (t->cb.init) {
+		err = t->cb.init(t);
+		if (err)
+			goto err_out_exit;
+	}
+
+	pohmelfs_trans_insert(t);
+
+	pohmelfs_state_schedule(st);
+	pohmelfs_state_put(st);
+
+err_out_exit:
+	return err;
+}
+
+int pohmelfs_send_io_group(struct pohmelfs_io *pio, int group)
+{
+	struct pohmelfs_inode *pi = pio->pi;
+	struct inode *inode = &pi->vfs_inode;
+	struct pohmelfs_sb *psb = pohmelfs_sb(inode->i_sb);
+	struct pohmelfs_state *st;
+	struct pohmelfs_trans *t;
+	struct dnet_cmd *cmd;
+	struct dnet_attr *attr;
+	struct dnet_io_attr *io;
+	u64 iosize = pio->size;
+	u64 alloc_io_size = pio->size;
+	int err;
+
+	/* Dirty hack to prevent setting cmd/attr size to pio->size,
+	 * since in read command we specify in io->size number bytes we want,
+	 * and it should not be accounted in the packet we send to remote node
+	 */
+	if (pio->cmd == DNET_CMD_READ)
+		alloc_io_size = 0;
+
+	t = pohmelfs_trans_alloc(inode);
+	if (IS_ERR(t)) {
+		err = PTR_ERR(t);
+		goto err_out_exit;
+	}
+
+	st = pohmelfs_state_lookup(psb, pio->id, group);
+	if (!st) {
+		err = -ENOENT;
+		goto err_out_free;
+	}
+
+	t->st = st;
+	pohmelfs_state_get(st);
+
+	cmd = &t->cmd.cmd;
+	attr = &t->cmd.attr;
+	io = &t->cmd.p.io;
+
+	dnet_setup_id(&cmd->id, group, pio->id->id);
+	cmd->flags = pio->cflags;
+	cmd->trans = t->trans = atomic_long_inc_return(&psb->trans);
+	cmd->size = alloc_io_size + sizeof(struct dnet_io_attr) + sizeof(struct dnet_attr);
+
+	attr->cmd = pio->cmd;
+	attr->size = alloc_io_size + sizeof(struct dnet_io_attr);
+	attr->flags = pio->aflags;
+
+	memcpy(io->id, pio->id->id, DNET_ID_SIZE);
+	memcpy(io->parent, pio->id->id, DNET_ID_SIZE);
+	io->flags = pio->ioflags;
+	io->size = iosize;
+	io->offset = pio->offset;
+	io->type = pio->type;
+	io->start = pio->start;
+	io->num = pio->num;
+
+	t->header_size = sizeof(struct dnet_cmd) + sizeof(struct dnet_attr) + sizeof(struct dnet_io_attr);
+	t->data_size = alloc_io_size;
+
+	dnet_convert_cmd(cmd);
+	dnet_convert_attr(attr);
+	dnet_convert_io_attr(io);
+
+	t->wctl = pio->wctl;
+
+	if (pio->data) {
+		if (pio->alloc_flags & POHMELFS_IO_OWN) {
+			t->data = pio->data;
+		} else {
+			t->data = kmalloc(alloc_io_size, GFP_NOIO);
+			if (!t->data) {
+				err = -ENOMEM;
+				goto err_out_put_state;
+			}
+
+			memcpy(t->data, pio->data, alloc_io_size);
+		}
+	}
+
+	err = pohmelfs_init_callbacks(t, pio);
+	if (err)
+		goto err_out_put_state;
+
+
+	return 0;
+
+err_out_put_state:
+	pohmelfs_state_put(t->st);
+err_out_free:
+	pohmelfs_trans_free(t);
+err_out_exit:
+	return err;
+}
+
+int pohmelfs_send_io(struct pohmelfs_io *pio)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(pio->pi->vfs_inode.i_sb);
+	int i, err, err_num;
+
+	err = -ENOENT;
+	err_num = 0;
+
+	for (i = 0; i < psb->group_num; ++i) {
+		err = pohmelfs_send_io_group(pio, psb->groups[i]);
+		if (err)
+			err_num++;
+	}
+
+	return (err_num == psb->group_num) ? err : 0;
+}
+
+int pohmelfs_trans_insert(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_state *st = t->st;
+
+	mutex_lock(&st->trans_lock);
+	list_add_tail(&t->trans_entry, &st->trans_list);
+	mutex_unlock(&st->trans_lock);
+
+	return 0;
+}
+
+void pohmelfs_trans_remove(struct pohmelfs_trans *t)
+{
+	struct pohmelfs_state *st = t->st;
+
+	mutex_lock(&st->trans_lock);
+	list_del(&t->trans_entry);
+	mutex_unlock(&st->trans_lock);
+}
+
+struct pohmelfs_trans *pohmelfs_trans_lookup(struct pohmelfs_state *st, struct dnet_cmd *cmd)
+{
+	struct pohmelfs_trans *t, *found = NULL;
+	u64 trans = cmd->trans & ~DNET_TRANS_REPLY;
+
+	mutex_lock(&st->trans_lock);
+	list_for_each_entry(t, &st->sent_trans_list, trans_entry) {
+		if (trans == t->trans) {
+			found = t;
+
+			kref_get(&t->refcnt);
+			break;
+		}
+	}
+	mutex_unlock(&st->trans_lock);
+
+	return found;
+}
+
+int pohmelfs_send_buf_single(struct pohmelfs_io *pio, struct pohmelfs_state *st)
+{
+	struct pohmelfs_inode *pi = pio->pi;
+	struct inode *inode = &pi->vfs_inode;
+	struct pohmelfs_sb *psb = pohmelfs_sb(inode->i_sb);
+	struct pohmelfs_trans *t;
+	struct dnet_cmd *cmd;
+	struct dnet_attr *attr;
+	int err;
+
+	t = pohmelfs_trans_alloc(inode);
+	if (IS_ERR(t)) {
+		err = PTR_ERR(t);
+		goto err_out_exit;
+	}
+
+	if (!st) {
+		st = pohmelfs_state_lookup(psb, pio->id, pio->group_id);
+		if (!st) {
+			err = -ENOENT;
+			goto err_out_free;
+		}
+	} else {
+		pohmelfs_state_get(st);
+	}
+
+	t->st = st;
+	pohmelfs_state_get(st);
+
+	cmd = &t->cmd.cmd;
+	attr = &t->cmd.attr;
+
+	dnet_setup_id(&cmd->id, st->group_id, pio->id->id);
+	cmd->flags = pio->cflags;
+	cmd->trans = t->trans = atomic_long_inc_return(&psb->trans);
+	cmd->size = pio->size + sizeof(struct dnet_attr);
+
+	attr->cmd = pio->cmd;
+	attr->size = pio->size;
+	attr->flags = pio->aflags;
+
+	t->header_size = sizeof(struct dnet_cmd) + sizeof(struct dnet_attr);
+	t->data_size = pio->size;
+
+	dnet_convert_cmd(cmd);
+	dnet_convert_attr(attr);
+
+	if (pio->data) {
+		if (pio->alloc_flags & POHMELFS_IO_OWN) {
+			t->data = pio->data;
+		} else {
+			t->data = kmalloc(pio->size, GFP_NOIO);
+			if (!t->data) {
+				err = -ENOMEM;
+				goto err_out_put_state;
+			}
+
+			memcpy(t->data, pio->data, pio->size);
+		}
+	}
+
+	err = pohmelfs_init_callbacks(t, pio);
+	if (err)
+		goto err_out_put_state;
+
+	return 0;
+
+err_out_put_state:
+	pohmelfs_state_put(t->st);
+err_out_free:
+	pohmelfs_trans_free(t);
+err_out_exit:
+	return err;
+}
+
+int pohmelfs_send_buf(struct pohmelfs_io *pio)
+{
+	struct pohmelfs_sb *psb = pohmelfs_sb(pio->pi->vfs_inode.i_sb);
+	int i, err, err_num;
+
+	err = -ENOENT;
+	err_num = 0;
+
+	for (i = 0; i < psb->group_num; ++i) {
+		pio->group_id = psb->groups[i];
+
+		err = pohmelfs_send_buf_single(pio, NULL);
+		if (err)
+			err_num++;
+	}
+
+	return (err_num == psb->group_num) ? err : 0;
+}

-- 
	Evgeniy Polyakov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ