[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <07626129f7c0665c7b65061cb091c6a2e2e4deb6.1684913827.git.durui@linux.alibaba.com>
Date: Wed, 24 May 2023 15:40:52 +0800
From: Du Rui <durui@...ux.alibaba.com>
To: dm-devel@...hat.com
Cc: linux-kernel@...r.kernel.org, Alasdair Kergon <agk@...hat.com>,
Mike Snitzer <snitzer@...nel.org>,
Du Rui <durui@...ux.alibaba.com>,
Yuan Yifan <yifan.yuan@...ux.alibaba.com>,
Liu Lanzheng <liulz@...ux.alibaba.com>
Subject: [RFC PATCH v3] dm overlaybd: targets mapping OverlayBD image
OverlayBD is a generic layering block-level image format, design
for container, secure container and applicable to virtual machine,
published in USENIX ATC '20
https://www.usenix.org/system/files/atc20-li-huiba.pdf
OverlayBD already has a ContainerD non-core sub-project implementation
in userspace, as an accelerated container image service
https://github.com/containerd/accelerated-container-image
It could be much more efficient when do decompressing and mapping works
in the kernel with the framework of device-mapper, in many circumstances,
such as secure container runtime, mobile-devices, etc.
We hope it could be used in not only container images, but also other
conditions that needs a readonly layering block device images.
This patch contains a module, dm-overlaybd, provides two kinds of targets
dm-zfile and dm-lsmt, to expose a group of block-devices contains
OverlayBD image as a overlaid read-only block-device.
Signed-off-by: Du Rui <durui@...ux.alibaba.com>
Signed-off-by: Yuan Yifan <yifan.yuan@...ux.alibaba.com>
Signed-off-by: Liu Lanzheng <liulz@...ux.alibaba.com>
---
Sorry for sending not-well checked mail.
v3 Changed:
- Fix some bugs
- Fis member sign-off address
.../device-mapper/dm-overlaybd.rst | 71 +++
drivers/md/Kconfig | 2 +
drivers/md/Makefile | 1 +
drivers/md/overlaybd/Kconfig | 37 ++
drivers/md/overlaybd/Makefile | 4 +
drivers/md/overlaybd/dm-lsmt.c | 162 +++++
drivers/md/overlaybd/dm-lsmtformat.c | 574 ++++++++++++++++++
drivers/md/overlaybd/dm-ovbd-blkfile.c | 134 ++++
drivers/md/overlaybd/dm-ovbd.c | 45 ++
drivers/md/overlaybd/dm-ovbd.h | 45 ++
drivers/md/overlaybd/dm-zfile.c | 154 +++++
drivers/md/overlaybd/dm-zfileformat.c | 441 ++++++++++++++
12 files changed, 1670 insertions(+)
create mode 100644 Documentation/admin-guide/device-mapper/dm-overlaybd.rst
create mode 100644 drivers/md/overlaybd/Kconfig
create mode 100644 drivers/md/overlaybd/Makefile
create mode 100644 drivers/md/overlaybd/dm-lsmt.c
create mode 100644 drivers/md/overlaybd/dm-lsmtformat.c
create mode 100644 drivers/md/overlaybd/dm-ovbd-blkfile.c
create mode 100644 drivers/md/overlaybd/dm-ovbd.c
create mode 100644 drivers/md/overlaybd/dm-ovbd.h
create mode 100644 drivers/md/overlaybd/dm-zfile.c
create mode 100644 drivers/md/overlaybd/dm-zfileformat.c
diff --git a/Documentation/admin-guide/device-mapper/dm-overlaybd.rst b/Documentation/admin-guide/device-mapper/dm-overlaybd.rst
new file mode 100644
index 000000000000..ad48cc7b57c7
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-overlaybd.rst
@@ -0,0 +1,71 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+dm-overlaybd
+============
+
+The device-mapper OverlayBD (dm-overlaybd) module allows merge block-devices
+contains OverlayBD format layers, into read-only OverlayBD block device.
+
+The OverlayBD is a block-device based container images system, based on
+random-readable compressed format ZFile, and multi-layer overlay block-device
+format LSMTFile.
+
+The following targets are provided by dm-overlaybd
+--------------------------------------------------
+
+- dm-zfile
+- dm-lsmt
+
+
+dm-zfile Table parameters
+-------------------------
+ <dev path> <ZFile size>
+
+Parameters:
+
+ <dev path>:
+ Full pathname to the underlying block-device,
+ <ZFile size>:
+ ZFile data length, in unit of byte.
+
+Examples:
+
+ZFile format data laying on /dev/vda, the ZFile length is 658971539,
+the table should be like:
+
+zfile_decompressed /dev/vdb 658971539
+
+
+dm-lsmt Table parameters
+------------------------
+ <baselayer dev path> <baselayer LSMTFile size>
+ [<upperlayer dev path> <upperlayer LSMTFile size>]
+
+Mandatory parameters:
+
+ <baselayer dev path>:
+ Full pathname to the underlying block-device, usually is a mapped
+ ZFile device.
+ <baselayer LSMTFile size>:
+ ZFile data length, in unit of byte.
+
+
+Optional parameter:
+
+ <upperlayer dev path> <upperlayer LSMTFile size>:
+ Multi-layer LSMTFile could overlay as stack, just like container
+ images.
+ The upper layers also described in table as optional parameters.
+ Each layer should tell the Full pathname of underlying block-device,
+ and a file size described LSMTFile data length in unit of byte.
+
+
+Examples:
+
+The base layer is mapped by dm-zfile in /dev/mapper/lsmt_base, and one upper
+layer is /dev/mapper/lsmt_upper that described changes from base layer.
+Assume that base layer LSMTFile length is 1,006,923,776 bytes, and upper layer is
+682,206,208 bytes.
+
+merged /dev/mapper/lsmt_base 1006923776 /dev/mapper/lsmt_upper 1006923776
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b0a22e99bade..4ae4fb628712 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -656,4 +656,6 @@ config DM_AUDIT
Enables audit logging of several security relevant events in the
particular device-mapper targets, especially the integrity target.
+source "drivers/md/overlaybd/Kconfig"
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 84291e38dca8..01b6da500a6a 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
obj-$(CONFIG_DM_ZONED) += dm-zoned.o
obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o
obj-$(CONFIG_SECURITY_LOADPIN_VERITY) += dm-verity-loadpin.o
+obj-$(CONFIG_DM_OVERLAYBD) += overlaybd/
ifeq ($(CONFIG_DM_INIT),y)
dm-mod-objs += dm-init.o
diff --git a/drivers/md/overlaybd/Kconfig b/drivers/md/overlaybd/Kconfig
new file mode 100644
index 000000000000..a602e2a2f0c7
--- /dev/null
+++ b/drivers/md/overlaybd/Kconfig
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config DM_OVERLAYBD
+ tristate "OverlayBD device mapper target"
+ select LZ4_DECOMPRESS
+ select DM_BUFIO
+ default m
+ help
+ Allows OverlayBD images like compressed block device loaded by
+ device-mapper. This module will provides two targets called dm-zfile
+ for decompressing, and dm-lsmt for block-device overlay.
+ The OverlayBD images is now one of ContainerD on-demand image solution.
+
+if DM_OVERLAYBD
+
+ config ZFILE_READAHEAD
+ bool "OverlayBD ZFile readahead support"
+ depends on DM_OVERLAYBD
+ default y
+ help
+ ZFile fetching compressed data with readahead feature.
+ It will perform read-ahaead prefetching when reading compressed data.
+ This option may improve performance reading, but will consume more
+ memory usage, and may read data that may not be used at all.
+
+ config ZFILE_CLEANUP_CACHE
+ bool "OverlayBD ZFile compressed data cache cleanup support"
+ depends on DM_OVERLAYBD
+ default y
+ help
+ Clean up compress data cache if block is already decompressed.
+ If set, drop the buffer when rear end of compreessed page was read.
+ Because VFS keeps pagecache for decompressed data, the compressed
+ pages usually be read only once in short term. This option usually
+ improves cache performance.
+
+endif #DM_OVERLAYBD
diff --git a/drivers/md/overlaybd/Makefile b/drivers/md/overlaybd/Makefile
new file mode 100644
index 000000000000..9967e03e6400
--- /dev/null
+++ b/drivers/md/overlaybd/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_DM_OVERLAYBD) += dm-overlaybd.o
+dm-overlaybd-y := dm-ovbd-blkfile.o dm-lsmtformat.o dm-zfileformat.o dm-lsmt.o dm-zfile.o dm-ovbd.o
\ No newline at end of file
diff --git a/drivers/md/overlaybd/dm-lsmt.c b/drivers/md/overlaybd/dm-lsmt.c
new file mode 100644
index 000000000000..c183cd471c15
--- /dev/null
+++ b/drivers/md/overlaybd/dm-lsmt.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "dm-ovbd.h"
+
+struct lsmt_dm_target {
+ struct dm_dev *dev[256];
+ struct vfile *lsmt;
+ struct vfile *bf[256];
+ unsigned int nr;
+};
+
+static int lsmt_target_map(struct dm_target *ti, struct bio *bio)
+{
+ struct lsmt_dm_target *mdt = (struct lsmt_dm_target *)ti->private;
+
+ if (!mdt) {
+ pr_err("LSMT DM Target not ready!!\n");
+ return DM_MAPIO_REQUEUE;
+ }
+
+ switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ return mdt->lsmt->ops->bio_remap((struct vfile *)mdt->lsmt, bio,
+ mdt->dev, mdt->nr);
+ default:
+ pr_err("DM_MAPIO_KILL %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_MAPIO_KILL;
+ }
+}
+
+static int lsmt_target_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
+{
+ if (bio->bi_status != BLK_STS_OK) {
+ pr_err("DONE NOT OK %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_ENDIO_REQUEUE;
+ }
+ return DM_ENDIO_DONE;
+}
+
+static int lsmt_target_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct lsmt_dm_target *mdt;
+ const char *devname;
+ const char *tail;
+ struct dm_arg_set args = { .argc = argc, .argv = argv };
+ size_t len;
+ int ret;
+ int i;
+
+ pr_debug("\n >>in function %s\n", __func__);
+
+ if (argc < 2) {
+ pr_warn("\n Invalid no.of arguments.\n");
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ mdt = kmalloc(sizeof(*mdt), GFP_KERNEL);
+
+ if (!mdt) {
+ ti->error = "dm-lsmt_target: Cannot allocate context";
+ return -ENOMEM;
+ }
+
+ for (i = 0; args.argc >= 2; i++) {
+ devname = dm_shift_arg(&args);
+ tail = dm_shift_arg(&args);
+ ret = kstrtoul(tail, 10, &len);
+ if (ret < 0) {
+ pr_warn("Invalid parameter");
+ goto error_out;
+ }
+ pr_info("\nlsmt-md: load dev %s\n", devname);
+ if (dm_get_device(ti, devname, dm_table_get_mode(ti->table),
+ &mdt->dev[i])) {
+ ti->error = "dm-lsmt_target: Device lookup failed";
+ goto bad;
+ }
+
+ if (!mdt->dev[i] || !mdt->dev[i]->bdev) {
+ pr_warn("failed to get mdt dev or bdev\n");
+ goto error_out;
+ }
+ mdt->bf[i] = open_blkdev_as_vfile(mdt->dev[i]->bdev, len);
+ pr_info("lsmt: file %d size %lu", i,
+ mdt->bf[i]->ops->len(mdt->bf[i]));
+ }
+ mdt->nr = i;
+
+ mdt->lsmt = lsmt_open_files(mdt->bf, 1);
+
+ if (!mdt->lsmt) {
+ pr_crit("Failed to open lsmt file");
+ goto error_out;
+ }
+
+ pr_info("dm-lsmt: blk size is %lu\n",
+ mdt->lsmt->ops->len((struct vfile *)mdt->lsmt));
+
+ ti->private = mdt;
+
+ pr_debug("\n>>out function %s\n", __func__);
+ return 0;
+
+error_out:
+ for (i = 0; i < mdt->nr; i++) {
+ if (mdt->bf[i])
+ mdt->bf[i]->ops->close((struct vfile *)mdt->bf[i]);
+ }
+
+ for (i = 0; i < mdt->nr; i++) {
+ if (mdt->dev[i])
+ dm_put_device(ti, mdt->dev[i]);
+ }
+bad:
+ kfree(mdt);
+ pr_debug("\n>>out function %s with error\n", __func__);
+ return -EINVAL;
+}
+
+static void lsmt_target_dtr(struct dm_target *ti)
+{
+ struct lsmt_dm_target *mdt = (struct lsmt_dm_target *)ti->private;
+ unsigned int i = 0;
+
+ pr_debug("\n<<in function %s\n", __func__);
+ if (mdt->lsmt)
+ mdt->lsmt->ops->close((struct vfile *)mdt->lsmt);
+ for (i = 0; i < mdt->nr; i++)
+ dm_put_device(ti, mdt->dev[i]);
+ kfree(mdt);
+ pr_debug("\n>>out function %s\n", __func__);
+}
+
+static struct target_type lsmt_target = {
+ .features = 0,
+ .name = "lsmt_target",
+ .version = { 1, 0, 0 },
+ .module = THIS_MODULE,
+ .ctr = lsmt_target_ctr,
+ .dtr = lsmt_target_dtr,
+ .map = lsmt_target_map,
+ .end_io = lsmt_target_end_io,
+};
+
+int init_lsmt_target(void)
+{
+ int result;
+
+ result = dm_register_target(&lsmt_target);
+ if (result < 0)
+ pr_warn("\n Error in registering target\n");
+ return 0;
+}
+
+void cleanup_lsmt_target(void)
+{
+ dm_unregister_target(&lsmt_target);
+}
diff --git a/drivers/md/overlaybd/dm-lsmtformat.c b/drivers/md/overlaybd/dm-lsmtformat.c
new file mode 100644
index 000000000000..be881f000ab8
--- /dev/null
+++ b/drivers/md/overlaybd/dm-lsmtformat.c
@@ -0,0 +1,574 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "dm-ovbd.h"
+#include <linux/vmalloc.h>
+
+#define REVERSE_ARRAY(type, begin, back) \
+ { \
+ type *l = (begin); \
+ type *r = (back); \
+ while (l < r) { \
+ type tmp = *l; \
+ *l = *r; \
+ *r = tmp; \
+ l++; \
+ r--; \
+ } \
+ }
+
+#define UINT64_MAX 0xFFFFFFFFFFFFFFFFULL
+#define ALIGNMENT 512U
+
+#define TYPE_SEGMENT 0
+#define TYPE_SEGMENT_MAPPING 1
+#define TYPE_FILDES 2
+#define TYPE_LSMT_RO_INDEX 3
+
+#define OVBD_MAX_LAYERS 256
+
+static const u64 INVALID_OFFSET = ((u64)1 << 50) - 1;
+static const u32 HT_SPACE = 4096;
+static u64 *MAGIC0 = (u64 *)"LSMT\0\1\2";
+static const uuid_t MAGIC1 = UUID_INIT(0x657e63d2, 0x9444, 0x084c, 0xa2, 0xd2,
+ 0xc8, 0xec, 0x4f, 0xcf, 0xae, 0x8a);
+
+struct lsmt_ht {
+ u64 magic0;
+ uuid_t magic1;
+ // offset 24, 28
+ u32 size; //= sizeof(HeaderTrailer);
+ u32 flags; //= 0;
+ // offset 32, 40, 48
+ u64 index_offset; // in bytes
+ u64 index_size; // # of SegmentMappings
+ u64 virtual_size; // in bytes
+} __packed;
+
+struct segment {
+ u64 offset : 50;
+ u32 length : 14;
+};
+
+struct segment_mapping { /* 8 + 8 bytes */
+ u64 offset : 50; // offset (0.5 PB if in sector)
+ u32 length : 14;
+ u64 moffset : 55; // mapped offset (2^64 B if in sector)
+ u32 zeroed : 1; // indicating a zero-filled segment
+ u8 tag;
+} __packed;
+
+struct lsmt_ro_index {
+ const struct segment_mapping *pbegin;
+ const struct segment_mapping *pend;
+ struct segment_mapping *mapping;
+};
+
+struct lsmt_ro_file {
+ struct vfile_operations *ops;
+ bool ownership;
+ int nr;
+ struct lsmt_ht ht;
+ struct lsmt_ro_index *index;
+ struct bio_set split_set;
+ struct vfile *fp[0];
+};
+
+static size_t lsmt_len(struct vfile *fp);
+static void lsmt_close(struct vfile *ctx);
+static int lsmt_bioremap(struct vfile *ctx, struct bio *bio,
+ struct dm_dev **dev, unsigned int nr);
+
+static struct vfile_operations lsmt_ops = { .len = lsmt_len,
+ .blkdev = NULL,
+ .pread = NULL,
+ .close = lsmt_close,
+ .bio_remap = lsmt_bioremap };
+
+static u64 segment_end(const void *s)
+{
+ return ((struct segment *)s)->offset + ((struct segment *)s)->length;
+}
+
+static void forward_offset_to(void *m, u64 x, int8_t type)
+{
+ struct segment *s = (struct segment *)m;
+ u64 delta = x - s->offset;
+
+ s->offset = x;
+ s->length -= delta;
+ if (type == TYPE_SEGMENT_MAPPING) {
+ struct segment_mapping *tmp = (struct segment_mapping *)m;
+
+ if (!tmp->zeroed)
+ tmp->moffset += delta;
+ }
+}
+
+static void backward_end_to(void *m, u64 x)
+{
+ struct segment *s = (struct segment *)m;
+
+ s->length = x - s->offset;
+}
+
+static void trim_edge(void *m, const struct segment *bound_segment, u8 type)
+{
+ if (((struct segment *)m)->offset < bound_segment->offset)
+ forward_offset_to(m, bound_segment->offset, type);
+ if (segment_end(m) > segment_end(bound_segment))
+ backward_end_to(m, segment_end(bound_segment));
+}
+
+static const struct segment_mapping *
+ro_index_lower_bound(const struct lsmt_ro_index *index, u64 offset)
+{
+ const struct segment_mapping *l = index->pbegin;
+ const struct segment_mapping *r = index->pend - 1;
+ const struct segment_mapping *pret;
+ int ret = -1;
+
+ while (l <= r) {
+ int m = ((l - index->pbegin) + (r - index->pbegin)) >> 1;
+ const struct segment_mapping *cmp = index->pbegin + m;
+
+ if (offset >= segment_end(cmp)) {
+ ret = m;
+ l = index->pbegin + (m + 1);
+ } else {
+ r = index->pbegin + (m - 1);
+ }
+ }
+ pret = index->pbegin + (ret + 1);
+ if (pret >= index->pend)
+ return index->pend;
+ else
+ return pret;
+}
+
+static int ro_index_lookup(const struct lsmt_ro_index *index,
+ const struct segment *query_segment,
+ struct segment_mapping *ret_mappings, size_t n)
+{
+ const struct segment_mapping *lb;
+ const struct segment_mapping *it;
+ int cnt;
+
+ if (query_segment->length == 0)
+ return 0;
+ lb = ro_index_lower_bound(index, query_segment->offset);
+ cnt = 0;
+ for (it = lb; it != index->pend; it++) {
+ if (it->offset >= segment_end(query_segment))
+ break;
+ ret_mappings[cnt++] = *it;
+ if (cnt == n)
+ break;
+ }
+ if (cnt == 0)
+ return 0;
+ trim_edge(&ret_mappings[0], query_segment, TYPE_SEGMENT_MAPPING);
+ if (cnt > 1) {
+ trim_edge(&ret_mappings[cnt - 1], query_segment,
+ TYPE_SEGMENT_MAPPING);
+ }
+ return cnt;
+}
+
+static size_t ro_index_size(const struct lsmt_ro_index *index)
+{
+ return index->pend - index->pbegin;
+}
+
+static struct lsmt_ro_index *
+create_memory_index(const struct segment_mapping *pmappings, size_t n,
+ u64 moffset_begin, u64 moffset_end)
+{
+ struct lsmt_ro_index *ret = NULL;
+
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return NULL;
+ ret->pbegin = pmappings;
+ ret->pend = pmappings + n;
+ ret->mapping = (struct segment_mapping *)pmappings;
+ pr_info("create memory index done. {index_count: %zu}", n);
+ return ret;
+};
+
+static int lsmt_bioremap(struct vfile *ctx, struct bio *bio,
+ struct dm_dev **dev, unsigned int nr)
+{
+ struct lsmt_ro_file *fp = (struct lsmt_ro_file *)ctx;
+ struct segment s;
+ struct segment_mapping m[16];
+ struct bio *subbio;
+ size_t i = 0;
+ int n;
+ loff_t offset = bio->bi_iter.bi_sector;
+
+ if (bio_op(bio) != REQ_OP_READ) {
+ pr_err("DM_MAPIO_KILL %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_MAPIO_KILL;
+ }
+
+ if ((offset << SECTOR_SHIFT) > fp->ht.virtual_size) {
+ pr_info("LSMT: %lld over tail %lld\n", offset,
+ fp->ht.virtual_size);
+ pr_err("DM_MAPIO_KILL %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_MAPIO_KILL;
+ }
+
+ // till here, take this bio, assume it will be submitted
+
+ // actually, split bio by segment, summit and call endio when all split bio
+ // are done
+
+ bio->bi_status = BLK_STS_OK;
+ while (true) {
+ s.offset = bio->bi_iter.bi_sector;
+ s.length = bio_sectors(bio);
+ n = ro_index_lookup(fp->index, &s, m, 16);
+ for (i = 0; i < n; ++i) {
+ s.offset = bio->bi_iter.bi_sector;
+ s.length = bio_sectors(bio);
+ if (s.offset < m[i].offset) {
+ // hole
+ if (m[i].offset - s.offset < s.length) {
+ subbio = bio_split(bio,
+ m[i].offset - s.offset,
+ GFP_NOIO, &fp->split_set);
+ bio_chain(subbio, bio);
+ zero_fill_bio(subbio);
+ bio_endio(subbio);
+ } else {
+ zero_fill_bio(bio);
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ }
+ s.offset = bio->bi_iter.bi_sector;
+ s.length = bio_sectors(bio);
+ // zeroe block
+ if (m[i].zeroed) {
+ if (m[i].length < s.length) {
+ subbio = bio_split(bio, m[i].length,
+ GFP_NOIO,
+ &fp->split_set);
+ bio_chain(subbio, bio);
+ zero_fill_bio(subbio);
+ bio_endio(subbio);
+ } else {
+ zero_fill_bio(bio);
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ } else {
+ bio_set_dev(bio, dev[m[i].tag]->bdev);
+ if (m[i].length < s.length) {
+ subbio = bio_split(bio, m[i].length,
+ GFP_NOIO,
+ &fp->split_set);
+ subbio->bi_iter.bi_sector =
+ m[i].moffset;
+ bio_chain(subbio, bio);
+ submit_bio(subbio);
+ } else {
+ bio->bi_iter.bi_sector = m[i].moffset;
+ submit_bio(bio);
+ return DM_MAPIO_SUBMITTED;
+ }
+ }
+ }
+ if (n < 16)
+ break;
+ }
+ if (s.length > 0)
+ zero_fill_bio(bio);
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+}
+
+static size_t lsmt_len(struct vfile *fp)
+{
+ return ((struct lsmt_ro_file *)fp)->ht.virtual_size;
+}
+
+static bool is_lsmtfile(struct vfile *fp)
+{
+ struct lsmt_ht ht;
+ ssize_t ret;
+
+ if (!fp)
+ return false;
+
+ pr_info("LSMT: read header(vfile: %p)", fp);
+ ret = fp->ops->pread(fp, &ht, sizeof(struct lsmt_ht), 0);
+
+ if (ret < (ssize_t)sizeof(struct lsmt_ht)) {
+ pr_err("failed to load header");
+ return NULL;
+ }
+
+ return ht.magic0 == *MAGIC0 && uuid_equal(&ht.magic1, &MAGIC1);
+}
+
+static void lsmt_close(struct vfile *ctx)
+{
+ struct lsmt_ro_file *lsmt_file = (struct lsmt_ro_file *)ctx;
+
+ if (lsmt_file->ownership)
+ for (int i = 0; i < lsmt_file->nr; i++)
+ lsmt_file->fp[i]->ops->close(lsmt_file->fp[i]);
+ vfree(lsmt_file->index->mapping);
+ kfree(lsmt_file->index);
+ bioset_exit(&lsmt_file->split_set);
+ kfree(lsmt_file);
+}
+
+static void *lsmt_alloc_copy(void *ptr, size_t bs, size_t *from_size,
+ size_t to_size)
+{
+ void *ret = vmalloc(to_size * bs);
+
+ if (IS_ERR_OR_NULL(ret))
+ return ret;
+ memcpy(ret, ptr, *from_size * bs);
+ *from_size = to_size;
+ vfree(ptr);
+ return ret;
+}
+
+static int merge_indexes(int level, struct lsmt_ro_index **indexes, size_t n,
+ struct segment_mapping **mappings, size_t *size,
+ size_t *capacity, u64 start, u64 end)
+{
+ struct segment_mapping *p;
+ struct segment_mapping it;
+ const struct segment_mapping *pend;
+
+ if (level >= n)
+ return 0;
+ p = (struct segment_mapping *)ro_index_lower_bound(indexes[level],
+ start);
+ pend = indexes[level]->pend;
+ if (p == pend) {
+ pr_debug("index=%p p=%p pend=%p", indexes[level], p, pend);
+ merge_indexes(level + 1, indexes, n, mappings, size, capacity,
+ start, end);
+ return 0;
+ }
+ it = *p;
+ if (start > it.offset)
+ forward_offset_to(&it, start, TYPE_SEGMENT_MAPPING);
+ while (p != pend) {
+ if (end <= it.offset)
+ break;
+ if (start < it.offset)
+ merge_indexes(level + 1, indexes, n, mappings, size,
+ capacity, start, it.offset);
+ if (end < segment_end(&it))
+ backward_end_to(&it, end);
+ if (*size == *capacity) {
+ *mappings = lsmt_alloc_copy(*mappings, sizeof(mappings),
+ capacity, (*capacity) << 1);
+ if (*size == *capacity) {
+ pr_err("realloc failed.");
+ return -1;
+ }
+ }
+ it.tag = level;
+ (*mappings)[*size] = it;
+ (*size)++;
+ start = segment_end(p);
+ pr_debug("push segment %zd {offset: %lu, len: %u}", *size,
+ it.offset + 0UL, it.length);
+ p++;
+ it = *p;
+ }
+ if (start < end)
+ merge_indexes(level + 1, indexes, n, mappings, size, capacity,
+ start, end);
+ return 0;
+}
+
+static struct lsmt_ro_index *
+merge_memory_indexes(struct lsmt_ro_index **indexes, size_t n)
+{
+ size_t size = 0;
+ size_t capacity = ro_index_size(indexes[0]);
+ struct lsmt_ro_index *ret = NULL;
+ struct segment_mapping *mappings;
+
+ mappings = vmalloc(sizeof(*mappings) * capacity);
+
+ pr_debug("init capacity: %zu\n", capacity);
+ if (IS_ERR_OR_NULL(mappings)) {
+ pr_err("Failed to alloc mapping memory\n");
+ goto err_ret;
+ }
+ pr_debug("start merge indexes, layers: %zu", n);
+
+ merge_indexes(0, indexes, n, &mappings, &size, &capacity, 0,
+ UINT64_MAX);
+ pr_info("merge done, index size: %zu", size);
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ mappings = lsmt_alloc_copy(mappings, sizeof(struct segment_mapping),
+ &size, size);
+ ret->pbegin = mappings;
+ ret->pend = mappings + size;
+ ret->mapping = mappings;
+ pr_info("ret index done. size: %zu", size);
+ return ret;
+
+err_ret:
+ if (mappings)
+ vfree(mappings);
+ kfree(ret);
+ return NULL;
+}
+
+static ssize_t do_load_index(struct vfile *fp, struct segment_mapping *p,
+ struct lsmt_ht *ht)
+{
+ ssize_t index_bytes = ht->index_size * sizeof(struct segment_mapping);
+ ssize_t readn;
+ size_t valid = 0;
+
+ pr_info("LSMT: loadindex off: %llu cnt: %llu", ht->index_offset,
+ ht->index_size);
+ readn = fp->ops->pread(fp, p, index_bytes, ht->index_offset);
+ if (readn < index_bytes) {
+ pr_err("failed to read index");
+ return -1;
+ }
+ for (off_t idx = 0; idx < ht->index_size; idx++) {
+ if (p[idx].offset != INVALID_OFFSET) {
+ p[valid] = p[idx];
+ p[valid].tag = 0;
+ pr_debug("valid index %zu {offset: %lu, length: %u}",
+ valid, p[idx].offset + 0UL, p[idx].length);
+ valid++;
+ }
+ }
+ pr_info("valid index count: %zu", valid);
+ ht->index_size = valid;
+ return valid;
+}
+
+static ssize_t lsmt_load_ht(struct vfile *fp, struct lsmt_ht *ht)
+{
+ ssize_t file_size;
+ loff_t tailer_offset;
+ ssize_t ret;
+
+ if (!is_lsmtfile(fp)) {
+ pr_info("LSMT: fp is not a lsmtfile(%p)\n", fp);
+ return -1;
+ }
+ file_size = fp->ops->len(fp);
+ pr_info("LSMT: file len is %zd\n", file_size);
+ tailer_offset = file_size - HT_SPACE;
+ ret = fp->ops->pread(fp, ht, sizeof(struct lsmt_ht), tailer_offset);
+ if (ret < (ssize_t)sizeof(struct lsmt_ht)) {
+ pr_err("failed to load tailer(%p)\n", fp);
+ return -1;
+ }
+ pr_info("LSMT(%p), index_offset %llu: index_count: %llu", fp,
+ ht->index_offset, ht->index_size);
+
+ return 0;
+}
+
+static struct lsmt_ro_index *load_merge_index(struct vfile *files[], size_t n,
+ struct lsmt_ht *ht)
+{
+ struct lsmt_ro_index **indexes;
+ struct lsmt_ro_index *pmi = NULL;
+ struct segment_mapping *p;
+ struct lsmt_ro_index *pi;
+ size_t index_bytes;
+
+ indexes = kzalloc(sizeof(**indexes) * OVBD_MAX_LAYERS, GFP_KERNEL);
+ if (n > OVBD_MAX_LAYERS) {
+ pr_err("too many indexes to merge, %d at most!",
+ OVBD_MAX_LAYERS);
+ goto error_ret;
+ }
+ for (int i = 0; i < n; ++i) {
+ pr_info("read %d-th LSMT info", i);
+ lsmt_load_ht(files[i], ht);
+ index_bytes = ht->index_size * sizeof(struct segment_mapping);
+ if (index_bytes == 0 || index_bytes > 1024UL * 1024 * 1024)
+ goto error_ret;
+ p = vmalloc(index_bytes);
+ if (do_load_index(files[i], p, ht) == -1) {
+ vfree(p);
+ pr_err("failed to load index from %d-th file", i);
+ goto error_ret;
+ }
+ pi = create_memory_index(p, ht->index_size,
+ HT_SPACE / ALIGNMENT,
+ ht->index_offset / ALIGNMENT);
+ if (!pi) {
+ pr_err("failed to create memory index! ( %d-th file )",
+ i);
+ vfree(p);
+ goto error_ret;
+ }
+ indexes[i] = pi;
+ }
+
+ pr_info("reverse index.");
+ REVERSE_ARRAY(struct vfile *, &files[0], &files[n - 1]);
+ REVERSE_ARRAY(struct lsmt_ro_index *, &indexes[0], &indexes[n - 1]);
+
+ pmi = merge_memory_indexes(indexes, n);
+
+ if (!pmi) {
+ pr_err("failed to merge indexes");
+ goto error_ret;
+ }
+ pr_debug("merge index done.");
+ kfree(indexes);
+ return pmi;
+
+error_ret:
+ kfree(indexes);
+ return NULL;
+}
+
+struct vfile *lsmt_open_files(struct vfile *zfiles[], int n)
+{
+ struct lsmt_ro_file *ret;
+ struct lsmt_ht ht;
+ struct lsmt_ro_index *idx;
+
+ pr_info("LSMT open_files, layers: %d", n);
+ ret = kzalloc(sizeof(struct vfile *) * n + sizeof(struct lsmt_ro_file),
+ GFP_KERNEL);
+ if (!ret)
+ return NULL;
+ idx = load_merge_index(zfiles, n, &ht);
+ if (!idx) {
+ pr_err("load merge index failed.");
+ goto error_out;
+ }
+ pr_info("Initial bio set");
+ if (bioset_init(&ret->split_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) {
+ pr_err("Initial bio set failed");
+ goto error_out;
+ }
+ ret->nr = n;
+ ret->index = idx;
+ ret->ownership = false;
+ ret->ops = &lsmt_ops;
+ ret->ht.virtual_size = ht.virtual_size;
+ pr_debug("ret->fp[0]: %p", &ret->fp[0]);
+ memcpy(&ret->fp[0], &zfiles[0], n * sizeof(struct vfile *));
+ return (struct vfile *)ret;
+error_out:
+ kfree(ret);
+ return NULL;
+}
diff --git a/drivers/md/overlaybd/dm-ovbd-blkfile.c b/drivers/md/overlaybd/dm-ovbd-blkfile.c
new file mode 100644
index 000000000000..19a75ce40033
--- /dev/null
+++ b/drivers/md/overlaybd/dm-ovbd-blkfile.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "dm-ovbd.h"
+#include <linux/dm-bufio.h>
+
+struct blkdev_as_vfile {
+ struct vfile_operations *ops;
+ struct block_device *blkdev;
+ loff_t len;
+ struct dm_bufio_client *c;
+};
+
+static struct block_device *blkdev_getblkdev(struct vfile *f)
+{
+ return ((struct blkdev_as_vfile *)f)->blkdev;
+}
+
+// special helper
+// access blockdev data by sync
+// copy to buffer
+static ssize_t sync_read_blkdev(struct blkdev_as_vfile *f, void *buf,
+ size_t count, loff_t offset)
+{
+ void *mem = NULL;
+ loff_t left = offset & PAGE_MASK;
+ loff_t right = (offset + count + PAGE_SIZE - 1) & PAGE_MASK;
+ loff_t i = 0;
+ size_t sg_len = 0;
+ ssize_t ret = 0;
+ int nr_pages = 0;
+ size_t dsize = f->len;
+ struct dm_buffer *dbuf = NULL;
+
+ if (right > (dsize << SECTOR_SHIFT))
+ right = (dsize << SECTOR_SHIFT);
+
+ nr_pages = (right - left + PAGE_SIZE - 1) / PAGE_SIZE;
+ dm_bufio_prefetch(f->c, left >> PAGE_SHIFT, nr_pages);
+
+ for (i = 0; i < nr_pages; i++) {
+ if (left > offset + count)
+ break;
+ sg_len = PAGE_SIZE;
+ if (left + sg_len > offset + count)
+ sg_len = offset + count - left;
+ if (offset > left)
+ sg_len = sg_len - (offset - left);
+ mem = dm_bufio_read(f->c, left >> PAGE_SHIFT, &dbuf);
+ if (IS_ERR(dbuf))
+ goto out;
+ memcpy(buf, mem + (offset - left), sg_len);
+ dm_bufio_release(dbuf);
+ buf += sg_len;
+ offset += sg_len;
+ left += PAGE_SIZE;
+ ret += sg_len;
+ count -= sg_len;
+ }
+out:
+ return ret;
+}
+
+static size_t blkdev_len(struct vfile *ctx)
+{
+ struct blkdev_as_vfile *bf = (struct blkdev_as_vfile *)ctx;
+
+ pr_debug("%s %lld\n", __func__, bf->len);
+ return bf->len;
+}
+
+static ssize_t blkdev_pread(struct vfile *ctx, void *buf, size_t count,
+ loff_t offset)
+{
+ struct blkdev_as_vfile *bf;
+ size_t ret, tr, split_count;
+
+ bf = (struct blkdev_as_vfile *)ctx;
+ ret = 0;
+ while (count) {
+ split_count = min((size_t)(PAGE_SIZE << 2), count);
+ tr = sync_read_blkdev(bf, buf, split_count, offset);
+ if (tr < 0)
+ return tr;
+ if (tr == 0)
+ return ret;
+ ret += tr;
+ buf += tr;
+ offset += tr;
+ count -= tr;
+ }
+ return ret;
+}
+
+static void blkdev_close(struct vfile *ctx)
+{
+ struct blkdev_as_vfile *bf;
+
+ bf = (struct blkdev_as_vfile *)ctx;
+ if (ctx) {
+ dm_bufio_client_destroy(bf->c);
+ kfree(ctx);
+ }
+}
+
+static struct vfile_operations blkdev_op = {
+ .blkdev = blkdev_getblkdev,
+ .len = blkdev_len,
+ .pread = blkdev_pread,
+ .bio_remap = NULL,
+ .close = blkdev_close,
+};
+
+struct vfile *open_blkdev_as_vfile(struct block_device *blk, loff_t len)
+{
+ struct blkdev_as_vfile *ret;
+
+ if (IS_ERR(blk))
+ return NULL;
+ ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret)
+ return NULL;
+ ret->ops = &blkdev_op;
+ ret->blkdev = blk;
+ ret->c = dm_bufio_client_create(blk, 4096, 1, 0, NULL, NULL, 0);
+ if (IS_ERR(ret->c))
+ goto errout;
+ if (len == -1)
+ len = get_capacity(blk->bd_disk) << SECTOR_SHIFT;
+ ret->len = len;
+ return (struct vfile *)ret;
+errout:
+ kfree(ret);
+ return NULL;
+}
diff --git a/drivers/md/overlaybd/dm-ovbd.c b/drivers/md/overlaybd/dm-ovbd.c
new file mode 100644
index 000000000000..87aa9ec17208
--- /dev/null
+++ b/drivers/md/overlaybd/dm-ovbd.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/module.h>
+#include "dm-ovbd.h"
+
+static struct ovbd_context global_ovbd_context;
+
+static int __init init_ovbd_target(void)
+{
+ global_ovbd_context.wq =
+ alloc_workqueue("ovbd", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, 0);
+ if (IS_ERR(global_ovbd_context.wq))
+ return -1;
+ if (init_lsmt_target() < 0)
+ goto error_out;
+ if (init_zfile_target() < 0)
+ goto error_out;
+ pr_info("OVBD initialized");
+ return 0;
+error_out:
+ destroy_workqueue(global_ovbd_context.wq);
+ return -1;
+}
+
+static void __exit cleanup_ovbd_target(void)
+{
+ cleanup_zfile_target();
+ cleanup_lsmt_target();
+ flush_workqueue(global_ovbd_context.wq);
+ destroy_workqueue(global_ovbd_context.wq);
+ global_ovbd_context.wq = NULL;
+ pr_info("OVBD cleared");
+}
+
+struct ovbd_context *get_ovbd_context(void)
+{
+ return &global_ovbd_context;
+}
+
+module_init(init_ovbd_target);
+module_exit(cleanup_ovbd_target);
+
+MODULE_AUTHOR("Du Rui <durui@...ux.alibaba.com>");
+MODULE_DESCRIPTION("DADI OverlayBD implementation as device mapper target");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/overlaybd/dm-ovbd.h b/drivers/md/overlaybd/dm-ovbd.h
new file mode 100644
index 000000000000..94f5303d7399
--- /dev/null
+++ b/drivers/md/overlaybd/dm-ovbd.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __DM_OVBD_HEADER__
+#define __DM_OVBD_HEADER__
+
+#include <linux/device-mapper.h>
+#include <linux/bio.h>
+
+struct ovbd_context {
+ struct workqueue_struct *wq;
+};
+
+struct ovbd_context *get_ovbd_context(void);
+
+int init_lsmt_target(void);
+
+void cleanup_lsmt_target(void);
+
+int init_zfile_target(void);
+
+void cleanup_zfile_target(void);
+
+struct vfile;
+
+struct vfile_operations {
+ struct block_device *(*blkdev)(struct vfile *file);
+ size_t (*len)(struct vfile *file);
+ ssize_t (*pread)(struct vfile *file, void *buffer, size_t count,
+ loff_t offset);
+ int (*bio_remap)(struct vfile *file, struct bio *bio,
+ struct dm_dev **devs, unsigned int nr_dev);
+ void (*close)(struct vfile *file);
+};
+
+struct vfile {
+ struct vfile_operations *ops;
+};
+
+struct vfile *open_blkdev_as_vfile(struct block_device *blk, loff_t len);
+
+struct vfile *zfile_open(struct vfile *file);
+
+struct vfile *lsmt_open_files(struct vfile *zf[], int n);
+
+#endif
diff --git a/drivers/md/overlaybd/dm-zfile.c b/drivers/md/overlaybd/dm-zfile.c
new file mode 100644
index 000000000000..f0a27014c0b7
--- /dev/null
+++ b/drivers/md/overlaybd/dm-zfile.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "dm-ovbd.h"
+
+struct zfile_dm_target {
+ struct dm_dev *dev;
+ struct vfile *zfile;
+ struct vfile *bf;
+};
+
+static int zfile_target_map(struct dm_target *ti, struct bio *bio)
+{
+ struct zfile_dm_target *mdt = (struct zfile_dm_target *)ti->private;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ return mdt->zfile->ops->bio_remap((struct vfile *)mdt->zfile,
+ bio, &mdt->dev, 1);
+ default:
+ pr_err("DM_MAPIO_KILL %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_MAPIO_KILL;
+ }
+}
+
+static int zfile_target_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
+{
+ if (bio->bi_status != BLK_STS_OK) {
+ pr_err("DONE NOT OK %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_ENDIO_REQUEUE;
+ }
+ return DM_ENDIO_DONE;
+}
+
+static int zfile_target_ctr(struct dm_target *ti, unsigned int argc,
+ char **argv)
+{
+ struct zfile_dm_target *mdt;
+ const char *devname, *tail;
+ struct dm_arg_set args = { .argc = argc, .argv = argv };
+ size_t zflen;
+ int ret;
+
+ pr_debug("\n >>in function %s\n", __func__);
+
+ if (argc < 2) {
+ pr_info("\n Invalid no.of arguments.\n");
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ mdt = kzalloc(sizeof(*mdt), GFP_KERNEL);
+
+ if (!mdt) {
+ ti->error = "dm-zfile_target: Cannot allocate context";
+ return -ENOMEM;
+ }
+
+ devname = dm_shift_arg(&args);
+ pr_info("\nzfile-md: load dev %s\n", devname);
+ if (dm_get_device(ti, devname, dm_table_get_mode(ti->table),
+ &mdt->dev)) {
+ ti->error = "dm-zfile_target: Device lookup failed";
+ goto bad;
+ }
+
+ if (!mdt->dev || !mdt->dev->bdev) {
+ pr_warn("failed to get mdt dev or bdev\n");
+ goto error_out;
+ }
+
+ tail = dm_shift_arg(&args);
+ ret = kstrtoul(tail, 10, &zflen);
+ if (ret < 0) {
+ pr_warn("failed to get file length");
+ goto error_out;
+ }
+
+ mdt->bf = (struct vfile *)open_blkdev_as_vfile(mdt->dev->bdev, zflen);
+
+ if (!mdt->bf) {
+ pr_crit("Failed to open blkdev");
+ goto error_out;
+ }
+
+ mdt->zfile = zfile_open(mdt->bf);
+
+ if (!mdt->zfile) {
+ pr_crit("Failed to open zfile file");
+ goto error_out;
+ }
+
+ pr_info("zfile: size is %lu\n",
+ mdt->zfile->ops->len((struct vfile *)mdt->zfile));
+
+ ti->private = mdt;
+
+ pr_debug("\n>>out function %s\n", __func__);
+ return 0;
+
+error_out:
+ if (mdt->zfile)
+ mdt->zfile->ops->close(mdt->zfile);
+ if (mdt->bf)
+ mdt->bf->ops->close(mdt->bf);
+ if (mdt->dev)
+ dm_put_device(ti, mdt->dev);
+bad:
+ kfree(mdt);
+ pr_debug("\n>>out function %s with error\n", __func__);
+ return -EINVAL;
+}
+
+static void zfile_target_dtr(struct dm_target *ti)
+{
+ struct zfile_dm_target *mdt = (struct zfile_dm_target *)ti->private;
+
+ pr_debug("\n<<in function %s\n", __func__);
+ if (mdt->zfile)
+ mdt->zfile->ops->close((struct vfile *)mdt->zfile);
+ if (mdt->bf)
+ mdt->bf->ops->close((struct vfile *)mdt->bf);
+ dm_put_device(ti, mdt->dev);
+ kfree(mdt);
+ pr_debug("\n>>out function %s\n", __func__);
+}
+
+static struct target_type zfile_target = {
+ .features = 0,
+ .name = "zfile_target",
+ .version = { 1, 0, 0 },
+ .module = THIS_MODULE,
+ .ctr = zfile_target_ctr,
+ .dtr = zfile_target_dtr,
+ .map = zfile_target_map,
+ .end_io = zfile_target_end_io,
+};
+
+int init_zfile_target(void)
+{
+ int result;
+
+ result = dm_register_target(&zfile_target);
+ if (result < 0)
+ pr_info("\n Error in registering target\n");
+ return 0;
+}
+
+void cleanup_zfile_target(void)
+{
+ dm_unregister_target(&zfile_target);
+}
diff --git a/drivers/md/overlaybd/dm-zfileformat.c b/drivers/md/overlaybd/dm-zfileformat.c
new file mode 100644
index 000000000000..04c8153fcb0a
--- /dev/null
+++ b/drivers/md/overlaybd/dm-zfileformat.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "dm-ovbd.h"
+#include <linux/lz4.h>
+#include <linux/vmalloc.h>
+#include <linux/prefetch.h>
+#include <linux/kthread.h>
+#include <linux/uuid.h>
+#include <linux/dm-bufio.h>
+#include <linux/build_bug.h>
+
+static const u32 ZF_SPACE = 512;
+static u64 *MAGIC0 = (u64 *)"ZFile\0\1";
+static const uuid_t MAGIC1 = UUID_INIT(0x74756a69, 0x2e79, 0x7966, 0x40, 0x41,
+ 0x6c, 0x69, 0x62, 0x61, 0x62, 0x61);
+
+struct compress_options {
+ u32 block_size; // 4
+ u8 type; // 5
+ u8 level; // 6
+ u8 use_dict; // 7
+ u8 __padding0; // 8
+ u32 args; // 12
+ u32 dict_size; // 16
+ u8 verify; // 17
+ u8 __padding1[7]; //24
+} __packed;
+
+static_assert(sizeof(struct compress_options) == 24, "CO size not fit");
+
+struct zfile_ht {
+ u64 magic0; // 8
+ uuid_t magic1; // 24
+
+ // till here offset = 24
+ u32 size_ht; //= sizeof(HeaderTrailer); // 28
+ u8 __padding[4]; // 32
+ u64 flags; //= 0; // 40
+
+ // till here offset = 36
+ u64 index_offset; // in bytes 48
+ u64 index_size; // num of index 56
+
+ u64 vsize; // 64
+ u64 reserved_0; // 72
+
+ struct compress_options opt; // suppose to be 96
+} __packed;
+
+static_assert(sizeof(struct zfile_ht) == 96, "Header size not fit");
+
+struct jump_table {
+ u64 partial_offset : 48; // 48 bits logical offset + 16 bits partial minimum
+ uint16_t delta : 16;
+} __packed;
+
+// zfile can be treated as file with extends
+struct zfile {
+ struct vfile_operations *ops;
+ struct vfile *fp;
+ bool ownership;
+ struct block_device *blkdev;
+ struct zfile_ht header;
+ struct jump_table *jump;
+ mempool_t cmdpool;
+ struct dm_bufio_client *c;
+ struct ovbd_context *ovbd;
+};
+
+#define FLAG_SHIFT_HEADER 0
+// 1:header 0:trailer
+#define FLAG_SHIFT_TYPE 1
+// 1:data file, 0:index file
+#define FLAG_SHIFT_SEALED 2
+// 1:YES 0:NO # skip it now.
+#define FLAG_SHIFT_HEADER_OVERWRITE 3
+
+#define PREFETCH_PAGE_NR 32
+#define CMDPOOL_SIZE 4096
+#define MAX_JUMPTABLE_SIZE (1024UL * 1024 * 1024)
+
+static size_t zfile_len(struct vfile *fp);
+static void zfile_close(struct vfile *ctx);
+static int zfile_bioremap(struct vfile *ctx, struct bio *bio, struct dm_dev **dev,
+ unsigned int nr);
+
+static struct vfile_operations zfile_ops = { .len = zfile_len,
+ .bio_remap = zfile_bioremap,
+ .close = zfile_close };
+
+static u32 get_flag_bit(struct zfile_ht *ht, u32 shift)
+{
+ return ht->flags & (1 << shift);
+}
+
+static bool is_header_overwrite(struct zfile_ht *ht)
+{
+ return get_flag_bit(ht, FLAG_SHIFT_HEADER_OVERWRITE);
+}
+
+static size_t zfile_len(struct vfile *zfile)
+{
+ return ((struct zfile *)zfile)->header.vsize;
+}
+
+static void build_jump_table(u32 *jt_saved, struct zfile *zf)
+{
+ size_t i;
+
+ zf->jump = vmalloc((zf->header.index_size + 2) *
+ sizeof(struct jump_table));
+ zf->jump[0].partial_offset = ZF_SPACE;
+ for (i = 0; i < zf->header.index_size; i++) {
+ zf->jump[i].delta = jt_saved[i];
+ zf->jump[i + 1].partial_offset =
+ zf->jump[i].partial_offset + jt_saved[i];
+ }
+}
+
+enum decompress_result {
+ ZFILE_DECOMP_ERROR = -1,
+ ZFILE_DECOMP_OK = 0,
+ ZFILE_DECOMP_NOT_READY = 1,
+};
+
+static int zf_decompress(struct zfile *zf, struct page *page, loff_t offset,
+ bool force)
+{
+ void *dst = NULL;
+ void *src = NULL;
+ size_t idx, c_cnt;
+ loff_t begin, left, right, i;
+ int ret = 0;
+ int decomp_cnt = 0;
+ struct dm_buffer *buf = NULL;
+ void *tmp = NULL;
+
+ idx = offset >> PAGE_SHIFT;
+ begin = zf->jump[idx].partial_offset;
+ c_cnt = zf->jump[idx].delta - (zf->header.opt.verify ? sizeof(u32) : 0);
+ left = begin & PAGE_MASK;
+ right = ((begin + c_cnt) + (PAGE_SIZE - 1)) & PAGE_MASK;
+
+ if (likely(right - left == PAGE_SIZE)) {
+ if (force)
+ src = dm_bufio_read(zf->c, left >> PAGE_SHIFT, &buf);
+ else
+ src = dm_bufio_get(zf->c, left >> PAGE_SHIFT, &buf);
+ if (IS_ERR_OR_NULL(src) || IS_ERR_OR_NULL(buf)) {
+ ret = ZFILE_DECOMP_NOT_READY;
+ goto out;
+ }
+ src = src + (begin - left);
+ } else {
+ tmp = kmalloc(right - left, GFP_KERNEL);
+ for (i = left; i < right; i += PAGE_SIZE) {
+ void *d = force ? dm_bufio_read(zf->c, i >> PAGE_SHIFT,
+ &buf) :
+ dm_bufio_get(zf->c, i >> PAGE_SHIFT,
+ &buf);
+ if (IS_ERR_OR_NULL(d) || IS_ERR_OR_NULL(buf)) {
+ ret = ZFILE_DECOMP_NOT_READY;
+ goto out;
+ }
+ memcpy(tmp + i - left, d, PAGE_SIZE);
+ dm_bufio_release(buf);
+ buf = NULL;
+ }
+ src = tmp + (begin - left);
+ }
+
+ dst = kmap_local_page(page);
+
+ prefetchw(dst);
+
+ decomp_cnt = LZ4_decompress_fast(src, dst, PAGE_SIZE);
+
+ kunmap_local(dst);
+
+ if (decomp_cnt < 0) {
+ pr_err("Decompress error\n");
+ ret = ZFILE_DECOMP_ERROR;
+ goto out;
+ }
+
+out:
+ if (!IS_ERR_OR_NULL(buf))
+ dm_bufio_release(buf);
+ kfree(tmp);
+
+ return ret;
+}
+
+static int do_decompress(struct zfile *zf, struct bio *bio, size_t left, int nr,
+ bool force)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bv, bio, iter) {
+ int ret =
+ zf_decompress(zf, bv.bv_page,
+ (iter.bi_sector << SECTOR_SHIFT), force);
+ if (unlikely(ret != ZFILE_DECOMP_OK)) {
+ if (ret == ZFILE_DECOMP_ERROR)
+ bio_io_error(bio);
+ return ret;
+ }
+ }
+ bio_endio(bio);
+ return ZFILE_DECOMP_OK;
+}
+
+struct decompress_work {
+ struct work_struct work;
+ struct zfile *zf;
+ struct bio *bio;
+ bool force;
+};
+
+static inline void zfile_prefetch(struct zfile *zf, size_t left, size_t nr)
+{
+#ifdef ZFILE_READAHEAD
+ size_t prefetch_page = PREFETCH_PAGE_NR;
+#else
+ size_t prefetch_page = 0;
+#endif
+ dm_bufio_prefetch(zf->c, left >> PAGE_SHIFT, nr + prefetch_page);
+}
+
+static inline void zfile_cleanup_compressed_cache(struct zfile *zf, size_t left,
+ size_t nr)
+{
+#ifdef ZFILE_CLEANUP_CACHE
+ dm_bufio_forget_buffers(zf->c, left >> PAGE_SHIFT, nr);
+#endif
+}
+
+static void decompress_fn(struct work_struct *work)
+{
+ size_t start_idx, end_idx, begin, range, left, right;
+ loff_t offset, count, nr;
+ size_t bs;
+ struct decompress_work *cmd =
+ container_of(work, struct decompress_work, work);
+
+ if (!work)
+ return;
+ offset = cmd->bio->bi_iter.bi_sector;
+ count = bio_sectors(cmd->bio);
+ bs = cmd->zf->header.opt.block_size;
+
+ start_idx = (offset << SECTOR_SHIFT) / bs;
+ end_idx = ((offset + count - 1) << SECTOR_SHIFT) / bs;
+
+ begin = cmd->zf->jump[start_idx].partial_offset;
+ range = cmd->zf->jump[end_idx].partial_offset +
+ cmd->zf->jump[end_idx].delta - begin;
+ left = begin & PAGE_MASK;
+ right = (begin + range + PAGE_SIZE - 1) & PAGE_MASK;
+ nr = (right - left) >> PAGE_SHIFT;
+
+ zfile_prefetch(cmd->zf, left, nr);
+
+ if (unlikely(do_decompress(cmd->zf, cmd->bio, left, nr, cmd->force) ==
+ ZFILE_DECOMP_NOT_READY)) {
+ goto resubmit;
+ }
+
+ zfile_cleanup_compressed_cache(cmd->zf, left,
+ nr - ((right > begin + range) ? 1 : 0));
+
+ mempool_free(cmd, &cmd->zf->cmdpool);
+
+ return;
+
+resubmit:
+ cmd->force = true;
+ queue_work(cmd->zf->ovbd->wq, work);
+}
+
+static int zfile_bioremap(struct vfile *ctx, struct bio *bio, struct dm_dev **dm_dev,
+ unsigned int dev_nr)
+{
+ struct zfile *zf = (struct zfile *)ctx;
+ loff_t offset = bio->bi_iter.bi_sector;
+ size_t count = bio_sectors(bio);
+ struct decompress_work *cmd;
+
+ if (unlikely(dev_nr != 1 || !dm_dev[0])) {
+ pr_err("ZFile: nr wrong\n");
+ pr_err("DM_MAPIO_KILL %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_MAPIO_KILL;
+ }
+ if (unlikely(bio_op(bio) != REQ_OP_READ)) {
+ pr_err("ZFile: REQ not read\n");
+ pr_err("DM_MAPIO_KILL %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_MAPIO_KILL;
+ }
+ if (unlikely((offset << SECTOR_SHIFT) >= zf->header.vsize)) {
+ pr_err("ZFile: %lld over tail\n", offset);
+ pr_err("DM_MAPIO_KILL %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_MAPIO_KILL;
+ }
+ if (unlikely(((offset + count) << SECTOR_SHIFT) > zf->header.vsize)) {
+ pr_err("ZFile: %lld over tail\n", offset);
+ pr_err("DM_MAPIO_KILL %s:%d op=%d sts=%d\n", __FILE__, __LINE__,
+ bio_op(bio), bio->bi_status);
+ return DM_MAPIO_KILL;
+ }
+
+ cmd = mempool_alloc(&zf->cmdpool, GFP_NOIO);
+ if (IS_ERR_OR_NULL(cmd))
+ return DM_MAPIO_DELAY_REQUEUE;
+
+ INIT_WORK(&cmd->work, decompress_fn);
+ cmd->bio = bio;
+ cmd->zf = zf;
+ cmd->force = false;
+
+ queue_work_on(raw_smp_processor_id(), cmd->zf->ovbd->wq, &cmd->work);
+ return DM_MAPIO_SUBMITTED;
+}
+
+static bool load_zfile_header(struct vfile *file, struct zfile_ht *ht);
+
+struct vfile *zfile_open(struct vfile *file)
+{
+ u32 *jt_saved;
+ size_t jt_size = 0;
+ struct zfile *zfile = NULL;
+ int ret = 0;
+ size_t file_size = 0;
+ loff_t tailer_offset;
+ struct block_device *bdev = file->ops->blkdev(file);
+
+ zfile = kzalloc(sizeof(*zfile), GFP_KERNEL);
+
+ if (!load_zfile_header(file, &zfile->header)) {
+ kfree(zfile);
+ return NULL;
+ }
+
+ if (!zfile)
+ goto error_out;
+ zfile->fp = file;
+
+ // should verify header
+ if (!is_header_overwrite(&zfile->header)) {
+ file_size = zfile->fp->ops->len(zfile->fp);
+ tailer_offset = file_size - ZF_SPACE;
+ pr_info("zfile: file_size=%lu tail_offset=%llu\n", file_size,
+ tailer_offset);
+ ret = zfile->fp->ops->pread(zfile->fp, &zfile->header,
+ sizeof(struct zfile_ht),
+ tailer_offset);
+ if (ret < (ssize_t)sizeof(struct zfile_ht)) {
+ pr_err("zfile: failed to fetch zfile tailer");
+ goto error_out;
+ }
+ pr_info("zfile: Trailer vsize=%lld index_offset=%lld index_size=%lld verify=%d",
+ zfile->header.vsize, zfile->header.index_offset,
+ zfile->header.index_size, zfile->header.opt.verify);
+ } else {
+ pr_info("zfile header overwrite: size=%lld index_offset=%lld index_size=%lld verify=%d",
+ zfile->header.vsize, zfile->header.index_offset,
+ zfile->header.index_size, zfile->header.opt.verify);
+ }
+
+ jt_size = ((u64)zfile->header.index_size) * sizeof(u32);
+ pr_info("get index_size %lu, index_offset %llu", jt_size,
+ zfile->header.index_offset);
+
+ if (jt_size == 0 || jt_size > MAX_JUMPTABLE_SIZE)
+ goto error_out;
+
+ jt_saved = vmalloc(jt_size);
+
+ ret = zfile->fp->ops->pread(zfile->fp, jt_saved, jt_size,
+ zfile->header.index_offset);
+
+ build_jump_table(jt_saved, zfile);
+
+ vfree(jt_saved);
+
+ zfile->ops = &zfile_ops;
+
+ ret = mempool_init_kmalloc_pool(&zfile->cmdpool, CMDPOOL_SIZE,
+ sizeof(struct decompress_work));
+ if (ret)
+ goto error_out;
+
+ zfile->c = dm_bufio_client_create(bdev, PAGE_SIZE, 1, 0, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(zfile->c))
+ goto error_out;
+
+ zfile->ovbd = get_ovbd_context();
+
+ return (struct vfile *)zfile;
+
+error_out:
+ if (zfile)
+ zfile_close((struct vfile *)zfile);
+ return NULL;
+}
+
+static bool load_zfile_header(struct vfile *file, struct zfile_ht *ht)
+{
+ ssize_t ret;
+
+ if (!file)
+ return false;
+
+ ret = file->ops->pread(file, ht, sizeof(struct zfile_ht), 0);
+ if (ret < (ssize_t)sizeof(struct zfile_ht)) {
+ pr_info("zfile: failed to load header %ld", ret);
+ return false;
+ }
+ return ht->magic0 == *MAGIC0 && uuid_equal(&ht->magic1, &MAGIC1);
+}
+
+static void zfile_close(struct vfile *f)
+{
+ struct zfile *zfile = (struct zfile *)f;
+
+ pr_info("close(%p)", (void *)f);
+ if (zfile) {
+ if (zfile->jump) {
+ vfree(zfile->jump);
+ zfile->jump = NULL;
+ }
+ zfile->fp = NULL;
+ mempool_exit(&zfile->cmdpool);
+ if (!IS_ERR_OR_NULL(zfile->c))
+ dm_bufio_client_destroy(zfile->c);
+ kfree(zfile);
+ }
+}
Powered by blists - more mailing lists