[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1268842164-5590-44-git-send-email-orenl@cs.columbia.edu>
Date: Wed, 17 Mar 2010 12:08:31 -0400
From: Oren Laadan <orenl@...columbia.edu>
To: Andrew Morton <akpm@...ux-foundation.org>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org,
linux-api@...r.kernel.org, Serge Hallyn <serue@...ibm.com>,
Ingo Molnar <mingo@...e.hu>,
containers@...ts.linux-foundation.org,
Oren Laadan <orenl@...columbia.edu>
Subject: [C/R v20][PATCH 43/96] c/r: restore memory address space (private memory)
Restoring the memory address space begins with nuking the existing one
of the current process, and then reading the vma state and contents.
Call do_mmap_pgoffset() for each vma and then read in the data.
Changelog[v20]:
- Only use arch_setup_additional_pages() if supported by arch
Changelog[v19]:
- [Serge Hallyn] do_munmap(): remove unused local vars
- [Serge Hallyn] Checkpoint saved_auxv as u64s
Changelog[v19-rc3]:
- [Serge Hallyn] move destroy_mm into mmap.c and remove size check
- [Serge Hallyn] fill vdso (syscall32_setup_pages) for TIF_IA32/x86_64
- Do not hold mmap_sem when reading memory pages on restart
Changelog[v19-rc2]:
- Expose page write functions
- [Serge Hallyn] Fix return value of read_pages_contents()
Changelog[v18]:
- Tighten checks on supported vma to checkpoint or restart
Changelog[v17]:
- Restore mm->{flags,def_flags,saved_auxv}
- Fix bogus warning in do_restore_mm()
Changelog[v16]:
- Restore mm->exe_file
Changelog[v14]:
- Introduce per vma-type restore() function
- Merge restart code into same file as checkpoint (memory.c)
- Compare saved 'vdso' field of mm_context with current value
- Check whether calls to ckpt_hbuf_get() fail
- Discard field 'h->parent'
- Revert change to pr_debug(), back to ckpt_debug()
Changelog[v13]:
- Avoid access to hh->vma_type after the header is freed
- Test for no vma's in exit_mmap() before calling unmap_vma() (or it
may crash if restart fails after having removed all vma's)
Changelog[v12]:
- Replace obsolete ckpt_debug() with pr_debug()
Changelog[v9]:
- Introduce ckpt_ctx_checkpoint() for checkpoint-specific ctx setup
Changelog[v7]:
- Fix argument given to kunmap_atomic() in memory dump/restore
Changelog[v6]:
- Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
(even though it's not really needed)
Changelog[v5]:
- Improve memory restore code (following Dave Hansen's comments)
- Change dump format (and code) to allow chunks of <vaddrs, pages>
instead of one long list of each
- Memory restore now maps user pages explicitly to copy data into them,
instead of reading directly to user space; got rid of mprotect_fixup()
Changelog[v4]:
- Use standard list_... for ckpt_pgarr
Signed-off-by: Oren Laadan <orenl@...columbia.edu>
Acked-by: Serge E. Hallyn <serue@...ibm.com>
Tested-by: Serge E. Hallyn <serue@...ibm.com>
---
arch/x86/include/asm/ldt.h | 7 +
arch/x86/kernel/checkpoint.c | 64 ++++++
checkpoint/memory.c | 476 ++++++++++++++++++++++++++++++++++++++++
checkpoint/objhash.c | 1 +
checkpoint/process.c | 3 +
checkpoint/restart.c | 3 +
fs/exec.c | 2 +-
include/linux/checkpoint.h | 8 +
include/linux/checkpoint_hdr.h | 2 +-
include/linux/mm.h | 14 ++
mm/filemap.c | 23 ++-
mm/mmap.c | 77 ++++++-
12 files changed, 669 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/asm/ldt.h
index 46727eb..f2845f9 100644
--- a/arch/x86/include/asm/ldt.h
+++ b/arch/x86/include/asm/ldt.h
@@ -37,4 +37,11 @@ struct user_desc {
#define MODIFY_LDT_CONTENTS_CODE 2
#endif /* !__ASSEMBLY__ */
+
+#ifdef __KERNEL__
+#include <linux/linkage.h>
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+ unsigned long bytecount);
+#endif
+
#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/kernel/checkpoint.c b/arch/x86/kernel/checkpoint.c
index dec824c..cf86b7a 100644
--- a/arch/x86/kernel/checkpoint.c
+++ b/arch/x86/kernel/checkpoint.c
@@ -13,6 +13,7 @@
#include <asm/desc.h>
#include <asm/i387.h>
+#include <asm/elf.h>
#include <linux/checkpoint.h>
#include <linux/checkpoint_hdr.h>
@@ -465,3 +466,66 @@ int restore_read_header_arch(struct ckpt_ctx *ctx)
ckpt_hdr_put(ctx, h);
return ret;
}
+
+int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct ckpt_hdr_mm_context *h;
+ unsigned int n;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM_CONTEXT);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ ckpt_debug("nldt %d vdso %#lx (%p)\n",
+ h->nldt, (unsigned long) h->vdso, mm->context.vdso);
+
+ ret = -EINVAL;
+ if (h->vdso != (unsigned long) mm->context.vdso)
+ goto out;
+ if (h->ldt_entry_size != LDT_ENTRY_SIZE)
+ goto out;
+
+ ret = _ckpt_read_obj_type(ctx, NULL,
+ h->nldt * LDT_ENTRY_SIZE,
+ CKPT_HDR_MM_CONTEXT_LDT);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * to utilize the syscall modify_ldt() we first convert the data
+ * in the checkpoint image from 'struct desc_struct' to 'struct
+ * user_desc' with reverse logic of include/asm/desc.h:fill_ldt()
+ */
+ for (n = 0; n < h->nldt; n++) {
+ struct user_desc info;
+ struct desc_struct desc;
+ mm_segment_t old_fs;
+
+ ret = ckpt_kread(ctx, &desc, LDT_ENTRY_SIZE);
+ if (ret < 0)
+ break;
+
+ info.entry_number = n;
+ info.base_addr = desc.base0 | (desc.base1 << 16);
+ info.limit = desc.limit0;
+ info.seg_32bit = desc.d;
+ info.contents = desc.type >> 2;
+ info.read_exec_only = (desc.type >> 1) ^ 1;
+ info.limit_in_pages = desc.g;
+ info.seg_not_present = desc.p ^ 1;
+ info.useable = desc.avl;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ ret = sys_modify_ldt(1, (struct user_desc __user *) &info,
+ sizeof(info));
+ set_fs(old_fs);
+
+ if (ret < 0)
+ break;
+ }
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
diff --git a/checkpoint/memory.c b/checkpoint/memory.c
index e82d240..3016521 100644
--- a/checkpoint/memory.c
+++ b/checkpoint/memory.c
@@ -16,6 +16,9 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/aio.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/mm_types.h>
#include <linux/proc_fs.h>
@@ -721,3 +724,476 @@ int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t)
return ret;
}
+
+/***********************************************************************
+ * Restart
+ *
+ * Unlike checkpoint, restart is executed in the context of each restarting
+ * process: vma regions are restored via a call to mmap(), and the data is
+ * read into the address space of the current process.
+ */
+
+/**
+ * read_pages_vaddrs - read addresses of pages to page-array chain
+ * @ctx - restart context
+ * @nr_pages - number of address to read
+ */
+static int read_pages_vaddrs(struct ckpt_ctx *ctx, unsigned long nr_pages)
+{
+ struct ckpt_pgarr *pgarr;
+ unsigned long *vaddrp;
+ int nr, ret;
+
+ while (nr_pages) {
+ pgarr = pgarr_current(ctx);
+ if (!pgarr)
+ return -ENOMEM;
+ nr = pgarr_nr_free(pgarr);
+ if (nr > nr_pages)
+ nr = nr_pages;
+ vaddrp = &pgarr->vaddrs[pgarr->nr_used];
+ ret = ckpt_kread(ctx, vaddrp, nr * sizeof(unsigned long));
+ if (ret < 0)
+ return ret;
+ pgarr->nr_used += nr;
+ nr_pages -= nr;
+ }
+ return 0;
+}
+
+int restore_read_page(struct ckpt_ctx *ctx, struct page *page)
+{
+ void *ptr;
+ int ret;
+
+ ret = ckpt_kread(ctx, ctx->scratch_page, PAGE_SIZE);
+ if (ret < 0)
+ return ret;
+
+ ptr = kmap_atomic(page, KM_USER1);
+ memcpy(ptr, ctx->scratch_page, PAGE_SIZE);
+ kunmap_atomic(ptr, KM_USER1);
+
+ return 0;
+}
+
+/**
+ * read_pages_contents - read in data of pages in page-array chain
+ * @ctx - restart context
+ */
+static int read_pages_contents(struct ckpt_ctx *ctx)
+{
+ struct mm_struct *mm = current->mm;
+ struct ckpt_pgarr *pgarr;
+ unsigned long *vaddrs;
+ int i, ret = 0;
+
+ list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
+ vaddrs = pgarr->vaddrs;
+ for (i = 0; i < pgarr->nr_used; i++) {
+ struct page *page;
+
+ /* TODO: do in chunks to reduce mmap_sem overhead */
+ _ckpt_debug(CKPT_DPAGE, "got page %#lx\n", vaddrs[i]);
+ down_read(¤t->mm->mmap_sem);
+ ret = get_user_pages(current, mm, vaddrs[i],
+ 1, 1, 1, &page, NULL);
+ up_read(¤t->mm->mmap_sem);
+ if (ret < 0)
+ return ret;
+
+ ret = restore_read_page(ctx, page);
+ page_cache_release(page);
+
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return ret;
+}
+
+/**
+ * restore_memory_contents - restore contents of a VMA with private memory
+ * @ctx - restart context
+ *
+ * Reads a header that specifies how many pages will follow, then reads
+ * a list of virtual addresses into ctx->pgarr_list page-array chain,
+ * followed by the actual contents of the corresponding pages. Iterates
+ * these steps until reaching a header specifying "0" pages, which marks
+ * the end of the contents.
+ */
+static int restore_memory_contents(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_pgarr *h;
+ unsigned long nr_pages;
+ int len, ret = 0;
+
+ while (1) {
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_PGARR);
+ if (IS_ERR(h))
+ break;
+
+ ckpt_debug("total pages %ld\n", (unsigned long) h->nr_pages);
+
+ nr_pages = h->nr_pages;
+ ckpt_hdr_put(ctx, h);
+
+ if (!nr_pages)
+ break;
+
+ len = nr_pages * (sizeof(unsigned long) + PAGE_SIZE);
+ ret = _ckpt_read_buffer(ctx, NULL, len);
+ if (ret < 0)
+ break;
+
+ ret = read_pages_vaddrs(ctx, nr_pages);
+ if (ret < 0)
+ break;
+ ret = read_pages_contents(ctx);
+ if (ret < 0)
+ break;
+ pgarr_reset_all(ctx);
+ }
+
+ return ret;
+}
+
+/**
+ * calc_map_prot_bits - convert vm_flags to mmap protection
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_prot_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_prot = 0;
+
+ if (orig_vm_flags & VM_READ)
+ vm_prot |= PROT_READ;
+ if (orig_vm_flags & VM_WRITE)
+ vm_prot |= PROT_WRITE;
+ if (orig_vm_flags & VM_EXEC)
+ vm_prot |= PROT_EXEC;
+ if (orig_vm_flags & PROT_SEM) /* only (?) with IPC-SHM */
+ vm_prot |= PROT_SEM;
+
+ return vm_prot;
+}
+
+/**
+ * calc_map_flags_bits - convert vm_flags to mmap flags
+ * orig_vm_flags: source vm_flags
+ */
+static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
+{
+ unsigned long vm_flags = 0;
+
+ vm_flags = MAP_FIXED;
+ if (orig_vm_flags & VM_GROWSDOWN)
+ vm_flags |= MAP_GROWSDOWN;
+ if (orig_vm_flags & VM_DENYWRITE)
+ vm_flags |= MAP_DENYWRITE;
+ if (orig_vm_flags & VM_EXECUTABLE)
+ vm_flags |= MAP_EXECUTABLE;
+ if (orig_vm_flags & VM_MAYSHARE)
+ vm_flags |= MAP_SHARED;
+ else
+ vm_flags |= MAP_PRIVATE;
+
+ return vm_flags;
+}
+
+/**
+ * generic_vma_restore - restore a vma
+ * @mm - address space
+ * @file - file to map (NULL for anonymous)
+ * @h - vma header data
+ */
+static unsigned long generic_vma_restore(struct mm_struct *mm,
+ struct file *file,
+ struct ckpt_hdr_vma *h)
+{
+ unsigned long vm_size, vm_start, vm_flags, vm_prot, vm_pgoff;
+ unsigned long addr;
+
+ if (h->vm_end < h->vm_start)
+ return -EINVAL;
+ if (h->vma_objref < 0)
+ return -EINVAL;
+
+ vm_start = h->vm_start;
+ vm_pgoff = h->vm_pgoff;
+ vm_size = h->vm_end - h->vm_start;
+ vm_prot = calc_map_prot_bits(h->vm_flags);
+ vm_flags = calc_map_flags_bits(h->vm_flags);
+
+ down_write(&mm->mmap_sem);
+ addr = do_mmap_pgoff(file, vm_start, vm_size,
+ vm_prot, vm_flags, vm_pgoff);
+ up_write(&mm->mmap_sem);
+ ckpt_debug("size %#lx prot %#lx flag %#lx pgoff %#lx => %#lx\n",
+ vm_size, vm_prot, vm_flags, vm_pgoff, addr);
+
+ return addr;
+}
+
+/**
+ * private_vma_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @file: file to use for mapping
+ * @h - vma header data
+ */
+int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct file *file, struct ckpt_hdr_vma *h)
+{
+ unsigned long addr;
+
+ if (h->vm_flags & (VM_SHARED | VM_MAYSHARE))
+ return -EINVAL;
+
+ addr = generic_vma_restore(mm, file, h);
+ if (IS_ERR((void *) addr))
+ return PTR_ERR((void *) addr);
+
+ return restore_memory_contents(ctx);
+}
+
+/**
+ * anon_private_restore - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ * @h - vma header data
+ */
+static int anon_private_restore(struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *h)
+{
+ /*
+ * vm_pgoff for anonymous mapping is the "global" page
+ * offset (namely from addr 0x0), so we force a zero
+ */
+ h->vm_pgoff = 0;
+
+ return private_vma_restore(ctx, mm, NULL, h);
+}
+
+/* callbacks to restore vma per its type: */
+struct restore_vma_ops {
+ char *vma_name;
+ enum vma_type vma_type;
+ int (*restore) (struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *ptr);
+};
+
+static struct restore_vma_ops restore_vma_ops[] = {
+ /* ignored vma */
+ {
+ .vma_name = "IGNORE",
+ .vma_type = CKPT_VMA_IGNORE,
+ .restore = NULL,
+ },
+ /* special mapping (vdso) */
+ {
+ .vma_name = "VDSO",
+ .vma_type = CKPT_VMA_VDSO,
+ .restore = special_mapping_restore,
+ },
+ /* anonymous private */
+ {
+ .vma_name = "ANON PRIVATE",
+ .vma_type = CKPT_VMA_ANON,
+ .restore = anon_private_restore,
+ },
+ /* file-mapped private */
+ {
+ .vma_name = "FILE PRIVATE",
+ .vma_type = CKPT_VMA_FILE,
+ .restore = filemap_restore,
+ },
+};
+
+/**
+ * restore_vma - read vma data, recreate it and read contents
+ * @ctx: checkpoint context
+ * @mm: memory address space
+ */
+static int restore_vma(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct ckpt_hdr_vma *h;
+ struct restore_vma_ops *ops;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VMA);
+ if (IS_ERR(h))
+ return PTR_ERR(h);
+
+ ckpt_debug("vma %#lx-%#lx flags %#lx type %d vmaref %d\n",
+ (unsigned long) h->vm_start, (unsigned long) h->vm_end,
+ (unsigned long) h->vm_flags, (int) h->vma_type,
+ (int) h->vma_objref);
+
+ ret = -EINVAL;
+ if (h->vm_end < h->vm_start)
+ goto out;
+ if (h->vma_objref < 0)
+ goto out;
+ if (h->vma_type >= CKPT_VMA_MAX)
+ goto out;
+ if (h->vm_flags & CKPT_VMA_NOT_SUPPORTED)
+ return -ENOSYS;
+
+ ops = &restore_vma_ops[h->vma_type];
+
+ /* make sure we don't change this accidentally */
+ BUG_ON(ops->vma_type != h->vma_type);
+
+ if (ops->restore) {
+ ckpt_debug("vma type %s\n", ops->vma_name);
+ ret = ops->restore(ctx, mm, h);
+ } else {
+ ckpt_debug("vma ignored\n");
+ ret = 0;
+ }
+ out:
+ ckpt_hdr_put(ctx, h);
+ return ret;
+}
+
+static int ckpt_read_auxv(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ int i, ret;
+ u64 *buf = kmalloc(CKPT_AT_SZ, GFP_KERNEL);
+
+ if (!buf)
+ return -ENOMEM;
+ ret = _ckpt_read_buffer(ctx, buf, CKPT_AT_SZ);
+ if (ret < 0)
+ goto out;
+
+ ret = -E2BIG;
+ for (i = 0; i < AT_VECTOR_SIZE; i++)
+ if (buf[i] > (u64) ULONG_MAX)
+ goto out;
+
+ for (i = 0; i < AT_VECTOR_SIZE - 1; i++)
+ mm->saved_auxv[i] = buf[i];
+ /* sanitize the input: force AT_NULL in last entry */
+ mm->saved_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+
+ ret = 0;
+ out:
+ kfree(buf);
+ return ret;
+}
+
+static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx)
+{
+ struct ckpt_hdr_mm *h;
+ struct mm_struct *mm = NULL;
+ struct file *file;
+ unsigned int nr;
+ int ret;
+
+ h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_MM);
+ if (IS_ERR(h))
+ return (struct mm_struct *) h;
+
+ ckpt_debug("map_count %d\n", h->map_count);
+
+ /* XXX need more sanity checks */
+
+ ret = -EINVAL;
+ if ((h->start_code > h->end_code) ||
+ (h->start_data > h->end_data))
+ goto out;
+ if (h->exe_objref < 0)
+ goto out;
+ if (h->def_flags & ~VM_LOCKED)
+ goto out;
+ if (h->flags & ~(MMF_DUMP_FILTER_MASK |
+ ((1 << MMF_DUMP_FILTER_BITS) - 1)))
+ goto out;
+
+ mm = current->mm;
+
+ /* point of no return -- destruct current mm */
+ down_write(&mm->mmap_sem);
+ ret = destroy_mm(mm);
+ if (ret < 0) {
+ up_write(&mm->mmap_sem);
+ goto out;
+ }
+
+ mm->flags = h->flags;
+ mm->def_flags = h->def_flags;
+
+ mm->start_code = h->start_code;
+ mm->end_code = h->end_code;
+ mm->start_data = h->start_data;
+ mm->end_data = h->end_data;
+ mm->start_brk = h->start_brk;
+ mm->brk = h->brk;
+ mm->start_stack = h->start_stack;
+ mm->arg_start = h->arg_start;
+ mm->arg_end = h->arg_end;
+ mm->env_start = h->env_start;
+ mm->env_end = h->env_end;
+
+ /* restore the ->exe_file */
+ if (h->exe_objref) {
+ file = ckpt_obj_fetch(ctx, h->exe_objref, CKPT_OBJ_FILE);
+ if (IS_ERR(file)) {
+ up_write(&mm->mmap_sem);
+ ret = PTR_ERR(file);
+ goto out;
+ }
+ set_mm_exe_file(mm, file);
+ }
+ up_write(&mm->mmap_sem);
+
+ ret = ckpt_read_auxv(ctx, mm);
+ if (ret < 0) {
+ ckpt_err(ctx, ret, "Error restoring auxv\n");
+ goto out;
+ }
+
+ for (nr = h->map_count; nr; nr--) {
+ ret = restore_vma(ctx, mm);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = restore_mm_context(ctx, mm);
+ out:
+ ckpt_hdr_put(ctx, h);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ /* restore_obj() expect an extra reference */
+ atomic_inc(&mm->mm_users);
+ return mm;
+}
+
+void *restore_mm(struct ckpt_ctx *ctx)
+{
+ return (void *) do_restore_mm(ctx);
+}
+
+int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = ckpt_obj_fetch(ctx, mm_objref, CKPT_OBJ_MM);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
+ if (mm == current->mm)
+ return 0;
+
+ ret = exec_mmap(mm);
+ if (ret < 0)
+ return ret;
+
+ atomic_inc(&mm->mm_users);
+ return 0;
+}
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 16bb6cb..3243bb4 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -148,6 +148,7 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
.ref_grab = obj_mm_grab,
.ref_users = obj_mm_users,
.checkpoint = checkpoint_mm,
+ .restore = restore_mm,
},
};
diff --git a/checkpoint/process.c b/checkpoint/process.c
index cc858c3..91999ee 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -372,6 +372,9 @@ static int restore_task_objs(struct ckpt_ctx *ctx)
ret = restore_obj_file_table(ctx, h->files_objref);
ckpt_debug("file_table: ret %d (%p)\n", ret, current->files);
+ ret = restore_obj_mm(ctx, h->mm_objref);
+ ckpt_debug("mm: ret %d (%p)\n", ret, current->mm);
+
ckpt_hdr_put(ctx, h);
return ret;
}
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index d33b18a..325d03a 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -563,6 +563,9 @@ static int check_kernel_const(struct ckpt_const *h)
/* task */
if (h->task_comm_len != sizeof(tsk->comm))
return -EINVAL;
+ /* mm->saved_auxv size */
+ if (h->at_vector_size != AT_VECTOR_SIZE)
+ return -EINVAL;
/* uts */
if (h->uts_release_len != sizeof(uts->release))
return -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index cce6bbd..ed3b98a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -710,7 +710,7 @@ int kernel_read(struct file *file, loff_t offset,
EXPORT_SYMBOL(kernel_read);
-static int exec_mmap(struct mm_struct *mm)
+int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct * old_mm, *active_mm;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 2f050ef..0b47f46 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -84,6 +84,7 @@ extern char *ckpt_fill_fname(struct path *path, struct path *root,
char *buf, int *len);
extern int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page);
+extern int restore_read_page(struct ckpt_ctx *ctx, struct page *page);
/* ckpt kflags */
#define ckpt_set_ctx_kflag(__ctx, __kflag) \
@@ -157,6 +158,7 @@ extern int checkpoint_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
extern int restore_read_header_arch(struct ckpt_ctx *ctx);
extern int restore_thread(struct ckpt_ctx *ctx);
extern int restore_cpu(struct ckpt_ctx *ctx);
+extern int restore_mm_context(struct ckpt_ctx *ctx, struct mm_struct *mm);
extern int checkpoint_restart_block(struct ckpt_ctx *ctx,
struct task_struct *t);
@@ -197,9 +199,15 @@ extern int private_vma_checkpoint(struct ckpt_ctx *ctx,
int vma_objref);
extern int checkpoint_obj_mm(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_obj_mm(struct ckpt_ctx *ctx, int mm_objref);
extern int ckpt_collect_mm(struct ckpt_ctx *ctx, struct task_struct *t);
extern int checkpoint_mm(struct ckpt_ctx *ctx, void *ptr);
+extern void *restore_mm(struct ckpt_ctx *ctx);
+
+extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct file *file, struct ckpt_hdr_vma *h);
+
#define CKPT_VMA_NOT_SUPPORTED \
(VM_SHARED | VM_MAYSHARE | VM_IO | VM_HUGETLB | \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index b3dc6fa..0687b61 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -307,7 +307,7 @@ struct ckpt_hdr_mm {
__u64 arg_start, arg_end, env_start, env_end;
} __attribute__((aligned(8)));
-/* vma subtypes */
+/* vma subtypes - index into restore_vma_dispatch[] */
enum vma_type {
CKPT_VMA_IGNORE = 0,
#define CKPT_VMA_IGNORE CKPT_VMA_IGNORE
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ef3e6b4..bdeb0b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1176,9 +1176,13 @@ out:
}
extern int do_munmap(struct mm_struct *, unsigned long, size_t);
+extern int destroy_mm(struct mm_struct *);
extern unsigned long do_brk(unsigned long, unsigned long);
+/* fs/exec.c */
+extern int exec_mmap(struct mm_struct *mm);
+
/* filemap.c */
extern unsigned long page_unuse(struct page *);
extern void truncate_inode_pages(struct address_space *, loff_t);
@@ -1197,6 +1201,16 @@ extern int filemap_checkpoint(struct ckpt_ctx *, struct vm_area_struct *);
int write_one_page(struct page *page, int wait);
void task_dirty_inc(struct task_struct *tsk);
+
+/* checkpoint/restart */
+#ifdef CONFIG_CHECKPOINT
+struct ckpt_hdr_vma;
+extern int filemap_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct ckpt_hdr_vma *hh);
+extern int special_mapping_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+ struct ckpt_hdr_vma *hh);
+#endif
+
/* readahead.c */
#define VM_MAX_READAHEAD 128 /* kbytes */
#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
diff --git a/mm/filemap.c b/mm/filemap.c
index 85998c5..f53223f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1611,9 +1611,28 @@ int filemap_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
return private_vma_checkpoint(ctx, vma, CKPT_VMA_FILE, vma_objref);
}
EXPORT_SYMBOL(filemap_checkpoint);
-#else
+
+int filemap_restore(struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *h)
+{
+ struct file *file;
+ int ret;
+
+ if (h->vma_type == CKPT_VMA_FILE &&
+ (h->vm_flags & (VM_SHARED | VM_MAYSHARE)))
+ return -EINVAL;
+
+ file = ckpt_obj_fetch(ctx, h->vma_objref, CKPT_OBJ_FILE);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = private_vma_restore(ctx, mm, file, h);
+ return ret;
+}
+#else /* !CONFIG_CHECKPOINT */
#define filemap_checkpoint NULL
-#endif /* CONFIG_CHECKPOINT */
+#endif
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
diff --git a/mm/mmap.c b/mm/mmap.c
index 3fac497..6573e51 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1934,14 +1934,11 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@...p.org>
*/
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+int do_munmap_nocheck(struct mm_struct *mm, unsigned long start, size_t len)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
- if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
-
if ((len = PAGE_ALIGN(len)) == 0)
return -EINVAL;
@@ -2015,8 +2012,39 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
return 0;
}
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+{
+ if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
+
+ return do_munmap_nocheck(mm, start, len);
+}
+
EXPORT_SYMBOL(do_munmap);
+/*
+ * called with mm->mmap-sem held
+ * only called from checkpoint/memory.c:restore_mm()
+ */
+int destroy_mm(struct mm_struct *mm)
+{
+ struct vm_area_struct *vmnext = mm->mmap;
+ struct vm_area_struct *vma;
+ int ret;
+
+ while (vmnext) {
+ vma = vmnext;
+ vmnext = vmnext->vm_next;
+ ret = do_munmap_nocheck(mm, vma->vm_start,
+ vma->vm_end-vma->vm_start);
+ if (ret < 0) {
+ pr_warning("%s: failed munmap (%d)\n", __func__, ret);
+ return ret;
+ }
+ }
+ return 0;
+}
+
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
int ret;
@@ -2172,7 +2200,7 @@ void exit_mmap(struct mm_struct *mm)
tlb = tlb_gather_mmu(mm, 1);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
- end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+ end = vma ? unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL) : 0;
vm_unacct_memory(nr_accounted);
free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
@@ -2332,6 +2360,14 @@ static void special_mapping_close(struct vm_area_struct *vma)
}
#ifdef CONFIG_CHECKPOINT
+/*
+ * FIX:
+ * - checkpoint vdso pages (once per distinct vdso is enough)
+ * - check for compatilibility between saved and current vdso
+ * - accommodate for dynamic kernel data in vdso page
+ *
+ * Current, we require COMPAT_VDSO which somewhat mitigates the issue
+ */
static int special_mapping_checkpoint(struct ckpt_ctx *ctx,
struct vm_area_struct *vma)
{
@@ -2353,9 +2389,36 @@ static int special_mapping_checkpoint(struct ckpt_ctx *ctx,
return generic_vma_checkpoint(ctx, vma, CKPT_VMA_VDSO, 0);
}
-#else
+
+int special_mapping_restore(struct ckpt_ctx *ctx,
+ struct mm_struct *mm,
+ struct ckpt_hdr_vma *h)
+{
+ int ret = 0;
+
+ /*
+ * FIX:
+ * Currently, we only handle VDSO/vsyscall special handling.
+ * Even that, is very basic - call arch_setup_additional_pages
+ * requiring the same mapping (start address) as before.
+ */
+
+ BUG_ON(h->vma_type != CKPT_VMA_VDSO);
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT)
+ if (test_thread_flag(TIF_IA32))
+ ret = syscall32_setup_pages(NULL, h->vm_start, 0);
+ else
+#endif
+ ret = arch_setup_additional_pages(NULL, h->vm_start, 0);
+#endif
+
+ return ret;
+}
+#else /* !CONFIG_CHECKPOINT */
#define special_mapping_checkpoint NULL
-#endif /* CONFIG_CHECKPOINT */
+#endif
static const struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
--
1.6.3.3
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists