lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1221080631.6781.57.camel@nimitz>
Date:	Wed, 10 Sep 2008 14:03:51 -0700
From:	Dave Hansen <dave@...ux.vnet.ibm.com>
To:	Oren Laadan <orenl@...columbia.edu>
Cc:	containers@...ts.linux-foundation.org, jeremy@...p.org,
	arnd@...db.de, linux-kernel@...r.kernel.org
Subject: Cleanups for [PATCH 4/9] Memory management (dump)

This is a lot of changes.  But, they're all kinda intertwined
so it's hard to make individual patches out of them.  I've
tried to explain each of the changes as you look through
the patch sequentially.

Note that this patch removes more code than it adds, and I think it
makes everything more readable.  There are a few things I need to fix up
in the restore patch (like the use of nr_free), but nothing really
fundamental.

- Remove use of get_free_pages() for buffers, use kmalloc()
  for its debugging advantages, and not having to deal with
  "orders".
- Now that we use kfree(), don't check for NULL pages/vaddr
  since kfree() does it for us.
- Zero out the pgarr as we remove things from it.  Bug
  hunters will thank us.
- Change ctx->pgarr name to pgarr_list.
- Change the ordering of the pgarr_list so that the first
  entry is always the old "pgcurr".  Make a function to
  find the first entry, and kill "pgcurr".
- Get rid of pgarr->nr_free.  It's redundant with nr_used
  and the fixed-size allocation of all pgarrs.  Create
  a helper function (pgarr_is_full()) to replace it.
- Remove cr_pgarr_prep(), just use cr_pgarr_alloc().
- Create cr_add_to_pgarr() helper which also does some
  checking of the page states that come back from
  follow_page().
- Rename cr_vma_fill_pgarr() to cr_private_vma() to make
  it painfully obvious that it does not deal with
  shared memory of any kind.
- Don't fault in pages with handle_mm_fault(), we do not
  need them to be present, nor should we be wasting
  space on pagetables that need to be created for sparse
  memory areas.
- Add parenthesis around 'page_mapping(page) != NULL' check
- Don't bother releasing pgarr pages for a VMA since it
  will never free all of the VMA's pages anyway
- Don't track total pages.  If we really need this, we can
  track the number of full 'pgarr's on the list, and add
  the used pages from the first one.
- Reverse list iteration since we changed the list order
- Give cr_pgarr_reset() a better name: cr_reset_all_pgarrs()



---

 linux-2.6.git-dave/checkpoint/ckpt_mem.c |  262 +++++++++++++------------------
 linux-2.6.git-dave/checkpoint/ckpt_mem.h |    3 
 linux-2.6.git-dave/include/linux/ckpt.h  |    4 
 3 files changed, 115 insertions(+), 154 deletions(-)

diff -puN checkpoint/ckpt_mem.c~p4-dave checkpoint/ckpt_mem.c
--- linux-2.6.git/checkpoint/ckpt_mem.c~p4-dave	2008-09-10 13:58:55.000000000 -0700
+++ linux-2.6.git-dave/checkpoint/ckpt_mem.c	2008-09-10 14:00:04.000000000 -0700
@@ -25,41 +25,41 @@
  * (common to ckpt_mem.c and rstr_mem.c).
  *
  * The checkpoint context structure has two members for page-arrays:
- *   ctx->pgarr: list head of the page-array chain
- *   ctx->pgcur: tracks the "current" position in the chain
+ *   ctx->pgarr_list: list head of the page-array chain
  *
  * During checkpoint (and restart) the chain tracks the dirty pages (page
  * pointer and virtual address) of each MM. For a particular MM, these are
- * always added to the "current" page-array (ctx->pgcur). The "current"
+ * always added to the first entry in the ctx->pgarr_list.  This "current"
  * page-array advances as necessary, and new page-array descriptors are
  * allocated on-demand. Before the next MM, the chain is reset but not
- * freed (that is, dereference page pointers and reset ctx->pgcur).
+ * freed (that is, dereference page pointers).
  */
 
-#define CR_PGARR_ORDER  0
-#define CR_PGARR_TOTAL  ((PAGE_SIZE << CR_PGARR_ORDER) / sizeof(void *))
+#define CR_PGARR_TOTAL  (PAGE_SIZE / sizeof(void *))
 
 /* release pages referenced by a page-array */
-void cr_pgarr_unref_pages(struct cr_pgarr *pgarr)
+void cr_pgarr_release_pages(struct cr_pgarr *pgarr)
 {
 	int n;
 
-	/* only checkpoint keeps references to pages */
-	if (pgarr->pages) {
-		cr_debug("nr_used %d\n", pgarr->nr_used);
-		for (n = pgarr->nr_used; n--; )
-			page_cache_release(pgarr->pages[n]);
+	/*
+	 * No need to check pgarr->pages, since
+	 * nr_used will be 0 if it is NULL.
+	 */
+	cr_debug("nr_used %d\n", pgarr->nr_used);
+	for (n = pgarr->nr_used; n >= 0; n--) {
+		page_cache_release(pgarr->pages[n]);
+		pgarr->pages[n] = NULL;
+		pgarr->vaddrs[n] = 0;
 	}
 }
 
 /* free a single page-array object */
 static void cr_pgarr_free_one(struct cr_pgarr *pgarr)
 {
-	cr_pgarr_unref_pages(pgarr);
-	if (pgarr->pages)
-		free_pages((unsigned long) pgarr->pages, CR_PGARR_ORDER);
-	if (pgarr->vaddrs)
-		free_pages((unsigned long) pgarr->vaddrs, CR_PGARR_ORDER);
+	cr_pgarr_release_pages(pgarr);
+	kfree(pgarr->pages);
+	kfree(pgarr->vaddrs);
 	kfree(pgarr);
 }
 
@@ -68,11 +68,10 @@ void cr_pgarr_free(struct cr_ctx *ctx)
 {
 	struct cr_pgarr *pgarr, *tmp;
 
-	list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr, list) {
+	list_for_each_entry_safe(pgarr, tmp, &ctx->pgarr_list, list) {
 		list_del(&pgarr->list);
 		cr_pgarr_free_one(pgarr);
 	}
-	ctx->pgcur = NULL;
 }
 
 /* allocate a single page-array object */
@@ -84,13 +83,11 @@ static struct cr_pgarr *cr_pgarr_alloc_o
 	if (!pgarr)
 		return NULL;
 
-	pgarr->nr_free = CR_PGARR_TOTAL;
 	pgarr->nr_used = 0;
-
-	pgarr->pages = (struct page **)
-		__get_free_pages(GFP_KERNEL, CR_PGARR_ORDER);
-	pgarr->vaddrs = (unsigned long *)
-		__get_free_pages(GFP_KERNEL, CR_PGARR_ORDER);
+	pgarr->pages  = kmalloc(CR_PGARR_TOTAL * sizeof(struct page *),
+				GFP_KERNEL);
+	pgarr->vaddrs = kmalloc(CR_PGARR_TOTAL * sizeof(unsigned long *),
+				GFP_KERNEL);
 	if (!pgarr->pages || !pgarr->vaddrs) {
 		cr_pgarr_free_one(pgarr);
 		return NULL;
@@ -99,57 +96,56 @@ static struct cr_pgarr *cr_pgarr_alloc_o
 	return pgarr;
 }
 
-/* cr_pgarr_alloc - return the next available pgarr in the page-array chain
+static int pgarr_is_full(struct cr_pgarr *pgarr)
+{
+	if (pgarr->nr_used > CR_PGARR_TOTAL)
+		return 1;
+	return 0;
+}
+
+static struct cr_pgarr *cr_first_pgarr(struct cr_ctx *ctx)
+{
+	return list_first_entry(&ctx->pgarr_list, struct cr_pgarr, list);
+}
+
+/* cr_get_empty_pgarr - return the next available pgarr in the page-array chain
  * @ctx: checkpoint context
  *
- * Return the page-array following ctx->pgcur, extending the chain if needed
+ * Return the first page-array in the list with space.  Extend the
+ * list if none has space.
  */
-struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx)
+struct cr_pgarr *cr_get_empty_pgarr(struct cr_ctx *ctx)
 {
 	struct cr_pgarr *pgarr;
 
-	/* can reuse next element after ctx->pgcur ? */
-	pgarr = ctx->pgcur;
-	if (pgarr && !list_is_last(&pgarr->list, &ctx->pgarr)) {
-		pgarr = list_entry(pgarr->list.next, struct cr_pgarr, list);
+	/*
+	 * This could just as easily be a list_for_each() if we
+	 * need to do a more comprehensive search.
+	 */
+	pgarr = cr_first_pgarr(ctx);
+	if (!pgarr_is_full(pgarr))
 		goto out;
-	}
 
-	/* nope, need to extend the page-array chain */
+	ctx->nr_full_pgarrs++;
 	pgarr = cr_pgarr_alloc_one();
 	if (!pgarr)
 		return NULL;
 
-	list_add_tail(&pgarr->list, &ctx->pgarr);
+	list_add(&pgarr->list, &ctx->pgarr_list);
  out:
-	ctx->pgcur = pgarr;
 	return pgarr;
 
 }
 
 /* reset the page-array chain (dropping page references if necessary) */
-void cr_pgarr_reset(struct cr_ctx *ctx)
+void cr_reset_all_pgarrs(struct cr_ctx *ctx)
 {
 	struct cr_pgarr *pgarr;
 
-	list_for_each_entry(pgarr, &ctx->pgarr, list) {
-		cr_pgarr_unref_pages(pgarr);
-		pgarr->nr_free = CR_PGARR_TOTAL;
+	list_for_each_entry(pgarr, &ctx->pgarr_list, list) {
+		cr_pgarr_release_pages(pgarr);
 		pgarr->nr_used = 0;
 	}
-	ctx->pgcur = NULL;
-}
-
-
-/* return current page-array (and allocate if needed) */
-struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx
-)
-{
-	struct cr_pgarr *pgarr = ctx->pgcur;
-
-	if (!pgarr->nr_free)
-		pgarr = cr_pgarr_alloc(ctx);
-	return pgarr;
 }
 
 /*
@@ -161,116 +157,84 @@ struct cr_pgarr *cr_pgarr_prep(struct cr
  * dumped to the file descriptor.
  */
 
+/*
+ * You must ensure that the pgarr has space before
+ * calling this function.
+ */
+static inline void cr_add_to_pgarr(struct cr_pgarr *pgarr, struct page *page,
+				  unsigned long vaddr)
+{
+	/*
+	 * We're really just handing the result of the
+	 * follow_page() here.
+	 */
+	if (page == NULL)
+		return;
+	if (page == ZERO_PAGE(0))
+		return;
+
+	get_page(page);
+	pgarr->pages[pgarr->nr_used] = page;
+	pgarr->vaddrs[pgarr->nr_used] = vaddr;
+	pgarr->nr_used++;
+}
+
+static inline void cr_pgarr_release_index(struct cr_pgarr *pgarr, int index)
+{
+	page_cache_release(pgarr->pages[index]);
+	pgarr->pages[index] = NULL;
+	pgarr->vaddrs[index] = 0;
+	pgarr->nr_used--;
+}
+
 /**
- * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma
+ * cr_private_vma - fill the ctx structure with pgarrs containing
+ * 		    the contents of this VMA
  * @ctx - checkpoint context
- * @pgarr - page-array to fill
  * @vma - vma to scan
- * @start - start address (updated)
  */
-static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr,
-			     struct vm_area_struct *vma, unsigned long *start)
+static int cr_private_vma(struct cr_ctx *ctx,
+			struct vm_area_struct *vma)
 {
-	unsigned long end = vma->vm_end;
-	unsigned long addr = *start;
-	struct page **pagep;
-	unsigned long *addrp;
-	int cow, nr, ret = 0;
-
-	nr = pgarr->nr_free;
-	pagep = &pgarr->pages[pgarr->nr_used];
-	addrp = &pgarr->vaddrs[pgarr->nr_used];
-	cow = !!vma->vm_file;
+	struct cr_pgarr *pgarr;
+	unsigned long addr = vma->vm_start;
+	int ret = 0;
+	int cow = 0;
+	int orig_nr_used;
 
-	while (addr < end) {
-		struct page *page;
+reload:
+	pgarr = cr_get_empty_pgarr(ctx);
+	if (!pgarr)
+		return -ENOMEM;
 
-		/*
-		 * simplified version of get_user_pages(): already have vma,
-		 * only need FOLL_TOUCH, and (for now) ignore fault stats.
-		 *
-		 * FIXME: consolidate with get_user_pages()
-		 */
+       	orig_nr_used = pgarr->nr_used;
+	/*
+	 * This function is only for private mappings.  If
+	 * the vma is file backed, it must be a cow.
+	 */
+	if (vma->vm_file)
+		cow = 1;
 
-		cond_resched();
-		while (!(page = follow_page(vma, addr, FOLL_TOUCH))) {
-			ret = handle_mm_fault(vma->vm_mm, vma, addr, 0);
-			if (ret & VM_FAULT_ERROR) {
-				if (ret & VM_FAULT_OOM)
-					ret = -ENOMEM;
-				else if (ret & VM_FAULT_SIGBUS)
-					ret = -EFAULT;
-				else
-					BUG();
-				break;
-			}
-			cond_resched();
-			ret = 0;
-		}
+	while (addr < vma->vm_end) {
+		struct page *page;
 
+		cond_resched();
+		page = follow_page(vma, addr, FOLL_TOUCH);
 		if (IS_ERR(page))
 			ret = PTR_ERR(page);
 
 		if (ret < 0)
 			break;
 
-		if (page == ZERO_PAGE(0)) {
-			page = NULL;	/* zero page: ignore */
-		} else if (cow && page_mapping(page) != NULL) {
-			page = NULL;	/* clean cow: ignore */
-		} else {
-			get_page(page);
-			*(addrp++) = addr;
-			*(pagep++) = page;
-			if (--nr == 0) {
-				addr += PAGE_SIZE;
-				break;
-			}
-		}
-
+		if (cow && (page_mapping(page) != NULL))
+			page = NULL;
+		cr_add_to_pgarr(pgarr, page, addr);
 		addr += PAGE_SIZE;
+		if (pgarr_is_full(pgarr))
+			goto reload;
 	}
 
-	if (unlikely(ret < 0)) {
-		nr = pgarr->nr_free - nr;
-		while (nr--)
-			page_cache_release(*(--pagep));
-		return ret;
-	}
-
-	*start = addr;
-	return pgarr->nr_free - nr;
-}
-
-/**
- * cr_vma_scan_pages - scan vma for pages that will need to be dumped
- * @ctx - checkpoint context
- * @vma - vma to scan
- *
- * lists of page pointes and corresponding virtual addresses are tracked
- * inside ctx->pgarr page-array chain
- */
-static int cr_vma_scan_pages(struct cr_ctx *ctx, struct vm_area_struct *vma)
-{
-	unsigned long addr = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	struct cr_pgarr *pgarr;
-	int nr, total = 0;
-
-	while (addr < end) {
-		pgarr = cr_pgarr_prep(ctx);
-		if (!pgarr)
-			return -ENOMEM;
-		nr = cr_vma_fill_pgarr(ctx, pgarr, vma, &addr);
-		if (nr < 0)
-			return nr;
-		pgarr->nr_free -= nr;
-		pgarr->nr_used += nr;
-		total += nr;
-	}
-
-	cr_debug("total %d\n", total);
-	return total;
+	return ret;
 }
 
 static int cr_page_write(struct cr_ctx *ctx, struct page *page, char *buf)
@@ -300,7 +264,7 @@ static int cr_vma_dump_pages(struct cr_c
 	if (!total)
 		return 0;
 
-	list_for_each_entry(pgarr, &ctx->pgarr, list) {
+	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
 		ret = cr_kwrite(ctx, pgarr->vaddrs,
 				pgarr->nr_used * sizeof(*pgarr->vaddrs));
 		if (ret < 0)
@@ -311,7 +275,7 @@ static int cr_vma_dump_pages(struct cr_c
 	if (!buf)
 		return -ENOMEM;
 
-	list_for_each_entry(pgarr, &ctx->pgarr, list) {
+	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
 		for (i = 0; i < pgarr->nr_used; i++) {
 			ret = cr_page_write(ctx, pgarr->pages[i], buf);
 			if (ret < 0)
@@ -366,7 +330,7 @@ static int cr_write_vma(struct cr_ctx *c
 	/* (1) scan: scan through the PTEs of the vma to count the pages
 	 * to dump (and later make those pages COW), and keep the list of
 	 * pages (and a reference to each page) on the checkpoint ctx */
-	nr = cr_vma_scan_pages(ctx, vma);
+	nr = cr_private_vma(ctx, vma);
 	if (nr < 0)
 		return nr;
 
@@ -387,7 +351,7 @@ static int cr_write_vma(struct cr_ctx *c
 	ret = cr_vma_dump_pages(ctx, nr);
 
 	/* (3) free: release the extra references to the pages in the list */
-	cr_pgarr_reset(ctx);
+	cr_reset_all_pgarrs(ctx);
 
 	return ret;
 }
diff -puN checkpoint/ckpt_mem.h~p4-dave checkpoint/ckpt_mem.h
--- linux-2.6.git/checkpoint/ckpt_mem.h~p4-dave	2008-09-10 13:58:55.000000000 -0700
+++ linux-2.6.git-dave/checkpoint/ckpt_mem.h	2008-09-10 13:58:55.000000000 -0700
@@ -23,13 +23,10 @@ struct cr_pgarr {
 	unsigned long *vaddrs;
 	struct page **pages;
 	unsigned int nr_used;	/* how many entries already used */
-	unsigned int nr_free;	/* how many entries still free */
 	struct list_head list;
 };
 
-void cr_pgarr_reset(struct cr_ctx *ctx);
 void cr_pgarr_free(struct cr_ctx *ctx);
-struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx);
 struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx);
 
 #endif /* _CHECKPOINT_CKPT_MEM_H_ */
diff -puN include/linux/ckpt.h~p4-dave include/linux/ckpt.h
--- linux-2.6.git/include/linux/ckpt.h~p4-dave	2008-09-10 13:58:55.000000000 -0700
+++ linux-2.6.git-dave/include/linux/ckpt.h	2008-09-10 13:58:55.000000000 -0700
@@ -28,8 +28,8 @@ struct cr_ctx {
 	void *hbuf;		/* temporary buffer for headers */
 	int hpos;		/* position in headers buffer */
 
-	struct list_head pgarr;	/* page array for dumping VMA contents */
-	struct cr_pgarr *pgcur;	/* current position in page array */
+	struct list_head pgarr_list;	/* page array for dumping VMA contents */
+	unsigned long nr_full_pgarrs;
 
 	struct path *vfsroot;	/* container root (FIXME) */
 };
_



-- Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ