lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20110215163948.429be561@mschwide.boeblingen.de.ibm.com>
Date:	Tue, 15 Feb 2011 16:39:48 +0100
From:	Martin Schwidefsky <schwidefsky@...ibm.com>
To:	Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc:	Hugh Dickins <hughd@...gle.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Benjamin Herrenschmidt <benh@...nel.crashing.org>,
	David Miller <davem@...emloft.net>,
	Nick Piggin <npiggin@...nel.dk>, linux-kernel@...r.kernel.org,
	linux-arch@...r.kernel.org, linux-mm@...ck.org,
	Andrea Arcangeli <aarcange@...hat.com>
Subject: Re: [PATCH 00/21] mm: Preemptibility -v6

On Tue, 15 Feb 2011 15:00:17 +0100
Martin Schwidefsky <schwidefsky@...ibm.com> wrote:

> On Wed, 19 Jan 2011 18:10:39 +0100
> Peter Zijlstra <a.p.zijlstra@...llo.nl> wrote:
> 
> > Martin, while doing the below DEFINE_PER_CPU removal I saw you had a
> > bunch of RCU table removal thingies in arch/s390/mm/pgtable.c, could
> > s390 use the generic bits like sparc and powerpc (see patch 16)?
> 
> That should do it. 229 deletions vs. 74 insertions, not bad. And the
> tlb flushing code actually got simpler. Even better :-)

Darn, forgot "quilt refresh". The last patch I've sent is the old,
broken one. This one is better..

--
Subject: [PATCH] s390: use generic RCP page-table freeing

From: Martin Schwidefsky <schwidefsky@...ibm.com>

Now that we have a generic implementation for RCU based page table
freeing, use it for s390 as well. It saves a couple of lines.

Signed-off-by: Martin Schwidefsky <schwidefsky@...ibm.com>
---
 arch/s390/Kconfig               |    1 
 arch/s390/include/asm/pgalloc.h |   19 +--
 arch/s390/include/asm/tlb.h     |   91 ++++++++----------
 arch/s390/mm/pgtable.c          |  192 +++++-----------------------------------
 4 files changed, 74 insertions(+), 229 deletions(-)

--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -87,6 +87,7 @@ config S390
 	select HAVE_KERNEL_LZO
 	select HAVE_GET_USER_PAGES_FAST
 	select HAVE_ARCH_MUTEX_CPU_RELAX
+	select HAVE_RCU_TABLE_FREE
 	select ARCH_INLINE_SPIN_TRYLOCK
 	select ARCH_INLINE_SPIN_TRYLOCK_BH
 	select ARCH_INLINE_SPIN_LOCK
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -20,12 +20,11 @@
 #define check_pgt_cache()	do {} while (0)
 
 unsigned long *crst_table_alloc(struct mm_struct *, int);
-void crst_table_free(struct mm_struct *, unsigned long *);
-void crst_table_free_rcu(struct mm_struct *, unsigned long *);
+void crst_table_free(unsigned long *);
 
 unsigned long *page_table_alloc(struct mm_struct *);
-void page_table_free(struct mm_struct *, unsigned long *);
-void page_table_free_rcu(struct mm_struct *, unsigned long *);
+void page_table_free(unsigned long *);
+
 void disable_noexec(struct mm_struct *, struct task_struct *);
 
 static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
@@ -95,7 +94,7 @@ static inline pud_t *pud_alloc_one(struc
 		crst_table_init(table, _REGION3_ENTRY_EMPTY);
 	return (pud_t *) table;
 }
-#define pud_free(mm, pud) crst_table_free(mm, (unsigned long *) pud)
+#define pud_free(mm, pud) crst_table_free((unsigned long *) pud)
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
 {
@@ -104,7 +103,7 @@ static inline pmd_t *pmd_alloc_one(struc
 		crst_table_init(table, _SEGMENT_ENTRY_EMPTY);
 	return (pmd_t *) table;
 }
-#define pmd_free(mm, pmd) crst_table_free(mm, (unsigned long *) pmd)
+#define pmd_free(mm, pmd) crst_table_free((unsigned long *) pmd)
 
 static inline void pgd_populate_kernel(struct mm_struct *mm,
 				       pgd_t *pgd, pud_t *pud)
@@ -148,7 +147,7 @@ static inline pgd_t *pgd_alloc(struct mm
 	return (pgd_t *)
 		crst_table_alloc(mm, user_mode == SECONDARY_SPACE_MODE);
 }
-#define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd)
+#define pgd_free(mm, pgd) crst_table_free((unsigned long *) pgd)
 
 static inline void pmd_populate_kernel(struct mm_struct *mm,
 				       pmd_t *pmd, pte_t *pte)
@@ -175,9 +174,7 @@ static inline void pmd_populate(struct m
 #define pte_alloc_one_kernel(mm, vmaddr) ((pte_t *) page_table_alloc(mm))
 #define pte_alloc_one(mm, vmaddr) ((pte_t *) page_table_alloc(mm))
 
-#define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte)
-#define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte)
-
-extern void rcu_table_freelist_finish(void);
+#define pte_free_kernel(mm, pte) page_table_free((unsigned long *) pte)
+#define pte_free(mm, pte) page_table_free((unsigned long *) pte)
 
 #endif /* _S390_PGALLOC_H */
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -29,50 +29,42 @@
 #include <asm/smp.h>
 #include <asm/tlbflush.h>
 
+struct mmu_table_batch {
+	struct rcu_head rcu;
+	unsigned int nr;
+	void *tables[0];
+};
+
+#define MAX_TABLE_BATCH \
+	((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *))
+
+void tlb_table_flush(struct mmu_gather *);
+void tlb_remove_table(struct mmu_gather *, void *);
+
 struct mmu_gather {
 	struct mm_struct *mm;
 	unsigned int fullmm;
-	unsigned int nr_ptes;
-	unsigned int nr_pxds;
-	unsigned int max;
-	void **array;
-	void *local[8];
+	struct mmu_table_batch *batch;
+	/* need_flush is used only for page tables */
+	unsigned int need_flush : 1;
 };
 
-static inline void __tlb_alloc_page(struct mmu_gather *tlb)
-{
-	unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
-
-	if (addr) {
-		tlb->array = (void *) addr;
-		tlb->max = PAGE_SIZE / sizeof(void *);
-	}
-}
-
 static inline void tlb_gather_mmu(struct mmu_gather *tlb,
 				  struct mm_struct *mm,
 				  unsigned int full_mm_flush)
 {
 	tlb->mm = mm;
-	tlb->max = ARRAY_SIZE(tlb->local);
-	tlb->array = tlb->local;
 	tlb->fullmm = full_mm_flush;
 	if (tlb->fullmm)
 		__tlb_flush_mm(mm);
-	else
-		__tlb_alloc_page(tlb);
-	tlb->nr_ptes = 0;
-	tlb->nr_pxds = tlb->max;
+	tlb->batch = NULL;
+	tlb->need_flush = 0;
 }
 
 static inline void tlb_flush_mmu(struct mmu_gather *tlb)
 {
-	if (!tlb->fullmm && (tlb->nr_ptes > 0 || tlb->nr_pxds < tlb->max))
-		__tlb_flush_mm(tlb->mm);
-	while (tlb->nr_ptes > 0)
-		page_table_free_rcu(tlb->mm, tlb->array[--tlb->nr_ptes]);
-	while (tlb->nr_pxds < tlb->max)
-		crst_table_free_rcu(tlb->mm, tlb->array[tlb->nr_pxds++]);
+	if (tlb->need_flush)
+		tlb_table_flush(tlb);
 }
 
 static inline void tlb_finish_mmu(struct mmu_gather *tlb,
@@ -80,13 +72,8 @@ static inline void tlb_finish_mmu(struct
 {
 	tlb_flush_mmu(tlb);
 
-	rcu_table_freelist_finish();
-
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
-
-	if (tlb->array != tlb->local)
-		free_pages((unsigned long) tlb->array, 0);
 }
 
 /*
@@ -113,12 +100,11 @@ static inline void tlb_remove_page(struc
 static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 				unsigned long address)
 {
-	if (!tlb->fullmm) {
-		tlb->array[tlb->nr_ptes++] = pte;
-		if (tlb->nr_ptes >= tlb->nr_pxds)
-			tlb_flush_mmu(tlb);
-	} else
-		page_table_free(tlb->mm, (unsigned long *) pte);
+	if (!tlb->fullmm)
+		/* Use LSB to distinguish crst table vs. page table */
+		tlb_remove_table(tlb, (void *) pte + 1);
+	else
+		page_table_free((unsigned long *) pte);
 }
 
 /*
@@ -134,12 +120,10 @@ static inline void pmd_free_tlb(struct m
 #ifdef __s390x__
 	if (tlb->mm->context.asce_limit <= (1UL << 31))
 		return;
-	if (!tlb->fullmm) {
-		tlb->array[--tlb->nr_pxds] = pmd;
-		if (tlb->nr_ptes >= tlb->nr_pxds)
-			tlb_flush_mmu(tlb);
-	} else
-		crst_table_free(tlb->mm, (unsigned long *) pmd);
+	if (!tlb->fullmm)
+		tlb_remove_table(tlb, pmd);
+	else
+		crst_table_free((unsigned long *) pmd);
 #endif
 }
 
@@ -156,15 +140,22 @@ static inline void pud_free_tlb(struct m
 #ifdef __s390x__
 	if (tlb->mm->context.asce_limit <= (1UL << 42))
 		return;
-	if (!tlb->fullmm) {
-		tlb->array[--tlb->nr_pxds] = pud;
-		if (tlb->nr_ptes >= tlb->nr_pxds)
-			tlb_flush_mmu(tlb);
-	} else
-		crst_table_free(tlb->mm, (unsigned long *) pud);
+	if (!tlb->fullmm)
+		tlb_remove_table(tlb, pud);
+	else
+		crst_table_free((unsigned long *) pud);
 #endif
 }
 
+static inline void __tlb_remove_table(void *table)
+{
+	/* Use LSB to distinguish crst table vs. page table */
+	if ((unsigned long) table & 1)
+		page_table_free(table - 1);
+	else
+		crst_table_free(table);
+}
+
 #define tlb_start_vma(tlb, vma)			do { } while (0)
 #define tlb_end_vma(tlb, vma)			do { } while (0)
 #define tlb_remove_tlb_entry(tlb, ptep, addr)	do { } while (0)
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -24,92 +24,17 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 
-struct rcu_table_freelist {
-	struct rcu_head rcu;
-	struct mm_struct *mm;
-	unsigned int pgt_index;
-	unsigned int crst_index;
-	unsigned long *table[0];
-};
-
-#define RCU_FREELIST_SIZE \
-	((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \
-	  / sizeof(unsigned long))
-
-static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist);
-
-static void __page_table_free(struct mm_struct *mm, unsigned long *table);
-static void __crst_table_free(struct mm_struct *mm, unsigned long *table);
-
-static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm)
-{
-	struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist);
-	struct rcu_table_freelist *batch = *batchp;
-
-	if (batch)
-		return batch;
-	batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC);
-	if (batch) {
-		batch->mm = mm;
-		batch->pgt_index = 0;
-		batch->crst_index = RCU_FREELIST_SIZE;
-		*batchp = batch;
-	}
-	return batch;
-}
-
-static void rcu_table_freelist_callback(struct rcu_head *head)
-{
-	struct rcu_table_freelist *batch =
-		container_of(head, struct rcu_table_freelist, rcu);
-
-	while (batch->pgt_index > 0)
-		__page_table_free(batch->mm, batch->table[--batch->pgt_index]);
-	while (batch->crst_index < RCU_FREELIST_SIZE)
-		__crst_table_free(batch->mm, batch->table[batch->crst_index++]);
-	free_page((unsigned long) batch);
-}
-
-void rcu_table_freelist_finish(void)
-{
-	struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist);
-
-	if (!batch)
-		return;
-	call_rcu(&batch->rcu, rcu_table_freelist_callback);
-	__get_cpu_var(rcu_table_freelist) = NULL;
-}
-
-static void smp_sync(void *arg)
-{
-}
 
 #ifndef CONFIG_64BIT
 #define ALLOC_ORDER	1
 #define TABLES_PER_PAGE	4
 #define FRAG_MASK	15UL
 #define SECOND_HALVES	10UL
-
-void clear_table_pgstes(unsigned long *table)
-{
-	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
-	memset(table + 256, 0, PAGE_SIZE/4);
-	clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
-	memset(table + 768, 0, PAGE_SIZE/4);
-}
-
 #else
 #define ALLOC_ORDER	2
 #define TABLES_PER_PAGE	2
 #define FRAG_MASK	3UL
 #define SECOND_HALVES	2UL
-
-void clear_table_pgstes(unsigned long *table)
-{
-	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
-	memset(table + 256, 0, PAGE_SIZE/2);
-}
-
 #endif
 
 unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
@@ -138,6 +63,7 @@ unsigned long *crst_table_alloc(struct m
 			return NULL;
 		}
 		page->index = page_to_phys(shadow);
+		page->private = (unsigned long) mm;
 	}
 	spin_lock_bh(&mm->context.list_lock);
 	list_add(&page->lru, &mm->context.crst_list);
@@ -145,47 +71,19 @@ unsigned long *crst_table_alloc(struct m
 	return (unsigned long *) page_to_phys(page);
 }
 
-static void __crst_table_free(struct mm_struct *mm, unsigned long *table)
-{
-	unsigned long *shadow = get_shadow_table(table);
-
-	if (shadow)
-		free_pages((unsigned long) shadow, ALLOC_ORDER);
-	free_pages((unsigned long) table, ALLOC_ORDER);
-}
-
-void crst_table_free(struct mm_struct *mm, unsigned long *table)
-{
-	struct page *page = virt_to_page(table);
-
-	spin_lock_bh(&mm->context.list_lock);
-	list_del(&page->lru);
-	spin_unlock_bh(&mm->context.list_lock);
-	__crst_table_free(mm, table);
-}
-
-void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table)
+void crst_table_free(unsigned long *table)
 {
-	struct rcu_table_freelist *batch;
 	struct page *page = virt_to_page(table);
+	struct mm_struct *mm = (struct mm_struct *) page->private;
+	unsigned long *shadow = get_shadow_table(table);
 
 	spin_lock_bh(&mm->context.list_lock);
 	list_del(&page->lru);
+	page->private = 0;
 	spin_unlock_bh(&mm->context.list_lock);
-	if (atomic_read(&mm->mm_users) < 2 &&
-	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
-		__crst_table_free(mm, table);
-		return;
-	}
-	batch = rcu_table_freelist_get(mm);
-	if (!batch) {
-		smp_call_function(smp_sync, NULL, 1);
-		__crst_table_free(mm, table);
-		return;
-	}
-	batch->table[--batch->crst_index] = table;
-	if (batch->pgt_index >= batch->crst_index)
-		rcu_table_freelist_finish();
+	if (shadow)
+		free_pages((unsigned long) shadow, ALLOC_ORDER);
+	free_pages((unsigned long) table, ALLOC_ORDER);
 }
 
 #ifdef CONFIG_64BIT
@@ -223,7 +121,7 @@ repeat:
 	}
 	spin_unlock_bh(&mm->page_table_lock);
 	if (table)
-		crst_table_free(mm, table);
+		crst_table_free(table);
 	if (mm->context.asce_limit < limit)
 		goto repeat;
 	update_mm(mm, current);
@@ -257,7 +155,7 @@ void crst_table_downgrade(struct mm_stru
 		}
 		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
 		mm->task_size = mm->context.asce_limit;
-		crst_table_free(mm, (unsigned long *) pgd);
+		crst_table_free((unsigned long *) pgd);
 	}
 	update_mm(mm, current);
 }
@@ -288,11 +186,7 @@ unsigned long *page_table_alloc(struct m
 			return NULL;
 		pgtable_page_ctor(page);
 		page->flags &= ~FRAG_MASK;
-		table = (unsigned long *) page_to_phys(page);
-		if (mm->context.has_pgste)
-			clear_table_pgstes(table);
-		else
-			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
+		page->private = (unsigned long) mm;
 		spin_lock_bh(&mm->context.list_lock);
 		list_add(&page->lru, &mm->context.pgtable_list);
 	}
@@ -305,42 +199,34 @@ unsigned long *page_table_alloc(struct m
 	if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
 		list_move_tail(&page->lru, &mm->context.pgtable_list);
 	spin_unlock_bh(&mm->context.list_lock);
+	clear_table(table, _PAGE_TYPE_EMPTY, PTRS_PER_PTE * sizeof(long));
+	if (mm->context.noexec)
+		clear_table(table + 256, _PAGE_TYPE_EMPTY,
+			    PTRS_PER_PTE * sizeof(long));
+	else if (mm->context.has_pgste)
+		clear_table(table + 256, 0, PTRS_PER_PTE * sizeof(long));
 	return table;
 }
 
-static void __page_table_free(struct mm_struct *mm, unsigned long *table)
+void page_table_free(unsigned long *table)
 {
-	struct page *page;
-	unsigned long bits;
-
-	bits = ((unsigned long) table) & 15;
-	table = (unsigned long *)(((unsigned long) table) ^ bits);
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	page->flags ^= bits;
-	if (!(page->flags & FRAG_MASK)) {
-		pgtable_page_dtor(page);
-		__free_page(page);
-	}
-}
-
-void page_table_free(struct mm_struct *mm, unsigned long *table)
-{
-	struct page *page;
+	struct page *page = virt_to_page(table);
+	struct mm_struct *mm = (struct mm_struct *) page->private;
 	unsigned long bits;
 
 	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
 	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 	spin_lock_bh(&mm->context.list_lock);
 	page->flags ^= bits;
 	if (page->flags & FRAG_MASK) {
 		/* Page now has some free pgtable fragments. */
-		if (!list_empty(&page->lru))
-			list_move(&page->lru, &mm->context.pgtable_list);
+		list_move(&page->lru, &mm->context.pgtable_list);
 		page = NULL;
-	} else
+	} else {
 		/* All fragments of the 4K page have been freed. */
 		list_del(&page->lru);
+		page->private = 0;
+	}
 	spin_unlock_bh(&mm->context.list_lock);
 	if (page) {
 		pgtable_page_dtor(page);
@@ -348,36 +234,6 @@ void page_table_free(struct mm_struct *m
 	}
 }
 
-void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
-{
-	struct rcu_table_freelist *batch;
-	struct page *page;
-	unsigned long bits;
-
-	if (atomic_read(&mm->mm_users) < 2 &&
-	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
-		page_table_free(mm, table);
-		return;
-	}
-	batch = rcu_table_freelist_get(mm);
-	if (!batch) {
-		smp_call_function(smp_sync, NULL, 1);
-		page_table_free(mm, table);
-		return;
-	}
-	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
-	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
-	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
-	spin_lock_bh(&mm->context.list_lock);
-	/* Delayed freeing with rcu prevents reuse of pgtable fragments */
-	list_del_init(&page->lru);
-	spin_unlock_bh(&mm->context.list_lock);
-	table = (unsigned long *)(((unsigned long) table) | bits);
-	batch->table[batch->pgt_index++] = table;
-	if (batch->pgt_index >= batch->crst_index)
-		rcu_table_freelist_finish();
-}
-
 void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
 {
 	struct page *page;

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ