diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 26f15c4..022043e 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -60,8 +60,6 @@ int handle_page_fault(unsigned long address, unsigned long ip, goto good_area; else if(!(vma->vm_flags & VM_GROWSDOWN)) goto out; - else if(is_user && !ARCH_IS_STACKGROW(address)) - goto out; else if(expand_stack(vma, address)) goto out; diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c index 185399b..a2731fc 100644 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -232,9 +232,6 @@ do { \ #define load_elf_binary load_elf32_binary #define ELF_PLAT_INIT(r, load_addr) elf32_init(r) -#define setup_arg_pages(bprm, stack_top, exec_stack) \ - ia32_setup_arg_pages(bprm, stack_top, exec_stack) -int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack); #undef start_thread #define start_thread(regs,new_rip,new_rsp) do { \ @@ -289,55 +286,7 @@ static void elf32_init(struct pt_regs *regs) int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { - unsigned long stack_base; - struct vm_area_struct *mpnt; - struct mm_struct *mm = current->mm; - int i, ret; - - stack_base = stack_top - MAX_ARG_PAGES * PAGE_SIZE; - mm->arg_start = bprm->p + stack_base; - - bprm->p += stack_base; - if (bprm->loader) - bprm->loader += stack_base; - bprm->exec += stack_base; - - mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!mpnt) - return -ENOMEM; - - down_write(&mm->mmap_sem); - { - mpnt->vm_mm = mm; - mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; - mpnt->vm_end = stack_top; - if (executable_stack == EXSTACK_ENABLE_X) - mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC; - else if (executable_stack == EXSTACK_DISABLE_X) - mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC; - else - mpnt->vm_flags = VM_STACK_FLAGS; - mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? - PAGE_COPY_EXEC : PAGE_COPY; - if ((ret = insert_vm_struct(mm, mpnt))) { - up_write(&mm->mmap_sem); - kmem_cache_free(vm_area_cachep, mpnt); - return ret; - } - mm->stack_vm = mm->total_vm = vma_pages(mpnt); - } - - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page *page = bprm->page[i]; - if (page) { - bprm->page[i] = NULL; - install_arg_page(mpnt, page, stack_base); - } - stack_base += PAGE_SIZE; - } - up_write(&mm->mmap_sem); - - return 0; + return setup_arg_pages(bprm, stack_top, executable_stack); } EXPORT_SYMBOL(ia32_setup_arg_pages); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9cc4f0a..fa0cf77 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -254,8 +254,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, size_t len; if (__put_user((elf_addr_t)p, argv++)) return -EFAULT; - len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES); - if (!len || len > PAGE_SIZE*MAX_ARG_PAGES) + len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) return 0; p += len; } @@ -266,8 +266,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, size_t len; if (__put_user((elf_addr_t)p, envp++)) return -EFAULT; - len = strnlen_user((void __user *)p, PAGE_SIZE*MAX_ARG_PAGES); - if (!len || len > PAGE_SIZE*MAX_ARG_PAGES) + len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) return 0; p += len; } @@ -777,10 +777,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) } /* OK, This is the point of no return */ - current->mm->start_data = 0; - current->mm->end_data = 0; - current->mm->end_code = 0; - current->mm->mmap = NULL; current->flags &= ~PF_FORKNOEXEC; current->mm->def_flags = def_flags; @@ -985,9 +981,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; - create_elf_tables(bprm, &loc->elf_ex, + retval = create_elf_tables(bprm, &loc->elf_ex, (interpreter_type == INTERPRETER_AOUT), load_addr, interp_load_addr); + if (retval < 0) { + send_sig(SIGSEGV, current, 0); + goto out; + } /* N.B. passed_fileno might not be initialized? */ if (interpreter_type == INTERPRETER_AOUT) current->mm->arg_start += strlen(passed_fileno) + 1; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index f3ddca4..f3dc3ca 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -622,8 +622,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, p = (char __user *) current->mm->arg_start; for (loop = bprm->argc; loop > 0; loop--) { __put_user((elf_caddr_t) p, argv++); - len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES); - if (!len || len > PAGE_SIZE * MAX_ARG_PAGES) + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) return -EINVAL; p += len; } @@ -634,8 +634,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, current->mm->env_start = (unsigned long) p; for (loop = bprm->envc; loop > 0; loop--) { __put_user((elf_caddr_t)(unsigned long) p, envp++); - len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES); - if (!len || len > PAGE_SIZE * MAX_ARG_PAGES) + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) return -EINVAL; p += len; } diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e6f5799..aa9ae99 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -126,7 +126,9 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto _ret; if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { - remove_arg_zero(bprm); + retval = remove_arg_zero(bprm); + if (retval) + goto _ret; } if (fmt->flags & MISC_FMT_OPEN_BINARY) { diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 1edbcca..5b6309f 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -68,7 +68,9 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) * This is done in reverse order, because of how the * user environment and arguments are stored. */ - remove_arg_zero(bprm); + retval = remove_arg_zero(bprm); + if (retval) + return retval; retval = copy_strings_kernel(1, &bprm->interp, bprm); if (retval < 0) return retval; bprm->argc++; diff --git a/fs/compat.c b/fs/compat.c index 72e5e69..5279745 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1391,6 +1391,7 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, { struct page *kmapped_page = NULL; char *kaddr = NULL; + unsigned long kpos = 0; int ret; while (argc-- > 0) { @@ -1399,92 +1400,84 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, unsigned long pos; if (get_user(str, argv+argc) || - !(len = strnlen_user(compat_ptr(str), bprm->p))) { + !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) { ret = -EFAULT; goto out; } - if (bprm->p < len) { + if (MAX_ARG_STRLEN < len) { ret = -E2BIG; goto out; } - bprm->p -= len; - /* XXX: add architecture specific overflow check here. */ + /* We're going to work our way backwords. */ pos = bprm->p; + str += len; + bprm->p -= len; while (len > 0) { - int i, new, err; int offset, bytes_to_copy; - struct page *page; offset = pos % PAGE_SIZE; - i = pos/PAGE_SIZE; - page = bprm->page[i]; - new = 0; - if (!page) { - page = alloc_page(GFP_HIGHUSER); - bprm->page[i] = page; - if (!page) { - ret = -ENOMEM; + if (offset == 0) + offset = PAGE_SIZE; + + bytes_to_copy = offset; + if (bytes_to_copy > len) + bytes_to_copy = len; + + offset -= bytes_to_copy; + pos -= bytes_to_copy; + str -= bytes_to_copy; + len -= bytes_to_copy; + + if (!kmapped_page || kpos != (pos & PAGE_MASK)) { + struct page *page; + +#ifdef CONFIG_STACK_GROWSUP + ret = expand_downwards(bprm->vma, pos); + if (ret < 0) { + /* We've exceed the stack rlimit. */ + ret = -E2BIG; + goto out; + } +#endif + ret = get_user_pages(current, bprm->mm, pos, + 1, 1, 1, &page, NULL); + if (ret <= 0) { + /* We've exceed the stack rlimit. */ + ret = -E2BIG; goto out; } - new = 1; - } - if (page != kmapped_page) { - if (kmapped_page) + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); kunmap(kmapped_page); + put_page(kmapped_page); + } kmapped_page = page; kaddr = kmap(kmapped_page); + kpos = pos & PAGE_MASK; + flush_cache_page(bprm->vma, kpos, + page_to_pfn(kmapped_page)); } - if (new && offset) - memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; - if (bytes_to_copy > len) { - bytes_to_copy = len; - if (new) - memset(kaddr+offset+len, 0, - PAGE_SIZE-offset-len); - } - err = copy_from_user(kaddr+offset, compat_ptr(str), - bytes_to_copy); - if (err) { + if (copy_from_user(kaddr+offset, compat_ptr(str), + bytes_to_copy)) { ret = -EFAULT; goto out; } - - pos += bytes_to_copy; - str += bytes_to_copy; - len -= bytes_to_copy; } } ret = 0; out: - if (kmapped_page) + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); kunmap(kmapped_page); - return ret; -} - -#ifdef CONFIG_MMU - -#define free_arg_pages(bprm) do { } while (0) - -#else - -static inline void free_arg_pages(struct linux_binprm *bprm) -{ - int i; - - for (i = 0; i < MAX_ARG_PAGES; i++) { - if (bprm->page[i]) - __free_page(bprm->page[i]); - bprm->page[i] = NULL; + put_page(kmapped_page); } + return ret; } -#endif /* CONFIG_MMU */ - /* * compat_do_execve() is mostly a copy of do_execve(), with the exception * that it processes 32 bit argv and envp pointers. @@ -1497,7 +1490,6 @@ int compat_do_execve(char * filename, struct linux_binprm *bprm; struct file *file; int retval; - int i; retval = -ENOMEM; bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); @@ -1511,24 +1503,19 @@ int compat_do_execve(char * filename, sched_exec(); - bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); bprm->file = file; bprm->filename = filename; bprm->interp = filename; - bprm->mm = mm_alloc(); - retval = -ENOMEM; - if (!bprm->mm) - goto out_file; - retval = init_new_context(current, bprm->mm); - if (retval < 0) - goto out_mm; + retval = bprm_mm_init(bprm); + if (retval) + goto out_file; - bprm->argc = compat_count(argv, bprm->p / sizeof(compat_uptr_t)); + bprm->argc = compat_count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) goto out_mm; - bprm->envc = compat_count(envp, bprm->p / sizeof(compat_uptr_t)); + bprm->envc = compat_count(envp, MAX_ARG_STRINGS); if ((retval = bprm->envc) < 0) goto out_mm; @@ -1553,10 +1540,8 @@ int compat_do_execve(char * filename, if (retval < 0) goto out; - retval = search_binary_handler(bprm, regs); + retval = search_binary_handler(bprm,regs); if (retval >= 0) { - free_arg_pages(bprm); - /* execve success */ security_bprm_free(bprm); acct_update_integrals(current); @@ -1565,19 +1550,12 @@ int compat_do_execve(char * filename, } out: - /* Something went wrong, return the inode and free the argument pages*/ - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page * page = bprm->page[i]; - if (page) - __free_page(page); - } - if (bprm->security) security_bprm_free(bprm); out_mm: if (bprm->mm) - mmdrop(bprm->mm); + mmput (bprm->mm); out_file: if (bprm->file) { diff --git a/fs/exec.c b/fs/exec.c index 3155e91..8637630 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -53,6 +53,7 @@ #include #include +#include #ifdef CONFIG_KMOD #include @@ -173,6 +174,153 @@ exit: goto out; } +#ifdef CONFIG_MMU + +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write) +{ + struct page *page; + int ret; + +#ifdef CONFIG_STACK_GROWSUP + if (write) { + ret = expand_downwards(bprm->vma, pos); + if (ret < 0) + return NULL; + } +#endif + ret = get_user_pages(current, bprm->mm, pos, + 1, write, 1, &page, NULL); + if (ret <= 0) + return NULL; + + return page; +} + +static void put_arg_page(struct page *page) +{ + put_page(page); +} + +static void free_arg_page(struct linux_binprm *bprm, int i) +{ +} + +static void free_arg_pages(struct linux_binprm *bprm) +{ +} + +#else + +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write) +{ + struct page *page; + + page = bprm->page[pos / PAGE_SIZE]; + if (!page && write) { + page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); + if (!page) + return NULL; + bprm->page[pos / PAGE_SIZE] = page; + } + + return page; +} + +static void put_arg_page(struct page *page) +{ +} + +static void free_arg_page(struct linux_binprm *bprm, int i) +{ + if (bprm->page[i]) { + __free_page(bprm->page[i]); + bprm->page[i] = NULL; + } +} + +static void free_arg_pages(struct linux_binprm *bprm) +{ + int i; + + for (i = 0; i < MAX_ARG_PAGES; i++) + free_arg_page(bprm, i); +} + +#endif /* CONFIG_MMU */ + +/* Create a new mm_struct and populate it with a temporary stack + * vm_area_struct. We don't have enough context at this point to set the + * stack flags, permissions, and offset, so we use temporary values. We'll + * update them later in setup_arg_pages(). */ +int bprm_mm_init(struct linux_binprm *bprm) +{ + int err; + struct mm_struct *mm = NULL; + struct vm_area_struct *vma = NULL; + + bprm->mm = mm = mm_alloc(); + err = -ENOMEM; + if (!mm) + goto err; + + if ((err = init_new_context(current, mm))) + goto err; + +#ifdef CONFIG_MMU + bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + err = -ENOMEM; + if (!vma) + goto err; + + down_write(&mm->mmap_sem); + { + vma->vm_mm = mm; + + /* Place the stack at the top of user memory. Later, we'll + * move this to an appropriate place. We don't use STACK_TOP + * because that can depend on attributes which aren't + * configured yet. */ + vma->vm_end = TASK_SIZE; + vma->vm_start = vma->vm_end - PAGE_SIZE; + + vma->vm_flags = VM_STACK_FLAGS; + vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; + if ((err = insert_vm_struct(mm, vma))) { + up_write(&mm->mmap_sem); + goto err; + } + + mm->stack_vm = mm->total_vm = 1; + } + up_write(&mm->mmap_sem); + + bprm->p = vma->vm_end - sizeof(void *); +#else + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); +#endif + + return 0; + +err: +#ifdef CONFIG_MMU + if (vma) { + bprm->vma = NULL; + kmem_cache_free(vm_area_cachep, vma); + } +#endif + + if (mm) { + bprm->mm = NULL; + mmdrop(mm); + } + + return err; +} + +EXPORT_SYMBOL(bprm_mm_init); + /* * count() counts the number of strings in array ARGV. */ @@ -198,15 +346,16 @@ static int count(char __user * __user * argv, int max) } /* - * 'copy_strings()' copies argument/environment strings from user - * memory to free pages in kernel mem. These are in a format ready - * to be put directly into the top of new user memory. + * 'copy_strings()' copies argument/environment strings from the old + * processes's memory to the new process's stack. The call to get_user_pages() + * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, char __user * __user * argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; + unsigned long kpos = 0; int ret; while (argc-- > 0) { @@ -215,69 +364,77 @@ static int copy_strings(int argc, char __user * __user * argv, unsigned long pos; if (get_user(str, argv+argc) || - !(len = strnlen_user(str, bprm->p))) { + !(len = strnlen_user(str, MAX_ARG_STRLEN))) { ret = -EFAULT; goto out; } - if (bprm->p < len) { +#ifdef CONFIG_MMU + if (MAX_ARG_STRLEN < len) { ret = -E2BIG; goto out; } +#else + if (bprm->p < len) { + ret = -E2BIG; + goto out; + } +#endif - bprm->p -= len; - /* XXX: add architecture specific overflow check here. */ + /* We're going to work our way backwords. */ pos = bprm->p; + str += len; + bprm->p -= len; while (len > 0) { - int i, new, err; int offset, bytes_to_copy; - struct page *page; offset = pos % PAGE_SIZE; - i = pos/PAGE_SIZE; - page = bprm->page[i]; - new = 0; - if (!page) { - page = alloc_page(GFP_HIGHUSER); - bprm->page[i] = page; + if (offset == 0) + offset = PAGE_SIZE; + + bytes_to_copy = offset; + if (bytes_to_copy > len) + bytes_to_copy = len; + + offset -= bytes_to_copy; + pos -= bytes_to_copy; + str -= bytes_to_copy; + len -= bytes_to_copy; + + if (!kmapped_page || kpos != (pos & PAGE_MASK)) { + struct page *page; + + page = get_arg_page(bprm, pos, 1); if (!page) { - ret = -ENOMEM; + ret = -E2BIG; goto out; } - new = 1; - } - if (page != kmapped_page) { - if (kmapped_page) + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); kunmap(kmapped_page); + put_arg_page(kmapped_page); + } kmapped_page = page; kaddr = kmap(kmapped_page); + kpos = pos & PAGE_MASK; + flush_cache_page(bprm->vma, kpos, + page_to_pfn(kmapped_page)); } - if (new && offset) - memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; - if (bytes_to_copy > len) { - bytes_to_copy = len; - if (new) - memset(kaddr+offset+len, 0, - PAGE_SIZE-offset-len); - } - err = copy_from_user(kaddr+offset, str, bytes_to_copy); - if (err) { + if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } - - pos += bytes_to_copy; - str += bytes_to_copy; - len -= bytes_to_copy; } } ret = 0; out: - if (kmapped_page) + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); kunmap(kmapped_page); + put_arg_page(kmapped_page); + } return ret; } @@ -297,154 +454,157 @@ int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm) EXPORT_SYMBOL(copy_strings_kernel); #ifdef CONFIG_MMU -/* - * This routine is used to map in a page into an address space: needed by - * execve() for the initial stack and environment pages. - * - * vma->vm_mm->mmap_sem is held for writing. - */ -void install_arg_page(struct vm_area_struct *vma, - struct page *page, unsigned long address) + +static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) { struct mm_struct *mm = vma->vm_mm; - pte_t * pte; - spinlock_t *ptl; + unsigned long old_start = vma->vm_start; + unsigned long old_end = vma->vm_end; + unsigned long length = old_end - old_start; + unsigned long new_start = old_start + shift; + unsigned long new_end = old_end + shift; + struct mmu_gather *tlb; + + BUG_ON(new_start > new_end); + + if (new_start < old_start) { + if (vma != find_vma(mm, new_start)) + return -EFAULT; + + vma_adjust(vma, new_start, old_end, + vma->vm_pgoff - (-shift >> PAGE_SHIFT), NULL); + + if (length != move_page_tables(vma, old_start, + vma, new_start, length)) + return -ENOMEM; + + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + free_pgd_range(&tlb, new_end, old_end, new_end, + vma->vm_next ? vma->vm_next->vm_start : 0); + tlb_finish_mmu(tlb, new_end, old_end); + + vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); + } else { + struct vm_area_struct *tmp, *prev; + + tmp = find_vma_prev(mm, new_end, &prev); + if ((tmp && tmp->vm_start < new_end) || prev != vma) + return -EFAULT; - if (unlikely(anon_vma_prepare(vma))) - goto out; + find_vma_prev(mm, vma->vm_start, &prev); - flush_dcache_page(page); - pte = get_locked_pte(mm, address, &ptl); - if (!pte) - goto out; - if (!pte_none(*pte)) { - pte_unmap_unlock(pte, ptl); - goto out; + vma_adjust(vma, old_start, new_end, vma->vm_pgoff, NULL); + + if (length != move_page_tables_up(vma, old_start, + vma, new_start, length)) + return -ENOMEM; + + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + free_pgd_range(&tlb, old_start, new_start, + prev ? prev->vm_end: 0, new_start); + tlb_finish_mmu(tlb, old_start, new_start); + + vma_adjust(vma, new_start, new_end, + vma->vm_pgoff + (shift >> PAGE_SHIFT), NULL); } - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); - set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( - page, vma->vm_page_prot)))); - page_add_new_anon_rmap(page, vma, address); - pte_unmap_unlock(pte, ptl); - - /* no need for flush_tlb */ - return; -out: - __free_page(page); - force_sig(SIGKILL, current); + + return 0; } #define EXTRA_STACK_VM_PAGES 20 /* random */ +/* Finalizes the stack vm_area_struct. The flags and permissions are updated, + * the stack is optionally relocated, and some extra space is added. + */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { - unsigned long stack_base; - struct vm_area_struct *mpnt; + unsigned long ret; + unsigned long stack_base, stack_shift; struct mm_struct *mm = current->mm; - int i, ret; - long arg_size; + struct vm_area_struct *vma = bprm->vma; #ifdef CONFIG_STACK_GROWSUP - /* Move the argument and environment strings to the bottom of the - * stack space. - */ - int offset, j; - char *to, *from; - - /* Start by shifting all the pages down */ - i = 0; - for (j = 0; j < MAX_ARG_PAGES; j++) { - struct page *page = bprm->page[j]; - if (!page) - continue; - bprm->page[i++] = page; - } - - /* Now move them within their pages */ - offset = bprm->p % PAGE_SIZE; - to = kmap(bprm->page[0]); - for (j = 1; j < i; j++) { - memmove(to, to + offset, PAGE_SIZE - offset); - from = kmap(bprm->page[j]); - memcpy(to + PAGE_SIZE - offset, from, offset); - kunmap(bprm->page[j - 1]); - to = from; - } - memmove(to, to + offset, PAGE_SIZE - offset); - kunmap(bprm->page[j - 1]); - /* Limit stack size to 1GB */ stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max; if (stack_base > (1 << 30)) stack_base = 1 << 30; stack_base = PAGE_ALIGN(stack_top - stack_base); - /* Adjust bprm->p to point to the end of the strings. */ - bprm->p = stack_base + PAGE_SIZE * i - offset; - - mm->arg_start = stack_base; - arg_size = i << PAGE_SHIFT; + /* Make sure we didn't let the argument array grow too large. */ + if (vma->vm_end - vma->vm_start > STACK_TOP - stack_base) + return -ENOMEM; - /* zero pages that were copied above */ - while (i < MAX_ARG_PAGES) - bprm->page[i++] = NULL; + stack_shift = stack_base - vma->vm_start; + mm->arg_start = bprm->p + stack_shift; + bprm->p = vma->vm_end + stack_shift; #else - stack_base = arch_align_stack(stack_top - MAX_ARG_PAGES*PAGE_SIZE); + BUG_ON(stack_top > STACK_TOP); + BUG_ON(stack_top & ~PAGE_MASK); + + stack_base = arch_align_stack(stack_top - mm->stack_vm*PAGE_SIZE); stack_base = PAGE_ALIGN(stack_base); - bprm->p += stack_base; + + /* Make sure we didn't let the argument array grow too large. */ + if (stack_base > stack_top) + return -ENOMEM; + + stack_shift = stack_base - (bprm->p & PAGE_MASK); + bprm->p += stack_shift; mm->arg_start = bprm->p; - arg_size = stack_top - (PAGE_MASK & (unsigned long) mm->arg_start); #endif - arg_size += EXTRA_STACK_VM_PAGES * PAGE_SIZE; - if (bprm->loader) - bprm->loader += stack_base; - bprm->exec += stack_base; - - mpnt = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!mpnt) - return -ENOMEM; + bprm->loader += stack_shift; + bprm->exec += stack_shift; down_write(&mm->mmap_sem); { - mpnt->vm_mm = mm; -#ifdef CONFIG_STACK_GROWSUP - mpnt->vm_start = stack_base; - mpnt->vm_end = stack_base + arg_size; -#else - mpnt->vm_end = stack_top; - mpnt->vm_start = mpnt->vm_end - arg_size; -#endif + struct vm_area_struct *prev = NULL; + unsigned long vm_flags = vma->vm_flags; + /* Adjust stack execute permissions; explicitly enable * for EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X * and leave alone (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) - mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC; + vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) - mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC; - else - mpnt->vm_flags = VM_STACK_FLAGS; - mpnt->vm_flags |= mm->def_flags; - mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7]; - if ((ret = insert_vm_struct(mm, mpnt))) { + vm_flags &= ~VM_EXEC; + vm_flags |= mm->def_flags; + + ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, + vm_flags); + if (ret) { up_write(&mm->mmap_sem); - kmem_cache_free(vm_area_cachep, mpnt); return ret; } - mm->stack_vm = mm->total_vm = vma_pages(mpnt); - } + BUG_ON(prev != vma); + + /* Move stack pages down in memory. */ + if (stack_shift) { + ret = shift_arg_pages(vma, stack_shift); + if (ret) { + up_write(&mm->mmap_sem); + return ret; + } + } - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page *page = bprm->page[i]; - if (page) { - bprm->page[i] = NULL; - install_arg_page(mpnt, page, stack_base); +#ifdef CONFIG_STACK_GROWSUP + if (expand_stack(vma, vma->vm_end + + EXTRA_STACK_VM_PAGES * PAGE_SIZE)) { + up_write(&mm->mmap_sem); + return -EFAULT; + } +#else + if (expand_stack(vma, stack_base - + EXTRA_STACK_VM_PAGES * PAGE_SIZE)) { + up_write(&mm->mmap_sem); + return -EFAULT; } - stack_base += PAGE_SIZE; +#endif } up_write(&mm->mmap_sem); @@ -453,21 +613,6 @@ int setup_arg_pages(struct linux_binprm *bprm, EXPORT_SYMBOL(setup_arg_pages); -#define free_arg_pages(bprm) do { } while (0) - -#else - -static inline void free_arg_pages(struct linux_binprm *bprm) -{ - int i; - - for (i = 0; i < MAX_ARG_PAGES; i++) { - if (bprm->page[i]) - __free_page(bprm->page[i]); - bprm->page[i] = NULL; - } -} - #endif /* CONFIG_MMU */ struct file *open_exec(const char *name) @@ -985,28 +1130,47 @@ void compute_creds(struct linux_binprm *bprm) EXPORT_SYMBOL(compute_creds); -void remove_arg_zero(struct linux_binprm *bprm) +/* + * Arguments are '\0' separated strings found at the location bprm->p + * points to; chop off the first by relocating brpm->p to right after + * the first '\0' encountered. + */ +int remove_arg_zero(struct linux_binprm *bprm) { - if (bprm->argc) { - unsigned long offset; - char * kaddr; - struct page *page; + int ret = 0; + unsigned long offset; + char *kaddr; + struct page *page; - offset = bprm->p % PAGE_SIZE; - goto inside; + if (!bprm->argc) + return 0; - while (bprm->p++, *(kaddr+offset++)) { - if (offset != PAGE_SIZE) - continue; - offset = 0; - kunmap_atomic(kaddr, KM_USER0); -inside: - page = bprm->page[bprm->p/PAGE_SIZE]; - kaddr = kmap_atomic(page, KM_USER0); + do { + offset = bprm->p & ~PAGE_MASK; + page = get_arg_page(bprm, bprm->p, 0); + if (!page) { + ret = -EFAULT; + goto out; } + kaddr = kmap_atomic(page, KM_USER0); + + for (; offset < PAGE_SIZE && kaddr[offset]; + offset++, bprm->p++) + ; + kunmap_atomic(kaddr, KM_USER0); - bprm->argc--; - } + put_arg_page(page); + + if (offset == PAGE_SIZE) + free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1); + } while (offset == PAGE_SIZE); + + bprm->p++; + bprm->argc--; + ret = 0; + +out: + return ret; } EXPORT_SYMBOL(remove_arg_zero); @@ -1033,7 +1197,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) fput(bprm->file); bprm->file = NULL; - loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); + loader = bprm->vma->vm_end - sizeof(void *); file = open_exec("/sbin/loader"); retval = PTR_ERR(file); @@ -1125,8 +1289,8 @@ int do_execve(char * filename, { struct linux_binprm *bprm; struct file *file; + unsigned long tmp; int retval; - int i; retval = -ENOMEM; bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); @@ -1140,25 +1304,19 @@ int do_execve(char * filename, sched_exec(); - bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - bprm->file = file; bprm->filename = filename; bprm->interp = filename; - bprm->mm = mm_alloc(); - retval = -ENOMEM; - if (!bprm->mm) - goto out_file; - retval = init_new_context(current, bprm->mm); - if (retval < 0) - goto out_mm; + retval = bprm_mm_init(bprm); + if (retval) + goto out_file; - bprm->argc = count(argv, bprm->p / sizeof(void *)); + bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) goto out_mm; - bprm->envc = count(envp, bprm->p / sizeof(void *)); + bprm->envc = count(envp, MAX_ARG_STRINGS); if ((retval = bprm->envc) < 0) goto out_mm; @@ -1179,15 +1337,16 @@ int do_execve(char * filename, if (retval < 0) goto out; + tmp = bprm->p; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out; + bprm->argv_len = tmp - bprm->p; retval = search_binary_handler(bprm,regs); if (retval >= 0) { - free_arg_pages(bprm); - /* execve success */ + free_arg_pages(bprm); security_bprm_free(bprm); acct_update_integrals(current); kfree(bprm); @@ -1195,26 +1354,19 @@ int do_execve(char * filename, } out: - /* Something went wrong, return the inode and free the argument pages*/ - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page * page = bprm->page[i]; - if (page) - __free_page(page); - } - + free_arg_pages(bprm); if (bprm->security) security_bprm_free(bprm); out_mm: if (bprm->mm) - mmdrop(bprm->mm); + mmput (bprm->mm); out_file: if (bprm->file) { allow_write_access(bprm->file); fput(bprm->file); } - out_kfree: kfree(bprm); diff --git a/include/asm-um/processor-i386.h b/include/asm-um/processor-i386.h index 595f1c3..869c236 100644 --- a/include/asm-um/processor-i386.h +++ b/include/asm-um/processor-i386.h @@ -67,9 +67,6 @@ static inline void rep_nop(void) #define current_text_addr() \ ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) -#define ARCH_IS_STACKGROW(address) \ - (address + 32 >= UPT_SP(¤t->thread.regs.regs)) - #define KSTK_EIP(tsk) KSTK_REG(tsk, EIP) #define KSTK_ESP(tsk) KSTK_REG(tsk, UESP) #define KSTK_EBP(tsk) KSTK_REG(tsk, EBP) diff --git a/include/asm-um/processor-x86_64.h b/include/asm-um/processor-x86_64.h index 31c2d4d..64d7bdd 100644 --- a/include/asm-um/processor-x86_64.h +++ b/include/asm-um/processor-x86_64.h @@ -44,9 +44,6 @@ static inline void arch_copy_thread(struct arch_thread *from, #define current_text_addr() \ ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; }) -#define ARCH_IS_STACKGROW(address) \ - (address + 128 >= UPT_SP(¤t->thread.regs.regs)) - #define KSTK_EIP(tsk) KSTK_REG(tsk, RIP) #define KSTK_ESP(tsk) KSTK_REG(tsk, RSP) diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 2d956cd..8ac2277 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -5,12 +5,9 @@ struct pt_regs; -/* - * MAX_ARG_PAGES defines the number of pages allocated for arguments - * and envelope for the new program. 32 should suffice, this gives - * a maximum env+arg of 128kB w/4KB pages! - */ -#define MAX_ARG_PAGES 32 +/* FIXME: Find real limits, or none. */ +#define MAX_ARG_STRLEN (PAGE_SIZE * 32) +#define MAX_ARG_STRINGS 0x7FFFFFFF /* sizeof(linux_binprm->buf) */ #define BINPRM_BUF_SIZE 128 @@ -22,7 +19,12 @@ struct pt_regs; */ struct linux_binprm{ char buf[BINPRM_BUF_SIZE]; +#ifdef CONFIG_MMU + struct vm_area_struct *vma; +#else +# define MAX_ARG_PAGES 32 struct page *page[MAX_ARG_PAGES]; +#endif struct mm_struct *mm; unsigned long p; /* current top of mem */ int sh_bang; @@ -38,6 +40,7 @@ struct linux_binprm{ unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; + unsigned long argv_len; }; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 @@ -66,7 +69,7 @@ extern int register_binfmt(struct linux_binfmt *); extern int unregister_binfmt(struct linux_binfmt *); extern int prepare_binprm(struct linux_binprm *); -extern void remove_arg_zero(struct linux_binprm *); +extern int __must_check remove_arg_zero(struct linux_binprm *); extern int search_binary_handler(struct linux_binprm *,struct pt_regs *); extern int flush_old_exec(struct linux_binprm * bprm); @@ -83,6 +86,7 @@ extern int suid_dumpable; extern int setup_arg_pages(struct linux_binprm * bprm, unsigned long stack_top, int executable_stack); +extern int bprm_mm_init(struct linux_binprm *bprm); extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); extern void compute_creds(struct linux_binprm *binprm); extern int do_coredump(long signr, int exit_code, struct pt_regs * regs); diff --git a/include/linux/mm.h b/include/linux/mm.h index 60e0e4a..ceee062 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -771,7 +771,6 @@ static inline int handle_mm_fault(struct mm_struct *mm, extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -void install_arg_page(struct vm_area_struct *, struct page *, unsigned long); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); @@ -788,9 +787,18 @@ int FASTCALL(set_page_dirty(struct page *page)); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); +extern unsigned long move_page_tables(struct vm_area_struct *vma, + unsigned long old_addr, struct vm_area_struct *new_vma, + unsigned long new_addr, unsigned long len); +extern unsigned long move_page_tables_up(struct vm_area_struct *vma, + unsigned long old_addr, struct vm_area_struct *new_vma, + unsigned long new_addr, unsigned long len); extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr); +extern int mprotect_fixup(struct vm_area_struct *vma, + struct vm_area_struct **pprev, unsigned long start, + unsigned long end, unsigned long newflags); /* * Prototype to add a shrinker callback for ageable caches. @@ -1091,6 +1099,9 @@ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); #ifdef CONFIG_IA64 extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); #endif +#ifdef CONFIG_STACK_GROWSUP +extern int expand_downwards(struct vm_area_struct *vma, unsigned long address); +#endif /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 47f1c53..b97ca11 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -165,6 +165,7 @@ enum KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ + KERN_AUDIT_ARGV=77, /* int: max size of argv array for audit logging */ }; diff --git a/kernel/audit.c b/kernel/audit.c index 4e9d208..9b08f55 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -170,6 +170,22 @@ void audit_panic(const char *message) } } +void audit_kill(const char *message) +{ + switch (audit_failure) + { + case AUDIT_FAIL_SILENT: + break; + case AUDIT_FAIL_PRINTK: + printk(KERN_ERR "audit: %s\n", message); + break; + case AUDIT_FAIL_PANIC: + printk(KERN_ERR "audit: %s\n", message); + send_sig(SIGKILL, current, 0); + break; + } +} + static inline int audit_rate_check(void) { static unsigned long last_check = 0; diff --git a/kernel/audit.h b/kernel/audit.h index a337023..9cad5ce 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -116,6 +116,7 @@ extern void audit_send_reply(int pid, int seq, int type, void *payload, int size); extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); +extern void audit_kill(const char *message); struct audit_netlink_list { int pid; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 628c7ac..402252e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -155,7 +155,7 @@ struct audit_aux_data_execve { struct audit_aux_data d; int argc; int envc; - char mem[0]; + struct mm_struct *mm; }; struct audit_aux_data_socketcall { @@ -795,6 +795,48 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk audit_log_task_context(ab); } +static void audit_log_execve_info(struct audit_buffer *ab, + struct audit_aux_data_execve *axi) +{ + int i; + long len; + const char __user *p = (const char __user *)axi->mm->arg_start; + + if (axi->mm != current->mm) + return; /* execve failed, no additional info */ + + for (i = 0; i < axi->argc; i++, p += len) { + long ret; + char *tmp; + + len = strnlen_user(p, MAX_ARG_STRLEN); + /* + * We just created this mm, if we can't find the strings + * we just copied in something is _very_ wrong. + */ + BUG_ON(!len); + + tmp = kmalloc(len, GFP_KERNEL); + if (!tmp) { + audit_kill("out of memory for argv string," + " terminating process\n"); + break; + } + + ret = copy_from_user(tmp, p, len); + /* + * There is no reason for this copy to be short. + */ + BUG_ON(ret); + + audit_log_format(ab, "a%d=", i); + audit_log_untrustedstring(ab, tmp); + audit_log_format(ab, "\n"); + + kfree(tmp); + } +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -935,13 +977,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; - int i; - const char *p; - for (i = 0, p = axi->mem; i < axi->argc; i++) { - audit_log_format(ab, "a%d=", i); - p = audit_log_untrustedstring(ab, p); - audit_log_format(ab, "\n"); - } + audit_log_execve_info(ab, axi); break; } case AUDIT_SOCKETCALL: { @@ -1761,32 +1797,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode return 0; } +int audit_argv_kb = 32; + int audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - unsigned long p, next; - void *to; if (likely(!audit_enabled || !context || context->dummy)) return 0; - ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, - GFP_KERNEL); + /* + * Even though the stack code doesn't limit the arg+env size any more, + * the audit code requires that _all_ arguments be logged in a single + * netlink skb. Hence cap it :-( + */ + if (bprm->argv_len > (audit_argv_kb << 10)) + return -E2BIG; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; ax->argc = bprm->argc; ax->envc = bprm->envc; - for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { - struct page *page = bprm->page[p / PAGE_SIZE]; - void *kaddr = kmap(page); - next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); - memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); - to += next - p; - kunmap(page); - } - + ax->mm = bprm->mm; ax->d.type = AUDIT_EXECVE; ax->d.next = context->aux; context->aux = (void *)ax; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c904748..5dfd4d7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int audit_argv_kb; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -603,6 +604,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_AUDITSYSCALL + { + .ctl_name = KERN_AUDIT_ARGV, + .procname = "audit_argv_kb", + .data = &audit_argv_kb, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; diff --git a/mm/mmap.c b/mm/mmap.c index 88da687..8c6ceb5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1569,33 +1569,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ -#ifdef CONFIG_STACK_GROWSUP -int expand_stack(struct vm_area_struct *vma, unsigned long address) -{ - return expand_upwards(vma, address); -} - -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma, *prev; - - addr &= PAGE_MASK; - vma = find_vma_prev(mm, addr, &prev); - if (vma && (vma->vm_start <= addr)) - return vma; - if (!prev || expand_stack(prev, addr)) - return NULL; - if (prev->vm_flags & VM_LOCKED) { - make_pages_present(addr, prev->vm_end); - } - return prev; -} -#else /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_stack(struct vm_area_struct *vma, unsigned long address) +#ifndef CONFIG_STACK_GROWSUP +static inline +#endif +int expand_downwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1632,6 +1612,34 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) return error; } +#ifdef CONFIG_STACK_GROWSUP +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_upwards(vma, address); +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma, *prev; + + addr &= PAGE_MASK; + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; + if (!prev || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) { + make_pages_present(addr, prev->vm_end); + } + return prev; +} +#else +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_downwards(vma, address); +} + struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 3b8f3c0..e8346c3 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -128,7 +128,7 @@ static void change_protection(struct vm_area_struct *vma, flush_tlb_range(vma, start, end); } -static int +int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) { diff --git a/mm/mremap.c b/mm/mremap.c index 5d4bd4f..858a36b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -118,9 +118,63 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(&mapping->i_mmap_lock); } +static void move_ptes_up(struct vm_area_struct *vma, pmd_t *old_pmd, + unsigned long old_addr, unsigned long old_end, + struct vm_area_struct *new_vma, pmd_t *new_pmd, + unsigned long new_addr) +{ + struct address_space *mapping = NULL; + struct mm_struct *mm = vma->vm_mm; + pte_t *old_pte, *new_pte, pte; + spinlock_t *old_ptl, *new_ptl; + unsigned long new_end = new_addr + (old_end - old_addr); + + if (vma->vm_file) { + /* + * Subtle point from Rajesh Venkatasubramanian: before + * moving file-based ptes, we must lock vmtruncate out, + * since it might clean the dst vma before the src vma, + * and we propagate stale pages into the dst afterward. + */ + mapping = vma->vm_file->f_mapping; + spin_lock(&mapping->i_mmap_lock); + if (new_vma->vm_truncate_count && + new_vma->vm_truncate_count != vma->vm_truncate_count) + new_vma->vm_truncate_count = 0; + } + + /* + * We don't have to worry about the ordering of src and dst + * pte locks because exclusive mmap_sem prevents deadlock. + */ + old_pte = pte_offset_map_lock(mm, old_pmd, old_end-1, &old_ptl); + new_pte = pte_offset_map_nested(new_pmd, new_end-1); + new_ptl = pte_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); + + for (; old_end > old_addr; old_pte--, old_end -= PAGE_SIZE, + new_pte--, new_end -= PAGE_SIZE) { + if (pte_none(*old_pte)) + continue; + pte = ptep_clear_flush(vma, old_end-1, old_pte); + pte = move_pte(pte, new_vma->vm_page_prot, old_end-1, new_end-1); + set_pte_at(mm, new_end-1, new_pte, pte); + } + + arch_leave_lazy_mmu_mode(); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + pte_unmap_nested(new_pte - 1); + pte_unmap_unlock(old_pte - 1, old_ptl); + if (mapping) + spin_unlock(&mapping->i_mmap_lock); +} + #define LATENCY_LIMIT (64 * PAGE_SIZE) -static unsigned long move_page_tables(struct vm_area_struct *vma, +unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len) { @@ -132,21 +186,25 @@ static unsigned long move_page_tables(struct vm_area_struct *vma, for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); + next = (old_addr + PMD_SIZE) & PMD_MASK; if (next - 1 > old_end) next = old_end; extent = next - old_addr; + old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); if (!new_pmd) break; + next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) extent = next - new_addr; if (extent > LATENCY_LIMIT) extent = LATENCY_LIMIT; + move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, new_pmd, new_addr); } @@ -154,6 +212,51 @@ static unsigned long move_page_tables(struct vm_area_struct *vma, return len + old_addr - old_end; /* how much done */ } +unsigned long move_page_tables_up(struct vm_area_struct *vma, + unsigned long old_addr, struct vm_area_struct *new_vma, + unsigned long new_addr, unsigned long len) +{ + unsigned long extent, prev, old_end, new_end; + pmd_t *old_pmd, *new_pmd; + + old_end = old_addr + len; + new_end = new_addr + len; + flush_cache_range(vma, old_addr, old_end); + + for (; old_end > old_addr; old_end -= extent, new_end -= extent) { + cond_resched(); + + /* + * calculate how far till prev PMD boundary for old + */ + prev = (old_end - 1) & PMD_MASK; + if (prev < old_addr) + prev = old_addr; + extent = old_end - prev; + + old_pmd = get_old_pmd(vma->vm_mm, old_end-1); + if (!old_pmd) + continue; + new_pmd = alloc_new_pmd(vma->vm_mm, new_end-1); + if (!new_pmd) + break; + + /* + * calculate and clip to prev PMD boundary for new + */ + prev = (new_end - 1) & PMD_MASK; + if (extent > new_end - prev) + extent = new_end - prev; + if (extent > LATENCY_LIMIT) + extent = LATENCY_LIMIT; + + move_ptes_up(vma, old_pmd, old_end - extent, old_end, + new_vma, new_pmd, new_end - extent); + } + + return old_addr + len - old_end; +} + static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, unsigned long new_len, unsigned long new_addr)