[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20120407190129.9726.96427.stgit@zurg>
Date: Sat, 07 Apr 2012 23:01:29 +0400
From: Konstantin Khlebnikov <khlebnikov@...nvz.org>
To: linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>,
linux-kernel@...r.kernel.org
Cc: Cyrill Gorcunov <gorcunov@...nvz.org>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Matt Helsley <matthltc@...ibm.com>,
Oleg Nesterov <oleg@...hat.com>
Subject: [PATCH v2 08/10] mm: kill vma flag VM_EXECUTABLE
Currently the kernel sets mm->exe_file during sys_execve() and then tracks
number of vmas with VM_EXECUTABLE flag in mm->num_exe_file_vmas, as soon as
this counter drops to zero kernel resets mm->exe_file to NULL. Plus it resets
mm->exe_file at last mmput() when mm->mm_users drops to zero.
Vma with VM_EXECUTABLE flag appears after mapping file with flag MAP_EXECUTABLE,
such vmas can appears only at sys_execve() or after vma splitting, because
sys_mmap ignores this flag. Usually binfmt module sets mm->exe_file and mmaps
some executable vmas with this file, they hold mm->exe_file while task is running.
comment from v2.6.25-6245-g925d1c4 ("procfs task exe symlink"),
where all this stuff was introduced:
> The kernel implements readlink of /proc/pid/exe by getting the file from
> the first executable VMA. Then the path to the file is reconstructed and
> reported as the result.
>
> Because of the VMA walk the code is slightly different on nommu systems.
> This patch avoids separate /proc/pid/exe code on nommu systems. Instead of
> walking the VMAs to find the first executable file-backed VMA we store a
> reference to the exec'd file in the mm_struct.
>
> That reference would prevent the filesystem holding the executable file
> from being unmounted even after unmapping the VMAs. So we track the number
> of VM_EXECUTABLE VMAs and drop the new reference when the last one is
> unmapped. This avoids pinning the mounted filesystem.
After this patch we track the number of VMAs with vma->vm_file == mm->exe_file,
instead of vmas with VM_EXECUTABLE. Behaviour is nearly the same: kernel will
reset mm->exe_file as soon as task unmap its executable file.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@...nvz.org>
Cc: Matt Helsley <matthltc@...ibm.com>
Cc: Oleg Nesterov <oleg@...hat.com>
Cc: Cyrill Gorcunov <gorcunov@...nvz.org>
---
include/linux/mm.h | 1 -
include/linux/mman.h | 1 -
mm/mmap.c | 12 ++++++------
mm/nommu.c | 11 ++++++-----
4 files changed, 12 insertions(+), 13 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 553d134..8e82b79 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -88,7 +88,6 @@ extern unsigned int kobjsize(const void *objp);
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
-#define VM_EXECUTABLE 0x00001000
#define VM_LOCKED 0x00002000
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 8b74e9b..77cec2f 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -86,7 +86,6 @@ calc_vm_flag_bits(unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
- _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
}
#endif /* __KERNEL__ */
diff --git a/mm/mmap.c b/mm/mmap.c
index 3d254ca..bc67ed7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
vma->vm_ops->close(vma);
if (vma->vm_file) {
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
+ if (vma->vm_file == vma->vm_mm->exe_file)
removed_exe_file_vma(vma->vm_mm);
}
mpol_put(vma_policy(vma));
@@ -618,7 +618,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (remove_next) {
if (file) {
fput(file);
- if (next->vm_flags & VM_EXECUTABLE)
+ if (file == mm->exe_file)
removed_exe_file_vma(mm);
}
if (next->anon_vma)
@@ -1293,7 +1293,7 @@ munmap_back:
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- if (vm_flags & VM_EXECUTABLE)
+ if (file == mm->exe_file)
added_exe_file_vma(mm);
/* Can addr have changed??
@@ -1971,7 +1971,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
if (new->vm_file) {
get_file(new->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
+ if (new->vm_file == mm->exe_file)
added_exe_file_vma(mm);
}
@@ -1992,7 +1992,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
if (new->vm_ops && new->vm_ops->close)
new->vm_ops->close(new);
if (new->vm_file) {
- if (vma->vm_flags & VM_EXECUTABLE)
+ if (new->vm_file == mm->exe_file)
removed_exe_file_vma(mm);
fput(new->vm_file);
}
@@ -2379,7 +2379,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
new_vma->vm_pgoff = pgoff;
if (new_vma->vm_file) {
get_file(new_vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
+ if (new_vma->vm_file == mm->exe_file)
added_exe_file_vma(mm);
}
if (new_vma->vm_ops && new_vma->vm_ops->open)
diff --git a/mm/nommu.c b/mm/nommu.c
index afa0a15..db8da78 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -791,7 +791,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
vma->vm_ops->close(vma);
if (vma->vm_file) {
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
+ if (vma->vm_file == mm->exe_file)
removed_exe_file_vma(mm);
}
put_nommu_region(vma->vm_region);
@@ -1287,7 +1287,7 @@ unsigned long do_mmap_pgoff(struct file *file,
get_file(file);
vma->vm_file = file;
get_file(file);
- if (vm_flags & VM_EXECUTABLE) {
+ if (file == current->mm->exe_file) {
added_exe_file_vma(current->mm);
vma->vm_mm = current->mm;
}
@@ -1441,10 +1441,11 @@ error:
if (region->vm_file)
fput(region->vm_file);
kmem_cache_free(vm_region_jar, region);
- if (vma->vm_file)
+ if (vma->vm_file) {
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(vma->vm_mm);
+ if (vma->vm_file == vma->vm_mm->exe_file)
+ removed_exe_file_vma(vma->vm_mm);
+ }
kmem_cache_free(vm_area_cachep, vma);
kleave(" = %d", ret);
return ret;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists