lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20100813212501.GF22102@kroah.com>
Date:	Fri, 13 Aug 2010 14:25:01 -0700
From:	Greg KH <gregkh@...e.de>
To:	linux-kernel@...r.kernel.org,
	Andrew Morton <akpm@...ux-foundation.org>,
	torvalds@...ux-foundation.org, stable@...nel.org, lwn@....net
Subject: Re: Linux 2.6.32.19

diff --git a/Makefile b/Makefile
index e86dbf7..64e538c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 32
-EXTRAVERSION = .18
+EXTRAVERSION = .19
 NAME = Man-Eating Seals of Antiquity
 
 # *DOCUMENTATION*
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1c4119c..e516aa2 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -887,6 +887,18 @@ config ARM_ERRATA_460075
 	  ACTLR register. Note that setting specific bits in the ACTLR register
 	  may not be available in non-secure mode.
 
+config ARM_ERRATA_720789
+	bool "ARM errata: TLBIASIDIS and TLBIMVAIS operations can broadcast a faulty ASID"
+	depends on CPU_V7 && SMP
+	help
+	  This option enables the workaround for the 720789 Cortex-A9 (prior to
+	  r2p0) erratum. A faulty ASID can be sent to the other CPUs for the
+	  broadcasted CP15 TLB maintenance operations TLBIASIDIS and TLBIMVAIS.
+	  As a consequence of this erratum, some TLB entries which should be
+	  invalidated are not, resulting in an incoherency in the system page
+	  tables. The workaround changes the TLB flushing routines to invalidate
+	  entries regardless of the ASID.
+
 endmenu
 
 source "arch/arm/common/Kconfig"
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index c2f1605..00c1cba 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -369,7 +369,11 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 	if (tlb_flag(TLB_V6_I_ASID))
 		asm("mcr p15, 0, %0, c8, c5, 2" : : "r" (asid) : "cc");
 	if (tlb_flag(TLB_V7_UIS_ASID))
+#ifdef CONFIG_ARM_ERRATA_720789
+		asm("mcr p15, 0, %0, c8, c3, 0" : : "r" (zero) : "cc");
+#else
 		asm("mcr p15, 0, %0, c8, c3, 2" : : "r" (asid) : "cc");
+#endif
 
 	if (tlb_flag(TLB_BTB)) {
 		/* flush the branch target cache */
@@ -409,7 +413,11 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 	if (tlb_flag(TLB_V6_I_PAGE))
 		asm("mcr p15, 0, %0, c8, c5, 1" : : "r" (uaddr) : "cc");
 	if (tlb_flag(TLB_V7_UIS_PAGE))
+#ifdef CONFIG_ARM_ERRATA_720789
+		asm("mcr p15, 0, %0, c8, c3, 3" : : "r" (uaddr & PAGE_MASK) : "cc");
+#else
 		asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (uaddr) : "cc");
+#endif
 
 	if (tlb_flag(TLB_BTB)) {
 		/* flush the branch target cache */
diff --git a/arch/arm/plat-mxc/include/mach/gpio.h b/arch/arm/plat-mxc/include/mach/gpio.h
index 6bd932c..7a0dc5a 100644
--- a/arch/arm/plat-mxc/include/mach/gpio.h
+++ b/arch/arm/plat-mxc/include/mach/gpio.h
@@ -19,6 +19,7 @@
 #ifndef __ASM_ARCH_MXC_GPIO_H__
 #define __ASM_ARCH_MXC_GPIO_H__
 
+#include <linux/spinlock.h>
 #include <mach/hardware.h>
 #include <asm-generic/gpio.h>
 
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 674a837..01ae69b 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -677,12 +677,19 @@ sba_alloc_range(struct ioc *ioc, struct device *dev, size_t size)
 			spin_unlock_irqrestore(&ioc->saved_lock, flags);
 
 			pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
-			if (unlikely(pide >= (ioc->res_size << 3)))
-				panic(__FILE__ ": I/O MMU @ %p is out of mapping resources\n",
-				      ioc->ioc_hpa);
+			if (unlikely(pide >= (ioc->res_size << 3))) {
+				printk(KERN_WARNING "%s: I/O MMU @ %p is"
+				       "out of mapping resources, %u %u %lx\n",
+				       __func__, ioc->ioc_hpa, ioc->res_size,
+				       pages_needed, dma_get_seg_boundary(dev));
+				return -1;
+			}
 #else
-			panic(__FILE__ ": I/O MMU @ %p is out of mapping resources\n",
-			      ioc->ioc_hpa);
+			printk(KERN_WARNING "%s: I/O MMU @ %p is"
+			       "out of mapping resources, %u %u %lx\n",
+			       __func__, ioc->ioc_hpa, ioc->res_size,
+			       pages_needed, dma_get_seg_boundary(dev));
+			return -1;
 #endif
 		}
 	}
@@ -965,6 +972,8 @@ static dma_addr_t sba_map_page(struct device *dev, struct page *page,
 #endif
 
 	pide = sba_alloc_range(ioc, dev, size);
+	if (pide < 0)
+		return 0;
 
 	iovp = (dma_addr_t) pide << iovp_shift;
 
@@ -1320,6 +1329,7 @@ sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
 	unsigned long dma_offset, dma_len; /* start/len of DMA stream */
 	int n_mappings = 0;
 	unsigned int max_seg_size = dma_get_max_seg_size(dev);
+	int idx;
 
 	while (nents > 0) {
 		unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
@@ -1418,16 +1428,22 @@ sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
 		vcontig_sg->dma_length = vcontig_len;
 		dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask;
 		ASSERT(dma_len <= DMA_CHUNK_SIZE);
-		dma_sg->dma_address = (dma_addr_t) (PIDE_FLAG
-			| (sba_alloc_range(ioc, dev, dma_len) << iovp_shift)
-			| dma_offset);
+		idx = sba_alloc_range(ioc, dev, dma_len);
+		if (idx < 0) {
+			dma_sg->dma_length = 0;
+			return -1;
+		}
+		dma_sg->dma_address = (dma_addr_t)(PIDE_FLAG | (idx << iovp_shift)
+						   | dma_offset);
 		n_mappings++;
 	}
 
 	return n_mappings;
 }
 
-
+static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
+			       int nents, enum dma_data_direction dir,
+			       struct dma_attrs *attrs);
 /**
  * sba_map_sg - map Scatter/Gather list
  * @dev: instance of PCI owned by the driver that's asking.
@@ -1493,6 +1509,10 @@ static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
 	** Access to the virtual address is what forces a two pass algorithm.
 	*/
 	coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
+	if (coalesced < 0) {
+		sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
+		return 0;
+	}
 
 	/*
 	** Program the I/O Pdir
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 4990495..a35c661 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -473,7 +473,7 @@ void update_vsyscall_tz(void)
 {
 }
 
-void update_vsyscall(struct timespec *wall, struct clocksource *c)
+void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult)
 {
         unsigned long flags;
 
@@ -481,7 +481,7 @@ void update_vsyscall(struct timespec *wall, struct clocksource *c)
 
         /* copy fsyscall clock data */
         fsyscall_gtod_data.clk_mask = c->mask;
-        fsyscall_gtod_data.clk_mult = c->mult;
+        fsyscall_gtod_data.clk_mult = mult;
         fsyscall_gtod_data.clk_shift = c->shift;
         fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio;
         fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 1a54a3b..7546e2c 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -158,9 +158,11 @@ drivers-$(CONFIG_OPROFILE)	+= arch/powerpc/oprofile/
 # Default to zImage, override when needed
 all: zImage
 
-BOOT_TARGETS = zImage zImage.initrd uImage zImage% dtbImage% treeImage.% cuImage.% simpleImage.%
+# With make 3.82 we cannot mix normal and wildcard targets
+BOOT_TARGETS1 := zImage zImage.initrd uImaged
+BOOT_TARGETS2 := zImage% dtbImage% treeImage.% cuImage.% simpleImage.%
 
-PHONY += $(BOOT_TARGETS)
+PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2)
 
 boot := arch/$(ARCH)/boot
 
@@ -175,10 +177,16 @@ relocs_check: arch/powerpc/relocs_check.pl vmlinux
 zImage: relocs_check
 endif
 
-$(BOOT_TARGETS): vmlinux
+$(BOOT_TARGETS1): vmlinux
+	$(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
+$(BOOT_TARGETS2): vmlinux
+	$(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
+
+
+bootwrapper_install:
 	$(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 
-bootwrapper_install %.dtb:
+%.dtb:
 	$(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 
 define archhelp
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 2828f9d..fa6648a 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -137,6 +137,11 @@ struct device_node * find_device_pe(struct device_node *dn);
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);
 
+static inline const char *eeh_pci_name(struct pci_dev *pdev)
+{
+	return pdev ? pci_name(pdev) : "<null>";
+}
+
 #endif /* CONFIG_EEH */
 
 #else /* CONFIG_PCI */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 02aab7f..7143d4c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -864,7 +864,8 @@ static cycle_t timebase_read(struct clocksource *cs)
 	return (cycle_t)get_tb();
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+		     u32 mult)
 {
 	u64 t2x, stamp_xsec;
 
@@ -877,7 +878,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 
 	/* XXX this assumes clock->shift == 22 */
 	/* 4611686018 ~= 2^(20+64-22) / 1e9 */
-	t2x = (u64) clock->mult * 4611686018ULL;
+	t2x = (u64) mult * 4611686018ULL;
 	stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
 	do_div(stamp_xsec, 1000000000);
 	stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index ccd8dd0..3304f32 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -491,7 +491,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	    pdn->eeh_mode & EEH_MODE_NOCHECK) {
 		ignored_check++;
 		pr_debug("EEH: Ignored check (%x) for %s %s\n",
-			 pdn->eeh_mode, pci_name (dev), dn->full_name);
+			 pdn->eeh_mode, eeh_pci_name(dev), dn->full_name);
 		return 0;
 	}
 
@@ -515,7 +515,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 			printk (KERN_ERR "EEH: %d reads ignored for recovering device at "
 				"location=%s driver=%s pci addr=%s\n",
 				pdn->eeh_check_count, location,
-				dev->driver->name, pci_name(dev));
+				dev->driver->name, eeh_pci_name(dev));
 			printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n",
 				dev->driver->name);
 			dump_stack();
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 0e8db67..52c4b40 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -353,7 +353,7 @@ struct pci_dn * handle_eeh_events (struct eeh_event *event)
 		location = location ? location : "unknown";
 		printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
 		                "for location=%s pci addr=%s\n",
-		        location, pci_name(event->dev));
+		        location, eeh_pci_name(event->dev));
 		return NULL;
 	}
 
@@ -384,7 +384,7 @@ struct pci_dn * handle_eeh_events (struct eeh_event *event)
 		pci_str = pci_name (frozen_pdn->pcidev);
 		drv_str = pcid_name (frozen_pdn->pcidev);
 	} else {
-		pci_str = pci_name (event->dev);
+		pci_str = eeh_pci_name(event->dev);
 		drv_str = pcid_name (event->dev);
 	}
 	
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index ddb80f5..ec5df8f 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -80,7 +80,7 @@ static int eeh_event_handler(void * dummy)
 	eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
 
 	printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
-	       pci_name(event->dev));
+	       eeh_pci_name(event->dev));
 
 	pdn = handle_eeh_events(event);
 
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index f23961a..258ba88 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -183,6 +183,7 @@ struct s390_idle_data {
 	unsigned long long idle_count;
 	unsigned long long idle_enter;
 	unsigned long long idle_time;
+	int nohz_delay;
 };
 
 DECLARE_PER_CPU(struct s390_idle_data, s390_idle);
@@ -198,4 +199,11 @@ static inline void s390_idle_check(void)
 		vtime_start_cpu();
 }
 
+static inline int s390_nohz_delay(int cpu)
+{
+	return per_cpu(s390_idle, cpu).nohz_delay != 0;
+}
+
+#define arch_needs_cpu(cpu) s390_nohz_delay(cpu)
+
 #endif /* _S390_CPUTIME_H */
diff --git a/arch/s390/kernel/s390_ext.c b/arch/s390/kernel/s390_ext.c
index 0de305b..59618bc 100644
--- a/arch/s390/kernel/s390_ext.c
+++ b/arch/s390/kernel/s390_ext.c
@@ -126,6 +126,8 @@ void __irq_entry do_extint(struct pt_regs *regs, unsigned short code)
 		/* Serve timer interrupts first. */
 		clock_comparator_work();
 	kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++;
+	if (code != 0x1004)
+		__get_cpu_var(s390_idle).nohz_delay = 1;
         index = ext_hash(code);
 	for (p = ext_int_hash[index]; p; p = p->next) {
 		if (likely(p->code == code))
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 34162a0..68e1ecf 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -214,7 +214,8 @@ struct clocksource * __init clocksource_default_clock(void)
 	return &clocksource_tod;
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+		     u32 mult)
 {
 	if (clock != &clocksource_tod)
 		return;
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index c41bb0d..b59a812 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -167,6 +167,8 @@ void vtime_stop_cpu(void)
 	/* Wait for external, I/O or machine check interrupt. */
 	psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT;
 
+	idle->nohz_delay = 0;
+
 	/* Check if the CPU timer needs to be reprogrammed. */
 	if (vq->do_spt) {
 		__u64 vmax = VTIMER_MAX_SLICE;
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ee1931b..5af5051 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -34,12 +34,12 @@ static inline void __set_64bit(unsigned long long *ptr,
 			       unsigned int low, unsigned int high)
 {
 	asm volatile("\n1:\t"
-		     "movl (%0), %%eax\n\t"
-		     "movl 4(%0), %%edx\n\t"
-		     LOCK_PREFIX "cmpxchg8b (%0)\n\t"
+		     "movl (%1), %%eax\n\t"
+		     "movl 4(%1), %%edx\n\t"
+		     LOCK_PREFIX "cmpxchg8b %0\n\t"
 		     "jnz 1b"
-		     : /* no outputs */
-		     : "D"(ptr),
+		     : "=m"(*ptr)
+		     : "D" (ptr),
 		       "b"(low),
 		       "c"(high)
 		     : "ax", "dx", "memory");
@@ -82,20 +82,20 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
 	switch (size) {
 	case 1:
 		asm volatile("xchgb %b0,%1"
-			     : "=q" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
+			     : "=q" (x), "+m" (*__xg(ptr))
+			     : "0" (x)
 			     : "memory");
 		break;
 	case 2:
 		asm volatile("xchgw %w0,%1"
-			     : "=r" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
+			     : "=r" (x), "+m" (*__xg(ptr))
+			     : "0" (x)
 			     : "memory");
 		break;
 	case 4:
 		asm volatile("xchgl %0,%1"
-			     : "=r" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
+			     : "=r" (x), "+m" (*__xg(ptr))
+			     : "0" (x)
 			     : "memory");
 		break;
 	}
@@ -139,21 +139,21 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 	unsigned long prev;
 	switch (size) {
 	case 1:
-		asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
-			     : "=a"(prev)
-			     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile(LOCK_PREFIX "cmpxchgb %b2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "q"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 2:
-		asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile(LOCK_PREFIX "cmpxchgw %w2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 4:
-		asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile(LOCK_PREFIX "cmpxchgl %2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	}
@@ -172,21 +172,21 @@ static inline unsigned long __sync_cmpxchg(volatile void *ptr,
 	unsigned long prev;
 	switch (size) {
 	case 1:
-		asm volatile("lock; cmpxchgb %b1,%2"
-			     : "=a"(prev)
-			     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("lock; cmpxchgb %b2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "q"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 2:
-		asm volatile("lock; cmpxchgw %w1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("lock; cmpxchgw %w2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 4:
-		asm volatile("lock; cmpxchgl %1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("lock; cmpxchgl %2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	}
@@ -200,21 +200,21 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 	unsigned long prev;
 	switch (size) {
 	case 1:
-		asm volatile("cmpxchgb %b1,%2"
-			     : "=a"(prev)
-			     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("cmpxchgb %b2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "q"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 2:
-		asm volatile("cmpxchgw %w1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("cmpxchgw %w2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 4:
-		asm volatile("cmpxchgl %1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("cmpxchgl %2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	}
@@ -226,11 +226,10 @@ static inline unsigned long long __cmpxchg64(volatile void *ptr,
 					     unsigned long long new)
 {
 	unsigned long long prev;
-	asm volatile(LOCK_PREFIX "cmpxchg8b %3"
-		     : "=A"(prev)
+	asm volatile(LOCK_PREFIX "cmpxchg8b %1"
+		     : "=A"(prev), "+m" (*__xg(ptr))
 		     : "b"((unsigned long)new),
 		       "c"((unsigned long)(new >> 32)),
-		       "m"(*__xg(ptr)),
 		       "0"(old)
 		     : "memory");
 	return prev;
@@ -241,11 +240,10 @@ static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
 						   unsigned long long new)
 {
 	unsigned long long prev;
-	asm volatile("cmpxchg8b %3"
-		     : "=A"(prev)
+	asm volatile("cmpxchg8b %1"
+		     : "=A"(prev), "+m"(*__xg(ptr))
 		     : "b"((unsigned long)new),
 		       "c"((unsigned long)(new >> 32)),
-		       "m"(*__xg(ptr)),
 		       "0"(old)
 		     : "memory");
 	return prev;
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 52de72e..1871cb0 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -26,26 +26,26 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
 	switch (size) {
 	case 1:
 		asm volatile("xchgb %b0,%1"
-			     : "=q" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
+			     : "=q" (x), "+m" (*__xg(ptr))
+			     : "0" (x)
 			     : "memory");
 		break;
 	case 2:
 		asm volatile("xchgw %w0,%1"
-			     : "=r" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
+			     : "=r" (x), "+m" (*__xg(ptr))
+			     : "0" (x)
 			     : "memory");
 		break;
 	case 4:
 		asm volatile("xchgl %k0,%1"
-			     : "=r" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
+			     : "=r" (x), "+m" (*__xg(ptr))
+			     : "0" (x)
 			     : "memory");
 		break;
 	case 8:
 		asm volatile("xchgq %0,%1"
-			     : "=r" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
+			     : "=r" (x), "+m" (*__xg(ptr))
+			     : "0" (x)
 			     : "memory");
 		break;
 	}
@@ -66,27 +66,27 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 	unsigned long prev;
 	switch (size) {
 	case 1:
-		asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
-			     : "=a"(prev)
-			     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile(LOCK_PREFIX "cmpxchgb %b2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "q"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 2:
-		asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile(LOCK_PREFIX "cmpxchgw %w2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 4:
-		asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile(LOCK_PREFIX "cmpxchgl %k2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 8:
-		asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile(LOCK_PREFIX "cmpxchgq %2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	}
@@ -105,21 +105,27 @@ static inline unsigned long __sync_cmpxchg(volatile void *ptr,
 	unsigned long prev;
 	switch (size) {
 	case 1:
-		asm volatile("lock; cmpxchgb %b1,%2"
-			     : "=a"(prev)
-			     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("lock; cmpxchgb %b2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "q"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 2:
-		asm volatile("lock; cmpxchgw %w1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("lock; cmpxchgw %w2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 4:
-		asm volatile("lock; cmpxchgl %1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("lock; cmpxchgl %k2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
+			     : "memory");
+		return prev;
+	case 8:
+		asm volatile("lock; cmpxchgq %2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	}
@@ -133,27 +139,27 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 	unsigned long prev;
 	switch (size) {
 	case 1:
-		asm volatile("cmpxchgb %b1,%2"
-			     : "=a"(prev)
-			     : "q"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("cmpxchgb %b2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "q"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 2:
-		asm volatile("cmpxchgw %w1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("cmpxchgw %w2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 4:
-		asm volatile("cmpxchgl %k1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("cmpxchgl %k2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	case 8:
-		asm volatile("cmpxchgq %1,%2"
-			     : "=a"(prev)
-			     : "r"(new), "m"(*__xg(ptr)), "0"(old)
+		asm volatile("cmpxchgq %2,%1"
+			     : "=a"(prev), "+m"(*__xg(ptr))
+			     : "r"(new), "0"(old)
 			     : "memory");
 		return prev;
 	}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc4f486..9f9fded 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1484,7 +1484,7 @@ static struct {
 
 static void __init setup_IO_APIC_irqs(void)
 {
-	int apic_id = 0, pin, idx, irq;
+	int apic_id, pin, idx, irq;
 	int notcon = 0;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
@@ -1492,14 +1492,7 @@ static void __init setup_IO_APIC_irqs(void)
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
-#ifdef CONFIG_ACPI
-	if (!acpi_disabled && acpi_ioapic) {
-		apic_id = mp_find_ioapic(0);
-		if (apic_id < 0)
-			apic_id = 0;
-	}
-#endif
-
+	for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
 	for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
 		idx = find_irq_entry(apic_id, pin, mp_INT);
 		if (idx == -1) {
@@ -1521,6 +1514,9 @@ static void __init setup_IO_APIC_irqs(void)
 
 		irq = pin_2_irq(idx, apic_id, pin);
 
+		if ((apic_id > 0) && (irq > 16))
+			continue;
+
 		/*
 		 * Skip the timer IRQ if there's a quirk handler
 		 * installed and if it returns 1:
@@ -4083,27 +4079,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 #ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
-	int pin, ioapic = 0, irq, irq_entry;
+	int pin, ioapic, irq, irq_entry;
 	struct irq_desc *desc;
 	const struct cpumask *mask;
 
 	if (skip_ioapic_setup == 1)
 		return;
 
-#ifdef CONFIG_ACPI
-	if (!acpi_disabled && acpi_ioapic) {
-		ioapic = mp_find_ioapic(0);
-		if (ioapic < 0)
-			ioapic = 0;
-	}
-#endif
-
+	for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
 	for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
 		irq_entry = find_irq_entry(ioapic, pin, mp_INT);
 		if (irq_entry == -1)
 			continue;
 		irq = pin_2_irq(irq_entry, ioapic, pin);
 
+		if ((ioapic > 0) && (irq > 16))
+			continue;
+
 		desc = irq_to_desc(irq);
 
 		/*
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97..9580152 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/dmi.h>
+#include <linux/jiffies.h>
 #include <asm/div64.h>
 #include <asm/vmware.h>
 #include <asm/x86_init.h>
@@ -50,7 +51,7 @@ static inline int __vmware_platform(void)
 
 static unsigned long vmware_get_tsc_khz(void)
 {
-	uint64_t tsc_hz;
+	uint64_t tsc_hz, lpj;
 	uint32_t eax, ebx, ecx, edx;
 
 	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -61,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
 	printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
 			 (unsigned long) tsc_hz / 1000,
 			 (unsigned long) tsc_hz % 1000);
+
+	if (!preset_lpj) {
+		lpj = ((u64)tsc_hz * 1000);
+		do_div(lpj, HZ);
+		preset_lpj = lpj;
+	}
+
 	return tsc_hz;
 }
 
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974..62f39d7 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
+		     u32 mult)
 {
 	unsigned long flags;
 
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
 	vsyscall_gtod_data.clock.vread = clock->vread;
 	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
 	vsyscall_gtod_data.clock.mask = clock->mask;
-	vsyscall_gtod_data.clock.mult = clock->mult;
+	vsyscall_gtod_data.clock.mult = mult;
 	vsyscall_gtod_data.clock.shift = clock->shift;
 	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
 	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f4cee90..1739358 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -801,8 +801,10 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	up_read(&mm->mmap_sem);
 
 	/* Kernel mode? Handle exceptions or die: */
-	if (!(error_code & PF_USER))
+	if (!(error_code & PF_USER)) {
 		no_context(regs, error_code, address);
+		return;
+	}
 
 	/* User-space => ok to do another page fault: */
 	if (is_prefetch(regs, error_code, address))
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a..7d095ad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -49,6 +49,7 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 #include <asm/init.h>
+#include <linux/bootmem.h>
 
 static unsigned long dma_reserve __initdata;
 
@@ -615,6 +616,21 @@ void __init paging_init(void)
  */
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
+ */
+static void  update_end_of_memory_vars(u64 start, u64 size)
+{
+	unsigned long end_pfn = PFN_UP(start + size);
+
+	if (end_pfn > max_pfn) {
+		max_pfn = end_pfn;
+		max_low_pfn = end_pfn;
+		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+	}
+}
+
+/*
  * Memory is added always to NORMAL zone. This means you will never get
  * additional DMA/DMA32 memory.
  */
@@ -633,6 +649,9 @@ int arch_add_memory(int nid, u64 start, u64 size)
 	ret = __add_pages(nid, zone, start_pfn, nr_pages);
 	WARN_ON_ONCE(ret);
 
+	/* update max_pfn, max_low_pfn and high_memory */
+	update_end_of_memory_vars(start, size);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 6d5b746..2a4106d 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1477,9 +1477,54 @@ static int alg_test_cprng(const struct alg_test_desc *desc, const char *driver,
 	return err;
 }
 
+static int alg_test_null(const struct alg_test_desc *desc,
+			     const char *driver, u32 type, u32 mask)
+{
+	return 0;
+}
+
 /* Please keep this list sorted by algorithm name. */
 static const struct alg_test_desc alg_test_descs[] = {
 	{
+		.alg = "__driver-cbc-aes-aesni",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
+	}, {
+		.alg = "__driver-ecb-aes-aesni",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
+	}, {
+		.alg = "__ghash-pclmulqdqni",
+		.test = alg_test_null,
+		.suite = {
+			.hash = {
+				.vecs = NULL,
+				.count = 0
+			}
+		}
+	}, {
 		.alg = "ansi_cprng",
 		.test = alg_test_cprng,
 		.fips_allowed = 1,
@@ -1623,6 +1668,30 @@ static const struct alg_test_desc alg_test_descs[] = {
 			}
 		}
 	}, {
+		.alg = "cryptd(__driver-ecb-aes-aesni)",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
+	}, {
+		.alg = "cryptd(__ghash-pclmulqdqni)",
+		.test = alg_test_null,
+		.suite = {
+			.hash = {
+				.vecs = NULL,
+				.count = 0
+			}
+		}
+	}, {
 		.alg = "ctr(aes)",
 		.test = alg_test_skcipher,
 		.fips_allowed = 1,
@@ -1669,6 +1738,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 			}
 		}
 	}, {
+		.alg = "ecb(__aes-aesni)",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
+	}, {
 		.alg = "ecb(aes)",
 		.test = alg_test_skcipher,
 		.fips_allowed = 1,
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 8ba0ed0..40d395e 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -356,7 +356,11 @@ static int acpi_processor_get_performance_info(struct acpi_processor *pr)
 	if (result)
 		goto update_bios;
 
-	return 0;
+	/* We need to call _PPC once when cpufreq starts */
+	if (ignore_ppc != 1)
+		result = acpi_processor_get_platform_limit(pr);
+
+	return result;
 
 	/*
 	 * Having _PPC but missing frequencies (_PSS, _PCT) is a very good hint that
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 4f94e22..c33591d 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -157,6 +157,7 @@ struct piix_map_db {
 struct piix_host_priv {
 	const int *map;
 	u32 saved_iocfg;
+	spinlock_t sidpr_lock;	/* FIXME: remove once locking in EH is fixed */
 	void __iomem *sidpr;
 };
 
@@ -948,12 +949,15 @@ static int piix_sidpr_scr_read(struct ata_link *link,
 			       unsigned int reg, u32 *val)
 {
 	struct piix_host_priv *hpriv = link->ap->host->private_data;
+	unsigned long flags;
 
 	if (reg >= ARRAY_SIZE(piix_sidx_map))
 		return -EINVAL;
 
+	spin_lock_irqsave(&hpriv->sidpr_lock, flags);
 	piix_sidpr_sel(link, reg);
 	*val = ioread32(hpriv->sidpr + PIIX_SIDPR_DATA);
+	spin_unlock_irqrestore(&hpriv->sidpr_lock, flags);
 	return 0;
 }
 
@@ -961,12 +965,15 @@ static int piix_sidpr_scr_write(struct ata_link *link,
 				unsigned int reg, u32 val)
 {
 	struct piix_host_priv *hpriv = link->ap->host->private_data;
+	unsigned long flags;
 
 	if (reg >= ARRAY_SIZE(piix_sidx_map))
 		return -EINVAL;
 
+	spin_lock_irqsave(&hpriv->sidpr_lock, flags);
 	piix_sidpr_sel(link, reg);
 	iowrite32(val, hpriv->sidpr + PIIX_SIDPR_DATA);
+	spin_unlock_irqrestore(&hpriv->sidpr_lock, flags);
 	return 0;
 }
 
@@ -1555,6 +1562,7 @@ static int __devinit piix_init_one(struct pci_dev *pdev,
 	hpriv = devm_kzalloc(dev, sizeof(*hpriv), GFP_KERNEL);
 	if (!hpriv)
 		return -ENOMEM;
+	spin_lock_init(&hpriv->sidpr_lock);
 
 	/* Save IOCFG, this will be used for cable detection, quirk
 	 * detection and restoration on detach.  This is necessary
diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
index c5f5186..a73f102 100644
--- a/drivers/atm/solos-pci.c
+++ b/drivers/atm/solos-pci.c
@@ -774,7 +774,8 @@ static struct atm_vcc *find_vcc(struct atm_dev *dev, short vpi, int vci)
 	sk_for_each(s, node, head) {
 		vcc = atm_sk(s);
 		if (vcc->dev == dev && vcc->vci == vci &&
-		    vcc->vpi == vpi && vcc->qos.rxtp.traffic_class != ATM_NONE)
+		    vcc->vpi == vpi && vcc->qos.rxtp.traffic_class != ATM_NONE &&
+		    test_bit(ATM_VF_READY, &vcc->flags))
 			goto out;
 	}
 	vcc = NULL;
@@ -900,6 +901,10 @@ static void pclose(struct atm_vcc *vcc)
 	clear_bit(ATM_VF_ADDR, &vcc->flags);
 	clear_bit(ATM_VF_READY, &vcc->flags);
 
+	/* Hold up vcc_destroy_socket() (our caller) until solos_bh() in the
+	   tasklet has finished processing any incoming packets (and, more to
+	   the point, using the vcc pointer). */
+	tasklet_unlock_wait(&card->tlet);
 	return;
 }
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bd112c8..1c21a3f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -238,6 +238,8 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 		if (ret)
 			goto fail;
 
+		file_update_time(file);
+
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
 				bvec->bv_page, bv_offs, size, IV);
 		copied = size;
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 1be7631..0e9c564 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -59,6 +59,9 @@ static struct usb_device_id btusb_table[] = {
 	/* Generic Bluetooth USB device */
 	{ USB_DEVICE_INFO(0xe0, 0x01, 0x01) },
 
+	/* Apple iMac11,1 */
+	{ USB_DEVICE(0x05ac, 0x8215) },
+
 	/* AVM BlueFRITZ! USB v2.0 */
 	{ USB_DEVICE(0x057c, 0x3800) },
 
diff --git a/drivers/char/nvram.c b/drivers/char/nvram.c
index 88cee40..71f0d72 100644
--- a/drivers/char/nvram.c
+++ b/drivers/char/nvram.c
@@ -265,10 +265,16 @@ static ssize_t nvram_write(struct file *file, const char __user *buf,
 	unsigned char contents[NVRAM_BYTES];
 	unsigned i = *ppos;
 	unsigned char *tmp;
-	int len;
 
-	len = (NVRAM_BYTES - i) < count ? (NVRAM_BYTES - i) : count;
-	if (copy_from_user(contents, buf, len))
+	if (i >= NVRAM_BYTES)
+		return 0;	/* Past EOF */
+
+	if (count > NVRAM_BYTES - i)
+		count = NVRAM_BYTES - i;
+	if (count > NVRAM_BYTES)
+		return -EFAULT;	/* Can't happen, but prove it to gcc */
+
+	if (copy_from_user(contents, buf, count))
 		return -EFAULT;
 
 	spin_lock_irq(&rtc_lock);
@@ -276,7 +282,7 @@ static ssize_t nvram_write(struct file *file, const char __user *buf,
 	if (!__nvram_check_checksum())
 		goto checksum_err;
 
-	for (tmp = contents; count-- > 0 && i < NVRAM_BYTES; ++i, ++tmp)
+	for (tmp = contents; count--; ++i, ++tmp)
 		__nvram_write_byte(*tmp, i);
 
 	__nvram_set_checksum();
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 259ec21..1097dec 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -565,8 +565,8 @@ struct drm_display_mode *drm_mode_std(struct drm_device *dev,
 		mode = drm_cvt_mode(dev, hsize, vsize, vrefresh_rate, 0, 0,
 				    false);
 		mode->hdisplay = 1366;
-		mode->vsync_start = mode->vsync_start - 1;
-		mode->vsync_end = mode->vsync_end - 1;
+		mode->hsync_start = mode->hsync_start - 1;
+		mode->hsync_end = mode->hsync_end - 1;
 		return mode;
 	}
 	mode = NULL;
diff --git a/drivers/gpu/drm/i915/dvo_tfp410.c b/drivers/gpu/drm/i915/dvo_tfp410.c
index 9ecc907..16dce84 100644
--- a/drivers/gpu/drm/i915/dvo_tfp410.c
+++ b/drivers/gpu/drm/i915/dvo_tfp410.c
@@ -214,7 +214,7 @@ static enum drm_connector_status tfp410_detect(struct intel_dvo_device *dvo)
 	uint8_t ctl2;
 
 	if (tfp410_readb(dvo, TFP410_CTL_2, &ctl2)) {
-		if (ctl2 & TFP410_CTL_2_HTPLG)
+		if (ctl2 & TFP410_CTL_2_RSEN)
 			ret = connector_status_connected;
 		else
 			ret = connector_status_disconnected;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 64207df..2de76cc 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -506,15 +506,22 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 	return (flags & REQ_FAILED) ? -EIO : 0;
 }
 
-static void ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
+/*
+ * returns true if rq has been completed
+ */
+static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 {
 	unsigned int nr_bytes = cmd->nbytes - cmd->nleft;
 
 	if (cmd->tf_flags & IDE_TFLAG_WRITE)
 		nr_bytes -= cmd->last_xfer_len;
 
-	if (nr_bytes > 0)
+	if (nr_bytes > 0) {
 		ide_complete_rq(drive, 0, nr_bytes);
+		return true;
+	}
+
+	return false;
 }
 
 static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
@@ -679,7 +686,8 @@ out_end:
 		}
 
 		if (uptodate == 0 && rq->bio)
-			ide_cd_error_cmd(drive, cmd);
+			if (ide_cd_error_cmd(drive, cmd))
+				return ide_stopped;
 
 		/* make sure it's fully ended */
 		if (blk_fs_request(rq) == 0) {
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index a5e5f2f..4783153 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1317,7 +1317,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 {
 	if (!bitmap) return;
 	if (behind) {
-		atomic_dec(&bitmap->behind_writes);
+		if (atomic_dec_and_test(&bitmap->behind_writes))
+			wake_up(&bitmap->behind_wait);
 		PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
 		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
 	}
@@ -1629,6 +1630,7 @@ int bitmap_create(mddev_t *mddev)
 	atomic_set(&bitmap->pending_writes, 0);
 	init_waitqueue_head(&bitmap->write_wait);
 	init_waitqueue_head(&bitmap->overflow_wait);
+	init_waitqueue_head(&bitmap->behind_wait);
 
 	bitmap->mddev = mddev;
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 7e38d13..86950bc 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -254,6 +254,9 @@ struct bitmap {
 	wait_queue_head_t write_wait;
 	wait_queue_head_t overflow_wait;
 
+#ifndef __GENKSYMS__
+	wait_queue_head_t behind_wait;
+#endif
 };
 
 /* the bitmap API */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5ccad28..791e195 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -845,6 +845,15 @@ static int make_request(struct request_queue *q, struct bio * bio)
 		}
 		mirror = conf->mirrors + rdisk;
 
+		if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+		    bitmap) {
+			/* Reading from a write-mostly device must
+			 * take care not to over-take any writes
+			 * that are 'behind'
+			 */
+			wait_event(bitmap->behind_wait,
+				   atomic_read(&bitmap->behind_writes) == 0);
+		}
 		r1_bio->read_disk = rdisk;
 
 		read_bio = bio_clone(bio, GFP_NOIO);
@@ -922,9 +931,13 @@ static int make_request(struct request_queue *q, struct bio * bio)
 		set_bit(R1BIO_Degraded, &r1_bio->state);
 	}
 
-	/* do behind I/O ? */
+	/* do behind I/O ?
+	 * Not if there are too many, or cannot allocate memory,
+	 * or a reader on WriteMostly is waiting for behind writes
+	 * to flush */
 	if (bitmap &&
 	    atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
+	    !waitqueue_active(&bitmap->behind_wait) &&
 	    (behind_pages = alloc_behind_pages(bio)) != NULL)
 		set_bit(R1BIO_BehindIO, &r1_bio->state);
 
@@ -2105,15 +2118,13 @@ static int stop(mddev_t *mddev)
 {
 	conf_t *conf = mddev->private;
 	struct bitmap *bitmap = mddev->bitmap;
-	int behind_wait = 0;
 
 	/* wait for behind writes to complete */
-	while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
-		behind_wait++;
-		printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(HZ); /* wait a second */
+	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+		printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop.\n", mdname(mddev));
 		/* need to kick something here to make sure I/O goes? */
+		wait_event(bitmap->behind_wait,
+			   atomic_read(&bitmap->behind_writes) == 0);
 	}
 
 	raise_barrier(conf);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6c2a35b..1b4e232 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -824,11 +824,29 @@ static int make_request(struct request_queue *q, struct bio * bio)
 		 */
 		bp = bio_split(bio,
 			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
+
+		/* Each of these 'make_request' calls will call 'wait_barrier'.
+		 * If the first succeeds but the second blocks due to the resync
+		 * thread raising the barrier, we will deadlock because the
+		 * IO to the underlying device will be queued in generic_make_request
+		 * and will never complete, so will never reduce nr_pending.
+		 * So increment nr_waiting here so no new raise_barriers will
+		 * succeed, and so the second wait_barrier cannot block.
+		 */
+		spin_lock_irq(&conf->resync_lock);
+		conf->nr_waiting++;
+		spin_unlock_irq(&conf->resync_lock);
+
 		if (make_request(q, &bp->bio1))
 			generic_make_request(&bp->bio1);
 		if (make_request(q, &bp->bio2))
 			generic_make_request(&bp->bio2);
 
+		spin_lock_irq(&conf->resync_lock);
+		conf->nr_waiting--;
+		wake_up(&conf->wait_barrier);
+		spin_unlock_irq(&conf->resync_lock);
+
 		bio_pair_release(bp);
 		return 0;
 	bad_map:
diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index f9cdcbc..b496fa6 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -85,8 +85,7 @@ struct smsc911x_data {
 	 */
 	spinlock_t mac_lock;
 
-	/* spinlock to ensure 16-bit accesses are serialised.
-	 * unused with a 32-bit bus */
+	/* spinlock to ensure register accesses are serialised */
 	spinlock_t dev_lock;
 
 	struct phy_device *phy_dev;
@@ -119,37 +118,33 @@ struct smsc911x_data {
 	unsigned int hashlo;
 };
 
-/* The 16-bit access functions are significantly slower, due to the locking
- * necessary.  If your bus hardware can be configured to do this for you
- * (in response to a single 32-bit operation from software), you should use
- * the 32-bit access functions instead. */
-
-static inline u32 smsc911x_reg_read(struct smsc911x_data *pdata, u32 reg)
+static inline u32 __smsc911x_reg_read(struct smsc911x_data *pdata, u32 reg)
 {
 	if (pdata->config.flags & SMSC911X_USE_32BIT)
 		return readl(pdata->ioaddr + reg);
 
-	if (pdata->config.flags & SMSC911X_USE_16BIT) {
-		u32 data;
-		unsigned long flags;
-
-		/* these two 16-bit reads must be performed consecutively, so
-		 * must not be interrupted by our own ISR (which would start
-		 * another read operation) */
-		spin_lock_irqsave(&pdata->dev_lock, flags);
-		data = ((readw(pdata->ioaddr + reg) & 0xFFFF) |
+	if (pdata->config.flags & SMSC911X_USE_16BIT)
+		return ((readw(pdata->ioaddr + reg) & 0xFFFF) |
 			((readw(pdata->ioaddr + reg + 2) & 0xFFFF) << 16));
-		spin_unlock_irqrestore(&pdata->dev_lock, flags);
-
-		return data;
-	}
 
 	BUG();
 	return 0;
 }
 
-static inline void smsc911x_reg_write(struct smsc911x_data *pdata, u32 reg,
-				      u32 val)
+static inline u32 smsc911x_reg_read(struct smsc911x_data *pdata, u32 reg)
+{
+	u32 data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pdata->dev_lock, flags);
+	data = __smsc911x_reg_read(pdata, reg);
+	spin_unlock_irqrestore(&pdata->dev_lock, flags);
+
+	return data;
+}
+
+static inline void __smsc911x_reg_write(struct smsc911x_data *pdata, u32 reg,
+					u32 val)
 {
 	if (pdata->config.flags & SMSC911X_USE_32BIT) {
 		writel(val, pdata->ioaddr + reg);
@@ -157,44 +152,54 @@ static inline void smsc911x_reg_write(struct smsc911x_data *pdata, u32 reg,
 	}
 
 	if (pdata->config.flags & SMSC911X_USE_16BIT) {
-		unsigned long flags;
-
-		/* these two 16-bit writes must be performed consecutively, so
-		 * must not be interrupted by our own ISR (which would start
-		 * another read operation) */
-		spin_lock_irqsave(&pdata->dev_lock, flags);
 		writew(val & 0xFFFF, pdata->ioaddr + reg);
 		writew((val >> 16) & 0xFFFF, pdata->ioaddr + reg + 2);
-		spin_unlock_irqrestore(&pdata->dev_lock, flags);
 		return;
 	}
 
 	BUG();
 }
 
+static inline void smsc911x_reg_write(struct smsc911x_data *pdata, u32 reg,
+				      u32 val)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pdata->dev_lock, flags);
+	__smsc911x_reg_write(pdata, reg, val);
+	spin_unlock_irqrestore(&pdata->dev_lock, flags);
+}
+
 /* Writes a packet to the TX_DATA_FIFO */
 static inline void
 smsc911x_tx_writefifo(struct smsc911x_data *pdata, unsigned int *buf,
 		      unsigned int wordcount)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&pdata->dev_lock, flags);
+
 	if (pdata->config.flags & SMSC911X_SWAP_FIFO) {
 		while (wordcount--)
-			smsc911x_reg_write(pdata, TX_DATA_FIFO, swab32(*buf++));
-		return;
+			__smsc911x_reg_write(pdata, TX_DATA_FIFO,
+					     swab32(*buf++));
+		goto out;
 	}
 
 	if (pdata->config.flags & SMSC911X_USE_32BIT) {
 		writesl(pdata->ioaddr + TX_DATA_FIFO, buf, wordcount);
-		return;
+		goto out;
 	}
 
 	if (pdata->config.flags & SMSC911X_USE_16BIT) {
 		while (wordcount--)
-			smsc911x_reg_write(pdata, TX_DATA_FIFO, *buf++);
-		return;
+			__smsc911x_reg_write(pdata, TX_DATA_FIFO, *buf++);
+		goto out;
 	}
 
 	BUG();
+out:
+	spin_unlock_irqrestore(&pdata->dev_lock, flags);
 }
 
 /* Reads a packet out of the RX_DATA_FIFO */
@@ -202,24 +207,31 @@ static inline void
 smsc911x_rx_readfifo(struct smsc911x_data *pdata, unsigned int *buf,
 		     unsigned int wordcount)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&pdata->dev_lock, flags);
+
 	if (pdata->config.flags & SMSC911X_SWAP_FIFO) {
 		while (wordcount--)
-			*buf++ = swab32(smsc911x_reg_read(pdata, RX_DATA_FIFO));
-		return;
+			*buf++ = swab32(__smsc911x_reg_read(pdata,
+							    RX_DATA_FIFO));
+		goto out;
 	}
 
 	if (pdata->config.flags & SMSC911X_USE_32BIT) {
 		readsl(pdata->ioaddr + RX_DATA_FIFO, buf, wordcount);
-		return;
+		goto out;
 	}
 
 	if (pdata->config.flags & SMSC911X_USE_16BIT) {
 		while (wordcount--)
-			*buf++ = smsc911x_reg_read(pdata, RX_DATA_FIFO);
-		return;
+			*buf++ = __smsc911x_reg_read(pdata, RX_DATA_FIFO);
+		goto out;
 	}
 
 	BUG();
+out:
+	spin_unlock_irqrestore(&pdata->dev_lock, flags);
 }
 
 /* waits for MAC not busy, with timeout.  Only called by smsc911x_mac_read
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index baa051d..1a11d95 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1619,6 +1619,7 @@ static void backend_changed(struct xenbus_device *dev,
 		if (xennet_connect(netdev) != 0)
 			break;
 		xenbus_switch_state(dev, XenbusStateConnected);
+		netif_notify_peers(netdev);
 		break;
 
 	case XenbusStateClosing:
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index c0de0b9..bf6bd67 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -2084,6 +2084,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS480, quirk_disabl
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_VT3336, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_VT3351, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_VT3364, quirk_disable_all_msi);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8380_0, quirk_disable_all_msi);
 
 /* Disable MSI on chipsets that are known to not support it */
 static void __devinit quirk_disable_msi(struct pci_dev *dev)
@@ -2356,6 +2357,9 @@ static void __devinit __nv_msi_ht_cap_quirk(struct pci_dev *dev, int all)
 	int pos;
 	int found;
 
+	if (!pci_msi_enabled())
+		return;
+
 	/* check if there is HT MSI cap or enabled on this device */
 	found = ht_check_msi_mapping(dev);
 
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 138124f..126f240 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -618,6 +618,7 @@ void __irq_entry do_IRQ(struct pt_regs *regs)
 	old_regs = set_irq_regs(regs);
 	s390_idle_check();
 	irq_enter();
+	__get_cpu_var(s390_idle).nohz_delay = 1;
 	if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator)
 		/* Serve timer interrupts first. */
 		clock_comparator_work();
diff --git a/drivers/scsi/aic7xxx/aic79xx_core.c b/drivers/scsi/aic7xxx/aic79xx_core.c
index 63b521d..3e89f8e 100644
--- a/drivers/scsi/aic7xxx/aic79xx_core.c
+++ b/drivers/scsi/aic7xxx/aic79xx_core.c
@@ -3171,13 +3171,16 @@ ahd_handle_nonpkt_busfree(struct ahd_softc *ahd)
 				tinfo->curr.transport_version = 2;
 				tinfo->goal.transport_version = 2;
 				tinfo->goal.ppr_options = 0;
-				/*
-				 * Remove any SCBs in the waiting for selection
-				 * queue that may also be for this target so
-				 * that command ordering is preserved.
-				 */
-				ahd_freeze_devq(ahd, scb);
-				ahd_qinfifo_requeue_tail(ahd, scb);
+				if (scb != NULL) {
+					/*
+					 * Remove any SCBs in the waiting
+					 * for selection queue that may
+					 * also be for this target so that
+					 * command ordering is preserved.
+					 */
+					ahd_freeze_devq(ahd, scb);
+					ahd_qinfifo_requeue_tail(ahd, scb);
+				}
 				printerror = 0;
 			}
 		} else if (ahd_sent_msg(ahd, AHDMSG_EXT, MSG_EXT_WDTR, FALSE)
@@ -3194,13 +3197,16 @@ ahd_handle_nonpkt_busfree(struct ahd_softc *ahd)
 				      MSG_EXT_WDTR_BUS_8_BIT,
 				      AHD_TRANS_CUR|AHD_TRANS_GOAL,
 				      /*paused*/TRUE);
-			/*
-			 * Remove any SCBs in the waiting for selection
-			 * queue that may also be for this target so that
-			 * command ordering is preserved.
-			 */
-			ahd_freeze_devq(ahd, scb);
-			ahd_qinfifo_requeue_tail(ahd, scb);
+			if (scb != NULL) {
+				/*
+				 * Remove any SCBs in the waiting for
+				 * selection queue that may also be for
+				 * this target so that command ordering
+				 * is preserved.
+				 */
+				ahd_freeze_devq(ahd, scb);
+				ahd_qinfifo_requeue_tail(ahd, scb);
+			}
 			printerror = 0;
 		} else if (ahd_sent_msg(ahd, AHDMSG_EXT, MSG_EXT_SDTR, FALSE)
 			&& ppr_busfree == 0) {
@@ -3217,13 +3223,16 @@ ahd_handle_nonpkt_busfree(struct ahd_softc *ahd)
 					/*ppr_options*/0,
 					AHD_TRANS_CUR|AHD_TRANS_GOAL,
 					/*paused*/TRUE);
-			/*
-			 * Remove any SCBs in the waiting for selection
-			 * queue that may also be for this target so that
-			 * command ordering is preserved.
-			 */
-			ahd_freeze_devq(ahd, scb);
-			ahd_qinfifo_requeue_tail(ahd, scb);
+			if (scb != NULL) {
+				/*
+				 * Remove any SCBs in the waiting for
+				 * selection queue that may also be for
+				 * this target so that command ordering
+				 * is preserved.
+				 */
+				ahd_freeze_devq(ahd, scb);
+				ahd_qinfifo_requeue_tail(ahd, scb);
+			}
 			printerror = 0;
 		} else if ((ahd->msg_flags & MSG_FLAG_EXPECT_IDE_BUSFREE) != 0
 			&& ahd_sent_msg(ahd, AHDMSG_1B,
@@ -3251,7 +3260,7 @@ ahd_handle_nonpkt_busfree(struct ahd_softc *ahd)
 	 * the message phases.  We check it last in case we
 	 * had to send some other message that caused a busfree.
 	 */
-	if (printerror != 0
+	if (scb != NULL && printerror != 0
 	 && (lastphase == P_MESGIN || lastphase == P_MESGOUT)
 	 && ((ahd->msg_flags & MSG_FLAG_EXPECT_PPR_BUSFREE) != 0)) {
 
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index bb2c696..2d66fac 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1969,7 +1969,7 @@ static int ibmvfc_wait_for_ops(struct ibmvfc_host *vhost, void *device,
 	DECLARE_COMPLETION_ONSTACK(comp);
 	int wait;
 	unsigned long flags;
-	signed long timeout = init_timeout * HZ;
+	signed long timeout = IBMVFC_ABORT_WAIT_TIMEOUT * HZ;
 
 	ENTER;
 	do {
@@ -2720,6 +2720,7 @@ static struct ibmvfc_async_crq *ibmvfc_next_async_crq(struct ibmvfc_host *vhost)
 	if (crq->valid & 0x80) {
 		if (++async_crq->cur == async_crq->size)
 			async_crq->cur = 0;
+		rmb();
 	} else
 		crq = NULL;
 
@@ -2742,6 +2743,7 @@ static struct ibmvfc_crq *ibmvfc_next_crq(struct ibmvfc_host *vhost)
 	if (crq->valid & 0x80) {
 		if (++queue->cur == queue->size)
 			queue->cur = 0;
+		rmb();
 	} else
 		crq = NULL;
 
@@ -2790,12 +2792,14 @@ static void ibmvfc_tasklet(void *data)
 		while ((async = ibmvfc_next_async_crq(vhost)) != NULL) {
 			ibmvfc_handle_async(async, vhost);
 			async->valid = 0;
+			wmb();
 		}
 
 		/* Pull all the valid messages off the CRQ */
 		while ((crq = ibmvfc_next_crq(vhost)) != NULL) {
 			ibmvfc_handle_crq(crq, vhost);
 			crq->valid = 0;
+			wmb();
 		}
 
 		vio_enable_interrupts(vdev);
@@ -2803,10 +2807,12 @@ static void ibmvfc_tasklet(void *data)
 			vio_disable_interrupts(vdev);
 			ibmvfc_handle_async(async, vhost);
 			async->valid = 0;
+			wmb();
 		} else if ((crq = ibmvfc_next_crq(vhost)) != NULL) {
 			vio_disable_interrupts(vdev);
 			ibmvfc_handle_crq(crq, vhost);
 			crq->valid = 0;
+			wmb();
 		} else
 			done = 1;
 	}
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 007fa1c..ef8e9f8 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -38,6 +38,7 @@
 #define IBMVFC_ADISC_PLUS_CANCEL_TIMEOUT	\
 		(IBMVFC_ADISC_TIMEOUT + IBMVFC_ADISC_CANCEL_TIMEOUT)
 #define IBMVFC_INIT_TIMEOUT		120
+#define IBMVFC_ABORT_WAIT_TIMEOUT	40
 #define IBMVFC_MAX_REQUESTS_DEFAULT	100
 
 #define IBMVFC_DEBUG			0
diff --git a/drivers/staging/line6/Kconfig b/drivers/staging/line6/Kconfig
index 7852d4a..bc1ffbe 100644
--- a/drivers/staging/line6/Kconfig
+++ b/drivers/staging/line6/Kconfig
@@ -2,6 +2,7 @@ config LINE6_USB
 	tristate "Line6 USB support"
 	depends on USB && SND
 	select SND_RAWMIDI
+	select SND_PCM
 	help
 	  This is a driver for the guitar amp, cab, and effects modeller
 	  PODxt Pro by Line6 (and similar devices), supporting the
diff --git a/drivers/staging/panel/panel.c b/drivers/staging/panel/panel.c
index 4ce399b..c39a25f 100644
--- a/drivers/staging/panel/panel.c
+++ b/drivers/staging/panel/panel.c
@@ -2181,6 +2181,7 @@ int panel_init(void)
 		if (pprt) {
 			parport_release(pprt);
 			parport_unregister_device(pprt);
+			pprt = NULL;
 		}
 		parport_unregister_driver(&panel_driver);
 		printk(KERN_ERR "Panel driver version " PANEL_VERSION
@@ -2230,6 +2231,7 @@ static void __exit panel_cleanup_module(void)
 		/* TODO: free all input signals */
 		parport_release(pprt);
 		parport_unregister_device(pprt);
+		pprt = NULL;
 	}
 	parport_unregister_driver(&panel_driver);
 }
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index ed3aa7a..7456e29 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -22,6 +22,7 @@
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/freezer.h>
+#include <linux/usb/quirks.h>
 
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
@@ -1768,7 +1769,6 @@ int usb_new_device(struct usb_device *udev)
 	if (udev->parent)
 		usb_autoresume_device(udev->parent);
 
-	usb_detect_quirks(udev);
 	err = usb_enumerate_device(udev);	/* Read descriptors */
 	if (err < 0)
 		goto fail;
@@ -3063,6 +3063,10 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 		if (status < 0)
 			goto loop;
 
+		usb_detect_quirks(udev);
+		if (udev->quirks & USB_QUIRK_DELAY_INIT)
+			msleep(1000);
+
 		/* consecutive bus-powered hubs aren't reliable; they can
 		 * violate the voltage drop budget.  if the new child has
 		 * a "powered" LED, users should notice we didn't enable it
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index a61f160..80b062b 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -38,6 +38,9 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Creative SB Audigy 2 NX */
 	{ USB_DEVICE(0x041e, 0x3020), .driver_info = USB_QUIRK_RESET_RESUME },
 
+	/* Logitech Harmony 700-series */
+	{ USB_DEVICE(0x046d, 0xc122), .driver_info = USB_QUIRK_DELAY_INIT },
+
 	/* Philips PSC805 audio device */
 	{ USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME },
 
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 0885d4a..da9a2b8 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -137,6 +137,16 @@ void usb_anchor_urb(struct urb *urb, struct usb_anchor *anchor)
 }
 EXPORT_SYMBOL_GPL(usb_anchor_urb);
 
+/* Callers must hold anchor->lock */
+static void __usb_unanchor_urb(struct urb *urb, struct usb_anchor *anchor)
+{
+	urb->anchor = NULL;
+	list_del(&urb->anchor_list);
+	usb_put_urb(urb);
+	if (list_empty(&anchor->urb_list))
+		wake_up(&anchor->wait);
+}
+
 /**
  * usb_unanchor_urb - unanchors an URB
  * @urb: pointer to the urb to anchor
@@ -156,17 +166,14 @@ void usb_unanchor_urb(struct urb *urb)
 		return;
 
 	spin_lock_irqsave(&anchor->lock, flags);
-	if (unlikely(anchor != urb->anchor)) {
-		/* we've lost the race to another thread */
-		spin_unlock_irqrestore(&anchor->lock, flags);
-		return;
-	}
-	urb->anchor = NULL;
-	list_del(&urb->anchor_list);
+	/*
+	 * At this point, we could be competing with another thread which
+	 * has the same intention. To protect the urb from being unanchored
+	 * twice, only the winner of the race gets the job.
+	 */
+	if (likely(anchor == urb->anchor))
+		__usb_unanchor_urb(urb, anchor);
 	spin_unlock_irqrestore(&anchor->lock, flags);
-	usb_put_urb(urb);
-	if (list_empty(&anchor->urb_list))
-		wake_up(&anchor->wait);
 }
 EXPORT_SYMBOL_GPL(usb_unanchor_urb);
 
@@ -725,20 +732,11 @@ EXPORT_SYMBOL_GPL(usb_unpoison_anchored_urbs);
 void usb_unlink_anchored_urbs(struct usb_anchor *anchor)
 {
 	struct urb *victim;
-	unsigned long flags;
 
-	spin_lock_irqsave(&anchor->lock, flags);
-	while (!list_empty(&anchor->urb_list)) {
-		victim = list_entry(anchor->urb_list.prev, struct urb,
-				    anchor_list);
-		usb_get_urb(victim);
-		spin_unlock_irqrestore(&anchor->lock, flags);
-		/* this will unanchor the URB */
+	while ((victim = usb_get_from_anchor(anchor)) != NULL) {
 		usb_unlink_urb(victim);
 		usb_put_urb(victim);
-		spin_lock_irqsave(&anchor->lock, flags);
 	}
-	spin_unlock_irqrestore(&anchor->lock, flags);
 }
 EXPORT_SYMBOL_GPL(usb_unlink_anchored_urbs);
 
@@ -775,12 +773,11 @@ struct urb *usb_get_from_anchor(struct usb_anchor *anchor)
 		victim = list_entry(anchor->urb_list.next, struct urb,
 				    anchor_list);
 		usb_get_urb(victim);
-		spin_unlock_irqrestore(&anchor->lock, flags);
-		usb_unanchor_urb(victim);
+		__usb_unanchor_urb(victim, anchor);
 	} else {
-		spin_unlock_irqrestore(&anchor->lock, flags);
 		victim = NULL;
 	}
+	spin_unlock_irqrestore(&anchor->lock, flags);
 
 	return victim;
 }
@@ -802,12 +799,7 @@ void usb_scuttle_anchored_urbs(struct usb_anchor *anchor)
 	while (!list_empty(&anchor->urb_list)) {
 		victim = list_entry(anchor->urb_list.prev, struct urb,
 				    anchor_list);
-		usb_get_urb(victim);
-		spin_unlock_irqrestore(&anchor->lock, flags);
-		/* this may free the URB */
-		usb_unanchor_urb(victim);
-		usb_put_urb(victim);
-		spin_lock_irqsave(&anchor->lock, flags);
+		__usb_unanchor_urb(victim, anchor);
 	}
 	spin_unlock_irqrestore(&anchor->lock, flags);
 }
diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c
index a9f06d7..d6a2ef3 100644
--- a/drivers/usb/misc/usbtest.c
+++ b/drivers/usb/misc/usbtest.c
@@ -1382,7 +1382,6 @@ static void iso_callback (struct urb *urb)
 			break;
 		}
 	}
-	simple_free_urb (urb);
 
 	ctx->pending--;
 	if (ctx->pending == 0) {
@@ -1499,6 +1498,7 @@ test_iso_queue (struct usbtest_dev *dev, struct usbtest_param *param,
 			}
 
 			simple_free_urb (urbs [i]);
+			urbs[i] = NULL;
 			context.pending--;
 			context.submit_error = 1;
 			break;
@@ -1508,6 +1508,10 @@ test_iso_queue (struct usbtest_dev *dev, struct usbtest_param *param,
 
 	wait_for_completion (&context.done);
 
+	for (i = 0; i < param->sglen; i++) {
+		if (urbs[i])
+			simple_free_urb(urbs[i]);
+	}
 	/*
 	 * Isochronous transfers are expected to fail sometimes.  As an
 	 * arbitrary limit, we will report an error if any submissions
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 10f3205..9231b25 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -971,7 +971,7 @@ static int mon_bin_ioctl(struct inode *inode, struct file *file,
 
 		mutex_lock(&rp->fetch_lock);
 		spin_lock_irqsave(&rp->b_lock, flags);
-		mon_free_buff(rp->b_vec, size/CHUNK_SIZE);
+		mon_free_buff(rp->b_vec, rp->b_size/CHUNK_SIZE);
 		kfree(rp->b_vec);
 		rp->b_vec  = vec;
 		rp->b_size = size;
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index e3e087e..ae4b181 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -128,6 +128,10 @@ static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x1843, 0x0200) }, /* Vaisala USB Instrument Cable */
 	{ USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */
 	{ USB_DEVICE(0x413C, 0x9500) }, /* DW700 GPS USB interface */
+	{ USB_DEVICE(0x16DC, 0x0010) }, /* W-IE-NE-R Plein & Baus GmbH PL512 Power Supply */
+	{ USB_DEVICE(0x16DC, 0x0011) }, /* W-IE-NE-R Plein & Baus GmbH RCM Remote Control for MARATON Power Supply */
+	{ USB_DEVICE(0x16DC, 0x0012) }, /* W-IE-NE-R Plein & Baus GmbH MPOD Multi Channel Power Supply */
+	{ USB_DEVICE(0x16DC, 0x0015) }, /* W-IE-NE-R Plein & Baus GmbH CML Control, Monitoring and Data Logger */
 	{ } /* Terminating Entry */
 };
 
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 8de8572..b2353a9 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -162,6 +162,9 @@ static struct usb_device_id id_table_combined [] = {
 	{ USB_DEVICE(FTDI_VID, FTDI_SCS_DEVICE_5_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_SCS_DEVICE_6_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_SCS_DEVICE_7_PID) },
+	{ USB_DEVICE(FTDI_VID, FTDI_USINT_CAT_PID) },
+	{ USB_DEVICE(FTDI_VID, FTDI_USINT_WKEY_PID) },
+	{ USB_DEVICE(FTDI_VID, FTDI_USINT_RS232_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_ACTZWAVE_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_IRTRANS_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_IPLUS_PID) },
@@ -752,6 +755,7 @@ static struct usb_device_id id_table_combined [] = {
 		.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
 	{ USB_DEVICE(FTDI_VID, XVERVE_SIGNALYZER_SH4_PID),
 		.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
+	{ USB_DEVICE(FTDI_VID, SEGWAY_RMP200_PID) },
 	{ },					/* Optional parameter entry */
 	{ }					/* Terminating entry */
 };
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index ffdcec7..a0a7796 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -40,6 +40,11 @@
 
 #define FTDI_NXTCAM_PID		0xABB8 /* NXTCam for Mindstorms NXT */
 
+/* US Interface Navigator (http://www.usinterface.com/) */
+#define FTDI_USINT_CAT_PID	0xb810	/* Navigator CAT and 2nd PTT lines */
+#define FTDI_USINT_WKEY_PID	0xb811	/* Navigator WKEY and FSK lines */
+#define FTDI_USINT_RS232_PID	0xb812	/* Navigator RS232 and CONFIG lines */
+
 /* OOCDlink by Joern Kaipf <joernk@....de>
  * (http://www.joernonline.de/dw/doku.php?id=start&idx=projects:oocdlink) */
 #define FTDI_OOCDLINK_PID	0xbaf8	/* Amontec JTAGkey */
@@ -1039,3 +1044,8 @@
 #define XVERVE_SIGNALYZER_SH2_PID	0xBCA2
 #define XVERVE_SIGNALYZER_SH4_PID	0xBCA4
 
+/*
+ * Segway Robotic Mobility Platform USB interface (using VID 0x0403)
+ * Submitted by John G. Rogers
+ */
+#define SEGWAY_RMP200_PID	0xe729
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 2586023..e864052 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -165,7 +165,10 @@ static int  option_resume(struct usb_serial *serial);
 #define HUAWEI_PRODUCT_E143D			0x143D
 #define HUAWEI_PRODUCT_E143E			0x143E
 #define HUAWEI_PRODUCT_E143F			0x143F
+#define HUAWEI_PRODUCT_K4505			0x1464
+#define HUAWEI_PRODUCT_K3765			0x1465
 #define HUAWEI_PRODUCT_E14AC			0x14AC
+#define HUAWEI_PRODUCT_ETS1220			0x1803
 
 #define QUANTA_VENDOR_ID			0x0408
 #define QUANTA_PRODUCT_Q101			0xEA02
@@ -469,6 +472,9 @@ static struct usb_device_id option_ids[] = {
 	{ USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E143D, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E143E, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E143F, 0xff, 0xff, 0xff) },
+	{ USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K4505, 0xff, 0xff, 0xff) },
+	{ USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_K3765, 0xff, 0xff, 0xff) },
+	{ USB_DEVICE_AND_INTERFACE_INFO(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_ETS1220, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E14AC) },
 	{ USB_DEVICE(AMOI_VENDOR_ID, AMOI_PRODUCT_9508) },
 	{ USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, NOVATELWIRELESS_PRODUCT_V640) }, /* Novatel Merlin V640/XV620 */
@@ -1007,6 +1013,13 @@ static int option_probe(struct usb_serial *serial,
 		serial->interface->cur_altsetting->desc.bInterfaceClass != 0xff)
 		return -ENODEV;
 
+	/* Don't bind network interfaces on Huawei K3765 & K4505 */
+	if (serial->dev->descriptor.idVendor == HUAWEI_VENDOR_ID &&
+		(serial->dev->descriptor.idProduct == HUAWEI_PRODUCT_K3765 ||
+			serial->dev->descriptor.idProduct == HUAWEI_PRODUCT_K4505) &&
+		serial->interface->cur_altsetting->desc.bInterfaceNumber == 1)
+		return -ENODEV;
+
 	data = serial->private = kzalloc(sizeof(struct option_intf_private), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
diff --git a/drivers/video/w100fb.c b/drivers/video/w100fb.c
index 2376f68..5a1dad2 100644
--- a/drivers/video/w100fb.c
+++ b/drivers/video/w100fb.c
@@ -857,9 +857,9 @@ unsigned long w100fb_gpio_read(int port)
 void w100fb_gpio_write(int port, unsigned long value)
 {
 	if (port==W100_GPIO_PORT_A)
-		value = writel(value, remapped_regs + mmGPIO_DATA);
+		writel(value, remapped_regs + mmGPIO_DATA);
 	else
-		value = writel(value, remapped_regs + mmGPIO_DATA2);
+		writel(value, remapped_regs + mmGPIO_DATA2);
 }
 EXPORT_SYMBOL(w100fb_gpio_read);
 EXPORT_SYMBOL(w100fb_gpio_write);
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index ce602dd..30e0467 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -535,6 +535,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
 	if (irq < 0)
 		return irq;
 
+	irqflags |= IRQF_NO_SUSPEND;
 	retval = request_irq(irq, handler, irqflags, devname, dev_id);
 	if (retval != 0) {
 		unbind_from_irq(irq);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9b9e3dc..e65efa2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1175,10 +1175,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	/*
 	 * hooks: /n/, see "layering violations".
 	 */
-	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
-	if (ret != 0) {
-		bdput(bdev);
-		return ret;
+	if (!for_part) {
+		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+		if (ret != 0) {
+			bdput(bdev);
+			return ret;
+		}
 	}
 
 	lock_kernel();
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 38ebe78..12d7be8 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -94,7 +94,8 @@ static int btrfs_xattr_get_acl(struct inode *inode, int type,
 /*
  * Needs to be called with fs_mutex held
  */
-static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+static int btrfs_set_acl(struct btrfs_trans_handle *trans,
+			 struct inode *inode, struct posix_acl *acl, int type)
 {
 	int ret, size = 0;
 	const char *name;
@@ -111,12 +112,14 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		mode = inode->i_mode;
-		ret = posix_acl_equiv_mode(acl, &mode);
-		if (ret < 0)
-			return ret;
-		ret = 0;
-		inode->i_mode = mode;
 		name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			ret = posix_acl_equiv_mode(acl, &mode);
+			if (ret < 0)
+				return ret;
+			inode->i_mode = mode;
+		}
+		ret = 0;
 		break;
 	case ACL_TYPE_DEFAULT:
 		if (!S_ISDIR(inode->i_mode))
@@ -140,8 +143,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 			goto out;
 	}
 
-	ret = __btrfs_setxattr(inode, name, value, size, 0);
-
+	ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
 out:
 	kfree(value);
 
@@ -154,7 +156,7 @@ out:
 static int btrfs_xattr_set_acl(struct inode *inode, int type,
 			       const void *value, size_t size)
 {
-	int ret = 0;
+	int ret;
 	struct posix_acl *acl = NULL;
 
 	if (!is_owner_or_cap(inode))
@@ -170,7 +172,7 @@ static int btrfs_xattr_set_acl(struct inode *inode, int type,
 		}
 	}
 
-	ret = btrfs_set_acl(inode, acl, type);
+	ret = btrfs_set_acl(NULL, inode, acl, type);
 
 	posix_acl_release(acl);
 
@@ -224,7 +226,8 @@ int btrfs_check_acl(struct inode *inode, int mask)
  * stuff has been fixed to work with that.  If the locking stuff changes, we
  * need to re-evaluate the acl locking stuff.
  */
-int btrfs_init_acl(struct inode *inode, struct inode *dir)
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+		   struct inode *inode, struct inode *dir)
 {
 	struct posix_acl *acl = NULL;
 	int ret = 0;
@@ -249,7 +252,8 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 		mode_t mode;
 
 		if (S_ISDIR(inode->i_mode)) {
-			ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
+			ret = btrfs_set_acl(trans, inode, acl,
+					    ACL_TYPE_DEFAULT);
 			if (ret)
 				goto failed;
 		}
@@ -264,10 +268,11 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 			inode->i_mode = mode;
 			if (ret > 0) {
 				/* we need an acl */
-				ret = btrfs_set_acl(inode, clone,
+				ret = btrfs_set_acl(trans, inode, clone,
 						    ACL_TYPE_ACCESS);
 			}
 		}
+		posix_acl_release(clone);
 	}
 failed:
 	posix_acl_release(acl);
@@ -297,7 +302,7 @@ int btrfs_acl_chmod(struct inode *inode)
 
 	ret = posix_acl_chmod_masq(clone, inode->i_mode);
 	if (!ret)
-		ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
+		ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
 
 	posix_acl_release(clone);
 
@@ -323,7 +328,8 @@ int btrfs_acl_chmod(struct inode *inode)
 	return 0;
 }
 
-int btrfs_init_acl(struct inode *inode, struct inode *dir)
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+		   struct inode *inode, struct inode *dir)
 {
 	return 0;
 }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f6783a4..3f1f50d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,9 +44,6 @@ struct btrfs_inode {
 	 */
 	struct extent_io_tree io_failure_tree;
 
-	/* held while inesrting or deleting extents from files */
-	struct mutex extent_mutex;
-
 	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
 
@@ -166,7 +163,7 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 
 static inline void btrfs_i_size_write(struct inode *inode, u64 size)
 {
-	inode->i_size = size;
+	i_size_write(inode, size);
 	BTRFS_I(inode)->disk_i_size = size;
 }
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ec96f3a..c4bc570 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -37,6 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *src_buf);
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
+static int setup_items_for_insert(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *cpu_key, u32 *data_size,
+			u32 total_data, u32 total_size, int nr);
+
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -451,9 +456,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		extent_buffer_get(cow);
 		spin_unlock(&root->node_lock);
 
-		btrfs_free_extent(trans, root, buf->start, buf->len,
-				  parent_start, root->root_key.objectid,
-				  level, 0);
+		btrfs_free_tree_block(trans, root, buf->start, buf->len,
+				parent_start, root->root_key.objectid, level);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
@@ -468,9 +472,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
-		btrfs_free_extent(trans, root, buf->start, buf->len,
-				  parent_start, root->root_key.objectid,
-				  level, 0);
+		btrfs_free_tree_block(trans, root, buf->start, buf->len,
+				parent_start, root->root_key.objectid, level);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -1030,8 +1033,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		btrfs_tree_unlock(mid);
 		/* once for the path */
 		free_extent_buffer(mid);
-		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
-					0, root->root_key.objectid, level, 1);
+		ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
+					    0, root->root_key.objectid, level);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return ret;
@@ -1095,10 +1098,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 				       1);
 			if (wret)
 				ret = wret;
-			wret = btrfs_free_extent(trans, root, bytenr,
-						 blocksize, 0,
-						 root->root_key.objectid,
-						 level, 0);
+			wret = btrfs_free_tree_block(trans, root,
+						     bytenr, blocksize, 0,
+						     root->root_key.objectid,
+						     level);
 			if (wret)
 				ret = wret;
 		} else {
@@ -1143,9 +1146,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		wret = del_ptr(trans, root, path, level + 1, pslot);
 		if (wret)
 			ret = wret;
-		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
-					 0, root->root_key.objectid,
-					 level, 0);
+		wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
+					 0, root->root_key.objectid, level);
 		if (wret)
 			ret = wret;
 	} else {
@@ -2997,75 +2999,85 @@ again:
 	return ret;
 }
 
-/*
- * This function splits a single item into two items,
- * giving 'new_key' to the new item and splitting the
- * old one at split_offset (from the start of the item).
- *
- * The path may be released by this operation.  After
- * the split, the path is pointing to the old item.  The
- * new item is going to be in the same node as the old one.
- *
- * Note, the item being split must be smaller enough to live alone on
- * a tree block with room for one extra struct btrfs_item
- *
- * This allows us to split the item in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_split_item(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_path *path,
-		     struct btrfs_key *new_key,
-		     unsigned long split_offset)
+static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct btrfs_path *path, int ins_len)
 {
-	u32 item_size;
+	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct btrfs_key orig_key;
-	struct btrfs_item *item;
-	struct btrfs_item *new_item;
-	int ret = 0;
-	int slot;
-	u32 nritems;
-	u32 orig_offset;
-	struct btrfs_disk_key disk_key;
-	char *buf;
+	struct btrfs_file_extent_item *fi;
+	u64 extent_len = 0;
+	u32 item_size;
+	int ret;
 
 	leaf = path->nodes[0];
-	btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
-	if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
-		goto split;
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+	       key.type != BTRFS_EXTENT_CSUM_KEY);
+
+	if (btrfs_leaf_free_space(root, leaf) >= ins_len)
+		return 0;
 
 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (key.type == BTRFS_EXTENT_DATA_KEY) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+	}
 	btrfs_release_path(root, path);
 
-	path->search_for_split = 1;
 	path->keep_locks = 1;
-
-	ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
+	path->search_for_split = 1;
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	path->search_for_split = 0;
+	if (ret < 0)
+		goto err;
 
+	ret = -EAGAIN;
+	leaf = path->nodes[0];
 	/* if our item isn't there or got smaller, return now */
-	if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
-							path->slots[0])) {
-		path->keep_locks = 0;
-		return -EAGAIN;
+	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+		goto err;
+
+	if (key.type == BTRFS_EXTENT_DATA_KEY) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
+			goto err;
 	}
 
 	btrfs_set_path_blocking(path);
-	ret = split_leaf(trans, root, &orig_key, path,
-			 sizeof(struct btrfs_item), 1);
-	path->keep_locks = 0;
+	ret = split_leaf(trans, root, &key, path, ins_len, 1);
 	BUG_ON(ret);
 
+	path->keep_locks = 0;
 	btrfs_unlock_up_safe(path, 1);
+	return 0;
+err:
+	path->keep_locks = 0;
+	return ret;
+}
+
+static noinline int split_item(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct btrfs_key *new_key,
+			       unsigned long split_offset)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	struct btrfs_item *new_item;
+	int slot;
+	char *buf;
+	u32 nritems;
+	u32 item_size;
+	u32 orig_offset;
+	struct btrfs_disk_key disk_key;
+
 	leaf = path->nodes[0];
 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
 
-split:
-	/*
-	 * make sure any changes to the path from split_leaf leave it
-	 * in a blocking state
-	 */
 	btrfs_set_path_blocking(path);
 
 	item = btrfs_item_nr(leaf, path->slots[0]);
@@ -3073,19 +3085,19 @@ split:
 	item_size = btrfs_item_size(leaf, item);
 
 	buf = kmalloc(item_size, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+
 	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
 			    path->slots[0]), item_size);
-	slot = path->slots[0] + 1;
-	leaf = path->nodes[0];
 
+	slot = path->slots[0] + 1;
 	nritems = btrfs_header_nritems(leaf);
-
 	if (slot != nritems) {
 		/* shift the items */
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
-			      btrfs_item_nr_offset(slot),
-			      (nritems - slot) * sizeof(struct btrfs_item));
-
+				btrfs_item_nr_offset(slot),
+				(nritems - slot) * sizeof(struct btrfs_item));
 	}
 
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -3113,16 +3125,81 @@ split:
 			    item_size - split_offset);
 	btrfs_mark_buffer_dirty(leaf);
 
-	ret = 0;
-	if (btrfs_leaf_free_space(root, leaf) < 0) {
-		btrfs_print_leaf(root, leaf);
-		BUG();
-	}
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
 	kfree(buf);
+	return 0;
+}
+
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation.  After
+ * the split, the path is pointing to the old item.  The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *new_key,
+		     unsigned long split_offset)
+{
+	int ret;
+	ret = setup_leaf_for_split(trans, root, path,
+				   sizeof(struct btrfs_item));
+	if (ret)
+		return ret;
+
+	ret = split_item(trans, root, path, new_key, split_offset);
 	return ret;
 }
 
 /*
+ * This function duplicate a item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item
+ * is contiguous with the original item.
+ *
+ * This allows us to split file extent in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path,
+			 struct btrfs_key *new_key)
+{
+	struct extent_buffer *leaf;
+	int ret;
+	u32 item_size;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ret = setup_leaf_for_split(trans, root, path,
+				   item_size + sizeof(struct btrfs_item));
+	if (ret)
+		return ret;
+
+	path->slots[0]++;
+	ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
+				     item_size, item_size +
+				     sizeof(struct btrfs_item), 1);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	memcpy_extent_buffer(leaf,
+			     btrfs_item_ptr_offset(leaf, path->slots[0]),
+			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+			     item_size);
+	return 0;
+}
+
+/*
  * make the item pointed to by the path smaller.  new_size indicates
  * how small to make it, and from_end tells us if we just chop bytes
  * off the end of the item or if we shift the item to chop bytes off
@@ -3714,8 +3791,8 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_unlock_up_safe(path, 0);
 
-	ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
-				0, root->root_key.objectid, 0, 0);
+	ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
+				    0, root->root_key.objectid, 0);
 	return ret;
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 444b3e9..9f806dd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -310,6 +310,9 @@ struct btrfs_header {
 #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
+#define BTRFS_MAX_XATTR_SIZE(r)	(BTRFS_LEAF_DATA_SIZE(r) - \
+				 sizeof(struct btrfs_item) -\
+				 sizeof(struct btrfs_dir_item))
 
 
 /*
@@ -859,8 +862,9 @@ struct btrfs_fs_info {
 	struct mutex ordered_operations_mutex;
 	struct rw_semaphore extent_commit_sem;
 
-	struct rw_semaphore subvol_sem;
+	struct rw_semaphore cleanup_work_sem;
 
+	struct rw_semaphore subvol_sem;
 	struct srcu_struct subvol_srcu;
 
 	struct list_head trans_list;
@@ -868,6 +872,9 @@ struct btrfs_fs_info {
 	struct list_head dead_roots;
 	struct list_head caching_block_groups;
 
+	spinlock_t delayed_iput_lock;
+	struct list_head delayed_iputs;
+
 	atomic_t nr_async_submits;
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
@@ -1034,12 +1041,12 @@ struct btrfs_root {
 	int ref_cows;
 	int track_dirty;
 	int in_radix;
+	int clean_orphans;
 
 	u64 defrag_trans_start;
 	struct btrfs_key defrag_progress;
 	struct btrfs_key defrag_max;
 	int defrag_running;
-	int defrag_level;
 	char *name;
 	int in_sysfs;
 
@@ -1975,6 +1982,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					u64 parent, u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
 					u64 hint, u64 empty_size);
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 bytenr, u32 blocksize,
+			  u64 parent, u64 root_objectid, int level);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize,
@@ -2089,6 +2100,10 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 		     struct btrfs_path *path,
 		     struct btrfs_key *new_key,
 		     unsigned long split_offset);
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path,
+			 struct btrfs_key *new_key);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
@@ -2196,9 +2211,10 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_path *path,
 			      struct btrfs_dir_item *di);
 int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, const char *name,
-			    u16 name_len, const void *data, u16 data_len,
-			    u64 dir);
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 objectid,
+			    const char *name, u16 name_len,
+			    const void *data, u16 data_len);
 struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, u64 dir,
@@ -2292,7 +2308,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct inode *inode, u64 new_size,
 			       u32 min_type);
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
@@ -2332,6 +2348,8 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_root *root);
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
@@ -2345,12 +2363,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 locked_end,
-		       u64 inline_limit, u64 *hint_block, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+		       u64 start, u64 end, u64 *hint_byte, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
 			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
 
@@ -2380,7 +2395,8 @@ int btrfs_check_acl(struct inode *inode, int mask);
 #else
 #define btrfs_check_acl NULL
 #endif
-int btrfs_init_acl(struct inode *inode, struct inode *dir);
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+		   struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
 
 /* relocation.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f3a6075..e9103b3 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -68,12 +68,12 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
  * into the tree
  */
 int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, const char *name,
-			    u16 name_len, const void *data, u16 data_len,
-			    u64 dir)
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 objectid,
+			    const char *name, u16 name_len,
+			    const void *data, u16 data_len)
 {
 	int ret = 0;
-	struct btrfs_path *path;
 	struct btrfs_dir_item *dir_item;
 	unsigned long name_ptr, data_ptr;
 	struct btrfs_key key, location;
@@ -81,15 +81,11 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	u32 data_size;
 
-	key.objectid = dir;
+	BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
+
+	key.objectid = objectid;
 	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
 	key.offset = btrfs_name_hash(name, name_len);
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	if (name_len + data_len + sizeof(struct btrfs_dir_item) >
-	    BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
-		return -ENOSPC;
 
 	data_size = sizeof(*dir_item) + name_len + data_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
@@ -117,7 +113,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 	write_extent_buffer(leaf, data, data_ptr, data_len);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
-	btrfs_free_path(path);
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 02b6afb..2b59201 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -892,6 +892,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->stripesize = stripesize;
 	root->ref_cows = 0;
 	root->track_dirty = 0;
+	root->in_radix = 0;
+	root->clean_orphans = 0;
 
 	root->fs_info = fs_info;
 	root->objectid = objectid;
@@ -928,7 +930,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->defrag_trans_start = fs_info->generation;
 	init_completion(&root->kobj_unregister);
 	root->defrag_running = 0;
-	root->defrag_level = 0;
 	root->root_key.objectid = objectid;
 	root->anon_super.s_root = NULL;
 	root->anon_super.s_dev = 0;
@@ -980,12 +981,12 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
-				    0, &start, &end, EXTENT_DIRTY);
+				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
 		if (ret)
 			break;
 
-		clear_extent_dirty(&log_root_tree->dirty_log_pages,
-				   start, end, GFP_NOFS);
+		clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
+				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
 	}
 	eb = fs_info->log_root_tree->node;
 
@@ -1210,8 +1211,10 @@ again:
 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
 				(unsigned long)root->root_key.objectid,
 				root);
-	if (ret == 0)
+	if (ret == 0) {
 		root->in_radix = 1;
+		root->clean_orphans = 1;
+	}
 	spin_unlock(&fs_info->fs_roots_radix_lock);
 	radix_tree_preload_end();
 	if (ret) {
@@ -1225,10 +1228,6 @@ again:
 	ret = btrfs_find_dead_roots(fs_info->tree_root,
 				    root->root_key.objectid);
 	WARN_ON(ret);
-
-	if (!(fs_info->sb->s_flags & MS_RDONLY))
-		btrfs_orphan_cleanup(root);
-
 	return root;
 fail:
 	free_fs_root(root);
@@ -1477,6 +1476,7 @@ static int cleaner_kthread(void *arg)
 
 		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
 		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
+			btrfs_run_delayed_iputs(root);
 			btrfs_clean_old_snapshots(root);
 			mutex_unlock(&root->fs_info->cleaner_mutex);
 		}
@@ -1606,6 +1606,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
+	INIT_LIST_HEAD(&fs_info->delayed_iputs);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
 	INIT_LIST_HEAD(&fs_info->ordered_operations);
@@ -1614,6 +1615,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->new_trans_lock);
 	spin_lock_init(&fs_info->ref_cache_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
+	spin_lock_init(&fs_info->delayed_iput_lock);
 
 	init_completion(&fs_info->kobj_unregister);
 	fs_info->tree_root = tree_root;
@@ -1689,6 +1691,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
 	init_rwsem(&fs_info->extent_commit_sem);
+	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
 
 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -1979,7 +1982,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_recover_relocation(tree_root);
-		BUG_ON(ret);
+		if (ret < 0) {
+			printk(KERN_WARNING
+			       "btrfs: failed to recover relocation\n");
+			err = -EINVAL;
+			goto fail_trans_kthread;
+		}
 	}
 
 	location.objectid = BTRFS_FS_TREE_OBJECTID;
@@ -1990,6 +1998,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!fs_info->fs_root)
 		goto fail_trans_kthread;
 
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_read(&fs_info->cleanup_work_sem);
+		btrfs_orphan_cleanup(fs_info->fs_root);
+		up_read(&fs_info->cleanup_work_sem);
+	}
+
 	return tree_root;
 
 fail_trans_kthread:
@@ -2386,8 +2400,14 @@ int btrfs_commit_super(struct btrfs_root *root)
 	int ret;
 
 	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_run_delayed_iputs(root);
 	btrfs_clean_old_snapshots(root);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
+
+	/* wait until ongoing cleanup work done */
+	down_write(&root->fs_info->cleanup_work_sem);
+	up_write(&root->fs_info->cleanup_work_sem);
+
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	BUG_ON(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 94627c4..559f724 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -83,6 +83,17 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 	return (cache->flags & bits) == bits;
 }
 
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+{
+	atomic_inc(&cache->count);
+}
+
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->count))
+		kfree(cache);
+}
+
 /*
  * this adds the block group to the fs_info rb tree for the block group
  * cache
@@ -156,7 +167,7 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 		}
 	}
 	if (ret)
-		atomic_inc(&ret->count);
+		btrfs_get_block_group(ret);
 	spin_unlock(&info->block_group_cache_lock);
 
 	return ret;
@@ -195,6 +206,14 @@ static int exclude_super_stripes(struct btrfs_root *root,
 	int stripe_len;
 	int i, nr, ret;
 
+	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
+		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+		cache->bytes_super += stripe_len;
+		ret = add_excluded_extent(root, cache->key.objectid,
+					  stripe_len);
+		BUG_ON(ret);
+	}
+
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 		bytenr = btrfs_sb_offset(i);
 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
@@ -255,7 +274,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 		if (ret)
 			break;
 
-		if (extent_start == start) {
+		if (extent_start <= start) {
 			start = extent_end + 1;
 		} else if (extent_start > start && extent_start < end) {
 			size = extent_start - start;
@@ -399,6 +418,8 @@ err:
 
 	put_caching_control(caching_ctl);
 	atomic_dec(&block_group->space_info->caching_threads);
+	btrfs_put_block_group(block_group);
+
 	return 0;
 }
 
@@ -439,6 +460,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
 	up_write(&fs_info->extent_commit_sem);
 
 	atomic_inc(&cache->space_info->caching_threads);
+	btrfs_get_block_group(cache);
 
 	tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
 			  cache->key.objectid);
@@ -478,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
 	return cache;
 }
 
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
-{
-	if (atomic_dec_and_test(&cache->count))
-		kfree(cache);
-}
-
 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 						  u64 flags)
 {
@@ -2574,7 +2590,7 @@ next_block_group(struct btrfs_root *root,
 	if (node) {
 		cache = rb_entry(node, struct btrfs_block_group_cache,
 				 cache_node);
-		atomic_inc(&cache->count);
+		btrfs_get_block_group(cache);
 	} else
 		cache = NULL;
 	spin_unlock(&root->fs_info->block_group_cache_lock);
@@ -2880,9 +2896,9 @@ static noinline void flush_delalloc_async(struct btrfs_work *work)
 	root = async->root;
 	info = async->info;
 
-	btrfs_start_delalloc_inodes(root);
+	btrfs_start_delalloc_inodes(root, 0);
 	wake_up(&info->flush_wait);
-	btrfs_wait_ordered_extents(root, 0);
+	btrfs_wait_ordered_extents(root, 0, 0);
 
 	spin_lock(&info->lock);
 	info->flushing = 0;
@@ -2956,8 +2972,8 @@ static void flush_delalloc(struct btrfs_root *root,
 	return;
 
 flush:
-	btrfs_start_delalloc_inodes(root);
-	btrfs_wait_ordered_extents(root, 0);
+	btrfs_start_delalloc_inodes(root, 0);
+	btrfs_wait_ordered_extents(root, 0, 0);
 
 	spin_lock(&info->lock);
 	info->flushing = 0;
@@ -3454,14 +3470,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	else
 		old_val -= num_bytes;
 	btrfs_set_super_bytes_used(&info->super_copy, old_val);
-
-	/* block accounting for root item */
-	old_val = btrfs_root_used(&root->root_item);
-	if (alloc)
-		old_val += num_bytes;
-	else
-		old_val -= num_bytes;
-	btrfs_set_root_used(&root->root_item, old_val);
 	spin_unlock(&info->delalloc_lock);
 
 	while (total) {
@@ -4049,6 +4057,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 bytenr, u32 blocksize,
+			  u64 parent, u64 root_objectid, int level)
+{
+	u64 used;
+	spin_lock(&root->node_lock);
+	used = btrfs_root_used(&root->root_item) - blocksize;
+	btrfs_set_root_used(&root->root_item, used);
+	spin_unlock(&root->node_lock);
+
+	return btrfs_free_extent(trans, root, bytenr, blocksize,
+				 parent, root_objectid, level, 0);
+}
+
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
 	u64 mask = ((u64)root->stripesize - 1);
@@ -4212,7 +4235,7 @@ search:
 		u64 offset;
 		int cached;
 
-		atomic_inc(&block_group->count);
+		btrfs_get_block_group(block_group);
 		search_start = block_group->key.objectid;
 
 have_block_group:
@@ -4300,7 +4323,7 @@ have_block_group:
 
 				btrfs_put_block_group(block_group);
 				block_group = last_ptr->block_group;
-				atomic_inc(&block_group->count);
+				btrfs_get_block_group(block_group);
 				spin_unlock(&last_ptr->lock);
 				spin_unlock(&last_ptr->refill_lock);
 
@@ -4578,7 +4601,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 search_start = 0;
-	struct btrfs_fs_info *info = root->fs_info;
 
 	data = btrfs_get_alloc_profile(root, data);
 again:
@@ -4586,17 +4608,9 @@ again:
 	 * the only place that sets empty_size is btrfs_realloc_node, which
 	 * is not called recursively on allocations
 	 */
-	if (empty_size || root->ref_cows) {
-		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
-			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     2 * 1024 * 1024,
-				     BTRFS_BLOCK_GROUP_METADATA |
-				     (info->metadata_alloc_profile &
-				      info->avail_metadata_alloc_bits), 0);
-		}
+	if (empty_size || root->ref_cows)
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 				     num_bytes + 2 * 1024 * 1024, data, 0);
-	}
 
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -4897,6 +4911,14 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
 					extent_op);
 		BUG_ON(ret);
 	}
+
+	if (root_objectid == root->root_key.objectid) {
+		u64 used;
+		spin_lock(&root->node_lock);
+		used = btrfs_root_used(&root->root_item) + num_bytes;
+		btrfs_set_root_used(&root->root_item, used);
+		spin_unlock(&root->node_lock);
+	}
 	return ret;
 }
 
@@ -4919,8 +4941,16 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	btrfs_set_buffer_uptodate(buf);
 
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-		set_extent_dirty(&root->dirty_log_pages, buf->start,
-			 buf->start + buf->len - 1, GFP_NOFS);
+		/*
+		 * we allow two log transactions at a time, use different
+		 * EXENT bit to differentiate dirty pages.
+		 */
+		if (root->log_transid % 2 == 0)
+			set_extent_dirty(&root->dirty_log_pages, buf->start,
+					buf->start + buf->len - 1, GFP_NOFS);
+		else
+			set_extent_new(&root->dirty_log_pages, buf->start,
+					buf->start + buf->len - 1, GFP_NOFS);
 	} else {
 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
@@ -5372,10 +5402,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 	int ret;
 
 	while (level >= 0) {
-		if (path->slots[level] >=
-		    btrfs_header_nritems(path->nodes[level]))
-			break;
-
 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
 		if (ret > 0)
 			break;
@@ -5383,6 +5409,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		if (level == 0)
 			break;
 
+		if (path->slots[level] >=
+		    btrfs_header_nritems(path->nodes[level]))
+			break;
+
 		ret = do_walk_down(trans, root, path, wc, &lookup_info);
 		if (ret > 0) {
 			path->slots[level]++;
@@ -7373,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 			wait_block_group_cache_done(block_group);
 
 		btrfs_remove_free_space_cache(block_group);
-
-		WARN_ON(atomic_read(&block_group->count) != 1);
-		kfree(block_group);
+		btrfs_put_block_group(block_group);
 
 		spin_lock(&info->block_group_cache_lock);
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96577e8..b177ed3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3165,10 +3165,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		spin_unlock(&tree->buffer_lock);
 		goto free_eb;
 	}
-	spin_unlock(&tree->buffer_lock);
-
 	/* add one reference for the tree */
 	atomic_inc(&eb->refs);
+	spin_unlock(&tree->buffer_lock);
 	return eb;
 
 free_eb:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 06550af..a7fd9f3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -179,18 +179,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		}
 		flags = em->flags;
 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
-			if (em->start <= start &&
-			    (!testend || em->start + em->len >= start + len)) {
+			if (testend && em->start + em->len >= start + len) {
 				free_extent_map(em);
 				write_unlock(&em_tree->lock);
 				break;
 			}
-			if (start < em->start) {
-				len = em->start - start;
-			} else {
+			start = em->start + em->len;
+			if (testend)
 				len = start + len - (em->start + em->len);
-				start = em->start + em->len;
-			}
 			free_extent_map(em);
 			write_unlock(&em_tree->lock);
 			continue;
@@ -265,324 +261,253 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
  * If an extent intersects the range but is not entirely inside the range
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
- *
- * inline_limit is used to tell this code which offsets in the file to keep
- * if they contain inline extents.
  */
-noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 locked_end,
-		       u64 inline_limit, u64 *hint_byte, int drop_cache)
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
+		       u64 start, u64 end, u64 *hint_byte, int drop_cache)
 {
-	u64 extent_end = 0;
-	u64 search_start = start;
-	u64 ram_bytes = 0;
-	u64 disk_bytenr = 0;
-	u64 orig_locked_end = locked_end;
-	u8 compression;
-	u8 encryption;
-	u16 other_encoding = 0;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
-	struct btrfs_file_extent_item *extent;
+	struct btrfs_file_extent_item *fi;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_file_extent_item old;
-	int keep;
-	int slot;
-	int bookend;
-	int found_type = 0;
-	int found_extent;
-	int found_inline;
+	struct btrfs_key new_key;
+	u64 search_start = start;
+	u64 disk_bytenr = 0;
+	u64 num_bytes = 0;
+	u64 extent_offset = 0;
+	u64 extent_end = 0;
+	int del_nr = 0;
+	int del_slot = 0;
+	int extent_type;
 	int recow;
 	int ret;
 
-	inline_limit = 0;
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+
 	while (1) {
 		recow = 0;
-		btrfs_release_path(root, path);
 		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
 					       search_start, -1);
 		if (ret < 0)
-			goto out;
-		if (ret > 0) {
-			if (path->slots[0] == 0) {
-				ret = 0;
-				goto out;
-			}
-			path->slots[0]--;
+			break;
+		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+			if (key.objectid == inode->i_ino &&
+			    key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
 		}
+		ret = 0;
 next_slot:
-		keep = 0;
-		bookend = 0;
-		found_extent = 0;
-		found_inline = 0;
-		compression = 0;
-		encryption = 0;
-		extent = NULL;
 		leaf = path->nodes[0];
-		slot = path->slots[0];
-		ret = 0;
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
-		    key.offset >= end) {
-			goto out;
-		}
-		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
-		    key.objectid != inode->i_ino) {
-			goto out;
-		}
-		if (recow) {
-			search_start = max(key.offset, start);
-			continue;
-		}
-		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
-			extent = btrfs_item_ptr(leaf, slot,
-						struct btrfs_file_extent_item);
-			found_type = btrfs_file_extent_type(leaf, extent);
-			compression = btrfs_file_extent_compression(leaf,
-								    extent);
-			encryption = btrfs_file_extent_encryption(leaf,
-								  extent);
-			other_encoding = btrfs_file_extent_other_encoding(leaf,
-								  extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG ||
-			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-				extent_end =
-				     btrfs_file_extent_disk_bytenr(leaf,
-								   extent);
-				if (extent_end)
-					*hint_byte = extent_end;
-
-				extent_end = key.offset +
-				     btrfs_file_extent_num_bytes(leaf, extent);
-				ram_bytes = btrfs_file_extent_ram_bytes(leaf,
-								extent);
-				found_extent = 1;
-			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-				found_inline = 1;
-				extent_end = key.offset +
-				     btrfs_file_extent_inline_len(leaf, extent);
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			BUG_ON(del_nr > 0);
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				break;
+			if (ret > 0) {
+				ret = 0;
+				break;
 			}
+			leaf = path->nodes[0];
+			recow = 1;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid > inode->i_ino ||
+		    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+			break;
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, fi);
+
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+			extent_offset = btrfs_file_extent_offset(leaf, fi);
+			extent_end = key.offset +
+				btrfs_file_extent_num_bytes(leaf, fi);
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			extent_end = key.offset +
+				btrfs_file_extent_inline_len(leaf, fi);
 		} else {
+			WARN_ON(1);
 			extent_end = search_start;
 		}
 
-		/* we found nothing we can drop */
-		if ((!found_extent && !found_inline) ||
-		    search_start >= extent_end) {
-			int nextret;
-			u32 nritems;
-			nritems = btrfs_header_nritems(leaf);
-			if (slot >= nritems - 1) {
-				nextret = btrfs_next_leaf(root, path);
-				if (nextret)
-					goto out;
-				recow = 1;
-			} else {
-				path->slots[0]++;
-			}
+		if (extent_end <= search_start) {
+			path->slots[0]++;
 			goto next_slot;
 		}
 
-		if (end <= extent_end && start >= key.offset && found_inline)
-			*hint_byte = EXTENT_MAP_INLINE;
-
-		if (found_extent) {
-			read_extent_buffer(leaf, &old, (unsigned long)extent,
-					   sizeof(old));
-		}
-
-		if (end < extent_end && end >= key.offset) {
-			bookend = 1;
-			if (found_inline && start <= key.offset)
-				keep = 1;
+		search_start = max(key.offset, start);
+		if (recow) {
+			btrfs_release_path(root, path);
+			continue;
 		}
 
-		if (bookend && found_extent) {
-			if (locked_end < extent_end) {
-				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-						locked_end, extent_end - 1,
-						GFP_NOFS);
-				if (!ret) {
-					btrfs_release_path(root, path);
-					lock_extent(&BTRFS_I(inode)->io_tree,
-						locked_end, extent_end - 1,
-						GFP_NOFS);
-					locked_end = extent_end;
-					continue;
-				}
-				locked_end = extent_end;
+		/*
+		 *     | - range to drop - |
+		 *  | -------- extent -------- |
+		 */
+		if (start > key.offset && end < extent_end) {
+			BUG_ON(del_nr > 0);
+			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.offset = start;
+			ret = btrfs_duplicate_item(trans, root, path,
+						   &new_key);
+			if (ret == -EAGAIN) {
+				btrfs_release_path(root, path);
+				continue;
 			}
-			disk_bytenr = le64_to_cpu(old.disk_bytenr);
-			if (disk_bytenr != 0) {
+			if (ret < 0)
+				break;
+
+			leaf = path->nodes[0];
+			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							start - key.offset);
+
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+
+			extent_offset += start - key.offset;
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_end - start);
+			btrfs_mark_buffer_dirty(leaf);
+
+			if (disk_bytenr > 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
-					   disk_bytenr,
-					   le64_to_cpu(old.disk_num_bytes), 0,
-					   root->root_key.objectid,
-					   key.objectid, key.offset -
-					   le64_to_cpu(old.offset));
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid,
+						new_key.objectid,
+						start - extent_offset);
 				BUG_ON(ret);
+				*hint_byte = disk_bytenr;
 			}
+			key.offset = start;
 		}
+		/*
+		 *  | ---- range to drop ----- |
+		 *      | -------- extent -------- |
+		 */
+		if (start <= key.offset && end < extent_end) {
+			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
 
-		if (found_inline) {
-			u64 mask = root->sectorsize - 1;
-			search_start = (extent_end + mask) & ~mask;
-		} else
-			search_start = extent_end;
-
-		/* truncate existing extent */
-		if (start > key.offset) {
-			u64 new_num;
-			u64 old_num;
-			keep = 1;
-			WARN_ON(start & (root->sectorsize - 1));
-			if (found_extent) {
-				new_num = start - key.offset;
-				old_num = btrfs_file_extent_num_bytes(leaf,
-								      extent);
-				*hint_byte =
-					btrfs_file_extent_disk_bytenr(leaf,
-								      extent);
-				if (btrfs_file_extent_disk_bytenr(leaf,
-								  extent)) {
-					inode_sub_bytes(inode, old_num -
-							new_num);
-				}
-				btrfs_set_file_extent_num_bytes(leaf,
-							extent, new_num);
-				btrfs_mark_buffer_dirty(leaf);
-			} else if (key.offset < inline_limit &&
-				   (end > extent_end) &&
-				   (inline_limit < extent_end)) {
-				u32 new_size;
-				new_size = btrfs_file_extent_calc_inline_size(
-						   inline_limit - key.offset);
-				inode_sub_bytes(inode, extent_end -
-						inline_limit);
-				btrfs_set_file_extent_ram_bytes(leaf, extent,
-							new_size);
-				if (!compression && !encryption) {
-					btrfs_truncate_item(trans, root, path,
-							    new_size, 1);
-				}
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.offset = end;
+			btrfs_set_item_key_safe(trans, root, path, &new_key);
+
+			extent_offset += end - key.offset;
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_end - end);
+			btrfs_mark_buffer_dirty(leaf);
+			if (disk_bytenr > 0) {
+				inode_sub_bytes(inode, end - key.offset);
+				*hint_byte = disk_bytenr;
 			}
+			break;
 		}
-		/* delete the entire extent */
-		if (!keep) {
-			if (found_inline)
-				inode_sub_bytes(inode, extent_end -
-						key.offset);
-			ret = btrfs_del_item(trans, root, path);
-			/* TODO update progress marker and return */
-			BUG_ON(ret);
-			extent = NULL;
-			btrfs_release_path(root, path);
-			/* the extent will be freed later */
-		}
-		if (bookend && found_inline && start <= key.offset) {
-			u32 new_size;
-			new_size = btrfs_file_extent_calc_inline_size(
-						   extent_end - end);
-			inode_sub_bytes(inode, end - key.offset);
-			btrfs_set_file_extent_ram_bytes(leaf, extent,
-							new_size);
-			if (!compression && !encryption)
-				ret = btrfs_truncate_item(trans, root, path,
-							  new_size, 0);
-			BUG_ON(ret);
-		}
-		/* create bookend, splitting the extent in two */
-		if (bookend && found_extent) {
-			struct btrfs_key ins;
-			ins.objectid = inode->i_ino;
-			ins.offset = end;
-			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
 
-			btrfs_release_path(root, path);
-			path->leave_spinning = 1;
-			ret = btrfs_insert_empty_item(trans, root, path, &ins,
-						      sizeof(*extent));
-			BUG_ON(ret);
+		search_start = extent_end;
+		/*
+		 *       | ---- range to drop ----- |
+		 *  | -------- extent -------- |
+		 */
+		if (start > key.offset && end >= extent_end) {
+			BUG_ON(del_nr > 0);
+			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
 
-			leaf = path->nodes[0];
-			extent = btrfs_item_ptr(leaf, path->slots[0],
-						struct btrfs_file_extent_item);
-			write_extent_buffer(leaf, &old,
-					    (unsigned long)extent, sizeof(old));
-
-			btrfs_set_file_extent_compression(leaf, extent,
-							  compression);
-			btrfs_set_file_extent_encryption(leaf, extent,
-							 encryption);
-			btrfs_set_file_extent_other_encoding(leaf, extent,
-							     other_encoding);
-			btrfs_set_file_extent_offset(leaf, extent,
-				    le64_to_cpu(old.offset) + end - key.offset);
-			WARN_ON(le64_to_cpu(old.num_bytes) <
-				(extent_end - end));
-			btrfs_set_file_extent_num_bytes(leaf, extent,
-							extent_end - end);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							start - key.offset);
+			btrfs_mark_buffer_dirty(leaf);
+			if (disk_bytenr > 0) {
+				inode_sub_bytes(inode, extent_end - start);
+				*hint_byte = disk_bytenr;
+			}
+			if (end == extent_end)
+				break;
 
-			/*
-			 * set the ram bytes to the size of the full extent
-			 * before splitting.  This is a worst case flag,
-			 * but its the best we can do because we don't know
-			 * how splitting affects compression
-			 */
-			btrfs_set_file_extent_ram_bytes(leaf, extent,
-							ram_bytes);
-			btrfs_set_file_extent_type(leaf, extent, found_type);
-
-			btrfs_unlock_up_safe(path, 1);
-			btrfs_mark_buffer_dirty(path->nodes[0]);
-			btrfs_set_lock_blocking(path->nodes[0]);
-
-			path->leave_spinning = 0;
-			btrfs_release_path(root, path);
-			if (disk_bytenr != 0)
-				inode_add_bytes(inode, extent_end - end);
+			path->slots[0]++;
+			goto next_slot;
 		}
 
-		if (found_extent && !keep) {
-			u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
+		/*
+		 *  | ---- range to drop ----- |
+		 *    | ------ extent ------ |
+		 */
+		if (start <= key.offset && end >= extent_end) {
+			if (del_nr == 0) {
+				del_slot = path->slots[0];
+				del_nr = 1;
+			} else {
+				BUG_ON(del_slot + del_nr != path->slots[0]);
+				del_nr++;
+			}
 
-			if (old_disk_bytenr != 0) {
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				inode_sub_bytes(inode,
-						le64_to_cpu(old.num_bytes));
+						extent_end - key.offset);
+				extent_end = ALIGN(extent_end,
+						   root->sectorsize);
+			} else if (disk_bytenr > 0) {
 				ret = btrfs_free_extent(trans, root,
-						old_disk_bytenr,
-						le64_to_cpu(old.disk_num_bytes),
-						0, root->root_key.objectid,
+						disk_bytenr, num_bytes, 0,
+						root->root_key.objectid,
 						key.objectid, key.offset -
-						le64_to_cpu(old.offset));
+						extent_offset);
 				BUG_ON(ret);
-				*hint_byte = old_disk_bytenr;
+				inode_sub_bytes(inode,
+						extent_end - key.offset);
+				*hint_byte = disk_bytenr;
 			}
-		}
 
-		if (search_start >= end) {
-			ret = 0;
-			goto out;
+			if (end == extent_end)
+				break;
+
+			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
+				path->slots[0]++;
+				goto next_slot;
+			}
+
+			ret = btrfs_del_items(trans, root, path, del_slot,
+					      del_nr);
+			BUG_ON(ret);
+
+			del_nr = 0;
+			del_slot = 0;
+
+			btrfs_release_path(root, path);
+			continue;
 		}
+
+		BUG_ON(1);
 	}
-out:
-	btrfs_free_path(path);
-	if (locked_end > orig_locked_end) {
-		unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
-			      locked_end - 1, GFP_NOFS);
+
+	if (del_nr > 0) {
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+		BUG_ON(ret);
 	}
+
+	btrfs_free_path(path);
 	return ret;
 }
 
 static int extent_mergeable(struct extent_buffer *leaf, int slot,
-			    u64 objectid, u64 bytenr, u64 *start, u64 *end)
+			    u64 objectid, u64 bytenr, u64 orig_offset,
+			    u64 *start, u64 *end)
 {
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
@@ -598,6 +523,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
 	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
 	    btrfs_file_extent_compression(leaf, fi) ||
 	    btrfs_file_extent_encryption(leaf, fi) ||
 	    btrfs_file_extent_other_encoding(leaf, fi))
@@ -620,23 +546,24 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
  * two or three.
  */
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
 			      struct inode *inode, u64 start, u64 end)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
 	struct btrfs_path *path;
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
+	struct btrfs_key new_key;
 	u64 bytenr;
 	u64 num_bytes;
 	u64 extent_end;
 	u64 orig_offset;
 	u64 other_start;
 	u64 other_end;
-	u64 split = start;
-	u64 locked_end = end;
-	int extent_type;
-	int split_end = 1;
+	u64 split;
+	int del_nr = 0;
+	int del_slot = 0;
+	int recow;
 	int ret;
 
 	btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -644,12 +571,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 again:
+	recow = 0;
+	split = start;
 	key.objectid = inode->i_ino;
 	key.type = BTRFS_EXTENT_DATA_KEY;
-	if (split == start)
-		key.offset = split;
-	else
-		key.offset = split - 1;
+	key.offset = split;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0 && path->slots[0] > 0)
@@ -661,159 +587,158 @@ again:
 	       key.type != BTRFS_EXTENT_DATA_KEY);
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
-	extent_type = btrfs_file_extent_type(leaf, fi);
-	BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
+	       BTRFS_FILE_EXTENT_PREALLOC);
 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 	BUG_ON(key.offset > start || extent_end < end);
 
 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+	memcpy(&new_key, &key, sizeof(new_key));
 
-	if (key.offset == start)
-		split = end;
-
-	if (key.offset == start && extent_end == end) {
-		int del_nr = 0;
-		int del_slot = 0;
-		other_start = end;
-		other_end = 0;
-		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
-				     bytenr, &other_start, &other_end)) {
-			extent_end = other_end;
-			del_slot = path->slots[0] + 1;
-			del_nr++;
-			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-						0, root->root_key.objectid,
-						inode->i_ino, orig_offset);
-			BUG_ON(ret);
-		}
+	if (start == key.offset && end < extent_end) {
 		other_start = 0;
 		other_end = start;
-		if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
-				     bytenr, &other_start, &other_end)) {
-			key.offset = other_start;
-			del_slot = path->slots[0];
-			del_nr++;
-			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-						0, root->root_key.objectid,
-						inode->i_ino, orig_offset);
-			BUG_ON(ret);
-		}
-		split_end = 0;
-		if (del_nr == 0) {
-			btrfs_set_file_extent_type(leaf, fi,
-						   BTRFS_FILE_EXTENT_REG);
-			goto done;
-		}
-
-		fi = btrfs_item_ptr(leaf, del_slot - 1,
-				    struct btrfs_file_extent_item);
-		btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
-		btrfs_set_file_extent_num_bytes(leaf, fi,
-						extent_end - key.offset);
-		btrfs_mark_buffer_dirty(leaf);
-
-		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-		BUG_ON(ret);
-		goto release;
-	} else if (split == start) {
-		if (locked_end < extent_end) {
-			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-					locked_end, extent_end - 1, GFP_NOFS);
-			if (!ret) {
-				btrfs_release_path(root, path);
-				lock_extent(&BTRFS_I(inode)->io_tree,
-					locked_end, extent_end - 1, GFP_NOFS);
-				locked_end = extent_end;
-				goto again;
-			}
-			locked_end = extent_end;
+		if (extent_mergeable(leaf, path->slots[0] - 1,
+				     inode->i_ino, bytenr, orig_offset,
+				     &other_start, &other_end)) {
+			new_key.offset = end;
+			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_end - end);
+			btrfs_set_file_extent_offset(leaf, fi,
+						     end - orig_offset);
+			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							end - other_start);
+			btrfs_mark_buffer_dirty(leaf);
+			goto out;
 		}
-		btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
-	} else  {
-		BUG_ON(key.offset != start);
-		key.offset = split;
-		btrfs_set_file_extent_offset(leaf, fi, key.offset -
-					     orig_offset);
-		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
-		btrfs_set_item_key_safe(trans, root, path, &key);
-		extent_end = split;
 	}
 
-	if (extent_end == end) {
-		split_end = 0;
-		extent_type = BTRFS_FILE_EXTENT_REG;
-	}
-	if (extent_end == end && split == start) {
+	if (start > key.offset && end == extent_end) {
 		other_start = end;
 		other_end = 0;
-		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
-				     bytenr, &other_start, &other_end)) {
-			path->slots[0]++;
+		if (extent_mergeable(leaf, path->slots[0] + 1,
+				     inode->i_ino, bytenr, orig_offset,
+				     &other_start, &other_end)) {
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
-			key.offset = split;
-			btrfs_set_item_key_safe(trans, root, path, &key);
-			btrfs_set_file_extent_offset(leaf, fi, key.offset -
-						     orig_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
-							other_end - split);
-			goto done;
-		}
-	}
-	if (extent_end == end && split == end) {
-		other_start = 0;
-		other_end = start;
-		if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
-				     bytenr, &other_start, &other_end)) {
-			path->slots[0]--;
+							start - key.offset);
+			path->slots[0]++;
+			new_key.offset = start;
+			btrfs_set_item_key_safe(trans, root, path, &new_key);
+
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
-			btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
-							other_start);
-			goto done;
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							other_end - start);
+			btrfs_set_file_extent_offset(leaf, fi,
+						     start - orig_offset);
+			btrfs_mark_buffer_dirty(leaf);
+			goto out;
 		}
 	}
 
-	btrfs_mark_buffer_dirty(leaf);
+	while (start > key.offset || end < extent_end) {
+		if (key.offset == start)
+			split = end;
 
-	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
-				   root->root_key.objectid,
-				   inode->i_ino, orig_offset);
-	BUG_ON(ret);
-	btrfs_release_path(root, path);
+		new_key.offset = split;
+		ret = btrfs_duplicate_item(trans, root, path, &new_key);
+		if (ret == -EAGAIN) {
+			btrfs_release_path(root, path);
+			goto again;
+		}
+		BUG_ON(ret < 0);
 
-	key.offset = start;
-	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
-	BUG_ON(ret);
+		leaf = path->nodes[0];
+		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						split - key.offset);
 
-	leaf = path->nodes[0];
-	fi = btrfs_item_ptr(leaf, path->slots[0],
-			    struct btrfs_file_extent_item);
-	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-	btrfs_set_file_extent_type(leaf, fi, extent_type);
-	btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
-	btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
-	btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
-	btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
-	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
-	btrfs_set_file_extent_compression(leaf, fi, 0);
-	btrfs_set_file_extent_encryption(leaf, fi, 0);
-	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
-done:
-	btrfs_mark_buffer_dirty(leaf);
-
-release:
-	btrfs_release_path(root, path);
-	if (split_end && split == start) {
-		split = end;
-		goto again;
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+
+		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						extent_end - split);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+					   root->root_key.objectid,
+					   inode->i_ino, orig_offset);
+		BUG_ON(ret);
+
+		if (split == start) {
+			key.offset = start;
+		} else {
+			BUG_ON(start != key.offset);
+			path->slots[0]--;
+			extent_end = end;
+		}
+		recow = 1;
+	}
+
+	other_start = end;
+	other_end = 0;
+	if (extent_mergeable(leaf, path->slots[0] + 1,
+			     inode->i_ino, bytenr, orig_offset,
+			     &other_start, &other_end)) {
+		if (recow) {
+			btrfs_release_path(root, path);
+			goto again;
+		}
+		extent_end = other_end;
+		del_slot = path->slots[0] + 1;
+		del_nr++;
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					0, root->root_key.objectid,
+					inode->i_ino, orig_offset);
+		BUG_ON(ret);
+	}
+	other_start = 0;
+	other_end = start;
+	if (extent_mergeable(leaf, path->slots[0] - 1,
+			     inode->i_ino, bytenr, orig_offset,
+			     &other_start, &other_end)) {
+		if (recow) {
+			btrfs_release_path(root, path);
+			goto again;
+		}
+		key.offset = other_start;
+		del_slot = path->slots[0];
+		del_nr++;
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					0, root->root_key.objectid,
+					inode->i_ino, orig_offset);
+		BUG_ON(ret);
 	}
-	if (locked_end > end) {
-		unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
-			      GFP_NOFS);
+	if (del_nr == 0) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+			   struct btrfs_file_extent_item);
+		btrfs_set_file_extent_type(leaf, fi,
+					   BTRFS_FILE_EXTENT_REG);
+		btrfs_mark_buffer_dirty(leaf);
+	} else {
+		fi = btrfs_item_ptr(leaf, del_slot - 1,
+			   struct btrfs_file_extent_item);
+		btrfs_set_file_extent_type(leaf, fi,
+					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						extent_end - key.offset);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+		BUG_ON(ret);
 	}
+out:
 	btrfs_free_path(path);
 	return 0;
 }
@@ -1210,7 +1135,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 	mutex_lock(&dentry->d_inode->i_mutex);
 out:
-	return ret > 0 ? EIO : ret;
+	return ret > 0 ? -EIO : ret;
 }
 
 static const struct vm_operations_struct btrfs_file_vm_ops = {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b3ad168..e03a836 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -88,13 +88,14 @@ static noinline int cow_file_range(struct inode *inode,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
 
-static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
+static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
+				     struct inode *inode,  struct inode *dir)
 {
 	int err;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_acl(trans, inode, dir);
 	if (!err)
-		err = btrfs_xattr_security_init(inode, dir);
+		err = btrfs_xattr_security_init(trans, inode, dir);
 	return err;
 }
 
@@ -188,8 +189,18 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
 
+	/*
+	 * we're an inline extent, so nobody can
+	 * extend the file past i_size without locking
+	 * a page we already have locked.
+	 *
+	 * We must do any isize and inode updates
+	 * before we unlock the pages.  Otherwise we
+	 * could end up racing with unlink.
+	 */
 	BTRFS_I(inode)->disk_i_size = inode->i_size;
 	btrfs_update_inode(trans, root, inode);
+
 	return 0;
 fail:
 	btrfs_free_path(path);
@@ -230,8 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 
-	ret = btrfs_drop_extents(trans, root, inode, start,
-				 aligned_end, aligned_end, start,
+	ret = btrfs_drop_extents(trans, inode, start, aligned_end,
 				 &hint_byte, 1);
 	BUG_ON(ret);
 
@@ -416,7 +426,6 @@ again:
 						    start, end,
 						    total_compressed, pages);
 		}
-		btrfs_end_transaction(trans, root);
 		if (ret == 0) {
 			/*
 			 * inline extent creation worked, we don't need
@@ -430,9 +439,11 @@ again:
 			     EXTENT_CLEAR_DELALLOC |
 			     EXTENT_CLEAR_ACCOUNTING |
 			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
-			ret = 0;
+
+			btrfs_end_transaction(trans, root);
 			goto free_pages_out;
 		}
+		btrfs_end_transaction(trans, root);
 	}
 
 	if (will_compress) {
@@ -543,7 +554,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
 	if (list_empty(&async_cow->extents))
 		return 0;
 
-	trans = btrfs_join_transaction(root, 1);
 
 	while (!list_empty(&async_cow->extents)) {
 		async_extent = list_entry(async_cow->extents.next,
@@ -590,19 +600,15 @@ retry:
 		lock_extent(io_tree, async_extent->start,
 			    async_extent->start + async_extent->ram_size - 1,
 			    GFP_NOFS);
-		/*
-		 * here we're doing allocation and writeback of the
-		 * compressed pages
-		 */
-		btrfs_drop_extent_cache(inode, async_extent->start,
-					async_extent->start +
-					async_extent->ram_size - 1, 0);
 
+		trans = btrfs_join_transaction(root, 1);
 		ret = btrfs_reserve_extent(trans, root,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
 					   0, alloc_hint,
 					   (u64)-1, &ins, 1);
+		btrfs_end_transaction(trans, root);
+
 		if (ret) {
 			int i;
 			for (i = 0; i < async_extent->nr_pages; i++) {
@@ -618,6 +624,14 @@ retry:
 			goto retry;
 		}
 
+		/*
+		 * here we're doing allocation and writeback of the
+		 * compressed pages
+		 */
+		btrfs_drop_extent_cache(inode, async_extent->start,
+					async_extent->start +
+					async_extent->ram_size - 1, 0);
+
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = async_extent->start;
 		em->len = async_extent->ram_size;
@@ -649,8 +663,6 @@ retry:
 					       BTRFS_ORDERED_COMPRESSED);
 		BUG_ON(ret);
 
-		btrfs_end_transaction(trans, root);
-
 		/*
 		 * clear dirty, set writeback and unlock the pages.
 		 */
@@ -672,13 +684,11 @@ retry:
 				    async_extent->nr_pages);
 
 		BUG_ON(ret);
-		trans = btrfs_join_transaction(root, 1);
 		alloc_hint = ins.objectid + ins.offset;
 		kfree(async_extent);
 		cond_resched();
 	}
 
-	btrfs_end_transaction(trans, root);
 	return 0;
 }
 
@@ -742,6 +752,7 @@ static noinline int cow_file_range(struct inode *inode,
 				     EXTENT_CLEAR_DIRTY |
 				     EXTENT_SET_WRITEBACK |
 				     EXTENT_END_WRITEBACK);
+
 			*nr_written = *nr_written +
 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 			*page_started = 1;
@@ -1596,7 +1607,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 				       struct inode *inode, u64 file_pos,
 				       u64 disk_bytenr, u64 disk_num_bytes,
 				       u64 num_bytes, u64 ram_bytes,
-				       u64 locked_end,
 				       u8 compression, u8 encryption,
 				       u16 other_encoding, int extent_type)
 {
@@ -1622,9 +1632,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	 * the caller is expected to unpin it and allow it to be merged
 	 * with the others.
 	 */
-	ret = btrfs_drop_extents(trans, root, inode, file_pos,
-				 file_pos + num_bytes, locked_end,
-				 file_pos, &hint, 0);
+	ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
+				 &hint, 0);
 	BUG_ON(ret);
 
 	ins.objectid = inode->i_ino;
@@ -1671,24 +1680,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
  * before we start the transaction.  It limits the amount of btree
  * reads required while inside the transaction.
  */
-static noinline void reada_csum(struct btrfs_root *root,
-				struct btrfs_path *path,
-				struct btrfs_ordered_extent *ordered_extent)
-{
-	struct btrfs_ordered_sum *sum;
-	u64 bytenr;
-
-	sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
-			 list);
-	bytenr = sum->sums[0].bytenr;
-
-	/*
-	 * we don't care about the results, the point of this search is
-	 * just to get the btree leaves into ram
-	 */
-	btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
-}
-
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1699,7 +1690,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_path *path;
 	int compressed = 0;
 	int ret;
 
@@ -1707,46 +1697,32 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	if (!ret)
 		return 0;
 
-	/*
-	 * before we join the transaction, try to do some of our IO.
-	 * This will limit the amount of IO that we have to do with
-	 * the transaction running.  We're unlikely to need to do any
-	 * IO if the file extents are new, the disk_i_size checks
-	 * covers the most common case.
-	 */
-	if (start < BTRFS_I(inode)->disk_i_size) {
-		path = btrfs_alloc_path();
-		if (path) {
-			ret = btrfs_lookup_file_extent(NULL, root, path,
-						       inode->i_ino,
-						       start, 0);
-			ordered_extent = btrfs_lookup_ordered_extent(inode,
-								     start);
-			if (!list_empty(&ordered_extent->list)) {
-				btrfs_release_path(root, path);
-				reada_csum(root, path, ordered_extent);
-			}
-			btrfs_free_path(path);
+	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+	BUG_ON(!ordered_extent);
+
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+		BUG_ON(!list_empty(&ordered_extent->list));
+		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+		if (!ret) {
+			trans = btrfs_join_transaction(root, 1);
+			ret = btrfs_update_inode(trans, root, inode);
+			BUG_ON(ret);
+			btrfs_end_transaction(trans, root);
 		}
+		goto out;
 	}
 
-	trans = btrfs_join_transaction(root, 1);
-
-	if (!ordered_extent)
-		ordered_extent = btrfs_lookup_ordered_extent(inode, start);
-	BUG_ON(!ordered_extent);
-	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
-		goto nocow;
-
 	lock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
 
+	trans = btrfs_join_transaction(root, 1);
+
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
 		compressed = 1;
 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
 		BUG_ON(compressed);
-		ret = btrfs_mark_extent_written(trans, root, inode,
+		ret = btrfs_mark_extent_written(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->file_offset +
 						ordered_extent->len);
@@ -1758,8 +1734,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 						ordered_extent->disk_len,
 						ordered_extent->len,
 						ordered_extent->len,
-						ordered_extent->file_offset +
-						ordered_extent->len,
 						compressed, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
 		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
@@ -1770,22 +1744,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
-nocow:
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
-	mutex_lock(&BTRFS_I(inode)->extent_mutex);
-	btrfs_ordered_update_i_size(inode, ordered_extent);
-	btrfs_update_inode(trans, root, inode);
-	btrfs_remove_ordered_extent(inode, ordered_extent);
-	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
+	/* this also removes the ordered extent from the tree */
+	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+	ret = btrfs_update_inode(trans, root, inode);
+	BUG_ON(ret);
+	btrfs_end_transaction(trans, root);
+out:
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
 	/* once for the tree */
 	btrfs_put_ordered_extent(ordered_extent);
 
-	btrfs_end_transaction(trans, root);
 	return 0;
 }
 
@@ -2008,6 +1980,54 @@ zeroit:
 	return -EIO;
 }
 
+struct delayed_iput {
+	struct list_head list;
+	struct inode *inode;
+};
+
+void btrfs_add_delayed_iput(struct inode *inode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	struct delayed_iput *delayed;
+
+	if (atomic_add_unless(&inode->i_count, -1, 1))
+		return;
+
+	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+	delayed->inode = inode;
+
+	spin_lock(&fs_info->delayed_iput_lock);
+	list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+	spin_unlock(&fs_info->delayed_iput_lock);
+}
+
+void btrfs_run_delayed_iputs(struct btrfs_root *root)
+{
+	LIST_HEAD(list);
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct delayed_iput *delayed;
+	int empty;
+
+	spin_lock(&fs_info->delayed_iput_lock);
+	empty = list_empty(&fs_info->delayed_iputs);
+	spin_unlock(&fs_info->delayed_iput_lock);
+	if (empty)
+		return;
+
+	down_read(&root->fs_info->cleanup_work_sem);
+	spin_lock(&fs_info->delayed_iput_lock);
+	list_splice_init(&fs_info->delayed_iputs, &list);
+	spin_unlock(&fs_info->delayed_iput_lock);
+
+	while (!list_empty(&list)) {
+		delayed = list_entry(list.next, struct delayed_iput, list);
+		list_del(&delayed->list);
+		iput(delayed->inode);
+		kfree(delayed);
+	}
+	up_read(&root->fs_info->cleanup_work_sem);
+}
+
 /*
  * This creates an orphan entry for the given inode in case something goes
  * wrong in the middle of an unlink/truncate.
@@ -2080,16 +2100,17 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 	struct inode *inode;
 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
-	path = btrfs_alloc_path();
-	if (!path)
+	if (!xchg(&root->clean_orphans, 0))
 		return;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
 	path->reada = -1;
 
 	key.objectid = BTRFS_ORPHAN_OBJECTID;
 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
 	key.offset = (u64)-1;
 
-
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0) {
@@ -2834,37 +2855,40 @@ out:
  * min_type is the minimum key type to truncate down to.  If set to 0, this
  * will kill all the items on this inode, including the INODE_ITEM_KEY.
  */
-noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct inode *inode,
-					u64 new_size, u32 min_type)
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct inode *inode,
+			       u64 new_size, u32 min_type)
 {
-	int ret;
 	struct btrfs_path *path;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	u32 found_type = (u8)-1;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
 	u64 extent_start = 0;
 	u64 extent_num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 item_end = 0;
+	u64 mask = root->sectorsize - 1;
+	u32 found_type = (u8)-1;
 	int found_extent;
 	int del_item;
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
 	int encoding;
-	u64 mask = root->sectorsize - 1;
+	int ret;
+	int err = 0;
+
+	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
 
 	if (root->ref_cows)
 		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	path->reada = -1;
 
-	/* FIXME, add redo link to tree so we don't leak on crash */
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
@@ -2872,17 +2896,17 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 search_again:
 	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0)
-		goto error;
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
 
 	if (ret > 0) {
 		/* there are no items in the tree for us to truncate, we're
 		 * done
 		 */
-		if (path->slots[0] == 0) {
-			ret = 0;
-			goto error;
-		}
+		if (path->slots[0] == 0)
+			goto out;
 		path->slots[0]--;
 	}
 
@@ -2917,28 +2941,17 @@ search_again:
 			}
 			item_end--;
 		}
-		if (item_end < new_size) {
-			if (found_type == BTRFS_DIR_ITEM_KEY)
-				found_type = BTRFS_INODE_ITEM_KEY;
-			else if (found_type == BTRFS_EXTENT_ITEM_KEY)
-				found_type = BTRFS_EXTENT_DATA_KEY;
-			else if (found_type == BTRFS_EXTENT_DATA_KEY)
-				found_type = BTRFS_XATTR_ITEM_KEY;
-			else if (found_type == BTRFS_XATTR_ITEM_KEY)
-				found_type = BTRFS_INODE_REF_KEY;
-			else if (found_type)
-				found_type--;
-			else
+		if (found_type > min_type) {
+			del_item = 1;
+		} else {
+			if (item_end < new_size)
 				break;
-			btrfs_set_key_type(&key, found_type);
-			goto next;
+			if (found_key.offset >= new_size)
+				del_item = 1;
+			else
+				del_item = 0;
 		}
-		if (found_key.offset >= new_size)
-			del_item = 1;
-		else
-			del_item = 0;
 		found_extent = 0;
-
 		/* FIXME, shrink the extent if the ref count is only 1 */
 		if (found_type != BTRFS_EXTENT_DATA_KEY)
 			goto delete;
@@ -3025,42 +3038,36 @@ delete:
 						inode->i_ino, extent_offset);
 			BUG_ON(ret);
 		}
-next:
-		if (path->slots[0] == 0) {
-			if (pending_del_nr)
-				goto del_pending;
-			btrfs_release_path(root, path);
-			if (found_type == BTRFS_INODE_ITEM_KEY)
-				break;
-			goto search_again;
-		}
 
-		path->slots[0]--;
-		if (pending_del_nr &&
-		    path->slots[0] + 1 != pending_del_slot) {
-			struct btrfs_key debug;
-del_pending:
-			btrfs_item_key_to_cpu(path->nodes[0], &debug,
-					      pending_del_slot);
-			ret = btrfs_del_items(trans, root, path,
-					      pending_del_slot,
-					      pending_del_nr);
-			BUG_ON(ret);
-			pending_del_nr = 0;
+		if (found_type == BTRFS_INODE_ITEM_KEY)
+			break;
+
+		if (path->slots[0] == 0 ||
+		    path->slots[0] != pending_del_slot) {
+			if (root->ref_cows) {
+				err = -EAGAIN;
+				goto out;
+			}
+			if (pending_del_nr) {
+				ret = btrfs_del_items(trans, root, path,
+						pending_del_slot,
+						pending_del_nr);
+				BUG_ON(ret);
+				pending_del_nr = 0;
+			}
 			btrfs_release_path(root, path);
-			if (found_type == BTRFS_INODE_ITEM_KEY)
-				break;
 			goto search_again;
+		} else {
+			path->slots[0]--;
 		}
 	}
-	ret = 0;
-error:
+out:
 	if (pending_del_nr) {
 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
 				      pending_del_nr);
 	}
 	btrfs_free_path(path);
-	return ret;
+	return err;
 }
 
 /*
@@ -3180,10 +3187,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	if (size <= hole_start)
 		return 0;
 
-	err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
-	if (err)
-		return err;
-
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
 		btrfs_wait_ordered_range(inode, hole_start,
@@ -3196,9 +3199,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 		btrfs_put_ordered_extent(ordered);
 	}
 
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-
 	cur_offset = hole_start;
 	while (1) {
 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
@@ -3206,40 +3206,120 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 		BUG_ON(IS_ERR(em) || !em);
 		last_byte = min(extent_map_end(em), block_end);
 		last_byte = (last_byte + mask) & ~mask;
-		if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
 			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
-			err = btrfs_drop_extents(trans, root, inode,
-						 cur_offset,
-						 cur_offset + hole_size,
-						 block_end,
-						 cur_offset, &hint_byte, 1);
-			if (err)
-				break;
 
-			err = btrfs_reserve_metadata_space(root, 1);
+			err = btrfs_reserve_metadata_space(root, 2);
 			if (err)
 				break;
 
+			trans = btrfs_start_transaction(root, 1);
+			btrfs_set_trans_block_group(trans, inode);
+
+			err = btrfs_drop_extents(trans, inode, cur_offset,
+						 cur_offset + hole_size,
+						 &hint_byte, 1);
+			BUG_ON(err);
+
 			err = btrfs_insert_file_extent(trans, root,
 					inode->i_ino, cur_offset, 0,
 					0, hole_size, 0, hole_size,
 					0, 0, 0);
+			BUG_ON(err);
+
 			btrfs_drop_extent_cache(inode, hole_start,
 					last_byte - 1, 0);
-			btrfs_unreserve_metadata_space(root, 1);
+
+			btrfs_end_transaction(trans, root);
+			btrfs_unreserve_metadata_space(root, 2);
 		}
 		free_extent_map(em);
 		cur_offset = last_byte;
-		if (err || cur_offset >= block_end)
+		if (cur_offset >= block_end)
 			break;
 	}
 
-	btrfs_end_transaction(trans, root);
 	unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 	return err;
 }
 
+static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr;
+	int ret;
+
+	if (attr->ia_size == inode->i_size)
+		return 0;
+
+	if (attr->ia_size > inode->i_size) {
+		unsigned long limit;
+		limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+		if (attr->ia_size > inode->i_sb->s_maxbytes)
+			return -EFBIG;
+		if (limit != RLIM_INFINITY && attr->ia_size > limit) {
+			send_sig(SIGXFSZ, current, 0);
+			return -EFBIG;
+		}
+	}
+
+	ret = btrfs_reserve_metadata_space(root, 1);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+
+	ret = btrfs_orphan_add(trans, inode);
+	BUG_ON(ret);
+
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_unreserve_metadata_space(root, 1);
+	btrfs_btree_balance_dirty(root, nr);
+
+	if (attr->ia_size > inode->i_size) {
+		ret = btrfs_cont_expand(inode, attr->ia_size);
+		if (ret) {
+			btrfs_truncate(inode);
+			return ret;
+		}
+
+		i_size_write(inode, attr->ia_size);
+		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
+
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
+		if (inode->i_nlink > 0) {
+			ret = btrfs_orphan_del(trans, inode);
+			BUG_ON(ret);
+		}
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root, nr);
+		return 0;
+	}
+
+	/*
+	 * We're truncating a file that used to have good data down to
+	 * zero. Make sure it gets into the ordered flush list so that
+	 * any new writes get down to disk quickly.
+	 */
+	if (attr->ia_size == 0)
+		BTRFS_I(inode)->ordered_data_close = 1;
+
+	/* we don't support swapfiles, so vmtruncate shouldn't fail */
+	ret = vmtruncate(inode, attr->ia_size);
+	BUG_ON(ret);
+
+	return 0;
+}
+
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -3250,23 +3330,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		return err;
 
 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-		if (attr->ia_size > inode->i_size) {
-			err = btrfs_cont_expand(inode, attr->ia_size);
-			if (err)
-				return err;
-		} else if (inode->i_size > 0 &&
-			   attr->ia_size == 0) {
-
-			/* we're truncating a file that used to have good
-			 * data down to zero.  Make sure it gets into
-			 * the ordered flush list so that any new writes
-			 * get down to disk quickly.
-			 */
-			BTRFS_I(inode)->ordered_data_close = 1;
-		}
+		err = btrfs_setattr_size(inode, attr);
+		if (err)
+			return err;
 	}
+	attr->ia_valid &= ~ATTR_SIZE;
 
-	err = inode_setattr(inode, attr);
+	if (attr->ia_valid)
+		err = inode_setattr(inode, attr);
 
 	if (!err && ((attr->ia_valid & ATTR_MODE)))
 		err = btrfs_acl_chmod(inode);
@@ -3287,36 +3358,43 @@ void btrfs_delete_inode(struct inode *inode)
 	}
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
+	if (root->fs_info->log_root_recovering) {
+		BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+		goto no_delete;
+	}
+
 	if (inode->i_nlink > 0) {
 		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
 		goto no_delete;
 	}
 
 	btrfs_i_size_write(inode, 0);
-	trans = btrfs_join_transaction(root, 1);
 
-	btrfs_set_trans_block_group(trans, inode);
-	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
-	if (ret) {
-		btrfs_orphan_del(NULL, inode);
-		goto no_delete_lock;
-	}
+	while (1) {
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
+		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
 
-	btrfs_orphan_del(trans, inode);
+		if (ret != -EAGAIN)
+			break;
 
-	nr = trans->blocks_used;
-	clear_inode(inode);
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+		trans = NULL;
+		btrfs_btree_balance_dirty(root, nr);
+	}
 
-	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
-	return;
+	if (ret == 0) {
+		ret = btrfs_orphan_del(trans, inode);
+		BUG_ON(ret);
+	}
 
-no_delete_lock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
 no_delete:
 	clear_inode(inode);
+	return;
 }
 
 /*
@@ -3569,7 +3647,6 @@ static noinline void init_btrfs_i(struct inode *inode)
 	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
 	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
 }
 
@@ -3695,6 +3772,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	}
 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
 
+	if (root != sub_root) {
+		down_read(&root->fs_info->cleanup_work_sem);
+		if (!(inode->i_sb->s_flags & MS_RDONLY))
+			btrfs_orphan_cleanup(sub_root);
+		up_read(&root->fs_info->cleanup_work_sem);
+	}
+
 	return inode;
 }
 
@@ -3869,7 +3953,11 @@ skip:
 
 	/* Reached end of directory/root. Bump pos past the last item. */
 	if (key_type == BTRFS_DIR_INDEX_KEY)
-		filp->f_pos = INT_LIMIT(off_t);
+		/*
+		 * 32-bit glibc will use getdents64, but then strtol -
+		 * so the last number we can serve is this.
+		 */
+		filp->f_pos = 0x7fffffff;
 	else
 		filp->f_pos++;
 nopos:
@@ -4219,7 +4307,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_inode_security(inode, dir);
+	err = btrfs_init_inode_security(trans, inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -4290,7 +4378,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_inode_security(inode, dir);
+	err = btrfs_init_inode_security(trans, inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -4336,6 +4424,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (inode->i_nlink == 0)
 		return -ENOENT;
 
+	/* do not allow sys_link's with other subvols of the same device */
+	if (root->objectid != BTRFS_I(inode)->root->objectid)
+		return -EPERM;
+
 	/*
 	 * 1 item for inode ref
 	 * 2 items for dir items
@@ -4423,7 +4515,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	drop_on_err = 1;
 
-	err = btrfs_init_inode_security(inode, dir);
+	err = btrfs_init_inode_security(trans, inode, dir);
 	if (err)
 		goto out_fail;
 
@@ -5074,17 +5166,20 @@ static void btrfs_truncate(struct inode *inode)
 	unsigned long nr;
 	u64 mask = root->sectorsize - 1;
 
-	if (!S_ISREG(inode->i_mode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+	if (!S_ISREG(inode->i_mode)) {
+		WARN_ON(1);
 		return;
+	}
 
 	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
 	if (ret)
 		return;
+
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
 
 	/*
 	 * setattr is responsible for setting the ordered_data_close flag,
@@ -5106,21 +5201,32 @@ static void btrfs_truncate(struct inode *inode)
 	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
 		btrfs_add_ordered_operation(trans, root, inode);
 
-	btrfs_set_trans_block_group(trans, inode);
-	btrfs_i_size_write(inode, inode->i_size);
+	while (1) {
+		ret = btrfs_truncate_inode_items(trans, root, inode,
+						 inode->i_size,
+						 BTRFS_EXTENT_DATA_KEY);
+		if (ret != -EAGAIN)
+			break;
 
-	ret = btrfs_orphan_add(trans, inode);
-	if (ret)
-		goto out;
-	/* FIXME, add redo link to tree so we don't leak on crash */
-	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
-				      BTRFS_EXTENT_DATA_KEY);
-	btrfs_update_inode(trans, root, inode);
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
 
-	ret = btrfs_orphan_del(trans, inode);
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root, nr);
+
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
+	}
+
+	if (ret == 0 && inode->i_nlink > 0) {
+		ret = btrfs_orphan_del(trans, inode);
+		BUG_ON(ret);
+	}
+
+	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 
-out:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
 	BUG_ON(ret);
@@ -5217,9 +5323,9 @@ void btrfs_destroy_inode(struct inode *inode)
 
 	spin_lock(&root->list_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
-		       " list\n", inode->i_ino);
-		dump_stack();
+		printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
+		       inode->i_ino);
+		list_del_init(&BTRFS_I(inode)->i_orphan);
 	}
 	spin_unlock(&root->list_lock);
 
@@ -5476,7 +5582,7 @@ out_fail:
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
 	struct list_head *head = &root->fs_info->delalloc_inodes;
 	struct btrfs_inode *binode;
@@ -5495,7 +5601,10 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 		spin_unlock(&root->fs_info->delalloc_lock);
 		if (inode) {
 			filemap_flush(inode->i_mapping);
-			iput(inode);
+			if (delay_iput)
+				btrfs_add_delayed_iput(inode);
+			else
+				iput(inode);
 		}
 		cond_resched();
 		spin_lock(&root->fs_info->delalloc_lock);
@@ -5569,7 +5678,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_inode_security(inode, dir);
+	err = btrfs_init_inode_security(trans, inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -5641,57 +5750,77 @@ out_fail:
 	return err;
 }
 
-static int prealloc_file_range(struct btrfs_trans_handle *trans,
-			       struct inode *inode, u64 start, u64 end,
-			       u64 locked_end, u64 alloc_hint, int mode)
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+			u64 alloc_hint, int mode, loff_t actual_len)
 {
+	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 alloc_size;
 	u64 cur_offset = start;
 	u64 num_bytes = end - start;
 	int ret = 0;
+	u64 i_size;
 
 	while (num_bytes > 0) {
 		alloc_size = min(num_bytes, root->fs_info->max_extent);
 
-		ret = btrfs_reserve_metadata_space(root, 1);
-		if (ret)
-			goto out;
+		trans = btrfs_start_transaction(root, 1);
 
 		ret = btrfs_reserve_extent(trans, root, alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
 			WARN_ON(1);
-			goto out;
+			goto stop_trans;
 		}
+
+		ret = btrfs_reserve_metadata_space(root, 3);
+		if (ret) {
+			btrfs_free_reserved_extent(root, ins.objectid,
+						   ins.offset);
+			goto stop_trans;
+		}
+
 		ret = insert_reserved_file_extent(trans, inode,
 						  cur_offset, ins.objectid,
 						  ins.offset, ins.offset,
-						  ins.offset, locked_end,
-						  0, 0, 0,
+						  ins.offset, 0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
 		BUG_ON(ret);
 		btrfs_drop_extent_cache(inode, cur_offset,
 					cur_offset + ins.offset -1, 0);
+
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
 		alloc_hint = ins.objectid + ins.offset;
-		btrfs_unreserve_metadata_space(root, 1);
-	}
-out:
-	if (cur_offset > start) {
+
 		inode->i_ctime = CURRENT_TIME;
 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-		    cur_offset > i_size_read(inode))
-			btrfs_i_size_write(inode, cur_offset);
+			(actual_len > inode->i_size) &&
+			(cur_offset > inode->i_size)) {
+
+			if (cur_offset > actual_len)
+				i_size  = actual_len;
+			else
+				i_size = cur_offset;
+			i_size_write(inode, i_size);
+			btrfs_ordered_update_i_size(inode, i_size, NULL);
+		}
+
 		ret = btrfs_update_inode(trans, root, inode);
 		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, root);
+		btrfs_unreserve_metadata_space(root, 3);
 	}
+	return ret;
 
+stop_trans:
+	btrfs_end_transaction(trans, root);
 	return ret;
+
 }
 
 static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5705,8 +5834,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 	u64 locked_end;
 	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 	struct extent_map *em;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root;
 	int ret;
 
 	alloc_start = offset & ~mask;
@@ -5725,9 +5852,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			goto out;
 	}
 
-	root = BTRFS_I(inode)->root;
-
-	ret = btrfs_check_data_free_space(root, inode,
+	ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
 					  alloc_end - alloc_start);
 	if (ret)
 		goto out;
@@ -5736,12 +5861,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
 
-		trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
-		if (!trans) {
-			ret = -EIO;
-			goto out_free;
-		}
-
 		/* the extent lock is ordered inside the running
 		 * transaction
 		 */
@@ -5755,8 +5874,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			btrfs_put_ordered_extent(ordered);
 			unlock_extent(&BTRFS_I(inode)->io_tree,
 				      alloc_start, locked_end, GFP_NOFS);
-			btrfs_end_transaction(trans, BTRFS_I(inode)->root);
-
 			/*
 			 * we can't wait on the range with the transaction
 			 * running or with the extent lock held
@@ -5777,10 +5894,12 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		BUG_ON(IS_ERR(em) || !em);
 		last_byte = min(extent_map_end(em), alloc_end);
 		last_byte = (last_byte + mask) & ~mask;
-		if (em->block_start == EXTENT_MAP_HOLE) {
-			ret = prealloc_file_range(trans, inode, cur_offset,
-					last_byte, locked_end + 1,
-					alloc_hint, mode);
+		if (em->block_start == EXTENT_MAP_HOLE ||
+		    (cur_offset >= inode->i_size &&
+		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+			ret = prealloc_file_range(inode,
+						  cur_offset, last_byte,
+						alloc_hint, mode, offset+len);
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
@@ -5799,9 +5918,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 		      GFP_NOFS);
 
-	btrfs_end_transaction(trans, BTRFS_I(inode)->root);
-out_free:
-	btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
+	btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
+				       alloc_end - alloc_start);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b9840fa..0bc5776 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -237,7 +237,6 @@ static noinline int create_subvol(struct btrfs_root *root,
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
 	u64 index = 0;
-	unsigned long nr = 1;
 
 	/*
 	 * 1 - inode item
@@ -290,7 +289,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	btrfs_set_root_generation(&root_item, trans->transid);
 	btrfs_set_root_level(&root_item, 0);
 	btrfs_set_root_refs(&root_item, 1);
-	btrfs_set_root_used(&root_item, 0);
+	btrfs_set_root_used(&root_item, leaf->len);
 	btrfs_set_root_last_snapshot(&root_item, 0);
 
 	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
@@ -342,24 +341,21 @@ static noinline int create_subvol(struct btrfs_root *root,
 
 	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
-	nr = trans->blocks_used;
 	err = btrfs_commit_transaction(trans, root);
 	if (err && !ret)
 		ret = err;
 
 	btrfs_unreserve_metadata_space(root, 6);
-	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
 
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 			   char *name, int namelen)
 {
+	struct inode *inode;
 	struct btrfs_pending_snapshot *pending_snapshot;
 	struct btrfs_trans_handle *trans;
-	int ret = 0;
-	int err;
-	unsigned long nr = 0;
+	int ret;
 
 	if (!root->ref_cows)
 		return -EINVAL;
@@ -372,20 +368,20 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	 */
 	ret = btrfs_reserve_metadata_space(root, 6);
 	if (ret)
-		goto fail_unlock;
+		goto fail;
 
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
 	if (!pending_snapshot) {
 		ret = -ENOMEM;
 		btrfs_unreserve_metadata_space(root, 6);
-		goto fail_unlock;
+		goto fail;
 	}
 	pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
 	if (!pending_snapshot->name) {
 		ret = -ENOMEM;
 		kfree(pending_snapshot);
 		btrfs_unreserve_metadata_space(root, 6);
-		goto fail_unlock;
+		goto fail;
 	}
 	memcpy(pending_snapshot->name, name, namelen);
 	pending_snapshot->name[namelen] = '\0';
@@ -395,10 +391,19 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	pending_snapshot->root = root;
 	list_add(&pending_snapshot->list,
 		 &trans->transaction->pending_snapshots);
-	err = btrfs_commit_transaction(trans, root);
+	ret = btrfs_commit_transaction(trans, root);
+	BUG_ON(ret);
+	btrfs_unreserve_metadata_space(root, 6);
 
-fail_unlock:
-	btrfs_btree_balance_dirty(root, nr);
+	inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto fail;
+	}
+	BUG_ON(!inode);
+	d_instantiate(dentry, inode);
+	ret = 0;
+fail:
 	return ret;
 }
 
@@ -1032,8 +1037,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	BUG_ON(!trans);
 
 	/* punch hole in destination first */
-	btrfs_drop_extents(trans, root, inode, off, off + len,
-			   off + len, 0, &hint_byte, 1);
+	btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
 
 	/* clone data */
 	key.objectid = src->i_ino;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5799bc4..5c2a9e7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -291,16 +291,16 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 
 /*
  * remove an ordered extent from the tree.  No references are dropped
- * but, anyone waiting on this extent is woken up.
+ * and you must wake_up entry->wait.  You must hold the tree mutex
+ * while you call this function.
  */
-int btrfs_remove_ordered_extent(struct inode *inode,
+static int __btrfs_remove_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
 	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
 	tree->last = NULL;
@@ -326,16 +326,34 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 	}
 	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 
+	return 0;
+}
+
+/*
+ * remove an ordered extent from the tree.  No references are dropped
+ * but any waiters are woken.
+ */
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	int ret;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	ret = __btrfs_remove_ordered_extent(inode, entry);
 	mutex_unlock(&tree->mutex);
 	wake_up(&entry->wait);
-	return 0;
+
+	return ret;
 }
 
 /*
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+			       int nocow_only, int delay_iput)
 {
 	struct list_head splice;
 	struct list_head *cur;
@@ -372,7 +390,10 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 		if (inode) {
 			btrfs_start_ordered_extent(inode, ordered, 1);
 			btrfs_put_ordered_extent(ordered);
-			iput(inode);
+			if (delay_iput)
+				btrfs_add_delayed_iput(inode);
+			else
+				iput(inode);
 		} else {
 			btrfs_put_ordered_extent(ordered);
 		}
@@ -430,7 +451,7 @@ again:
 				btrfs_wait_ordered_range(inode, 0, (u64)-1);
 			else
 				filemap_flush(inode->i_mapping);
-			iput(inode);
+			btrfs_add_delayed_iput(inode);
 		}
 
 		cond_resched();
@@ -589,7 +610,7 @@ out:
  * After an extent is done, call this to conditionally update the on disk
  * i_size.  i_size is updated to cover any fully written part of the file.
  */
-int btrfs_ordered_update_i_size(struct inode *inode,
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered)
 {
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
@@ -597,18 +618,32 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	u64 disk_i_size;
 	u64 new_i_size;
 	u64 i_size_test;
+	u64 i_size = i_size_read(inode);
 	struct rb_node *node;
+	struct rb_node *prev = NULL;
 	struct btrfs_ordered_extent *test;
+	int ret = 1;
+
+	if (ordered)
+		offset = entry_end(ordered);
+	else
+		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
 
 	mutex_lock(&tree->mutex);
 	disk_i_size = BTRFS_I(inode)->disk_i_size;
 
+	/* truncate file */
+	if (disk_i_size > i_size) {
+		BTRFS_I(inode)->disk_i_size = i_size;
+		ret = 0;
+		goto out;
+	}
+
 	/*
 	 * if the disk i_size is already at the inode->i_size, or
 	 * this ordered extent is inside the disk i_size, we're done
 	 */
-	if (disk_i_size >= inode->i_size ||
-	    ordered->file_offset + ordered->len <= disk_i_size) {
+	if (disk_i_size == i_size || offset <= disk_i_size) {
 		goto out;
 	}
 
@@ -616,8 +651,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 * we can't update the disk_isize if there are delalloc bytes
 	 * between disk_i_size and  this ordered extent
 	 */
-	if (test_range_bit(io_tree, disk_i_size,
-			   ordered->file_offset + ordered->len - 1,
+	if (test_range_bit(io_tree, disk_i_size, offset - 1,
 			   EXTENT_DELALLOC, 0, NULL)) {
 		goto out;
 	}
@@ -626,20 +660,32 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 * if we find an ordered extent then we can't update disk i_size
 	 * yet
 	 */
-	node = &ordered->rb_node;
-	while (1) {
-		node = rb_prev(node);
-		if (!node)
-			break;
+	if (ordered) {
+		node = rb_prev(&ordered->rb_node);
+	} else {
+		prev = tree_search(tree, offset);
+		/*
+		 * we insert file extents without involving ordered struct,
+		 * so there should be no ordered struct cover this offset
+		 */
+		if (prev) {
+			test = rb_entry(prev, struct btrfs_ordered_extent,
+					rb_node);
+			BUG_ON(offset_in_entry(test, offset));
+		}
+		node = prev;
+	}
+	while (node) {
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 		if (test->file_offset + test->len <= disk_i_size)
 			break;
-		if (test->file_offset >= inode->i_size)
+		if (test->file_offset >= i_size)
 			break;
 		if (test->file_offset >= disk_i_size)
 			goto out;
+		node = rb_prev(node);
 	}
-	new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
+	new_i_size = min_t(u64, offset, i_size);
 
 	/*
 	 * at this point, we know we can safely update i_size to at least
@@ -647,7 +693,14 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 * walk forward and see if ios from higher up in the file have
 	 * finished.
 	 */
-	node = rb_next(&ordered->rb_node);
+	if (ordered) {
+		node = rb_next(&ordered->rb_node);
+	} else {
+		if (prev)
+			node = rb_next(prev);
+		else
+			node = rb_first(&tree->tree);
+	}
 	i_size_test = 0;
 	if (node) {
 		/*
@@ -655,10 +708,10 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 		 * between our ordered extent and the next one.
 		 */
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-		if (test->file_offset > entry_end(ordered))
+		if (test->file_offset > offset)
 			i_size_test = test->file_offset;
 	} else {
-		i_size_test = i_size_read(inode);
+		i_size_test = i_size;
 	}
 
 	/*
@@ -667,15 +720,25 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 * are no delalloc bytes in this area, it is safe to update
 	 * disk_i_size to the end of the region.
 	 */
-	if (i_size_test > entry_end(ordered) &&
-	    !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
-			   EXTENT_DELALLOC, 0, NULL)) {
-		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+	if (i_size_test > offset &&
+	    !test_range_bit(io_tree, offset, i_size_test - 1,
+			    EXTENT_DELALLOC, 0, NULL)) {
+		new_i_size = min_t(u64, i_size_test, i_size);
 	}
 	BTRFS_I(inode)->disk_i_size = new_i_size;
+	ret = 0;
 out:
+	/*
+	 * we need to remove the ordered extent with the tree lock held
+	 * so that other people calling this function don't find our fully
+	 * processed ordered entry and skip updating the i_size
+	 */
+	if (ordered)
+		__btrfs_remove_ordered_extent(inode, ordered);
 	mutex_unlock(&tree->mutex);
-	return 0;
+	if (ordered)
+		wake_up(&ordered->wait);
+	return ret;
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f82e874..1fe1282 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -150,12 +150,13 @@ void btrfs_start_ordered_extent(struct inode *inode,
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
-int btrfs_ordered_update_i_size(struct inode *inode,
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
 int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				struct inode *inode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+			       int nocow_only, int delay_iput);
 #endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfcc93c..ab7ab53 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1561,6 +1561,20 @@ static int invalidate_extent_cache(struct btrfs_root *root,
 	return 0;
 }
 
+static void put_inodes(struct list_head *list)
+{
+	struct inodevec *ivec;
+	while (!list_empty(list)) {
+		ivec = list_entry(list->next, struct inodevec, list);
+		list_del(&ivec->list);
+		while (ivec->nr > 0) {
+			ivec->nr--;
+			iput(ivec->inode[ivec->nr]);
+		}
+		kfree(ivec);
+	}
+}
+
 static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key)
 
@@ -1723,6 +1737,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
 		btrfs_btree_balance_dirty(root, nr);
 
+		/*
+		 * put inodes outside transaction, otherwise we may deadlock.
+		 */
+		put_inodes(&inode_list);
+
 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
 			invalidate_extent_cache(root, &key, &next_key);
 	}
@@ -1752,19 +1771,7 @@ out:
 
 	btrfs_btree_balance_dirty(root, nr);
 
-	/*
-	 * put inodes while we aren't holding the tree locks
-	 */
-	while (!list_empty(&inode_list)) {
-		struct inodevec *ivec;
-		ivec = list_entry(inode_list.next, struct inodevec, list);
-		list_del(&ivec->list);
-		while (ivec->nr > 0) {
-			ivec->nr--;
-			iput(ivec->inode[ivec->nr]);
-		}
-		kfree(ivec);
-	}
+	put_inodes(&inode_list);
 
 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
 		invalidate_extent_cache(root, &key, &next_key);
@@ -3274,8 +3281,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 		return -ENOMEM;
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		kfree(cluster);
 		return -ENOMEM;
+	}
 
 	rc->extents_found = 0;
 	rc->extents_skipped = 0;
@@ -3534,8 +3543,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	       (unsigned long long)rc->block_group->key.objectid,
 	       (unsigned long long)rc->block_group->flags);
 
-	btrfs_start_delalloc_inodes(fs_info->tree_root);
-	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
 
 	while (1) {
 		rc->extents_found = 0;
@@ -3755,6 +3764,8 @@ out:
 				       BTRFS_DATA_RELOC_TREE_OBJECTID);
 		if (IS_ERR(fs_root))
 			err = PTR_ERR(fs_root);
+		else
+			btrfs_orphan_cleanup(fs_root);
 	}
 	return err;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 752a546..a649305 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -126,8 +126,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 	substring_t args[MAX_OPT_ARGS];
-	char *p, *num;
+	char *p, *num, *orig;
 	int intarg;
+	int ret = 0;
 
 	if (!options)
 		return 0;
@@ -140,6 +141,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	if (!options)
 		return -ENOMEM;
 
+	orig = options;
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
@@ -262,12 +264,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_discard:
 			btrfs_set_opt(info->mount_opt, DISCARD);
 			break;
+		case Opt_err:
+			printk(KERN_INFO "btrfs: unrecognized mount option "
+			       "'%s'\n", p);
+			ret = -EINVAL;
+			goto out;
 		default:
 			break;
 		}
 	}
-	kfree(options);
-	return 0;
+out:
+	kfree(orig);
+	return ret;
 }
 
 /*
@@ -405,8 +413,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 
-	btrfs_start_delalloc_inodes(root);
-	btrfs_wait_ordered_extents(root, 0);
+	btrfs_start_delalloc_inodes(root, 0);
+	btrfs_wait_ordered_extents(root, 0, 0);
 
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
@@ -450,6 +458,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",notreelog");
 	if (btrfs_test_opt(root, FLUSHONCOMMIT))
 		seq_puts(seq, ",flushoncommit");
+	if (btrfs_test_opt(root, DISCARD))
+		seq_puts(seq, ",discard");
 	if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
 		seq_puts(seq, ",noacl");
 	return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c207e8c..b2acc79 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -333,6 +333,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	memset(trans, 0, sizeof(*trans));
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
+	if (throttle)
+		btrfs_run_delayed_iputs(root);
+
 	return 0;
 }
 
@@ -354,7 +357,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
  * those extents are sent to disk but does not wait on them
  */
 int btrfs_write_marked_extents(struct btrfs_root *root,
-			       struct extent_io_tree *dirty_pages)
+			       struct extent_io_tree *dirty_pages, int mark)
 {
 	int ret;
 	int err = 0;
@@ -367,7 +370,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 
 	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    EXTENT_DIRTY);
+					    mark);
 		if (ret)
 			break;
 		while (start <= end) {
@@ -413,7 +416,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
  * on all the pages and clear them from the dirty pages state tree
  */
 int btrfs_wait_marked_extents(struct btrfs_root *root,
-			      struct extent_io_tree *dirty_pages)
+			      struct extent_io_tree *dirty_pages, int mark)
 {
 	int ret;
 	int err = 0;
@@ -425,12 +428,12 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 	unsigned long index;
 
 	while (1) {
-		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
-					    EXTENT_DIRTY);
+		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+					    mark);
 		if (ret)
 			break;
 
-		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
 		while (start <= end) {
 			index = start >> PAGE_CACHE_SHIFT;
 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
@@ -460,13 +463,13 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
  * those extents are on disk for transaction or log commit
  */
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
-					struct extent_io_tree *dirty_pages)
+				struct extent_io_tree *dirty_pages, int mark)
 {
 	int ret;
 	int ret2;
 
-	ret = btrfs_write_marked_extents(root, dirty_pages);
-	ret2 = btrfs_wait_marked_extents(root, dirty_pages);
+	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
 	return ret || ret2;
 }
 
@@ -479,7 +482,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 		return filemap_write_and_wait(btree_inode->i_mapping);
 	}
 	return btrfs_write_and_wait_marked_extents(root,
-					   &trans->transaction->dirty_pages);
+					   &trans->transaction->dirty_pages,
+					   EXTENT_DIRTY);
 }
 
 /*
@@ -497,13 +501,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 old_root_bytenr;
+	u64 old_root_used;
 	struct btrfs_root *tree_root = root->fs_info->tree_root;
 
+	old_root_used = btrfs_root_used(&root->root_item);
 	btrfs_write_dirty_block_groups(trans, root);
 
 	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
-		if (old_root_bytenr == root->node->start)
+		if (old_root_bytenr == root->node->start &&
+		    old_root_used == btrfs_root_used(&root->root_item))
 			break;
 
 		btrfs_set_root_node(&root->root_item, root->node);
@@ -512,6 +519,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 					&root->root_item);
 		BUG_ON(ret);
 
+		old_root_used = btrfs_root_used(&root->root_item);
 		ret = btrfs_write_dirty_block_groups(trans, root);
 		BUG_ON(ret);
 	}
@@ -795,7 +803,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	memcpy(&pending->root_key, &key, sizeof(key));
 fail:
 	kfree(new_root_item);
-	btrfs_unreserve_metadata_space(root, 6);
 	return ret;
 }
 
@@ -807,7 +814,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 	u64 index = 0;
 	struct btrfs_trans_handle *trans;
 	struct inode *parent_inode;
-	struct inode *inode;
 	struct btrfs_root *parent_root;
 
 	parent_inode = pending->dentry->d_parent->d_inode;
@@ -839,8 +845,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 
 	BUG_ON(ret);
 
-	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
-	d_instantiate(pending->dentry, inode);
 fail:
 	btrfs_end_transaction(trans, fs_info->fs_root);
 	return ret;
@@ -994,11 +998,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->trans_mutex);
 
 		if (flush_on_commit) {
-			btrfs_start_delalloc_inodes(root);
-			ret = btrfs_wait_ordered_extents(root, 0);
+			btrfs_start_delalloc_inodes(root, 1);
+			ret = btrfs_wait_ordered_extents(root, 0, 1);
 			BUG_ON(ret);
 		} else if (snap_pending) {
-			ret = btrfs_wait_ordered_extents(root, 1);
+			ret = btrfs_wait_ordered_extents(root, 0, 1);
 			BUG_ON(ret);
 		}
 
@@ -1116,6 +1120,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		current->journal_info = NULL;
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+	if (current != root->fs_info->transaction_kthread)
+		btrfs_run_delayed_iputs(root);
+
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d4e3e7a..93c7ccb 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,10 +107,10 @@ void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
-					struct extent_io_tree *dirty_pages);
+				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_write_marked_extents(struct btrfs_root *root,
-					struct extent_io_tree *dirty_pages);
+				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
-					struct extent_io_tree *dirty_pages);
+				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 741666a..4a9434b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -542,8 +542,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
-	ret = btrfs_drop_extents(trans, root, inode,
-			 start, extent_end, extent_end, start, &alloc_hint, 1);
+	ret = btrfs_drop_extents(trans, inode, start, extent_end,
+				 &alloc_hint, 1);
 	BUG_ON(ret);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -930,6 +930,17 @@ out_nowrite:
 	return 0;
 }
 
+static int insert_orphan_item(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root, u64 offset)
+{
+	int ret;
+	ret = btrfs_find_orphan_item(root, offset);
+	if (ret > 0)
+		ret = btrfs_insert_orphan_item(trans, root, offset);
+	return ret;
+}
+
+
 /*
  * There are a few corners where the link count of the file can't
  * be properly maintained during replay.  So, instead of adding
@@ -997,9 +1008,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	}
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 
-	if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
-		ret = replay_dir_deletes(trans, root, NULL, path,
-					 inode->i_ino, 1);
+	if (inode->i_nlink == 0) {
+		if (S_ISDIR(inode->i_mode)) {
+			ret = replay_dir_deletes(trans, root, NULL, path,
+						 inode->i_ino, 1);
+			BUG_ON(ret);
+		}
+		ret = insert_orphan_item(trans, root, inode->i_ino);
 		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
@@ -1587,7 +1602,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 		/* inode keys are done during the first stage */
 		if (key.type == BTRFS_INODE_ITEM_KEY &&
 		    wc->stage == LOG_WALK_REPLAY_INODES) {
-			struct inode *inode;
 			struct btrfs_inode_item *inode_item;
 			u32 mode;
 
@@ -1603,31 +1617,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 					     eb, i, &key);
 			BUG_ON(ret);
 
-			/* for regular files, truncate away
-			 * extents past the new EOF
+			/* for regular files, make sure corresponding
+			 * orhpan item exist. extents past the new EOF
+			 * will be truncated later by orphan cleanup.
 			 */
 			if (S_ISREG(mode)) {
-				inode = read_one_inode(root,
-						       key.objectid);
-				BUG_ON(!inode);
-
-				ret = btrfs_truncate_inode_items(wc->trans,
-					root, inode, inode->i_size,
-					BTRFS_EXTENT_DATA_KEY);
+				ret = insert_orphan_item(wc->trans, root,
+							 key.objectid);
 				BUG_ON(ret);
-
-				/* if the nlink count is zero here, the iput
-				 * will free the inode.  We bump it to make
-				 * sure it doesn't get freed until the link
-				 * count fixup is done
-				 */
-				if (inode->i_nlink == 0) {
-					btrfs_inc_nlink(inode);
-					btrfs_update_inode(wc->trans,
-							   root, inode);
-				}
-				iput(inode);
 			}
+
 			ret = link_to_fixup_dir(wc->trans, root,
 						path, key.objectid);
 			BUG_ON(ret);
@@ -1977,10 +1976,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 {
 	int index1;
 	int index2;
+	int mark;
 	int ret;
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
-	u64 log_transid = 0;
+	unsigned long log_transid = 0;
 
 	mutex_lock(&root->log_mutex);
 	index1 = root->log_transid % 2;
@@ -2014,24 +2014,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
+	log_transid = root->log_transid;
+	if (log_transid % 2 == 0)
+		mark = EXTENT_DIRTY;
+	else
+		mark = EXTENT_NEW;
+
 	/* we start IO on  all the marked extents here, but we don't actually
 	 * wait for them until later.
 	 */
-	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
+	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
 	BUG_ON(ret);
 
 	btrfs_set_root_node(&log->root_item, log->node);
 
 	root->log_batch = 0;
-	log_transid = root->log_transid;
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
 	smp_mb();
 	/*
-	 * log tree has been flushed to disk, new modifications of
-	 * the log will be written to new positions. so it's safe to
-	 * allow log writers to go in.
+	 * IO has been started, blocks of the log tree have WRITTEN flag set
+	 * in their headers. new modifications of the log will be written to
+	 * new positions. so it's safe to allow log writers to go in.
 	 */
 	mutex_unlock(&root->log_mutex);
 
@@ -2052,7 +2057,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	index2 = log_root_tree->log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
-		btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		wait_log_commit(trans, log_root_tree,
 				log_root_tree->log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -2072,16 +2077,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * check the full commit flag again
 	 */
 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
-		btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = -EAGAIN;
 		goto out_wake_log_root;
 	}
 
 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
-				&log_root_tree->dirty_log_pages);
+				&log_root_tree->dirty_log_pages,
+				EXTENT_DIRTY | EXTENT_NEW);
 	BUG_ON(ret);
-	btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 
 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
 				log_root_tree->node->start);
@@ -2147,12 +2153,12 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
-				    0, &start, &end, EXTENT_DIRTY);
+				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
 		if (ret)
 			break;
 
-		clear_extent_dirty(&log->dirty_log_pages,
-				   start, end, GFP_NOFS);
+		clear_extent_bits(&log->dirty_log_pages, start, end,
+				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
 	}
 
 	if (log->log_transid > 0) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7eda483..41ecbb2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1135,7 +1135,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->avail_metadata_alloc_bits;
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    root->fs_info->fs_devices->rw_devices <= 4) {
+	    root->fs_info->fs_devices->num_devices <= 4) {
 		printk(KERN_ERR "btrfs: unable to go below four devices "
 		       "on raid10\n");
 		ret = -EINVAL;
@@ -1143,7 +1143,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    root->fs_info->fs_devices->rw_devices <= 2) {
+	    root->fs_info->fs_devices->num_devices <= 2) {
 		printk(KERN_ERR "btrfs: unable to go below two "
 		       "devices on raid1\n");
 		ret = -EINVAL;
@@ -1434,8 +1434,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EINVAL;
 
 	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
-	if (!bdev)
-		return -EIO;
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
 
 	if (root->fs_info->fs_devices->seeding) {
 		seeding_dev = 1;
@@ -2209,7 +2209,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		max_chunk_size = 10 * calc_size;
 		min_stripe_size = 64 * 1024 * 1024;
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-		max_chunk_size = 4 * calc_size;
+		max_chunk_size = 256 * 1024 * 1024;
 		min_stripe_size = 32 * 1024 * 1024;
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		calc_size = 8 * 1024 * 1024;
@@ -2538,6 +2538,11 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 	if (!em)
 		return 1;
 
+	if (btrfs_test_opt(root, DEGRADED)) {
+		free_extent_map(em);
+		return 0;
+	}
+
 	map = (struct map_lookup *)em->bdev;
 	for (i = 0; i < map->num_stripes; i++) {
 		if (!map->stripes[i].dev->writeable) {
@@ -2649,8 +2654,10 @@ again:
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	read_unlock(&em_tree->lock);
 
-	if (!em && unplug_page)
+	if (!em && unplug_page) {
+		kfree(multi);
 		return 0;
+	}
 
 	if (!em) {
 		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b6dd596..193b58f 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -85,22 +85,23 @@ out:
 	return ret;
 }
 
-int __btrfs_setxattr(struct inode *inode, const char *name,
-			    const void *value, size_t size, int flags)
+static int do_setxattr(struct btrfs_trans_handle *trans,
+		       struct inode *inode, const char *name,
+		       const void *value, size_t size, int flags)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
-	int ret = 0, mod = 0;
+	size_t name_len = strlen(name);
+	int ret = 0;
+
+	if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
+		return -ENOSPC;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	trans = btrfs_join_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-
 	/* first lets see if we already have this xattr */
 	di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
 				strlen(name), -1);
@@ -118,15 +119,12 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
 		}
 
 		ret = btrfs_delete_one_dir_name(trans, root, path, di);
-		if (ret)
-			goto out;
+		BUG_ON(ret);
 		btrfs_release_path(root, path);
 
 		/* if we don't have a value then we are removing the xattr */
-		if (!value) {
-			mod = 1;
+		if (!value)
 			goto out;
-		}
 	} else {
 		btrfs_release_path(root, path);
 
@@ -138,20 +136,45 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
 	}
 
 	/* ok we have to create a completely new xattr */
-	ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
-				      value, size, inode->i_ino);
+	ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
+				      name, name_len, value, size);
+	BUG_ON(ret);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+		     struct inode *inode, const char *name,
+		     const void *value, size_t size, int flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (trans)
+		return do_setxattr(trans, inode, name, value, size, flags);
+
+	ret = btrfs_reserve_metadata_space(root, 2);
 	if (ret)
-		goto out;
-	mod = 1;
+		return ret;
 
-out:
-	if (mod) {
-		inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_update_inode(trans, root, inode);
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto out;
 	}
+	btrfs_set_trans_block_group(trans, inode);
 
-	btrfs_end_transaction(trans, root);
-	btrfs_free_path(path);
+	ret = do_setxattr(trans, inode, name, value, size, flags);
+	if (ret)
+		goto out;
+
+	inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, inode);
+	BUG_ON(ret);
+out:
+	btrfs_end_transaction_throttle(trans, root);
+	btrfs_unreserve_metadata_space(root, 2);
 	return ret;
 }
 
@@ -314,7 +337,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
-	return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
+
+	return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size,
+				flags);
 }
 
 int btrfs_removexattr(struct dentry *dentry, const char *name)
@@ -329,10 +354,13 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 
 	if (!btrfs_is_valid_xattr(name))
 		return -EOPNOTSUPP;
-	return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+
+	return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
+				XATTR_REPLACE);
 }
 
-int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+			      struct inode *inode, struct inode *dir)
 {
 	int err;
 	size_t len;
@@ -354,7 +382,7 @@ int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
 	} else {
 		strcpy(name, XATTR_SECURITY_PREFIX);
 		strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-		err = __btrfs_setxattr(inode, name, value, len, 0);
+		err = __btrfs_setxattr(trans, inode, name, value, len, 0);
 		kfree(name);
 	}
 
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index c71e9c3..721efa0 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -27,15 +27,16 @@ extern struct xattr_handler *btrfs_xattr_handlers[];
 
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 		void *buffer, size_t size);
-extern int __btrfs_setxattr(struct inode *inode, const char *name,
-		const void *value, size_t size, int flags);
-
+extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+			    struct inode *inode, const char *name,
+			    const void *value, size_t size, int flags);
 extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 		void *buffer, size_t size);
 extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 		const void *value, size_t size, int flags);
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 
-extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
+extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+				     struct inode *inode, struct inode *dir);
 
 #endif /* __XATTR__ */
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dc2ad60..4314f0d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
 	spin_unlock(&ast_queue_lock);
 }
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
 {
 	if (lkb->lkb_flags & DLM_IFL_USER) {
-		dlm_user_add_ast(lkb, type, bastmode);
+		dlm_user_add_ast(lkb, type, mode);
 		return;
 	}
 
@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 	if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+		lkb->lkb_ast_first = type;
 	}
+
+	/* sanity check, this should not happen */
+
+	if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
+		log_print("repeat cast %d castmode %d lock %x %s",
+			  mode, lkb->lkb_castmode,
+			  lkb->lkb_id, lkb->lkb_resource->res_name);
+
 	lkb->lkb_ast_type |= type;
-	if (bastmode)
-		lkb->lkb_bastmode = bastmode;
+	if (type == AST_BAST)
+		lkb->lkb_bastmode = mode;
+	else
+		lkb->lkb_castmode = mode;
 	spin_unlock(&ast_queue_lock);
 
 	set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,9 +70,9 @@ static void process_asts(void)
 	struct dlm_ls *ls = NULL;
 	struct dlm_rsb *r = NULL;
 	struct dlm_lkb *lkb;
-	void (*cast) (void *astparam);
-	void (*bast) (void *astparam, int mode);
-	int type = 0, bastmode;
+	void (*castfn) (void *astparam);
+	void (*bastfn) (void *astparam, int mode);
+	int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
 
 repeat:
 	spin_lock(&ast_queue_lock);
@@ -75,17 +86,48 @@ repeat:
 		list_del(&lkb->lkb_astqueue);
 		type = lkb->lkb_ast_type;
 		lkb->lkb_ast_type = 0;
+		first = lkb->lkb_ast_first;
+		lkb->lkb_ast_first = 0;
 		bastmode = lkb->lkb_bastmode;
-
+		castmode = lkb->lkb_castmode;
+		castfn = lkb->lkb_astfn;
+		bastfn = lkb->lkb_bastfn;
 		spin_unlock(&ast_queue_lock);
-		cast = lkb->lkb_astfn;
-		bast = lkb->lkb_bastfn;
-
-		if ((type & AST_COMP) && cast)
-			cast(lkb->lkb_astparam);
 
-		if ((type & AST_BAST) && bast)
-			bast(lkb->lkb_astparam, bastmode);
+		do_cast = (type & AST_COMP) && castfn;
+		do_bast = (type & AST_BAST) && bastfn;
+
+		/* Skip a bast if its blocking mode is compatible with the
+		   granted mode of the preceding cast. */
+
+		if (do_bast) {
+			if (first == AST_COMP)
+				last_castmode = castmode;
+			else
+				last_castmode = lkb->lkb_castmode_done;
+			if (dlm_modes_compat(bastmode, last_castmode))
+				do_bast = 0;
+		}
+
+		if (first == AST_COMP) {
+			if (do_cast)
+				castfn(lkb->lkb_astparam);
+			if (do_bast)
+				bastfn(lkb->lkb_astparam, bastmode);
+		} else if (first == AST_BAST) {
+			if (do_bast)
+				bastfn(lkb->lkb_astparam, bastmode);
+			if (do_cast)
+				castfn(lkb->lkb_astparam);
+		} else {
+			log_error(ls, "bad ast_first %d ast_type %d",
+				  first, type);
+		}
+
+		if (do_cast)
+			lkb->lkb_castmode_done = castmode;
+		if (do_bast)
+			lkb->lkb_bastmode_done = bastmode;
 
 		/* this removes the reference added by dlm_add_ast
 		   and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 1b5fc5f..bcb1aab 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
 void dlm_del_ast(struct dlm_lkb *lkb);
 
 void dlm_astd_wake(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index fd9859f..0df2438 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -410,10 +410,10 @@ static struct config_group *make_cluster(struct config_group *g,
 	struct dlm_comms *cms = NULL;
 	void *gps = NULL;
 
-	cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
-	gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
-	sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
-	cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
+	cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
+	gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
+	sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
+	cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
 
 	if (!cl || !gps || !sps || !cms)
 		goto fail;
@@ -482,9 +482,9 @@ static struct config_group *make_space(struct config_group *g, const char *name)
 	struct dlm_nodes *nds = NULL;
 	void *gps = NULL;
 
-	sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
-	gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
-	nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
+	sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
+	gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
+	nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
 
 	if (!sp || !gps || !nds)
 		goto fail;
@@ -536,7 +536,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
 {
 	struct dlm_comm *cm;
 
-	cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
+	cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
 	if (!cm)
 		return ERR_PTR(-ENOMEM);
 
@@ -569,7 +569,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
 	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
 	struct dlm_node *nd;
 
-	nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
+	nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
 	if (!nd)
 		return ERR_PTR(-ENOMEM);
 
@@ -705,7 +705,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
 	if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
 		return -ENOSPC;
 
-	addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+	addr = kzalloc(sizeof(*addr), GFP_NOFS);
 	if (!addr)
 		return -ENOMEM;
 
@@ -868,7 +868,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 
 	ids_count = sp->members_count;
 
-	ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL);
+	ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
 	if (!ids) {
 		rv = -ENOMEM;
 		goto out;
@@ -886,7 +886,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 	if (!new_count)
 		goto out_ids;
 
-	new = kcalloc(new_count, sizeof(int), GFP_KERNEL);
+	new = kcalloc(new_count, sizeof(int), GFP_NOFS);
 	if (!new) {
 		kfree(ids);
 		rv = -ENOMEM;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1c8bb8c..375a235 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -404,7 +404,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 	if (bucket >= ls->ls_rsbtbl_size)
 		return NULL;
 
-	ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
+	ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
 	if (!ri)
 		return NULL;
 	if (n == 0)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index c4dfa1d..7b84c1d 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
 	spin_unlock(&ls->ls_recover_list_lock);
 
 	if (!found)
-		de = kzalloc(sizeof(struct dlm_direntry) + len,
-			     ls->ls_allocation);
+		de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
 	return de;
 }
 
@@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
 
 	dlm_dir_clear(ls);
 
-	last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
+	last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
 	if (!last_name)
 		goto out;
 
@@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 	if (namelen > DLM_RESNAME_MAXLEN)
 		return -EINVAL;
 
-	de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
+	de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
 	if (!de)
 		return -ENOMEM;
 
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d01ca0a..f632b58 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -232,11 +232,17 @@ struct dlm_lkb {
 	int8_t			lkb_status;     /* granted, waiting, convert */
 	int8_t			lkb_rqmode;	/* requested lock mode */
 	int8_t			lkb_grmode;	/* granted lock mode */
-	int8_t			lkb_bastmode;	/* requested mode */
 	int8_t			lkb_highbast;	/* highest mode bast sent for */
+
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
 	int8_t			lkb_wait_count;
 	int8_t			lkb_ast_type;	/* type of ast queued for */
+	int8_t			lkb_ast_first;	/* type of first ast queued */
+
+	int8_t			lkb_bastmode;	/* req mode of queued bast */
+	int8_t			lkb_castmode;	/* gr mode of queued cast */
+	int8_t			lkb_bastmode_done; /* last delivered bastmode */
+	int8_t			lkb_castmode_done; /* last delivered castmode */
 
 	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
 	struct list_head	lkb_statequeue;	/* rsb g/c/w list */
@@ -473,7 +479,6 @@ struct dlm_ls {
 	int			ls_low_nodeid;
 	int			ls_total_weight;
 	int			*ls_node_array;
-	gfp_t			ls_allocation;
 
 	struct dlm_rsb		ls_stub_rsb;	/* for returning errors */
 	struct dlm_lkb		ls_stub_lkb;	/* for returning errors */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index eb507c4..d0e43a3 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 	lkb->lkb_lksb->sb_status = rv;
 	lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
 
-	dlm_add_ast(lkb, AST_COMP, 0);
+	dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
 }
 
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2280,20 +2280,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	if (can_be_queued(lkb)) {
 		error = -EINPROGRESS;
 		add_lkb(r, lkb, DLM_LKSTS_WAITING);
-		send_blocking_asts(r, lkb);
 		add_timeout(lkb);
 		goto out;
 	}
 
 	error = -EAGAIN;
-	if (force_blocking_asts(lkb))
-		send_blocking_asts_all(r, lkb);
 	queue_cast(r, lkb, -EAGAIN);
-
  out:
 	return error;
 }
 
+static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
 static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error = 0;
@@ -2304,7 +2314,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	if (can_be_granted(r, lkb, 1, &deadlk)) {
 		grant_lock(r, lkb);
 		queue_cast(r, lkb, 0);
-		grant_pending_locks(r);
 		goto out;
 	}
 
@@ -2334,7 +2343,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		if (_can_be_granted(r, lkb, 1)) {
 			grant_lock(r, lkb);
 			queue_cast(r, lkb, 0);
-			grant_pending_locks(r);
 			goto out;
 		}
 		/* else fall through and move to convert queue */
@@ -2344,28 +2352,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		error = -EINPROGRESS;
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
-		send_blocking_asts(r, lkb);
 		add_timeout(lkb);
 		goto out;
 	}
 
 	error = -EAGAIN;
-	if (force_blocking_asts(lkb))
-		send_blocking_asts_all(r, lkb);
 	queue_cast(r, lkb, -EAGAIN);
-
  out:
 	return error;
 }
 
+static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case 0:
+		grant_pending_locks(r);
+		/* grant_pending_locks also sends basts */
+		break;
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
 static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	remove_lock(r, lkb);
 	queue_cast(r, lkb, -DLM_EUNLOCK);
-	grant_pending_locks(r);
 	return -DLM_EUNLOCK;
 }
 
+static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	grant_pending_locks(r);
+}
+
 /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
  
 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2375,12 +2402,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	error = revert_lock(r, lkb);
 	if (error) {
 		queue_cast(r, lkb, -DLM_ECANCEL);
-		grant_pending_locks(r);
 		return -DLM_ECANCEL;
 	}
 	return 0;
 }
 
+static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	if (error)
+		grant_pending_locks(r);
+}
+
 /*
  * Four stage 3 varieties:
  * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
@@ -2402,11 +2435,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		goto out;
 	}
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_request() calls do_request() on remote node */
 		error = send_request(r, lkb);
-	else
+	} else {
 		error = do_request(r, lkb);
+		/* for remote locks the request_reply is sent
+		   between do_request and do_request_effects */
+		do_request_effects(r, lkb, error);
+	}
  out:
 	return error;
 }
@@ -2417,11 +2454,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error;
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_convert() calls do_convert() on remote node */
 		error = send_convert(r, lkb);
-	else
+	} else {
 		error = do_convert(r, lkb);
+		/* for remote locks the convert_reply is sent
+		   between do_convert and do_convert_effects */
+		do_convert_effects(r, lkb, error);
+	}
 
 	return error;
 }
@@ -2432,11 +2473,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error;
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_unlock() calls do_unlock() on remote node */
 		error = send_unlock(r, lkb);
-	else
+	} else {
 		error = do_unlock(r, lkb);
+		/* for remote locks the unlock_reply is sent
+		   between do_unlock and do_unlock_effects */
+		do_unlock_effects(r, lkb, error);
+	}
 
 	return error;
 }
@@ -2447,11 +2492,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error;
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_cancel() calls do_cancel() on remote node */
 		error = send_cancel(r, lkb);
-	else
+	} else {
 		error = do_cancel(r, lkb);
+		/* for remote locks the cancel_reply is sent
+		   between do_cancel and do_cancel_effects */
+		do_cancel_effects(r, lkb, error);
+	}
 
 	return error;
 }
@@ -2689,7 +2738,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
 	   pass into lowcomms_commit and a message buffer (mb) that we
 	   write our data into */
 
-	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
 	if (!mh)
 		return -ENOBUFS;
 
@@ -3191,6 +3240,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
 	attach_lkb(r, lkb);
 	error = do_request(r, lkb);
 	send_request_reply(r, lkb, error);
+	do_request_effects(r, lkb, error);
 
 	unlock_rsb(r);
 	put_rsb(r);
@@ -3226,15 +3276,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
 		goto out;
 
 	receive_flags(lkb, ms);
+
 	error = receive_convert_args(ls, lkb, ms);
-	if (error)
-		goto out_reply;
+	if (error) {
+		send_convert_reply(r, lkb, error);
+		goto out;
+	}
+
 	reply = !down_conversion(lkb);
 
 	error = do_convert(r, lkb);
- out_reply:
 	if (reply)
 		send_convert_reply(r, lkb, error);
+	do_convert_effects(r, lkb, error);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
@@ -3266,13 +3320,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
 		goto out;
 
 	receive_flags(lkb, ms);
+
 	error = receive_unlock_args(ls, lkb, ms);
-	if (error)
-		goto out_reply;
+	if (error) {
+		send_unlock_reply(r, lkb, error);
+		goto out;
+	}
 
 	error = do_unlock(r, lkb);
- out_reply:
 	send_unlock_reply(r, lkb, error);
+	do_unlock_effects(r, lkb, error);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
@@ -3307,6 +3364,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = do_cancel(r, lkb);
 	send_cancel_reply(r, lkb, error);
+	do_cancel_effects(r, lkb, error);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
@@ -4512,7 +4570,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 	}
 
 	if (flags & DLM_LKF_VALBLK) {
-		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
 		if (!ua->lksb.sb_lvbptr) {
 			kfree(ua);
 			__put_lkb(ls, lkb);
@@ -4582,7 +4640,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	ua = lkb->lkb_ua;
 
 	if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
-		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
 		if (!ua->lksb.sb_lvbptr) {
 			error = -ENOMEM;
 			goto out_put;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d489fcc..c010ecf 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -430,7 +430,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 
 	error = -ENOMEM;
 
-	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
 	if (!ls)
 		goto out;
 	memcpy(ls->ls_name, name, namelen);
@@ -443,11 +443,6 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	if (flags & DLM_LSFL_TIMEWARN)
 		set_bit(LSFL_TIMEWARN, &ls->ls_flags);
 
-	if (flags & DLM_LSFL_FS)
-		ls->ls_allocation = GFP_NOFS;
-	else
-		ls->ls_allocation = GFP_KERNEL;
-
 	/* ls_exflags are forced to match among nodes, and we don't
 	   need to require all nodes to have some flags set */
 	ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
@@ -456,7 +451,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	size = dlm_config.ci_rsbtbl_size;
 	ls->ls_rsbtbl_size = size;
 
-	ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+	ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS);
 	if (!ls->ls_rsbtbl)
 		goto out_lsfree;
 	for (i = 0; i < size; i++) {
@@ -468,7 +463,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	size = dlm_config.ci_lkbtbl_size;
 	ls->ls_lkbtbl_size = size;
 
-	ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+	ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS);
 	if (!ls->ls_lkbtbl)
 		goto out_rsbfree;
 	for (i = 0; i < size; i++) {
@@ -480,7 +475,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	size = dlm_config.ci_dirtbl_size;
 	ls->ls_dirtbl_size = size;
 
-	ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+	ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS);
 	if (!ls->ls_dirtbl)
 		goto out_lkbfree;
 	for (i = 0; i < size; i++) {
@@ -527,7 +522,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
 	mutex_init(&ls->ls_requestqueue_mutex);
 	mutex_init(&ls->ls_clear_proc_locks);
 
-	ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+	ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
 	if (!ls->ls_recover_buf)
 		goto out_dirfree;
 
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 70736eb..52cab16 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1060,7 +1060,7 @@ static void init_local(void)
 		if (dlm_our_addr(&sas, i))
 			break;
 
-		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+		addr = kmalloc(sizeof(*addr), GFP_NOFS);
 		if (!addr)
 			break;
 		memcpy(addr, &sas, sizeof(*addr));
@@ -1099,7 +1099,7 @@ static int sctp_listen_for_all(void)
 	struct sockaddr_storage localaddr;
 	struct sctp_event_subscribe subscribe;
 	int result = -EINVAL, num = 1, i, addr_len;
-	struct connection *con = nodeid2con(0, GFP_KERNEL);
+	struct connection *con = nodeid2con(0, GFP_NOFS);
 	int bufsize = NEEDED_RMEM;
 
 	if (!con)
@@ -1171,7 +1171,7 @@ out:
 static int tcp_listen_for_all(void)
 {
 	struct socket *sock = NULL;
-	struct connection *con = nodeid2con(0, GFP_KERNEL);
+	struct connection *con = nodeid2con(0, GFP_NOFS);
 	int result = -EINVAL;
 
 	if (!con)
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b128775..84f70bf 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
 	struct dlm_member *memb;
 	int w, error;
 
-	memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+	memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
 	if (!memb)
 		return -ENOMEM;
 
@@ -143,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
 
 	ls->ls_total_weight = total;
 
-	array = kmalloc(sizeof(int) * total, ls->ls_allocation);
+	array = kmalloc(sizeof(int) * total, GFP_NOFS);
 	if (!array)
 		return;
 
@@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 			continue;
 		log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
 
-		memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
+		memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
 		if (!memb)
 			return -ENOMEM;
 		memb->nodeid = rv->new[i];
@@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
 	int *ids = NULL, *new = NULL;
 	int error, ids_count = 0, new_count = 0;
 
-	rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
+	rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
 	if (!rv)
 		return -ENOMEM;
 
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index c1775b8..8e0d00d 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
 	char *p;
 
-	p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
+	p = kzalloc(ls->ls_lvblen, GFP_NOFS);
 	return p;
 }
 
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
 
 	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
 
-	r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
+	r = kzalloc(sizeof(*r) + namelen, GFP_NOFS);
 	return r;
 }
 
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
 
-	lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
+	lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
 	return lkb;
 }
 
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 55ea369..052095c 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -26,7 +26,7 @@ static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
 	struct sk_buff *skb;
 	void *data;
 
-	skb = genlmsg_new(size, GFP_KERNEL);
+	skb = genlmsg_new(size, GFP_NOFS);
 	if (!skb)
 		return -ENOMEM;
 
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 16f682e..2863deb 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -82,7 +82,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (!ls)
 		return -EINVAL;
 
-	xop = kzalloc(sizeof(*xop), GFP_KERNEL);
+	xop = kzalloc(sizeof(*xop), GFP_NOFS);
 	if (!xop) {
 		rv = -ENOMEM;
 		goto out;
@@ -211,7 +211,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (!ls)
 		return -EINVAL;
 
-	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	op = kzalloc(sizeof(*op), GFP_NOFS);
 	if (!op) {
 		rv = -ENOMEM;
 		goto out;
@@ -266,7 +266,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (!ls)
 		return -EINVAL;
 
-	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	op = kzalloc(sizeof(*op), GFP_NOFS);
 	if (!op) {
 		rv = -ENOMEM;
 		goto out;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 67522c2..3c83a49 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
 	char *mb;
 	int mb_len = sizeof(struct dlm_rcom) + len;
 
-	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
 	if (!mh) {
 		log_print("create_rcom to %d type %d len %d ENOBUFS",
 			  to_nodeid, type, len);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 7a2307c..a44fa22 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
 	struct rq_entry *e;
 	int length = ms->m_header.h_length - sizeof(struct dlm_message);
 
-	e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
+	e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
 	if (!e) {
 		log_print("dlm_add_requestqueue: out of memory len %d", length);
 		return;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebce994..a4bfd31 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -173,7 +173,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
    being removed and then remove that lkb from the orphans list and free it */
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
 {
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
@@ -206,8 +206,10 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 
 	ast_type = lkb->lkb_ast_type;
 	lkb->lkb_ast_type |= type;
-	if (bastmode)
-		lkb->lkb_bastmode = bastmode;
+	if (type == AST_BAST)
+		lkb->lkb_bastmode = mode;
+	else
+		lkb->lkb_castmode = mode;
 
 	if (!ast_type) {
 		kref_get(&lkb->lkb_ref);
@@ -267,7 +269,7 @@ static int device_user_lock(struct dlm_user_proc *proc,
 		goto out;
 	}
 
-	ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
 	if (!ua)
 		goto out;
 	ua->proc = proc;
@@ -307,7 +309,7 @@ static int device_user_unlock(struct dlm_user_proc *proc,
 	if (!ls)
 		return -ENOENT;
 
-	ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
 	if (!ua)
 		goto out;
 	ua->proc = proc;
@@ -352,7 +354,7 @@ static int dlm_device_register(struct dlm_ls *ls, char *name)
 
 	error = -ENOMEM;
 	len = strlen(name) + strlen(name_prefix) + 2;
-	ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+	ls->ls_device.name = kzalloc(len, GFP_NOFS);
 	if (!ls->ls_device.name)
 		goto fail;
 
@@ -520,7 +522,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 #endif
 		return -EINVAL;
 
-	kbuf = kzalloc(count + 1, GFP_KERNEL);
+	kbuf = kzalloc(count + 1, GFP_NOFS);
 	if (!kbuf)
 		return -ENOMEM;
 
@@ -546,7 +548,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 
 		/* add 1 after namelen so that the name string is terminated */
 		kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
-			       GFP_KERNEL);
+			       GFP_NOFS);
 		if (!kbuf) {
 			kfree(k32buf);
 			return -ENOMEM;
@@ -648,7 +650,7 @@ static int device_open(struct inode *inode, struct file *file)
 	if (!ls)
 		return -ENOENT;
 
-	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
 	if (!proc) {
 		dlm_put_lockspace(ls);
 		return -ENOMEM;
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 1c96864..f196091 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -9,7 +9,7 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 1744f17..4e25328 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -198,7 +198,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 			       "the persistent file for the dentry with name "
 			       "[%s]; rc = [%d]\n", __func__,
 			       ecryptfs_dentry->d_name.name, rc);
-			goto out;
+			goto out_free;
 		}
 	}
 	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
@@ -206,7 +206,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 		rc = -EPERM;
 		printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
 		       "file must hence be opened RO\n", __func__);
-		goto out;
+		goto out_free;
 	}
 	ecryptfs_set_file_lower(
 		file, ecryptfs_inode_to_private(inode)->lower_file);
@@ -293,12 +293,40 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
 	return rc;
 }
 
-static int ecryptfs_ioctl(struct inode *inode, struct file *file,
-			  unsigned int cmd, unsigned long arg);
+static long
+ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct file *lower_file = NULL;
+	long rc = -ENOTTY;
+
+	if (ecryptfs_file_to_private(file))
+		lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl)
+		rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+	return rc;
+}
+
+#ifdef CONFIG_COMPAT
+static long
+ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct file *lower_file = NULL;
+	long rc = -ENOIOCTLCMD;
+
+	if (ecryptfs_file_to_private(file))
+		lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl)
+		rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+	return rc;
+}
+#endif
 
 const struct file_operations ecryptfs_dir_fops = {
 	.readdir = ecryptfs_readdir,
-	.ioctl = ecryptfs_ioctl,
+	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = ecryptfs_compat_ioctl,
+#endif
 	.mmap = generic_file_mmap,
 	.open = ecryptfs_open,
 	.flush = ecryptfs_flush,
@@ -315,7 +343,10 @@ const struct file_operations ecryptfs_main_fops = {
 	.write = do_sync_write,
 	.aio_write = generic_file_aio_write,
 	.readdir = ecryptfs_readdir,
-	.ioctl = ecryptfs_ioctl,
+	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = ecryptfs_compat_ioctl,
+#endif
 	.mmap = generic_file_mmap,
 	.open = ecryptfs_open,
 	.flush = ecryptfs_flush,
@@ -324,20 +355,3 @@ const struct file_operations ecryptfs_main_fops = {
 	.fasync = ecryptfs_fasync,
 	.splice_read = generic_file_splice_read,
 };
-
-static int
-ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-	       unsigned long arg)
-{
-	int rc = 0;
-	struct file *lower_file = NULL;
-
-	if (ecryptfs_file_to_private(file))
-		lower_file = ecryptfs_file_to_lower(file);
-	if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
-		rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
-					     lower_file, cmd, arg);
-	else
-		rc = -ENOTTY;
-	return rc;
-}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 268b7d1..585987c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -272,7 +272,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 		printk(KERN_ERR "%s: Out of memory whilst attempting "
 		       "to allocate ecryptfs_dentry_info struct\n",
 			__func__);
-		goto out_dput;
+		goto out_put;
 	}
 	ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
 	ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
@@ -345,8 +345,9 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 out_free_kmem:
 	kmem_cache_free(ecryptfs_header_cache_2, page_virt);
 	goto out;
-out_dput:
+out_put:
 	dput(lower_dentry);
+	mntput(lower_mnt);
 	d_drop(ecryptfs_dentry);
 out:
 	return rc;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5b14d11..a73ed78 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -958,6 +958,9 @@ mext_check_arguments(struct inode *orig_inode,
 		return -EINVAL;
 	}
 
+	if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+		return -EPERM;
+
 	/* Ext4 move extent does not support swapfile */
 	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
 		ext4_debug("ext4 move extent: The argument files should "
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 54a05cc..f27e045 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -227,7 +227,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	if (sb->s_flags & MS_RDONLY)
 		return ERR_PTR(-EROFS);
 
-	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+	vfs_check_frozen(sb, SB_FREEZE_TRANS);
 	/* Special case here: if the journal has aborted behind our
 	 * backs (eg. EIO in the commit thread), then we still need to
 	 * take the FS itself readonly cleanly. */
@@ -1218,6 +1218,11 @@ static int parse_options(char *options, struct super_block *sb,
 		if (!*p)
 			continue;
 
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = 0;
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_bsd_df:
@@ -1503,10 +1508,11 @@ set_qf_format:
 			clear_opt(sbi->s_mount_opt, BARRIER);
 			break;
 		case Opt_barrier:
-			if (match_int(&args[0], &option)) {
-				set_opt(sbi->s_mount_opt, BARRIER);
-				break;
-			}
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
 			if (option)
 				set_opt(sbi->s_mount_opt, BARRIER);
 			else
@@ -1579,10 +1585,11 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
 			break;
 		case Opt_auto_da_alloc:
-			if (match_int(&args[0], &option)) {
-				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
-				break;
-			}
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
 			if (option)
 				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
 			else
@@ -3390,7 +3397,7 @@ int ext4_force_commit(struct super_block *sb)
 
 	journal = EXT4_SB(sb)->s_journal;
 	if (journal) {
-		vfs_check_frozen(sb, SB_FREEZE_WRITE);
+		vfs_check_frozen(sb, SB_FREEZE_TRANS);
 		ret = ext4_journal_force_commit(journal);
 	}
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index fad3645..8b0da9b 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -85,46 +85,25 @@ struct ea_buffer {
 #define EA_MALLOC	0x0008
 
 
+static int is_known_namespace(const char *name)
+{
+	if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
+	    strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+		return false;
+
+	return true;
+}
+
 /*
  * These three routines are used to recognize on-disk extended attributes
  * that are in a recognized namespace.  If the attribute is not recognized,
  * "os2." is prepended to the name
  */
-static inline int is_os2_xattr(struct jfs_ea *ea)
+static int is_os2_xattr(struct jfs_ea *ea)
 {
-	/*
-	 * Check for "system."
-	 */
-	if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) &&
-	    !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return false;
-	/*
-	 * Check for "user."
-	 */
-	if ((ea->namelen >= XATTR_USER_PREFIX_LEN) &&
-	    !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-		return false;
-	/*
-	 * Check for "security."
-	 */
-	if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) &&
-	    !strncmp(ea->name, XATTR_SECURITY_PREFIX,
-		     XATTR_SECURITY_PREFIX_LEN))
-		return false;
-	/*
-	 * Check for "trusted."
-	 */
-	if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) &&
-	    !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
-		return false;
-	/*
-	 * Add any other valid namespace prefixes here
-	 */
-
-	/*
-	 * We assume it's OS/2's flat namespace
-	 */
-	return true;
+	return !is_known_namespace(ea->name);
 }
 
 static inline int name_size(struct jfs_ea *ea)
@@ -762,13 +741,23 @@ static int can_set_xattr(struct inode *inode, const char *name,
 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
 		return can_set_system_xattr(inode, name, value, value_len);
 
+	if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
+		/*
+		 * This makes sure that we aren't trying to set an
+		 * attribute in a different namespace by prefixing it
+		 * with "os2."
+		 */
+		if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
+				return -EOPNOTSUPP;
+		return 0;
+	}
+
 	/*
 	 * Don't allow setting an attribute in an unknown namespace.
 	 */
 	if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
 	    strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
-	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
-	    strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN))
+	    strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
 		return -EOPNOTSUPP;
 
 	return 0;
@@ -950,19 +939,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
 	int xattr_size;
 	ssize_t size;
 	int namelen = strlen(name);
-	char *os2name = NULL;
 	char *value;
 
-	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
-		os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1,
-				  GFP_KERNEL);
-		if (!os2name)
-			return -ENOMEM;
-		strcpy(os2name, name + XATTR_OS2_PREFIX_LEN);
-		name = os2name;
-		namelen -= XATTR_OS2_PREFIX_LEN;
-	}
-
 	down_read(&JFS_IP(inode)->xattr_sem);
 
 	xattr_size = ea_get(inode, &ea_buf, 0);
@@ -1000,8 +978,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data,
       out:
 	up_read(&JFS_IP(inode)->xattr_sem);
 
-	kfree(os2name);
-
 	return size;
 }
 
@@ -1010,6 +986,19 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
 {
 	int err;
 
+	if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
+		/*
+		 * skip past "os2." prefix
+		 */
+		name += XATTR_OS2_PREFIX_LEN;
+		/*
+		 * Don't allow retrieving properly prefixed attributes
+		 * by prepending them with "os2."
+		 */
+		if (is_known_namespace(name))
+			return -EOPNOTSUPP;
+	}
+
 	err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
 
 	return err;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5..5661db1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1765,9 +1765,9 @@ set_and_inc:
  *
  * The array index of the subtree root is passed back.
  */
-static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
-				   struct ocfs2_path *left,
-				   struct ocfs2_path *right)
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+			    struct ocfs2_path *left,
+			    struct ocfs2_path *right)
 {
 	int i = 0;
 
@@ -2872,8 +2872,8 @@ out:
  * This looks similar, but is subtly different to
  * ocfs2_find_cpos_for_left_leaf().
  */
-static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
-					  struct ocfs2_path *path, u32 *cpos)
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+				   struct ocfs2_path *path, u32 *cpos)
 {
 	int i, j, ret = 0;
 	u64 blkno;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 9c122d5..1db4359 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle,
 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
 			      handle_t *handle,
 			      struct ocfs2_path *path);
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+				   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+			    struct ocfs2_path *left,
+			    struct ocfs2_path *right);
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac62..b5cb3ed 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if (__mandatory_lock(inode))
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 03a1ab8..bb0465f 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -969,6 +969,103 @@ out:
 }
 
 /*
+ * Find the end range for a leaf refcount block indicated by
+ * el->l_recs[index].e_blkno.
+ */
+static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
+				       struct buffer_head *ref_root_bh,
+				       struct ocfs2_extent_block *eb,
+				       struct ocfs2_extent_list *el,
+				       int index,  u32 *cpos_end)
+{
+	int ret, i, subtree_root;
+	u32 cpos;
+	u64 blkno;
+	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+	struct ocfs2_path *left_path = NULL, *right_path = NULL;
+	struct ocfs2_extent_tree et;
+	struct ocfs2_extent_list *tmp_el;
+
+	if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
+		/*
+		 * We have a extent rec after index, so just use the e_cpos
+		 * of the next extent rec.
+		 */
+		*cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
+		return 0;
+	}
+
+	if (!eb || (eb && !eb->h_next_leaf_blk)) {
+		/*
+		 * We are the last extent rec, so any high cpos should
+		 * be stored in this leaf refcount block.
+		 */
+		*cpos_end = UINT_MAX;
+		return 0;
+	}
+
+	/*
+	 * If the extent block isn't the last one, we have to find
+	 * the subtree root between this extent block and the next
+	 * leaf extent block and get the corresponding e_cpos from
+	 * the subroot. Otherwise we may corrupt the b-tree.
+	 */
+	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+
+	left_path = ocfs2_new_path_from_et(&et);
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
+	ret = ocfs2_find_path(ci, left_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	right_path = ocfs2_new_path_from_path(left_path);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_path(ci, right_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	subtree_root = ocfs2_find_subtree_root(&et, left_path,
+					       right_path);
+
+	tmp_el = left_path->p_node[subtree_root].el;
+	blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
+	for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
+		if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
+			*cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
+			break;
+		}
+	}
+
+	BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
+
+out:
+	ocfs2_free_path(left_path);
+	ocfs2_free_path(right_path);
+	return ret;
+}
+
+/*
  * Given a cpos and len, try to find the refcount record which contains cpos.
  * 1. If cpos can be found in one refcount record, return the record.
  * 2. If cpos can't be found, return a fake record which start from cpos
@@ -983,10 +1080,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
 				  struct buffer_head **ret_bh)
 {
 	int ret = 0, i, found;
-	u32 low_cpos;
+	u32 low_cpos, uninitialized_var(cpos_end);
 	struct ocfs2_extent_list *el;
-	struct ocfs2_extent_rec *tmp, *rec = NULL;
-	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+	struct ocfs2_extent_block *eb = NULL;
 	struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	struct ocfs2_refcount_block *rb =
@@ -1034,12 +1131,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
 		}
 	}
 
-	/* adjust len when we have ocfs2_extent_rec after it. */
-	if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
-		tmp = &el->l_recs[i+1];
+	if (found) {
+		ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
+						  eb, el, i, &cpos_end);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
-		if (le32_to_cpu(tmp->e_cpos) < cpos + len)
-			len = le32_to_cpu(tmp->e_cpos) - cpos;
+		if (cpos_end < low_cpos + len)
+			len = cpos_end - low_cpos;
 	}
 
 	ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 14f47d2..9f55be4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -701,6 +701,10 @@ unlock_osb:
 
 		if (!ocfs2_is_hard_readonly(osb))
 			ocfs2_set_journal_params(osb);
+
+		sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+			((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
+							MS_POSIXACL : 0);
 	}
 out:
 	unlock_kernel();
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650..42fdc76 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -405,7 +405,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
-			struct task_cputime cputime;
 			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
@@ -416,9 +415,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			thread_group_cputime(task, &cputime);
-			utime = cputime.utime;
-			stime = cputime.stime;
+			thread_group_times(task, &utime, &stime);
 			gtime = cputime_add(gtime, sig->gtime);
 		}
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9062220..b5fe0aa 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2184,6 +2184,15 @@ static int journal_read_transaction(struct super_block *sb,
 		brelse(d_bh);
 		return 1;
 	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		reiserfs_warning(sb, "clm-2076",
+				 "device is readonly, unable to replay log");
+		brelse(c_bh);
+		brelse(d_bh);
+		return -EROFS;
+	}
+
 	trans_id = get_desc_trans_id(desc);
 	/* now we know we've got a good transaction, and it was inside the valid time ranges */
 	log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2422,12 +2431,6 @@ static int journal_read(struct super_block *sb)
 		goto start_log_replay;
 	}
 
-	if (continue_replay && bdev_read_only(sb->s_bdev)) {
-		reiserfs_warning(sb, "clm-2076",
-				 "device is readonly, unable to replay log");
-		return -1;
-	}
-
 	/* ok, there are transactions that need to be replayed.  start with the first log block, find
 	 ** all the valid transactions, and pick out the oldest.
 	 */
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index a92c879..b37b13b 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -75,7 +75,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
 		return error;
 	}
 
-	if (sec->length) {
+	if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
 		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
 			 reiserfs_xattr_nblocks(inode, sec->length);
 		/* We don't want to count the directories twice if we have
diff --git a/fs/signalfd.c b/fs/signalfd.c
index b07565c..d98bea8 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -87,6 +87,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 		 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
 		 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
 		 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
 		break;
 	case __SI_POLL:
 		err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
@@ -110,6 +111,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
 		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
 		err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+		err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
 		break;
 	default:
 		/*
diff --git a/fs/splice.c b/fs/splice.c
index 7394e9e..e5efbb9 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -365,17 +365,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		 * If the page isn't uptodate, we may need to start io on it
 		 */
 		if (!PageUptodate(page)) {
-			/*
-			 * If in nonblock mode then dont block on waiting
-			 * for an in-flight io page
-			 */
-			if (flags & SPLICE_F_NONBLOCK) {
-				if (!trylock_page(page)) {
-					error = -EAGAIN;
-					break;
-				}
-			} else
-				lock_page(page);
+			lock_page(page);
 
 			/*
 			 * Page was truncated, or invalidated by the
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 64b1a4c..f73bc1b 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -282,10 +282,12 @@ extern struct clocksource * __init __weak clocksource_default_clock(void);
 extern void clocksource_mark_unstable(struct clocksource *cs);
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
-extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
+extern void
+update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult);
 extern void update_vsyscall_tz(void);
 #else
-static inline void update_vsyscall(struct timespec *ts, struct clocksource *c)
+static inline void
+update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult)
 {
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b6a5622..1ff0962 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -145,11 +145,11 @@ struct inodes_stat_t {
  *
  */
 #define RW_MASK		1
-#define RWA_MASK	2
+#define RWA_MASK		16
 #define READ 0
 #define WRITE 1
-#define READA 2		/* read-ahead  - don't block if no resources */
-#define SWRITE 3	/* for ll_rw_block() - wait for buffer lock */
+#define READA			16 /* readahead - don't block if no resources */
+#define SWRITE			17 /* for ll_rw_block(), wait for buffer lock */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7ca72b7..c49d6f5 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -52,16 +52,21 @@
  * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
  *                Used by threaded interrupts which need to keep the
  *                irq line disabled until the threaded handler has been run.
+ * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend
+ *
  */
 #define IRQF_DISABLED		0x00000020
 #define IRQF_SAMPLE_RANDOM	0x00000040
 #define IRQF_SHARED		0x00000080
 #define IRQF_PROBE_SHARED	0x00000100
-#define IRQF_TIMER		0x00000200
+#define __IRQF_TIMER		0x00000200
 #define IRQF_PERCPU		0x00000400
 #define IRQF_NOBALANCING	0x00000800
 #define IRQF_IRQPOLL		0x00001000
 #define IRQF_ONESHOT		0x00002000
+#define IRQF_NO_SUSPEND		0x00004000
+
+#define IRQF_TIMER		(__IRQF_TIMER | IRQF_NO_SUSPEND)
 
 /*
  * Bits used by threaded handlers:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 812a5f3..ec12f8c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1560,6 +1560,8 @@ extern void netif_carrier_on(struct net_device *dev);
 
 extern void netif_carrier_off(struct net_device *dev);
 
+extern void netif_notify_peers(struct net_device *dev);
+
 /**
  *	netif_dormant_on - mark device as dormant.
  *	@dev: network device
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 44428d2..5ecdb50 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -201,6 +201,7 @@ static inline int notifier_to_errno(int ret)
 #define NETDEV_PRE_UP		0x000D
 #define NETDEV_BONDING_OLDTYPE  0x000E
 #define NETDEV_BONDING_NEWTYPE  0x000F
+#define NETDEV_NOTIFY_PEERS	0x0013
 
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 99928dc..7fa02b4 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -70,6 +70,11 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th,
 void reiserfs_security_free(struct reiserfs_security_handle *sec);
 #endif
 
+static inline int reiserfs_xattrs_initialized(struct super_block *sb)
+{
+	return REISERFS_SB(sb)->priv_root != NULL;
+}
+
 #define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
 static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b253434..cc24beb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -628,6 +628,9 @@ struct signal_struct {
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	cputime_t prev_utime, prev_stime;
+#endif
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
@@ -1725,6 +1728,7 @@ static inline void put_task_struct(struct task_struct *t)
 extern cputime_t task_utime(struct task_struct *p);
 extern cputime_t task_stime(struct task_struct *p);
 extern cputime_t task_gtime(struct task_struct *p);
+extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
  * Per process flags
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0482229..8dc0821 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -98,6 +98,9 @@ extern int tick_check_oneshot_change(int allow_nohz);
 extern struct tick_sched *tick_get_tick_sched(int cpu);
 extern void tick_check_idle(int cpu);
 extern int tick_oneshot_mode_active(void);
+#  ifndef arch_needs_cpu
+#   define arch_needs_cpu(cpu) (0)
+#  endif
 # else
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index 2526f3b..dd0bde1 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h
@@ -19,4 +19,8 @@
 /* device can't handle its Configuration or Interface strings */
 #define USB_QUIRK_CONFIG_INTF_STRINGS	0x00000008
 
+/* device needs a pause during initialization, after we read the device
+   descriptor */
+#define USB_QUIRK_DELAY_INIT		0x00000040
+
 #endif /* __LINUX_USB_QUIRKS_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac..4a0e062 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -110,8 +110,8 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime = cputime_add(sig->utime, task_utime(tsk));
-		sig->stime = cputime_add(sig->stime, task_stime(tsk));
+		sig->utime = cputime_add(sig->utime, tsk->utime);
+		sig->stime = cputime_add(sig->stime, tsk->stime);
 		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
@@ -1205,6 +1205,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 		unsigned long maxrss;
+		cputime_t tgutime, tgstime;
 
 		/*
 		 * The resource counters for the group leader are in its
@@ -1220,20 +1221,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		 * need to protect the access to parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
+		 *
+		 * We use thread_group_times() to get times for the thread
+		 * group, which consolidates times for all threads in the
+		 * group including the group leader.
 		 */
+		thread_group_times(p, &tgutime, &tgstime);
 		spin_lock_irq(&p->real_parent->sighand->siglock);
 		psig = p->real_parent->signal;
 		sig = p->signal;
 		psig->cutime =
 			cputime_add(psig->cutime,
-			cputime_add(p->utime,
-			cputime_add(sig->utime,
-				    sig->cutime)));
+			cputime_add(tgutime,
+				    sig->cutime));
 		psig->cstime =
 			cputime_add(psig->cstime,
-			cputime_add(p->stime,
-			cputime_add(sig->stime,
-				    sig->cstime)));
+			cputime_add(tgstime,
+				    sig->cstime));
 		psig->cgtime =
 			cputime_add(psig->cgtime,
 			cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index 28b4874..ce2f585 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -884,6 +884,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
 	sig->cgtime = cputime_zero;
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+	sig->prev_utime = sig->prev_stime = cputime_zero;
+#endif
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index fa4bdd4..f34e231 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -200,7 +200,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
 void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 {
 	if (suspend) {
-		if (!desc->action || (desc->action->flags & IRQF_TIMER))
+		if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
 			return;
 		desc->status |= IRQ_SUSPENDED;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index d0958da..9990074 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5215,45 +5215,90 @@ cputime_t task_stime(struct task_struct *p)
 {
 	return p->stime;
 }
+
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+
+	*ut = cputime.utime;
+	*st = cputime.stime;
+}
 #else
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs) \
+	msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
+#endif
+
 cputime_t task_utime(struct task_struct *p)
 {
-	clock_t utime = cputime_to_clock_t(p->utime),
-		total = utime + cputime_to_clock_t(p->stime);
+	cputime_t utime = p->utime, total = utime + p->stime;
 	u64 temp;
 
 	/*
 	 * Use CFS's precise accounting:
 	 */
-	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+	temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
 
 	if (total) {
 		temp *= utime;
 		do_div(temp, total);
 	}
-	utime = (clock_t)temp;
+	utime = (cputime_t)temp;
 
-	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+	p->prev_utime = max(p->prev_utime, utime);
 	return p->prev_utime;
 }
 
 cputime_t task_stime(struct task_struct *p)
 {
-	clock_t stime;
+	cputime_t stime;
 
 	/*
 	 * Use CFS's precise accounting. (we subtract utime from
 	 * the total, to make sure the total observed by userspace
 	 * grows monotonically - apps rely on that):
 	 */
-	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-			cputime_to_clock_t(task_utime(p));
+	stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
 
 	if (stime >= 0)
-		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+		p->prev_stime = max(p->prev_stime, stime);
 
 	return p->prev_stime;
 }
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct signal_struct *sig = p->signal;
+	struct task_cputime cputime;
+	cputime_t rtime, utime, total;
+
+	thread_group_cputime(p, &cputime);
+
+	total = cputime_add(cputime.utime, cputime.stime);
+	rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+
+	if (total) {
+		u64 temp;
+
+		temp = (u64)(rtime * cputime.utime);
+		do_div(temp, total);
+		utime = (cputime_t)temp;
+	} else
+		utime = rtime;
+
+	sig->prev_utime = max(sig->prev_utime, utime);
+	sig->prev_stime = max(sig->prev_stime,
+			      cputime_sub(rtime, sig->prev_utime));
+
+	*ut = sig->prev_utime;
+	*st = sig->prev_stime;
+}
 #endif
 
 inline cputime_t task_gtime(struct task_struct *p)
@@ -5545,7 +5590,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 	 * the mutex owner just released it and exited.
 	 */
 	if (probe_kernel_address(&owner->cpu, cpu))
-		goto out;
+		return 0;
 #else
 	cpu = owner->cpu;
 #endif
@@ -5555,14 +5600,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 	 * the cpu field may no longer be valid.
 	 */
 	if (cpu >= nr_cpumask_bits)
-		goto out;
+		return 0;
 
 	/*
 	 * We need to validate that we can do a
 	 * get_cpu() and that we have the percpu area.
 	 */
 	if (!cpu_online(cpu))
-		goto out;
+		return 0;
 
 	rq = cpu_rq(cpu);
 
@@ -5581,7 +5626,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 
 		cpu_relax();
 	}
-out:
+
 	return 1;
 }
 #endif
diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760..26e4b8a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -911,16 +911,15 @@ change_okay:
 
 void do_sys_times(struct tms *tms)
 {
-	struct task_cputime cputime;
-	cputime_t cutime, cstime;
+	cputime_t tgutime, tgstime, cutime, cstime;
 
-	thread_group_cputime(current, &cputime);
 	spin_lock_irq(&current->sighand->siglock);
+	thread_group_times(current, &tgutime, &tgstime);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
 	spin_unlock_irq(&current->sighand->siglock);
-	tms->tms_utime = cputime_to_clock_t(cputime.utime);
-	tms->tms_stime = cputime_to_clock_t(cputime.stime);
+	tms->tms_utime = cputime_to_clock_t(tgutime);
+	tms->tms_stime = cputime_to_clock_t(tgstime);
 	tms->tms_cutime = cputime_to_clock_t(cutime);
 	tms->tms_cstime = cputime_to_clock_t(cstime);
 }
@@ -1338,8 +1337,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 {
 	struct task_struct *t;
 	unsigned long flags;
-	cputime_t utime, stime;
-	struct task_cputime cputime;
+	cputime_t tgutime, tgstime, utime, stime;
 	unsigned long maxrss = 0;
 
 	memset((char *) r, 0, sizeof *r);
@@ -1373,9 +1371,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 				break;
 
 		case RUSAGE_SELF:
-			thread_group_cputime(p, &cputime);
-			utime = cputime_add(utime, cputime.utime);
-			stime = cputime_add(stime, cputime.stime);
+			thread_group_times(p, &tgutime, &tgstime);
+			utime = cputime_add(utime, tgutime);
+			stime = cputime_add(stime, tgstime);
 			r->ru_nvcsw += p->signal->nvcsw;
 			r->ru_nivcsw += p->signal->nivcsw;
 			r->ru_minflt += p->signal->min_flt;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 44320b1..b63cfeb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
  * value. We do this unconditionally on any cpu, as we don't know whether the
  * cpu, which has the update task assigned is in a long sleep.
  */
-static void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(ktime_t now)
 {
 	int cpu = smp_processor_id();
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	unsigned long flags;
-	ktime_t now;
-
-	if (!ts->tick_stopped)
-		return;
 
 	cpumask_clear_cpu(cpu, nohz_cpu_mask);
-	now = ktime_get();
 	ts->idle_waketime = now;
 
 	local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
 	touch_softlockup_watchdog();
 }
 
-static void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t delta;
 
-	if (ts->idle_active) {
-		ktime_t now, delta;
-		now = ktime_get();
-		delta = ktime_sub(now, ts->idle_entrytime);
-		ts->idle_lastupdate = now;
-		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-		ts->idle_active = 0;
+	delta = ktime_sub(now, ts->idle_entrytime);
+	ts->idle_lastupdate = now;
+	ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+	ts->idle_active = 0;
 
-		sched_clock_idle_wakeup_event(0);
-	}
+	sched_clock_idle_wakeup_event(0);
 }
 
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -289,12 +281,15 @@ void tick_nohz_stop_sched_tick(int inidle)
 			time_delta = KTIME_MAX;
 	} while (read_seqretry(&xtime_lock, seq));
 
-	/* Get the next timer wheel timer */
-	next_jiffies = get_next_timer_interrupt(last_jiffies);
-	delta_jiffies = next_jiffies - last_jiffies;
-
-	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
+	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
+	    arch_needs_cpu(cpu)) {
+		next_jiffies = last_jiffies + 1;
 		delta_jiffies = 1;
+	} else {
+		/* Get the next timer wheel timer */
+		next_jiffies = get_next_timer_interrupt(last_jiffies);
+		delta_jiffies = next_jiffies - last_jiffies;
+	}
 	/*
 	 * Do not stop the tick, if we are only one off
 	 * or if the cpu is required for rcu
@@ -460,7 +455,11 @@ void tick_nohz_restart_sched_tick(void)
 	ktime_t now;
 
 	local_irq_disable();
-	tick_nohz_stop_idle(cpu);
+	if (ts->idle_active || (ts->inidle && ts->tick_stopped))
+		now = ktime_get();
+
+	if (ts->idle_active)
+		tick_nohz_stop_idle(cpu, now);
 
 	if (!ts->inidle || !ts->tick_stopped) {
 		ts->inidle = 0;
@@ -474,7 +473,6 @@ void tick_nohz_restart_sched_tick(void)
 
 	/* Update jiffies first */
 	select_nohz_load_balancer(0);
-	now = ktime_get();
 	tick_do_update_jiffies64(now);
 	cpumask_clear_cpu(cpu, nohz_cpu_mask);
 
@@ -608,22 +606,18 @@ static void tick_nohz_switch_to_nohz(void)
  * timer and do not touch the other magic bits which need to be done
  * when idle is left.
  */
-static void tick_nohz_kick_tick(int cpu)
+static void tick_nohz_kick_tick(int cpu, ktime_t now)
 {
 #if 0
 	/* Switch back to 2.6.27 behaviour */
 
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-	ktime_t delta, now;
-
-	if (!ts->tick_stopped)
-		return;
+	ktime_t delta;
 
 	/*
 	 * Do not touch the tick device, when the next expiry is either
 	 * already reached or less/equal than the tick period.
 	 */
-	now = ktime_get();
 	delta =	ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
 	if (delta.tv64 <= tick_period.tv64)
 		return;
@@ -632,9 +626,26 @@ static void tick_nohz_kick_tick(int cpu)
 #endif
 }
 
+static inline void tick_check_nohz(int cpu)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now;
+
+	if (!ts->idle_active && !ts->tick_stopped)
+		return;
+	now = ktime_get();
+	if (ts->idle_active)
+		tick_nohz_stop_idle(cpu, now);
+	if (ts->tick_stopped) {
+		tick_nohz_update_jiffies(now);
+		tick_nohz_kick_tick(cpu, now);
+	}
+}
+
 #else
 
 static inline void tick_nohz_switch_to_nohz(void) { }
+static inline void tick_check_nohz(int cpu) { }
 
 #endif /* NO_HZ */
 
@@ -644,11 +655,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 void tick_check_idle(int cpu)
 {
 	tick_check_oneshot_broadcast(cpu);
-#ifdef CONFIG_NO_HZ
-	tick_nohz_stop_idle(cpu);
-	tick_nohz_update_jiffies();
-	tick_nohz_kick_tick(cpu);
-#endif
+	tick_check_nohz(cpu);
 }
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8b709de..26e2f37 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -177,7 +177,7 @@ void timekeeping_leap_insert(int leapsecond)
 {
 	xtime.tv_sec += leapsecond;
 	wall_to_monotonic.tv_sec -= leapsecond;
-	update_vsyscall(&xtime, timekeeper.clock);
+	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 
 #ifdef CONFIG_GENERIC_TIME
@@ -337,7 +337,7 @@ int do_settimeofday(struct timespec *tv)
 	timekeeper.ntp_error = 0;
 	ntp_clear();
 
-	update_vsyscall(&xtime, timekeeper.clock);
+	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
@@ -822,7 +822,7 @@ void update_wall_time(void)
 	update_xtime_cache(nsecs);
 
 	/* check to see if there is a new clocksource to use */
-	update_vsyscall(&xtime, timekeeper.clock);
+	update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 
 /**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index dacc641..8aeba53 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -589,7 +589,6 @@ static struct page_state {
 
 	{ lru|dirty,	lru|dirty,	"LRU",		me_pagecache_dirty },
 	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
-	{ swapbacked,	swapbacked,	"anonymous",	me_pagecache_clean },
 
 	/*
 	 * Catchall entry: must be at end.
@@ -638,7 +637,7 @@ static int page_action(struct page_state *ps, struct page *p,
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
  */
-static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
+static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 				  int trapno)
 {
 	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
@@ -648,15 +647,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	int i;
 	int kill = 1;
 
-	if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
-		return;
+	if (PageReserved(p) || PageSlab(p))
+		return SWAP_SUCCESS;
 
 	/*
 	 * This check implies we don't kill processes if their pages
 	 * are in the swap cache early. Those are always late kills.
 	 */
 	if (!page_mapped(p))
-		return;
+		return SWAP_SUCCESS;
+
+	if (PageCompound(p) || PageKsm(p))
+		return SWAP_FAIL;
 
 	if (PageSwapCache(p)) {
 		printk(KERN_ERR
@@ -718,6 +720,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 */
 	kill_procs_ao(&tokill, !!PageDirty(p), trapno,
 		      ret != SWAP_SUCCESS, pfn);
+
+	return ret;
 }
 
 int __memory_failure(unsigned long pfn, int trapno, int ref)
@@ -787,8 +791,13 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
 
 	/*
 	 * Now take care of user space mappings.
+	 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
 	 */
-	hwpoison_user_mappings(p, pfn, trapno);
+	if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
+		printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
+		res = -EBUSY;
+		goto out;
+	}
 
 	/*
 	 * Torn down by someone else?
diff --git a/mm/memory.c b/mm/memory.c
index 348e206..76d1b21 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2630,6 +2630,26 @@ out_release:
 }
 
 /*
+ * This is like a special single-page "expand_downwards()",
+ * except we must first make sure that 'address-PAGE_SIZE'
+ * doesn't hit another vma.
+ *
+ * The "find_vma()" will do the right thing even if we wrap
+ */
+static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
+{
+	address &= PAGE_MASK;
+	if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
+		address -= PAGE_SIZE;
+		if (find_vma(vma->vm_mm, address) != vma)
+			return -ENOMEM;
+
+		expand_stack(vma, address);
+	}
+	return 0;
+}
+
+/*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2642,6 +2662,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
 
+	if (check_stack_guard_page(vma, address) < 0) {
+		pte_unmap(page_table);
+		return VM_FAULT_SIGBUS;
+	}
+
 	if (!(flags & FAULT_FLAG_WRITE)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
 						vma->vm_page_prot));
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9c590ee..270e136 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -330,8 +330,10 @@ checks:
 	if (offset > si->highest_bit)
 		scan_base = offset = si->lowest_bit;
 
-	/* reuse swap entry of cache-only swap if not busy. */
-	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+	/* reuse swap entry of cache-only swap if not hibernation. */
+	if (vm_swap_full()
+		&& cache == SWAP_CACHE
+		&& si->swap_map[offset] == SWAP_HAS_CACHE) {
 		int swap_was_freed;
 		spin_unlock(&swap_lock);
 		swap_was_freed = __try_to_reclaim_swap(si, offset);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index cc35645..f84f6dd 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1076,6 +1076,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 		}
 		ip_mc_up(in_dev);
 		/* fall through */
+	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_CHANGEADDR:
 		/* Send gratuitous ARP to notify of link change */
 		if (IN_DEV_ARP_NOTIFY(in_dev)) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 4ae6aa5..3028001 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -330,6 +330,24 @@ void netif_carrier_off(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_carrier_off);
 
+/**
+ * 	netif_notify_peers - notify network peers about existence of @dev
+ * 	@dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void netif_notify_peers(struct net_device *dev)
+{
+	rtnl_lock();
+	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL(netif_notify_peers);
+
 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
    under all circumstances. It is difficult to invent anything faster or
    cheaper.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ