linux-kernel - RE: [PATCH v8 01/33] x86/traps: let common_interrupt() handle IRQ_MOVE_CLEANUP

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <SA1PR11MB67344FE6C8A83810D0CAF175A85FA@SA1PR11MB6734.namprd11.prod.outlook.com>
Date:   Mon, 19 Jun 2023 08:00:34 +0000
From:   "Li, Xin3" <xin3.li@...el.com>
To:     Thomas Gleixner <tglx@...utronix.de>,
        "H. Peter Anvin" <hpa@...or.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "x86@...nel.org" <x86@...nel.org>,
        "kvm@...r.kernel.org" <kvm@...r.kernel.org>
CC:     "mingo@...hat.com" <mingo@...hat.com>,
        "bp@...en8.de" <bp@...en8.de>,
        "dave.hansen@...ux.intel.com" <dave.hansen@...ux.intel.com>,
        "peterz@...radead.org" <peterz@...radead.org>,
        "andrew.cooper3@...rix.com" <andrew.cooper3@...rix.com>,
        "Christopherson,, Sean" <seanjc@...gle.com>,
        "pbonzini@...hat.com" <pbonzini@...hat.com>,
        "Shankar, Ravi V" <ravi.v.shankar@...el.com>,
        "jiangshanlai@...il.com" <jiangshanlai@...il.com>,
        "Kang, Shan" <shan.kang@...el.com>
Subject: RE: [PATCH v8 01/33] x86/traps: let common_interrupt() handle
 IRQ_MOVE_CLEANUP_VECTOR

> The untested below should do the trick. Wants to be split in several
> patches, but you get the idea.

Hi Thomas,

How do you want to split the patch?

To me it's better to keep the changes in one patch, thus the differences
are more obvious.

We need a second patch to do vector cleanup in lapic_offline() in case the
vector cleanup timer has not expired.  But I don't see a good way to split
your draft patch (I only have a minor fix on it).

Thanks!
Xin

> 
> Thanks,
> 
>         tglx
> ---
> Subject: x86/vector: Get rid of IRQ_MOVE_CLEANUP_VECTOR
> From: Thomas Gleixner <tglx@...utronix.de>
> 
> No point to waste a vector for cleaning up the leftovers of a moved
> interrupt. Aside of that this must be the lowest priority of all vectors
> which makes FRED systems utilizing vectors 0x10-0x1f more complicated
> than necessary.
> 
> Schedule a timer instead.
> 
> Not-Yet-Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
> ---
>  arch/x86/include/asm/hw_irq.h       |    4 -
>  arch/x86/include/asm/idtentry.h     |    1
>  arch/x86/include/asm/irq_vectors.h  |    7 ---
>  arch/x86/kernel/apic/vector.c       |   83 ++++++++++++++++++++++++++----------
>  arch/x86/kernel/idt.c               |    1
>  arch/x86/platform/uv/uv_irq.c       |    2
>  drivers/iommu/amd/iommu.c           |    2
>  drivers/iommu/hyperv-iommu.c        |    4 -
>  drivers/iommu/intel/irq_remapping.c |    2
>  9 files changed, 68 insertions(+), 38 deletions(-)
> 
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -97,10 +97,10 @@ extern struct irq_cfg *irqd_cfg(struct i
>  extern void lock_vector_lock(void);
>  extern void unlock_vector_lock(void);
>  #ifdef CONFIG_SMP
> -extern void send_cleanup_vector(struct irq_cfg *);
> +extern void vector_schedule_cleanup(struct irq_cfg *);
>  extern void irq_complete_move(struct irq_cfg *cfg);
>  #else
> -static inline void send_cleanup_vector(struct irq_cfg *c) { }
> +static inline void vector_schedule_cleanup(struct irq_cfg *c) { }
>  static inline void irq_complete_move(struct irq_cfg *c) { }
>  #endif
> 
> --- a/arch/x86/include/asm/idtentry.h
> +++ b/arch/x86/include/asm/idtentry.h
> @@ -648,7 +648,6 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI
> 
>  #ifdef CONFIG_SMP
>  DECLARE_IDTENTRY(RESCHEDULE_VECTOR,
> 	sysvec_reschedule_ipi);
> -DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR,
> 	sysvec_irq_move_cleanup);
>  DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR,			sysvec_reboot);
>  DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR,
> 	sysvec_call_function_single);
>  DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR,
> 	sysvec_call_function);
> --- a/arch/x86/include/asm/irq_vectors.h
> +++ b/arch/x86/include/asm/irq_vectors.h
> @@ -35,13 +35,6 @@
>   */
>  #define FIRST_EXTERNAL_VECTOR		0x20
> 
> -/*
> - * Reserve the lowest usable vector (and hence lowest priority)  0x20 for
> - * triggering cleanup after irq migration. 0x21-0x2f will still be used
> - * for device interrupts.
> - */
> -#define IRQ_MOVE_CLEANUP_VECTOR		FIRST_EXTERNAL_VECTOR
> -
>  #define IA32_SYSCALL_VECTOR		0x80
> 
>  /*
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -44,7 +44,18 @@ static cpumask_var_t vector_searchmask;
>  static struct irq_chip lapic_controller;
>  static struct irq_matrix *vector_matrix;
>  #ifdef CONFIG_SMP
> -static DEFINE_PER_CPU(struct hlist_head, cleanup_list);
> +
> +static void vector_cleanup_callback(struct timer_list *tmr);
> +
> +struct vector_cleanup {
> +	struct hlist_head	head;
> +	struct timer_list	timer;
> +};
> +
> +static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = {
> +	.head	= HLIST_HEAD_INIT,
> +	.timer	= __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED),
> +};
>  #endif
> 
>  void lock_vector_lock(void)
> @@ -843,8 +854,12 @@ void lapic_online(void)
> 
>  void lapic_offline(void)
>  {
> +	struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup);
> +
>  	lock_vector_lock();
>  	irq_matrix_offline(vector_matrix);
> +	WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0);
> +	WARN_ON_ONCE(!hlist_empty(&cl->head));
>  	unlock_vector_lock();
>  }
> 
> @@ -934,62 +949,86 @@ static void free_moved_vector(struct api
>  	apicd->move_in_progress = 0;
>  }
> 
> -DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup)
> +static void vector_cleanup_callback(struct timer_list *tmr)
>  {
> -	struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
> +	struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer);
>  	struct apic_chip_data *apicd;
>  	struct hlist_node *tmp;
> +	bool rearm = false;
> 
> -	ack_APIC_irq();
>  	/* Prevent vectors vanishing under us */
> -	raw_spin_lock(&vector_lock);
> +	raw_spin_lock_irq(&vector_lock);
> 
> -	hlist_for_each_entry_safe(apicd, tmp, clhead, clist) {
> +	hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
>  		unsigned int irr, vector = apicd->prev_vector;
> 
>  		/*
>  		 * Paranoia: Check if the vector that needs to be cleaned
> -		 * up is registered at the APICs IRR. If so, then this is
> -		 * not the best time to clean it up. Clean it up in the
> -		 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
> -		 * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest
> -		 * priority external vector, so on return from this
> -		 * interrupt the device interrupt will happen first.
> +		 * up is registered at the APICs IRR. That's clearly a
> +		 * hardware issue if the vector arrived on the old target
> +		 * _after_ interrupts were disabled above. Keep @apicd
> +		 * on the list and schedule the timer again to give the CPU
> +		 * a chance to handle the pending interrupt.
>  		 */
>  		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
>  		if (irr & (1U << (vector % 32))) {
> -			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
> +			pr_warn_once("Moved interrupt pending in old target APIC
> %u\n", apicd->irq);
> +			rearm = true;
>  			continue;
>  		}
>  		free_moved_vector(apicd);
>  	}
> 
> -	raw_spin_unlock(&vector_lock);
> +	/*
> +	 * Must happen under vector_lock to make the timer_pending() check
> +	 * in __vector_schedule_cleanup() race free against the rearm here.
> +	 */
> +	if (rearm)
> +		mod_timer(tmr, jiffies + 1);
> +
> +	raw_spin_unlock_irq(&vector_lock);
>  }
> 
> -static void __send_cleanup_vector(struct apic_chip_data *apicd)
> +static void __vector_schedule_cleanup(struct apic_chip_data *apicd)
>  {
> -	unsigned int cpu;
> +	unsigned int cpu = apicd->prev_cpu;
> 
>  	raw_spin_lock(&vector_lock);
>  	apicd->move_in_progress = 0;
> -	cpu = apicd->prev_cpu;
>  	if (cpu_online(cpu)) {
> -		hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu));
> -		apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR);
> +		struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu);
> +
> +		/*
> +		 * The lockless timer_pending() check is safe here. If it
> +		 * returns true, then the callback will observe this new
> +		 * apic data in the hlist as everything is serialized by
> +		 * vector lock.
> +		 *
> +		 * If it returns false then the timer is either not armed
> +		 * or the other CPU executes the callback, which again
> +		 * would be blocked on vector lock. Rearming it in the
> +		 * latter case makes it fire for nothing.
> +		 *
> +		 * This is also safe against the callback rearming the timer
> +		 * because that's serialized via vector lock too.
> +		 */
> +		if (!timer_pending(&cl->timer)) {
> +			cl->timer.expires = jiffies + 1;
> +			add_timer_on(&cl->timer, cpu);
> +		}
>  	} else {
>  		apicd->prev_vector = 0;
>  	}
>  	raw_spin_unlock(&vector_lock);
>  }
> 
> -void send_cleanup_vector(struct irq_cfg *cfg)
> +void vector_schedule_cleanup(struct irq_cfg *cfg)
>  {
>  	struct apic_chip_data *apicd;
> 
>  	apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
>  	if (apicd->move_in_progress)
> -		__send_cleanup_vector(apicd);
> +		__vector_schedule_cleanup(apicd);
>  }
> 
>  void irq_complete_move(struct irq_cfg *cfg)
> @@ -1007,7 +1046,7 @@ void irq_complete_move(struct irq_cfg *c
>  	 * on the same CPU.
>  	 */
>  	if (apicd->cpu == smp_processor_id())
> -		__send_cleanup_vector(apicd);
> +		__vector_schedule_cleanup(apicd);
>  }
> 
>  /*
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -131,7 +131,6 @@ static const __initconst struct idt_data
>  	INTG(RESCHEDULE_VECTOR,
> 	asm_sysvec_reschedule_ipi),
>  	INTG(CALL_FUNCTION_VECTOR,		asm_sysvec_call_function),
>  	INTG(CALL_FUNCTION_SINGLE_VECTOR,	asm_sysvec_call_function_single),
> -	INTG(IRQ_MOVE_CLEANUP_VECTOR,
> 	asm_sysvec_irq_move_cleanup),
>  	INTG(REBOOT_VECTOR,			asm_sysvec_reboot),
>  #endif
> 
> --- a/arch/x86/platform/uv/uv_irq.c
> +++ b/arch/x86/platform/uv/uv_irq.c
> @@ -58,7 +58,7 @@ uv_set_irq_affinity(struct irq_data *dat
>  	ret = parent->chip->irq_set_affinity(parent, mask, force);
>  	if (ret >= 0) {
>  		uv_program_mmr(cfg, data->chip_data);
> -		send_cleanup_vector(cfg);
> +		vector_schedule_cleanup(cfg);
>  	}
> 
>  	return ret;
> --- a/drivers/iommu/amd/iommu.c
> +++ b/drivers/iommu/amd/iommu.c
> @@ -3639,7 +3639,7 @@ static int amd_ir_set_affinity(struct ir
>  	 * at the new destination. So, time to cleanup the previous
>  	 * vector allocation.
>  	 */
> -	send_cleanup_vector(cfg);
> +	vector_schedule_cleanup(cfg);
> 
>  	return IRQ_SET_MASK_OK_DONE;
>  }
> --- a/drivers/iommu/hyperv-iommu.c
> +++ b/drivers/iommu/hyperv-iommu.c
> @@ -51,7 +51,7 @@ static int hyperv_ir_set_affinity(struct
>  	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
>  		return ret;
> 
> -	send_cleanup_vector(cfg);
> +	vector_schedule_cleanup(cfg);
> 
>  	return 0;
>  }
> @@ -257,7 +257,7 @@ static int hyperv_root_ir_set_affinity(s
>  	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
>  		return ret;
> 
> -	send_cleanup_vector(cfg);
> +	vector_schedule_cleanup(cfg);
> 
>  	return 0;
>  }
> --- a/drivers/iommu/intel/irq_remapping.c
> +++ b/drivers/iommu/intel/irq_remapping.c
> @@ -1180,7 +1180,7 @@ intel_ir_set_affinity(struct irq_data *d
>  	 * at the new destination. So, time to cleanup the previous
>  	 * vector allocation.
>  	 */
> -	send_cleanup_vector(cfg);
> +	vector_schedule_cleanup(cfg);
> 
>  	return IRQ_SET_MASK_OK_DONE;
>  }