netdev - Re: [PATCH v5] ptp: Add support for the AMZNC10C 'vmclock' device

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5c0303f4-c609-4a4e-a012-c27f08cfa6f9@redhat.com>
Date: Tue, 3 Sep 2024 15:56:07 +0200
From: Paolo Abeni <pabeni@...hat.com>
To: David Woodhouse <dwmw2@...radead.org>,
 Richard Cochran <richardcochran@...il.com>,
 Peter Hilber <peter.hilber@...nsynergy.com>, linux-kernel@...r.kernel.org,
 virtualization@...ts.linux.dev, linux-arm-kernel@...ts.infradead.org,
 linux-rtc@...r.kernel.org, "Ridoux, Julien" <ridouxj@...zon.com>,
 virtio-dev@...ts.linux.dev, "Luu, Ryan" <rluu@...zon.com>,
 "Chashper, David" <chashper@...zon.com>,
 "Mohamed Abuelfotoh, Hazem" <abuehaze@...zon.com>
Cc: "Christopher S . Hall" <christopher.s.hall@...el.com>,
 Jason Wang <jasowang@...hat.com>, John Stultz <jstultz@...gle.com>,
 "Michael S . Tsirkin" <mst@...hat.com>, netdev@...r.kernel.org,
 Stephen Boyd <sboyd@...nel.org>, Thomas Gleixner <tglx@...utronix.de>,
 Xuan Zhuo <xuanzhuo@...ux.alibaba.com>, Marc Zyngier <maz@...nel.org>,
 Mark Rutland <mark.rutland@....com>,
 Daniel Lezcano <daniel.lezcano@...aro.org>,
 Alessandro Zummo <a.zummo@...ertech.it>,
 Alexandre Belloni <alexandre.belloni@...tlin.com>,
 qemu-devel <qemu-devel@...gnu.org>, Simon Horman <horms@...nel.org>
Subject: Re: [PATCH v5] ptp: Add support for the AMZNC10C 'vmclock' device

On 8/23/24 12:09, David Woodhouse wrote:
> diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
> index 604541dcb320..e98c9767e0ef 100644
> --- a/drivers/ptp/Kconfig
> +++ b/drivers/ptp/Kconfig
> @@ -131,6 +131,19 @@ config PTP_1588_CLOCK_KVM
>   	  To compile this driver as a module, choose M here: the module
>   	  will be called ptp_kvm.
>   
> +config PTP_1588_CLOCK_VMCLOCK
> +	tristate "Virtual machine PTP clock"
> +	depends on X86_TSC || ARM_ARCH_TIMER
> +	depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128
> +	default y
> +	help
> +	  This driver adds support for using a virtual precision clock
> +	  advertised by the hypervisor. This clock is only useful in virtual
> +	  machines where such a device is present.
> +
> +	  To compile this driver as a module, choose M here: the module
> +	  will be called ptp_vmclock.
> +
>   config PTP_1588_CLOCK_IDT82P33
>   	tristate "IDT 82P33xxx PTP clock"
>   	depends on PTP_1588_CLOCK && I2C
> diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
> index 68bf02078053..01b5cd91eb61 100644
> --- a/drivers/ptp/Makefile
> +++ b/drivers/ptp/Makefile
> @@ -11,6 +11,7 @@ obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
>   obj-$(CONFIG_PTP_1588_CLOCK_INES)	+= ptp_ines.o
>   obj-$(CONFIG_PTP_1588_CLOCK_PCH)	+= ptp_pch.o
>   obj-$(CONFIG_PTP_1588_CLOCK_KVM)	+= ptp_kvm.o
> +obj-$(CONFIG_PTP_1588_CLOCK_VMCLOCK)	+= ptp_vmclock.o
>   obj-$(CONFIG_PTP_1588_CLOCK_QORIQ)	+= ptp-qoriq.o
>   ptp-qoriq-y				+= ptp_qoriq.o
>   ptp-qoriq-$(CONFIG_DEBUG_FS)		+= ptp_qoriq_debugfs.o
> diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
> new file mode 100644
> index 000000000000..f772bcb23599
> --- /dev/null
> +++ b/drivers/ptp/ptp_vmclock.c
> @@ -0,0 +1,607 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Virtual PTP 1588 clock for use with LM-safe VMclock device.
> + *
> + * Copyright © 2024 Amazon.com, Inc. or its affiliates.
> + */
> +
> +#include <linux/acpi.h>
> +#include <linux/device.h>
> +#include <linux/err.h>
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/miscdevice.h>
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/platform_device.h>
> +#include <linux/slab.h>
> +
> +#include <uapi/linux/vmclock-abi.h>
> +
> +#include <linux/ptp_clock_kernel.h>
> +
> +#ifdef CONFIG_X86
> +#include <asm/pvclock.h>
> +#include <asm/kvmclock.h>
> +#endif
> +
> +#ifdef CONFIG_KVM_GUEST
> +#define SUPPORT_KVMCLOCK
> +#endif
> +
> +static DEFINE_IDA(vmclock_ida);
> +
> +ACPI_MODULE_NAME("vmclock");
> +
> +struct vmclock_state {
> +	struct resource res;
> +	struct vmclock_abi *clk;
> +	struct miscdevice miscdev;
> +	struct ptp_clock_info ptp_clock_info;
> +	struct ptp_clock *ptp_clock;
> +	enum clocksource_ids cs_id, sys_cs_id;
> +	int index;
> +	char *name;
> +};
> +
> +#define VMCLOCK_MAX_WAIT ms_to_ktime(100)
> +
> +/* Require at least the flags field to be present. All else can be optional. */
> +#define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
> +
> +#define VMCLOCK_FIELD_PRESENT(_c, _f)			  \
> +	le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) +	\
> +				    sizeof((_c)->_f))
> +
> +/*
> + * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
> + * and add the fractional second part of the reference time.
> + *
> + * The result is a 128-bit value, the top 64 bits of which are seconds, and
> + * the low 64 bits are (seconds >> 64).
> + */
> +static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
> +					uint64_t period, uint8_t shift,
> +					uint64_t frac_sec)

I'm sorry for the late feedback.

uint64_t should be u64 (in a lot of places), mutatis mutandis uint32_t, 
etc...

> +{
> +	unsigned __int128 res = (unsigned __int128)delta * period;
> +
> +	res >>= shift;
> +	res += frac_sec;
> +	*res_hi = res >> 64;
> +	return (uint64_t)res;
> +}
> +
> +static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
> +{
> +	if (likely(clk->time_type == VMCLOCK_TIME_UTC))
> +		return true;
> +
> +	if (clk->time_type == VMCLOCK_TIME_TAI &&
> +	    (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
> +		if (sec)
> +			*sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static int vmclock_get_crosststamp(struct vmclock_state *st,
> +				   struct ptp_system_timestamp *sts,
> +				   struct system_counterval_t *system_counter,
> +				   struct timespec64 *tspec)
> +{
> +	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
> +	struct system_time_snapshot systime_snapshot;
> +	uint64_t cycle, delta, seq, frac_sec;
> +
> +#ifdef CONFIG_X86
> +	/*
> +	 * We'd expect the hypervisor to know this and to report the clock
> +	 * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
> +	 */
> +	if (check_tsc_unstable())
> +		return -EINVAL;
> +#endif
> +
> +	while (1) {
> +		seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
> +
> +		/*
> +		 * This pairs with a write barrier in the hypervisor
> +		 * which populates this structure.
> +		 */
> +		virt_rmb();
> +
> +		if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE)
> +			return -EINVAL;
> +
> +		/*
> +		 * When invoked for gettimex64(), fill in the pre/post system
> +		 * times. The simple case is when system time is based on the
> +		 * same counter as st->cs_id, in which case all three times
> +		 * will be derived from the *same* counter value.
> +		 *
> +		 * If the system isn't using the same counter, then the value
> +		 * from ktime_get_snapshot() will still be used as pre_ts, and
> +		 * ptp_read_system_postts() is called to populate postts after
> +		 * calling get_cycles().
> +		 *
> +		 * The conversion to timespec64 happens further down, outside
> +		 * the seq_count loop.
> +		 */
> +		if (sts) {
> +			ktime_get_snapshot(&systime_snapshot);
> +			if (systime_snapshot.cs_id == st->cs_id) {
> +				cycle = systime_snapshot.cycles;
> +			} else {
> +				cycle = get_cycles();
> +				ptp_read_system_postts(sts);
> +			}
> +		} else {
> +			cycle = get_cycles();
> +		}
> +
> +		delta = cycle - le64_to_cpu(st->clk->counter_value);
> +
> +		frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta,
> +						   le64_to_cpu(st->clk->counter_period_frac_sec),
> +						   st->clk->counter_period_shift,
> +						   le64_to_cpu(st->clk->time_frac_sec));
> +		tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64);
> +		tspec->tv_sec += le64_to_cpu(st->clk->time_sec);
> +
> +		if (!tai_adjust(st->clk, &tspec->tv_sec))
> +			return -EINVAL;
> +
> +		virt_rmb();

Even this one deserves a comment.

> +		if (seq == le32_to_cpu(st->clk->seq_count))
> +			break;
> +
> +		if (ktime_after(ktime_get(), deadline))
> +			return -ETIMEDOUT;
> +	}
> +
> +	if (system_counter) {
> +		system_counter->cycles = cycle;
> +		system_counter->cs_id = st->cs_id;
> +	}
> +
> +	if (sts) {
> +		sts->pre_ts = ktime_to_timespec64(systime_snapshot.real);
> +		if (systime_snapshot.cs_id == st->cs_id)
> +			sts->post_ts = sts->pre_ts;
> +	}
> +
> +	return 0;
> +}
> +
> +#ifdef SUPPORT_KVMCLOCK
> +/*
> + * In the case where the system is using the KVM clock for timekeeping, convert
> + * the TSC value into a KVM clock time in order to return a paired reading that
> + * get_device_system_crosststamp() can cope with.
> + */
> +static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st,
> +					    struct ptp_system_timestamp *sts,
> +					    struct system_counterval_t *system_counter,
> +					    struct timespec64 *tspec)
> +{
> +	struct pvclock_vcpu_time_info *pvti = this_cpu_pvti();
> +	unsigned pvti_ver;

unsigned int

> +	int ret;
> +
> +	preempt_disable_notrace();
> +
> +	do {
> +		pvti_ver = pvclock_read_begin(pvti);
> +
> +		ret = vmclock_get_crosststamp(st, sts, system_counter, tspec);
> +		if (ret)
> +			break;
> +
> +		system_counter->cycles = __pvclock_read_cycles(pvti,
> +							       system_counter->cycles);
> +		system_counter->cs_id = CSID_X86_KVM_CLK;
> +
> +		/*
> +		 * This retry should never really happen; if the TSC is
> +		 * stable and reliable enough across vCPUS that it is sane
> +		 * for the hypervisor to expose a VMCLOCK device which uses
> +		 * it as the reference counter, then the KVM clock sohuld be
> +		 * in 'master clock mode' and basically never changed. But
> +		 * the KVM clock is a fickle and often broken thing, so do
> +		 * it "properly" just in case.
> +		 */
> +	} while (pvclock_read_retry(pvti, pvti_ver));
> +
> +	preempt_enable_notrace();
> +
> +	return ret;
> +}
> +#endif
> +
> +static int ptp_vmclock_get_time_fn(ktime_t *device_time,
> +				   struct system_counterval_t *system_counter,
> +				   void *ctx)
> +{
> +	struct vmclock_state *st = ctx;
> +	struct timespec64 tspec;
> +	int ret;
> +
> +#ifdef SUPPORT_KVMCLOCK
> +	if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK)
> +		ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter,
> +						       &tspec);
> +	else
> +#endif
> +		ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec);
> +
> +	if (!ret)
> +		*device_time = timespec64_to_ktime(tspec);
> +
> +	return ret;
> +}
> +
> +static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp,
> +				      struct system_device_crosststamp *xtstamp)
> +{
> +	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
> +						ptp_clock_info);
> +	int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st,
> +						NULL, xtstamp);
> +#ifdef SUPPORT_KVMCLOCK
> +	/*
> +	 * On x86, the KVM clock may be used for the system time. We can
> +	 * actually convert a TSC reading to that, and return a paired
> +	 * timestamp that get_device_system_crosststamp() *can* handle.
> +	 */
> +	if (ret == -ENODEV) {
> +		struct system_time_snapshot systime_snapshot;

Please insert an empty line after the variable declarations.

> +		ktime_get_snapshot(&systime_snapshot);
> +
> +		if (systime_snapshot.cs_id == CSID_X86_TSC ||
> +		    systime_snapshot.cs_id == CSID_X86_KVM_CLK) {
> +			WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id);
> +			ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn,
> +							    st, NULL, xtstamp);
> +		}
> +	}
> +#endif
> +	return ret;
> +}
> +
> +/*
> + * PTP clock operations
> + */
> +
> +static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static int ptp_vmclock_settime(struct ptp_clock_info *ptp,
> +			   const struct timespec64 *ts)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts,
> +				struct ptp_system_timestamp *sts)
> +{
> +	struct vmclock_state *st = container_of(ptp, struct vmclock_state,
> +						ptp_clock_info);
> +
> +	return vmclock_get_crosststamp(st, sts, NULL, ts);
> +}
> +
> +static int ptp_vmclock_enable(struct ptp_clock_info *ptp,
> +			  struct ptp_clock_request *rq, int on)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static const struct ptp_clock_info ptp_vmclock_info = {
> +	.owner		= THIS_MODULE,
> +	.max_adj	= 0,
> +	.n_ext_ts	= 0,
> +	.n_pins		= 0,
> +	.pps		= 0,
> +	.adjfine	= ptp_vmclock_adjfine,
> +	.adjtime	= ptp_vmclock_adjtime,
> +	.gettimex64	= ptp_vmclock_gettimex,
> +	.settime64	= ptp_vmclock_settime,
> +	.enable		= ptp_vmclock_enable,
> +	.getcrosststamp = ptp_vmclock_getcrosststamp,
> +};
> +
> +static struct ptp_clock *vmclock_ptp_register(struct device *dev,
> +					      struct vmclock_state *st)
> +{
> +	enum clocksource_ids cs_id;
> +
> +	if (IS_ENABLED(CONFIG_ARM64) &&
> +	    st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) {
> +		/* Can we check it's the virtual counter? */
> +		cs_id = CSID_ARM_ARCH_COUNTER;
> +	} else if (IS_ENABLED(CONFIG_X86) &&
> +		   st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) {
> +		cs_id = CSID_X86_TSC;
> +	} else {
> +		return NULL;
> +	}
> +
> +	/* Only UTC, or TAI with offset */
> +	if (!tai_adjust(st->clk, NULL)) {
> +		dev_info(dev, "vmclock does not provide unambiguous UTC\n");
> +		return NULL;
> +	}
> +
> +	st->sys_cs_id = st->cs_id = cs_id;

Please avoid multiple assignments in the same statement.

> +	st->ptp_clock_info = ptp_vmclock_info;
> +	strscpy(st->ptp_clock_info.name, st->name);
> +
> +	return ptp_clock_register(&st->ptp_clock_info, dev);
> +}
> +
> +static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
> +{
> +	struct vmclock_state *st = container_of(fp->private_data,
> +						struct vmclock_state, miscdev);
> +
> +	if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
> +		return -EROFS;
> +
> +	if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff)
> +		return -EINVAL;
> +
> +        if (io_remap_pfn_range(vma, vma->vm_start,
> +			       st->res.start >> PAGE_SHIFT, PAGE_SIZE,
> +                               vma->vm_page_prot))
> +                return -EAGAIN;
> +
> +        return 0;

This chunk looks whitespace-damaged, use tab for indentation.

> +}
> +
> +static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
> +				    size_t count, loff_t *ppos)
> +{
> +	struct vmclock_state *st = container_of(fp->private_data,
> +						struct vmclock_state, miscdev);
> +	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
> +	size_t max_count;
> +	uint32_t seq;
> +
> +	if (*ppos >= PAGE_SIZE)
> +		return 0;
> +
> +	max_count = PAGE_SIZE - *ppos;
> +	if (count > max_count)
> +		count = max_count;
> +
> +	while (1) {
> +		seq = le32_to_cpu(st->clk->seq_count) & ~1U;
> +		virt_rmb();
> +
> +		if (copy_to_user(buf, ((char *)st->clk) + *ppos, count))
> +			return -EFAULT;
> +
> +		virt_rmb();
> +		if (seq == le32_to_cpu(st->clk->seq_count))
> +			break;
> +
> +		if (ktime_after(ktime_get(), deadline))
> +			return -ETIMEDOUT;
> +	}
> +
> +	*ppos += count;
> +	return count;
> +}
> +
> +static const struct file_operations vmclock_miscdev_fops = {
> +        .mmap = vmclock_miscdev_mmap,
> +        .read = vmclock_miscdev_read,
> +};
> +
> +/* module operations */
> +
> +static void vmclock_remove(struct platform_device *pdev)
> +{
> +	struct device *dev = &pdev->dev;
> +	struct vmclock_state *st = dev_get_drvdata(dev);
> +
> +	if (st->ptp_clock)
> +		ptp_clock_unregister(st->ptp_clock);
> +
> +	if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
> +		misc_deregister(&st->miscdev);
> +}
> +
> +static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
> +{
> +	struct vmclock_state *st = data;
> +	struct resource_win win;
> +	struct resource *res = &(win.res);

Unnecessary parentheses

There are several checkpatch offenders, please double check your next 
version before the submission.

Thanks!

Paolo