>From 8a60136b1b805746451b5ff4d82eaa251bfd8c6c Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 7 Nov 2016 23:08:28 -0600 Subject: [PATCH 2/2] irqchip/gicv3-its: Test code for measuring Read-allocate hints performance Apply this patch on tip of the v4.9-rc4 kernel and do two steps mentioned below for measuring performance improvement with Read-allocate hints. mount -t debugfs none /sys/kernel/debug echo 10 > /sys/kernel/debug/lpitest Test displays the CPU cycles that are spent to deliver LPI event. Example: [ 93.139710] CPU[1] iter=10 cycles=0xdd8c avg=0x1627 min=0x13a3 Signed-off-by: Shanker Donthineni --- arch/arm64/mm/cache.S | 55 +++++++++++++ drivers/irqchip/irq-gic-v3-its.c | 160 +++++++++++++++++++++++++++++++++++++ drivers/irqchip/irq-gic-v3.c | 7 ++ include/linux/irqchip/arm-gic-v3.h | 23 ++++++ 4 files changed, 245 insertions(+) diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 58b5a90..0a03420 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -198,3 +198,58 @@ ENTRY(__dma_unmap_area) b.ne __dma_inv_area ret ENDPIPROC(__dma_unmap_area) + +/* + * flush_dcache_all(), Flush the whole D-cache. + */ +ENTRY(flush_dcache_all) + mov x12, lr + dmb sy // ensure ordering with previous memory accesses + mrs x0, clidr_el1 // read clidr + and x3, x0, #0x7000000 // extract loc from clidr + lsr x3, x3, #23 // left align loc bit field + cbz x3, finished // if loc is 0, then no need to clean + mov x10, #0 // start clean at cache level 0 +loop1: + add x2, x10, x10, lsr #1 // work out 3x current cache level + lsr x1, x0, x2 // extract cache type bits from clidr + and x1, x1, #7 // mask of the bits for current cache only + cmp x1, #2 // see what cache we have at this level + b.lt skip // skip if no cache, or just i-cache + mrs x9, daif + disable_irq + msr csselr_el1, x10 // select current cache level in csselr + isb // isb to sych the new cssr&csidr + mrs x1, ccsidr_el1 // read the new ccsidr + msr daif, x9 + and x2, x1, #7 // extract the length of the cache lines + add x2, x2, #4 // add 4 (line length offset) + mov x4, #0x3ff + and x4, x4, x1, lsr #3 // find maximum number on the way size + clz w5, w4 // find bit position of way size increment + mov x7, #0x7fff + and x7, x7, x1, lsr #13 // extract max number of the index size +loop2: + mov x9, x4 // create working copy of max way size +loop3: + lsl x6, x9, x5 + orr x11, x10, x6 // factor way and cache number into x11 + lsl x6, x7, x2 + orr x11, x11, x6 // factor index number into x11 + dc cisw, x11 // clean & invalidate by set/way + subs x9, x9, #1 // decrement the way + b.ge loop3 + subs x7, x7, #1 // decrement the index + b.ge loop2 +skip: + add x10, x10, #2 // increment cache number + cmp x3, x10 + b.gt loop1 +finished: + mov x10, #0 // swith back to cache level 0 + msr csselr_el1, x10 // select current cache level in csselr + dsb sy + isb + mov x0, #0 + ret x12 +ENDPROC(flush_dcache_all) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 227a1eb..7aec44f 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1882,3 +1882,163 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, return 0; } +#include + +struct lpitest_cntx lpitest1 = { + .wq = __WAIT_QUEUE_HEAD_INITIALIZER(lpitest1.wq), +}; +struct lpitest_cntx *lpitest = &lpitest1; + +static struct its_device *lpi_its_dev; + +static struct its_collection *its_build_int_cmd(struct its_cmd_block *cmd, + struct its_cmd_desc *desc) +{ + struct its_collection *col; + + col = dev_event_to_col(desc->its_inv_cmd.dev, + desc->its_inv_cmd.event_id); + + its_encode_cmd(cmd, 0x03); + its_encode_devid(cmd, desc->its_inv_cmd.dev->device_id); + its_encode_event_id(cmd, desc->its_inv_cmd.event_id); + + its_fixup_cmd(cmd); + + return col; +} + +static void its_send_int(struct its_device *dev, u32 event_id) +{ + struct its_cmd_desc desc; + + desc.its_inv_cmd.dev = dev; + desc.its_inv_cmd.event_id = event_id; + + its_send_single_command(dev->its, its_build_int_cmd, &desc); +} + +static ssize_t lpitest_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) +{ + unsigned long val, lcnt; + u64 cycles, dcycles, mcycles = ~0; + int cpu, ret; + + ret = kstrtoul_from_user(buffer, count, 10, &val); + if (ret || val <= 0) + return ret; + + preempt_disable(); + + flush_dcache_all(); + + cpu = smp_processor_id(); + *pos += count; + lpitest->irqnr = lpi_its_dev->event_map.lpi_base + cpu; + + lpitest->total_cycles = 0; + lcnt = val; + while (val) { + cycles = pmu_read_cycles(); + lpitest->done = 1; + its_send_int(lpi_its_dev, cpu); + wait_event_interruptible(lpitest->wq, !lpitest->done); + dcycles = lpitest->end_cycles - cycles; + lpitest->total_cycles += dcycles; + if (mcycles > dcycles) + mcycles = dcycles; + val--; + } + preempt_enable(); + + pr_info("CPU[%d] niter=%ld cycles=0x%lx avg=0x%lx min=0x%lx\n", cpu, + (unsigned long)lcnt, + (unsigned long)lpitest->total_cycles, + (unsigned long)lpitest->total_cycles/lcnt, + (unsigned long)mcycles); + + return ret ? ret : count; +} + +static int lpitest_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, NULL, NULL); +} + +static const struct file_operations lpitest_fops = { + .owner = THIS_MODULE, + .open = lpitest_proc_open, + .read = seq_read, + .write = lpitest_write, +}; + +static void pmu_enable_cycle_counter(void *discard) +{ + u64 tmp; + + asm volatile("mrs %0, pmcr_el0\n" + "orr %0, %0, #(1 << 0)\n" + "orr %0, %0, #(1 << 2)\n" + "bic %0, %0, #(1 << 3)\n" + "orr %0, %0, #(1 << 6)\n" + "msr pmcr_el0, %0\n" + "mov %0, #0b11111\n" + "msr pmselr_el0, %0\n" + "isb \n" + "mrs %0, pmxevtyper_el0\n" + "orr %0, %0, #(1 << 27)\n" + "bic %0, %0, #(3 << 30)\n" + "bic %0, %0, #(3 << 28)\n" + "msr pmxevtyper_el0, %0\n" + "mrs %0, pmcntenset_el0\n" + "orr %0, %0, #(1 << 31)\n" + "msr pmcntenset_el0, %0\n" + : "=r" (tmp)); +} + +static int __init its_lpitest_init(void) +{ + struct its_device *its_dev; + struct its_node *its; + struct dentry *dentry; + irq_hw_number_t hwirq; + int i, nvec = 64; + u8 *cfg; + + if (list_empty(&its_nodes)) + return 0; + its = list_first_entry(&its_nodes, struct its_node, entry); + + dentry = debugfs_create_file("lpitest", 0666, NULL, NULL, &lpitest_fops); + if (!dentry) { + pr_err("failed to create debugfs for its-lpitest"); + return -ENOMEM; + } + + its_dev = its_create_device(its, 0xFFFF, nvec); + if (!its_dev) { + pr_err("failed to create its device for lpitest"); + return -ENOMEM; + } + + lpi_its_dev = its_dev; + hwirq = its_dev->event_map.lpi_base; + cfg = page_address(gic_rdists->prop_page) + hwirq - 8192; + + for (i = 0; i < nvec; i++) { + lpi_its_dev->event_map.col_map[i] = i; + its_send_mapvi(its_dev, hwirq + i, i); + *cfg |= LPI_PROP_ENABLED; + dsb(ishst); + its_send_inv(its_dev, i); + cfg++; + } + + on_each_cpu(pmu_enable_cycle_counter, NULL, 1); + + pr_info("lpitest successfully initialized lpi_base=%d\n", (u32)hwirq); + + return 0; +} +late_initcall(its_lpitest_init); diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 19d642e..ab44a0f 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -355,6 +355,13 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs if (static_key_true(&supports_deactivate)) gic_write_eoir(irqnr); + if (irqnr == lpitest->irqnr) { + lpitest->end_cycles = pmu_read_cycles(); + lpitest->done = 0; + wake_up_interruptible(&lpitest->wq); + continue; + } + err = handle_domain_irq(gic_data.domain, irqnr, regs); if (err) { WARN_ONCE(true, "Unexpected interrupt received!\n"); diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index b7e3431..986b7f4 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -450,6 +450,29 @@ static inline bool gic_enable_sre(void) return !!(val & ICC_SRE_EL1_SRE); } +#include + +extern void flush_dcache_all(void); + +static __always_inline volatile u64 pmu_read_cycles(void) +{ + u64 cycles; + + asm volatile("mrs %0, pmccntr_el0\n" + "isb \n\t": [reg] "=r" (cycles)); + return cycles; +} + +struct lpitest_cntx { + u64 total_cycles; + u64 end_cycles; + u32 irqnr; + u32 done; + wait_queue_head_t wq; +}; + +extern struct lpitest_cntx *lpitest; + #endif #endif -- Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.