[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20120319183800.bec23f19adeabf00c3cbc973@freescale.com>
Date: Mon, 19 Mar 2012 18:38:00 -0500
From: Kim Phillips <kim.phillips@...escale.com>
To: <horia.geanta@...escale.com>
CC: <linux-crypto@...r.kernel.org>, <netdev@...r.kernel.org>,
<herbert@...dor.hengli.com.au>, <davem@...emloft.net>,
<Sandeep.Malik@...escale.com>
Subject: balancing crypto with NAPI flow vs. tasklet (was: Re: Hi,)
On Mon, 12 Mar 2012 14:44:27 +0200
<horia.geanta@...escale.com> wrote:
> This patch replaces the back-half implementation of talitos crypto engine from
> tasklet to NAPI. The decision to do this was based on improved performance
> (around 7%).
> A similiar patch (not posted yet) was tested for caam crypto engine, with
> 10-15% improvement over tasklet.
>
> Since having crypto engines use the net softirq is probably not acceptable,
> I would like to hear your comments on what options do I have to make this
> upstreamable.
> Besides current approach, I am considering the following:
> - defining a new softirq for crypto engines, having a higher priority than the
> NET_RX_SOFTIRQ
but the priority just has to be equal to NET_RX_SOFTIRQ to get the
net--crypto performance balance, not necessarily higher IIRC.
> - using tasklet_hi_schedule instead of tasklet_schedule
this has done close to nothing for performance in my experience -
but there is no other tasklet competing for the slice in my IPSec
fwding tests either.
> Let me know if any of these two fits better or if something else is preferred.
Herbert/Dave,
Is it ok for a crypto driver to depend on NET? If not, how should
the NAPI-style flow be abstracted out of NET?
Thanks,
Kim
> Thank you
>
> From 20f30ef6fdfe641f1c30f94320891715ffee33a2 Mon Sep 17 00:00:00 2001
> From: Sandeep Malik <Sandeep.Malik@...escale.com>
> Date: Sat, 12 Jun 2010 14:08:47 +0800
> Subject: [RFC,PATCH] crypto: talitos - Replace the tasklet implementation with NAPI
>
> This patch updates the current tasklet implement to NAPI so as
> the system is more balanced in the terms that the packet submission
> and the packet forwarding after being processed can be done at
> the same priority.
>
> Signed-off-by: Sandeep Malik <Sandeep.Malik@...escale.com>
> Signed-off-by: Horia Geanta <horia.geanta@...escale.com>
> ---
> drivers/crypto/Kconfig | 2 +-
> drivers/crypto/talitos.c | 145 +++++++++++++++++++++++++++++++++-------------
> drivers/crypto/talitos.h | 4 +-
> 3 files changed, 109 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
> index e707979..682096b 100644
> --- a/drivers/crypto/Kconfig
> +++ b/drivers/crypto/Kconfig
> @@ -218,7 +218,7 @@ config CRYPTO_DEV_TALITOS
> select CRYPTO_ALGAPI
> select CRYPTO_AUTHENC
> select HW_RANDOM
> - depends on FSL_SOC
> + depends on FSL_SOC && NET
> help
> Say 'Y' here to use the Freescale Security Engine (SEC)
> to offload cryptographic algorithm computation.
> diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
> index dc641c7..f368579 100644
> --- a/drivers/crypto/talitos.c
> +++ b/drivers/crypto/talitos.c
> @@ -1,7 +1,7 @@
> /*
> * talitos - Freescale Integrated Security Engine (SEC) device driver
> *
> - * Copyright (c) 2008-2011 Freescale Semiconductor, Inc.
> + * Copyright (c) 2008-2012 Freescale Semiconductor, Inc.
> *
> * Scatterlist Crypto API glue code copied from files with the following:
> * Copyright (c) 2006-2007 Herbert Xu <herbert@...dor.apana.org.au>
> @@ -37,6 +37,7 @@
> #include <linux/io.h>
> #include <linux/spinlock.h>
> #include <linux/rtnetlink.h>
> +#include <linux/netdevice.h>
> #include <linux/slab.h>
>
> #include <crypto/algapi.h>
> @@ -121,6 +122,7 @@ struct talitos_channel {
> struct talitos_private {
> struct device *dev;
> struct platform_device *ofdev;
> + struct net_device __percpu *netdev;
> void __iomem *reg;
> int irq[2];
>
> @@ -145,8 +147,8 @@ struct talitos_private {
> /* next channel to be assigned next incoming descriptor */
> atomic_t last_chan ____cacheline_aligned;
>
> - /* request callback tasklet */
> - struct tasklet_struct done_task[2];
> + /* request callback napi */
> + struct napi_struct __percpu *done_task[2];
>
> /* list of registered algorithms */
> struct list_head alg_list;
> @@ -349,17 +351,18 @@ static int talitos_submit(struct device *dev, int ch, struct talitos_desc *desc,
> /*
> * process what was done, notify callback of error if not
> */
> -static void flush_channel(struct device *dev, int ch, int error, int reset_ch)
> +static int flush_channel(struct device *dev, int ch, int error, int reset_ch,
> + int weight)
> {
> struct talitos_private *priv = dev_get_drvdata(dev);
> struct talitos_request *request, saved_req;
> unsigned long flags;
> - int tail, status;
> + int tail, status, count = 0;
>
> spin_lock_irqsave(&priv->chan[ch].tail_lock, flags);
>
> tail = priv->chan[ch].tail;
> - while (priv->chan[ch].fifo[tail].desc) {
> + while (priv->chan[ch].fifo[tail].desc && (count < weight)) {
> request = &priv->chan[ch].fifo[tail];
>
> /* descriptors with their done bits set don't get the error */
> @@ -396,43 +399,55 @@ static void flush_channel(struct device *dev, int ch, int error, int reset_ch)
> status);
> /* channel may resume processing in single desc error case */
> if (error && !reset_ch && status == error)
> - return;
> + return 0;
> spin_lock_irqsave(&priv->chan[ch].tail_lock, flags);
> tail = priv->chan[ch].tail;
> + count++;
> }
>
> spin_unlock_irqrestore(&priv->chan[ch].tail_lock, flags);
> +
> + return count;
> }
>
> /*
> * process completed requests for channels that have done status
> */
> -#define DEF_TALITOS_DONE(name, ch_done_mask) \
> -static void talitos_done_##name(unsigned long data) \
> +#define DEF_TALITOS_DONE(name, ch_done_mask, num_ch) \
> +static int talitos_done_##name(struct napi_struct *napi, int budget) \
> { \
> - struct device *dev = (struct device *)data; \
> + struct device *dev = &napi->dev->dev; \
> struct talitos_private *priv = dev_get_drvdata(dev); \
> + int budget_per_ch, work_done = 0; \
> \
> + budget_per_ch = budget / num_ch; \
> if (ch_done_mask & 1) \
> - flush_channel(dev, 0, 0, 0); \
> + work_done += flush_channel(dev, 0, 0, 0, budget_per_ch);\
> if (priv->num_channels == 1) \
> goto out; \
> if (ch_done_mask & (1 << 2)) \
> - flush_channel(dev, 1, 0, 0); \
> + work_done += flush_channel(dev, 1, 0, 0, budget_per_ch);\
> if (ch_done_mask & (1 << 4)) \
> - flush_channel(dev, 2, 0, 0); \
> + work_done += flush_channel(dev, 2, 0, 0, budget_per_ch);\
> if (ch_done_mask & (1 << 6)) \
> - flush_channel(dev, 3, 0, 0); \
> + work_done += flush_channel(dev, 3, 0, 0, budget_per_ch);\
> \
> out: \
> - /* At this point, all completed channels have been processed */ \
> - /* Unmask done interrupts for channels completed later on. */ \
> - setbits32(priv->reg + TALITOS_IMR, ch_done_mask); \
> - setbits32(priv->reg + TALITOS_IMR_LO, TALITOS_IMR_LO_INIT); \
> + if (work_done < budget) { \
> + napi_complete(napi); \
> + /* At this point, all completed channels have been */ \
> + /* processed. Unmask done interrupts for channels */ \
> + /* completed later on. */ \
> + setbits32(priv->reg + TALITOS_IMR, ch_done_mask); \
> + setbits32(priv->reg + TALITOS_IMR_LO, \
> + TALITOS_IMR_LO_INIT); \
> + } \
> + \
> + return work_done; \
> }
> -DEF_TALITOS_DONE(4ch, TALITOS_ISR_4CHDONE)
> -DEF_TALITOS_DONE(ch0_2, TALITOS_ISR_CH_0_2_DONE)
> -DEF_TALITOS_DONE(ch1_3, TALITOS_ISR_CH_1_3_DONE)
> +DEF_TALITOS_DONE(4ch, TALITOS_ISR_4CHDONE, 4)
> +DEF_TALITOS_DONE(ch0_2, TALITOS_ISR_CH_0_2_DONE, 2)
> +DEF_TALITOS_DONE(ch1_3, TALITOS_ISR_CH_1_3_DONE, 2)
>
> /*
> * locate current (offending) descriptor
> @@ -582,7 +597,7 @@ static void talitos_error(struct device *dev, u32 isr, u32 isr_lo)
> if (v_lo & TALITOS_CCPSR_LO_SRL)
> dev_err(dev, "scatter return/length error\n");
>
> - flush_channel(dev, ch, error, reset_ch);
> + flush_channel(dev, ch, error, reset_ch, priv->fifo_len);
>
> if (reset_ch) {
> reset_channel(dev, ch);
> @@ -606,14 +621,14 @@ static void talitos_error(struct device *dev, u32 isr, u32 isr_lo)
>
> /* purge request queues */
> for (ch = 0; ch < priv->num_channels; ch++)
> - flush_channel(dev, ch, -EIO, 1);
> + flush_channel(dev, ch, -EIO, 1, priv->fifo_len);
>
> /* reset and reinitialize the device */
> init_device(dev);
> }
> }
>
> -#define DEF_TALITOS_INTERRUPT(name, ch_done_mask, ch_err_mask, tlet) \
> +#define DEF_TALITOS_INTERRUPT(name, ch_done_mask, ch_err_mask, sirq) \
> static irqreturn_t talitos_interrupt_##name(int irq, void *data) \
> { \
> struct device *dev = data; \
> @@ -633,7 +648,8 @@ static irqreturn_t talitos_interrupt_##name(int irq, void *data) \
> /* mask further done interrupts. */ \
> clrbits32(priv->reg + TALITOS_IMR, ch_done_mask); \
> /* done_task will unmask done interrupts at exit */ \
> - tasklet_schedule(&priv->done_task[tlet]); \
> + napi_schedule(per_cpu_ptr(priv->done_task[sirq], \
> + smp_processor_id())); \
> } \
> \
> return (isr & (ch_done_mask | ch_err_mask) || isr_lo) ? IRQ_HANDLED : \
> @@ -2555,7 +2571,7 @@ static int talitos_remove(struct platform_device *ofdev)
> struct device *dev = &ofdev->dev;
> struct talitos_private *priv = dev_get_drvdata(dev);
> struct talitos_crypto_alg *t_alg, *n;
> - int i;
> + int i, j;
>
> list_for_each_entry_safe(t_alg, n, &priv->alg_list, entry) {
> switch (t_alg->algt.type) {
> @@ -2574,25 +2590,32 @@ static int talitos_remove(struct platform_device *ofdev)
> if (hw_supports(dev, DESC_HDR_SEL0_RNG))
> talitos_unregister_rng(dev);
>
> - for (i = 0; i < priv->num_channels; i++)
> - kfree(priv->chan[i].fifo);
> -
> - kfree(priv->chan);
> -
> for (i = 0; i < 2; i++)
> if (priv->irq[i]) {
> free_irq(priv->irq[i], dev);
> irq_dispose_mapping(priv->irq[i]);
> +
> + for_each_possible_cpu(j) {
> + napi_disable(per_cpu_ptr(priv->done_task[i],
> + j));
> + netif_napi_del(per_cpu_ptr(priv->done_task[i],
> + j));
> + }
> +
> + free_percpu(priv->done_task[i]);
> }
>
> - tasklet_kill(&priv->done_task[0]);
> - if (priv->irq[1])
> - tasklet_kill(&priv->done_task[1]);
> + for (i = 0; i < priv->num_channels; i++)
> + kfree(priv->chan[i].fifo);
> +
> + kfree(priv->chan);
>
> iounmap(priv->reg);
>
> dev_set_drvdata(dev, NULL);
>
> + free_percpu(priv->netdev);
> +
> kfree(priv);
>
> return 0;
> @@ -2718,19 +2741,61 @@ static int talitos_probe(struct platform_device *ofdev)
> dev_set_drvdata(dev, priv);
>
> priv->ofdev = ofdev;
> + priv->dev = dev;
> +
> + priv->netdev = alloc_percpu(struct net_device);
> + if (!priv->netdev) {
> + dev_err(dev, "failed to allocate netdevice\n");
> + err = -ENOMEM;
> + goto err_out;
> + }
> +
> + for_each_possible_cpu(i) {
> + err = init_dummy_netdev(per_cpu_ptr(priv->netdev, i));
> + if (err) {
> + dev_err(dev, "failed to initialize dummy netdevice\n");
> + goto err_out;
> + }
> + (per_cpu_ptr(priv->netdev, i))->dev = *dev;
> + }
>
> err = talitos_probe_irq(ofdev);
> if (err)
> goto err_out;
>
> + priv->done_task[0] = alloc_percpu(struct napi_struct);
> + if (!priv->done_task[0]) {
> + dev_err(dev, "failed to allocate napi for 1st irq\n");
> + err = -ENOMEM;
> + goto err_out;
> + }
> +
> if (!priv->irq[1]) {
> - tasklet_init(&priv->done_task[0], talitos_done_4ch,
> - (unsigned long)dev);
> + for_each_possible_cpu(i) {
> + netif_napi_add(per_cpu_ptr(priv->netdev, i),
> + per_cpu_ptr(priv->done_task[0], i),
> + talitos_done_4ch, TALITOS_NAPI_WEIGHT);
> + napi_enable(per_cpu_ptr(priv->done_task[0], i));
> + }
> } else {
> - tasklet_init(&priv->done_task[0], talitos_done_ch0_2,
> - (unsigned long)dev);
> - tasklet_init(&priv->done_task[1], talitos_done_ch1_3,
> - (unsigned long)dev);
> + priv->done_task[1] = alloc_percpu(struct napi_struct);
> + if (!priv->done_task[1]) {
> + dev_err(dev, "failed to allocate napi for 2nd irq\n");
> + err = -ENOMEM;
> + goto err_out;
> + }
> +
> + for_each_possible_cpu(i) {
> + netif_napi_add(per_cpu_ptr(priv->netdev, i),
> + per_cpu_ptr(priv->done_task[0], i),
> + talitos_done_ch0_2, TALITOS_NAPI_WEIGHT);
> + napi_enable(per_cpu_ptr(priv->done_task[0], i));
> +
> + netif_napi_add(per_cpu_ptr(priv->netdev, i),
> + per_cpu_ptr(priv->done_task[1], i),
> + talitos_done_ch1_3, TALITOS_NAPI_WEIGHT);
> + napi_enable(per_cpu_ptr(priv->done_task[1], i));
> + }
> }
>
> INIT_LIST_HEAD(&priv->alg_list);
> diff --git a/drivers/crypto/talitos.h b/drivers/crypto/talitos.h
> index 3c17395..ba62abc 100644
> --- a/drivers/crypto/talitos.h
> +++ b/drivers/crypto/talitos.h
> @@ -1,7 +1,7 @@
> /*
> * Freescale SEC (talitos) device register and descriptor header defines
> *
> - * Copyright (c) 2006-2011 Freescale Semiconductor, Inc.
> + * Copyright (c) 2006-2012 Freescale Semiconductor, Inc.
> *
> * Redistribution and use in source and binary forms, with or without
> * modification, are permitted provided that the following conditions
> @@ -28,6 +28,8 @@
> *
> */
>
> +#define TALITOS_NAPI_WEIGHT 12
> +
> /*
> * TALITOS_xxx_LO addresses point to the low data bits (32-63) of the register
> */
> --
> 1.7.3.4
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists