[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1428d9ae-0427-4f3d-b3fa-41ee030b805d@amd.com>
Date: Wed, 29 Oct 2025 16:44:26 +0000
From: Alejandro Lucero Palau <alucerop@....com>
To: Robert Richter <rrichter@....com>,
Alison Schofield <alison.schofield@...el.com>,
Vishal Verma <vishal.l.verma@...el.com>, Ira Weiny <ira.weiny@...el.com>,
Dan Williams <dan.j.williams@...el.com>,
Jonathan Cameron <Jonathan.Cameron@...wei.com>,
Dave Jiang <dave.jiang@...el.com>, Davidlohr Bueso <dave@...olabs.net>
Cc: linux-cxl@...r.kernel.org, linux-kernel@...r.kernel.org,
Gregory Price <gourry@...rry.net>, Terry Bowman <terry.bowman@....com>,
Joshua Hahn <joshua.hahnjy@...il.com>
Subject: Re: [PATCH] cxl/region: Support multi-level interleaving with smaller
granularities for lower levels
On 10/28/25 09:47, Robert Richter wrote:
> The CXL specification supports multi-level interleaving "as long as
> all the levels use different, but consecutive, HPA bits to select the
> target and no Interleave Set has more than 8 devices" (from 3.2).
>
> Currently the kernel expects that a decoder's "interleave granularity
> is a multiple of @parent_port granularity". That is, the granularity
> of a lower level is bigger than those of the parent and uses the outer
> HPA bits as selector. It works e.g. for the following 8-way config:
>
> * cross-link (cross-hostbridge config in CFMWS):
> * 4-way
> * 256 granularity
> * Selector: HPA[8:9]
> * sub-link (CXL Host bridge config of the HDM):
> * 2-way
> * 1024 granularity
> * Selector: HPA[10]
>
> Now, if the outer HPA bits are used for the cross-hostbridge, an 8-way
> config could look like this:
>
> * cross-link (cross-hostbridge config in CFMWS):
> * 4-way
> * 512 granularity
> * Selector: HPA[9:10]
> * sub-link (CXL Host bridge config of the HDM):
> * 2-way
> * 256 granularity
> * Selector: HPA[8]
>
> The enumeration of decoders for this configuration fails then with
> following error:
>
> cxl region0: pci0000:00:port1 cxl_port_setup_targets expected iw: 2 ig: 1024 [mem 0x10000000000-0x1ffffffffff flags 0x200]
> cxl region0: pci0000:00:port1 cxl_port_setup_targets got iw: 2 ig: 256 state: enabled 0x10000000000:0x1ffffffffff
> cxl_port endpoint12: failed to attach decoder12.0 to region0: -6
>
> Note that this happens only if firmware is setting up the decoders
> (CXL_REGION_F_AUTO). For userspace region assembly the granularities
> are chosen to increase from root down to the lower levels. That is,
> outer HPA bits are always used for lower interleaving levels.
>
> Rework the implementation to also support multi-level interleaving
> with smaller granularities for lower levels. Determine the interleave
> set of autodetected decoders. Check that it is a subset of the root
> interleave.
>
> The HPA selector bits are extracted for all decoders of the set and
> checked that there is no overlap and bits are consecutive. All
> decoders can be programmed now to use any bit range within the
> region's target selector.
>
> Signed-off-by: Robert Richter <rrichter@....com>
> ---
> drivers/cxl/core/region.c | 201 ++++++++++++++++++++------------------
> 1 file changed, 108 insertions(+), 93 deletions(-)
>
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index b06fee1978ba..79d35def7c79 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1323,57 +1323,119 @@ static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig)
> return 0;
> }
>
> +static inline u64 get_selector(u64 ways, u64 gran)
> +{
> + if (!is_power_of_2(ways))
> + ways /= 3;
> +
> + if (!is_power_of_2(ways) || !is_power_of_2(gran))
> + return 0;
> +
> + return (ways - 1) * gran;
> +}
> +
> static int cxl_port_setup_targets(struct cxl_port *port,
> struct cxl_region *cxlr,
> struct cxl_endpoint_decoder *cxled)
> {
> struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> - int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos;
> struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
> struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
> struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
> struct cxl_region_params *p = &cxlr->params;
> struct cxl_decoder *cxld = cxl_rr->decoder;
> - struct cxl_switch_decoder *cxlsd;
> + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(&cxld->dev);
> struct cxl_port *iter = port;
> - u16 eig, peig;
> - u8 eiw, peiw;
> + int ig, iw = cxl_rr->nr_targets, rc, inc = 0, pos = cxled->pos;
> + int distance, parent_distance;
> + u64 selector, cxlr_sel;
> + u16 eig;
> + u8 eiw;
>
Hi Robert,
The number of local variables suggest to me maybe it is worth to
refactor this code for simplicity, factoring out the selector and
distance calculation. You added a new helper above which should include
most of this, IMO.
Alejandro
> /*
> * While root level decoders support x3, x6, x12, switch level
> * decoders only support powers of 2 up to x16.
> */
> - if (!is_power_of_2(cxl_rr->nr_targets)) {
> + if (!is_power_of_2(iw)) {
> dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
> - dev_name(port->uport_dev), dev_name(&port->dev),
> - cxl_rr->nr_targets);
> + dev_name(port->uport_dev), dev_name(&port->dev), iw);
> return -EINVAL;
> }
>
> - cxlsd = to_cxl_switch_decoder(&cxld->dev);
> - if (cxl_rr->nr_targets_set) {
> - int i, distance = 1;
> - struct cxl_region_ref *cxl_rr_iter;
> + if (iw > 8 || iw > cxlsd->nr_targets) {
> + dev_dbg(&cxlr->dev,
> + "%s:%s:%s: ways: %d overflows targets: %d\n",
> + dev_name(port->uport_dev), dev_name(&port->dev),
> + dev_name(&cxld->dev), iw, cxlsd->nr_targets);
> + return -ENXIO;
> + }
>
> - /*
> - * The "distance" between peer downstream ports represents which
> - * endpoint positions in the region interleave a given port can
> - * host.
> - *
> - * For example, at the root of a hierarchy the distance is
> - * always 1 as every index targets a different host-bridge. At
> - * each subsequent switch level those ports map every Nth region
> - * position where N is the width of the switch == distance.
> - */
> - do {
> - cxl_rr_iter = cxl_rr_load(iter, cxlr);
> - distance *= cxl_rr_iter->nr_targets;
> - iter = to_cxl_port(iter->dev.parent);
> - } while (!is_cxl_root(iter));
> - distance *= cxlrd->cxlsd.cxld.interleave_ways;
> + /*
> + * Calculate the effective granularity and ways to determine
> + * HPA bits used as target selectors of the interleave set.
> + * Use this to check if the root decoder and all subsequent
> + * HDM decoders only use bits from that range as selectors.
> + *
> + * The "distance" between peer downstream ports represents which
> + * endpoint positions in the region interleave a given port can
> + * host.
> + *
> + * For example, at the root of a hierarchy the distance is
> + * always 1 as every index targets a different host-bridge. At
> + * each subsequent switch level those ports map every Nth region
> + * position where N is the width of the switch == distance.
> + */
> +
> + /* Start with the root decoders selector and distance. */
> + selector = get_selector(cxlrd->cxlsd.cxld.interleave_ways,
> + cxlrd->cxlsd.cxld.interleave_granularity);
> + distance = cxlrd->cxlsd.cxld.interleave_ways;
> + if (!is_power_of_2(distance))
> + distance /= 3;
> +
> + for (iter = parent_port; !is_cxl_root(iter);
> + iter = to_cxl_port(iter->dev.parent)) {
> + struct cxl_region_ref *cxl_rr_iter = cxl_rr_load(iter, cxlr);
> + struct cxl_decoder *cxld_iter = cxl_rr_iter->decoder;
> + u64 cxld_sel;
> +
> + if (cxld_iter->interleave_ways == 1)
> + continue;
> +
> + cxld_sel = get_selector(cxld_iter->interleave_ways,
> + cxld_iter->interleave_granularity);
> +
> + if (cxld_sel & selector) {
> + dev_dbg(&cxlr->dev, "%s:%s: overlapping selectors: %#llx:%#llx\n",
> + dev_name(iter->uport_dev),
> + dev_name(&iter->dev), cxld_sel, selector);
> + return -ENXIO;
> + }
>
> - for (i = 0; i < cxl_rr->nr_targets_set; i++)
> + selector |= cxld_sel;
> + distance *= cxl_rr_iter->nr_targets;
> + }
> +
> + parent_distance = distance;
> + distance *= iw;
> +
> + /* The combined selector bits must fit the region selector. */
> + cxlr_sel = get_selector(p->interleave_ways,
> + p->interleave_granularity);
> +
> + if ((cxlr_sel & selector) != selector) {
> + dev_dbg(&cxlr->dev, "%s:%s: invalid selectors: %#llx:%#llx\n",
> + dev_name(iter->uport_dev),
> + dev_name(&iter->dev), cxlr_sel, selector);
> + return -ENXIO;
> + }
> +
> + /* Calculate remaining selector bits available for use. */
> + selector = cxlr_sel & ~selector;
> +
> + if (cxl_rr->nr_targets_set) {
> + for (int i = 0; i < cxl_rr->nr_targets_set; i++)
> if (ep->dport == cxlsd->target[i]) {
> rc = check_last_peer(cxled, ep, cxl_rr,
> distance);
> @@ -1384,87 +1446,40 @@ static int cxl_port_setup_targets(struct cxl_port *port,
> goto add_target;
> }
>
> - if (is_cxl_root(parent_port)) {
> + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
> + ig = cxld->interleave_granularity;
> + else
> /*
> + * Set the interleave granularity with each interleave
> + * level to a multiple of it's parent port interleave
> + * ways. Beginning with the granularity of the root
> + * decoder set to the region granularity (starting
> + * with the inner selector bits of the HPA), the
> + * granularity is increased with each level. Calculate
> + * this using the parent distance and region
> + * granularity.
> + *
> * Root decoder IG is always set to value in CFMWS which
> * may be different than this region's IG. We can use the
> * region's IG here since interleave_granularity_store()
> * does not allow interleaved host-bridges with
> * root IG != region IG.
> */
> - parent_ig = p->interleave_granularity;
> - parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
> - /*
> - * For purposes of address bit routing, use power-of-2 math for
> - * switch ports.
> - */
> - if (!is_power_of_2(parent_iw))
> - parent_iw /= 3;
> - } else {
> - struct cxl_region_ref *parent_rr;
> - struct cxl_decoder *parent_cxld;
> -
> - parent_rr = cxl_rr_load(parent_port, cxlr);
> - parent_cxld = parent_rr->decoder;
> - parent_ig = parent_cxld->interleave_granularity;
> - parent_iw = parent_cxld->interleave_ways;
> - }
> -
> - rc = granularity_to_eig(parent_ig, &peig);
> - if (rc) {
> - dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
> - dev_name(parent_port->uport_dev),
> - dev_name(&parent_port->dev), parent_ig);
> - return rc;
> - }
> -
> - rc = ways_to_eiw(parent_iw, &peiw);
> - if (rc) {
> - dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
> - dev_name(parent_port->uport_dev),
> - dev_name(&parent_port->dev), parent_iw);
> - return rc;
> - }
> + ig = p->interleave_granularity * parent_distance;
>
> - iw = cxl_rr->nr_targets;
> rc = ways_to_eiw(iw, &eiw);
> - if (rc) {
> - dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
> - dev_name(port->uport_dev), dev_name(&port->dev), iw);
> - return rc;
> - }
> -
> - /*
> - * Interleave granularity is a multiple of @parent_port granularity.
> - * Multiplier is the parent port interleave ways.
> - */
> - rc = granularity_to_eig(parent_ig * parent_iw, &eig);
> - if (rc) {
> - dev_dbg(&cxlr->dev,
> - "%s: invalid granularity calculation (%d * %d)\n",
> - dev_name(&parent_port->dev), parent_ig, parent_iw);
> - return rc;
> - }
> -
> - rc = eig_to_granularity(eig, &ig);
> - if (rc) {
> - dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
> - dev_name(port->uport_dev), dev_name(&port->dev),
> - 256 << eig);
> - return rc;
> - }
> + if (!rc)
> + rc = granularity_to_eig(ig, &eig);
>
> - if (iw > 8 || iw > cxlsd->nr_targets) {
> - dev_dbg(&cxlr->dev,
> - "%s:%s:%s: ways: %d overflows targets: %d\n",
> + if (rc || (iw > 1 && ~selector & get_selector(iw, ig))) {
> + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d:%d:%#llx\n",
> dev_name(port->uport_dev), dev_name(&port->dev),
> - dev_name(&cxld->dev), iw, cxlsd->nr_targets);
> + iw, ig, selector);
> return -ENXIO;
> }
>
> if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
> if (cxld->interleave_ways != iw ||
> - (iw > 1 && cxld->interleave_granularity != ig) ||
> !region_res_match_cxl_range(p, &cxld->hpa_range) ||
> ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
> dev_err(&cxlr->dev,
Powered by blists - more mailing lists