[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1428d9ae-0427-4f3d-b3fa-41ee030b805d@amd.com>
Date: Wed, 29 Oct 2025 16:44:26 +0000
From: Alejandro Lucero Palau <alucerop@....com>
To: Robert Richter <rrichter@....com>,
 Alison Schofield <alison.schofield@...el.com>,
 Vishal Verma <vishal.l.verma@...el.com>, Ira Weiny <ira.weiny@...el.com>,
 Dan Williams <dan.j.williams@...el.com>,
 Jonathan Cameron <Jonathan.Cameron@...wei.com>,
 Dave Jiang <dave.jiang@...el.com>, Davidlohr Bueso <dave@...olabs.net>
Cc: linux-cxl@...r.kernel.org, linux-kernel@...r.kernel.org,
 Gregory Price <gourry@...rry.net>, Terry Bowman <terry.bowman@....com>,
 Joshua Hahn <joshua.hahnjy@...il.com>
Subject: Re: [PATCH] cxl/region: Support multi-level interleaving with smaller
 granularities for lower levels
On 10/28/25 09:47, Robert Richter wrote:
> The CXL specification supports multi-level interleaving "as long as
> all the levels use different, but consecutive, HPA bits to select the
> target and no Interleave Set has more than 8 devices" (from 3.2).
>
> Currently the kernel expects that a decoder's "interleave granularity
> is a multiple of @parent_port granularity". That is, the granularity
> of a lower level is bigger than those of the parent and uses the outer
> HPA bits as selector. It works e.g. for the following 8-way config:
>
>   * cross-link (cross-hostbridge config in CFMWS):
>     * 4-way
>     * 256 granularity
>     * Selector: HPA[8:9]
>   * sub-link (CXL Host bridge config of the HDM):
>     * 2-way
>     * 1024 granularity
>     * Selector: HPA[10]
>
> Now, if the outer HPA bits are used for the cross-hostbridge, an 8-way
> config could look like this:
>
>   * cross-link (cross-hostbridge config in CFMWS):
>     * 4-way
>     * 512 granularity
>     * Selector: HPA[9:10]
>   * sub-link (CXL Host bridge config of the HDM):
>     * 2-way
>     * 256 granularity
>     * Selector: HPA[8]
>
> The enumeration of decoders for this configuration fails then with
> following error:
>
>   cxl region0: pci0000:00:port1 cxl_port_setup_targets expected iw: 2 ig: 1024 [mem 0x10000000000-0x1ffffffffff flags 0x200]
>   cxl region0: pci0000:00:port1 cxl_port_setup_targets got iw: 2 ig: 256 state: enabled 0x10000000000:0x1ffffffffff
>   cxl_port endpoint12: failed to attach decoder12.0 to region0: -6
>
> Note that this happens only if firmware is setting up the decoders
> (CXL_REGION_F_AUTO). For userspace region assembly the granularities
> are chosen to increase from root down to the lower levels. That is,
> outer HPA bits are always used for lower interleaving levels.
>
> Rework the implementation to also support multi-level interleaving
> with smaller granularities for lower levels. Determine the interleave
> set of autodetected decoders. Check that it is a subset of the root
> interleave.
>
> The HPA selector bits are extracted for all decoders of the set and
> checked that there is no overlap and bits are consecutive. All
> decoders can be programmed now to use any bit range within the
> region's target selector.
>
> Signed-off-by: Robert Richter <rrichter@....com>
> ---
>   drivers/cxl/core/region.c | 201 ++++++++++++++++++++------------------
>   1 file changed, 108 insertions(+), 93 deletions(-)
>
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index b06fee1978ba..79d35def7c79 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1323,57 +1323,119 @@ static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig)
>   	return 0;
>   }
>   
> +static inline u64 get_selector(u64 ways, u64 gran)
> +{
> +	if (!is_power_of_2(ways))
> +		ways /= 3;
> +
> +	if (!is_power_of_2(ways) || !is_power_of_2(gran))
> +		return 0;
> +
> +	return (ways - 1) * gran;
> +}
> +
>   static int cxl_port_setup_targets(struct cxl_port *port,
>   				  struct cxl_region *cxlr,
>   				  struct cxl_endpoint_decoder *cxled)
>   {
>   	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> -	int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos;
>   	struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
>   	struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
>   	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
>   	struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
>   	struct cxl_region_params *p = &cxlr->params;
>   	struct cxl_decoder *cxld = cxl_rr->decoder;
> -	struct cxl_switch_decoder *cxlsd;
> +	struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(&cxld->dev);
>   	struct cxl_port *iter = port;
> -	u16 eig, peig;
> -	u8 eiw, peiw;
> +	int ig, iw = cxl_rr->nr_targets, rc, inc = 0, pos = cxled->pos;
> +	int distance, parent_distance;
> +	u64 selector, cxlr_sel;
> +	u16 eig;
> +	u8 eiw;
>   
Hi Robert,
The number of local variables suggest to me maybe it is worth to 
refactor this code for simplicity, factoring out the selector and 
distance calculation. You added a new helper above which should include 
most of this, IMO.
Alejandro
>   	/*
>   	 * While root level decoders support x3, x6, x12, switch level
>   	 * decoders only support powers of 2 up to x16.
>   	 */
> -	if (!is_power_of_2(cxl_rr->nr_targets)) {
> +	if (!is_power_of_2(iw)) {
>   		dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
> -			dev_name(port->uport_dev), dev_name(&port->dev),
> -			cxl_rr->nr_targets);
> +			dev_name(port->uport_dev), dev_name(&port->dev), iw);
>   		return -EINVAL;
>   	}
>   
> -	cxlsd = to_cxl_switch_decoder(&cxld->dev);
> -	if (cxl_rr->nr_targets_set) {
> -		int i, distance = 1;
> -		struct cxl_region_ref *cxl_rr_iter;
> +	if (iw > 8 || iw > cxlsd->nr_targets) {
> +		dev_dbg(&cxlr->dev,
> +			"%s:%s:%s: ways: %d overflows targets: %d\n",
> +			dev_name(port->uport_dev), dev_name(&port->dev),
> +			dev_name(&cxld->dev), iw, cxlsd->nr_targets);
> +		return -ENXIO;
> +	}
>   
> -		/*
> -		 * The "distance" between peer downstream ports represents which
> -		 * endpoint positions in the region interleave a given port can
> -		 * host.
> -		 *
> -		 * For example, at the root of a hierarchy the distance is
> -		 * always 1 as every index targets a different host-bridge. At
> -		 * each subsequent switch level those ports map every Nth region
> -		 * position where N is the width of the switch == distance.
> -		 */
> -		do {
> -			cxl_rr_iter = cxl_rr_load(iter, cxlr);
> -			distance *= cxl_rr_iter->nr_targets;
> -			iter = to_cxl_port(iter->dev.parent);
> -		} while (!is_cxl_root(iter));
> -		distance *= cxlrd->cxlsd.cxld.interleave_ways;
> +	/*
> +	 * Calculate the effective granularity and ways to determine
> +	 * HPA bits used as target selectors of the interleave set.
> +	 * Use this to check if the root decoder and all subsequent
> +	 * HDM decoders only use bits from that range as selectors.
> +	 *
> +	 * The "distance" between peer downstream ports represents which
> +	 * endpoint positions in the region interleave a given port can
> +	 * host.
> +	 *
> +	 * For example, at the root of a hierarchy the distance is
> +	 * always 1 as every index targets a different host-bridge. At
> +	 * each subsequent switch level those ports map every Nth region
> +	 * position where N is the width of the switch == distance.
> +	 */
> +
> +	/* Start with the root decoders selector and distance. */
> +	selector = get_selector(cxlrd->cxlsd.cxld.interleave_ways,
> +				cxlrd->cxlsd.cxld.interleave_granularity);
> +	distance = cxlrd->cxlsd.cxld.interleave_ways;
> +	if (!is_power_of_2(distance))
> +		distance /= 3;
> +
> +	for (iter = parent_port; !is_cxl_root(iter);
> +	     iter = to_cxl_port(iter->dev.parent)) {
> +		struct cxl_region_ref *cxl_rr_iter = cxl_rr_load(iter, cxlr);
> +		struct cxl_decoder *cxld_iter = cxl_rr_iter->decoder;
> +		u64 cxld_sel;
> +
> +		if (cxld_iter->interleave_ways == 1)
> +			continue;
> +
> +		cxld_sel = get_selector(cxld_iter->interleave_ways,
> +					cxld_iter->interleave_granularity);
> +
> +		if (cxld_sel & selector) {
> +			dev_dbg(&cxlr->dev, "%s:%s: overlapping selectors: %#llx:%#llx\n",
> +				dev_name(iter->uport_dev),
> +				dev_name(&iter->dev), cxld_sel, selector);
> +			return -ENXIO;
> +		}
>   
> -		for (i = 0; i < cxl_rr->nr_targets_set; i++)
> +		selector |= cxld_sel;
> +		distance *= cxl_rr_iter->nr_targets;
> +	}
> +
> +	parent_distance = distance;
> +	distance *= iw;
> +
> +	/* The combined selector bits must fit the region selector. */
> +	cxlr_sel = get_selector(p->interleave_ways,
> +				p->interleave_granularity);
> +
> +	if ((cxlr_sel & selector) != selector) {
> +		dev_dbg(&cxlr->dev, "%s:%s: invalid selectors: %#llx:%#llx\n",
> +			dev_name(iter->uport_dev),
> +			dev_name(&iter->dev), cxlr_sel, selector);
> +		return -ENXIO;
> +	}
> +
> +	/* Calculate remaining selector bits available for use. */
> +	selector = cxlr_sel & ~selector;
> +
> +	if (cxl_rr->nr_targets_set) {
> +		for (int i = 0; i < cxl_rr->nr_targets_set; i++)
>   			if (ep->dport == cxlsd->target[i]) {
>   				rc = check_last_peer(cxled, ep, cxl_rr,
>   						     distance);
> @@ -1384,87 +1446,40 @@ static int cxl_port_setup_targets(struct cxl_port *port,
>   		goto add_target;
>   	}
>   
> -	if (is_cxl_root(parent_port)) {
> +	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
> +		ig = cxld->interleave_granularity;
> +	else
>   		/*
> +		 * Set the interleave granularity with each interleave
> +		 * level to a multiple of it's parent port interleave
> +		 * ways. Beginning with the granularity of the root
> +		 * decoder set to the region granularity (starting
> +		 * with the inner selector bits of the HPA), the
> +		 * granularity is increased with each level. Calculate
> +		 * this using the parent distance and region
> +		 * granularity.
> +		 *
>   		 * Root decoder IG is always set to value in CFMWS which
>   		 * may be different than this region's IG.  We can use the
>   		 * region's IG here since interleave_granularity_store()
>   		 * does not allow interleaved host-bridges with
>   		 * root IG != region IG.
>   		 */
> -		parent_ig = p->interleave_granularity;
> -		parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
> -		/*
> -		 * For purposes of address bit routing, use power-of-2 math for
> -		 * switch ports.
> -		 */
> -		if (!is_power_of_2(parent_iw))
> -			parent_iw /= 3;
> -	} else {
> -		struct cxl_region_ref *parent_rr;
> -		struct cxl_decoder *parent_cxld;
> -
> -		parent_rr = cxl_rr_load(parent_port, cxlr);
> -		parent_cxld = parent_rr->decoder;
> -		parent_ig = parent_cxld->interleave_granularity;
> -		parent_iw = parent_cxld->interleave_ways;
> -	}
> -
> -	rc = granularity_to_eig(parent_ig, &peig);
> -	if (rc) {
> -		dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
> -			dev_name(parent_port->uport_dev),
> -			dev_name(&parent_port->dev), parent_ig);
> -		return rc;
> -	}
> -
> -	rc = ways_to_eiw(parent_iw, &peiw);
> -	if (rc) {
> -		dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
> -			dev_name(parent_port->uport_dev),
> -			dev_name(&parent_port->dev), parent_iw);
> -		return rc;
> -	}
> +		ig = p->interleave_granularity * parent_distance;
>   
> -	iw = cxl_rr->nr_targets;
>   	rc = ways_to_eiw(iw, &eiw);
> -	if (rc) {
> -		dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
> -			dev_name(port->uport_dev), dev_name(&port->dev), iw);
> -		return rc;
> -	}
> -
> -	/*
> -	 * Interleave granularity is a multiple of @parent_port granularity.
> -	 * Multiplier is the parent port interleave ways.
> -	 */
> -	rc = granularity_to_eig(parent_ig * parent_iw, &eig);
> -	if (rc) {
> -		dev_dbg(&cxlr->dev,
> -			"%s: invalid granularity calculation (%d * %d)\n",
> -			dev_name(&parent_port->dev), parent_ig, parent_iw);
> -		return rc;
> -	}
> -
> -	rc = eig_to_granularity(eig, &ig);
> -	if (rc) {
> -		dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
> -			dev_name(port->uport_dev), dev_name(&port->dev),
> -			256 << eig);
> -		return rc;
> -	}
> +	if (!rc)
> +		rc = granularity_to_eig(ig, &eig);
>   
> -	if (iw > 8 || iw > cxlsd->nr_targets) {
> -		dev_dbg(&cxlr->dev,
> -			"%s:%s:%s: ways: %d overflows targets: %d\n",
> +	if (rc || (iw > 1 && ~selector & get_selector(iw, ig))) {
> +		dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d:%d:%#llx\n",
>   			dev_name(port->uport_dev), dev_name(&port->dev),
> -			dev_name(&cxld->dev), iw, cxlsd->nr_targets);
> +			iw, ig, selector);
>   		return -ENXIO;
>   	}
>   
>   	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
>   		if (cxld->interleave_ways != iw ||
> -		    (iw > 1 && cxld->interleave_granularity != ig) ||
>   		    !region_res_match_cxl_range(p, &cxld->hpa_range) ||
>   		    ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
>   			dev_err(&cxlr->dev,
Powered by blists - more mailing lists
 
