This file contains functions for handling services provided to other kernel modules that use the GRU. Examples include the SGI xpmem & xpc drivers. Signed-off-by: Jack Steiner --- drivers/misc/gru/grukservices.c | 706 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 706 insertions(+) Index: linux/drivers/misc/gru/grukservices.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux/drivers/misc/gru/grukservices.c 2008-06-09 13:53:05.880788446 -0500 @@ -0,0 +1,706 @@ +/* + * SN Platform GRU Driver + * + * KERNEL SERVICES THAT USE THE GRU + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gru.h" +#include "grulib.h" +#include "grutables.h" +#include "grukservices.h" +#include "gru_instructions.h" +#include + +/* + * Kernel GRU Usage + * + * The following is an interim algorithm for management of kernel GRU + * resources. This will likely be replaced when we better understand the + * kernel/user requirements. + * + * At boot time, the kernel permanently reserves a fixed number of + * CBRs/DSRs for each cpu to use. The resources are all taken from + * the GRU chiplet 1 on the blade. This leaves the full set of resources + * of chiplet 0 available to be allocated to a single user. + */ + +/* Blade percpu resources PERMANENTLY reserved for kernel use */ +#define GRU_NUM_KERNEL_CBR 1 +#define GRU_NUM_KERNEL_DSR_BYTES 256 +#define KERNEL_CTXNUM 15 + +/* GRU instruction attributes for all instructions */ +#define IMA IMA_CB_DELAY + +#define __gru_cacheline_aligned__ \ + __attribute__((__aligned__(GRU_CACHE_LINE_BYTES))) + +#define MAGIC 0x1234567887654321UL + +/* Default retry count for GRU errors on kernel instructions */ +#define EXCEPTION_RETRY_LIMIT 3 + +/* Status of message queue sections */ +#define MQS_EMPTY 0 +#define MQS_FULL 1 +#define MQS_NOOP 2 + +/* Additional errors not exported to caller */ +#define MQE_AGAIN 20 /* try again */ + +/*----------------- RESOURCE MANAGEMENT -------------------------------------*/ +/* optimized for x86_64 */ +struct message_queue { + union gru_mesqhead head __gru_cacheline_aligned__; /* CL 0 */ + int qlines; /* DW 1 */ + long hstatus[2]; + void *next __gru_cacheline_aligned__;/* CL 1 */ + void *limit; + void *start; + void *start2; + char data ____cacheline_aligned; /* CL 2 */ +}; + +/* First word in every message - used by mesq interface */ +struct message_header { + char present; + char present2; + char lines; + char fill; +}; + +static int gru_get_cpu_resources(int dsr_bytes, void **cb, void **dsr) +{ + struct gru_blade_state *bs; + int lcpu; + + BUG_ON(dsr_bytes > GRU_NUM_KERNEL_DSR_BYTES); + preempt_disable(); + bs = gru_base[uv_numa_blade_id()]; + lcpu = uv_blade_processor_id(); + *cb = bs->kernel_cb + lcpu * GRU_HANDLE_STRIDE; + *dsr = bs->kernel_dsr + lcpu * GRU_NUM_KERNEL_DSR_BYTES; + return 0; +} + +static void gru_free_cpu_resources(void *cb, void *dsr) +{ + preempt_enable(); +} + +int gru_get_cb_exception_detail(void *cb, + struct control_block_extended_exc_detail *excdet) +{ + struct gru_control_block_extended *cbe; + + cbe = cb_to_cbe(cb, CBNUM(cb)); + excdet->opc = cbe->opccpy; + excdet->exopc = cbe->exopccpy; + excdet->ecause = cbe->ecause; + excdet->exceptdet0 = cbe->idef1upd; + excdet->exceptdet1 = cbe->idef3upd; + return 0; +} + +char *gru_get_cb_exception_detail_str(int ret, void *cb, + char *buf, int size) +{ + struct gru_control_block_status *gen = (void *)cb; + struct control_block_extended_exc_detail excdet; + + if (ret > 0 && gen->istatus == CBS_EXCEPTION) { + gru_get_cb_exception_detail(cb, &excdet); + snprintf(buf, size, + "GRU exception: cb %p, opc %d, exopc %d, ecause 0x%x," + "excdet0 0x%lx, excdet1 0x%x", + gen, excdet.opc, excdet.exopc, excdet.ecause, + excdet.exceptdet0, excdet.exceptdet1); + } else { + snprintf(buf, size, "No exception"); + } + return buf; +} + +static int gru_wait_idle_or_exception(struct gru_control_block_status *gen) +{ + while (gen->istatus >= CBS_ACTIVE) { + cpu_relax(); + barrier(); + } + return gen->istatus; +} + +static int gru_retry_exception(void *cb) +{ + struct gru_control_block_status *gen = (void *)cb; + struct control_block_extended_exc_detail excdet; + int retry = EXCEPTION_RETRY_LIMIT; + + while (1) { + if (gru_get_cb_message_queue_substatus(cb)) + break; + if (gru_wait_idle_or_exception(gen) == CBS_IDLE) + return CBS_IDLE; + + gru_get_cb_exception_detail(cb, &excdet); + if (excdet.ecause & ~EXCEPTION_RETRY_BITS) + break; + if (retry-- == 0) + break; + gen->icmd = 1; + gru_flush_cache(gen); + } + return CBS_EXCEPTION; +} + +int gru_check_status_proc(void *cb) +{ + struct gru_control_block_status *gen = (void *)cb; + int ret; + + ret = gen->istatus; + if (ret != CBS_EXCEPTION) + return ret; + return gru_retry_exception(cb); + +} + +int gru_wait_proc(void *cb) +{ + struct gru_control_block_status *gen = (void *)cb; + int ret; + + ret = gru_wait_idle_or_exception(gen); + if (ret == CBS_EXCEPTION) + ret = gru_retry_exception(cb); + + return ret; +} + +void gru_abort(int ret, void *cb, char *str) +{ + char buf[GRU_EXC_STR_SIZE]; + + panic("GRU FATAL ERROR: %s - %s\n", str, + gru_get_cb_exception_detail_str(ret, cb, buf, sizeof(buf))); +} + +void gru_wait_abort_proc(void *cb) +{ + int ret; + + ret = gru_wait_proc(cb); + if (ret) + gru_abort(ret, cb, "gru_wait_abort"); +} + + +/*------------------------------ MESSAGE QUEUES -----------------------------*/ + +/* + * Save/restore the "present" flag that is in the second line of 2-line + * messages + */ +static inline int get_present2(void *p) +{ + struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES; + return mhdr->present; +} + +static inline void restore_present2(void *p, int val) +{ + struct message_header *mhdr = p + GRU_CACHE_LINE_BYTES; + mhdr->present = val; +} + +/* + * Create a message queue. + * qlines - message queue size in cache lines. Includes 2-line header. + */ +int gru_create_message_queue(void *p, unsigned int bytes) +{ + struct message_queue *mq = p; + unsigned int qlines; + + qlines = bytes / GRU_CACHE_LINE_BYTES - 2; + memset(mq, 0, bytes); + mq->start = &mq->data; + mq->start2 = &mq->data + (qlines / 2 - 1) * GRU_CACHE_LINE_BYTES; + mq->next = &mq->data; + mq->limit = &mq->data + (qlines - 2) * GRU_CACHE_LINE_BYTES; + mq->qlines = qlines; + mq->hstatus[0] = 0; + mq->hstatus[1] = 1; + mq->head = gru_mesq_head(2, qlines / 2 + 1); + return 0; +} +EXPORT_SYMBOL_GPL(gru_create_message_queue); + +/* + * Send a NOOP message to a message queue + * Returns: + * 0 - if queue is full after the send. This is the normal case + * but various races can change this. + * -1 - if mesq sent successfully but queue not full + * >0 - unexpected error. MQE_xxx returned + */ +static int send_noop_message(void *cb, + struct message_queue *xmq, void *mesg) +{ + const struct message_header noop_header = { + .present = MQS_NOOP, .lines = 1}; + void *m; + int substatus, ret; + struct message_header save_mhdr, *mhdr = mesg; + + STAT(mesq_noop); + save_mhdr = *mhdr; + *mhdr = noop_header; + gru_mesq(cb, xmq, gru_get_tri(mhdr), 1, IMA); + ret = gru_wait(cb); + + if (ret) { + substatus = gru_get_cb_message_queue_substatus(cb); + switch (substatus) { + case CBSS_NO_ERROR: + STAT(mesq_noop_unexpected_error); + ret = MQE_UNEXPECTED_CB_ERR; + break; + case CBSS_LB_OVERFLOWED: + STAT(mesq_noop_lb_overflow); + ret = MQE_CONGESTION; + break; + case CBSS_QLIMIT_REACHED: + STAT(mesq_noop_qlimit_reached); + ret = 0; + break; + case CBSS_AMO_NACKED: + STAT(mesq_noop_amo_nacked); + ret = MQE_CONGESTION; + break; + case CBSS_PUT_NACKED: + STAT(mesq_noop_put_nacked); + m = (void *)xmq + (gru_get_amo_value_head(cb) << 6); + gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, 1, 1, + IMA); + if (gru_wait(cb) == CBS_IDLE) + ret = MQE_AGAIN; + else + ret = MQE_UNEXPECTED_CB_ERR; + break; + case CBSS_PAGE_OVERFLOW: + default: + BUG(); + } + } + *mhdr = save_mhdr; + return ret; +} + +/* + * Handle a gru_mesq full. + */ +static int send_message_queue_full(void *cb, + struct message_queue *xmq, void *mesg, int lines) +{ + union gru_mesqhead mqh; + unsigned int limit, head; + unsigned long avalue; + int half, qlines, save; + + /* Determine if switching to first/second half of q */ + avalue = gru_get_amo_value(cb); + head = gru_get_amo_value_head(cb); + limit = gru_get_amo_value_limit(cb); + + /* + * Fetch "qlines" from the queue header. Since the queue may be + * in memory that can't be accessed using socket addresses, use + * the GRU to access the data. Use DSR space from the message. + */ + save = *(int *)mesg; + gru_vload(cb, &xmq->qlines, gru_get_tri(mesg), XTYPE_W, 1, 1, IMA); + if (gru_wait(cb) != CBS_IDLE) + goto cberr; + qlines = *(int *)mesg; + *(int *)mesg = save; + half = (limit != qlines); + + if (half) + mqh = gru_mesq_head(qlines / 2 + 1, qlines); + else + mqh = gru_mesq_head(2, qlines / 2 + 1); + + /* Try to get lock for switching head pointer */ + gru_gamir(cb, EOP_IR_CLR, &xmq->hstatus[half], XTYPE_DW, IMA); + if (gru_wait(cb) != CBS_IDLE) + goto cberr; + if (!gru_get_amo_value(cb)) { + STAT(mesq_qf_locked); + return MQE_QUEUE_FULL; + } + + /* Got the lock. Send optional NOP if queue not full, */ + if (head != limit) { + if (send_noop_message(cb, xmq, mesg)) { + gru_gamir(cb, EOP_IR_INC, &xmq->hstatus[half], + XTYPE_DW, IMA); + if (gru_wait(cb) != CBS_IDLE) + goto cberr; + STAT(mesq_qf_noop_not_full); + return MQE_AGAIN; + } + avalue++; + } + + /* Then flip queuehead to other half of queue. */ + gru_gamer(cb, EOP_ERR_CSWAP, xmq, XTYPE_DW, mqh.val, avalue, IMA); + if (gru_wait(cb) != CBS_IDLE) + goto cberr; + + /* If not successfully in swapping queue head, clear the hstatus lock */ + if (gru_get_amo_value(cb) != avalue) { + STAT(mesq_qf_switch_head_failed); + gru_gamir(cb, EOP_IR_INC, &xmq->hstatus[half], XTYPE_DW, IMA); + if (gru_wait(cb) != CBS_IDLE) + goto cberr; + } + return MQE_AGAIN; +cberr: + STAT(mesq_qf_unexpected_error); + return MQE_UNEXPECTED_CB_ERR; +} + + +/* + * Handle a gru_mesq failure. Some of these failures are software recoverable + * or retryable. + */ +static int send_message_failure(void *cb, + struct message_queue *xmq, + void *mesg, + int lines) +{ + int substatus, ret = 0; + void *m; + + substatus = gru_get_cb_message_queue_substatus(cb); + switch (substatus) { + case CBSS_NO_ERROR: + STAT(mesq_send_unexpected_error); + ret = MQE_UNEXPECTED_CB_ERR; + break; + case CBSS_LB_OVERFLOWED: + STAT(mesq_send_lb_overflow); + ret = MQE_CONGESTION; + break; + case CBSS_QLIMIT_REACHED: + STAT(mesq_send_qlimit_reached); + ret = send_message_queue_full(cb, xmq, mesg, lines); + break; + case CBSS_AMO_NACKED: + STAT(mesq_send_amo_nacked); + ret = MQE_CONGESTION; + break; + case CBSS_PUT_NACKED: + STAT(mesq_send_put_nacked); + m = (void *)xmq + (gru_get_amo_value_head(cb) << 6); + gru_vstore(cb, m, gru_get_tri(mesg), XTYPE_CL, lines, 1, IMA); + if (gru_wait(cb) == CBS_IDLE) + ret = MQE_OK; + else + ret = MQE_UNEXPECTED_CB_ERR; + break; + default: + BUG(); + } + return ret; +} + +/* + * Send a message to a message queue + * cb GRU control block to use to send message + * xmq message queue + * mesg message. ust be vaddr within a GSEG + * bytes message size (<= 2 CL) + */ +int gru_send_message_gpa(void *xmq, void *mesg, unsigned int bytes) +{ + struct message_header *mhdr; + void *cb; + void *dsr; + int istatus, clines, ret; + + STAT(mesq_send); + BUG_ON(bytes < sizeof(int) || bytes > 2 * GRU_CACHE_LINE_BYTES); + + clines = (bytes + GRU_CACHE_LINE_BYTES - 1) / GRU_CACHE_LINE_BYTES; + if (gru_get_cpu_resources(bytes, &cb, &dsr)) + return MQE_BUG_NO_RESOURCES; + memcpy(dsr, mesg, bytes); + mhdr = dsr; + mhdr->present = MQS_FULL; + mhdr->lines = clines; + if (clines == 2) { + mhdr->present2 = get_present2(mhdr); + restore_present2(mhdr, MQS_FULL); + } + + do { + ret = MQE_OK; + gru_mesq(cb, xmq, gru_get_tri(mhdr), clines, IMA); + istatus = gru_wait(cb); + if (istatus != CBS_IDLE) + ret = send_message_failure(cb, xmq, dsr, clines); + } while (ret == MQE_AGAIN); + gru_free_cpu_resources(cb, dsr); + + if (ret) + STAT(mesq_send_failed); + return ret; +} +EXPORT_SYMBOL_GPL(gru_send_message_gpa); + +/* + * Advance the receive pointer for the queue to the next message. + */ +void gru_next_message(void *rmq) +{ + struct message_queue *mq = rmq; + struct message_header *mhdr = mq->next; + void *next, *pnext; + int half = -1; + int lines = mhdr->lines; + + if (lines == 2) + restore_present2(mhdr, MQS_EMPTY); + mhdr->present = MQS_EMPTY; + + pnext = mq->next; + next = pnext + GRU_CACHE_LINE_BYTES * lines; + if (next == mq->limit) { + next = mq->start; + half = 1; + } else if (pnext < mq->start2 && next >= mq->start2) { + half = 0; + } + + if (half >= 0) + mq->hstatus[half] = 1; + mq->next = next; +} +EXPORT_SYMBOL_GPL(gru_next_message); + +/* + * Get next message from message queue. Return NULL if no message + * present. User must call next_message() to move to next message. + * rmq message queue + */ +void *gru_receive_message(void *rmq) +{ + struct message_queue *mq = rmq; + struct message_header *mhdr = mq->next; + int present = mhdr->present; + + /* skip NOOP messages */ + STAT(mesq_receive); + while (present == MQS_NOOP) { + gru_next_message(rmq); + mhdr = mq->next; + present = mhdr->present; + } + + /* Wait for both halves of 2 line messages */ + if (present == MQS_FULL && mhdr->lines == 2 && + get_present2(mhdr) == MQS_EMPTY) + present = MQS_EMPTY; + + if (!present) { + STAT(mesq_receive_none); + return NULL; + } + + if (mhdr->lines == 2) + restore_present2(mhdr, mhdr->present2); + + return mhdr; +} +EXPORT_SYMBOL_GPL(gru_receive_message); + +/* ---------------------- GRU DATA COPY FUNCTIONS ---------------------------*/ + +/* + * Copy a block of data using the GRU resources + */ +int gru_copy_gpa(void *dest_gpa, void *src_gpa, unsigned int bytes) +{ + void *cb; + void *dsr; + int ret; + + STAT(mesq_send); + if (gru_get_cpu_resources(GRU_NUM_KERNEL_DSR_BYTES, &cb, &dsr)) + return MQE_BUG_NO_RESOURCES; + gru_bcopy(cb, src_gpa, dest_gpa, gru_get_tri(dsr), + XTYPE_B, bytes, GRU_NUM_KERNEL_DSR_BYTES, IMA); + ret = gru_wait(cb); + gru_free_cpu_resources(cb, dsr); + return ret; +} + +/* ---------------------- KERNEL QUICKTESTS RUN AT STARTUP ------------------*/ +static __cacheline_aligned unsigned long word0; +static __cacheline_aligned unsigned long word1; + +static int quicktest(struct gru_state *gru) +{ + void *cb; + void *dsr; + unsigned long *p; + + cb = GRU_GSEG_CB(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0); + dsr = GRU_GSEG_DS(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0); + p = dsr; + word0 = MAGIC; + + gru_vload(cb, uv_vgpa(&word0), 0, XTYPE_DW, 1, 1, IMA); + if (gru_wait(cb) != CBS_IDLE) + BUG(); + + if (*(unsigned long *)dsr != MAGIC) + BUG(); + gru_vstore(cb, uv_vgpa(&word1), 0, XTYPE_DW, 1, 1, IMA); + if (gru_wait(cb) != CBS_IDLE) + BUG(); + + if (word0 != word1 || word0 != MAGIC) { + printk + ("GRU quicktest err: gru %d, found 0x%lx, expected 0x%lx\n", + gru->gs_gid, word1, MAGIC); + BUG(); /* ZZZ should not be fatal */ + } + + return 0; +} + +#define ALIGNUP(p, q) ((void *)(((unsigned long)(p) + (q) - 1) & ~(q - 1))) + +int quicktest2(void) +{ + void *p, *mq; + unsigned long *dw; + int i, ret; + char mes[GRU_CACHE_LINE_BYTES], *m; + + /* Need 1K cacheline aligned that does not cross page boundary */ + p = kmalloc(4096, 0); + mq = ALIGNUP(p, 1024); + memset(mes, 0xee, sizeof(mes)); + dw = mq; + + gru_create_message_queue(mq, 18 * GRU_CACHE_LINE_BYTES); + for (i = 0; i < 16; i++) { + mes[8] = i; + do { + ret = gru_send_message_gpa(uv_vgpa(mq), mes, + sizeof(mes)); + } while (ret == MQE_CONGESTION); + if (ret) + break; + } + BUG_ON(ret != MQE_QUEUE_FULL || i != 14); + + for (i = 0; i < 16; i++) { + m = gru_receive_message(mq); + if (!m || m[8] != i) + break; + gru_next_message(mq); + } + BUG_ON(i != 14); + kfree(p); + return 0; +} + +int gru_kservices_init(struct gru_state *gru) +{ + struct gru_blade_state *bs; + struct gru_context_configuration_handle *cch; + unsigned long cbr_map, dsr_map; + int err, num, cpus_possible; + + /* + * Currently, resources are reserved ONLY on the second chiplet + * on each blade. This leaves ALL resources on chiplet 0 available + * for user code. + */ + bs = gru->gs_blade; + if (gru != &bs->bs_grus[1]) + return 0; + + cpus_possible = uv_blade_nr_possible_cpus(gru->gs_blade_id); + + num = GRU_NUM_KERNEL_CBR * cpus_possible; + cbr_map = reserve_gru_cb_resources(gru, GRU_CB_COUNT_TO_AU(num), NULL); + gru->gs_reserved_cbrs += num; + + num = GRU_NUM_KERNEL_DSR_BYTES * cpus_possible; + dsr_map = reserve_gru_ds_resources(gru, GRU_DS_BYTES_TO_AU(num), NULL); + gru->gs_reserved_dsr_bytes += num; + + gru->gs_active_contexts++; + __set_bit(KERNEL_CTXNUM, &gru->gs_context_map); + cch = GRU_CCH(gru->gs_gru_base_vaddr, KERNEL_CTXNUM); + + bs->kernel_cb = GRU_GSEG_CB(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0); + bs->kernel_dsr = GRU_GSEG_DS(gru->gs_gru_base_vaddr, KERNEL_CTXNUM, 0); + + lock_cch_handle(cch); + cch->tfm_fault_bit_enable = 0; + cch->tlb_int_enable = 0; + cch->tfm_done_bit_enable = 0; + cch->unmap_enable = 1; + err = cch_allocate(cch, 0, cbr_map, dsr_map); + if (err) { + gru_dbg(grudev, + "Unable to allocate kernel CCH: gru %d, err %d\n", + gru->gs_gid, err); + BUG(); + } + if (cch_start(cch)) { + gru_dbg(grudev, "Unable to start kernel CCH: gru %d, err %d\n", + gru->gs_gid, err); + BUG(); + } + unlock_cch_handle(cch); + + if (options & GRU_QUICKLOOK) + quicktest(gru); + return 0; +} -- Thanks Jack Steiner (steiner@sgi.com) 651-683-5302 Sr Principal Engineer SGI - Silicon Graphics, Inc. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/