linux-kernel - [PATCH V2] perf bench sched cpu-matrix benchmark

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20111114085741.GA3051@linux.vnet.ibm.com>
Date:	Mon, 14 Nov 2011 14:27:41 +0530
From:	Kamalesh Babulal <kamalesh@...ux.vnet.ibm.com>
To:	Ingo Molnar <mingo@...e.hu>
Cc:	linux-kernel@...r.kernel.org,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Paul Turner <pjt@...gle.com>,
	Venki Pallipadi <venki@...gle.com>,
	Vaidyanathan Srinivasan <svaidy@...ux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>,
	Arnaldo Carvalho de Melo <acme@...hat.com>
Subject: [PATCH V2] perf bench sched cpu-matrix benchmark

Hi Ingo,
 Thanks for the review.

Changes from v1:
- Addressed the review comments from Vatsa and Ingo.
- Added support for creating master/worker thread(s) with individual policy 
  and priority.
- Changed the format of printing the progress multiplications of thread(s).

perf bench: Add sched cpu-matrix benchmark

perf bench sched cpu-matrix benchmark is a matrix multiplication
workload, which can be replaced with the traditional while1
cpu hog.

Example of usage:

% perf bench sched cpu-matrix
# Running sched/cpu-matrix benchmark...

Multiplication of [20] x [20] matrix, using [1] threads
 Total time: 0.000170 [sec]

% perf bench sched cpu-matrix -s1024 -t10 -p1000
# Running sched/cpu-matrix benchmark...
57701987 multiplications over 1.000069 [sec]
60361191 multiplications over 1.000119 [sec]
...
20753139 multiplications over 1.000151 [sec]

Multiplication of [1K] x [1K] matrix, of unsigned int using [10] threads
 Total time: 19.002301 [sec]

% perf bench --format=simple sched cpu-matrix -s1k -t10
 Total time: 18.601030 [sec]

% perf bench sched cpu-matrix -s1k -t10 -wr99 -p1000
# Running sched/cpu-matrix benchmark...
56894683 multiplications over 1.000071 [sec]
53085285 multiplications over 1.000115 [sec]
...
37727908 multiplications over 1.000127 [sec]

Multiplication of [1K] x [1K] matrix, of unsigned int using [10] threads
 Total time: 19.002302 [sec]

Signed-off-by: Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>
Signed-off-by: Kamalesh Babulal <kamalesh@...ux.vnet.ibm.com>
---
 tools/perf/Makefile           |    1 +
 tools/perf/bench/bench.h      |    2 +
 tools/perf/bench/cpu-matrix.c |  573 +++++++++++++++++++++++++++++++++++++++++
 tools/perf/builtin-bench.c    |    3 +
 4 files changed, 579 insertions(+), 0 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index b98e307..02bd562 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -363,6 +363,7 @@ ifeq ($(RAW_ARCH),x86_64)
 BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
 endif
 BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
+BUILTIN_OBJS += $(OUTPUT)bench/cpu-matrix.o
 
 BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
 BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index f7781c6..174465a 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -4,6 +4,8 @@
 extern int bench_sched_messaging(int argc, const char **argv, const char *prefix);
 extern int bench_sched_pipe(int argc, const char **argv, const char *prefix);
 extern int bench_mem_memcpy(int argc, const char **argv, const char *prefix __used);
+extern int bench_cpu_matrix(int argc, const char **argv,
+				const char *prefix __used);
 
 #define BENCH_FORMAT_DEFAULT_STR	"default"
 #define BENCH_FORMAT_DEFAULT		0
diff --git a/tools/perf/bench/cpu-matrix.c b/tools/perf/bench/cpu-matrix.c
new file mode 100644
index 0000000..be47c74
--- /dev/null
+++ b/tools/perf/bench/cpu-matrix.c
@@ -0,0 +1,573 @@
+/*
+ * cpu matrix multiplication benchmark
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) IBM Corporation, 2011
+ *
+ * Authors: Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>
+ *	    Kamalesh Babulal <kamalesh@...ux.vnet.ibm.com>
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+#include "../builtin.h"
+#include "bench.h"
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <math.h>
+#include <sched.h>
+
+#define DEFAULT_ITERATIONS 1
+#define DEFAULT_MATRIX_SIZE 20
+#define DEFAULT_NUM_THREADS 1
+#define DEFAULT_SLEEP_MSEC 0
+#define DEFAULT_THREAD_POLICY SCHED_OTHER
+#define DEFAULT_THREAD_PRIO 20
+
+#define SCHED_NORMAL	0
+#define SCHED_BATCH	3
+#define	SCHED_IDLE	5
+
+static int iterations = DEFAULT_ITERATIONS;
+static const char *mat_size_str = "20";
+static unsigned int mat_size = DEFAULT_MATRIX_SIZE;
+static int num_threads = DEFAULT_NUM_THREADS;
+
+static int ready_count;
+static int sleep_msec = DEFAULT_SLEEP_MSEC;
+static const char *master_prio_str = "O0";
+static const char *worker_prio_str = "O0";
+static int master_policy = DEFAULT_THREAD_POLICY;
+static int worker_policy = DEFAULT_THREAD_POLICY;
+static int master_prio = DEFAULT_THREAD_PRIO;
+static int worker_prio = DEFAULT_THREAD_PRIO;
+static pthread_mutex_t ready_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static inline void barf(const char *str)
+{
+	if (errno)
+		perror(str);
+	else
+		printf("%s\n", str);
+
+	exit(1);
+}
+
+static inline void *galloc(size_t size)
+{
+	void *ptr = malloc(size);
+
+	if (!ptr)
+		barf("malloc ");
+
+	return ptr;
+}
+
+
+static inline void populate_matrix(unsigned int *matrix, int size)
+{
+	unsigned int i;
+
+	for (i = 0; i < ((u64)size * (u64)size); i++)
+		*(matrix + i) = random() % 100;
+}
+
+static const struct option options[] = {
+	OPT_STRING('s', "size", &mat_size_str, "20",
+			"Specify the size of the square matrix.\n"
+			"\t\t\t  Available unit: K, M (upper and lower)"),
+	OPT_INTEGER('i', "iterations", &iterations,
+			"Specify number of iterations"),
+	OPT_INTEGER('t', "threads", &num_threads,
+			"Specify number of threads"),
+	OPT_INTEGER('p', "sleep", &sleep_msec,
+			"Progress to be printed every P millseconds."),
+	OPT_STRING('m', "master<policy><prio>", &master_prio_str, "O0",
+			"Create master thread(s) with <policy><priority>\n "
+			"\t\t\t  Supported policies are:\n"
+			"\t\t\t\t\t\t F = SCHED_FIFO\n"
+			"\t\t\t\t\t\t R = SCHED_RR\n"
+			"\t\t\t\t\t\t O = SCHED_OTHER <default>\n"
+			"\t\t\t\t\t\t B = SCHED_BATCH\n"
+			"\t\t\t\t\t\t I = SCHED_IDLE"),
+	OPT_STRING('w', "worker<policy><prio>", &worker_prio_str, "O0",
+			"Create worker thread with <policy><priority>\n"
+			"\t\t\t  Supported policies are:\n"
+			"\t\t\t\t\t\t F = SCHED_FIFO\n"
+			"\t\t\t\t\t\t R = SCHED_RR\n"
+			"\t\t\t\t\t\t O = SCHED_OTHER <default>\n"
+			"\t\t\t\t\t\t B = SCHED_BATCH\n"
+			"\t\t\t\t\t\t I = SCHED_IDLE"),
+	OPT_END()
+};
+
+static const char *const bench_cpu_matrix_usage[] = {
+	"perf bench sched cpu_matrix <options>",
+	NULL
+};
+
+struct thread_work {
+	unsigned int *a, *b, *c;	/* Matrix A, B, C */
+	unsigned int matrix_size;	/* Matrix size */
+	unsigned int progress;		/* Multiplication count */
+	unsigned int prev_progress;	/* Used for calculating delta */
+	int size;			/* No. of rows handled by a thread */
+	int start_row;			/* Row to start multiplication */
+	int done;			/* Indication of the thread job done */
+	int num_threads;		/* Number of threads */
+	int iter_count;			/* Number of iterations */
+	int policy;			/* Sched policy */
+	int prio;			/* Priority of thread */
+};
+
+/*
+ * Returns the delta/difference between previous checkpoint
+ * and current progress, where progress is summation of
+ * multiplications done by each thread.
+ */
+static u64 thread_progress(struct thread_work *work, int thread_count)
+{
+	int i;
+	u64 total_progress = 0;
+
+	for (i = 0; i < thread_count; i++) {
+		unsigned int progress, prev_progress, delta;
+
+		progress = work[i].progress;
+		prev_progress = work[i].prev_progress;
+		delta = progress - prev_progress;
+
+		work[i].prev_progress = progress;
+
+		total_progress += delta;
+	}
+
+	return total_progress;
+}
+
+/*
+ * Returns 1 if all the threads are done with multiplication
+ * else return 0.
+ */
+static inline int all_threads_done(struct thread_work *work, int thread_count)
+{
+	int i, done = 0;
+
+	for (i = 0; i < thread_count; ++i) {
+		if (work[i].done)
+			done++;
+	}
+
+	if (done >= thread_count)
+		return 1;
+
+	return 0;
+}
+
+static void set_thread_policy_prio(pthread_t thread, int policy, int prio)
+{
+	int ret;
+	struct sched_param sched_param;
+
+	/*
+	 * If the policy and priority are same the default,
+	 * do nothing.
+	 */
+	if (policy == DEFAULT_THREAD_POLICY &&
+		prio == DEFAULT_THREAD_PRIO)
+		return;
+
+	sched_param.sched_priority = prio;
+
+	ret = pthread_setschedparam(thread, policy, &sched_param);
+	if (ret < 0)
+		barf("Unable to set task policy/priority ");
+}
+/*
+ * Prints the progress of multiplications done by all threads
+ * every sleep_msec
+ */
+static void print_progress(struct thread_work *work, int thread_count,
+				int sleep_usec)
+{
+	struct timeval start, end, delta;
+	u64 curr_progress = 0;
+
+	if (!sleep_msec)
+		return;
+
+	delta.tv_sec = 0;
+	delta.tv_usec = 0;
+
+	gettimeofday(&start, NULL);
+
+	do {
+		usleep(sleep_usec);
+		gettimeofday(&end, NULL);
+		curr_progress = thread_progress(work, thread_count);
+		timersub(&end, &start, &delta);
+		start = end;
+
+		printf("%lu multiplications over %lu.%06lu [sec]\n",
+				(unsigned long)curr_progress, delta.tv_sec,
+				delta.tv_usec);
+		fflush(stdout);
+
+	} while (!all_threads_done(work, thread_count));
+}
+
+/*
+ * Multiples single row X no. of columns.
+ */
+static void row_col_multiply(unsigned int *a, unsigned int *b, unsigned int *c,
+				int row_num, int col_num, int size,
+				struct thread_work *work)
+{
+	int i, j, k, sum = 0;
+
+	for (i = 0; i < size; ++i) {
+		j = *(a + (row_num * size) + i);
+		k = *(b + (i * size) + col_num);
+		sum += (j * k);
+		work->progress++;
+	}
+
+	*(c + (row_num * size) + col_num) = sum;
+}
+
+static void *thread_fn(void *arg)
+{
+	struct thread_work *work = arg;
+	int row, col, i, k;
+	unsigned int j, *a = work->a, *b = work->b, *c = work->c;
+
+	set_thread_policy_prio(pthread_self(), work->policy, work->prio);
+
+	pthread_mutex_lock(&ready_lock);
+	ready_count++;
+	pthread_mutex_unlock(&ready_lock);
+
+	/*
+	 * Wait for all the threads to start up
+	 */
+	while (ready_count < work->num_threads)
+		cpu_relax();
+
+	/*
+	 * Iteration loop
+	 */
+	for (k = 0; k < work->iter_count; k++) {
+		/*
+		 * Rows this thread is supposed to work on
+		 */
+		row = work->start_row;
+
+		for (i = 0; i < work->size; i++, row++) {
+			/*
+			 * Reset the column to first column
+			 */
+			col = 0;
+
+			for (j = 0; j < work->matrix_size; j++, col++)
+				row_col_multiply(a, b, c, row, col,
+						 work->matrix_size, work);
+		}
+	}
+
+	work->done = 1;
+
+	return NULL;
+}
+
+/*
+ * Core function to create threads and assign work to them.
+ */
+static void matrix_multiply(unsigned int *a, unsigned int *b, unsigned int *c,
+				int matrix_size, int iter_count,
+				int thread_count, int sleep_usec,
+				int mpolicy, int mprio, int wpolicy, int wprio)
+{
+	int i;
+	unsigned int per_thread_work, rem, row_idx = 0;
+	struct thread_work *work_arr;
+	pthread_t *thread_ids;
+
+	assert(thread_count > 0);
+
+	work_arr = galloc(thread_count * sizeof(struct thread_work));
+	thread_ids = galloc(thread_count * sizeof(pthread_t));
+
+	per_thread_work = matrix_size / thread_count;
+	rem = matrix_size;
+
+	for (i = 0; i < thread_count; ++i) {
+		int num_rows, rc;
+
+		/*
+		 * If the thread is the last thread, assign it all the
+		 * remaining rows
+		 */
+		if (i == (thread_count - 1))
+			num_rows = rem;
+		else
+			num_rows = (rem > per_thread_work) ?
+						 per_thread_work : rem;
+
+		rem -= num_rows;
+		work_arr[i].a = a;
+		work_arr[i].b = b;
+		work_arr[i].c = c;
+		work_arr[i].matrix_size = matrix_size;
+		work_arr[i].iter_count = iter_count;
+		work_arr[i].size = num_rows;
+		work_arr[i].start_row = row_idx;
+		work_arr[i].progress = 0;
+		work_arr[i].prev_progress = 0;
+		work_arr[i].done = 0;
+		work_arr[i].num_threads = thread_count;
+		work_arr[i].policy = wpolicy;
+		work_arr[i].prio = wprio;
+
+		row_idx += num_rows;
+
+		rc = pthread_create(&thread_ids[i], NULL, thread_fn,
+							 &work_arr[i]);
+		if (rc != 0)
+			barf("pthread_create ");
+	}
+
+	assert(!rem);
+	assert(row_idx == mat_size);
+
+	set_thread_policy_prio(pthread_self(), mpolicy, mprio);
+
+	print_progress(work_arr, thread_count, sleep_usec);
+
+	for (i = 0; i < thread_count; ++i)
+		pthread_join(thread_ids[i], NULL);
+
+	free(work_arr);
+	free(thread_ids);
+}
+
+static inline char *matrix_string(s64 length)
+{
+	int size;
+	char *matrix_size = galloc(INT_MAX);
+
+	if ((length % 1024) == 0) {
+		size = (length / 1024);
+		if (size > 1) {
+			size /= 1024;
+			sprintf(matrix_size, "%d%c", size, 'M');
+		} else
+			sprintf(matrix_size, "%d%c", size, 'K');
+	} else
+		sprintf(matrix_size, "%d", (int)length);
+
+	return matrix_size;
+}
+
+#define K 1024LL
+
+static int parse_mat_size(const char *str)
+{
+	unsigned int i;
+	int unit = 1;
+	s64 length = -1;
+
+	if (!isdigit(str[0]))
+		return -1;
+
+	for (i = 1; i < strlen(str); i++) {
+		switch (str[i]) {
+		case 'k':
+		case 'K':
+			unit = (unit == 1 ? K : -1);
+			break;
+		case 'm':
+		case 'M':
+			unit = (unit == 1 ? (K * K) : -1);
+			break;
+		default:
+			if (!isdigit(str[i]))
+				goto out_err;
+			break;
+		}
+	}
+
+	if (unit > 0)
+		length = atoll(str) * unit;
+
+	if (length > INT_MAX)
+		length = -1;
+
+out_err:
+	return (int)length;
+}
+
+
+static int parse_policy_prio(const char *str, int *policy, int *prio)
+{
+	unsigned int i;
+	char *str_eptr = NULL;
+	int ret = -1;
+
+	/*
+	 * First letter should be the policy
+	 */
+	if (isdigit(str[0]))
+		return -1;
+
+	switch (str[0]) {
+	case 'f':
+	case 'F':
+		*policy = SCHED_FIFO;
+		break;
+	case 'r':
+	case 'R':
+		*policy = SCHED_RR;
+		break;
+	case 'o':
+	case 'O':
+		*policy = SCHED_OTHER;
+		break;
+	case 'b':
+	case 'B':
+		*policy = SCHED_BATCH;
+		break;
+	case 'i':
+	case 'I':
+		*policy = SCHED_IDLE;
+		break;
+	default:
+		goto out_err;
+		break;
+
+	}
+
+	for (i = 1; i < strlen(str); i++) {
+		if (!isdigit(str[i]))
+			goto out_err;
+	}
+
+	/*
+	 * User can set 0..99 as prirority for tasks with
+	 * SCHED_FIFO/SCHED_RR policy and 0 for tasks with
+	 * SCHED_NORMAL/SCHED_BATCH/SCHED_IDLE policy.
+	 */
+	*prio = (int)strtol(str+1, &str_eptr, 10);
+	if (str == str_eptr || *prio < 0 || *prio > 99)
+		goto out_err;
+
+	if ((*policy == SCHED_FIFO || *policy == SCHED_RR)
+		&& (*prio > 99))
+		goto out_err;
+
+	if ((*policy == SCHED_NORMAL || *policy == SCHED_BATCH ||
+		*policy == SCHED_IDLE) && (*prio != 0))
+		goto out_err;
+
+	ret = 0;
+
+out_err:
+	return ret;
+}
+
+
+int bench_cpu_matrix(int argc, const char **argv,
+			const char *prefix __used)
+{
+
+	unsigned int *mat_a, *mat_b, *mat_c;
+	struct timeval start, stop, diff;
+	size_t alloc_size;
+	char *matrix_str;
+	int ret = 0;
+
+	errno = 0;
+
+	parse_options(argc, argv, options, bench_cpu_matrix_usage, 0);
+	mat_size = parse_mat_size(mat_size_str);
+	if ((int)mat_size <= 0)
+		barf("Invalid size of matrix ");
+
+	ret = parse_policy_prio(master_prio_str, &master_policy, &master_prio);
+	if (ret != 0)
+		barf("Invalid master policy/prio ");
+
+	ret = parse_policy_prio(worker_prio_str, &worker_policy, &worker_prio);
+	if (ret != 0)
+		barf("Invalid worker policy/prio ");
+
+	if (iterations <= 0)
+		barf("Invalid loop(s) of iterations ");
+
+	if (num_threads <= 0)
+		barf("Invalid number of threads ");
+
+	alloc_size = (u64)mat_size * (u64)mat_size * sizeof(unsigned int);
+
+	mat_a = galloc(alloc_size);
+	mat_b = galloc(alloc_size);
+	mat_c = galloc(alloc_size);
+
+	sleep_msec *= 1000;
+
+	populate_matrix(mat_a, mat_size);
+	populate_matrix(mat_b, mat_size);
+
+	gettimeofday(&start, NULL);
+
+	matrix_multiply(mat_a, mat_b, mat_c, mat_size, iterations,
+			num_threads, sleep_msec,
+			master_policy, master_prio,
+			worker_policy, worker_prio);
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+
+	matrix_str = matrix_string(mat_size);
+
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		printf("\nMultiplication of [%s] x [%s] matrix,"
+			" of unsigned int using [%d] threads\n",
+				 matrix_str,  matrix_str,
+				 num_threads);
+		printf(" %s: %lu.%06lu [sec]\n", "Total time",
+				diff.tv_sec, diff.tv_usec);
+		break;
+	case BENCH_FORMAT_SIMPLE:
+		printf(" %s: %lu.%06lu [sec]\n", "Total time",
+				diff.tv_sec, diff.tv_usec);
+		break;
+	default:
+		barf("Unknown benchmark format");
+		break;
+	}
+
+	free(mat_a);
+	free(mat_b);
+	free(mat_c);
+	free(matrix_str);
+	return 0;
+}
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index fcb9626..df84428 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -42,6 +42,9 @@ static struct bench_suite sched_suites[] = {
 	{ "pipe",
 	  "Flood of communication over pipe() between two processes",
 	  bench_sched_pipe      },
+	{ "cpu-matrix",
+	  "Benchmark to run cpu matrix multiplication",
+	  bench_cpu_matrix      },
 	suite_all,
 	{ NULL,
 	  NULL,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/