aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFlorian Fischer <florian.fl.fischer@fau.de>2020-04-08 16:20:29 +0200
committerFlorian Fischer <florian.fl.fischer@fau.de>2020-04-08 16:20:29 +0200
commitd58dc6c95d9044ffafa08b4327f5abbf0f5b54e0 (patch)
tree5e66b5ceebdfcd8bb9e1e492287fd0d69d53f727
parentf7c6f7142e38e4bf42f95bb706c37c9ae61a04df (diff)
downloadallocbench-d58dc6c95d9044ffafa08b4327f5abbf0f5b54e0.tar.gz
allocbench-d58dc6c95d9044ffafa08b4327f5abbf0f5b54e0.zip
add micro benchmark measureing malloc using rdtsc
-rw-r--r--src/benchmarks/rdtsc.py75
-rw-r--r--src/benchmarks/rdtsc/Makefile25
-rw-r--r--src/benchmarks/rdtsc/rdtsc.c104
3 files changed, 204 insertions, 0 deletions
diff --git a/src/benchmarks/rdtsc.py b/src/benchmarks/rdtsc.py
new file mode 100644
index 0000000..b0cd808
--- /dev/null
+++ b/src/benchmarks/rdtsc.py
@@ -0,0 +1,75 @@
+# Copyright 2020 Florian Fischer <florian.fl.fischer@fau.de>
+#
+# This file is part of allocbench.
+#
+# allocbench is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# allocbench is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with allocbench. If not, see <http://www.gnu.org/licenses/>.
+"""Definition of the rdtsc micro benchmark
+
+This benchmark measures the clock cycles used by malloc.
+It tries to spread the spawned thread on all cores exept the first one.
+See: https://developers.redhat.com/blog/2016/03/11/practical-micro-benchmarking-with-ltrace-and-sched/
+"""
+
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+
+from src.benchmark import Benchmark
+import src.globalvars
+import src.plots
+
+
+class BenchmarkRdtsc(Benchmark):
+ """rdtsc micro benchmark
+
+ This benchmark allocates and frees n blocks in t concurrent threads measuring the used cycles.
+ """
+ def __init__(self):
+ name = "rdtsc"
+
+ self.cmd = "rdtsc {mode} 100000 64 {threads}"
+ self.measure_cmd = ""
+
+ self.args = {"threads": [1],
+ "mode": ['fresh', 'cached']}
+
+ self.requirements = ["rdtsc"]
+ super().__init__(name)
+
+ def process_output(self, result, stdout, stderr, alloc, perm):
+ all_cycles = []
+ for line in stdout.splitlines():
+ all_cycles.append(int(line.split()[1]))
+ result["cycles"] = all_cycles
+
+ def summary(self):
+ for perm in self.iterate_args(args=self.results['args']):
+ label = f'rdtsc_{perm}_cycles'
+ fig = plt.figure(label)
+ src.plots.FIGURES[label] = fig
+
+ axes = plt.axes()
+ axes.set_ylim([50, 800])
+
+ for alloc in self.results['allocators']:
+ d = np.sort(self.results[alloc][perm][0]['cycles'])
+ plt.plot(d, label=alloc, color=src.plots._get_alloc_color(self, alloc))
+
+ fig.savefig(f'{label}.{src.globalvars.summary_file_ext}')
+ plt.legend()
+ plt.title(str(perm))
+ plt.show()
+
+
+rdtsc = BenchmarkRdtsc()
diff --git a/src/benchmarks/rdtsc/Makefile b/src/benchmarks/rdtsc/Makefile
new file mode 100644
index 0000000..f81a84b
--- /dev/null
+++ b/src/benchmarks/rdtsc/Makefile
@@ -0,0 +1,25 @@
+OBJDIR ?= obj
+
+CC ?= gcc
+
+WARNFLAGS ?= -Wall -Wextra
+COMMONFLAGS ?= -fno-builtin -pthread
+OPTFLAGS ?= -O0 -g
+
+CFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS)
+
+LDFLAGS ?= -pthread -static-libgcc
+
+.PHONY = all clean
+
+all: $(OBJDIR)/rdtsc
+
+$(OBJDIR)/rdtsc: rdtsc.c Makefile | $(OBJDIR)
+ @echo compiling $@...
+ $(CC) $(LDFLAGS) $(CFLAGS) -o $@ $<
+
+$(OBJDIR):
+ mkdir -p $@
+
+clean:
+ rm -rf $(OBJDIR)
diff --git a/src/benchmarks/rdtsc/rdtsc.c b/src/benchmarks/rdtsc/rdtsc.c
new file mode 100644
index 0000000..33a8626
--- /dev/null
+++ b/src/benchmarks/rdtsc/rdtsc.c
@@ -0,0 +1,104 @@
+#define _GNU_SOURCE /* See feature_test_macros(7) */
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/sysinfo.h>
+
+int mode = 0;
+int size = 64;
+int iterations = 100000;
+int num_cpus;
+
+static __inline__ int64_t rdtsc_s(void)
+{
+ unsigned a, d;
+ asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
+ asm volatile("rdtsc" : "=a" (a), "=d" (d));
+ return ((unsigned long)a) | (((unsigned long)d) << 32);
+}
+
+static __inline__ int64_t rdtsc_e(void)
+{
+ unsigned a, d;
+ asm volatile("rdtscp" : "=a" (a), "=d" (d));
+ asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
+ return ((unsigned long)a) | (((unsigned long)d) << 32);
+}
+
+static void* test_thread_func(void* arg) {
+ int64_t clock_before, clock_after;
+ void* p;
+
+ int64_t* clocks = malloc(iterations * sizeof(int64_t));
+ if (!clocks)
+ abort();
+
+ // set cpu affinity to prevent cpu switching
+ int64_t tid = (int64_t) arg;
+ cpu_set_t my_cpu;
+ /* Skip CPU0 - let the OS run on that one */
+ int my_cpu_num = (tid % (num_cpus-1))+1;
+
+ CPU_ZERO (&my_cpu);
+ /* CPU_SET (my_cpu_num, &my_cpu); */
+ CPU_SET (3, &my_cpu);
+ if (sched_setaffinity (0, sizeof(my_cpu), &my_cpu) == -1)
+ perror ("setaffinity failed");
+
+ for(int i = 0; i < iterations; i++) {
+ clock_before = rdtsc_s();
+ p = malloc(size);
+ clock_after = rdtsc_e();
+
+ // measure potentially cached allocations
+ if (mode)
+ free(p);
+
+ clocks[i] = clock_after - clock_before;
+ }
+
+ for(int i = 0; i < iterations; i++) {
+ printf("malloc(%d): %d cycles\n", size, clocks[i]);
+ }
+
+ return NULL;
+}
+
+int main(int argc, char* argv[]) {
+ pthread_t* threads;
+ int num_threads = 1;
+
+ num_cpus = get_nprocs();
+
+ if (argc > 5) {
+ fprintf(stderr, "Usage: %s <iterations> <size> <num threads>\n", argv[0]);
+ return 1;
+ }
+
+ if (argc > 1 && strncmp(argv[1], "cached", strlen("cached"))) mode = 1;
+ if (argc > 2) iterations = atoi(argv[2]);
+ if (argc > 3) size = atoi(argv[3]);
+ if (argc > 4) num_threads = atoi(argv[4]);
+
+ fprintf(stderr, "iterations = %d; size = %d; threads = %d\n", iterations, size, num_threads);
+
+ threads = (pthread_t*) malloc(num_threads * sizeof(pthread_t));
+
+ for (int i = 0; i < num_threads; i++) {
+ if (0 != pthread_create(&threads[i], NULL, test_thread_func, NULL)) {
+ perror("pthread_create");
+ return 1;
+ }
+ }
+
+ for(int i = 0; i < num_threads; i++) {
+ if (0 != pthread_join(threads[i], NULL)) {
+ perror("pthread_join");
+ return 1;
+ }
+ }
+
+ return 0;
+}