diff options
| author | Florian Fischer <florian.fl.fischer@fau.de> | 2020-04-08 16:20:29 +0200 |
|---|---|---|
| committer | Florian Fischer <florian.fl.fischer@fau.de> | 2020-04-08 16:20:29 +0200 |
| commit | d58dc6c95d9044ffafa08b4327f5abbf0f5b54e0 (patch) | |
| tree | 5e66b5ceebdfcd8bb9e1e492287fd0d69d53f727 | |
| parent | f7c6f7142e38e4bf42f95bb706c37c9ae61a04df (diff) | |
| download | allocbench-d58dc6c95d9044ffafa08b4327f5abbf0f5b54e0.tar.gz allocbench-d58dc6c95d9044ffafa08b4327f5abbf0f5b54e0.zip | |
add micro benchmark measureing malloc using rdtsc
| -rw-r--r-- | src/benchmarks/rdtsc.py | 75 | ||||
| -rw-r--r-- | src/benchmarks/rdtsc/Makefile | 25 | ||||
| -rw-r--r-- | src/benchmarks/rdtsc/rdtsc.c | 104 |
3 files changed, 204 insertions, 0 deletions
diff --git a/src/benchmarks/rdtsc.py b/src/benchmarks/rdtsc.py new file mode 100644 index 0000000..b0cd808 --- /dev/null +++ b/src/benchmarks/rdtsc.py @@ -0,0 +1,75 @@ +# Copyright 2020 Florian Fischer <florian.fl.fischer@fau.de> +# +# This file is part of allocbench. +# +# allocbench is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# allocbench is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with allocbench. If not, see <http://www.gnu.org/licenses/>. +"""Definition of the rdtsc micro benchmark + +This benchmark measures the clock cycles used by malloc. +It tries to spread the spawned thread on all cores exept the first one. +See: https://developers.redhat.com/blog/2016/03/11/practical-micro-benchmarking-with-ltrace-and-sched/ +""" + +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +from src.benchmark import Benchmark +import src.globalvars +import src.plots + + +class BenchmarkRdtsc(Benchmark): + """rdtsc micro benchmark + + This benchmark allocates and frees n blocks in t concurrent threads measuring the used cycles. + """ + def __init__(self): + name = "rdtsc" + + self.cmd = "rdtsc {mode} 100000 64 {threads}" + self.measure_cmd = "" + + self.args = {"threads": [1], + "mode": ['fresh', 'cached']} + + self.requirements = ["rdtsc"] + super().__init__(name) + + def process_output(self, result, stdout, stderr, alloc, perm): + all_cycles = [] + for line in stdout.splitlines(): + all_cycles.append(int(line.split()[1])) + result["cycles"] = all_cycles + + def summary(self): + for perm in self.iterate_args(args=self.results['args']): + label = f'rdtsc_{perm}_cycles' + fig = plt.figure(label) + src.plots.FIGURES[label] = fig + + axes = plt.axes() + axes.set_ylim([50, 800]) + + for alloc in self.results['allocators']: + d = np.sort(self.results[alloc][perm][0]['cycles']) + plt.plot(d, label=alloc, color=src.plots._get_alloc_color(self, alloc)) + + fig.savefig(f'{label}.{src.globalvars.summary_file_ext}') + plt.legend() + plt.title(str(perm)) + plt.show() + + +rdtsc = BenchmarkRdtsc() diff --git a/src/benchmarks/rdtsc/Makefile b/src/benchmarks/rdtsc/Makefile new file mode 100644 index 0000000..f81a84b --- /dev/null +++ b/src/benchmarks/rdtsc/Makefile @@ -0,0 +1,25 @@ +OBJDIR ?= obj + +CC ?= gcc + +WARNFLAGS ?= -Wall -Wextra +COMMONFLAGS ?= -fno-builtin -pthread +OPTFLAGS ?= -O0 -g + +CFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS) + +LDFLAGS ?= -pthread -static-libgcc + +.PHONY = all clean + +all: $(OBJDIR)/rdtsc + +$(OBJDIR)/rdtsc: rdtsc.c Makefile | $(OBJDIR) + @echo compiling $@... + $(CC) $(LDFLAGS) $(CFLAGS) -o $@ $< + +$(OBJDIR): + mkdir -p $@ + +clean: + rm -rf $(OBJDIR) diff --git a/src/benchmarks/rdtsc/rdtsc.c b/src/benchmarks/rdtsc/rdtsc.c new file mode 100644 index 0000000..33a8626 --- /dev/null +++ b/src/benchmarks/rdtsc/rdtsc.c @@ -0,0 +1,104 @@ +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include <pthread.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/sysinfo.h> + +int mode = 0; +int size = 64; +int iterations = 100000; +int num_cpus; + +static __inline__ int64_t rdtsc_s(void) +{ + unsigned a, d; + asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((unsigned long)a) | (((unsigned long)d) << 32); +} + +static __inline__ int64_t rdtsc_e(void) +{ + unsigned a, d; + asm volatile("rdtscp" : "=a" (a), "=d" (d)); + asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); + return ((unsigned long)a) | (((unsigned long)d) << 32); +} + +static void* test_thread_func(void* arg) { + int64_t clock_before, clock_after; + void* p; + + int64_t* clocks = malloc(iterations * sizeof(int64_t)); + if (!clocks) + abort(); + + // set cpu affinity to prevent cpu switching + int64_t tid = (int64_t) arg; + cpu_set_t my_cpu; + /* Skip CPU0 - let the OS run on that one */ + int my_cpu_num = (tid % (num_cpus-1))+1; + + CPU_ZERO (&my_cpu); + /* CPU_SET (my_cpu_num, &my_cpu); */ + CPU_SET (3, &my_cpu); + if (sched_setaffinity (0, sizeof(my_cpu), &my_cpu) == -1) + perror ("setaffinity failed"); + + for(int i = 0; i < iterations; i++) { + clock_before = rdtsc_s(); + p = malloc(size); + clock_after = rdtsc_e(); + + // measure potentially cached allocations + if (mode) + free(p); + + clocks[i] = clock_after - clock_before; + } + + for(int i = 0; i < iterations; i++) { + printf("malloc(%d): %d cycles\n", size, clocks[i]); + } + + return NULL; +} + +int main(int argc, char* argv[]) { + pthread_t* threads; + int num_threads = 1; + + num_cpus = get_nprocs(); + + if (argc > 5) { + fprintf(stderr, "Usage: %s <iterations> <size> <num threads>\n", argv[0]); + return 1; + } + + if (argc > 1 && strncmp(argv[1], "cached", strlen("cached"))) mode = 1; + if (argc > 2) iterations = atoi(argv[2]); + if (argc > 3) size = atoi(argv[3]); + if (argc > 4) num_threads = atoi(argv[4]); + + fprintf(stderr, "iterations = %d; size = %d; threads = %d\n", iterations, size, num_threads); + + threads = (pthread_t*) malloc(num_threads * sizeof(pthread_t)); + + for (int i = 0; i < num_threads; i++) { + if (0 != pthread_create(&threads[i], NULL, test_thread_func, NULL)) { + perror("pthread_create"); + return 1; + } + } + + for(int i = 0; i < num_threads; i++) { + if (0 != pthread_join(threads[i], NULL)) { + perror("pthread_join"); + return 1; + } + } + + return 0; +} |
