aboutsummaryrefslogtreecommitdiff
path: root/src/benchmarks/rdtsc/rdtsc.c
blob: fec84d6cbe2b0d5a033c3d1efa6ec5af1178f5ef (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#define _GNU_SOURCE             /* See feature_test_macros(7) */
#include <pthread.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/sysinfo.h>

int mode = 0;
int size = 64;
int iterations = 100000;
int num_cpus;

static __inline__ int64_t rdtsc_s(void)
{
	unsigned a, d;
	asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
	asm volatile("rdtsc" : "=a" (a), "=d" (d));
	return ((unsigned long)a) | (((unsigned long)d) << 32);
}

static __inline__ int64_t rdtsc_e(void)
{
	unsigned a, d;
	asm volatile("rdtscp" : "=a" (a), "=d" (d));
	asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
	return ((unsigned long)a) | (((unsigned long)d) << 32);
}

static void* test_thread_func(void* arg) {
	int64_t clock_before, clock_after;
	void* p;

	int64_t* clocks = malloc(iterations * sizeof(int64_t));
	if (!clocks)
		abort();

	// set cpu affinity to prevent cpu switching
	int64_t tid = (int64_t) arg;
	cpu_set_t my_cpu;
	/* Skip CPU0 - let the OS run on that one */
	int my_cpu_num = (tid % (num_cpus-1))+1;

	CPU_ZERO (&my_cpu);
	CPU_SET (my_cpu_num, &my_cpu);
	if (sched_setaffinity (0, sizeof(my_cpu), &my_cpu) == -1)
	  perror ("setaffinity failed");

	for(int i = 0; i < iterations; i++) {
		clock_before = rdtsc_s();
		p = malloc(size);
		clock_after = rdtsc_e();

		// measure potentially cached allocations
		if (mode)
			free(p);

		clocks[i] = clock_after - clock_before;
	}

	for(int i = 0; i < iterations; i++) {
		printf("malloc(%d): %d cycles\n", size, clocks[i]);
	}

	return NULL;
}

int main(int argc, char* argv[]) {
	pthread_t* threads;
	int num_threads = 1;

	num_cpus = get_nprocs();

	if (argc > 5) {
		fprintf(stderr, "Usage: %s <iterations> <size> <num threads>\n", argv[0]);
		return 1;
	}

	if (argc > 1) {
		if (strncmp(argv[1], "cached", strlen("cached")) == 0) mode = 1;
	}
	if (argc > 2) iterations = atoi(argv[2]);
	if (argc > 3) size = atoi(argv[3]);
	if (argc > 4) num_threads = atoi(argv[4]);

	fprintf(stderr, "iterations = %d; size = %d; threads = %d\n", iterations, size, num_threads);

	threads = (pthread_t*) malloc(num_threads * sizeof(pthread_t));

	for (int i = 0; i < num_threads; i++) {
		if (0 != pthread_create(&threads[i], NULL, test_thread_func, NULL)) {
			perror("pthread_create");
			return 1;
		}
	}

	for(int i = 0; i < num_threads; i++) {
		if (0 != pthread_join(threads[i], NULL)) {
			perror("pthread_join");
			return 1;
		}
	}

	return 0;
}