1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
|
///-*-C++-*-//////////////////////////////////////////////////////////////////
//
// Hoard: A Fast, Scalable, and Memory-Efficient Allocator
// for Shared-Memory Multiprocessors
// Contact author: Emery Berger, http://www.cs.umass.edu/~emery
//
// This library is free software; you can redistribute it and/or modify
// it under the terms of the GNU Library General Public License as
// published by the Free Software Foundation, http://www.fsf.org.
//
// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Library General Public License for more details.
//
//////////////////////////////////////////////////////////////////////////////
/**
* @file cache-scratch.cpp
*
* cache-scratch is a benchmark that exercises a heap's cache locality.
* An allocator that allows multiple threads to re-use the same small
* object (possibly all in one cache-line) will scale poorly, while
* an allocator like Hoard will exhibit near-linear scaling.
*
* Try the following (on a P-processor machine):
*
* cache-scratch 1 1000 1 1000000
* cache-scratch P 1000 1 1000000
*
* cache-scratch-hoard 1 1000 1 1000000
* cache-scratch-hoard P 1000 1 1000000
*
* The ideal is a P-fold speedup.
*/
#include <stdio.h>
#include <stdlib.h>
#include "fred.h"
#include "cpuinfo.h"
#include "timer.h"
// This class just holds arguments to each thread.
class workerArg {
public:
workerArg() {}
workerArg (char * obj, int objSize, int repetitions, int iterations)
: _object (obj),
_objSize (objSize),
_iterations (iterations),
_repetitions (repetitions)
{}
char * _object;
int _objSize;
int _iterations;
int _repetitions;
};
#if defined(_WIN32)
extern "C" void worker (void * arg)
#else
extern "C" void * worker (void * arg)
#endif
{
// free the object we were given.
// Then, repeatedly do the following:
// malloc a given-sized object,
// repeatedly write on it,
// then free it.
workerArg * w = (workerArg *) arg;
delete w->_object;
workerArg w1 = *w;
for (int i = 0; i < w1._iterations; i++) {
// Allocate the object.
char * obj = new char[w1._objSize];
// Write into it a bunch of times.
for (int j = 0; j < w1._repetitions; j++) {
for (int k = 0; k < w1._objSize; k++) {
obj[k] = (char) k;
volatile char ch = obj[k];
ch++;
}
}
// Free the object.
delete [] obj;
}
#if !defined(_WIN32)
return NULL;
#endif
}
int main (int argc, char * argv[])
{
int nthreads;
int iterations;
int objSize;
int repetitions;
if (argc > 4) {
nthreads = atoi(argv[1]);
iterations = atoi(argv[2]);
objSize = atoi(argv[3]);
repetitions = atoi(argv[4]);
} else {
fprintf (stderr, "Usage: %s nthreads iterations objSize repetitions\n", argv[0]);
return 1;
}
HL::Fred * threads = new HL::Fred[nthreads];
HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors());
workerArg * w = new workerArg[nthreads];
int i;
// Allocate nthreads objects and distribute them among the threads.
char ** objs = new char * [nthreads];
for (i = 0; i < nthreads; i++) {
objs[i] = new char[objSize];
}
HL::Timer t;
t.start();
for (i = 0; i < nthreads; i++) {
w[i] = workerArg (objs[i], objSize, repetitions / nthreads, iterations);
threads[i].create (&worker, (void *) &w[i]);
}
for (i = 0; i < nthreads; i++) {
threads[i].join();
}
t.stop();
delete [] threads;
delete [] objs;
delete [] w;
printf ("Time elapsed = %f seconds.\n", (double) t);
return 0;
}
|