diff options
| author | Florian Fischer <florian.fl.fischer@fau.de> | 2018-07-24 11:17:48 +0200 |
|---|---|---|
| committer | Florian Fischer <florian.fl.fischer@fau.de> | 2018-07-24 11:17:48 +0200 |
| commit | 8f6e9a172923b67ccffaf9fd519642ae242db868 (patch) | |
| tree | b38624306297c73011756dbd2bcbbcf9ea6c1f37 | |
| parent | fce50c833496d8c07a1d189807d81be15875431c (diff) | |
| download | allocbench-8f6e9a172923b67ccffaf9fd519642ae242db868.tar.gz allocbench-8f6e9a172923b67ccffaf9fd519642ae242db868.zip | |
add falsesharing benchmarks from berger
| -rw-r--r-- | benchmarks/cache-scratch.cc | 147 | ||||
| -rw-r--r-- | benchmarks/cache-thrash.cc | 134 | ||||
| -rw-r--r-- | benchmarks/cpuinfo.h | 202 | ||||
| -rw-r--r-- | benchmarks/fred.h | 97 | ||||
| -rw-r--r-- | benchmarks/timer.h | 372 | ||||
| -rw-r--r-- | falsesharing.py | 124 |
6 files changed, 1076 insertions, 0 deletions
diff --git a/benchmarks/cache-scratch.cc b/benchmarks/cache-scratch.cc new file mode 100644 index 0000000..2cb9b28 --- /dev/null +++ b/benchmarks/cache-scratch.cc @@ -0,0 +1,147 @@ +///-*-C++-*-////////////////////////////////////////////////////////////////// +// +// Hoard: A Fast, Scalable, and Memory-Efficient Allocator +// for Shared-Memory Multiprocessors +// Contact author: Emery Berger, http://www.cs.umass.edu/~emery +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Library General Public License as +// published by the Free Software Foundation, http://www.fsf.org. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Library General Public License for more details. +// +////////////////////////////////////////////////////////////////////////////// + +/** + * @file cache-scratch.cpp + * + * cache-scratch is a benchmark that exercises a heap's cache locality. + * An allocator that allows multiple threads to re-use the same small + * object (possibly all in one cache-line) will scale poorly, while + * an allocator like Hoard will exhibit near-linear scaling. + * + * Try the following (on a P-processor machine): + * + * cache-scratch 1 1000 1 1000000 + * cache-scratch P 1000 1 1000000 + * + * cache-scratch-hoard 1 1000 1 1000000 + * cache-scratch-hoard P 1000 1 1000000 + * + * The ideal is a P-fold speedup. +*/ + +#include <stdio.h> +#include <stdlib.h> + +#include "fred.h" +#include "cpuinfo.h" +#include "timer.h" + +// This class just holds arguments to each thread. +class workerArg { +public: + + workerArg() {} + + workerArg (char * obj, int objSize, int repetitions, int iterations) + : _object (obj), + _objSize (objSize), + _iterations (iterations), + _repetitions (repetitions) + {} + + char * _object; + int _objSize; + int _iterations; + int _repetitions; +}; + + +#if defined(_WIN32) +extern "C" void worker (void * arg) +#else +extern "C" void * worker (void * arg) +#endif +{ + // free the object we were given. + // Then, repeatedly do the following: + // malloc a given-sized object, + // repeatedly write on it, + // then free it. + workerArg * w = (workerArg *) arg; + delete w->_object; + workerArg w1 = *w; + for (int i = 0; i < w1._iterations; i++) { + // Allocate the object. + char * obj = new char[w1._objSize]; + // Write into it a bunch of times. + for (int j = 0; j < w1._repetitions; j++) { + for (int k = 0; k < w1._objSize; k++) { + obj[k] = (char) k; + volatile char ch = obj[k]; + ch++; + } + } + // Free the object. + delete [] obj; + } + +#if !defined(_WIN32) + return NULL; +#endif +} + + +int main (int argc, char * argv[]) +{ + int nthreads; + int iterations; + int objSize; + int repetitions; + + if (argc > 4) { + nthreads = atoi(argv[1]); + iterations = atoi(argv[2]); + objSize = atoi(argv[3]); + repetitions = atoi(argv[4]); + } else { + fprintf (stderr, "Usage: %s nthreads iterations objSize repetitions\n", argv[0]); + return 1; + } + + HL::Fred * threads = new HL::Fred[nthreads]; + HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors()); + + workerArg * w = new workerArg[nthreads]; + + int i; + + // Allocate nthreads objects and distribute them among the threads. + char ** objs = new char * [nthreads]; + for (i = 0; i < nthreads; i++) { + objs[i] = new char[objSize]; + } + + HL::Timer t; + t.start(); + + for (i = 0; i < nthreads; i++) { + w[i] = workerArg (objs[i], objSize, repetitions / nthreads, iterations); + threads[i].create (&worker, (void *) &w[i]); + } + for (i = 0; i < nthreads; i++) { + threads[i].join(); + } + t.stop(); + + delete [] threads; + delete [] objs; + delete [] w; + + printf ("Time elapsed = %f seconds.\n", (double) t); + return 0; +} diff --git a/benchmarks/cache-thrash.cc b/benchmarks/cache-thrash.cc new file mode 100644 index 0000000..79242eb --- /dev/null +++ b/benchmarks/cache-thrash.cc @@ -0,0 +1,134 @@ +///-*-C++-*-////////////////////////////////////////////////////////////////// +// +// Hoard: A Fast, Scalable, and Memory-Efficient Allocator +// for Shared-Memory Multiprocessors +// Contact author: Emery Berger, http://www.cs.umass.edu/~emery +// +// Copyright (c) 1998-2003, The University of Texas at Austin. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Library General Public License as +// published by the Free Software Foundation, http://www.fsf.org. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Library General Public License for more details. +// +////////////////////////////////////////////////////////////////////////////// + +/** + * @file cache-thrash.cpp + * @brief cache-thrash is a benchmark that exercises a heap's cache-locality. + * + * Try the following (on a P-processor machine): + * + * cache-thrash 1 1000 1 1000000 + * cache-thrash P 1000 1 1000000 + * + * cache-thrash-hoard 1 1000 1 1000000 + * cache-thrash-hoard P 1000 1 1000000 + * + * The ideal is a P-fold speedup. +*/ + + +#include <iostream> +#include <stdlib.h> + +using namespace std; + +#include "cpuinfo.h" +#include "fred.h" +#include "timer.h" + +// This class just holds arguments to each thread. +class workerArg { +public: + workerArg() {} + workerArg (int objSize, int repetitions, int iterations) + : _objSize (objSize), + _iterations (iterations), + _repetitions (repetitions) + {} + + int _objSize; + int _iterations; + int _repetitions; +}; + + +#if defined(_WIN32) +extern "C" void worker (void * arg) +#else +extern "C" void * worker (void * arg) +#endif +{ + // Repeatedly do the following: + // malloc a given-sized object, + // repeatedly write on it, + // then free it. + workerArg * w = (workerArg *) arg; + workerArg w1 = *w; + for (int i = 0; i < w1._iterations; i++) { + // Allocate the object. + char * obj = new char[w1._objSize]; + // printf ("obj = %p\n", obj); + // Write into it a bunch of times. + for (int j = 0; j < w1._repetitions; j++) { + for (int k = 0; k < w1._objSize; k++) { + obj[k] = (char) k; + volatile char ch = obj[k]; + ch++; + } + } + // Free the object. + delete [] obj; + } +#if !defined(_WIN32) + return NULL; +#endif +} + + +int main (int argc, char * argv[]) +{ + int nthreads; + int iterations; + int objSize; + int repetitions; + + if (argc > 4) { + nthreads = atoi(argv[1]); + iterations = atoi(argv[2]); + objSize = atoi(argv[3]); + repetitions = atoi(argv[4]); + } else { + cerr << "Usage: " << argv[0] << " nthreads iterations objSize repetitions" << endl; + exit(1); + } + + HL::Fred * threads = new HL::Fred[nthreads]; + HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors()); + + int i; + + HL::Timer t; + t.start(); + + workerArg * w = new workerArg[nthreads]; + + for (i = 0; i < nthreads; i++) { + w[i] = workerArg (objSize, repetitions / nthreads, iterations); + threads[i].create (&worker, (void *) &w[i]); + } + for (i = 0; i < nthreads; i++) { + threads[i].join(); + } + t.stop(); + + delete [] threads; + delete [] w; + + cout << "Time elapsed = " << (double) t << " seconds." << endl; +} diff --git a/benchmarks/cpuinfo.h b/benchmarks/cpuinfo.h new file mode 100644 index 0000000..1ed1f36 --- /dev/null +++ b/benchmarks/cpuinfo.h @@ -0,0 +1,202 @@ +// -*- C++ -*-
+
+/*
+
+ Heap Layers: An Extensible Memory Allocation Infrastructure
+
+ Copyright (C) 2000-2003 by Emery Berger
+ http://www.cs.umass.edu/~emery
+ emery@cs.umass.edu
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+*/
+
+
+
+#ifndef HL_CPUINFO_H
+#define HL_CPUINFO_H
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
+
+
+#if !defined(_WIN32)
+#include <pthread.h>
+#endif
+
+#if defined(__SVR4) // Solaris
+#include <sys/lwp.h>
+extern "C" unsigned int lwp_self(void);
+#include <thread.h>
+extern "C" int _thr_self(void);
+#endif
+
+#if defined(__linux)
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(__sgi)
+#include <sys/types.h>
+#include <sys/sysmp.h>
+#include <sys/sysinfo.h>
+#endif
+
+#if defined(hpux)
+#include <sys/mpctl.h>
+#endif
+
+#if defined(_WIN32)
+extern __declspec(thread) int localThreadId;
+#endif
+
+#if defined(__SVR4) && defined(MAP_ALIGN)
+extern volatile int anyThreadStackCreated;
+#endif
+
+namespace HL {
+
+/**
+ * @class CPUInfo
+ * @author Emery Berger <http://www.cs.umass.edu/~emery>
+ *
+ * @brief Architecture-independent wrapper to get number of CPUs.
+ */
+
+class CPUInfo {
+public:
+ CPUInfo (void)
+ {}
+
+ inline static int getNumProcessors (void) {
+ static int _numProcessors = computeNumProcessors();
+ return _numProcessors;
+ }
+
+ static inline unsigned long getThreadId (void);
+ inline static int computeNumProcessors (void);
+
+};
+
+
+int CPUInfo::computeNumProcessors (void)
+{
+ static int np = 0;
+ if (!np) {
+#if defined(__linux) || defined(__APPLE__)
+ np = (int) sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_WIN32)
+ SYSTEM_INFO infoReturn[1];
+ GetSystemInfo (infoReturn);
+ np = (int) (infoReturn->dwNumberOfProcessors);
+#elif defined(__sgi)
+ np = (int) sysmp(MP_NAPROCS);
+#elif defined(hpux)
+ np = mpctl(MPC_GETNUMSPUS, NULL, NULL); // or pthread_num_processors_np()?
+#elif defined(_SC_NPROCESSORS_ONLN)
+ np = (int) (sysconf(_SC_NPROCESSORS_ONLN));
+#else
+ np = 2;
+ // Unsupported platform.
+ // Pretend we have at least two processors. This approach avoids the risk of assuming
+ // we're on a uniprocessor, which might lead clever allocators to avoid using atomic
+ // operations for all locks.
+#endif
+ return np;
+ } else {
+ return np;
+ }
+}
+
+ // Note: when stacksize arg is NULL for pthread_attr_setstacksize [Solaris],
+// stack size is 1 MB for 32-bit arch, 2 MB for 64-bit arch.
+// pthread_attr_getstacksize
+// pthread_attr_setstackaddr
+// pthread_attr_getstackaddr
+// PTHREAD_STACK_SIZE is minimum.
+// or should we just assume we have __declspec(thread) or __thread?
+
+#if defined(USE_THREAD_KEYWORD)
+ extern __thread int localThreadId;
+#endif
+
+ // FIX ME FIXME
+ //#include <stdio.h>
+
+unsigned long CPUInfo::getThreadId (void) {
+#if defined(__SVR4)
+ size_t THREAD_STACK_SIZE;
+ if (sizeof(size_t) <= 4) {
+ THREAD_STACK_SIZE = 1048576;
+ } else {
+ // 64-bits.
+ THREAD_STACK_SIZE = 1048576 * 2;
+ }
+ if (0) { // !anyThreadStackCreated) {
+ // We know a priori that all stack variables
+ // are on different stacks. Since no one has created
+ // a special one, we are in control, and thus all stacks
+ // are 1 MB in size and on 1 MB boundaries.
+ // (Actually: 1 MB for 32-bits, 2 MB for 64-bits.)
+ char buf;
+ return (((size_t) &buf) & ~(THREAD_STACK_SIZE-1)) >> 20;
+ } else {
+ return (int) pthread_self();
+ }
+#elif defined(_WIN32)
+ // It looks like thread id's are always multiples of 4, so...
+ return GetCurrentThreadId() >> 2;
+#elif defined(__APPLE__)
+ // Consecutive thread id's in Mac OS are 4096 apart;
+ // dividing off the 4096 gives us an appropriate thread id.
+ int tid = (int) ((unsigned long) pthread_self()) >> 12;
+ return tid;
+#elif defined(__BEOS__)
+ return find_thread(0);
+#elif defined(USE_THREAD_KEYWORD)
+ return localThreadId;
+#elif defined(__linux) || defined(PTHREAD_KEYS_MAX)
+ // Consecutive thread id's in Linux are 1024 apart;
+ // dividing off the 1024 gives us an appropriate thread id.
+ return (unsigned long) pthread_self() >> 10;
+#elif defined(POSIX)
+ return (unsigned long) pthread_self();
+#elif USE_SPROC
+ // This hairiness has the same effect as calling getpid(),
+ // but it's MUCH faster since it avoids making a system call
+ // and just accesses the sproc-local data directly.
+ unsigned long pid = (unsigned long) PRDA->sys_prda.prda_sys.t_pid;
+ return pid;
+#else
+ return 0;
+#endif
+}
+
+}
+
+#endif
diff --git a/benchmarks/fred.h b/benchmarks/fred.h new file mode 100644 index 0000000..b0198a7 --- /dev/null +++ b/benchmarks/fred.h @@ -0,0 +1,97 @@ +// -*- C++ -*- + +#ifndef HL_FRED_H +#define HL_FRED_H + +/// A thread-wrapper of childlike simplicity :). + +#if defined(_WIN32) + + #include <windows.h> + #include <process.h> + +#elif defined(__SVR4) + + #include <thread.h> + #include <pthread.h> + #include <unistd.h> + +#else + + #include <pthread.h> + #include <unistd.h> + +#endif + +typedef void * (*ThreadFunctionType) (void *); + +namespace HL { + +class Fred { +public: + + Fred() { +#if !defined(_WIN32) + pthread_attr_init (&attr); + pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM); +#endif + } + + ~Fred() { +#if !defined(_WIN32) + pthread_attr_destroy (&attr); +#endif + } + + void create (ThreadFunctionType function, void * arg) { +#if defined(_WIN32) + t = CreateThread (0, 0, (LPTHREAD_START_ROUTINE) *function, (LPVOID) arg, 0, 0); +#else + pthread_create (&t, &attr, function, arg); +#endif + } + + void join (void) { +#if defined(_WIN32) + WaitForSingleObject (t, INFINITE); +#else + pthread_join (t, NULL); +#endif + } + + static void yield (void) { +#if defined(_WIN32) + Sleep (0); +#elif defined(__SVR4) + thr_yield(); +#else + sched_yield(); +#endif + } + + + static void setConcurrency (int n) { +#if defined(_WIN32) +#elif defined(__SVR4) + thr_setconcurrency (n); +#else + pthread_setconcurrency (n); +#endif + } + + +private: +#if defined(_WIN32) + typedef HANDLE FredType; +#else + typedef pthread_t FredType; + pthread_attr_t attr; +#endif + + FredType t; +}; + +} + + +#endif diff --git a/benchmarks/timer.h b/benchmarks/timer.h new file mode 100644 index 0000000..d4d42c7 --- /dev/null +++ b/benchmarks/timer.h @@ -0,0 +1,372 @@ +/* -*- C++ -*- */ + +/* + + Heap Layers: An Extensible Memory Allocation Infrastructure + + Copyright (C) 2000-2003 by Emery Berger + http://www.cs.umass.edu/~emery + emery@cs.umass.edu + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +*/ + +#include <cassert> +#include <stdio.h> + + +#ifndef _TIMER_H_ +#define _TIMER_H_ + +/** + * @class Timer + * @brief A portable class for high-resolution timing. + * + * This class simplifies timing measurements across a number of platforms. + * + * @code + * Timer t; + * t.start(); + * // do some work + * t.stop(); + * cout << "That took " << (double) t << " seconds." << endl; + * @endcode + * + */ + +#ifdef __APPLE__ +#include <sys/time.h> +#endif + +#if defined(__linux__) && defined(__GNUG__) && defined(__i386__) + +#include <stdio.h> +#include <limits.h> +#include <time.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> + +static void getTime (unsigned long& tlo, unsigned long& thi) { + asm volatile ("rdtsc" + : "=a"(tlo), + "=d" (thi)); +} + + +static double getFrequency (void) { + static double freq = 0.0; + static bool initialized = false; + unsigned long LTime0, LTime1, HTime0, HTime1; + if (!initialized) { + + freq = 2600000.0; + +#if 0 + // Compute MHz directly. + // Wait for approximately one second. + + getTime (LTime0, HTime0); + // printf ("waiting...\n"); + struct timespec rqtp, rmtp; + rqtp.tv_sec = 1; + rqtp.tv_nsec = 0; + nanosleep (&rqtp, &rmtp); + // printf ("done.\n"); + getTime (LTime1, HTime1); + + freq = (double)(LTime1 - LTime0) + (double)(UINT_MAX)*(double)(HTime1 - HTime0); + if (LTime1 < LTime0) { + freq -= (double)UINT_MAX; + } +#endif + initialized = true; + + } else { + // printf ("wha?\n"); + } + return freq; +} + + +namespace HL { + +class Timer { +public: + Timer (void) + : timeElapsed (0.0) + { + _frequency = getFrequency(); + // printf ("wooo!\n"); + // printf ("freq = %lf\n", frequency); + } + void start (void) { + getTime (currentLo, currentHi); + } + void stop (void) { + unsigned long lo, hi; + getTime (lo, hi); + double now = (double) hi * 4294967296.0 + lo; + double prev = (double) currentHi * 4294967296.0 + currentLo; + timeElapsed = (now - prev) / _frequency; + } + + operator double (void) { + return timeElapsed; + } + +private: + double timeElapsed; + unsigned long currentLo, currentHi; + double _frequency; +}; + +}; + +#else + + +#ifdef __SVR4 // Solaris +#include <sys/time.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/procfs.h> +#include <stdio.h> +#endif // __SVR4 + +#include <time.h> + +#if defined(unix) || defined(__linux) +#include <sys/time.h> +#include <unistd.h> +#endif + + +#ifdef __sgi +#include <sys/types.h> +#include <sys/times.h> +#include <limits.h> +#endif + + +#if defined(_WIN32) +#include <windows.h> +#endif + + +#if defined(__BEOS__) +#include <OS.h> +#endif + + +namespace HL { + +class Timer { + +public: + + /// Initializes the timer. + Timer (void) +#if !defined(_WIN32) + : _starttime (0), + _elapsedtime (0) +#endif + { + } + + /// Start the timer. + void start (void) { _starttime = _time(); } + + /// Stop the timer. + void stop (void) { _elapsedtime += _time() - _starttime; } + + /// Reset the timer. + void reset (void) { _starttime = _elapsedtime; } + +#if 0 + // Set the timer. + void set (double secs) { _starttime = 0; _elapsedtime = _sectotime (secs);} +#endif + + /// Return the number of seconds elapsed. + operator double (void) { return _timetosec (_elapsedtime); } + + static double currentTime (void) { TimeType t; t = _time(); return _timetosec (t); } + + +private: + + // The _timer variable will be different depending on the OS. + // We try to use the best timer available. + +#ifdef __sgi +#define TIMER_FOUND + + long _starttime, _elapsedtime; + + long _time (void) { + struct tms t; + long ticks = times (&t); + return ticks; + } + + static double _timetosec (long t) { + return ((double) (t) / CLK_TCK); + } + + static long _sectotime (double sec) { + return (long) sec * CLK_TCK; + } +#endif + +#ifdef __SVR4 // Solaris +#define TIMER_FOUND + typedef hrtime_t TimeType; + TimeType _starttime, _elapsedtime; + + static TimeType _time (void) { + return gethrtime(); + } + + static TimeType _sectotime (double sec) { return (hrtime_t) (sec * 1.0e9); } + + static double _timetosec (TimeType& t) { + return ((double) (t) / 1.0e9); + } +#endif // __SVR4 + +#if defined(MAC) || defined(macintosh) +#define TIMER_FOUND + double _starttime, _elapsedtime; + + double _time (void) { + return get_Mac_microseconds(); + } + + double _timetosec (hrtime_t& t) { + return t; + } +#endif // MAC + +#ifdef _WIN32 +#define TIMER_FOUND + +#ifndef __GNUC__ + class TimeType { + public: + TimeType (void) + { + largeInt.QuadPart = 0; + } + operator double& (void) { return (double&) largeInt.QuadPart; } + operator LARGE_INTEGER& (void) { return largeInt; } + double timeToSec (void) { + return (double) largeInt.QuadPart / getFreq(); + } + private: + double getFreq (void) { + QueryPerformanceFrequency (&freq); + return (double) freq.QuadPart; + } + + LARGE_INTEGER largeInt; + LARGE_INTEGER freq; + }; + + TimeType _starttime, _elapsedtime; + + static TimeType _time (void) { + TimeType t; + int r = QueryPerformanceCounter (&((LARGE_INTEGER&) t)); + assert (r); + return t; + } + + static double _timetosec (TimeType& t) { + return t.timeToSec(); + } +#else + typedef DWORD TimeType; + DWORD _starttime, _elapsedtime; + static DWORD _time (void) { + return GetTickCount(); + } + + static double _timetosec (DWORD& t) { + return (double) t / 100000.0; + } + static unsigned long _sectotime (double sec) { + return (unsigned long)(sec); + } +#endif +#endif // _WIN32 + + +#ifdef __BEOS__ +#define TIMER_FOUND + bigtime_t _starttime, _elapsedtime; + bigtime_t _time(void) { + return system_time(); + } + double _timetosec (bigtime_t& t) { + return (double) t / 1000000.0; + } + + bigtime_t _sectotime (double sec) { + return (bigtime_t)(sec * 1000000.0); + } +#endif // __BEOS__ + +#ifndef TIMER_FOUND + + typedef long TimeType; + TimeType _starttime, _elapsedtime; + + static TimeType _time (void) { + struct timeval t; + gettimeofday (&t, NULL); + return t.tv_sec * 1000000 + t.tv_usec; + } + + static double _timetosec (TimeType t) { + return ((double) (t) / 1000000.0); + } + + static TimeType _sectotime (double sec) { + return (TimeType) (sec * 1000000.0); + } + +#endif // TIMER_FOUND + +#undef TIMER_FOUND + +}; + + +#ifdef __SVR4 // Solaris +class VirtualTimer : public Timer { +public: + hrtime_t _time (void) { + return gethrvtime(); + } +}; +#endif + +} + +#endif + +#endif diff --git a/falsesharing.py b/falsesharing.py new file mode 100644 index 0000000..627acc0 --- /dev/null +++ b/falsesharing.py @@ -0,0 +1,124 @@ +import csv +import pickle +import matplotlib.pyplot as plt +import multiprocessing +import numpy as np +import os +import re +import subprocess + +from benchmark import Benchmark +from common_targets import common_targets + +cmd = ("perf stat -x\; -e cpu-clock:k,cache-references,cache-misses,cycles," + "instructions,branches,faults,migrations " + "build/cache-{}{} {} 100 8 1000000") + +class Benchmark_Falsesharing( Benchmark ): + def __init__(self): + self.name = "falsesharing" + self.descrition = """This benchmarks makes small allocations and writes + to them multiple times. If the allocated objects are + on the same cache line the writes will be expensive because + of cache trashing.""", + self.targets = common_targets + self.nthreads = range(1, multiprocessing.cpu_count() * 2 + 1) + + self.results = {"args" : {"nthreads" : self.nthreads}, + "targets" : self.targets, + "thrash": {}, + "scratch": {}} + + def prepare(self, verbose=False): + req = ["build/cache-thrash", "build/cache-scratch"] + for r in req: + if not os.path.isfile(r): + print(r, "not found") + return False + if not os.access(r, os.X_OK): + print(r, "not executable") + return False + if verbose: + print(r, "found and executable.") + return True + + + def run(self, verbose=False, runs=3): + for run in range(1, runs + 1): + print(str(run) + ". run") + + n = len(self.nthreads) + for i, threads in enumerate(list(range(1, n + 1)) * 2): + print(i + 1, "of", n*2, "\r", end='') + + # run cmd for each target + for tname, t in self.targets.items(): + result = {} + + os.environ["LD_PRELOAD"] = t[1] + + for bench in ["thrash", "scratch"]: + + target_cmd = cmd.format(bench, t[0], threads).split(" ") + if verbose: + print("\n" + tname, t, "\n", " ".join(target_cmd), "\n") + + p = subprocess.run(target_cmd, + env=os.environ, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + universal_newlines=True) + + output = p.stdout + + if p.returncode != 0: + print("\n" + " ".join(target_cmd), "exited with", + p.returncode, ".\n Aborting Benchmark.") + print(tname, t) + print(output) + print(p.stdout) + return False + + if "ERROR: ld.so" in output: + print("\nPreloading of", t[1], "failed for", tname, + ".\n Aborting Benchmark.") + print(output) + return False + + # Handle perf output + time = float(re.search("(\d*\.\d*)", str(output))[1]) + key = (tname, threads) + if not key in self.results[bench]: + self.results[bench][key] = [time] + else: + self.results[bench][key].append(time) + + print() + return True + + def summary(self): + # Speedup thrash + nthreads = self.results["args"]["nthreads"] + targets = self.results["targets"] + + y_mapping = {v : i for i, v in enumerate(nthreads)} + for bench in ["thrash", "scratch"]: + for target in targets: + y_vals = [0] * len(nthreads) + single_threaded = np.mean(self.results[bench][(target, 1)]) + y_vals[0] = single_threaded + for mid, measures in self.results[bench].items(): + print(measures) + if mid[0] == target and mid[1] != 1: + y_vals[y_mapping[mid[1]]] = single_threaded / np.mean(measures) + print(target, single_threaded, y_vals) + plt.plot(nthreads, y_vals, marker='.', linestyle='-', label=target) + + plt.legend() + plt.xlabel("threads") + plt.ylabel("speedup") + plt.title(bench) + plt.savefig(self.name + "." + bench + ".png") + plt.clf() + +falsesharing= Benchmark_Falsesharing() |
