add falsesharing benchmarks from berger

author: Florian Fischer <florian.fl.fischer@fau.de> 2018-07-24 11:17:48 +0200
committer: Florian Fischer <florian.fl.fischer@fau.de> 2018-07-24 11:17:48 +0200
commit: 8f6e9a172923b67ccffaf9fd519642ae242db868 (patch)
tree: b38624306297c73011756dbd2bcbbcf9ea6c1f37
parent: fce50c833496d8c07a1d189807d81be15875431c (diff)
download: allocbench-8f6e9a172923b67ccffaf9fd519642ae242db868.tar.gz
allocbench-8f6e9a172923b67ccffaf9fd519642ae242db868.zip
6 files changed, 1076 insertions, 0 deletions
diff --git a/benchmarks/cache-scratch.cc b/benchmarks/cache-scratch.cc
new file mode 100644
index 0000000..2cb9b28
--- /dev/null
+++ b/benchmarks/cache-scratch.cc
@@ -0,0 +1,147 @@
+///-*-C++-*-//////////////////////////////////////////////////////////////////
+//
+// Hoard: A Fast, Scalable, and Memory-Efficient Allocator
+//        for Shared-Memory Multiprocessors
+// Contact author: Emery Berger, http://www.cs.umass.edu/~emery
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Library General Public License as
+// published by the Free Software Foundation, http://www.fsf.org.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Library General Public License for more details.
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @file cache-scratch.cpp
+ *
+ * cache-scratch is a benchmark that exercises a heap's cache locality.
+ * An allocator that allows multiple threads to re-use the same small
+ * object (possibly all in one cache-line) will scale poorly, while
+ * an allocator like Hoard will exhibit near-linear scaling.
+ *
+ * Try the following (on a P-processor machine):
+ *
+ *  cache-scratch 1 1000 1 1000000
+ *  cache-scratch P 1000 1 1000000
+ *
+ *  cache-scratch-hoard 1 1000 1 1000000
+ *  cache-scratch-hoard P 1000 1 1000000
+ *
+ *  The ideal is a P-fold speedup.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "fred.h"
+#include "cpuinfo.h"
+#include "timer.h"
+
+// This class just holds arguments to each thread.
+class workerArg {
+public:
+
+  workerArg() {}
+
+  workerArg (char * obj, int objSize, int repetitions, int iterations)
+    : _object (obj),
+      _objSize (objSize),
+      _iterations (iterations),
+      _repetitions (repetitions)
+  {}
+
+  char * _object;
+  int _objSize;
+  int _iterations;
+  int _repetitions;
+};
+
+
+#if defined(_WIN32)
+extern "C" void worker (void * arg)
+#else
+extern "C" void * worker (void * arg)
+#endif
+{
+  // free the object we were given.
+  // Then, repeatedly do the following:
+  //   malloc a given-sized object,
+  //   repeatedly write on it,
+  //   then free it.
+  workerArg * w = (workerArg *) arg;
+  delete w->_object;
+  workerArg w1 = *w;
+  for (int i = 0; i < w1._iterations; i++) {
+    // Allocate the object.
+    char * obj = new char[w1._objSize];
+    // Write into it a bunch of times.
+    for (int j = 0; j < w1._repetitions; j++) {
+      for (int k = 0; k < w1._objSize; k++) {
+	obj[k] = (char) k;
+	volatile char ch = obj[k];
+	ch++;
+      }
+    }
+    // Free the object.
+    delete [] obj;
+  }
+
+#if !defined(_WIN32)
+  return NULL;
+#endif
+}
+
+
+int main (int argc, char * argv[])
+{
+  int nthreads;
+  int iterations;
+  int objSize;
+  int repetitions;
+
+  if (argc > 4) {
+    nthreads = atoi(argv[1]);
+    iterations = atoi(argv[2]);
+    objSize = atoi(argv[3]);
+    repetitions = atoi(argv[4]);
+  } else {
+    fprintf (stderr, "Usage: %s nthreads iterations objSize repetitions\n", argv[0]);
+    return 1;
+  }
+
+  HL::Fred * threads = new HL::Fred[nthreads];
+  HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors());
+
+  workerArg * w = new workerArg[nthreads];
+
+  int i;
+
+  // Allocate nthreads objects and distribute them among the threads.
+  char ** objs = new char * [nthreads];
+  for (i = 0; i < nthreads; i++) {
+    objs[i] = new char[objSize];
+  }
+
+  HL::Timer t;
+  t.start();
+
+  for (i = 0; i < nthreads; i++) {
+    w[i] = workerArg (objs[i], objSize, repetitions / nthreads, iterations);
+    threads[i].create (&worker, (void *) &w[i]);
+  }
+  for (i = 0; i < nthreads; i++) {
+    threads[i].join();
+  }
+  t.stop();
+
+  delete [] threads;
+  delete [] objs;
+  delete [] w;
+
+  printf ("Time elapsed = %f seconds.\n", (double) t);
+  return 0;
+}
diff --git a/benchmarks/cache-thrash.cc b/benchmarks/cache-thrash.cc
new file mode 100644
index 0000000..79242eb
--- /dev/null
+++ b/benchmarks/cache-thrash.cc
@@ -0,0 +1,134 @@
+///-*-C++-*-//////////////////////////////////////////////////////////////////
+//
+// Hoard: A Fast, Scalable, and Memory-Efficient Allocator
+//        for Shared-Memory Multiprocessors
+// Contact author: Emery Berger, http://www.cs.umass.edu/~emery
+//
+// Copyright (c) 1998-2003, The University of Texas at Austin.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Library General Public License as
+// published by the Free Software Foundation, http://www.fsf.org.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Library General Public License for more details.
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @file  cache-thrash.cpp
+ * @brief cache-thrash is a benchmark that exercises a heap's cache-locality.
+ *
+ * Try the following (on a P-processor machine):
+ *
+ *  cache-thrash 1 1000 1 1000000
+ *  cache-thrash P 1000 1 1000000
+ *
+ *  cache-thrash-hoard 1 1000 1 1000000
+ *  cache-thrash-hoard P 1000 1 1000000
+ *
+ *  The ideal is a P-fold speedup.
+*/
+
+
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+
+#include "cpuinfo.h"
+#include "fred.h"
+#include "timer.h"
+
+// This class just holds arguments to each thread.
+class workerArg {
+public:
+	workerArg() {}
+	workerArg (int objSize, int repetitions, int iterations)
+	: _objSize (objSize),
+	 _iterations (iterations),
+	 _repetitions (repetitions)
+	{}
+
+	int _objSize;
+	int _iterations;
+	int _repetitions;
+};
+
+
+#if defined(_WIN32)
+extern "C" void worker (void * arg)
+#else
+extern "C" void * worker (void * arg)
+#endif
+{
+	// Repeatedly do the following:
+	//   malloc a given-sized object,
+	//   repeatedly write on it,
+	//   then free it.
+	workerArg * w = (workerArg *) arg;
+	workerArg w1 = *w;
+	for (int i = 0; i < w1._iterations; i++) {
+	// Allocate the object.
+		char * obj = new char[w1._objSize];
+		//    printf ("obj = %p\n", obj);
+		// Write into it a bunch of times.
+		for (int j = 0; j < w1._repetitions; j++) {
+			for (int k = 0; k < w1._objSize; k++) {
+				obj[k] = (char) k;
+				volatile char ch = obj[k];
+				ch++;
+			}
+		}
+		// Free the object.
+		delete [] obj;
+	}
+#if !defined(_WIN32)
+	return NULL;
+#endif
+}
+
+
+int main (int argc, char * argv[])
+{
+	int nthreads;
+	int iterations;
+	int objSize;
+	int repetitions;
+	
+	if (argc > 4) {
+		nthreads = atoi(argv[1]);
+		iterations = atoi(argv[2]);
+		objSize = atoi(argv[3]);
+		repetitions = atoi(argv[4]);
+	} else {
+	cerr << "Usage: " << argv[0] << " nthreads iterations objSize repetitions" << endl;
+	exit(1);
+	}
+
+	HL::Fred * threads = new HL::Fred[nthreads];
+	HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors());
+    
+	int i;
+  
+	HL::Timer t;
+	t.start();
+  
+	workerArg * w = new workerArg[nthreads];
+    
+	for (i = 0; i < nthreads; i++) {
+		w[i] = workerArg (objSize, repetitions / nthreads, iterations);
+		threads[i].create (&worker, (void *) &w[i]);
+	}
+	for (i = 0; i < nthreads; i++) {
+		threads[i].join();
+	}
+	t.stop();
+
+	delete [] threads;
+	delete [] w;
+
+	cout << "Time elapsed = " << (double) t << " seconds." << endl;
+}
diff --git a/benchmarks/cpuinfo.h b/benchmarks/cpuinfo.h
new file mode 100644
index 0000000..1ed1f36
--- /dev/null
+++ b/benchmarks/cpuinfo.h
@@ -0,0 +1,202 @@
+// -*- C++ -*-
+
+/*
+
+  Heap Layers: An Extensible Memory Allocation Infrastructure
+  
+  Copyright (C) 2000-2003 by Emery Berger
+  http://www.cs.umass.edu/~emery
+  emery@cs.umass.edu
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+  
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+*/
+
+
+
+#ifndef HL_CPUINFO_H
+#define HL_CPUINFO_H
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
+
+
+#if !defined(_WIN32)
+#include <pthread.h>
+#endif
+
+#if defined(__SVR4) // Solaris
+#include <sys/lwp.h>
+extern "C" unsigned int lwp_self(void);
+#include <thread.h>
+extern "C" int _thr_self(void);
+#endif
+
+#if defined(__linux)
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(__sgi)
+#include <sys/types.h>
+#include <sys/sysmp.h>
+#include <sys/sysinfo.h>
+#endif
+
+#if defined(hpux)
+#include <sys/mpctl.h>
+#endif
+
+#if defined(_WIN32)
+extern __declspec(thread) int localThreadId;
+#endif
+
+#if defined(__SVR4) && defined(MAP_ALIGN)
+extern volatile int anyThreadStackCreated;
+#endif
+
+namespace HL {
+
+/**
+ * @class CPUInfo
+ * @author Emery Berger <http://www.cs.umass.edu/~emery>
+ *
+ * @brief Architecture-independent wrapper to get number of CPUs. 
+ */
+
+class CPUInfo {
+public:
+  CPUInfo (void)
+  {}
+
+  inline static int getNumProcessors (void) {
+    static int _numProcessors = computeNumProcessors();
+    return _numProcessors;
+  }
+
+  static inline unsigned long getThreadId (void);
+  inline static int computeNumProcessors (void);
+
+};
+
+
+int CPUInfo::computeNumProcessors (void)
+{
+  static int np = 0;
+  if (!np) {
+#if defined(__linux) || defined(__APPLE__)
+    np = (int) sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_WIN32)
+    SYSTEM_INFO infoReturn[1];
+    GetSystemInfo (infoReturn);
+    np = (int) (infoReturn->dwNumberOfProcessors);
+#elif defined(__sgi)
+    np = (int) sysmp(MP_NAPROCS);
+#elif defined(hpux)
+    np = mpctl(MPC_GETNUMSPUS, NULL, NULL); // or pthread_num_processors_np()?
+#elif defined(_SC_NPROCESSORS_ONLN)
+    np = (int) (sysconf(_SC_NPROCESSORS_ONLN));
+#else
+    np = 2;
+    // Unsupported platform.
+    // Pretend we have at least two processors. This approach avoids the risk of assuming
+    // we're on a uniprocessor, which might lead clever allocators to avoid using atomic
+    // operations for all locks.
+#endif
+    return np;
+  } else {
+    return np;
+  }
+}
+
+  // Note: when stacksize arg is NULL for pthread_attr_setstacksize [Solaris],
+// stack size is 1 MB for 32-bit arch, 2 MB for 64-bit arch.
+// pthread_attr_getstacksize
+// pthread_attr_setstackaddr
+// pthread_attr_getstackaddr
+// PTHREAD_STACK_SIZE is minimum.
+// or should we just assume we have __declspec(thread) or __thread?
+
+#if defined(USE_THREAD_KEYWORD)
+  extern __thread int localThreadId;
+#endif
+
+  // FIX ME FIXME
+  //#include <stdio.h>
+
+unsigned long CPUInfo::getThreadId (void) {
+#if defined(__SVR4)
+  size_t THREAD_STACK_SIZE;
+  if (sizeof(size_t) <= 4) {
+    THREAD_STACK_SIZE = 1048576;
+  } else {
+    // 64-bits.
+    THREAD_STACK_SIZE = 1048576 * 2;
+  }
+  if (0) { // !anyThreadStackCreated) {
+    // We know a priori that all stack variables
+    // are on different stacks. Since no one has created
+    // a special one, we are in control, and thus all stacks
+    // are 1 MB in size and on 1 MB boundaries.
+    // (Actually: 1 MB for 32-bits, 2 MB for 64-bits.)
+    char buf;
+    return (((size_t) &buf) & ~(THREAD_STACK_SIZE-1)) >> 20;
+  } else {
+    return (int) pthread_self();
+  }
+#elif defined(_WIN32)
+  // It looks like thread id's are always multiples of 4, so...
+  return GetCurrentThreadId() >> 2;
+#elif defined(__APPLE__)
+  // Consecutive thread id's in Mac OS are 4096 apart;
+  // dividing off the 4096 gives us an appropriate thread id.
+  int tid = (int) ((unsigned long) pthread_self()) >> 12;
+  return tid;
+#elif defined(__BEOS__)
+  return find_thread(0);
+#elif defined(USE_THREAD_KEYWORD)
+  return localThreadId;
+#elif defined(__linux) || defined(PTHREAD_KEYS_MAX)
+  // Consecutive thread id's in Linux are 1024 apart;
+  // dividing off the 1024 gives us an appropriate thread id.
+  return (unsigned long) pthread_self() >> 10;
+#elif defined(POSIX)
+  return (unsigned long) pthread_self();
+#elif USE_SPROC
+  // This hairiness has the same effect as calling getpid(),
+  // but it's MUCH faster since it avoids making a system call
+  // and just accesses the sproc-local data directly.
+  unsigned long pid = (unsigned long) PRDA->sys_prda.prda_sys.t_pid;
+  return pid;
+#else
+  return 0;
+#endif
+}
+
+}
+
+#endif
diff --git a/benchmarks/fred.h b/benchmarks/fred.h
new file mode 100644
index 0000000..b0198a7
--- /dev/null
+++ b/benchmarks/fred.h
@@ -0,0 +1,97 @@
+// -*- C++ -*-
+
+#ifndef HL_FRED_H
+#define HL_FRED_H
+
+/// A thread-wrapper of childlike simplicity :).
+
+#if defined(_WIN32)
+
+  #include <windows.h>
+  #include <process.h>
+
+#elif defined(__SVR4)
+
+  #include <thread.h>
+  #include <pthread.h>
+  #include <unistd.h>
+
+#else
+
+  #include <pthread.h>
+  #include <unistd.h>
+
+#endif
+
+typedef void * (*ThreadFunctionType) (void *);
+
+namespace HL {
+
+class Fred {
+public:
+
+  Fred() {
+#if !defined(_WIN32)
+    pthread_attr_init (&attr);
+    pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM);
+#endif
+  }
+
+  ~Fred() {
+#if !defined(_WIN32)
+    pthread_attr_destroy (&attr);
+#endif
+  }
+
+  void create (ThreadFunctionType function, void * arg) {
+#if defined(_WIN32)
+    t = CreateThread (0, 0, (LPTHREAD_START_ROUTINE) *function, (LPVOID) arg, 0, 0);
+#else
+    pthread_create (&t, &attr, function, arg);
+#endif
+  }
+
+  void join (void) {
+#if defined(_WIN32)
+    WaitForSingleObject (t, INFINITE);
+#else
+    pthread_join (t, NULL);
+#endif
+  }
+
+  static void yield (void) {
+#if defined(_WIN32)
+    Sleep (0);
+#elif defined(__SVR4)
+    thr_yield();
+#else
+    sched_yield();
+#endif
+  }
+
+
+  static void setConcurrency (int n) {
+#if defined(_WIN32)
+#elif defined(__SVR4)
+    thr_setconcurrency (n);
+#else
+    pthread_setconcurrency (n);
+#endif
+  }
+
+
+private:
+#if defined(_WIN32)
+  typedef HANDLE FredType;
+#else
+  typedef pthread_t FredType;
+  pthread_attr_t attr;
+#endif
+
+  FredType t;
+};
+
+}
+
+
+#endif
diff --git a/benchmarks/timer.h b/benchmarks/timer.h
new file mode 100644
index 0000000..d4d42c7
--- /dev/null
+++ b/benchmarks/timer.h
@@ -0,0 +1,372 @@
+/* -*- C++ -*- */
+
+/*
+
+  Heap Layers: An Extensible Memory Allocation Infrastructure
+  
+  Copyright (C) 2000-2003 by Emery Berger
+  http://www.cs.umass.edu/~emery
+  emery@cs.umass.edu
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+  
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+*/
+
+#include <cassert>
+#include <stdio.h>
+
+
+#ifndef _TIMER_H_
+#define _TIMER_H_
+
+/**
+ * @class Timer
+ * @brief A portable class for high-resolution timing.
+ *
+ * This class simplifies timing measurements across a number of platforms.
+ * 
+ * @code
+ *  Timer t;
+ *  t.start();
+ *  // do some work
+ *  t.stop();
+ *  cout << "That took " << (double) t << " seconds." << endl;
+ * @endcode
+ *
+ */
+
+#ifdef __APPLE__
+#include <sys/time.h>
+#endif
+
+#if defined(__linux__) && defined(__GNUG__) && defined(__i386__)
+
+#include <stdio.h>
+#include <limits.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+
+static void getTime (unsigned long& tlo, unsigned long& thi) {
+  asm volatile ("rdtsc"
+		: "=a"(tlo),
+		"=d" (thi));
+}
+
+
+static double getFrequency (void) {
+  static double freq = 0.0;
+  static bool initialized = false;
+  unsigned long LTime0, LTime1, HTime0, HTime1;
+  if (!initialized) { 
+
+    freq = 2600000.0;
+
+#if 0
+    // Compute MHz directly.
+    // Wait for approximately one second.
+    
+    getTime (LTime0, HTime0);
+    //    printf ("waiting...\n");
+    struct timespec rqtp, rmtp;
+    rqtp.tv_sec = 1;
+    rqtp.tv_nsec = 0;
+    nanosleep (&rqtp, &rmtp);
+    // printf ("done.\n");
+    getTime (LTime1, HTime1);
+
+    freq = (double)(LTime1 - LTime0) + (double)(UINT_MAX)*(double)(HTime1 - HTime0);
+    if (LTime1 < LTime0) {
+      freq -= (double)UINT_MAX;
+    }
+#endif
+    initialized = true;
+
+  } else {
+    // printf ("wha?\n");
+  }
+  return freq;
+}
+
+
+namespace HL {
+
+class Timer {
+public:
+  Timer (void)
+    : timeElapsed (0.0)
+  {
+    _frequency = getFrequency();
+    //    printf ("wooo!\n");
+    //  printf ("freq = %lf\n", frequency);
+  }
+  void start (void) {
+    getTime (currentLo, currentHi);
+  }
+  void stop (void) {
+    unsigned long lo, hi;
+    getTime (lo, hi);
+    double now = (double) hi * 4294967296.0 + lo;
+    double prev = (double) currentHi * 4294967296.0 + currentLo;
+    timeElapsed = (now - prev) / _frequency;
+  }
+
+  operator double (void) {
+    return timeElapsed;
+  }
+
+private:
+  double timeElapsed;
+  unsigned long currentLo, currentHi;
+  double _frequency;
+};
+
+};
+
+#else
+
+
+#ifdef __SVR4 // Solaris
+#include <sys/time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
+#include <stdio.h>
+#endif // __SVR4
+
+#include <time.h>
+
+#if defined(unix) || defined(__linux)
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+
+#ifdef __sgi
+#include <sys/types.h>
+#include <sys/times.h>
+#include <limits.h>
+#endif
+
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+
+#if defined(__BEOS__)
+#include <OS.h>
+#endif
+
+
+namespace HL {
+
+class Timer {
+
+public:
+
+  /// Initializes the timer.
+  Timer (void)
+#if !defined(_WIN32)
+    : _starttime (0),
+      _elapsedtime (0)
+#endif
+  {
+  }
+
+  /// Start the timer.
+  void start (void) { _starttime = _time(); }
+
+  /// Stop the timer.
+  void stop (void) { _elapsedtime += _time() - _starttime; }
+
+  /// Reset the timer.
+  void reset (void) { _starttime = _elapsedtime; }
+
+#if 0
+  // Set the timer.
+  void set (double secs) { _starttime = 0; _elapsedtime = _sectotime (secs);}
+#endif
+
+  /// Return the number of seconds elapsed.
+  operator double (void) { return _timetosec (_elapsedtime); }
+
+  static double currentTime (void) { TimeType t; t = _time(); return _timetosec (t); }
+
+
+private:
+
+  // The _timer variable will be different depending on the OS.
+  // We try to use the best timer available.
+
+#ifdef __sgi
+#define TIMER_FOUND
+
+  long _starttime, _elapsedtime;
+
+  long _time (void) {
+    struct tms t;
+    long ticks = times (&t);
+    return ticks;
+  }
+
+  static double _timetosec (long t) {
+    return ((double) (t) / CLK_TCK);
+  }
+
+  static long _sectotime (double sec) {
+    return (long) sec * CLK_TCK;
+  }
+#endif
+
+#ifdef __SVR4 // Solaris
+#define TIMER_FOUND
+  typedef hrtime_t TimeType;
+  TimeType	_starttime, _elapsedtime;
+
+  static TimeType _time (void) {
+    return gethrtime();
+  }
+
+  static TimeType _sectotime (double sec) { return (hrtime_t) (sec * 1.0e9); }
+
+  static double _timetosec (TimeType& t) {
+    return ((double) (t) / 1.0e9);
+  }
+#endif // __SVR4
+
+#if defined(MAC) || defined(macintosh)
+#define TIMER_FOUND
+  double		_starttime, _elapsedtime;
+
+  double _time (void) {
+    return get_Mac_microseconds();
+  }
+
+  double _timetosec (hrtime_t& t) {
+    return t;
+  }
+#endif // MAC
+
+#ifdef _WIN32
+#define TIMER_FOUND
+
+#ifndef __GNUC__
+  class TimeType {
+  public:
+    TimeType (void)
+    {
+      largeInt.QuadPart = 0;
+    }
+    operator double& (void) { return (double&) largeInt.QuadPart; }
+    operator LARGE_INTEGER& (void) { return largeInt; }
+    double timeToSec (void) {
+      return (double) largeInt.QuadPart / getFreq();
+    }
+  private:
+    double getFreq (void) {
+      QueryPerformanceFrequency (&freq);
+      return (double) freq.QuadPart;
+    }
+
+    LARGE_INTEGER largeInt;
+    LARGE_INTEGER freq;
+  };
+
+  TimeType _starttime, _elapsedtime;
+
+  static TimeType _time (void) {
+    TimeType t;
+    int r = QueryPerformanceCounter (&((LARGE_INTEGER&) t));
+    assert (r);
+    return t;
+  }
+
+  static double _timetosec (TimeType& t) {
+    return t.timeToSec();
+  }
+#else
+  typedef DWORD TimeType;
+  DWORD _starttime, _elapsedtime;
+  static DWORD _time (void) {
+    return GetTickCount();
+  }
+
+  static double _timetosec (DWORD& t) {
+    return (double) t / 100000.0;
+  }
+  static unsigned long _sectotime (double sec) {
+    return (unsigned long)(sec);
+  }
+#endif
+#endif // _WIN32
+
+
+#ifdef __BEOS__
+#define TIMER_FOUND
+  bigtime_t _starttime, _elapsedtime;
+  bigtime_t _time(void) {
+    return system_time();
+  }
+  double _timetosec (bigtime_t& t) {
+    return (double) t / 1000000.0;
+  }
+  
+  bigtime_t _sectotime (double sec) {
+    return (bigtime_t)(sec * 1000000.0);
+  }
+#endif // __BEOS__
+
+#ifndef TIMER_FOUND
+
+  typedef long TimeType;
+  TimeType _starttime, _elapsedtime;
+
+  static TimeType _time (void) {
+    struct timeval t;
+    gettimeofday (&t, NULL);
+    return t.tv_sec * 1000000 + t.tv_usec;
+  }
+
+  static double _timetosec (TimeType t) {
+    return ((double) (t) / 1000000.0);
+  }
+
+  static TimeType _sectotime (double sec) {
+    return (TimeType) (sec * 1000000.0);
+  }
+
+#endif // TIMER_FOUND
+
+#undef TIMER_FOUND
+
+};
+
+
+#ifdef __SVR4 // Solaris
+class VirtualTimer : public Timer {
+public:
+  hrtime_t _time (void) {
+    return gethrvtime();
+  }
+};  
+#endif
+
+}
+
+#endif
+
+#endif
diff --git a/falsesharing.py b/falsesharing.py
new file mode 100644
index 0000000..627acc0
--- /dev/null
+++ b/falsesharing.py
@@ -0,0 +1,124 @@
+import csv
+import pickle
+import matplotlib.pyplot as plt
+import multiprocessing
+import numpy as np
+import os
+import re
+import subprocess
+
+from benchmark import Benchmark
+from common_targets import common_targets
+
+cmd = ("perf stat -x\; -e cpu-clock:k,cache-references,cache-misses,cycles,"
+       "instructions,branches,faults,migrations "
+       "build/cache-{}{} {} 100 8 1000000")
+
+class Benchmark_Falsesharing( Benchmark ):
+    def __init__(self):
+        self.name = "falsesharing"
+        self.descrition = """This benchmarks makes small allocations and writes
+                            to them multiple times. If the allocated objects are
+                            on the same cache line the writes will be expensive because
+                            of cache trashing.""",
+        self.targets = common_targets
+        self.nthreads = range(1, multiprocessing.cpu_count() * 2 + 1)
+
+        self.results = {"args" : {"nthreads" : self.nthreads},
+                        "targets" : self.targets,
+                        "thrash": {},
+                        "scratch": {}}
+
+    def prepare(self, verbose=False):
+        req = ["build/cache-thrash", "build/cache-scratch"]
+        for r in req:
+            if not os.path.isfile(r):
+                print(r, "not found")
+                return False
+            if not os.access(r, os.X_OK):
+                print(r, "not executable")
+                return False
+            if verbose:
+                print(r, "found and executable.")
+        return True
+
+
+    def run(self, verbose=False, runs=3):
+        for run in range(1, runs + 1):
+            print(str(run) + ". run")
+
+            n = len(self.nthreads)
+            for i, threads in enumerate(list(range(1, n + 1)) * 2):
+                print(i + 1, "of", n*2, "\r", end='')
+
+                # run cmd for each target
+                for tname, t in self.targets.items():
+                    result = {}
+
+                    os.environ["LD_PRELOAD"] = t[1]
+
+                    for bench in ["thrash", "scratch"]:
+
+                        target_cmd = cmd.format(bench, t[0], threads).split(" ")
+                        if verbose:
+                            print("\n" + tname, t, "\n", " ".join(target_cmd), "\n")
+
+                        p = subprocess.run(target_cmd,
+                                             env=os.environ,
+                                             stderr=subprocess.PIPE,
+                                             stdout=subprocess.PIPE,
+                                             universal_newlines=True)
+
+                        output = p.stdout
+
+                        if p.returncode != 0:
+                            print("\n" + " ".join(target_cmd), "exited with",
+                                    p.returncode, ".\n Aborting Benchmark.")
+                            print(tname, t)
+                            print(output)
+                            print(p.stdout)
+                            return False
+
+                        if "ERROR: ld.so" in output:
+                            print("\nPreloading of", t[1], "failed for", tname,
+                                    ".\n Aborting Benchmark.")
+                            print(output)
+                            return False
+
+                        # Handle perf output
+                        time = float(re.search("(\d*\.\d*)", str(output))[1])
+                        key = (tname, threads)
+                        if not key in self.results[bench]:
+                            self.results[bench][key] = [time]
+                        else:
+                            self.results[bench][key].append(time)
+
+            print()
+        return True
+
+    def summary(self):
+        # Speedup thrash
+        nthreads = self.results["args"]["nthreads"]
+        targets = self.results["targets"]
+
+        y_mapping = {v : i for i, v in enumerate(nthreads)}
+        for bench in ["thrash", "scratch"]:
+            for target in targets:
+                y_vals = [0] * len(nthreads)
+                single_threaded = np.mean(self.results[bench][(target, 1)])
+                y_vals[0] = single_threaded
+                for mid, measures in self.results[bench].items():
+                    print(measures)
+                    if mid[0] == target and mid[1] != 1:
+                        y_vals[y_mapping[mid[1]]] = single_threaded / np.mean(measures)
+                print(target, single_threaded, y_vals)
+                plt.plot(nthreads, y_vals, marker='.', linestyle='-', label=target)
+
+            plt.legend()
+            plt.xlabel("threads")
+            plt.ylabel("speedup")
+            plt.title(bench)
+            plt.savefig(self.name + "." + bench + ".png")
+            plt.clf()
+
+falsesharing= Benchmark_Falsesharing()
author	Florian Fischer <florian.fl.fischer@fau.de>	2018-07-24 11:17:48 +0200
committer	Florian Fischer <florian.fl.fischer@fau.de>	2018-07-24 11:17:48 +0200
commit	8f6e9a172923b67ccffaf9fd519642ae242db868 (patch)
tree	b38624306297c73011756dbd2bcbbcf9ea6c1f37
parent	fce50c833496d8c07a1d189807d81be15875431c (diff)
download	allocbench-8f6e9a172923b67ccffaf9fd519642ae242db868.tar.gz allocbench-8f6e9a172923b67ccffaf9fd519642ae242db868.zip