aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/__init__.py0
-rw-r--r--src/benchmarks/bench_loop.c87
-rw-r--r--src/benchmarks/cache-scratch.cc147
-rw-r--r--src/benchmarks/cache-thrash.cc134
-rw-r--r--src/benchmarks/cpuinfo.h202
-rw-r--r--src/benchmarks/fred.h97
-rw-r--r--src/benchmarks/larson.cc744
-rw-r--r--src/benchmarks/timer.h372
-rw-r--r--src/chattymalloc.c161
-rw-r--r--src/chattyparser.py158
-rw-r--r--src/larson.py54
-rw-r--r--src/print_status_on_exit.c37
-rw-r--r--src/trace_run.c750
13 files changed, 2943 insertions, 0 deletions
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/__init__.py
diff --git a/src/benchmarks/bench_loop.c b/src/benchmarks/bench_loop.c
new file mode 100644
index 0000000..bc15808
--- /dev/null
+++ b/src/benchmarks/bench_loop.c
@@ -0,0 +1,87 @@
+#include <assert.h>
+#include <malloc.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+static size_t _rand() {
+ static __thread size_t seed = 123456789;
+ size_t a = 1103515245;
+ size_t c = 12345;
+ size_t m = 1 << 31;
+ seed = (a * seed + c) % m;
+ return seed;
+}
+
+typedef struct ThreadArgs {
+ double benchmark;
+ int allocations;
+ int max_size;
+} ThreadArgs;
+
+static void* malloc_then_write(size_t size) {
+ void* ptr = malloc(size);
+ // Write to ptr
+ /* *((char*)ptr) = '!'; */
+ return ptr;
+}
+
+static void read_then_free(void* ptr) {
+ // Read before free
+ /* char s __attribute__((unused)) = *((char*)ptr); */
+ free(ptr);
+}
+static void* test_thread_func(void* arg) {
+ ThreadArgs* args = (ThreadArgs*)arg;
+
+ for(int i = 0; i < args->allocations; i++) {
+ void* ptr = malloc_then_write((_rand() % args->max_size) + 1);
+ read_then_free(ptr);
+ }
+ return NULL;
+}
+
+int main(int argc, char* argv[]) {
+ pthread_t* threads;
+ int num_threads;
+ struct ThreadArgs thread_args;
+
+ if (argc < 4) {
+ fprintf(stderr, "Usage: %s <num threads> <num allocations> <max size>\n", argv[0]);
+ return 1;
+ }
+
+ num_threads = atoi(argv[1]);
+ thread_args.allocations = atoi(argv[2]);
+ thread_args.max_size = atoi(argv[3]);
+
+ threads = (pthread_t*)malloc(num_threads * sizeof(pthread_t));
+
+ for (int i = 0; i < num_threads; i++) {
+ if (0 != pthread_create(&threads[i], NULL, test_thread_func, &thread_args)) {
+ perror("pthread_create");
+ return 1;
+ }
+ }
+
+ for(int i = 0; i < num_threads; i++) {
+ if (0 != pthread_join(threads[i], NULL)) {
+ perror("pthread_join");
+ return 1;
+ }
+ }
+
+ if (argc == 5)
+ {
+ FILE* f = stdout;
+ if (strcmp(argv[4],"stdout") != 0)
+ f = fopen(argv[4], "w");
+ malloc_info(0, f);
+ if (strcmp(argv[4],"stdout") != 0)
+ fclose(f);
+ }
+
+ return 0;
+}
diff --git a/src/benchmarks/cache-scratch.cc b/src/benchmarks/cache-scratch.cc
new file mode 100644
index 0000000..2cb9b28
--- /dev/null
+++ b/src/benchmarks/cache-scratch.cc
@@ -0,0 +1,147 @@
+///-*-C++-*-//////////////////////////////////////////////////////////////////
+//
+// Hoard: A Fast, Scalable, and Memory-Efficient Allocator
+// for Shared-Memory Multiprocessors
+// Contact author: Emery Berger, http://www.cs.umass.edu/~emery
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Library General Public License as
+// published by the Free Software Foundation, http://www.fsf.org.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Library General Public License for more details.
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @file cache-scratch.cpp
+ *
+ * cache-scratch is a benchmark that exercises a heap's cache locality.
+ * An allocator that allows multiple threads to re-use the same small
+ * object (possibly all in one cache-line) will scale poorly, while
+ * an allocator like Hoard will exhibit near-linear scaling.
+ *
+ * Try the following (on a P-processor machine):
+ *
+ * cache-scratch 1 1000 1 1000000
+ * cache-scratch P 1000 1 1000000
+ *
+ * cache-scratch-hoard 1 1000 1 1000000
+ * cache-scratch-hoard P 1000 1 1000000
+ *
+ * The ideal is a P-fold speedup.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "fred.h"
+#include "cpuinfo.h"
+#include "timer.h"
+
+// This class just holds arguments to each thread.
+class workerArg {
+public:
+
+ workerArg() {}
+
+ workerArg (char * obj, int objSize, int repetitions, int iterations)
+ : _object (obj),
+ _objSize (objSize),
+ _iterations (iterations),
+ _repetitions (repetitions)
+ {}
+
+ char * _object;
+ int _objSize;
+ int _iterations;
+ int _repetitions;
+};
+
+
+#if defined(_WIN32)
+extern "C" void worker (void * arg)
+#else
+extern "C" void * worker (void * arg)
+#endif
+{
+ // free the object we were given.
+ // Then, repeatedly do the following:
+ // malloc a given-sized object,
+ // repeatedly write on it,
+ // then free it.
+ workerArg * w = (workerArg *) arg;
+ delete w->_object;
+ workerArg w1 = *w;
+ for (int i = 0; i < w1._iterations; i++) {
+ // Allocate the object.
+ char * obj = new char[w1._objSize];
+ // Write into it a bunch of times.
+ for (int j = 0; j < w1._repetitions; j++) {
+ for (int k = 0; k < w1._objSize; k++) {
+ obj[k] = (char) k;
+ volatile char ch = obj[k];
+ ch++;
+ }
+ }
+ // Free the object.
+ delete [] obj;
+ }
+
+#if !defined(_WIN32)
+ return NULL;
+#endif
+}
+
+
+int main (int argc, char * argv[])
+{
+ int nthreads;
+ int iterations;
+ int objSize;
+ int repetitions;
+
+ if (argc > 4) {
+ nthreads = atoi(argv[1]);
+ iterations = atoi(argv[2]);
+ objSize = atoi(argv[3]);
+ repetitions = atoi(argv[4]);
+ } else {
+ fprintf (stderr, "Usage: %s nthreads iterations objSize repetitions\n", argv[0]);
+ return 1;
+ }
+
+ HL::Fred * threads = new HL::Fred[nthreads];
+ HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors());
+
+ workerArg * w = new workerArg[nthreads];
+
+ int i;
+
+ // Allocate nthreads objects and distribute them among the threads.
+ char ** objs = new char * [nthreads];
+ for (i = 0; i < nthreads; i++) {
+ objs[i] = new char[objSize];
+ }
+
+ HL::Timer t;
+ t.start();
+
+ for (i = 0; i < nthreads; i++) {
+ w[i] = workerArg (objs[i], objSize, repetitions / nthreads, iterations);
+ threads[i].create (&worker, (void *) &w[i]);
+ }
+ for (i = 0; i < nthreads; i++) {
+ threads[i].join();
+ }
+ t.stop();
+
+ delete [] threads;
+ delete [] objs;
+ delete [] w;
+
+ printf ("Time elapsed = %f seconds.\n", (double) t);
+ return 0;
+}
diff --git a/src/benchmarks/cache-thrash.cc b/src/benchmarks/cache-thrash.cc
new file mode 100644
index 0000000..79242eb
--- /dev/null
+++ b/src/benchmarks/cache-thrash.cc
@@ -0,0 +1,134 @@
+///-*-C++-*-//////////////////////////////////////////////////////////////////
+//
+// Hoard: A Fast, Scalable, and Memory-Efficient Allocator
+// for Shared-Memory Multiprocessors
+// Contact author: Emery Berger, http://www.cs.umass.edu/~emery
+//
+// Copyright (c) 1998-2003, The University of Texas at Austin.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Library General Public License as
+// published by the Free Software Foundation, http://www.fsf.org.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Library General Public License for more details.
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @file cache-thrash.cpp
+ * @brief cache-thrash is a benchmark that exercises a heap's cache-locality.
+ *
+ * Try the following (on a P-processor machine):
+ *
+ * cache-thrash 1 1000 1 1000000
+ * cache-thrash P 1000 1 1000000
+ *
+ * cache-thrash-hoard 1 1000 1 1000000
+ * cache-thrash-hoard P 1000 1 1000000
+ *
+ * The ideal is a P-fold speedup.
+*/
+
+
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+
+#include "cpuinfo.h"
+#include "fred.h"
+#include "timer.h"
+
+// This class just holds arguments to each thread.
+class workerArg {
+public:
+ workerArg() {}
+ workerArg (int objSize, int repetitions, int iterations)
+ : _objSize (objSize),
+ _iterations (iterations),
+ _repetitions (repetitions)
+ {}
+
+ int _objSize;
+ int _iterations;
+ int _repetitions;
+};
+
+
+#if defined(_WIN32)
+extern "C" void worker (void * arg)
+#else
+extern "C" void * worker (void * arg)
+#endif
+{
+ // Repeatedly do the following:
+ // malloc a given-sized object,
+ // repeatedly write on it,
+ // then free it.
+ workerArg * w = (workerArg *) arg;
+ workerArg w1 = *w;
+ for (int i = 0; i < w1._iterations; i++) {
+ // Allocate the object.
+ char * obj = new char[w1._objSize];
+ // printf ("obj = %p\n", obj);
+ // Write into it a bunch of times.
+ for (int j = 0; j < w1._repetitions; j++) {
+ for (int k = 0; k < w1._objSize; k++) {
+ obj[k] = (char) k;
+ volatile char ch = obj[k];
+ ch++;
+ }
+ }
+ // Free the object.
+ delete [] obj;
+ }
+#if !defined(_WIN32)
+ return NULL;
+#endif
+}
+
+
+int main (int argc, char * argv[])
+{
+ int nthreads;
+ int iterations;
+ int objSize;
+ int repetitions;
+
+ if (argc > 4) {
+ nthreads = atoi(argv[1]);
+ iterations = atoi(argv[2]);
+ objSize = atoi(argv[3]);
+ repetitions = atoi(argv[4]);
+ } else {
+ cerr << "Usage: " << argv[0] << " nthreads iterations objSize repetitions" << endl;
+ exit(1);
+ }
+
+ HL::Fred * threads = new HL::Fred[nthreads];
+ HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors());
+
+ int i;
+
+ HL::Timer t;
+ t.start();
+
+ workerArg * w = new workerArg[nthreads];
+
+ for (i = 0; i < nthreads; i++) {
+ w[i] = workerArg (objSize, repetitions / nthreads, iterations);
+ threads[i].create (&worker, (void *) &w[i]);
+ }
+ for (i = 0; i < nthreads; i++) {
+ threads[i].join();
+ }
+ t.stop();
+
+ delete [] threads;
+ delete [] w;
+
+ cout << "Time elapsed = " << (double) t << " seconds." << endl;
+}
diff --git a/src/benchmarks/cpuinfo.h b/src/benchmarks/cpuinfo.h
new file mode 100644
index 0000000..1ed1f36
--- /dev/null
+++ b/src/benchmarks/cpuinfo.h
@@ -0,0 +1,202 @@
+// -*- C++ -*-
+
+/*
+
+ Heap Layers: An Extensible Memory Allocation Infrastructure
+
+ Copyright (C) 2000-2003 by Emery Berger
+ http://www.cs.umass.edu/~emery
+ emery@cs.umass.edu
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+*/
+
+
+
+#ifndef HL_CPUINFO_H
+#define HL_CPUINFO_H
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
+
+
+#if !defined(_WIN32)
+#include <pthread.h>
+#endif
+
+#if defined(__SVR4) // Solaris
+#include <sys/lwp.h>
+extern "C" unsigned int lwp_self(void);
+#include <thread.h>
+extern "C" int _thr_self(void);
+#endif
+
+#if defined(__linux)
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(__sgi)
+#include <sys/types.h>
+#include <sys/sysmp.h>
+#include <sys/sysinfo.h>
+#endif
+
+#if defined(hpux)
+#include <sys/mpctl.h>
+#endif
+
+#if defined(_WIN32)
+extern __declspec(thread) int localThreadId;
+#endif
+
+#if defined(__SVR4) && defined(MAP_ALIGN)
+extern volatile int anyThreadStackCreated;
+#endif
+
+namespace HL {
+
+/**
+ * @class CPUInfo
+ * @author Emery Berger <http://www.cs.umass.edu/~emery>
+ *
+ * @brief Architecture-independent wrapper to get number of CPUs.
+ */
+
+class CPUInfo {
+public:
+ CPUInfo (void)
+ {}
+
+ inline static int getNumProcessors (void) {
+ static int _numProcessors = computeNumProcessors();
+ return _numProcessors;
+ }
+
+ static inline unsigned long getThreadId (void);
+ inline static int computeNumProcessors (void);
+
+};
+
+
+int CPUInfo::computeNumProcessors (void)
+{
+ static int np = 0;
+ if (!np) {
+#if defined(__linux) || defined(__APPLE__)
+ np = (int) sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_WIN32)
+ SYSTEM_INFO infoReturn[1];
+ GetSystemInfo (infoReturn);
+ np = (int) (infoReturn->dwNumberOfProcessors);
+#elif defined(__sgi)
+ np = (int) sysmp(MP_NAPROCS);
+#elif defined(hpux)
+ np = mpctl(MPC_GETNUMSPUS, NULL, NULL); // or pthread_num_processors_np()?
+#elif defined(_SC_NPROCESSORS_ONLN)
+ np = (int) (sysconf(_SC_NPROCESSORS_ONLN));
+#else
+ np = 2;
+ // Unsupported platform.
+ // Pretend we have at least two processors. This approach avoids the risk of assuming
+ // we're on a uniprocessor, which might lead clever allocators to avoid using atomic
+ // operations for all locks.
+#endif
+ return np;
+ } else {
+ return np;
+ }
+}
+
+ // Note: when stacksize arg is NULL for pthread_attr_setstacksize [Solaris],
+// stack size is 1 MB for 32-bit arch, 2 MB for 64-bit arch.
+// pthread_attr_getstacksize
+// pthread_attr_setstackaddr
+// pthread_attr_getstackaddr
+// PTHREAD_STACK_SIZE is minimum.
+// or should we just assume we have __declspec(thread) or __thread?
+
+#if defined(USE_THREAD_KEYWORD)
+ extern __thread int localThreadId;
+#endif
+
+ // FIX ME FIXME
+ //#include <stdio.h>
+
+unsigned long CPUInfo::getThreadId (void) {
+#if defined(__SVR4)
+ size_t THREAD_STACK_SIZE;
+ if (sizeof(size_t) <= 4) {
+ THREAD_STACK_SIZE = 1048576;
+ } else {
+ // 64-bits.
+ THREAD_STACK_SIZE = 1048576 * 2;
+ }
+ if (0) { // !anyThreadStackCreated) {
+ // We know a priori that all stack variables
+ // are on different stacks. Since no one has created
+ // a special one, we are in control, and thus all stacks
+ // are 1 MB in size and on 1 MB boundaries.
+ // (Actually: 1 MB for 32-bits, 2 MB for 64-bits.)
+ char buf;
+ return (((size_t) &buf) & ~(THREAD_STACK_SIZE-1)) >> 20;
+ } else {
+ return (int) pthread_self();
+ }
+#elif defined(_WIN32)
+ // It looks like thread id's are always multiples of 4, so...
+ return GetCurrentThreadId() >> 2;
+#elif defined(__APPLE__)
+ // Consecutive thread id's in Mac OS are 4096 apart;
+ // dividing off the 4096 gives us an appropriate thread id.
+ int tid = (int) ((unsigned long) pthread_self()) >> 12;
+ return tid;
+#elif defined(__BEOS__)
+ return find_thread(0);
+#elif defined(USE_THREAD_KEYWORD)
+ return localThreadId;
+#elif defined(__linux) || defined(PTHREAD_KEYS_MAX)
+ // Consecutive thread id's in Linux are 1024 apart;
+ // dividing off the 1024 gives us an appropriate thread id.
+ return (unsigned long) pthread_self() >> 10;
+#elif defined(POSIX)
+ return (unsigned long) pthread_self();
+#elif USE_SPROC
+ // This hairiness has the same effect as calling getpid(),
+ // but it's MUCH faster since it avoids making a system call
+ // and just accesses the sproc-local data directly.
+ unsigned long pid = (unsigned long) PRDA->sys_prda.prda_sys.t_pid;
+ return pid;
+#else
+ return 0;
+#endif
+}
+
+}
+
+#endif
diff --git a/src/benchmarks/fred.h b/src/benchmarks/fred.h
new file mode 100644
index 0000000..b0198a7
--- /dev/null
+++ b/src/benchmarks/fred.h
@@ -0,0 +1,97 @@
+// -*- C++ -*-
+
+#ifndef HL_FRED_H
+#define HL_FRED_H
+
+/// A thread-wrapper of childlike simplicity :).
+
+#if defined(_WIN32)
+
+ #include <windows.h>
+ #include <process.h>
+
+#elif defined(__SVR4)
+
+ #include <thread.h>
+ #include <pthread.h>
+ #include <unistd.h>
+
+#else
+
+ #include <pthread.h>
+ #include <unistd.h>
+
+#endif
+
+typedef void * (*ThreadFunctionType) (void *);
+
+namespace HL {
+
+class Fred {
+public:
+
+ Fred() {
+#if !defined(_WIN32)
+ pthread_attr_init (&attr);
+ pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM);
+#endif
+ }
+
+ ~Fred() {
+#if !defined(_WIN32)
+ pthread_attr_destroy (&attr);
+#endif
+ }
+
+ void create (ThreadFunctionType function, void * arg) {
+#if defined(_WIN32)
+ t = CreateThread (0, 0, (LPTHREAD_START_ROUTINE) *function, (LPVOID) arg, 0, 0);
+#else
+ pthread_create (&t, &attr, function, arg);
+#endif
+ }
+
+ void join (void) {
+#if defined(_WIN32)
+ WaitForSingleObject (t, INFINITE);
+#else
+ pthread_join (t, NULL);
+#endif
+ }
+
+ static void yield (void) {
+#if defined(_WIN32)
+ Sleep (0);
+#elif defined(__SVR4)
+ thr_yield();
+#else
+ sched_yield();
+#endif
+ }
+
+
+ static void setConcurrency (int n) {
+#if defined(_WIN32)
+#elif defined(__SVR4)
+ thr_setconcurrency (n);
+#else
+ pthread_setconcurrency (n);
+#endif
+ }
+
+
+private:
+#if defined(_WIN32)
+ typedef HANDLE FredType;
+#else
+ typedef pthread_t FredType;
+ pthread_attr_t attr;
+#endif
+
+ FredType t;
+};
+
+}
+
+
+#endif
diff --git a/src/benchmarks/larson.cc b/src/benchmarks/larson.cc
new file mode 100644
index 0000000..be8038f
--- /dev/null
+++ b/src/benchmarks/larson.cc
@@ -0,0 +1,744 @@
+#include <assert.h>
+#include <stdio.h>
+
+#if defined(_WIN32)
+#define __WIN32__
+#endif
+
+#ifdef __WIN32__
+#include <windows.h>
+#include <conio.h>
+#include <process.h>
+
+#else
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+
+#ifndef __SVR4
+//extern "C" int pthread_setconcurrency (int) throw();
+#include <pthread.h>
+#endif
+
+
+typedef void * LPVOID;
+typedef long long LONGLONG;
+typedef long DWORD;
+typedef long LONG;
+typedef unsigned long ULONG;
+typedef union _LARGE_INTEGER {
+ struct {
+ DWORD LowPart;
+ LONG HighPart;
+ } foo;
+ LONGLONG QuadPart; // In Visual C++, a typedef to _ _int64} LARGE_INTEGER;
+} LARGE_INTEGER;
+typedef long long _int64;
+#ifndef TRUE
+enum { TRUE = 1, FALSE = 0 };
+#endif
+#include <assert.h>
+#define _ASSERTE(x) assert(x)
+#define _inline inline
+void Sleep (long x)
+{
+ // printf ("sleeping for %ld seconds.\n", x/1000);
+ sleep(x/1000);
+}
+
+void QueryPerformanceCounter (long * x)
+{
+ struct timezone tz;
+ struct timeval tv;
+ gettimeofday (&tv, &tz);
+ *x = tv.tv_sec * 1000000L + tv.tv_usec;
+}
+
+void QueryPerformanceFrequency(long * x)
+{
+ *x = 1000000L;
+}
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <ctype.h>
+#include <time.h>
+#include <assert.h>
+
+#define _REENTRANT 1
+#include <pthread.h>
+#ifdef __sun
+#include <thread.h>
+#endif
+typedef void * VoidFunction (void *);
+void _beginthread (VoidFunction x, int, void * z)
+{
+ pthread_t pt;
+ pthread_attr_t pa;
+ pthread_attr_init (&pa);
+
+#if 1//defined(__SVR4)
+ pthread_attr_setscope (&pa, PTHREAD_SCOPE_SYSTEM); /* bound behavior */
+#endif
+
+ // printf ("creating a thread.\n");
+ int v = pthread_create(&pt, &pa, x, z);
+ // printf ("v = %d\n", v);
+}
+#endif
+
+
+#if 0
+static char buf[65536];
+
+#define malloc(v) &buf
+#define free(p)
+#endif
+
+#undef CPP
+//#define CPP
+//#include "arch-specific.h"
+
+#if USE_ROCKALL
+//#include "FastHeap.hpp"
+//FAST_HEAP theFastHeap (1024 * 1024, true, true, true);
+
+typedef int SBIT32;
+
+#include "SmpHeap.hpp"
+SMP_HEAP theFastHeap (1024 * 1024, true, true, true);
+
+void * operator new( unsigned int cb )
+{
+ void *pRet = theFastHeap.New ((size_t)cb) ;
+ return pRet;
+}
+
+void operator delete(void *pUserData )
+{
+ theFastHeap.Delete (pUserData) ;
+}
+#endif
+
+#if 0
+extern "C" void * hdmalloc (size_t sz) ;
+extern "C" void hdfree (void * ptr) ;
+extern "C" void hdmalloc_stats (void) ;
+void * operator new( unsigned int cb )
+{
+ void *pRet = hdmalloc((size_t)cb) ;
+ return pRet;
+}
+
+void operator delete(void *pUserData )
+{
+ hdfree(pUserData) ;
+}
+#endif
+
+
+
+/* Test driver for memory allocators */
+/* Author: Paul Larson, palarson@microsoft.com */
+#define MAX_THREADS 100
+#define MAX_BLOCKS 20000000
+
+int volatile stopflag=FALSE ;
+
+struct lran2_st {
+ long x, y, v[97];
+};
+
+int TotalAllocs=0 ;
+
+typedef struct thr_data {
+
+ int threadno ;
+ int NumBlocks ;
+ int seed ;
+
+ int min_size ;
+ int max_size ;
+
+ char * *array ;
+ int *blksize ;
+ int asize ;
+
+ unsigned long cAllocs ;
+ unsigned long cFrees ;
+ int cThreads ;
+ unsigned long cBytesAlloced ;
+
+ volatile int finished ;
+ struct lran2_st rgen ;
+
+} thread_data;
+
+void runthreads(long sleep_cnt, int min_threads, int max_threads,
+ int chperthread, int num_rounds) ;
+void runloops(long sleep_cnt, int num_chunks ) ;
+static void warmup(char **blkp, int num_chunks );
+static void * exercise_heap( void *pinput) ;
+static void lran2_init(struct lran2_st* d, long seed) ;
+static long lran2(struct lran2_st* d) ;
+ULONG CountReservedSpace() ;
+
+char ** blkp = new char *[MAX_BLOCKS] ;
+int * blksize = new int[MAX_BLOCKS] ;
+long seqlock=0 ;
+struct lran2_st rgen ;
+int min_size=10, max_size=500 ;
+int num_threads ;
+ULONG init_space ;
+
+extern int cLockSleeps ;
+extern int cAllocedChunks ;
+extern int cAllocedSpace ;
+extern int cUsedSpace ;
+extern int cFreeChunks ;
+extern int cFreeSpace ;
+
+int cChecked=0 ;
+
+#if defined(_WIN32)
+extern "C" {
+ extern HANDLE crtheap;
+};
+#endif
+
+int main (int argc, char *argv[])
+{
+#if defined(USE_LFH) && defined(_WIN32)
+ // Activate 'Low Fragmentation Heap'.
+ ULONG info = 2;
+ HeapSetInformation (GetProcessHeap(),
+ HeapCompatibilityInformation,
+ &info,
+ sizeof(info));
+#endif
+#if 0 // defined(__SVR4)
+ {
+ psinfo_t ps;
+ int pid = getpid();
+ char fname[255];
+ sprintf (fname, "/proc/%d/psinfo", pid);
+ // sprintf (fname, "/proc/self/ps");
+ FILE * f = fopen (fname, "rb");
+ printf ("opening %s\n", fname);
+ if (f) {
+ fread (&ps, sizeof(ps), 1, f);
+ printf ("resident set size = %dK\n", ps.pr_rssize);
+ fclose (f);
+ }
+ }
+#endif
+
+#if defined(_MT) || defined(_REENTRANT)
+ int min_threads, max_threads ;
+ int num_rounds ;
+ int chperthread ;
+#endif
+ unsigned seed=12345 ;
+ int num_chunks=10000;
+ long sleep_cnt;
+
+ if (argc > 7) {
+ sleep_cnt = atoi(argv[1]);
+ min_size = atoi(argv[2]);
+ max_size = atoi(argv[3]);
+ chperthread = atoi(argv[4]);
+ num_rounds = atoi(argv[5]);
+ seed = atoi(argv[6]);
+ max_threads = atoi(argv[7]);
+ min_threads = max_threads;
+ printf ("sleep = %ld, min = %d, max = %d, per thread = %d, num rounds = %d, seed = %d, max_threads = %d, min_threads = %d\n",
+ sleep_cnt, min_size, max_size, chperthread, num_rounds, seed, max_threads, min_threads);
+ goto DoneWithInput;
+ }
+
+#if defined(_MT) || defined(_REENTRANT)
+ //#ifdef _MT
+ printf( "\nMulti-threaded test driver \n") ;
+#else
+ printf( "\nSingle-threaded test driver \n") ;
+#endif
+#ifdef CPP
+ printf("C++ version (new and delete)\n") ;
+#else
+ printf("C version (malloc and free)\n") ;
+#endif
+ printf("runtime (sec): ") ;
+ scanf ("%ld", &sleep_cnt);
+
+ printf("chunk size (min,max): ") ;
+ scanf("%d %d", &min_size, &max_size ) ;
+#if defined(_MT) || defined(_REENTRANT)
+ //#ifdef _MT
+ printf("threads (min, max): ") ;
+ scanf("%d %d", &min_threads, &max_threads) ;
+ printf("chunks/thread: ") ; scanf("%d", &chperthread ) ;
+ printf("no of rounds: ") ; scanf("%d", &num_rounds ) ;
+ num_chunks = max_threads*chperthread ;
+#else
+ printf("no of chunks: ") ; scanf("%d", &num_chunks ) ;
+#endif
+ printf("random seed: ") ; scanf("%d", &seed) ;
+
+ DoneWithInput:
+
+ if( num_chunks > MAX_BLOCKS ){
+ printf("Max %d chunks - exiting\n", MAX_BLOCKS ) ;
+ return(1) ;
+ }
+
+#ifndef __WIN32__
+#ifdef __SVR4
+ pthread_setconcurrency (max_threads);
+#endif
+#endif
+
+ lran2_init(&rgen, seed) ;
+ // init_space = CountReservedSpace() ;
+
+#if defined(_MT) || defined(_REENTRANT)
+ //#ifdef _MT
+ runthreads(sleep_cnt, min_threads, max_threads, chperthread, num_rounds) ;
+#else
+ runloops(sleep_cnt, num_chunks ) ;
+#endif
+
+#ifdef _DEBUG
+ _cputs("Hit any key to exit...") ; (void)_getch() ;
+#endif
+
+ return 0;
+
+} /* main */
+
+void runloops(long sleep_cnt, int num_chunks )
+{
+ int cblks ;
+ int victim ;
+ int blk_size ;
+#ifdef __WIN32__
+ _LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt;
+#else
+ long ticks_per_sec ;
+ long start_cnt, end_cnt ;
+#endif
+ _int64 ticks ;
+ double duration ;
+ double reqd_space ;
+ ULONG used_space ;
+ int sum_allocs=0 ;
+
+ QueryPerformanceFrequency( &ticks_per_sec ) ;
+ QueryPerformanceCounter( &start_cnt) ;
+
+ for( cblks=0; cblks<num_chunks; cblks++){
+ if (max_size == min_size) {
+ blk_size = min_size;
+ } else {
+ blk_size = min_size+lran2(&rgen)%(max_size - min_size) ;
+ }
+#ifdef CPP
+ blkp[cblks] = new char[blk_size] ;
+#else
+ blkp[cblks] = (char *) malloc(blk_size) ;
+#endif
+ blksize[cblks] = blk_size ;
+ assert(blkp[cblks] != NULL) ;
+ }
+
+ while(TRUE){
+ for( cblks=0; cblks<num_chunks; cblks++){
+ victim = lran2(&rgen)%num_chunks ;
+#ifdef CPP
+ delete blkp[victim] ;
+#else
+ free(blkp[victim]) ;
+#endif
+
+ if (max_size == min_size) {
+ blk_size = min_size;
+ } else {
+ blk_size = min_size+lran2(&rgen)%(max_size - min_size) ;
+ }
+#ifdef CPP
+ blkp[victim] = new char[blk_size] ;
+#else
+ blkp[victim] = (char *) malloc(blk_size) ;
+#endif
+ blksize[victim] = blk_size ;
+ assert(blkp[victim] != NULL) ;
+ }
+ sum_allocs += num_chunks ;
+
+ QueryPerformanceCounter( &end_cnt) ;
+#ifdef __WIN32__
+ ticks = end_cnt.QuadPart - start_cnt.QuadPart ;
+ duration = (double)ticks/ticks_per_sec.QuadPart ;
+#else
+ ticks = end_cnt - start_cnt ;
+ duration = (double)ticks/ticks_per_sec ;
+#endif
+
+ if( duration >= sleep_cnt) break ;
+ }
+ reqd_space = (0.5*(min_size+max_size)*num_chunks) ;
+ // used_space = CountReservedSpace() - init_space;
+
+ printf("%6.3f", duration ) ;
+ printf("%8.0f", sum_allocs/duration ) ;
+ printf(" %6.3f %.3f", (double)used_space/(1024*1024), used_space/reqd_space) ;
+ printf("\n") ;
+
+}
+
+
+#if defined(_MT) || defined(_REENTRANT)
+//#ifdef _MT
+void runthreads(long sleep_cnt, int min_threads, int max_threads, int chperthread, int num_rounds)
+{
+ thread_data *de_area = new thread_data[max_threads] ;
+ thread_data *pdea;
+ int nperthread ;
+ int sum_threads ;
+ unsigned long sum_allocs ;
+ unsigned long sum_frees ;
+ double duration ;
+#ifdef __WIN32__
+ _LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt;
+#else
+ long ticks_per_sec ;
+ long start_cnt, end_cnt ;
+#endif
+ _int64 ticks ;
+ double rate_1=0, rate_n ;
+ double reqd_space ;
+ ULONG used_space ;
+ int prevthreads ;
+ int i ;
+
+ QueryPerformanceFrequency( &ticks_per_sec ) ;
+
+ pdea = &de_area[0] ;
+ memset(&de_area[0], 0, sizeof(thread_data)) ;
+
+ prevthreads = 0 ;
+ for(num_threads=min_threads; num_threads <= max_threads; num_threads++ )
+ {
+
+ warmup(&blkp[prevthreads*chperthread], (num_threads-prevthreads)*chperthread );
+
+ nperthread = chperthread ;
+ stopflag = FALSE ;
+
+ for(i=0; i< num_threads; i++){
+ de_area[i].threadno = i+1 ;
+ de_area[i].NumBlocks = num_rounds*nperthread;
+ de_area[i].array = &blkp[i*nperthread] ;
+ de_area[i].blksize = &blksize[i*nperthread] ;
+ de_area[i].asize = nperthread ;
+ de_area[i].min_size = min_size ;
+ de_area[i].max_size = max_size ;
+ de_area[i].seed = lran2(&rgen) ; ;
+ de_area[i].finished = 0 ;
+ de_area[i].cAllocs = 0 ;
+ de_area[i].cFrees = 0 ;
+ de_area[i].cThreads = 0 ;
+ de_area[i].finished = FALSE ;
+ lran2_init(&de_area[i].rgen, de_area[i].seed) ;
+
+#ifdef __WIN32__
+ _beginthread((void (__cdecl*)(void *)) exercise_heap, 0, &de_area[i]) ;
+#else
+ _beginthread(exercise_heap, 0, &de_area[i]) ;
+#endif
+
+ }
+
+ QueryPerformanceCounter( &start_cnt) ;
+
+ // printf ("Sleeping for %ld seconds.\n", sleep_cnt);
+ Sleep(sleep_cnt * 1000L) ;
+
+ stopflag = TRUE ;
+
+ for(i=0; i<num_threads; i++){
+ while( !de_area[i].finished ){
+#ifdef __WIN32__
+ Sleep(1);
+#elif defined(__SVR4)
+ thr_yield();
+#else
+ sched_yield();
+#endif
+ }
+ }
+
+
+ QueryPerformanceCounter( &end_cnt) ;
+
+ sum_frees = sum_allocs =0 ;
+ sum_threads = 0 ;
+ for(i=0;i< num_threads; i++){
+ sum_allocs += de_area[i].cAllocs ;
+ sum_frees += de_area[i].cFrees ;
+ sum_threads += de_area[i].cThreads ;
+ de_area[i].cAllocs = de_area[i].cFrees = 0;
+ }
+
+
+#ifdef __WIN32__
+ ticks = end_cnt.QuadPart - start_cnt.QuadPart ;
+ duration = (double)ticks/ticks_per_sec.QuadPart ;
+#else
+ ticks = end_cnt - start_cnt ;
+ duration = (double)ticks/ticks_per_sec ;
+#endif
+
+ for( i=0; i<num_threads; i++){
+ if( !de_area[i].finished )
+ printf("Thread at %d not finished\n", i) ;
+ }
+
+
+ rate_n = sum_allocs/duration ;
+ if( rate_1 == 0){
+ rate_1 = rate_n ;
+ }
+
+ reqd_space = (0.5*(min_size+max_size)*num_threads*chperthread) ;
+ // used_space = CountReservedSpace() - init_space;
+ used_space = 0;
+
+ printf ("Throughput = %8.0f operations per second.\n", sum_allocs / duration);
+
+#if 0
+ printf("%2d ", num_threads ) ;
+ printf("%6.3f", duration ) ;
+ printf("%6.3f", rate_n/rate_1 ) ;
+ printf("%8.0f", sum_allocs/duration ) ;
+ printf(" %6.3f %.3f", (double)used_space/(1024*1024), used_space/reqd_space) ;
+ printf("\n") ;
+#endif
+
+ Sleep(5000L) ; // wait 5 sec for old threads to die
+
+ prevthreads = num_threads ;
+
+ printf ("Done sleeping...\n");
+
+ }
+ delete [] de_area;
+}
+
+
+static void * exercise_heap( void *pinput)
+{
+ thread_data *pdea;
+ int cblks=0 ;
+ int victim ;
+ long blk_size ;
+ int range ;
+
+ if( stopflag ) return 0;
+
+ pdea = (thread_data *)pinput ;
+ pdea->finished = FALSE ;
+ pdea->cThreads++ ;
+ range = pdea->max_size - pdea->min_size ;
+
+ /* allocate NumBlocks chunks of random size */
+ for( cblks=0; cblks<pdea->NumBlocks; cblks++){
+ victim = lran2(&pdea->rgen)%pdea->asize ;
+#ifdef CPP
+ delete pdea->array[victim] ;
+#else
+ free(pdea->array[victim]) ;
+#endif
+ pdea->cFrees++ ;
+
+ if (range == 0) {
+ blk_size = pdea->min_size;
+ } else {
+ blk_size = pdea->min_size+lran2(&pdea->rgen)%range ;
+ }
+#ifdef CPP
+ pdea->array[victim] = new char[blk_size] ;
+#else
+ pdea->array[victim] = (char *) malloc(blk_size) ;
+#endif
+
+ pdea->blksize[victim] = blk_size ;
+ assert(pdea->array[victim] != NULL) ;
+
+ pdea->cAllocs++ ;
+
+ /* Write something! */
+
+ volatile char * chptr = ((char *) pdea->array[victim]);
+ *chptr++ = 'a';
+ volatile char ch = *((char *) pdea->array[victim]);
+ *chptr = 'b';
+
+
+ if( stopflag ) break ;
+ }
+
+ // printf("Thread %u terminating: %d allocs, %d frees\n",
+ // pdea->threadno, pdea->cAllocs, pdea->cFrees) ;
+ pdea->finished = TRUE ;
+
+ if( !stopflag ){
+#ifdef __WIN32__
+ _beginthread((void (__cdecl*)(void *)) exercise_heap, 0, pdea) ;
+#else
+ _beginthread(exercise_heap, 0, pdea) ;
+#endif
+ } else {
+ printf ("thread stopping.\n");
+ }
+#ifndef _WIN32
+ pthread_exit (NULL);
+#endif
+ return 0;
+}
+
+static void warmup(char **blkp, int num_chunks )
+{
+ int cblks ;
+ int victim ;
+ int blk_size ;
+ LPVOID tmp ;
+
+
+ for( cblks=0; cblks<num_chunks; cblks++){
+ if (min_size == max_size) {
+ blk_size = min_size;
+ } else {
+ blk_size = min_size+lran2(&rgen)%(max_size-min_size) ;
+ }
+#ifdef CPP
+ blkp[cblks] = new char[blk_size] ;
+#else
+ blkp[cblks] = (char *) malloc(blk_size) ;
+#endif
+ blksize[cblks] = blk_size ;
+ assert(blkp[cblks] != NULL) ;
+ }
+
+ /* generate a random permutation of the chunks */
+ for( cblks=num_chunks; cblks > 0 ; cblks--){
+ victim = lran2(&rgen)%cblks ;
+ tmp = blkp[victim] ;
+ blkp[victim] = blkp[cblks-1] ;
+ blkp[cblks-1] = (char *) tmp ;
+ }
+
+ for( cblks=0; cblks<4*num_chunks; cblks++){
+ victim = lran2(&rgen)%num_chunks ;
+#ifdef CPP
+ delete blkp[victim] ;
+#else
+ free(blkp[victim]) ;
+#endif
+
+ if (max_size == min_size) {
+ blk_size = min_size;
+ } else {
+ blk_size = min_size+lran2(&rgen)%(max_size - min_size) ;
+ }
+#ifdef CPP
+ blkp[victim] = new char[blk_size] ;
+#else
+ blkp[victim] = (char *) malloc(blk_size) ;
+#endif
+ blksize[victim] = blk_size ;
+ assert(blkp[victim] != NULL) ;
+ }
+}
+#endif // _MT
+
+#ifdef __WIN32__
+ULONG CountReservedSpace()
+{
+ MEMORY_BASIC_INFORMATION info;
+ char *addr=NULL ;
+ ULONG size=0 ;
+
+ while( true){
+ VirtualQuery(addr, &info, sizeof(info));
+ switch( info.State){
+ case MEM_FREE:
+ case MEM_RESERVE:
+ break ;
+ case MEM_COMMIT:
+ size += info.RegionSize ;
+ break ;
+ }
+ addr += info.RegionSize ;
+ if( addr >= (char *)0x80000000UL ) break ;
+ }
+
+ return size ;
+
+}
+#endif
+
+// =======================================================
+
+/* lran2.h
+ * by Wolfram Gloger 1996.
+ *
+ * A small, portable pseudo-random number generator.
+ */
+
+#ifndef _LRAN2_H
+#define _LRAN2_H
+
+#define LRAN2_MAX 714025l /* constants for portable */
+#define IA 1366l /* random number generator */
+#define IC 150889l /* (see e.g. `Numerical Recipes') */
+
+//struct lran2_st {
+// long x, y, v[97];
+//};
+
+static void
+lran2_init(struct lran2_st* d, long seed)
+{
+ long x;
+ int j;
+
+ x = (IC - seed) % LRAN2_MAX;
+ if(x < 0) x = -x;
+ for(j=0; j<97; j++) {
+ x = (IA*x + IC) % LRAN2_MAX;
+ d->v[j] = x;
+ }
+ d->x = (IA*x + IC) % LRAN2_MAX;
+ d->y = d->x;
+}
+
+static
+long lran2(struct lran2_st* d)
+{
+ int j = (d->y % 97);
+
+ d->y = d->v[j];
+ d->x = (IA*d->x + IC) % LRAN2_MAX;
+ d->v[j] = d->x;
+ return d->y;
+}
+
+#undef IA
+#undef IC
+
+#endif
+
+
diff --git a/src/benchmarks/timer.h b/src/benchmarks/timer.h
new file mode 100644
index 0000000..d4d42c7
--- /dev/null
+++ b/src/benchmarks/timer.h
@@ -0,0 +1,372 @@
+/* -*- C++ -*- */
+
+/*
+
+ Heap Layers: An Extensible Memory Allocation Infrastructure
+
+ Copyright (C) 2000-2003 by Emery Berger
+ http://www.cs.umass.edu/~emery
+ emery@cs.umass.edu
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+*/
+
+#include <cassert>
+#include <stdio.h>
+
+
+#ifndef _TIMER_H_
+#define _TIMER_H_
+
+/**
+ * @class Timer
+ * @brief A portable class for high-resolution timing.
+ *
+ * This class simplifies timing measurements across a number of platforms.
+ *
+ * @code
+ * Timer t;
+ * t.start();
+ * // do some work
+ * t.stop();
+ * cout << "That took " << (double) t << " seconds." << endl;
+ * @endcode
+ *
+ */
+
+#ifdef __APPLE__
+#include <sys/time.h>
+#endif
+
+#if defined(__linux__) && defined(__GNUG__) && defined(__i386__)
+
+#include <stdio.h>
+#include <limits.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+
+static void getTime (unsigned long& tlo, unsigned long& thi) {
+ asm volatile ("rdtsc"
+ : "=a"(tlo),
+ "=d" (thi));
+}
+
+
+static double getFrequency (void) {
+ static double freq = 0.0;
+ static bool initialized = false;
+ unsigned long LTime0, LTime1, HTime0, HTime1;
+ if (!initialized) {
+
+ freq = 2600000.0;
+
+#if 0
+ // Compute MHz directly.
+ // Wait for approximately one second.
+
+ getTime (LTime0, HTime0);
+ // printf ("waiting...\n");
+ struct timespec rqtp, rmtp;
+ rqtp.tv_sec = 1;
+ rqtp.tv_nsec = 0;
+ nanosleep (&rqtp, &rmtp);
+ // printf ("done.\n");
+ getTime (LTime1, HTime1);
+
+ freq = (double)(LTime1 - LTime0) + (double)(UINT_MAX)*(double)(HTime1 - HTime0);
+ if (LTime1 < LTime0) {
+ freq -= (double)UINT_MAX;
+ }
+#endif
+ initialized = true;
+
+ } else {
+ // printf ("wha?\n");
+ }
+ return freq;
+}
+
+
+namespace HL {
+
+class Timer {
+public:
+ Timer (void)
+ : timeElapsed (0.0)
+ {
+ _frequency = getFrequency();
+ // printf ("wooo!\n");
+ // printf ("freq = %lf\n", frequency);
+ }
+ void start (void) {
+ getTime (currentLo, currentHi);
+ }
+ void stop (void) {
+ unsigned long lo, hi;
+ getTime (lo, hi);
+ double now = (double) hi * 4294967296.0 + lo;
+ double prev = (double) currentHi * 4294967296.0 + currentLo;
+ timeElapsed = (now - prev) / _frequency;
+ }
+
+ operator double (void) {
+ return timeElapsed;
+ }
+
+private:
+ double timeElapsed;
+ unsigned long currentLo, currentHi;
+ double _frequency;
+};
+
+};
+
+#else
+
+
+#ifdef __SVR4 // Solaris
+#include <sys/time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
+#include <stdio.h>
+#endif // __SVR4
+
+#include <time.h>
+
+#if defined(unix) || defined(__linux)
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+
+#ifdef __sgi
+#include <sys/types.h>
+#include <sys/times.h>
+#include <limits.h>
+#endif
+
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+
+#if defined(__BEOS__)
+#include <OS.h>
+#endif
+
+
+namespace HL {
+
+class Timer {
+
+public:
+
+ /// Initializes the timer.
+ Timer (void)
+#if !defined(_WIN32)
+ : _starttime (0),
+ _elapsedtime (0)
+#endif
+ {
+ }
+
+ /// Start the timer.
+ void start (void) { _starttime = _time(); }
+
+ /// Stop the timer.
+ void stop (void) { _elapsedtime += _time() - _starttime; }
+
+ /// Reset the timer.
+ void reset (void) { _starttime = _elapsedtime; }
+
+#if 0
+ // Set the timer.
+ void set (double secs) { _starttime = 0; _elapsedtime = _sectotime (secs);}
+#endif
+
+ /// Return the number of seconds elapsed.
+ operator double (void) { return _timetosec (_elapsedtime); }
+
+ static double currentTime (void) { TimeType t; t = _time(); return _timetosec (t); }
+
+
+private:
+
+ // The _timer variable will be different depending on the OS.
+ // We try to use the best timer available.
+
+#ifdef __sgi
+#define TIMER_FOUND
+
+ long _starttime, _elapsedtime;
+
+ long _time (void) {
+ struct tms t;
+ long ticks = times (&t);
+ return ticks;
+ }
+
+ static double _timetosec (long t) {
+ return ((double) (t) / CLK_TCK);
+ }
+
+ static long _sectotime (double sec) {
+ return (long) sec * CLK_TCK;
+ }
+#endif
+
+#ifdef __SVR4 // Solaris
+#define TIMER_FOUND
+ typedef hrtime_t TimeType;
+ TimeType _starttime, _elapsedtime;
+
+ static TimeType _time (void) {
+ return gethrtime();
+ }
+
+ static TimeType _sectotime (double sec) { return (hrtime_t) (sec * 1.0e9); }
+
+ static double _timetosec (TimeType& t) {
+ return ((double) (t) / 1.0e9);
+ }
+#endif // __SVR4
+
+#if defined(MAC) || defined(macintosh)
+#define TIMER_FOUND
+ double _starttime, _elapsedtime;
+
+ double _time (void) {
+ return get_Mac_microseconds();
+ }
+
+ double _timetosec (hrtime_t& t) {
+ return t;
+ }
+#endif // MAC
+
+#ifdef _WIN32
+#define TIMER_FOUND
+
+#ifndef __GNUC__
+ class TimeType {
+ public:
+ TimeType (void)
+ {
+ largeInt.QuadPart = 0;
+ }
+ operator double& (void) { return (double&) largeInt.QuadPart; }
+ operator LARGE_INTEGER& (void) { return largeInt; }
+ double timeToSec (void) {
+ return (double) largeInt.QuadPart / getFreq();
+ }
+ private:
+ double getFreq (void) {
+ QueryPerformanceFrequency (&freq);
+ return (double) freq.QuadPart;
+ }
+
+ LARGE_INTEGER largeInt;
+ LARGE_INTEGER freq;
+ };
+
+ TimeType _starttime, _elapsedtime;
+
+ static TimeType _time (void) {
+ TimeType t;
+ int r = QueryPerformanceCounter (&((LARGE_INTEGER&) t));
+ assert (r);
+ return t;
+ }
+
+ static double _timetosec (TimeType& t) {
+ return t.timeToSec();
+ }
+#else
+ typedef DWORD TimeType;
+ DWORD _starttime, _elapsedtime;
+ static DWORD _time (void) {
+ return GetTickCount();
+ }
+
+ static double _timetosec (DWORD& t) {
+ return (double) t / 100000.0;
+ }
+ static unsigned long _sectotime (double sec) {
+ return (unsigned long)(sec);
+ }
+#endif
+#endif // _WIN32
+
+
+#ifdef __BEOS__
+#define TIMER_FOUND
+ bigtime_t _starttime, _elapsedtime;
+ bigtime_t _time(void) {
+ return system_time();
+ }
+ double _timetosec (bigtime_t& t) {
+ return (double) t / 1000000.0;
+ }
+
+ bigtime_t _sectotime (double sec) {
+ return (bigtime_t)(sec * 1000000.0);
+ }
+#endif // __BEOS__
+
+#ifndef TIMER_FOUND
+
+ typedef long TimeType;
+ TimeType _starttime, _elapsedtime;
+
+ static TimeType _time (void) {
+ struct timeval t;
+ gettimeofday (&t, NULL);
+ return t.tv_sec * 1000000 + t.tv_usec;
+ }
+
+ static double _timetosec (TimeType t) {
+ return ((double) (t) / 1000000.0);
+ }
+
+ static TimeType _sectotime (double sec) {
+ return (TimeType) (sec * 1000000.0);
+ }
+
+#endif // TIMER_FOUND
+
+#undef TIMER_FOUND
+
+};
+
+
+#ifdef __SVR4 // Solaris
+class VirtualTimer : public Timer {
+public:
+ hrtime_t _time (void) {
+ return gethrvtime();
+ }
+};
+#endif
+
+}
+
+#endif
+
+#endif
diff --git a/src/chattymalloc.c b/src/chattymalloc.c
new file mode 100644
index 0000000..54708d6
--- /dev/null
+++ b/src/chattymalloc.c
@@ -0,0 +1,161 @@
+#define _GNU_SOURCE
+#include <dlfcn.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static char tmpbuff[1024];
+static unsigned long tmppos = 0;
+static unsigned long tmpallocs = 0;
+
+static int out = -1;
+static int prevent_recursion = 0;
+
+/*=========================================================
+ * * interception points
+ * */
+
+static void * (*myfn_malloc)(size_t size);
+static void (*myfn_free)(void* ptr);
+static void * (*myfn_calloc)(size_t nmemb, size_t size);
+static void * (*myfn_realloc)(void* ptr, size_t size);
+static void * (*myfn_memalign)(size_t alignment, size_t size);
+
+static void write_output(const char* fmt, ...)
+{
+ if (!prevent_recursion)
+ {
+ prevent_recursion = 1;
+
+ /* lockf(out, F_LOCK, 0); */
+
+ va_list args;
+ va_start(args, fmt);
+ vdprintf(out, fmt, args);
+ va_end(args);
+
+ /* lockf(out, F_ULOCK, 0); */
+ prevent_recursion = 0;
+ }
+}
+
+static void init()
+{
+ out = open("chattymalloc.data", O_WRONLY | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (out == -1)
+ {
+ fprintf(stderr, "failed to open output file with %d\n", errno);
+ exit(1);
+ }
+
+ myfn_malloc = dlsym(RTLD_NEXT, "malloc");
+ myfn_free = dlsym(RTLD_NEXT, "free");
+ myfn_calloc = dlsym(RTLD_NEXT, "calloc");
+ myfn_realloc = dlsym(RTLD_NEXT, "realloc");
+ myfn_memalign = dlsym(RTLD_NEXT, "memalign");
+
+ if (!myfn_malloc || !myfn_free || !myfn_calloc || !myfn_realloc || !myfn_memalign)
+ {
+ fprintf(stderr, "Error in `dlsym`: %s\n", dlerror());
+ exit(1);
+ }
+}
+
+void *malloc(size_t size)
+{
+ static int initializing = 0;
+
+ if (myfn_malloc == NULL)
+ {
+ if (!initializing)
+ {
+ initializing = 1;
+ init();
+ initializing = 0;
+
+ }
+ else
+ {
+ if (tmppos + size < sizeof(tmpbuff))
+ {
+ void *retptr = tmpbuff + tmppos;
+ tmppos += size;
+ ++tmpallocs;
+ return retptr;
+ }
+ else
+ {
+ fprintf(stderr, "%d in %d allocs\n", tmppos, tmpallocs);
+ fprintf(stderr, "jcheck: too much memory requested during initialisation - increase tmpbuff size\n");
+ exit(1);
+ }
+ }
+ }
+
+ void *ptr = myfn_malloc(size);
+ write_output("m %zu %p\n", size, ptr);
+ return ptr;
+}
+
+void free(void *ptr)
+{
+ // something wrong if we call free before one of the allocators!
+ if (myfn_malloc == NULL)
+ init();
+ if (!(ptr >= (void*) tmpbuff && ptr <= (void*)(tmpbuff + tmppos)))
+ {
+ write_output("f %p\n", ptr);
+ myfn_free(ptr);
+ }
+}
+
+void* realloc(void *ptr, size_t size)
+{
+ if (myfn_realloc == NULL)
+ {
+ void *nptr = malloc(size);
+ if (nptr && ptr)
+ {
+ memmove(nptr, ptr, size);
+ free(ptr);
+ }
+ return nptr;
+ }
+
+ void* nptr = myfn_realloc(ptr, size);
+ write_output("r %p %zu %p\n", ptr, size, nptr);
+ return nptr;
+}
+
+void* calloc(size_t nmemb, size_t size)
+{
+ if (myfn_calloc == NULL)
+ {
+ void *ptr = malloc(nmemb*size);
+ if (ptr)
+ memset(ptr, 0, nmemb*size);
+ return ptr;
+ }
+
+ void* ptr = myfn_calloc(nmemb, size);
+ write_output("c %zu %zu %p\n", nmemb, size, ptr);
+ return ptr;
+}
+
+void* memalign(size_t alignment, size_t size)
+{
+ if (myfn_memalign == NULL)
+ {
+ fprintf(stderr, "called memalign before or during init");
+ exit(1);
+ }
+
+ void* ptr = myfn_memalign(alignment, size);
+ write_output("mm %zu %zu %p\n", alignment, size, ptr);
+ return ptr;
+}
diff --git a/src/chattyparser.py b/src/chattyparser.py
new file mode 100644
index 0000000..3406b44
--- /dev/null
+++ b/src/chattyparser.py
@@ -0,0 +1,158 @@
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+
+ptr = "(?:0x)?(?P<ptr>(?:\w+)|(?:\(nil\)))"
+size = "(?P<size>\d+)"
+
+malloc_re = re.compile("^m {} {}$".format(size, ptr))
+free_re = re.compile("^f {}$".format(ptr))
+calloc_re = re.compile("^c (?P<nmemb>\d+) {} {}$".format(size, ptr))
+realloc_re = re.compile("^r {} {} {}$".format(ptr, size, ptr.replace("ptr", "nptr")))
+memalign_re = re.compile("^mm (?P<alignment>\d+) {} {}$".format(size, ptr))
+
+def record_allocation(hist, total_size, allocations, ptr, size, coll_size, req_size, nohist, optr=None, add=True):
+ size = int(size)
+ if add:
+ if optr and optr in allocations:
+ size -= allocations[optr]
+ del(allocations[optr])
+
+ allocations[ptr] = size
+ if not nohist:
+ hist[size] = hist.get(size, 0) + 1
+
+ if type(total_size[-1]) != int or type(size) != int:
+ print("invalid type", type(total_size[-1]), type(size))
+ return
+
+ if coll_size:
+ if not req_size or size == req_size:
+ total_size.append(total_size[-1] + size)
+ elif req_size:
+ total_size.append(total_size[-1])
+
+ elif ptr != "(nil)" and ptr in allocations:
+ size = allocations[ptr]
+ if coll_size:
+ if not req_size or size == req_size:
+ total_size.append(total_size[-1] - size)
+ elif req_size:
+ total_size.append(total_size[-1])
+
+ del(allocations[ptr])
+ elif coll_size:
+ total_size.append(total_size[-1])
+
+def parse(path="chattymalloc.data", coll_size=True, req_size=None, nohist=False):
+ tmalloc, tcalloc, trealloc, tfree, tmemalign= 0, 0, 0, 0, 0
+ allocations = {}
+ requested_size = [0]
+ hist = {}
+ ln = 0
+
+ with open(path, "r") as f:
+ for i, l in enumerate(f.readlines()):
+ ln += 1
+ res = malloc_re.match(l)
+ if res != None:
+ res = res.groupdict()
+ record_allocation(hist, requested_size, allocations, res["ptr"],
+ res["size"], coll_size, req_size, nohist)
+ tmalloc += 1
+ continue
+
+ res = free_re.match(l)
+ if res != None:
+ res = res.groupdict()
+ record_allocation(hist, requested_size, allocations, res["ptr"],
+ 0, coll_size, req_size, nohist, add=False)
+ tfree +=1
+ continue
+
+ res = calloc_re.match(l)
+ if res != None:
+ res = res.groupdict()
+ size = int(res["nmemb"]) * int(res["size"])
+ record_allocation(hist, requested_size, allocations, res["ptr"],
+ size, coll_size, req_size, nohist)
+ tcalloc += 1
+ continue
+
+ res = realloc_re.match(l)
+ if res != None:
+ res = res.groupdict()
+ record_allocation(hist, requested_size, allocations, res["nptr"],
+ res["size"], coll_size, req_size, nohist, optr=res["ptr"])
+ trealloc += 1
+ continue
+
+ res = memalign_re.match(l)
+ if res != None:
+ res = res.groupdict()
+ record_allocation(hist, requested_size, allocations, res["ptr"],
+ res["size"], coll_size, req_size, nohist)
+ tmemalign += 1
+ continue
+
+ print("\ninvalid line at", ln, ":", l)
+ calls = {"malloc": tmalloc, "free": tfree, "calloc": tcalloc, "realloc": trealloc, "memalign": tmemalign}
+ return hist, calls, requested_size
+
+def plot(path):
+ hist, calls, _ = parse(req_size=None)
+ plot_hist_ascii(path+".hist", hist, calls)
+ top5 = [t[1] for t in sorted([(n, s) for s, n in hist.items()])[-5:]]
+
+ del(hist)
+ del(calls)
+ plot_profile(path+".profile.png", top5)
+
+
+def plot_profile(path, top5):
+ _, calls, total_size = parse(nohist=True)
+ x_vals = range(0, sum(calls.values()) + 1)
+
+ plt.plot(x_vals, total_size, marker='', linestyle='-', label="Total requested")
+
+ for s in top5:
+ _, calls, total_size = parse(nohist=True, req_size=s)
+ plt.plot(x_vals, total_size, label=s)
+
+ plt.legend()
+ plt.xlabel("Allocations")
+ plt.ylabel("mem in kb")
+ plt.title("Memusage profile")
+ plt.savefig(path)
+ plt.clf()
+
+def plot_hist_ascii(path, hist, calls):
+ bins = {}
+ for size in sorted(hist):
+ bin = int(size / 16)
+ bins[bin] = bins.get(bin, 0) + hist[size]
+
+ total = sum(calls.values()) - calls["free"]
+ with open(path, "w") as f:
+ print("Total function calls:", total, file=f)
+ print("malloc:", calls["malloc"], file=f)
+ print("calloc:", calls["calloc"], file=f)
+ print("realloc:", calls["realloc"], file=f)
+ print("free:", calls["free"], file=f)
+ print("memalign:", calls["memalign"], file=f)
+ print(file=f)
+
+ print("< 1024", sum([n for s,n in hist.items() if s < 1024]), file=f)
+ print("< 4096", sum([n for s,n in hist.items() if s < 4096]), file=f)
+ print(file=f)
+
+ print("Histogram of sizes:", file=f)
+ sbins = sorted(bins)
+ binmaxlength = str(len(str(sbins[-1])) + 1)
+ amountmaxlength = str(len(str(sorted(bins.values())[-1])))
+ for b in sbins:
+ perc = bins[b]/total*100
+ binsize = "{:<" + binmaxlength + "} - {:>" + binmaxlength + "}"
+ print(binsize.format((b)*16, (b+1)*16-1), end=" ", file=f)
+ amount = "{:<" + amountmaxlength + "} {:.2f}% {}"
+ print(amount.format(bins[b], perc, '*'*int(perc/2)), file=f)
diff --git a/src/larson.py b/src/larson.py
new file mode 100644
index 0000000..713818b
--- /dev/null
+++ b/src/larson.py
@@ -0,0 +1,54 @@
+import multiprocessing
+import re
+
+from src.benchmark import Benchmark
+
+throughput_re = re.compile("^Throughput =\s*(?P<throughput>\d+) operations per second.$")
+
+class Benchmark_Larson( Benchmark ):
+ def __init__(self):
+ self.name = "larson"
+ self.descrition = """This benchmark is courtesy of Paul Larson at Microsoft
+ Research. It simulates a server: each thread allocates
+ and deallocates objects, and then transfers some objects
+ (randomly selected) to other threads to be freed."""
+
+ self.cmd = "build/larson{binary_suffix} 1 8 {maxsize} 1000 50000 1 {threads}"
+
+ self.args = {
+ "maxsize" : [8, 32, 64, 128, 256, 512, 1024],
+ "threads" : range(1, multiprocessing.cpu_count() * 2 + 1)
+ }
+
+ self.requirements = ["build/larson"]
+ super().__init__()
+
+ def process_output(self, result, stdout, stderr, target, perm, verbose):
+ for l in stdout.splitlines():
+ res = throughput_re.match(l)
+ if res:
+ result["throughput"] = int(res.group("throughput"))
+ return
+
+ def summary(self, sumdir):
+ # Plot threads->throughput and maxsize->throughput
+ self.plot_fixed_arg("{throughput}/1000000",
+ ylabel="'MOPS/s'",
+ title = "'Larson: ' + arg + ' ' + str(arg_value)",
+ filepostfix = "throughput",
+ sumdir=sumdir)
+
+ self.plot_fixed_arg("({L1-dcache-load-misses}/{L1-dcache-loads})*100",
+ ylabel="'l1 cache misses in %'",
+ title = "'Larson cache misses: ' + arg + ' ' + str(arg_value)",
+ filepostfix = "cachemisses",
+ sumdir=sumdir)
+
+ # Memusage
+ self.plot_fixed_arg("int({VmHWM})",
+ ylabel='"VmHWM in kB"',
+ title= '"Loop Memusage: " + arg + " " + str(arg_value)',
+ filepostfix="memusage",
+ sumdir=sumdir)
+
+larson = Benchmark_Larson()
diff --git a/src/print_status_on_exit.c b/src/print_status_on_exit.c
new file mode 100644
index 0000000..1d72384
--- /dev/null
+++ b/src/print_status_on_exit.c
@@ -0,0 +1,37 @@
+#define _GNU_SOURCE
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void print_status(void)
+{
+ char buf[4096];
+
+ FILE* status = fopen("/proc/self/status", "r");
+ if (status == NULL)
+ {
+ perror("fopen status");
+ exit(1);
+ }
+
+ FILE* output = fopen("status", "a");
+ if (output == NULL)
+ {
+ perror("fopen output file");
+ exit(1);
+ }
+
+ while (!feof(status))
+ {
+ fgets(&buf, 4096, status);
+ fprintf(output, "%s", buf);
+ }
+ fclose(status);
+}
+
+static void __attribute__((constructor)) init()
+{
+ atexit(print_status);
+}
+
diff --git a/src/trace_run.c b/src/trace_run.c
new file mode 100644
index 0000000..604d01e
--- /dev/null
+++ b/src/trace_run.c
@@ -0,0 +1,750 @@
+#define _LARGEFILE64_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+// #include "malloc.h"
+#include <malloc.h>
+
+// #include "mtrace.h"
+/* Codes for the simulator/workload programs. Copied from mtrace.h. */
+#define C_NOP 0
+#define C_DONE 1
+#define C_MALLOC 2
+#define C_CALLOC 3
+#define C_REALLOC 4
+#define C_FREE 5
+#define C_SYNC_W 6
+#define C_SYNC_R 7
+#define C_ALLOC_PTRS 8
+#define C_ALLOC_SYNCS 9
+#define C_NTHREADS 10
+#define C_START_THREAD 11
+#define C_MEMALIGN 12
+#define C_VALLOC 13
+#define C_PVALLOC 14
+#define C_POSIX_MEMALIGN 15
+
+#if UINTPTR_MAX == 0xffffffffffffffff
+
+#define ticks_t int64_t
+/* Setting quick_run to 1 allows the simulator to model
+ only the allocation and deallocation accounting via
+ atomic_rss. The actual allocations are skipped. This
+ mode is useful to verify the workload file. */
+#define quick_run 0
+
+static __inline__ ticks_t rdtsc_s(void)
+{
+ unsigned a, d;
+ asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
+ asm volatile("rdtscp" : "=a" (a), "=d" (d));
+ return ((unsigned long long)a) | (((unsigned long long)d) << 32);
+}
+
+static __inline__ ticks_t rdtsc_e(void)
+{
+ unsigned a, d;
+ asm volatile("rdtscp" : "=a" (a), "=d" (d));
+ asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
+ return ((unsigned long long)a) | (((unsigned long long)d) << 32);
+}
+
+#else
+
+#define ticks_t int32_t
+
+static __inline__ ticks_t rdtsc_s(void)
+{
+ unsigned a, d;
+ asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
+ asm volatile("rdtsc" : "=a" (a), "=d" (d));
+ return ((unsigned long)a) | (((unsigned long)d) << 16);
+}
+
+static __inline__ ticks_t rdtsc_e(void)
+{
+ unsigned a, d;
+ asm volatile("rdtscp" : "=a" (a), "=d" (d));
+ asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
+ return ((unsigned long)a) | (((unsigned long)d) << 16);
+}
+
+#endif
+
+static ticks_t diff_timeval (struct timeval e, struct timeval s)
+{
+ ticks_t usec;
+ if (e.tv_usec < s.tv_usec)
+ usec = (e.tv_usec + 1000000 - s.tv_usec) + (e.tv_sec-1 - s.tv_sec)*1000000;
+ else
+ usec = (e.tv_usec - s.tv_usec) + (e.tv_sec - s.tv_sec)*1000000;
+ return usec;
+}
+
+#if 1
+#define Q1
+#define Q2
+#else
+pthread_mutex_t genmutex = PTHREAD_MUTEX_INITIALIZER;
+#define Q1 pthread_mutex_lock(&genmutex)
+#define Q2 pthread_mutex_unlock(&genmutex)
+#endif
+
+pthread_mutex_t cmutex = PTHREAD_MUTEX_INITIALIZER;
+#define NCBUF 10
+static char cbuf[NCBUF][30];
+static int ci = 0;
+
+char *comma(ticks_t x)
+{
+ char buf[30], *bs, *bd;
+ int l, i, idx;
+
+ pthread_mutex_lock(&cmutex);
+ ci = (ci + 1) % NCBUF;
+ idx = ci;
+ pthread_mutex_unlock(&cmutex);
+ bs = buf;
+ bd = cbuf[idx];
+
+ sprintf(buf, "%lld", (long long int)x);
+ l = strlen(buf);
+ i = l;
+ while (*bs)
+ {
+ *bd++ = *bs++;
+ i--;
+ if (i % 3 == 0 && *bs)
+ *bd++ = ',';
+ }
+ *bd = 0;
+ return cbuf[idx];
+}
+
+static volatile void **ptrs;
+static volatile size_t *sizes;
+static size_t n_ptrs;
+static volatile char *syncs;
+static pthread_mutex_t *mutexes;
+static pthread_cond_t *conds;
+static size_t n_syncs;
+
+static pthread_mutex_t stat_mutex = PTHREAD_MUTEX_INITIALIZER;
+ticks_t malloc_time = 0, malloc_count = 0;
+ticks_t calloc_time = 0, calloc_count = 0;
+ticks_t realloc_time = 0, realloc_count = 0;
+ticks_t free_time = 0, free_count = 0;
+
+size_t ideal_rss = 0;
+size_t max_ideal_rss = 0;
+static pthread_mutex_t rss_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void atomic_rss (ssize_t delta)
+{
+ pthread_mutex_lock (&rss_mutex);
+ ideal_rss += delta;
+ if (max_ideal_rss < ideal_rss)
+ max_ideal_rss = ideal_rss;
+ pthread_mutex_unlock (&rss_mutex);
+}
+
+pthread_mutex_t stop_mutex = PTHREAD_MUTEX_INITIALIZER;
+int threads_done = 0;
+
+//#define dprintf printf
+#define dprintf(...) (void)1
+
+//#define mprintf printf
+//#define MDEBUG 1
+#define mprintf(...) (void)1
+
+#define myabort() my_abort_2(thrc, __LINE__)
+void
+my_abort_2 (pthread_t thrc, int line)
+{
+ fprintf(stderr, "Abort thread %p at line %d\n", (void *)thrc, line);
+ abort();
+}
+
+/*------------------------------------------------------------*/
+/* Wrapper around I/O routines */
+
+int io_fd;
+
+#define IOSIZE 65536
+#define IOMIN 4096
+
+static pthread_mutex_t io_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+typedef struct {
+ unsigned char buf[IOSIZE];
+ size_t incr;
+ size_t max_incr;
+ size_t buf_base;
+ size_t buf_idx;
+ int saw_eof;
+} IOPerThreadType;
+
+IOPerThreadType main_io;
+IOPerThreadType *thread_io;
+
+void
+io_init (IOPerThreadType *io, size_t file_offset, int incr)
+{
+ if (incr > IOSIZE)
+ incr = IOSIZE;
+ if (incr < IOMIN)
+ incr = IOMIN;
+
+ io->buf_base = file_offset;
+ io->buf_idx = 0;
+ io->incr = incr;
+
+ pthread_mutex_lock (&io_mutex);
+ lseek64 (io_fd, io->buf_base, SEEK_SET);
+ // short read OK, the eof is just to prevent runaways from bad data.
+ if (read (io_fd, io->buf, incr) < 0)
+ io->saw_eof = 1;
+ else
+ io->saw_eof = 0;
+ pthread_mutex_unlock (&io_mutex);
+}
+
+unsigned char
+io_read (IOPerThreadType *io)
+{
+ if (io->buf_idx >= io->incr)
+ io_init (io, io->buf_base + io->buf_idx, io->incr);
+ if (io->saw_eof)
+ return 0xff;
+ return io->buf [io->buf_idx++];
+}
+
+unsigned char
+io_peek (IOPerThreadType *io)
+{
+ if (io->buf_idx >= io->incr)
+ io_init (io, io->buf_base + io->buf_idx, io->incr);
+ if (io->saw_eof)
+ return 0xff;
+ return io->buf [io->buf_idx];
+}
+
+size_t
+io_pos (IOPerThreadType *io)
+{
+ return io->buf_base + io->buf_idx;
+}
+
+/*------------------------------------------------------------*/
+
+static void
+wmem (volatile void *ptr, int count)
+{
+ char *p = (char *)ptr;
+ int i;
+
+ if (!p)
+ return;
+
+ for (i=0; i<count; i++)
+ p[i] = 0x11;
+}
+#define xwmem(a,b)
+
+static size_t get_int (IOPerThreadType *io)
+{
+ size_t rv = 0;
+ while (1)
+ {
+ unsigned char c = io_read (io);
+ rv |= (c & 0x7f);
+ if (c & 0x80)
+ rv <<= 7;
+ else
+ return rv;
+ }
+}
+
+static void free_wipe (size_t idx)
+{
+ char *cp = (char *)ptrs[idx];
+ if (cp == NULL)
+ return;
+ size_t sz = sizes[idx];
+ size_t i;
+ for (i=0; i<sz; i++)
+ {
+ if (i % 8 == 1)
+ cp[i] = i / 8;
+ else
+ cp[i] = 0x22;
+ }
+}
+
+static void *
+thread_common (void *my_data_v)
+{
+ pthread_t thrc = pthread_self ();
+ size_t p1, p2, sz, sz2;
+ IOPerThreadType *io = (IOPerThreadType *)my_data_v;
+ ticks_t my_malloc_time = 0, my_malloc_count = 0;
+ ticks_t my_calloc_time = 0, my_calloc_count = 0;
+ ticks_t my_realloc_time = 0, my_realloc_count = 0;
+ ticks_t my_free_time = 0, my_free_count = 0;
+ ticks_t stime, etime;
+ int thread_idx = io - thread_io;
+#ifdef MDEBUG
+ volatile void *tmp;
+#endif
+
+ while (1)
+ {
+ unsigned char this_op = io_peek (io);
+ if (io->saw_eof)
+ myabort();
+ dprintf("op %p:%ld is %d\n", (void *)thrc, io_pos (io), io_peek (io));
+ switch (io_read (io))
+ {
+ case C_NOP:
+ break;
+
+ case C_DONE:
+ dprintf("op %p:%ld DONE\n", (void *)thrc, io_pos (io));
+ pthread_mutex_lock (&stat_mutex);
+ malloc_time += my_malloc_time;
+ calloc_time += my_calloc_time;
+ realloc_time += my_realloc_time;
+ free_time += my_free_time;
+ malloc_count += my_malloc_count;
+ calloc_count += my_calloc_count;
+ realloc_count += my_realloc_count;
+ free_count += my_free_count;
+ threads_done ++;
+ pthread_mutex_unlock (&stat_mutex);
+ pthread_mutex_lock(&stop_mutex);
+ pthread_mutex_unlock(&stop_mutex);
+ return NULL;
+
+ case C_MEMALIGN:
+ p2 = get_int (io);
+ sz2 = get_int (io);
+ sz = get_int (io);
+ dprintf("op %p:%ld %ld = MEMALIGN %ld %ld\n", (void *)thrc, io_pos (io), p2, sz2, sz);
+ /* we can't force memalign to return NULL (fail), so just skip it. */
+ if (p2 == 0)
+ break;
+ if (p2 > n_ptrs)
+ myabort();
+ stime = rdtsc_s();
+ Q1;
+ if (ptrs[p2])
+ {
+ if (!quick_run)
+ free ((void *)ptrs[p2]);
+ atomic_rss (-sizes[p2]);
+ }
+ if (!quick_run)
+ ptrs[p2] = memalign (sz2, sz);
+ else
+ ptrs[p2] = (void *)p2;
+ /* Verify the alignment matches what is expected. */
+ if (((size_t)ptrs[p2] & (sz2 - 1)) != 0)
+ myabort ();
+ sizes[p2] = sz;
+ mprintf("%p = memalign(%lx, %lx)\n", ptrs[p2], sz2, sz);
+ Q2;
+ etime = rdtsc_e();
+ if (ptrs[p2] != NULL)
+ atomic_rss (sz);
+ if (etime < stime)
+ {
+ printf("s: %llx e:%llx d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
+ }
+ my_malloc_time += etime - stime;
+ my_malloc_count ++;
+ if (!quick_run)
+ wmem(ptrs[p2], sz);
+ break;
+
+ case C_MALLOC:
+ p2 = get_int (io);
+ sz = get_int (io);
+ dprintf("op %p:%ld %ld = MALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);
+ /* we can't force malloc to return NULL (fail), so just skip it. */
+ if (p2 == 0)
+ break;
+ if (p2 > n_ptrs)
+ myabort();
+ stime = rdtsc_s();
+ Q1;
+ if (ptrs[p2])
+ {
+ if (!quick_run)
+ free ((void *)ptrs[p2]);
+ atomic_rss (-sizes[p2]);
+ }
+ if (!quick_run)
+ ptrs[p2] = malloc (sz);
+ else
+ ptrs[p2] = (void *)p2;
+ sizes[p2] = sz;
+ mprintf("%p = malloc(%lx)\n", ptrs[p2], sz);
+ Q2;
+ etime = rdtsc_e();
+ if (ptrs[p2] != NULL)
+ atomic_rss (sz);
+ if (etime < stime)
+ {
+ printf("s: %llx e:%llx d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
+ }
+ my_malloc_time += etime - stime;
+ my_malloc_count ++;
+ if (!quick_run)
+ wmem(ptrs[p2], sz);
+ break;
+
+ case C_CALLOC:
+ p2 = get_int (io);
+ sz = get_int (io);
+ dprintf("op %p:%ld %ld = CALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);
+ /* we can't force calloc to return NULL (fail), so just skip it. */
+ if (p2 == 0)
+ break;
+ if (p2 > n_ptrs)
+ myabort();
+ if (ptrs[p2])
+ {
+ if (!quick_run)
+ free ((void *)ptrs[p2]);
+ atomic_rss (-sizes[p2]);
+ }
+ stime = rdtsc_s();
+ Q1;
+ if (!quick_run)
+ ptrs[p2] = calloc (sz, 1);
+ else
+ ptrs[p2] = (void *)p2;
+ sizes[p2] = sz;
+ mprintf("%p = calloc(%lx)\n", ptrs[p2], sz);
+ Q2;
+ if (ptrs[p2])
+ atomic_rss (sz);
+ my_calloc_time += rdtsc_e() - stime;
+ my_calloc_count ++;
+ if (!quick_run)
+ wmem(ptrs[p2], sz);
+ break;
+
+ case C_REALLOC:
+ p2 = get_int (io);
+ p1 = get_int (io);
+ sz = get_int (io);
+ dprintf("op %p:%ld %ld = REALLOC %ld %ld\n", (void *)thrc, io_pos (io), p2, p1, sz);
+ if (p1 > n_ptrs)
+ myabort();
+ if (p2 > n_ptrs)
+ myabort();
+ /* we can't force realloc to return NULL (fail), so just skip it. */
+ if (p2 == 0)
+ break;
+
+ if (ptrs[p1])
+ atomic_rss (-sizes[p1]);
+ if (!quick_run)
+ free_wipe(p1);
+ stime = rdtsc_s();
+ Q1;
+#ifdef MDEBUG
+ tmp = ptrs[p1];
+#endif
+ if (!quick_run)
+ ptrs[p2] = realloc ((void *)ptrs[p1], sz);
+ else
+ ptrs[p2] = (void *)p2;
+ sizes[p2] = sz;
+ mprintf("%p = relloc(%p,%lx)\n", ptrs[p2], tmp,sz);
+ Q2;
+ my_realloc_time += rdtsc_e() - stime;
+ my_realloc_count ++;
+ if (!quick_run)
+ wmem(ptrs[p2], sz);
+ if (p1 != p2)
+ ptrs[p1] = 0;
+ if (ptrs[p2])
+ atomic_rss (sizes[p2]);
+ break;
+
+ case C_FREE:
+ p1 = get_int (io);
+ if (p1 > n_ptrs)
+ myabort();
+ dprintf("op %p:%ld FREE %ld\n", (void *)thrc, io_pos (io), p1);
+ if (!quick_run)
+ free_wipe (p1);
+ if (ptrs[p1])
+ atomic_rss (-sizes[p1]);
+ stime = rdtsc_s();
+ Q1;
+ mprintf("free(%p)\n", ptrs[p1]);
+ if (!quick_run)
+ free ((void *)ptrs[p1]);
+ Q2;
+ my_free_time += rdtsc_e() - stime;
+ my_free_count ++;
+ ptrs[p1] = 0;
+ break;
+
+ case C_SYNC_W:
+ p1 = get_int(io);
+ dprintf("op %p:%ld SYNC_W %ld\n", (void *)thrc, io_pos (io), p1);
+ if (p1 > n_syncs)
+ myabort();
+ pthread_mutex_lock (&mutexes[p1]);
+ syncs[p1] = 1;
+ pthread_cond_signal (&conds[p1]);
+ __sync_synchronize ();
+ pthread_mutex_unlock (&mutexes[p1]);
+ break;
+
+ case C_SYNC_R:
+ p1 = get_int(io);
+ dprintf("op %p:%ld SYNC_R %ld\n", (void *)thrc, io_pos (io), p1);
+ if (p1 > n_syncs)
+ myabort();
+ pthread_mutex_lock (&mutexes[p1]);
+ while (syncs[p1] != 1)
+ {
+ pthread_cond_wait (&conds[p1], &mutexes[p1]);
+ __sync_synchronize ();
+ }
+ pthread_mutex_unlock (&mutexes[p1]);
+ break;
+
+ default:
+ printf("op %d - unsupported, thread %d addr %lu\n",
+ this_op, thread_idx, (long unsigned int)io_pos (io));
+ myabort();
+ }
+ }
+}
+
+static void *alloc_mem (size_t amt)
+{
+ void *rv = mmap (NULL, amt, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ mlock (rv, amt);
+ memset (rv, 0, amt);
+ return rv;
+}
+
+static pthread_t *thread_ids;
+
+void *
+my_malloc (const char *msg, int size, IOPerThreadType *io, size_t *psz, size_t count)
+{
+ void *rv;
+ if (psz)
+ count = *psz = get_int (io);
+ dprintf ("my_malloc for %s size %d * %ld\n", msg, size, count);
+ rv = alloc_mem(size * count);
+ if (!rv)
+ {
+ fprintf(stderr, "calloc(%lu,%lu) failed\n", (long unsigned)size, (long unsigned)*psz);
+ exit(1);
+ }
+ mlock (rv, size * count);
+ return rv;
+}
+
+static const char * const scan_names[] = {
+ "UNUSED",
+ "ARENA",
+ "HEAP",
+ "CHUNK_USED",
+ "CHUNK_FREE",
+ "FASTBIN_FREE",
+ "UNSORTED",
+ "TOP",
+ "TCACHE",
+ "USED"
+};
+
+void
+malloc_scan_callback (void *ptr, size_t length, int type)
+{
+ printf("%s: ptr %p length %llx\n", scan_names[type], ptr, (long long)length);
+}
+
+#define MY_ALLOC(T, psz) \
+ (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, psz, 0)
+#define MY_ALLOCN(T, count) \
+ (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, NULL, count)
+
+int
+main(int argc, char **argv)
+{
+ ticks_t start=0;
+ ticks_t end;
+ ticks_t usec;
+ struct timeval tv_s, tv_e;
+ int thread_idx = 0;
+ int i;
+ size_t n_threads = 0;
+ size_t idx;
+ struct rusage res_start, res_end;
+ int done;
+ size_t guessed_io_size = 4096;
+ struct stat statb;
+
+ if (argc < 2)
+ {
+ fprintf(stderr, "Usage: %s <trace2dat.outfile>\n", argv[0]);
+ exit(1);
+ }
+ io_fd = open(argv[1], O_RDONLY);
+ if (io_fd < 0)
+ {
+ fprintf(stderr, "Unable to open %s for reading\n", argv[1]);
+ perror("The error was");
+ exit(1);
+ }
+ fstat (io_fd, &statb);
+
+ io_init (&main_io, 0, IOMIN);
+
+ pthread_mutex_lock(&stop_mutex);
+
+ done = 0;
+ while (!done)
+ {
+ switch (io_read (&main_io))
+ {
+ case C_NOP:
+ break;
+ case C_ALLOC_PTRS:
+ ptrs = MY_ALLOC (ptrs, &n_ptrs);
+ sizes = alloc_mem(sizeof(sizes[0]) * n_ptrs);
+ ptrs[0] = 0;
+ break;
+ case C_ALLOC_SYNCS:
+ n_syncs = get_int(&main_io);
+ syncs = MY_ALLOCN (syncs, n_syncs);
+ conds = MY_ALLOCN (conds, n_syncs);
+ mutexes = MY_ALLOCN (mutexes, n_syncs);
+ for (idx=0; idx<n_syncs; idx++)
+ {
+ pthread_mutex_init (&mutexes[idx], NULL);
+ pthread_cond_init (&conds[idx], NULL);
+ }
+ break;
+ case C_NTHREADS:
+ thread_ids = MY_ALLOC (thread_ids, &n_threads);
+ thread_io = MY_ALLOCN (thread_io, n_threads);
+ guessed_io_size = ((statb.st_size / n_threads) < (1024*1024)) ? 65536 : 4096;
+ /* The next thing in the workscript is thread creation */
+ getrusage (RUSAGE_SELF, &res_start);
+ gettimeofday (&tv_s, NULL);
+ start = rdtsc_s();
+ break;
+ case C_START_THREAD:
+ idx = get_int (&main_io);
+ io_init (& thread_io[thread_idx], idx, guessed_io_size);
+ pthread_create (&thread_ids[thread_idx], NULL, thread_common, thread_io + thread_idx);
+ dprintf("Starting thread %lld at offset %lu %lx\n", (long long)thread_ids[thread_idx], (unsigned long)idx, (unsigned long)idx);
+ thread_idx ++;
+ break;
+ case C_DONE:
+ do
+ {
+ pthread_mutex_lock (&stat_mutex);
+ i = threads_done;
+ pthread_mutex_unlock (&stat_mutex);
+ } while (i < thread_idx);
+ done = 1;
+ break;
+ }
+ }
+ if (!quick_run)
+ {
+ end = rdtsc_e();
+ gettimeofday (&tv_e, NULL);
+ getrusage (RUSAGE_SELF, &res_end);
+
+ printf("%s cycles\n", comma(end - start));
+ usec = diff_timeval (tv_e, tv_s);
+ printf("%s usec wall time\n", comma(usec));
+
+ usec = diff_timeval (res_end.ru_utime, res_start.ru_utime);
+ printf("%s usec across %d thread%s\n",
+ comma(usec), (int)n_threads, n_threads == 1 ? "" : "s");
+ printf("%s Kb Max RSS (%s -> %s)\n",
+ comma(res_end.ru_maxrss - res_start.ru_maxrss),
+ comma(res_start.ru_maxrss), comma(res_end.ru_maxrss));
+ }
+ printf("%s Kb Max Ideal RSS\n", comma (max_ideal_rss / 1024));
+
+ if (malloc_count == 0) malloc_count ++;
+ if (calloc_count == 0) calloc_count ++;
+ if (realloc_count == 0) realloc_count ++;
+ if (free_count == 0) free_count ++;
+
+ if (!quick_run)
+ {
+ printf("\n");
+ printf("sizeof ticks_t is %lu\n", sizeof(ticks_t));
+ printf("Avg malloc time: %6s in %10s calls\n", comma(malloc_time/malloc_count), comma(malloc_count));
+ printf("Avg calloc time: %6s in %10s calls\n", comma(calloc_time/calloc_count), comma(calloc_count));
+ printf("Avg realloc time: %5s in %10s calls\n", comma(realloc_time/realloc_count), comma(realloc_count));
+ printf("Avg free time: %8s in %10s calls\n", comma(free_time/free_count), comma(free_count));
+ printf("Total call time: %s cycles\n", comma(malloc_time+calloc_time+realloc_time+free_time));
+ printf("\n");
+ }
+
+#if 0
+ /* Free any still-held chunks of memory. */
+ for (idx=0; idx<n_ptrs; idx++)
+ if (ptrs[idx])
+ {
+ free((void *)ptrs[idx]);
+ ptrs[idx] = 0;
+ }
+#endif
+
+#if 0
+ /* This will fail (crash) for system glibc but that's OK. */
+ __malloc_scan_chunks(malloc_scan_callback);
+
+ malloc_info (0, stdout);
+#endif
+
+#if 0
+ /* ...or report them as used. */
+ for (idx=0; idx<n_ptrs; idx++)
+ if (ptrs[idx])
+ {
+ char *p = (char *)ptrs[idx] - 2*sizeof(size_t);
+ size_t *sp = (size_t *)p;
+ size_t size = sp[1] & ~7;
+ malloc_scan_callback (sp, size, 9);
+ }
+#endif
+
+ /* Now that we've scanned all the per-thread caches, it's safe to
+ let them exit and clean up. */
+ pthread_mutex_unlock(&stop_mutex);
+
+ for (i=0; i<thread_idx; i++)
+ pthread_join (thread_ids[i], NULL);
+
+ return 0;
+}