From 130765de719a3ddc475284e13749d09ff371a8e1 Mon Sep 17 00:00:00 2001
From: Florian Fischer <florian.fl.fischer@fau.de>
Date: Fri, 1 Feb 2019 16:35:20 +0100
Subject: rework build system #1

each benchmark has its own Makefile which must put it's binaries into
OBJDIR which is added to the PATH during execution.
---
 src/Makefile                                 |  27 +
 src/benchmark.py                             |   2 +
 src/benchmarks/bench_loop.c                  |  87 ----
 src/benchmarks/cache-scratch.cc              | 147 ------
 src/benchmarks/cache-thrash.cc               | 134 -----
 src/benchmarks/cpuinfo.h                     | 202 --------
 src/benchmarks/dj_trace/Makefile             |  24 +
 src/benchmarks/dj_trace/trace_run.c          | 750 +++++++++++++++++++++++++++
 src/benchmarks/falsesharing/Makefile         |  29 ++
 src/benchmarks/falsesharing/cache-scratch.cc | 147 ++++++
 src/benchmarks/falsesharing/cache-thrash.cc  | 134 +++++
 src/benchmarks/falsesharing/cpuinfo.h        | 202 ++++++++
 src/benchmarks/falsesharing/fred.h           |  97 ++++
 src/benchmarks/falsesharing/timer.h          | 372 +++++++++++++
 src/benchmarks/fred.h                        |  97 ----
 src/benchmarks/larson.cc                     | 744 --------------------------
 src/benchmarks/larson/Makefile               |  24 +
 src/benchmarks/larson/larson.cc              | 744 ++++++++++++++++++++++++++
 src/benchmarks/loop/Makefile                 |  24 +
 src/benchmarks/loop/loop.c                   |  87 ++++
 src/benchmarks/timer.h                       | 372 -------------
 src/dj_trace.py                              |   4 +-
 src/falsesharing.py                          |   4 +-
 src/larson.py                                |   4 +-
 src/loop.py                                  |   4 +-
 src/trace_run.c                              | 750 ---------------------------
 26 files changed, 2671 insertions(+), 2541 deletions(-)
 create mode 100644 src/Makefile
 delete mode 100644 src/benchmarks/bench_loop.c
 delete mode 100644 src/benchmarks/cache-scratch.cc
 delete mode 100644 src/benchmarks/cache-thrash.cc
 delete mode 100644 src/benchmarks/cpuinfo.h
 create mode 100644 src/benchmarks/dj_trace/Makefile
 create mode 100644 src/benchmarks/dj_trace/trace_run.c
 create mode 100644 src/benchmarks/falsesharing/Makefile
 create mode 100644 src/benchmarks/falsesharing/cache-scratch.cc
 create mode 100644 src/benchmarks/falsesharing/cache-thrash.cc
 create mode 100644 src/benchmarks/falsesharing/cpuinfo.h
 create mode 100644 src/benchmarks/falsesharing/fred.h
 create mode 100644 src/benchmarks/falsesharing/timer.h
 delete mode 100644 src/benchmarks/fred.h
 delete mode 100644 src/benchmarks/larson.cc
 create mode 100644 src/benchmarks/larson/Makefile
 create mode 100644 src/benchmarks/larson/larson.cc
 create mode 100644 src/benchmarks/loop/Makefile
 create mode 100644 src/benchmarks/loop/loop.c
 delete mode 100644 src/benchmarks/timer.h
 delete mode 100644 src/trace_run.c

(limited to 'src')

diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..6b7b704
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,27 @@
+OBJDIR ?= obj
+
+CC ?= gcc
+
+WARNFLAGS ?= -Wall -Wextra
+COMMONFLAGS ?= -fno-builtin -fPIC -DPIC -pthread
+OPTFLAGS ?= -O3 -DNDEBUG
+
+CFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS)
+
+LDFLAGS ?= -pthread -static-libgcc
+
+.PHONY: all clean
+
+all: $(OBJDIR)/print_status_on_exit.so $(OBJDIR)/chattymalloc.so
+
+$(OBJDIR)/print_status_on_exit.so: print_status_on_exit.c | $(OBJDIR)
+	$(CC) $(LDFLAGS) -shared $(CFLAGS) -o $@ $<
+
+$(OBJDIR)/chattymalloc.so: chattymalloc.c | $(OBJDIR)
+	$(CC) $(LDFLAGS) -shared $(CFLAGS) -o $@ $<
+
+$(OBJDIR):
+	mkdir $@
+
+clean:
+	rm -rf $(OBJDIR)
diff --git a/src/benchmark.py b/src/benchmark.py
index 4de05e3..e4dbef2 100644
--- a/src/benchmark.py
+++ b/src/benchmark.py
@@ -81,6 +81,7 @@ class Benchmark (object):
             self.results[target] = d
 
     def prepare(self, verbose=False):
+        os.environ["PATH"] += ":build/" + self.name
         def is_exe(fpath):
             return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
 
@@ -256,6 +257,7 @@ class Benchmark (object):
                     if self.posttarget_hook((tname, t), run, verbose):
                         return False
             print()
+        os.environ["PATH"] = os.environ["PATH"].replace(":build/"+self.name, "")
         return True
 
     def plot_single_arg(self, yval, ylabel="'y-label'", xlabel="'x-label'", autoticks=True,
diff --git a/src/benchmarks/bench_loop.c b/src/benchmarks/bench_loop.c
deleted file mode 100644
index bc15808..0000000
--- a/src/benchmarks/bench_loop.c
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <assert.h>
-#include <malloc.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-
-static size_t _rand() {
-	static __thread size_t seed = 123456789;
-	size_t a = 1103515245;
-	size_t c = 12345;
-	size_t m = 1 << 31;
-	seed = (a * seed + c) % m;
-		return seed;
-}
-
-typedef struct ThreadArgs {
-	double benchmark;
-	int allocations;
-	int max_size;
-} ThreadArgs;
-
-static void* malloc_then_write(size_t size) {
-	void* ptr = malloc(size);
-	// Write to ptr
-	/* *((char*)ptr) = '!'; */
-	return ptr;
-}
-
-static void read_then_free(void* ptr) {
-	// Read before free
-	/* char s __attribute__((unused)) = *((char*)ptr); */
-	free(ptr);
-}
-static void* test_thread_func(void* arg) {
-	ThreadArgs* args = (ThreadArgs*)arg;
-
-	for(int i = 0; i < args->allocations; i++) {
-		void* ptr = malloc_then_write((_rand() % args->max_size) + 1);
-		read_then_free(ptr);
-	}
-	return NULL;
-}
-
-int main(int argc, char* argv[]) {
-	pthread_t* threads;
-	int num_threads;
-	struct ThreadArgs thread_args;
-
-	if (argc < 4) {
-		fprintf(stderr, "Usage: %s <num threads> <num allocations> <max size>\n", argv[0]);
-		return 1;
-	}
-
-	num_threads = atoi(argv[1]);
-	thread_args.allocations = atoi(argv[2]);
-	thread_args.max_size = atoi(argv[3]);
-
-	threads = (pthread_t*)malloc(num_threads * sizeof(pthread_t));
-
-	for (int i = 0; i < num_threads; i++) {
-		if (0 != pthread_create(&threads[i], NULL, test_thread_func, &thread_args)) {
-			perror("pthread_create");
-			return 1;
-		}
-	}
-
-	for(int i = 0; i < num_threads; i++) {
-		if (0 != pthread_join(threads[i], NULL)) {
-			perror("pthread_join");
-			return 1;
-		}
-	}
-
-	if (argc == 5)
-	{
-		FILE* f = stdout;
-		if (strcmp(argv[4],"stdout") != 0)
-			f = fopen(argv[4], "w");
-		malloc_info(0, f);
-		if (strcmp(argv[4],"stdout") != 0)
-			fclose(f);
-	}
-
-	return 0;
-}
diff --git a/src/benchmarks/cache-scratch.cc b/src/benchmarks/cache-scratch.cc
deleted file mode 100644
index 2cb9b28..0000000
--- a/src/benchmarks/cache-scratch.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-///-*-C++-*-//////////////////////////////////////////////////////////////////
-//
-// Hoard: A Fast, Scalable, and Memory-Efficient Allocator
-//        for Shared-Memory Multiprocessors
-// Contact author: Emery Berger, http://www.cs.umass.edu/~emery
-//
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Library General Public License as
-// published by the Free Software Foundation, http://www.fsf.org.
-//
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Library General Public License for more details.
-//
-//////////////////////////////////////////////////////////////////////////////
-
-/**
- * @file cache-scratch.cpp
- *
- * cache-scratch is a benchmark that exercises a heap's cache locality.
- * An allocator that allows multiple threads to re-use the same small
- * object (possibly all in one cache-line) will scale poorly, while
- * an allocator like Hoard will exhibit near-linear scaling.
- *
- * Try the following (on a P-processor machine):
- *
- *  cache-scratch 1 1000 1 1000000
- *  cache-scratch P 1000 1 1000000
- *
- *  cache-scratch-hoard 1 1000 1 1000000
- *  cache-scratch-hoard P 1000 1 1000000
- *
- *  The ideal is a P-fold speedup.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "fred.h"
-#include "cpuinfo.h"
-#include "timer.h"
-
-// This class just holds arguments to each thread.
-class workerArg {
-public:
-
-  workerArg() {}
-
-  workerArg (char * obj, int objSize, int repetitions, int iterations)
-    : _object (obj),
-      _objSize (objSize),
-      _iterations (iterations),
-      _repetitions (repetitions)
-  {}
-
-  char * _object;
-  int _objSize;
-  int _iterations;
-  int _repetitions;
-};
-
-
-#if defined(_WIN32)
-extern "C" void worker (void * arg)
-#else
-extern "C" void * worker (void * arg)
-#endif
-{
-  // free the object we were given.
-  // Then, repeatedly do the following:
-  //   malloc a given-sized object,
-  //   repeatedly write on it,
-  //   then free it.
-  workerArg * w = (workerArg *) arg;
-  delete w->_object;
-  workerArg w1 = *w;
-  for (int i = 0; i < w1._iterations; i++) {
-    // Allocate the object.
-    char * obj = new char[w1._objSize];
-    // Write into it a bunch of times.
-    for (int j = 0; j < w1._repetitions; j++) {
-      for (int k = 0; k < w1._objSize; k++) {
-	obj[k] = (char) k;
-	volatile char ch = obj[k];
-	ch++;
-      }
-    }
-    // Free the object.
-    delete [] obj;
-  }
-
-#if !defined(_WIN32)
-  return NULL;
-#endif
-}
-
-
-int main (int argc, char * argv[])
-{
-  int nthreads;
-  int iterations;
-  int objSize;
-  int repetitions;
-
-  if (argc > 4) {
-    nthreads = atoi(argv[1]);
-    iterations = atoi(argv[2]);
-    objSize = atoi(argv[3]);
-    repetitions = atoi(argv[4]);
-  } else {
-    fprintf (stderr, "Usage: %s nthreads iterations objSize repetitions\n", argv[0]);
-    return 1;
-  }
-
-  HL::Fred * threads = new HL::Fred[nthreads];
-  HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors());
-
-  workerArg * w = new workerArg[nthreads];
-
-  int i;
-
-  // Allocate nthreads objects and distribute them among the threads.
-  char ** objs = new char * [nthreads];
-  for (i = 0; i < nthreads; i++) {
-    objs[i] = new char[objSize];
-  }
-
-  HL::Timer t;
-  t.start();
-
-  for (i = 0; i < nthreads; i++) {
-    w[i] = workerArg (objs[i], objSize, repetitions / nthreads, iterations);
-    threads[i].create (&worker, (void *) &w[i]);
-  }
-  for (i = 0; i < nthreads; i++) {
-    threads[i].join();
-  }
-  t.stop();
-
-  delete [] threads;
-  delete [] objs;
-  delete [] w;
-
-  printf ("Time elapsed = %f seconds.\n", (double) t);
-  return 0;
-}
diff --git a/src/benchmarks/cache-thrash.cc b/src/benchmarks/cache-thrash.cc
deleted file mode 100644
index 79242eb..0000000
--- a/src/benchmarks/cache-thrash.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-///-*-C++-*-//////////////////////////////////////////////////////////////////
-//
-// Hoard: A Fast, Scalable, and Memory-Efficient Allocator
-//        for Shared-Memory Multiprocessors
-// Contact author: Emery Berger, http://www.cs.umass.edu/~emery
-//
-// Copyright (c) 1998-2003, The University of Texas at Austin.
-//
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Library General Public License as
-// published by the Free Software Foundation, http://www.fsf.org.
-//
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Library General Public License for more details.
-//
-//////////////////////////////////////////////////////////////////////////////
-
-/**
- * @file  cache-thrash.cpp
- * @brief cache-thrash is a benchmark that exercises a heap's cache-locality.
- *
- * Try the following (on a P-processor machine):
- *
- *  cache-thrash 1 1000 1 1000000
- *  cache-thrash P 1000 1 1000000
- *
- *  cache-thrash-hoard 1 1000 1 1000000
- *  cache-thrash-hoard P 1000 1 1000000
- *
- *  The ideal is a P-fold speedup.
-*/
-
-
-#include <iostream>
-#include <stdlib.h>
-
-using namespace std;
-
-#include "cpuinfo.h"
-#include "fred.h"
-#include "timer.h"
-
-// This class just holds arguments to each thread.
-class workerArg {
-public:
-	workerArg() {}
-	workerArg (int objSize, int repetitions, int iterations)
-	: _objSize (objSize),
-	 _iterations (iterations),
-	 _repetitions (repetitions)
-	{}
-
-	int _objSize;
-	int _iterations;
-	int _repetitions;
-};
-
-
-#if defined(_WIN32)
-extern "C" void worker (void * arg)
-#else
-extern "C" void * worker (void * arg)
-#endif
-{
-	// Repeatedly do the following:
-	//   malloc a given-sized object,
-	//   repeatedly write on it,
-	//   then free it.
-	workerArg * w = (workerArg *) arg;
-	workerArg w1 = *w;
-	for (int i = 0; i < w1._iterations; i++) {
-	// Allocate the object.
-		char * obj = new char[w1._objSize];
-		//    printf ("obj = %p\n", obj);
-		// Write into it a bunch of times.
-		for (int j = 0; j < w1._repetitions; j++) {
-			for (int k = 0; k < w1._objSize; k++) {
-				obj[k] = (char) k;
-				volatile char ch = obj[k];
-				ch++;
-			}
-		}
-		// Free the object.
-		delete [] obj;
-	}
-#if !defined(_WIN32)
-	return NULL;
-#endif
-}
-
-
-int main (int argc, char * argv[])
-{
-	int nthreads;
-	int iterations;
-	int objSize;
-	int repetitions;
-	
-	if (argc > 4) {
-		nthreads = atoi(argv[1]);
-		iterations = atoi(argv[2]);
-		objSize = atoi(argv[3]);
-		repetitions = atoi(argv[4]);
-	} else {
-	cerr << "Usage: " << argv[0] << " nthreads iterations objSize repetitions" << endl;
-	exit(1);
-	}
-
-	HL::Fred * threads = new HL::Fred[nthreads];
-	HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors());
-    
-	int i;
-  
-	HL::Timer t;
-	t.start();
-  
-	workerArg * w = new workerArg[nthreads];
-    
-	for (i = 0; i < nthreads; i++) {
-		w[i] = workerArg (objSize, repetitions / nthreads, iterations);
-		threads[i].create (&worker, (void *) &w[i]);
-	}
-	for (i = 0; i < nthreads; i++) {
-		threads[i].join();
-	}
-	t.stop();
-
-	delete [] threads;
-	delete [] w;
-
-	cout << "Time elapsed = " << (double) t << " seconds." << endl;
-}
diff --git a/src/benchmarks/cpuinfo.h b/src/benchmarks/cpuinfo.h
deleted file mode 100644
index 1ed1f36..0000000
--- a/src/benchmarks/cpuinfo.h
+++ /dev/null
@@ -1,202 +0,0 @@
-// -*- C++ -*-
-
-/*
-
-  Heap Layers: An Extensible Memory Allocation Infrastructure
-  
-  Copyright (C) 2000-2003 by Emery Berger
-  http://www.cs.umass.edu/~emery
-  emery@cs.umass.edu
-  
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation; either version 2 of the License, or
-  (at your option) any later version.
-  
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-*/
-
-
-
-#ifndef HL_CPUINFO_H
-#define HL_CPUINFO_H
-
-#if defined(_WIN32)
-#include <windows.h>
-#include <process.h>
-#else
-#include <unistd.h>
-#endif
-
-
-#if !defined(_WIN32)
-#include <pthread.h>
-#endif
-
-#if defined(__SVR4) // Solaris
-#include <sys/lwp.h>
-extern "C" unsigned int lwp_self(void);
-#include <thread.h>
-extern "C" int _thr_self(void);
-#endif
-
-#if defined(__linux)
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-#endif
-
-#if defined(__APPLE__)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-
-#if defined(__sgi)
-#include <sys/types.h>
-#include <sys/sysmp.h>
-#include <sys/sysinfo.h>
-#endif
-
-#if defined(hpux)
-#include <sys/mpctl.h>
-#endif
-
-#if defined(_WIN32)
-extern __declspec(thread) int localThreadId;
-#endif
-
-#if defined(__SVR4) && defined(MAP_ALIGN)
-extern volatile int anyThreadStackCreated;
-#endif
-
-namespace HL {
-
-/**
- * @class CPUInfo
- * @author Emery Berger <http://www.cs.umass.edu/~emery>
- *
- * @brief Architecture-independent wrapper to get number of CPUs. 
- */
-
-class CPUInfo {
-public:
-  CPUInfo (void)
-  {}
-
-  inline static int getNumProcessors (void) {
-    static int _numProcessors = computeNumProcessors();
-    return _numProcessors;
-  }
-
-  static inline unsigned long getThreadId (void);
-  inline static int computeNumProcessors (void);
-
-};
-
-
-int CPUInfo::computeNumProcessors (void)
-{
-  static int np = 0;
-  if (!np) {
-#if defined(__linux) || defined(__APPLE__)
-    np = (int) sysconf(_SC_NPROCESSORS_ONLN);
-#elif defined(_WIN32)
-    SYSTEM_INFO infoReturn[1];
-    GetSystemInfo (infoReturn);
-    np = (int) (infoReturn->dwNumberOfProcessors);
-#elif defined(__sgi)
-    np = (int) sysmp(MP_NAPROCS);
-#elif defined(hpux)
-    np = mpctl(MPC_GETNUMSPUS, NULL, NULL); // or pthread_num_processors_np()?
-#elif defined(_SC_NPROCESSORS_ONLN)
-    np = (int) (sysconf(_SC_NPROCESSORS_ONLN));
-#else
-    np = 2;
-    // Unsupported platform.
-    // Pretend we have at least two processors. This approach avoids the risk of assuming
-    // we're on a uniprocessor, which might lead clever allocators to avoid using atomic
-    // operations for all locks.
-#endif
-    return np;
-  } else {
-    return np;
-  }
-}
-
-  // Note: when stacksize arg is NULL for pthread_attr_setstacksize [Solaris],
-// stack size is 1 MB for 32-bit arch, 2 MB for 64-bit arch.
-// pthread_attr_getstacksize
-// pthread_attr_setstackaddr
-// pthread_attr_getstackaddr
-// PTHREAD_STACK_SIZE is minimum.
-// or should we just assume we have __declspec(thread) or __thread?
-
-#if defined(USE_THREAD_KEYWORD)
-  extern __thread int localThreadId;
-#endif
-
-  // FIX ME FIXME
-  //#include <stdio.h>
-
-unsigned long CPUInfo::getThreadId (void) {
-#if defined(__SVR4)
-  size_t THREAD_STACK_SIZE;
-  if (sizeof(size_t) <= 4) {
-    THREAD_STACK_SIZE = 1048576;
-  } else {
-    // 64-bits.
-    THREAD_STACK_SIZE = 1048576 * 2;
-  }
-  if (0) { // !anyThreadStackCreated) {
-    // We know a priori that all stack variables
-    // are on different stacks. Since no one has created
-    // a special one, we are in control, and thus all stacks
-    // are 1 MB in size and on 1 MB boundaries.
-    // (Actually: 1 MB for 32-bits, 2 MB for 64-bits.)
-    char buf;
-    return (((size_t) &buf) & ~(THREAD_STACK_SIZE-1)) >> 20;
-  } else {
-    return (int) pthread_self();
-  }
-#elif defined(_WIN32)
-  // It looks like thread id's are always multiples of 4, so...
-  return GetCurrentThreadId() >> 2;
-#elif defined(__APPLE__)
-  // Consecutive thread id's in Mac OS are 4096 apart;
-  // dividing off the 4096 gives us an appropriate thread id.
-  int tid = (int) ((unsigned long) pthread_self()) >> 12;
-  return tid;
-#elif defined(__BEOS__)
-  return find_thread(0);
-#elif defined(USE_THREAD_KEYWORD)
-  return localThreadId;
-#elif defined(__linux) || defined(PTHREAD_KEYS_MAX)
-  // Consecutive thread id's in Linux are 1024 apart;
-  // dividing off the 1024 gives us an appropriate thread id.
-  return (unsigned long) pthread_self() >> 10;
-#elif defined(POSIX)
-  return (unsigned long) pthread_self();
-#elif USE_SPROC
-  // This hairiness has the same effect as calling getpid(),
-  // but it's MUCH faster since it avoids making a system call
-  // and just accesses the sproc-local data directly.
-  unsigned long pid = (unsigned long) PRDA->sys_prda.prda_sys.t_pid;
-  return pid;
-#else
-  return 0;
-#endif
-}
-
-}
-
-#endif
diff --git a/src/benchmarks/dj_trace/Makefile b/src/benchmarks/dj_trace/Makefile
new file mode 100644
index 0000000..14eca91
--- /dev/null
+++ b/src/benchmarks/dj_trace/Makefile
@@ -0,0 +1,24 @@
+OBJDIR ?= obj
+
+CC ?= gcc
+
+WARNFLAGS ?= -Wall -Wextra
+COMMONFLAGS ?= -fno-builtin -fPIC -DPIC -pthread
+OPTFLAGS ?= -O3 -DNDEBUG
+
+CFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS)
+
+LDFLAGS ?= -pthread -static-libgcc
+
+.PHONY = all clean
+
+all: $(OBJDIR)/trace_run
+
+$(OBJDIR)/trace_run: trace_run.c | $(OBJDIR)
+	$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $<
+
+$(OBJDIR):
+	mkdir $@
+
+clean:
+	rm -rf $(OBJDIR)
diff --git a/src/benchmarks/dj_trace/trace_run.c b/src/benchmarks/dj_trace/trace_run.c
new file mode 100644
index 0000000..604d01e
--- /dev/null
+++ b/src/benchmarks/dj_trace/trace_run.c
@@ -0,0 +1,750 @@
+#define _LARGEFILE64_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+// #include "malloc.h"
+#include <malloc.h>
+
+// #include "mtrace.h"
+/* Codes for the simulator/workload programs. Copied from mtrace.h. */
+#define C_NOP 0
+#define C_DONE 1
+#define C_MALLOC 2
+#define C_CALLOC 3
+#define C_REALLOC 4
+#define C_FREE 5
+#define C_SYNC_W 6
+#define C_SYNC_R 7
+#define C_ALLOC_PTRS 8
+#define C_ALLOC_SYNCS 9
+#define C_NTHREADS 10
+#define C_START_THREAD 11
+#define C_MEMALIGN 12
+#define C_VALLOC 13
+#define C_PVALLOC 14
+#define C_POSIX_MEMALIGN 15
+
+#if UINTPTR_MAX == 0xffffffffffffffff
+
+#define ticks_t int64_t
+/* Setting quick_run to 1 allows the simulator to model
+   only the allocation and deallocation accounting via
+   atomic_rss. The actual allocations are skipped.  This
+   mode is useful to verify the workload file.  */
+#define quick_run 0
+
+static __inline__ ticks_t rdtsc_s(void)
+{
+  unsigned a, d;
+  asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
+  asm volatile("rdtscp" : "=a" (a), "=d" (d));
+  return ((unsigned long long)a) | (((unsigned long long)d) << 32);
+}
+
+static __inline__ ticks_t rdtsc_e(void)
+{
+  unsigned a, d;
+  asm volatile("rdtscp" : "=a" (a), "=d" (d));
+  asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
+  return ((unsigned long long)a) | (((unsigned long long)d) << 32);
+}
+
+#else
+
+#define ticks_t int32_t
+
+static __inline__ ticks_t rdtsc_s(void)
+{
+  unsigned a, d;
+  asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
+  asm volatile("rdtsc" : "=a" (a), "=d" (d));
+  return ((unsigned long)a) | (((unsigned long)d) << 16);
+}
+
+static __inline__ ticks_t rdtsc_e(void)
+{
+  unsigned a, d;
+  asm volatile("rdtscp" : "=a" (a), "=d" (d));
+  asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
+  return ((unsigned long)a) | (((unsigned long)d) << 16);
+}
+
+#endif
+
+static ticks_t diff_timeval (struct timeval e, struct timeval s)
+{
+  ticks_t usec;
+  if (e.tv_usec < s.tv_usec)
+    usec = (e.tv_usec + 1000000 - s.tv_usec) + (e.tv_sec-1 - s.tv_sec)*1000000;
+  else
+    usec = (e.tv_usec - s.tv_usec) + (e.tv_sec - s.tv_sec)*1000000;
+  return usec;
+}
+
+#if 1
+#define Q1
+#define Q2
+#else
+pthread_mutex_t genmutex = PTHREAD_MUTEX_INITIALIZER;
+#define Q1   pthread_mutex_lock(&genmutex)
+#define Q2   pthread_mutex_unlock(&genmutex)
+#endif
+
+pthread_mutex_t cmutex = PTHREAD_MUTEX_INITIALIZER;
+#define NCBUF 10
+static char cbuf[NCBUF][30];
+static int ci = 0;
+
+char *comma(ticks_t x)
+{
+  char buf[30], *bs, *bd;
+  int l, i, idx;
+
+  pthread_mutex_lock(&cmutex);
+  ci = (ci + 1) % NCBUF;
+  idx = ci;
+  pthread_mutex_unlock(&cmutex);
+  bs = buf;
+  bd = cbuf[idx];
+
+  sprintf(buf, "%lld", (long long int)x);
+  l = strlen(buf);
+  i = l;
+  while (*bs)
+    {
+      *bd++ = *bs++;
+      i--;
+      if (i % 3 == 0 && *bs)
+	*bd++ = ',';
+    }
+  *bd = 0;
+  return cbuf[idx];
+}
+
+static volatile void **ptrs;
+static volatile size_t *sizes;
+static size_t n_ptrs;
+static volatile char *syncs;
+static pthread_mutex_t *mutexes;
+static pthread_cond_t *conds;
+static size_t n_syncs;
+
+static pthread_mutex_t stat_mutex = PTHREAD_MUTEX_INITIALIZER;
+ticks_t malloc_time = 0, malloc_count = 0;
+ticks_t calloc_time = 0, calloc_count = 0;
+ticks_t realloc_time = 0, realloc_count = 0;
+ticks_t free_time = 0, free_count = 0;
+
+size_t ideal_rss = 0;
+size_t max_ideal_rss = 0;
+static pthread_mutex_t rss_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void atomic_rss (ssize_t delta)
+{
+  pthread_mutex_lock (&rss_mutex);
+  ideal_rss += delta;
+  if (max_ideal_rss < ideal_rss)
+    max_ideal_rss = ideal_rss;
+  pthread_mutex_unlock (&rss_mutex);
+}
+
+pthread_mutex_t stop_mutex = PTHREAD_MUTEX_INITIALIZER;
+int threads_done = 0;
+
+//#define dprintf printf
+#define dprintf(...) (void)1
+
+//#define mprintf printf
+//#define MDEBUG 1
+#define mprintf(...) (void)1
+
+#define myabort() my_abort_2(thrc, __LINE__)
+void
+my_abort_2 (pthread_t thrc, int line)
+{
+  fprintf(stderr, "Abort thread %p at line %d\n", (void *)thrc, line);
+  abort();
+}
+
+/*------------------------------------------------------------*/
+/* Wrapper around I/O routines */
+
+int io_fd;
+
+#define IOSIZE 65536
+#define IOMIN 4096
+
+static pthread_mutex_t io_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+typedef struct {
+  unsigned char buf[IOSIZE];
+  size_t incr;
+  size_t max_incr;
+  size_t buf_base;
+  size_t buf_idx;
+  int saw_eof;
+} IOPerThreadType;
+
+IOPerThreadType main_io;
+IOPerThreadType *thread_io;
+
+void
+io_init (IOPerThreadType *io, size_t file_offset, int incr)
+{
+  if (incr > IOSIZE)
+    incr = IOSIZE;
+  if (incr < IOMIN)
+    incr = IOMIN;
+
+  io->buf_base = file_offset;
+  io->buf_idx = 0;
+  io->incr = incr;
+
+  pthread_mutex_lock (&io_mutex);
+  lseek64 (io_fd, io->buf_base, SEEK_SET);
+  // short read OK, the eof is just to prevent runaways from bad data.
+  if (read (io_fd, io->buf, incr) < 0)
+    io->saw_eof = 1;
+  else
+    io->saw_eof = 0;
+  pthread_mutex_unlock (&io_mutex);
+}
+
+unsigned char
+io_read (IOPerThreadType *io)
+{
+  if (io->buf_idx >= io->incr)
+    io_init (io, io->buf_base + io->buf_idx, io->incr);
+  if (io->saw_eof)
+    return 0xff;
+  return io->buf [io->buf_idx++];
+}
+
+unsigned char
+io_peek (IOPerThreadType *io)
+{
+  if (io->buf_idx >= io->incr)
+    io_init (io, io->buf_base + io->buf_idx, io->incr);
+  if (io->saw_eof)
+    return 0xff;
+  return io->buf [io->buf_idx];
+}
+
+size_t
+io_pos (IOPerThreadType *io)
+{
+  return io->buf_base + io->buf_idx;
+}
+
+/*------------------------------------------------------------*/
+
+static void
+wmem (volatile void *ptr, int count)
+{
+  char *p = (char *)ptr;
+  int i;
+
+  if (!p)
+    return;
+
+  for (i=0; i<count; i++)
+    p[i] = 0x11;
+}
+#define xwmem(a,b)
+
+static size_t get_int (IOPerThreadType *io)
+{
+  size_t rv = 0;
+  while (1)
+  {
+    unsigned char c = io_read (io);
+    rv |= (c & 0x7f);
+    if (c & 0x80)
+      rv <<= 7;
+    else
+      return rv;
+  }
+}
+
+static void free_wipe (size_t idx)
+{
+  char *cp = (char *)ptrs[idx];
+  if (cp == NULL)
+    return;
+  size_t sz = sizes[idx];
+  size_t i;
+  for (i=0; i<sz; i++)
+    {
+      if (i % 8 == 1)
+	cp[i] = i / 8;
+      else
+	cp[i] = 0x22;
+    }
+}
+
+static void *
+thread_common (void *my_data_v)
+{
+  pthread_t thrc = pthread_self ();
+  size_t p1, p2, sz, sz2;
+  IOPerThreadType *io = (IOPerThreadType *)my_data_v;
+  ticks_t my_malloc_time = 0, my_malloc_count = 0;
+  ticks_t my_calloc_time = 0, my_calloc_count = 0;
+  ticks_t my_realloc_time = 0, my_realloc_count = 0;
+  ticks_t my_free_time = 0, my_free_count = 0;
+  ticks_t stime, etime;
+  int thread_idx = io - thread_io;
+#ifdef MDEBUG
+  volatile void *tmp;
+#endif
+
+  while (1)
+    {
+      unsigned char this_op = io_peek (io);
+      if (io->saw_eof)
+	myabort();
+      dprintf("op %p:%ld is %d\n", (void *)thrc, io_pos (io),  io_peek (io));
+      switch (io_read (io))
+	{
+	case C_NOP:
+	  break;
+
+	case C_DONE:
+	  dprintf("op %p:%ld DONE\n", (void *)thrc, io_pos (io));
+	  pthread_mutex_lock (&stat_mutex);
+	  malloc_time += my_malloc_time;
+	  calloc_time += my_calloc_time;
+	  realloc_time += my_realloc_time;
+	  free_time += my_free_time;
+	  malloc_count += my_malloc_count;
+	  calloc_count += my_calloc_count;
+	  realloc_count += my_realloc_count;
+	  free_count += my_free_count;
+	  threads_done ++;
+	  pthread_mutex_unlock (&stat_mutex);
+	  pthread_mutex_lock(&stop_mutex);
+	  pthread_mutex_unlock(&stop_mutex);
+	  return NULL;
+
+	case C_MEMALIGN:
+	  p2 = get_int (io);
+	  sz2 = get_int (io);
+	  sz = get_int (io);
+	  dprintf("op %p:%ld %ld = MEMALIGN %ld %ld\n", (void *)thrc, io_pos (io), p2, sz2, sz);
+	  /* we can't force memalign to return NULL (fail), so just skip it.  */
+	  if (p2 == 0)
+	    break;
+	  if (p2 > n_ptrs)
+	    myabort();
+	  stime = rdtsc_s();
+	  Q1;
+	  if (ptrs[p2])
+	    {
+	      if (!quick_run)
+		free ((void *)ptrs[p2]);
+	      atomic_rss (-sizes[p2]);
+	    }
+	  if (!quick_run)
+	    ptrs[p2] = memalign (sz2, sz);
+	  else
+	    ptrs[p2] = (void *)p2;
+	  /* Verify the alignment matches what is expected.  */
+	  if (((size_t)ptrs[p2] & (sz2 - 1)) != 0)
+	    myabort ();
+	  sizes[p2] = sz;
+	  mprintf("%p = memalign(%lx, %lx)\n", ptrs[p2], sz2, sz);
+	  Q2;
+	  etime = rdtsc_e();
+	  if (ptrs[p2] != NULL)
+	    atomic_rss (sz);
+	  if (etime < stime)
+	    {
+	      printf("s: %llx e:%llx  d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
+	    }
+	  my_malloc_time += etime - stime;
+	  my_malloc_count ++;
+	  if (!quick_run)
+	    wmem(ptrs[p2], sz);
+	  break;
+
+	case C_MALLOC:
+	  p2 = get_int (io);
+	  sz = get_int (io);
+	  dprintf("op %p:%ld %ld = MALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);
+	  /* we can't force malloc to return NULL (fail), so just skip it.  */
+	  if (p2 == 0)
+	    break;
+	  if (p2 > n_ptrs)
+	    myabort();
+	  stime = rdtsc_s();
+	  Q1;
+	  if (ptrs[p2])
+	    {
+	      if (!quick_run)
+		free ((void *)ptrs[p2]);
+	      atomic_rss (-sizes[p2]);
+	    }
+	  if (!quick_run)
+	    ptrs[p2] = malloc (sz);
+	  else
+	    ptrs[p2] = (void *)p2;
+	  sizes[p2] = sz;
+	  mprintf("%p = malloc(%lx)\n", ptrs[p2], sz);
+	  Q2;
+	  etime = rdtsc_e();
+	  if (ptrs[p2] != NULL)
+	    atomic_rss (sz);
+	  if (etime < stime)
+	    {
+	      printf("s: %llx e:%llx  d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
+	    }
+	  my_malloc_time += etime - stime;
+	  my_malloc_count ++;
+	  if (!quick_run)
+	    wmem(ptrs[p2], sz);
+	  break;
+
+	case C_CALLOC:
+	  p2 = get_int (io);
+	  sz = get_int (io);
+	  dprintf("op %p:%ld %ld = CALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);
+	  /* we can't force calloc to return NULL (fail), so just skip it.  */
+	  if (p2 == 0)
+	    break;
+	  if (p2 > n_ptrs)
+	    myabort();
+	  if (ptrs[p2])
+	    {
+	      if (!quick_run)
+		free ((void *)ptrs[p2]);
+	      atomic_rss (-sizes[p2]);
+	    }
+	  stime = rdtsc_s();
+	  Q1;
+	  if (!quick_run)
+	    ptrs[p2] = calloc (sz, 1);
+	  else
+	    ptrs[p2] = (void *)p2;
+	  sizes[p2] = sz;
+	  mprintf("%p = calloc(%lx)\n", ptrs[p2], sz);
+	  Q2;
+	  if (ptrs[p2])
+	    atomic_rss (sz);
+	  my_calloc_time += rdtsc_e() - stime;
+	  my_calloc_count ++;
+	  if (!quick_run)
+	    wmem(ptrs[p2], sz);
+	  break;
+
+	case C_REALLOC:
+	  p2 = get_int (io);
+	  p1 = get_int (io);
+	  sz = get_int (io);
+	  dprintf("op %p:%ld %ld = REALLOC %ld %ld\n", (void *)thrc, io_pos (io), p2, p1, sz);
+	  if (p1 > n_ptrs)
+	    myabort();
+	  if (p2 > n_ptrs)
+	    myabort();
+	  /* we can't force realloc to return NULL (fail), so just skip it.  */
+	  if (p2 == 0)
+	    break;
+
+	  if (ptrs[p1])
+	    atomic_rss (-sizes[p1]);
+	  if (!quick_run)
+	    free_wipe(p1);
+	  stime = rdtsc_s();
+	  Q1;
+#ifdef MDEBUG
+	  tmp = ptrs[p1];
+#endif
+	  if (!quick_run)
+	    ptrs[p2] = realloc ((void *)ptrs[p1], sz);
+	  else
+	    ptrs[p2] = (void *)p2;
+	  sizes[p2] = sz;
+	  mprintf("%p = relloc(%p,%lx)\n", ptrs[p2], tmp,sz);
+	  Q2;
+	  my_realloc_time += rdtsc_e() - stime;
+	  my_realloc_count ++;
+	  if (!quick_run)
+	    wmem(ptrs[p2], sz);
+	  if (p1 != p2)
+	    ptrs[p1] = 0;
+	  if (ptrs[p2])
+	    atomic_rss (sizes[p2]);
+	  break;
+
+	case C_FREE:
+	  p1 = get_int (io);
+	  if (p1 > n_ptrs)
+	    myabort();
+	  dprintf("op %p:%ld FREE %ld\n", (void *)thrc, io_pos (io), p1);
+	  if (!quick_run)
+	    free_wipe (p1);
+	  if (ptrs[p1])
+	    atomic_rss (-sizes[p1]);
+	  stime = rdtsc_s();
+	  Q1;
+	  mprintf("free(%p)\n", ptrs[p1]);
+	  if (!quick_run)
+	    free ((void *)ptrs[p1]);
+	  Q2;
+	  my_free_time += rdtsc_e() - stime;
+	  my_free_count ++;
+	  ptrs[p1] = 0;
+	  break;
+
+	case C_SYNC_W:
+	  p1 = get_int(io);
+	  dprintf("op %p:%ld SYNC_W %ld\n", (void *)thrc, io_pos (io), p1);
+	  if (p1 > n_syncs)
+	    myabort();
+	  pthread_mutex_lock (&mutexes[p1]);
+	  syncs[p1] = 1;
+	  pthread_cond_signal (&conds[p1]);
+	  __sync_synchronize ();
+	  pthread_mutex_unlock (&mutexes[p1]);
+	  break;
+
+	case C_SYNC_R:
+	  p1 = get_int(io);
+	  dprintf("op %p:%ld SYNC_R %ld\n", (void *)thrc, io_pos (io), p1);
+	  if (p1 > n_syncs)
+	    myabort();
+	  pthread_mutex_lock (&mutexes[p1]);
+	  while (syncs[p1] != 1)
+	    {
+	      pthread_cond_wait (&conds[p1], &mutexes[p1]);
+	      __sync_synchronize ();
+	    }
+	  pthread_mutex_unlock (&mutexes[p1]);
+	  break;
+
+	default:
+	  printf("op %d - unsupported, thread %d addr %lu\n",
+		 this_op, thread_idx, (long unsigned int)io_pos (io));
+	  myabort();
+	}
+    }
+}
+
+static void *alloc_mem (size_t amt)
+{
+  void *rv = mmap (NULL, amt, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  mlock (rv, amt);
+  memset (rv, 0, amt);
+  return rv;
+}
+
+static pthread_t *thread_ids;
+
+void *
+my_malloc (const char *msg, int size, IOPerThreadType *io, size_t *psz, size_t count)
+{
+  void *rv;
+  if (psz)
+    count = *psz = get_int (io);
+  dprintf ("my_malloc for %s size %d * %ld\n", msg, size, count);
+  rv = alloc_mem(size * count);
+  if (!rv)
+    {
+      fprintf(stderr, "calloc(%lu,%lu) failed\n", (long unsigned)size, (long unsigned)*psz);
+      exit(1);
+    }
+  mlock (rv, size * count);
+  return rv;
+}
+
+static const char * const scan_names[] = {
+  "UNUSED",
+  "ARENA",
+  "HEAP",
+  "CHUNK_USED",
+  "CHUNK_FREE",
+  "FASTBIN_FREE",
+  "UNSORTED",
+  "TOP",
+  "TCACHE",
+  "USED"
+};
+
+void
+malloc_scan_callback (void *ptr, size_t length, int type)
+{
+  printf("%s: ptr %p length %llx\n", scan_names[type], ptr, (long long)length);
+}
+
+#define MY_ALLOC(T, psz)				\
+  (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, psz, 0)
+#define MY_ALLOCN(T, count)				\
+  (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, NULL, count)
+
+int
+main(int argc, char **argv)
+{
+  ticks_t start=0;
+  ticks_t end;
+  ticks_t usec;
+  struct timeval tv_s, tv_e;
+  int thread_idx = 0;
+  int i;
+  size_t n_threads = 0;
+  size_t idx;
+  struct rusage res_start, res_end;
+  int done;
+  size_t guessed_io_size = 4096;
+  struct stat statb;
+
+  if (argc < 2)
+    {
+      fprintf(stderr, "Usage: %s <trace2dat.outfile>\n", argv[0]);
+      exit(1);
+    }
+  io_fd = open(argv[1], O_RDONLY);
+  if (io_fd < 0)
+    {
+      fprintf(stderr, "Unable to open %s for reading\n", argv[1]);
+      perror("The error was");
+      exit(1);
+    }
+  fstat (io_fd, &statb);
+
+  io_init (&main_io, 0, IOMIN);
+
+  pthread_mutex_lock(&stop_mutex);
+
+  done = 0;
+  while (!done)
+    {
+      switch (io_read (&main_io))
+	{
+	case C_NOP:
+	  break;
+	case C_ALLOC_PTRS:
+	  ptrs = MY_ALLOC (ptrs, &n_ptrs);
+	  sizes = alloc_mem(sizeof(sizes[0]) * n_ptrs);
+	  ptrs[0] = 0;
+	  break;
+	case C_ALLOC_SYNCS:
+	  n_syncs = get_int(&main_io);
+	  syncs = MY_ALLOCN (syncs, n_syncs);
+	  conds = MY_ALLOCN (conds, n_syncs);
+	  mutexes = MY_ALLOCN (mutexes, n_syncs);
+	  for (idx=0; idx<n_syncs; idx++)
+	    {
+	      pthread_mutex_init (&mutexes[idx], NULL);
+	      pthread_cond_init (&conds[idx], NULL);
+	    }
+	  break;
+	case C_NTHREADS:
+	  thread_ids = MY_ALLOC (thread_ids, &n_threads);
+	  thread_io = MY_ALLOCN (thread_io, n_threads);
+	  guessed_io_size = ((statb.st_size / n_threads) < (1024*1024)) ? 65536 : 4096;
+	  /* The next thing in the workscript is thread creation */
+	  getrusage (RUSAGE_SELF, &res_start);
+	  gettimeofday (&tv_s, NULL);
+	  start = rdtsc_s();
+	  break;
+	case C_START_THREAD:
+	  idx = get_int (&main_io);
+	  io_init (& thread_io[thread_idx], idx, guessed_io_size);
+	  pthread_create (&thread_ids[thread_idx], NULL, thread_common, thread_io + thread_idx);
+	  dprintf("Starting thread %lld at offset %lu %lx\n", (long long)thread_ids[thread_idx], (unsigned long)idx, (unsigned long)idx);
+	  thread_idx ++;
+	  break;
+	case C_DONE:
+	  do
+	    {
+	      pthread_mutex_lock (&stat_mutex);
+	      i = threads_done;
+	      pthread_mutex_unlock (&stat_mutex);
+	    } while (i < thread_idx);
+	  done = 1;
+	  break;
+	}
+    }
+  if (!quick_run)
+    {
+      end = rdtsc_e();
+      gettimeofday (&tv_e, NULL);
+      getrusage (RUSAGE_SELF, &res_end);
+
+      printf("%s cycles\n", comma(end - start));
+      usec = diff_timeval (tv_e, tv_s);
+      printf("%s usec wall time\n", comma(usec));
+
+      usec = diff_timeval (res_end.ru_utime, res_start.ru_utime);
+      printf("%s usec across %d thread%s\n",
+	     comma(usec), (int)n_threads, n_threads == 1 ? "" : "s");
+      printf("%s Kb Max RSS (%s -> %s)\n",
+	     comma(res_end.ru_maxrss - res_start.ru_maxrss),
+	     comma(res_start.ru_maxrss), comma(res_end.ru_maxrss));
+    }
+  printf("%s Kb Max Ideal RSS\n", comma (max_ideal_rss / 1024));
+
+  if (malloc_count == 0) malloc_count ++;
+  if (calloc_count == 0) calloc_count ++;
+  if (realloc_count == 0) realloc_count ++;
+  if (free_count == 0) free_count ++;
+
+  if (!quick_run)
+    {
+      printf("\n");
+      printf("sizeof ticks_t is %lu\n", sizeof(ticks_t));
+      printf("Avg malloc time: %6s in %10s calls\n", comma(malloc_time/malloc_count), comma(malloc_count));
+      printf("Avg calloc time: %6s in %10s calls\n", comma(calloc_time/calloc_count), comma(calloc_count));
+      printf("Avg realloc time: %5s in %10s calls\n", comma(realloc_time/realloc_count), comma(realloc_count));
+      printf("Avg free time: %8s in %10s calls\n", comma(free_time/free_count), comma(free_count));
+      printf("Total call time: %s cycles\n", comma(malloc_time+calloc_time+realloc_time+free_time));
+      printf("\n");
+    }
+
+#if 0
+  /* Free any still-held chunks of memory.  */
+  for (idx=0; idx<n_ptrs; idx++)
+    if (ptrs[idx])
+      {
+	free((void *)ptrs[idx]);
+	ptrs[idx] = 0;
+      }
+#endif
+
+#if 0
+  /* This will fail (crash) for system glibc but that's OK.  */
+  __malloc_scan_chunks(malloc_scan_callback);
+
+  malloc_info (0, stdout);
+#endif
+
+#if 0
+  /* ...or report them as used.  */
+  for (idx=0; idx<n_ptrs; idx++)
+    if (ptrs[idx])
+      {
+	char *p = (char *)ptrs[idx] - 2*sizeof(size_t);
+	size_t *sp = (size_t *)p;
+	size_t size = sp[1] & ~7;
+	malloc_scan_callback (sp, size, 9);
+      }
+#endif
+
+  /* Now that we've scanned all the per-thread caches, it's safe to
+     let them exit and clean up.  */
+  pthread_mutex_unlock(&stop_mutex);
+
+  for (i=0; i<thread_idx; i++)
+    pthread_join (thread_ids[i], NULL);
+
+  return 0;
+}
diff --git a/src/benchmarks/falsesharing/Makefile b/src/benchmarks/falsesharing/Makefile
new file mode 100644
index 0000000..7dec230
--- /dev/null
+++ b/src/benchmarks/falsesharing/Makefile
@@ -0,0 +1,29 @@
+OBJDIR ?= obj
+
+CXX ?= g++
+
+WARNFLAGS ?= -Wall -Wextra
+COMMONFLAGS ?= -fno-builtin -fPIC -DPIC -pthread
+OPTFLAGS ?= -O3 -DNDEBUG
+
+CXXFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS)
+
+LDXXFLAGS ?= -pthread -static-libgcc -static-libstdc++
+
+HEADER = cpuinfo.h fred.h timer.h
+
+.PHONY = all clean
+
+all: $(OBJDIR)/cache-thrash $(OBJDIR)/cache-scratch
+
+$(OBJDIR)/cache-thrash: cache-thrash.cc $(HEADER) | $(OBJDIR)
+	$(CXX) $(LDXXFLAGS) $(CXXFLAGS) -o $@ $<
+
+$(OBJDIR)/cache-scratch: cache-scratch.cc $(HEADER) | $(OBJDIR)
+	$(CXX) $(LDXXFLAGS) $(CXXFLAGS) -o $@ $<
+
+$(OBJDIR):
+	mkdir $@
+
+clean:
+	rm -rf $(OBJDIR)
diff --git a/src/benchmarks/falsesharing/cache-scratch.cc b/src/benchmarks/falsesharing/cache-scratch.cc
new file mode 100644
index 0000000..2cb9b28
--- /dev/null
+++ b/src/benchmarks/falsesharing/cache-scratch.cc
@@ -0,0 +1,147 @@
+///-*-C++-*-//////////////////////////////////////////////////////////////////
+//
+// Hoard: A Fast, Scalable, and Memory-Efficient Allocator
+//        for Shared-Memory Multiprocessors
+// Contact author: Emery Berger, http://www.cs.umass.edu/~emery
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Library General Public License as
+// published by the Free Software Foundation, http://www.fsf.org.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Library General Public License for more details.
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @file cache-scratch.cpp
+ *
+ * cache-scratch is a benchmark that exercises a heap's cache locality.
+ * An allocator that allows multiple threads to re-use the same small
+ * object (possibly all in one cache-line) will scale poorly, while
+ * an allocator like Hoard will exhibit near-linear scaling.
+ *
+ * Try the following (on a P-processor machine):
+ *
+ *  cache-scratch 1 1000 1 1000000
+ *  cache-scratch P 1000 1 1000000
+ *
+ *  cache-scratch-hoard 1 1000 1 1000000
+ *  cache-scratch-hoard P 1000 1 1000000
+ *
+ *  The ideal is a P-fold speedup.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "fred.h"
+#include "cpuinfo.h"
+#include "timer.h"
+
+// This class just holds arguments to each thread.
+class workerArg {
+public:
+
+  workerArg() {}
+
+  workerArg (char * obj, int objSize, int repetitions, int iterations)
+    : _object (obj),
+      _objSize (objSize),
+      _iterations (iterations),
+      _repetitions (repetitions)
+  {}
+
+  char * _object;
+  int _objSize;
+  int _iterations;
+  int _repetitions;
+};
+
+
+#if defined(_WIN32)
+extern "C" void worker (void * arg)
+#else
+extern "C" void * worker (void * arg)
+#endif
+{
+  // free the object we were given.
+  // Then, repeatedly do the following:
+  //   malloc a given-sized object,
+  //   repeatedly write on it,
+  //   then free it.
+  workerArg * w = (workerArg *) arg;
+  delete w->_object;
+  workerArg w1 = *w;
+  for (int i = 0; i < w1._iterations; i++) {
+    // Allocate the object.
+    char * obj = new char[w1._objSize];
+    // Write into it a bunch of times.
+    for (int j = 0; j < w1._repetitions; j++) {
+      for (int k = 0; k < w1._objSize; k++) {
+	obj[k] = (char) k;
+	volatile char ch = obj[k];
+	ch++;
+      }
+    }
+    // Free the object.
+    delete [] obj;
+  }
+
+#if !defined(_WIN32)
+  return NULL;
+#endif
+}
+
+
+int main (int argc, char * argv[])
+{
+  int nthreads;
+  int iterations;
+  int objSize;
+  int repetitions;
+
+  if (argc > 4) {
+    nthreads = atoi(argv[1]);
+    iterations = atoi(argv[2]);
+    objSize = atoi(argv[3]);
+    repetitions = atoi(argv[4]);
+  } else {
+    fprintf (stderr, "Usage: %s nthreads iterations objSize repetitions\n", argv[0]);
+    return 1;
+  }
+
+  HL::Fred * threads = new HL::Fred[nthreads];
+  HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors());
+
+  workerArg * w = new workerArg[nthreads];
+
+  int i;
+
+  // Allocate nthreads objects and distribute them among the threads.
+  char ** objs = new char * [nthreads];
+  for (i = 0; i < nthreads; i++) {
+    objs[i] = new char[objSize];
+  }
+
+  HL::Timer t;
+  t.start();
+
+  for (i = 0; i < nthreads; i++) {
+    w[i] = workerArg (objs[i], objSize, repetitions / nthreads, iterations);
+    threads[i].create (&worker, (void *) &w[i]);
+  }
+  for (i = 0; i < nthreads; i++) {
+    threads[i].join();
+  }
+  t.stop();
+
+  delete [] threads;
+  delete [] objs;
+  delete [] w;
+
+  printf ("Time elapsed = %f seconds.\n", (double) t);
+  return 0;
+}
diff --git a/src/benchmarks/falsesharing/cache-thrash.cc b/src/benchmarks/falsesharing/cache-thrash.cc
new file mode 100644
index 0000000..79242eb
--- /dev/null
+++ b/src/benchmarks/falsesharing/cache-thrash.cc
@@ -0,0 +1,134 @@
+///-*-C++-*-//////////////////////////////////////////////////////////////////
+//
+// Hoard: A Fast, Scalable, and Memory-Efficient Allocator
+//        for Shared-Memory Multiprocessors
+// Contact author: Emery Berger, http://www.cs.umass.edu/~emery
+//
+// Copyright (c) 1998-2003, The University of Texas at Austin.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Library General Public License as
+// published by the Free Software Foundation, http://www.fsf.org.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Library General Public License for more details.
+//
+//////////////////////////////////////////////////////////////////////////////
+
+/**
+ * @file  cache-thrash.cpp
+ * @brief cache-thrash is a benchmark that exercises a heap's cache-locality.
+ *
+ * Try the following (on a P-processor machine):
+ *
+ *  cache-thrash 1 1000 1 1000000
+ *  cache-thrash P 1000 1 1000000
+ *
+ *  cache-thrash-hoard 1 1000 1 1000000
+ *  cache-thrash-hoard P 1000 1 1000000
+ *
+ *  The ideal is a P-fold speedup.
+*/
+
+
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+
+#include "cpuinfo.h"
+#include "fred.h"
+#include "timer.h"
+
+// This class just holds arguments to each thread.
+class workerArg {
+public:
+	workerArg() {}
+	workerArg (int objSize, int repetitions, int iterations)
+	: _objSize (objSize),
+	 _iterations (iterations),
+	 _repetitions (repetitions)
+	{}
+
+	int _objSize;
+	int _iterations;
+	int _repetitions;
+};
+
+
+#if defined(_WIN32)
+extern "C" void worker (void * arg)
+#else
+extern "C" void * worker (void * arg)
+#endif
+{
+	// Repeatedly do the following:
+	//   malloc a given-sized object,
+	//   repeatedly write on it,
+	//   then free it.
+	workerArg * w = (workerArg *) arg;
+	workerArg w1 = *w;
+	for (int i = 0; i < w1._iterations; i++) {
+	// Allocate the object.
+		char * obj = new char[w1._objSize];
+		//    printf ("obj = %p\n", obj);
+		// Write into it a bunch of times.
+		for (int j = 0; j < w1._repetitions; j++) {
+			for (int k = 0; k < w1._objSize; k++) {
+				obj[k] = (char) k;
+				volatile char ch = obj[k];
+				ch++;
+			}
+		}
+		// Free the object.
+		delete [] obj;
+	}
+#if !defined(_WIN32)
+	return NULL;
+#endif
+}
+
+
+int main (int argc, char * argv[])
+{
+	int nthreads;
+	int iterations;
+	int objSize;
+	int repetitions;
+	
+	if (argc > 4) {
+		nthreads = atoi(argv[1]);
+		iterations = atoi(argv[2]);
+		objSize = atoi(argv[3]);
+		repetitions = atoi(argv[4]);
+	} else {
+	cerr << "Usage: " << argv[0] << " nthreads iterations objSize repetitions" << endl;
+	exit(1);
+	}
+
+	HL::Fred * threads = new HL::Fred[nthreads];
+	HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors());
+    
+	int i;
+  
+	HL::Timer t;
+	t.start();
+  
+	workerArg * w = new workerArg[nthreads];
+    
+	for (i = 0; i < nthreads; i++) {
+		w[i] = workerArg (objSize, repetitions / nthreads, iterations);
+		threads[i].create (&worker, (void *) &w[i]);
+	}
+	for (i = 0; i < nthreads; i++) {
+		threads[i].join();
+	}
+	t.stop();
+
+	delete [] threads;
+	delete [] w;
+
+	cout << "Time elapsed = " << (double) t << " seconds." << endl;
+}
diff --git a/src/benchmarks/falsesharing/cpuinfo.h b/src/benchmarks/falsesharing/cpuinfo.h
new file mode 100644
index 0000000..1ed1f36
--- /dev/null
+++ b/src/benchmarks/falsesharing/cpuinfo.h
@@ -0,0 +1,202 @@
+// -*- C++ -*-
+
+/*
+
+  Heap Layers: An Extensible Memory Allocation Infrastructure
+  
+  Copyright (C) 2000-2003 by Emery Berger
+  http://www.cs.umass.edu/~emery
+  emery@cs.umass.edu
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+  
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+*/
+
+
+
+#ifndef HL_CPUINFO_H
+#define HL_CPUINFO_H
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
+
+
+#if !defined(_WIN32)
+#include <pthread.h>
+#endif
+
+#if defined(__SVR4) // Solaris
+#include <sys/lwp.h>
+extern "C" unsigned int lwp_self(void);
+#include <thread.h>
+extern "C" int _thr_self(void);
+#endif
+
+#if defined(__linux)
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(__sgi)
+#include <sys/types.h>
+#include <sys/sysmp.h>
+#include <sys/sysinfo.h>
+#endif
+
+#if defined(hpux)
+#include <sys/mpctl.h>
+#endif
+
+#if defined(_WIN32)
+extern __declspec(thread) int localThreadId;
+#endif
+
+#if defined(__SVR4) && defined(MAP_ALIGN)
+extern volatile int anyThreadStackCreated;
+#endif
+
+namespace HL {
+
+/**
+ * @class CPUInfo
+ * @author Emery Berger <http://www.cs.umass.edu/~emery>
+ *
+ * @brief Architecture-independent wrapper to get number of CPUs. 
+ */
+
+class CPUInfo {
+public:
+  CPUInfo (void)
+  {}
+
+  inline static int getNumProcessors (void) {
+    static int _numProcessors = computeNumProcessors();
+    return _numProcessors;
+  }
+
+  static inline unsigned long getThreadId (void);
+  inline static int computeNumProcessors (void);
+
+};
+
+
+int CPUInfo::computeNumProcessors (void)
+{
+  static int np = 0;
+  if (!np) {
+#if defined(__linux) || defined(__APPLE__)
+    np = (int) sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_WIN32)
+    SYSTEM_INFO infoReturn[1];
+    GetSystemInfo (infoReturn);
+    np = (int) (infoReturn->dwNumberOfProcessors);
+#elif defined(__sgi)
+    np = (int) sysmp(MP_NAPROCS);
+#elif defined(hpux)
+    np = mpctl(MPC_GETNUMSPUS, NULL, NULL); // or pthread_num_processors_np()?
+#elif defined(_SC_NPROCESSORS_ONLN)
+    np = (int) (sysconf(_SC_NPROCESSORS_ONLN));
+#else
+    np = 2;
+    // Unsupported platform.
+    // Pretend we have at least two processors. This approach avoids the risk of assuming
+    // we're on a uniprocessor, which might lead clever allocators to avoid using atomic
+    // operations for all locks.
+#endif
+    return np;
+  } else {
+    return np;
+  }
+}
+
+  // Note: when stacksize arg is NULL for pthread_attr_setstacksize [Solaris],
+// stack size is 1 MB for 32-bit arch, 2 MB for 64-bit arch.
+// pthread_attr_getstacksize
+// pthread_attr_setstackaddr
+// pthread_attr_getstackaddr
+// PTHREAD_STACK_SIZE is minimum.
+// or should we just assume we have __declspec(thread) or __thread?
+
+#if defined(USE_THREAD_KEYWORD)
+  extern __thread int localThreadId;
+#endif
+
+  // FIX ME FIXME
+  //#include <stdio.h>
+
+unsigned long CPUInfo::getThreadId (void) {
+#if defined(__SVR4)
+  size_t THREAD_STACK_SIZE;
+  if (sizeof(size_t) <= 4) {
+    THREAD_STACK_SIZE = 1048576;
+  } else {
+    // 64-bits.
+    THREAD_STACK_SIZE = 1048576 * 2;
+  }
+  if (0) { // !anyThreadStackCreated) {
+    // We know a priori that all stack variables
+    // are on different stacks. Since no one has created
+    // a special one, we are in control, and thus all stacks
+    // are 1 MB in size and on 1 MB boundaries.
+    // (Actually: 1 MB for 32-bits, 2 MB for 64-bits.)
+    char buf;
+    return (((size_t) &buf) & ~(THREAD_STACK_SIZE-1)) >> 20;
+  } else {
+    return (int) pthread_self();
+  }
+#elif defined(_WIN32)
+  // It looks like thread id's are always multiples of 4, so...
+  return GetCurrentThreadId() >> 2;
+#elif defined(__APPLE__)
+  // Consecutive thread id's in Mac OS are 4096 apart;
+  // dividing off the 4096 gives us an appropriate thread id.
+  int tid = (int) ((unsigned long) pthread_self()) >> 12;
+  return tid;
+#elif defined(__BEOS__)
+  return find_thread(0);
+#elif defined(USE_THREAD_KEYWORD)
+  return localThreadId;
+#elif defined(__linux) || defined(PTHREAD_KEYS_MAX)
+  // Consecutive thread id's in Linux are 1024 apart;
+  // dividing off the 1024 gives us an appropriate thread id.
+  return (unsigned long) pthread_self() >> 10;
+#elif defined(POSIX)
+  return (unsigned long) pthread_self();
+#elif USE_SPROC
+  // This hairiness has the same effect as calling getpid(),
+  // but it's MUCH faster since it avoids making a system call
+  // and just accesses the sproc-local data directly.
+  unsigned long pid = (unsigned long) PRDA->sys_prda.prda_sys.t_pid;
+  return pid;
+#else
+  return 0;
+#endif
+}
+
+}
+
+#endif
diff --git a/src/benchmarks/falsesharing/fred.h b/src/benchmarks/falsesharing/fred.h
new file mode 100644
index 0000000..b0198a7
--- /dev/null
+++ b/src/benchmarks/falsesharing/fred.h
@@ -0,0 +1,97 @@
+// -*- C++ -*-
+
+#ifndef HL_FRED_H
+#define HL_FRED_H
+
+/// A thread-wrapper of childlike simplicity :).
+
+#if defined(_WIN32)
+
+  #include <windows.h>
+  #include <process.h>
+
+#elif defined(__SVR4)
+
+  #include <thread.h>
+  #include <pthread.h>
+  #include <unistd.h>
+
+#else
+
+  #include <pthread.h>
+  #include <unistd.h>
+
+#endif
+
+typedef void * (*ThreadFunctionType) (void *);
+
+namespace HL {
+
+class Fred {
+public:
+
+  Fred() {
+#if !defined(_WIN32)
+    pthread_attr_init (&attr);
+    pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM);
+#endif
+  }
+
+  ~Fred() {
+#if !defined(_WIN32)
+    pthread_attr_destroy (&attr);
+#endif
+  }
+
+  void create (ThreadFunctionType function, void * arg) {
+#if defined(_WIN32)
+    t = CreateThread (0, 0, (LPTHREAD_START_ROUTINE) *function, (LPVOID) arg, 0, 0);
+#else
+    pthread_create (&t, &attr, function, arg);
+#endif
+  }
+
+  void join (void) {
+#if defined(_WIN32)
+    WaitForSingleObject (t, INFINITE);
+#else
+    pthread_join (t, NULL);
+#endif
+  }
+
+  static void yield (void) {
+#if defined(_WIN32)
+    Sleep (0);
+#elif defined(__SVR4)
+    thr_yield();
+#else
+    sched_yield();
+#endif
+  }
+
+
+  static void setConcurrency (int n) {
+#if defined(_WIN32)
+#elif defined(__SVR4)
+    thr_setconcurrency (n);
+#else
+    pthread_setconcurrency (n);
+#endif
+  }
+
+
+private:
+#if defined(_WIN32)
+  typedef HANDLE FredType;
+#else
+  typedef pthread_t FredType;
+  pthread_attr_t attr;
+#endif
+
+  FredType t;
+};
+
+}
+
+
+#endif
diff --git a/src/benchmarks/falsesharing/timer.h b/src/benchmarks/falsesharing/timer.h
new file mode 100644
index 0000000..d4d42c7
--- /dev/null
+++ b/src/benchmarks/falsesharing/timer.h
@@ -0,0 +1,372 @@
+/* -*- C++ -*- */
+
+/*
+
+  Heap Layers: An Extensible Memory Allocation Infrastructure
+  
+  Copyright (C) 2000-2003 by Emery Berger
+  http://www.cs.umass.edu/~emery
+  emery@cs.umass.edu
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+  
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+*/
+
+#include <cassert>
+#include <stdio.h>
+
+
+#ifndef _TIMER_H_
+#define _TIMER_H_
+
+/**
+ * @class Timer
+ * @brief A portable class for high-resolution timing.
+ *
+ * This class simplifies timing measurements across a number of platforms.
+ * 
+ * @code
+ *  Timer t;
+ *  t.start();
+ *  // do some work
+ *  t.stop();
+ *  cout << "That took " << (double) t << " seconds." << endl;
+ * @endcode
+ *
+ */
+
+#ifdef __APPLE__
+#include <sys/time.h>
+#endif
+
+#if defined(__linux__) && defined(__GNUG__) && defined(__i386__)
+
+#include <stdio.h>
+#include <limits.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+
+static void getTime (unsigned long& tlo, unsigned long& thi) {
+  asm volatile ("rdtsc"
+		: "=a"(tlo),
+		"=d" (thi));
+}
+
+
+static double getFrequency (void) {
+  static double freq = 0.0;
+  static bool initialized = false;
+  unsigned long LTime0, LTime1, HTime0, HTime1;
+  if (!initialized) { 
+
+    freq = 2600000.0;
+
+#if 0
+    // Compute MHz directly.
+    // Wait for approximately one second.
+    
+    getTime (LTime0, HTime0);
+    //    printf ("waiting...\n");
+    struct timespec rqtp, rmtp;
+    rqtp.tv_sec = 1;
+    rqtp.tv_nsec = 0;
+    nanosleep (&rqtp, &rmtp);
+    // printf ("done.\n");
+    getTime (LTime1, HTime1);
+
+    freq = (double)(LTime1 - LTime0) + (double)(UINT_MAX)*(double)(HTime1 - HTime0);
+    if (LTime1 < LTime0) {
+      freq -= (double)UINT_MAX;
+    }
+#endif
+    initialized = true;
+
+  } else {
+    // printf ("wha?\n");
+  }
+  return freq;
+}
+
+
+namespace HL {
+
+class Timer {
+public:
+  Timer (void)
+    : timeElapsed (0.0)
+  {
+    _frequency = getFrequency();
+    //    printf ("wooo!\n");
+    //  printf ("freq = %lf\n", frequency);
+  }
+  void start (void) {
+    getTime (currentLo, currentHi);
+  }
+  void stop (void) {
+    unsigned long lo, hi;
+    getTime (lo, hi);
+    double now = (double) hi * 4294967296.0 + lo;
+    double prev = (double) currentHi * 4294967296.0 + currentLo;
+    timeElapsed = (now - prev) / _frequency;
+  }
+
+  operator double (void) {
+    return timeElapsed;
+  }
+
+private:
+  double timeElapsed;
+  unsigned long currentLo, currentHi;
+  double _frequency;
+};
+
+};
+
+#else
+
+
+#ifdef __SVR4 // Solaris
+#include <sys/time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
+#include <stdio.h>
+#endif // __SVR4
+
+#include <time.h>
+
+#if defined(unix) || defined(__linux)
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+
+#ifdef __sgi
+#include <sys/types.h>
+#include <sys/times.h>
+#include <limits.h>
+#endif
+
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+
+#if defined(__BEOS__)
+#include <OS.h>
+#endif
+
+
+namespace HL {
+
+class Timer {
+
+public:
+
+  /// Initializes the timer.
+  Timer (void)
+#if !defined(_WIN32)
+    : _starttime (0),
+      _elapsedtime (0)
+#endif
+  {
+  }
+
+  /// Start the timer.
+  void start (void) { _starttime = _time(); }
+
+  /// Stop the timer.
+  void stop (void) { _elapsedtime += _time() - _starttime; }
+
+  /// Reset the timer.
+  void reset (void) { _starttime = _elapsedtime; }
+
+#if 0
+  // Set the timer.
+  void set (double secs) { _starttime = 0; _elapsedtime = _sectotime (secs);}
+#endif
+
+  /// Return the number of seconds elapsed.
+  operator double (void) { return _timetosec (_elapsedtime); }
+
+  static double currentTime (void) { TimeType t; t = _time(); return _timetosec (t); }
+
+
+private:
+
+  // The _timer variable will be different depending on the OS.
+  // We try to use the best timer available.
+
+#ifdef __sgi
+#define TIMER_FOUND
+
+  long _starttime, _elapsedtime;
+
+  long _time (void) {
+    struct tms t;
+    long ticks = times (&t);
+    return ticks;
+  }
+
+  static double _timetosec (long t) {
+    return ((double) (t) / CLK_TCK);
+  }
+
+  static long _sectotime (double sec) {
+    return (long) sec * CLK_TCK;
+  }
+#endif
+
+#ifdef __SVR4 // Solaris
+#define TIMER_FOUND
+  typedef hrtime_t TimeType;
+  TimeType	_starttime, _elapsedtime;
+
+  static TimeType _time (void) {
+    return gethrtime();
+  }
+
+  static TimeType _sectotime (double sec) { return (hrtime_t) (sec * 1.0e9); }
+
+  static double _timetosec (TimeType& t) {
+    return ((double) (t) / 1.0e9);
+  }
+#endif // __SVR4
+
+#if defined(MAC) || defined(macintosh)
+#define TIMER_FOUND
+  double		_starttime, _elapsedtime;
+
+  double _time (void) {
+    return get_Mac_microseconds();
+  }
+
+  double _timetosec (hrtime_t& t) {
+    return t;
+  }
+#endif // MAC
+
+#ifdef _WIN32
+#define TIMER_FOUND
+
+#ifndef __GNUC__
+  class TimeType {
+  public:
+    TimeType (void)
+    {
+      largeInt.QuadPart = 0;
+    }
+    operator double& (void) { return (double&) largeInt.QuadPart; }
+    operator LARGE_INTEGER& (void) { return largeInt; }
+    double timeToSec (void) {
+      return (double) largeInt.QuadPart / getFreq();
+    }
+  private:
+    double getFreq (void) {
+      QueryPerformanceFrequency (&freq);
+      return (double) freq.QuadPart;
+    }
+
+    LARGE_INTEGER largeInt;
+    LARGE_INTEGER freq;
+  };
+
+  TimeType _starttime, _elapsedtime;
+
+  static TimeType _time (void) {
+    TimeType t;
+    int r = QueryPerformanceCounter (&((LARGE_INTEGER&) t));
+    assert (r);
+    return t;
+  }
+
+  static double _timetosec (TimeType& t) {
+    return t.timeToSec();
+  }
+#else
+  typedef DWORD TimeType;
+  DWORD _starttime, _elapsedtime;
+  static DWORD _time (void) {
+    return GetTickCount();
+  }
+
+  static double _timetosec (DWORD& t) {
+    return (double) t / 100000.0;
+  }
+  static unsigned long _sectotime (double sec) {
+    return (unsigned long)(sec);
+  }
+#endif
+#endif // _WIN32
+
+
+#ifdef __BEOS__
+#define TIMER_FOUND
+  bigtime_t _starttime, _elapsedtime;
+  bigtime_t _time(void) {
+    return system_time();
+  }
+  double _timetosec (bigtime_t& t) {
+    return (double) t / 1000000.0;
+  }
+  
+  bigtime_t _sectotime (double sec) {
+    return (bigtime_t)(sec * 1000000.0);
+  }
+#endif // __BEOS__
+
+#ifndef TIMER_FOUND
+
+  typedef long TimeType;
+  TimeType _starttime, _elapsedtime;
+
+  static TimeType _time (void) {
+    struct timeval t;
+    gettimeofday (&t, NULL);
+    return t.tv_sec * 1000000 + t.tv_usec;
+  }
+
+  static double _timetosec (TimeType t) {
+    return ((double) (t) / 1000000.0);
+  }
+
+  static TimeType _sectotime (double sec) {
+    return (TimeType) (sec * 1000000.0);
+  }
+
+#endif // TIMER_FOUND
+
+#undef TIMER_FOUND
+
+};
+
+
+#ifdef __SVR4 // Solaris
+class VirtualTimer : public Timer {
+public:
+  hrtime_t _time (void) {
+    return gethrvtime();
+  }
+};  
+#endif
+
+}
+
+#endif
+
+#endif
diff --git a/src/benchmarks/fred.h b/src/benchmarks/fred.h
deleted file mode 100644
index b0198a7..0000000
--- a/src/benchmarks/fred.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// -*- C++ -*-
-
-#ifndef HL_FRED_H
-#define HL_FRED_H
-
-/// A thread-wrapper of childlike simplicity :).
-
-#if defined(_WIN32)
-
-  #include <windows.h>
-  #include <process.h>
-
-#elif defined(__SVR4)
-
-  #include <thread.h>
-  #include <pthread.h>
-  #include <unistd.h>
-
-#else
-
-  #include <pthread.h>
-  #include <unistd.h>
-
-#endif
-
-typedef void * (*ThreadFunctionType) (void *);
-
-namespace HL {
-
-class Fred {
-public:
-
-  Fred() {
-#if !defined(_WIN32)
-    pthread_attr_init (&attr);
-    pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM);
-#endif
-  }
-
-  ~Fred() {
-#if !defined(_WIN32)
-    pthread_attr_destroy (&attr);
-#endif
-  }
-
-  void create (ThreadFunctionType function, void * arg) {
-#if defined(_WIN32)
-    t = CreateThread (0, 0, (LPTHREAD_START_ROUTINE) *function, (LPVOID) arg, 0, 0);
-#else
-    pthread_create (&t, &attr, function, arg);
-#endif
-  }
-
-  void join (void) {
-#if defined(_WIN32)
-    WaitForSingleObject (t, INFINITE);
-#else
-    pthread_join (t, NULL);
-#endif
-  }
-
-  static void yield (void) {
-#if defined(_WIN32)
-    Sleep (0);
-#elif defined(__SVR4)
-    thr_yield();
-#else
-    sched_yield();
-#endif
-  }
-
-
-  static void setConcurrency (int n) {
-#if defined(_WIN32)
-#elif defined(__SVR4)
-    thr_setconcurrency (n);
-#else
-    pthread_setconcurrency (n);
-#endif
-  }
-
-
-private:
-#if defined(_WIN32)
-  typedef HANDLE FredType;
-#else
-  typedef pthread_t FredType;
-  pthread_attr_t attr;
-#endif
-
-  FredType t;
-};
-
-}
-
-
-#endif
diff --git a/src/benchmarks/larson.cc b/src/benchmarks/larson.cc
deleted file mode 100644
index be8038f..0000000
--- a/src/benchmarks/larson.cc
+++ /dev/null
@@ -1,744 +0,0 @@
-#include <assert.h>
-#include <stdio.h>
-
-#if defined(_WIN32)
-#define __WIN32__
-#endif
-
-#ifdef __WIN32__
-#include  <windows.h>
-#include  <conio.h>
-#include  <process.h>
-
-#else
-#include <unistd.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-
-#ifndef __SVR4
-//extern "C" int pthread_setconcurrency (int) throw();
-#include <pthread.h>
-#endif
-
-
-typedef void * LPVOID;
-typedef long long LONGLONG;
-typedef long DWORD;
-typedef long LONG;
-typedef unsigned long ULONG;
-typedef union _LARGE_INTEGER {
-  struct {
-    DWORD LowPart;
-    LONG  HighPart;
-  } foo;
-  LONGLONG QuadPart;    // In Visual C++, a typedef to _ _int64} LARGE_INTEGER;
-} LARGE_INTEGER;
-typedef long long _int64;
-#ifndef TRUE
-enum { TRUE = 1, FALSE = 0 };
-#endif
-#include <assert.h>
-#define _ASSERTE(x) assert(x)
-#define _inline inline
-void Sleep (long x) 
-{
-  //  printf ("sleeping for %ld seconds.\n", x/1000);
-  sleep(x/1000);
-}
-
-void QueryPerformanceCounter (long * x)
-{
-  struct timezone tz;
-  struct timeval tv;
-  gettimeofday (&tv, &tz);
-  *x = tv.tv_sec * 1000000L + tv.tv_usec;
-}
-
-void QueryPerformanceFrequency(long * x)
-{
-  *x = 1000000L;
-}
-
-
-#include  <stdio.h>
-#include  <stdlib.h>
-#include  <stddef.h>
-#include  <string.h>
-#include  <ctype.h>
-#include  <time.h>
-#include  <assert.h>
-
-#define _REENTRANT 1
-#include <pthread.h>
-#ifdef __sun
-#include <thread.h>
-#endif
-typedef void * VoidFunction (void *);
-void _beginthread (VoidFunction x, int, void * z)
-{
-  pthread_t pt;
-  pthread_attr_t pa;
-  pthread_attr_init (&pa);
-
-#if 1//defined(__SVR4)
-  pthread_attr_setscope (&pa, PTHREAD_SCOPE_SYSTEM); /* bound behavior */
-#endif
-
-  //  printf ("creating a thread.\n");
-  int v = pthread_create(&pt, &pa, x, z);
-  //  printf ("v = %d\n", v);
-}
-#endif
-
-
-#if 0
-static char buf[65536];
-
-#define malloc(v) &buf
-#define free(p) 
-#endif
-
-#undef CPP
-//#define CPP
-//#include "arch-specific.h"
-
-#if USE_ROCKALL
-//#include "FastHeap.hpp"
-//FAST_HEAP theFastHeap (1024 * 1024, true, true, true);
-
-typedef int SBIT32;
-
-#include "SmpHeap.hpp"
-SMP_HEAP theFastHeap (1024 * 1024, true, true, true);
-
-void * operator new( unsigned int cb )
-{
-  void *pRet = theFastHeap.New ((size_t)cb) ;
-  return pRet;
-}
-
-void operator delete(void *pUserData )
-{
-  theFastHeap.Delete (pUserData) ;
-}
-#endif
-
-#if 0
-extern "C" void * hdmalloc (size_t sz) ;
-extern "C" void hdfree (void * ptr) ;
-extern "C" void hdmalloc_stats (void) ;
-void * operator new( unsigned int cb )
-{
-  void *pRet = hdmalloc((size_t)cb) ;
-  return pRet;
-}
-
-void operator delete(void *pUserData )
-{
-  hdfree(pUserData) ;
-}
-#endif
-
-
-
-/* Test driver for memory allocators           */
-/* Author: Paul Larson, palarson@microsoft.com */
-#define MAX_THREADS     100
-#define MAX_BLOCKS  20000000
-
-int volatile  stopflag=FALSE ;       
-
-struct lran2_st {
-  long x, y, v[97];
-};
-
-int     TotalAllocs=0 ;
-
-typedef struct thr_data {
-
-  int    threadno ;
-  int    NumBlocks ;
-  int    seed ;
-
-  int    min_size ;
-  int    max_size ;
-
-  char * *array ;
-  int    *blksize ;
-  int     asize ;
-
-  unsigned long    cAllocs ;
-  unsigned long    cFrees ;
-  int    cThreads ;
-  unsigned long    cBytesAlloced ;
-
-  volatile int finished ;
-  struct lran2_st rgen ;
-
-} thread_data;
-
-void runthreads(long sleep_cnt, int min_threads, int max_threads, 
-		int chperthread, int num_rounds) ;
-void runloops(long sleep_cnt, int num_chunks ) ;
-static void warmup(char **blkp, int num_chunks );
-static void * exercise_heap( void *pinput) ;
-static void lran2_init(struct lran2_st* d, long seed) ;
-static long lran2(struct lran2_st* d) ;
-ULONG CountReservedSpace() ;
- 
-char **          blkp = new char *[MAX_BLOCKS] ;
-int *           blksize = new int[MAX_BLOCKS] ;
-long            seqlock=0 ;
-struct lran2_st rgen ;
-int             min_size=10, max_size=500 ;
-int             num_threads ;
-ULONG           init_space ;
-
-extern  int   cLockSleeps ;
-extern  int   cAllocedChunks ;
-extern  int   cAllocedSpace ;
-extern  int   cUsedSpace ;
-extern  int   cFreeChunks ;
-extern  int   cFreeSpace ;
-
-int cChecked=0 ;
-
-#if defined(_WIN32)
-extern "C" {
-  extern HANDLE crtheap;
-};
-#endif
-
-int main (int argc, char *argv[])
-{
-#if defined(USE_LFH) && defined(_WIN32)
-  // Activate 'Low Fragmentation Heap'.
-  ULONG info = 2;
-  HeapSetInformation (GetProcessHeap(),
-		      HeapCompatibilityInformation,
-		      &info,
-		      sizeof(info));
-#endif
-#if 0 // defined(__SVR4)
- {
-   psinfo_t ps;
-   int pid = getpid();
-   char fname[255];
-   sprintf (fname, "/proc/%d/psinfo", pid);
-   // sprintf (fname, "/proc/self/ps");
-   FILE * f = fopen (fname, "rb");
-   printf ("opening %s\n", fname);
-   if (f) {
-     fread (&ps, sizeof(ps), 1, f);
-     printf ("resident set size = %dK\n", ps.pr_rssize);
-     fclose (f);
-   }
- }
-#endif
-
-#if defined(_MT) || defined(_REENTRANT)
-  int          min_threads, max_threads ;
-  int          num_rounds ;
-  int          chperthread ;
-#endif
-  unsigned     seed=12345 ;
-  int          num_chunks=10000;
-  long sleep_cnt;
-
-  if (argc > 7) {
-    sleep_cnt = atoi(argv[1]);
-    min_size = atoi(argv[2]);
-    max_size = atoi(argv[3]);
-    chperthread = atoi(argv[4]);
-    num_rounds = atoi(argv[5]);
-    seed = atoi(argv[6]);
-    max_threads = atoi(argv[7]);
-    min_threads = max_threads;
-    printf ("sleep = %ld, min = %d, max = %d, per thread = %d, num rounds = %d, seed = %d, max_threads = %d, min_threads = %d\n",
-	    sleep_cnt, min_size, max_size, chperthread, num_rounds, seed, max_threads, min_threads);
-    goto DoneWithInput;
-  }
-
-#if defined(_MT) || defined(_REENTRANT)
-  //#ifdef _MT
-  printf( "\nMulti-threaded test driver \n") ;
-#else
-  printf( "\nSingle-threaded test driver \n") ;
-#endif
-#ifdef CPP
-  printf("C++ version (new and delete)\n") ;
-#else
-  printf("C version (malloc and free)\n") ;
-#endif
-  printf("runtime (sec): ") ;
-  scanf ("%ld", &sleep_cnt);
-
-  printf("chunk size (min,max): ") ;
-  scanf("%d %d", &min_size, &max_size ) ;
-#if defined(_MT) || defined(_REENTRANT)
-  //#ifdef _MT
-  printf("threads (min, max):   ") ; 
-  scanf("%d %d", &min_threads, &max_threads) ;
-  printf("chunks/thread:  ") ; scanf("%d", &chperthread ) ;
-  printf("no of rounds:   ") ; scanf("%d", &num_rounds ) ;
-  num_chunks = max_threads*chperthread ;
-#else 
-  printf("no of chunks:  ") ; scanf("%d", &num_chunks ) ;
-#endif
-  printf("random seed:    ") ; scanf("%d", &seed) ;
-
- DoneWithInput:
-
-  if( num_chunks > MAX_BLOCKS ){
-    printf("Max %d chunks - exiting\n", MAX_BLOCKS ) ;
-    return(1) ;
-  }
-
-#ifndef __WIN32__
-#ifdef __SVR4
-  pthread_setconcurrency (max_threads);
-#endif
-#endif
-
-  lran2_init(&rgen, seed) ;
-  // init_space = CountReservedSpace() ;
-
-#if defined(_MT) || defined(_REENTRANT)
-  //#ifdef _MT
-  runthreads(sleep_cnt, min_threads, max_threads, chperthread, num_rounds) ;
-#else
-  runloops(sleep_cnt, num_chunks ) ;
-#endif
-
-#ifdef _DEBUG
-  _cputs("Hit any key to exit...") ;	(void)_getch() ;
-#endif
-
-  return 0;
-
-} /* main */
-
-void runloops(long sleep_cnt, int num_chunks )
-{
-  int     cblks ;
-  int     victim ;
-  int     blk_size ;
-#ifdef __WIN32__
-	_LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt;
-#else
-  long ticks_per_sec ;
-  long start_cnt, end_cnt ;
-#endif
-  _int64        ticks ;
-  double        duration ;
-  double        reqd_space ;
-  ULONG         used_space ;
-  int           sum_allocs=0 ;
-
-  QueryPerformanceFrequency( &ticks_per_sec ) ;
-  QueryPerformanceCounter( &start_cnt) ;
-
-  for( cblks=0; cblks<num_chunks; cblks++){
-    if (max_size == min_size) {
-      blk_size = min_size;
-    } else {
-      blk_size = min_size+lran2(&rgen)%(max_size - min_size) ;
-    }
-#ifdef CPP
-    blkp[cblks] = new char[blk_size] ;
-#else
-    blkp[cblks] = (char *) malloc(blk_size) ;
-#endif
-    blksize[cblks] = blk_size ;
-    assert(blkp[cblks] != NULL) ;
-  }
-
-  while(TRUE){
-    for( cblks=0; cblks<num_chunks; cblks++){
-      victim = lran2(&rgen)%num_chunks ;
-#ifdef CPP
-      delete blkp[victim] ;
-#else
-      free(blkp[victim]) ;
-#endif
-
-      if (max_size == min_size) {
-	blk_size = min_size;
-      } else {
-	blk_size = min_size+lran2(&rgen)%(max_size - min_size) ;
-      }
-#ifdef CPP
-      blkp[victim] = new char[blk_size] ;
-#else
-      blkp[victim] = (char *) malloc(blk_size) ;
-#endif
-      blksize[victim] = blk_size ;
-      assert(blkp[victim] != NULL) ;
-    }
-    sum_allocs += num_chunks ;
-
-    QueryPerformanceCounter( &end_cnt) ;
-#ifdef __WIN32__
-		ticks = end_cnt.QuadPart - start_cnt.QuadPart ;
-		duration = (double)ticks/ticks_per_sec.QuadPart ;
-#else
-    ticks = end_cnt - start_cnt ;
-    duration = (double)ticks/ticks_per_sec ;
-#endif
-
-    if( duration >= sleep_cnt) break ;
-  }
-  reqd_space = (0.5*(min_size+max_size)*num_chunks) ;
-  // used_space = CountReservedSpace() - init_space;
-
-  printf("%6.3f", duration  ) ;
-  printf("%8.0f", sum_allocs/duration ) ;
-  printf(" %6.3f %.3f", (double)used_space/(1024*1024), used_space/reqd_space) ;
-  printf("\n") ;
-
-}
-
-
-#if defined(_MT) || defined(_REENTRANT)
-//#ifdef _MT
-void runthreads(long sleep_cnt, int min_threads, int max_threads, int chperthread, int num_rounds)
-{
-  thread_data *de_area = new thread_data[max_threads] ;
-  thread_data *pdea;
-  int           nperthread ;
-  int           sum_threads ;
-  unsigned long  sum_allocs ;
-  unsigned long  sum_frees ;
-  double        duration ;
-#ifdef __WIN32__
-	_LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt;
-#else
-	long ticks_per_sec ;
-  long start_cnt, end_cnt ;
-#endif
-	_int64        ticks ;
-  double        rate_1=0, rate_n ;
-  double        reqd_space ;
-  ULONG         used_space ;
-  int           prevthreads ;
-  int           i ;
-
-  QueryPerformanceFrequency( &ticks_per_sec ) ;
-
-  pdea = &de_area[0] ;
-  memset(&de_area[0], 0, sizeof(thread_data)) ;
-
-  prevthreads = 0 ;
-  for(num_threads=min_threads; num_threads <= max_threads; num_threads++ )
-    {
-
-      warmup(&blkp[prevthreads*chperthread], (num_threads-prevthreads)*chperthread );
-
-      nperthread = chperthread ;
-      stopflag   = FALSE ;
-		
-      for(i=0; i< num_threads; i++){
-	de_area[i].threadno    = i+1 ;
-	de_area[i].NumBlocks   = num_rounds*nperthread;
-	de_area[i].array       = &blkp[i*nperthread] ;
-	de_area[i].blksize     = &blksize[i*nperthread] ;
-	de_area[i].asize       = nperthread ;
-	de_area[i].min_size    = min_size ;
-	de_area[i].max_size    = max_size ;
-	de_area[i].seed        = lran2(&rgen) ; ;
-	de_area[i].finished    = 0 ;
-	de_area[i].cAllocs     = 0 ;
-	de_area[i].cFrees      = 0 ;
-	de_area[i].cThreads    = 0 ;
-	de_area[i].finished    = FALSE ;
-	lran2_init(&de_area[i].rgen, de_area[i].seed) ;
-
-#ifdef __WIN32__
-	_beginthread((void (__cdecl*)(void *)) exercise_heap, 0, &de_area[i]) ;  
-#else
-	_beginthread(exercise_heap, 0, &de_area[i]) ;  
-#endif
-
-	}
-
-      QueryPerformanceCounter( &start_cnt) ;
-
-      // printf ("Sleeping for %ld seconds.\n", sleep_cnt);
-      Sleep(sleep_cnt * 1000L) ;
-
-      stopflag = TRUE ;
-
-      for(i=0; i<num_threads; i++){
-	while( !de_area[i].finished ){
-#ifdef __WIN32__
-		Sleep(1);
-#elif defined(__SVR4)
-		thr_yield();
-#else
-		sched_yield();
-#endif
-	}
-      }
-
-
-      QueryPerformanceCounter( &end_cnt) ;
-
-      sum_frees = sum_allocs =0  ;
-      sum_threads = 0 ;
-      for(i=0;i< num_threads; i++){
-	sum_allocs    += de_area[i].cAllocs ;
-	sum_frees     += de_area[i].cFrees ;
-	sum_threads   += de_area[i].cThreads ;
-	de_area[i].cAllocs = de_area[i].cFrees = 0;
-      }
-
- 
-#ifdef __WIN32__
-      ticks = end_cnt.QuadPart - start_cnt.QuadPart ;
-     duration = (double)ticks/ticks_per_sec.QuadPart ;
-#else
-      ticks = end_cnt - start_cnt ;
-     duration = (double)ticks/ticks_per_sec ;
-#endif
-
-      for( i=0; i<num_threads; i++){
-	if( !de_area[i].finished )
-	  printf("Thread at %d not finished\n", i) ;
-      }
-
-
-      rate_n = sum_allocs/duration ;
-      if( rate_1 == 0){
-	rate_1 = rate_n ;
-      }
-		
-      reqd_space = (0.5*(min_size+max_size)*num_threads*chperthread) ;
-      // used_space = CountReservedSpace() - init_space;
-      used_space = 0;
-      
-      printf ("Throughput = %8.0f operations per second.\n", sum_allocs / duration);
-
-#if 0
-      printf("%2d ", num_threads ) ;
-      printf("%6.3f", duration  ) ;
-      printf("%6.3f", rate_n/rate_1 ) ;
-      printf("%8.0f", sum_allocs/duration ) ;
-      printf(" %6.3f %.3f", (double)used_space/(1024*1024), used_space/reqd_space) ;
-      printf("\n") ;
-#endif
-
-      Sleep(5000L) ; // wait 5 sec for old threads to die
-
-      prevthreads = num_threads ;
-
-      printf ("Done sleeping...\n");
-
-    }
-  delete [] de_area;
-}
-
-
-static void * exercise_heap( void *pinput)
-{
-  thread_data  *pdea;
-  int           cblks=0 ;
-  int           victim ;
-  long          blk_size ;
-  int           range ;
-
-  if( stopflag ) return 0;
-
-  pdea = (thread_data *)pinput ;
-  pdea->finished = FALSE ;
-  pdea->cThreads++ ;
-  range = pdea->max_size - pdea->min_size ;
-
-  /* allocate NumBlocks chunks of random size */
-  for( cblks=0; cblks<pdea->NumBlocks; cblks++){
-    victim = lran2(&pdea->rgen)%pdea->asize ;
-#ifdef CPP
-    delete pdea->array[victim] ;
-#else
-    free(pdea->array[victim]) ;
-#endif
-    pdea->cFrees++ ;
-
-    if (range == 0) {
-      blk_size = pdea->min_size;
-    } else {
-      blk_size = pdea->min_size+lran2(&pdea->rgen)%range ;
-    }
-#ifdef CPP
-    pdea->array[victim] = new char[blk_size] ;
-#else
-    pdea->array[victim] = (char *) malloc(blk_size) ;
-#endif
-
-    pdea->blksize[victim] = blk_size ;
-    assert(pdea->array[victim] != NULL) ;
-
-    pdea->cAllocs++ ;
-
-		/* Write something! */
-
-		volatile char * chptr = ((char *) pdea->array[victim]);
-		*chptr++ = 'a';
-		volatile char ch = *((char *) pdea->array[victim]);
-		*chptr = 'b';
-
-    
-		if( stopflag ) break ;
-  }
-
-  //  	printf("Thread %u terminating: %d allocs, %d frees\n",
-  //		      pdea->threadno, pdea->cAllocs, pdea->cFrees) ;
-  pdea->finished = TRUE ;
-
-  if( !stopflag ){
-#ifdef __WIN32__
-	_beginthread((void (__cdecl*)(void *)) exercise_heap, 0, pdea) ;  
-#else
-    _beginthread(exercise_heap, 0, pdea) ;
-#endif
-  } else {
-    printf ("thread stopping.\n");
-  }
-#ifndef _WIN32
-  pthread_exit (NULL);
-#endif
-  return 0;
-}
-
-static void warmup(char **blkp, int num_chunks )
-{
-  int     cblks ;
-  int     victim ;
-  int     blk_size ;
-  LPVOID  tmp ;
-
-
-  for( cblks=0; cblks<num_chunks; cblks++){
-    if (min_size == max_size) {
-      blk_size = min_size;
-    } else {
-      blk_size = min_size+lran2(&rgen)%(max_size-min_size) ;
-    }
-#ifdef CPP
-    blkp[cblks] = new char[blk_size] ;
-#else
-    blkp[cblks] = (char *) malloc(blk_size) ;
-#endif
-    blksize[cblks] = blk_size ;
-    assert(blkp[cblks] != NULL) ;
-  }
-
-  /* generate a random permutation of the chunks */
-  for( cblks=num_chunks; cblks > 0 ; cblks--){
-    victim = lran2(&rgen)%cblks ;
-    tmp = blkp[victim] ;
-    blkp[victim]  = blkp[cblks-1] ;
-    blkp[cblks-1] = (char *) tmp ;
-  }
-
-  for( cblks=0; cblks<4*num_chunks; cblks++){
-    victim = lran2(&rgen)%num_chunks ;
-#ifdef CPP
-    delete blkp[victim] ;
-#else
-    free(blkp[victim]) ;
-#endif
-
-    if (max_size == min_size) {
-      blk_size = min_size;
-    } else {
-      blk_size = min_size+lran2(&rgen)%(max_size - min_size) ;
-    }
-#ifdef CPP
-    blkp[victim] = new char[blk_size] ;
-#else
-    blkp[victim] = (char *) malloc(blk_size) ;
-#endif
-    blksize[victim] = blk_size ;
-    assert(blkp[victim] != NULL) ;
-  }
-}
-#endif // _MT
-
-#ifdef __WIN32__
-ULONG CountReservedSpace()
-{
-  MEMORY_BASIC_INFORMATION info;
-  char                     *addr=NULL ;
-  ULONG                     size=0 ;
-
-  while( true){
-    VirtualQuery(addr, &info, sizeof(info));
-    switch( info.State){
-    case MEM_FREE:
-    case MEM_RESERVE:
-      break ;
-    case MEM_COMMIT:
-      size += info.RegionSize ;
-      break ;
-    }
-    addr += info.RegionSize ;
-    if( addr >= (char *)0x80000000UL ) break ;
-  }
-
-  return size ;
-
-}
-#endif
-
-// =======================================================
-
-/* lran2.h
- * by Wolfram Gloger 1996.
- *
- * A small, portable pseudo-random number generator.
- */
-
-#ifndef _LRAN2_H
-#define _LRAN2_H
-
-#define LRAN2_MAX 714025l /* constants for portable */
-#define IA	  1366l	  /* random number generator */
-#define IC	  150889l /* (see e.g. `Numerical Recipes') */
-
-//struct lran2_st {
-//    long x, y, v[97];
-//};
-
-static void
-lran2_init(struct lran2_st* d, long seed)
-{
-  long x;
-  int j;
-
-  x = (IC - seed) % LRAN2_MAX;
-  if(x < 0) x = -x;
-  for(j=0; j<97; j++) {
-    x = (IA*x + IC) % LRAN2_MAX;
-    d->v[j] = x;
-  }
-  d->x = (IA*x + IC) % LRAN2_MAX;
-  d->y = d->x;
-}
-
-static 
-long lran2(struct lran2_st* d)
-{
-  int j = (d->y % 97);
-
-  d->y = d->v[j];
-  d->x = (IA*d->x + IC) % LRAN2_MAX;
-  d->v[j] = d->x;
-  return d->y;
-}
-
-#undef IA
-#undef IC
-
-#endif
-
-
diff --git a/src/benchmarks/larson/Makefile b/src/benchmarks/larson/Makefile
new file mode 100644
index 0000000..9ccce9f
--- /dev/null
+++ b/src/benchmarks/larson/Makefile
@@ -0,0 +1,24 @@
+OBJDIR ?= obj
+
+CXX ?= g++
+
+WARNFLAGS ?= -Wall -Wextra
+COMMONFLAGS ?= -fno-builtin -fPIC -DPIC -pthread
+OPTFLAGS ?= -O3 -DNDEBUG
+
+CXXFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS)
+
+LDXXFLAGS ?= -pthread -static-libgcc -static-libstdc++
+
+.PHONY: all clean
+
+all: $(OBJDIR)/larson
+
+$(OBJDIR)/larson: larson.cc | $(OBJDIR)
+	$(CXX) $(LDXXFLAGS) $(CXXFLAGS) -o $@ $<
+
+$(OBJDIR):
+	mkdir $@
+
+clean:
+	rm -rf $(OBJDIR)
diff --git a/src/benchmarks/larson/larson.cc b/src/benchmarks/larson/larson.cc
new file mode 100644
index 0000000..be8038f
--- /dev/null
+++ b/src/benchmarks/larson/larson.cc
@@ -0,0 +1,744 @@
+#include <assert.h>
+#include <stdio.h>
+
+#if defined(_WIN32)
+#define __WIN32__
+#endif
+
+#ifdef __WIN32__
+#include  <windows.h>
+#include  <conio.h>
+#include  <process.h>
+
+#else
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+
+#ifndef __SVR4
+//extern "C" int pthread_setconcurrency (int) throw();
+#include <pthread.h>
+#endif
+
+
+typedef void * LPVOID;
+typedef long long LONGLONG;
+typedef long DWORD;
+typedef long LONG;
+typedef unsigned long ULONG;
+typedef union _LARGE_INTEGER {
+  struct {
+    DWORD LowPart;
+    LONG  HighPart;
+  } foo;
+  LONGLONG QuadPart;    // In Visual C++, a typedef to _ _int64} LARGE_INTEGER;
+} LARGE_INTEGER;
+typedef long long _int64;
+#ifndef TRUE
+enum { TRUE = 1, FALSE = 0 };
+#endif
+#include <assert.h>
+#define _ASSERTE(x) assert(x)
+#define _inline inline
+void Sleep (long x) 
+{
+  //  printf ("sleeping for %ld seconds.\n", x/1000);
+  sleep(x/1000);
+}
+
+void QueryPerformanceCounter (long * x)
+{
+  struct timezone tz;
+  struct timeval tv;
+  gettimeofday (&tv, &tz);
+  *x = tv.tv_sec * 1000000L + tv.tv_usec;
+}
+
+void QueryPerformanceFrequency(long * x)
+{
+  *x = 1000000L;
+}
+
+
+#include  <stdio.h>
+#include  <stdlib.h>
+#include  <stddef.h>
+#include  <string.h>
+#include  <ctype.h>
+#include  <time.h>
+#include  <assert.h>
+
+#define _REENTRANT 1
+#include <pthread.h>
+#ifdef __sun
+#include <thread.h>
+#endif
+typedef void * VoidFunction (void *);
+void _beginthread (VoidFunction x, int, void * z)
+{
+  pthread_t pt;
+  pthread_attr_t pa;
+  pthread_attr_init (&pa);
+
+#if 1//defined(__SVR4)
+  pthread_attr_setscope (&pa, PTHREAD_SCOPE_SYSTEM); /* bound behavior */
+#endif
+
+  //  printf ("creating a thread.\n");
+  int v = pthread_create(&pt, &pa, x, z);
+  //  printf ("v = %d\n", v);
+}
+#endif
+
+
+#if 0
+static char buf[65536];
+
+#define malloc(v) &buf
+#define free(p) 
+#endif
+
+#undef CPP
+//#define CPP
+//#include "arch-specific.h"
+
+#if USE_ROCKALL
+//#include "FastHeap.hpp"
+//FAST_HEAP theFastHeap (1024 * 1024, true, true, true);
+
+typedef int SBIT32;
+
+#include "SmpHeap.hpp"
+SMP_HEAP theFastHeap (1024 * 1024, true, true, true);
+
+void * operator new( unsigned int cb )
+{
+  void *pRet = theFastHeap.New ((size_t)cb) ;
+  return pRet;
+}
+
+void operator delete(void *pUserData )
+{
+  theFastHeap.Delete (pUserData) ;
+}
+#endif
+
+#if 0
+extern "C" void * hdmalloc (size_t sz) ;
+extern "C" void hdfree (void * ptr) ;
+extern "C" void hdmalloc_stats (void) ;
+void * operator new( unsigned int cb )
+{
+  void *pRet = hdmalloc((size_t)cb) ;
+  return pRet;
+}
+
+void operator delete(void *pUserData )
+{
+  hdfree(pUserData) ;
+}
+#endif
+
+
+
+/* Test driver for memory allocators           */
+/* Author: Paul Larson, palarson@microsoft.com */
+#define MAX_THREADS     100
+#define MAX_BLOCKS  20000000
+
+int volatile  stopflag=FALSE ;       
+
+struct lran2_st {
+  long x, y, v[97];
+};
+
+int     TotalAllocs=0 ;
+
+typedef struct thr_data {
+
+  int    threadno ;
+  int    NumBlocks ;
+  int    seed ;
+
+  int    min_size ;
+  int    max_size ;
+
+  char * *array ;
+  int    *blksize ;
+  int     asize ;
+
+  unsigned long    cAllocs ;
+  unsigned long    cFrees ;
+  int    cThreads ;
+  unsigned long    cBytesAlloced ;
+
+  volatile int finished ;
+  struct lran2_st rgen ;
+
+} thread_data;
+
+void runthreads(long sleep_cnt, int min_threads, int max_threads, 
+		int chperthread, int num_rounds) ;
+void runloops(long sleep_cnt, int num_chunks ) ;
+static void warmup(char **blkp, int num_chunks );
+static void * exercise_heap( void *pinput) ;
+static void lran2_init(struct lran2_st* d, long seed) ;
+static long lran2(struct lran2_st* d) ;
+ULONG CountReservedSpace() ;
+ 
+char **          blkp = new char *[MAX_BLOCKS] ;
+int *           blksize = new int[MAX_BLOCKS] ;
+long            seqlock=0 ;
+struct lran2_st rgen ;
+int             min_size=10, max_size=500 ;
+int             num_threads ;
+ULONG           init_space ;
+
+extern  int   cLockSleeps ;
+extern  int   cAllocedChunks ;
+extern  int   cAllocedSpace ;
+extern  int   cUsedSpace ;
+extern  int   cFreeChunks ;
+extern  int   cFreeSpace ;
+
+int cChecked=0 ;
+
+#if defined(_WIN32)
+extern "C" {
+  extern HANDLE crtheap;
+};
+#endif
+
+int main (int argc, char *argv[])
+{
+#if defined(USE_LFH) && defined(_WIN32)
+  // Activate 'Low Fragmentation Heap'.
+  ULONG info = 2;
+  HeapSetInformation (GetProcessHeap(),
+		      HeapCompatibilityInformation,
+		      &info,
+		      sizeof(info));
+#endif
+#if 0 // defined(__SVR4)
+ {
+   psinfo_t ps;
+   int pid = getpid();
+   char fname[255];
+   sprintf (fname, "/proc/%d/psinfo", pid);
+   // sprintf (fname, "/proc/self/ps");
+   FILE * f = fopen (fname, "rb");
+   printf ("opening %s\n", fname);
+   if (f) {
+     fread (&ps, sizeof(ps), 1, f);
+     printf ("resident set size = %dK\n", ps.pr_rssize);
+     fclose (f);
+   }
+ }
+#endif
+
+#if defined(_MT) || defined(_REENTRANT)
+  int          min_threads, max_threads ;
+  int          num_rounds ;
+  int          chperthread ;
+#endif
+  unsigned     seed=12345 ;
+  int          num_chunks=10000;
+  long sleep_cnt;
+
+  if (argc > 7) {
+    sleep_cnt = atoi(argv[1]);
+    min_size = atoi(argv[2]);
+    max_size = atoi(argv[3]);
+    chperthread = atoi(argv[4]);
+    num_rounds = atoi(argv[5]);
+    seed = atoi(argv[6]);
+    max_threads = atoi(argv[7]);
+    min_threads = max_threads;
+    printf ("sleep = %ld, min = %d, max = %d, per thread = %d, num rounds = %d, seed = %d, max_threads = %d, min_threads = %d\n",
+	    sleep_cnt, min_size, max_size, chperthread, num_rounds, seed, max_threads, min_threads);
+    goto DoneWithInput;
+  }
+
+#if defined(_MT) || defined(_REENTRANT)
+  //#ifdef _MT
+  printf( "\nMulti-threaded test driver \n") ;
+#else
+  printf( "\nSingle-threaded test driver \n") ;
+#endif
+#ifdef CPP
+  printf("C++ version (new and delete)\n") ;
+#else
+  printf("C version (malloc and free)\n") ;
+#endif
+  printf("runtime (sec): ") ;
+  scanf ("%ld", &sleep_cnt);
+
+  printf("chunk size (min,max): ") ;
+  scanf("%d %d", &min_size, &max_size ) ;
+#if defined(_MT) || defined(_REENTRANT)
+  //#ifdef _MT
+  printf("threads (min, max):   ") ; 
+  scanf("%d %d", &min_threads, &max_threads) ;
+  printf("chunks/thread:  ") ; scanf("%d", &chperthread ) ;
+  printf("no of rounds:   ") ; scanf("%d", &num_rounds ) ;
+  num_chunks = max_threads*chperthread ;
+#else 
+  printf("no of chunks:  ") ; scanf("%d", &num_chunks ) ;
+#endif
+  printf("random seed:    ") ; scanf("%d", &seed) ;
+
+ DoneWithInput:
+
+  if( num_chunks > MAX_BLOCKS ){
+    printf("Max %d chunks - exiting\n", MAX_BLOCKS ) ;
+    return(1) ;
+  }
+
+#ifndef __WIN32__
+#ifdef __SVR4
+  pthread_setconcurrency (max_threads);
+#endif
+#endif
+
+  lran2_init(&rgen, seed) ;
+  // init_space = CountReservedSpace() ;
+
+#if defined(_MT) || defined(_REENTRANT)
+  //#ifdef _MT
+  runthreads(sleep_cnt, min_threads, max_threads, chperthread, num_rounds) ;
+#else
+  runloops(sleep_cnt, num_chunks ) ;
+#endif
+
+#ifdef _DEBUG
+  _cputs("Hit any key to exit...") ;	(void)_getch() ;
+#endif
+
+  return 0;
+
+} /* main */
+
+void runloops(long sleep_cnt, int num_chunks )
+{
+  int     cblks ;
+  int     victim ;
+  int     blk_size ;
+#ifdef __WIN32__
+	_LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt;
+#else
+  long ticks_per_sec ;
+  long start_cnt, end_cnt ;
+#endif
+  _int64        ticks ;
+  double        duration ;
+  double        reqd_space ;
+  ULONG         used_space ;
+  int           sum_allocs=0 ;
+
+  QueryPerformanceFrequency( &ticks_per_sec ) ;
+  QueryPerformanceCounter( &start_cnt) ;
+
+  for( cblks=0; cblks<num_chunks; cblks++){
+    if (max_size == min_size) {
+      blk_size = min_size;
+    } else {
+      blk_size = min_size+lran2(&rgen)%(max_size - min_size) ;
+    }
+#ifdef CPP
+    blkp[cblks] = new char[blk_size] ;
+#else
+    blkp[cblks] = (char *) malloc(blk_size) ;
+#endif
+    blksize[cblks] = blk_size ;
+    assert(blkp[cblks] != NULL) ;
+  }
+
+  while(TRUE){
+    for( cblks=0; cblks<num_chunks; cblks++){
+      victim = lran2(&rgen)%num_chunks ;
+#ifdef CPP
+      delete blkp[victim] ;
+#else
+      free(blkp[victim]) ;
+#endif
+
+      if (max_size == min_size) {
+	blk_size = min_size;
+      } else {
+	blk_size = min_size+lran2(&rgen)%(max_size - min_size) ;
+      }
+#ifdef CPP
+      blkp[victim] = new char[blk_size] ;
+#else
+      blkp[victim] = (char *) malloc(blk_size) ;
+#endif
+      blksize[victim] = blk_size ;
+      assert(blkp[victim] != NULL) ;
+    }
+    sum_allocs += num_chunks ;
+
+    QueryPerformanceCounter( &end_cnt) ;
+#ifdef __WIN32__
+		ticks = end_cnt.QuadPart - start_cnt.QuadPart ;
+		duration = (double)ticks/ticks_per_sec.QuadPart ;
+#else
+    ticks = end_cnt - start_cnt ;
+    duration = (double)ticks/ticks_per_sec ;
+#endif
+
+    if( duration >= sleep_cnt) break ;
+  }
+  reqd_space = (0.5*(min_size+max_size)*num_chunks) ;
+  // used_space = CountReservedSpace() - init_space;
+
+  printf("%6.3f", duration  ) ;
+  printf("%8.0f", sum_allocs/duration ) ;
+  printf(" %6.3f %.3f", (double)used_space/(1024*1024), used_space/reqd_space) ;
+  printf("\n") ;
+
+}
+
+
+#if defined(_MT) || defined(_REENTRANT)
+//#ifdef _MT
+void runthreads(long sleep_cnt, int min_threads, int max_threads, int chperthread, int num_rounds)
+{
+  thread_data *de_area = new thread_data[max_threads] ;
+  thread_data *pdea;
+  int           nperthread ;
+  int           sum_threads ;
+  unsigned long  sum_allocs ;
+  unsigned long  sum_frees ;
+  double        duration ;
+#ifdef __WIN32__
+	_LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt;
+#else
+	long ticks_per_sec ;
+  long start_cnt, end_cnt ;
+#endif
+	_int64        ticks ;
+  double        rate_1=0, rate_n ;
+  double        reqd_space ;
+  ULONG         used_space ;
+  int           prevthreads ;
+  int           i ;
+
+  QueryPerformanceFrequency( &ticks_per_sec ) ;
+
+  pdea = &de_area[0] ;
+  memset(&de_area[0], 0, sizeof(thread_data)) ;
+
+  prevthreads = 0 ;
+  for(num_threads=min_threads; num_threads <= max_threads; num_threads++ )
+    {
+
+      warmup(&blkp[prevthreads*chperthread], (num_threads-prevthreads)*chperthread );
+
+      nperthread = chperthread ;
+      stopflag   = FALSE ;
+		
+      for(i=0; i< num_threads; i++){
+	de_area[i].threadno    = i+1 ;
+	de_area[i].NumBlocks   = num_rounds*nperthread;
+	de_area[i].array       = &blkp[i*nperthread] ;
+	de_area[i].blksize     = &blksize[i*nperthread] ;
+	de_area[i].asize       = nperthread ;
+	de_area[i].min_size    = min_size ;
+	de_area[i].max_size    = max_size ;
+	de_area[i].seed        = lran2(&rgen) ; ;
+	de_area[i].finished    = 0 ;
+	de_area[i].cAllocs     = 0 ;
+	de_area[i].cFrees      = 0 ;
+	de_area[i].cThreads    = 0 ;
+	de_area[i].finished    = FALSE ;
+	lran2_init(&de_area[i].rgen, de_area[i].seed) ;
+
+#ifdef __WIN32__
+	_beginthread((void (__cdecl*)(void *)) exercise_heap, 0, &de_area[i]) ;  
+#else
+	_beginthread(exercise_heap, 0, &de_area[i]) ;  
+#endif
+
+	}
+
+      QueryPerformanceCounter( &start_cnt) ;
+
+      // printf ("Sleeping for %ld seconds.\n", sleep_cnt);
+      Sleep(sleep_cnt * 1000L) ;
+
+      stopflag = TRUE ;
+
+      for(i=0; i<num_threads; i++){
+	while( !de_area[i].finished ){
+#ifdef __WIN32__
+		Sleep(1);
+#elif defined(__SVR4)
+		thr_yield();
+#else
+		sched_yield();
+#endif
+	}
+      }
+
+
+      QueryPerformanceCounter( &end_cnt) ;
+
+      sum_frees = sum_allocs =0  ;
+      sum_threads = 0 ;
+      for(i=0;i< num_threads; i++){
+	sum_allocs    += de_area[i].cAllocs ;
+	sum_frees     += de_area[i].cFrees ;
+	sum_threads   += de_area[i].cThreads ;
+	de_area[i].cAllocs = de_area[i].cFrees = 0;
+      }
+
+ 
+#ifdef __WIN32__
+      ticks = end_cnt.QuadPart - start_cnt.QuadPart ;
+     duration = (double)ticks/ticks_per_sec.QuadPart ;
+#else
+      ticks = end_cnt - start_cnt ;
+     duration = (double)ticks/ticks_per_sec ;
+#endif
+
+      for( i=0; i<num_threads; i++){
+	if( !de_area[i].finished )
+	  printf("Thread at %d not finished\n", i) ;
+      }
+
+
+      rate_n = sum_allocs/duration ;
+      if( rate_1 == 0){
+	rate_1 = rate_n ;
+      }
+		
+      reqd_space = (0.5*(min_size+max_size)*num_threads*chperthread) ;
+      // used_space = CountReservedSpace() - init_space;
+      used_space = 0;
+      
+      printf ("Throughput = %8.0f operations per second.\n", sum_allocs / duration);
+
+#if 0
+      printf("%2d ", num_threads ) ;
+      printf("%6.3f", duration  ) ;
+      printf("%6.3f", rate_n/rate_1 ) ;
+      printf("%8.0f", sum_allocs/duration ) ;
+      printf(" %6.3f %.3f", (double)used_space/(1024*1024), used_space/reqd_space) ;
+      printf("\n") ;
+#endif
+
+      Sleep(5000L) ; // wait 5 sec for old threads to die
+
+      prevthreads = num_threads ;
+
+      printf ("Done sleeping...\n");
+
+    }
+  delete [] de_area;
+}
+
+
+static void * exercise_heap( void *pinput)
+{
+  thread_data  *pdea;
+  int           cblks=0 ;
+  int           victim ;
+  long          blk_size ;
+  int           range ;
+
+  if( stopflag ) return 0;
+
+  pdea = (thread_data *)pinput ;
+  pdea->finished = FALSE ;
+  pdea->cThreads++ ;
+  range = pdea->max_size - pdea->min_size ;
+
+  /* allocate NumBlocks chunks of random size */
+  for( cblks=0; cblks<pdea->NumBlocks; cblks++){
+    victim = lran2(&pdea->rgen)%pdea->asize ;
+#ifdef CPP
+    delete pdea->array[victim] ;
+#else
+    free(pdea->array[victim]) ;
+#endif
+    pdea->cFrees++ ;
+
+    if (range == 0) {
+      blk_size = pdea->min_size;
+    } else {
+      blk_size = pdea->min_size+lran2(&pdea->rgen)%range ;
+    }
+#ifdef CPP
+    pdea->array[victim] = new char[blk_size] ;
+#else
+    pdea->array[victim] = (char *) malloc(blk_size) ;
+#endif
+
+    pdea->blksize[victim] = blk_size ;
+    assert(pdea->array[victim] != NULL) ;
+
+    pdea->cAllocs++ ;
+
+		/* Write something! */
+
+		volatile char * chptr = ((char *) pdea->array[victim]);
+		*chptr++ = 'a';
+		volatile char ch = *((char *) pdea->array[victim]);
+		*chptr = 'b';
+
+    
+		if( stopflag ) break ;
+  }
+
+  //  	printf("Thread %u terminating: %d allocs, %d frees\n",
+  //		      pdea->threadno, pdea->cAllocs, pdea->cFrees) ;
+  pdea->finished = TRUE ;
+
+  if( !stopflag ){
+#ifdef __WIN32__
+	_beginthread((void (__cdecl*)(void *)) exercise_heap, 0, pdea) ;  
+#else
+    _beginthread(exercise_heap, 0, pdea) ;
+#endif
+  } else {
+    printf ("thread stopping.\n");
+  }
+#ifndef _WIN32
+  pthread_exit (NULL);
+#endif
+  return 0;
+}
+
+static void warmup(char **blkp, int num_chunks )
+{
+  int     cblks ;
+  int     victim ;
+  int     blk_size ;
+  LPVOID  tmp ;
+
+
+  for( cblks=0; cblks<num_chunks; cblks++){
+    if (min_size == max_size) {
+      blk_size = min_size;
+    } else {
+      blk_size = min_size+lran2(&rgen)%(max_size-min_size) ;
+    }
+#ifdef CPP
+    blkp[cblks] = new char[blk_size] ;
+#else
+    blkp[cblks] = (char *) malloc(blk_size) ;
+#endif
+    blksize[cblks] = blk_size ;
+    assert(blkp[cblks] != NULL) ;
+  }
+
+  /* generate a random permutation of the chunks */
+  for( cblks=num_chunks; cblks > 0 ; cblks--){
+    victim = lran2(&rgen)%cblks ;
+    tmp = blkp[victim] ;
+    blkp[victim]  = blkp[cblks-1] ;
+    blkp[cblks-1] = (char *) tmp ;
+  }
+
+  for( cblks=0; cblks<4*num_chunks; cblks++){
+    victim = lran2(&rgen)%num_chunks ;
+#ifdef CPP
+    delete blkp[victim] ;
+#else
+    free(blkp[victim]) ;
+#endif
+
+    if (max_size == min_size) {
+      blk_size = min_size;
+    } else {
+      blk_size = min_size+lran2(&rgen)%(max_size - min_size) ;
+    }
+#ifdef CPP
+    blkp[victim] = new char[blk_size] ;
+#else
+    blkp[victim] = (char *) malloc(blk_size) ;
+#endif
+    blksize[victim] = blk_size ;
+    assert(blkp[victim] != NULL) ;
+  }
+}
+#endif // _MT
+
+#ifdef __WIN32__
+ULONG CountReservedSpace()
+{
+  MEMORY_BASIC_INFORMATION info;
+  char                     *addr=NULL ;
+  ULONG                     size=0 ;
+
+  while( true){
+    VirtualQuery(addr, &info, sizeof(info));
+    switch( info.State){
+    case MEM_FREE:
+    case MEM_RESERVE:
+      break ;
+    case MEM_COMMIT:
+      size += info.RegionSize ;
+      break ;
+    }
+    addr += info.RegionSize ;
+    if( addr >= (char *)0x80000000UL ) break ;
+  }
+
+  return size ;
+
+}
+#endif
+
+// =======================================================
+
+/* lran2.h
+ * by Wolfram Gloger 1996.
+ *
+ * A small, portable pseudo-random number generator.
+ */
+
+#ifndef _LRAN2_H
+#define _LRAN2_H
+
+#define LRAN2_MAX 714025l /* constants for portable */
+#define IA	  1366l	  /* random number generator */
+#define IC	  150889l /* (see e.g. `Numerical Recipes') */
+
+//struct lran2_st {
+//    long x, y, v[97];
+//};
+
+static void
+lran2_init(struct lran2_st* d, long seed)
+{
+  long x;
+  int j;
+
+  x = (IC - seed) % LRAN2_MAX;
+  if(x < 0) x = -x;
+  for(j=0; j<97; j++) {
+    x = (IA*x + IC) % LRAN2_MAX;
+    d->v[j] = x;
+  }
+  d->x = (IA*x + IC) % LRAN2_MAX;
+  d->y = d->x;
+}
+
+static 
+long lran2(struct lran2_st* d)
+{
+  int j = (d->y % 97);
+
+  d->y = d->v[j];
+  d->x = (IA*d->x + IC) % LRAN2_MAX;
+  d->v[j] = d->x;
+  return d->y;
+}
+
+#undef IA
+#undef IC
+
+#endif
+
+
diff --git a/src/benchmarks/loop/Makefile b/src/benchmarks/loop/Makefile
new file mode 100644
index 0000000..89914b2
--- /dev/null
+++ b/src/benchmarks/loop/Makefile
@@ -0,0 +1,24 @@
+OBJDIR ?= obj
+
+CC ?= gcc
+
+WARNFLAGS ?= -Wall -Wextra
+COMMONFLAGS ?= -fno-builtin -fPIC -DPIC -pthread
+OPTFLAGS ?= -O3 -DNDEBUG
+
+CFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS)
+
+LDFLAGS ?= -pthread -static-libgcc
+
+.PHONY = all clean
+
+all: $(OBJDIR)/loop
+
+$(OBJDIR)/loop: loop.c | $(OBJDIR)
+	$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $<
+
+$(OBJDIR):
+	mkdir $@
+
+clean:
+	rm -rf $(OBJDIR)
diff --git a/src/benchmarks/loop/loop.c b/src/benchmarks/loop/loop.c
new file mode 100644
index 0000000..bc15808
--- /dev/null
+++ b/src/benchmarks/loop/loop.c
@@ -0,0 +1,87 @@
+#include <assert.h>
+#include <malloc.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+static size_t _rand() {
+	static __thread size_t seed = 123456789;
+	size_t a = 1103515245;
+	size_t c = 12345;
+	size_t m = 1 << 31;
+	seed = (a * seed + c) % m;
+		return seed;
+}
+
+typedef struct ThreadArgs {
+	double benchmark;
+	int allocations;
+	int max_size;
+} ThreadArgs;
+
+static void* malloc_then_write(size_t size) {
+	void* ptr = malloc(size);
+	// Write to ptr
+	/* *((char*)ptr) = '!'; */
+	return ptr;
+}
+
+static void read_then_free(void* ptr) {
+	// Read before free
+	/* char s __attribute__((unused)) = *((char*)ptr); */
+	free(ptr);
+}
+static void* test_thread_func(void* arg) {
+	ThreadArgs* args = (ThreadArgs*)arg;
+
+	for(int i = 0; i < args->allocations; i++) {
+		void* ptr = malloc_then_write((_rand() % args->max_size) + 1);
+		read_then_free(ptr);
+	}
+	return NULL;
+}
+
+int main(int argc, char* argv[]) {
+	pthread_t* threads;
+	int num_threads;
+	struct ThreadArgs thread_args;
+
+	if (argc < 4) {
+		fprintf(stderr, "Usage: %s <num threads> <num allocations> <max size>\n", argv[0]);
+		return 1;
+	}
+
+	num_threads = atoi(argv[1]);
+	thread_args.allocations = atoi(argv[2]);
+	thread_args.max_size = atoi(argv[3]);
+
+	threads = (pthread_t*)malloc(num_threads * sizeof(pthread_t));
+
+	for (int i = 0; i < num_threads; i++) {
+		if (0 != pthread_create(&threads[i], NULL, test_thread_func, &thread_args)) {
+			perror("pthread_create");
+			return 1;
+		}
+	}
+
+	for(int i = 0; i < num_threads; i++) {
+		if (0 != pthread_join(threads[i], NULL)) {
+			perror("pthread_join");
+			return 1;
+		}
+	}
+
+	if (argc == 5)
+	{
+		FILE* f = stdout;
+		if (strcmp(argv[4],"stdout") != 0)
+			f = fopen(argv[4], "w");
+		malloc_info(0, f);
+		if (strcmp(argv[4],"stdout") != 0)
+			fclose(f);
+	}
+
+	return 0;
+}
diff --git a/src/benchmarks/timer.h b/src/benchmarks/timer.h
deleted file mode 100644
index d4d42c7..0000000
--- a/src/benchmarks/timer.h
+++ /dev/null
@@ -1,372 +0,0 @@
-/* -*- C++ -*- */
-
-/*
-
-  Heap Layers: An Extensible Memory Allocation Infrastructure
-  
-  Copyright (C) 2000-2003 by Emery Berger
-  http://www.cs.umass.edu/~emery
-  emery@cs.umass.edu
-  
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation; either version 2 of the License, or
-  (at your option) any later version.
-  
-  This program is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program; if not, write to the Free Software
-  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-*/
-
-#include <cassert>
-#include <stdio.h>
-
-
-#ifndef _TIMER_H_
-#define _TIMER_H_
-
-/**
- * @class Timer
- * @brief A portable class for high-resolution timing.
- *
- * This class simplifies timing measurements across a number of platforms.
- * 
- * @code
- *  Timer t;
- *  t.start();
- *  // do some work
- *  t.stop();
- *  cout << "That took " << (double) t << " seconds." << endl;
- * @endcode
- *
- */
-
-#ifdef __APPLE__
-#include <sys/time.h>
-#endif
-
-#if defined(__linux__) && defined(__GNUG__) && defined(__i386__)
-
-#include <stdio.h>
-#include <limits.h>
-#include <time.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <string.h>
-
-static void getTime (unsigned long& tlo, unsigned long& thi) {
-  asm volatile ("rdtsc"
-		: "=a"(tlo),
-		"=d" (thi));
-}
-
-
-static double getFrequency (void) {
-  static double freq = 0.0;
-  static bool initialized = false;
-  unsigned long LTime0, LTime1, HTime0, HTime1;
-  if (!initialized) { 
-
-    freq = 2600000.0;
-
-#if 0
-    // Compute MHz directly.
-    // Wait for approximately one second.
-    
-    getTime (LTime0, HTime0);
-    //    printf ("waiting...\n");
-    struct timespec rqtp, rmtp;
-    rqtp.tv_sec = 1;
-    rqtp.tv_nsec = 0;
-    nanosleep (&rqtp, &rmtp);
-    // printf ("done.\n");
-    getTime (LTime1, HTime1);
-
-    freq = (double)(LTime1 - LTime0) + (double)(UINT_MAX)*(double)(HTime1 - HTime0);
-    if (LTime1 < LTime0) {
-      freq -= (double)UINT_MAX;
-    }
-#endif
-    initialized = true;
-
-  } else {
-    // printf ("wha?\n");
-  }
-  return freq;
-}
-
-
-namespace HL {
-
-class Timer {
-public:
-  Timer (void)
-    : timeElapsed (0.0)
-  {
-    _frequency = getFrequency();
-    //    printf ("wooo!\n");
-    //  printf ("freq = %lf\n", frequency);
-  }
-  void start (void) {
-    getTime (currentLo, currentHi);
-  }
-  void stop (void) {
-    unsigned long lo, hi;
-    getTime (lo, hi);
-    double now = (double) hi * 4294967296.0 + lo;
-    double prev = (double) currentHi * 4294967296.0 + currentLo;
-    timeElapsed = (now - prev) / _frequency;
-  }
-
-  operator double (void) {
-    return timeElapsed;
-  }
-
-private:
-  double timeElapsed;
-  unsigned long currentLo, currentHi;
-  double _frequency;
-};
-
-};
-
-#else
-
-
-#ifdef __SVR4 // Solaris
-#include <sys/time.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/procfs.h>
-#include <stdio.h>
-#endif // __SVR4
-
-#include <time.h>
-
-#if defined(unix) || defined(__linux)
-#include <sys/time.h>
-#include <unistd.h>
-#endif
-
-
-#ifdef __sgi
-#include <sys/types.h>
-#include <sys/times.h>
-#include <limits.h>
-#endif
-
-
-#if defined(_WIN32)
-#include <windows.h>
-#endif
-
-
-#if defined(__BEOS__)
-#include <OS.h>
-#endif
-
-
-namespace HL {
-
-class Timer {
-
-public:
-
-  /// Initializes the timer.
-  Timer (void)
-#if !defined(_WIN32)
-    : _starttime (0),
-      _elapsedtime (0)
-#endif
-  {
-  }
-
-  /// Start the timer.
-  void start (void) { _starttime = _time(); }
-
-  /// Stop the timer.
-  void stop (void) { _elapsedtime += _time() - _starttime; }
-
-  /// Reset the timer.
-  void reset (void) { _starttime = _elapsedtime; }
-
-#if 0
-  // Set the timer.
-  void set (double secs) { _starttime = 0; _elapsedtime = _sectotime (secs);}
-#endif
-
-  /// Return the number of seconds elapsed.
-  operator double (void) { return _timetosec (_elapsedtime); }
-
-  static double currentTime (void) { TimeType t; t = _time(); return _timetosec (t); }
-
-
-private:
-
-  // The _timer variable will be different depending on the OS.
-  // We try to use the best timer available.
-
-#ifdef __sgi
-#define TIMER_FOUND
-
-  long _starttime, _elapsedtime;
-
-  long _time (void) {
-    struct tms t;
-    long ticks = times (&t);
-    return ticks;
-  }
-
-  static double _timetosec (long t) {
-    return ((double) (t) / CLK_TCK);
-  }
-
-  static long _sectotime (double sec) {
-    return (long) sec * CLK_TCK;
-  }
-#endif
-
-#ifdef __SVR4 // Solaris
-#define TIMER_FOUND
-  typedef hrtime_t TimeType;
-  TimeType	_starttime, _elapsedtime;
-
-  static TimeType _time (void) {
-    return gethrtime();
-  }
-
-  static TimeType _sectotime (double sec) { return (hrtime_t) (sec * 1.0e9); }
-
-  static double _timetosec (TimeType& t) {
-    return ((double) (t) / 1.0e9);
-  }
-#endif // __SVR4
-
-#if defined(MAC) || defined(macintosh)
-#define TIMER_FOUND
-  double		_starttime, _elapsedtime;
-
-  double _time (void) {
-    return get_Mac_microseconds();
-  }
-
-  double _timetosec (hrtime_t& t) {
-    return t;
-  }
-#endif // MAC
-
-#ifdef _WIN32
-#define TIMER_FOUND
-
-#ifndef __GNUC__
-  class TimeType {
-  public:
-    TimeType (void)
-    {
-      largeInt.QuadPart = 0;
-    }
-    operator double& (void) { return (double&) largeInt.QuadPart; }
-    operator LARGE_INTEGER& (void) { return largeInt; }
-    double timeToSec (void) {
-      return (double) largeInt.QuadPart / getFreq();
-    }
-  private:
-    double getFreq (void) {
-      QueryPerformanceFrequency (&freq);
-      return (double) freq.QuadPart;
-    }
-
-    LARGE_INTEGER largeInt;
-    LARGE_INTEGER freq;
-  };
-
-  TimeType _starttime, _elapsedtime;
-
-  static TimeType _time (void) {
-    TimeType t;
-    int r = QueryPerformanceCounter (&((LARGE_INTEGER&) t));
-    assert (r);
-    return t;
-  }
-
-  static double _timetosec (TimeType& t) {
-    return t.timeToSec();
-  }
-#else
-  typedef DWORD TimeType;
-  DWORD _starttime, _elapsedtime;
-  static DWORD _time (void) {
-    return GetTickCount();
-  }
-
-  static double _timetosec (DWORD& t) {
-    return (double) t / 100000.0;
-  }
-  static unsigned long _sectotime (double sec) {
-    return (unsigned long)(sec);
-  }
-#endif
-#endif // _WIN32
-
-
-#ifdef __BEOS__
-#define TIMER_FOUND
-  bigtime_t _starttime, _elapsedtime;
-  bigtime_t _time(void) {
-    return system_time();
-  }
-  double _timetosec (bigtime_t& t) {
-    return (double) t / 1000000.0;
-  }
-  
-  bigtime_t _sectotime (double sec) {
-    return (bigtime_t)(sec * 1000000.0);
-  }
-#endif // __BEOS__
-
-#ifndef TIMER_FOUND
-
-  typedef long TimeType;
-  TimeType _starttime, _elapsedtime;
-
-  static TimeType _time (void) {
-    struct timeval t;
-    gettimeofday (&t, NULL);
-    return t.tv_sec * 1000000 + t.tv_usec;
-  }
-
-  static double _timetosec (TimeType t) {
-    return ((double) (t) / 1000000.0);
-  }
-
-  static TimeType _sectotime (double sec) {
-    return (TimeType) (sec * 1000000.0);
-  }
-
-#endif // TIMER_FOUND
-
-#undef TIMER_FOUND
-
-};
-
-
-#ifdef __SVR4 // Solaris
-class VirtualTimer : public Timer {
-public:
-  hrtime_t _time (void) {
-    return gethrvtime();
-  }
-};  
-#endif
-
-}
-
-#endif
-
-#endif
diff --git a/src/dj_trace.py b/src/dj_trace.py
index 21b9ddd..f4265ea 100644
--- a/src/dj_trace.py
+++ b/src/dj_trace.py
@@ -34,7 +34,7 @@ class Benchmark_DJ_Trace( Benchmark ):
                              also used by delorie to measure improvements in the
                              glibc allocator.""",
 
-        self.cmd = "build/trace_run{binary_suffix} dj_workloads/{workload}.wl"
+        self.cmd = "trace_run{binary_suffix} dj_workloads/{workload}.wl"
         self.measure_cmd = ""
 
         self.args = {
@@ -80,7 +80,7 @@ class Benchmark_DJ_Trace( Benchmark ):
                             "realloc":117, "free":10099261, "threads": 19},
                         }
 
-        self.requirements = ["build/trace_run"]
+        self.requirements = ["trace_run"]
         super().__init__()
 
     def prepare(self, verbose=False):
diff --git a/src/falsesharing.py b/src/falsesharing.py
index 57acf06..6c4ddc0 100644
--- a/src/falsesharing.py
+++ b/src/falsesharing.py
@@ -16,14 +16,14 @@ class Benchmark_Falsesharing( Benchmark ):
                             on the same cache line the writes will be expensive because
                             of cache thrashing."""
 
-        self.cmd = "build/cache-{bench}{binary_suffix} {threads} 100 8 1000000"
+        self.cmd = "cache-{bench}{binary_suffix} {threads} 100 8 1000000"
 
         self.args = {
                         "bench" : ["thrash", "scratch"],
                         "threads" : range(1, multiprocessing.cpu_count() * 2 + 1)
                     }
 
-        self.requirements = ["build/cache-thrash", "build/cache-scratch"]
+        self.requirements = ["cache-thrash", "cache-scratch"]
         super().__init__()
 
     def process_output(self, result, stdout, stderr, target, perm, verbose):
diff --git a/src/larson.py b/src/larson.py
index 0a4a237..a035de8 100644
--- a/src/larson.py
+++ b/src/larson.py
@@ -13,14 +13,14 @@ class Benchmark_Larson( Benchmark ):
                              and deallocates objects, and then transfers some objects
                              (randomly selected) to other threads to be freed."""
 
-        self.cmd = "build/larson{binary_suffix} 1 8 {maxsize} 1000 50000 1 {threads}"
+        self.cmd = "larson{binary_suffix} 1 8 {maxsize} 1000 50000 1 {threads}"
 
         self.args = {
                         "maxsize" : [8, 32, 64, 128, 256, 512, 1024],
                         "threads" : range(1, multiprocessing.cpu_count() * 2 + 1)
                     }
 
-        self.requirements = ["build/larson"]
+        self.requirements = ["larson"]
         super().__init__()
 
     def process_output(self, result, stdout, stderr, target, perm, verbose):
diff --git a/src/loop.py b/src/loop.py
index d58b4e2..81ddf19 100644
--- a/src/loop.py
+++ b/src/loop.py
@@ -9,14 +9,14 @@ class Benchmark_Loop( Benchmark ):
                             How allocations are freed can be changed with the benchmark
                             version""",
 
-        self.cmd = "build/bench_loop{binary_suffix} {nthreads} 1000000 {maxsize}"
+        self.cmd = "loop{binary_suffix} {nthreads} 1000000 {maxsize}"
 
         self.args = {
                         "maxsize" : [2 ** x for x in range(6, 16)],
                         "nthreads" : range(1, multiprocessing.cpu_count() * 2 + 1)
                     }
 
-        self.requirements = ["build/bench_loop"]
+        self.requirements = ["loop"]
         super().__init__()
 
     def summary(self):
diff --git a/src/trace_run.c b/src/trace_run.c
deleted file mode 100644
index 604d01e..0000000
--- a/src/trace_run.c
+++ /dev/null
@@ -1,750 +0,0 @@
-#define _LARGEFILE64_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <pthread.h>
-#include <sys/time.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/resource.h>
-#include <fcntl.h>
-#include <unistd.h>
-
-// #include "malloc.h"
-#include <malloc.h>
-
-// #include "mtrace.h"
-/* Codes for the simulator/workload programs. Copied from mtrace.h. */
-#define C_NOP 0
-#define C_DONE 1
-#define C_MALLOC 2
-#define C_CALLOC 3
-#define C_REALLOC 4
-#define C_FREE 5
-#define C_SYNC_W 6
-#define C_SYNC_R 7
-#define C_ALLOC_PTRS 8
-#define C_ALLOC_SYNCS 9
-#define C_NTHREADS 10
-#define C_START_THREAD 11
-#define C_MEMALIGN 12
-#define C_VALLOC 13
-#define C_PVALLOC 14
-#define C_POSIX_MEMALIGN 15
-
-#if UINTPTR_MAX == 0xffffffffffffffff
-
-#define ticks_t int64_t
-/* Setting quick_run to 1 allows the simulator to model
-   only the allocation and deallocation accounting via
-   atomic_rss. The actual allocations are skipped.  This
-   mode is useful to verify the workload file.  */
-#define quick_run 0
-
-static __inline__ ticks_t rdtsc_s(void)
-{
-  unsigned a, d;
-  asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
-  asm volatile("rdtscp" : "=a" (a), "=d" (d));
-  return ((unsigned long long)a) | (((unsigned long long)d) << 32);
-}
-
-static __inline__ ticks_t rdtsc_e(void)
-{
-  unsigned a, d;
-  asm volatile("rdtscp" : "=a" (a), "=d" (d));
-  asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
-  return ((unsigned long long)a) | (((unsigned long long)d) << 32);
-}
-
-#else
-
-#define ticks_t int32_t
-
-static __inline__ ticks_t rdtsc_s(void)
-{
-  unsigned a, d;
-  asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
-  asm volatile("rdtsc" : "=a" (a), "=d" (d));
-  return ((unsigned long)a) | (((unsigned long)d) << 16);
-}
-
-static __inline__ ticks_t rdtsc_e(void)
-{
-  unsigned a, d;
-  asm volatile("rdtscp" : "=a" (a), "=d" (d));
-  asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx");
-  return ((unsigned long)a) | (((unsigned long)d) << 16);
-}
-
-#endif
-
-static ticks_t diff_timeval (struct timeval e, struct timeval s)
-{
-  ticks_t usec;
-  if (e.tv_usec < s.tv_usec)
-    usec = (e.tv_usec + 1000000 - s.tv_usec) + (e.tv_sec-1 - s.tv_sec)*1000000;
-  else
-    usec = (e.tv_usec - s.tv_usec) + (e.tv_sec - s.tv_sec)*1000000;
-  return usec;
-}
-
-#if 1
-#define Q1
-#define Q2
-#else
-pthread_mutex_t genmutex = PTHREAD_MUTEX_INITIALIZER;
-#define Q1   pthread_mutex_lock(&genmutex)
-#define Q2   pthread_mutex_unlock(&genmutex)
-#endif
-
-pthread_mutex_t cmutex = PTHREAD_MUTEX_INITIALIZER;
-#define NCBUF 10
-static char cbuf[NCBUF][30];
-static int ci = 0;
-
-char *comma(ticks_t x)
-{
-  char buf[30], *bs, *bd;
-  int l, i, idx;
-
-  pthread_mutex_lock(&cmutex);
-  ci = (ci + 1) % NCBUF;
-  idx = ci;
-  pthread_mutex_unlock(&cmutex);
-  bs = buf;
-  bd = cbuf[idx];
-
-  sprintf(buf, "%lld", (long long int)x);
-  l = strlen(buf);
-  i = l;
-  while (*bs)
-    {
-      *bd++ = *bs++;
-      i--;
-      if (i % 3 == 0 && *bs)
-	*bd++ = ',';
-    }
-  *bd = 0;
-  return cbuf[idx];
-}
-
-static volatile void **ptrs;
-static volatile size_t *sizes;
-static size_t n_ptrs;
-static volatile char *syncs;
-static pthread_mutex_t *mutexes;
-static pthread_cond_t *conds;
-static size_t n_syncs;
-
-static pthread_mutex_t stat_mutex = PTHREAD_MUTEX_INITIALIZER;
-ticks_t malloc_time = 0, malloc_count = 0;
-ticks_t calloc_time = 0, calloc_count = 0;
-ticks_t realloc_time = 0, realloc_count = 0;
-ticks_t free_time = 0, free_count = 0;
-
-size_t ideal_rss = 0;
-size_t max_ideal_rss = 0;
-static pthread_mutex_t rss_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-void atomic_rss (ssize_t delta)
-{
-  pthread_mutex_lock (&rss_mutex);
-  ideal_rss += delta;
-  if (max_ideal_rss < ideal_rss)
-    max_ideal_rss = ideal_rss;
-  pthread_mutex_unlock (&rss_mutex);
-}
-
-pthread_mutex_t stop_mutex = PTHREAD_MUTEX_INITIALIZER;
-int threads_done = 0;
-
-//#define dprintf printf
-#define dprintf(...) (void)1
-
-//#define mprintf printf
-//#define MDEBUG 1
-#define mprintf(...) (void)1
-
-#define myabort() my_abort_2(thrc, __LINE__)
-void
-my_abort_2 (pthread_t thrc, int line)
-{
-  fprintf(stderr, "Abort thread %p at line %d\n", (void *)thrc, line);
-  abort();
-}
-
-/*------------------------------------------------------------*/
-/* Wrapper around I/O routines */
-
-int io_fd;
-
-#define IOSIZE 65536
-#define IOMIN 4096
-
-static pthread_mutex_t io_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-typedef struct {
-  unsigned char buf[IOSIZE];
-  size_t incr;
-  size_t max_incr;
-  size_t buf_base;
-  size_t buf_idx;
-  int saw_eof;
-} IOPerThreadType;
-
-IOPerThreadType main_io;
-IOPerThreadType *thread_io;
-
-void
-io_init (IOPerThreadType *io, size_t file_offset, int incr)
-{
-  if (incr > IOSIZE)
-    incr = IOSIZE;
-  if (incr < IOMIN)
-    incr = IOMIN;
-
-  io->buf_base = file_offset;
-  io->buf_idx = 0;
-  io->incr = incr;
-
-  pthread_mutex_lock (&io_mutex);
-  lseek64 (io_fd, io->buf_base, SEEK_SET);
-  // short read OK, the eof is just to prevent runaways from bad data.
-  if (read (io_fd, io->buf, incr) < 0)
-    io->saw_eof = 1;
-  else
-    io->saw_eof = 0;
-  pthread_mutex_unlock (&io_mutex);
-}
-
-unsigned char
-io_read (IOPerThreadType *io)
-{
-  if (io->buf_idx >= io->incr)
-    io_init (io, io->buf_base + io->buf_idx, io->incr);
-  if (io->saw_eof)
-    return 0xff;
-  return io->buf [io->buf_idx++];
-}
-
-unsigned char
-io_peek (IOPerThreadType *io)
-{
-  if (io->buf_idx >= io->incr)
-    io_init (io, io->buf_base + io->buf_idx, io->incr);
-  if (io->saw_eof)
-    return 0xff;
-  return io->buf [io->buf_idx];
-}
-
-size_t
-io_pos (IOPerThreadType *io)
-{
-  return io->buf_base + io->buf_idx;
-}
-
-/*------------------------------------------------------------*/
-
-static void
-wmem (volatile void *ptr, int count)
-{
-  char *p = (char *)ptr;
-  int i;
-
-  if (!p)
-    return;
-
-  for (i=0; i<count; i++)
-    p[i] = 0x11;
-}
-#define xwmem(a,b)
-
-static size_t get_int (IOPerThreadType *io)
-{
-  size_t rv = 0;
-  while (1)
-  {
-    unsigned char c = io_read (io);
-    rv |= (c & 0x7f);
-    if (c & 0x80)
-      rv <<= 7;
-    else
-      return rv;
-  }
-}
-
-static void free_wipe (size_t idx)
-{
-  char *cp = (char *)ptrs[idx];
-  if (cp == NULL)
-    return;
-  size_t sz = sizes[idx];
-  size_t i;
-  for (i=0; i<sz; i++)
-    {
-      if (i % 8 == 1)
-	cp[i] = i / 8;
-      else
-	cp[i] = 0x22;
-    }
-}
-
-static void *
-thread_common (void *my_data_v)
-{
-  pthread_t thrc = pthread_self ();
-  size_t p1, p2, sz, sz2;
-  IOPerThreadType *io = (IOPerThreadType *)my_data_v;
-  ticks_t my_malloc_time = 0, my_malloc_count = 0;
-  ticks_t my_calloc_time = 0, my_calloc_count = 0;
-  ticks_t my_realloc_time = 0, my_realloc_count = 0;
-  ticks_t my_free_time = 0, my_free_count = 0;
-  ticks_t stime, etime;
-  int thread_idx = io - thread_io;
-#ifdef MDEBUG
-  volatile void *tmp;
-#endif
-
-  while (1)
-    {
-      unsigned char this_op = io_peek (io);
-      if (io->saw_eof)
-	myabort();
-      dprintf("op %p:%ld is %d\n", (void *)thrc, io_pos (io),  io_peek (io));
-      switch (io_read (io))
-	{
-	case C_NOP:
-	  break;
-
-	case C_DONE:
-	  dprintf("op %p:%ld DONE\n", (void *)thrc, io_pos (io));
-	  pthread_mutex_lock (&stat_mutex);
-	  malloc_time += my_malloc_time;
-	  calloc_time += my_calloc_time;
-	  realloc_time += my_realloc_time;
-	  free_time += my_free_time;
-	  malloc_count += my_malloc_count;
-	  calloc_count += my_calloc_count;
-	  realloc_count += my_realloc_count;
-	  free_count += my_free_count;
-	  threads_done ++;
-	  pthread_mutex_unlock (&stat_mutex);
-	  pthread_mutex_lock(&stop_mutex);
-	  pthread_mutex_unlock(&stop_mutex);
-	  return NULL;
-
-	case C_MEMALIGN:
-	  p2 = get_int (io);
-	  sz2 = get_int (io);
-	  sz = get_int (io);
-	  dprintf("op %p:%ld %ld = MEMALIGN %ld %ld\n", (void *)thrc, io_pos (io), p2, sz2, sz);
-	  /* we can't force memalign to return NULL (fail), so just skip it.  */
-	  if (p2 == 0)
-	    break;
-	  if (p2 > n_ptrs)
-	    myabort();
-	  stime = rdtsc_s();
-	  Q1;
-	  if (ptrs[p2])
-	    {
-	      if (!quick_run)
-		free ((void *)ptrs[p2]);
-	      atomic_rss (-sizes[p2]);
-	    }
-	  if (!quick_run)
-	    ptrs[p2] = memalign (sz2, sz);
-	  else
-	    ptrs[p2] = (void *)p2;
-	  /* Verify the alignment matches what is expected.  */
-	  if (((size_t)ptrs[p2] & (sz2 - 1)) != 0)
-	    myabort ();
-	  sizes[p2] = sz;
-	  mprintf("%p = memalign(%lx, %lx)\n", ptrs[p2], sz2, sz);
-	  Q2;
-	  etime = rdtsc_e();
-	  if (ptrs[p2] != NULL)
-	    atomic_rss (sz);
-	  if (etime < stime)
-	    {
-	      printf("s: %llx e:%llx  d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
-	    }
-	  my_malloc_time += etime - stime;
-	  my_malloc_count ++;
-	  if (!quick_run)
-	    wmem(ptrs[p2], sz);
-	  break;
-
-	case C_MALLOC:
-	  p2 = get_int (io);
-	  sz = get_int (io);
-	  dprintf("op %p:%ld %ld = MALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);
-	  /* we can't force malloc to return NULL (fail), so just skip it.  */
-	  if (p2 == 0)
-	    break;
-	  if (p2 > n_ptrs)
-	    myabort();
-	  stime = rdtsc_s();
-	  Q1;
-	  if (ptrs[p2])
-	    {
-	      if (!quick_run)
-		free ((void *)ptrs[p2]);
-	      atomic_rss (-sizes[p2]);
-	    }
-	  if (!quick_run)
-	    ptrs[p2] = malloc (sz);
-	  else
-	    ptrs[p2] = (void *)p2;
-	  sizes[p2] = sz;
-	  mprintf("%p = malloc(%lx)\n", ptrs[p2], sz);
-	  Q2;
-	  etime = rdtsc_e();
-	  if (ptrs[p2] != NULL)
-	    atomic_rss (sz);
-	  if (etime < stime)
-	    {
-	      printf("s: %llx e:%llx  d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime));
-	    }
-	  my_malloc_time += etime - stime;
-	  my_malloc_count ++;
-	  if (!quick_run)
-	    wmem(ptrs[p2], sz);
-	  break;
-
-	case C_CALLOC:
-	  p2 = get_int (io);
-	  sz = get_int (io);
-	  dprintf("op %p:%ld %ld = CALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz);
-	  /* we can't force calloc to return NULL (fail), so just skip it.  */
-	  if (p2 == 0)
-	    break;
-	  if (p2 > n_ptrs)
-	    myabort();
-	  if (ptrs[p2])
-	    {
-	      if (!quick_run)
-		free ((void *)ptrs[p2]);
-	      atomic_rss (-sizes[p2]);
-	    }
-	  stime = rdtsc_s();
-	  Q1;
-	  if (!quick_run)
-	    ptrs[p2] = calloc (sz, 1);
-	  else
-	    ptrs[p2] = (void *)p2;
-	  sizes[p2] = sz;
-	  mprintf("%p = calloc(%lx)\n", ptrs[p2], sz);
-	  Q2;
-	  if (ptrs[p2])
-	    atomic_rss (sz);
-	  my_calloc_time += rdtsc_e() - stime;
-	  my_calloc_count ++;
-	  if (!quick_run)
-	    wmem(ptrs[p2], sz);
-	  break;
-
-	case C_REALLOC:
-	  p2 = get_int (io);
-	  p1 = get_int (io);
-	  sz = get_int (io);
-	  dprintf("op %p:%ld %ld = REALLOC %ld %ld\n", (void *)thrc, io_pos (io), p2, p1, sz);
-	  if (p1 > n_ptrs)
-	    myabort();
-	  if (p2 > n_ptrs)
-	    myabort();
-	  /* we can't force realloc to return NULL (fail), so just skip it.  */
-	  if (p2 == 0)
-	    break;
-
-	  if (ptrs[p1])
-	    atomic_rss (-sizes[p1]);
-	  if (!quick_run)
-	    free_wipe(p1);
-	  stime = rdtsc_s();
-	  Q1;
-#ifdef MDEBUG
-	  tmp = ptrs[p1];
-#endif
-	  if (!quick_run)
-	    ptrs[p2] = realloc ((void *)ptrs[p1], sz);
-	  else
-	    ptrs[p2] = (void *)p2;
-	  sizes[p2] = sz;
-	  mprintf("%p = relloc(%p,%lx)\n", ptrs[p2], tmp,sz);
-	  Q2;
-	  my_realloc_time += rdtsc_e() - stime;
-	  my_realloc_count ++;
-	  if (!quick_run)
-	    wmem(ptrs[p2], sz);
-	  if (p1 != p2)
-	    ptrs[p1] = 0;
-	  if (ptrs[p2])
-	    atomic_rss (sizes[p2]);
-	  break;
-
-	case C_FREE:
-	  p1 = get_int (io);
-	  if (p1 > n_ptrs)
-	    myabort();
-	  dprintf("op %p:%ld FREE %ld\n", (void *)thrc, io_pos (io), p1);
-	  if (!quick_run)
-	    free_wipe (p1);
-	  if (ptrs[p1])
-	    atomic_rss (-sizes[p1]);
-	  stime = rdtsc_s();
-	  Q1;
-	  mprintf("free(%p)\n", ptrs[p1]);
-	  if (!quick_run)
-	    free ((void *)ptrs[p1]);
-	  Q2;
-	  my_free_time += rdtsc_e() - stime;
-	  my_free_count ++;
-	  ptrs[p1] = 0;
-	  break;
-
-	case C_SYNC_W:
-	  p1 = get_int(io);
-	  dprintf("op %p:%ld SYNC_W %ld\n", (void *)thrc, io_pos (io), p1);
-	  if (p1 > n_syncs)
-	    myabort();
-	  pthread_mutex_lock (&mutexes[p1]);
-	  syncs[p1] = 1;
-	  pthread_cond_signal (&conds[p1]);
-	  __sync_synchronize ();
-	  pthread_mutex_unlock (&mutexes[p1]);
-	  break;
-
-	case C_SYNC_R:
-	  p1 = get_int(io);
-	  dprintf("op %p:%ld SYNC_R %ld\n", (void *)thrc, io_pos (io), p1);
-	  if (p1 > n_syncs)
-	    myabort();
-	  pthread_mutex_lock (&mutexes[p1]);
-	  while (syncs[p1] != 1)
-	    {
-	      pthread_cond_wait (&conds[p1], &mutexes[p1]);
-	      __sync_synchronize ();
-	    }
-	  pthread_mutex_unlock (&mutexes[p1]);
-	  break;
-
-	default:
-	  printf("op %d - unsupported, thread %d addr %lu\n",
-		 this_op, thread_idx, (long unsigned int)io_pos (io));
-	  myabort();
-	}
-    }
-}
-
-static void *alloc_mem (size_t amt)
-{
-  void *rv = mmap (NULL, amt, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  mlock (rv, amt);
-  memset (rv, 0, amt);
-  return rv;
-}
-
-static pthread_t *thread_ids;
-
-void *
-my_malloc (const char *msg, int size, IOPerThreadType *io, size_t *psz, size_t count)
-{
-  void *rv;
-  if (psz)
-    count = *psz = get_int (io);
-  dprintf ("my_malloc for %s size %d * %ld\n", msg, size, count);
-  rv = alloc_mem(size * count);
-  if (!rv)
-    {
-      fprintf(stderr, "calloc(%lu,%lu) failed\n", (long unsigned)size, (long unsigned)*psz);
-      exit(1);
-    }
-  mlock (rv, size * count);
-  return rv;
-}
-
-static const char * const scan_names[] = {
-  "UNUSED",
-  "ARENA",
-  "HEAP",
-  "CHUNK_USED",
-  "CHUNK_FREE",
-  "FASTBIN_FREE",
-  "UNSORTED",
-  "TOP",
-  "TCACHE",
-  "USED"
-};
-
-void
-malloc_scan_callback (void *ptr, size_t length, int type)
-{
-  printf("%s: ptr %p length %llx\n", scan_names[type], ptr, (long long)length);
-}
-
-#define MY_ALLOC(T, psz)				\
-  (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, psz, 0)
-#define MY_ALLOCN(T, count)				\
-  (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, NULL, count)
-
-int
-main(int argc, char **argv)
-{
-  ticks_t start=0;
-  ticks_t end;
-  ticks_t usec;
-  struct timeval tv_s, tv_e;
-  int thread_idx = 0;
-  int i;
-  size_t n_threads = 0;
-  size_t idx;
-  struct rusage res_start, res_end;
-  int done;
-  size_t guessed_io_size = 4096;
-  struct stat statb;
-
-  if (argc < 2)
-    {
-      fprintf(stderr, "Usage: %s <trace2dat.outfile>\n", argv[0]);
-      exit(1);
-    }
-  io_fd = open(argv[1], O_RDONLY);
-  if (io_fd < 0)
-    {
-      fprintf(stderr, "Unable to open %s for reading\n", argv[1]);
-      perror("The error was");
-      exit(1);
-    }
-  fstat (io_fd, &statb);
-
-  io_init (&main_io, 0, IOMIN);
-
-  pthread_mutex_lock(&stop_mutex);
-
-  done = 0;
-  while (!done)
-    {
-      switch (io_read (&main_io))
-	{
-	case C_NOP:
-	  break;
-	case C_ALLOC_PTRS:
-	  ptrs = MY_ALLOC (ptrs, &n_ptrs);
-	  sizes = alloc_mem(sizeof(sizes[0]) * n_ptrs);
-	  ptrs[0] = 0;
-	  break;
-	case C_ALLOC_SYNCS:
-	  n_syncs = get_int(&main_io);
-	  syncs = MY_ALLOCN (syncs, n_syncs);
-	  conds = MY_ALLOCN (conds, n_syncs);
-	  mutexes = MY_ALLOCN (mutexes, n_syncs);
-	  for (idx=0; idx<n_syncs; idx++)
-	    {
-	      pthread_mutex_init (&mutexes[idx], NULL);
-	      pthread_cond_init (&conds[idx], NULL);
-	    }
-	  break;
-	case C_NTHREADS:
-	  thread_ids = MY_ALLOC (thread_ids, &n_threads);
-	  thread_io = MY_ALLOCN (thread_io, n_threads);
-	  guessed_io_size = ((statb.st_size / n_threads) < (1024*1024)) ? 65536 : 4096;
-	  /* The next thing in the workscript is thread creation */
-	  getrusage (RUSAGE_SELF, &res_start);
-	  gettimeofday (&tv_s, NULL);
-	  start = rdtsc_s();
-	  break;
-	case C_START_THREAD:
-	  idx = get_int (&main_io);
-	  io_init (& thread_io[thread_idx], idx, guessed_io_size);
-	  pthread_create (&thread_ids[thread_idx], NULL, thread_common, thread_io + thread_idx);
-	  dprintf("Starting thread %lld at offset %lu %lx\n", (long long)thread_ids[thread_idx], (unsigned long)idx, (unsigned long)idx);
-	  thread_idx ++;
-	  break;
-	case C_DONE:
-	  do
-	    {
-	      pthread_mutex_lock (&stat_mutex);
-	      i = threads_done;
-	      pthread_mutex_unlock (&stat_mutex);
-	    } while (i < thread_idx);
-	  done = 1;
-	  break;
-	}
-    }
-  if (!quick_run)
-    {
-      end = rdtsc_e();
-      gettimeofday (&tv_e, NULL);
-      getrusage (RUSAGE_SELF, &res_end);
-
-      printf("%s cycles\n", comma(end - start));
-      usec = diff_timeval (tv_e, tv_s);
-      printf("%s usec wall time\n", comma(usec));
-
-      usec = diff_timeval (res_end.ru_utime, res_start.ru_utime);
-      printf("%s usec across %d thread%s\n",
-	     comma(usec), (int)n_threads, n_threads == 1 ? "" : "s");
-      printf("%s Kb Max RSS (%s -> %s)\n",
-	     comma(res_end.ru_maxrss - res_start.ru_maxrss),
-	     comma(res_start.ru_maxrss), comma(res_end.ru_maxrss));
-    }
-  printf("%s Kb Max Ideal RSS\n", comma (max_ideal_rss / 1024));
-
-  if (malloc_count == 0) malloc_count ++;
-  if (calloc_count == 0) calloc_count ++;
-  if (realloc_count == 0) realloc_count ++;
-  if (free_count == 0) free_count ++;
-
-  if (!quick_run)
-    {
-      printf("\n");
-      printf("sizeof ticks_t is %lu\n", sizeof(ticks_t));
-      printf("Avg malloc time: %6s in %10s calls\n", comma(malloc_time/malloc_count), comma(malloc_count));
-      printf("Avg calloc time: %6s in %10s calls\n", comma(calloc_time/calloc_count), comma(calloc_count));
-      printf("Avg realloc time: %5s in %10s calls\n", comma(realloc_time/realloc_count), comma(realloc_count));
-      printf("Avg free time: %8s in %10s calls\n", comma(free_time/free_count), comma(free_count));
-      printf("Total call time: %s cycles\n", comma(malloc_time+calloc_time+realloc_time+free_time));
-      printf("\n");
-    }
-
-#if 0
-  /* Free any still-held chunks of memory.  */
-  for (idx=0; idx<n_ptrs; idx++)
-    if (ptrs[idx])
-      {
-	free((void *)ptrs[idx]);
-	ptrs[idx] = 0;
-      }
-#endif
-
-#if 0
-  /* This will fail (crash) for system glibc but that's OK.  */
-  __malloc_scan_chunks(malloc_scan_callback);
-
-  malloc_info (0, stdout);
-#endif
-
-#if 0
-  /* ...or report them as used.  */
-  for (idx=0; idx<n_ptrs; idx++)
-    if (ptrs[idx])
-      {
-	char *p = (char *)ptrs[idx] - 2*sizeof(size_t);
-	size_t *sp = (size_t *)p;
-	size_t size = sp[1] & ~7;
-	malloc_scan_callback (sp, size, 9);
-      }
-#endif
-
-  /* Now that we've scanned all the per-thread caches, it's safe to
-     let them exit and clean up.  */
-  pthread_mutex_unlock(&stop_mutex);
-
-  for (i=0; i<thread_idx; i++)
-    pthread_join (thread_ids[i], NULL);
-
-  return 0;
-}
-- 
cgit v1.2.3