From 130765de719a3ddc475284e13749d09ff371a8e1 Mon Sep 17 00:00:00 2001 From: Florian Fischer Date: Fri, 1 Feb 2019 16:35:20 +0100 Subject: rework build system #1 each benchmark has its own Makefile which must put it's binaries into OBJDIR which is added to the PATH during execution. --- src/Makefile | 27 + src/benchmark.py | 2 + src/benchmarks/bench_loop.c | 87 ---- src/benchmarks/cache-scratch.cc | 147 ------ src/benchmarks/cache-thrash.cc | 134 ----- src/benchmarks/cpuinfo.h | 202 -------- src/benchmarks/dj_trace/Makefile | 24 + src/benchmarks/dj_trace/trace_run.c | 750 +++++++++++++++++++++++++++ src/benchmarks/falsesharing/Makefile | 29 ++ src/benchmarks/falsesharing/cache-scratch.cc | 147 ++++++ src/benchmarks/falsesharing/cache-thrash.cc | 134 +++++ src/benchmarks/falsesharing/cpuinfo.h | 202 ++++++++ src/benchmarks/falsesharing/fred.h | 97 ++++ src/benchmarks/falsesharing/timer.h | 372 +++++++++++++ src/benchmarks/fred.h | 97 ---- src/benchmarks/larson.cc | 744 -------------------------- src/benchmarks/larson/Makefile | 24 + src/benchmarks/larson/larson.cc | 744 ++++++++++++++++++++++++++ src/benchmarks/loop/Makefile | 24 + src/benchmarks/loop/loop.c | 87 ++++ src/benchmarks/timer.h | 372 ------------- src/dj_trace.py | 4 +- src/falsesharing.py | 4 +- src/larson.py | 4 +- src/loop.py | 4 +- src/trace_run.c | 750 --------------------------- 26 files changed, 2671 insertions(+), 2541 deletions(-) create mode 100644 src/Makefile delete mode 100644 src/benchmarks/bench_loop.c delete mode 100644 src/benchmarks/cache-scratch.cc delete mode 100644 src/benchmarks/cache-thrash.cc delete mode 100644 src/benchmarks/cpuinfo.h create mode 100644 src/benchmarks/dj_trace/Makefile create mode 100644 src/benchmarks/dj_trace/trace_run.c create mode 100644 src/benchmarks/falsesharing/Makefile create mode 100644 src/benchmarks/falsesharing/cache-scratch.cc create mode 100644 src/benchmarks/falsesharing/cache-thrash.cc create mode 100644 src/benchmarks/falsesharing/cpuinfo.h create mode 100644 src/benchmarks/falsesharing/fred.h create mode 100644 src/benchmarks/falsesharing/timer.h delete mode 100644 src/benchmarks/fred.h delete mode 100644 src/benchmarks/larson.cc create mode 100644 src/benchmarks/larson/Makefile create mode 100644 src/benchmarks/larson/larson.cc create mode 100644 src/benchmarks/loop/Makefile create mode 100644 src/benchmarks/loop/loop.c delete mode 100644 src/benchmarks/timer.h delete mode 100644 src/trace_run.c (limited to 'src') diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..6b7b704 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,27 @@ +OBJDIR ?= obj + +CC ?= gcc + +WARNFLAGS ?= -Wall -Wextra +COMMONFLAGS ?= -fno-builtin -fPIC -DPIC -pthread +OPTFLAGS ?= -O3 -DNDEBUG + +CFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS) + +LDFLAGS ?= -pthread -static-libgcc + +.PHONY: all clean + +all: $(OBJDIR)/print_status_on_exit.so $(OBJDIR)/chattymalloc.so + +$(OBJDIR)/print_status_on_exit.so: print_status_on_exit.c | $(OBJDIR) + $(CC) $(LDFLAGS) -shared $(CFLAGS) -o $@ $< + +$(OBJDIR)/chattymalloc.so: chattymalloc.c | $(OBJDIR) + $(CC) $(LDFLAGS) -shared $(CFLAGS) -o $@ $< + +$(OBJDIR): + mkdir $@ + +clean: + rm -rf $(OBJDIR) diff --git a/src/benchmark.py b/src/benchmark.py index 4de05e3..e4dbef2 100644 --- a/src/benchmark.py +++ b/src/benchmark.py @@ -81,6 +81,7 @@ class Benchmark (object): self.results[target] = d def prepare(self, verbose=False): + os.environ["PATH"] += ":build/" + self.name def is_exe(fpath): return os.path.isfile(fpath) and os.access(fpath, os.X_OK) @@ -256,6 +257,7 @@ class Benchmark (object): if self.posttarget_hook((tname, t), run, verbose): return False print() + os.environ["PATH"] = os.environ["PATH"].replace(":build/"+self.name, "") return True def plot_single_arg(self, yval, ylabel="'y-label'", xlabel="'x-label'", autoticks=True, diff --git a/src/benchmarks/bench_loop.c b/src/benchmarks/bench_loop.c deleted file mode 100644 index bc15808..0000000 --- a/src/benchmarks/bench_loop.c +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include -#include -#include -#include -#include - - -static size_t _rand() { - static __thread size_t seed = 123456789; - size_t a = 1103515245; - size_t c = 12345; - size_t m = 1 << 31; - seed = (a * seed + c) % m; - return seed; -} - -typedef struct ThreadArgs { - double benchmark; - int allocations; - int max_size; -} ThreadArgs; - -static void* malloc_then_write(size_t size) { - void* ptr = malloc(size); - // Write to ptr - /* *((char*)ptr) = '!'; */ - return ptr; -} - -static void read_then_free(void* ptr) { - // Read before free - /* char s __attribute__((unused)) = *((char*)ptr); */ - free(ptr); -} -static void* test_thread_func(void* arg) { - ThreadArgs* args = (ThreadArgs*)arg; - - for(int i = 0; i < args->allocations; i++) { - void* ptr = malloc_then_write((_rand() % args->max_size) + 1); - read_then_free(ptr); - } - return NULL; -} - -int main(int argc, char* argv[]) { - pthread_t* threads; - int num_threads; - struct ThreadArgs thread_args; - - if (argc < 4) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } - - num_threads = atoi(argv[1]); - thread_args.allocations = atoi(argv[2]); - thread_args.max_size = atoi(argv[3]); - - threads = (pthread_t*)malloc(num_threads * sizeof(pthread_t)); - - for (int i = 0; i < num_threads; i++) { - if (0 != pthread_create(&threads[i], NULL, test_thread_func, &thread_args)) { - perror("pthread_create"); - return 1; - } - } - - for(int i = 0; i < num_threads; i++) { - if (0 != pthread_join(threads[i], NULL)) { - perror("pthread_join"); - return 1; - } - } - - if (argc == 5) - { - FILE* f = stdout; - if (strcmp(argv[4],"stdout") != 0) - f = fopen(argv[4], "w"); - malloc_info(0, f); - if (strcmp(argv[4],"stdout") != 0) - fclose(f); - } - - return 0; -} diff --git a/src/benchmarks/cache-scratch.cc b/src/benchmarks/cache-scratch.cc deleted file mode 100644 index 2cb9b28..0000000 --- a/src/benchmarks/cache-scratch.cc +++ /dev/null @@ -1,147 +0,0 @@ -///-*-C++-*-////////////////////////////////////////////////////////////////// -// -// Hoard: A Fast, Scalable, and Memory-Efficient Allocator -// for Shared-Memory Multiprocessors -// Contact author: Emery Berger, http://www.cs.umass.edu/~emery -// -// This library is free software; you can redistribute it and/or modify -// it under the terms of the GNU Library General Public License as -// published by the Free Software Foundation, http://www.fsf.org. -// -// This library is distributed in the hope that it will be useful, but -// WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Library General Public License for more details. -// -////////////////////////////////////////////////////////////////////////////// - -/** - * @file cache-scratch.cpp - * - * cache-scratch is a benchmark that exercises a heap's cache locality. - * An allocator that allows multiple threads to re-use the same small - * object (possibly all in one cache-line) will scale poorly, while - * an allocator like Hoard will exhibit near-linear scaling. - * - * Try the following (on a P-processor machine): - * - * cache-scratch 1 1000 1 1000000 - * cache-scratch P 1000 1 1000000 - * - * cache-scratch-hoard 1 1000 1 1000000 - * cache-scratch-hoard P 1000 1 1000000 - * - * The ideal is a P-fold speedup. -*/ - -#include -#include - -#include "fred.h" -#include "cpuinfo.h" -#include "timer.h" - -// This class just holds arguments to each thread. -class workerArg { -public: - - workerArg() {} - - workerArg (char * obj, int objSize, int repetitions, int iterations) - : _object (obj), - _objSize (objSize), - _iterations (iterations), - _repetitions (repetitions) - {} - - char * _object; - int _objSize; - int _iterations; - int _repetitions; -}; - - -#if defined(_WIN32) -extern "C" void worker (void * arg) -#else -extern "C" void * worker (void * arg) -#endif -{ - // free the object we were given. - // Then, repeatedly do the following: - // malloc a given-sized object, - // repeatedly write on it, - // then free it. - workerArg * w = (workerArg *) arg; - delete w->_object; - workerArg w1 = *w; - for (int i = 0; i < w1._iterations; i++) { - // Allocate the object. - char * obj = new char[w1._objSize]; - // Write into it a bunch of times. - for (int j = 0; j < w1._repetitions; j++) { - for (int k = 0; k < w1._objSize; k++) { - obj[k] = (char) k; - volatile char ch = obj[k]; - ch++; - } - } - // Free the object. - delete [] obj; - } - -#if !defined(_WIN32) - return NULL; -#endif -} - - -int main (int argc, char * argv[]) -{ - int nthreads; - int iterations; - int objSize; - int repetitions; - - if (argc > 4) { - nthreads = atoi(argv[1]); - iterations = atoi(argv[2]); - objSize = atoi(argv[3]); - repetitions = atoi(argv[4]); - } else { - fprintf (stderr, "Usage: %s nthreads iterations objSize repetitions\n", argv[0]); - return 1; - } - - HL::Fred * threads = new HL::Fred[nthreads]; - HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors()); - - workerArg * w = new workerArg[nthreads]; - - int i; - - // Allocate nthreads objects and distribute them among the threads. - char ** objs = new char * [nthreads]; - for (i = 0; i < nthreads; i++) { - objs[i] = new char[objSize]; - } - - HL::Timer t; - t.start(); - - for (i = 0; i < nthreads; i++) { - w[i] = workerArg (objs[i], objSize, repetitions / nthreads, iterations); - threads[i].create (&worker, (void *) &w[i]); - } - for (i = 0; i < nthreads; i++) { - threads[i].join(); - } - t.stop(); - - delete [] threads; - delete [] objs; - delete [] w; - - printf ("Time elapsed = %f seconds.\n", (double) t); - return 0; -} diff --git a/src/benchmarks/cache-thrash.cc b/src/benchmarks/cache-thrash.cc deleted file mode 100644 index 79242eb..0000000 --- a/src/benchmarks/cache-thrash.cc +++ /dev/null @@ -1,134 +0,0 @@ -///-*-C++-*-////////////////////////////////////////////////////////////////// -// -// Hoard: A Fast, Scalable, and Memory-Efficient Allocator -// for Shared-Memory Multiprocessors -// Contact author: Emery Berger, http://www.cs.umass.edu/~emery -// -// Copyright (c) 1998-2003, The University of Texas at Austin. -// -// This library is free software; you can redistribute it and/or modify -// it under the terms of the GNU Library General Public License as -// published by the Free Software Foundation, http://www.fsf.org. -// -// This library is distributed in the hope that it will be useful, but -// WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Library General Public License for more details. -// -////////////////////////////////////////////////////////////////////////////// - -/** - * @file cache-thrash.cpp - * @brief cache-thrash is a benchmark that exercises a heap's cache-locality. - * - * Try the following (on a P-processor machine): - * - * cache-thrash 1 1000 1 1000000 - * cache-thrash P 1000 1 1000000 - * - * cache-thrash-hoard 1 1000 1 1000000 - * cache-thrash-hoard P 1000 1 1000000 - * - * The ideal is a P-fold speedup. -*/ - - -#include -#include - -using namespace std; - -#include "cpuinfo.h" -#include "fred.h" -#include "timer.h" - -// This class just holds arguments to each thread. -class workerArg { -public: - workerArg() {} - workerArg (int objSize, int repetitions, int iterations) - : _objSize (objSize), - _iterations (iterations), - _repetitions (repetitions) - {} - - int _objSize; - int _iterations; - int _repetitions; -}; - - -#if defined(_WIN32) -extern "C" void worker (void * arg) -#else -extern "C" void * worker (void * arg) -#endif -{ - // Repeatedly do the following: - // malloc a given-sized object, - // repeatedly write on it, - // then free it. - workerArg * w = (workerArg *) arg; - workerArg w1 = *w; - for (int i = 0; i < w1._iterations; i++) { - // Allocate the object. - char * obj = new char[w1._objSize]; - // printf ("obj = %p\n", obj); - // Write into it a bunch of times. - for (int j = 0; j < w1._repetitions; j++) { - for (int k = 0; k < w1._objSize; k++) { - obj[k] = (char) k; - volatile char ch = obj[k]; - ch++; - } - } - // Free the object. - delete [] obj; - } -#if !defined(_WIN32) - return NULL; -#endif -} - - -int main (int argc, char * argv[]) -{ - int nthreads; - int iterations; - int objSize; - int repetitions; - - if (argc > 4) { - nthreads = atoi(argv[1]); - iterations = atoi(argv[2]); - objSize = atoi(argv[3]); - repetitions = atoi(argv[4]); - } else { - cerr << "Usage: " << argv[0] << " nthreads iterations objSize repetitions" << endl; - exit(1); - } - - HL::Fred * threads = new HL::Fred[nthreads]; - HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors()); - - int i; - - HL::Timer t; - t.start(); - - workerArg * w = new workerArg[nthreads]; - - for (i = 0; i < nthreads; i++) { - w[i] = workerArg (objSize, repetitions / nthreads, iterations); - threads[i].create (&worker, (void *) &w[i]); - } - for (i = 0; i < nthreads; i++) { - threads[i].join(); - } - t.stop(); - - delete [] threads; - delete [] w; - - cout << "Time elapsed = " << (double) t << " seconds." << endl; -} diff --git a/src/benchmarks/cpuinfo.h b/src/benchmarks/cpuinfo.h deleted file mode 100644 index 1ed1f36..0000000 --- a/src/benchmarks/cpuinfo.h +++ /dev/null @@ -1,202 +0,0 @@ -// -*- C++ -*- - -/* - - Heap Layers: An Extensible Memory Allocation Infrastructure - - Copyright (C) 2000-2003 by Emery Berger - http://www.cs.umass.edu/~emery - emery@cs.umass.edu - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -*/ - - - -#ifndef HL_CPUINFO_H -#define HL_CPUINFO_H - -#if defined(_WIN32) -#include -#include -#else -#include -#endif - - -#if !defined(_WIN32) -#include -#endif - -#if defined(__SVR4) // Solaris -#include -extern "C" unsigned int lwp_self(void); -#include -extern "C" int _thr_self(void); -#endif - -#if defined(__linux) -#include -#include -#include -#include -#include -#endif - -#if defined(__APPLE__) -#include -#include -#endif - -#if defined(__sgi) -#include -#include -#include -#endif - -#if defined(hpux) -#include -#endif - -#if defined(_WIN32) -extern __declspec(thread) int localThreadId; -#endif - -#if defined(__SVR4) && defined(MAP_ALIGN) -extern volatile int anyThreadStackCreated; -#endif - -namespace HL { - -/** - * @class CPUInfo - * @author Emery Berger - * - * @brief Architecture-independent wrapper to get number of CPUs. - */ - -class CPUInfo { -public: - CPUInfo (void) - {} - - inline static int getNumProcessors (void) { - static int _numProcessors = computeNumProcessors(); - return _numProcessors; - } - - static inline unsigned long getThreadId (void); - inline static int computeNumProcessors (void); - -}; - - -int CPUInfo::computeNumProcessors (void) -{ - static int np = 0; - if (!np) { -#if defined(__linux) || defined(__APPLE__) - np = (int) sysconf(_SC_NPROCESSORS_ONLN); -#elif defined(_WIN32) - SYSTEM_INFO infoReturn[1]; - GetSystemInfo (infoReturn); - np = (int) (infoReturn->dwNumberOfProcessors); -#elif defined(__sgi) - np = (int) sysmp(MP_NAPROCS); -#elif defined(hpux) - np = mpctl(MPC_GETNUMSPUS, NULL, NULL); // or pthread_num_processors_np()? -#elif defined(_SC_NPROCESSORS_ONLN) - np = (int) (sysconf(_SC_NPROCESSORS_ONLN)); -#else - np = 2; - // Unsupported platform. - // Pretend we have at least two processors. This approach avoids the risk of assuming - // we're on a uniprocessor, which might lead clever allocators to avoid using atomic - // operations for all locks. -#endif - return np; - } else { - return np; - } -} - - // Note: when stacksize arg is NULL for pthread_attr_setstacksize [Solaris], -// stack size is 1 MB for 32-bit arch, 2 MB for 64-bit arch. -// pthread_attr_getstacksize -// pthread_attr_setstackaddr -// pthread_attr_getstackaddr -// PTHREAD_STACK_SIZE is minimum. -// or should we just assume we have __declspec(thread) or __thread? - -#if defined(USE_THREAD_KEYWORD) - extern __thread int localThreadId; -#endif - - // FIX ME FIXME - //#include - -unsigned long CPUInfo::getThreadId (void) { -#if defined(__SVR4) - size_t THREAD_STACK_SIZE; - if (sizeof(size_t) <= 4) { - THREAD_STACK_SIZE = 1048576; - } else { - // 64-bits. - THREAD_STACK_SIZE = 1048576 * 2; - } - if (0) { // !anyThreadStackCreated) { - // We know a priori that all stack variables - // are on different stacks. Since no one has created - // a special one, we are in control, and thus all stacks - // are 1 MB in size and on 1 MB boundaries. - // (Actually: 1 MB for 32-bits, 2 MB for 64-bits.) - char buf; - return (((size_t) &buf) & ~(THREAD_STACK_SIZE-1)) >> 20; - } else { - return (int) pthread_self(); - } -#elif defined(_WIN32) - // It looks like thread id's are always multiples of 4, so... - return GetCurrentThreadId() >> 2; -#elif defined(__APPLE__) - // Consecutive thread id's in Mac OS are 4096 apart; - // dividing off the 4096 gives us an appropriate thread id. - int tid = (int) ((unsigned long) pthread_self()) >> 12; - return tid; -#elif defined(__BEOS__) - return find_thread(0); -#elif defined(USE_THREAD_KEYWORD) - return localThreadId; -#elif defined(__linux) || defined(PTHREAD_KEYS_MAX) - // Consecutive thread id's in Linux are 1024 apart; - // dividing off the 1024 gives us an appropriate thread id. - return (unsigned long) pthread_self() >> 10; -#elif defined(POSIX) - return (unsigned long) pthread_self(); -#elif USE_SPROC - // This hairiness has the same effect as calling getpid(), - // but it's MUCH faster since it avoids making a system call - // and just accesses the sproc-local data directly. - unsigned long pid = (unsigned long) PRDA->sys_prda.prda_sys.t_pid; - return pid; -#else - return 0; -#endif -} - -} - -#endif diff --git a/src/benchmarks/dj_trace/Makefile b/src/benchmarks/dj_trace/Makefile new file mode 100644 index 0000000..14eca91 --- /dev/null +++ b/src/benchmarks/dj_trace/Makefile @@ -0,0 +1,24 @@ +OBJDIR ?= obj + +CC ?= gcc + +WARNFLAGS ?= -Wall -Wextra +COMMONFLAGS ?= -fno-builtin -fPIC -DPIC -pthread +OPTFLAGS ?= -O3 -DNDEBUG + +CFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS) + +LDFLAGS ?= -pthread -static-libgcc + +.PHONY = all clean + +all: $(OBJDIR)/trace_run + +$(OBJDIR)/trace_run: trace_run.c | $(OBJDIR) + $(CC) $(LDFLAGS) $(CFLAGS) -o $@ $< + +$(OBJDIR): + mkdir $@ + +clean: + rm -rf $(OBJDIR) diff --git a/src/benchmarks/dj_trace/trace_run.c b/src/benchmarks/dj_trace/trace_run.c new file mode 100644 index 0000000..604d01e --- /dev/null +++ b/src/benchmarks/dj_trace/trace_run.c @@ -0,0 +1,750 @@ +#define _LARGEFILE64_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// #include "malloc.h" +#include + +// #include "mtrace.h" +/* Codes for the simulator/workload programs. Copied from mtrace.h. */ +#define C_NOP 0 +#define C_DONE 1 +#define C_MALLOC 2 +#define C_CALLOC 3 +#define C_REALLOC 4 +#define C_FREE 5 +#define C_SYNC_W 6 +#define C_SYNC_R 7 +#define C_ALLOC_PTRS 8 +#define C_ALLOC_SYNCS 9 +#define C_NTHREADS 10 +#define C_START_THREAD 11 +#define C_MEMALIGN 12 +#define C_VALLOC 13 +#define C_PVALLOC 14 +#define C_POSIX_MEMALIGN 15 + +#if UINTPTR_MAX == 0xffffffffffffffff + +#define ticks_t int64_t +/* Setting quick_run to 1 allows the simulator to model + only the allocation and deallocation accounting via + atomic_rss. The actual allocations are skipped. This + mode is useful to verify the workload file. */ +#define quick_run 0 + +static __inline__ ticks_t rdtsc_s(void) +{ + unsigned a, d; + asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); + asm volatile("rdtscp" : "=a" (a), "=d" (d)); + return ((unsigned long long)a) | (((unsigned long long)d) << 32); +} + +static __inline__ ticks_t rdtsc_e(void) +{ + unsigned a, d; + asm volatile("rdtscp" : "=a" (a), "=d" (d)); + asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); + return ((unsigned long long)a) | (((unsigned long long)d) << 32); +} + +#else + +#define ticks_t int32_t + +static __inline__ ticks_t rdtsc_s(void) +{ + unsigned a, d; + asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx"); + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((unsigned long)a) | (((unsigned long)d) << 16); +} + +static __inline__ ticks_t rdtsc_e(void) +{ + unsigned a, d; + asm volatile("rdtscp" : "=a" (a), "=d" (d)); + asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx"); + return ((unsigned long)a) | (((unsigned long)d) << 16); +} + +#endif + +static ticks_t diff_timeval (struct timeval e, struct timeval s) +{ + ticks_t usec; + if (e.tv_usec < s.tv_usec) + usec = (e.tv_usec + 1000000 - s.tv_usec) + (e.tv_sec-1 - s.tv_sec)*1000000; + else + usec = (e.tv_usec - s.tv_usec) + (e.tv_sec - s.tv_sec)*1000000; + return usec; +} + +#if 1 +#define Q1 +#define Q2 +#else +pthread_mutex_t genmutex = PTHREAD_MUTEX_INITIALIZER; +#define Q1 pthread_mutex_lock(&genmutex) +#define Q2 pthread_mutex_unlock(&genmutex) +#endif + +pthread_mutex_t cmutex = PTHREAD_MUTEX_INITIALIZER; +#define NCBUF 10 +static char cbuf[NCBUF][30]; +static int ci = 0; + +char *comma(ticks_t x) +{ + char buf[30], *bs, *bd; + int l, i, idx; + + pthread_mutex_lock(&cmutex); + ci = (ci + 1) % NCBUF; + idx = ci; + pthread_mutex_unlock(&cmutex); + bs = buf; + bd = cbuf[idx]; + + sprintf(buf, "%lld", (long long int)x); + l = strlen(buf); + i = l; + while (*bs) + { + *bd++ = *bs++; + i--; + if (i % 3 == 0 && *bs) + *bd++ = ','; + } + *bd = 0; + return cbuf[idx]; +} + +static volatile void **ptrs; +static volatile size_t *sizes; +static size_t n_ptrs; +static volatile char *syncs; +static pthread_mutex_t *mutexes; +static pthread_cond_t *conds; +static size_t n_syncs; + +static pthread_mutex_t stat_mutex = PTHREAD_MUTEX_INITIALIZER; +ticks_t malloc_time = 0, malloc_count = 0; +ticks_t calloc_time = 0, calloc_count = 0; +ticks_t realloc_time = 0, realloc_count = 0; +ticks_t free_time = 0, free_count = 0; + +size_t ideal_rss = 0; +size_t max_ideal_rss = 0; +static pthread_mutex_t rss_mutex = PTHREAD_MUTEX_INITIALIZER; + +void atomic_rss (ssize_t delta) +{ + pthread_mutex_lock (&rss_mutex); + ideal_rss += delta; + if (max_ideal_rss < ideal_rss) + max_ideal_rss = ideal_rss; + pthread_mutex_unlock (&rss_mutex); +} + +pthread_mutex_t stop_mutex = PTHREAD_MUTEX_INITIALIZER; +int threads_done = 0; + +//#define dprintf printf +#define dprintf(...) (void)1 + +//#define mprintf printf +//#define MDEBUG 1 +#define mprintf(...) (void)1 + +#define myabort() my_abort_2(thrc, __LINE__) +void +my_abort_2 (pthread_t thrc, int line) +{ + fprintf(stderr, "Abort thread %p at line %d\n", (void *)thrc, line); + abort(); +} + +/*------------------------------------------------------------*/ +/* Wrapper around I/O routines */ + +int io_fd; + +#define IOSIZE 65536 +#define IOMIN 4096 + +static pthread_mutex_t io_mutex = PTHREAD_MUTEX_INITIALIZER; + +typedef struct { + unsigned char buf[IOSIZE]; + size_t incr; + size_t max_incr; + size_t buf_base; + size_t buf_idx; + int saw_eof; +} IOPerThreadType; + +IOPerThreadType main_io; +IOPerThreadType *thread_io; + +void +io_init (IOPerThreadType *io, size_t file_offset, int incr) +{ + if (incr > IOSIZE) + incr = IOSIZE; + if (incr < IOMIN) + incr = IOMIN; + + io->buf_base = file_offset; + io->buf_idx = 0; + io->incr = incr; + + pthread_mutex_lock (&io_mutex); + lseek64 (io_fd, io->buf_base, SEEK_SET); + // short read OK, the eof is just to prevent runaways from bad data. + if (read (io_fd, io->buf, incr) < 0) + io->saw_eof = 1; + else + io->saw_eof = 0; + pthread_mutex_unlock (&io_mutex); +} + +unsigned char +io_read (IOPerThreadType *io) +{ + if (io->buf_idx >= io->incr) + io_init (io, io->buf_base + io->buf_idx, io->incr); + if (io->saw_eof) + return 0xff; + return io->buf [io->buf_idx++]; +} + +unsigned char +io_peek (IOPerThreadType *io) +{ + if (io->buf_idx >= io->incr) + io_init (io, io->buf_base + io->buf_idx, io->incr); + if (io->saw_eof) + return 0xff; + return io->buf [io->buf_idx]; +} + +size_t +io_pos (IOPerThreadType *io) +{ + return io->buf_base + io->buf_idx; +} + +/*------------------------------------------------------------*/ + +static void +wmem (volatile void *ptr, int count) +{ + char *p = (char *)ptr; + int i; + + if (!p) + return; + + for (i=0; isaw_eof) + myabort(); + dprintf("op %p:%ld is %d\n", (void *)thrc, io_pos (io), io_peek (io)); + switch (io_read (io)) + { + case C_NOP: + break; + + case C_DONE: + dprintf("op %p:%ld DONE\n", (void *)thrc, io_pos (io)); + pthread_mutex_lock (&stat_mutex); + malloc_time += my_malloc_time; + calloc_time += my_calloc_time; + realloc_time += my_realloc_time; + free_time += my_free_time; + malloc_count += my_malloc_count; + calloc_count += my_calloc_count; + realloc_count += my_realloc_count; + free_count += my_free_count; + threads_done ++; + pthread_mutex_unlock (&stat_mutex); + pthread_mutex_lock(&stop_mutex); + pthread_mutex_unlock(&stop_mutex); + return NULL; + + case C_MEMALIGN: + p2 = get_int (io); + sz2 = get_int (io); + sz = get_int (io); + dprintf("op %p:%ld %ld = MEMALIGN %ld %ld\n", (void *)thrc, io_pos (io), p2, sz2, sz); + /* we can't force memalign to return NULL (fail), so just skip it. */ + if (p2 == 0) + break; + if (p2 > n_ptrs) + myabort(); + stime = rdtsc_s(); + Q1; + if (ptrs[p2]) + { + if (!quick_run) + free ((void *)ptrs[p2]); + atomic_rss (-sizes[p2]); + } + if (!quick_run) + ptrs[p2] = memalign (sz2, sz); + else + ptrs[p2] = (void *)p2; + /* Verify the alignment matches what is expected. */ + if (((size_t)ptrs[p2] & (sz2 - 1)) != 0) + myabort (); + sizes[p2] = sz; + mprintf("%p = memalign(%lx, %lx)\n", ptrs[p2], sz2, sz); + Q2; + etime = rdtsc_e(); + if (ptrs[p2] != NULL) + atomic_rss (sz); + if (etime < stime) + { + printf("s: %llx e:%llx d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime)); + } + my_malloc_time += etime - stime; + my_malloc_count ++; + if (!quick_run) + wmem(ptrs[p2], sz); + break; + + case C_MALLOC: + p2 = get_int (io); + sz = get_int (io); + dprintf("op %p:%ld %ld = MALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz); + /* we can't force malloc to return NULL (fail), so just skip it. */ + if (p2 == 0) + break; + if (p2 > n_ptrs) + myabort(); + stime = rdtsc_s(); + Q1; + if (ptrs[p2]) + { + if (!quick_run) + free ((void *)ptrs[p2]); + atomic_rss (-sizes[p2]); + } + if (!quick_run) + ptrs[p2] = malloc (sz); + else + ptrs[p2] = (void *)p2; + sizes[p2] = sz; + mprintf("%p = malloc(%lx)\n", ptrs[p2], sz); + Q2; + etime = rdtsc_e(); + if (ptrs[p2] != NULL) + atomic_rss (sz); + if (etime < stime) + { + printf("s: %llx e:%llx d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime)); + } + my_malloc_time += etime - stime; + my_malloc_count ++; + if (!quick_run) + wmem(ptrs[p2], sz); + break; + + case C_CALLOC: + p2 = get_int (io); + sz = get_int (io); + dprintf("op %p:%ld %ld = CALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz); + /* we can't force calloc to return NULL (fail), so just skip it. */ + if (p2 == 0) + break; + if (p2 > n_ptrs) + myabort(); + if (ptrs[p2]) + { + if (!quick_run) + free ((void *)ptrs[p2]); + atomic_rss (-sizes[p2]); + } + stime = rdtsc_s(); + Q1; + if (!quick_run) + ptrs[p2] = calloc (sz, 1); + else + ptrs[p2] = (void *)p2; + sizes[p2] = sz; + mprintf("%p = calloc(%lx)\n", ptrs[p2], sz); + Q2; + if (ptrs[p2]) + atomic_rss (sz); + my_calloc_time += rdtsc_e() - stime; + my_calloc_count ++; + if (!quick_run) + wmem(ptrs[p2], sz); + break; + + case C_REALLOC: + p2 = get_int (io); + p1 = get_int (io); + sz = get_int (io); + dprintf("op %p:%ld %ld = REALLOC %ld %ld\n", (void *)thrc, io_pos (io), p2, p1, sz); + if (p1 > n_ptrs) + myabort(); + if (p2 > n_ptrs) + myabort(); + /* we can't force realloc to return NULL (fail), so just skip it. */ + if (p2 == 0) + break; + + if (ptrs[p1]) + atomic_rss (-sizes[p1]); + if (!quick_run) + free_wipe(p1); + stime = rdtsc_s(); + Q1; +#ifdef MDEBUG + tmp = ptrs[p1]; +#endif + if (!quick_run) + ptrs[p2] = realloc ((void *)ptrs[p1], sz); + else + ptrs[p2] = (void *)p2; + sizes[p2] = sz; + mprintf("%p = relloc(%p,%lx)\n", ptrs[p2], tmp,sz); + Q2; + my_realloc_time += rdtsc_e() - stime; + my_realloc_count ++; + if (!quick_run) + wmem(ptrs[p2], sz); + if (p1 != p2) + ptrs[p1] = 0; + if (ptrs[p2]) + atomic_rss (sizes[p2]); + break; + + case C_FREE: + p1 = get_int (io); + if (p1 > n_ptrs) + myabort(); + dprintf("op %p:%ld FREE %ld\n", (void *)thrc, io_pos (io), p1); + if (!quick_run) + free_wipe (p1); + if (ptrs[p1]) + atomic_rss (-sizes[p1]); + stime = rdtsc_s(); + Q1; + mprintf("free(%p)\n", ptrs[p1]); + if (!quick_run) + free ((void *)ptrs[p1]); + Q2; + my_free_time += rdtsc_e() - stime; + my_free_count ++; + ptrs[p1] = 0; + break; + + case C_SYNC_W: + p1 = get_int(io); + dprintf("op %p:%ld SYNC_W %ld\n", (void *)thrc, io_pos (io), p1); + if (p1 > n_syncs) + myabort(); + pthread_mutex_lock (&mutexes[p1]); + syncs[p1] = 1; + pthread_cond_signal (&conds[p1]); + __sync_synchronize (); + pthread_mutex_unlock (&mutexes[p1]); + break; + + case C_SYNC_R: + p1 = get_int(io); + dprintf("op %p:%ld SYNC_R %ld\n", (void *)thrc, io_pos (io), p1); + if (p1 > n_syncs) + myabort(); + pthread_mutex_lock (&mutexes[p1]); + while (syncs[p1] != 1) + { + pthread_cond_wait (&conds[p1], &mutexes[p1]); + __sync_synchronize (); + } + pthread_mutex_unlock (&mutexes[p1]); + break; + + default: + printf("op %d - unsupported, thread %d addr %lu\n", + this_op, thread_idx, (long unsigned int)io_pos (io)); + myabort(); + } + } +} + +static void *alloc_mem (size_t amt) +{ + void *rv = mmap (NULL, amt, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + mlock (rv, amt); + memset (rv, 0, amt); + return rv; +} + +static pthread_t *thread_ids; + +void * +my_malloc (const char *msg, int size, IOPerThreadType *io, size_t *psz, size_t count) +{ + void *rv; + if (psz) + count = *psz = get_int (io); + dprintf ("my_malloc for %s size %d * %ld\n", msg, size, count); + rv = alloc_mem(size * count); + if (!rv) + { + fprintf(stderr, "calloc(%lu,%lu) failed\n", (long unsigned)size, (long unsigned)*psz); + exit(1); + } + mlock (rv, size * count); + return rv; +} + +static const char * const scan_names[] = { + "UNUSED", + "ARENA", + "HEAP", + "CHUNK_USED", + "CHUNK_FREE", + "FASTBIN_FREE", + "UNSORTED", + "TOP", + "TCACHE", + "USED" +}; + +void +malloc_scan_callback (void *ptr, size_t length, int type) +{ + printf("%s: ptr %p length %llx\n", scan_names[type], ptr, (long long)length); +} + +#define MY_ALLOC(T, psz) \ + (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, psz, 0) +#define MY_ALLOCN(T, count) \ + (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, NULL, count) + +int +main(int argc, char **argv) +{ + ticks_t start=0; + ticks_t end; + ticks_t usec; + struct timeval tv_s, tv_e; + int thread_idx = 0; + int i; + size_t n_threads = 0; + size_t idx; + struct rusage res_start, res_end; + int done; + size_t guessed_io_size = 4096; + struct stat statb; + + if (argc < 2) + { + fprintf(stderr, "Usage: %s \n", argv[0]); + exit(1); + } + io_fd = open(argv[1], O_RDONLY); + if (io_fd < 0) + { + fprintf(stderr, "Unable to open %s for reading\n", argv[1]); + perror("The error was"); + exit(1); + } + fstat (io_fd, &statb); + + io_init (&main_io, 0, IOMIN); + + pthread_mutex_lock(&stop_mutex); + + done = 0; + while (!done) + { + switch (io_read (&main_io)) + { + case C_NOP: + break; + case C_ALLOC_PTRS: + ptrs = MY_ALLOC (ptrs, &n_ptrs); + sizes = alloc_mem(sizeof(sizes[0]) * n_ptrs); + ptrs[0] = 0; + break; + case C_ALLOC_SYNCS: + n_syncs = get_int(&main_io); + syncs = MY_ALLOCN (syncs, n_syncs); + conds = MY_ALLOCN (conds, n_syncs); + mutexes = MY_ALLOCN (mutexes, n_syncs); + for (idx=0; idx %s)\n", + comma(res_end.ru_maxrss - res_start.ru_maxrss), + comma(res_start.ru_maxrss), comma(res_end.ru_maxrss)); + } + printf("%s Kb Max Ideal RSS\n", comma (max_ideal_rss / 1024)); + + if (malloc_count == 0) malloc_count ++; + if (calloc_count == 0) calloc_count ++; + if (realloc_count == 0) realloc_count ++; + if (free_count == 0) free_count ++; + + if (!quick_run) + { + printf("\n"); + printf("sizeof ticks_t is %lu\n", sizeof(ticks_t)); + printf("Avg malloc time: %6s in %10s calls\n", comma(malloc_time/malloc_count), comma(malloc_count)); + printf("Avg calloc time: %6s in %10s calls\n", comma(calloc_time/calloc_count), comma(calloc_count)); + printf("Avg realloc time: %5s in %10s calls\n", comma(realloc_time/realloc_count), comma(realloc_count)); + printf("Avg free time: %8s in %10s calls\n", comma(free_time/free_count), comma(free_count)); + printf("Total call time: %s cycles\n", comma(malloc_time+calloc_time+realloc_time+free_time)); + printf("\n"); + } + +#if 0 + /* Free any still-held chunks of memory. */ + for (idx=0; idx +#include + +#include "fred.h" +#include "cpuinfo.h" +#include "timer.h" + +// This class just holds arguments to each thread. +class workerArg { +public: + + workerArg() {} + + workerArg (char * obj, int objSize, int repetitions, int iterations) + : _object (obj), + _objSize (objSize), + _iterations (iterations), + _repetitions (repetitions) + {} + + char * _object; + int _objSize; + int _iterations; + int _repetitions; +}; + + +#if defined(_WIN32) +extern "C" void worker (void * arg) +#else +extern "C" void * worker (void * arg) +#endif +{ + // free the object we were given. + // Then, repeatedly do the following: + // malloc a given-sized object, + // repeatedly write on it, + // then free it. + workerArg * w = (workerArg *) arg; + delete w->_object; + workerArg w1 = *w; + for (int i = 0; i < w1._iterations; i++) { + // Allocate the object. + char * obj = new char[w1._objSize]; + // Write into it a bunch of times. + for (int j = 0; j < w1._repetitions; j++) { + for (int k = 0; k < w1._objSize; k++) { + obj[k] = (char) k; + volatile char ch = obj[k]; + ch++; + } + } + // Free the object. + delete [] obj; + } + +#if !defined(_WIN32) + return NULL; +#endif +} + + +int main (int argc, char * argv[]) +{ + int nthreads; + int iterations; + int objSize; + int repetitions; + + if (argc > 4) { + nthreads = atoi(argv[1]); + iterations = atoi(argv[2]); + objSize = atoi(argv[3]); + repetitions = atoi(argv[4]); + } else { + fprintf (stderr, "Usage: %s nthreads iterations objSize repetitions\n", argv[0]); + return 1; + } + + HL::Fred * threads = new HL::Fred[nthreads]; + HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors()); + + workerArg * w = new workerArg[nthreads]; + + int i; + + // Allocate nthreads objects and distribute them among the threads. + char ** objs = new char * [nthreads]; + for (i = 0; i < nthreads; i++) { + objs[i] = new char[objSize]; + } + + HL::Timer t; + t.start(); + + for (i = 0; i < nthreads; i++) { + w[i] = workerArg (objs[i], objSize, repetitions / nthreads, iterations); + threads[i].create (&worker, (void *) &w[i]); + } + for (i = 0; i < nthreads; i++) { + threads[i].join(); + } + t.stop(); + + delete [] threads; + delete [] objs; + delete [] w; + + printf ("Time elapsed = %f seconds.\n", (double) t); + return 0; +} diff --git a/src/benchmarks/falsesharing/cache-thrash.cc b/src/benchmarks/falsesharing/cache-thrash.cc new file mode 100644 index 0000000..79242eb --- /dev/null +++ b/src/benchmarks/falsesharing/cache-thrash.cc @@ -0,0 +1,134 @@ +///-*-C++-*-////////////////////////////////////////////////////////////////// +// +// Hoard: A Fast, Scalable, and Memory-Efficient Allocator +// for Shared-Memory Multiprocessors +// Contact author: Emery Berger, http://www.cs.umass.edu/~emery +// +// Copyright (c) 1998-2003, The University of Texas at Austin. +// +// This library is free software; you can redistribute it and/or modify +// it under the terms of the GNU Library General Public License as +// published by the Free Software Foundation, http://www.fsf.org. +// +// This library is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Library General Public License for more details. +// +////////////////////////////////////////////////////////////////////////////// + +/** + * @file cache-thrash.cpp + * @brief cache-thrash is a benchmark that exercises a heap's cache-locality. + * + * Try the following (on a P-processor machine): + * + * cache-thrash 1 1000 1 1000000 + * cache-thrash P 1000 1 1000000 + * + * cache-thrash-hoard 1 1000 1 1000000 + * cache-thrash-hoard P 1000 1 1000000 + * + * The ideal is a P-fold speedup. +*/ + + +#include +#include + +using namespace std; + +#include "cpuinfo.h" +#include "fred.h" +#include "timer.h" + +// This class just holds arguments to each thread. +class workerArg { +public: + workerArg() {} + workerArg (int objSize, int repetitions, int iterations) + : _objSize (objSize), + _iterations (iterations), + _repetitions (repetitions) + {} + + int _objSize; + int _iterations; + int _repetitions; +}; + + +#if defined(_WIN32) +extern "C" void worker (void * arg) +#else +extern "C" void * worker (void * arg) +#endif +{ + // Repeatedly do the following: + // malloc a given-sized object, + // repeatedly write on it, + // then free it. + workerArg * w = (workerArg *) arg; + workerArg w1 = *w; + for (int i = 0; i < w1._iterations; i++) { + // Allocate the object. + char * obj = new char[w1._objSize]; + // printf ("obj = %p\n", obj); + // Write into it a bunch of times. + for (int j = 0; j < w1._repetitions; j++) { + for (int k = 0; k < w1._objSize; k++) { + obj[k] = (char) k; + volatile char ch = obj[k]; + ch++; + } + } + // Free the object. + delete [] obj; + } +#if !defined(_WIN32) + return NULL; +#endif +} + + +int main (int argc, char * argv[]) +{ + int nthreads; + int iterations; + int objSize; + int repetitions; + + if (argc > 4) { + nthreads = atoi(argv[1]); + iterations = atoi(argv[2]); + objSize = atoi(argv[3]); + repetitions = atoi(argv[4]); + } else { + cerr << "Usage: " << argv[0] << " nthreads iterations objSize repetitions" << endl; + exit(1); + } + + HL::Fred * threads = new HL::Fred[nthreads]; + HL::Fred::setConcurrency (HL::CPUInfo::getNumProcessors()); + + int i; + + HL::Timer t; + t.start(); + + workerArg * w = new workerArg[nthreads]; + + for (i = 0; i < nthreads; i++) { + w[i] = workerArg (objSize, repetitions / nthreads, iterations); + threads[i].create (&worker, (void *) &w[i]); + } + for (i = 0; i < nthreads; i++) { + threads[i].join(); + } + t.stop(); + + delete [] threads; + delete [] w; + + cout << "Time elapsed = " << (double) t << " seconds." << endl; +} diff --git a/src/benchmarks/falsesharing/cpuinfo.h b/src/benchmarks/falsesharing/cpuinfo.h new file mode 100644 index 0000000..1ed1f36 --- /dev/null +++ b/src/benchmarks/falsesharing/cpuinfo.h @@ -0,0 +1,202 @@ +// -*- C++ -*- + +/* + + Heap Layers: An Extensible Memory Allocation Infrastructure + + Copyright (C) 2000-2003 by Emery Berger + http://www.cs.umass.edu/~emery + emery@cs.umass.edu + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +*/ + + + +#ifndef HL_CPUINFO_H +#define HL_CPUINFO_H + +#if defined(_WIN32) +#include +#include +#else +#include +#endif + + +#if !defined(_WIN32) +#include +#endif + +#if defined(__SVR4) // Solaris +#include +extern "C" unsigned int lwp_self(void); +#include +extern "C" int _thr_self(void); +#endif + +#if defined(__linux) +#include +#include +#include +#include +#include +#endif + +#if defined(__APPLE__) +#include +#include +#endif + +#if defined(__sgi) +#include +#include +#include +#endif + +#if defined(hpux) +#include +#endif + +#if defined(_WIN32) +extern __declspec(thread) int localThreadId; +#endif + +#if defined(__SVR4) && defined(MAP_ALIGN) +extern volatile int anyThreadStackCreated; +#endif + +namespace HL { + +/** + * @class CPUInfo + * @author Emery Berger + * + * @brief Architecture-independent wrapper to get number of CPUs. + */ + +class CPUInfo { +public: + CPUInfo (void) + {} + + inline static int getNumProcessors (void) { + static int _numProcessors = computeNumProcessors(); + return _numProcessors; + } + + static inline unsigned long getThreadId (void); + inline static int computeNumProcessors (void); + +}; + + +int CPUInfo::computeNumProcessors (void) +{ + static int np = 0; + if (!np) { +#if defined(__linux) || defined(__APPLE__) + np = (int) sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(_WIN32) + SYSTEM_INFO infoReturn[1]; + GetSystemInfo (infoReturn); + np = (int) (infoReturn->dwNumberOfProcessors); +#elif defined(__sgi) + np = (int) sysmp(MP_NAPROCS); +#elif defined(hpux) + np = mpctl(MPC_GETNUMSPUS, NULL, NULL); // or pthread_num_processors_np()? +#elif defined(_SC_NPROCESSORS_ONLN) + np = (int) (sysconf(_SC_NPROCESSORS_ONLN)); +#else + np = 2; + // Unsupported platform. + // Pretend we have at least two processors. This approach avoids the risk of assuming + // we're on a uniprocessor, which might lead clever allocators to avoid using atomic + // operations for all locks. +#endif + return np; + } else { + return np; + } +} + + // Note: when stacksize arg is NULL for pthread_attr_setstacksize [Solaris], +// stack size is 1 MB for 32-bit arch, 2 MB for 64-bit arch. +// pthread_attr_getstacksize +// pthread_attr_setstackaddr +// pthread_attr_getstackaddr +// PTHREAD_STACK_SIZE is minimum. +// or should we just assume we have __declspec(thread) or __thread? + +#if defined(USE_THREAD_KEYWORD) + extern __thread int localThreadId; +#endif + + // FIX ME FIXME + //#include + +unsigned long CPUInfo::getThreadId (void) { +#if defined(__SVR4) + size_t THREAD_STACK_SIZE; + if (sizeof(size_t) <= 4) { + THREAD_STACK_SIZE = 1048576; + } else { + // 64-bits. + THREAD_STACK_SIZE = 1048576 * 2; + } + if (0) { // !anyThreadStackCreated) { + // We know a priori that all stack variables + // are on different stacks. Since no one has created + // a special one, we are in control, and thus all stacks + // are 1 MB in size and on 1 MB boundaries. + // (Actually: 1 MB for 32-bits, 2 MB for 64-bits.) + char buf; + return (((size_t) &buf) & ~(THREAD_STACK_SIZE-1)) >> 20; + } else { + return (int) pthread_self(); + } +#elif defined(_WIN32) + // It looks like thread id's are always multiples of 4, so... + return GetCurrentThreadId() >> 2; +#elif defined(__APPLE__) + // Consecutive thread id's in Mac OS are 4096 apart; + // dividing off the 4096 gives us an appropriate thread id. + int tid = (int) ((unsigned long) pthread_self()) >> 12; + return tid; +#elif defined(__BEOS__) + return find_thread(0); +#elif defined(USE_THREAD_KEYWORD) + return localThreadId; +#elif defined(__linux) || defined(PTHREAD_KEYS_MAX) + // Consecutive thread id's in Linux are 1024 apart; + // dividing off the 1024 gives us an appropriate thread id. + return (unsigned long) pthread_self() >> 10; +#elif defined(POSIX) + return (unsigned long) pthread_self(); +#elif USE_SPROC + // This hairiness has the same effect as calling getpid(), + // but it's MUCH faster since it avoids making a system call + // and just accesses the sproc-local data directly. + unsigned long pid = (unsigned long) PRDA->sys_prda.prda_sys.t_pid; + return pid; +#else + return 0; +#endif +} + +} + +#endif diff --git a/src/benchmarks/falsesharing/fred.h b/src/benchmarks/falsesharing/fred.h new file mode 100644 index 0000000..b0198a7 --- /dev/null +++ b/src/benchmarks/falsesharing/fred.h @@ -0,0 +1,97 @@ +// -*- C++ -*- + +#ifndef HL_FRED_H +#define HL_FRED_H + +/// A thread-wrapper of childlike simplicity :). + +#if defined(_WIN32) + + #include + #include + +#elif defined(__SVR4) + + #include + #include + #include + +#else + + #include + #include + +#endif + +typedef void * (*ThreadFunctionType) (void *); + +namespace HL { + +class Fred { +public: + + Fred() { +#if !defined(_WIN32) + pthread_attr_init (&attr); + pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM); +#endif + } + + ~Fred() { +#if !defined(_WIN32) + pthread_attr_destroy (&attr); +#endif + } + + void create (ThreadFunctionType function, void * arg) { +#if defined(_WIN32) + t = CreateThread (0, 0, (LPTHREAD_START_ROUTINE) *function, (LPVOID) arg, 0, 0); +#else + pthread_create (&t, &attr, function, arg); +#endif + } + + void join (void) { +#if defined(_WIN32) + WaitForSingleObject (t, INFINITE); +#else + pthread_join (t, NULL); +#endif + } + + static void yield (void) { +#if defined(_WIN32) + Sleep (0); +#elif defined(__SVR4) + thr_yield(); +#else + sched_yield(); +#endif + } + + + static void setConcurrency (int n) { +#if defined(_WIN32) +#elif defined(__SVR4) + thr_setconcurrency (n); +#else + pthread_setconcurrency (n); +#endif + } + + +private: +#if defined(_WIN32) + typedef HANDLE FredType; +#else + typedef pthread_t FredType; + pthread_attr_t attr; +#endif + + FredType t; +}; + +} + + +#endif diff --git a/src/benchmarks/falsesharing/timer.h b/src/benchmarks/falsesharing/timer.h new file mode 100644 index 0000000..d4d42c7 --- /dev/null +++ b/src/benchmarks/falsesharing/timer.h @@ -0,0 +1,372 @@ +/* -*- C++ -*- */ + +/* + + Heap Layers: An Extensible Memory Allocation Infrastructure + + Copyright (C) 2000-2003 by Emery Berger + http://www.cs.umass.edu/~emery + emery@cs.umass.edu + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +*/ + +#include +#include + + +#ifndef _TIMER_H_ +#define _TIMER_H_ + +/** + * @class Timer + * @brief A portable class for high-resolution timing. + * + * This class simplifies timing measurements across a number of platforms. + * + * @code + * Timer t; + * t.start(); + * // do some work + * t.stop(); + * cout << "That took " << (double) t << " seconds." << endl; + * @endcode + * + */ + +#ifdef __APPLE__ +#include +#endif + +#if defined(__linux__) && defined(__GNUG__) && defined(__i386__) + +#include +#include +#include +#include +#include +#include + +static void getTime (unsigned long& tlo, unsigned long& thi) { + asm volatile ("rdtsc" + : "=a"(tlo), + "=d" (thi)); +} + + +static double getFrequency (void) { + static double freq = 0.0; + static bool initialized = false; + unsigned long LTime0, LTime1, HTime0, HTime1; + if (!initialized) { + + freq = 2600000.0; + +#if 0 + // Compute MHz directly. + // Wait for approximately one second. + + getTime (LTime0, HTime0); + // printf ("waiting...\n"); + struct timespec rqtp, rmtp; + rqtp.tv_sec = 1; + rqtp.tv_nsec = 0; + nanosleep (&rqtp, &rmtp); + // printf ("done.\n"); + getTime (LTime1, HTime1); + + freq = (double)(LTime1 - LTime0) + (double)(UINT_MAX)*(double)(HTime1 - HTime0); + if (LTime1 < LTime0) { + freq -= (double)UINT_MAX; + } +#endif + initialized = true; + + } else { + // printf ("wha?\n"); + } + return freq; +} + + +namespace HL { + +class Timer { +public: + Timer (void) + : timeElapsed (0.0) + { + _frequency = getFrequency(); + // printf ("wooo!\n"); + // printf ("freq = %lf\n", frequency); + } + void start (void) { + getTime (currentLo, currentHi); + } + void stop (void) { + unsigned long lo, hi; + getTime (lo, hi); + double now = (double) hi * 4294967296.0 + lo; + double prev = (double) currentHi * 4294967296.0 + currentLo; + timeElapsed = (now - prev) / _frequency; + } + + operator double (void) { + return timeElapsed; + } + +private: + double timeElapsed; + unsigned long currentLo, currentHi; + double _frequency; +}; + +}; + +#else + + +#ifdef __SVR4 // Solaris +#include +#include +#include +#include +#include +#endif // __SVR4 + +#include + +#if defined(unix) || defined(__linux) +#include +#include +#endif + + +#ifdef __sgi +#include +#include +#include +#endif + + +#if defined(_WIN32) +#include +#endif + + +#if defined(__BEOS__) +#include +#endif + + +namespace HL { + +class Timer { + +public: + + /// Initializes the timer. + Timer (void) +#if !defined(_WIN32) + : _starttime (0), + _elapsedtime (0) +#endif + { + } + + /// Start the timer. + void start (void) { _starttime = _time(); } + + /// Stop the timer. + void stop (void) { _elapsedtime += _time() - _starttime; } + + /// Reset the timer. + void reset (void) { _starttime = _elapsedtime; } + +#if 0 + // Set the timer. + void set (double secs) { _starttime = 0; _elapsedtime = _sectotime (secs);} +#endif + + /// Return the number of seconds elapsed. + operator double (void) { return _timetosec (_elapsedtime); } + + static double currentTime (void) { TimeType t; t = _time(); return _timetosec (t); } + + +private: + + // The _timer variable will be different depending on the OS. + // We try to use the best timer available. + +#ifdef __sgi +#define TIMER_FOUND + + long _starttime, _elapsedtime; + + long _time (void) { + struct tms t; + long ticks = times (&t); + return ticks; + } + + static double _timetosec (long t) { + return ((double) (t) / CLK_TCK); + } + + static long _sectotime (double sec) { + return (long) sec * CLK_TCK; + } +#endif + +#ifdef __SVR4 // Solaris +#define TIMER_FOUND + typedef hrtime_t TimeType; + TimeType _starttime, _elapsedtime; + + static TimeType _time (void) { + return gethrtime(); + } + + static TimeType _sectotime (double sec) { return (hrtime_t) (sec * 1.0e9); } + + static double _timetosec (TimeType& t) { + return ((double) (t) / 1.0e9); + } +#endif // __SVR4 + +#if defined(MAC) || defined(macintosh) +#define TIMER_FOUND + double _starttime, _elapsedtime; + + double _time (void) { + return get_Mac_microseconds(); + } + + double _timetosec (hrtime_t& t) { + return t; + } +#endif // MAC + +#ifdef _WIN32 +#define TIMER_FOUND + +#ifndef __GNUC__ + class TimeType { + public: + TimeType (void) + { + largeInt.QuadPart = 0; + } + operator double& (void) { return (double&) largeInt.QuadPart; } + operator LARGE_INTEGER& (void) { return largeInt; } + double timeToSec (void) { + return (double) largeInt.QuadPart / getFreq(); + } + private: + double getFreq (void) { + QueryPerformanceFrequency (&freq); + return (double) freq.QuadPart; + } + + LARGE_INTEGER largeInt; + LARGE_INTEGER freq; + }; + + TimeType _starttime, _elapsedtime; + + static TimeType _time (void) { + TimeType t; + int r = QueryPerformanceCounter (&((LARGE_INTEGER&) t)); + assert (r); + return t; + } + + static double _timetosec (TimeType& t) { + return t.timeToSec(); + } +#else + typedef DWORD TimeType; + DWORD _starttime, _elapsedtime; + static DWORD _time (void) { + return GetTickCount(); + } + + static double _timetosec (DWORD& t) { + return (double) t / 100000.0; + } + static unsigned long _sectotime (double sec) { + return (unsigned long)(sec); + } +#endif +#endif // _WIN32 + + +#ifdef __BEOS__ +#define TIMER_FOUND + bigtime_t _starttime, _elapsedtime; + bigtime_t _time(void) { + return system_time(); + } + double _timetosec (bigtime_t& t) { + return (double) t / 1000000.0; + } + + bigtime_t _sectotime (double sec) { + return (bigtime_t)(sec * 1000000.0); + } +#endif // __BEOS__ + +#ifndef TIMER_FOUND + + typedef long TimeType; + TimeType _starttime, _elapsedtime; + + static TimeType _time (void) { + struct timeval t; + gettimeofday (&t, NULL); + return t.tv_sec * 1000000 + t.tv_usec; + } + + static double _timetosec (TimeType t) { + return ((double) (t) / 1000000.0); + } + + static TimeType _sectotime (double sec) { + return (TimeType) (sec * 1000000.0); + } + +#endif // TIMER_FOUND + +#undef TIMER_FOUND + +}; + + +#ifdef __SVR4 // Solaris +class VirtualTimer : public Timer { +public: + hrtime_t _time (void) { + return gethrvtime(); + } +}; +#endif + +} + +#endif + +#endif diff --git a/src/benchmarks/fred.h b/src/benchmarks/fred.h deleted file mode 100644 index b0198a7..0000000 --- a/src/benchmarks/fred.h +++ /dev/null @@ -1,97 +0,0 @@ -// -*- C++ -*- - -#ifndef HL_FRED_H -#define HL_FRED_H - -/// A thread-wrapper of childlike simplicity :). - -#if defined(_WIN32) - - #include - #include - -#elif defined(__SVR4) - - #include - #include - #include - -#else - - #include - #include - -#endif - -typedef void * (*ThreadFunctionType) (void *); - -namespace HL { - -class Fred { -public: - - Fred() { -#if !defined(_WIN32) - pthread_attr_init (&attr); - pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM); -#endif - } - - ~Fred() { -#if !defined(_WIN32) - pthread_attr_destroy (&attr); -#endif - } - - void create (ThreadFunctionType function, void * arg) { -#if defined(_WIN32) - t = CreateThread (0, 0, (LPTHREAD_START_ROUTINE) *function, (LPVOID) arg, 0, 0); -#else - pthread_create (&t, &attr, function, arg); -#endif - } - - void join (void) { -#if defined(_WIN32) - WaitForSingleObject (t, INFINITE); -#else - pthread_join (t, NULL); -#endif - } - - static void yield (void) { -#if defined(_WIN32) - Sleep (0); -#elif defined(__SVR4) - thr_yield(); -#else - sched_yield(); -#endif - } - - - static void setConcurrency (int n) { -#if defined(_WIN32) -#elif defined(__SVR4) - thr_setconcurrency (n); -#else - pthread_setconcurrency (n); -#endif - } - - -private: -#if defined(_WIN32) - typedef HANDLE FredType; -#else - typedef pthread_t FredType; - pthread_attr_t attr; -#endif - - FredType t; -}; - -} - - -#endif diff --git a/src/benchmarks/larson.cc b/src/benchmarks/larson.cc deleted file mode 100644 index be8038f..0000000 --- a/src/benchmarks/larson.cc +++ /dev/null @@ -1,744 +0,0 @@ -#include -#include - -#if defined(_WIN32) -#define __WIN32__ -#endif - -#ifdef __WIN32__ -#include -#include -#include - -#else -#include -#include -#include - -#ifndef __SVR4 -//extern "C" int pthread_setconcurrency (int) throw(); -#include -#endif - - -typedef void * LPVOID; -typedef long long LONGLONG; -typedef long DWORD; -typedef long LONG; -typedef unsigned long ULONG; -typedef union _LARGE_INTEGER { - struct { - DWORD LowPart; - LONG HighPart; - } foo; - LONGLONG QuadPart; // In Visual C++, a typedef to _ _int64} LARGE_INTEGER; -} LARGE_INTEGER; -typedef long long _int64; -#ifndef TRUE -enum { TRUE = 1, FALSE = 0 }; -#endif -#include -#define _ASSERTE(x) assert(x) -#define _inline inline -void Sleep (long x) -{ - // printf ("sleeping for %ld seconds.\n", x/1000); - sleep(x/1000); -} - -void QueryPerformanceCounter (long * x) -{ - struct timezone tz; - struct timeval tv; - gettimeofday (&tv, &tz); - *x = tv.tv_sec * 1000000L + tv.tv_usec; -} - -void QueryPerformanceFrequency(long * x) -{ - *x = 1000000L; -} - - -#include -#include -#include -#include -#include -#include -#include - -#define _REENTRANT 1 -#include -#ifdef __sun -#include -#endif -typedef void * VoidFunction (void *); -void _beginthread (VoidFunction x, int, void * z) -{ - pthread_t pt; - pthread_attr_t pa; - pthread_attr_init (&pa); - -#if 1//defined(__SVR4) - pthread_attr_setscope (&pa, PTHREAD_SCOPE_SYSTEM); /* bound behavior */ -#endif - - // printf ("creating a thread.\n"); - int v = pthread_create(&pt, &pa, x, z); - // printf ("v = %d\n", v); -} -#endif - - -#if 0 -static char buf[65536]; - -#define malloc(v) &buf -#define free(p) -#endif - -#undef CPP -//#define CPP -//#include "arch-specific.h" - -#if USE_ROCKALL -//#include "FastHeap.hpp" -//FAST_HEAP theFastHeap (1024 * 1024, true, true, true); - -typedef int SBIT32; - -#include "SmpHeap.hpp" -SMP_HEAP theFastHeap (1024 * 1024, true, true, true); - -void * operator new( unsigned int cb ) -{ - void *pRet = theFastHeap.New ((size_t)cb) ; - return pRet; -} - -void operator delete(void *pUserData ) -{ - theFastHeap.Delete (pUserData) ; -} -#endif - -#if 0 -extern "C" void * hdmalloc (size_t sz) ; -extern "C" void hdfree (void * ptr) ; -extern "C" void hdmalloc_stats (void) ; -void * operator new( unsigned int cb ) -{ - void *pRet = hdmalloc((size_t)cb) ; - return pRet; -} - -void operator delete(void *pUserData ) -{ - hdfree(pUserData) ; -} -#endif - - - -/* Test driver for memory allocators */ -/* Author: Paul Larson, palarson@microsoft.com */ -#define MAX_THREADS 100 -#define MAX_BLOCKS 20000000 - -int volatile stopflag=FALSE ; - -struct lran2_st { - long x, y, v[97]; -}; - -int TotalAllocs=0 ; - -typedef struct thr_data { - - int threadno ; - int NumBlocks ; - int seed ; - - int min_size ; - int max_size ; - - char * *array ; - int *blksize ; - int asize ; - - unsigned long cAllocs ; - unsigned long cFrees ; - int cThreads ; - unsigned long cBytesAlloced ; - - volatile int finished ; - struct lran2_st rgen ; - -} thread_data; - -void runthreads(long sleep_cnt, int min_threads, int max_threads, - int chperthread, int num_rounds) ; -void runloops(long sleep_cnt, int num_chunks ) ; -static void warmup(char **blkp, int num_chunks ); -static void * exercise_heap( void *pinput) ; -static void lran2_init(struct lran2_st* d, long seed) ; -static long lran2(struct lran2_st* d) ; -ULONG CountReservedSpace() ; - -char ** blkp = new char *[MAX_BLOCKS] ; -int * blksize = new int[MAX_BLOCKS] ; -long seqlock=0 ; -struct lran2_st rgen ; -int min_size=10, max_size=500 ; -int num_threads ; -ULONG init_space ; - -extern int cLockSleeps ; -extern int cAllocedChunks ; -extern int cAllocedSpace ; -extern int cUsedSpace ; -extern int cFreeChunks ; -extern int cFreeSpace ; - -int cChecked=0 ; - -#if defined(_WIN32) -extern "C" { - extern HANDLE crtheap; -}; -#endif - -int main (int argc, char *argv[]) -{ -#if defined(USE_LFH) && defined(_WIN32) - // Activate 'Low Fragmentation Heap'. - ULONG info = 2; - HeapSetInformation (GetProcessHeap(), - HeapCompatibilityInformation, - &info, - sizeof(info)); -#endif -#if 0 // defined(__SVR4) - { - psinfo_t ps; - int pid = getpid(); - char fname[255]; - sprintf (fname, "/proc/%d/psinfo", pid); - // sprintf (fname, "/proc/self/ps"); - FILE * f = fopen (fname, "rb"); - printf ("opening %s\n", fname); - if (f) { - fread (&ps, sizeof(ps), 1, f); - printf ("resident set size = %dK\n", ps.pr_rssize); - fclose (f); - } - } -#endif - -#if defined(_MT) || defined(_REENTRANT) - int min_threads, max_threads ; - int num_rounds ; - int chperthread ; -#endif - unsigned seed=12345 ; - int num_chunks=10000; - long sleep_cnt; - - if (argc > 7) { - sleep_cnt = atoi(argv[1]); - min_size = atoi(argv[2]); - max_size = atoi(argv[3]); - chperthread = atoi(argv[4]); - num_rounds = atoi(argv[5]); - seed = atoi(argv[6]); - max_threads = atoi(argv[7]); - min_threads = max_threads; - printf ("sleep = %ld, min = %d, max = %d, per thread = %d, num rounds = %d, seed = %d, max_threads = %d, min_threads = %d\n", - sleep_cnt, min_size, max_size, chperthread, num_rounds, seed, max_threads, min_threads); - goto DoneWithInput; - } - -#if defined(_MT) || defined(_REENTRANT) - //#ifdef _MT - printf( "\nMulti-threaded test driver \n") ; -#else - printf( "\nSingle-threaded test driver \n") ; -#endif -#ifdef CPP - printf("C++ version (new and delete)\n") ; -#else - printf("C version (malloc and free)\n") ; -#endif - printf("runtime (sec): ") ; - scanf ("%ld", &sleep_cnt); - - printf("chunk size (min,max): ") ; - scanf("%d %d", &min_size, &max_size ) ; -#if defined(_MT) || defined(_REENTRANT) - //#ifdef _MT - printf("threads (min, max): ") ; - scanf("%d %d", &min_threads, &max_threads) ; - printf("chunks/thread: ") ; scanf("%d", &chperthread ) ; - printf("no of rounds: ") ; scanf("%d", &num_rounds ) ; - num_chunks = max_threads*chperthread ; -#else - printf("no of chunks: ") ; scanf("%d", &num_chunks ) ; -#endif - printf("random seed: ") ; scanf("%d", &seed) ; - - DoneWithInput: - - if( num_chunks > MAX_BLOCKS ){ - printf("Max %d chunks - exiting\n", MAX_BLOCKS ) ; - return(1) ; - } - -#ifndef __WIN32__ -#ifdef __SVR4 - pthread_setconcurrency (max_threads); -#endif -#endif - - lran2_init(&rgen, seed) ; - // init_space = CountReservedSpace() ; - -#if defined(_MT) || defined(_REENTRANT) - //#ifdef _MT - runthreads(sleep_cnt, min_threads, max_threads, chperthread, num_rounds) ; -#else - runloops(sleep_cnt, num_chunks ) ; -#endif - -#ifdef _DEBUG - _cputs("Hit any key to exit...") ; (void)_getch() ; -#endif - - return 0; - -} /* main */ - -void runloops(long sleep_cnt, int num_chunks ) -{ - int cblks ; - int victim ; - int blk_size ; -#ifdef __WIN32__ - _LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt; -#else - long ticks_per_sec ; - long start_cnt, end_cnt ; -#endif - _int64 ticks ; - double duration ; - double reqd_space ; - ULONG used_space ; - int sum_allocs=0 ; - - QueryPerformanceFrequency( &ticks_per_sec ) ; - QueryPerformanceCounter( &start_cnt) ; - - for( cblks=0; cblks= sleep_cnt) break ; - } - reqd_space = (0.5*(min_size+max_size)*num_chunks) ; - // used_space = CountReservedSpace() - init_space; - - printf("%6.3f", duration ) ; - printf("%8.0f", sum_allocs/duration ) ; - printf(" %6.3f %.3f", (double)used_space/(1024*1024), used_space/reqd_space) ; - printf("\n") ; - -} - - -#if defined(_MT) || defined(_REENTRANT) -//#ifdef _MT -void runthreads(long sleep_cnt, int min_threads, int max_threads, int chperthread, int num_rounds) -{ - thread_data *de_area = new thread_data[max_threads] ; - thread_data *pdea; - int nperthread ; - int sum_threads ; - unsigned long sum_allocs ; - unsigned long sum_frees ; - double duration ; -#ifdef __WIN32__ - _LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt; -#else - long ticks_per_sec ; - long start_cnt, end_cnt ; -#endif - _int64 ticks ; - double rate_1=0, rate_n ; - double reqd_space ; - ULONG used_space ; - int prevthreads ; - int i ; - - QueryPerformanceFrequency( &ticks_per_sec ) ; - - pdea = &de_area[0] ; - memset(&de_area[0], 0, sizeof(thread_data)) ; - - prevthreads = 0 ; - for(num_threads=min_threads; num_threads <= max_threads; num_threads++ ) - { - - warmup(&blkp[prevthreads*chperthread], (num_threads-prevthreads)*chperthread ); - - nperthread = chperthread ; - stopflag = FALSE ; - - for(i=0; i< num_threads; i++){ - de_area[i].threadno = i+1 ; - de_area[i].NumBlocks = num_rounds*nperthread; - de_area[i].array = &blkp[i*nperthread] ; - de_area[i].blksize = &blksize[i*nperthread] ; - de_area[i].asize = nperthread ; - de_area[i].min_size = min_size ; - de_area[i].max_size = max_size ; - de_area[i].seed = lran2(&rgen) ; ; - de_area[i].finished = 0 ; - de_area[i].cAllocs = 0 ; - de_area[i].cFrees = 0 ; - de_area[i].cThreads = 0 ; - de_area[i].finished = FALSE ; - lran2_init(&de_area[i].rgen, de_area[i].seed) ; - -#ifdef __WIN32__ - _beginthread((void (__cdecl*)(void *)) exercise_heap, 0, &de_area[i]) ; -#else - _beginthread(exercise_heap, 0, &de_area[i]) ; -#endif - - } - - QueryPerformanceCounter( &start_cnt) ; - - // printf ("Sleeping for %ld seconds.\n", sleep_cnt); - Sleep(sleep_cnt * 1000L) ; - - stopflag = TRUE ; - - for(i=0; ifinished = FALSE ; - pdea->cThreads++ ; - range = pdea->max_size - pdea->min_size ; - - /* allocate NumBlocks chunks of random size */ - for( cblks=0; cblksNumBlocks; cblks++){ - victim = lran2(&pdea->rgen)%pdea->asize ; -#ifdef CPP - delete pdea->array[victim] ; -#else - free(pdea->array[victim]) ; -#endif - pdea->cFrees++ ; - - if (range == 0) { - blk_size = pdea->min_size; - } else { - blk_size = pdea->min_size+lran2(&pdea->rgen)%range ; - } -#ifdef CPP - pdea->array[victim] = new char[blk_size] ; -#else - pdea->array[victim] = (char *) malloc(blk_size) ; -#endif - - pdea->blksize[victim] = blk_size ; - assert(pdea->array[victim] != NULL) ; - - pdea->cAllocs++ ; - - /* Write something! */ - - volatile char * chptr = ((char *) pdea->array[victim]); - *chptr++ = 'a'; - volatile char ch = *((char *) pdea->array[victim]); - *chptr = 'b'; - - - if( stopflag ) break ; - } - - // printf("Thread %u terminating: %d allocs, %d frees\n", - // pdea->threadno, pdea->cAllocs, pdea->cFrees) ; - pdea->finished = TRUE ; - - if( !stopflag ){ -#ifdef __WIN32__ - _beginthread((void (__cdecl*)(void *)) exercise_heap, 0, pdea) ; -#else - _beginthread(exercise_heap, 0, pdea) ; -#endif - } else { - printf ("thread stopping.\n"); - } -#ifndef _WIN32 - pthread_exit (NULL); -#endif - return 0; -} - -static void warmup(char **blkp, int num_chunks ) -{ - int cblks ; - int victim ; - int blk_size ; - LPVOID tmp ; - - - for( cblks=0; cblks 0 ; cblks--){ - victim = lran2(&rgen)%cblks ; - tmp = blkp[victim] ; - blkp[victim] = blkp[cblks-1] ; - blkp[cblks-1] = (char *) tmp ; - } - - for( cblks=0; cblks<4*num_chunks; cblks++){ - victim = lran2(&rgen)%num_chunks ; -#ifdef CPP - delete blkp[victim] ; -#else - free(blkp[victim]) ; -#endif - - if (max_size == min_size) { - blk_size = min_size; - } else { - blk_size = min_size+lran2(&rgen)%(max_size - min_size) ; - } -#ifdef CPP - blkp[victim] = new char[blk_size] ; -#else - blkp[victim] = (char *) malloc(blk_size) ; -#endif - blksize[victim] = blk_size ; - assert(blkp[victim] != NULL) ; - } -} -#endif // _MT - -#ifdef __WIN32__ -ULONG CountReservedSpace() -{ - MEMORY_BASIC_INFORMATION info; - char *addr=NULL ; - ULONG size=0 ; - - while( true){ - VirtualQuery(addr, &info, sizeof(info)); - switch( info.State){ - case MEM_FREE: - case MEM_RESERVE: - break ; - case MEM_COMMIT: - size += info.RegionSize ; - break ; - } - addr += info.RegionSize ; - if( addr >= (char *)0x80000000UL ) break ; - } - - return size ; - -} -#endif - -// ======================================================= - -/* lran2.h - * by Wolfram Gloger 1996. - * - * A small, portable pseudo-random number generator. - */ - -#ifndef _LRAN2_H -#define _LRAN2_H - -#define LRAN2_MAX 714025l /* constants for portable */ -#define IA 1366l /* random number generator */ -#define IC 150889l /* (see e.g. `Numerical Recipes') */ - -//struct lran2_st { -// long x, y, v[97]; -//}; - -static void -lran2_init(struct lran2_st* d, long seed) -{ - long x; - int j; - - x = (IC - seed) % LRAN2_MAX; - if(x < 0) x = -x; - for(j=0; j<97; j++) { - x = (IA*x + IC) % LRAN2_MAX; - d->v[j] = x; - } - d->x = (IA*x + IC) % LRAN2_MAX; - d->y = d->x; -} - -static -long lran2(struct lran2_st* d) -{ - int j = (d->y % 97); - - d->y = d->v[j]; - d->x = (IA*d->x + IC) % LRAN2_MAX; - d->v[j] = d->x; - return d->y; -} - -#undef IA -#undef IC - -#endif - - diff --git a/src/benchmarks/larson/Makefile b/src/benchmarks/larson/Makefile new file mode 100644 index 0000000..9ccce9f --- /dev/null +++ b/src/benchmarks/larson/Makefile @@ -0,0 +1,24 @@ +OBJDIR ?= obj + +CXX ?= g++ + +WARNFLAGS ?= -Wall -Wextra +COMMONFLAGS ?= -fno-builtin -fPIC -DPIC -pthread +OPTFLAGS ?= -O3 -DNDEBUG + +CXXFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS) + +LDXXFLAGS ?= -pthread -static-libgcc -static-libstdc++ + +.PHONY: all clean + +all: $(OBJDIR)/larson + +$(OBJDIR)/larson: larson.cc | $(OBJDIR) + $(CXX) $(LDXXFLAGS) $(CXXFLAGS) -o $@ $< + +$(OBJDIR): + mkdir $@ + +clean: + rm -rf $(OBJDIR) diff --git a/src/benchmarks/larson/larson.cc b/src/benchmarks/larson/larson.cc new file mode 100644 index 0000000..be8038f --- /dev/null +++ b/src/benchmarks/larson/larson.cc @@ -0,0 +1,744 @@ +#include +#include + +#if defined(_WIN32) +#define __WIN32__ +#endif + +#ifdef __WIN32__ +#include +#include +#include + +#else +#include +#include +#include + +#ifndef __SVR4 +//extern "C" int pthread_setconcurrency (int) throw(); +#include +#endif + + +typedef void * LPVOID; +typedef long long LONGLONG; +typedef long DWORD; +typedef long LONG; +typedef unsigned long ULONG; +typedef union _LARGE_INTEGER { + struct { + DWORD LowPart; + LONG HighPart; + } foo; + LONGLONG QuadPart; // In Visual C++, a typedef to _ _int64} LARGE_INTEGER; +} LARGE_INTEGER; +typedef long long _int64; +#ifndef TRUE +enum { TRUE = 1, FALSE = 0 }; +#endif +#include +#define _ASSERTE(x) assert(x) +#define _inline inline +void Sleep (long x) +{ + // printf ("sleeping for %ld seconds.\n", x/1000); + sleep(x/1000); +} + +void QueryPerformanceCounter (long * x) +{ + struct timezone tz; + struct timeval tv; + gettimeofday (&tv, &tz); + *x = tv.tv_sec * 1000000L + tv.tv_usec; +} + +void QueryPerformanceFrequency(long * x) +{ + *x = 1000000L; +} + + +#include +#include +#include +#include +#include +#include +#include + +#define _REENTRANT 1 +#include +#ifdef __sun +#include +#endif +typedef void * VoidFunction (void *); +void _beginthread (VoidFunction x, int, void * z) +{ + pthread_t pt; + pthread_attr_t pa; + pthread_attr_init (&pa); + +#if 1//defined(__SVR4) + pthread_attr_setscope (&pa, PTHREAD_SCOPE_SYSTEM); /* bound behavior */ +#endif + + // printf ("creating a thread.\n"); + int v = pthread_create(&pt, &pa, x, z); + // printf ("v = %d\n", v); +} +#endif + + +#if 0 +static char buf[65536]; + +#define malloc(v) &buf +#define free(p) +#endif + +#undef CPP +//#define CPP +//#include "arch-specific.h" + +#if USE_ROCKALL +//#include "FastHeap.hpp" +//FAST_HEAP theFastHeap (1024 * 1024, true, true, true); + +typedef int SBIT32; + +#include "SmpHeap.hpp" +SMP_HEAP theFastHeap (1024 * 1024, true, true, true); + +void * operator new( unsigned int cb ) +{ + void *pRet = theFastHeap.New ((size_t)cb) ; + return pRet; +} + +void operator delete(void *pUserData ) +{ + theFastHeap.Delete (pUserData) ; +} +#endif + +#if 0 +extern "C" void * hdmalloc (size_t sz) ; +extern "C" void hdfree (void * ptr) ; +extern "C" void hdmalloc_stats (void) ; +void * operator new( unsigned int cb ) +{ + void *pRet = hdmalloc((size_t)cb) ; + return pRet; +} + +void operator delete(void *pUserData ) +{ + hdfree(pUserData) ; +} +#endif + + + +/* Test driver for memory allocators */ +/* Author: Paul Larson, palarson@microsoft.com */ +#define MAX_THREADS 100 +#define MAX_BLOCKS 20000000 + +int volatile stopflag=FALSE ; + +struct lran2_st { + long x, y, v[97]; +}; + +int TotalAllocs=0 ; + +typedef struct thr_data { + + int threadno ; + int NumBlocks ; + int seed ; + + int min_size ; + int max_size ; + + char * *array ; + int *blksize ; + int asize ; + + unsigned long cAllocs ; + unsigned long cFrees ; + int cThreads ; + unsigned long cBytesAlloced ; + + volatile int finished ; + struct lran2_st rgen ; + +} thread_data; + +void runthreads(long sleep_cnt, int min_threads, int max_threads, + int chperthread, int num_rounds) ; +void runloops(long sleep_cnt, int num_chunks ) ; +static void warmup(char **blkp, int num_chunks ); +static void * exercise_heap( void *pinput) ; +static void lran2_init(struct lran2_st* d, long seed) ; +static long lran2(struct lran2_st* d) ; +ULONG CountReservedSpace() ; + +char ** blkp = new char *[MAX_BLOCKS] ; +int * blksize = new int[MAX_BLOCKS] ; +long seqlock=0 ; +struct lran2_st rgen ; +int min_size=10, max_size=500 ; +int num_threads ; +ULONG init_space ; + +extern int cLockSleeps ; +extern int cAllocedChunks ; +extern int cAllocedSpace ; +extern int cUsedSpace ; +extern int cFreeChunks ; +extern int cFreeSpace ; + +int cChecked=0 ; + +#if defined(_WIN32) +extern "C" { + extern HANDLE crtheap; +}; +#endif + +int main (int argc, char *argv[]) +{ +#if defined(USE_LFH) && defined(_WIN32) + // Activate 'Low Fragmentation Heap'. + ULONG info = 2; + HeapSetInformation (GetProcessHeap(), + HeapCompatibilityInformation, + &info, + sizeof(info)); +#endif +#if 0 // defined(__SVR4) + { + psinfo_t ps; + int pid = getpid(); + char fname[255]; + sprintf (fname, "/proc/%d/psinfo", pid); + // sprintf (fname, "/proc/self/ps"); + FILE * f = fopen (fname, "rb"); + printf ("opening %s\n", fname); + if (f) { + fread (&ps, sizeof(ps), 1, f); + printf ("resident set size = %dK\n", ps.pr_rssize); + fclose (f); + } + } +#endif + +#if defined(_MT) || defined(_REENTRANT) + int min_threads, max_threads ; + int num_rounds ; + int chperthread ; +#endif + unsigned seed=12345 ; + int num_chunks=10000; + long sleep_cnt; + + if (argc > 7) { + sleep_cnt = atoi(argv[1]); + min_size = atoi(argv[2]); + max_size = atoi(argv[3]); + chperthread = atoi(argv[4]); + num_rounds = atoi(argv[5]); + seed = atoi(argv[6]); + max_threads = atoi(argv[7]); + min_threads = max_threads; + printf ("sleep = %ld, min = %d, max = %d, per thread = %d, num rounds = %d, seed = %d, max_threads = %d, min_threads = %d\n", + sleep_cnt, min_size, max_size, chperthread, num_rounds, seed, max_threads, min_threads); + goto DoneWithInput; + } + +#if defined(_MT) || defined(_REENTRANT) + //#ifdef _MT + printf( "\nMulti-threaded test driver \n") ; +#else + printf( "\nSingle-threaded test driver \n") ; +#endif +#ifdef CPP + printf("C++ version (new and delete)\n") ; +#else + printf("C version (malloc and free)\n") ; +#endif + printf("runtime (sec): ") ; + scanf ("%ld", &sleep_cnt); + + printf("chunk size (min,max): ") ; + scanf("%d %d", &min_size, &max_size ) ; +#if defined(_MT) || defined(_REENTRANT) + //#ifdef _MT + printf("threads (min, max): ") ; + scanf("%d %d", &min_threads, &max_threads) ; + printf("chunks/thread: ") ; scanf("%d", &chperthread ) ; + printf("no of rounds: ") ; scanf("%d", &num_rounds ) ; + num_chunks = max_threads*chperthread ; +#else + printf("no of chunks: ") ; scanf("%d", &num_chunks ) ; +#endif + printf("random seed: ") ; scanf("%d", &seed) ; + + DoneWithInput: + + if( num_chunks > MAX_BLOCKS ){ + printf("Max %d chunks - exiting\n", MAX_BLOCKS ) ; + return(1) ; + } + +#ifndef __WIN32__ +#ifdef __SVR4 + pthread_setconcurrency (max_threads); +#endif +#endif + + lran2_init(&rgen, seed) ; + // init_space = CountReservedSpace() ; + +#if defined(_MT) || defined(_REENTRANT) + //#ifdef _MT + runthreads(sleep_cnt, min_threads, max_threads, chperthread, num_rounds) ; +#else + runloops(sleep_cnt, num_chunks ) ; +#endif + +#ifdef _DEBUG + _cputs("Hit any key to exit...") ; (void)_getch() ; +#endif + + return 0; + +} /* main */ + +void runloops(long sleep_cnt, int num_chunks ) +{ + int cblks ; + int victim ; + int blk_size ; +#ifdef __WIN32__ + _LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt; +#else + long ticks_per_sec ; + long start_cnt, end_cnt ; +#endif + _int64 ticks ; + double duration ; + double reqd_space ; + ULONG used_space ; + int sum_allocs=0 ; + + QueryPerformanceFrequency( &ticks_per_sec ) ; + QueryPerformanceCounter( &start_cnt) ; + + for( cblks=0; cblks= sleep_cnt) break ; + } + reqd_space = (0.5*(min_size+max_size)*num_chunks) ; + // used_space = CountReservedSpace() - init_space; + + printf("%6.3f", duration ) ; + printf("%8.0f", sum_allocs/duration ) ; + printf(" %6.3f %.3f", (double)used_space/(1024*1024), used_space/reqd_space) ; + printf("\n") ; + +} + + +#if defined(_MT) || defined(_REENTRANT) +//#ifdef _MT +void runthreads(long sleep_cnt, int min_threads, int max_threads, int chperthread, int num_rounds) +{ + thread_data *de_area = new thread_data[max_threads] ; + thread_data *pdea; + int nperthread ; + int sum_threads ; + unsigned long sum_allocs ; + unsigned long sum_frees ; + double duration ; +#ifdef __WIN32__ + _LARGE_INTEGER ticks_per_sec, start_cnt, end_cnt; +#else + long ticks_per_sec ; + long start_cnt, end_cnt ; +#endif + _int64 ticks ; + double rate_1=0, rate_n ; + double reqd_space ; + ULONG used_space ; + int prevthreads ; + int i ; + + QueryPerformanceFrequency( &ticks_per_sec ) ; + + pdea = &de_area[0] ; + memset(&de_area[0], 0, sizeof(thread_data)) ; + + prevthreads = 0 ; + for(num_threads=min_threads; num_threads <= max_threads; num_threads++ ) + { + + warmup(&blkp[prevthreads*chperthread], (num_threads-prevthreads)*chperthread ); + + nperthread = chperthread ; + stopflag = FALSE ; + + for(i=0; i< num_threads; i++){ + de_area[i].threadno = i+1 ; + de_area[i].NumBlocks = num_rounds*nperthread; + de_area[i].array = &blkp[i*nperthread] ; + de_area[i].blksize = &blksize[i*nperthread] ; + de_area[i].asize = nperthread ; + de_area[i].min_size = min_size ; + de_area[i].max_size = max_size ; + de_area[i].seed = lran2(&rgen) ; ; + de_area[i].finished = 0 ; + de_area[i].cAllocs = 0 ; + de_area[i].cFrees = 0 ; + de_area[i].cThreads = 0 ; + de_area[i].finished = FALSE ; + lran2_init(&de_area[i].rgen, de_area[i].seed) ; + +#ifdef __WIN32__ + _beginthread((void (__cdecl*)(void *)) exercise_heap, 0, &de_area[i]) ; +#else + _beginthread(exercise_heap, 0, &de_area[i]) ; +#endif + + } + + QueryPerformanceCounter( &start_cnt) ; + + // printf ("Sleeping for %ld seconds.\n", sleep_cnt); + Sleep(sleep_cnt * 1000L) ; + + stopflag = TRUE ; + + for(i=0; ifinished = FALSE ; + pdea->cThreads++ ; + range = pdea->max_size - pdea->min_size ; + + /* allocate NumBlocks chunks of random size */ + for( cblks=0; cblksNumBlocks; cblks++){ + victim = lran2(&pdea->rgen)%pdea->asize ; +#ifdef CPP + delete pdea->array[victim] ; +#else + free(pdea->array[victim]) ; +#endif + pdea->cFrees++ ; + + if (range == 0) { + blk_size = pdea->min_size; + } else { + blk_size = pdea->min_size+lran2(&pdea->rgen)%range ; + } +#ifdef CPP + pdea->array[victim] = new char[blk_size] ; +#else + pdea->array[victim] = (char *) malloc(blk_size) ; +#endif + + pdea->blksize[victim] = blk_size ; + assert(pdea->array[victim] != NULL) ; + + pdea->cAllocs++ ; + + /* Write something! */ + + volatile char * chptr = ((char *) pdea->array[victim]); + *chptr++ = 'a'; + volatile char ch = *((char *) pdea->array[victim]); + *chptr = 'b'; + + + if( stopflag ) break ; + } + + // printf("Thread %u terminating: %d allocs, %d frees\n", + // pdea->threadno, pdea->cAllocs, pdea->cFrees) ; + pdea->finished = TRUE ; + + if( !stopflag ){ +#ifdef __WIN32__ + _beginthread((void (__cdecl*)(void *)) exercise_heap, 0, pdea) ; +#else + _beginthread(exercise_heap, 0, pdea) ; +#endif + } else { + printf ("thread stopping.\n"); + } +#ifndef _WIN32 + pthread_exit (NULL); +#endif + return 0; +} + +static void warmup(char **blkp, int num_chunks ) +{ + int cblks ; + int victim ; + int blk_size ; + LPVOID tmp ; + + + for( cblks=0; cblks 0 ; cblks--){ + victim = lran2(&rgen)%cblks ; + tmp = blkp[victim] ; + blkp[victim] = blkp[cblks-1] ; + blkp[cblks-1] = (char *) tmp ; + } + + for( cblks=0; cblks<4*num_chunks; cblks++){ + victim = lran2(&rgen)%num_chunks ; +#ifdef CPP + delete blkp[victim] ; +#else + free(blkp[victim]) ; +#endif + + if (max_size == min_size) { + blk_size = min_size; + } else { + blk_size = min_size+lran2(&rgen)%(max_size - min_size) ; + } +#ifdef CPP + blkp[victim] = new char[blk_size] ; +#else + blkp[victim] = (char *) malloc(blk_size) ; +#endif + blksize[victim] = blk_size ; + assert(blkp[victim] != NULL) ; + } +} +#endif // _MT + +#ifdef __WIN32__ +ULONG CountReservedSpace() +{ + MEMORY_BASIC_INFORMATION info; + char *addr=NULL ; + ULONG size=0 ; + + while( true){ + VirtualQuery(addr, &info, sizeof(info)); + switch( info.State){ + case MEM_FREE: + case MEM_RESERVE: + break ; + case MEM_COMMIT: + size += info.RegionSize ; + break ; + } + addr += info.RegionSize ; + if( addr >= (char *)0x80000000UL ) break ; + } + + return size ; + +} +#endif + +// ======================================================= + +/* lran2.h + * by Wolfram Gloger 1996. + * + * A small, portable pseudo-random number generator. + */ + +#ifndef _LRAN2_H +#define _LRAN2_H + +#define LRAN2_MAX 714025l /* constants for portable */ +#define IA 1366l /* random number generator */ +#define IC 150889l /* (see e.g. `Numerical Recipes') */ + +//struct lran2_st { +// long x, y, v[97]; +//}; + +static void +lran2_init(struct lran2_st* d, long seed) +{ + long x; + int j; + + x = (IC - seed) % LRAN2_MAX; + if(x < 0) x = -x; + for(j=0; j<97; j++) { + x = (IA*x + IC) % LRAN2_MAX; + d->v[j] = x; + } + d->x = (IA*x + IC) % LRAN2_MAX; + d->y = d->x; +} + +static +long lran2(struct lran2_st* d) +{ + int j = (d->y % 97); + + d->y = d->v[j]; + d->x = (IA*d->x + IC) % LRAN2_MAX; + d->v[j] = d->x; + return d->y; +} + +#undef IA +#undef IC + +#endif + + diff --git a/src/benchmarks/loop/Makefile b/src/benchmarks/loop/Makefile new file mode 100644 index 0000000..89914b2 --- /dev/null +++ b/src/benchmarks/loop/Makefile @@ -0,0 +1,24 @@ +OBJDIR ?= obj + +CC ?= gcc + +WARNFLAGS ?= -Wall -Wextra +COMMONFLAGS ?= -fno-builtin -fPIC -DPIC -pthread +OPTFLAGS ?= -O3 -DNDEBUG + +CFLAGS ?= $(OPTFLAGS) $(WARNFLAGS) $(COMMONFLAGS) + +LDFLAGS ?= -pthread -static-libgcc + +.PHONY = all clean + +all: $(OBJDIR)/loop + +$(OBJDIR)/loop: loop.c | $(OBJDIR) + $(CC) $(LDFLAGS) $(CFLAGS) -o $@ $< + +$(OBJDIR): + mkdir $@ + +clean: + rm -rf $(OBJDIR) diff --git a/src/benchmarks/loop/loop.c b/src/benchmarks/loop/loop.c new file mode 100644 index 0000000..bc15808 --- /dev/null +++ b/src/benchmarks/loop/loop.c @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include + + +static size_t _rand() { + static __thread size_t seed = 123456789; + size_t a = 1103515245; + size_t c = 12345; + size_t m = 1 << 31; + seed = (a * seed + c) % m; + return seed; +} + +typedef struct ThreadArgs { + double benchmark; + int allocations; + int max_size; +} ThreadArgs; + +static void* malloc_then_write(size_t size) { + void* ptr = malloc(size); + // Write to ptr + /* *((char*)ptr) = '!'; */ + return ptr; +} + +static void read_then_free(void* ptr) { + // Read before free + /* char s __attribute__((unused)) = *((char*)ptr); */ + free(ptr); +} +static void* test_thread_func(void* arg) { + ThreadArgs* args = (ThreadArgs*)arg; + + for(int i = 0; i < args->allocations; i++) { + void* ptr = malloc_then_write((_rand() % args->max_size) + 1); + read_then_free(ptr); + } + return NULL; +} + +int main(int argc, char* argv[]) { + pthread_t* threads; + int num_threads; + struct ThreadArgs thread_args; + + if (argc < 4) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + num_threads = atoi(argv[1]); + thread_args.allocations = atoi(argv[2]); + thread_args.max_size = atoi(argv[3]); + + threads = (pthread_t*)malloc(num_threads * sizeof(pthread_t)); + + for (int i = 0; i < num_threads; i++) { + if (0 != pthread_create(&threads[i], NULL, test_thread_func, &thread_args)) { + perror("pthread_create"); + return 1; + } + } + + for(int i = 0; i < num_threads; i++) { + if (0 != pthread_join(threads[i], NULL)) { + perror("pthread_join"); + return 1; + } + } + + if (argc == 5) + { + FILE* f = stdout; + if (strcmp(argv[4],"stdout") != 0) + f = fopen(argv[4], "w"); + malloc_info(0, f); + if (strcmp(argv[4],"stdout") != 0) + fclose(f); + } + + return 0; +} diff --git a/src/benchmarks/timer.h b/src/benchmarks/timer.h deleted file mode 100644 index d4d42c7..0000000 --- a/src/benchmarks/timer.h +++ /dev/null @@ -1,372 +0,0 @@ -/* -*- C++ -*- */ - -/* - - Heap Layers: An Extensible Memory Allocation Infrastructure - - Copyright (C) 2000-2003 by Emery Berger - http://www.cs.umass.edu/~emery - emery@cs.umass.edu - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -*/ - -#include -#include - - -#ifndef _TIMER_H_ -#define _TIMER_H_ - -/** - * @class Timer - * @brief A portable class for high-resolution timing. - * - * This class simplifies timing measurements across a number of platforms. - * - * @code - * Timer t; - * t.start(); - * // do some work - * t.stop(); - * cout << "That took " << (double) t << " seconds." << endl; - * @endcode - * - */ - -#ifdef __APPLE__ -#include -#endif - -#if defined(__linux__) && defined(__GNUG__) && defined(__i386__) - -#include -#include -#include -#include -#include -#include - -static void getTime (unsigned long& tlo, unsigned long& thi) { - asm volatile ("rdtsc" - : "=a"(tlo), - "=d" (thi)); -} - - -static double getFrequency (void) { - static double freq = 0.0; - static bool initialized = false; - unsigned long LTime0, LTime1, HTime0, HTime1; - if (!initialized) { - - freq = 2600000.0; - -#if 0 - // Compute MHz directly. - // Wait for approximately one second. - - getTime (LTime0, HTime0); - // printf ("waiting...\n"); - struct timespec rqtp, rmtp; - rqtp.tv_sec = 1; - rqtp.tv_nsec = 0; - nanosleep (&rqtp, &rmtp); - // printf ("done.\n"); - getTime (LTime1, HTime1); - - freq = (double)(LTime1 - LTime0) + (double)(UINT_MAX)*(double)(HTime1 - HTime0); - if (LTime1 < LTime0) { - freq -= (double)UINT_MAX; - } -#endif - initialized = true; - - } else { - // printf ("wha?\n"); - } - return freq; -} - - -namespace HL { - -class Timer { -public: - Timer (void) - : timeElapsed (0.0) - { - _frequency = getFrequency(); - // printf ("wooo!\n"); - // printf ("freq = %lf\n", frequency); - } - void start (void) { - getTime (currentLo, currentHi); - } - void stop (void) { - unsigned long lo, hi; - getTime (lo, hi); - double now = (double) hi * 4294967296.0 + lo; - double prev = (double) currentHi * 4294967296.0 + currentLo; - timeElapsed = (now - prev) / _frequency; - } - - operator double (void) { - return timeElapsed; - } - -private: - double timeElapsed; - unsigned long currentLo, currentHi; - double _frequency; -}; - -}; - -#else - - -#ifdef __SVR4 // Solaris -#include -#include -#include -#include -#include -#endif // __SVR4 - -#include - -#if defined(unix) || defined(__linux) -#include -#include -#endif - - -#ifdef __sgi -#include -#include -#include -#endif - - -#if defined(_WIN32) -#include -#endif - - -#if defined(__BEOS__) -#include -#endif - - -namespace HL { - -class Timer { - -public: - - /// Initializes the timer. - Timer (void) -#if !defined(_WIN32) - : _starttime (0), - _elapsedtime (0) -#endif - { - } - - /// Start the timer. - void start (void) { _starttime = _time(); } - - /// Stop the timer. - void stop (void) { _elapsedtime += _time() - _starttime; } - - /// Reset the timer. - void reset (void) { _starttime = _elapsedtime; } - -#if 0 - // Set the timer. - void set (double secs) { _starttime = 0; _elapsedtime = _sectotime (secs);} -#endif - - /// Return the number of seconds elapsed. - operator double (void) { return _timetosec (_elapsedtime); } - - static double currentTime (void) { TimeType t; t = _time(); return _timetosec (t); } - - -private: - - // The _timer variable will be different depending on the OS. - // We try to use the best timer available. - -#ifdef __sgi -#define TIMER_FOUND - - long _starttime, _elapsedtime; - - long _time (void) { - struct tms t; - long ticks = times (&t); - return ticks; - } - - static double _timetosec (long t) { - return ((double) (t) / CLK_TCK); - } - - static long _sectotime (double sec) { - return (long) sec * CLK_TCK; - } -#endif - -#ifdef __SVR4 // Solaris -#define TIMER_FOUND - typedef hrtime_t TimeType; - TimeType _starttime, _elapsedtime; - - static TimeType _time (void) { - return gethrtime(); - } - - static TimeType _sectotime (double sec) { return (hrtime_t) (sec * 1.0e9); } - - static double _timetosec (TimeType& t) { - return ((double) (t) / 1.0e9); - } -#endif // __SVR4 - -#if defined(MAC) || defined(macintosh) -#define TIMER_FOUND - double _starttime, _elapsedtime; - - double _time (void) { - return get_Mac_microseconds(); - } - - double _timetosec (hrtime_t& t) { - return t; - } -#endif // MAC - -#ifdef _WIN32 -#define TIMER_FOUND - -#ifndef __GNUC__ - class TimeType { - public: - TimeType (void) - { - largeInt.QuadPart = 0; - } - operator double& (void) { return (double&) largeInt.QuadPart; } - operator LARGE_INTEGER& (void) { return largeInt; } - double timeToSec (void) { - return (double) largeInt.QuadPart / getFreq(); - } - private: - double getFreq (void) { - QueryPerformanceFrequency (&freq); - return (double) freq.QuadPart; - } - - LARGE_INTEGER largeInt; - LARGE_INTEGER freq; - }; - - TimeType _starttime, _elapsedtime; - - static TimeType _time (void) { - TimeType t; - int r = QueryPerformanceCounter (&((LARGE_INTEGER&) t)); - assert (r); - return t; - } - - static double _timetosec (TimeType& t) { - return t.timeToSec(); - } -#else - typedef DWORD TimeType; - DWORD _starttime, _elapsedtime; - static DWORD _time (void) { - return GetTickCount(); - } - - static double _timetosec (DWORD& t) { - return (double) t / 100000.0; - } - static unsigned long _sectotime (double sec) { - return (unsigned long)(sec); - } -#endif -#endif // _WIN32 - - -#ifdef __BEOS__ -#define TIMER_FOUND - bigtime_t _starttime, _elapsedtime; - bigtime_t _time(void) { - return system_time(); - } - double _timetosec (bigtime_t& t) { - return (double) t / 1000000.0; - } - - bigtime_t _sectotime (double sec) { - return (bigtime_t)(sec * 1000000.0); - } -#endif // __BEOS__ - -#ifndef TIMER_FOUND - - typedef long TimeType; - TimeType _starttime, _elapsedtime; - - static TimeType _time (void) { - struct timeval t; - gettimeofday (&t, NULL); - return t.tv_sec * 1000000 + t.tv_usec; - } - - static double _timetosec (TimeType t) { - return ((double) (t) / 1000000.0); - } - - static TimeType _sectotime (double sec) { - return (TimeType) (sec * 1000000.0); - } - -#endif // TIMER_FOUND - -#undef TIMER_FOUND - -}; - - -#ifdef __SVR4 // Solaris -class VirtualTimer : public Timer { -public: - hrtime_t _time (void) { - return gethrvtime(); - } -}; -#endif - -} - -#endif - -#endif diff --git a/src/dj_trace.py b/src/dj_trace.py index 21b9ddd..f4265ea 100644 --- a/src/dj_trace.py +++ b/src/dj_trace.py @@ -34,7 +34,7 @@ class Benchmark_DJ_Trace( Benchmark ): also used by delorie to measure improvements in the glibc allocator.""", - self.cmd = "build/trace_run{binary_suffix} dj_workloads/{workload}.wl" + self.cmd = "trace_run{binary_suffix} dj_workloads/{workload}.wl" self.measure_cmd = "" self.args = { @@ -80,7 +80,7 @@ class Benchmark_DJ_Trace( Benchmark ): "realloc":117, "free":10099261, "threads": 19}, } - self.requirements = ["build/trace_run"] + self.requirements = ["trace_run"] super().__init__() def prepare(self, verbose=False): diff --git a/src/falsesharing.py b/src/falsesharing.py index 57acf06..6c4ddc0 100644 --- a/src/falsesharing.py +++ b/src/falsesharing.py @@ -16,14 +16,14 @@ class Benchmark_Falsesharing( Benchmark ): on the same cache line the writes will be expensive because of cache thrashing.""" - self.cmd = "build/cache-{bench}{binary_suffix} {threads} 100 8 1000000" + self.cmd = "cache-{bench}{binary_suffix} {threads} 100 8 1000000" self.args = { "bench" : ["thrash", "scratch"], "threads" : range(1, multiprocessing.cpu_count() * 2 + 1) } - self.requirements = ["build/cache-thrash", "build/cache-scratch"] + self.requirements = ["cache-thrash", "cache-scratch"] super().__init__() def process_output(self, result, stdout, stderr, target, perm, verbose): diff --git a/src/larson.py b/src/larson.py index 0a4a237..a035de8 100644 --- a/src/larson.py +++ b/src/larson.py @@ -13,14 +13,14 @@ class Benchmark_Larson( Benchmark ): and deallocates objects, and then transfers some objects (randomly selected) to other threads to be freed.""" - self.cmd = "build/larson{binary_suffix} 1 8 {maxsize} 1000 50000 1 {threads}" + self.cmd = "larson{binary_suffix} 1 8 {maxsize} 1000 50000 1 {threads}" self.args = { "maxsize" : [8, 32, 64, 128, 256, 512, 1024], "threads" : range(1, multiprocessing.cpu_count() * 2 + 1) } - self.requirements = ["build/larson"] + self.requirements = ["larson"] super().__init__() def process_output(self, result, stdout, stderr, target, perm, verbose): diff --git a/src/loop.py b/src/loop.py index d58b4e2..81ddf19 100644 --- a/src/loop.py +++ b/src/loop.py @@ -9,14 +9,14 @@ class Benchmark_Loop( Benchmark ): How allocations are freed can be changed with the benchmark version""", - self.cmd = "build/bench_loop{binary_suffix} {nthreads} 1000000 {maxsize}" + self.cmd = "loop{binary_suffix} {nthreads} 1000000 {maxsize}" self.args = { "maxsize" : [2 ** x for x in range(6, 16)], "nthreads" : range(1, multiprocessing.cpu_count() * 2 + 1) } - self.requirements = ["build/bench_loop"] + self.requirements = ["loop"] super().__init__() def summary(self): diff --git a/src/trace_run.c b/src/trace_run.c deleted file mode 100644 index 604d01e..0000000 --- a/src/trace_run.c +++ /dev/null @@ -1,750 +0,0 @@ -#define _LARGEFILE64_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// #include "malloc.h" -#include - -// #include "mtrace.h" -/* Codes for the simulator/workload programs. Copied from mtrace.h. */ -#define C_NOP 0 -#define C_DONE 1 -#define C_MALLOC 2 -#define C_CALLOC 3 -#define C_REALLOC 4 -#define C_FREE 5 -#define C_SYNC_W 6 -#define C_SYNC_R 7 -#define C_ALLOC_PTRS 8 -#define C_ALLOC_SYNCS 9 -#define C_NTHREADS 10 -#define C_START_THREAD 11 -#define C_MEMALIGN 12 -#define C_VALLOC 13 -#define C_PVALLOC 14 -#define C_POSIX_MEMALIGN 15 - -#if UINTPTR_MAX == 0xffffffffffffffff - -#define ticks_t int64_t -/* Setting quick_run to 1 allows the simulator to model - only the allocation and deallocation accounting via - atomic_rss. The actual allocations are skipped. This - mode is useful to verify the workload file. */ -#define quick_run 0 - -static __inline__ ticks_t rdtsc_s(void) -{ - unsigned a, d; - asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); - asm volatile("rdtscp" : "=a" (a), "=d" (d)); - return ((unsigned long long)a) | (((unsigned long long)d) << 32); -} - -static __inline__ ticks_t rdtsc_e(void) -{ - unsigned a, d; - asm volatile("rdtscp" : "=a" (a), "=d" (d)); - asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx"); - return ((unsigned long long)a) | (((unsigned long long)d) << 32); -} - -#else - -#define ticks_t int32_t - -static __inline__ ticks_t rdtsc_s(void) -{ - unsigned a, d; - asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx"); - asm volatile("rdtsc" : "=a" (a), "=d" (d)); - return ((unsigned long)a) | (((unsigned long)d) << 16); -} - -static __inline__ ticks_t rdtsc_e(void) -{ - unsigned a, d; - asm volatile("rdtscp" : "=a" (a), "=d" (d)); - asm volatile("cpuid" ::: "%ax", "%bx", "%cx", "%dx"); - return ((unsigned long)a) | (((unsigned long)d) << 16); -} - -#endif - -static ticks_t diff_timeval (struct timeval e, struct timeval s) -{ - ticks_t usec; - if (e.tv_usec < s.tv_usec) - usec = (e.tv_usec + 1000000 - s.tv_usec) + (e.tv_sec-1 - s.tv_sec)*1000000; - else - usec = (e.tv_usec - s.tv_usec) + (e.tv_sec - s.tv_sec)*1000000; - return usec; -} - -#if 1 -#define Q1 -#define Q2 -#else -pthread_mutex_t genmutex = PTHREAD_MUTEX_INITIALIZER; -#define Q1 pthread_mutex_lock(&genmutex) -#define Q2 pthread_mutex_unlock(&genmutex) -#endif - -pthread_mutex_t cmutex = PTHREAD_MUTEX_INITIALIZER; -#define NCBUF 10 -static char cbuf[NCBUF][30]; -static int ci = 0; - -char *comma(ticks_t x) -{ - char buf[30], *bs, *bd; - int l, i, idx; - - pthread_mutex_lock(&cmutex); - ci = (ci + 1) % NCBUF; - idx = ci; - pthread_mutex_unlock(&cmutex); - bs = buf; - bd = cbuf[idx]; - - sprintf(buf, "%lld", (long long int)x); - l = strlen(buf); - i = l; - while (*bs) - { - *bd++ = *bs++; - i--; - if (i % 3 == 0 && *bs) - *bd++ = ','; - } - *bd = 0; - return cbuf[idx]; -} - -static volatile void **ptrs; -static volatile size_t *sizes; -static size_t n_ptrs; -static volatile char *syncs; -static pthread_mutex_t *mutexes; -static pthread_cond_t *conds; -static size_t n_syncs; - -static pthread_mutex_t stat_mutex = PTHREAD_MUTEX_INITIALIZER; -ticks_t malloc_time = 0, malloc_count = 0; -ticks_t calloc_time = 0, calloc_count = 0; -ticks_t realloc_time = 0, realloc_count = 0; -ticks_t free_time = 0, free_count = 0; - -size_t ideal_rss = 0; -size_t max_ideal_rss = 0; -static pthread_mutex_t rss_mutex = PTHREAD_MUTEX_INITIALIZER; - -void atomic_rss (ssize_t delta) -{ - pthread_mutex_lock (&rss_mutex); - ideal_rss += delta; - if (max_ideal_rss < ideal_rss) - max_ideal_rss = ideal_rss; - pthread_mutex_unlock (&rss_mutex); -} - -pthread_mutex_t stop_mutex = PTHREAD_MUTEX_INITIALIZER; -int threads_done = 0; - -//#define dprintf printf -#define dprintf(...) (void)1 - -//#define mprintf printf -//#define MDEBUG 1 -#define mprintf(...) (void)1 - -#define myabort() my_abort_2(thrc, __LINE__) -void -my_abort_2 (pthread_t thrc, int line) -{ - fprintf(stderr, "Abort thread %p at line %d\n", (void *)thrc, line); - abort(); -} - -/*------------------------------------------------------------*/ -/* Wrapper around I/O routines */ - -int io_fd; - -#define IOSIZE 65536 -#define IOMIN 4096 - -static pthread_mutex_t io_mutex = PTHREAD_MUTEX_INITIALIZER; - -typedef struct { - unsigned char buf[IOSIZE]; - size_t incr; - size_t max_incr; - size_t buf_base; - size_t buf_idx; - int saw_eof; -} IOPerThreadType; - -IOPerThreadType main_io; -IOPerThreadType *thread_io; - -void -io_init (IOPerThreadType *io, size_t file_offset, int incr) -{ - if (incr > IOSIZE) - incr = IOSIZE; - if (incr < IOMIN) - incr = IOMIN; - - io->buf_base = file_offset; - io->buf_idx = 0; - io->incr = incr; - - pthread_mutex_lock (&io_mutex); - lseek64 (io_fd, io->buf_base, SEEK_SET); - // short read OK, the eof is just to prevent runaways from bad data. - if (read (io_fd, io->buf, incr) < 0) - io->saw_eof = 1; - else - io->saw_eof = 0; - pthread_mutex_unlock (&io_mutex); -} - -unsigned char -io_read (IOPerThreadType *io) -{ - if (io->buf_idx >= io->incr) - io_init (io, io->buf_base + io->buf_idx, io->incr); - if (io->saw_eof) - return 0xff; - return io->buf [io->buf_idx++]; -} - -unsigned char -io_peek (IOPerThreadType *io) -{ - if (io->buf_idx >= io->incr) - io_init (io, io->buf_base + io->buf_idx, io->incr); - if (io->saw_eof) - return 0xff; - return io->buf [io->buf_idx]; -} - -size_t -io_pos (IOPerThreadType *io) -{ - return io->buf_base + io->buf_idx; -} - -/*------------------------------------------------------------*/ - -static void -wmem (volatile void *ptr, int count) -{ - char *p = (char *)ptr; - int i; - - if (!p) - return; - - for (i=0; isaw_eof) - myabort(); - dprintf("op %p:%ld is %d\n", (void *)thrc, io_pos (io), io_peek (io)); - switch (io_read (io)) - { - case C_NOP: - break; - - case C_DONE: - dprintf("op %p:%ld DONE\n", (void *)thrc, io_pos (io)); - pthread_mutex_lock (&stat_mutex); - malloc_time += my_malloc_time; - calloc_time += my_calloc_time; - realloc_time += my_realloc_time; - free_time += my_free_time; - malloc_count += my_malloc_count; - calloc_count += my_calloc_count; - realloc_count += my_realloc_count; - free_count += my_free_count; - threads_done ++; - pthread_mutex_unlock (&stat_mutex); - pthread_mutex_lock(&stop_mutex); - pthread_mutex_unlock(&stop_mutex); - return NULL; - - case C_MEMALIGN: - p2 = get_int (io); - sz2 = get_int (io); - sz = get_int (io); - dprintf("op %p:%ld %ld = MEMALIGN %ld %ld\n", (void *)thrc, io_pos (io), p2, sz2, sz); - /* we can't force memalign to return NULL (fail), so just skip it. */ - if (p2 == 0) - break; - if (p2 > n_ptrs) - myabort(); - stime = rdtsc_s(); - Q1; - if (ptrs[p2]) - { - if (!quick_run) - free ((void *)ptrs[p2]); - atomic_rss (-sizes[p2]); - } - if (!quick_run) - ptrs[p2] = memalign (sz2, sz); - else - ptrs[p2] = (void *)p2; - /* Verify the alignment matches what is expected. */ - if (((size_t)ptrs[p2] & (sz2 - 1)) != 0) - myabort (); - sizes[p2] = sz; - mprintf("%p = memalign(%lx, %lx)\n", ptrs[p2], sz2, sz); - Q2; - etime = rdtsc_e(); - if (ptrs[p2] != NULL) - atomic_rss (sz); - if (etime < stime) - { - printf("s: %llx e:%llx d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime)); - } - my_malloc_time += etime - stime; - my_malloc_count ++; - if (!quick_run) - wmem(ptrs[p2], sz); - break; - - case C_MALLOC: - p2 = get_int (io); - sz = get_int (io); - dprintf("op %p:%ld %ld = MALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz); - /* we can't force malloc to return NULL (fail), so just skip it. */ - if (p2 == 0) - break; - if (p2 > n_ptrs) - myabort(); - stime = rdtsc_s(); - Q1; - if (ptrs[p2]) - { - if (!quick_run) - free ((void *)ptrs[p2]); - atomic_rss (-sizes[p2]); - } - if (!quick_run) - ptrs[p2] = malloc (sz); - else - ptrs[p2] = (void *)p2; - sizes[p2] = sz; - mprintf("%p = malloc(%lx)\n", ptrs[p2], sz); - Q2; - etime = rdtsc_e(); - if (ptrs[p2] != NULL) - atomic_rss (sz); - if (etime < stime) - { - printf("s: %llx e:%llx d:%llx\n", (long long)stime, (long long)etime, (long long)(etime-stime)); - } - my_malloc_time += etime - stime; - my_malloc_count ++; - if (!quick_run) - wmem(ptrs[p2], sz); - break; - - case C_CALLOC: - p2 = get_int (io); - sz = get_int (io); - dprintf("op %p:%ld %ld = CALLOC %ld\n", (void *)thrc, io_pos (io), p2, sz); - /* we can't force calloc to return NULL (fail), so just skip it. */ - if (p2 == 0) - break; - if (p2 > n_ptrs) - myabort(); - if (ptrs[p2]) - { - if (!quick_run) - free ((void *)ptrs[p2]); - atomic_rss (-sizes[p2]); - } - stime = rdtsc_s(); - Q1; - if (!quick_run) - ptrs[p2] = calloc (sz, 1); - else - ptrs[p2] = (void *)p2; - sizes[p2] = sz; - mprintf("%p = calloc(%lx)\n", ptrs[p2], sz); - Q2; - if (ptrs[p2]) - atomic_rss (sz); - my_calloc_time += rdtsc_e() - stime; - my_calloc_count ++; - if (!quick_run) - wmem(ptrs[p2], sz); - break; - - case C_REALLOC: - p2 = get_int (io); - p1 = get_int (io); - sz = get_int (io); - dprintf("op %p:%ld %ld = REALLOC %ld %ld\n", (void *)thrc, io_pos (io), p2, p1, sz); - if (p1 > n_ptrs) - myabort(); - if (p2 > n_ptrs) - myabort(); - /* we can't force realloc to return NULL (fail), so just skip it. */ - if (p2 == 0) - break; - - if (ptrs[p1]) - atomic_rss (-sizes[p1]); - if (!quick_run) - free_wipe(p1); - stime = rdtsc_s(); - Q1; -#ifdef MDEBUG - tmp = ptrs[p1]; -#endif - if (!quick_run) - ptrs[p2] = realloc ((void *)ptrs[p1], sz); - else - ptrs[p2] = (void *)p2; - sizes[p2] = sz; - mprintf("%p = relloc(%p,%lx)\n", ptrs[p2], tmp,sz); - Q2; - my_realloc_time += rdtsc_e() - stime; - my_realloc_count ++; - if (!quick_run) - wmem(ptrs[p2], sz); - if (p1 != p2) - ptrs[p1] = 0; - if (ptrs[p2]) - atomic_rss (sizes[p2]); - break; - - case C_FREE: - p1 = get_int (io); - if (p1 > n_ptrs) - myabort(); - dprintf("op %p:%ld FREE %ld\n", (void *)thrc, io_pos (io), p1); - if (!quick_run) - free_wipe (p1); - if (ptrs[p1]) - atomic_rss (-sizes[p1]); - stime = rdtsc_s(); - Q1; - mprintf("free(%p)\n", ptrs[p1]); - if (!quick_run) - free ((void *)ptrs[p1]); - Q2; - my_free_time += rdtsc_e() - stime; - my_free_count ++; - ptrs[p1] = 0; - break; - - case C_SYNC_W: - p1 = get_int(io); - dprintf("op %p:%ld SYNC_W %ld\n", (void *)thrc, io_pos (io), p1); - if (p1 > n_syncs) - myabort(); - pthread_mutex_lock (&mutexes[p1]); - syncs[p1] = 1; - pthread_cond_signal (&conds[p1]); - __sync_synchronize (); - pthread_mutex_unlock (&mutexes[p1]); - break; - - case C_SYNC_R: - p1 = get_int(io); - dprintf("op %p:%ld SYNC_R %ld\n", (void *)thrc, io_pos (io), p1); - if (p1 > n_syncs) - myabort(); - pthread_mutex_lock (&mutexes[p1]); - while (syncs[p1] != 1) - { - pthread_cond_wait (&conds[p1], &mutexes[p1]); - __sync_synchronize (); - } - pthread_mutex_unlock (&mutexes[p1]); - break; - - default: - printf("op %d - unsupported, thread %d addr %lu\n", - this_op, thread_idx, (long unsigned int)io_pos (io)); - myabort(); - } - } -} - -static void *alloc_mem (size_t amt) -{ - void *rv = mmap (NULL, amt, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); - mlock (rv, amt); - memset (rv, 0, amt); - return rv; -} - -static pthread_t *thread_ids; - -void * -my_malloc (const char *msg, int size, IOPerThreadType *io, size_t *psz, size_t count) -{ - void *rv; - if (psz) - count = *psz = get_int (io); - dprintf ("my_malloc for %s size %d * %ld\n", msg, size, count); - rv = alloc_mem(size * count); - if (!rv) - { - fprintf(stderr, "calloc(%lu,%lu) failed\n", (long unsigned)size, (long unsigned)*psz); - exit(1); - } - mlock (rv, size * count); - return rv; -} - -static const char * const scan_names[] = { - "UNUSED", - "ARENA", - "HEAP", - "CHUNK_USED", - "CHUNK_FREE", - "FASTBIN_FREE", - "UNSORTED", - "TOP", - "TCACHE", - "USED" -}; - -void -malloc_scan_callback (void *ptr, size_t length, int type) -{ - printf("%s: ptr %p length %llx\n", scan_names[type], ptr, (long long)length); -} - -#define MY_ALLOC(T, psz) \ - (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, psz, 0) -#define MY_ALLOCN(T, count) \ - (typeof (T)) my_malloc (#T, sizeof(*T), &main_io, NULL, count) - -int -main(int argc, char **argv) -{ - ticks_t start=0; - ticks_t end; - ticks_t usec; - struct timeval tv_s, tv_e; - int thread_idx = 0; - int i; - size_t n_threads = 0; - size_t idx; - struct rusage res_start, res_end; - int done; - size_t guessed_io_size = 4096; - struct stat statb; - - if (argc < 2) - { - fprintf(stderr, "Usage: %s \n", argv[0]); - exit(1); - } - io_fd = open(argv[1], O_RDONLY); - if (io_fd < 0) - { - fprintf(stderr, "Unable to open %s for reading\n", argv[1]); - perror("The error was"); - exit(1); - } - fstat (io_fd, &statb); - - io_init (&main_io, 0, IOMIN); - - pthread_mutex_lock(&stop_mutex); - - done = 0; - while (!done) - { - switch (io_read (&main_io)) - { - case C_NOP: - break; - case C_ALLOC_PTRS: - ptrs = MY_ALLOC (ptrs, &n_ptrs); - sizes = alloc_mem(sizeof(sizes[0]) * n_ptrs); - ptrs[0] = 0; - break; - case C_ALLOC_SYNCS: - n_syncs = get_int(&main_io); - syncs = MY_ALLOCN (syncs, n_syncs); - conds = MY_ALLOCN (conds, n_syncs); - mutexes = MY_ALLOCN (mutexes, n_syncs); - for (idx=0; idx %s)\n", - comma(res_end.ru_maxrss - res_start.ru_maxrss), - comma(res_start.ru_maxrss), comma(res_end.ru_maxrss)); - } - printf("%s Kb Max Ideal RSS\n", comma (max_ideal_rss / 1024)); - - if (malloc_count == 0) malloc_count ++; - if (calloc_count == 0) calloc_count ++; - if (realloc_count == 0) realloc_count ++; - if (free_count == 0) free_count ++; - - if (!quick_run) - { - printf("\n"); - printf("sizeof ticks_t is %lu\n", sizeof(ticks_t)); - printf("Avg malloc time: %6s in %10s calls\n", comma(malloc_time/malloc_count), comma(malloc_count)); - printf("Avg calloc time: %6s in %10s calls\n", comma(calloc_time/calloc_count), comma(calloc_count)); - printf("Avg realloc time: %5s in %10s calls\n", comma(realloc_time/realloc_count), comma(realloc_count)); - printf("Avg free time: %8s in %10s calls\n", comma(free_time/free_count), comma(free_count)); - printf("Total call time: %s cycles\n", comma(malloc_time+calloc_time+realloc_time+free_time)); - printf("\n"); - } - -#if 0 - /* Free any still-held chunks of memory. */ - for (idx=0; idx