add documentation for loop, cfrac, espresso

author: Florian Fischer <florian.fl.fischer@fau.de> 2019-10-14 20:44:08 +0200
committer: Florian Fischer <florian.fl.fischer@fau.de> 2019-10-14 20:44:08 +0200
commit: ea81dfa7929ba83bf4d24e1be34b8384716a637e (patch)
tree: 52ce6b7061f9f9c2e1c8d0ca5ca1343d1f0354e4
parent: 8ca8135f8302a1caafecccdba0671732bb5a3077 (diff)
download: allocbench-ea81dfa7929ba83bf4d24e1be34b8384716a637e.tar.gz
allocbench-ea81dfa7929ba83bf4d24e1be34b8384716a637e.zip
3 files changed, 75 insertions, 6 deletions
diff --git a/src/benchmarks/cfrac.py b/src/benchmarks/cfrac.py
index 9438501..2829b77 100644
--- a/src/benchmarks/cfrac.py
+++ b/src/benchmarks/cfrac.py
@@ -15,12 +15,38 @@
 # You should have received a copy of the GNU General Public License
 # along with allocbench.  If not, see <http://www.gnu.org/licenses/>.
 
-""" Definition of the cfrac benchmark"""
+"""cfrac is a single threaded implementation of the continued fraction factorization algorithm.
+It uses many small short-lived allocations. Factorizing 175451865205073170563711388363274837927895
+results in 43044885 allocator calls (malloc: 21522444, free: 21522441).
+
+Top 10 allocation sizes 1.00% of all allocations
+1. 18 B occurred 8172763 times
+2. 28 B occurred 3781894 times
+3. 10 B occurred 2989673 times
+4. 26 B occurred 2566937 times
+5. 20 B occurred 2420915 times
+6. 16 B occurred 1168569 times
+7. 12 B occurred 203177 times
+8. 14 B occurred 170914 times
+9. 30 B occurred 21149 times
+10. 44 B occurred 15922 times
+allocations <= 64 21522432
+allocations <= 1024 21522436
+allocations <= 4096 21522443
+
+Histogram of sizes:
+0     -    15 3363764  15.63% *******
+16    -    31 18132778 84.25% ******************************************
+32    -    47 25888    0.12%
+
+The relevant non functional allocator properties are the raw speed of the
+API function as well as memory placement strategies with good data locality.
+"""
 
 from src.benchmark import Benchmark
 
 class BenchmarkCfrac(Benchmark):
-    """TODO"""
+    """Definition of the cfrac benchmark"""
     def __init__(self):
         name = "cfrac"
 
diff --git a/src/benchmarks/espresso.py b/src/benchmarks/espresso.py
index a565132..4ffd843 100644
--- a/src/benchmarks/espresso.py
+++ b/src/benchmarks/espresso.py
@@ -15,7 +15,32 @@
 # You should have received a copy of the GNU General Public License
 # along with allocbench.  If not, see <http://www.gnu.org/licenses/>.
 
-"""Definition of the espresso benchmark"""
+"""espresso is a single threaded programmable logic array analyzer, described by Grunwald, Zorn,
+and Henderson in their paper "Improving the cache locality of memory allocation".
+
+The file "largest.espresso" shipped with mimalloc-bench and allocbench generates
+a workload with 3367364 allocator calls (malloc: 1659385, free: 1691851, realloc: 16128).
+About 87% of all allocations are smaller than 64 Byte, the common cache line size.
+
+Top 10 allocation sizes 0.91% of all allocations
+1. 48 B occurred 615622 times
+2. 16 B occurred 533267 times
+3. 56 B occurred 235944 times
+4. 72 B occurred 27318 times
+5. 88 B occurred 23640 times
+6. 64 B occurred 22498 times
+7. 80 B occurred 17779 times
+8. 8 B occurred 16336 times
+9. 272 B occurred 14644 times
+10. 96 B occurred 13175 times
+
+allocations <= 64 1442648
+allocations <= 1024 1657509
+allocations <= 4096 1667112
+
+The relevant non functional allocator properties are the raw speed of the
+API function as well as memory placement strategies with good data locality.
+"""
 
 import os
 
@@ -23,7 +48,7 @@ from src.benchmark import Benchmark
 import src.globalvars
 
 class BenchmarkEspresso(Benchmark):
-    """TODO"""
+    """Definition of the espresso benchmark for allocbench"""
     def __init__(self):
         name = "espresso"
 
diff --git a/src/benchmarks/loop.py b/src/benchmarks/loop.py
index 5f016d8..5a957b9 100644
--- a/src/benchmarks/loop.py
+++ b/src/benchmarks/loop.py
@@ -15,7 +15,25 @@
 # You should have received a copy of the GNU General Public License
 # along with allocbench.  If not, see <http://www.gnu.org/licenses/>.
 
-"""Definition of the loop micro benchmark"""
+"""Definition of the loop micro benchmark
+
+This benchmark allocates and immediately deallocates a pseudo random sized allocation
+N times in T threads. The acquired memory is neither read nor written. Not using the
+allocations at all maybe seems odd but this micro benchmark should only measure
+the allocators fast paths, scalability and management overhead.
+Using the allocations will add cache effects to our results which are
+measured for example in the false sharing or larson benchmarks.
+
+Observations:
+* Glibc's factor two faster for allocations <= 1024B
+* TCMalloc suffers when allocating only small chunks
+
+Interpretation:
+* A significant higher cache miss rate than other allocators could mean that
+  internals suffer from false sharing (TCMalloc).
+* Speed changes with constant threads but changing sizes may show performance
+  differences in differing strategies for seperate sizes (glibc thread caches < 1032B)
+"""
 
 from src.benchmark import Benchmark
 
@@ -29,7 +47,7 @@ class BenchmarkLoop(Benchmark):
     def __init__(self):
         name = "loop"
 
-        self.cmd = "loop{binary_suffix} {nthreads} 1000000 {maxsize}"
+        self.cmd = "loop{binary_suffix} {nthreads} 1000001 {maxsize}"
 
         self.args = {"maxsize":  [2 ** x for x in range(6, 16)],
                      "nthreads": Benchmark.scale_threads_for_cpus(2)}
author	Florian Fischer <florian.fl.fischer@fau.de>	2019-10-14 20:44:08 +0200
committer	Florian Fischer <florian.fl.fischer@fau.de>	2019-10-14 20:44:08 +0200
commit	ea81dfa7929ba83bf4d24e1be34b8384716a637e (patch)
tree	52ce6b7061f9f9c2e1c8d0ca5ca1343d1f0354e4
parent	8ca8135f8302a1caafecccdba0671732bb5a3077 (diff)
download	allocbench-ea81dfa7929ba83bf4d24e1be34b8384716a637e.tar.gz allocbench-ea81dfa7929ba83bf4d24e1be34b8384716a637e.zip