aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/benchmarks/cfrac.py30
-rw-r--r--src/benchmarks/espresso.py29
-rw-r--r--src/benchmarks/loop.py22
3 files changed, 75 insertions, 6 deletions
diff --git a/src/benchmarks/cfrac.py b/src/benchmarks/cfrac.py
index 9438501..2829b77 100644
--- a/src/benchmarks/cfrac.py
+++ b/src/benchmarks/cfrac.py
@@ -15,12 +15,38 @@
# You should have received a copy of the GNU General Public License
# along with allocbench. If not, see <http://www.gnu.org/licenses/>.
-""" Definition of the cfrac benchmark"""
+"""cfrac is a single threaded implementation of the continued fraction factorization algorithm.
+It uses many small short-lived allocations. Factorizing 175451865205073170563711388363274837927895
+results in 43044885 allocator calls (malloc: 21522444, free: 21522441).
+
+Top 10 allocation sizes 1.00% of all allocations
+1. 18 B occurred 8172763 times
+2. 28 B occurred 3781894 times
+3. 10 B occurred 2989673 times
+4. 26 B occurred 2566937 times
+5. 20 B occurred 2420915 times
+6. 16 B occurred 1168569 times
+7. 12 B occurred 203177 times
+8. 14 B occurred 170914 times
+9. 30 B occurred 21149 times
+10. 44 B occurred 15922 times
+allocations <= 64 21522432
+allocations <= 1024 21522436
+allocations <= 4096 21522443
+
+Histogram of sizes:
+0 - 15 3363764 15.63% *******
+16 - 31 18132778 84.25% ******************************************
+32 - 47 25888 0.12%
+
+The relevant non functional allocator properties are the raw speed of the
+API function as well as memory placement strategies with good data locality.
+"""
from src.benchmark import Benchmark
class BenchmarkCfrac(Benchmark):
- """TODO"""
+ """Definition of the cfrac benchmark"""
def __init__(self):
name = "cfrac"
diff --git a/src/benchmarks/espresso.py b/src/benchmarks/espresso.py
index a565132..4ffd843 100644
--- a/src/benchmarks/espresso.py
+++ b/src/benchmarks/espresso.py
@@ -15,7 +15,32 @@
# You should have received a copy of the GNU General Public License
# along with allocbench. If not, see <http://www.gnu.org/licenses/>.
-"""Definition of the espresso benchmark"""
+"""espresso is a single threaded programmable logic array analyzer, described by Grunwald, Zorn,
+and Henderson in their paper "Improving the cache locality of memory allocation".
+
+The file "largest.espresso" shipped with mimalloc-bench and allocbench generates
+a workload with 3367364 allocator calls (malloc: 1659385, free: 1691851, realloc: 16128).
+About 87% of all allocations are smaller than 64 Byte, the common cache line size.
+
+Top 10 allocation sizes 0.91% of all allocations
+1. 48 B occurred 615622 times
+2. 16 B occurred 533267 times
+3. 56 B occurred 235944 times
+4. 72 B occurred 27318 times
+5. 88 B occurred 23640 times
+6. 64 B occurred 22498 times
+7. 80 B occurred 17779 times
+8. 8 B occurred 16336 times
+9. 272 B occurred 14644 times
+10. 96 B occurred 13175 times
+
+allocations <= 64 1442648
+allocations <= 1024 1657509
+allocations <= 4096 1667112
+
+The relevant non functional allocator properties are the raw speed of the
+API function as well as memory placement strategies with good data locality.
+"""
import os
@@ -23,7 +48,7 @@ from src.benchmark import Benchmark
import src.globalvars
class BenchmarkEspresso(Benchmark):
- """TODO"""
+ """Definition of the espresso benchmark for allocbench"""
def __init__(self):
name = "espresso"
diff --git a/src/benchmarks/loop.py b/src/benchmarks/loop.py
index 5f016d8..5a957b9 100644
--- a/src/benchmarks/loop.py
+++ b/src/benchmarks/loop.py
@@ -15,7 +15,25 @@
# You should have received a copy of the GNU General Public License
# along with allocbench. If not, see <http://www.gnu.org/licenses/>.
-"""Definition of the loop micro benchmark"""
+"""Definition of the loop micro benchmark
+
+This benchmark allocates and immediately deallocates a pseudo random sized allocation
+N times in T threads. The acquired memory is neither read nor written. Not using the
+allocations at all maybe seems odd but this micro benchmark should only measure
+the allocators fast paths, scalability and management overhead.
+Using the allocations will add cache effects to our results which are
+measured for example in the false sharing or larson benchmarks.
+
+Observations:
+* Glibc's factor two faster for allocations <= 1024B
+* TCMalloc suffers when allocating only small chunks
+
+Interpretation:
+* A significant higher cache miss rate than other allocators could mean that
+ internals suffer from false sharing (TCMalloc).
+* Speed changes with constant threads but changing sizes may show performance
+ differences in differing strategies for seperate sizes (glibc thread caches < 1032B)
+"""
from src.benchmark import Benchmark
@@ -29,7 +47,7 @@ class BenchmarkLoop(Benchmark):
def __init__(self):
name = "loop"
- self.cmd = "loop{binary_suffix} {nthreads} 1000000 {maxsize}"
+ self.cmd = "loop{binary_suffix} {nthreads} 1000001 {maxsize}"
self.args = {"maxsize": [2 ** x for x in range(6, 16)],
"nthreads": Benchmark.scale_threads_for_cpus(2)}