diff options
| author | Florian Fischer <florian.fl.fischer@fau.de> | 2019-10-14 20:44:08 +0200 |
|---|---|---|
| committer | Florian Fischer <florian.fl.fischer@fau.de> | 2019-10-14 20:44:08 +0200 |
| commit | ea81dfa7929ba83bf4d24e1be34b8384716a637e (patch) | |
| tree | 52ce6b7061f9f9c2e1c8d0ca5ca1343d1f0354e4 | |
| parent | 8ca8135f8302a1caafecccdba0671732bb5a3077 (diff) | |
| download | allocbench-ea81dfa7929ba83bf4d24e1be34b8384716a637e.tar.gz allocbench-ea81dfa7929ba83bf4d24e1be34b8384716a637e.zip | |
add documentation for loop, cfrac, espresso
| -rw-r--r-- | src/benchmarks/cfrac.py | 30 | ||||
| -rw-r--r-- | src/benchmarks/espresso.py | 29 | ||||
| -rw-r--r-- | src/benchmarks/loop.py | 22 |
3 files changed, 75 insertions, 6 deletions
diff --git a/src/benchmarks/cfrac.py b/src/benchmarks/cfrac.py index 9438501..2829b77 100644 --- a/src/benchmarks/cfrac.py +++ b/src/benchmarks/cfrac.py @@ -15,12 +15,38 @@ # You should have received a copy of the GNU General Public License # along with allocbench. If not, see <http://www.gnu.org/licenses/>. -""" Definition of the cfrac benchmark""" +"""cfrac is a single threaded implementation of the continued fraction factorization algorithm. +It uses many small short-lived allocations. Factorizing 175451865205073170563711388363274837927895 +results in 43044885 allocator calls (malloc: 21522444, free: 21522441). + +Top 10 allocation sizes 1.00% of all allocations +1. 18 B occurred 8172763 times +2. 28 B occurred 3781894 times +3. 10 B occurred 2989673 times +4. 26 B occurred 2566937 times +5. 20 B occurred 2420915 times +6. 16 B occurred 1168569 times +7. 12 B occurred 203177 times +8. 14 B occurred 170914 times +9. 30 B occurred 21149 times +10. 44 B occurred 15922 times +allocations <= 64 21522432 +allocations <= 1024 21522436 +allocations <= 4096 21522443 + +Histogram of sizes: +0 - 15 3363764 15.63% ******* +16 - 31 18132778 84.25% ****************************************** +32 - 47 25888 0.12% + +The relevant non functional allocator properties are the raw speed of the +API function as well as memory placement strategies with good data locality. +""" from src.benchmark import Benchmark class BenchmarkCfrac(Benchmark): - """TODO""" + """Definition of the cfrac benchmark""" def __init__(self): name = "cfrac" diff --git a/src/benchmarks/espresso.py b/src/benchmarks/espresso.py index a565132..4ffd843 100644 --- a/src/benchmarks/espresso.py +++ b/src/benchmarks/espresso.py @@ -15,7 +15,32 @@ # You should have received a copy of the GNU General Public License # along with allocbench. If not, see <http://www.gnu.org/licenses/>. -"""Definition of the espresso benchmark""" +"""espresso is a single threaded programmable logic array analyzer, described by Grunwald, Zorn, +and Henderson in their paper "Improving the cache locality of memory allocation". + +The file "largest.espresso" shipped with mimalloc-bench and allocbench generates +a workload with 3367364 allocator calls (malloc: 1659385, free: 1691851, realloc: 16128). +About 87% of all allocations are smaller than 64 Byte, the common cache line size. + +Top 10 allocation sizes 0.91% of all allocations +1. 48 B occurred 615622 times +2. 16 B occurred 533267 times +3. 56 B occurred 235944 times +4. 72 B occurred 27318 times +5. 88 B occurred 23640 times +6. 64 B occurred 22498 times +7. 80 B occurred 17779 times +8. 8 B occurred 16336 times +9. 272 B occurred 14644 times +10. 96 B occurred 13175 times + +allocations <= 64 1442648 +allocations <= 1024 1657509 +allocations <= 4096 1667112 + +The relevant non functional allocator properties are the raw speed of the +API function as well as memory placement strategies with good data locality. +""" import os @@ -23,7 +48,7 @@ from src.benchmark import Benchmark import src.globalvars class BenchmarkEspresso(Benchmark): - """TODO""" + """Definition of the espresso benchmark for allocbench""" def __init__(self): name = "espresso" diff --git a/src/benchmarks/loop.py b/src/benchmarks/loop.py index 5f016d8..5a957b9 100644 --- a/src/benchmarks/loop.py +++ b/src/benchmarks/loop.py @@ -15,7 +15,25 @@ # You should have received a copy of the GNU General Public License # along with allocbench. If not, see <http://www.gnu.org/licenses/>. -"""Definition of the loop micro benchmark""" +"""Definition of the loop micro benchmark + +This benchmark allocates and immediately deallocates a pseudo random sized allocation +N times in T threads. The acquired memory is neither read nor written. Not using the +allocations at all maybe seems odd but this micro benchmark should only measure +the allocators fast paths, scalability and management overhead. +Using the allocations will add cache effects to our results which are +measured for example in the false sharing or larson benchmarks. + +Observations: +* Glibc's factor two faster for allocations <= 1024B +* TCMalloc suffers when allocating only small chunks + +Interpretation: +* A significant higher cache miss rate than other allocators could mean that + internals suffer from false sharing (TCMalloc). +* Speed changes with constant threads but changing sizes may show performance + differences in differing strategies for seperate sizes (glibc thread caches < 1032B) +""" from src.benchmark import Benchmark @@ -29,7 +47,7 @@ class BenchmarkLoop(Benchmark): def __init__(self): name = "loop" - self.cmd = "loop{binary_suffix} {nthreads} 1000000 {maxsize}" + self.cmd = "loop{binary_suffix} {nthreads} 1000001 {maxsize}" self.args = {"maxsize": [2 ** x for x in range(6, 16)], "nthreads": Benchmark.scale_threads_for_cpus(2)} |
