src/benchmarks/larson.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

# Copyright 2018-2019 Florian Fischer <florian.fl.fischer@fau.de>
#
# This file is part of allocbench.
#
# allocbench is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# allocbench is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with allocbench.  If not, see <http://www.gnu.org/licenses/>.
"""Larson server benchmark

This benchmark was build by Paul Larson at Microsoft Research. It
simulates a server: each thread has a set of allocations. From which it selects
a random slot. The allocation in this slot is freed, a new one with a random size
allocated, written to and stored in the selected slot. When a thread finished its
allocations it will pass its objects to a new thread.

Larson benchmark usage: ./larson sleep min-size max-size chunks malloc_frees seed threads

In the paper "Memory Allocation for Long-Running Server Applications" the authors
use 1000 chunks per thread and 50000 malloc and free pairs per thread which
correspond to a "bleeding rate" of 2% which they observed in real world systems.
The allocations are uniformly distributed between min-size and max-size.

allocbench runs larson with different distributions and thread counts.
The other arguments are the same as the original authors used in their paper.

mimalloc-bench uses 1000 chunks per thread and 10000 malloc and free pairs
simulating 10% bleeding. I don't know why they use different values than the
original paper.


Interpretation:

This benchmark is intended to model a real world server workload.
But the use of a uniformly distribution of allocation sizes clearly differs from
real applications. Although the results can be a metric of scalability and
false sharing because it uses multiple threads, which pass memory around.
"""

import re

from src.benchmark import Benchmark
import src.plots as plt

THROUGHPUT_RE = re.compile(
    "^Throughput =\\s*(?P<throughput>\\d+) operations per second.$")


class BenchmarkLarson(Benchmark):
    """Definition of the larson benchmark"""
    def __init__(self):
        name = "larson"

        # Parameters taken from the paper "Memory Allocation for Long-Running Server
        # Applications" from Larson and Krishnan
        self.cmd = "larson{binary_suffix} 5 8 {maxsize} 1000 50000 1 {threads}"

        self.args = {
            "maxsize": [64, 512, 1024],
            "threads": Benchmark.scale_threads_for_cpus(2)
        }

        self.requirements = ["larson"]
        super().__init__(name)

    @staticmethod
    def process_output(result, stdout, stderr, target, perm):
        for line in stdout.splitlines():
            res = THROUGHPUT_RE.match(line)
            if res:
                result["throughput"] = int(res.group("throughput"))
                return

    def summary(self):
        # Plot threads->throughput and maxsize->throughput
        plt.plot_fixed_arg(self,
                           "{throughput}/1000000",
                           ylabel="'MOPS/s'",
                           title="'Larson: ' + arg + ' ' + str(arg_value)",
                           filepostfix="throughput")

        plt.plot_fixed_arg(
            self,
            "({L1-dcache-load-misses}/{L1-dcache-loads})*100",
            ylabel="'l1 cache misses in %'",
            title="'Larson cache misses: ' + arg + ' ' + str(arg_value)",
            filepostfix="cachemisses")


larson = BenchmarkLarson()