% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Bhme:229075,
author = {Böhme, David},
othercontributors = {Wolf, Felix Gerd Eugen},
title = {{C}haracterizing load and communication imbalance in
parallel applications},
volume = {23},
address = {Jülich},
publisher = {Forschungszentrum Jülich, Zentralbibliothek},
reportid = {RWTH-CONV-144050},
series = {Schriften des Forschungszentrums Jülich : IAS series},
pages = {XV, 111 S. : Ill., graph. Darst.},
year = {2014},
note = {Zsfassung in dt. und engl. Sprache; Zugl.: Aachen, Techn.
Hochsch., Diss., 2013},
abstract = {The amount of parallelism in modern supercomputers
currently grows from generation to generation, and is
expected to reach orders of millions of processor cores in a
single system in the near future. Further application
performance improvements therefore depend to a large extend
on software-managed parallelism: in particular, the software
must organize data exchange between processing elements
efficiently and optimally distribute the workload between
them. Performance analysis tools help developers of parallel
applications to evaluate and optimize the parallel
efficiency of their programs by pinpointing specific
performance bottlenecks. However, existing tools are often
incapable of identifying complex imbalance patterns and
determining their performance impact reliably. This
dissertation presents two novel methods to automatically
extract imbalance-related performance problems from event
traces generated by MPI programs and intuitively guide the
performance analyst to inefficiencies whose optimization
promise the highest benefit. The first method, the delay
analysis, identifies the root causes of wait states. A delay
occurs when a program activity needs more time on one
process than on another, which leads to the formation of
wait states at a subsequent synchronization point. Wait
states, which are intervals through which a process is idle
while waiting for the delayed process, are the primary
symptom of load imbalance in parallel programs. While wait
states themselves are easy to detect, the potentially large
temporal and spatial distance between wait states and the
delays causing them complicates the identification of
wait-state root causes. The delay analysis closes this gap,
accounting for both short-term and long-term effects. To
this end, the delay analysis comprises two contributions of
this dissertation: (1) a cost model and terminology to
describe the severity of a delay in terms of the overall
waiting time it causes; and (2) a scalable algorithm to
identify the locations of delays and determine their cost.
The second new analysis method is based on the detection of
the critical path. In contrast to the delay analysis, which
characterizes the formation of wait states, this
critical-path analysis determines the effect of imbalance on
program runtime. The critical path is the longest execution
path in a parallel program without wait states: optimizing
an activity on the critical path will reduce the program’s
run time. Comparing the duration of activities on the
critical path with their duration on each process yields a
set of novel, compact performance indicators. These
indicators allow users to evaluate load balance, identify
performance bottlenecks, and determine the performance
impact of load imbalance at first glance by providing an
intuitive understanding of complex performance phenomena.
Unlike existing statistics-based load balance metrics, these
indicators are applicable to both SPMD and MPMD-style
programs. Both analysis methods leverage the scalable
event-trace analysis technique employed by the Scalasca
toolset: by replaying event traces in parallel, the
bottleneck search algorithms can harness the distributed
memory and computational resources of the target system for
the analysis, allowing them to process even large-scale
program runs. The scalability and performance insight that
the novel analysis approaches provide are demonstrated by
evaluating a variety of real-world HPC codes in
configurations with up to 262,144 processor cores.},
keywords = {Leistungsbewertung (SWD) / Parallelverarbeitung (SWD) /
Supercomputer (SWD)},
cin = {120000 / 124010},
ddc = {004},
cid = {$I:(DE-82)120000_20140620$ / $I:(DE-82)124010_20140620$},
typ = {PUB:(DE-HGF)11 / PUB:(DE-HGF)3},
urn = {urn:nbn:de:hbz:82-opus-49861},
url = {https://publications.rwth-aachen.de/record/229075},
}