% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Malapally:1026256,
author = {Malapally, Nitin},
othercontributors = {Carloni, Paolo and Fyta, Maria},
title = {{E}xascale-ready molecular dynamics simulations with
efficient algorithms for extreme core counts},
school = {RWTH Aachen University},
type = {Dissertation},
address = {Aachen},
publisher = {RWTH Aachen University},
reportid = {RWTH-2026-00768},
pages = {1 Online-Ressource : Illustrationen},
year = {2026},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University 2026; Dissertation, RWTH Aachen University, 2026},
abstract = {Biomolecular simulations, realized by molecular dynamics
(MD) and enhanced-sampling approaches, are very powerful
tools for studying the structural dynamics, kinetics, and
energetics of biological systems. In combination with
high-performance computing (HPC), increasingly larger
systems and longer timescales can be simulated. However, the
sequential nature of MD’s time evolution imposes a hard
parallel limit, resulting in reduced scalability and hence
under-utilization of HPC systems. As a result, standard MD
does not reach the typical timescale (millisecond and
beyond) required to study many biological processes.
Enhanced-sampling techniques, such as umbrella sampling,
metadynamics, and replica-exchange MD, do simulate these
long processes but often require various techniques to
retrieve kinetic and thermodynamic properties and do not
scale well. The arrival of exascale computers has made the
need for highly scalable algorithms for MD simulations even
more urgent. This doctoral thesis reports on my efforts to
address these important issues via algorithmic optimization,
design and development. The first was an attempt to speed up
MD simulations by means of an alternative parallel 3D
discrete Fourier transform (3D DFT) algorithm, which was
implemented and benchmarked on the JUWELS Cluster, showing
comparable scaling performance to the state-of-the-art. In
the second, the software apparatus required for a massively
parallel MD strategy was constructed within the highly
popular GROMACS code and the PLUMED library. The
implementation is capable of both multi-CPU and multi-GPU
parallelism and was optimized and benchmarked on the JUWELS
Booster. The results revealed its multi-modal scalability in
that simulations using it can be both efficiently sped up
and also greatly extended for a small increase in runtime.
The implementation was shown to scale up to $94\%$ of the
JUWELS Booster (3,500 GPUs and 42,000 CPUs) with excellent
parallel efficiency. Moreover, a plateauing of parallel
efficiency was observed at $50\%$ of the JUWELS Booster,
which hints at its ability to scale to even higher node
counts. This software has the potential to accelerate MD and
thereby enable the study of more complex biological
processes than was previously practicable.},
cin = {137810 / 130000},
ddc = {530},
cid = {$I:(DE-82)137810_20140620$ / $I:(DE-82)130000_20140620$},
typ = {PUB:(DE-HGF)11},
doi = {10.18154/RWTH-2026-00768},
url = {https://publications.rwth-aachen.de/record/1026256},
}