@inproceedings{teixeira_et_al_ICA3PP_17, author = {Andreia Sofia Teixeira and Pedro T. Monteiro and Jo\~ao A. Carri\c{c}o and Francisco C. Santos and Alexandre P. Francisco}, title = {Using Spark and GraphX to Parallelize Large-Scale Simulations of Bacterial Populations over Host Contact Networks}, booktitle = {Proceedings of the 17th International Conference on Algorithms and Architectures for Parallel Processing {ICA3PP}'17 (Helsinki, Finland)}, year = {2017}, pages = {591--600}, editor = {Shadi Ibrahim and Kim-Kwang Raymond Choo and Zheng Yan and Witold Pedrycz}, publisher = {Springer Verlag}, series = {Lecture Notes in Computer Science}, volume = {10393}, abstract = { Large-scale population genetics studies are fundamental for phylogenetic and epidemiology analysis of pathogens. And the validation of both evolutionary models and methods used in such studies depend on large data analysis. It is, however, unrealistic to work with large datasets as only rather small samples of the real pathogen population are available. On the other hand, given model complexity and required population sizes, large-scale simulations are the only way to address this issue. In this paper we study how to efficiently parallelize such extensive simulations on top of Apache Spark, making use of both the MapReduce programming model and the GraphX API. We propose a simulation framework for large bacterial populations, over host contact networks, implementing the Wright-Fisher model. The experimental evaluation shows that we can effectively speedup simulations. We also evaluate inherent parallelism limits, drawing conclusions on the relation between cluster computing power and simulations speedup.} }