This is the accompanying website for the following paper:
@inproceedings{ZalkowSKHPD25_LowResourceGenerativePostprocessing_ICASSP,
author = {Frank Zalkow and Paolo Sani and Kishor Kayyar Lakshminarayana and Emanu{\"e}l A.\ P. Habets and Nicola Pia and Christian Dittmar},
title = {Bridging the Training–Inference Gap in {TTS}: {T}raining Strategies for Robust Generative Postprocessing for Low-Resource Speakers},
booktitle = {Proceedings of the Conference of the International Speech Communication Association (INTERSPEECH)},
address = {Rotterdam, The Netherlands},
year = {2025},
pages = {},
doi = {},
url-pdf = {},
url-details = {https://www.audiolabs-erlangen.de/resources/NLUI/2025-Interspeech-LowResGen},
}
Modern text-to-speech synthesis systems usually consist of an acoustic model generating speech features, e.g., mel spectrograms, and a vocoder converting them into speech waveforms. The vocoder is typically trained with ground-truth features but receives features from the acoustic model during inference, leading to a mismatch between training and inference. To address this issue, previous work proposed employing generative postprocessing models to make the synthetic features appear more natural. While such systems can produce speech nearly indistinguishable from real speech when sufficient training data is available, their performance degrades with limited data. To mitigate this limitation, we propose a training data generation procedure using a subsampling strategy and multiple acoustic models. We evaluate it through listening tests, demonstrating consistent improvements in the naturalness of the synthetic speech across different postprocessing models and low-resource target speakers.
The following samples have been used in the listening tests reported in the paper.
mic_F05_si1232
mic_F05_si1247
mic_F05_si1282
mic_F05_si1288
mic_F05_si1303
mic_F05_si1325
mic_F05_si1337
mic_F05_si1344
mic_F05_si1362
mic_F05_si1380
mic_F05_sx191
mic_F05_sx215
mic_M03_si842
mic_M03_si844
mic_M03_si858
mic_M03_si879
mic_M03_si955
mic_M03_si958
mic_M03_si962
mic_M03_si965
mic_M03_si977
mic_M03_sx112
mic_M03_sx113
mic_M03_sx118
The International Audio Laboratories Erlangen are a joint institution of the Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU) and Fraunhofer Institute for Integrated Circuits IIS. This research was partially supported by the Free State of Bavaria in the DSAI project and by the Fraunhofer-Zukunftsstiftung. The authors gratefully acknowledge the scientific support and HPC resources provided by the Erlangen National High Performance Computing Center (NHR@FAU) of the Friedrich-Alexander-Universität Erlangen-Nürnberg (FAU) under the NHR project b215dc. NHR funding is provided by federal and Bavarian state authorities. NHR@FAU hardware is partially funded by the German Research Foundation (DFG) – 440719683.
@inproceedings{RenEtAl22_Revisiting_ACL,
address = {Dublin, Ireland},
author = {Yi Ren and Xu Tan and Tao Qin and Zhou Zhao and Tie{-}Yan Liu},
booktitle = {Proceedings of the Annual Meeting of the Association for Computational Linguistics},
pages = {8197--8213},
title = {Revisiting Over-Smoothness in Text to Speech},
year = {2022}
}
@inproceedings{SaniEtAl23_PostProcessingGAN_ITG,
address = {Aachen, Germany},
author = {Paolo Sani and Judith Bauer and Frank Zalkow and Emanu{\"e}l A. P. Habets and Christian Dittmar},
booktitle = {Proceedings of the {ITG} Conference on Speech Communication},
pages = {270--274},
title = {Improving the Naturalness of Synthesized Spectrograms for {TTS} Using {GAN}-Based Post-Processing},
year = {2023}
}
@inproceedings{ZalkowEtAl23_AudioLabs_Blizzard,
address = {Grenoble, France},
author = {Frank Zalkow and Paolo Sani and Michael Fast and Judith Bauer and Mohammad Joshaghani and Kishor Kayyar and Emanu{\"e}l A. P. Habets and Christian Dittmar},
booktitle = {Proceedings of the Blizzard Challenge Workshop},
pages = {63--68},
title = {The {AudioLabs} System for the {B}lizzard {C}hallenge 2023},
year = {2023}
}
@inproceedings{PiaEtAl25_FlowMAC_ICASSP,
address = {Hyderabad, India},
author = {Nicola Pia and Martin Strauss and Markus Multrus and Bernd Edler},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
pages = {},
title = {{FlowMAC}: Conditional Flow Matching for Audio Coding at Low Bit Rates},
year = {2025}
}