This is the accompanying website for the following paper:
@inproceedings{KalitaDSZHP24_PAD-VC_IWAENC,
author = {Arunava Kr. Kalita and Christian Dittmar and Paolo Sani and Frank Zalkow and Emanu\"{e}l A. P. Habets and Rusha Patra},
title = {{PAD-VC}: {A} Prosody-Aware Decoder for Any-to-Few Voice Conversion},
booktitle = {Proceedings of the International Workshop on Acoustic Signal Enhancement ({IWAENC})},
address = {Aalborg, Denmark},
year = {2024},
pages = {389--393},
doi = {10.1109/IWAENC61483.2024.10694576},
url-pdf = {https://ieeexplore.ieee.org/document/10694576},
url-details = {https://www.audiolabs-erlangen.de/resources/NLUI/2024-PAD-VC},
}
Voice conversion (VC) is the process of synthetically generating speech based on some source speaker recording, aiming to preserve its linguistic information while using a specified target speaker's timbral characteristics. In this paper, we propose PAD-VC, a prosody-aware VC model based on the decoder part of the ForwardTacotron (FT) architecture. We train PAD-VC with prosody features such as pitch, energy, and voicing confidence and augment those with linguistic features derived from a phoneme posteriorgram (PPG) representation of the source utterance. This way, we can handle both phonemic information and frame-wise supra-segmental features. During inference time, the source speaker's prosody features are modified to match the prosody statistics of the target speaker. We show that our proposed PAD-VC surpasses the prosody-cloning performance of FT on unseen source speakers in terms of similarity and naturalness.
Here, we provide audio samples used in our test on speech naturalness, i.e., two synthetic versions (FT-VC and PAD-VC) and a reference utterance obtained by copy synthesis (REF) for eight different text prompts, respectively. In addition, we provide samples for PAD-VC + PostProGAN as well as PAD-VC + PostProCFM, where a GAN-based post-processing [5], respectively CFM-based post-processing [6] has been applied to the mel spectrograms predicted by PAD-VC. Both of these conditions were not included in the listening tests. Please note that the synthetic versions correspond to a different speaker identity than the reference in this test on speech naturalness.
Here, we provide audio samples used in our test on speaker similarity, i.e., two synthetic versions (FT-VC and PAD-VC) and a target speaker sample obtained by copy synthesis (TGT). In addition, we provide samples for PAD-VC + PostProGAN as well as PAD-VC + PostProCFM, where a GAN-based post-processing [5], respectively CFM-based post-processing [6] has been applied to the mel spectrograms predicted by PAD-VC. Both of these conditions were not included in the listening tests. Finally, we also provide the reference utterance REF that was converted to the target speaker voice but also not included in the listening test.
The author conducted this work during his stay in Fraunhofer IIS, Erlangen, with the TTS group. This internship was made possible through the generous support and funding provided by IGSTC. This research was partially supported by the Free State of Bavaria in the DSAI project, and it was also supported by Fraunhofer-Zukunftsstiftung.
@inproceedings{ChurchwellEtAl24_NeuralPPG_ICASSP,
title = {High-Fidelity Neural Phonetic Posteriorgrams},
author = {Cameron Churchwell and Max Morrison and Bryan Pardo},
year = {2024},
booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association (Interspeech)},
address = {Seoul, Korea},
pages = {4287--4291}
}
@misc{Schaefer20_ForwardTacotron_Github,
author = {Christian Schäfer and Ollie McCarthy and contributors},
howpublished = {\url{https://github.com/as-ideas/ForwardTacotron}},
journal = {GitHub repository},
publisher = {GitHub},
title = {{ForwardTacotron}},
year = {2020}
}
@article{SismanEtAl20_VoiceConversionverview_TASLP,
title = {An overview of voice conversion and its challenges: {F}rom statistical modeling to deep learning},
author={Berrak Sisman and Junichi Yamagishi and Simon King and Haizhou Li},
journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume={29},
pages={132--157},
year={2020},
}
@inproceedings{SunEtAl16_PPG-VoiceConversion_ICME,
address = {Seattle, WA, USA},
author = {Lifa Sun and Kun Li and Hao Wang and Shiyin Kang and Helen Meng},
booktitle = {Proceedings of the {IEEE} International Conference on Multimedia and Expo ({ICME})},
pages = {},
title = {Phonetic posteriorgrams for many-to-one voice conversion without parallel data training},
year = {2016},
}
@inproceedings{SaniBZHD23_Postprocessing_ITG,
author = {Paolo Sani and Judith Bauer and Frank Zalkow and Emanu{\"e}l A.\ P.\ Habets and Christian Dittmar},
title = {Improving the Naturalness of Synthesized Spectograms for {TTS} Using {GAN}-Based Post-Processing},
booktitle = {Proceedings of the {ITG} Conference on Speech Communication},
address = {Aachen, Germany},
year = {2023},
doi = {10.30420/456164053},
pages = {270--274},
url-pdf = {https://ieeexplore.ieee.org/document/10363041},
url-details = {https://www.audiolabs-erlangen.de/resources/NLUI/2023-ITG-postprocessing},
}
@inproceedings{ZalkowSKHPD25_LowResourceGenerativePostprocessing_INTERSPEECH,
author = {Frank Zalkow and Paolo Sani and Kishor Kayyar Lakshminarayana and Emanu{\"e}l A.\ P. Habets and Nicola Pia and Christian Dittmar},
title = {Bridging the Training–Inference Gap in {TTS}: {T}raining Strategies for Robust Generative Postprocessing for Low-Resource Speakers},
booktitle = {Proceedings of the Conference of the International Speech Communication Association (INTERSPEECH)},
address = {Rotterdam, The Netherlands},
year = {2025},
pages = {2470--2474},
doi = {10.21437/Interspeech.2025-854},
url-pdf = {https://www.isca-archive.org/interspeech_2025/zalkow25_interspeech.html},
url-details = {https://www.audiolabs-erlangen.de/resources/NLUI/2025-Interspeech-LowResGen},
}