This is the accompanying website for the paper "A lightweight neural TTS system for high-quality speech synthesis in German" by Prachi Govalkar, Ahmed Mustafa, Nicola Pia, Judith Bauer, Metehan Yurt, Yiğitcan Özer and Christian Dittmar at the ITG Conference on Speech Communication 2021.
This paper describes a lightweight neural text-to-speech system for the German language. The system is composed of a non-autoregressive spectrogram predictor, followed by a novel vocoder model called StyleMelGAN. Our complete system has a very tiny footprint of 61 MB and is able to synthesize high-quality speech output faster than real-time both on CPU (2.26x) and GPU (44.71x). We additionally propose a modified version of the vocoder called Multi-band StyleMelGAN, which offers a significant improvement in inference speed with a small trade-off in speech quality. In a perceptual listening test with the complete TTS pipeline, the best configuration achieves a mean opinion score of 3.84 using StyleMelGAN, compared to 4.23 for professional speech recordings.
Below, we provide all speech items that have been presented in our perceptual listening test to the human raters. The naming of the different conditions is explained in the following table.
| Name | Description |
|---|---|
| FT + PGHI | ForwardTacotron + Phase Gradient Heap Integration |
| FT + WGLO | ForwardTacotron + WaveGlow |
| FT + MBSMG | ForwardTacotron + Multi-band StyleMelGAN |
| FT + SMG | ForwardTacotron + StyleMelGAN |
| REF | Reference speech recordings |




















@inproceedings{PrengerVC19_WaveGlow_ICASSP,
author={Ryan Prenger and Raffael Valle and Bryan Catanzaro},
booktitle={{ICASSP}},
title={{W}ave{G}low: {A} {F}low-based {G}enerative {N}etwork for {S}peech {S}ynthesis},
year={2019},
address = {Brighton, UK},
month = {May},
pages={3617--3621},
doi = {10.1109/ICASSP.2019.8683143},
}
@inproceedings{WangLTJY18_NeuralVocodersComparison_ICASSP,
author = {Xin Wang and Jaime Lorenzo{-}Trueba and Shinji Takaki and Lauri Juvela and Junichi Yamagishi},
title = {{A} {C}omparison of {R}ecent {W}aveform {G}eneration and {A}coustic {M}odeling {M}ethods for {N}eural-{N}etwork-{B}ased {S}peech {S}ynthesis},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing ({ICASSP})},
address = {Calgary, Canada},
month = {April},
pages = {4804--4808},
year = {2018},
}
@article{AiraksinenJBYA18_VocoderComparison_IEEE-TASLP,
author = {Manu Airaksinen and Lauri Juvela and Bajibabu Bollepalli and Junichi Yamagishi and Paavo Alku},
title = {A {C}omparison {B}etween {STRAIGHT}, {G}lottal, and {S}inusoidal {V}ocoding in {S}tatistical {P}arametric {S}peech {S}ynthesis},
journal = {{IEEE/ACM} Transactions on Audio, Speech, and Language Processing},
volume = {26},
number = {9},
pages = {1658--1670},
year = {2018},
}
@InProceedings{Govalkar2019_NeuralVocoders_SSW,
author = {Prachi Govalkar and Johannes Fischer and Frank Zalkow and Christian Dittmar},
booktitle = {ISCA Speech Synthesis Workshop},
title = {{A Comparison of Recent Neural Vocoders for Speech Signal Reconstruction}},
year = {2019},
address = {Vienna, Austria},
month = {September},
pages = {7--12},
doi = {10.21437/SSW.2019-2},
}
@article{GriffinL84_SpecgramInversion_TASSP,
author={Daniel W. Griffin and Jae S. Lim},
title={Signal {E}stimation from {M}odified {S}hort-{T}ime {F}ourier {T}ransform},
journal={{IEEE} Transactions on Acoustics, Speech, and Signal Processing},
year={1984},
volume={32},
number={2},
pages={236--243}
}
@inproceedings{OordDZSVGKSK16_WaveNet_SSW,
author={A{\"{a}}ron van den Oord and Sander Dieleman and Heiga Zen and Karen Simonyan and Oriol Vinyals and Alex Graves and Nal Kalchbrenner and Andrew Senior and Koray Kavukcuoglu},
title={{W}ave{N}et: {A} {G}enerative {M}odel for {R}aw {A}udio},
year={2016},
booktitle={Proceedings of the {ISCA} Speech Synthesis Workshop},
pages={125--125},
address = {Sunnyvale, CA, USA},
month = {September},
}
@inproceedings{KumarKBGTSBBC19_MelGAN_NIPS,
author = {Kundan Kumar and Rithesh Kumar and Thibault de Boissiere and Lucas Gestin and Wei Zhen Teoh and Jose Sotelo and Alexandre de Br{\'{e}}bisson and Yoshua Bengio and Aaron C. Courville},
title = {Mel{GAN}: Generative Adversarial Networks for Conditional Waveform Synthesis},
booktitle = {Proceedings of the Annual Conference on Neural Information Processing Systems ({NIPS})},
address = {Vancouver, BC, Canada},
month = {December},
pages = {14881--14892},
year = {2019},
}
@inproceedings{KingmaD18_Glow_NIPS,
title={{G}low: {G}enerative {F}low with {I}nvertible 1x1 {C}onvolutions},
author={Kingma, Durk P and Dhariwal, Prafulla},
booktitle={Proceedings of the Annual Conference on Neural Information Processing Systems ({NIPS})},
address = {Montr{\'{e}}al, Canada},
month = {December},
pages={10215--10224},
year={2018}
}
@article{Rec_P800_ITU,
title={P. 800: Methods for subjective determination of transmission quality},
author={Rec, ITUT},
journal={International Telecommunication Union, Geneva},
volume={22},
year={1996}
}
@article{Rec_P808_ITU,
title={P. 808: Subjective evaluation of speech quality with a crowdsourcing approach},
author={Rec, ITUT},
journal={International Telecommunication Union, Geneva},
year={2018}
}
@misc{MTurk,
title = {2005-2018, {A}mazon {M}echanical {T}urk, {I}nc.},
howpublished = {\url{https://www.mturk.com/}},
note = {Accessed: 2020-09-15}
}
@misc{Aws,
title = {2020, {A}mazon {W}eb {S}ervices, {I}nc.},
howpublished = {\url{https://aws.amazon.com/}},
note = {Accessed: 2020-08-19}
}
@conference{SchoefflerSEH15_WebMUSHRA_WAC,
author = {Michael Schoeffler and Fabian-Robert St\"{o}ter and Bernd Edler and J\"{u}rgen Herre},
booktitle = {Web Audio Conference},
address = {Paris, France},
month = {January},
title = {{{T}owards the {N}ext {G}eneration of {W}eb-based {E}xperiments: {A} {C}ase {S}tudy {A}ssessing {B}asic {A}udio {Q}uality {F}ollowing the {ITU}-{R} {R}ecommendation {BS}.1534 ({MUSHRA})}},
year = {2015},
}
@inproceedings{Shen2018_Natural_ICASSP,
author = {Jonathan Shen and Ruoming Pang and Ron J. Weiss and Mike Schuster and Navdeep Jaitly and Zongheng Yang and Zhifeng Chen and Yu Zhang and Yuxuan Wang and R. J. Skerry{-}Ryan and Rif A. Saurous and Yannis Agiomyrgiannakis and Yonghui Wu},
title = {{Natural {TTS} Synthesis by Conditioning Wavenet on {Mel}-Spectrogram Predictions}},
booktitle = {{ICASSP}},
address = {Calgary, AB, Canada},
month = {April},
pages = {4779--4783},
year = {2018},
}
@article{Prusa2017_PGHI_IEEE,
author = {Zden\v{e}k Pru\v{s}a and P{\'{e}}ter Bal{\'{a}}zs and Peter L. S{\o}ndergaard},
title = {{A} {N}oniterative {M}ethod for {R}econstruction of {P}hase {F}rom {STFT} {M}agnitude},
journal = {{IEEE/ACM} Transactions on Audio, Speech, and Language Processing},
volume = {25},
number = {5},
pages = {1154--1164},
year = {2017},
}
@inproceedings{zeng2020aligntts,
author={Zeng, Zhen and Wang, Jianzong and Cheng, Ning and Xia, Tian and Xiao, Jing},
booktitle={ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title={Aligntts: Efficient Feed-Forward Text-to-Speech System Without Explicit Alignment},
year={2020},
volume={},
number={},
pages={6714-6718},
doi={10.1109/ICASSP40776.2020.9054119}
}
@article{donahue2020end,
author = {Jeff Donahue and Sander Dieleman and Mikolaj Binkowski and Erich Elsen and Karen Simonyan},
title = {End-to-End Adversarial Text-to-Speech},
journal = {CoRR},
volume = {abs/2006.03575},
year = {2020},
url = {https://arxiv.org/abs/2006.03575},
archivePrefix = {arXiv},
eprint = {2006.03575},
timestamp = {Tue, 09 Jun 2020 16:38:02 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2006-03575.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{miao2020efficienttts,
title={EfficientTTS: An Efficient and High-Quality Text-to-Speech Architecture},
author={Chenfeng Miao and Shuang Liang and Zhencheng Liu and Minchuan Chen and Jun Ma and Shaojun Wang and Jing Xiao},
year={2020},
journal = {CoRR},
volume = {2012.03500},
archivePrefix={arXiv},
primaryClass={eess.AS}
}
@article{lim2020jdi,
title = {JDI-T: Jointly trained Duration Informed Transformer for Text-To-Speech without Explicit Alignment},
author = {Dan Lim and Won Jang and Gyeonghwan O and Heayoung Park and Bongwan Kim and Jaesam Yoon},
journal = {CoRR},
year = {2020},
volume = {2005.07799}
}
@article{ren2019fastspeech,
author = {Yi Ren and Yangjun Ruan and Xu Tan and Tao Qin and Sheng Zhao and Zhou Zhao and Tie{-}Yan Liu},
title = {FastSpeech: Fast, Robust and Controllable Text to Speech},
journal = {CoRR},
volume = {abs/1905.09263},
year = {2019},
url = {http://arxiv.org/abs/1905.09263},
archivePrefix = {arXiv},
eprint = {1905.09263},
timestamp = {Wed, 11 Nov 2020 08:48:07 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-1905-09263.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{park2019semantic,
title={Semantic image synthesis with spatially-adaptive normalization},
author={Park, Taesung and Liu, Ming-Yu and Wang, Ting-Chun and Zhu, Jun-Yan},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
pages={2337--2346},
year={2019}
}
@article{binkowski2019high,
author = {Mikolaj Binkowski and Jeff Donahue and Sander Dieleman and Aidan Clark and Erich Elsen and Norman Casagrande and Luis C. Cobo and Karen Simonyan},
title = {High Fidelity Speech Synthesis with Adversarial Networks},
journal = {CoRR},
volume = {abs/1909.11646},
year = {2019},
url = {http://arxiv.org/abs/1909.11646},
archivePrefix = {arXiv},
eprint = {1909.11646},
timestamp = {Fri, 27 Sep 2019 13:04:21 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1909-11646.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{cuturi2017soft,
title={Soft-dtw: a differentiable loss function for time-series},
author={Cuturi, Marco and Blondel, Mathieu},
booktitle={International Conference on Machine Learning},
pages={894--903},
year={2017},
organization={PMLR}
}
@inproceedings{wang2017tacotron,
author = {Yuxuan Wang and R.J. Skerry-Ryan and Daisy Stanton and Yonghui Wu and Ron J. Weiss and Navdeep Jaitly and Zongheng Yang and Ying Xiao and Zhifeng Chen and Samy Bengio and Quoc Le and Yannis Agiomyrgiannakis and Rob Clark and Rif A. Saurous},
title = {Tacotron: Towards End-to-End Speech Synthesis},
year = 2017,
booktitle = {Proc. Interspeech 2017},
pages = {4006--4010},
doi = {10.21437/Interspeech.2017-1452},
url = {http://dx.doi.org/10.21437/Interspeech.2017-1452}
}
@misc{Schaefer2020,
author = {Schaefer, C.},
title = {ForwardTacotron},
year = {2020},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/as-ideas/ForwardTacotron}},
note = {Accessed: 2020-06}
}
@article{ulyanov2016instance,
author = {Dmitry Ulyanov and Andrea Vedaldi and Victor S. Lempitsky},
title = {Instance Normalization: The Missing Ingredient for Fast Stylization},
journal = {CoRR},
volume = {abs/1607.08022},
year = {2016},
url = {http://arxiv.org/abs/1607.08022},
archivePrefix = {arXiv},
eprint = {1607.08022},
timestamp = {Mon, 13 Aug 2018 16:47:58 +0200},
biburl = {https://dblp.org/rec/journals/corr/UlyanovVL16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
}
@inproceedings{mustafa2020stylemelgan,
author={Mustafa, Ahmed and Pia, Nicola and Fuchs, Guillaume},
booktitle={ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
title={StyleMelGAN: An Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization},
year={2021},
volume={},
number={},
pages={6034-6038},
doi={10.1109/ICASSP39728.2021.9413605},
}
@inproceedings{yang2021multi,
title={Multi-band MelGAN: Faster waveform generation for high-quality text-to-speech},
author={Yang, Geng and Yang, Shan and Liu, Kai and Fang, Peng and Chen, Wei and Xie, Lei},
booktitle={2021 IEEE Spoken Language Technology Workshop (SLT)},
pages={492--498},
year={2021},
organization={IEEE}
}
@inproceedings{yamamoto2020parallel,
title={Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram},
author={Yamamoto, Ryuichi and Song, Eunwoo and Kim, Jae-Min},
booktitle={ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={6199--6203},
year={2020},
organization={IEEE}
}
@article{nguyen1994near,
title={Near-perfect-reconstruction pseudo-QMF banks},
author={Nguyen, Truong Q},
journal={IEEE Transactions on signal processing},
volume={42},
number={1},
pages={65--76},
year={1994},
publisher={IEEE}
}
@article{gritsenko2020spectral,
title={A Spectral Energy Distance for Parallel Speech Synthesis},
author={A. Gritsenko and Tim Salimans and Rianne van den Berg and Jasper Snoek and Nal Kalchbrenner},
journal={CoRR},
year={2020},
volume={abs/2008.01160}
}