1+ from transformers import SpeechT5Processor ,SpeechT5ForTextToSpeech ,SpeechT5HifiGan
2+ from datasets import load_dataset
3+ import torch
4+ import random
5+ import string
6+ import soundfile as sf
7+
8+ device = "cuda" if torch .cuda .is_available ()else "cpu"
9+ # load the processor
10+ processor = SpeechT5Processor .from_pretrained ("microsoft/speecht5_tts" )
11+ # load the model
12+ model = SpeechT5ForTextToSpeech .from_pretrained ("microsoft/speecht5_tts" ).to (device )
13+ # load the vocoder, that is the voice encoder
14+ vocoder = SpeechT5HifiGan .from_pretrained ("microsoft/speecht5_hifigan" ).to (device )
15+ # we load this dataset to get the speaker embeddings
16+ embeddings_dataset = load_dataset ("Matthijs/cmu-arctic-xvectors" ,split = "validation" )
17+
18+ # speaker ids from the embeddings dataset
19+ speakers = {
20+ 'awb' :0 ,# Scottish male
21+ 'bdl' :1138 ,# US male
22+ 'clb' :2271 ,# US female
23+ 'jmk' :3403 ,# Canadian male
24+ 'ksp' :4535 ,# Indian male
25+ 'rms' :5667 ,# US male
26+ 'slt' :6799 # US female
27+ }
28+
29+ def save_text_to_speech (text ,speaker = None ):
30+ # preprocess text
31+ inputs = processor (text = text ,return_tensors = "pt" ).to (device )
32+ if speaker is not None :
33+ # load xvector containing speaker's voice characteristics from a dataset
34+ speaker_embeddings = torch .tensor (embeddings_dataset [speaker ]["xvector" ]).unsqueeze (0 ).to (device )
35+ else :
36+ # random vector, meaning a random voice
37+ speaker_embeddings = torch .randn ((1 ,512 )).to (device )
38+ # generate speech with the models
39+ speech = model .generate_speech (inputs ["input_ids" ],speaker_embeddings ,vocoder = vocoder )
40+ if speaker is not None :
41+ # if we have a speaker, we use the speaker's ID in the filename
42+ output_filename = f"{ speaker } -{ '-' .join (text .split ()[:6 ])} .mp3"
43+ else :
44+ # if we don't have a speaker, we use a random string in the filename
45+ random_str = '' .join (random .sample (string .ascii_letters + string .digits ,k = 5 ))
46+ output_filename = f"{ random_str } -{ '-' .join (text .split ()[:6 ])} .mp3"
47+ # save the generated speech to a file with 16KHz sampling rate
48+ sf .write (output_filename ,speech .cpu ().numpy (),samplerate = 16000 )
49+ # return the filename for reference
50+ return output_filename
51+
52+ # generate speech with a US female voice
53+ save_text_to_speech ("Python is my favorite programming language" ,speaker = speakers ["slt" ])
54+ # generate speech with a random voice
55+ save_text_to_speech ("Python is my favorite programming language" )
56+
57+ # a challenging text with all speakers
58+ text = """In his miracle year, he published four groundbreaking papers.
59+ These outlined the theory of the photoelectric effect, explained Brownian motion,
60+ introduced special relativity, and demonstrated mass-energy equivalence."""
61+
62+ for speaker_name ,speaker in speakers .items ():
63+ output_filename = save_text_to_speech (text ,speaker )
64+ print (f"Saved{ output_filename } " )
65+ # random speaker
66+ output_filename = save_text_to_speech (text )
67+ print (f"Saved{ output_filename } " )