SandraCLV commited on
Commit
1a4e740
1 Parent(s): dfe885e

Create audio_model.py

Browse files
Files changed (1) hide show
  1. audio_model.py +55 -0
audio_model.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, BlipForConditionalGeneration, AutoTokenizer,SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+
6
+ #CONSTANTS
7
+ speaker_embeddings = {
8
+ "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
9
+ "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
10
+ "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
11
+ "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
12
+ }
13
+
14
+ # Carga el modelo de clasificación de tetxo a audio speech
15
+ checkpoint = "microsoft/speecht5_tts"
16
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
17
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
18
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
19
+
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+
22
+ ### TEXT TO AUDIO SPEECH MODEL 2
23
+ # Define la función que convierte texto en voz
24
+ def text_to_speech(text,speaker):
25
+ # Genera el audio utilizando el modelo
26
+ if len(text.strip()) == 0:
27
+ return (16000, np.zeros(0).astype(np.int16))
28
+ inputs = processor(text=text, return_tensors="pt")
29
+
30
+ # limit input length
31
+ input_ids = inputs["input_ids"]
32
+ input_ids = input_ids[..., :model.config.max_text_positions]
33
+
34
+ if speaker == "Surprise Me!":
35
+ # load one of the provided speaker embeddings at random
36
+ idx = np.random.randint(len(speaker_embeddings))
37
+ key = list(speaker_embeddings.keys())[idx]
38
+ speaker_embedding = np.load(speaker_embeddings[key])
39
+
40
+ # randomly shuffle the elements
41
+ np.random.shuffle(speaker_embedding)
42
+
43
+ # randomly flip half the values
44
+ x = (np.random.rand(512) >= 0.5) * 1.0
45
+ x[x == 0] = -1.0
46
+ speaker_embedding *= x
47
+
48
+ #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
49
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
50
+
51
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
52
+
53
+ speech = (speech.numpy() * 32767).astype(np.int16)
54
+ return (16000, speech)
55
+ ### END TEXT TO AUDIO SPEECH MODEL 2