SandraCLV commited on
Commit
30bd11b
1 Parent(s): 8a0e1ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -100
app.py CHANGED
@@ -1,102 +1,86 @@
1
  import gradio as gr
2
- from transformers import pipeline,WhisperProcessor, WhisperForConditionalGeneration
 
 
 
 
3
  import torch
4
- import librosa
5
- import datasets
6
- from transformers.pipelines.pt_utils import KeyDataset
7
- from tqdm.auto import tqdm
8
- import logging
9
- import time
10
- import uuid
11
- import soundfile as sf
12
- from PIL import Image
13
-
14
- # model.py apache license 2.0 Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
15
- from model import get_pretrained_model, language_to_models
16
- # demo for a input given image transform into text interpretation, and those text put a speech text to be played
17
-
18
- #text to speech code from https://huggingface.co/spaces/k2-fsa/text-to-speech/blob/main/app.py
19
- image_to_text_model = pipeline("image-classification",model="microsoft/beit-base-patch16-224-pt22k-ft22k")
20
-
21
- def build_html_output(s: str, style: str = "result_item_success"):
22
- return f"""
23
- <div class='result'>
24
- <div class='result_item {style}'>
25
- {s}
26
- </div>
27
- </div>
28
- """
29
- def text_to_speech(language: str, repo_id: str, text: str, sid: str, speed: float):
30
- logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
31
- sid = int(sid)
32
- tts = get_pretrained_model(repo_id, speed)
33
-
34
- start = time.time()
35
- audio = tts.generate(text, sid=sid)
36
- end = time.time()
37
-
38
- if len(audio.samples) == 0:
39
- raise ValueError(
40
- "Error in generating audios. Please read previous error messages."
41
- )
42
-
43
- duration = len(audio.samples) / audio.sample_rate
44
-
45
- elapsed_seconds = end - start
46
- rtf = elapsed_seconds / duration
47
-
48
- info = f"""
49
- Wave duration : {duration:.3f} s <br/>
50
- Processing time: {elapsed_seconds:.3f} s <br/>
51
- RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
52
- """
53
-
54
- logging.info(info)
55
- logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
56
-
57
- filename = str(uuid.uuid4())
58
- filename = f"{filename}.wav"
59
- sf.write(
60
- filename,
61
- audio.samples,
62
- samplerate=audio.sample_rate,
63
- subtype="PCM_16",
64
- )
65
-
66
- return filename, build_html_output(info)
67
-
68
-
69
- with gr.Blocks() as demo:
70
- language_choices = list(language_to_models.keys())
71
- inputsImg=gr.Image()
72
- idx=0
73
- text_output = image_to_text_model(inputsImg)[0]
74
- print(text_output)
75
- for txt in text_output:
76
- output_txt[idx] = gr.Textbox(label=text_output,lines=1,max_lines=1,value=text_output,placeholder="Interpretation")
77
- input_sid = gr.Textbox(
78
- label="Speaker ID",
79
- info="Speaker ID",
80
- lines=1,
81
- max_lines=1,
82
- value="0",
83
- placeholder="Speaker ID. Valid only for mult-speaker model")
84
-
85
- input_speed = gr.Slider(
86
- minimum=0.1,
87
- maximum=10,
88
- value=1,
89
- step=0.1,
90
- label="Speed (larger->faster; smaller->slower)")
91
- text_to_speech(language_choices[0],language_to_models[language_choices[0]][0],text_output,input_sid,input_speed)
92
- output_audio[idx] = gr.Audio(label="Output")
93
- output_info[idx] = gr.HTML(label="Info")
94
- idx=idx+1
95
- demo=gr.Interface(fn=text_to_speech,
96
- title="Image to Text Interpretation",
97
- inputs=inputsImg,
98
- outputs=[output_txt,output_audio,input_sid,input_speed],
99
- description="image to audio demo",
100
- article = ""
101
- )
102
- demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, BlipForConditionalGeneration, AutoModelForCausalLM, AutoImageProcessor, VisionEncoderDecoderModel, AutoTokenizer
3
+ import io
4
+ import base64
5
+
6
+ # from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, Blip2ForConditionalGeneration, VisionEncoderDecoderModel
7
  import torch
8
+ import open_clip
9
+ import openai
10
+
11
+ from huggingface_hub import hf_hub_download
12
+
13
+ # Carga el modelo de clasificación de imagen a texto
14
+ blip_processor_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
15
+ blip_model_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
16
+
17
+ # Carga el modelo de texto a voz
18
+ openai.api_key = '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'
19
+ model_id = "base"
20
+ #model_version = "2022-01-01"
21
+ whisper = openai.Model(model_id=model_id)
22
+
23
+ device = "cuda" if torch.cuda.is_available() else "cpu"
24
+ blip_model_large.to(device)
25
+
26
+ def generate_caption(processor, model, image, tokenizer=None, use_float_16=False):
27
+ inputs = processor(images=image, return_tensors="pt").to(device)
28
+
29
+ if use_float_16:
30
+ inputs = inputs.to(torch.float16)
31
+
32
+ generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50)
33
+
34
+ if tokenizer is not None:
35
+ generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
36
+ else:
37
+ generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
38
+
39
+ return generated_caption
40
+
41
+
42
+ def generate_caption_coca(model, transform, image):
43
+ im = transform(image).unsqueeze(0).to(device)
44
+ with torch.no_grad(), torch.cuda.amp.autocast():
45
+ generated = model.generate(im, seq_len=20)
46
+ return open_clip.decode(generated[0].detach()).split("<end_of_text>")[0].replace("<start_of_text>", "")
47
+
48
+
49
+ def generate_captions(image):
50
+
51
+ caption_blip_large = generate_caption(blip_processor_large, blip_model_large, image)
52
+ print(caption_blip_large)
53
+ return caption_blip_large
54
+
55
+
56
+ # Define la función que convierte texto en voz
57
+ def text_to_speech(text):
58
+ # Genera el audio utilizando el modelo Whisper
59
+ response = whisper.generate(prompt=text)
60
+ print(response)
61
+ # Extrae el audio del resultado
62
+ audio = response.choices[0].audio
63
+
64
+ # Codifica el audio en base64
65
+ audio_base64 = base64.b64encode(audio).decode("utf-8")
66
+
67
+ # Devuelve el audio como un archivo MP3
68
+ return BytesIO(base64.b64decode(audio_base64))
69
+
70
+ # Define la interfaz de usuario utilizando Gradio
71
+ inputsImg = [
72
+ gr.Image(type="pil", label="Imagen"),
73
+ ]
74
+
75
+ outputs = [ gr.Textbox(label="Caption generated by BLIP-large") ]
76
+ title = "Clasificación de imagen a texto y conversión de texto a voz"
77
+ description = "Carga una imagen y obtén una descripción de texto de lo que contiene la imagen, así como un archivo de audio que lee el texto en voz alta."
78
+ examples = []
79
+
80
+ interface = gr.Interface(fn=generate_captions,
81
+ inputs=inputsImg,
82
+ outputs=outputs,
83
+ examples=examples,
84
+ title=title,
85
+ description=description)
86
+ interface.launch()