Spaces:

LiquidoNoNewtoniano
/

Proyecto_1

Sleeping

LiquidoNoNewtoniano commited on Nov 6, 2023

Commit

26fe3ab

•

1 Parent(s): 46af2dd

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import gradio as gr
-from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
 import torch
 from PIL import Image
 model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 max_length = 16
 num_beams = 4
@@ -25,15 +31,22 @@ def image_to_text(image_paths):
     preds = [pred.strip() for pred in preds]
     return preds[0]
 title = ""
 description = ""
 interface = gr.Interface(
-    fn=image_to_text,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Textbox(),
     title=title,
-    description=description
 )

 import gradio as gr
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer, DiffusionPipeline, DPMSolverMultistepScheduler
 import torch
 from PIL import Image
+from diffusers.utils import export_to_video
 model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+pipe = pipe.to(device)
 max_length = 16
 num_beams = 4
     preds = [pred.strip() for pred in preds]
     return preds[0]
+def text_to_video(image_paths):
+    prompt = image_to_text(image_paths)
+    video_frames = pipe(prompt, num_inference_steps=25).frames
+    video_path = export_to_video(video_frames)
+    return video_frames
 title = ""
 description = ""
 interface = gr.Interface(
+    fn=text_to_video,
+    inputs=gr.inputs.Image(type="pil"),
+    outputs=gr.Video(),
     title=title,
+    description=description,
 )