LiquidoNoNewtoniano commited on
Commit
26fe3ab
1 Parent(s): 46af2dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -5
app.py CHANGED
@@ -1,14 +1,20 @@
1
  import gradio as gr
2
- from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
3
  import torch
4
  from PIL import Image
 
5
 
6
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
7
  feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
8
  tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
9
 
 
 
 
 
10
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
  model.to(device)
 
12
 
13
  max_length = 16
14
  num_beams = 4
@@ -25,15 +31,22 @@ def image_to_text(image_paths):
25
  preds = [pred.strip() for pred in preds]
26
  return preds[0]
27
 
 
 
 
 
 
 
 
28
  title = ""
29
  description = ""
30
 
31
  interface = gr.Interface(
32
- fn=image_to_text,
33
- inputs=gr.Image(type="pil"),
34
- outputs=gr.Textbox(),
35
  title=title,
36
- description=description
37
 
38
  )
39
 
 
1
  import gradio as gr
2
+ from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer, DiffusionPipeline, DPMSolverMultistepScheduler
3
  import torch
4
  from PIL import Image
5
+ from diffusers.utils import export_to_video
6
 
7
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
8
  feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
9
  tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
10
 
11
+ pipe = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
12
+ pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
13
+ pipe.enable_model_cpu_offload()
14
+
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
  model.to(device)
17
+ pipe = pipe.to(device)
18
 
19
  max_length = 16
20
  num_beams = 4
 
31
  preds = [pred.strip() for pred in preds]
32
  return preds[0]
33
 
34
+ def text_to_video(image_paths):
35
+ prompt = image_to_text(image_paths)
36
+ video_frames = pipe(prompt, num_inference_steps=25).frames
37
+ video_path = export_to_video(video_frames)
38
+ return video_frames
39
+
40
+
41
  title = ""
42
  description = ""
43
 
44
  interface = gr.Interface(
45
+ fn=text_to_video,
46
+ inputs=gr.inputs.Image(type="pil"),
47
+ outputs=gr.Video(),
48
  title=title,
49
+ description=description,
50
 
51
  )
52