import spaces import os import re import traceback import torch import gradio as gr import sys import numpy as np from longvu.builder import load_pretrained_model from longvu.constants import ( DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, ) from longvu.conversation import conv_templates, SeparatorStyle from longvu.mm_datautils import ( KeywordsStoppingCriteria, process_images, tokenizer_image_token, ) from decord import cpu, VideoReader from huggingface_hub import snapshot_download title_markdown = """

LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding

""" block_css = """ #buttons button { min-width: min(120px,100%); color: #9C276A } """ plum_color = gr.themes.colors.Color( name='plum', c50='#F8E4EF', c100='#E9D0DE', c200='#DABCCD', c300='#CBA8BC', c400='#BC94AB', c500='#AD809A', c600='#9E6C89', c700='#8F5878', c800='#804467', c900='#713056', c950='#662647', ) class Chat: def __init__(self): self.version = "qwen" model_name = "cambrian_qwen" model_path = snapshot_download("Vision-CAIR/LongVU_Qwen2_7B", repo_type="model") device = "cuda" self.tokenizer, self.model, self.processor, _ = load_pretrained_model(model_path, None, model_name, device=device) self.model.eval() def remove_after_last_dot(self, s): last_dot_index = s.rfind('.') if last_dot_index == -1: return s return s[:last_dot_index + 1] @spaces.GPU(duration=120) @torch.inference_mode() def generate(self, data: list, message, temperature, top_p, max_output_tokens): # TODO: support multiple turns of conversation. assert len(data) == 1 tensor, image_sizes, modal = data[0] conv = conv_templates[self.version].copy() if isinstance(message, str): conv.append_message("user", DEFAULT_IMAGE_TOKEN + '\n' + message) elif isinstance(message, list): if DEFAULT_IMAGE_TOKEN not in message[0]['content']: message[0]['content'] = DEFAULT_IMAGE_TOKEN + '\n' + message[0]['content'] for mes in message: conv.append_message(mes["role"], mes["content"]) conv.append_message("assistant", None) prompt = conv.get_prompt() input_ids = ( tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") .unsqueeze(0) .to(self.model.device) ) if "llama3" in self.version: input_ids = input_ids[0][1:].unsqueeze(0) # remove bos stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 keywords = [stop_str] stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids) with torch.inference_mode(): output_ids = self.model.generate( input_ids, images=tensor, image_sizes=image_sizes, do_sample=True, temperature=temperature, max_new_tokens=max_output_tokens, use_cache=True, top_p=top_p, stopping_criteria=[stopping_criteria], ) pred = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() return self.remove_after_last_dot(pred) @spaces.GPU(duration=120) def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16): if textbox_in is None: raise gr.Error("Chat messages cannot be empty") return ( gr.update(value=image, interactive=True), gr.update(value=video, interactive=True), message, chatbot, None, ) data = [] processor = handler.processor try: if image is not None: data.append((processor['image'](image).to(handler.model.device, dtype=dtype), None, '')) elif video is not None: vr = VideoReader(video, ctx=cpu(0), num_threads=1) fps = float(vr.get_avg_fps()) frame_indices = np.array( [ i for i in range( 0, len(vr), round(fps), ) ] ) video_tensor = [] for frame_index in frame_indices: img = vr[frame_index].asnumpy() video_tensor.append(img) video_tensor = np.stack(video_tensor) image_sizes = [video_tensor[0].shape[:2]] video_tensor = process_images(video_tensor, processor, handler.model.config) video_tensor = [item.unsqueeze(0).to(handler.model.device, dtype=dtype) for item in video_tensor] data.append((video_tensor, image_sizes, '