import torch from PIL import Image from omegaconf import OmegaConf from lavis.models import load_model, load_preprocess from lavis.common.registry import registry import requests from generate import generate url = "https://iliad.stanford.edu/pg-vlm/example_images/ceramic_bowl.jpg" example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB") vlm = load_model( name='blip2_t5_instruct', model_type='flant5xxl', checkpoint='pgvlm_weights.bin', # replace with location of downloaded weights is_eval=True, device="cuda" if torch.cuda.is_available() else "cpu" ) vlm.qformer_text_input = False # Optionally disable qformer text model_cls = registry.get_model_class('blip2_t5_instruct') model_type = 'flant5xxl' preprocess_cfg = OmegaConf.load(model_cls.default_config_path(model_type)).preprocess vis_processors, _ = load_preprocess(preprocess_cfg) processor = vis_processors["eval"] question_samples = { 'prompt': 'Question: Classify this object as transparent, translucent, or opaque? Respond unknown if you are not sure. Short answer:', 'image': torch.stack([processor(example_image)], dim=0).to(vlm.device) } answers, scores = generate(vlm, question_samples, length_penalty=0, repetition_penalty=1, num_captions=3) print(answers, scores) # ['opaque', 'translucent', 'transparent'] tensor([-0.0373, -4.2404, -4.4436], device='cuda:0')