File size: 1,391 Bytes

import torch
from PIL import Image
from omegaconf import OmegaConf

from lavis.models import load_model, load_preprocess
from lavis.common.registry import registry

import requests

from generate import generate

url = "https://iliad.stanford.edu/pg-vlm/example_images/ceramic_bowl.jpg"
example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

vlm = load_model(
    name='blip2_t5_instruct',
    model_type='flant5xxl',
    checkpoint='pgvlm_weights.bin',  # replace with location of downloaded weights
    is_eval=True,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

vlm.qformer_text_input = False  # Optionally disable qformer text

model_cls = registry.get_model_class('blip2_t5_instruct')
model_type = 'flant5xxl'
preprocess_cfg = OmegaConf.load(model_cls.default_config_path(model_type)).preprocess
vis_processors, _ = load_preprocess(preprocess_cfg)
processor = vis_processors["eval"]

question_samples = {
    'prompt': 'Question: Classify this object as transparent, translucent, or opaque? Respond unknown if you are not sure. Short answer:',
    'image': torch.stack([processor(example_image)], dim=0).to(vlm.device)
}

answers, scores = generate(vlm, question_samples, length_penalty=0, repetition_penalty=1, num_captions=3)
print(answers, scores)
# ['opaque', 'translucent', 'transparent'] tensor([-0.0373, -4.2404, -4.4436], device='cuda:0')