|
|
|
"""Copy of compose_glide.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/19xx6Nu4FeiGj-TzTUFxBf-15IkeuFx_F |
|
""" |
|
|
|
|
|
|
|
|
|
import torch as th |
|
import numpy as np |
|
|
|
from glide_text2im.download import load_checkpoint |
|
from glide_text2im.model_creation import ( |
|
create_model_and_diffusion, |
|
model_and_diffusion_defaults, |
|
model_and_diffusion_defaults_upsampler |
|
) |
|
|
|
from composable_diffusion.download import download_model |
|
from composable_diffusion.model_creation import create_model_and_diffusion as create_model_and_diffusion_for_clevr |
|
from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr |
|
|
|
|
|
|
|
|
|
|
|
|
|
has_cuda = th.cuda.is_available() |
|
device = th.device('cpu' if not has_cuda else 'cuda') |
|
|
|
|
|
timestep_respacing = 100 |
|
options = model_and_diffusion_defaults() |
|
options['use_fp16'] = has_cuda |
|
options['timestep_respacing'] = str(timestep_respacing) |
|
model, diffusion = create_model_and_diffusion(**options) |
|
model.eval() |
|
if has_cuda: |
|
model.convert_to_fp16() |
|
model.to(device) |
|
model.load_state_dict(load_checkpoint('base', device)) |
|
print('total base parameters', sum(x.numel() for x in model.parameters())) |
|
|
|
|
|
options_up = model_and_diffusion_defaults_upsampler() |
|
options_up['use_fp16'] = has_cuda |
|
options_up['timestep_respacing'] = 'fast27' |
|
model_up, diffusion_up = create_model_and_diffusion(**options_up) |
|
model_up.eval() |
|
if has_cuda: |
|
model_up.convert_to_fp16() |
|
model_up.to(device) |
|
model_up.load_state_dict(load_checkpoint('upsample', device)) |
|
print('total upsampler parameters', sum(x.numel() for x in model_up.parameters())) |
|
|
|
def show_images(batch: th.Tensor): |
|
""" Display a batch of images inline. """ |
|
scaled = ((batch + 1)*127.5).round().clamp(0,255).to(th.uint8).cpu() |
|
reshaped = scaled.permute(2, 0, 3, 1).reshape([batch.shape[2], -1, 3]) |
|
display(Image.fromarray(reshaped.numpy())) |
|
|
|
def compose_language_descriptions(prompt): |
|
|
|
prompts = [x.strip() for x in prompt.split('|')] |
|
|
|
batch_size = 1 |
|
guidance_scale = 10 |
|
|
|
|
|
upsample_temp = 0.980 |
|
|
|
|
|
|
|
masks = [True] * len(prompts) + [False] |
|
|
|
masks = th.tensor(masks, dtype=th.bool, device=device) |
|
|
|
def model_fn(x_t, ts, **kwargs): |
|
half = x_t[:1] |
|
combined = th.cat([half] * x_t.size(0), dim=0) |
|
model_out = model(combined, ts, **kwargs) |
|
eps, rest = model_out[:, :3], model_out[:, 3:] |
|
cond_eps = eps[masks].mean(dim=0, keepdim=True) |
|
|
|
uncond_eps = eps[~masks].mean(dim=0, keepdim=True) |
|
half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps) |
|
eps = th.cat([half_eps] * x_t.size(0), dim=0) |
|
return th.cat([eps, rest], dim=1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sample_64(prompts): |
|
tokens_list = [model.tokenizer.encode(prompt) for prompt in prompts] |
|
outputs = [model.tokenizer.padded_tokens_and_mask( |
|
tokens, options['text_ctx'] |
|
) for tokens in tokens_list] |
|
|
|
cond_tokens, cond_masks = zip(*outputs) |
|
cond_tokens, cond_masks = list(cond_tokens), list(cond_masks) |
|
|
|
full_batch_size = batch_size * (len(prompts) + 1) |
|
uncond_tokens, uncond_mask = model.tokenizer.padded_tokens_and_mask( |
|
[], options['text_ctx'] |
|
) |
|
|
|
|
|
model_kwargs = dict( |
|
tokens=th.tensor( |
|
cond_tokens + [uncond_tokens], device=device |
|
), |
|
mask=th.tensor( |
|
cond_masks + [uncond_mask], |
|
dtype=th.bool, |
|
device=device, |
|
), |
|
) |
|
|
|
|
|
model.del_cache() |
|
samples = diffusion.p_sample_loop( |
|
model_fn, |
|
(full_batch_size, 3, options["image_size"], options["image_size"]), |
|
device=device, |
|
clip_denoised=True, |
|
progress=True, |
|
model_kwargs=model_kwargs, |
|
cond_fn=None, |
|
)[:batch_size] |
|
model.del_cache() |
|
|
|
|
|
return samples |
|
|
|
|
|
|
|
|
|
|
|
|
|
def upsampling_256(prompts, samples): |
|
tokens = model_up.tokenizer.encode("".join(prompts)) |
|
tokens, mask = model_up.tokenizer.padded_tokens_and_mask( |
|
tokens, options_up['text_ctx'] |
|
) |
|
|
|
|
|
model_kwargs = dict( |
|
|
|
low_res=((samples+1)*127.5).round()/127.5 - 1, |
|
|
|
|
|
tokens=th.tensor( |
|
[tokens] * batch_size, device=device |
|
), |
|
mask=th.tensor( |
|
[mask] * batch_size, |
|
dtype=th.bool, |
|
device=device, |
|
), |
|
) |
|
|
|
|
|
model_up.del_cache() |
|
up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"]) |
|
up_samples = diffusion_up.ddim_sample_loop( |
|
model_up, |
|
up_shape, |
|
noise=th.randn(up_shape, device=device) * upsample_temp, |
|
device=device, |
|
clip_denoised=True, |
|
progress=True, |
|
model_kwargs=model_kwargs, |
|
cond_fn=None, |
|
)[:batch_size] |
|
model_up.del_cache() |
|
|
|
|
|
return up_samples |
|
|
|
|
|
|
|
samples = sample_64(prompts) |
|
|
|
|
|
|
|
upsamples = upsampling_256(prompts, samples) |
|
|
|
|
|
out_img = upsamples[0].permute(1,2,0) |
|
out_img = (out_img+1)/2 |
|
out_img = np.array(out_img.data.to('cpu')) |
|
return out_img |
|
|
|
|
|
timestep_respacing = 100 |
|
clevr_options = model_and_diffusion_defaults_for_clevr() |
|
|
|
flags = { |
|
"image_size": 128, |
|
"num_channels": 192, |
|
"num_res_blocks": 2, |
|
"learn_sigma": True, |
|
"use_scale_shift_norm": False, |
|
"raw_unet": True, |
|
"noise_schedule": "squaredcos_cap_v2", |
|
"rescale_learned_sigmas": False, |
|
"rescale_timesteps": False, |
|
"num_classes": '2', |
|
"dataset": "clevr_pos", |
|
"use_fp16": has_cuda, |
|
"timestep_respacing": str(timestep_respacing) |
|
} |
|
|
|
for key, val in flags.items(): |
|
clevr_options[key] = val |
|
|
|
clevr_model, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options) |
|
clevr_model.eval() |
|
if has_cuda: |
|
clevr_model.convert_to_fp16() |
|
|
|
clevr_model.to(device) |
|
clevr_model.load_state_dict(th.load(download_model('clevr_pos'), device)) |
|
|
|
def compose_clevr_objects(prompt): |
|
print(prompt) |
|
coordinates = [[float(x.split(',')[0].strip()), float(x.split(',')[1].strip())] |
|
for x in prompt.split('|')] |
|
coordinates += [[-1, -1]] |
|
batch_size = 1 |
|
guidance_scale = 10 |
|
|
|
def model_fn(x_t, ts, **kwargs): |
|
half = x_t[:1] |
|
combined = th.cat([half] * kwargs['y'].size(0), dim=0) |
|
model_out = model(combined, ts, **kwargs) |
|
eps, rest = model_out[:, :3], model_out[:, 3:] |
|
masks = kwargs.get('masks') |
|
cond_eps = eps[masks].mean(dim=0, keepdim=True) |
|
uncond_eps = eps[~masks].mean(dim=0, keepdim=True) |
|
half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps) |
|
eps = th.cat([half_eps] * x_t.size(0), dim=0) |
|
return th.cat([eps, rest], dim=1) |
|
|
|
masks = [True] * (len(coordinates) - 1) + [False] |
|
model_kwargs = dict( |
|
y=th.tensor(coordinates, dtype=th.float, device=device), |
|
masks=th.tensor(masks, dtype=th.bool, device=device) |
|
) |
|
|
|
def sample(coordinates): |
|
samples = diffusion.p_sample_loop( |
|
model_fn, |
|
(len(coordinates), 3, options["image_size"], options["image_size"]), |
|
device=device, |
|
clip_denoised=True, |
|
progress=True, |
|
model_kwargs=model_kwargs, |
|
cond_fn=None, |
|
)[:batch_size] |
|
|
|
return samples |
|
|
|
samples = sample(coordinates) |
|
out_img = samples[0].permute(1,2,0) |
|
out_img = (out_img+1)/2 |
|
out_img = np.array(out_img.data.to('cpu')) |
|
return out_img |
|
|
|
|
|
def compose(prompt, ver): |
|
if ver == 'GLIDE': |
|
return compose_language_descriptions(prompt) |
|
else: |
|
return compose_clevr_objects(prompt) |
|
|
|
examples_1 = ['a camel | a forest', 'A cloudy blue sky | A mountain in the horizon | Cherry Blossoms in front of the mountain'] |
|
examples_2 = ['0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5'] |
|
examples = [[examples_1, 'GLIDE'], [examples_2, 'CLEVR Objects']] |
|
|
|
import gradio as gr |
|
gr.Interface(title='Compositional Visual Generation with Composable Diffusion Models', |
|
description='<p>Demo for Composable Diffusion (~20s per example)</p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.).</li></ul><p>When composing multiple sentences, use `|` as the delimiter, see given examples below.</p>', |
|
fn=compose, inputs=['text', gr.inputs.Radio(['GLIDE','CLEVR Objects'], type="value", default='GLIDE', label='version')], outputs='image', examples=examples).launch(); |
|
|
|
|