Diffusers 🤗
参考链接:Diffusers 官方文档
无论是在寻找简单的推理解决方案,还是想训练自己的扩散模型,Diffusers 都是一个值得首选的最先进的预训练扩散模型库(生成图像、音频甚至 3D 分子结构)。
快速使用
环境安装
# 先安装pytorch,然后执行下面的指令
# 其中 Accelerate 在推理和训练过程中加速模型加载
pip install diffusers["torch"] accelerate transformers
代码示例
代码其实非常简单,核心部件只有两个:model和scheduler。
from diffusers import DDPMScheduler, UNet2DModel
import torch
from PIL import Image
import numpy as np
# (1)加载模型和调度器
scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")
model = UNet2DModel.from_pretrained("google/ddpm-cat-256", use_safetensors=True).to("cuda")
# (2)设置调度器参数之步数
scheduler.set_timesteps(50)
scheduler.timesteps
# (3)随机采样一个噪声作为初始噪声
sample_size = model.config.sample_size
noise = torch.randn((1, 3, sample_size, sample_size), device="cuda")
input = noise
# (4)迭代去噪
for t in scheduler.timesteps:
with torch.no_grad():
noisy_residual = model(input, t).sample
previous_noisy_sample = scheduler.step(noisy_residual, t, input).prev_sample
input = previous_noisy_sample
# (5)将图像还原
image = (input / 2 + 0.5).clamp(0, 1).squeeze()
# 将像素值从范围[-1, 1]映射到[0, 1]之间
image = (image.permute(1, 2, 0) * 255).round().to(torch.uint8).cpu().numpy()
# 使得图像的形状变为(height, width, channels)
image = Image.fromarray(image)
# 使用NumPy数组创建一个PIL图像对象
image
四行代码搞定
from diffusers import DDPMPipeline
ddpm = DDPMPipeline.from_pretrained("google/ddpm-cat-256", use_safetensors=True).to("cuda")
image = ddpm(num_inference_steps=25).images[0]
image
详细介绍
加载模型和调度器
这里将更加细致,model其实分为四个部分:vae、tokenizer、text_encoder、unet。实际上我们只用到了vae的解码部分,至于其编码部分是用不到的。
from PIL import Image
import torch
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
from diffusers import UniPCMultistepScheduler
from tqdm.auto import tqdm
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_safetensors=True)
tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(
"CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
)
unet = UNet2DConditionModel.from_pretrained(
"CompVis/stable-diffusion-v1-4", subfolder="unet", use_safetensors=True
)
scheduler = UniPCMultistepScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
torch_device = "cuda"
vae.to(torch_device)
text_encoder.to(torch_device)
unet.to(torch_device)
生成控制条件
prompt = ["a photograph of an astronaut riding a horse"]
generator = torch.manual_seed(0)
# 生成文本条件
text_input = tokenizer(
prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
)
with torch.no_grad():
text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
# 生成空条件(和文本条件维度一致)
max_length = text_input.input_ids.shape[-1]
batch_size = len(prompt)
uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
# 条件拼接
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
初始化噪声
height = 512 # default height of Stable Diffusion
width = 512 # default width of Stable Diffusion
generator = torch.manual_seed(0)
latents = torch.randn(
(batch_size, unet.config.in_channels, height // 8, width // 8), # B C H W
generator=generator, # 指定了生成随机数所使用的随机数生成器,方便复现
device=torch_device,
)
latents = latents * scheduler.init_noise_sigma # 这个scheduler的需要的设置
迭代去噪
num_inference_steps = 25
guidance_scale = 7.5
scheduler.set_timesteps(num_inference_steps)
for t in tqdm(scheduler.timesteps):
# doing classifier-free guidance to avoid doing two forward passes.
latent_model_input = torch.cat([latents] * 2)
latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
# predict the noise residual
with torch.no_grad():
noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
# 条件引导
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
# 更新 latents: x_t -> x_t-1
latents = scheduler.step(noise_pred, t, latents).prev_sample
将图像还原
# scale and decode the image latents with vae
latents = 1 / 0.18215 * latents
with torch.no_grad():
image = vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1).squeeze()
image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
image = Image.fromarray(image)
image