Diffusers 🤗

参考链接：Diffusers 官方文档

无论是在寻找简单的推理解决方案，还是想训练自己的扩散模型，Diffusers 都是一个值得首选的最先进的预训练扩散模型库（生成图像、音频甚至 3D 分子结构）。

快速使用

环境安装

# 先安装pytorch，然后执行下面的指令
# 其中 Accelerate 在推理和训练过程中加速模型加载
pip install diffusers["torch"] accelerate transformers

代码示例

代码其实非常简单，核心部件只有两个：model和scheduler。

from diffusers import DDPMScheduler, UNet2DModel
import torch
from PIL import Image
import numpy as np

# （1）加载模型和调度器
scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")
model = UNet2DModel.from_pretrained("google/ddpm-cat-256", use_safetensors=True).to("cuda")

# （2）设置调度器参数之步数
scheduler.set_timesteps(50)
scheduler.timesteps

# （3）随机采样一个噪声作为初始噪声
sample_size = model.config.sample_size
noise = torch.randn((1, 3, sample_size, sample_size), device="cuda")
input = noise

# （4）迭代去噪
for t in scheduler.timesteps:
    with torch.no_grad():
        noisy_residual = model(input, t).sample
    previous_noisy_sample = scheduler.step(noisy_residual, t, input).prev_sample
    input = previous_noisy_sample

# （5）将图像还原
image = (input / 2 + 0.5).clamp(0, 1).squeeze() 
# 将像素值从范围[-1, 1]映射到[0, 1]之间
image = (image.permute(1, 2, 0) * 255).round().to(torch.uint8).cpu().numpy()
# 使得图像的形状变为(height, width, channels)
image = Image.fromarray(image)
# 使用NumPy数组创建一个PIL图像对象

image

四行代码搞定

from diffusers import DDPMPipeline
ddpm = DDPMPipeline.from_pretrained("google/ddpm-cat-256", use_safetensors=True).to("cuda")
image = ddpm(num_inference_steps=25).images[0]
image

详细介绍

加载模型和调度器

这里将更加细致，model其实分为四个部分：vae、tokenizer、text_encoder、unet。实际上我们只用到了vae的解码部分，至于其编码部分是用不到的。

from PIL import Image
import torch
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
from diffusers import UniPCMultistepScheduler
from tqdm.auto import tqdm

vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_safetensors=True)
tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
)
unet = UNet2DConditionModel.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="unet", use_safetensors=True
)
scheduler = UniPCMultistepScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")

torch_device = "cuda"
vae.to(torch_device)
text_encoder.to(torch_device)
unet.to(torch_device)

生成控制条件

prompt = ["a photograph of an astronaut riding a horse"]
generator = torch.manual_seed(0)  

# 生成文本条件
text_input = tokenizer(
    prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
)
with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]

# 生成空条件（和文本条件维度一致）
max_length = text_input.input_ids.shape[-1]
batch_size = len(prompt)
uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]

# 条件拼接
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

初始化噪声

height = 512  # default height of Stable Diffusion
width = 512  # default width of Stable Diffusion
generator = torch.manual_seed(0)
latents = torch.randn(
    (batch_size, unet.config.in_channels, height // 8, width // 8), # B C H W
    generator=generator, # 指定了生成随机数所使用的随机数生成器，方便复现
    device=torch_device,
)
latents = latents * scheduler.init_noise_sigma # 这个scheduler的需要的设置

迭代去噪


num_inference_steps = 25
guidance_scale = 7.5  
scheduler.set_timesteps(num_inference_steps)

for t in tqdm(scheduler.timesteps):

    # doing classifier-free guidance to avoid doing two forward passes.
    latent_model_input = torch.cat([latents] * 2)
    latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)

    # predict the noise residual
    with torch.no_grad():
        noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

    # 条件引导
    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

    # 更新 latents: x_t -> x_t-1
    latents = scheduler.step(noise_pred, t, latents).prev_sample

将图像还原

# scale and decode the image latents with vae
latents = 1 / 0.18215 * latents
with torch.no_grad():
    image = vae.decode(latents).sample

image = (image / 2 + 0.5).clamp(0, 1).squeeze()
image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
image = Image.fromarray(image)
image

训练一个Diffusion模型

参考链接：Train a diffusion model