模型地址:https://huggingface.co/CompVis/stable-diffusion-v1-4/tree/main/vae
主要参考:Using-Stable-Diffusion-VAE-to-encode-satellite-images
sd1.4 vae
下载到本地
from diffusers import AutoencoderKL
from PIL import Image
import torch
import torchvision.transforms as T
# ./huggingface/stable-diffusion-v1-4/vae 切换为任意本地路径
vae = AutoencoderKL.from_pretrained("./huggingface/stable-diffusion-v1-4/vae",variant='fp16')
# c:\Users\zeng\Downloads\vae_config.json
def encode_img(input_img):
# Single image -> single latent in a batch (so size 1, 4, 64, 64)
# Transform the image to a tensor and normalize it
transform = T.Compose([
# T.Resize((256, 256)),
T.ToTensor()
])
input_img = transform(input_img)
if len(input_img.shape)<4:
input_img = input_img.unsqueeze(0)
with torch.no_grad():
latent = vae.encode(input_img*2 - 1) # Note scaling
return 0.18215 * latent.latent_dist.sample()
def decode_img(latents):
# bath of latents -> list of images
latents = (1 / 0.18215) * latents
with torch.no_grad():
image = vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1)
image = image.detach().cpu()
# image = T.Resize(original_size)(image.squeeze())
return T.ToPILImage()(image.squeeze())
if __name__ == '__main__':
# Load an example image
input_img = Image.open("huge.jpg")
original_size = input_img.size
print('original_size',original_size)
# Encode and decode the image
latents = encode_img(input_img)
reconstructed_img = decode_img(latents)
# Save the reconstructed image
reconstructed_img.save("reconstructed_example2.jpg")
# Concatenate the original and reconstructed images
concatenated_img = Image.new('RGB', (original_size[0] * 2, original_size[1]))
concatenated_img.paste(input_img, (0, 0))
concatenated_img.paste(reconstructed_img, (original_size[0], 0))
# Save the concatenated image
concatenated_img.save("concatenated_example2.jpg")