Skip to content

Instantly share code, notes, and snippets.

@recoilme
Created July 29, 2024 10:03
Show Gist options
  • Save recoilme/7417060775a8051652b0e20c066b7cfb to your computer and use it in GitHub Desktop.
Save recoilme/7417060775a8051652b0e20c066b7cfb to your computer and use it in GitHub Desktop.
embeds.py
from diffusers import DiffusionPipeline,EulerAncestralDiscreteScheduler
from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
import torch
MODEL_PATH = "models/colorfulxl"
pipe = DiffusionPipeline.from_pretrained(
MODEL_PATH,
text_encoder=None, tokenizer=None,
text_encoder_2=None, tokenizer_2=None,
torch_dtype=torch.float16,
variant="fp16",
use_safetensors=True
).to('cuda')
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
pipe.scheduler.config,
)
#pipe.to('cuda')
prompt = "柴犬、カラフルアート"
prompt = "The image captures a vibrant scene from a bustling street. The perspective is from a pedestrian's viewpoint on the sidewalk, immersing the viewer in the city's daily life. The street is lined with a variety of buildings, their architecture hinting at the rich history of the city. Among these structures, a church with a tall bell tower stands out, its presence adding a sense of grandeur to the scene.People are seen walking on the sidewalk, going about their day, adding a dynamic element to the otherwise static urban landscape.The colors in the image are predominantly blue, yellow, and green, reflecting the lively atmosphere of the city. The sky above is a clear blue, suggesting a sunny day, which further enhances the overall vibrancy of the scene."
prompt = "The image captures a serene scene on a river. Dominating the foreground is a wooden boat, its brown hue contrasting with the greenish-blue of the water. The boat is not alone in the frame. In the background, a red-roofed building can be seen, its white walls standing out against the greenery. The building is partially obscured by trees, adding an element of mystery to the scene. The sky above is overcast, casting a soft light over the entire scene. Despite this, there's a sense of tranquility that pervades the image, as if inviting the viewer to take a moment and appreciate the peacefulness of the scene. There's no text or discernible action in the image, just a snapshot of a moment, frozen in time."
prompt = "A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus. This imaginative creature features the distinctive, bulky body of a hippo, but with a texture and appearance resembling a golden-brown, crispy waffle. The creature might have elements like waffle squares across its skin and a syrup-like sheen. It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting, possibly including oversized utensils or plates in the background. The image should evoke a sense of playful absurdity and culinary fantasy."
negative_prompt = ""
def tokenize_prompt(tokenizer, prompt):
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids
return text_input_ids
def encode_prompt(text_encoders, tokenizers, prompt, hidden_size, model_max_length=77 ):
prompt_embeds_list = []
for i, text_encoder in enumerate(text_encoders):
if text_encoder is not None:
tokenizer = tokenizers[i]
text_input_ids = tokenize_prompt(tokenizer, prompt)
prompt_embeds = text_encoder(
text_input_ids.to(text_encoders[i].device), output_hidden_states=True, return_dict=False
)
pooled_prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds[-1][-2]
else:
prompt_embeds = torch.zeros((1, model_max_length, hidden_size))
pooled_prompt_embeds = torch.zeros((1, hidden_size))
# We are only ALWAYS interested in the pooled output of the final text encoder
prompt_embeds = prompt_embeds.to("cuda")
bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
prompt_embeds_list.append(prompt_embeds)
prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
return prompt_embeds, pooled_prompt_embeds
#save GPU memory
text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(MODEL_PATH, subfolder="text_encoder_2").to("cpu")
tokenizer_2 = CLIPTokenizer.from_pretrained(MODEL_PATH, subfolder="tokenizer_2")
prompt_embeds, pooled_prompt_embeds = encode_prompt([None, text_encoder_2],[None, tokenizer_2], prompt, 768)
negative_prompt_embeds, negative_pooled_prompt_embeds = encode_prompt([None, text_encoder_2],[None, tokenizer_2], negative_prompt, 768)
#del text_encoder_2, tokenizer_2
#flush()
#prompt_embeds, pooled_prompt_embeds = encode_prompt([None, pipe.text_encoder_2],[None, pipe.tokenizer_2], prompt, 768)
#negative_prompt_embeds, negative_pooled_prompt_embeds = encode_prompt([None, pipe.text_encoder_2],[None, pipe.tokenizer_2], negative_prompt, 768)
generator = torch.Generator(device="cuda").manual_seed(42)
pipe.enable_vae_slicing()
#pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
image = pipe(
prompt_embeds=prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
num_inference_steps=24,
guidance_scale=2,
generator=generator,
#num_images_per_prompt=2
).images[0]
display(image)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment