recoilme · July 29, 2024 10:03
diff --git a/gistfile1.py b/gistfile1.py
 from diffusers import DiffusionPipeline,EulerAncestralDiscreteScheduler
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 import torch

 MODEL_PATH = "models/colorfulxl"
 pipe = DiffusionPipeline.from_pretrained(
    MODEL_PATH,
    text_encoder=None, tokenizer=None,
    text_encoder_2=None, tokenizer_2=None,
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True
 ).to('cuda')

 pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
    pipe.scheduler.config,
 )
 #pipe.to('cuda')

 prompt = "柴犬、カラフルアート"
 prompt = "The image captures a vibrant scene from a bustling street. The perspective is from a pedestrian's viewpoint on the sidewalk, immersing the viewer in the city's daily life. The street is lined with a variety of buildings, their architecture hinting at the rich history of the city. Among these structures, a church with a tall bell tower stands out, its presence adding a sense of grandeur to the scene.People are seen walking on the sidewalk, going about their day, adding a dynamic element to the otherwise static urban landscape.The colors in the image are predominantly blue, yellow, and green, reflecting the lively atmosphere of the city. The sky above is a clear blue, suggesting a sunny day, which further enhances the overall vibrancy of the scene."
 prompt = "The image captures a serene scene on a river. Dominating the foreground is a wooden boat, its brown hue contrasting with the greenish-blue of the water. The boat is not alone in the frame. In the background, a red-roofed building can be seen, its white walls standing out against the greenery. The building is partially obscured by trees, adding an element of mystery to the scene. The sky above is overcast, casting a soft light over the entire scene. Despite this, there's a sense of tranquility that pervades the image, as if inviting the viewer to take a moment and appreciate the peacefulness of the scene. There's no text or discernible action in the image, just a snapshot of a moment, frozen in time."
 prompt = "A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus. This imaginative creature features the distinctive, bulky body of a hippo, but with a texture and appearance resembling a golden-brown, crispy waffle. The creature might have elements like waffle squares across its skin and a syrup-like sheen. It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting, possibly including oversized utensils or plates in the background. The image should evoke a sense of playful absurdity and culinary fantasy."
 negative_prompt = ""

 def tokenize_prompt(tokenizer, prompt):
    text_inputs = tokenizer(
        prompt,
        padding="max_length",
        max_length=tokenizer.model_max_length,
        truncation=True,
        return_tensors="pt",
    )
    text_input_ids = text_inputs.input_ids
    return text_input_ids

 def encode_prompt(text_encoders, tokenizers, prompt, hidden_size, model_max_length=77 ):
    prompt_embeds_list = []

    for i, text_encoder in enumerate(text_encoders):
        if text_encoder is not None:
            tokenizer = tokenizers[i]

            text_input_ids = tokenize_prompt(tokenizer, prompt)
            prompt_embeds = text_encoder(
                    text_input_ids.to(text_encoders[i].device), output_hidden_states=True, return_dict=False
                )
            pooled_prompt_embeds = prompt_embeds[0]
            prompt_embeds = prompt_embeds[-1][-2]
        else:
            prompt_embeds = torch.zeros((1, model_max_length, hidden_size))
            pooled_prompt_embeds = torch.zeros((1, hidden_size)) 

        # We are only ALWAYS interested in the pooled output of the final text encoder
        prompt_embeds = prompt_embeds.to("cuda")
        bs_embed, seq_len, _ = prompt_embeds.shape
        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
        prompt_embeds_list.append(prompt_embeds)

    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
    return prompt_embeds, pooled_prompt_embeds


 #save GPU memory
 text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(MODEL_PATH, subfolder="text_encoder_2").to("cpu")
 tokenizer_2 = CLIPTokenizer.from_pretrained(MODEL_PATH, subfolder="tokenizer_2")
 prompt_embeds, pooled_prompt_embeds = encode_prompt([None, text_encoder_2],[None, tokenizer_2], prompt, 768)
 negative_prompt_embeds, negative_pooled_prompt_embeds = encode_prompt([None, text_encoder_2],[None, tokenizer_2], negative_prompt, 768)

 #del text_encoder_2, tokenizer_2
 #flush()

 #prompt_embeds, pooled_prompt_embeds = encode_prompt([None, pipe.text_encoder_2],[None, pipe.tokenizer_2], prompt, 768)
 #negative_prompt_embeds, negative_pooled_prompt_embeds = encode_prompt([None, pipe.text_encoder_2],[None, pipe.tokenizer_2], negative_prompt, 768)

 generator = torch.Generator(device="cuda").manual_seed(42)
 pipe.enable_vae_slicing()
 #pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
 image = pipe(
    prompt_embeds=prompt_embeds,
    pooled_prompt_embeds=pooled_prompt_embeds,
    negative_prompt_embeds=negative_prompt_embeds,
    negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
    num_inference_steps=24,
    guidance_scale=2,
    generator=generator,
    #num_images_per_prompt=2
 ).images[0]

 display(image)
	from diffusers import DiffusionPipeline,EulerAncestralDiscreteScheduler
	from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
	import torch

	MODEL_PATH = "models/colorfulxl"
	pipe = DiffusionPipeline.from_pretrained(
	MODEL_PATH,
	text_encoder=None, tokenizer=None,
	text_encoder_2=None, tokenizer_2=None,
	torch_dtype=torch.float16,
	variant="fp16",
	use_safetensors=True
	).to('cuda')

	pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(
	pipe.scheduler.config,
	)
	#pipe.to('cuda')

	prompt = "柴犬、カラフルアート"
	prompt = "The image captures a vibrant scene from a bustling street. The perspective is from a pedestrian's viewpoint on the sidewalk, immersing the viewer in the city's daily life. The street is lined with a variety of buildings, their architecture hinting at the rich history of the city. Among these structures, a church with a tall bell tower stands out, its presence adding a sense of grandeur to the scene.People are seen walking on the sidewalk, going about their day, adding a dynamic element to the otherwise static urban landscape.The colors in the image are predominantly blue, yellow, and green, reflecting the lively atmosphere of the city. The sky above is a clear blue, suggesting a sunny day, which further enhances the overall vibrancy of the scene."
	prompt = "The image captures a serene scene on a river. Dominating the foreground is a wooden boat, its brown hue contrasting with the greenish-blue of the water. The boat is not alone in the frame. In the background, a red-roofed building can be seen, its white walls standing out against the greenery. The building is partially obscured by trees, adding an element of mystery to the scene. The sky above is overcast, casting a soft light over the entire scene. Despite this, there's a sense of tranquility that pervades the image, as if inviting the viewer to take a moment and appreciate the peacefulness of the scene. There's no text or discernible action in the image, just a snapshot of a moment, frozen in time."
	prompt = "A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus. This imaginative creature features the distinctive, bulky body of a hippo, but with a texture and appearance resembling a golden-brown, crispy waffle. The creature might have elements like waffle squares across its skin and a syrup-like sheen. It's set in a surreal environment that playfully combines a natural water habitat of a hippo with elements of a breakfast table setting, possibly including oversized utensils or plates in the background. The image should evoke a sense of playful absurdity and culinary fantasy."
	negative_prompt = ""

	def tokenize_prompt(tokenizer, prompt):
	text_inputs = tokenizer(
	prompt,
	padding="max_length",
	max_length=tokenizer.model_max_length,
	truncation=True,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	return text_input_ids

	def encode_prompt(text_encoders, tokenizers, prompt, hidden_size, model_max_length=77 ):
	prompt_embeds_list = []

	for i, text_encoder in enumerate(text_encoders):
	if text_encoder is not None:
	tokenizer = tokenizers[i]

	text_input_ids = tokenize_prompt(tokenizer, prompt)
	prompt_embeds = text_encoder(
	text_input_ids.to(text_encoders[i].device), output_hidden_states=True, return_dict=False
	)
	pooled_prompt_embeds = prompt_embeds[0]
	prompt_embeds = prompt_embeds[-1][-2]
	else:
	prompt_embeds = torch.zeros((1, model_max_length, hidden_size))
	pooled_prompt_embeds = torch.zeros((1, hidden_size))

	# We are only ALWAYS interested in the pooled output of the final text encoder
	prompt_embeds = prompt_embeds.to("cuda")
	bs_embed, seq_len, _ = prompt_embeds.shape
	prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
	prompt_embeds_list.append(prompt_embeds)

	prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
	pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
	return prompt_embeds, pooled_prompt_embeds


	#save GPU memory
	text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(MODEL_PATH, subfolder="text_encoder_2").to("cpu")
	tokenizer_2 = CLIPTokenizer.from_pretrained(MODEL_PATH, subfolder="tokenizer_2")
	prompt_embeds, pooled_prompt_embeds = encode_prompt([None, text_encoder_2],[None, tokenizer_2], prompt, 768)
	negative_prompt_embeds, negative_pooled_prompt_embeds = encode_prompt([None, text_encoder_2],[None, tokenizer_2], negative_prompt, 768)

	#del text_encoder_2, tokenizer_2
	#flush()

	#prompt_embeds, pooled_prompt_embeds = encode_prompt([None, pipe.text_encoder_2],[None, pipe.tokenizer_2], prompt, 768)
	#negative_prompt_embeds, negative_pooled_prompt_embeds = encode_prompt([None, pipe.text_encoder_2],[None, pipe.tokenizer_2], negative_prompt, 768)

	generator = torch.Generator(device="cuda").manual_seed(42)
	pipe.enable_vae_slicing()
	#pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
	image = pipe(
	prompt_embeds=prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
	num_inference_steps=24,
	guidance_scale=2,
	generator=generator,
	#num_images_per_prompt=2
	).images[0]

	display(image)