-
-
Save sayakpaul/23862a2e7f5ab73dfdcc513751289bea to your computer and use it in GitHub Desktop.
from diffusers import FluxPipeline, AutoencoderKL | |
from diffusers.image_processor import VaeImageProcessor | |
from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel | |
import torch | |
import gc | |
def flush(): | |
gc.collect() | |
torch.cuda.empty_cache() | |
torch.cuda.reset_max_memory_allocated() | |
torch.cuda.reset_peak_memory_stats() | |
def bytes_to_giga_bytes(bytes): | |
return bytes / 1024 / 1024 / 1024 | |
flush() | |
ckpt_id = "black-forest-labs/FLUX.1-schnell" | |
prompt = "a photo of a dog with cat-like look" | |
text_encoder = CLIPTextModel.from_pretrained( | |
ckpt_id, subfolder="text_encoder", torch_dtype=torch.bfloat16 | |
) | |
text_encoder_2 = T5EncoderModel.from_pretrained( | |
ckpt_id, subfolder="text_encoder_2", torch_dtype=torch.bfloat16 | |
) | |
tokenizer = CLIPTokenizer.from_pretrained(ckpt_id, subfolder="tokenizer") | |
tokenizer_2 = T5TokenizerFast.from_pretrained(ckpt_id, subfolder="tokenizer_2") | |
pipeline = FluxPipeline.from_pretrained( | |
ckpt_id, | |
text_encoder=text_encoder, | |
text_encoder_2=text_encoder_2, | |
tokenizer=tokenizer, | |
tokenizer_2=tokenizer_2, | |
transformer=None, | |
vae=None, | |
).to("cuda") | |
with torch.no_grad(): | |
print("Encoding prompts.") | |
prompt_embeds, pooled_prompt_embeds, text_ids = pipeline.encode_prompt( | |
prompt=prompt, prompt_2=None, max_sequence_length=256 | |
) | |
del text_encoder | |
del text_encoder_2 | |
del tokenizer | |
del tokenizer_2 | |
del pipeline | |
flush() | |
pipeline = FluxPipeline.from_pretrained( | |
ckpt_id, | |
text_encoder=None, | |
text_encoder_2=None, | |
tokenizer=None, | |
tokenizer_2=None, | |
vae=None, | |
torch_dtype=torch.bfloat16, | |
).to("cuda") | |
print("Running denoising.") | |
height, width = 768, 1360 | |
# No need to wrap it up under `torch.no_grad()` as pipeline call method | |
# is already wrapped under that. | |
latents = pipeline( | |
prompt_embeds=prompt_embeds, | |
pooled_prompt_embeds=pooled_prompt_embeds, | |
num_inference_steps=4, | |
guidance_scale=0.0, | |
height=height, | |
width=width, | |
output_type="latent", | |
).images | |
print(f"{latents.shape=}") | |
del pipeline.transformer | |
del pipeline | |
flush() | |
vae = AutoencoderKL.from_pretrained(ckpt_id, revision="refs/pr/1", subfolder="vae", torch_dtype=torch.bfloat16).to( | |
"cuda" | |
) | |
vae_scale_factor = 2 ** (len(vae.config.block_out_channels)) | |
image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) | |
with torch.no_grad(): | |
print("Running decoding.") | |
latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor) | |
latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor | |
image = vae.decode(latents, return_dict=False)[0] | |
image = image_processor.postprocess(image, output_type="pil") | |
image[0].save("image.png") |
Thanks! This is lots cleaner.
Simply version that works well on RTX 4090. Avg 4.5->5s for text encoder, 1->1.2 it/s for denoise model.
from transformers import T5EncoderModel import time import gc import torch import diffusers def flush(): gc.collect() torch.cuda.empty_cache() t5_encoder = T5EncoderModel.from_pretrained( "black-forest-labs/FLUX.1-schnell", subfolder="text_encoder_2", revision="refs/pr/7", torch_dtype=torch.bfloat16 ) text_encoder = diffusers.DiffusionPipeline.from_pretrained( "black-forest-labs/FLUX.1-schnell", text_encoder_2=t5_encoder, transformer=None, vae=None, revision="refs/pr/7", ) pipeline = diffusers.DiffusionPipeline.from_pretrained( "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16, revision="refs/pr/1", text_encoder_2=None, text_encoder=None, ) pipeline.enable_model_cpu_offload() @torch.inference_mode() def inference(self, prompt, num_inference_steps=4, guidance_scale=0.0, width=1024, height=1024): self.text_encoder.to("cuda") start = time.time() ( prompt_embeds, pooled_prompt_embeds, _, ) = self.text_encoder.encode_prompt(prompt=prompt, prompt_2=None, max_sequence_length=256) self.text_encoder.to("cpu") flush() print(f"Prompt encoding time: {time.time() - start}") output = self.pipeline( prompt_embeds=prompt_embeds.bfloat16(), pooled_prompt_embeds=pooled_prompt_embeds.bfloat16(), width=width, height=height, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps ) image = output.images[0] return image
@toilaluan Can you make it work under 16GB VRAM so that it can run on RTX A4000?
@bil-ash why is inference
using self
parameters?
@toilaluan I've been trying to run your example, although unsuccessfully. I keep getting errors such as:
AttributeError: 'NoneType' object has no attribute 'to'
I'm guessing that's because self.text_encoder is null (or equivalent in Python).
Removing the self portion of the code does work, but on my 4090 it utilizes the GPU 100% in both RAM and computing, but your seconds are minutes on my end. Any clue what I'm doing wrong?
Oh and I just added this at the bottom:
if __name__ == "__main__":
prompt = "A cat holding a sign that says hello world"
generated_image = inference(pipeline, prompt)
generated_image.save("output.png")
Simply version that works well on RTX 4090. Avg 4.5->5s for text encoder, 1->1.2 it/s for denoise model.
from transformers import T5EncoderModel import time import gc import torch import diffusers def flush(): gc.collect() torch.cuda.empty_cache() t5_encoder = T5EncoderModel.from_pretrained( "black-forest-labs/FLUX.1-schnell", subfolder="text_encoder_2", revision="refs/pr/7", torch_dtype=torch.bfloat16 ) text_encoder = diffusers.DiffusionPipeline.from_pretrained( "black-forest-labs/FLUX.1-schnell", text_encoder_2=t5_encoder, transformer=None, vae=None, revision="refs/pr/7", ) pipeline = diffusers.DiffusionPipeline.from_pretrained( "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16, revision="refs/pr/1", text_encoder_2=None, text_encoder=None, ) pipeline.enable_model_cpu_offload() @torch.inference_mode() def inference(self, prompt, num_inference_steps=4, guidance_scale=0.0, width=1024, height=1024): self.text_encoder.to("cuda") start = time.time() ( prompt_embeds, pooled_prompt_embeds, _, ) = self.text_encoder.encode_prompt(prompt=prompt, prompt_2=None, max_sequence_length=256) self.text_encoder.to("cpu") flush() print(f"Prompt encoding time: {time.time() - start}") output = self.pipeline( prompt_embeds=prompt_embeds.bfloat16(), pooled_prompt_embeds=pooled_prompt_embeds.bfloat16(), width=width, height=height, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps ) image = output.images[0] return image
How to use a Lora with this code? Thanks!
@toilaluan You're refactor works great. Anybody know how to get the image-to-image to work with 4090? I haven't been able to modify this script successfully.
@toilaluan I've been trying to run your example, although unsuccessfully. I keep getting errors such as:
AttributeError: 'NoneType' object has no attribute 'to'
I'm guessing that's because self.text_encoder is null (or equivalent in Python).
Removing the self portion of the code does work, but on my 4090 it utilizes the GPU 100% in both RAM and computing, but your seconds are minutes on my end. Any clue what I'm doing wrong?
Oh and I just added this at the bottom:
if __name__ == "__main__": prompt = "A cat holding a sign that says hello world" generated_image = inference(pipeline, prompt) generated_image.save("output.png")
same on my side (win 11)
Simply version that works well on RTX 4090. Avg 4.5->5s for text encoder, 1->1.2 it/s for denoise model.