Created
February 12, 2026 22:35
-
-
Save tin2tin/059917aef1a8e7f975fa9a19fb08fc41 to your computer and use it in GitHub Desktop.
LTX2 + single file transformer + distill lora + 2. step upscale/decode
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import torch | |
| import gc | |
| import numpy as np | |
| from diffusers import ( | |
| LTX2Pipeline, | |
| LTX2VideoTransformer3DModel, | |
| GGUFQuantizationConfig, | |
| LTX2LatentUpsamplePipeline, | |
| FlowMatchEulerDiscreteScheduler | |
| ) | |
| from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel | |
| from diffusers.pipelines.ltx2.utils import STAGE_2_DISTILLED_SIGMA_VALUES | |
| from diffusers.utils import export_to_video | |
| from diffusers.pipelines.ltx2.export_utils import encode_video | |
| # ----------------------------- | |
| # CONFIGURATION | |
| # ----------------------------- | |
| # Unsloth's Q4_K_M version (~11.5GB) | |
| # This fits easily in your 32GB RAM and 24GB VRAM | |
| GGUF_URL = "https://huggingface.co/unsloth/LTX-2-GGUF/blob/main/ltx-2-19b-dev-Q4_K_M.gguf" | |
| OFFICIAL_REPO = "Lightricks/LTX-2" | |
| DEVICE = "cuda" | |
| DTYPE = torch.bfloat16 | |
| PROMPT = "A beautiful sunset over the ocean, cinematic, 4k, highly detailed, photorealistic" | |
| NEGATIVE_PROMPT = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static." | |
| # Base Resolution | |
| WIDTH = 768 | |
| HEIGHT = 512 | |
| FRAME_RATE = 24.0 | |
| NUM_FRAMES = 121 | |
| # ----------------------------- | |
| # 1. LOAD GGUF TRANSFORMER | |
| # ----------------------------- | |
| print("--- Step 1: Loading Unsloth GGUF Transformer ---") | |
| # Configure GGUF to load in bfloat16 to match the pipeline | |
| quantization_config = GGUFQuantizationConfig(compute_dtype=DTYPE) | |
| # Load the single file transformer (Auto-downloads if needed) | |
| transformer = LTX2VideoTransformer3DModel.from_single_file( | |
| GGUF_URL, | |
| config=OFFICIAL_REPO, | |
| subfolder="transformer", | |
| quantization_config=quantization_config, | |
| torch_dtype=DTYPE, | |
| ) | |
| # ----------------------------- | |
| # 2. SETUP PIPELINE | |
| # ----------------------------- | |
| print("--- Step 2: Setting up Base Pipeline ---") | |
| pipe = LTX2Pipeline.from_pretrained( | |
| OFFICIAL_REPO, | |
| transformer=transformer, # Inject GGUF transformer | |
| torch_dtype=DTYPE, | |
| ) | |
| pipe.enable_model_cpu_offload(device=DEVICE) | |
| # ----------------------------- | |
| # 3. STAGE 1: BASE LATENTS | |
| # ----------------------------- | |
| print("--- Step 3: Generating Base Latents (Stage 1) ---") | |
| video_latent, audio_latent = pipe( | |
| prompt=PROMPT, | |
| negative_prompt=NEGATIVE_PROMPT, | |
| width=WIDTH, | |
| height=HEIGHT, | |
| num_frames=NUM_FRAMES, | |
| frame_rate=FRAME_RATE, | |
| num_inference_steps=40, # Standard steps for base | |
| guidance_scale=4.0, | |
| output_type="latent", | |
| return_dict=False, | |
| ) | |
| # Clear VRAM for Upscaler | |
| torch.cuda.empty_cache() | |
| # ----------------------------- | |
| # 4. UPSCALING | |
| # ----------------------------- | |
| print("--- Step 4: Upscaling Latents ---") | |
| # Load Upscaler | |
| latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained( | |
| OFFICIAL_REPO, | |
| subfolder="latent_upsampler", | |
| torch_dtype=DTYPE, | |
| ) | |
| upsample_pipe = LTX2LatentUpsamplePipeline( | |
| vae=pipe.vae, | |
| latent_upsampler=latent_upsampler | |
| ) | |
| upsample_pipe.enable_model_cpu_offload(device=DEVICE) | |
| # Run Upscale | |
| upscaled_video_latent = upsample_pipe( | |
| latents=video_latent, | |
| output_type="latent", | |
| return_dict=False, | |
| )[0] | |
| # Cleanup Upscaler to free memory for Refinement | |
| del upsample_pipe | |
| del latent_upsampler | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| # ----------------------------- | |
| # 5. STAGE 2: DISTILLED REFINEMENT | |
| # ----------------------------- | |
| print("--- Step 5: Loading Distilled LoRA (Stage 2) ---") | |
| # Load Distilled LoRA weights onto the main pipe | |
| pipe.load_lora_weights( | |
| OFFICIAL_REPO, | |
| adapter_name="stage_2_distilled", | |
| weight_name="ltx-2-19b-distilled-lora-384.safetensors" | |
| ) | |
| pipe.set_adapters("stage_2_distilled", 1.0) | |
| # ⚠️ CRITICAL FOR 24GB VRAM: Enable Tiling | |
| # Without this, decoding the high-res video will OOM your GPU | |
| pipe.vae.enable_tiling() | |
| # Swap Scheduler for Distilled Sigmas | |
| new_scheduler = FlowMatchEulerDiscreteScheduler.from_config( | |
| pipe.scheduler.config, | |
| use_dynamic_shifting=False, | |
| shift_terminal=None | |
| ) | |
| pipe.scheduler = new_scheduler | |
| print("--- Step 6: Running Refinement ---") | |
| video, audio = pipe( | |
| latents=upscaled_video_latent, | |
| audio_latents=audio_latent, | |
| prompt=PROMPT, | |
| negative_prompt=NEGATIVE_PROMPT, | |
| width=WIDTH * 2, # 1536 | |
| height=HEIGHT * 2, # 1024 | |
| num_inference_steps=3, # Distilled needs very few steps | |
| sigmas=STAGE_2_DISTILLED_SIGMA_VALUES, | |
| guidance_scale=1.0, # Guidance 1.0 is mandatory for distilled | |
| output_type="np", | |
| return_dict=False, | |
| ) | |
| # ----------------------------- | |
| # 6. EXPORT | |
| # ----------------------------- | |
| print("--- Saving Video ---") | |
| video = (video * 255).round().astype("uint8") | |
| video_tensor = torch.from_numpy(video)[0] | |
| # Output path in C:\tmp to ensure write permissions or current blend dir | |
| output_filename = r"C:\tmp\ltx2_unsloth_distilled.mp4" | |
| if not os.path.exists(r"C:\tmp"): | |
| os.makedirs(r"C:\tmp") | |
| # Handle audio if present | |
| if audio is not None and len(audio) > 0: | |
| encode_video( | |
| video_tensor, | |
| fps=FRAME_RATE, | |
| audio=audio[0].float().cpu(), | |
| audio_sample_rate=pipe.vocoder.config.output_sampling_rate, | |
| output_path=output_filename, | |
| ) | |
| else: | |
| export_to_video(video_tensor, output_filename, fps=FRAME_RATE) | |
| print(f"DONE! Saved to {os.path.abspath(output_filename)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment