tin2tin · February 12, 2026 22:35
diff --git a/ltx2_single_destill_lora_upscale.py b/ltx2_single_destill_lora_upscale.py
 import os
 import torch
 import gc
 import numpy as np
 from diffusers import (
    LTX2Pipeline, 
    LTX2VideoTransformer3DModel, 
    GGUFQuantizationConfig,
    LTX2LatentUpsamplePipeline,
    FlowMatchEulerDiscreteScheduler
 )
 from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
 from diffusers.pipelines.ltx2.utils import STAGE_2_DISTILLED_SIGMA_VALUES
 from diffusers.utils import export_to_video
 from diffusers.pipelines.ltx2.export_utils import encode_video

 # -----------------------------
 # CONFIGURATION
 # -----------------------------

 # Unsloth's Q4_K_M version (~11.5GB)
 # This fits easily in your 32GB RAM and 24GB VRAM
 GGUF_URL = "https://huggingface.co/unsloth/LTX-2-GGUF/blob/main/ltx-2-19b-dev-Q4_K_M.gguf"

 OFFICIAL_REPO = "Lightricks/LTX-2"
 DEVICE = "cuda"
 DTYPE = torch.bfloat16

 PROMPT = "A beautiful sunset over the ocean, cinematic, 4k, highly detailed, photorealistic"
 NEGATIVE_PROMPT = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."

 # Base Resolution
 WIDTH = 768
 HEIGHT = 512
 FRAME_RATE = 24.0
 NUM_FRAMES = 121

 # -----------------------------
 # 1. LOAD GGUF TRANSFORMER
 # -----------------------------
 print("--- Step 1: Loading Unsloth GGUF Transformer ---")

 # Configure GGUF to load in bfloat16 to match the pipeline
 quantization_config = GGUFQuantizationConfig(compute_dtype=DTYPE)

 # Load the single file transformer (Auto-downloads if needed)
 transformer = LTX2VideoTransformer3DModel.from_single_file(
    GGUF_URL,
    config=OFFICIAL_REPO,
    subfolder="transformer",
    quantization_config=quantization_config,
    torch_dtype=DTYPE,
 )

 # -----------------------------
 # 2. SETUP PIPELINE
 # -----------------------------
 print("--- Step 2: Setting up Base Pipeline ---")

 pipe = LTX2Pipeline.from_pretrained(
    OFFICIAL_REPO,
    transformer=transformer, # Inject GGUF transformer
    torch_dtype=DTYPE,
 )
 pipe.enable_model_cpu_offload(device=DEVICE)

 # -----------------------------
 # 3. STAGE 1: BASE LATENTS
 # -----------------------------
 print("--- Step 3: Generating Base Latents (Stage 1) ---")

 video_latent, audio_latent = pipe(
    prompt=PROMPT,
    negative_prompt=NEGATIVE_PROMPT,
    width=WIDTH,
    height=HEIGHT,
    num_frames=NUM_FRAMES,
    frame_rate=FRAME_RATE,
    num_inference_steps=40, # Standard steps for base
    guidance_scale=4.0,
    output_type="latent",
    return_dict=False,
 )

 # Clear VRAM for Upscaler
 torch.cuda.empty_cache()

 # -----------------------------
 # 4. UPSCALING
 # -----------------------------
 print("--- Step 4: Upscaling Latents ---")

 # Load Upscaler
 latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
    OFFICIAL_REPO,
    subfolder="latent_upsampler",
    torch_dtype=DTYPE,
 )

 upsample_pipe = LTX2LatentUpsamplePipeline(
    vae=pipe.vae, 
    latent_upsampler=latent_upsampler
 )
 upsample_pipe.enable_model_cpu_offload(device=DEVICE)

 # Run Upscale
 upscaled_video_latent = upsample_pipe(
    latents=video_latent,
    output_type="latent",
    return_dict=False,
 )[0]

 # Cleanup Upscaler to free memory for Refinement
 del upsample_pipe
 del latent_upsampler
 gc.collect()
 torch.cuda.empty_cache()

 # -----------------------------
 # 5. STAGE 2: DISTILLED REFINEMENT
 # -----------------------------
 print("--- Step 5: Loading Distilled LoRA (Stage 2) ---")

 # Load Distilled LoRA weights onto the main pipe
 pipe.load_lora_weights(
    OFFICIAL_REPO, 
    adapter_name="stage_2_distilled", 
    weight_name="ltx-2-19b-distilled-lora-384.safetensors"
 )
 pipe.set_adapters("stage_2_distilled", 1.0)

 # ⚠️ CRITICAL FOR 24GB VRAM: Enable Tiling
 # Without this, decoding the high-res video will OOM your GPU
 pipe.vae.enable_tiling()

 # Swap Scheduler for Distilled Sigmas
 new_scheduler = FlowMatchEulerDiscreteScheduler.from_config(
    pipe.scheduler.config, 
    use_dynamic_shifting=False, 
    shift_terminal=None
 )
 pipe.scheduler = new_scheduler

 print("--- Step 6: Running Refinement ---")

 video, audio = pipe(
    latents=upscaled_video_latent,
    audio_latents=audio_latent,
    prompt=PROMPT,
    negative_prompt=NEGATIVE_PROMPT,
    width=WIDTH * 2,   # 1536
    height=HEIGHT * 2, # 1024
    num_inference_steps=3, # Distilled needs very few steps
    sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
    guidance_scale=1.0, # Guidance 1.0 is mandatory for distilled
    output_type="np",
    return_dict=False,
 )

 # -----------------------------
 # 6. EXPORT
 # -----------------------------
 print("--- Saving Video ---")

 video = (video * 255).round().astype("uint8")
 video_tensor = torch.from_numpy(video)[0]

 # Output path in C:\tmp to ensure write permissions or current blend dir
 output_filename = r"C:\tmp\ltx2_unsloth_distilled.mp4"
 if not os.path.exists(r"C:\tmp"):
    os.makedirs(r"C:\tmp")

 # Handle audio if present
 if audio is not None and len(audio) > 0:
    encode_video(
        video_tensor,
        fps=FRAME_RATE,
        audio=audio[0].float().cpu(),
        audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
        output_path=output_filename,
    )
 else:
    export_to_video(video_tensor, output_filename, fps=FRAME_RATE)

 print(f"DONE! Saved to {os.path.abspath(output_filename)}")
	import os
	import torch
	import gc
	import numpy as np
	from diffusers import (
	LTX2Pipeline,
	LTX2VideoTransformer3DModel,
	GGUFQuantizationConfig,
	LTX2LatentUpsamplePipeline,
	FlowMatchEulerDiscreteScheduler
	)
	from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
	from diffusers.pipelines.ltx2.utils import STAGE_2_DISTILLED_SIGMA_VALUES
	from diffusers.utils import export_to_video
	from diffusers.pipelines.ltx2.export_utils import encode_video

	# -----------------------------
	# CONFIGURATION
	# -----------------------------

	# Unsloth's Q4_K_M version (~11.5GB)
	# This fits easily in your 32GB RAM and 24GB VRAM
	GGUF_URL = "https://huggingface.co/unsloth/LTX-2-GGUF/blob/main/ltx-2-19b-dev-Q4_K_M.gguf"

	OFFICIAL_REPO = "Lightricks/LTX-2"
	DEVICE = "cuda"
	DTYPE = torch.bfloat16

	PROMPT = "A beautiful sunset over the ocean, cinematic, 4k, highly detailed, photorealistic"
	NEGATIVE_PROMPT = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static."

	# Base Resolution
	WIDTH = 768
	HEIGHT = 512
	FRAME_RATE = 24.0
	NUM_FRAMES = 121

	# -----------------------------
	# 1. LOAD GGUF TRANSFORMER
	# -----------------------------
	print("--- Step 1: Loading Unsloth GGUF Transformer ---")

	# Configure GGUF to load in bfloat16 to match the pipeline
	quantization_config = GGUFQuantizationConfig(compute_dtype=DTYPE)

	# Load the single file transformer (Auto-downloads if needed)
	transformer = LTX2VideoTransformer3DModel.from_single_file(
	GGUF_URL,
	config=OFFICIAL_REPO,
	subfolder="transformer",
	quantization_config=quantization_config,
	torch_dtype=DTYPE,
	)

	# -----------------------------
	# 2. SETUP PIPELINE
	# -----------------------------
	print("--- Step 2: Setting up Base Pipeline ---")

	pipe = LTX2Pipeline.from_pretrained(
	OFFICIAL_REPO,
	transformer=transformer, # Inject GGUF transformer
	torch_dtype=DTYPE,
	)
	pipe.enable_model_cpu_offload(device=DEVICE)

	# -----------------------------
	# 3. STAGE 1: BASE LATENTS
	# -----------------------------
	print("--- Step 3: Generating Base Latents (Stage 1) ---")

	video_latent, audio_latent = pipe(
	prompt=PROMPT,
	negative_prompt=NEGATIVE_PROMPT,
	width=WIDTH,
	height=HEIGHT,
	num_frames=NUM_FRAMES,
	frame_rate=FRAME_RATE,
	num_inference_steps=40, # Standard steps for base
	guidance_scale=4.0,
	output_type="latent",
	return_dict=False,
	)

	# Clear VRAM for Upscaler
	torch.cuda.empty_cache()

	# -----------------------------
	# 4. UPSCALING
	# -----------------------------
	print("--- Step 4: Upscaling Latents ---")

	# Load Upscaler
	latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
	OFFICIAL_REPO,
	subfolder="latent_upsampler",
	torch_dtype=DTYPE,
	)

	upsample_pipe = LTX2LatentUpsamplePipeline(
	vae=pipe.vae,
	latent_upsampler=latent_upsampler
	)
	upsample_pipe.enable_model_cpu_offload(device=DEVICE)

	# Run Upscale
	upscaled_video_latent = upsample_pipe(
	latents=video_latent,
	output_type="latent",
	return_dict=False,
	)[0]

	# Cleanup Upscaler to free memory for Refinement
	del upsample_pipe
	del latent_upsampler
	gc.collect()
	torch.cuda.empty_cache()

	# -----------------------------
	# 5. STAGE 2: DISTILLED REFINEMENT
	# -----------------------------
	print("--- Step 5: Loading Distilled LoRA (Stage 2) ---")

	# Load Distilled LoRA weights onto the main pipe
	pipe.load_lora_weights(
	OFFICIAL_REPO,
	adapter_name="stage_2_distilled",
	weight_name="ltx-2-19b-distilled-lora-384.safetensors"
	)
	pipe.set_adapters("stage_2_distilled", 1.0)

	# ⚠️ CRITICAL FOR 24GB VRAM: Enable Tiling
	# Without this, decoding the high-res video will OOM your GPU
	pipe.vae.enable_tiling()

	# Swap Scheduler for Distilled Sigmas
	new_scheduler = FlowMatchEulerDiscreteScheduler.from_config(
	pipe.scheduler.config,
	use_dynamic_shifting=False,
	shift_terminal=None
	)
	pipe.scheduler = new_scheduler

	print("--- Step 6: Running Refinement ---")

	video, audio = pipe(
	latents=upscaled_video_latent,
	audio_latents=audio_latent,
	prompt=PROMPT,
	negative_prompt=NEGATIVE_PROMPT,
	width=WIDTH * 2, # 1536
	height=HEIGHT * 2, # 1024
	num_inference_steps=3, # Distilled needs very few steps
	sigmas=STAGE_2_DISTILLED_SIGMA_VALUES,
	guidance_scale=1.0, # Guidance 1.0 is mandatory for distilled
	output_type="np",
	return_dict=False,
	)

	# -----------------------------
	# 6. EXPORT
	# -----------------------------
	print("--- Saving Video ---")

	video = (video * 255).round().astype("uint8")
	video_tensor = torch.from_numpy(video)[0]

	# Output path in C:\tmp to ensure write permissions or current blend dir
	output_filename = r"C:\tmp\ltx2_unsloth_distilled.mp4"
	if not os.path.exists(r"C:\tmp"):
	os.makedirs(r"C:\tmp")

	# Handle audio if present
	if audio is not None and len(audio) > 0:
	encode_video(
	video_tensor,
	fps=FRAME_RATE,
	audio=audio[0].float().cpu(),
	audio_sample_rate=pipe.vocoder.config.output_sampling_rate,
	output_path=output_filename,
	)
	else:
	export_to_video(video_tensor, output_filename, fps=FRAME_RATE)

	print(f"DONE! Saved to {os.path.abspath(output_filename)}")
No results found