Spaces:

legacymiles
/

halocine-demoz

Paused

App Files Files Community

halocine-demoz / HoloCine_inference_sparse_attention.py

GPTVer8

release

70b136d about 2 months ago

raw

history blame contribute delete

14.6 kB

	import torch
	import math
	from diffsynth import save_video
	from diffsynth.pipelines.wan_video_holocine import WanVideoHoloCinePipeline, ModelConfig

	# ---------------------------------------------------
	# Helper Functions
	# ---------------------------------------------------

	def enforce_4t_plus_1(n: int) -> int:
	"""Forces an integer 'n' to the closest 4t+1 form."""
	t = round((n - 1) / 4)
	return 4 * t + 1

	def prepare_multishot_inputs(
	global_caption: str,
	shot_captions: list[str],
	total_frames: int,
	custom_shot_cut_frames: list[int] = None
	) -> dict:
	"""
	(Helper for Mode 1)
	Prepares the inference parameters from user-friendly segmented inputs.
	"""

	num_shots = len(shot_captions)

	# 1. Prepare 'prompt'
	if "This scene contains" not in global_caption:
	global_caption = global_caption.strip() + f" This scene contains {num_shots} shots."
	per_shot_string = " [shot cut] ".join(shot_captions)
	prompt = f"[global caption] {global_caption} [per shot caption] {per_shot_string}"

	# 2. Prepare 'num_frames'
	processed_total_frames = enforce_4t_plus_1(total_frames)

	# 3. Prepare 'shot_cut_frames'
	num_cuts = num_shots - 1
	processed_shot_cuts = []

	if custom_shot_cut_frames:
	# User provided custom cuts
	print(f"Using {len(custom_shot_cut_frames)} user-defined shot cuts (enforcing 4t+1).")
	for frame in custom_shot_cut_frames:
	processed_shot_cuts.append(enforce_4t_plus_1(frame))
	else:
	# Auto-calculate cuts
	print(f"Auto-calculating {num_cuts} shot cuts.")
	if num_cuts > 0:
	ideal_step = processed_total_frames / num_shots
	for i in range(1, num_shots):
	approx_cut_frame = i * ideal_step
	processed_shot_cuts.append(enforce_4t_plus_1(round(approx_cut_frame)))

	processed_shot_cuts = sorted(list(set(processed_shot_cuts)))
	processed_shot_cuts = [f for f in processed_shot_cuts if f > 0 and f < processed_total_frames]

	return {
	"prompt": prompt,
	"shot_cut_frames": processed_shot_cuts,
	"num_frames": processed_total_frames
	}

	# ---------------------------------------------------
	#
	# ✨ Main Inference Wrapper ✨
	#
	# ---------------------------------------------------

	def run_inference(
	pipe: WanVideoHoloCinePipeline,
	output_path: str,

	# --- Prompting Options (Auto-detect) ---
	global_caption: str = None,
	shot_captions: list[str] = None,
	prompt: str = None,
	negative_prompt: str = None,

	# --- Core Generation Parameters (All Optional) ---
	num_frames: int = None,
	shot_cut_frames: list[int] = None,

	# --- Other Generation Parameters ---
	seed: int = 0,
	tiled: bool = True,
	height: int = 480,
	width: int = 832,
	num_inference_steps: int = 50,

	# --- Output Parameters ---
	fps: int = 15,
	quality: int = 5
	):
	"""
	Runs the inference pipeline, auto-detecting the input mode
	and honoring pipeline defaults for optional parameters.

	Mode 1 (Structured): Provide 'global_caption', 'shot_captions', 'num_frames'.
	'shot_cut_frames' is optional (auto-calculated).
	Mode 2 (Raw): Provide 'prompt'.
	'num_frames' and 'shot_cut_frames' are optional.
	"""

	# --- 1. Prepare 'pipe_kwargs' dictionary ---
	pipe_kwargs = {
	"negative_prompt": negative_prompt,
	"seed": seed,
	"tiled": tiled,
	"height": height,
	"width": width,
	"num_inference_steps": num_inference_steps
	}

	# --- 2. Auto-Detection Logic ---
	if global_caption and shot_captions:
	# --- Mode 1: Structured Input ---
	print("--- Detected Structured Input (Mode 1) ---")
	if num_frames is None:
	raise ValueError("Must provide 'num_frames' for structured input (Mode 1).")

	# Use the helper function
	inputs = prepare_multishot_inputs(
	global_caption=global_caption,
	shot_captions=shot_captions,
	total_frames=num_frames,
	custom_shot_cut_frames=shot_cut_frames
	)
	pipe_kwargs.update(inputs)

	elif prompt:
	# --- Mode 2: Raw String Input ---
	print("--- Detected Raw String Input (Mode 2) ---")
	pipe_kwargs["prompt"] = prompt

	# Process num_frames ONLY if provided
	if num_frames is not None:
	processed_frames = enforce_4t_plus_1(num_frames)
	if num_frames != processed_frames:
	print(f"Corrected 'num_frames': {num_frames} -> {processed_frames}")
	pipe_kwargs["num_frames"] = processed_frames
	else:
	print("Using default 'num_frames' (if any).")
	pipe_kwargs["num_frames"] = None

	# Process shot_cut_frames ONLY if provided
	if shot_cut_frames is not None:
	processed_cuts = [enforce_4t_plus_1(f) for f in shot_cut_frames]
	if shot_cut_frames != processed_cuts:
	print(f"Corrected 'shot_cut_frames': {shot_cut_frames} -> {processed_cuts}")
	pipe_kwargs["shot_cut_frames"] = processed_cuts
	else:
	print("Using default 'shot_cut_frames' (if any).")
	pipe_kwargs["shot_cut_frames"] = None

	else:
	raise ValueError("Invalid inputs. Provide either (global_caption, shot_captions, num_frames) OR (prompt).")

	# --- 3. Filter out None values before calling pipe ---
	# This ensures we don't pass 'num_frames=None' and override a
	# default value (e.g., num_frames=25) inside the pipeline.
	final_pipe_kwargs = {k: v for k, v in pipe_kwargs.items() if v is not None}

	if "prompt" not in final_pipe_kwargs:
	raise ValueError("A 'prompt' or ('global_caption' + 'shot_captions') is required.")

	# --- 4. Run Generation ---
	print(f"Running inference...")
	if "num_frames" in final_pipe_kwargs:
	print(f" Total frames: {final_pipe_kwargs['num_frames']}")
	if "shot_cut_frames" in final_pipe_kwargs:
	print(f" Cuts: {final_pipe_kwargs['shot_cut_frames']}")

	video = pipe(**final_pipe_kwargs)

	save_video(video, output_path, fps=fps, quality=quality)
	print(f"Video saved successfully to {output_path}")


	# ---------------------------------------------------
	#
	# Script Execution
	#
	# ---------------------------------------------------

	# --- 1. Load Model (Done once) ---
	device = 'cuda'
	pipe = WanVideoHoloCinePipeline.from_pretrained(
	torch_dtype=torch.bfloat16,
	device=device,
	model_configs=[
	ModelConfig(path="./checkpoints/Wan2.2-T2V-A14B//Wan2.2-T2V-A14B/models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
	ModelConfig(path="./checkpoints/HoloCine_dit/sparse/sparse_high_noise.safetensors", offload_device="cpu"),
	ModelConfig(path="./checkpoints/HoloCine_dit/sparse/sparse_low_noise.safetensors", offload_device="cpu"),
	ModelConfig(path="./checkpoints/Wan2.2-T2V-A14B/Wan2.2-T2V-A14B/Wan2.1_VAE.pth", offload_device="cpu"),
	],
	)
	pipe.dit.use_sparse_self_attn=True
	pipe.dit2.use_sparse_self_attn=True
	pipe.enable_vram_management()
	pipe.to(device)

	# --- 2. Define Common Parameters ---
	scene_negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"


	# ===================================================================
	# ✨ How to Use ✨
	# ===================================================================

	# --- Example 1: Call using Structured Input (Choice 1) ---
	# (Auto-calculates shot cuts)
	print("\n--- Running Example 1 (Structured Input) ---")
	run_inference(
	pipe=pipe,
	negative_prompt=scene_negative_prompt,
	output_path="video1.mp4",

	# Choice 1 inputs
	global_caption="The scene is set in a lavish, 1920s Art Deco ballroom during a masquerade party. [character1] is a mysterious woman with a sleek bob, wearing a sequined silver dress and an ornate feather mask. [character2] is a dapper gentleman in a black tuxedo, his face half-hidden by a simple black domino mask. The environment is filled with champagne fountains, a live jazz band, and dancing couples in extravagant costumes. This scene contains 5 shots.",
	shot_captions=[
	"Medium shot of [character1] standing by a pillar, observing the crowd, a champagne flute in her hand.",
	"Close-up of [character2] watching her from across the room, a look of intrigue on his visible features.",
	"Medium shot as [character2] navigates the crowd and approaches [character1], offering a polite bow.",
	"Close-up on [character1]'s eyes through her mask, as they crinkle in a subtle, amused smile.",
	"A stylish medium two-shot of them standing together, the swirling party out of focus behind them, as they begin to converse."

	],
	num_frames=241
	)


	# --- Example 2: Call using Raw String Input (Choice 2) ---
	# (Uses your original prompt format)
	print("\n--- Running Example 2 (Raw String Input) ---")

	run_inference(
	pipe=pipe,
	negative_prompt=scene_negative_prompt,
	output_path="video2.mp4",

	# Choice 2 inputs
	prompt="[global caption] The scene features a young painter, [character1], with paint-smudged cheeks and intense, focused eyes. Her hair is tied up messily. The setting is a bright, sun-drenched art studio with large windows, canvases, and the smell of oil paint. This scene contains 6 shots. [per shot caption] Medium shot of [character1] standing back from a large canvas, brush in hand, critically observing her work. [shot cut] Close-up of her hand holding the brush, dabbing it thoughtfully onto a palette of vibrant colors. [shot cut] Extreme close-up of her eyes, narrowed in concentration as she studies the canvas. [shot cut] Close-up on the canvas, showing a detailed, textured brushstroke being slowly applied. [shot cut] Medium close-up of [character1]'s face, a small, satisfied smile appears as she finds the right color. [shot cut] Over-the-shoulder shot showing her add a final, delicate highlight to the painting.",
	num_frames=241,
	shot_cut_frames=[37, 73, 113, 169, 205]
	)



	# # we provide more samples for test, you can uncomment them and have a try.

	# run_inference(
	# pipe=pipe,
	# negative_prompt=scene_negative_prompt,
	# output_path="video3.mp4",

	# # Choice 2 inputs
	# prompt="[global caption] The scene is a magical encounter in a misty, ancient Celtic ruin at dawn. [character1] is a modern-day historian, a skeptical woman with practical hiking gear and a camera. [character2] is the spectral figure of an ancient Celtic queen, translucent and ethereal, with long, flowing red hair and a silver circlet. The environment is comprised of mossy standing stones and rolling green hills shrouded in morning mist. This scene contains 5 shots. [per shot caption] Medium shot of [character1] carefully touching a moss-covered standing stone, a look of academic interest on her face. [shot cut] Close-up of her face, her expression changing to one of utter shock as she sees something off-camera. [shot cut] A soft-focus shot of [character2] slowly materializing from the mist between two stones. [shot cut] Medium shot of [character1] stumbling backward, lowering her camera, her skepticism completely shattered. [shot cut] Close-up of [character2]'s spectral face, her expression sad and timeless as she looks at the historian.",
	# num_frames=241,
	# shot_cut_frames=[49, 93, 137, 189]
	# )

	# run_inference(
	# pipe=pipe,
	# negative_prompt=scene_negative_prompt,
	# output_path="video4.mp4",

	# # Choice 2 inputs
	# prompt="[global caption] The scene is a magical encounter in a misty, ancient Celtic ruin at dawn. [character1] is a modern-day historian, a skeptical woman with practical hiking gear and a camera. [character2] is the spectral figure of an ancient Celtic queen, translucent and ethereal, with long, flowing red hair and a silver circlet. The environment is comprised of mossy standing stones and rolling green hills shrouded in morning mist. This scene contains 5 shots. [per shot caption] Medium shot of [character1] carefully touching a moss-covered standing stone, a look of academic interest on her face. [shot cut] Close-up of her face, her expression changing to one of utter shock as she sees something off-camera. [shot cut] A soft-focus shot of [character2] slowly materializing from the mist between two stones. [shot cut] Medium shot of [character1] stumbling backward, lowering her camera, her skepticism completely shattered. [shot cut] Close-up of [character2]'s spectral face, her expression sad and timeless as she looks at the historian.",
	# num_frames=241,
	# shot_cut_frames=[49, 93, 137, 189],
	# )


	# run_inference(
	# pipe=pipe,
	# negative_prompt=scene_negative_prompt,
	# output_path="video5.mp4",

	# # Choice 2 inputs
	# prompt="[global caption] The scene is set in an enchanted, bioluminescent forest at twilight. [character1] is an ancient elf with long, silver hair braided with glowing flowers, wearing ethereal white robes. [character2] is a lost human child with short, messy brown hair and wide, fearful eyes, clutching a wooden toy. The environment is filled with giant, glowing mushrooms, sparkling flora, and shafts of moonlight breaking through a thick canopy. This scene contains 5 shots. [per shot caption] Medium shot of [character2] hiding behind a large, glowing mushroom, peering out nervously. [shot cut] Close-up of [character1]'s hand, fingers adorned with delicate rings, gently touching a luminous plant, causing it to glow brighter. [shot cut] Medium shot of [character1] turning their head, their pointed ears catching the faint sound of the child's whimper. [shot cut] Close-up of [character2]'s face, a tear rolling down their cheek, illuminated by the blue light of the forest. [shot cut] A soft-focus shot from the child's perspective, showing [character1] approaching slowly with a kind, reassuring smile, their form haloed by the forest's light.",
	# num_frames=241,
	# shot_cut_frames=[49, 93, 137, 189],
	# )