Spaces:
Paused
Paused
File size: 14,625 Bytes
13f6c48 70b136d 13f6c48 563db04 13f6c48 18afc7f 13f6c48 f5bf6fb 13f6c48 f5bf6fb 13f6c48 f5bf6fb 13f6c48 f5bf6fb 13f6c48 f5bf6fb 13f6c48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 |
import torch
import math
from diffsynth import save_video
from diffsynth.pipelines.wan_video_holocine import WanVideoHoloCinePipeline, ModelConfig
# ---------------------------------------------------
# Helper Functions
# ---------------------------------------------------
def enforce_4t_plus_1(n: int) -> int:
"""Forces an integer 'n' to the closest 4t+1 form."""
t = round((n - 1) / 4)
return 4 * t + 1
def prepare_multishot_inputs(
global_caption: str,
shot_captions: list[str],
total_frames: int,
custom_shot_cut_frames: list[int] = None
) -> dict:
"""
(Helper for Mode 1)
Prepares the inference parameters from user-friendly segmented inputs.
"""
num_shots = len(shot_captions)
# 1. Prepare 'prompt'
if "This scene contains" not in global_caption:
global_caption = global_caption.strip() + f" This scene contains {num_shots} shots."
per_shot_string = " [shot cut] ".join(shot_captions)
prompt = f"[global caption] {global_caption} [per shot caption] {per_shot_string}"
# 2. Prepare 'num_frames'
processed_total_frames = enforce_4t_plus_1(total_frames)
# 3. Prepare 'shot_cut_frames'
num_cuts = num_shots - 1
processed_shot_cuts = []
if custom_shot_cut_frames:
# User provided custom cuts
print(f"Using {len(custom_shot_cut_frames)} user-defined shot cuts (enforcing 4t+1).")
for frame in custom_shot_cut_frames:
processed_shot_cuts.append(enforce_4t_plus_1(frame))
else:
# Auto-calculate cuts
print(f"Auto-calculating {num_cuts} shot cuts.")
if num_cuts > 0:
ideal_step = processed_total_frames / num_shots
for i in range(1, num_shots):
approx_cut_frame = i * ideal_step
processed_shot_cuts.append(enforce_4t_plus_1(round(approx_cut_frame)))
processed_shot_cuts = sorted(list(set(processed_shot_cuts)))
processed_shot_cuts = [f for f in processed_shot_cuts if f > 0 and f < processed_total_frames]
return {
"prompt": prompt,
"shot_cut_frames": processed_shot_cuts,
"num_frames": processed_total_frames
}
# ---------------------------------------------------
#
# ✨ Main Inference Wrapper ✨
#
# ---------------------------------------------------
def run_inference(
pipe: WanVideoHoloCinePipeline,
output_path: str,
# --- Prompting Options (Auto-detect) ---
global_caption: str = None,
shot_captions: list[str] = None,
prompt: str = None,
negative_prompt: str = None,
# --- Core Generation Parameters (All Optional) ---
num_frames: int = None,
shot_cut_frames: list[int] = None,
# --- Other Generation Parameters ---
seed: int = 0,
tiled: bool = True,
height: int = 480,
width: int = 832,
num_inference_steps: int = 50,
# --- Output Parameters ---
fps: int = 15,
quality: int = 5
):
"""
Runs the inference pipeline, auto-detecting the input mode
and honoring pipeline defaults for optional parameters.
Mode 1 (Structured): Provide 'global_caption', 'shot_captions', 'num_frames'.
'shot_cut_frames' is optional (auto-calculated).
Mode 2 (Raw): Provide 'prompt'.
'num_frames' and 'shot_cut_frames' are optional.
"""
# --- 1. Prepare 'pipe_kwargs' dictionary ---
pipe_kwargs = {
"negative_prompt": negative_prompt,
"seed": seed,
"tiled": tiled,
"height": height,
"width": width,
"num_inference_steps": num_inference_steps
}
# --- 2. Auto-Detection Logic ---
if global_caption and shot_captions:
# --- Mode 1: Structured Input ---
print("--- Detected Structured Input (Mode 1) ---")
if num_frames is None:
raise ValueError("Must provide 'num_frames' for structured input (Mode 1).")
# Use the helper function
inputs = prepare_multishot_inputs(
global_caption=global_caption,
shot_captions=shot_captions,
total_frames=num_frames,
custom_shot_cut_frames=shot_cut_frames
)
pipe_kwargs.update(inputs)
elif prompt:
# --- Mode 2: Raw String Input ---
print("--- Detected Raw String Input (Mode 2) ---")
pipe_kwargs["prompt"] = prompt
# Process num_frames ONLY if provided
if num_frames is not None:
processed_frames = enforce_4t_plus_1(num_frames)
if num_frames != processed_frames:
print(f"Corrected 'num_frames': {num_frames} -> {processed_frames}")
pipe_kwargs["num_frames"] = processed_frames
else:
print("Using default 'num_frames' (if any).")
pipe_kwargs["num_frames"] = None
# Process shot_cut_frames ONLY if provided
if shot_cut_frames is not None:
processed_cuts = [enforce_4t_plus_1(f) for f in shot_cut_frames]
if shot_cut_frames != processed_cuts:
print(f"Corrected 'shot_cut_frames': {shot_cut_frames} -> {processed_cuts}")
pipe_kwargs["shot_cut_frames"] = processed_cuts
else:
print("Using default 'shot_cut_frames' (if any).")
pipe_kwargs["shot_cut_frames"] = None
else:
raise ValueError("Invalid inputs. Provide either (global_caption, shot_captions, num_frames) OR (prompt).")
# --- 3. Filter out None values before calling pipe ---
# This ensures we don't pass 'num_frames=None' and override a
# default value (e.g., num_frames=25) inside the pipeline.
final_pipe_kwargs = {k: v for k, v in pipe_kwargs.items() if v is not None}
if "prompt" not in final_pipe_kwargs:
raise ValueError("A 'prompt' or ('global_caption' + 'shot_captions') is required.")
# --- 4. Run Generation ---
print(f"Running inference...")
if "num_frames" in final_pipe_kwargs:
print(f" Total frames: {final_pipe_kwargs['num_frames']}")
if "shot_cut_frames" in final_pipe_kwargs:
print(f" Cuts: {final_pipe_kwargs['shot_cut_frames']}")
video = pipe(**final_pipe_kwargs)
save_video(video, output_path, fps=fps, quality=quality)
print(f"Video saved successfully to {output_path}")
# ---------------------------------------------------
#
# Script Execution
#
# ---------------------------------------------------
# --- 1. Load Model (Done once) ---
device = 'cuda'
pipe = WanVideoHoloCinePipeline.from_pretrained(
torch_dtype=torch.bfloat16,
device=device,
model_configs=[
ModelConfig(path="./checkpoints/Wan2.2-T2V-A14B//Wan2.2-T2V-A14B/models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
ModelConfig(path="./checkpoints/HoloCine_dit/sparse/sparse_high_noise.safetensors", offload_device="cpu"),
ModelConfig(path="./checkpoints/HoloCine_dit/sparse/sparse_low_noise.safetensors", offload_device="cpu"),
ModelConfig(path="./checkpoints/Wan2.2-T2V-A14B/Wan2.2-T2V-A14B/Wan2.1_VAE.pth", offload_device="cpu"),
],
)
pipe.dit.use_sparse_self_attn=True
pipe.dit2.use_sparse_self_attn=True
pipe.enable_vram_management()
pipe.to(device)
# --- 2. Define Common Parameters ---
scene_negative_prompt = "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走"
# ===================================================================
# ✨ How to Use ✨
# ===================================================================
# --- Example 1: Call using Structured Input (Choice 1) ---
# (Auto-calculates shot cuts)
print("\n--- Running Example 1 (Structured Input) ---")
run_inference(
pipe=pipe,
negative_prompt=scene_negative_prompt,
output_path="video1.mp4",
# Choice 1 inputs
global_caption="The scene is set in a lavish, 1920s Art Deco ballroom during a masquerade party. [character1] is a mysterious woman with a sleek bob, wearing a sequined silver dress and an ornate feather mask. [character2] is a dapper gentleman in a black tuxedo, his face half-hidden by a simple black domino mask. The environment is filled with champagne fountains, a live jazz band, and dancing couples in extravagant costumes. This scene contains 5 shots.",
shot_captions=[
"Medium shot of [character1] standing by a pillar, observing the crowd, a champagne flute in her hand.",
"Close-up of [character2] watching her from across the room, a look of intrigue on his visible features.",
"Medium shot as [character2] navigates the crowd and approaches [character1], offering a polite bow.",
"Close-up on [character1]'s eyes through her mask, as they crinkle in a subtle, amused smile.",
"A stylish medium two-shot of them standing together, the swirling party out of focus behind them, as they begin to converse."
],
num_frames=241
)
# --- Example 2: Call using Raw String Input (Choice 2) ---
# (Uses your original prompt format)
print("\n--- Running Example 2 (Raw String Input) ---")
run_inference(
pipe=pipe,
negative_prompt=scene_negative_prompt,
output_path="video2.mp4",
# Choice 2 inputs
prompt="[global caption] The scene features a young painter, [character1], with paint-smudged cheeks and intense, focused eyes. Her hair is tied up messily. The setting is a bright, sun-drenched art studio with large windows, canvases, and the smell of oil paint. This scene contains 6 shots. [per shot caption] Medium shot of [character1] standing back from a large canvas, brush in hand, critically observing her work. [shot cut] Close-up of her hand holding the brush, dabbing it thoughtfully onto a palette of vibrant colors. [shot cut] Extreme close-up of her eyes, narrowed in concentration as she studies the canvas. [shot cut] Close-up on the canvas, showing a detailed, textured brushstroke being slowly applied. [shot cut] Medium close-up of [character1]'s face, a small, satisfied smile appears as she finds the right color. [shot cut] Over-the-shoulder shot showing her add a final, delicate highlight to the painting.",
num_frames=241,
shot_cut_frames=[37, 73, 113, 169, 205]
)
# # we provide more samples for test, you can uncomment them and have a try.
# run_inference(
# pipe=pipe,
# negative_prompt=scene_negative_prompt,
# output_path="video3.mp4",
# # Choice 2 inputs
# prompt="[global caption] The scene is a magical encounter in a misty, ancient Celtic ruin at dawn. [character1] is a modern-day historian, a skeptical woman with practical hiking gear and a camera. [character2] is the spectral figure of an ancient Celtic queen, translucent and ethereal, with long, flowing red hair and a silver circlet. The environment is comprised of mossy standing stones and rolling green hills shrouded in morning mist. This scene contains 5 shots. [per shot caption] Medium shot of [character1] carefully touching a moss-covered standing stone, a look of academic interest on her face. [shot cut] Close-up of her face, her expression changing to one of utter shock as she sees something off-camera. [shot cut] A soft-focus shot of [character2] slowly materializing from the mist between two stones. [shot cut] Medium shot of [character1] stumbling backward, lowering her camera, her skepticism completely shattered. [shot cut] Close-up of [character2]'s spectral face, her expression sad and timeless as she looks at the historian.",
# num_frames=241,
# shot_cut_frames=[49, 93, 137, 189]
# )
# run_inference(
# pipe=pipe,
# negative_prompt=scene_negative_prompt,
# output_path="video4.mp4",
# # Choice 2 inputs
# prompt="[global caption] The scene is a magical encounter in a misty, ancient Celtic ruin at dawn. [character1] is a modern-day historian, a skeptical woman with practical hiking gear and a camera. [character2] is the spectral figure of an ancient Celtic queen, translucent and ethereal, with long, flowing red hair and a silver circlet. The environment is comprised of mossy standing stones and rolling green hills shrouded in morning mist. This scene contains 5 shots. [per shot caption] Medium shot of [character1] carefully touching a moss-covered standing stone, a look of academic interest on her face. [shot cut] Close-up of her face, her expression changing to one of utter shock as she sees something off-camera. [shot cut] A soft-focus shot of [character2] slowly materializing from the mist between two stones. [shot cut] Medium shot of [character1] stumbling backward, lowering her camera, her skepticism completely shattered. [shot cut] Close-up of [character2]'s spectral face, her expression sad and timeless as she looks at the historian.",
# num_frames=241,
# shot_cut_frames=[49, 93, 137, 189],
# )
# run_inference(
# pipe=pipe,
# negative_prompt=scene_negative_prompt,
# output_path="video5.mp4",
# # Choice 2 inputs
# prompt="[global caption] The scene is set in an enchanted, bioluminescent forest at twilight. [character1] is an ancient elf with long, silver hair braided with glowing flowers, wearing ethereal white robes. [character2] is a lost human child with short, messy brown hair and wide, fearful eyes, clutching a wooden toy. The environment is filled with giant, glowing mushrooms, sparkling flora, and shafts of moonlight breaking through a thick canopy. This scene contains 5 shots. [per shot caption] Medium shot of [character2] hiding behind a large, glowing mushroom, peering out nervously. [shot cut] Close-up of [character1]'s hand, fingers adorned with delicate rings, gently touching a luminous plant, causing it to glow brighter. [shot cut] Medium shot of [character1] turning their head, their pointed ears catching the faint sound of the child's whimper. [shot cut] Close-up of [character2]'s face, a tear rolling down their cheek, illuminated by the blue light of the forest. [shot cut] A soft-focus shot from the child's perspective, showing [character1] approaching slowly with a kind, reassuring smile, their form haloed by the forest's light.",
# num_frames=241,
# shot_cut_frames=[49, 93, 137, 189],
# )
|