weiyuyeh commited on
Commit
60293a7
·
1 Parent(s): a1f0652

new app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -74
app.py CHANGED
@@ -1,13 +1,14 @@
 
 
1
  import subprocess
2
 
3
- subprocess.run(
4
- "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True
5
- )
6
 
7
  import gradio as gr
8
  import spaces
9
  import torch
10
- from diffusers import Cosmos2TextToImagePipeline, EDMEulerScheduler
 
11
  from transformers import AutoModelForCausalLM, SiglipProcessor
12
  import random
13
 
@@ -26,133 +27,105 @@ try:
26
  except Exception as e:
27
  print(f"Authentication failed: {e}")
28
 
29
- #Add flash_attention_2 to the safeguard model
 
30
  def patch_from_pretrained(cls):
31
  orig_method = cls.from_pretrained
32
 
33
  def new_from_pretrained(*args, **kwargs):
34
  kwargs.setdefault("attn_implementation", "flash_attention_2")
35
- kwargs.setdefault("torch_dtype", torch.bfloat16)
36
  return orig_method(*args, **kwargs)
37
 
38
  cls.from_pretrained = new_from_pretrained
39
 
 
40
  patch_from_pretrained(AutoModelForCausalLM)
41
 
42
- #Add a `use_fast` to the safeguard image processor
 
43
  def patch_processor_fast(cls):
44
  orig_method = cls.from_pretrained
 
45
  def new_from_pretrained(*args, **kwargs):
46
  kwargs.setdefault("use_fast", True)
47
  return orig_method(*args, **kwargs)
 
48
  cls.from_pretrained = new_from_pretrained
49
 
50
- patch_processor_fast(SiglipProcessor)
51
 
52
- model_14b_id = "nvidia/Cosmos-Predict2-14B-Text2Image"
53
 
54
- pipe_14b = Cosmos2TextToImagePipeline.from_pretrained(
55
- model_14b_id,
56
- torch_dtype=torch.bfloat16,
57
- use_auth_token=True
58
- )
59
  pipe_14b.to("cuda")
60
 
 
61
  @spaces.GPU(duration=140)
62
- def generate_image(
 
63
  prompt,
64
  negative_prompt="The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.",
65
  seed=42,
66
  randomize_seed=False,
67
  model_choice="14B",
68
- progress=gr.Progress(track_tqdm=True)
69
- ):
70
 
71
  if randomize_seed:
72
  actual_seed = random.randint(0, 1000000)
73
  else:
74
  actual_seed = seed
75
-
76
  generator = torch.Generator().manual_seed(actual_seed)
77
-
78
- output = pipe_14b(
79
- prompt=prompt,
80
- negative_prompt=negative_prompt,
81
- generator=generator
82
- ).images[0]
83
-
84
- return output, actual_seed
85
-
86
- example_prompts = [
87
- "A well-worn broom sweeps across a dusty wooden floor, its bristles gathering crumbs and flecks of debris in swift, rhythmic strokes. Dust motes dance in the sunbeams filtering through the window, glowing momentarily before settling. The quiet swish of straw brushing wood is interrupted only by the occasional creak of old floorboards. With each pass, the floor grows cleaner, restoring a sense of quiet order to the humble room.",
88
- "A laundry machine whirs to life, tumbling colorful clothes behind the foggy glass door. Suds begin to form in a frothy dance, clinging to fabric as the drum spins. The gentle thud of shifting clothes creates a steady rhythm, like a heartbeat of the home. Outside the machine, a quiet calm fills the room, anticipation building for the softness and warmth of freshly laundered garments.",
89
- "A robotic arm tightens a bolt beneath the hood of a car, its tool head rotating with practiced torque. The metal-on-metal sound clicks into place, and the arm pauses briefly before retracting with a soft hydraulic hiss. Overhead lights reflect off the glossy vehicle surface, while scattered tools and screens blink in the background—a garage scene reimagined through the lens of precision engineering.",
90
- "A nighttime city bus terminal gradually shifts from stillness to subtle movement. At first, multiple double-decker buses are parked under the glow of overhead lights, with a central bus labeled '87D' facing forward and stationary. As the video progresses, the bus in the middle moves ahead slowly, its headlights brightening the surrounding area and casting reflections onto adjacent vehicles. The motion creates space in the lineup, signaling activity within the otherwise quiet station. It then comes to a smooth stop, resuming its position in line. Overhead signage in Chinese characters remains illuminated, enhancing the vibrant, urban night scene.",
91
- "As the red light shifts to green, the red bus at the intersection begins to move forward, its headlights cutting through the falling snow. The snowy tire tracks deepen as the vehicle inches ahead, casting fresh lines onto the slushy road. Around it, streetlights glow warmer, illuminating the drifting flakes and wet reflections on the asphalt. Other cars behind start to edge forward, their beams joining the scene. The stillness of the urban street transitions into motion as the quiet snowfall is punctuated by the slow advance of traffic through the frosty city corridor.",
92
- "In the later moments of the video, the female worker in the front, dressed in a white coat and hairnet, performs a repetitive yet precise task. She scoops golden granular material from a wide jar and steadily pours it into the next empty glass bottle on the conveyor belt. Her hand moves with practiced control as she aligns the scoop over each container, ensuring an even fill. The sequence highlights her focused attention and consistent motion, capturing the shift from preparation to active material handling as the production line advances bottle by bottle.",
93
- "A wide-angle shot captures a sunny suburban street intersection, where the bright sunlight casts sharp shadows across the road. The scene is framed by a row of houses with beige and brown roofs, and lush green lawns. Autumn-colored trees add vibrant red and orange hues to the landscape. Overhead power lines stretch across the sky, and a fire hydrant is visible on the right side of the frame near the curb. A silver sedan is parked on the driveway of a house on the left, while a silver SUV is parked on the street in front of the house at the center of the camera view. The ego vehicle waits to turn right at the t-intersection, yielding to two other vehicles traveling in opposite directions. A black car enters the frame from the right, driving across the intersection and continuing straight ahead. The car's movement is smooth and steady, and it exits the frame to the left. The final frame shows the intersection with a vehicle moving from the left to the right side, the silver sedan and the SUV still parked in their initial positions, and the black car having moved out of view."
94
- ]
95
 
96
  # Define the Gradio Blocks interface
97
  with gr.Blocks() as demo:
98
  gr.Markdown(
99
  """
100
- # Cosmos-Predict2 14B Text2Image
101
- [[Model]](https://huggingface.co/nvidia/Cosmos-Predict2-14B-Text2Image), [[Code]](https://github.com/nvidia-cosmos/cosmos-predict2)
102
  """
103
  )
104
  with gr.Row():
105
  with gr.Column():
 
 
106
  prompt_input = gr.Textbox(
107
  label="Prompt",
108
  lines=5,
109
  value="A close-up shot captures a vibrant yellow scrubber vigorously working on a grimy plate, its bristles moving in circular motions to lift stubborn grease and food residue. The dish, once covered in remnants of a hearty meal, gradually reveals its original glossy surface. Suds form and bubble around the scrubber, creating a satisfying visual of cleanliness in progress. The sound of scrubbing fills the air, accompanied by the gentle clinking of the dish against the sink. As the scrubber continues its task, the dish transforms, gleaming under the bright kitchen lights, symbolizing the triumph of cleanliness over mess.",
110
- placeholder="Enter your descriptive prompt here..."
111
  )
112
-
113
  negative_prompt_input = gr.Textbox(
114
  label="Negative Prompt",
115
  lines=3,
116
  value="The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.",
117
- placeholder="Enter what you DON'T want to see in the image..."
118
  )
119
-
120
  with gr.Row():
121
- randomize_seed_checkbox = gr.Checkbox(
122
- label="Randomize Seed",
123
- value=True
124
- )
125
- seed_input = gr.Slider(
126
- minimum=0,
127
- maximum=1000000,
128
- value=1,
129
- step=1,
130
- label="Seed"
131
- )
132
-
133
- model_radio = gr.Radio(
134
- choices=["14B", "2B"],
135
- value="14B",
136
- label="Model Selection",
137
- visible=False
138
- )
139
  generate_button = gr.Button("Generate Image")
140
-
141
  with gr.Column():
142
- output_image = gr.Image(label="Generated Image", type="pil")
143
-
144
- gr.Examples(
145
- examples=example_prompts,
146
- inputs=[prompt_input],
147
- outputs=[output_image, seed_input],
148
- fn=generate_image,
149
- cache_examples="lazy"
150
- )
151
  generate_button.click(
152
- fn=generate_image,
153
- inputs=[prompt_input, negative_prompt_input, seed_input, randomize_seed_checkbox, model_radio],
154
- outputs=[output_image, seed_input]
155
  )
156
 
157
  if __name__ == "__main__":
158
- demo.launch()
 
1
+ # flake8: noqa
2
+
3
  import subprocess
4
 
5
+ subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
 
 
6
 
7
  import gradio as gr
8
  import spaces
9
  import torch
10
+ from diffusers import Cosmos2VideoToWorldPipeline
11
+ from diffusers.utils import export_to_video
12
  from transformers import AutoModelForCausalLM, SiglipProcessor
13
  import random
14
 
 
27
  except Exception as e:
28
  print(f"Authentication failed: {e}")
29
 
30
+
31
+ # Add flash_attention_2 to the safeguard model
32
  def patch_from_pretrained(cls):
33
  orig_method = cls.from_pretrained
34
 
35
  def new_from_pretrained(*args, **kwargs):
36
  kwargs.setdefault("attn_implementation", "flash_attention_2")
37
+ kwargs.setdefault("torch_dtype", torch.bfloat16)
38
  return orig_method(*args, **kwargs)
39
 
40
  cls.from_pretrained = new_from_pretrained
41
 
42
+
43
  patch_from_pretrained(AutoModelForCausalLM)
44
 
45
+
46
+ # Add a `use_fast` to the safeguard image processor
47
  def patch_processor_fast(cls):
48
  orig_method = cls.from_pretrained
49
+
50
  def new_from_pretrained(*args, **kwargs):
51
  kwargs.setdefault("use_fast", True)
52
  return orig_method(*args, **kwargs)
53
+
54
  cls.from_pretrained = new_from_pretrained
55
 
 
56
 
57
+ patch_processor_fast(SiglipProcessor)
58
 
59
+ model_14b_id = "nvidia/Cosmos-Predict2-14B-Video2World"
60
+ pipe_14b = Cosmos2VideoToWorldPipeline.from_pretrained(model_14b_id, torch_dtype=torch.bfloat16, use_auth_token=True)
 
 
 
61
  pipe_14b.to("cuda")
62
 
63
+
64
  @spaces.GPU(duration=140)
65
+ def generate_video(
66
+ image,
67
  prompt,
68
  negative_prompt="The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.",
69
  seed=42,
70
  randomize_seed=False,
71
  model_choice="14B",
72
+ progress=gr.Progress(track_tqdm=True),
73
+ ):
74
 
75
  if randomize_seed:
76
  actual_seed = random.randint(0, 1000000)
77
  else:
78
  actual_seed = seed
79
+
80
  generator = torch.Generator().manual_seed(actual_seed)
81
+
82
+ video = pipe_14b(image=image, prompt=prompt, negative_prompt=negative_prompt, generator=generator).frames[0]
83
+
84
+ output = export_to_video(video, "output.mp4", fps=16)
85
+
86
+ return output, output, actual_seed
87
+
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  # Define the Gradio Blocks interface
90
  with gr.Blocks() as demo:
91
  gr.Markdown(
92
  """
93
+ # Cosmos-Predict2 14B Video2World
 
94
  """
95
  )
96
  with gr.Row():
97
  with gr.Column():
98
+ image_input = gr.Image(label="Input Image", type="pil")
99
+
100
  prompt_input = gr.Textbox(
101
  label="Prompt",
102
  lines=5,
103
  value="A close-up shot captures a vibrant yellow scrubber vigorously working on a grimy plate, its bristles moving in circular motions to lift stubborn grease and food residue. The dish, once covered in remnants of a hearty meal, gradually reveals its original glossy surface. Suds form and bubble around the scrubber, creating a satisfying visual of cleanliness in progress. The sound of scrubbing fills the air, accompanied by the gentle clinking of the dish against the sink. As the scrubber continues its task, the dish transforms, gleaming under the bright kitchen lights, symbolizing the triumph of cleanliness over mess.",
104
+ placeholder="Enter your descriptive prompt here...",
105
  )
106
+
107
  negative_prompt_input = gr.Textbox(
108
  label="Negative Prompt",
109
  lines=3,
110
  value="The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.",
111
+ placeholder="Enter what you DON'T want to see in the image...",
112
  )
113
+
114
  with gr.Row():
115
+ randomize_seed_checkbox = gr.Checkbox(label="Randomize Seed", value=True)
116
+ seed_input = gr.Slider(minimum=0, maximum=1000000, value=1, step=1, label="Seed")
117
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  generate_button = gr.Button("Generate Image")
119
+
120
  with gr.Column():
121
+ output_video = gr.Video(label="Generated Video", format="mp4")
122
+ output_file = gr.File(label="Download Video")
123
+
 
 
 
 
 
 
124
  generate_button.click(
125
+ fn=generate_video,
126
+ inputs=[image_input, prompt_input, negative_prompt_input, seed_input, randomize_seed_checkbox],
127
+ outputs=[output_video, output_file, seed_input],
128
  )
129
 
130
  if __name__ == "__main__":
131
+ demo.launch()