wangrongsheng commited on
Commit
10dbb25
·
verified ·
1 Parent(s): c876723
Files changed (1) hide show
  1. app.py +59 -34
app.py CHANGED
@@ -1,14 +1,11 @@
 
 
1
  from qwen_vl_utils import fetch_image
2
  from transformers import (
3
- Qwen2_5_VLForConditionalGeneration,
4
- Qwen2_5_VLProcessor,
5
- AutoTokenizer,
6
- set_seed,
7
  AutoModelForCausalLM,
8
  AutoProcessor,
9
- TextIteratorStreamer
10
  )
11
-
12
  import gradio as gr
13
  import librosa
14
  import torch
@@ -16,25 +13,33 @@ import numpy as np
16
  import soundfile as sf
17
  from threading import Thread
18
  from copy import deepcopy
 
 
 
 
19
 
20
 
21
- # ===================== Load Model =====================
22
- model_path = 'FreedomIntelligence/ShizhenGPT-7B-Omni'
23
 
 
24
  model = AutoModelForCausalLM.from_pretrained(
25
  model_path,
26
- torch_dtype=torch.bfloat16,
27
- # device_map="cuda:0",
28
- trust_remote_code=True
29
  )
30
-
31
  processor = AutoProcessor.from_pretrained(
32
  model_path,
33
  trust_remote_code=True
34
  )
35
 
36
  model.eval()
37
- processor.chat_template = processor.tokenizer.chat_template
 
 
 
 
 
38
 
39
 
40
  # ===================== Streaming Generation =====================
@@ -42,49 +47,55 @@ def generate_with_streaming(model, processor, text, images=None, audios=None, hi
42
  # Process images
43
  processed_images = None
44
  if images:
45
- text = ''.join(['<|vision_start|><|image_pad|><|vision_end|>'] * len(images)) + text
46
- processed_images = [fetch_image({"type": "image", "image": img, "max_pixels": 360 * 420})
47
- for img in images if img is not None]
 
 
 
48
 
49
  # Process audios
50
  processed_audios = None
51
  if audios:
52
- text = ''.join(['<|audio_bos|><|AUDIO|><|audio_eos|>'] * len(audios)) + text
53
  processed_audios = [audio for audio in audios if audio is not None]
54
 
55
  # Build conversation history
56
  messages = []
57
  if history:
58
  for user_msg, assistant_msg in history:
59
- messages.append({'role': 'user', 'content': user_msg})
60
  if assistant_msg:
61
- messages.append({'role': 'assistant', 'content': assistant_msg})
62
 
63
  # Clean multimodal tokens from previous history
64
  for m in messages:
65
- m['content'] = m['content'].replace('<|audio_bos|><|AUDIO|><|audio_eos|>', '').replace(
66
- '<|vision_start|><|image_pad|><|vision_end|>', '')
 
67
 
68
  # Add current user input
69
- messages.append({'role': 'user', 'content': text})
70
 
71
  # Prepare model input
72
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) or [""]
73
  input_data = processor(
74
- text=[text],
75
  audios=processed_audios,
76
  images=processed_images,
77
  return_tensors="pt",
78
- padding=True
79
  )
80
 
81
- # Move tensors to GPU
82
  for k, v in input_data.items():
83
  if hasattr(v, "to"):
84
  input_data[k] = v.to(model.device)
85
 
86
  # Start streaming generation
87
- streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True, skip_prompt=True)
 
 
88
  generation_kwargs = dict(
89
  **input_data,
90
  streamer=streamer,
@@ -115,20 +126,30 @@ def process_audio(audio):
115
  save_path = "./temp.wav"
116
  sf.write(save_path, y, sr)
117
 
118
- y_resampled = librosa.load(save_path, sr=processor.feature_extractor.sampling_rate)[0]
 
 
 
 
119
  return y_resampled
120
  except Exception as e:
121
  print(f"Error processing audio: {e}")
122
  return None
123
 
124
 
125
- # ===================== Prediction Function for Gradio =====================
 
126
  def predict(message, image, audio, chatbox):
127
- """Main function for chat: process input, call model, and update chat history."""
128
- chat_history = deepcopy(chatbox)
129
 
130
- processed_audio = [process_audio(audio)] if audio else None
131
- processed_image = [image] if image else None
 
 
 
 
 
 
132
 
133
  chatbox.append([message, ""])
134
  response = ""
@@ -137,6 +158,7 @@ def predict(message, image, audio, chatbox):
137
  for chunk in generate_with_streaming(model, processor, message, processed_image, processed_audio, chat_history):
138
  response += chunk
139
  chatbox[-1][1] = response
 
140
  yield chatbox
141
 
142
  print("\n=== Complete Model Response ===")
@@ -211,4 +233,7 @@ with gr.Blocks(css=css) as demo:
211
 
212
  # ===================== Run App =====================
213
  if __name__ == "__main__":
214
- demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=True)
 
 
 
 
1
+ # app.py
2
+ # ===================== Imports =====================
3
  from qwen_vl_utils import fetch_image
4
  from transformers import (
 
 
 
 
5
  AutoModelForCausalLM,
6
  AutoProcessor,
7
+ TextIteratorStreamer,
8
  )
 
9
  import gradio as gr
10
  import librosa
11
  import torch
 
13
  import soundfile as sf
14
  from threading import Thread
15
  from copy import deepcopy
16
+ import os
17
+
18
+ # NEW: ZeroGPU requirement
19
+ import spaces
20
 
21
 
22
+ # ===================== Load Model (lazy move to GPU) =====================
23
+ model_path = "FreedomIntelligence/ShizhenGPT-7B-Omni"
24
 
25
+ # 先在 CPU 上加载权重;等真正需要推理时再迁移到 GPU(由 @spaces.GPU 管理)
26
  model = AutoModelForCausalLM.from_pretrained(
27
  model_path,
28
+ torch_dtype=torch.bfloat16, # 如遇不支持,可改成 torch.float16
29
+ trust_remote_code=True,
 
30
  )
 
31
  processor = AutoProcessor.from_pretrained(
32
  model_path,
33
  trust_remote_code=True
34
  )
35
 
36
  model.eval()
37
+ # 某些权重会把 chat_template tokenizer 上;做个兼容
38
+ if hasattr(processor, "tokenizer") and hasattr(processor.tokenizer, "chat_template"):
39
+ processor.chat_template = processor.tokenizer.chat_template
40
+
41
+ # 标志位:仅在第一次推理时把模型迁移到 GPU
42
+ _MODEL_ON_CUDA = False
43
 
44
 
45
  # ===================== Streaming Generation =====================
 
47
  # Process images
48
  processed_images = None
49
  if images:
50
+ text = "".join(["<|vision_start|><|image_pad|><|vision_end|>"] * len(images)) + text
51
+ processed_images = [
52
+ fetch_image({"type": "image", "image": img, "max_pixels": 360 * 420})
53
+ for img in images
54
+ if img is not None
55
+ ]
56
 
57
  # Process audios
58
  processed_audios = None
59
  if audios:
60
+ text = "".join(["<|audio_bos|><|AUDIO|><|audio_eos|>"] * len(audios)) + text
61
  processed_audios = [audio for audio in audios if audio is not None]
62
 
63
  # Build conversation history
64
  messages = []
65
  if history:
66
  for user_msg, assistant_msg in history:
67
+ messages.append({"role": "user", "content": user_msg})
68
  if assistant_msg:
69
+ messages.append({"role": "assistant", "content": assistant_msg})
70
 
71
  # Clean multimodal tokens from previous history
72
  for m in messages:
73
+ m["content"] = m["content"].replace("<|audio_bos|><|AUDIO|><|audio_eos|>", "").replace(
74
+ "<|vision_start|><|image_pad|><|vision_end|>", ""
75
+ )
76
 
77
  # Add current user input
78
+ messages.append({"role": "user", "content": text})
79
 
80
  # Prepare model input
81
+ templated = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) or ""
82
  input_data = processor(
83
+ text=[templated],
84
  audios=processed_audios,
85
  images=processed_images,
86
  return_tensors="pt",
87
+ padding=True,
88
  )
89
 
90
+ # Move tensors to the current model device
91
  for k, v in input_data.items():
92
  if hasattr(v, "to"):
93
  input_data[k] = v.to(model.device)
94
 
95
  # Start streaming generation
96
+ streamer = TextIteratorStreamer(
97
+ processor.tokenizer, skip_special_tokens=True, skip_prompt=True
98
+ )
99
  generation_kwargs = dict(
100
  **input_data,
101
  streamer=streamer,
 
126
  save_path = "./temp.wav"
127
  sf.write(save_path, y, sr)
128
 
129
+ # 有些处理器没有 feature_extractor;做个兜底
130
+ target_sr = getattr(
131
+ getattr(processor, "feature_extractor", None), "sampling_rate", 16000
132
+ )
133
+ y_resampled, _ = librosa.load(save_path, sr=target_sr, mono=True)
134
  return y_resampled
135
  except Exception as e:
136
  print(f"Error processing audio: {e}")
137
  return None
138
 
139
 
140
+ # ===================== Prediction Function for Gradio (ZeroGPU) =====================
141
+ @spaces.GPU(duration=600) # 关键:让 ZeroGPU 能检测到 GPU 函数,并把一次调用的最长占用设长些
142
  def predict(message, image, audio, chatbox):
143
+ global _MODEL_ON_CUDA, model
 
144
 
145
+ # 首次调用被装饰的函数时,ZeroGPU 才真正分配 GPU;此时再迁移模型
146
+ if not _MODEL_ON_CUDA:
147
+ model.to("cuda")
148
+ _MODEL_ON_CUDA = True
149
+
150
+ chat_history = deepcopy(chatbox)
151
+ processed_audio = [process_audio(audio)] if audio is not None else None
152
+ processed_image = [image] if image is not None else None
153
 
154
  chatbox.append([message, ""])
155
  response = ""
 
158
  for chunk in generate_with_streaming(model, processor, message, processed_image, processed_audio, chat_history):
159
  response += chunk
160
  chatbox[-1][1] = response
161
+ # 作为生成器返回,Gradio Chatbot 将实时刷新
162
  yield chatbox
163
 
164
  print("\n=== Complete Model Response ===")
 
233
 
234
  # ===================== Run App =====================
235
  if __name__ == "__main__":
236
+ # ZeroGPU 下建议限制并发,避免重复申请 GPU
237
+ demo.queue(concurrency_count=1, max_size=8).launch(
238
+ server_name="0.0.0.0", server_port=7860, share=True
239
+ )