update
Browse files
app.py
CHANGED
|
@@ -1,14 +1,11 @@
|
|
|
|
|
|
|
|
| 1 |
from qwen_vl_utils import fetch_image
|
| 2 |
from transformers import (
|
| 3 |
-
Qwen2_5_VLForConditionalGeneration,
|
| 4 |
-
Qwen2_5_VLProcessor,
|
| 5 |
-
AutoTokenizer,
|
| 6 |
-
set_seed,
|
| 7 |
AutoModelForCausalLM,
|
| 8 |
AutoProcessor,
|
| 9 |
-
TextIteratorStreamer
|
| 10 |
)
|
| 11 |
-
|
| 12 |
import gradio as gr
|
| 13 |
import librosa
|
| 14 |
import torch
|
|
@@ -16,25 +13,33 @@ import numpy as np
|
|
| 16 |
import soundfile as sf
|
| 17 |
from threading import Thread
|
| 18 |
from copy import deepcopy
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
-
# ===================== Load Model =====================
|
| 22 |
-
model_path =
|
| 23 |
|
|
|
|
| 24 |
model = AutoModelForCausalLM.from_pretrained(
|
| 25 |
model_path,
|
| 26 |
-
torch_dtype=torch.bfloat16,
|
| 27 |
-
|
| 28 |
-
trust_remote_code=True
|
| 29 |
)
|
| 30 |
-
|
| 31 |
processor = AutoProcessor.from_pretrained(
|
| 32 |
model_path,
|
| 33 |
trust_remote_code=True
|
| 34 |
)
|
| 35 |
|
| 36 |
model.eval()
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
# ===================== Streaming Generation =====================
|
|
@@ -42,49 +47,55 @@ def generate_with_streaming(model, processor, text, images=None, audios=None, hi
|
|
| 42 |
# Process images
|
| 43 |
processed_images = None
|
| 44 |
if images:
|
| 45 |
-
text =
|
| 46 |
-
processed_images = [
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
# Process audios
|
| 50 |
processed_audios = None
|
| 51 |
if audios:
|
| 52 |
-
text =
|
| 53 |
processed_audios = [audio for audio in audios if audio is not None]
|
| 54 |
|
| 55 |
# Build conversation history
|
| 56 |
messages = []
|
| 57 |
if history:
|
| 58 |
for user_msg, assistant_msg in history:
|
| 59 |
-
messages.append({
|
| 60 |
if assistant_msg:
|
| 61 |
-
messages.append({
|
| 62 |
|
| 63 |
# Clean multimodal tokens from previous history
|
| 64 |
for m in messages:
|
| 65 |
-
m[
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
# Add current user input
|
| 69 |
-
messages.append({
|
| 70 |
|
| 71 |
# Prepare model input
|
| 72 |
-
|
| 73 |
input_data = processor(
|
| 74 |
-
text=[
|
| 75 |
audios=processed_audios,
|
| 76 |
images=processed_images,
|
| 77 |
return_tensors="pt",
|
| 78 |
-
padding=True
|
| 79 |
)
|
| 80 |
|
| 81 |
-
# Move tensors to
|
| 82 |
for k, v in input_data.items():
|
| 83 |
if hasattr(v, "to"):
|
| 84 |
input_data[k] = v.to(model.device)
|
| 85 |
|
| 86 |
# Start streaming generation
|
| 87 |
-
streamer = TextIteratorStreamer(
|
|
|
|
|
|
|
| 88 |
generation_kwargs = dict(
|
| 89 |
**input_data,
|
| 90 |
streamer=streamer,
|
|
@@ -115,20 +126,30 @@ def process_audio(audio):
|
|
| 115 |
save_path = "./temp.wav"
|
| 116 |
sf.write(save_path, y, sr)
|
| 117 |
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
return y_resampled
|
| 120 |
except Exception as e:
|
| 121 |
print(f"Error processing audio: {e}")
|
| 122 |
return None
|
| 123 |
|
| 124 |
|
| 125 |
-
# ===================== Prediction Function for Gradio =====================
|
|
|
|
| 126 |
def predict(message, image, audio, chatbox):
|
| 127 |
-
|
| 128 |
-
chat_history = deepcopy(chatbox)
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
chatbox.append([message, ""])
|
| 134 |
response = ""
|
|
@@ -137,6 +158,7 @@ def predict(message, image, audio, chatbox):
|
|
| 137 |
for chunk in generate_with_streaming(model, processor, message, processed_image, processed_audio, chat_history):
|
| 138 |
response += chunk
|
| 139 |
chatbox[-1][1] = response
|
|
|
|
| 140 |
yield chatbox
|
| 141 |
|
| 142 |
print("\n=== Complete Model Response ===")
|
|
@@ -211,4 +233,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 211 |
|
| 212 |
# ===================== Run App =====================
|
| 213 |
if __name__ == "__main__":
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
# ===================== Imports =====================
|
| 3 |
from qwen_vl_utils import fetch_image
|
| 4 |
from transformers import (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
AutoModelForCausalLM,
|
| 6 |
AutoProcessor,
|
| 7 |
+
TextIteratorStreamer,
|
| 8 |
)
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
import librosa
|
| 11 |
import torch
|
|
|
|
| 13 |
import soundfile as sf
|
| 14 |
from threading import Thread
|
| 15 |
from copy import deepcopy
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
# NEW: ZeroGPU requirement
|
| 19 |
+
import spaces
|
| 20 |
|
| 21 |
|
| 22 |
+
# ===================== Load Model (lazy move to GPU) =====================
|
| 23 |
+
model_path = "FreedomIntelligence/ShizhenGPT-7B-Omni"
|
| 24 |
|
| 25 |
+
# 先在 CPU 上加载权重;等真正需要推理时再迁移到 GPU(由 @spaces.GPU 管理)
|
| 26 |
model = AutoModelForCausalLM.from_pretrained(
|
| 27 |
model_path,
|
| 28 |
+
torch_dtype=torch.bfloat16, # 如遇不支持,可改成 torch.float16
|
| 29 |
+
trust_remote_code=True,
|
|
|
|
| 30 |
)
|
|
|
|
| 31 |
processor = AutoProcessor.from_pretrained(
|
| 32 |
model_path,
|
| 33 |
trust_remote_code=True
|
| 34 |
)
|
| 35 |
|
| 36 |
model.eval()
|
| 37 |
+
# 某些权重会把 chat_template 放 tokenizer 上;做个兼容
|
| 38 |
+
if hasattr(processor, "tokenizer") and hasattr(processor.tokenizer, "chat_template"):
|
| 39 |
+
processor.chat_template = processor.tokenizer.chat_template
|
| 40 |
+
|
| 41 |
+
# 标志位:仅在第一次推理时把模型迁移到 GPU
|
| 42 |
+
_MODEL_ON_CUDA = False
|
| 43 |
|
| 44 |
|
| 45 |
# ===================== Streaming Generation =====================
|
|
|
|
| 47 |
# Process images
|
| 48 |
processed_images = None
|
| 49 |
if images:
|
| 50 |
+
text = "".join(["<|vision_start|><|image_pad|><|vision_end|>"] * len(images)) + text
|
| 51 |
+
processed_images = [
|
| 52 |
+
fetch_image({"type": "image", "image": img, "max_pixels": 360 * 420})
|
| 53 |
+
for img in images
|
| 54 |
+
if img is not None
|
| 55 |
+
]
|
| 56 |
|
| 57 |
# Process audios
|
| 58 |
processed_audios = None
|
| 59 |
if audios:
|
| 60 |
+
text = "".join(["<|audio_bos|><|AUDIO|><|audio_eos|>"] * len(audios)) + text
|
| 61 |
processed_audios = [audio for audio in audios if audio is not None]
|
| 62 |
|
| 63 |
# Build conversation history
|
| 64 |
messages = []
|
| 65 |
if history:
|
| 66 |
for user_msg, assistant_msg in history:
|
| 67 |
+
messages.append({"role": "user", "content": user_msg})
|
| 68 |
if assistant_msg:
|
| 69 |
+
messages.append({"role": "assistant", "content": assistant_msg})
|
| 70 |
|
| 71 |
# Clean multimodal tokens from previous history
|
| 72 |
for m in messages:
|
| 73 |
+
m["content"] = m["content"].replace("<|audio_bos|><|AUDIO|><|audio_eos|>", "").replace(
|
| 74 |
+
"<|vision_start|><|image_pad|><|vision_end|>", ""
|
| 75 |
+
)
|
| 76 |
|
| 77 |
# Add current user input
|
| 78 |
+
messages.append({"role": "user", "content": text})
|
| 79 |
|
| 80 |
# Prepare model input
|
| 81 |
+
templated = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) or ""
|
| 82 |
input_data = processor(
|
| 83 |
+
text=[templated],
|
| 84 |
audios=processed_audios,
|
| 85 |
images=processed_images,
|
| 86 |
return_tensors="pt",
|
| 87 |
+
padding=True,
|
| 88 |
)
|
| 89 |
|
| 90 |
+
# Move tensors to the current model device
|
| 91 |
for k, v in input_data.items():
|
| 92 |
if hasattr(v, "to"):
|
| 93 |
input_data[k] = v.to(model.device)
|
| 94 |
|
| 95 |
# Start streaming generation
|
| 96 |
+
streamer = TextIteratorStreamer(
|
| 97 |
+
processor.tokenizer, skip_special_tokens=True, skip_prompt=True
|
| 98 |
+
)
|
| 99 |
generation_kwargs = dict(
|
| 100 |
**input_data,
|
| 101 |
streamer=streamer,
|
|
|
|
| 126 |
save_path = "./temp.wav"
|
| 127 |
sf.write(save_path, y, sr)
|
| 128 |
|
| 129 |
+
# 有些处理器没有 feature_extractor;做个兜底
|
| 130 |
+
target_sr = getattr(
|
| 131 |
+
getattr(processor, "feature_extractor", None), "sampling_rate", 16000
|
| 132 |
+
)
|
| 133 |
+
y_resampled, _ = librosa.load(save_path, sr=target_sr, mono=True)
|
| 134 |
return y_resampled
|
| 135 |
except Exception as e:
|
| 136 |
print(f"Error processing audio: {e}")
|
| 137 |
return None
|
| 138 |
|
| 139 |
|
| 140 |
+
# ===================== Prediction Function for Gradio (ZeroGPU) =====================
|
| 141 |
+
@spaces.GPU(duration=600) # 关键:让 ZeroGPU 能检测到 GPU 函数,并把一次调用的最长占用设长些
|
| 142 |
def predict(message, image, audio, chatbox):
|
| 143 |
+
global _MODEL_ON_CUDA, model
|
|
|
|
| 144 |
|
| 145 |
+
# 首次调用被装饰的函数时,ZeroGPU 才真正分配 GPU;此时再迁移模型
|
| 146 |
+
if not _MODEL_ON_CUDA:
|
| 147 |
+
model.to("cuda")
|
| 148 |
+
_MODEL_ON_CUDA = True
|
| 149 |
+
|
| 150 |
+
chat_history = deepcopy(chatbox)
|
| 151 |
+
processed_audio = [process_audio(audio)] if audio is not None else None
|
| 152 |
+
processed_image = [image] if image is not None else None
|
| 153 |
|
| 154 |
chatbox.append([message, ""])
|
| 155 |
response = ""
|
|
|
|
| 158 |
for chunk in generate_with_streaming(model, processor, message, processed_image, processed_audio, chat_history):
|
| 159 |
response += chunk
|
| 160 |
chatbox[-1][1] = response
|
| 161 |
+
# 作为生成器返回,Gradio Chatbot 将实时刷新
|
| 162 |
yield chatbox
|
| 163 |
|
| 164 |
print("\n=== Complete Model Response ===")
|
|
|
|
| 233 |
|
| 234 |
# ===================== Run App =====================
|
| 235 |
if __name__ == "__main__":
|
| 236 |
+
# ZeroGPU 下建议限制并发,避免重复申请 GPU
|
| 237 |
+
demo.queue(concurrency_count=1, max_size=8).launch(
|
| 238 |
+
server_name="0.0.0.0", server_port=7860, share=True
|
| 239 |
+
)
|