Spaces:

FreedomIntelligence
/

ShizhenGPT-7B-Omni

Sleeping

App Files Files Community

wangrongsheng commited on Aug 29

Commit

10dbb25

verified ·

1 Parent(s): c876723

update

Browse files

Files changed (1) hide show

app.py +59 -34

app.py CHANGED Viewed

@@ -1,14 +1,11 @@
 from qwen_vl_utils import fetch_image
 from transformers import (
-    Qwen2_5_VLForConditionalGeneration,
-    Qwen2_5_VLProcessor,
-    AutoTokenizer,
-    set_seed,
     AutoModelForCausalLM,
     AutoProcessor,
-    TextIteratorStreamer
 )
 import gradio as gr
 import librosa
 import torch
@@ -16,25 +13,33 @@ import numpy as np
 import soundfile as sf
 from threading import Thread
 from copy import deepcopy
-# ===================== Load Model =====================
-model_path = 'FreedomIntelligence/ShizhenGPT-7B-Omni'
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
-    torch_dtype=torch.bfloat16,
-    # device_map="cuda:0",
-    trust_remote_code=True
 )
 processor = AutoProcessor.from_pretrained(
     model_path,
     trust_remote_code=True
 )
 model.eval()
-processor.chat_template = processor.tokenizer.chat_template
 # ===================== Streaming Generation =====================
@@ -42,49 +47,55 @@ def generate_with_streaming(model, processor, text, images=None, audios=None, hi
     # Process images
     processed_images = None
     if images:
-        text = ''.join(['<|vision_start|><|image_pad|><|vision_end|>'] * len(images)) + text
-        processed_images = [fetch_image({"type": "image", "image": img, "max_pixels": 360 * 420})
-                            for img in images if img is not None]
     # Process audios
     processed_audios = None
     if audios:
-        text = ''.join(['<|audio_bos|><|AUDIO|><|audio_eos|>'] * len(audios)) + text
         processed_audios = [audio for audio in audios if audio is not None]
     # Build conversation history
     messages = []
     if history:
         for user_msg, assistant_msg in history:
-            messages.append({'role': 'user', 'content': user_msg})
             if assistant_msg:
-                messages.append({'role': 'assistant', 'content': assistant_msg})
     # Clean multimodal tokens from previous history
     for m in messages:
-        m['content'] = m['content'].replace('<|audio_bos|><|AUDIO|><|audio_eos|>', '').replace(
-            '<|vision_start|><|image_pad|><|vision_end|>', '')
     # Add current user input
-    messages.append({'role': 'user', 'content': text})
     # Prepare model input
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) or [""]
     input_data = processor(
-        text=[text],
         audios=processed_audios,
         images=processed_images,
         return_tensors="pt",
-        padding=True
     )
-    # Move tensors to GPU
     for k, v in input_data.items():
         if hasattr(v, "to"):
             input_data[k] = v.to(model.device)
     # Start streaming generation
-    streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True, skip_prompt=True)
     generation_kwargs = dict(
         **input_data,
         streamer=streamer,
@@ -115,20 +126,30 @@ def process_audio(audio):
         save_path = "./temp.wav"
         sf.write(save_path, y, sr)
-        y_resampled = librosa.load(save_path, sr=processor.feature_extractor.sampling_rate)[0]
         return y_resampled
     except Exception as e:
         print(f"Error processing audio: {e}")
         return None
-# ===================== Prediction Function for Gradio =====================
 def predict(message, image, audio, chatbox):
-    """Main function for chat: process input, call model, and update chat history."""
-    chat_history = deepcopy(chatbox)
-    processed_audio = [process_audio(audio)] if audio else None
-    processed_image = [image] if image else None
     chatbox.append([message, ""])
     response = ""
@@ -137,6 +158,7 @@ def predict(message, image, audio, chatbox):
     for chunk in generate_with_streaming(model, processor, message, processed_image, processed_audio, chat_history):
         response += chunk
         chatbox[-1][1] = response
         yield chatbox
     print("\n=== Complete Model Response ===")
@@ -211,4 +233,7 @@ with gr.Blocks(css=css) as demo:
 # ===================== Run App =====================
 if __name__ == "__main__":
-    demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=True)

+# app.py
+# ===================== Imports =====================
 from qwen_vl_utils import fetch_image
 from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
+    TextIteratorStreamer,
 )
 import gradio as gr
 import librosa
 import torch
 import soundfile as sf
 from threading import Thread
 from copy import deepcopy
+import os
+# NEW: ZeroGPU requirement
+import spaces
+# ===================== Load Model (lazy move to GPU) =====================
+model_path = "FreedomIntelligence/ShizhenGPT-7B-Omni"
+# 先在 CPU 上加载权重；等真正需要推理时再迁移到 GPU（由 @spaces.GPU 管理）
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
+    torch_dtype=torch.bfloat16,   # 如遇不支持，可改成 torch.float16
+    trust_remote_code=True,
 )
 processor = AutoProcessor.from_pretrained(
     model_path,
     trust_remote_code=True
 )
 model.eval()
+# 某些权重会把 chat_template 放 tokenizer 上；做个兼容
+if hasattr(processor, "tokenizer") and hasattr(processor.tokenizer, "chat_template"):
+    processor.chat_template = processor.tokenizer.chat_template
+# 标志位：仅在第一次推理时把模型迁移到 GPU
+_MODEL_ON_CUDA = False
 # ===================== Streaming Generation =====================
     # Process images
     processed_images = None
     if images:
+        text = "".join(["<|vision_start|><|image_pad|><|vision_end|>"] * len(images)) + text
+        processed_images = [
+            fetch_image({"type": "image", "image": img, "max_pixels": 360 * 420})
+            for img in images
+            if img is not None
+        ]
     # Process audios
     processed_audios = None
     if audios:
+        text = "".join(["<|audio_bos|><|AUDIO|><|audio_eos|>"] * len(audios)) + text
         processed_audios = [audio for audio in audios if audio is not None]
     # Build conversation history
     messages = []
     if history:
         for user_msg, assistant_msg in history:
+            messages.append({"role": "user", "content": user_msg})
             if assistant_msg:
+                messages.append({"role": "assistant", "content": assistant_msg})
     # Clean multimodal tokens from previous history
     for m in messages:
+        m["content"] = m["content"].replace("<|audio_bos|><|AUDIO|><|audio_eos|>", "").replace(
+            "<|vision_start|><|image_pad|><|vision_end|>", ""
+        )
     # Add current user input
+    messages.append({"role": "user", "content": text})
     # Prepare model input
+    templated = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) or ""
     input_data = processor(
+        text=[templated],
         audios=processed_audios,
         images=processed_images,
         return_tensors="pt",
+        padding=True,
     )
+    # Move tensors to the current model device
     for k, v in input_data.items():
         if hasattr(v, "to"):
             input_data[k] = v.to(model.device)
     # Start streaming generation
+    streamer = TextIteratorStreamer(
+        processor.tokenizer, skip_special_tokens=True, skip_prompt=True
+    )
     generation_kwargs = dict(
         **input_data,
         streamer=streamer,
         save_path = "./temp.wav"
         sf.write(save_path, y, sr)
+        # 有些处理器没有 feature_extractor；做个兜底
+        target_sr = getattr(
+            getattr(processor, "feature_extractor", None), "sampling_rate", 16000
+        )
+        y_resampled, _ = librosa.load(save_path, sr=target_sr, mono=True)
         return y_resampled
     except Exception as e:
         print(f"Error processing audio: {e}")
         return None
+# ===================== Prediction Function for Gradio (ZeroGPU) =====================
+@spaces.GPU(duration=600)  # 关键：让 ZeroGPU 能检测到 GPU 函数，并把一次调用的最长占用设长些
 def predict(message, image, audio, chatbox):
+    global _MODEL_ON_CUDA, model
+    # 首次调用被装饰的函数时，ZeroGPU 才真正分配 GPU；此时再迁移模型
+    if not _MODEL_ON_CUDA:
+        model.to("cuda")
+        _MODEL_ON_CUDA = True
+    chat_history = deepcopy(chatbox)
+    processed_audio = [process_audio(audio)] if audio is not None else None
+    processed_image = [image] if image is not None else None
     chatbox.append([message, ""])
     response = ""
     for chunk in generate_with_streaming(model, processor, message, processed_image, processed_audio, chat_history):
         response += chunk
         chatbox[-1][1] = response
+        # 作为生成器返回，Gradio Chatbot 将实时刷新
         yield chatbox
     print("\n=== Complete Model Response ===")
 # ===================== Run App =====================
 if __name__ == "__main__":
+    # ZeroGPU 下建议限制并发，避免重复申请 GPU
+    demo.queue(concurrency_count=1, max_size=8).launch(
+        server_name="0.0.0.0", server_port=7860, share=True
+    )