Spaces:

cafe3310
/

ling-playground-basic

Sleeping

cafe3310 commited on Sep 23

Commit

9602bb7

1 Parent(s): 491422c

Refactor: Implement streaming response and simplify architecture

- Replace LangGraph with a direct Gradio implementation for simplicity and performance.
- Implement streaming responses using `TextIteratorStreamer` for a better user experience.
- Use `tokenizer.apply_chat_template` for robust prompt formatting.
- Remove obsolete `graph.py.

Files changed (4) hide show

GEMINI.md +4 -5
app.py +19 -42
comp.py +46 -46
graph.py +0 -42

GEMINI.md CHANGED Viewed

@@ -19,21 +19,20 @@
 # 子目标
 ## 未完成
-- [ ] **(进行中)** 解决模型体积过大导致部署失败的问题。
-- [ ] (已暂停) 实现自动化部署和验证流程。
 ## 已完成
 - [x] 使用 LangGraph 实现一个可以路由两个模型的聊天网页应用。
 ---
 # Todolist
 ## 未完成
-- [ ] **当前任务**: 修改 `app.py`，移除 `Ling-flash-2.0` 模型，只保留 `Ring-mini-2.0`。
-- [ ] (待定) 根据用户找到的量化模型，更新 `app.py` 中的模型路径。
 - [ ] (已暂停) 搜索 `huggingface_hub` 文档，确认是否存在用于重启 Space 的 API。
 ## 已完成
 - [x] **(用户决策)** 确认 `Ling-flash-2.0` 模型过大，暂时移除，仅使用 `Ring-mini-2.0`。
 - [x] 搭建 LangGraph 基础架构并重构 `app.py`。
 - [x] 实现基于用户输入的模型路由逻辑。
@@ -65,4 +64,4 @@
 - **平台:** HuggingFace Spaces
 - **订阅:** HuggingFace Pro
 - **推理资源:** 可以使用 ZeroGPU
-- **文档参考:** 在必要的时候，主动搜索 HuggingFace 以及 Gradio 的在线 API 文档。

 # 子目标
 ## 未完成
+- [ ] **(进行中)** 实现自动化部署和验证流程。
 ## 已完成
+- [x] 解决模型体积过大导致部署失败的问题。
 - [x] 使用 LangGraph 实现一个可以路由两个模型的聊天网页应用。
 ---
 # Todolist
 ## 未完成
 - [ ] (已暂停) 搜索 `huggingface_hub` 文档，确认是否存在用于重启 Space 的 API。
 ## 已完成
+- [x] 修改 `app.py`，移除 `Ling-flash-2.0` 模型，只保留 `Ring-mini-2.0`。
 - [x] **(用户决策)** 确认 `Ling-flash-2.0` 模型过大，暂时移除，仅使用 `Ring-mini-2.0`。
 - [x] 搭建 LangGraph 基础架构并重构 `app.py`。
 - [x] 实现基于用户输入的模型路由逻辑。
 - **平台:** HuggingFace Spaces
 - **订阅:** HuggingFace Pro
 - **推理资源:** 可以使用 ZeroGPU
+- **文档参考:** 在必要的时候，主动搜索 HuggingFace 以及 Gradio 的在线 API 文档。

app.py CHANGED Viewed

@@ -1,51 +1,28 @@
 import gradio as gr
-import spaces
-from langchain_core.messages import AIMessage, SystemMessage, HumanMessage
-# 导入已编译的 LangGraph 应用
-from graph import app
-@spaces.GPU
-def respond(message, history, system_message, hf_token: gr.OAuthToken = None):
-    """Gradio 接口的响应函数，调用 LangGraph 应用"""
-    # 将 Gradio 的 history 格式转换为 LangChain 消息格式
-    messages = []
-    if system_message:
-        messages.append(SystemMessage(content=system_message))
-    for chat_message in history:
-        if chat_message['role'] == "user":
-            messages.append(HumanMessage(content=chat_message['content']))
-        elif chat_message['role'] == "assistant":
-            messages.append(AIMessage(content=chat_message['content']))
-    messages.append(HumanMessage(content=message))
-    # 使用 invoke 方法进行一次性调用
-    inputs = {"messages": messages}
-    final_state = app.invoke(inputs)
-    # 从最终状态中提取最后一条消息
-    final_response = final_state["messages"][-1].content
-    return final_response
-# 重新定义 ChatInterface
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages", # 改为 messages 类型以更好地匹配 LangChain
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-    ],
-)
-with gr.Blocks() as demo:
-    gr.Markdown("# HuggingFace Running")
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from comp import generate_response
+# --- Gradio UI ---
+with gr.Blocks() as demo:
+    gr.Markdown("# Ling Playground")
+    chatbot = gr.Chatbot()
+    msg = gr.Textbox()
+    clear = gr.ClearButton([msg, chatbot])
+    def user(user_message, history):
+        return "", history + [[user_message, None]]
+    def bot(history):
+        user_message = history[-1][0]
+        history[-1][1] = ""
+        for response in generate_response(user_message, history[:-1]):
+            history[-1][1] = response
+            yield history
+    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot, chatbot, chatbot
+    )
+    clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
     demo.launch()

comp.py CHANGED Viewed

@@ -1,12 +1,7 @@
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from langchain_core.messages import AIMessage
-from typing import TypedDict, Annotated, List
-import operator
-# 定义此组件操作的图状态的子集
-class GraphState(TypedDict):
-    messages: Annotated[List[AIMessage], operator.add]
 # --- 模型加载 ---
 # 使用 "auto" 模式加载模型和分词器，Hugging Face Accelerate 会自动处理设备和精度
@@ -20,44 +15,49 @@ model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 )
-def completion_node(state: GraphState) -> dict:
-    """
-    一个调用语言模型以获取响应的节点。
-    Args:
-        state (GraphState): 图的当前状态，包含消息历史。
-    Returns:
-        dict: 一个包含新 AI 消息的字典，该消息将被添加到状态中。
-    """
-    messages = state["messages"]
-    # --- 提示工程 ---
-    # 从消息历史中组装提示。
-    prompt = ""
-    for msg in messages:
-        if msg.type == "system":
-            prompt += f"{msg.content}\n"
-        elif msg.type == "human":
-            prompt += f"User: {msg.content}\n"
-        elif msg.type == "ai":
-            prompt += f"Assistant: {msg.content}\n"
-    prompt += "Assistant:"
-    # --- 模型调用 ---
-    # 调用 tokenizer 时获取 input_ids 和 attention_mask
-    inputs = tokenizer(prompt, return_tensors="pt")
-    # 将 attention_mask 和 input_ids 一起传递给 model.generate
-    output_ids = model.generate(
-        inputs.input_ids.to(model.device),
-        attention_mask=inputs.attention_mask.to(model.device),
-        max_new_tokens=512,  # 暂时硬编码
         do_sample=True,
-        pad_token_id=tokenizer.eos_token_id,
     )
-    output = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
-    # 以 AIMessage 的形式返回响应，以添加到图的状态中。
-    return {"messages": [AIMessage(content=output)]}

 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+import spaces
 # --- 模型加载 ---
 # 使用 "auto" 模式加载模型和分词器，Hugging Face Accelerate 会自动处理设备和精度
     trust_remote_code=True
 )
+@spaces.GPU(duration=120)
+def generate_response(message, history):
+    # Convert history to messages format
+    messages = [
+        {"role": "system", "content": "You are Ring, an assistant created by inclusionAI"}
+    ]
+    # Add conversation history
+    for human, assistant in history:
+        messages.append({"role": "user", "content": human})
+        messages.append({"role": "assistant", "content": assistant})
+    # Add current message
+    messages.append({"role": "user", "content": message})
+    # Apply chat template
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    # Tokenize input
+    model_inputs = tokenizer([text], return_tensors="pt", return_token_type_ids=False).to(model.device)
+    # Generate response with streaming
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
+    generation_kwargs = dict(
+        **model_inputs,
+        max_new_tokens=8192,
+        temperature=0.7,
         do_sample=True,
+        streamer=streamer,
     )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Stream the response
+    response = ""
+    for new_text in streamer:
+        response += new_text
+        yield response
+    thread.join()

graph.py DELETED Viewed

@@ -1,42 +0,0 @@
-import operator
-from typing import Annotated, List
-from typing_extensions import TypedDict
-from langchain_core.messages import AnyMessage
-from langgraph.graph import StateGraph, END
-# 从我们的组件文件中导入模型补全节点
-from comp import completion_node
-# --- 图状态定义 ---
-# 状态是我们图的内存或上下文。它是一个字典，
-# 保存了对话过程中交换的所有消息。
-class GraphState(TypedDict):
-    """
-    表示我们图的状态。
-    Attributes:
-        messages: 一个随时间自动累积的消息列表。
-                  `operator.add` 注解告诉 LangGraph 将新消息附加到此列表，
-                  而不是覆盖它。这就是图如何维护对话历史（上下文）的方式。
-    """
-    messages: Annotated[List[AnyMessage], operator.add]
-# --- 图工作流构建 ---
-# 使用我们定义的状态创建一个新的状态图
-workflow = StateGraph(GraphState)
-# 将补全节点添加到图中。我们将其命名为 “llm”。
-# 这个节点负责调用语言模型。
-workflow.add_node("llm", completion_node)
-# 设置图的入口点。第一个被调用的节点是 “llm”。
-workflow.set_entry_point("llm")
-# 从 “llm” 节点到 END 添加一条简单的边。
-# 这意味着在调用 LLM 后，图的执行就完成了。
-workflow.add_edge("llm", END)
-# 将工作流编译成一个可运行的应用。
-app = workflow.compile()