PromptEnhancer_32B-test

Running

App Files Files Community

rahul7star commited on Oct 16

Commit

8c48eed

verified ·

1 Parent(s): 2c51b55

Update app_low.py

Browse files

Files changed (1) hide show

app_low.py +14 -10

app_low.py CHANGED Viewed

@@ -1,18 +1,19 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, snapshot_download
 import os
 # ============================================================
-# 1️⃣ Download model efficiently
 # ============================================================
 MODEL_ID = "Qwen/Qwen2.5-1.5B"
-# Download to /tmp to avoid HF Space quota overflow
 model_dir = snapshot_download(repo_id=MODEL_ID, cache_dir="/tmp/qwen_model")
 # ============================================================
-# 2️⃣ Load model with CPU/offload optimizations
 # ============================================================
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -22,10 +23,11 @@ model = AutoModelForCausalLM.from_pretrained(
     device_map="auto" if torch.cuda.is_available() else None,
     low_cpu_mem_usage=True,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
 # ============================================================
-# 3️⃣ Define Chat Function
 # ============================================================
 def chat_with_qwen(message, history):
     history = history or []
@@ -35,17 +37,20 @@ def chat_with_qwen(message, history):
         messages.append({"role": "assistant", "content": bot})
     messages.append({"role": "user", "content": message})
     inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_tensors="pt"
-    ).to(device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=300,
             temperature=0.8,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id
@@ -55,14 +60,13 @@ def chat_with_qwen(message, history):
     history.append((message, response))
     return history, history
 # ============================================================
 # 4️⃣ Gradio UI
 # ============================================================
 with gr.Blocks(theme="soft", title="Qwen 2.5 Chatbot") as demo:
-    gr.Markdown("## 🧠 Qwen 2.5 — Lightweight Chatbot (Optimized for CPU & GPU Offload)")
     chatbot = gr.Chatbot(height=480, label="Chat with Qwen 2.5", type="messages")
-    msg = gr.Textbox(placeholder="Ask me anything...", label="Your message")
     clear = gr.Button("🧹 Clear Chat")
     msg.submit(chat_with_qwen, [msg, chatbot], [chatbot, chatbot])

 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import snapshot_download
 import os
 # ============================================================
+# 1️⃣ Download model efficiently (avoid exceeding space limits)
 # ============================================================
 MODEL_ID = "Qwen/Qwen2.5-1.5B"
+# Store in /tmp to reduce Space storage pressure
 model_dir = snapshot_download(repo_id=MODEL_ID, cache_dir="/tmp/qwen_model")
 # ============================================================
+# 2️⃣ Load model with CPU or GPU offload
 # ============================================================
 device = "cuda" if torch.cuda.is_available() else "cpu"
     device_map="auto" if torch.cuda.is_available() else None,
     low_cpu_mem_usage=True,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
 # ============================================================
+# 3️⃣ Define chat function
 # ============================================================
 def chat_with_qwen(message, history):
     history = history or []
         messages.append({"role": "assistant", "content": bot})
     messages.append({"role": "user", "content": message})
+    # Tokenize input messages
     inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_tensors="pt"
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=256,
             temperature=0.8,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id
     history.append((message, response))
     return history, history
 # ============================================================
 # 4️⃣ Gradio UI
 # ============================================================
 with gr.Blocks(theme="soft", title="Qwen 2.5 Chatbot") as demo:
+    gr.Markdown("## 🤖 Qwen 2.5 Chatbot — Optimized for CPU/GPU Offload")
     chatbot = gr.Chatbot(height=480, label="Chat with Qwen 2.5", type="messages")
+    msg = gr.Textbox(placeholder="Type your question here...", label="Your Message")
     clear = gr.Button("🧹 Clear Chat")
     msg.submit(chat_with_qwen, [msg, chatbot], [chatbot, chatbot])