rahul7star commited on
Commit
8c48eed
·
verified ·
1 Parent(s): 2c51b55

Update app_low.py

Browse files
Files changed (1) hide show
  1. app_low.py +14 -10
app_low.py CHANGED
@@ -1,18 +1,19 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, snapshot_download
 
4
  import os
5
 
6
  # ============================================================
7
- # 1️⃣ Download model efficiently
8
  # ============================================================
9
  MODEL_ID = "Qwen/Qwen2.5-1.5B"
10
 
11
- # Download to /tmp to avoid HF Space quota overflow
12
  model_dir = snapshot_download(repo_id=MODEL_ID, cache_dir="/tmp/qwen_model")
13
 
14
  # ============================================================
15
- # 2️⃣ Load model with CPU/offload optimizations
16
  # ============================================================
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
 
@@ -22,10 +23,11 @@ model = AutoModelForCausalLM.from_pretrained(
22
  device_map="auto" if torch.cuda.is_available() else None,
23
  low_cpu_mem_usage=True,
24
  )
 
25
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
26
 
27
  # ============================================================
28
- # 3️⃣ Define Chat Function
29
  # ============================================================
30
  def chat_with_qwen(message, history):
31
  history = history or []
@@ -35,17 +37,20 @@ def chat_with_qwen(message, history):
35
  messages.append({"role": "assistant", "content": bot})
36
  messages.append({"role": "user", "content": message})
37
 
 
38
  inputs = tokenizer.apply_chat_template(
39
  messages,
40
  add_generation_prompt=True,
41
  tokenize=True,
42
  return_tensors="pt"
43
- ).to(device)
 
 
44
 
45
  with torch.no_grad():
46
  outputs = model.generate(
47
  **inputs,
48
- max_new_tokens=300,
49
  temperature=0.8,
50
  do_sample=True,
51
  pad_token_id=tokenizer.eos_token_id
@@ -55,14 +60,13 @@ def chat_with_qwen(message, history):
55
  history.append((message, response))
56
  return history, history
57
 
58
-
59
  # ============================================================
60
  # 4️⃣ Gradio UI
61
  # ============================================================
62
  with gr.Blocks(theme="soft", title="Qwen 2.5 Chatbot") as demo:
63
- gr.Markdown("## 🧠 Qwen 2.5 — Lightweight Chatbot (Optimized for CPU & GPU Offload)")
64
  chatbot = gr.Chatbot(height=480, label="Chat with Qwen 2.5", type="messages")
65
- msg = gr.Textbox(placeholder="Ask me anything...", label="Your message")
66
  clear = gr.Button("🧹 Clear Chat")
67
 
68
  msg.submit(chat_with_qwen, [msg, chatbot], [chatbot, chatbot])
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+ from huggingface_hub import snapshot_download
5
  import os
6
 
7
  # ============================================================
8
+ # 1️⃣ Download model efficiently (avoid exceeding space limits)
9
  # ============================================================
10
  MODEL_ID = "Qwen/Qwen2.5-1.5B"
11
 
12
+ # Store in /tmp to reduce Space storage pressure
13
  model_dir = snapshot_download(repo_id=MODEL_ID, cache_dir="/tmp/qwen_model")
14
 
15
  # ============================================================
16
+ # 2️⃣ Load model with CPU or GPU offload
17
  # ============================================================
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
 
 
23
  device_map="auto" if torch.cuda.is_available() else None,
24
  low_cpu_mem_usage=True,
25
  )
26
+
27
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
28
 
29
  # ============================================================
30
+ # 3️⃣ Define chat function
31
  # ============================================================
32
  def chat_with_qwen(message, history):
33
  history = history or []
 
37
  messages.append({"role": "assistant", "content": bot})
38
  messages.append({"role": "user", "content": message})
39
 
40
+ # Tokenize input messages
41
  inputs = tokenizer.apply_chat_template(
42
  messages,
43
  add_generation_prompt=True,
44
  tokenize=True,
45
  return_tensors="pt"
46
+ )
47
+
48
+ inputs = {k: v.to(device) for k, v in inputs.items()}
49
 
50
  with torch.no_grad():
51
  outputs = model.generate(
52
  **inputs,
53
+ max_new_tokens=256,
54
  temperature=0.8,
55
  do_sample=True,
56
  pad_token_id=tokenizer.eos_token_id
 
60
  history.append((message, response))
61
  return history, history
62
 
 
63
  # ============================================================
64
  # 4️⃣ Gradio UI
65
  # ============================================================
66
  with gr.Blocks(theme="soft", title="Qwen 2.5 Chatbot") as demo:
67
+ gr.Markdown("## 🤖 Qwen 2.5 Chatbot — Optimized for CPU/GPU Offload")
68
  chatbot = gr.Chatbot(height=480, label="Chat with Qwen 2.5", type="messages")
69
+ msg = gr.Textbox(placeholder="Type your question here...", label="Your Message")
70
  clear = gr.Button("🧹 Clear Chat")
71
 
72
  msg.submit(chat_with_qwen, [msg, chatbot], [chatbot, chatbot])