qa_python_k2_think_api_UI

Runtime error

rabiyulfahim commited on Oct 6

Commit

6412a86

verified ·

1 Parent(s): ef9e4f5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,25 +7,26 @@ from fastapi.staticfiles import StaticFiles
 import os
 import torch
-# ✅ Hugging Face cache directory
-os.environ["HF_HOME"] = "/tmp"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 # -----------------------
 # Model Setup
 # -----------------------
 model_id = "LLM360/K2-Think"
-# Load tokenizer and model
 print("Loading tokenizer and model...")
 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    cache_dir="/tmp",
-    device_map="auto",       # Automatically select GPU/CPU
-    torch_dtype=torch.float16
 )
-print("Model loaded successfully!")
 # -----------------------
 # FastAPI Setup
@@ -40,7 +41,6 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Mount static folder
 app.mount("/static", StaticFiles(directory="static"), name="static")
 # -----------------------

 import os
 import torch
+# -----------------------
+# Hugging Face cache
+# -----------------------
+os.environ["HF_HOME"] = "/tmp"  # writable cache
+os.environ["TRANSFORMERS_CACHE"] = "/tmp"  # optional
 # -----------------------
 # Model Setup
 # -----------------------
 model_id = "LLM360/K2-Think"
 print("Loading tokenizer and model...")
 tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    device_map="auto",        # auto assign to GPU/CPU
+    load_in_8bit=True,        # 8-bit quantization for low memory
+    cache_dir="/tmp"
 )
+print("Model loaded!")
 # -----------------------
 # FastAPI Setup
     allow_headers=["*"],
 )
 app.mount("/static", StaticFiles(directory="static"), name="static")
 # -----------------------