rabiyulfahim commited on
Commit
ff74138
·
verified ·
1 Parent(s): 247f998

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -70
app.py CHANGED
@@ -1,25 +1,36 @@
1
- from fastapi import FastAPI,Query
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
- import torch
4
- import os
5
  from pydantic import BaseModel
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from fastapi.responses import HTMLResponse
8
  from fastapi.staticfiles import StaticFiles
 
 
9
 
10
-
11
- # ✅ Force Hugging Face cache to /tmp (writable in Spaces)
12
  os.environ["HF_HOME"] = "/tmp"
13
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
14
 
 
 
 
 
15
 
16
- model_id = "rabiyulfahim/qa_python_gpt2"
17
-
18
  tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
19
- model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir="/tmp")
20
-
 
 
 
 
 
21
 
22
- app = FastAPI(title="QA GPT2 API UI", description="Serving HuggingFace model with FastAPI")
 
 
 
23
 
24
  app.add_middleware(
25
  CORSMiddleware,
@@ -28,28 +39,25 @@ app.add_middleware(
28
  allow_methods=["*"],
29
  allow_headers=["*"],
30
  )
31
- # Request schema
 
 
 
 
 
 
32
  class QueryRequest(BaseModel):
33
  question: str
34
  max_new_tokens: int = 50
35
  temperature: float = 0.7
36
  top_p: float = 0.9
37
 
38
-
 
 
39
  @app.get("/")
40
  def home():
41
- return {"message": "Welcome to QA GPT2 API 🚀"}
42
-
43
- @app.get("/ask")
44
- def ask(question: str, max_new_tokens: int = 50):
45
- inputs = tokenizer(question, return_tensors="pt")
46
- outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
47
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
48
- return {"question": question, "answer": answer}
49
-
50
-
51
- # Mount static folder
52
- app.mount("/static", StaticFiles(directory="static"), name="static")
53
 
54
  @app.get("/ui", response_class=HTMLResponse)
55
  def serve_ui():
@@ -57,55 +65,42 @@ def serve_ui():
57
  with open(html_path, "r", encoding="utf-8") as f:
58
  return HTMLResponse(f.read())
59
 
60
-
61
- # Health check endpoint
62
  @app.get("/health")
63
  def health():
64
  return {"status": "ok"}
65
 
66
- # Inference endpoint
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  @app.post("/predict")
68
  def predict(request: QueryRequest):
69
- inputs = tokenizer(request.question, return_tensors="pt")
70
- outputs = model.generate(
71
- **inputs,
72
- max_new_tokens=request.max_new_tokens,
73
- do_sample=True,
74
- temperature=0.7,
75
- top_p=0.9,
76
- pad_token_id=tokenizer.eos_token_id,
77
- return_dict_in_generate=True
78
- )
79
-
80
- answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
81
- return {
82
- "question": request.question,
83
- "answer": answer
84
- }
85
-
86
-
87
-
88
-
89
- @app.get("/answers")
90
- def predict(question: str = Query(..., description="The question to ask"), max_new_tokens: int = Query(50, description="Max new tokens to generate")):
91
- # Tokenize the input question
92
- inputs = tokenizer(question, return_tensors="pt")
93
-
94
- # Generate output from model
95
- outputs = model.generate(
96
- **inputs,
97
- max_new_tokens=max_new_tokens,
98
- do_sample=True,
99
- temperature=0.7,
100
- top_p=0.9,
101
- pad_token_id=tokenizer.eos_token_id,
102
- return_dict_in_generate=True
103
- )
104
-
105
- # Decode output
106
- answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
107
-
108
- return {
109
- "question": question,
110
- "answer": answer
111
- }
 
1
+ from fastapi import FastAPI, Query, HTTPException
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
3
  from pydantic import BaseModel
4
  from fastapi.middleware.cors import CORSMiddleware
5
  from fastapi.responses import HTMLResponse
6
  from fastapi.staticfiles import StaticFiles
7
+ import os
8
+ import torch
9
 
10
+ # ✅ Hugging Face cache directory
 
11
  os.environ["HF_HOME"] = "/tmp"
12
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
13
 
14
+ # -----------------------
15
+ # Model Setup
16
+ # -----------------------
17
+ model_id = "LLM360/K2-Think"
18
 
19
+ # Load tokenizer and model
20
+ print("Loading tokenizer and model...")
21
  tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp")
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ model_id,
24
+ cache_dir="/tmp",
25
+ device_map="auto", # Automatically select GPU/CPU
26
+ torch_dtype=torch.float16
27
+ )
28
+ print("Model loaded successfully!")
29
 
30
+ # -----------------------
31
+ # FastAPI Setup
32
+ # -----------------------
33
+ app = FastAPI(title="K2-Think QA API", description="Serving K2-Think Hugging Face model with FastAPI")
34
 
35
  app.add_middleware(
36
  CORSMiddleware,
 
39
  allow_methods=["*"],
40
  allow_headers=["*"],
41
  )
42
+
43
+ # Mount static folder
44
+ app.mount("/static", StaticFiles(directory="static"), name="static")
45
+
46
+ # -----------------------
47
+ # Request Schema
48
+ # -----------------------
49
  class QueryRequest(BaseModel):
50
  question: str
51
  max_new_tokens: int = 50
52
  temperature: float = 0.7
53
  top_p: float = 0.9
54
 
55
+ # -----------------------
56
+ # Endpoints
57
+ # -----------------------
58
  @app.get("/")
59
  def home():
60
+ return {"message": "Welcome to K2-Think QA API 🚀"}
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  @app.get("/ui", response_class=HTMLResponse)
63
  def serve_ui():
 
65
  with open(html_path, "r", encoding="utf-8") as f:
66
  return HTMLResponse(f.read())
67
 
 
 
68
  @app.get("/health")
69
  def health():
70
  return {"status": "ok"}
71
 
72
+ @app.get("/ask")
73
+ def ask(question: str = Query(...), max_new_tokens: int = Query(50)):
74
+ try:
75
+ inputs = tokenizer(question, return_tensors="pt")
76
+ outputs = model.generate(
77
+ **inputs,
78
+ max_new_tokens=max_new_tokens,
79
+ do_sample=True,
80
+ temperature=0.7,
81
+ top_p=0.9,
82
+ pad_token_id=tokenizer.eos_token_id,
83
+ return_dict_in_generate=True
84
+ )
85
+ answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
86
+ return {"question": question, "answer": answer}
87
+ except Exception as e:
88
+ raise HTTPException(status_code=500, detail=str(e))
89
+
90
  @app.post("/predict")
91
  def predict(request: QueryRequest):
92
+ try:
93
+ inputs = tokenizer(request.question, return_tensors="pt")
94
+ outputs = model.generate(
95
+ **inputs,
96
+ max_new_tokens=request.max_new_tokens,
97
+ do_sample=True,
98
+ temperature=request.temperature,
99
+ top_p=request.top_p,
100
+ pad_token_id=tokenizer.eos_token_id,
101
+ return_dict_in_generate=True
102
+ )
103
+ answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
104
+ return {"question": request.question, "answer": answer}
105
+ except Exception as e:
106
+ raise HTTPException(status_code=500, detail=str(e))