from fastapi import FastAPI, Query, HTTPException from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from pydantic import BaseModel from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse from fastapi.staticfiles import StaticFiles import os import torch # ----------------------- # Set cache dirs (avoid Docker errors) # ----------------------- # os.environ["HF_HOME"] = "/tmp" # os.environ["TRANSFORMERS_CACHE"] = "/tmp" # os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_inductor_cache" # os.makedirs("/tmp/torch_inductor_cache", exist_ok=True) os.environ["TORCH_HOME"] = "/tmp/torch_home" os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_inductor_cache" os.makedirs("/tmp/torch_home", exist_ok=True) os.makedirs("/tmp/torch_inductor_cache", exist_ok=True) # ----------------------- # Model Setup # ----------------------- model_id = "LLM360/K2-Think" print("Loading tokenizer and model...") tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp") bnb_config = BitsAndBytesConfig( load_in_8bit=True # 8-bit quantization ) model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=bnb_config, device_map="auto", cache_dir="/tmp" ) print("Model loaded!") # ----------------------- # FastAPI Setup # ----------------------- app = FastAPI(title="K2-Think QA API", description="Serving K2-Think Hugging Face model with FastAPI") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) app.mount("/static", StaticFiles(directory="static"), name="static") # ----------------------- # Request Schema # ----------------------- class QueryRequest(BaseModel): question: str max_new_tokens: int = 50 temperature: float = 0.7 top_p: float = 0.9 # ----------------------- # Endpoints # ----------------------- @app.get("/") def home(): return {"message": "Welcome to K2-Think QA API 🚀"} @app.get("/health") def health(): return {"status": "ok"} @app.get("/ask") def ask(question: str = Query(...), max_new_tokens: int = Query(50)): try: inputs = tokenizer(question, return_tensors="pt") outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id, return_dict_in_generate=True ) answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) return {"question": question, "answer": answer} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/predict") def predict(request: QueryRequest): try: inputs = tokenizer(request.question, return_tensors="pt") outputs = model.generate( **inputs, max_new_tokens=request.max_new_tokens, do_sample=True, temperature=request.temperature, top_p=request.top_p, pad_token_id=tokenizer.eos_token_id, return_dict_in_generate=True ) answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) return {"question": request.question, "answer": answer} except Exception as e: raise HTTPException(status_code=500, detail=str(e))