|
|
""" |
|
|
Docker Model Runner - CPU-Optimized FastAPI application |
|
|
Optimized for: 2 vCPU, 16GB RAM |
|
|
""" |
|
|
from fastapi import FastAPI, HTTPException |
|
|
from pydantic import BaseModel |
|
|
from typing import Optional, List |
|
|
import torch |
|
|
from transformers import pipeline, AutoTokenizer, AutoModel |
|
|
import os |
|
|
from datetime import datetime |
|
|
from contextlib import asynccontextmanager |
|
|
|
|
|
|
|
|
MODEL_NAME = os.getenv("MODEL_NAME", "distilbert-base-uncased-finetuned-sst-2-english") |
|
|
GENERATOR_MODEL = os.getenv("GENERATOR_MODEL", "distilgpt2") |
|
|
EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
torch.set_num_threads(2) |
|
|
|
|
|
|
|
|
models = {} |
|
|
|
|
|
|
|
|
def load_models(): |
|
|
"""Pre-load models for faster inference""" |
|
|
global models |
|
|
print("Loading models for CPU inference...") |
|
|
|
|
|
|
|
|
models["classifier"] = pipeline( |
|
|
"text-classification", |
|
|
model=MODEL_NAME, |
|
|
device=-1, |
|
|
torch_dtype=torch.float32 |
|
|
) |
|
|
|
|
|
models["generator"] = pipeline( |
|
|
"text-generation", |
|
|
model=GENERATOR_MODEL, |
|
|
device=-1, |
|
|
torch_dtype=torch.float32 |
|
|
) |
|
|
|
|
|
|
|
|
models["tokenizer"] = AutoTokenizer.from_pretrained(EMBED_MODEL) |
|
|
models["embedder"] = AutoModel.from_pretrained(EMBED_MODEL) |
|
|
models["embedder"].eval() |
|
|
|
|
|
print("✅ All models loaded successfully!") |
|
|
|
|
|
|
|
|
@asynccontextmanager |
|
|
async def lifespan(app: FastAPI): |
|
|
load_models() |
|
|
yield |
|
|
models.clear() |
|
|
|
|
|
|
|
|
app = FastAPI( |
|
|
title="Docker Model Runner", |
|
|
description="CPU-Optimized HuggingFace Space with named endpoints", |
|
|
version="1.0.0", |
|
|
lifespan=lifespan |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
class PredictRequest(BaseModel): |
|
|
text: str |
|
|
top_k: Optional[int] = 1 |
|
|
|
|
|
|
|
|
class PredictResponse(BaseModel): |
|
|
predictions: List[dict] |
|
|
model: str |
|
|
latency_ms: float |
|
|
|
|
|
|
|
|
class GenerateRequest(BaseModel): |
|
|
prompt: str |
|
|
max_length: Optional[int] = 50 |
|
|
num_return_sequences: Optional[int] = 1 |
|
|
temperature: Optional[float] = 0.7 |
|
|
|
|
|
|
|
|
class GenerateResponse(BaseModel): |
|
|
generated_text: List[str] |
|
|
model: str |
|
|
latency_ms: float |
|
|
|
|
|
|
|
|
class EmbedRequest(BaseModel): |
|
|
texts: List[str] |
|
|
|
|
|
|
|
|
class EmbedResponse(BaseModel): |
|
|
embeddings: List[List[float]] |
|
|
model: str |
|
|
dimensions: int |
|
|
latency_ms: float |
|
|
|
|
|
|
|
|
class HealthResponse(BaseModel): |
|
|
status: str |
|
|
timestamp: str |
|
|
hardware: str |
|
|
models_loaded: bool |
|
|
|
|
|
|
|
|
class InfoResponse(BaseModel): |
|
|
name: str |
|
|
version: str |
|
|
hardware: str |
|
|
models: dict |
|
|
endpoints: List[str] |
|
|
|
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
async def root(): |
|
|
"""Welcome endpoint""" |
|
|
return { |
|
|
"message": "Docker Model Runner API (CPU Optimized)", |
|
|
"hardware": "CPU Basic: 2 vCPU · 16 GB RAM", |
|
|
"docs": "/docs", |
|
|
"endpoints": ["/health", "/info", "/predict", "/generate", "/embed"] |
|
|
} |
|
|
|
|
|
|
|
|
@app.get("/health", response_model=HealthResponse) |
|
|
async def health(): |
|
|
"""Health check endpoint""" |
|
|
return HealthResponse( |
|
|
status="healthy", |
|
|
timestamp=datetime.utcnow().isoformat(), |
|
|
hardware="CPU Basic: 2 vCPU · 16 GB RAM", |
|
|
models_loaded=len(models) > 0 |
|
|
) |
|
|
|
|
|
|
|
|
@app.get("/info", response_model=InfoResponse) |
|
|
async def info(): |
|
|
"""Model and API information""" |
|
|
return InfoResponse( |
|
|
name="Docker Model Runner", |
|
|
version="1.0.0", |
|
|
hardware="CPU Basic: 2 vCPU · 16 GB RAM", |
|
|
models={ |
|
|
"classifier": MODEL_NAME, |
|
|
"generator": GENERATOR_MODEL, |
|
|
"embedder": EMBED_MODEL |
|
|
}, |
|
|
endpoints=["/", "/health", "/info", "/predict", "/generate", "/embed"] |
|
|
) |
|
|
|
|
|
|
|
|
@app.post("/predict", response_model=PredictResponse) |
|
|
async def predict(request: PredictRequest): |
|
|
""" |
|
|
Run text classification (sentiment analysis) |
|
|
|
|
|
- **text**: Input text to classify |
|
|
- **top_k**: Number of top predictions to return |
|
|
""" |
|
|
try: |
|
|
start_time = datetime.now() |
|
|
results = models["classifier"](request.text, top_k=request.top_k) |
|
|
latency = (datetime.now() - start_time).total_seconds() * 1000 |
|
|
|
|
|
return PredictResponse( |
|
|
predictions=results, |
|
|
model=MODEL_NAME, |
|
|
latency_ms=round(latency, 2) |
|
|
) |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
|
|
|
|
@app.post("/generate", response_model=GenerateResponse) |
|
|
async def generate(request: GenerateRequest): |
|
|
""" |
|
|
Generate text from a prompt |
|
|
|
|
|
- **prompt**: Input prompt for generation |
|
|
- **max_length**: Maximum length of generated text (default: 50) |
|
|
- **temperature**: Sampling temperature (default: 0.7) |
|
|
""" |
|
|
try: |
|
|
start_time = datetime.now() |
|
|
results = models["generator"]( |
|
|
request.prompt, |
|
|
max_length=request.max_length, |
|
|
num_return_sequences=request.num_return_sequences, |
|
|
temperature=request.temperature, |
|
|
do_sample=True, |
|
|
pad_token_id=50256 |
|
|
) |
|
|
latency = (datetime.now() - start_time).total_seconds() * 1000 |
|
|
|
|
|
generated_texts = [r["generated_text"] for r in results] |
|
|
|
|
|
return GenerateResponse( |
|
|
generated_text=generated_texts, |
|
|
model=GENERATOR_MODEL, |
|
|
latency_ms=round(latency, 2) |
|
|
) |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
|
|
|
|
@app.post("/embed", response_model=EmbedResponse) |
|
|
async def embed(request: EmbedRequest): |
|
|
""" |
|
|
Get text embeddings using MiniLM (384 dimensions) |
|
|
|
|
|
- **texts**: List of texts to embed |
|
|
""" |
|
|
try: |
|
|
start_time = datetime.now() |
|
|
|
|
|
|
|
|
inputs = models["tokenizer"]( |
|
|
request.texts, |
|
|
padding=True, |
|
|
truncation=True, |
|
|
max_length=256, |
|
|
return_tensors="pt" |
|
|
) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = models["embedder"](**inputs) |
|
|
|
|
|
attention_mask = inputs["attention_mask"] |
|
|
token_embeddings = outputs.last_hidden_state |
|
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
|
embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
|
|
|
|
|
latency = (datetime.now() - start_time).total_seconds() * 1000 |
|
|
|
|
|
return EmbedResponse( |
|
|
embeddings=embeddings.tolist(), |
|
|
model=EMBED_MODEL, |
|
|
dimensions=embeddings.shape[1], |
|
|
latency_ms=round(latency, 2) |
|
|
) |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|
|