Spaces:

likhonsheikhdev
/

docker-model-runner

Sleeping

App Files Files Community

likhonsheikhdev commited on 8 days ago

Commit

7222b60

verified ·

1 Parent(s): 1ea9642

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

main.py +128 -57

main.py CHANGED Viewed

@@ -1,11 +1,13 @@
 """
 Docker Model Runner - Anthropic API Compatible
 Full compatibility with Anthropic Messages API + Interleaved Thinking
 Optimized for: 2 vCPU, 16GB RAM
 """
 from fastapi import FastAPI, HTTPException, Header, Request
 from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field
 from typing import Optional, List, Union, Literal, Any, Dict
 import torch
@@ -17,7 +19,6 @@ import uuid
 import time
 import json
 import asyncio
-import re
 # CPU-optimized lightweight models
 GENERATOR_MODEL = os.getenv("GENERATOR_MODEL", "distilgpt2")
@@ -31,17 +32,13 @@ models = {}
 def load_models():
-    """Pre-load models for faster inference"""
     global models
     print("Loading models for CPU inference...")
     models["tokenizer"] = AutoTokenizer.from_pretrained(GENERATOR_MODEL)
     models["model"] = AutoModelForCausalLM.from_pretrained(GENERATOR_MODEL)
     models["model"].eval()
     if models["tokenizer"].pad_token is None:
         models["tokenizer"].pad_token = models["tokenizer"].eos_token
     print("✅ All models loaded successfully!")
@@ -54,13 +51,22 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
     title="Model Runner",
-    description="Anthropic API Compatible with Interleaved Thinking",
-    version="1.0.0",
     lifespan=lifespan,
     docs_url="/api/docs",
     redoc_url="/api/redoc"
 )
 # ============== Anthropic API Models ==============
@@ -143,7 +149,7 @@ class Metadata(BaseModel):
 class AnthropicRequest(BaseModel):
     model: str = "MiniMax-M2"
     messages: List[MessageParam]
-    max_tokens: int = 1024
     temperature: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
     top_p: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
     top_k: Optional[int] = None
@@ -153,7 +159,7 @@ class AnthropicRequest(BaseModel):
     tools: Optional[List[Tool]] = None
     tool_choice: Optional[Union[ToolChoice, Dict[str, Any]]] = None
     metadata: Optional[Metadata] = None
-    thinking: Optional[ThinkingConfig] = None
     service_tier: Optional[str] = None
@@ -219,10 +225,13 @@ def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Uni
                     if block_type == 'thinking' and include_thinking:
                         prompt_parts.append(f"<thinking>{block.get('thinking', '')}</thinking>\n")
                     elif block_type == 'text':
                         if role == "user":
-                            prompt_parts.append(f"Human: {block.get('text', '')}\n\n")
                         else:
-                            prompt_parts.append(f"Assistant: {block.get('text', '')}\n\n")
                 elif hasattr(block, 'type'):
                     if block.type == 'thinking' and include_thinking:
                         prompt_parts.append(f"<thinking>{block.thinking}</thinking>\n")
@@ -244,12 +253,12 @@ def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Uni
 def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float) -> tuple:
     tokenizer = models["tokenizer"]
     model = models["model"]
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
     input_tokens = inputs["input_ids"].shape[1]
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=min(max_tokens, 256),
             temperature=temperature if temperature > 0 else 1.0,
             top_p=top_p,
             do_sample=temperature > 0,
@@ -271,7 +280,7 @@ def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=min(budget_tokens, 128),
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
@@ -287,7 +296,7 @@ def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
 async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperature: float, top_p: float, message_id: str, model_name: str, thinking_enabled: bool = False, thinking_budget: int = 100):
     tokenizer = models["tokenizer"]
     model = models["model"]
-    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
     input_tokens = inputs["input_ids"].shape[1]
     total_output_tokens = 0
@@ -314,7 +323,7 @@ async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperatur
     yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': content_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
     with torch.no_grad():
-        outputs = model.generate(**inputs, max_new_tokens=min(max_tokens, 256), temperature=temperature if temperature > 0 else 1.0, top_p=top_p, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
     generated_tokens = outputs[0][input_tokens:]
     generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
@@ -322,7 +331,7 @@ async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperatur
     for i in range(0, len(generated_text), 5):
         yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': content_index, 'delta': {'type': 'text_delta', 'text': generated_text[i:i+5]}})}\n\n"
-        await asyncio.sleep(0.01)
     yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': content_index})}\n\n"
     yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': 'end_turn', 'stop_sequence': None}, 'usage': {'output_tokens': total_output_tokens}})}\n\n"
@@ -338,45 +347,19 @@ def handle_tool_call(tools: List[Tool], messages: List[MessageParam], generated_
     return None
-# ============== Frontend ==============
-@app.get("/", response_class=HTMLResponse)
-async def home():
-    """Serve the minimal centered frontend"""
-    try:
-        with open("/app/static/index.html", "r") as f:
-            return HTMLResponse(content=f.read())
-    except:
-        return HTMLResponse(content="""
-<!DOCTYPE html>
-<html><head><meta charset="UTF-8"><title>Model Runner</title>
-<style>*{margin:0;padding:0}body{min-height:100vh;background:#000;display:flex;justify-content:center;align-items:center}
-.logo{width:200px;height:200px;animation:float 3s ease-in-out infinite}
-@keyframes float{0%,100%{transform:translateY(0)}50%{transform:translateY(-10px)}}</style></head>
-<body><div class="logo"><svg viewBox="0 0 200 200" fill="none">
-<defs><linearGradient id="r" x1="0%" y1="100%" x2="100%" y2="0%">
-<stop offset="0%" stop-color="#ff0080"/><stop offset="25%" stop-color="#ff4d00"/>
-<stop offset="50%" stop-color="#ffcc00"/><stop offset="75%" stop-color="#00ff88"/>
-<stop offset="100%" stop-color="#00ccff"/></linearGradient></defs>
-<path d="M100 20 L180 160 L20 160 Z" stroke="url(#r)" stroke-width="12" stroke-linecap="round" fill="none"/>
-<path d="M100 70 L130 130 L70 130 Z" stroke="url(#r)" stroke-width="8" stroke-linecap="round" fill="none"/>
-<line x1="80" y1="115" x2="120" y2="115" stroke="url(#r)" stroke-width="6" stroke-linecap="round"/>
-</svg></div></body></html>
-        """)
-# ============== Anthropic API Endpoints ==============
-@app.post("/v1/messages")
-async def create_message(request: AnthropicRequest):
     try:
         message_id = f"msg_{uuid.uuid4().hex[:24]}"
         thinking_enabled = False
         thinking_budget = 100
         if request.thinking:
             if isinstance(request.thinking, dict):
                 thinking_enabled = request.thinking.get('type') == 'enabled'
-                thinking_budget = request.thinking.get('budget_tokens', 100)
             else:
                 thinking_enabled = request.thinking.type == 'enabled'
                 thinking_budget = request.thinking.budget_tokens or 100
@@ -416,17 +399,66 @@ async def create_message(request: AnthropicRequest):
         raise HTTPException(status_code=500, detail=str(e))
 # ============== OpenAI Compatible ==============
 class ChatMessage(BaseModel):
     role: str
-    content: str
 class ChatCompletionRequest(BaseModel):
-    model: str = "distilgpt2"
     messages: List[ChatMessage]
-    max_tokens: Optional[int] = 1024
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     stream: Optional[bool] = False
@@ -435,19 +467,51 @@ class ChatCompletionRequest(BaseModel):
 @app.post("/v1/chat/completions")
 async def chat_completions(request: ChatCompletionRequest):
     try:
-        anthropic_messages = [MessageParam(role=msg.role if msg.role in ["user", "assistant"] else "user", content=msg.content) for msg in request.messages if msg.role in ["user", "assistant"]]
-        prompt = format_messages_to_prompt(anthropic_messages)
-        generated_text, input_tokens, output_tokens = generate_text(prompt, request.max_tokens or 1024, request.temperature or 0.7, request.top_p or 1.0)
-        return {"id": f"chatcmpl-{uuid.uuid4().hex[:24]}", "object": "chat.completion", "created": int(time.time()), "model": request.model, "choices": [{"index": 0, "message": {"role": "assistant", "content": generated_text}, "finish_reason": "stop"}], "usage": {"prompt_tokens": input_tokens, "completion_tokens": output_tokens, "total_tokens": input_tokens + output_tokens}}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/v1/models")
 async def list_models():
-    return {"object": "list", "data": [{"id": "MiniMax-M2", "object": "model", "created": int(time.time()), "owned_by": "local"}, {"id": "MiniMax-M2-Stable", "object": "model", "created": int(time.time()), "owned_by": "local"}, {"id": GENERATOR_MODEL, "object": "model", "created": int(time.time()), "owned_by": "local"}]}
 @app.get("/health")
 async def health():
     return {"status": "healthy", "timestamp": datetime.utcnow().isoformat(), "models_loaded": len(models) > 0}
@@ -455,7 +519,14 @@ async def health():
 @app.get("/info")
 async def info():
-    return {"name": "Model Runner", "version": "1.1.0", "api_compatibility": ["anthropic", "openai"], "interleaved_thinking": True}
 if __name__ == "__main__":

 """
 Docker Model Runner - Anthropic API Compatible
 Full compatibility with Anthropic Messages API + Interleaved Thinking
+Supports: /v1/messages, /anthropic/v1/messages, /api/v1/messages
 Optimized for: 2 vCPU, 16GB RAM
 """
 from fastapi import FastAPI, HTTPException, Header, Request
 from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
 from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from typing import Optional, List, Union, Literal, Any, Dict
 import torch
 import time
 import json
 import asyncio
 # CPU-optimized lightweight models
 GENERATOR_MODEL = os.getenv("GENERATOR_MODEL", "distilgpt2")
 def load_models():
     global models
     print("Loading models for CPU inference...")
     models["tokenizer"] = AutoTokenizer.from_pretrained(GENERATOR_MODEL)
     models["model"] = AutoModelForCausalLM.from_pretrained(GENERATOR_MODEL)
     models["model"].eval()
     if models["tokenizer"].pad_token is None:
         models["tokenizer"].pad_token = models["tokenizer"].eos_token
     print("✅ All models loaded successfully!")
 app = FastAPI(
     title="Model Runner",
+    description="Anthropic API Compatible - Works with Claude Code & Agentic Tools",
+    version="1.1.0",
     lifespan=lifespan,
     docs_url="/api/docs",
     redoc_url="/api/redoc"
 )
+# CORS for agentic tools
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
 # ============== Anthropic API Models ==============
 class AnthropicRequest(BaseModel):
     model: str = "MiniMax-M2"
     messages: List[MessageParam]
+    max_tokens: int = 4096
     temperature: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
     top_p: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
     top_k: Optional[int] = None
     tools: Optional[List[Tool]] = None
     tool_choice: Optional[Union[ToolChoice, Dict[str, Any]]] = None
     metadata: Optional[Metadata] = None
+    thinking: Optional[Union[ThinkingConfig, Dict[str, Any]]] = None
     service_tier: Optional[str] = None
                     if block_type == 'thinking' and include_thinking:
                         prompt_parts.append(f"<thinking>{block.get('thinking', '')}</thinking>\n")
                     elif block_type == 'text':
+                        text_content = block.get('text', '')
                         if role == "user":
+                            prompt_parts.append(f"Human: {text_content}\n\n")
                         else:
+                            prompt_parts.append(f"Assistant: {text_content}\n\n")
+                    elif block_type == 'tool_result':
+                        prompt_parts.append(f"Tool Result: {block.get('content', '')}\n\n")
                 elif hasattr(block, 'type'):
                     if block.type == 'thinking' and include_thinking:
                         prompt_parts.append(f"<thinking>{block.thinking}</thinking>\n")
 def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float) -> tuple:
     tokenizer = models["tokenizer"]
     model = models["model"]
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
     input_tokens = inputs["input_ids"].shape[1]
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=min(max_tokens, 512),
             temperature=temperature if temperature > 0 else 1.0,
             top_p=top_p,
             do_sample=temperature > 0,
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=min(budget_tokens, 256),
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
 async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperature: float, top_p: float, message_id: str, model_name: str, thinking_enabled: bool = False, thinking_budget: int = 100):
     tokenizer = models["tokenizer"]
     model = models["model"]
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
     input_tokens = inputs["input_ids"].shape[1]
     total_output_tokens = 0
     yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': content_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
     with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=min(max_tokens, 512), temperature=temperature if temperature > 0 else 1.0, top_p=top_p, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
     generated_tokens = outputs[0][input_tokens:]
     generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
     for i in range(0, len(generated_text), 5):
         yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': content_index, 'delta': {'type': 'text_delta', 'text': generated_text[i:i+5]}})}\n\n"
+        await asyncio.sleep(0.005)
     yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': content_index})}\n\n"
     yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': 'end_turn', 'stop_sequence': None}, 'usage': {'output_tokens': total_output_tokens}})}\n\n"
     return None
+# ============== Core Messages Handler ==============
+async def handle_messages(request: AnthropicRequest):
+    """Core handler for Anthropic Messages API"""
     try:
         message_id = f"msg_{uuid.uuid4().hex[:24]}"
         thinking_enabled = False
         thinking_budget = 100
         if request.thinking:
             if isinstance(request.thinking, dict):
                 thinking_enabled = request.thinking.get('type') == 'enabled'
+                thinking_budget = request.thinking.get('budget_tokens', 100) or 100
             else:
                 thinking_enabled = request.thinking.type == 'enabled'
                 thinking_budget = request.thinking.budget_tokens or 100
         raise HTTPException(status_code=500, detail=str(e))
+# ============== Frontend ==============
+@app.get("/", response_class=HTMLResponse)
+async def home():
+    return HTMLResponse(content="""<!DOCTYPE html>
+<html><head><meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1"><title>Model Runner</title>
+<style>*{margin:0;padding:0;box-sizing:border-box}body{min-height:100vh;background:#000;display:flex;justify-content:center;align-items:center;font-family:system-ui,sans-serif}
+.container{display:flex;flex-direction:column;align-items:center;gap:2rem}
+.logo{width:200px;height:200px;animation:float 3s ease-in-out infinite;filter:drop-shadow(0 0 30px rgba(255,100,100,0.3))}
+.status{display:flex;align-items:center;gap:0.5rem;color:rgba(255,255,255,0.6);font-size:0.875rem}
+.dot{width:8px;height:8px;background:#22c55e;border-radius:50%;animation:pulse 2s ease-in-out infinite}
+.sparkle{position:fixed;bottom:2rem;right:2rem;opacity:0.4}
+@keyframes float{0%,100%{transform:translateY(0)}50%{transform:translateY(-10px)}}
+@keyframes pulse{0%,100%{opacity:1}50%{opacity:0.5}}</style></head>
+<body><div class="container"><div class="logo"><svg viewBox="0 0 200 200" fill="none">
+<defs><linearGradient id="r" x1="0%" y1="100%" x2="100%" y2="0%">
+<stop offset="0%" stop-color="#ff0080"/><stop offset="20%" stop-color="#ff4d00"/>
+<stop offset="40%" stop-color="#ffcc00"/><stop offset="60%" stop-color="#00ff88"/>
+<stop offset="80%" stop-color="#00ccff"/><stop offset="100%" stop-color="#6644ff"/></linearGradient></defs>
+<path d="M100 20 L180 160 L20 160 Z" stroke="url(#r)" stroke-width="12" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
+<path d="M100 70 L130 130 L70 130 Z" stroke="url(#r)" stroke-width="8" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
+<line x1="80" y1="115" x2="120" y2="115" stroke="url(#r)" stroke-width="6" stroke-linecap="round"/>
+</svg></div><div class="status"><span class="dot"></span><span>Ready</span></div></div>
+<svg class="sparkle" width="24" height="24" viewBox="0 0 24 24" fill="none">
+<path d="M12 2L13.5 8.5L20 10L13.5 11.5L12 18L10.5 11.5L4 10L10.5 8.5L12 2Z" fill="rgba(255,255,255,0.6)"/></svg>
+</body></html>""")
+# ============== Anthropic API Routes ==============
+# Support multiple base paths for compatibility
+@app.post("/v1/messages")
+async def messages_v1(request: AnthropicRequest):
+    """Standard Anthropic API endpoint"""
+    return await handle_messages(request)
+@app.post("/anthropic/v1/messages")
+async def messages_anthropic(request: AnthropicRequest):
+    """Anthropic base path - for Claude Code compatibility"""
+    return await handle_messages(request)
+@app.post("/api/v1/messages")
+async def messages_api(request: AnthropicRequest):
+    """API base path variant"""
+    return await handle_messages(request)
 # ============== OpenAI Compatible ==============
 class ChatMessage(BaseModel):
     role: str
+    content: Union[str, List[Dict[str, Any]]]
 class ChatCompletionRequest(BaseModel):
+    model: str = "gpt-4"
     messages: List[ChatMessage]
+    max_tokens: Optional[int] = 4096
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     stream: Optional[bool] = False
 @app.post("/v1/chat/completions")
 async def chat_completions(request: ChatCompletionRequest):
     try:
+        # Extract text from messages
+        formatted_messages = []
+        for msg in request.messages:
+            if msg.role in ["user", "assistant"]:
+                content = msg.content
+                if isinstance(content, list):
+                    text_parts = [c.get('text', '') for c in content if isinstance(c, dict) and c.get('type') == 'text']
+                    content = ' '.join(text_parts)
+                formatted_messages.append(MessageParam(role=msg.role, content=content))
+        prompt = format_messages_to_prompt(formatted_messages)
+        generated_text, input_tokens, output_tokens = generate_text(prompt, request.max_tokens or 4096, request.temperature or 0.7, request.top_p or 1.0)
+        return {
+            "id": f"chatcmpl-{uuid.uuid4().hex[:24]}",
+            "object": "chat.completion",
+            "created": int(time.time()),
+            "model": request.model,
+            "choices": [{"index": 0, "message": {"role": "assistant", "content": generated_text}, "finish_reason": "stop"}],
+            "usage": {"prompt_tokens": input_tokens, "completion_tokens": output_tokens, "total_tokens": input_tokens + output_tokens}
+        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+# ============== Models Endpoints ==============
 @app.get("/v1/models")
+@app.get("/anthropic/v1/models")
+@app.get("/api/v1/models")
 async def list_models():
+    return {
+        "object": "list",
+        "data": [
+            {"id": "claude-sonnet-4-20250514", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
+            {"id": "claude-3-5-sonnet-20241022", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
+            {"id": "MiniMax-M2", "object": "model", "created": int(time.time()), "owned_by": "local"},
+            {"id": "MiniMax-M2-Stable", "object": "model", "created": int(time.time()), "owned_by": "local"},
+            {"id": GENERATOR_MODEL, "object": "model", "created": int(time.time()), "owned_by": "local"}
+        ]
+    }
+# ============== Utility Endpoints ==============
 @app.get("/health")
 async def health():
     return {"status": "healthy", "timestamp": datetime.utcnow().isoformat(), "models_loaded": len(models) > 0}
 @app.get("/info")
 async def info():
+    return {
+        "name": "Model Runner",
+        "version": "1.1.0",
+        "api_compatibility": ["anthropic", "openai"],
+        "base_paths": ["/v1/messages", "/anthropic/v1/messages", "/api/v1/messages"],
+        "interleaved_thinking": True,
+        "agentic_tools": True
+    }
 if __name__ == "__main__":