Spaces:

likhonsheikhdev
/

docker-model-runner

Sleeping

App Files Files Community

likhonsheikhdev commited on 6 days ago

Commit

1ea9642

verified ·

1 Parent(s): 7270816

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

Dockerfile +4 -0
main.py +62 -268
static/index.html +115 -0

Dockerfile CHANGED Viewed

@@ -25,6 +25,10 @@ RUN pip install --no-cache-dir fastapi==0.115.0 uvicorn[standard]==0.30.6 \
 # Copy application code
 COPY . .
 # Create non-root user for security
 RUN useradd -m -u 1000 user
 USER user

 # Copy application code
 COPY . .
+# Create static directory
+RUN mkdir -p /app/static
+COPY static/ /app/static/
 # Create non-root user for security
 RUN useradd -m -u 1000 user
 USER user

main.py CHANGED Viewed

@@ -4,7 +4,8 @@ Full compatibility with Anthropic Messages API + Interleaved Thinking
 Optimized for: 2 vCPU, 16GB RAM
 """
 from fastapi import FastAPI, HTTPException, Header, Request
-from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
 from typing import Optional, List, Union, Literal, Any, Dict
 import torch
@@ -52,10 +53,12 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
-    title="Docker Model Runner",
     description="Anthropic API Compatible with Interleaved Thinking",
     version="1.0.0",
-    lifespan=lifespan
 )
@@ -143,15 +146,15 @@ class AnthropicRequest(BaseModel):
     max_tokens: int = 1024
     temperature: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
     top_p: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
-    top_k: Optional[int] = None  # Ignored
-    stop_sequences: Optional[List[str]] = None  # Ignored
     stream: Optional[bool] = False
     system: Optional[Union[str, List[TextBlock]]] = None
     tools: Optional[List[Tool]] = None
     tool_choice: Optional[Union[ToolChoice, Dict[str, Any]]] = None
     metadata: Optional[Metadata] = None
     thinking: Optional[ThinkingConfig] = None
-    service_tier: Optional[str] = None  # Ignored
 class Usage(BaseModel):
@@ -175,10 +178,8 @@ class AnthropicResponse(BaseModel):
 # ============== Helper Functions ==============
 def extract_text_from_content(content: Union[str, List[ContentBlock]]) -> str:
-    """Extract text from content which may be string or list of blocks"""
     if isinstance(content, str):
         return content
     texts = []
     for block in content:
         if isinstance(block, str):
@@ -196,7 +197,6 @@ def extract_text_from_content(content: Union[str, List[ContentBlock]]) -> str:
 def format_system_prompt(system: Optional[Union[str, List[TextBlock]]]) -> str:
-    """Format system prompt from string or list of blocks"""
     if system is None:
         return ""
     if isinstance(system, str):
@@ -205,18 +205,13 @@ def format_system_prompt(system: Optional[Union[str, List[TextBlock]]]) -> str:
 def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Union[str, List[TextBlock]]] = None, include_thinking: bool = False) -> str:
-    """Convert chat messages to a single prompt string"""
     prompt_parts = []
     system_text = format_system_prompt(system)
     if system_text:
         prompt_parts.append(f"System: {system_text}\n\n")
     for msg in messages:
         role = msg.role
         content = msg.content
-        # Handle interleaved thinking in message history
         if isinstance(content, list):
             for block in content:
                 if isinstance(block, dict):
@@ -242,19 +237,15 @@ def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Uni
                 prompt_parts.append(f"Human: {content_text}\n\n")
             elif role == "assistant":
                 prompt_parts.append(f"Assistant: {content_text}\n\n")
     prompt_parts.append("Assistant:")
     return "".join(prompt_parts)
 def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float) -> tuple:
-    """Generate text and return (text, input_tokens, output_tokens)"""
     tokenizer = models["tokenizer"]
     model = models["model"]
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
     input_tokens = inputs["input_ids"].shape[1]
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
@@ -265,24 +256,18 @@ def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float
             pad_token_id=tokenizer.pad_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
     generated_tokens = outputs[0][input_tokens:]
     output_tokens = len(generated_tokens)
     generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
     return generated_text.strip(), input_tokens, output_tokens
 def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
-    """Generate thinking/reasoning content"""
     tokenizer = models["tokenizer"]
     model = models["model"]
     thinking_prompt = f"{prompt}\n\nLet me think through this step by step:\n"
     inputs = tokenizer(thinking_prompt, return_tensors="pt", truncation=True, max_length=512)
     input_tokens = inputs["input_ids"].shape[1]
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
@@ -293,173 +278,99 @@ def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
             pad_token_id=tokenizer.pad_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
     generated_tokens = outputs[0][input_tokens:]
     thinking_tokens = len(generated_tokens)
     thinking_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
     return thinking_text.strip(), thinking_tokens
-async def generate_stream_with_thinking(
-    prompt: str,
-    max_tokens: int,
-    temperature: float,
-    top_p: float,
-    message_id: str,
-    model_name: str,
-    thinking_enabled: bool = False,
-    thinking_budget: int = 100
-):
-    """Generate streaming response with interleaved thinking in Anthropic SSE format"""
     tokenizer = models["tokenizer"]
     model = models["model"]
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
     input_tokens = inputs["input_ids"].shape[1]
     total_output_tokens = 0
-    # Send message_start event
     message_start = {
         "type": "message_start",
-        "message": {
-            "id": message_id,
-            "type": "message",
-            "role": "assistant",
-            "content": [],
-            "model": model_name,
-            "stop_reason": None,
-            "stop_sequence": None,
-            "usage": {"input_tokens": input_tokens, "output_tokens": 0}
-        }
     }
     yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
     content_index = 0
-    # Generate thinking block if enabled
     if thinking_enabled:
-        # Send thinking content_block_start
-        thinking_block_start = {
-            "type": "content_block_start",
-            "index": content_index,
-            "content_block": {"type": "thinking", "thinking": ""}
-        }
         yield f"event: content_block_start\ndata: {json.dumps(thinking_block_start)}\n\n"
-        # Generate thinking content
         thinking_text, thinking_tokens = generate_thinking(prompt, thinking_budget)
         total_output_tokens += thinking_tokens
-        # Stream thinking in chunks
-        chunk_size = 10
-        for i in range(0, len(thinking_text), chunk_size):
-            chunk = thinking_text[i:i+chunk_size]
-            thinking_delta = {
-                "type": "content_block_delta",
-                "index": content_index,
-                "delta": {"type": "thinking_delta", "thinking": chunk}
-            }
-            yield f"event: content_block_delta\ndata: {json.dumps(thinking_delta)}\n\n"
             await asyncio.sleep(0.01)
-        # Send thinking content_block_stop
-        thinking_block_stop = {"type": "content_block_stop", "index": content_index}
-        yield f"event: content_block_stop\ndata: {json.dumps(thinking_block_stop)}\n\n"
         content_index += 1
-    # Send text content_block_start
-    text_block_start = {
-        "type": "content_block_start",
-        "index": content_index,
-        "content_block": {"type": "text", "text": ""}
-    }
-    yield f"event: content_block_start\ndata: {json.dumps(text_block_start)}\n\n"
-    # Generate main response
     with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=min(max_tokens, 256),
-            temperature=temperature if temperature > 0 else 1.0,
-            top_p=top_p,
-            do_sample=temperature > 0,
-            pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id
-        )
     generated_tokens = outputs[0][input_tokens:]
     generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
     total_output_tokens += len(generated_tokens)
-    # Stream text in chunks
-    chunk_size = 5
-    for i in range(0, len(generated_text), chunk_size):
-        chunk = generated_text[i:i+chunk_size]
-        text_delta = {
-            "type": "content_block_delta",
-            "index": content_index,
-            "delta": {"type": "text_delta", "text": chunk}
-        }
-        yield f"event: content_block_delta\ndata: {json.dumps(text_delta)}\n\n"
         await asyncio.sleep(0.01)
-    # Send text content_block_stop
-    text_block_stop = {"type": "content_block_stop", "index": content_index}
-    yield f"event: content_block_stop\ndata: {json.dumps(text_block_stop)}\n\n"
-    # Send message_delta event
-    message_delta = {
-        "type": "message_delta",
-        "delta": {"stop_reason": "end_turn", "stop_sequence": None},
-        "usage": {"output_tokens": total_output_tokens}
-    }
-    yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
-    # Send message_stop event
-    message_stop = {"type": "message_stop"}
-    yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n"
 def handle_tool_call(tools: List[Tool], messages: List[MessageParam], generated_text: str) -> Optional[ToolUseBlock]:
-    """Check if the response should trigger a tool call"""
     if not tools:
         return None
     for tool in tools:
         if tool.name.lower() in generated_text.lower():
-            return ToolUseBlock(
-                type="tool_use",
-                id=f"toolu_{uuid.uuid4().hex[:24]}",
-                name=tool.name,
-                input={}
-            )
     return None
 # ============== Anthropic API Endpoints ==============
 @app.post("/v1/messages")
 async def create_message(request: AnthropicRequest):
-    """
-    Anthropic Messages API compatible endpoint with Interleaved Thinking
-    POST /v1/messages
-    Supports:
-    - Text messages
-    - System prompts
-    - Streaming responses
-    - Tool/function calling
-    - Interleaved thinking blocks
-    - Thinking budget tokens
-    - Metadata
-    """
     try:
         message_id = f"msg_{uuid.uuid4().hex[:24]}"
-        # Check if thinking is enabled
         thinking_enabled = False
         thinking_budget = 100
         if request.thinking:
@@ -470,50 +381,26 @@ async def create_message(request: AnthropicRequest):
                 thinking_enabled = request.thinking.type == 'enabled'
                 thinking_budget = request.thinking.budget_tokens or 100
-        # Format messages to prompt (include thinking from history if enabled)
         prompt = format_messages_to_prompt(request.messages, request.system, include_thinking=thinking_enabled)
-        # Handle streaming
         if request.stream:
             return StreamingResponse(
-                generate_stream_with_thinking(
-                    prompt=prompt,
-                    max_tokens=request.max_tokens,
-                    temperature=request.temperature or 1.0,
-                    top_p=request.top_p or 1.0,
-                    message_id=message_id,
-                    model_name=request.model,
-                    thinking_enabled=thinking_enabled,
-                    thinking_budget=thinking_budget
-                ),
                 media_type="text/event-stream",
-                headers={
-                    "Cache-Control": "no-cache",
-                    "Connection": "keep-alive",
-                    "X-Accel-Buffering": "no"
-                }
             )
-        # Non-streaming response
         content_blocks = []
         total_output_tokens = 0
-        # Generate thinking block if enabled
         if thinking_enabled:
             thinking_text, thinking_tokens = generate_thinking(prompt, thinking_budget)
             total_output_tokens += thinking_tokens
             content_blocks.append(ThinkingBlock(type="thinking", thinking=thinking_text))
-        # Generate main response
-        generated_text, input_tokens, output_tokens = generate_text(
-            prompt=prompt,
-            max_tokens=request.max_tokens,
-            temperature=request.temperature or 1.0,
-            top_p=request.top_p or 1.0
-        )
         total_output_tokens += output_tokens
-        # Check for tool calls
         tool_use = handle_tool_call(request.tools, request.messages, generated_text) if request.tools else None
         if tool_use:
@@ -524,18 +411,12 @@ async def create_message(request: AnthropicRequest):
             content_blocks.append(TextBlock(type="text", text=generated_text))
             stop_reason = "end_turn"
-        return AnthropicResponse(
-            id=message_id,
-            content=content_blocks,
-            model=request.model,
-            stop_reason=stop_reason,
-            usage=Usage(input_tokens=input_tokens, output_tokens=total_output_tokens)
-        )
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-# ============== OpenAI Compatible Endpoints ==============
 class ChatMessage(BaseModel):
     role: str
@@ -553,115 +434,28 @@ class ChatCompletionRequest(BaseModel):
 @app.post("/v1/chat/completions")
 async def chat_completions(request: ChatCompletionRequest):
-    """OpenAI Chat Completions API compatible endpoint"""
     try:
-        anthropic_messages = [
-            MessageParam(role=msg.role if msg.role in ["user", "assistant"] else "user",
-                        content=msg.content)
-            for msg in request.messages
-            if msg.role in ["user", "assistant"]
-        ]
         prompt = format_messages_to_prompt(anthropic_messages)
-        generated_text, input_tokens, output_tokens = generate_text(
-            prompt=prompt,
-            max_tokens=request.max_tokens or 1024,
-            temperature=request.temperature or 0.7,
-            top_p=request.top_p or 1.0
-        )
-        return {
-            "id": f"chatcmpl-{uuid.uuid4().hex[:24]}",
-            "object": "chat.completion",
-            "created": int(time.time()),
-            "model": request.model,
-            "choices": [{
-                "index": 0,
-                "message": {"role": "assistant", "content": generated_text},
-                "finish_reason": "stop"
-            }],
-            "usage": {
-                "prompt_tokens": input_tokens,
-                "completion_tokens": output_tokens,
-                "total_tokens": input_tokens + output_tokens
-            }
-        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/v1/models")
 async def list_models():
-    """List available models"""
-    return {
-        "object": "list",
-        "data": [
-            {"id": "MiniMax-M2", "object": "model", "created": int(time.time()), "owned_by": "local"},
-            {"id": "MiniMax-M2-Stable", "object": "model", "created": int(time.time()), "owned_by": "local"},
-            {"id": GENERATOR_MODEL, "object": "model", "created": int(time.time()), "owned_by": "local"}
-        ]
-    }
-# ============== Utility Endpoints ==============
-@app.get("/")
-async def root():
-    """Welcome endpoint"""
-    return {
-        "message": "Docker Model Runner API (Anthropic Compatible + Interleaved Thinking)",
-        "hardware": "CPU Basic: 2 vCPU · 16 GB RAM",
-        "docs": "/docs",
-        "api_endpoints": {
-            "anthropic_messages": "POST /v1/messages",
-            "openai_chat": "POST /v1/chat/completions",
-            "models": "GET /v1/models"
-        },
-        "supported_features": [
-            "text messages",
-            "system prompts",
-            "streaming responses",
-            "tool/function calling",
-            "interleaved thinking blocks",
-            "thinking budget tokens",
-            "metadata"
-        ]
-    }
 @app.get("/health")
 async def health():
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "timestamp": datetime.utcnow().isoformat(),
-        "hardware": "CPU Basic: 2 vCPU · 16 GB RAM",
-        "models_loaded": len(models) > 0
-    }
 @app.get("/info")
 async def info():
-    """API information"""
-    return {
-        "name": "Docker Model Runner",
-        "version": "1.1.0",
-        "api_compatibility": ["anthropic", "openai"],
-        "supported_models": ["MiniMax-M2", "MiniMax-M2-Stable"],
-        "interleaved_thinking": {
-            "supported": True,
-            "streaming": True,
-            "budget_tokens": True
-        },
-        "supported_parameters": {
-            "fully_supported": ["model", "messages", "max_tokens", "stream", "system", "temperature", "top_p", "tools", "tool_choice", "metadata", "thinking"],
-            "ignored": ["top_k", "stop_sequences", "service_tier"]
-        },
-        "message_types": {
-            "supported": ["text", "tool_use", "tool_result", "thinking"],
-            "not_supported": ["image", "document"]
-        }
-    }
 if __name__ == "__main__":

 Optimized for: 2 vCPU, 16GB RAM
 """
 from fastapi import FastAPI, HTTPException, Header, Request
+from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field
 from typing import Optional, List, Union, Literal, Any, Dict
 import torch
 app = FastAPI(
+    title="Model Runner",
     description="Anthropic API Compatible with Interleaved Thinking",
     version="1.0.0",
+    lifespan=lifespan,
+    docs_url="/api/docs",
+    redoc_url="/api/redoc"
 )
     max_tokens: int = 1024
     temperature: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
     top_p: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
+    top_k: Optional[int] = None
+    stop_sequences: Optional[List[str]] = None
     stream: Optional[bool] = False
     system: Optional[Union[str, List[TextBlock]]] = None
     tools: Optional[List[Tool]] = None
     tool_choice: Optional[Union[ToolChoice, Dict[str, Any]]] = None
     metadata: Optional[Metadata] = None
     thinking: Optional[ThinkingConfig] = None
+    service_tier: Optional[str] = None
 class Usage(BaseModel):
 # ============== Helper Functions ==============
 def extract_text_from_content(content: Union[str, List[ContentBlock]]) -> str:
     if isinstance(content, str):
         return content
     texts = []
     for block in content:
         if isinstance(block, str):
 def format_system_prompt(system: Optional[Union[str, List[TextBlock]]]) -> str:
     if system is None:
         return ""
     if isinstance(system, str):
 def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Union[str, List[TextBlock]]] = None, include_thinking: bool = False) -> str:
     prompt_parts = []
     system_text = format_system_prompt(system)
     if system_text:
         prompt_parts.append(f"System: {system_text}\n\n")
     for msg in messages:
         role = msg.role
         content = msg.content
         if isinstance(content, list):
             for block in content:
                 if isinstance(block, dict):
                 prompt_parts.append(f"Human: {content_text}\n\n")
             elif role == "assistant":
                 prompt_parts.append(f"Assistant: {content_text}\n\n")
     prompt_parts.append("Assistant:")
     return "".join(prompt_parts)
 def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float) -> tuple:
     tokenizer = models["tokenizer"]
     model = models["model"]
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
     input_tokens = inputs["input_ids"].shape[1]
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             pad_token_id=tokenizer.pad_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
     generated_tokens = outputs[0][input_tokens:]
     output_tokens = len(generated_tokens)
     generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
     return generated_text.strip(), input_tokens, output_tokens
 def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
     tokenizer = models["tokenizer"]
     model = models["model"]
     thinking_prompt = f"{prompt}\n\nLet me think through this step by step:\n"
     inputs = tokenizer(thinking_prompt, return_tensors="pt", truncation=True, max_length=512)
     input_tokens = inputs["input_ids"].shape[1]
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             pad_token_id=tokenizer.pad_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
     generated_tokens = outputs[0][input_tokens:]
     thinking_tokens = len(generated_tokens)
     thinking_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
     return thinking_text.strip(), thinking_tokens
+async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperature: float, top_p: float, message_id: str, model_name: str, thinking_enabled: bool = False, thinking_budget: int = 100):
     tokenizer = models["tokenizer"]
     model = models["model"]
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
     input_tokens = inputs["input_ids"].shape[1]
     total_output_tokens = 0
     message_start = {
         "type": "message_start",
+        "message": {"id": message_id, "type": "message", "role": "assistant", "content": [], "model": model_name, "stop_reason": None, "stop_sequence": None, "usage": {"input_tokens": input_tokens, "output_tokens": 0}}
     }
     yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
     content_index = 0
     if thinking_enabled:
+        thinking_block_start = {"type": "content_block_start", "index": content_index, "content_block": {"type": "thinking", "thinking": ""}}
         yield f"event: content_block_start\ndata: {json.dumps(thinking_block_start)}\n\n"
         thinking_text, thinking_tokens = generate_thinking(prompt, thinking_budget)
         total_output_tokens += thinking_tokens
+        for i in range(0, len(thinking_text), 10):
+            chunk = thinking_text[i:i+10]
+            yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': content_index, 'delta': {'type': 'thinking_delta', 'thinking': chunk}})}\n\n"
             await asyncio.sleep(0.01)
+        yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': content_index})}\n\n"
         content_index += 1
+    yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': content_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
     with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=min(max_tokens, 256), temperature=temperature if temperature > 0 else 1.0, top_p=top_p, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
     generated_tokens = outputs[0][input_tokens:]
     generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
     total_output_tokens += len(generated_tokens)
+    for i in range(0, len(generated_text), 5):
+        yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': content_index, 'delta': {'type': 'text_delta', 'text': generated_text[i:i+5]}})}\n\n"
         await asyncio.sleep(0.01)
+    yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': content_index})}\n\n"
+    yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': 'end_turn', 'stop_sequence': None}, 'usage': {'output_tokens': total_output_tokens}})}\n\n"
+    yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
 def handle_tool_call(tools: List[Tool], messages: List[MessageParam], generated_text: str) -> Optional[ToolUseBlock]:
     if not tools:
         return None
     for tool in tools:
         if tool.name.lower() in generated_text.lower():
+            return ToolUseBlock(type="tool_use", id=f"toolu_{uuid.uuid4().hex[:24]}", name=tool.name, input={})
     return None
+# ============== Frontend ==============
+@app.get("/", response_class=HTMLResponse)
+async def home():
+    """Serve the minimal centered frontend"""
+    try:
+        with open("/app/static/index.html", "r") as f:
+            return HTMLResponse(content=f.read())
+    except:
+        return HTMLResponse(content="""
+<!DOCTYPE html>
+<html><head><meta charset="UTF-8"><title>Model Runner</title>
+<style>*{margin:0;padding:0}body{min-height:100vh;background:#000;display:flex;justify-content:center;align-items:center}
+.logo{width:200px;height:200px;animation:float 3s ease-in-out infinite}
+@keyframes float{0%,100%{transform:translateY(0)}50%{transform:translateY(-10px)}}</style></head>
+<body><div class="logo"><svg viewBox="0 0 200 200" fill="none">
+<defs><linearGradient id="r" x1="0%" y1="100%" x2="100%" y2="0%">
+<stop offset="0%" stop-color="#ff0080"/><stop offset="25%" stop-color="#ff4d00"/>
+<stop offset="50%" stop-color="#ffcc00"/><stop offset="75%" stop-color="#00ff88"/>
+<stop offset="100%" stop-color="#00ccff"/></linearGradient></defs>
+<path d="M100 20 L180 160 L20 160 Z" stroke="url(#r)" stroke-width="12" stroke-linecap="round" fill="none"/>
+<path d="M100 70 L130 130 L70 130 Z" stroke="url(#r)" stroke-width="8" stroke-linecap="round" fill="none"/>
+<line x1="80" y1="115" x2="120" y2="115" stroke="url(#r)" stroke-width="6" stroke-linecap="round"/>
+</svg></div></body></html>
+        """)
 # ============== Anthropic API Endpoints ==============
 @app.post("/v1/messages")
 async def create_message(request: AnthropicRequest):
     try:
         message_id = f"msg_{uuid.uuid4().hex[:24]}"
         thinking_enabled = False
         thinking_budget = 100
         if request.thinking:
                 thinking_enabled = request.thinking.type == 'enabled'
                 thinking_budget = request.thinking.budget_tokens or 100
         prompt = format_messages_to_prompt(request.messages, request.system, include_thinking=thinking_enabled)
         if request.stream:
             return StreamingResponse(
+                generate_stream_with_thinking(prompt, request.max_tokens, request.temperature or 1.0, request.top_p or 1.0, message_id, request.model, thinking_enabled, thinking_budget),
                 media_type="text/event-stream",
+                headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"}
             )
         content_blocks = []
         total_output_tokens = 0
         if thinking_enabled:
             thinking_text, thinking_tokens = generate_thinking(prompt, thinking_budget)
             total_output_tokens += thinking_tokens
             content_blocks.append(ThinkingBlock(type="thinking", thinking=thinking_text))
+        generated_text, input_tokens, output_tokens = generate_text(prompt, request.max_tokens, request.temperature or 1.0, request.top_p or 1.0)
         total_output_tokens += output_tokens
         tool_use = handle_tool_call(request.tools, request.messages, generated_text) if request.tools else None
         if tool_use:
             content_blocks.append(TextBlock(type="text", text=generated_text))
             stop_reason = "end_turn"
+        return AnthropicResponse(id=message_id, content=content_blocks, model=request.model, stop_reason=stop_reason, usage=Usage(input_tokens=input_tokens, output_tokens=total_output_tokens))
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+# ============== OpenAI Compatible ==============
 class ChatMessage(BaseModel):
     role: str
 @app.post("/v1/chat/completions")
 async def chat_completions(request: ChatCompletionRequest):
     try:
+        anthropic_messages = [MessageParam(role=msg.role if msg.role in ["user", "assistant"] else "user", content=msg.content) for msg in request.messages if msg.role in ["user", "assistant"]]
         prompt = format_messages_to_prompt(anthropic_messages)
+        generated_text, input_tokens, output_tokens = generate_text(prompt, request.max_tokens or 1024, request.temperature or 0.7, request.top_p or 1.0)
+        return {"id": f"chatcmpl-{uuid.uuid4().hex[:24]}", "object": "chat.completion", "created": int(time.time()), "model": request.model, "choices": [{"index": 0, "message": {"role": "assistant", "content": generated_text}, "finish_reason": "stop"}], "usage": {"prompt_tokens": input_tokens, "completion_tokens": output_tokens, "total_tokens": input_tokens + output_tokens}}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/v1/models")
 async def list_models():
+    return {"object": "list", "data": [{"id": "MiniMax-M2", "object": "model", "created": int(time.time()), "owned_by": "local"}, {"id": "MiniMax-M2-Stable", "object": "model", "created": int(time.time()), "owned_by": "local"}, {"id": GENERATOR_MODEL, "object": "model", "created": int(time.time()), "owned_by": "local"}]}
 @app.get("/health")
 async def health():
+    return {"status": "healthy", "timestamp": datetime.utcnow().isoformat(), "models_loaded": len(models) > 0}
 @app.get("/info")
 async def info():
+    return {"name": "Model Runner", "version": "1.1.0", "api_compatibility": ["anthropic", "openai"], "interleaved_thinking": True}
 if __name__ == "__main__":

static/index.html ADDED Viewed

	@@ -0,0 +1,115 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Model Runner</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            min-height: 100vh;
+            background: #000000;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            overflow: hidden;
+        }
+        .container {
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            gap: 2rem;
+        }
+        .logo {
+            width: 200px;
+            height: 200px;
+            position: relative;
+            animation: float 3s ease-in-out infinite;
+        }
+        .logo svg {
+            width: 100%;
+            height: 100%;
+            filter: drop-shadow(0 0 30px rgba(255, 100, 100, 0.3));
+        }
+        .status {
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+            color: rgba(255, 255, 255, 0.6);
+            font-size: 0.875rem;
+        }
+        .status-dot {
+            width: 8px;
+            height: 8px;
+            background: #22c55e;
+            border-radius: 50%;
+            animation: pulse 2s ease-in-out infinite;
+        }
+        .sparkle {
+            position: fixed;
+            bottom: 2rem;
+            right: 2rem;
+            opacity: 0.4;
+        }
+        @keyframes float {
+            0%, 100% { transform: translateY(0); }
+            50% { transform: translateY(-10px); }
+        }
+        @keyframes pulse {
+            0%, 100% { opacity: 1; transform: scale(1); }
+            50% { opacity: 0.5; transform: scale(1.2); }
+        }
+        @keyframes spin {
+            from { transform: rotate(0deg); }
+            to { transform: rotate(360deg); }
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="logo">
+            <svg viewBox="0 0 200 200" fill="none" xmlns="http://www.w3.org/2000/svg">
+                <defs>
+                    <linearGradient id="rainbow" x1="0%" y1="100%" x2="100%" y2="0%">
+                        <stop offset="0%" stop-color="#ff0080"/>
+                        <stop offset="20%" stop-color="#ff4d00"/>
+                        <stop offset="40%" stop-color="#ffcc00"/>
+                        <stop offset="60%" stop-color="#00ff88"/>
+                        <stop offset="80%" stop-color="#00ccff"/>
+                        <stop offset="100%" stop-color="#6644ff"/>
+                    </linearGradient>
+                </defs>
+                <!-- Outer triangle -->
+                <path d="M100 20 L180 160 L20 160 Z" stroke="url(#rainbow)" stroke-width="12" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
+                <!-- Inner A shape -->
+                <path d="M100 70 L130 130 L70 130 Z" stroke="url(#rainbow)" stroke-width="8" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
+                <!-- Horizontal bar -->
+                <line x1="80" y1="115" x2="120" y2="115" stroke="url(#rainbow)" stroke-width="6" stroke-linecap="round"/>
+            </svg>
+        </div>
+        <div class="status">
+            <span class="status-dot"></span>
+            <span>Ready</span>
+        </div>
+    </div>
+    <svg class="sparkle" width="24" height="24" viewBox="0 0 24 24" fill="none">
+        <path d="M12 2L13.5 8.5L20 10L13.5 11.5L12 18L10.5 11.5L4 10L10.5 8.5L12 2Z" fill="rgba(255,255,255,0.6)"/>
+    </svg>
+</body>
+</html>