likhonsheikhdev commited on
Commit
7222b60
·
verified ·
1 Parent(s): 1ea9642

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. main.py +128 -57
main.py CHANGED
@@ -1,11 +1,13 @@
1
  """
2
  Docker Model Runner - Anthropic API Compatible
3
  Full compatibility with Anthropic Messages API + Interleaved Thinking
 
4
  Optimized for: 2 vCPU, 16GB RAM
5
  """
6
  from fastapi import FastAPI, HTTPException, Header, Request
7
  from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
8
  from fastapi.staticfiles import StaticFiles
 
9
  from pydantic import BaseModel, Field
10
  from typing import Optional, List, Union, Literal, Any, Dict
11
  import torch
@@ -17,7 +19,6 @@ import uuid
17
  import time
18
  import json
19
  import asyncio
20
- import re
21
 
22
  # CPU-optimized lightweight models
23
  GENERATOR_MODEL = os.getenv("GENERATOR_MODEL", "distilgpt2")
@@ -31,17 +32,13 @@ models = {}
31
 
32
 
33
  def load_models():
34
- """Pre-load models for faster inference"""
35
  global models
36
  print("Loading models for CPU inference...")
37
-
38
  models["tokenizer"] = AutoTokenizer.from_pretrained(GENERATOR_MODEL)
39
  models["model"] = AutoModelForCausalLM.from_pretrained(GENERATOR_MODEL)
40
  models["model"].eval()
41
-
42
  if models["tokenizer"].pad_token is None:
43
  models["tokenizer"].pad_token = models["tokenizer"].eos_token
44
-
45
  print("✅ All models loaded successfully!")
46
 
47
 
@@ -54,13 +51,22 @@ async def lifespan(app: FastAPI):
54
 
55
  app = FastAPI(
56
  title="Model Runner",
57
- description="Anthropic API Compatible with Interleaved Thinking",
58
- version="1.0.0",
59
  lifespan=lifespan,
60
  docs_url="/api/docs",
61
  redoc_url="/api/redoc"
62
  )
63
 
 
 
 
 
 
 
 
 
 
64
 
65
  # ============== Anthropic API Models ==============
66
 
@@ -143,7 +149,7 @@ class Metadata(BaseModel):
143
  class AnthropicRequest(BaseModel):
144
  model: str = "MiniMax-M2"
145
  messages: List[MessageParam]
146
- max_tokens: int = 1024
147
  temperature: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
148
  top_p: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
149
  top_k: Optional[int] = None
@@ -153,7 +159,7 @@ class AnthropicRequest(BaseModel):
153
  tools: Optional[List[Tool]] = None
154
  tool_choice: Optional[Union[ToolChoice, Dict[str, Any]]] = None
155
  metadata: Optional[Metadata] = None
156
- thinking: Optional[ThinkingConfig] = None
157
  service_tier: Optional[str] = None
158
 
159
 
@@ -219,10 +225,13 @@ def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Uni
219
  if block_type == 'thinking' and include_thinking:
220
  prompt_parts.append(f"<thinking>{block.get('thinking', '')}</thinking>\n")
221
  elif block_type == 'text':
 
222
  if role == "user":
223
- prompt_parts.append(f"Human: {block.get('text', '')}\n\n")
224
  else:
225
- prompt_parts.append(f"Assistant: {block.get('text', '')}\n\n")
 
 
226
  elif hasattr(block, 'type'):
227
  if block.type == 'thinking' and include_thinking:
228
  prompt_parts.append(f"<thinking>{block.thinking}</thinking>\n")
@@ -244,12 +253,12 @@ def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Uni
244
  def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float) -> tuple:
245
  tokenizer = models["tokenizer"]
246
  model = models["model"]
247
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
248
  input_tokens = inputs["input_ids"].shape[1]
249
  with torch.no_grad():
250
  outputs = model.generate(
251
  **inputs,
252
- max_new_tokens=min(max_tokens, 256),
253
  temperature=temperature if temperature > 0 else 1.0,
254
  top_p=top_p,
255
  do_sample=temperature > 0,
@@ -271,7 +280,7 @@ def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
271
  with torch.no_grad():
272
  outputs = model.generate(
273
  **inputs,
274
- max_new_tokens=min(budget_tokens, 128),
275
  temperature=0.7,
276
  top_p=0.9,
277
  do_sample=True,
@@ -287,7 +296,7 @@ def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
287
  async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperature: float, top_p: float, message_id: str, model_name: str, thinking_enabled: bool = False, thinking_budget: int = 100):
288
  tokenizer = models["tokenizer"]
289
  model = models["model"]
290
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
291
  input_tokens = inputs["input_ids"].shape[1]
292
  total_output_tokens = 0
293
 
@@ -314,7 +323,7 @@ async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperatur
314
  yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': content_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
315
 
316
  with torch.no_grad():
317
- outputs = model.generate(**inputs, max_new_tokens=min(max_tokens, 256), temperature=temperature if temperature > 0 else 1.0, top_p=top_p, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
318
 
319
  generated_tokens = outputs[0][input_tokens:]
320
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
@@ -322,7 +331,7 @@ async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperatur
322
 
323
  for i in range(0, len(generated_text), 5):
324
  yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': content_index, 'delta': {'type': 'text_delta', 'text': generated_text[i:i+5]}})}\n\n"
325
- await asyncio.sleep(0.01)
326
 
327
  yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': content_index})}\n\n"
328
  yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': 'end_turn', 'stop_sequence': None}, 'usage': {'output_tokens': total_output_tokens}})}\n\n"
@@ -338,45 +347,19 @@ def handle_tool_call(tools: List[Tool], messages: List[MessageParam], generated_
338
  return None
339
 
340
 
341
- # ============== Frontend ==============
342
-
343
- @app.get("/", response_class=HTMLResponse)
344
- async def home():
345
- """Serve the minimal centered frontend"""
346
- try:
347
- with open("/app/static/index.html", "r") as f:
348
- return HTMLResponse(content=f.read())
349
- except:
350
- return HTMLResponse(content="""
351
- <!DOCTYPE html>
352
- <html><head><meta charset="UTF-8"><title>Model Runner</title>
353
- <style>*{margin:0;padding:0}body{min-height:100vh;background:#000;display:flex;justify-content:center;align-items:center}
354
- .logo{width:200px;height:200px;animation:float 3s ease-in-out infinite}
355
- @keyframes float{0%,100%{transform:translateY(0)}50%{transform:translateY(-10px)}}</style></head>
356
- <body><div class="logo"><svg viewBox="0 0 200 200" fill="none">
357
- <defs><linearGradient id="r" x1="0%" y1="100%" x2="100%" y2="0%">
358
- <stop offset="0%" stop-color="#ff0080"/><stop offset="25%" stop-color="#ff4d00"/>
359
- <stop offset="50%" stop-color="#ffcc00"/><stop offset="75%" stop-color="#00ff88"/>
360
- <stop offset="100%" stop-color="#00ccff"/></linearGradient></defs>
361
- <path d="M100 20 L180 160 L20 160 Z" stroke="url(#r)" stroke-width="12" stroke-linecap="round" fill="none"/>
362
- <path d="M100 70 L130 130 L70 130 Z" stroke="url(#r)" stroke-width="8" stroke-linecap="round" fill="none"/>
363
- <line x1="80" y1="115" x2="120" y2="115" stroke="url(#r)" stroke-width="6" stroke-linecap="round"/>
364
- </svg></div></body></html>
365
- """)
366
-
367
-
368
- # ============== Anthropic API Endpoints ==============
369
 
370
- @app.post("/v1/messages")
371
- async def create_message(request: AnthropicRequest):
372
  try:
373
  message_id = f"msg_{uuid.uuid4().hex[:24]}"
374
  thinking_enabled = False
375
  thinking_budget = 100
 
376
  if request.thinking:
377
  if isinstance(request.thinking, dict):
378
  thinking_enabled = request.thinking.get('type') == 'enabled'
379
- thinking_budget = request.thinking.get('budget_tokens', 100)
380
  else:
381
  thinking_enabled = request.thinking.type == 'enabled'
382
  thinking_budget = request.thinking.budget_tokens or 100
@@ -416,17 +399,66 @@ async def create_message(request: AnthropicRequest):
416
  raise HTTPException(status_code=500, detail=str(e))
417
 
418
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  # ============== OpenAI Compatible ==============
420
 
421
  class ChatMessage(BaseModel):
422
  role: str
423
- content: str
424
 
425
 
426
  class ChatCompletionRequest(BaseModel):
427
- model: str = "distilgpt2"
428
  messages: List[ChatMessage]
429
- max_tokens: Optional[int] = 1024
430
  temperature: Optional[float] = 0.7
431
  top_p: Optional[float] = 1.0
432
  stream: Optional[bool] = False
@@ -435,19 +467,51 @@ class ChatCompletionRequest(BaseModel):
435
  @app.post("/v1/chat/completions")
436
  async def chat_completions(request: ChatCompletionRequest):
437
  try:
438
- anthropic_messages = [MessageParam(role=msg.role if msg.role in ["user", "assistant"] else "user", content=msg.content) for msg in request.messages if msg.role in ["user", "assistant"]]
439
- prompt = format_messages_to_prompt(anthropic_messages)
440
- generated_text, input_tokens, output_tokens = generate_text(prompt, request.max_tokens or 1024, request.temperature or 0.7, request.top_p or 1.0)
441
- return {"id": f"chatcmpl-{uuid.uuid4().hex[:24]}", "object": "chat.completion", "created": int(time.time()), "model": request.model, "choices": [{"index": 0, "message": {"role": "assistant", "content": generated_text}, "finish_reason": "stop"}], "usage": {"prompt_tokens": input_tokens, "completion_tokens": output_tokens, "total_tokens": input_tokens + output_tokens}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  except Exception as e:
443
  raise HTTPException(status_code=500, detail=str(e))
444
 
445
 
 
 
446
  @app.get("/v1/models")
 
 
447
  async def list_models():
448
- return {"object": "list", "data": [{"id": "MiniMax-M2", "object": "model", "created": int(time.time()), "owned_by": "local"}, {"id": "MiniMax-M2-Stable", "object": "model", "created": int(time.time()), "owned_by": "local"}, {"id": GENERATOR_MODEL, "object": "model", "created": int(time.time()), "owned_by": "local"}]}
 
 
 
 
 
 
 
 
 
449
 
450
 
 
 
451
  @app.get("/health")
452
  async def health():
453
  return {"status": "healthy", "timestamp": datetime.utcnow().isoformat(), "models_loaded": len(models) > 0}
@@ -455,7 +519,14 @@ async def health():
455
 
456
  @app.get("/info")
457
  async def info():
458
- return {"name": "Model Runner", "version": "1.1.0", "api_compatibility": ["anthropic", "openai"], "interleaved_thinking": True}
 
 
 
 
 
 
 
459
 
460
 
461
  if __name__ == "__main__":
 
1
  """
2
  Docker Model Runner - Anthropic API Compatible
3
  Full compatibility with Anthropic Messages API + Interleaved Thinking
4
+ Supports: /v1/messages, /anthropic/v1/messages, /api/v1/messages
5
  Optimized for: 2 vCPU, 16GB RAM
6
  """
7
  from fastapi import FastAPI, HTTPException, Header, Request
8
  from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
9
  from fastapi.staticfiles import StaticFiles
10
+ from fastapi.middleware.cors import CORSMiddleware
11
  from pydantic import BaseModel, Field
12
  from typing import Optional, List, Union, Literal, Any, Dict
13
  import torch
 
19
  import time
20
  import json
21
  import asyncio
 
22
 
23
  # CPU-optimized lightweight models
24
  GENERATOR_MODEL = os.getenv("GENERATOR_MODEL", "distilgpt2")
 
32
 
33
 
34
  def load_models():
 
35
  global models
36
  print("Loading models for CPU inference...")
 
37
  models["tokenizer"] = AutoTokenizer.from_pretrained(GENERATOR_MODEL)
38
  models["model"] = AutoModelForCausalLM.from_pretrained(GENERATOR_MODEL)
39
  models["model"].eval()
 
40
  if models["tokenizer"].pad_token is None:
41
  models["tokenizer"].pad_token = models["tokenizer"].eos_token
 
42
  print("✅ All models loaded successfully!")
43
 
44
 
 
51
 
52
  app = FastAPI(
53
  title="Model Runner",
54
+ description="Anthropic API Compatible - Works with Claude Code & Agentic Tools",
55
+ version="1.1.0",
56
  lifespan=lifespan,
57
  docs_url="/api/docs",
58
  redoc_url="/api/redoc"
59
  )
60
 
61
+ # CORS for agentic tools
62
+ app.add_middleware(
63
+ CORSMiddleware,
64
+ allow_origins=["*"],
65
+ allow_credentials=True,
66
+ allow_methods=["*"],
67
+ allow_headers=["*"],
68
+ )
69
+
70
 
71
  # ============== Anthropic API Models ==============
72
 
 
149
  class AnthropicRequest(BaseModel):
150
  model: str = "MiniMax-M2"
151
  messages: List[MessageParam]
152
+ max_tokens: int = 4096
153
  temperature: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
154
  top_p: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
155
  top_k: Optional[int] = None
 
159
  tools: Optional[List[Tool]] = None
160
  tool_choice: Optional[Union[ToolChoice, Dict[str, Any]]] = None
161
  metadata: Optional[Metadata] = None
162
+ thinking: Optional[Union[ThinkingConfig, Dict[str, Any]]] = None
163
  service_tier: Optional[str] = None
164
 
165
 
 
225
  if block_type == 'thinking' and include_thinking:
226
  prompt_parts.append(f"<thinking>{block.get('thinking', '')}</thinking>\n")
227
  elif block_type == 'text':
228
+ text_content = block.get('text', '')
229
  if role == "user":
230
+ prompt_parts.append(f"Human: {text_content}\n\n")
231
  else:
232
+ prompt_parts.append(f"Assistant: {text_content}\n\n")
233
+ elif block_type == 'tool_result':
234
+ prompt_parts.append(f"Tool Result: {block.get('content', '')}\n\n")
235
  elif hasattr(block, 'type'):
236
  if block.type == 'thinking' and include_thinking:
237
  prompt_parts.append(f"<thinking>{block.thinking}</thinking>\n")
 
253
  def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float) -> tuple:
254
  tokenizer = models["tokenizer"]
255
  model = models["model"]
256
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
257
  input_tokens = inputs["input_ids"].shape[1]
258
  with torch.no_grad():
259
  outputs = model.generate(
260
  **inputs,
261
+ max_new_tokens=min(max_tokens, 512),
262
  temperature=temperature if temperature > 0 else 1.0,
263
  top_p=top_p,
264
  do_sample=temperature > 0,
 
280
  with torch.no_grad():
281
  outputs = model.generate(
282
  **inputs,
283
+ max_new_tokens=min(budget_tokens, 256),
284
  temperature=0.7,
285
  top_p=0.9,
286
  do_sample=True,
 
296
  async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperature: float, top_p: float, message_id: str, model_name: str, thinking_enabled: bool = False, thinking_budget: int = 100):
297
  tokenizer = models["tokenizer"]
298
  model = models["model"]
299
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
300
  input_tokens = inputs["input_ids"].shape[1]
301
  total_output_tokens = 0
302
 
 
323
  yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': content_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
324
 
325
  with torch.no_grad():
326
+ outputs = model.generate(**inputs, max_new_tokens=min(max_tokens, 512), temperature=temperature if temperature > 0 else 1.0, top_p=top_p, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
327
 
328
  generated_tokens = outputs[0][input_tokens:]
329
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
 
331
 
332
  for i in range(0, len(generated_text), 5):
333
  yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': content_index, 'delta': {'type': 'text_delta', 'text': generated_text[i:i+5]}})}\n\n"
334
+ await asyncio.sleep(0.005)
335
 
336
  yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': content_index})}\n\n"
337
  yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': 'end_turn', 'stop_sequence': None}, 'usage': {'output_tokens': total_output_tokens}})}\n\n"
 
347
  return None
348
 
349
 
350
+ # ============== Core Messages Handler ==============
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
+ async def handle_messages(request: AnthropicRequest):
353
+ """Core handler for Anthropic Messages API"""
354
  try:
355
  message_id = f"msg_{uuid.uuid4().hex[:24]}"
356
  thinking_enabled = False
357
  thinking_budget = 100
358
+
359
  if request.thinking:
360
  if isinstance(request.thinking, dict):
361
  thinking_enabled = request.thinking.get('type') == 'enabled'
362
+ thinking_budget = request.thinking.get('budget_tokens', 100) or 100
363
  else:
364
  thinking_enabled = request.thinking.type == 'enabled'
365
  thinking_budget = request.thinking.budget_tokens or 100
 
399
  raise HTTPException(status_code=500, detail=str(e))
400
 
401
 
402
+ # ============== Frontend ==============
403
+
404
+ @app.get("/", response_class=HTMLResponse)
405
+ async def home():
406
+ return HTMLResponse(content="""<!DOCTYPE html>
407
+ <html><head><meta charset="UTF-8"><meta name="viewport" content="width=device-width,initial-scale=1"><title>Model Runner</title>
408
+ <style>*{margin:0;padding:0;box-sizing:border-box}body{min-height:100vh;background:#000;display:flex;justify-content:center;align-items:center;font-family:system-ui,sans-serif}
409
+ .container{display:flex;flex-direction:column;align-items:center;gap:2rem}
410
+ .logo{width:200px;height:200px;animation:float 3s ease-in-out infinite;filter:drop-shadow(0 0 30px rgba(255,100,100,0.3))}
411
+ .status{display:flex;align-items:center;gap:0.5rem;color:rgba(255,255,255,0.6);font-size:0.875rem}
412
+ .dot{width:8px;height:8px;background:#22c55e;border-radius:50%;animation:pulse 2s ease-in-out infinite}
413
+ .sparkle{position:fixed;bottom:2rem;right:2rem;opacity:0.4}
414
+ @keyframes float{0%,100%{transform:translateY(0)}50%{transform:translateY(-10px)}}
415
+ @keyframes pulse{0%,100%{opacity:1}50%{opacity:0.5}}</style></head>
416
+ <body><div class="container"><div class="logo"><svg viewBox="0 0 200 200" fill="none">
417
+ <defs><linearGradient id="r" x1="0%" y1="100%" x2="100%" y2="0%">
418
+ <stop offset="0%" stop-color="#ff0080"/><stop offset="20%" stop-color="#ff4d00"/>
419
+ <stop offset="40%" stop-color="#ffcc00"/><stop offset="60%" stop-color="#00ff88"/>
420
+ <stop offset="80%" stop-color="#00ccff"/><stop offset="100%" stop-color="#6644ff"/></linearGradient></defs>
421
+ <path d="M100 20 L180 160 L20 160 Z" stroke="url(#r)" stroke-width="12" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
422
+ <path d="M100 70 L130 130 L70 130 Z" stroke="url(#r)" stroke-width="8" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
423
+ <line x1="80" y1="115" x2="120" y2="115" stroke="url(#r)" stroke-width="6" stroke-linecap="round"/>
424
+ </svg></div><div class="status"><span class="dot"></span><span>Ready</span></div></div>
425
+ <svg class="sparkle" width="24" height="24" viewBox="0 0 24 24" fill="none">
426
+ <path d="M12 2L13.5 8.5L20 10L13.5 11.5L12 18L10.5 11.5L4 10L10.5 8.5L12 2Z" fill="rgba(255,255,255,0.6)"/></svg>
427
+ </body></html>""")
428
+
429
+
430
+ # ============== Anthropic API Routes ==============
431
+ # Support multiple base paths for compatibility
432
+
433
+ @app.post("/v1/messages")
434
+ async def messages_v1(request: AnthropicRequest):
435
+ """Standard Anthropic API endpoint"""
436
+ return await handle_messages(request)
437
+
438
+
439
+ @app.post("/anthropic/v1/messages")
440
+ async def messages_anthropic(request: AnthropicRequest):
441
+ """Anthropic base path - for Claude Code compatibility"""
442
+ return await handle_messages(request)
443
+
444
+
445
+ @app.post("/api/v1/messages")
446
+ async def messages_api(request: AnthropicRequest):
447
+ """API base path variant"""
448
+ return await handle_messages(request)
449
+
450
+
451
  # ============== OpenAI Compatible ==============
452
 
453
  class ChatMessage(BaseModel):
454
  role: str
455
+ content: Union[str, List[Dict[str, Any]]]
456
 
457
 
458
  class ChatCompletionRequest(BaseModel):
459
+ model: str = "gpt-4"
460
  messages: List[ChatMessage]
461
+ max_tokens: Optional[int] = 4096
462
  temperature: Optional[float] = 0.7
463
  top_p: Optional[float] = 1.0
464
  stream: Optional[bool] = False
 
467
  @app.post("/v1/chat/completions")
468
  async def chat_completions(request: ChatCompletionRequest):
469
  try:
470
+ # Extract text from messages
471
+ formatted_messages = []
472
+ for msg in request.messages:
473
+ if msg.role in ["user", "assistant"]:
474
+ content = msg.content
475
+ if isinstance(content, list):
476
+ text_parts = [c.get('text', '') for c in content if isinstance(c, dict) and c.get('type') == 'text']
477
+ content = ' '.join(text_parts)
478
+ formatted_messages.append(MessageParam(role=msg.role, content=content))
479
+
480
+ prompt = format_messages_to_prompt(formatted_messages)
481
+ generated_text, input_tokens, output_tokens = generate_text(prompt, request.max_tokens or 4096, request.temperature or 0.7, request.top_p or 1.0)
482
+
483
+ return {
484
+ "id": f"chatcmpl-{uuid.uuid4().hex[:24]}",
485
+ "object": "chat.completion",
486
+ "created": int(time.time()),
487
+ "model": request.model,
488
+ "choices": [{"index": 0, "message": {"role": "assistant", "content": generated_text}, "finish_reason": "stop"}],
489
+ "usage": {"prompt_tokens": input_tokens, "completion_tokens": output_tokens, "total_tokens": input_tokens + output_tokens}
490
+ }
491
  except Exception as e:
492
  raise HTTPException(status_code=500, detail=str(e))
493
 
494
 
495
+ # ============== Models Endpoints ==============
496
+
497
  @app.get("/v1/models")
498
+ @app.get("/anthropic/v1/models")
499
+ @app.get("/api/v1/models")
500
  async def list_models():
501
+ return {
502
+ "object": "list",
503
+ "data": [
504
+ {"id": "claude-sonnet-4-20250514", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
505
+ {"id": "claude-3-5-sonnet-20241022", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
506
+ {"id": "MiniMax-M2", "object": "model", "created": int(time.time()), "owned_by": "local"},
507
+ {"id": "MiniMax-M2-Stable", "object": "model", "created": int(time.time()), "owned_by": "local"},
508
+ {"id": GENERATOR_MODEL, "object": "model", "created": int(time.time()), "owned_by": "local"}
509
+ ]
510
+ }
511
 
512
 
513
+ # ============== Utility Endpoints ==============
514
+
515
  @app.get("/health")
516
  async def health():
517
  return {"status": "healthy", "timestamp": datetime.utcnow().isoformat(), "models_loaded": len(models) > 0}
 
519
 
520
  @app.get("/info")
521
  async def info():
522
+ return {
523
+ "name": "Model Runner",
524
+ "version": "1.1.0",
525
+ "api_compatibility": ["anthropic", "openai"],
526
+ "base_paths": ["/v1/messages", "/anthropic/v1/messages", "/api/v1/messages"],
527
+ "interleaved_thinking": True,
528
+ "agentic_tools": True
529
+ }
530
 
531
 
532
  if __name__ == "__main__":