likhonsheikhdev commited on
Commit
1ea9642
·
verified ·
1 Parent(s): 7270816

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. Dockerfile +4 -0
  2. main.py +62 -268
  3. static/index.html +115 -0
Dockerfile CHANGED
@@ -25,6 +25,10 @@ RUN pip install --no-cache-dir fastapi==0.115.0 uvicorn[standard]==0.30.6 \
25
  # Copy application code
26
  COPY . .
27
 
 
 
 
 
28
  # Create non-root user for security
29
  RUN useradd -m -u 1000 user
30
  USER user
 
25
  # Copy application code
26
  COPY . .
27
 
28
+ # Create static directory
29
+ RUN mkdir -p /app/static
30
+ COPY static/ /app/static/
31
+
32
  # Create non-root user for security
33
  RUN useradd -m -u 1000 user
34
  USER user
main.py CHANGED
@@ -4,7 +4,8 @@ Full compatibility with Anthropic Messages API + Interleaved Thinking
4
  Optimized for: 2 vCPU, 16GB RAM
5
  """
6
  from fastapi import FastAPI, HTTPException, Header, Request
7
- from fastapi.responses import StreamingResponse
 
8
  from pydantic import BaseModel, Field
9
  from typing import Optional, List, Union, Literal, Any, Dict
10
  import torch
@@ -52,10 +53,12 @@ async def lifespan(app: FastAPI):
52
 
53
 
54
  app = FastAPI(
55
- title="Docker Model Runner",
56
  description="Anthropic API Compatible with Interleaved Thinking",
57
  version="1.0.0",
58
- lifespan=lifespan
 
 
59
  )
60
 
61
 
@@ -143,15 +146,15 @@ class AnthropicRequest(BaseModel):
143
  max_tokens: int = 1024
144
  temperature: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
145
  top_p: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
146
- top_k: Optional[int] = None # Ignored
147
- stop_sequences: Optional[List[str]] = None # Ignored
148
  stream: Optional[bool] = False
149
  system: Optional[Union[str, List[TextBlock]]] = None
150
  tools: Optional[List[Tool]] = None
151
  tool_choice: Optional[Union[ToolChoice, Dict[str, Any]]] = None
152
  metadata: Optional[Metadata] = None
153
  thinking: Optional[ThinkingConfig] = None
154
- service_tier: Optional[str] = None # Ignored
155
 
156
 
157
  class Usage(BaseModel):
@@ -175,10 +178,8 @@ class AnthropicResponse(BaseModel):
175
  # ============== Helper Functions ==============
176
 
177
  def extract_text_from_content(content: Union[str, List[ContentBlock]]) -> str:
178
- """Extract text from content which may be string or list of blocks"""
179
  if isinstance(content, str):
180
  return content
181
-
182
  texts = []
183
  for block in content:
184
  if isinstance(block, str):
@@ -196,7 +197,6 @@ def extract_text_from_content(content: Union[str, List[ContentBlock]]) -> str:
196
 
197
 
198
  def format_system_prompt(system: Optional[Union[str, List[TextBlock]]]) -> str:
199
- """Format system prompt from string or list of blocks"""
200
  if system is None:
201
  return ""
202
  if isinstance(system, str):
@@ -205,18 +205,13 @@ def format_system_prompt(system: Optional[Union[str, List[TextBlock]]]) -> str:
205
 
206
 
207
  def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Union[str, List[TextBlock]]] = None, include_thinking: bool = False) -> str:
208
- """Convert chat messages to a single prompt string"""
209
  prompt_parts = []
210
-
211
  system_text = format_system_prompt(system)
212
  if system_text:
213
  prompt_parts.append(f"System: {system_text}\n\n")
214
-
215
  for msg in messages:
216
  role = msg.role
217
  content = msg.content
218
-
219
- # Handle interleaved thinking in message history
220
  if isinstance(content, list):
221
  for block in content:
222
  if isinstance(block, dict):
@@ -242,19 +237,15 @@ def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Uni
242
  prompt_parts.append(f"Human: {content_text}\n\n")
243
  elif role == "assistant":
244
  prompt_parts.append(f"Assistant: {content_text}\n\n")
245
-
246
  prompt_parts.append("Assistant:")
247
  return "".join(prompt_parts)
248
 
249
 
250
  def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float) -> tuple:
251
- """Generate text and return (text, input_tokens, output_tokens)"""
252
  tokenizer = models["tokenizer"]
253
  model = models["model"]
254
-
255
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
256
  input_tokens = inputs["input_ids"].shape[1]
257
-
258
  with torch.no_grad():
259
  outputs = model.generate(
260
  **inputs,
@@ -265,24 +256,18 @@ def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float
265
  pad_token_id=tokenizer.pad_token_id,
266
  eos_token_id=tokenizer.eos_token_id
267
  )
268
-
269
  generated_tokens = outputs[0][input_tokens:]
270
  output_tokens = len(generated_tokens)
271
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
272
-
273
  return generated_text.strip(), input_tokens, output_tokens
274
 
275
 
276
  def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
277
- """Generate thinking/reasoning content"""
278
  tokenizer = models["tokenizer"]
279
  model = models["model"]
280
-
281
  thinking_prompt = f"{prompt}\n\nLet me think through this step by step:\n"
282
-
283
  inputs = tokenizer(thinking_prompt, return_tensors="pt", truncation=True, max_length=512)
284
  input_tokens = inputs["input_ids"].shape[1]
285
-
286
  with torch.no_grad():
287
  outputs = model.generate(
288
  **inputs,
@@ -293,173 +278,99 @@ def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
293
  pad_token_id=tokenizer.pad_token_id,
294
  eos_token_id=tokenizer.eos_token_id
295
  )
296
-
297
  generated_tokens = outputs[0][input_tokens:]
298
  thinking_tokens = len(generated_tokens)
299
  thinking_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
300
-
301
  return thinking_text.strip(), thinking_tokens
302
 
303
 
304
- async def generate_stream_with_thinking(
305
- prompt: str,
306
- max_tokens: int,
307
- temperature: float,
308
- top_p: float,
309
- message_id: str,
310
- model_name: str,
311
- thinking_enabled: bool = False,
312
- thinking_budget: int = 100
313
- ):
314
- """Generate streaming response with interleaved thinking in Anthropic SSE format"""
315
  tokenizer = models["tokenizer"]
316
  model = models["model"]
317
-
318
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
319
  input_tokens = inputs["input_ids"].shape[1]
320
  total_output_tokens = 0
321
 
322
- # Send message_start event
323
  message_start = {
324
  "type": "message_start",
325
- "message": {
326
- "id": message_id,
327
- "type": "message",
328
- "role": "assistant",
329
- "content": [],
330
- "model": model_name,
331
- "stop_reason": None,
332
- "stop_sequence": None,
333
- "usage": {"input_tokens": input_tokens, "output_tokens": 0}
334
- }
335
  }
336
  yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
337
 
338
  content_index = 0
339
 
340
- # Generate thinking block if enabled
341
  if thinking_enabled:
342
- # Send thinking content_block_start
343
- thinking_block_start = {
344
- "type": "content_block_start",
345
- "index": content_index,
346
- "content_block": {"type": "thinking", "thinking": ""}
347
- }
348
  yield f"event: content_block_start\ndata: {json.dumps(thinking_block_start)}\n\n"
349
-
350
- # Generate thinking content
351
  thinking_text, thinking_tokens = generate_thinking(prompt, thinking_budget)
352
  total_output_tokens += thinking_tokens
353
-
354
- # Stream thinking in chunks
355
- chunk_size = 10
356
- for i in range(0, len(thinking_text), chunk_size):
357
- chunk = thinking_text[i:i+chunk_size]
358
- thinking_delta = {
359
- "type": "content_block_delta",
360
- "index": content_index,
361
- "delta": {"type": "thinking_delta", "thinking": chunk}
362
- }
363
- yield f"event: content_block_delta\ndata: {json.dumps(thinking_delta)}\n\n"
364
  await asyncio.sleep(0.01)
365
-
366
- # Send thinking content_block_stop
367
- thinking_block_stop = {"type": "content_block_stop", "index": content_index}
368
- yield f"event: content_block_stop\ndata: {json.dumps(thinking_block_stop)}\n\n"
369
-
370
  content_index += 1
371
 
372
- # Send text content_block_start
373
- text_block_start = {
374
- "type": "content_block_start",
375
- "index": content_index,
376
- "content_block": {"type": "text", "text": ""}
377
- }
378
- yield f"event: content_block_start\ndata: {json.dumps(text_block_start)}\n\n"
379
 
380
- # Generate main response
381
  with torch.no_grad():
382
- outputs = model.generate(
383
- **inputs,
384
- max_new_tokens=min(max_tokens, 256),
385
- temperature=temperature if temperature > 0 else 1.0,
386
- top_p=top_p,
387
- do_sample=temperature > 0,
388
- pad_token_id=tokenizer.pad_token_id,
389
- eos_token_id=tokenizer.eos_token_id
390
- )
391
 
392
  generated_tokens = outputs[0][input_tokens:]
393
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
394
  total_output_tokens += len(generated_tokens)
395
 
396
- # Stream text in chunks
397
- chunk_size = 5
398
- for i in range(0, len(generated_text), chunk_size):
399
- chunk = generated_text[i:i+chunk_size]
400
- text_delta = {
401
- "type": "content_block_delta",
402
- "index": content_index,
403
- "delta": {"type": "text_delta", "text": chunk}
404
- }
405
- yield f"event: content_block_delta\ndata: {json.dumps(text_delta)}\n\n"
406
  await asyncio.sleep(0.01)
407
 
408
- # Send text content_block_stop
409
- text_block_stop = {"type": "content_block_stop", "index": content_index}
410
- yield f"event: content_block_stop\ndata: {json.dumps(text_block_stop)}\n\n"
411
-
412
- # Send message_delta event
413
- message_delta = {
414
- "type": "message_delta",
415
- "delta": {"stop_reason": "end_turn", "stop_sequence": None},
416
- "usage": {"output_tokens": total_output_tokens}
417
- }
418
- yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
419
-
420
- # Send message_stop event
421
- message_stop = {"type": "message_stop"}
422
- yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n"
423
 
424
 
425
  def handle_tool_call(tools: List[Tool], messages: List[MessageParam], generated_text: str) -> Optional[ToolUseBlock]:
426
- """Check if the response should trigger a tool call"""
427
  if not tools:
428
  return None
429
-
430
  for tool in tools:
431
  if tool.name.lower() in generated_text.lower():
432
- return ToolUseBlock(
433
- type="tool_use",
434
- id=f"toolu_{uuid.uuid4().hex[:24]}",
435
- name=tool.name,
436
- input={}
437
- )
438
  return None
439
 
440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  # ============== Anthropic API Endpoints ==============
442
 
443
  @app.post("/v1/messages")
444
  async def create_message(request: AnthropicRequest):
445
- """
446
- Anthropic Messages API compatible endpoint with Interleaved Thinking
447
-
448
- POST /v1/messages
449
-
450
- Supports:
451
- - Text messages
452
- - System prompts
453
- - Streaming responses
454
- - Tool/function calling
455
- - Interleaved thinking blocks
456
- - Thinking budget tokens
457
- - Metadata
458
- """
459
  try:
460
  message_id = f"msg_{uuid.uuid4().hex[:24]}"
461
-
462
- # Check if thinking is enabled
463
  thinking_enabled = False
464
  thinking_budget = 100
465
  if request.thinking:
@@ -470,50 +381,26 @@ async def create_message(request: AnthropicRequest):
470
  thinking_enabled = request.thinking.type == 'enabled'
471
  thinking_budget = request.thinking.budget_tokens or 100
472
 
473
- # Format messages to prompt (include thinking from history if enabled)
474
  prompt = format_messages_to_prompt(request.messages, request.system, include_thinking=thinking_enabled)
475
 
476
- # Handle streaming
477
  if request.stream:
478
  return StreamingResponse(
479
- generate_stream_with_thinking(
480
- prompt=prompt,
481
- max_tokens=request.max_tokens,
482
- temperature=request.temperature or 1.0,
483
- top_p=request.top_p or 1.0,
484
- message_id=message_id,
485
- model_name=request.model,
486
- thinking_enabled=thinking_enabled,
487
- thinking_budget=thinking_budget
488
- ),
489
  media_type="text/event-stream",
490
- headers={
491
- "Cache-Control": "no-cache",
492
- "Connection": "keep-alive",
493
- "X-Accel-Buffering": "no"
494
- }
495
  )
496
 
497
- # Non-streaming response
498
  content_blocks = []
499
  total_output_tokens = 0
500
 
501
- # Generate thinking block if enabled
502
  if thinking_enabled:
503
  thinking_text, thinking_tokens = generate_thinking(prompt, thinking_budget)
504
  total_output_tokens += thinking_tokens
505
  content_blocks.append(ThinkingBlock(type="thinking", thinking=thinking_text))
506
 
507
- # Generate main response
508
- generated_text, input_tokens, output_tokens = generate_text(
509
- prompt=prompt,
510
- max_tokens=request.max_tokens,
511
- temperature=request.temperature or 1.0,
512
- top_p=request.top_p or 1.0
513
- )
514
  total_output_tokens += output_tokens
515
 
516
- # Check for tool calls
517
  tool_use = handle_tool_call(request.tools, request.messages, generated_text) if request.tools else None
518
 
519
  if tool_use:
@@ -524,18 +411,12 @@ async def create_message(request: AnthropicRequest):
524
  content_blocks.append(TextBlock(type="text", text=generated_text))
525
  stop_reason = "end_turn"
526
 
527
- return AnthropicResponse(
528
- id=message_id,
529
- content=content_blocks,
530
- model=request.model,
531
- stop_reason=stop_reason,
532
- usage=Usage(input_tokens=input_tokens, output_tokens=total_output_tokens)
533
- )
534
  except Exception as e:
535
  raise HTTPException(status_code=500, detail=str(e))
536
 
537
 
538
- # ============== OpenAI Compatible Endpoints ==============
539
 
540
  class ChatMessage(BaseModel):
541
  role: str
@@ -553,115 +434,28 @@ class ChatCompletionRequest(BaseModel):
553
 
554
  @app.post("/v1/chat/completions")
555
  async def chat_completions(request: ChatCompletionRequest):
556
- """OpenAI Chat Completions API compatible endpoint"""
557
  try:
558
- anthropic_messages = [
559
- MessageParam(role=msg.role if msg.role in ["user", "assistant"] else "user",
560
- content=msg.content)
561
- for msg in request.messages
562
- if msg.role in ["user", "assistant"]
563
- ]
564
-
565
  prompt = format_messages_to_prompt(anthropic_messages)
566
- generated_text, input_tokens, output_tokens = generate_text(
567
- prompt=prompt,
568
- max_tokens=request.max_tokens or 1024,
569
- temperature=request.temperature or 0.7,
570
- top_p=request.top_p or 1.0
571
- )
572
-
573
- return {
574
- "id": f"chatcmpl-{uuid.uuid4().hex[:24]}",
575
- "object": "chat.completion",
576
- "created": int(time.time()),
577
- "model": request.model,
578
- "choices": [{
579
- "index": 0,
580
- "message": {"role": "assistant", "content": generated_text},
581
- "finish_reason": "stop"
582
- }],
583
- "usage": {
584
- "prompt_tokens": input_tokens,
585
- "completion_tokens": output_tokens,
586
- "total_tokens": input_tokens + output_tokens
587
- }
588
- }
589
  except Exception as e:
590
  raise HTTPException(status_code=500, detail=str(e))
591
 
592
 
593
  @app.get("/v1/models")
594
  async def list_models():
595
- """List available models"""
596
- return {
597
- "object": "list",
598
- "data": [
599
- {"id": "MiniMax-M2", "object": "model", "created": int(time.time()), "owned_by": "local"},
600
- {"id": "MiniMax-M2-Stable", "object": "model", "created": int(time.time()), "owned_by": "local"},
601
- {"id": GENERATOR_MODEL, "object": "model", "created": int(time.time()), "owned_by": "local"}
602
- ]
603
- }
604
-
605
-
606
- # ============== Utility Endpoints ==============
607
-
608
- @app.get("/")
609
- async def root():
610
- """Welcome endpoint"""
611
- return {
612
- "message": "Docker Model Runner API (Anthropic Compatible + Interleaved Thinking)",
613
- "hardware": "CPU Basic: 2 vCPU · 16 GB RAM",
614
- "docs": "/docs",
615
- "api_endpoints": {
616
- "anthropic_messages": "POST /v1/messages",
617
- "openai_chat": "POST /v1/chat/completions",
618
- "models": "GET /v1/models"
619
- },
620
- "supported_features": [
621
- "text messages",
622
- "system prompts",
623
- "streaming responses",
624
- "tool/function calling",
625
- "interleaved thinking blocks",
626
- "thinking budget tokens",
627
- "metadata"
628
- ]
629
- }
630
 
631
 
632
  @app.get("/health")
633
  async def health():
634
- """Health check endpoint"""
635
- return {
636
- "status": "healthy",
637
- "timestamp": datetime.utcnow().isoformat(),
638
- "hardware": "CPU Basic: 2 vCPU · 16 GB RAM",
639
- "models_loaded": len(models) > 0
640
- }
641
 
642
 
643
  @app.get("/info")
644
  async def info():
645
- """API information"""
646
- return {
647
- "name": "Docker Model Runner",
648
- "version": "1.1.0",
649
- "api_compatibility": ["anthropic", "openai"],
650
- "supported_models": ["MiniMax-M2", "MiniMax-M2-Stable"],
651
- "interleaved_thinking": {
652
- "supported": True,
653
- "streaming": True,
654
- "budget_tokens": True
655
- },
656
- "supported_parameters": {
657
- "fully_supported": ["model", "messages", "max_tokens", "stream", "system", "temperature", "top_p", "tools", "tool_choice", "metadata", "thinking"],
658
- "ignored": ["top_k", "stop_sequences", "service_tier"]
659
- },
660
- "message_types": {
661
- "supported": ["text", "tool_use", "tool_result", "thinking"],
662
- "not_supported": ["image", "document"]
663
- }
664
- }
665
 
666
 
667
  if __name__ == "__main__":
 
4
  Optimized for: 2 vCPU, 16GB RAM
5
  """
6
  from fastapi import FastAPI, HTTPException, Header, Request
7
+ from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
8
+ from fastapi.staticfiles import StaticFiles
9
  from pydantic import BaseModel, Field
10
  from typing import Optional, List, Union, Literal, Any, Dict
11
  import torch
 
53
 
54
 
55
  app = FastAPI(
56
+ title="Model Runner",
57
  description="Anthropic API Compatible with Interleaved Thinking",
58
  version="1.0.0",
59
+ lifespan=lifespan,
60
+ docs_url="/api/docs",
61
+ redoc_url="/api/redoc"
62
  )
63
 
64
 
 
146
  max_tokens: int = 1024
147
  temperature: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
148
  top_p: Optional[float] = Field(default=1.0, gt=0.0, le=1.0)
149
+ top_k: Optional[int] = None
150
+ stop_sequences: Optional[List[str]] = None
151
  stream: Optional[bool] = False
152
  system: Optional[Union[str, List[TextBlock]]] = None
153
  tools: Optional[List[Tool]] = None
154
  tool_choice: Optional[Union[ToolChoice, Dict[str, Any]]] = None
155
  metadata: Optional[Metadata] = None
156
  thinking: Optional[ThinkingConfig] = None
157
+ service_tier: Optional[str] = None
158
 
159
 
160
  class Usage(BaseModel):
 
178
  # ============== Helper Functions ==============
179
 
180
  def extract_text_from_content(content: Union[str, List[ContentBlock]]) -> str:
 
181
  if isinstance(content, str):
182
  return content
 
183
  texts = []
184
  for block in content:
185
  if isinstance(block, str):
 
197
 
198
 
199
  def format_system_prompt(system: Optional[Union[str, List[TextBlock]]]) -> str:
 
200
  if system is None:
201
  return ""
202
  if isinstance(system, str):
 
205
 
206
 
207
  def format_messages_to_prompt(messages: List[MessageParam], system: Optional[Union[str, List[TextBlock]]] = None, include_thinking: bool = False) -> str:
 
208
  prompt_parts = []
 
209
  system_text = format_system_prompt(system)
210
  if system_text:
211
  prompt_parts.append(f"System: {system_text}\n\n")
 
212
  for msg in messages:
213
  role = msg.role
214
  content = msg.content
 
 
215
  if isinstance(content, list):
216
  for block in content:
217
  if isinstance(block, dict):
 
237
  prompt_parts.append(f"Human: {content_text}\n\n")
238
  elif role == "assistant":
239
  prompt_parts.append(f"Assistant: {content_text}\n\n")
 
240
  prompt_parts.append("Assistant:")
241
  return "".join(prompt_parts)
242
 
243
 
244
  def generate_text(prompt: str, max_tokens: int, temperature: float, top_p: float) -> tuple:
 
245
  tokenizer = models["tokenizer"]
246
  model = models["model"]
 
247
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
248
  input_tokens = inputs["input_ids"].shape[1]
 
249
  with torch.no_grad():
250
  outputs = model.generate(
251
  **inputs,
 
256
  pad_token_id=tokenizer.pad_token_id,
257
  eos_token_id=tokenizer.eos_token_id
258
  )
 
259
  generated_tokens = outputs[0][input_tokens:]
260
  output_tokens = len(generated_tokens)
261
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
 
262
  return generated_text.strip(), input_tokens, output_tokens
263
 
264
 
265
  def generate_thinking(prompt: str, budget_tokens: int = 100) -> tuple:
 
266
  tokenizer = models["tokenizer"]
267
  model = models["model"]
 
268
  thinking_prompt = f"{prompt}\n\nLet me think through this step by step:\n"
 
269
  inputs = tokenizer(thinking_prompt, return_tensors="pt", truncation=True, max_length=512)
270
  input_tokens = inputs["input_ids"].shape[1]
 
271
  with torch.no_grad():
272
  outputs = model.generate(
273
  **inputs,
 
278
  pad_token_id=tokenizer.pad_token_id,
279
  eos_token_id=tokenizer.eos_token_id
280
  )
 
281
  generated_tokens = outputs[0][input_tokens:]
282
  thinking_tokens = len(generated_tokens)
283
  thinking_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
 
284
  return thinking_text.strip(), thinking_tokens
285
 
286
 
287
+ async def generate_stream_with_thinking(prompt: str, max_tokens: int, temperature: float, top_p: float, message_id: str, model_name: str, thinking_enabled: bool = False, thinking_budget: int = 100):
 
 
 
 
 
 
 
 
 
 
288
  tokenizer = models["tokenizer"]
289
  model = models["model"]
 
290
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
291
  input_tokens = inputs["input_ids"].shape[1]
292
  total_output_tokens = 0
293
 
 
294
  message_start = {
295
  "type": "message_start",
296
+ "message": {"id": message_id, "type": "message", "role": "assistant", "content": [], "model": model_name, "stop_reason": None, "stop_sequence": None, "usage": {"input_tokens": input_tokens, "output_tokens": 0}}
 
 
 
 
 
 
 
 
 
297
  }
298
  yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
299
 
300
  content_index = 0
301
 
 
302
  if thinking_enabled:
303
+ thinking_block_start = {"type": "content_block_start", "index": content_index, "content_block": {"type": "thinking", "thinking": ""}}
 
 
 
 
 
304
  yield f"event: content_block_start\ndata: {json.dumps(thinking_block_start)}\n\n"
 
 
305
  thinking_text, thinking_tokens = generate_thinking(prompt, thinking_budget)
306
  total_output_tokens += thinking_tokens
307
+ for i in range(0, len(thinking_text), 10):
308
+ chunk = thinking_text[i:i+10]
309
+ yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': content_index, 'delta': {'type': 'thinking_delta', 'thinking': chunk}})}\n\n"
 
 
 
 
 
 
 
 
310
  await asyncio.sleep(0.01)
311
+ yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': content_index})}\n\n"
 
 
 
 
312
  content_index += 1
313
 
314
+ yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': content_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
 
 
 
 
 
 
315
 
 
316
  with torch.no_grad():
317
+ outputs = model.generate(**inputs, max_new_tokens=min(max_tokens, 256), temperature=temperature if temperature > 0 else 1.0, top_p=top_p, do_sample=temperature > 0, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
 
 
 
 
 
 
 
 
318
 
319
  generated_tokens = outputs[0][input_tokens:]
320
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
321
  total_output_tokens += len(generated_tokens)
322
 
323
+ for i in range(0, len(generated_text), 5):
324
+ yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': content_index, 'delta': {'type': 'text_delta', 'text': generated_text[i:i+5]}})}\n\n"
 
 
 
 
 
 
 
 
325
  await asyncio.sleep(0.01)
326
 
327
+ yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': content_index})}\n\n"
328
+ yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': 'end_turn', 'stop_sequence': None}, 'usage': {'output_tokens': total_output_tokens}})}\n\n"
329
+ yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
 
332
  def handle_tool_call(tools: List[Tool], messages: List[MessageParam], generated_text: str) -> Optional[ToolUseBlock]:
 
333
  if not tools:
334
  return None
 
335
  for tool in tools:
336
  if tool.name.lower() in generated_text.lower():
337
+ return ToolUseBlock(type="tool_use", id=f"toolu_{uuid.uuid4().hex[:24]}", name=tool.name, input={})
 
 
 
 
 
338
  return None
339
 
340
 
341
+ # ============== Frontend ==============
342
+
343
+ @app.get("/", response_class=HTMLResponse)
344
+ async def home():
345
+ """Serve the minimal centered frontend"""
346
+ try:
347
+ with open("/app/static/index.html", "r") as f:
348
+ return HTMLResponse(content=f.read())
349
+ except:
350
+ return HTMLResponse(content="""
351
+ <!DOCTYPE html>
352
+ <html><head><meta charset="UTF-8"><title>Model Runner</title>
353
+ <style>*{margin:0;padding:0}body{min-height:100vh;background:#000;display:flex;justify-content:center;align-items:center}
354
+ .logo{width:200px;height:200px;animation:float 3s ease-in-out infinite}
355
+ @keyframes float{0%,100%{transform:translateY(0)}50%{transform:translateY(-10px)}}</style></head>
356
+ <body><div class="logo"><svg viewBox="0 0 200 200" fill="none">
357
+ <defs><linearGradient id="r" x1="0%" y1="100%" x2="100%" y2="0%">
358
+ <stop offset="0%" stop-color="#ff0080"/><stop offset="25%" stop-color="#ff4d00"/>
359
+ <stop offset="50%" stop-color="#ffcc00"/><stop offset="75%" stop-color="#00ff88"/>
360
+ <stop offset="100%" stop-color="#00ccff"/></linearGradient></defs>
361
+ <path d="M100 20 L180 160 L20 160 Z" stroke="url(#r)" stroke-width="12" stroke-linecap="round" fill="none"/>
362
+ <path d="M100 70 L130 130 L70 130 Z" stroke="url(#r)" stroke-width="8" stroke-linecap="round" fill="none"/>
363
+ <line x1="80" y1="115" x2="120" y2="115" stroke="url(#r)" stroke-width="6" stroke-linecap="round"/>
364
+ </svg></div></body></html>
365
+ """)
366
+
367
+
368
  # ============== Anthropic API Endpoints ==============
369
 
370
  @app.post("/v1/messages")
371
  async def create_message(request: AnthropicRequest):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  try:
373
  message_id = f"msg_{uuid.uuid4().hex[:24]}"
 
 
374
  thinking_enabled = False
375
  thinking_budget = 100
376
  if request.thinking:
 
381
  thinking_enabled = request.thinking.type == 'enabled'
382
  thinking_budget = request.thinking.budget_tokens or 100
383
 
 
384
  prompt = format_messages_to_prompt(request.messages, request.system, include_thinking=thinking_enabled)
385
 
 
386
  if request.stream:
387
  return StreamingResponse(
388
+ generate_stream_with_thinking(prompt, request.max_tokens, request.temperature or 1.0, request.top_p or 1.0, message_id, request.model, thinking_enabled, thinking_budget),
 
 
 
 
 
 
 
 
 
389
  media_type="text/event-stream",
390
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"}
 
 
 
 
391
  )
392
 
 
393
  content_blocks = []
394
  total_output_tokens = 0
395
 
 
396
  if thinking_enabled:
397
  thinking_text, thinking_tokens = generate_thinking(prompt, thinking_budget)
398
  total_output_tokens += thinking_tokens
399
  content_blocks.append(ThinkingBlock(type="thinking", thinking=thinking_text))
400
 
401
+ generated_text, input_tokens, output_tokens = generate_text(prompt, request.max_tokens, request.temperature or 1.0, request.top_p or 1.0)
 
 
 
 
 
 
402
  total_output_tokens += output_tokens
403
 
 
404
  tool_use = handle_tool_call(request.tools, request.messages, generated_text) if request.tools else None
405
 
406
  if tool_use:
 
411
  content_blocks.append(TextBlock(type="text", text=generated_text))
412
  stop_reason = "end_turn"
413
 
414
+ return AnthropicResponse(id=message_id, content=content_blocks, model=request.model, stop_reason=stop_reason, usage=Usage(input_tokens=input_tokens, output_tokens=total_output_tokens))
 
 
 
 
 
 
415
  except Exception as e:
416
  raise HTTPException(status_code=500, detail=str(e))
417
 
418
 
419
+ # ============== OpenAI Compatible ==============
420
 
421
  class ChatMessage(BaseModel):
422
  role: str
 
434
 
435
  @app.post("/v1/chat/completions")
436
  async def chat_completions(request: ChatCompletionRequest):
 
437
  try:
438
+ anthropic_messages = [MessageParam(role=msg.role if msg.role in ["user", "assistant"] else "user", content=msg.content) for msg in request.messages if msg.role in ["user", "assistant"]]
 
 
 
 
 
 
439
  prompt = format_messages_to_prompt(anthropic_messages)
440
+ generated_text, input_tokens, output_tokens = generate_text(prompt, request.max_tokens or 1024, request.temperature or 0.7, request.top_p or 1.0)
441
+ return {"id": f"chatcmpl-{uuid.uuid4().hex[:24]}", "object": "chat.completion", "created": int(time.time()), "model": request.model, "choices": [{"index": 0, "message": {"role": "assistant", "content": generated_text}, "finish_reason": "stop"}], "usage": {"prompt_tokens": input_tokens, "completion_tokens": output_tokens, "total_tokens": input_tokens + output_tokens}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
  except Exception as e:
443
  raise HTTPException(status_code=500, detail=str(e))
444
 
445
 
446
  @app.get("/v1/models")
447
  async def list_models():
448
+ return {"object": "list", "data": [{"id": "MiniMax-M2", "object": "model", "created": int(time.time()), "owned_by": "local"}, {"id": "MiniMax-M2-Stable", "object": "model", "created": int(time.time()), "owned_by": "local"}, {"id": GENERATOR_MODEL, "object": "model", "created": int(time.time()), "owned_by": "local"}]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
 
451
  @app.get("/health")
452
  async def health():
453
+ return {"status": "healthy", "timestamp": datetime.utcnow().isoformat(), "models_loaded": len(models) > 0}
 
 
 
 
 
 
454
 
455
 
456
  @app.get("/info")
457
  async def info():
458
+ return {"name": "Model Runner", "version": "1.1.0", "api_compatibility": ["anthropic", "openai"], "interleaved_thinking": True}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
 
460
 
461
  if __name__ == "__main__":
static/index.html ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Model Runner</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ min-height: 100vh;
16
+ background: #000000;
17
+ display: flex;
18
+ justify-content: center;
19
+ align-items: center;
20
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
21
+ overflow: hidden;
22
+ }
23
+
24
+ .container {
25
+ display: flex;
26
+ flex-direction: column;
27
+ align-items: center;
28
+ gap: 2rem;
29
+ }
30
+
31
+ .logo {
32
+ width: 200px;
33
+ height: 200px;
34
+ position: relative;
35
+ animation: float 3s ease-in-out infinite;
36
+ }
37
+
38
+ .logo svg {
39
+ width: 100%;
40
+ height: 100%;
41
+ filter: drop-shadow(0 0 30px rgba(255, 100, 100, 0.3));
42
+ }
43
+
44
+ .status {
45
+ display: flex;
46
+ align-items: center;
47
+ gap: 0.5rem;
48
+ color: rgba(255, 255, 255, 0.6);
49
+ font-size: 0.875rem;
50
+ }
51
+
52
+ .status-dot {
53
+ width: 8px;
54
+ height: 8px;
55
+ background: #22c55e;
56
+ border-radius: 50%;
57
+ animation: pulse 2s ease-in-out infinite;
58
+ }
59
+
60
+ .sparkle {
61
+ position: fixed;
62
+ bottom: 2rem;
63
+ right: 2rem;
64
+ opacity: 0.4;
65
+ }
66
+
67
+ @keyframes float {
68
+ 0%, 100% { transform: translateY(0); }
69
+ 50% { transform: translateY(-10px); }
70
+ }
71
+
72
+ @keyframes pulse {
73
+ 0%, 100% { opacity: 1; transform: scale(1); }
74
+ 50% { opacity: 0.5; transform: scale(1.2); }
75
+ }
76
+
77
+ @keyframes spin {
78
+ from { transform: rotate(0deg); }
79
+ to { transform: rotate(360deg); }
80
+ }
81
+ </style>
82
+ </head>
83
+ <body>
84
+ <div class="container">
85
+ <div class="logo">
86
+ <svg viewBox="0 0 200 200" fill="none" xmlns="http://www.w3.org/2000/svg">
87
+ <defs>
88
+ <linearGradient id="rainbow" x1="0%" y1="100%" x2="100%" y2="0%">
89
+ <stop offset="0%" stop-color="#ff0080"/>
90
+ <stop offset="20%" stop-color="#ff4d00"/>
91
+ <stop offset="40%" stop-color="#ffcc00"/>
92
+ <stop offset="60%" stop-color="#00ff88"/>
93
+ <stop offset="80%" stop-color="#00ccff"/>
94
+ <stop offset="100%" stop-color="#6644ff"/>
95
+ </linearGradient>
96
+ </defs>
97
+ <!-- Outer triangle -->
98
+ <path d="M100 20 L180 160 L20 160 Z" stroke="url(#rainbow)" stroke-width="12" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
99
+ <!-- Inner A shape -->
100
+ <path d="M100 70 L130 130 L70 130 Z" stroke="url(#rainbow)" stroke-width="8" stroke-linecap="round" stroke-linejoin="round" fill="none"/>
101
+ <!-- Horizontal bar -->
102
+ <line x1="80" y1="115" x2="120" y2="115" stroke="url(#rainbow)" stroke-width="6" stroke-linecap="round"/>
103
+ </svg>
104
+ </div>
105
+ <div class="status">
106
+ <span class="status-dot"></span>
107
+ <span>Ready</span>
108
+ </div>
109
+ </div>
110
+
111
+ <svg class="sparkle" width="24" height="24" viewBox="0 0 24 24" fill="none">
112
+ <path d="M12 2L13.5 8.5L20 10L13.5 11.5L12 18L10.5 11.5L4 10L10.5 8.5L12 2Z" fill="rgba(255,255,255,0.6)"/>
113
+ </svg>
114
+ </body>
115
+ </html>