Spaces:
Running
on
Zero
Running
on
Zero
| #!/usr/bin/env python3 | |
| """ | |
| Mock LLM function calls for testing CLI topic extraction without API costs. | |
| This module patches requests.post to intercept HTTP calls to inference servers | |
| and return mock responses instead. | |
| """ | |
| import json | |
| import os | |
| # Store original requests if it exists | |
| _original_requests = None | |
| def _generate_mock_response(prompt: str, system_prompt: str) -> str: | |
| """ | |
| Generate a mock response that satisfies validation requirements. | |
| The response must: | |
| - Be longer than 120 characters | |
| - Contain a markdown table (with | characters) | |
| Args: | |
| prompt: The user prompt | |
| system_prompt: The system prompt | |
| Returns: | |
| A mock markdown table response | |
| """ | |
| # Generate a simple markdown table that satisfies the validation | |
| # This mimics a topic extraction table response | |
| mock_table = """| Reference | General Topic | Sub-topic | Sentiment | | |
| |-----------|---------------|-----------|-----------| | |
| | 1 | Test Topic | Test Subtopic | Positive | | |
| | 2 | Another Topic | Another Subtopic | Neutral | | |
| | 3 | Third Topic | Third Subtopic | Negative | | |
| This is a mock response from the test inference server. The actual content would be generated by a real LLM model, but for testing purposes, this dummy response allows us to verify that the CLI commands work correctly without incurring API costs.""" | |
| return mock_table | |
| def _estimate_tokens(text: str) -> int: | |
| """Estimate token count (rough approximation: ~4 characters per token).""" | |
| return max(1, len(text) // 4) | |
| def mock_requests_post(url, **kwargs): | |
| """ | |
| Mock version of requests.post that intercepts inference-server calls. | |
| Returns a mock response object that mimics the real requests.Response. | |
| """ | |
| # Only mock inference-server URLs | |
| if "/v1/chat/completions" not in url: | |
| # For non-inference-server URLs, use real requests | |
| import requests | |
| return requests.post(url, **kwargs) | |
| # Extract payload | |
| payload = kwargs.get("json", {}) | |
| messages = payload.get("messages", []) | |
| # Extract prompts | |
| system_prompt = "" | |
| user_prompt = "" | |
| for msg in messages: | |
| role = msg.get("role", "") | |
| content = msg.get("content", "") | |
| if role == "system": | |
| system_prompt = content | |
| elif role == "user": | |
| user_prompt = content | |
| # Generate mock response | |
| response_text = _generate_mock_response(user_prompt, system_prompt) | |
| # Estimate tokens | |
| input_tokens = _estimate_tokens(system_prompt + "\n" + user_prompt) | |
| output_tokens = _estimate_tokens(response_text) | |
| # Check if streaming is requested | |
| stream = payload.get("stream", False) | |
| if stream: | |
| # For streaming, create a mock response with iter_lines | |
| class MockStreamResponse: | |
| def __init__(self, text): | |
| self.text = text | |
| self.status_code = 200 | |
| self.lines = [] | |
| # Simulate streaming chunks | |
| chunk_size = 20 | |
| for i in range(0, len(text), chunk_size): | |
| chunk = text[i : i + chunk_size] | |
| chunk_data = { | |
| "choices": [ | |
| { | |
| "delta": {"content": chunk}, | |
| "index": 0, | |
| "finish_reason": None, | |
| } | |
| ] | |
| } | |
| self.lines.append(f"data: {json.dumps(chunk_data)}\n\n".encode()) | |
| self.lines.append(b"data: [DONE]\n\n") | |
| self._line_index = 0 | |
| def raise_for_status(self): | |
| pass | |
| def iter_lines(self): | |
| for line in self.lines: | |
| yield line | |
| return MockStreamResponse(response_text) | |
| else: | |
| # For non-streaming, create a simple mock response | |
| class MockResponse: | |
| def __init__(self, text, input_tokens, output_tokens): | |
| self._json_data = { | |
| "choices": [ | |
| { | |
| "index": 0, | |
| "finish_reason": "stop", | |
| "message": { | |
| "role": "assistant", | |
| "content": text, | |
| }, | |
| } | |
| ], | |
| "usage": { | |
| "prompt_tokens": input_tokens, | |
| "completion_tokens": output_tokens, | |
| "total_tokens": input_tokens + output_tokens, | |
| }, | |
| } | |
| self.status_code = 200 | |
| def raise_for_status(self): | |
| pass | |
| def json(self): | |
| return self._json_data | |
| return MockResponse(response_text, input_tokens, output_tokens) | |
| def apply_mock_patches(): | |
| """ | |
| Apply patches to mock HTTP requests. | |
| This should be called before importing modules that use requests. | |
| """ | |
| global _original_requests | |
| try: | |
| import requests | |
| _original_requests = requests.post | |
| requests.post = mock_requests_post | |
| print("[Mock] Patched requests.post for inference-server calls") | |
| except ImportError: | |
| # requests not imported yet, will be patched when imported | |
| pass | |
| def restore_original(): | |
| """Restore original requests.post if it was patched.""" | |
| global _original_requests | |
| if _original_requests: | |
| try: | |
| import requests | |
| requests.post = _original_requests | |
| _original_requests = None | |
| print("[Mock] Restored original requests.post") | |
| except ImportError: | |
| pass | |
| # Auto-apply patches if TEST_MODE environment variable is set | |
| if os.environ.get("TEST_MODE") == "1" or os.environ.get("USE_MOCK_LLM") == "1": | |
| apply_mock_patches() | |