msse-ai-engineering / src /utils /render_monitoring.py
sethmcknight
Refactor test cases for improved readability and consistency
159faf0
"""
Monitoring utilities specifically for Render production environment.
"""
import json
import logging
import os
import time
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, TypedDict
from .memory_utils import (
clean_memory,
force_garbage_collection,
get_memory_usage,
log_memory_checkpoint,
memory_summary,
)
class MemorySample(TypedDict):
"""Type definition for memory sample records."""
timestamp: float
memory_mb: float
context: str
class MemoryStatus(TypedDict):
"""Type definition for memory status results."""
timestamp: str
memory_mb: float
peak_memory_mb: float
context: str
status: str
action_taken: Optional[str]
memory_limit_mb: float
logger = logging.getLogger(__name__)
# Configure these thresholds based on your Render free tier limits
RENDER_MEMORY_LIMIT_MB = 512
RENDER_WARNING_THRESHOLD_MB = 400 # 78% of limit
RENDER_CRITICAL_THRESHOLD_MB = 450 # 88% of limit
RENDER_EMERGENCY_THRESHOLD_MB = 480 # 94% of limit
# Memory metrics tracking
_memory_samples: List[MemorySample] = []
_memory_peak: float = 0.0
_memory_history_limit: int = 1000 # Keep last N samples to avoid unbounded growth
_memory_last_dump_time: float = 0.0
def init_render_monitoring(log_interval: int = 10) -> None:
"""
Initialize Render-specific monitoring with shorter intervals
Args:
log_interval: Seconds between memory log entries
"""
# Set environment variables for memory monitoring
os.environ["MEMORY_DEBUG"] = "1"
os.environ["MEMORY_LOG_INTERVAL"] = str(log_interval)
logger.info(
"Initialized Render monitoring with %ds intervals (memory limit: %dMB)",
log_interval,
RENDER_MEMORY_LIMIT_MB,
)
# Perform initial memory check
memory_mb = get_memory_usage()
logger.info("Initial memory: %.1fMB", memory_mb)
# Record startup metrics
_record_memory_sample("startup", memory_mb)
def check_render_memory_thresholds(context: str = "periodic") -> MemoryStatus:
"""
Check current memory against Render thresholds and take action if needed.
Args:
context: Label for the check (e.g., "request", "background")
Returns:
Dictionary with memory status details
"""
memory_mb = get_memory_usage()
_record_memory_sample(context, memory_mb)
global _memory_peak
if memory_mb > _memory_peak:
_memory_peak = memory_mb
log_memory_checkpoint(f"new_peak_memory_{context}", force=True)
status = "normal"
action_taken: Optional[str] = None
# Progressive response based on severity
if memory_mb > RENDER_EMERGENCY_THRESHOLD_MB:
logger.critical(
"EMERGENCY: Memory usage at %.1fMB - critically close to %.1fMB limit",
memory_mb,
RENDER_MEMORY_LIMIT_MB,
)
status = "emergency"
action_taken = "emergency_cleanup"
# Take emergency action
clean_memory("emergency")
force_garbage_collection()
elif memory_mb > RENDER_CRITICAL_THRESHOLD_MB:
logger.warning(
"CRITICAL: Memory usage at %.1fMB - approaching %.1fMB limit",
memory_mb,
RENDER_MEMORY_LIMIT_MB,
)
status = "critical"
action_taken = "aggressive_cleanup"
clean_memory("critical")
elif memory_mb > RENDER_WARNING_THRESHOLD_MB:
logger.warning(
"WARNING: Memory usage at %.1fMB - monitor closely (limit: %.1fMB)",
memory_mb,
RENDER_MEMORY_LIMIT_MB,
)
status = "warning"
action_taken = "light_cleanup"
clean_memory("warning")
result: MemoryStatus = {
"timestamp": datetime.now(timezone.utc).isoformat(), # Timestamp of the check
"memory_mb": memory_mb, # Current memory usage
"peak_memory_mb": _memory_peak, # Peak memory usage recorded
"context": context, # Context of the memory check
"status": status, # Current status based on memory usage
"action_taken": action_taken, # Action taken if any
"memory_limit_mb": RENDER_MEMORY_LIMIT_MB, # Memory limit defined
}
# Periodically dump memory metrics to a file in /tmp
_maybe_dump_memory_metrics()
return result
def _record_memory_sample(context: str, memory_mb: float) -> None:
"""Record a memory sample with timestamp for trend analysis."""
global _memory_samples
sample: MemorySample = {
"timestamp": time.time(),
"memory_mb": memory_mb,
"context": context,
}
_memory_samples.append(sample)
# Prevent unbounded growth by limiting history
if len(_memory_samples) > _memory_history_limit:
_memory_samples = _memory_samples[-_memory_history_limit:]
def _maybe_dump_memory_metrics() -> None:
"""Periodically save memory metrics to file for later analysis."""
global _memory_last_dump_time
# Only dump once every 5 minutes
now = time.time()
if now - _memory_last_dump_time < 300: # 5 minutes
return
try:
_memory_last_dump_time = now
# Create directory if it doesn't exist
dump_dir = "/tmp/render_metrics"
os.makedirs(dump_dir, exist_ok=True)
# Generate filename with timestamp
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
filename = f"{dump_dir}/memory_metrics_{timestamp}.json"
# Dump the samples to a file
with open(filename, "w") as f:
json.dump(
{
"samples": _memory_samples,
"peak_memory_mb": _memory_peak,
"memory_limit_mb": RENDER_MEMORY_LIMIT_MB,
"summary": memory_summary(),
},
f,
indent=2,
)
logger.info("Memory metrics dumped to %s", filename)
except Exception as e:
logger.error("Failed to dump memory metrics: %s", e)
def get_memory_trends() -> Dict[str, Any]:
"""
Get memory usage trends from collected samples.
Returns:
Dictionary with memory trends and statistics
"""
if not _memory_samples:
return {"status": "no_data"}
# Basic statistics
current = _memory_samples[-1]["memory_mb"] if _memory_samples else 0.0
# Calculate 5-minute and 1-hour trends if we have enough data
trends: Dict[str, Any] = {
"current_mb": current,
"peak_mb": _memory_peak,
"samples_count": len(_memory_samples),
}
# Calculate trend over last 5 minutes
recent_samples: List[MemorySample] = [
s for s in _memory_samples if time.time() - s["timestamp"] < 300
] # Last 5 minutes
if len(recent_samples) >= 2:
start_mb: float = recent_samples[0]["memory_mb"]
end_mb: float = recent_samples[-1]["memory_mb"]
trends["trend_5min_mb"] = end_mb - start_mb
# Calculate hourly trend if we have enough data
hour_samples: List[MemorySample] = [s for s in _memory_samples if time.time() - s["timestamp"] < 3600] # Last hour
if len(hour_samples) >= 2:
start_mb: float = hour_samples[0]["memory_mb"]
end_mb: float = hour_samples[-1]["memory_mb"]
trends["trend_1hour_mb"] = end_mb - start_mb
return trends
def add_memory_middleware(app) -> None:
"""
Add middleware to Flask app for request-level memory monitoring.
Args:
app: Flask application instance
"""
try:
@app.before_request
def check_memory_before_request():
"""Check memory before processing each request."""
try:
from flask import request
try:
memory_status = check_render_memory_thresholds(f"request_{request.endpoint}")
# If we're in emergency state, reject new requests
if memory_status["status"] == "emergency":
logger.critical(
"Rejecting request due to critical memory usage: %s %.1fMB",
request.path,
memory_status["memory_mb"],
)
return {
"status": "error",
"message": ("Service temporarily unavailable due to " "resource constraints"),
"retry_after": 30, # Suggest retry after 30 seconds
}, 503
except Exception as e:
# Don't let memory monitoring failures affect requests
logger.debug(f"Memory status check failed: {e}")
except Exception as e:
# Catch all other errors to prevent middleware from breaking the app
logger.debug(f"Memory middleware error: {e}")
@app.after_request
def log_memory_after_request(response):
"""Log memory usage after request processing."""
try:
memory_mb = get_memory_usage()
logger.debug("Memory after request: %.1fMB", memory_mb)
except Exception as e:
logger.debug(f"After request memory logging failed: {e}")
return response
except Exception as e:
# If we can't even add the middleware, log it but don't crash
logger.warning(f"Failed to add memory middleware: {e}")
# Define empty placeholder to avoid errors
@app.before_request
def memory_middleware_failed():
pass