Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

msse-ai-engineering / src /utils /render_monitoring.py

sethmcknight

Refactor test cases for improved readability and consistency

159faf0 about 2 months ago

9.62 kB

	"""
	Monitoring utilities specifically for Render production environment.
	"""

	import json
	import logging
	import os
	import time
	from datetime import datetime, timezone
	from typing import Any, Dict, List, Optional, TypedDict

	from .memory_utils import (
	clean_memory,
	force_garbage_collection,
	get_memory_usage,
	log_memory_checkpoint,
	memory_summary,
	)


	class MemorySample(TypedDict):
	"""Type definition for memory sample records."""

	timestamp: float
	memory_mb: float
	context: str


	class MemoryStatus(TypedDict):
	"""Type definition for memory status results."""

	timestamp: str
	memory_mb: float
	peak_memory_mb: float
	context: str
	status: str
	action_taken: Optional[str]
	memory_limit_mb: float


	logger = logging.getLogger(__name__)

	# Configure these thresholds based on your Render free tier limits
	RENDER_MEMORY_LIMIT_MB = 512
	RENDER_WARNING_THRESHOLD_MB = 400 # 78% of limit
	RENDER_CRITICAL_THRESHOLD_MB = 450 # 88% of limit
	RENDER_EMERGENCY_THRESHOLD_MB = 480 # 94% of limit

	# Memory metrics tracking
	_memory_samples: List[MemorySample] = []
	_memory_peak: float = 0.0
	_memory_history_limit: int = 1000 # Keep last N samples to avoid unbounded growth
	_memory_last_dump_time: float = 0.0


	def init_render_monitoring(log_interval: int = 10) -> None:
	"""
	Initialize Render-specific monitoring with shorter intervals

	Args:
	log_interval: Seconds between memory log entries
	"""
	# Set environment variables for memory monitoring
	os.environ["MEMORY_DEBUG"] = "1"
	os.environ["MEMORY_LOG_INTERVAL"] = str(log_interval)

	logger.info(
	"Initialized Render monitoring with %ds intervals (memory limit: %dMB)",
	log_interval,
	RENDER_MEMORY_LIMIT_MB,
	)

	# Perform initial memory check
	memory_mb = get_memory_usage()
	logger.info("Initial memory: %.1fMB", memory_mb)

	# Record startup metrics
	_record_memory_sample("startup", memory_mb)


	def check_render_memory_thresholds(context: str = "periodic") -> MemoryStatus:
	"""
	Check current memory against Render thresholds and take action if needed.

	Args:
	context: Label for the check (e.g., "request", "background")

	Returns:
	Dictionary with memory status details
	"""
	memory_mb = get_memory_usage()
	_record_memory_sample(context, memory_mb)

	global _memory_peak
	if memory_mb > _memory_peak:
	_memory_peak = memory_mb
	log_memory_checkpoint(f"new_peak_memory_{context}", force=True)

	status = "normal"
	action_taken: Optional[str] = None

	# Progressive response based on severity
	if memory_mb > RENDER_EMERGENCY_THRESHOLD_MB:
	logger.critical(
	"EMERGENCY: Memory usage at %.1fMB - critically close to %.1fMB limit",
	memory_mb,
	RENDER_MEMORY_LIMIT_MB,
	)
	status = "emergency"
	action_taken = "emergency_cleanup"
	# Take emergency action
	clean_memory("emergency")
	force_garbage_collection()

	elif memory_mb > RENDER_CRITICAL_THRESHOLD_MB:
	logger.warning(
	"CRITICAL: Memory usage at %.1fMB - approaching %.1fMB limit",
	memory_mb,
	RENDER_MEMORY_LIMIT_MB,
	)
	status = "critical"
	action_taken = "aggressive_cleanup"
	clean_memory("critical")

	elif memory_mb > RENDER_WARNING_THRESHOLD_MB:
	logger.warning(
	"WARNING: Memory usage at %.1fMB - monitor closely (limit: %.1fMB)",
	memory_mb,
	RENDER_MEMORY_LIMIT_MB,
	)
	status = "warning"
	action_taken = "light_cleanup"
	clean_memory("warning")

	result: MemoryStatus = {
	"timestamp": datetime.now(timezone.utc).isoformat(), # Timestamp of the check
	"memory_mb": memory_mb, # Current memory usage
	"peak_memory_mb": _memory_peak, # Peak memory usage recorded
	"context": context, # Context of the memory check
	"status": status, # Current status based on memory usage
	"action_taken": action_taken, # Action taken if any
	"memory_limit_mb": RENDER_MEMORY_LIMIT_MB, # Memory limit defined
	}

	# Periodically dump memory metrics to a file in /tmp
	_maybe_dump_memory_metrics()

	return result


	def _record_memory_sample(context: str, memory_mb: float) -> None:
	"""Record a memory sample with timestamp for trend analysis."""
	global _memory_samples

	sample: MemorySample = {
	"timestamp": time.time(),
	"memory_mb": memory_mb,
	"context": context,
	}

	_memory_samples.append(sample)

	# Prevent unbounded growth by limiting history
	if len(_memory_samples) > _memory_history_limit:
	_memory_samples = _memory_samples[-_memory_history_limit:]


	def _maybe_dump_memory_metrics() -> None:
	"""Periodically save memory metrics to file for later analysis."""
	global _memory_last_dump_time

	# Only dump once every 5 minutes
	now = time.time()
	if now - _memory_last_dump_time < 300: # 5 minutes
	return

	try:
	_memory_last_dump_time = now

	# Create directory if it doesn't exist
	dump_dir = "/tmp/render_metrics"
	os.makedirs(dump_dir, exist_ok=True)

	# Generate filename with timestamp
	timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
	filename = f"{dump_dir}/memory_metrics_{timestamp}.json"

	# Dump the samples to a file
	with open(filename, "w") as f:
	json.dump(
	{
	"samples": _memory_samples,
	"peak_memory_mb": _memory_peak,
	"memory_limit_mb": RENDER_MEMORY_LIMIT_MB,
	"summary": memory_summary(),
	},
	f,
	indent=2,
	)

	logger.info("Memory metrics dumped to %s", filename)

	except Exception as e:
	logger.error("Failed to dump memory metrics: %s", e)


	def get_memory_trends() -> Dict[str, Any]:
	"""
	Get memory usage trends from collected samples.

	Returns:
	Dictionary with memory trends and statistics
	"""
	if not _memory_samples:
	return {"status": "no_data"}

	# Basic statistics
	current = _memory_samples[-1]["memory_mb"] if _memory_samples else 0.0

	# Calculate 5-minute and 1-hour trends if we have enough data
	trends: Dict[str, Any] = {
	"current_mb": current,
	"peak_mb": _memory_peak,
	"samples_count": len(_memory_samples),
	}

	# Calculate trend over last 5 minutes
	recent_samples: List[MemorySample] = [
	s for s in _memory_samples if time.time() - s["timestamp"] < 300
	] # Last 5 minutes

	if len(recent_samples) >= 2:
	start_mb: float = recent_samples[0]["memory_mb"]
	end_mb: float = recent_samples[-1]["memory_mb"]
	trends["trend_5min_mb"] = end_mb - start_mb

	# Calculate hourly trend if we have enough data
	hour_samples: List[MemorySample] = [s for s in _memory_samples if time.time() - s["timestamp"] < 3600] # Last hour

	if len(hour_samples) >= 2:
	start_mb: float = hour_samples[0]["memory_mb"]
	end_mb: float = hour_samples[-1]["memory_mb"]
	trends["trend_1hour_mb"] = end_mb - start_mb

	return trends


	def add_memory_middleware(app) -> None:
	"""
	Add middleware to Flask app for request-level memory monitoring.

	Args:
	app: Flask application instance
	"""
	try:

	@app.before_request
	def check_memory_before_request():
	"""Check memory before processing each request."""
	try:
	from flask import request

	try:
	memory_status = check_render_memory_thresholds(f"request_{request.endpoint}")

	# If we're in emergency state, reject new requests
	if memory_status["status"] == "emergency":
	logger.critical(
	"Rejecting request due to critical memory usage: %s %.1fMB",
	request.path,
	memory_status["memory_mb"],
	)
	return {
	"status": "error",
	"message": ("Service temporarily unavailable due to " "resource constraints"),
	"retry_after": 30, # Suggest retry after 30 seconds
	}, 503
	except Exception as e:
	# Don't let memory monitoring failures affect requests
	logger.debug(f"Memory status check failed: {e}")
	except Exception as e:
	# Catch all other errors to prevent middleware from breaking the app
	logger.debug(f"Memory middleware error: {e}")

	@app.after_request
	def log_memory_after_request(response):
	"""Log memory usage after request processing."""
	try:
	memory_mb = get_memory_usage()
	logger.debug("Memory after request: %.1fMB", memory_mb)
	except Exception as e:
	logger.debug(f"After request memory logging failed: {e}")
	return response

	except Exception as e:
	# If we can't even add the middleware, log it but don't crash
	logger.warning(f"Failed to add memory middleware: {e}")

	# Define empty placeholder to avoid errors
	@app.before_request
	def memory_middleware_failed():
	pass