Spaces:

SWE-Arena
/

SWE-Issue

Running

App Files Files Community

zhimin-z commited on 26 days ago

Commit

5998589

1 Parent(s): 68ab628

refine

Browse files

Files changed (7) hide show

.gitignore +2 -1
Dockerfile +6 -18
README.md +0 -1
app.py +239 -1322
docker-compose.yml +23 -0
msr.py +573 -660
requirements.txt +3 -5

.gitignore CHANGED Viewed

@@ -2,4 +2,5 @@
 *.env
 *.venv
 *.ipynb
-*.pyc

 *.env
 *.venv
 *.ipynb
+*.pyc
+*.duckdb

Dockerfile CHANGED Viewed

@@ -1,34 +1,22 @@
-# Use official Python runtime as base image
 FROM python:3.12-slim
 # Set working directory
 WORKDIR /app
-# Install system dependencies (if needed)
 RUN apt-get update && apt-get install -y \
-    git \
     && rm -rf /var/lib/apt/lists/*
-# Copy requirements.txt
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy application files
-COPY .env .
-COPY msr.py .
-# Create a non-root user for security (optional but recommended)
-RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
-USER appuser
-# Expose port for Gradio web interface (default is 7860)
-EXPOSE 7860
 # Set environment variables
-ENV GRADIO_SERVER_NAME=0.0.0.0
-ENV GRADIO_SERVER_PORT=7860
-# Run the Gradio app
 CMD ["python", "msr.py"]

 FROM python:3.12-slim
 # Set working directory
 WORKDIR /app
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
     && rm -rf /var/lib/apt/lists/*
+# Copy requirements file
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Set environment variables
+ENV PYTHONUNBUFFERED=1
+# Run the mining script with scheduler
 CMD ["python", "msr.py"]

README.md CHANGED Viewed

@@ -52,7 +52,6 @@ Behind the scenes, we're doing a few things:
 **Data Collection**
 We search GitHub using multiple query patterns to catch all issues associated with an agent:
-- Issues authored by the agent (`author:agent-name`)
 - Issues assigned to the agent (`assignee:agent-name`)
 **Regular Updates**

 **Data Collection**
 We search GitHub using multiple query patterns to catch all issues associated with an agent:
 - Issues assigned to the agent (`assignee:agent-name`)
 **Regular Updates**

app.py CHANGED Viewed

@@ -3,12 +3,10 @@ from gradio_leaderboard import Leaderboard, ColumnFilter
 import json
 import os
 import time
-import tempfile
 import requests
-from datetime import datetime, timezone, timedelta
-from collections import defaultdict
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
 from dotenv import load_dotenv
 import pandas as pd
 import random
@@ -16,8 +14,6 @@ import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
-from google.cloud import bigquery
-import backoff
 # Load environment variables
 load_dotenv()
@@ -27,10 +23,8 @@ load_dotenv()
 # =============================================================================
 AGENTS_REPO = "SWE-Arena/bot_metadata"  # HuggingFace dataset for agent metadata
-ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata"  # HuggingFace dataset for issue metadata
-LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # HuggingFace dataset for leaderboard metadata
-LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for leaderboard
-UPDATE_TIME_FRAME_DAYS = 30  # How often to re-mine data via BigQuery
 LEADERBOARD_COLUMNS = [
     ("Agent Name", "string"),
@@ -45,1005 +39,57 @@ LEADERBOARD_COLUMNS = [
 # =============================================================================
 def is_rate_limit_error(e):
-    """Check if the exception is a rate limit error (429)."""
-    return isinstance(e, HfHubHTTPError) and e.response.status_code == 429
 @backoff.on_exception(
     backoff.expo,
     HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
     base=300,
     max_value=3600,
-    jitter=backoff.full_jitter,
-    on_backoff=lambda details: print(f"   ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
 )
 def list_repo_files_with_backoff(api, **kwargs):
-    """List repo files with exponential backoff on rate limit errors."""
     return api.list_repo_files(**kwargs)
-@backoff.on_exception(
-    backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
-    base=300,
-    max_value=3600,
-    jitter=backoff.full_jitter,
-    on_backoff=lambda details: print(f"   ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
-)
-def hf_hub_download_with_backoff(**kwargs):
-    """Download from HF Hub with exponential backoff on rate limit errors."""
-    return hf_hub_download(**kwargs)
 @backoff.on_exception(
     backoff.expo,
     HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
     base=300,
     max_value=3600,
-    jitter=backoff.full_jitter,
-    on_backoff=lambda details: print(f"   ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
-)
-def upload_file_with_backoff(api, **kwargs):
-    """Upload file with exponential backoff on rate limit errors."""
-    return api.upload_file(**kwargs)
-@backoff.on_exception(
-    backoff.expo,
-    HfHubHTTPError,
     giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
-    base=300,
-    max_value=3600,
-    jitter=backoff.full_jitter,
-    on_backoff=lambda details: print(f"   ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
-)
-def upload_folder_with_backoff(api, **kwargs):
-    """Upload folder with exponential backoff on rate limit errors."""
-    return api.upload_folder(**kwargs)
-# =============================================================================
-# JSONL FILE OPERATIONS
-# =============================================================================
-def load_jsonl(filename):
-    """Load JSONL file and return list of dictionaries."""
-    if not os.path.exists(filename):
-        return []
-    data = []
-    with open(filename, 'r', encoding='utf-8') as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                try:
-                    entry = json.loads(line)
-                    data.append(entry)
-                except json.JSONDecodeError as e:
-                    print(f"Warning: Skipping invalid JSON line: {e}")
-    return data
-def save_jsonl(filename, data):
-    """Save list of dictionaries to JSONL file."""
-    with open(filename, 'w', encoding='utf-8') as f:
-        for item in data:
-            f.write(json.dumps(item) + '\n')
-def cache_to_dict(cache_list):
-    """Convert list of cache entries to dictionary by identifier."""
-    return {entry['github_identifier']: entry for entry in cache_list}
-def dict_to_cache(cache_dict):
-    """Convert dictionary back to list of values."""
-    return list(cache_dict.values())
-def normalize_date_format(date_string):
-    """
-    Convert date strings to standardized ISO 8601 format with Z suffix.
-    Handles both old format (2025-10-15T23:23:47.983068) and new format (2025-10-15T23:23:47Z).
-    Also handles space separator (2025-06-23 07:18:28) and incomplete timezone offsets (+00).
-    """
-    if not date_string or date_string == 'N/A':
-        return 'N/A'
-    try:
-        # Replace space with 'T' for ISO format compatibility
-        date_string = date_string.replace(' ', 'T')
-        # Fix incomplete timezone offset (+00 or -00 -> +00:00 or -00:00)
-        if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
-            date_string = date_string + ':00'
-        # Parse the date string (handles both with and without microseconds)
-        dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
-        # Convert to standardized format
-        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
-    except Exception as e:
-        print(f"Warning: Could not parse date '{date_string}': {e}")
-        return date_string
-# =============================================================================
-# BIGQUERY OPERATIONS
-# =============================================================================
-def get_bigquery_client():
-    """
-    Initialize BigQuery client using credentials from environment variable.
-    Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
-    the service account JSON credentials as a string.
-    """
-    # Get the JSON content from environment variable
-    creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
-    if creds_json:
-        # Create a temporary file to store credentials
-        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
-            temp_file.write(creds_json)
-            temp_path = temp_file.name
-        # Set environment variable to point to temp file
-        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
-        # Initialize BigQuery client
-        client = bigquery.Client()
-        # Clean up temp file
-        os.unlink(temp_path)
-        return client
-    else:
-        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
-def generate_table_union_statements(start_date, end_date):
-    """
-    Generate UNION ALL statements for githubarchive.month tables in date range.
-    Args:
-        start_date: Start datetime
-        end_date: End datetime
-    Returns:
-        String with UNION ALL SELECT statements for all monthly tables in range
-    """
-    table_names = []
-    # Start from the beginning of start_date's month
-    current_date = start_date.replace(day=1)
-    end_month = end_date.replace(day=1)
-    while current_date <= end_month:
-        table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
-        table_names.append(table_name)
-        # Move to next month
-        if current_date.month == 12:
-            current_date = current_date.replace(year=current_date.year + 1, month=1)
-        else:
-            current_date = current_date.replace(month=current_date.month + 1)
-    # Create UNION ALL chain
-    union_parts = [f"SELECT * FROM {table}" for table in table_names]
-    return " UNION ALL ".join(union_parts)
-def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
-    """
-    Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
-    Splits agents into smaller batches to avoid performance issues with large UNNEST arrays
-    and correlated subqueries. Each batch query runs much faster than one massive query.
-    Args:
-        client: BigQuery client instance
-        identifiers: List of GitHub usernames/bot identifiers
-        start_date: Start datetime (timezone-aware)
-        end_date: End datetime (timezone-aware)
-        batch_size: Number of agents per batch (default: 100)
-        upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
-    Returns:
-        Dictionary mapping agent identifier to list of issue metadata
-    """
-    print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
-    print(f"   Batch size: {batch_size} agents per query")
-    print(f"   Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
-    # Split identifiers into batches
-    batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
-    print(f"   Total batches: {len(batches)}")
-    # Collect results from all batches
-    all_metadata = {}
-    for batch_num, batch_identifiers in enumerate(batches, 1):
-        print(f"\n{'─'*80}")
-        print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)")
-        print(f"{'─'*80}")
-        try:
-            batch_results = fetch_all_issue_metadata_single_query(
-                client, batch_identifiers, start_date, end_date
-            )
-            # Merge results
-            for identifier, metadata_list in batch_results.items():
-                if identifier in all_metadata:
-                    all_metadata[identifier].extend(metadata_list)
-                else:
-                    all_metadata[identifier] = metadata_list
-            print(f"   ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
-            # Upload immediately after this batch if enabled
-            if upload_immediately and batch_results:
-                print(f"\n   🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
-                upload_success = 0
-                upload_errors = 0
-                for identifier, metadata_list in batch_results.items():
-                    if metadata_list:
-                        if save_issue_metadata_to_hf(metadata_list, identifier):
-                            upload_success += 1
-                        else:
-                            upload_errors += 1
-                print(f"   ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
-        except Exception as e:
-            print(f"   ✗ Batch {batch_num} failed: {str(e)}")
-            print(f"   Continuing with remaining batches...")
-            import traceback
-            traceback.print_exc()
-            continue
-    print(f"\n{'='*80}")
-    print(f"✅ All batches completed!")
-    print(f"   Total agents with data: {len(all_metadata)}")
-    total_issues = sum(len(issues) for issues in all_metadata.values())
-    print(f"   Total issues found: {total_issues}")
-    print(f"{'='*80}\n")
-    return all_metadata
-def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
-    """
-    Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query.
-    This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
-    deduplicates to get the latest state of each issue. Filters by issue author,
-    commenter, or assignee.
-    NOTE: This function is designed for smaller batches (~100 agents). For large
-    numbers of agents, use fetch_issue_metadata_batched() instead.
-    Args:
-        client: BigQuery client instance
-        identifiers: List of GitHub usernames/bot identifiers (recommended: <100)
-        start_date: Start datetime (timezone-aware)
-        end_date: End datetime (timezone-aware)
-    Returns:
-        Dictionary mapping agent identifier to list of issue metadata:
-        {
-            'agent-identifier': [
-                {
-                    'url': Issue URL,
-                    'created_at': Issue creation timestamp,
-                    'closed_at': Close timestamp (if closed, else None),
-                    'state_reason': Reason for closure (completed/not_planned/etc.)
-                },
-                ...
-            ],
-            ...
-        }
-    """
-    print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY")
-    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
-    # Generate table UNION statements for issue events
-    issue_tables = generate_table_union_statements(start_date, end_date)
-    # Build identifier list for IN clause (handle both bot and non-bot versions)
-    identifier_set = set()
-    for id in identifiers:
-        identifier_set.add(id)
-        # Also add stripped version without [bot] suffix
-        stripped = id.replace('[bot]', '')
-        if stripped != id:
-            identifier_set.add(stripped)
-    # Create array format for UNNEST (avoids 256KB query size limit)
-    identifier_array = '[' + ', '.join([f'"{id}"' for id in identifier_set]) + ']'
-    print(f"   Total identifiers (including bot/non-bot variants): {len(identifier_set)}")
-    # Build comprehensive query with CTEs
-    query = f"""
-    WITH agent_identifiers AS (
-      -- Create a table of all agent identifiers using UNNEST
-      -- This avoids hitting BigQuery's 256KB query size limit with large IN clauses
-      SELECT identifier
-      FROM UNNEST({identifier_array}) AS identifier
-    ),
-    issue_events AS (
-      -- Get all issue events and comment events for ALL agents
-      SELECT
-        JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as url,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as created_at,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as closed_at,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.state_reason') as state_reason,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as author,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') as assignee,
-        JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') as commenter,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
-        repo.name as repo_name,
-        created_at as event_time
-      FROM (
-        {issue_tables}
-      )
-      WHERE
-        type IN ('IssuesEvent', 'IssueCommentEvent')
-        -- Exclude pull requests (they have pull_request field)
-        AND JSON_EXTRACT(payload, '$.issue.pull_request') IS NULL
-        AND JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') IS NOT NULL
-        -- Filter by author OR commenter OR assignee
-        AND (
-          JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') IN (SELECT identifier FROM agent_identifiers)
-          OR JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') IN (SELECT identifier FROM agent_identifiers)
-          OR JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') IN (SELECT identifier FROM agent_identifiers)
-        )
-    ),
-    latest_states AS (
-      -- Deduplicate to get latest state for each issue
-      SELECT
-        url,
-        created_at,
-        closed_at,
-        state_reason,
-        author,
-        assignee,
-        commenter
-      FROM issue_events
-      QUALIFY ROW_NUMBER() OVER (
-        PARTITION BY repo_name, issue_number
-        ORDER BY event_time DESC
-      ) = 1
-    ),
-    agent_issues AS (
-      -- Map each issue to its relevant agent(s)
-      SELECT DISTINCT
-        CASE
-          WHEN author IN (SELECT identifier FROM agent_identifiers) THEN author
-          WHEN commenter IN (SELECT identifier FROM agent_identifiers) THEN commenter
-          WHEN assignee IN (SELECT identifier FROM agent_identifiers) THEN assignee
-          ELSE NULL
-        END as agent_identifier,
-        url,
-        created_at,
-        closed_at,
-        state_reason
-      FROM latest_states
-      WHERE
-        author IN (SELECT identifier FROM agent_identifiers)
-        OR commenter IN (SELECT identifier FROM agent_identifiers)
-        OR assignee IN (SELECT identifier FROM agent_identifiers)
     )
-    SELECT
-      agent_identifier,
-      url,
-      created_at,
-      closed_at,
-      state_reason
-    FROM agent_issues
-    WHERE agent_identifier IS NOT NULL
-    ORDER BY agent_identifier, created_at DESC
-    """
-    # Calculate number of days for reporting
-    query_days = (end_date - start_date).days
-    print(f"   Querying {query_days} days for issue and comment events...")
-    print(f"   Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
-    try:
-        query_job = client.query(query)
-        results = list(query_job.result())
-        print(f"   ✓ Found {len(results)} total issue records across all agents")
-        # Group results by agent
-        metadata_by_agent = defaultdict(list)
-        for row in results:
-            agent_id = row.agent_identifier
-            # Convert datetime objects to ISO strings
-            created_at = row.created_at
-            if hasattr(created_at, 'isoformat'):
-                created_at = created_at.isoformat()
-            closed_at = row.closed_at
-            if hasattr(closed_at, 'isoformat'):
-                closed_at = closed_at.isoformat()
-            metadata_by_agent[agent_id].append({
-                'url': row.url,
-                'created_at': created_at,
-                'closed_at': closed_at,
-                'state_reason': row.state_reason,
-            })
-        # Print breakdown by agent
-        print(f"\n   📊 Results breakdown by agent:")
-        for identifier in identifiers:
-            # Check both original and stripped versions
-            count = len(metadata_by_agent.get(identifier, []))
-            stripped = identifier.replace('[bot]', '')
-            if stripped != identifier:
-                count += len(metadata_by_agent.get(stripped, []))
-            if count > 0:
-                # Merge both versions if needed
-                all_metadata = metadata_by_agent.get(identifier, []) + metadata_by_agent.get(stripped, [])
-                completed_count = sum(1 for m in all_metadata if m['state_reason'] == 'completed')
-                closed_count = sum(1 for m in all_metadata if m['closed_at'] is not None)
-                open_count = count - closed_count
-                print(f"      {identifier}: {count} issues ({completed_count} completed, {closed_count} closed, {open_count} open)")
-        # Convert defaultdict to regular dict and merge bot/non-bot versions
-        final_metadata = {}
-        for identifier in identifiers:
-            combined = metadata_by_agent.get(identifier, [])
-            stripped = identifier.replace('[bot]', '')
-            if stripped != identifier and stripped in metadata_by_agent:
-                combined.extend(metadata_by_agent[stripped])
-            if combined:
-                final_metadata[identifier] = combined
-        return final_metadata
-    except Exception as e:
-        print(f"   ✗ BigQuery error: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return {}
 # =============================================================================
-# GITHUB API OPERATIONS (Minimal - for validation only)
 # =============================================================================
-def get_github_token():
-    """Get GitHub token from environment variables for validation purposes."""
-    token = os.getenv('GITHUB_TOKEN')
-    if not token:
-        print("Warning: GITHUB_TOKEN not found for validation")
-    return token
 def validate_github_username(identifier):
-    """Verify that a GitHub identifier exists (simple validation for submission)."""
     try:
-        token = get_github_token()
-        headers = {'Authorization': f'token {token}'} if token else {}
-        url = f'https://api.github.com/users/{identifier}'
-        response = requests.get(url, headers=headers, timeout=10)
-        if response.status_code == 200:
-            return True, "Username is valid"
-        elif response.status_code == 404:
-            return False, "GitHub identifier not found"
-        else:
-            return False, f"Validation error: HTTP {response.status_code}"
     except Exception as e:
         return False, f"Validation error: {str(e)}"
-# =============================================================================
-# ISSUE METADATA OPERATIONS
-# =============================================================================
-def extract_issue_metadata(issue):
-    """
-    Extract minimal issue metadata for efficient storage.
-    Only keeps essential fields: url, created_at, closed_at, state_reason.
-    Note: agent_name is not stored as it's inferred from the folder structure.
-    Issue states:
-    - state: "open" or "closed"
-    - state_reason: "completed" (resolved), "not_planned" (closed as not planned), or None (still open)
-    """
-    # Extract dates and state
-    created_at = issue.get('created_at')
-    closed_at = issue.get('closed_at')
-    state = issue.get('state')
-    state_reason = issue.get('state_reason')
-    return {
-        'url': issue.get('url'),
-        'created_at': created_at,
-        'closed_at': closed_at,
-        'state': state,
-        'state_reason': state_reason
-    }
-def calculate_issue_stats_from_metadata(metadata_list):
-    """
-    Calculate statistics from a list of issue metadata (lightweight objects).
-    Works with minimal metadata: url, created_at, closed_at, state, state_reason.
-    Returns a dictionary with comprehensive issue metrics.
-    Resolved Rate is calculated as:
-        completed issues / closed issues * 100
-    Completed Issues = issues closed as completed (state_reason="completed")
-    Closed Issues = all issues that have been closed (closed_at is not None)
-    We do NOT count issues closed as not planned (state_reason="not_planned") as resolved,
-    but they ARE counted in the denominator as closed issues.
-    """
-    total_issues = len(metadata_list)
-    # Count closed issues (those with closed_at timestamp)
-    closed_issues = sum(1 for issue_meta in metadata_list
-                       if issue_meta.get('closed_at') is not None)
-    # Count completed issues (subset of closed issues with state_reason="completed")
-    completed = sum(1 for issue_meta in metadata_list
-                   if issue_meta.get('state_reason') == 'completed')
-    # Calculate resolved rate as: completed / closed (not completed / total)
-    resolved_rate = (completed / closed_issues * 100) if closed_issues > 0 else 0
-    return {
-        'total_issues': total_issues,
-        'closed_issues': closed_issues,
-        'resolved_issues': completed,
-        'resolved_rate': round(resolved_rate, 2),
-    }
-def calculate_monthly_metrics_by_agent(top_n=None):
-    """
-    Calculate monthly metrics for all agents (or top N agents) for visualization.
-    Loads data directly from SWE-Arena/issue_metadata dataset.
-    Args:
-        top_n: If specified, only return metrics for the top N agents by total issues.
-               Agents are ranked by their total issue count across all months.
-    Returns:
-        dict: {
-            'agents': list of agent names,
-            'months': list of month labels (e.g., '2025-01'),
-            'data': {
-                agent_name: {
-                    'resolved_rates': list of resolved rates by month,
-                    'total_issues': list of issue counts by month,
-                    'resolved_issues': list of resolved issue counts by month
-                }
-            }
-        }
-    """
-    # Load ALL agents from HuggingFace agents repo
-    agents = load_agents_from_hf()
-    # Create mapping from agent_identifier to agent_name
-    identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
-    # Load all issue metadata from issue_metadata dataset
-    all_metadata = load_issue_metadata()
-    if not all_metadata:
-        return {'agents': [], 'months': [], 'data': {}}
-    # Group by agent and month
-    agent_month_data = defaultdict(lambda: defaultdict(list))
-    for issue_meta in all_metadata:
-        agent_identifier = issue_meta.get('agent_identifier')
-        created_at = issue_meta.get('created_at')
-        if not agent_identifier or not created_at:
-            continue
-        # Get agent_name from identifier
-        agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
-        try:
-            dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
-            month_key = f"{dt.year}-{dt.month:02d}"
-            agent_month_data[agent_name][month_key].append(issue_meta)
-        except Exception as e:
-            print(f"Warning: Could not parse date '{created_at}': {e}")
-            continue
-    # Get all unique months and sort them
-    all_months = set()
-    for agent_data in agent_month_data.values():
-        all_months.update(agent_data.keys())
-    months = sorted(list(all_months))
-    # Calculate metrics for each agent and month
-    result_data = {}
-    for agent_name, month_dict in agent_month_data.items():
-        resolved_rates = []
-        total_issues_list = []
-        resolved_issues_list = []
-        for month in months:
-            issues_in_month = month_dict.get(month, [])
-            # Count completed issues (those with state_reason="completed")
-            completed_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')
-            # Count closed issues (those with closed_at timestamp)
-            closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at') is not None)
-            # Total issues created in this month
-            total_count = len(issues_in_month)
-            # Calculate resolved rate as: completed / closed (not completed / total)
-            resolved_rate = (completed_count / closed_count * 100) if closed_count > 0 else None
-            resolved_rates.append(resolved_rate)
-            total_issues_list.append(total_count)
-            resolved_issues_list.append(completed_count)
-        result_data[agent_name] = {
-            'resolved_rates': resolved_rates,
-            'total_issues': total_issues_list,
-            'resolved_issues': resolved_issues_list
-        }
-    # Filter to top N agents if specified
-    agents_list = sorted(list(agent_month_data.keys()))
-    if top_n is not None and top_n > 0:
-        # Calculate total issues for each agent across all months
-        agent_totals = []
-        for agent_name in agents_list:
-            total_issues = sum(result_data[agent_name]['total_issues'])
-            agent_totals.append((agent_name, total_issues))
-        # Sort by total issues (descending) and take top N
-        agent_totals.sort(key=lambda x: x[1], reverse=True)
-        top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
-        # Filter result_data to only include top agents
-        result_data = {agent: result_data[agent] for agent in top_agents if agent in result_data}
-        agents_list = top_agents
-    return {
-        'agents': agents_list,
-        'months': months,
-        'data': result_data
-    }
-# =============================================================================
-# ISSUE METADATA STORAGE & RETRIEVAL
-# =============================================================================
-def group_metadata_by_date(metadata_list):
-    """
-    Group issue metadata by exact date (year.month.day) for efficient daily storage.
-    Returns dict: {(year, month, day): [metadata_list]}
-    """
-    grouped = defaultdict(list)
-    for issue_meta in metadata_list:
-        created_at = issue_meta.get('created_at')
-        if not created_at:
-            continue
-        try:
-            dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
-            key = (dt.year, dt.month, dt.day)
-            grouped[key].append(issue_meta)
-        except Exception as e:
-            print(f"Warning: Could not parse date '{created_at}': {e}")
-    return dict(grouped)
-def save_issue_metadata_to_hf(metadata_list, agent_identifier):
-    """
-    Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
-    Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
-    This function uses COMPLETE OVERWRITE strategy (not append/deduplicate).
-    Uses upload_folder for single-commit batch uploads (avoids rate limit issues).
-    Args:
-        metadata_list: List of issue metadata dictionaries
-        agent_identifier: GitHub identifier of the agent (used as folder name)
-    """
-    import tempfile
-    import shutil
-    temp_dir = None
-    try:
-        token = get_hf_token()
-        if not token:
-            raise Exception("No HuggingFace token found")
-        api = HfApi(token=token)
-        # Group by exact date (year, month, day)
-        grouped = group_metadata_by_date(metadata_list)
-        if not grouped:
-            print(f"   No valid metadata to save for {agent_identifier}")
-            return False
-        # Create temporary directory for batch upload
-        temp_dir = tempfile.mkdtemp()
-        agent_folder = os.path.join(temp_dir, agent_identifier)
-        os.makedirs(agent_folder, exist_ok=True)
-        print(f"📦 Preparing batch upload for {agent_identifier} ({len(grouped)} daily files)...")
-        # Process each daily file
-        for (issue_year, month, day), day_metadata in grouped.items():
-            filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
-            local_filename = os.path.join(agent_folder, f"{issue_year}.{month:02d}.{day:02d}.jsonl")
-            # Sort by created_at for better organization
-            day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
-            # Save to temp directory (complete overwrite, no merging)
-            save_jsonl(local_filename, day_metadata)
-            print(f"   Prepared {len(day_metadata)} issues for {filename}")
-        # Upload entire folder using upload_folder (single commit per agent)
-        print(f"🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
-        upload_folder_with_backoff(
-            api,
-            folder_path=temp_dir,
-            repo_id=ISSUE_METADATA_REPO,
-            repo_type="dataset",
-            commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
-        )
-        print(f"   ✓ Batch upload complete for {agent_identifier}")
-        return True
-    except Exception as e:
-        print(f"✗ Error saving issue metadata: {str(e)}")
-        return False
-    finally:
-        # Always clean up temporary directory
-        if temp_dir and os.path.exists(temp_dir):
-            shutil.rmtree(temp_dir)
-def load_issue_metadata():
-    """
-    Load issue metadata from the last LEADERBOARD_TIME_FRAME_DAYS only.
-    Structure: [agent_identifier]/YYYY.MM.DD.jsonl
-    Returns:
-        List of dictionaries with 'agent_identifier' added to each issue metadata.
-        Only includes issues within the last LEADERBOARD_TIME_FRAME_DAYS.
-    """
-    # Calculate cutoff date based on LEADERBOARD_TIME_FRAME_DAYS
-    current_time = datetime.now(timezone.utc)
-    cutoff_date = current_time - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
-    try:
-        api = HfApi()
-        token = get_hf_token()
-        # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
-        # Filter for files within the time frame: [agent_identifier]/YYYY.MM.DD.jsonl
-        # Parse date from filename and only include files within LEADERBOARD_TIME_FRAME_DAYS
-        time_frame_files = []
-        for f in files:
-            if f.endswith('.jsonl'):
-                parts = f.split('/')
-                if len(parts) == 2:  # [agent_identifier]/YYYY.MM.DD.jsonl
-                    filename = parts[1]
-                    try:
-                        # Extract date from filename: YYYY.MM.DD.jsonl
-                        date_part = filename.replace('.jsonl', '')  # Get YYYY.MM.DD
-                        date_components = date_part.split('.')
-                        if len(date_components) == 3:
-                            file_year, file_month, file_day = map(int, date_components)
-                            file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
-                            # Only include files within the time frame
-                            if file_date >= cutoff_date:
-                                time_frame_files.append(f)
-                    except Exception:
-                        # Skip files with unparseable dates
-                        continue
-        print(f"📥 [LOAD] Reading cached issue metadata from HuggingFace ({len(time_frame_files)} files, last {LEADERBOARD_TIME_FRAME_DAYS} days)...")
-        all_metadata = []
-        for filename in time_frame_files:
-            try:
-                # Extract agent_identifier from path (first part)
-                # Format: agent_identifier/YYYY.MM.DD.jsonl
-                parts = filename.split('/')
-                if len(parts) != 2:
-                    print(f"   Warning: Unexpected filename format: {filename}")
-                    continue
-                agent_identifier = parts[0]
-                file_path = hf_hub_download_with_backoff(
-                    repo_id=ISSUE_METADATA_REPO,
-                    filename=filename,
-                    repo_type="dataset",
-                    token=token
-                )
-                day_metadata = load_jsonl(file_path)
-                # Add agent_identifier and filter by date as a double-check
-                for issue_meta in day_metadata:
-                    # Validate issue date against cutoff
-                    created_at = issue_meta.get('created_at')
-                    if created_at:
-                        try:
-                            dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
-                            if dt < cutoff_date:
-                                continue  # Skip issues outside time frame
-                        except Exception:
-                            pass  # Keep issues with unparseable dates
-                    issue_meta['agent_identifier'] = agent_identifier
-                    all_metadata.append(issue_meta)
-                print(f"   ✓ Loaded {len(day_metadata)} issues from {filename}")
-            except Exception as e:
-                print(f"   Warning: Could not load {filename}: {str(e)}")
-        print(f"✓ Loaded {len(all_metadata)} total issues from last {LEADERBOARD_TIME_FRAME_DAYS} days")
-        return all_metadata
-    except Exception as e:
-        print(f"✗ Error loading issue metadata from last {LEADERBOARD_TIME_FRAME_DAYS} days: {str(e)}")
-        return []
-def get_latest_issue_date_for_agent(agent_identifier):
-    """
-    Get the latest issue creation date for an agent from stored metadata.
-    Used for incremental updates - only fetch issues newer than this date.
-    Structure: [agent_identifier]/YYYY.MM.DD.jsonl
-    Args:
-        agent_identifier: GitHub identifier of the agent
-    Returns:
-        datetime or None if no existing issues found.
-    """
-    try:
-        api = HfApi()
-        token = get_hf_token()
-        # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
-        # Filter for files in this agent's folder
-        # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
-        agent_pattern = f"{agent_identifier}/"
-        agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
-        if not agent_files:
-            return None
-        # Find latest created_at across all files
-        latest_date = None
-        for filename in agent_files:
-            try:
-                file_path = hf_hub_download_with_backoff(
-                    repo_id=ISSUE_METADATA_REPO,
-                    filename=filename,
-                    repo_type="dataset",
-                    token=token
-                )
-                metadata = load_jsonl(file_path)
-                for issue in metadata:
-                    created_at = issue.get('created_at')
-                    if created_at:
-                        try:
-                            dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
-                            if latest_date is None or dt > latest_date:
-                                latest_date = dt
-                        except Exception:
-                            continue
-            except Exception:
-                continue
-        return latest_date
-    except Exception:
-        return None
-def get_daily_files_last_time_frame(agent_identifier):
-    """
-    Get list of daily file paths for an agent from the configured time frame.
-    Args:
-        agent_identifier: GitHub identifier of the agent
-    Returns:
-        List of file paths in format: [agent_identifier]/YYYY.MM.DD.jsonl
-    """
-    try:
-        api = HfApi()
-        token = get_hf_token()
-        # Calculate date range using configured time frame
-        today = datetime.now(timezone.utc)
-        cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
-        # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
-        # Filter for files in this agent's folder
-        agent_pattern = f"{agent_identifier}/"
-        agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
-        # Filter by date range (extract date from filename)
-        recent_files = []
-        for filename in agent_files:
-            try:
-                # Extract date from filename: YYYY.MM.DD.jsonl
-                parts = filename.split('/')
-                if len(parts) != 2:
-                    continue
-                date_part = parts[1].replace('.jsonl', '')  # Get YYYY.MM.DD
-                date_components = date_part.split('.')
-                if len(date_components) != 3:
-                    continue
-                file_year, file_month, file_day = map(int, date_components)
-                file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
-                # Include if within configured time frame
-                if cutoff_date <= file_date <= today:
-                    recent_files.append(filename)
-            except Exception:
-                continue
-        return recent_files
-    except Exception as e:
-        print(f"Error getting daily files: {str(e)}")
-        return []
 # =============================================================================
 # HUGGINGFACE DATASET OPERATIONS
 # =============================================================================
@@ -1055,7 +101,7 @@ def load_agents_from_hf():
         agents = []
         # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
         # Filter for JSON files only
         json_files = [f for f in files if f.endswith('.json')]
@@ -1082,19 +128,13 @@ def load_agents_from_hf():
                     # Add or override github_identifier to match filename
                     agent_data['github_identifier'] = filename_identifier
-                    # Normalize name field: use 'name' if exists, otherwise use identifier
-                    if 'name' in agent_data:
-                        agent_data['name'] = agent_data['name']
-                    elif 'name' not in agent_data:
-                        agent_data['name'] = filename_identifier
                     agents.append(agent_data)
             except Exception as e:
                 print(f"Warning: Could not load {json_file}: {str(e)}")
                 continue
-        print(f"✓ Loaded {len(agents)} agents from HuggingFace")
         return agents
     except Exception as e:
@@ -1102,8 +142,6 @@ def load_agents_from_hf():
         return None
 def get_hf_token():
     """Get HuggingFace token from environment variables."""
     token = os.getenv('HF_TOKEN')
@@ -1112,48 +150,6 @@ def get_hf_token():
     return token
-def load_cached_leaderboard_and_metrics():
-    """
-    Load cached leaderboard and monthly metrics data from HuggingFace.
-    This is much faster than constructing from scratch on every app launch.
-    Returns:
-        dict: {
-            'leaderboard': dict of agent stats,
-            'monthly_metrics': dict with agents, months, and data,
-            'metadata': dict with last_updated, time_frame_days, total_agents
-        }
-        Returns None if cache doesn't exist or fails to load.
-    """
-    try:
-        token = get_hf_token()
-        print("📥 Loading cached leaderboard and metrics from HuggingFace...")
-        # Download cached file
-        cached_path = hf_hub_download_with_backoff(
-            repo_id=LEADERBOARD_REPO,
-            filename="swe-issue.json",
-            repo_type="dataset",
-            token=token
-        )
-        # Load JSON data
-        with open(cached_path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        print(f"   ✓ Loaded cached data (last updated: {data.get('metadata', {}).get('last_updated', 'Unknown')})")
-        print(f"   ✓ Leaderboard entries: {len(data.get('leaderboard', {}))}")
-        print(f"   ✓ Monthly metrics for: {len(data.get('monthly_metrics', {}).get('agents', []))} agents")
-        return data
-    except Exception as e:
-        print(f"⚠️ Could not load cached data: {str(e)}")
-        print(f"   Falling back to constructing from issue metadata...")
-        return None
 def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
     """
     Upload file to HuggingFace with exponential backoff retry logic.
@@ -1182,18 +178,18 @@ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, to
                 token=token
             )
             if attempt > 0:
-                print(f"   ✓ Upload succeeded on attempt {attempt + 1}/{max_retries}")
             return True
         except Exception as e:
             if attempt < max_retries - 1:
                 wait_time = delay + random.uniform(0, 1.0)
-                print(f"   ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
-                print(f"   ⏳ Retrying in {wait_time:.1f} seconds...")
                 time.sleep(wait_time)
                 delay = min(delay * 2, 60.0)  # Exponential backoff, max 60s
             else:
-                print(f"   ✗ Upload failed after {max_retries} attempts: {str(e)}")
                 raise
@@ -1223,7 +219,7 @@ def save_agent_to_hf(data):
                 repo_type="dataset",
                 token=token
             )
-            print(f"✓ Saved agent to HuggingFace: {filename}")
             return True
         finally:
             # Always clean up local file, even if upload fails
@@ -1231,207 +227,48 @@ def save_agent_to_hf(data):
                 os.remove(filename)
     except Exception as e:
-        print(f"✗ Error saving agent: {str(e)}")
         return False
-# =============================================================================
-# DATA MANAGEMENT
-# =============================================================================
-def save_leaderboard_and_metrics_to_hf():
     """
-    Creates a comprehensive JSON file with both leaderboard stats and monthly metrics.
-    If the file exists, it will be overwritten.
     Returns:
-        bool: True if successful, False otherwise
     """
-    import io
     try:
         token = get_hf_token()
-        if not token:
-            raise Exception("No HuggingFace token found")
-        api = HfApi(token=token)
-        print(f"\n{'='*80}")
-        print(f"📊 Preparing leaderboard and metrics data for upload...")
-        print(f"{'='*80}\n")
-        # Get leaderboard data
-        print("   Constructing leaderboard data...")
-        leaderboard_data = construct_leaderboard_from_metadata()
-        # Get monthly metrics data (all agents, not just top N)
-        print("   Calculating monthly metrics...")
-        monthly_metrics = calculate_monthly_metrics_by_agent(top_n=None)
-        # Combine into a single structure
-        combined_data = {
-            "leaderboard": leaderboard_data,
-            "monthly_metrics": monthly_metrics,
-            "metadata": {
-                "last_updated": datetime.now(timezone.utc).isoformat(),
-                "time_frame_days": LEADERBOARD_TIME_FRAME_DAYS,
-                "total_agents": len(leaderboard_data)
-            }
-        }
-        print(f"   Leaderboard entries: {len(leaderboard_data)}")
-        print(f"   Monthly metrics for: {len(monthly_metrics['agents'])} agents")
-        print(f"   Time frame: {LEADERBOARD_TIME_FRAME_DAYS} days")
-        # Convert to JSON and create file-like object
-        json_content = json.dumps(combined_data, indent=2)
-        file_like_object = io.BytesIO(json_content.encode('utf-8'))
-        # Upload to HuggingFace (will overwrite if exists)
-        print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
-        upload_file_with_backoff(
-            api,
-            path_or_fileobj=file_like_object,
-            path_in_repo="swe-issue.json",
             repo_id=LEADERBOARD_REPO,
             repo_type="dataset",
-            token=token,
-            commit_message=f"Update leaderboard data - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
         )
-        print(f"   ✓ Successfully uploaded swe-issue.json")
-        print(f"{'='*80}\n")
-        return True
-    except Exception as e:
-        print(f"✗ Error saving leaderboard and metrics: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return False
-def mine_all_agents():
-    """
-    Mine issue metadata for all agents within UPDATE_TIME_FRAME_DAYS and save to HuggingFace.
-    Uses BATCHED BigQuery queries for all agents (efficient approach).
-    """
-    # Load agent metadata from HuggingFace
-    agents = load_agents_from_hf()
-    if not agents:
-        print("No agents found in HuggingFace dataset")
-        return
-    # Extract all identifiers
-    identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
-    if not identifiers:
-        print("No valid agent identifiers found")
-        return
-    print(f"\n{'='*80}")
-    print(f"⛏️  [MINE] Starting BigQuery data mining for {len(identifiers)} agents")
-    print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
-    print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
-    print(f"⚠️  This will query BigQuery and may take several minutes")
-    print(f"{'='*80}\n")
-    # Initialize BigQuery client
-    try:
-        client = get_bigquery_client()
-    except Exception as e:
-        print(f"✗ Failed to initialize BigQuery client: {str(e)}")
-        return
-    # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
-    current_time = datetime.now(timezone.utc)
-    end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
-    start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
-    try:
-        # Use batched approach for better performance
-        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
-        all_metadata = fetch_issue_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
-        )
-        # Calculate summary statistics
-        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
-        agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
-        print(f"\n{'='*80}")
-        print(f"✅ BigQuery mining and upload complete!")
-        print(f"   Total agents: {len(agents)}")
-        print(f"   Agents with data: {agents_with_data}")
-        print(f"   Total PRs found: {total_prs}")
-        print(f"{'='*80}\n")
     except Exception as e:
-        print(f"✗ Error during BigQuery fetch: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return
-    # After mining is complete, save leaderboard and metrics to HuggingFace
-    print(f"📤 Uploading leaderboard and metrics data...")
-    if save_leaderboard_and_metrics_to_hf():
-        print(f"✓ Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}")
-    else:
-        print(f"⚠️ Failed to upload leaderboard and metrics data")
-def construct_leaderboard_from_metadata():
-    """
-    Construct leaderboard from stored issue metadata instead of fetching all issues.
-    Much more memory-efficient and faster.
-    Returns dictionary of agent stats.
-    """
-    print("📊 Constructing leaderboard from issue metadata...")
-    # Load agents
-    agents = load_agents_from_hf()
-    if not agents:
-        print("No agents found")
-        return {}
-    # Load all issue metadata
-    all_metadata = load_issue_metadata()
-    cache_dict = {}
-    for agent in agents:
-        identifier = agent.get('github_identifier')
-        agent_name = agent.get('name', 'Unknown')
-        # Filter metadata for this agent
-        bot_metadata = [issue for issue in all_metadata if issue.get('agent_identifier') == identifier]
-        # Calculate stats
-        stats = calculate_issue_stats_from_metadata(bot_metadata)
-        cache_dict[identifier] = {
-            'name': agent_name,
-            'website': agent.get('website', 'N/A'),
-            'github_identifier': identifier,
-            **stats
-        }
-    return cache_dict
 # =============================================================================
 # UI FUNCTIONS
 # =============================================================================
-def generate_color(index, total):
-    """Generate distinct colors using HSL color space for better distribution"""
-    hue = (index * 360 / total) % 360
-    saturation = 70 + (index % 3) * 10  # Vary saturation slightly
-    lightness = 45 + (index % 2) * 10   # Vary lightness slightly
-    return f'hsl({hue}, {saturation}%, {lightness}%)'
 def create_monthly_metrics_plot(top_n=5):
     """
     Create a Plotly figure with dual y-axes showing:
@@ -1443,37 +280,47 @@ def create_monthly_metrics_plot(top_n=5):
     Args:
         top_n: Number of top agents to show (default: 5)
     """
-    # Try to load from cache first
-    cached_data = load_cached_leaderboard_and_metrics()
-    if cached_data and 'monthly_metrics' in cached_data:
-        # Use cached monthly metrics
-        all_metrics = cached_data['monthly_metrics']
-        # Filter to top_n agents by total issue count
-        if all_metrics.get('agents') and all_metrics.get('data'):
-            # Calculate total issues for each agent
-            agent_totals = []
-            for agent_name in all_metrics['agents']:
-                total_issues = sum(all_metrics['data'][agent_name]['total_issues'])
-                agent_totals.append((agent_name, total_issues))
-            # Sort and take top_n agents
-            agent_totals.sort(key=lambda x: x[1], reverse=True)
-            top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
-            # Filter metrics to only include top agents
-            metrics = {
-                'agents': top_agents,
-                'months': all_metrics['months'],
-                'data': {agent: all_metrics['data'][agent] for agent in top_agents if agent in all_metrics['data']}
-            }
-        else:
-            metrics = all_metrics
-    else:
-        # Fallback: Calculate from issue metadata
-        print("   Calculating monthly metrics from issue metadata...")
-        metrics = calculate_monthly_metrics_by_agent(top_n=top_n)
     if not metrics['agents'] or not metrics['months']:
         # Return an empty figure with a message
@@ -1494,15 +341,23 @@ def create_monthly_metrics_plot(top_n=5):
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
     agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
-    # Generate unique colors for many agents using HSL color space
     agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
     # Add traces for each agent
-    for agent_name in agents:
         color = agent_colors[agent_name]
         agent_data = data[agent_name]
@@ -1520,10 +375,11 @@ def create_monthly_metrics_plot(top_n=5):
                     name=agent_name,
                     mode='lines+markers',
                     line=dict(color=color, width=2),
-                    marker=dict(size=6),
                     legendgroup=agent_name,
-                    showlegend=True,
-                    hovertemplate='<b>%{fullData.name}</b><br>' +
                                  'Resolved Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
                 ),
@@ -1547,8 +403,9 @@ def create_monthly_metrics_plot(top_n=5):
                     name=agent_name,
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
-                    showlegend=False,  # Don't show in legend (already shown for line)
-                    hovertemplate='<b>%{fullData.name}</b><br>' +
                                  'Total Issues: %{y}<br>' +
                                  '<extra></extra>',
                     offsetgroup=agent_name  # Group bars by agent for proper spacing
@@ -1558,23 +415,26 @@ def create_monthly_metrics_plot(top_n=5):
     # Update axes labels
     fig.update_xaxes(title_text=None)
-    fig.update_yaxes(title_text="<b>Resolved Rate (%)</b>", secondary_y=False)
     fig.update_yaxes(title_text="<b>Total Issues</b>", secondary_y=True)
     # Update layout
     fig.update_layout(
         title=None,
-        hovermode='closest',
         barmode='group',
         height=600,
-        legend=dict(
-            orientation="h",
-            yanchor="bottom",
-            y=1.02,
-            xanchor="right",
-            x=1
-        ),
-        margin=dict(l=50, r=50, t=100, b=50)
     )
     return fig
@@ -1582,39 +442,52 @@ def create_monthly_metrics_plot(top_n=5):
 def get_leaderboard_dataframe():
     """
-    Load leaderboard from cached data and convert to pandas DataFrame for display.
-    Falls back to constructing from issue metadata if cache is unavailable.
     Returns formatted DataFrame sorted by total issues.
     """
-    # Try to load from cache first
-    cached_data = load_cached_leaderboard_and_metrics()
-    if cached_data and 'leaderboard' in cached_data:
-        cache_dict = cached_data['leaderboard']
-    else:
-        # Fallback: Construct leaderboard from metadata
-        print("   Constructing leaderboard from issue metadata...")
-        cache_dict = construct_leaderboard_from_metadata()
     if not cache_dict:
         # Return empty DataFrame with correct columns if no data
         column_names = [col[0] for col in LEADERBOARD_COLUMNS]
         return pd.DataFrame(columns=column_names)
     rows = []
-    for data in cache_dict.values():
         # Filter out agents with zero total issues
-        if data.get('total_issues', 0) == 0:
             continue
         # Only include display-relevant fields
         rows.append([
             data.get('name', 'Unknown'),
             data.get('website', 'N/A'),
-            data.get('total_issues', 0),
             data.get('resolved_issues', 0),
             data.get('resolved_rate', 0.0),
         ])
     # Create DataFrame
     column_names = [col[0] for col in LEADERBOARD_COLUMNS]
     df = pd.DataFrame(rows, columns=column_names)
@@ -1629,95 +502,125 @@ def get_leaderboard_dataframe():
     if "Total Issues" in df.columns and not df.empty:
         df = df.sort_values(by="Total Issues", ascending=False).reset_index(drop=True)
     return df
-def submit_agent(identifier, agent_name, developer, website):
     """
     Submit a new agent to the leaderboard.
     Validates input and saves submission.
-    Issue data will be populated by the monthly mining task.
     """
     # Validate required fields
     if not identifier or not identifier.strip():
-        return "❌ GitHub identifier is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     if not agent_name or not agent_name.strip():
-        return "❌ Agent name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
-    if not developer or not developer.strip():
-        return "❌ Developer name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     if not website or not website.strip():
-        return "❌ Website URL is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     # Clean inputs
     identifier = identifier.strip()
     agent_name = agent_name.strip()
-    developer = developer.strip()
     website = website.strip()
     # Validate GitHub identifier
     is_valid, message = validate_github_username(identifier)
     if not is_valid:
-        return f"❌ {message}", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     # Check for duplicates by loading agents from HuggingFace
     agents = load_agents_from_hf()
     if agents:
         existing_names = {agent['github_identifier'] for agent in agents}
         if identifier in existing_names:
-            return f"⚠️ Agent with identifier '{identifier}' already exists", get_leaderboard_dataframe(), create_monthly_metrics_plot()
     # Create submission
     submission = {
         'name': agent_name,
-        'developer': developer,
         'github_identifier': identifier,
         'website': website,
     }
     # Save to HuggingFace
     if not save_agent_to_hf(submission):
-        return "❌ Failed to save submission", get_leaderboard_dataframe(), create_monthly_metrics_plot()
-    return f"✅ Successfully submitted {agent_name}! Issue data will be populated by daily incremental updates.", get_leaderboard_dataframe(), create_monthly_metrics_plot()
 # =============================================================================
 # GRADIO APPLICATION
 # =============================================================================
-print(f"\n🚀 Starting SWE Agent PR Leaderboard")
-print(f"   Leaderboard time frame: {LEADERBOARD_TIME_FRAME_DAYS} days ({LEADERBOARD_TIME_FRAME_DAYS // 30} months)")
-print(f"   Mining update frequency: Every {UPDATE_TIME_FRAME_DAYS} days\n")
-# Start APScheduler for monthly PR mining at 12:00 AM UTC every 1st of the month
 scheduler = BackgroundScheduler(timezone="UTC")
 scheduler.add_job(
-    mine_all_agents,
-    trigger=CronTrigger(day=1, hour=0, minute=0),  # 12:00 AM UTC every 1st of the month
-    id='monthly_issue_mining',
-    name='Monthly Issue Mining',
     replace_existing=True
 )
 scheduler.start()
 print(f"\n{'='*80}")
-print(f"✓ Scheduler initialized successfully")
-print(f"⛏️  Mining schedule: Every 1st of the month at 12:00 AM UTC")
-print(f"📥 On startup: Only loads cached data from HuggingFace (no mining)")
 print(f"{'='*80}\n")
 # Create Gradio interface
 with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as app:
-    gr.Markdown("# 🏆 SWE Agent Issue Leaderboard")
     gr.Markdown(f"Track and compare GitHub issue resolution statistics for SWE agents")
     with gr.Tabs():
         # Leaderboard Tab
-        with gr.Tab("📊 Leaderboard"):
-            gr.Markdown(f"*All statistics are based on issues from the last {LEADERBOARD_TIME_FRAME_DAYS // 30} months*")
             leaderboard_table = Leaderboard(
-                value=get_leaderboard_dataframe(),
                 datatype=LEADERBOARD_COLUMNS,
                 search_columns=["Agent Name", "Website"],
                 filter_columns=[
@@ -1732,41 +635,55 @@ with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as a
                 ]
             )
-            gr.Markdown("### Monthly Metrics")
-            gr.Markdown("Track resolution rates and issue activity over time")
-            monthly_plot = gr.Plot(
-                value=create_monthly_metrics_plot(),
-                label="Monthly Issue Metrics"
             )
         # Submit Agent Tab
-        with gr.Tab("➕ Submit Agent"):
             gr.Markdown("### Submit Your Agent")
-            gr.Markdown("Fill in the details below to add your agent to the leaderboard. Make sure you're logged in to HuggingFace CLI on your machine.")
             with gr.Row():
                 with gr.Column():
                     github_input = gr.Textbox(
                         label="GitHub Identifier*",
-                        placeholder="Your agent username (e.g., my-agent-bot)"
                     )
                     name_input = gr.Textbox(
                         label="Agent Name*",
                         placeholder="Your agent's display name"
                     )
                 with gr.Column():
-                    developer_input = gr.Textbox(
-                        label="Developer*",
-                        placeholder="Your developer or team name"
                     )
                     website_input = gr.Textbox(
-                        label="Website",
                         placeholder="https://your-agent-website.com"
                     )
             submit_button = gr.Button(
                 "Submit Agent",
                 variant="primary"
@@ -1775,15 +692,15 @@ with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as a
                 label="Submission Status",
                 interactive=False
             )
             # Event handler
             submit_button.click(
                 fn=submit_agent,
-                inputs=[github_input, name_input, developer_input, website_input],
-                outputs=[submission_status, leaderboard_table, monthly_plot]
             )
 # Launch application
 if __name__ == "__main__":
-    app.launch()

 import json
 import os
 import time
 import requests
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
+import backoff
 from dotenv import load_dotenv
 import pandas as pd
 import random
 from plotly.subplots import make_subplots
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
 # Load environment variables
 load_dotenv()
 # =============================================================================
 AGENTS_REPO = "SWE-Arena/bot_metadata"  # HuggingFace dataset for agent metadata
+LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"  # HuggingFace dataset for leaderboard data
+MAX_RETRIES = 5
 LEADERBOARD_COLUMNS = [
     ("Agent Name", "string"),
 # =============================================================================
 def is_rate_limit_error(e):
+    """Check if exception is a HuggingFace rate limit error (429)."""
+    if isinstance(e, HfHubHTTPError):
+        return e.response.status_code == 429
+    return False
 @backoff.on_exception(
     backoff.expo,
     HfHubHTTPError,
+    max_tries=MAX_RETRIES,
     base=300,
     max_value=3600,
+    giveup=lambda e: not is_rate_limit_error(e),
+    on_backoff=lambda details: print(
+        f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
 )
 def list_repo_files_with_backoff(api, **kwargs):
+    """Wrapper for api.list_repo_files() with exponential backoff for rate limits."""
     return api.list_repo_files(**kwargs)
 @backoff.on_exception(
     backoff.expo,
     HfHubHTTPError,
+    max_tries=MAX_RETRIES,
     base=300,
     max_value=3600,
     giveup=lambda e: not is_rate_limit_error(e),
+    on_backoff=lambda details: print(
+        f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
     )
+)
+def hf_hub_download_with_backoff(**kwargs):
+    """Wrapper for hf_hub_download() with exponential backoff for rate limits."""
+    return hf_hub_download(**kwargs)
 # =============================================================================
+# GITHUB USERNAME VALIDATION
 # =============================================================================
 def validate_github_username(identifier):
+    """Verify that a GitHub identifier exists."""
     try:
+        response = requests.get(f'https://api.github.com/users/{identifier}', timeout=10)
+        return (True, "Username is valid") if response.status_code == 200 else (False, "GitHub identifier not found" if response.status_code == 404 else f"Validation error: HTTP {response.status_code}")
     except Exception as e:
         return False, f"Validation error: {str(e)}"
 # =============================================================================
 # HUGGINGFACE DATASET OPERATIONS
 # =============================================================================
         agents = []
         # List all files in the repository
+        files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
         # Filter for JSON files only
         json_files = [f for f in files if f.endswith('.json')]
                     # Add or override github_identifier to match filename
                     agent_data['github_identifier'] = filename_identifier
                     agents.append(agent_data)
             except Exception as e:
                 print(f"Warning: Could not load {json_file}: {str(e)}")
                 continue
+        print(f"Loaded {len(agents)} agents from HuggingFace")
         return agents
     except Exception as e:
         return None
 def get_hf_token():
     """Get HuggingFace token from environment variables."""
     token = os.getenv('HF_TOKEN')
     return token
 def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
     """
     Upload file to HuggingFace with exponential backoff retry logic.
                 token=token
             )
             if attempt > 0:
+                print(f"   Upload succeeded on attempt {attempt + 1}/{max_retries}")
             return True
         except Exception as e:
             if attempt < max_retries - 1:
                 wait_time = delay + random.uniform(0, 1.0)
+                print(f"   Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
+                print(f"   Retrying in {wait_time:.1f} seconds...")
                 time.sleep(wait_time)
                 delay = min(delay * 2, 60.0)  # Exponential backoff, max 60s
             else:
+                print(f"   Upload failed after {max_retries} attempts: {str(e)}")
                 raise
                 repo_type="dataset",
                 token=token
             )
+            print(f"Saved agent to HuggingFace: {filename}")
             return True
         finally:
             # Always clean up local file, even if upload fails
                 os.remove(filename)
     except Exception as e:
+        print(f"Error saving agent: {str(e)}")
         return False
+def load_leaderboard_data_from_hf():
     """
+    Load leaderboard data and monthly metrics from HuggingFace dataset.
     Returns:
+        dict: Dictionary with 'leaderboard', 'monthly_metrics', and 'metadata' keys
+              Returns None if file doesn't exist or error occurs
     """
     try:
         token = get_hf_token()
+        filename = "swe-issue.json"
+        # Download file
+        file_path = hf_hub_download_with_backoff(
             repo_id=LEADERBOARD_REPO,
+            filename=filename,
             repo_type="dataset",
+            token=token
         )
+        # Load JSON data
+        with open(file_path, 'r') as f:
+            data = json.load(f)
+        last_updated = data.get('metadata', {}).get('last_updated', 'Unknown')
+        print(f"Loaded leaderboard data from HuggingFace (last updated: {last_updated})")
+        return data
     except Exception as e:
+        print(f"Could not load leaderboard data from HuggingFace: {str(e)}")
+        return None
 # =============================================================================
 # UI FUNCTIONS
 # =============================================================================
 def create_monthly_metrics_plot(top_n=5):
     """
     Create a Plotly figure with dual y-axes showing:
     Args:
         top_n: Number of top agents to show (default: 5)
     """
+    # Load from saved dataset
+    saved_data = load_leaderboard_data_from_hf()
+    if not saved_data or 'monthly_metrics' not in saved_data:
+        # Return an empty figure with a message
+        fig = go.Figure()
+        fig.add_annotation(
+            text="No data available for visualization",
+            xref="paper", yref="paper",
+            x=0.5, y=0.5, showarrow=False,
+            font=dict(size=16)
+        )
+        fig.update_layout(
+            title=None,
+            xaxis_title=None,
+            height=500
+        )
+        return fig
+    metrics = saved_data['monthly_metrics']
+    print(f"Loaded monthly metrics from saved dataset")
+    # Apply top_n filter if specified
+    if top_n is not None and top_n > 0 and metrics.get('agents'):
+        # Calculate total issues for each agent
+        agent_totals = []
+        for agent_name in metrics['agents']:
+            agent_data = metrics['data'].get(agent_name, {})
+            total_issues = sum(agent_data.get('total_issues', []))
+            agent_totals.append((agent_name, total_issues))
+        # Sort by total issues and take top N
+        agent_totals.sort(key=lambda x: x[1], reverse=True)
+        top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
+        # Filter metrics to only include top agents
+        metrics = {
+            'agents': top_agents,
+            'months': metrics['months'],
+            'data': {agent: metrics['data'][agent] for agent in top_agents if agent in metrics['data']}
+        }
     if not metrics['agents'] or not metrics['months']:
         # Return an empty figure with a message
     # Create figure with secondary y-axis
     fig = make_subplots(specs=[[{"secondary_y": True}]])
+    # Generate unique colors for many agents using HSL color space
+    def generate_color(index, total):
+        """Generate distinct colors using HSL color space for better distribution"""
+        hue = (index * 360 / total) % 360
+        saturation = 70 + (index % 3) * 10  # Vary saturation slightly
+        lightness = 45 + (index % 2) * 10   # Vary lightness slightly
+        return f'hsl({hue}, {saturation}%, {lightness}%)'
     agents = metrics['agents']
     months = metrics['months']
     data = metrics['data']
+    # Generate colors for all agents
     agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
     # Add traces for each agent
+    for idx, agent_name in enumerate(agents):
         color = agent_colors[agent_name]
         agent_data = data[agent_name]
                     name=agent_name,
                     mode='lines+markers',
                     line=dict(color=color, width=2),
+                    marker=dict(size=8),
                     legendgroup=agent_name,
+                    showlegend=(top_n is not None and top_n <= 10),  # Show legend for top N agents
+                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
+                                 'Month: %{x}<br>' +
                                  'Resolved Rate: %{y:.2f}%<br>' +
                                  '<extra></extra>'
                 ),
                     name=agent_name,
                     marker=dict(color=color, opacity=0.6),
                     legendgroup=agent_name,
+                    showlegend=False,  # Hide duplicate legend entry (already shown in Scatter)
+                    hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
+                                 'Month: %{x}<br>' +
                                  'Total Issues: %{y}<br>' +
                                  '<extra></extra>',
                     offsetgroup=agent_name  # Group bars by agent for proper spacing
     # Update axes labels
     fig.update_xaxes(title_text=None)
+    fig.update_yaxes(
+        title_text="<b>Resolved Rate (%)</b>",
+        range=[0, 100],
+        secondary_y=False,
+        showticklabels=True,
+        tickmode='linear',
+        dtick=10,
+        showgrid=True
+    )
     fig.update_yaxes(title_text="<b>Total Issues</b>", secondary_y=True)
     # Update layout
+    show_legend = (top_n is not None and top_n <= 10)
     fig.update_layout(
         title=None,
+        hovermode='closest',  # Show individual agent info on hover
         barmode='group',
         height=600,
+        showlegend=show_legend,
+        margin=dict(l=50, r=150 if show_legend else 50, t=50, b=50)  # More right margin when legend is shown
     )
     return fig
 def get_leaderboard_dataframe():
     """
+    Load leaderboard from saved dataset and convert to pandas DataFrame for display.
     Returns formatted DataFrame sorted by total issues.
     """
+    # Load from saved dataset
+    saved_data = load_leaderboard_data_from_hf()
+    if not saved_data or 'leaderboard' not in saved_data:
+        print(f"No leaderboard data available")
+        # Return empty DataFrame with correct columns if no data
+        column_names = [col[0] for col in LEADERBOARD_COLUMNS]
+        return pd.DataFrame(columns=column_names)
+    cache_dict = saved_data['leaderboard']
+    last_updated = saved_data.get('metadata', {}).get('last_updated', 'Unknown')
+    print(f"Loaded leaderboard from saved dataset (last updated: {last_updated})")
+    print(f"Cache dict size: {len(cache_dict)}")
     if not cache_dict:
+        print("WARNING: cache_dict is empty!")
         # Return empty DataFrame with correct columns if no data
         column_names = [col[0] for col in LEADERBOARD_COLUMNS]
         return pd.DataFrame(columns=column_names)
     rows = []
+    filtered_count = 0
+    for identifier, data in cache_dict.items():
+        total_issues = data.get('total_issues', 0)
+        print(f"   Agent '{identifier}': {total_issues} issues")
         # Filter out agents with zero total issues
+        if total_issues == 0:
+            filtered_count += 1
             continue
         # Only include display-relevant fields
         rows.append([
             data.get('name', 'Unknown'),
             data.get('website', 'N/A'),
+            total_issues,
             data.get('resolved_issues', 0),
             data.get('resolved_rate', 0.0),
         ])
+    print(f"Filtered out {filtered_count} agents with 0 issues")
+    print(f"Leaderboard will show {len(rows)} agents")
     # Create DataFrame
     column_names = [col[0] for col in LEADERBOARD_COLUMNS]
     df = pd.DataFrame(rows, columns=column_names)
     if "Total Issues" in df.columns and not df.empty:
         df = df.sort_values(by="Total Issues", ascending=False).reset_index(drop=True)
+    print(f"Final DataFrame shape: {df.shape}")
+    print("="*60 + "\n")
     return df
+def submit_agent(identifier, agent_name, organization, website):
     """
     Submit a new agent to the leaderboard.
     Validates input and saves submission.
     """
     # Validate required fields
     if not identifier or not identifier.strip():
+        return "ERROR: GitHub identifier is required", gr.update()
     if not agent_name or not agent_name.strip():
+        return "ERROR: Agent name is required", gr.update()
+    if not organization or not organization.strip():
+        return "ERROR: Organization name is required", gr.update()
     if not website or not website.strip():
+        return "ERROR: Website URL is required", gr.update()
     # Clean inputs
     identifier = identifier.strip()
     agent_name = agent_name.strip()
+    organization = organization.strip()
     website = website.strip()
     # Validate GitHub identifier
     is_valid, message = validate_github_username(identifier)
     if not is_valid:
+        return f"ERROR: {message}", gr.update()
     # Check for duplicates by loading agents from HuggingFace
     agents = load_agents_from_hf()
     if agents:
         existing_names = {agent['github_identifier'] for agent in agents}
         if identifier in existing_names:
+            return f"WARNING: Agent with identifier '{identifier}' already exists", gr.update()
     # Create submission
     submission = {
         'name': agent_name,
+        'organization': organization,
         'github_identifier': identifier,
         'website': website,
+        'status': 'public'
     }
     # Save to HuggingFace
     if not save_agent_to_hf(submission):
+        return "ERROR: Failed to save submission", gr.update()
+    # Return success message - data will be populated by backend updates
+    return f"SUCCESS: Successfully submitted {agent_name}! Issue data will be automatically populated by the backend system via the maintainers.", gr.update()
+# =============================================================================
+# DATA RELOAD FUNCTION
+# =============================================================================
+def reload_leaderboard_data():
+    """
+    Reload leaderboard data from HuggingFace.
+    This function is called by the scheduler on a daily basis.
+    """
+    print(f"\n{'='*80}")
+    print(f"Reloading leaderboard data from HuggingFace...")
+    print(f"{'='*80}\n")
+    try:
+        data = load_leaderboard_data_from_hf()
+        if data:
+            print(f"Successfully reloaded leaderboard data")
+            print(f"   Last updated: {data.get('metadata', {}).get('last_updated', 'Unknown')}")
+            print(f"   Agents: {len(data.get('leaderboard', {}))}")
+        else:
+            print(f"No data available")
+    except Exception as e:
+        print(f"Error reloading leaderboard data: {str(e)}")
+    print(f"{'='*80}\n")
 # =============================================================================
 # GRADIO APPLICATION
 # =============================================================================
+print(f"\nStarting SWE Agent Issue Leaderboard")
+print(f"   Data source: {LEADERBOARD_REPO}")
+print(f"   Reload frequency: Daily at 12:00 AM UTC\n")
+# Start APScheduler for daily data reload at 12:00 AM UTC
 scheduler = BackgroundScheduler(timezone="UTC")
 scheduler.add_job(
+    reload_leaderboard_data,
+    trigger=CronTrigger(hour=0, minute=0),  # 12:00 AM UTC daily
+    id='daily_data_reload',
+    name='Daily Data Reload',
     replace_existing=True
 )
 scheduler.start()
 print(f"\n{'='*80}")
+print(f"Scheduler initialized successfully")
+print(f"Reload schedule: Daily at 12:00 AM UTC")
+print(f"On startup: Loads cached data from HuggingFace on demand")
 print(f"{'='*80}\n")
 # Create Gradio interface
 with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# SWE Agent Issue Leaderboard")
     gr.Markdown(f"Track and compare GitHub issue resolution statistics for SWE agents")
     with gr.Tabs():
         # Leaderboard Tab
+        with gr.Tab("Leaderboard"):
+            gr.Markdown("*Statistics are based on agent issue resolution activity tracked by the system*")
             leaderboard_table = Leaderboard(
+                value=pd.DataFrame(columns=[col[0] for col in LEADERBOARD_COLUMNS]),  # Empty initially
                 datatype=LEADERBOARD_COLUMNS,
                 search_columns=["Agent Name", "Website"],
                 filter_columns=[
                 ]
             )
+            # Load leaderboard data when app starts
+            app.load(
+                fn=get_leaderboard_dataframe,
+                inputs=[],
+                outputs=[leaderboard_table]
+            )
+            # Monthly Metrics Section
+            gr.Markdown("---")  # Divider
+            gr.Markdown("### Monthly Performance - Top 5 Agents")
+            gr.Markdown("*Shows resolved rate trends and issue volumes for the most active agents*")
+            monthly_metrics_plot = gr.Plot(label="Monthly Metrics")
+            # Load monthly metrics when app starts
+            app.load(
+                fn=lambda: create_monthly_metrics_plot(),
+                inputs=[],
+                outputs=[monthly_metrics_plot]
             )
         # Submit Agent Tab
+        with gr.Tab("Submit Agent"):
             gr.Markdown("### Submit Your Agent")
+            gr.Markdown("Fill in the details below to add your agent to the leaderboard.")
             with gr.Row():
                 with gr.Column():
                     github_input = gr.Textbox(
                         label="GitHub Identifier*",
+                        placeholder="Your agent username (e.g., my-agent[bot])"
                     )
                     name_input = gr.Textbox(
                         label="Agent Name*",
                         placeholder="Your agent's display name"
                     )
                 with gr.Column():
+                    organization_input = gr.Textbox(
+                        label="Organization*",
+                        placeholder="Your organization or team name"
                     )
                     website_input = gr.Textbox(
+                        label="Website*",
                         placeholder="https://your-agent-website.com"
                     )
             submit_button = gr.Button(
                 "Submit Agent",
                 variant="primary"
                 label="Submission Status",
                 interactive=False
             )
             # Event handler
             submit_button.click(
                 fn=submit_agent,
+                inputs=[github_input, name_input, organization_input, website_input],
+                outputs=[submission_status, leaderboard_table]
             )
 # Launch application
 if __name__ == "__main__":
+    app.launch()

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+services:
+  msr-miner:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: gharchive-miner
+    restart: unless-stopped
+    env_file:
+      - .env
+    volumes:
+      # Mount entire workspace for live code updates
+      - .:/app
+      # Mount gharchive workspace for data storage
+      - ../gharchive:/gharchive:ro
+      # Mount bot data for agent repository storage
+      - ../bot_data:/bot_data:ro
+    environment:
+      - PYTHONUNBUFFERED=1
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "10m"
+        max-file: "3"

msr.py CHANGED Viewed

@@ -1,18 +1,19 @@
-"""
-Minimalist Issue Metadata Mining Script
-Mines issue metadata from GitHub Archive via BigQuery and saves to HuggingFace dataset.
-"""
 import json
 import os
-import tempfile
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
 from dotenv import load_dotenv
-from google.cloud import bigquery
 import backoff
 # Load environment variables
 load_dotenv()
@@ -21,75 +22,39 @@ load_dotenv()
 # CONFIGURATION
 # =============================================================================
-AGENTS_REPO = "SWE-Arena/bot_metadata"
-ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata"
-LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
-LEADERBOARD_TIME_FRAME_DAYS = 180  # Time frame for leaderboard
-# =============================================================================
-# HUGGINGFACE API WRAPPERS WITH BACKOFF
-# =============================================================================
-def is_rate_limit_error(e):
-    """Check if the exception is a rate limit error (429)."""
-    return isinstance(e, HfHubHTTPError) and e.response.status_code == 429
-@backoff.on_exception(
-    backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
-    base=300,
-    max_value=3600,
-    jitter=backoff.full_jitter,
-    on_backoff=lambda details: print(f"   ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
-)
-def list_repo_files_with_backoff(api, **kwargs):
-    """List repo files with exponential backoff on rate limit errors."""
-    return api.list_repo_files(**kwargs)
-@backoff.on_exception(
-    backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
-    base=300,
-    max_value=3600,
-    jitter=backoff.full_jitter,
-    on_backoff=lambda details: print(f"   ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
-)
-def hf_hub_download_with_backoff(**kwargs):
-    """Download from HF Hub with exponential backoff on rate limit errors."""
-    return hf_hub_download(**kwargs)
-@backoff.on_exception(
-    backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
-    base=300,
-    max_value=3600,
-    jitter=backoff.full_jitter,
-    on_backoff=lambda details: print(f"   ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
-)
-def upload_file_with_backoff(api, **kwargs):
-    """Upload file with exponential backoff on rate limit errors."""
-    return api.upload_file(**kwargs)
-@backoff.on_exception(
-    backoff.expo,
-    HfHubHTTPError,
-    giveup=lambda e: not is_rate_limit_error(e),
-    max_tries=8,
-    base=300,
-    max_value=3600,
-    jitter=backoff.full_jitter,
-    on_backoff=lambda details: print(f"   ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
-)
-def upload_folder_with_backoff(api, **kwargs):
-    """Upload folder with exponential backoff on rate limit errors."""
-    return api.upload_folder(**kwargs)
 # =============================================================================
 # UTILITY FUNCTIONS
@@ -116,7 +81,32 @@ def save_jsonl(filename, data):
     """Save list of dictionaries to JSONL file."""
     with open(filename, 'w', encoding='utf-8') as f:
         for item in data:
-            f.write(json.dumps(item) + '\n')
 def get_hf_token():
@@ -127,581 +117,498 @@ def get_hf_token():
     return token
-def get_bigquery_client():
-    """
-    Initialize BigQuery client using credentials from environment variable.
-    Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
-    the service account JSON credentials as a string.
-    """
-    # Get the JSON content from environment variable
-    creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
-    if creds_json:
-        # Create a temporary file to store credentials
-        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
-            temp_file.write(creds_json)
-            temp_path = temp_file.name
-        # Set environment variable to point to temp file
-        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
-        # Initialize BigQuery client
-        client = bigquery.Client()
-        # Clean up temp file
-        os.unlink(temp_path)
-        return client
-    else:
-        raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
-def generate_table_union_statements(start_date, end_date):
-    """
-    Generate UNION ALL statements for githubarchive.month tables in date range.
-    Args:
-        start_date: Start datetime
-        end_date: End datetime
-    Returns:
-        String with UNION ALL SELECT statements for all monthly tables in range
-    """
-    table_names = []
-    # Start from the beginning of start_date's month
-    current_date = start_date.replace(day=1)
-    end_month = end_date.replace(day=1)
-    while current_date <= end_month:
-        table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
-        table_names.append(table_name)
-        # Move to next month
-        if current_date.month == 12:
-            current_date = current_date.replace(year=current_date.year + 1, month=1)
-        else:
-            current_date = current_date.replace(month=current_date.month + 1)
-    # Create UNION ALL chain
-    union_parts = [f"SELECT * FROM {table}" for table in table_names]
-    return " UNION ALL ".join(union_parts)
 # =============================================================================
-# BIGQUERY FUNCTIONS
 # =============================================================================
-def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
-    """
-    Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
-    Splits agents into smaller batches to avoid performance issues with large UNNEST arrays
-    and correlated subqueries. Each batch query runs much faster than one massive query.
-    Args:
-        client: BigQuery client instance
-        identifiers: List of GitHub usernames/bot identifiers
-        start_date: Start datetime (timezone-aware)
-        end_date: End datetime (timezone-aware)
-        batch_size: Number of agents per batch (default: 100)
-        upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
-    Returns:
-        Dictionary mapping agent identifier to list of issue metadata
-    """
-    print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
-    print(f"   Batch size: {batch_size} agents per query")
-    print(f"   Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
-    # Split identifiers into batches
-    batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
-    print(f"   Total batches: {len(batches)}")
-    # Collect results from all batches
-    all_metadata = {}
-    for batch_num, batch_identifiers in enumerate(batches, 1):
-        print(f"\n{'─'*80}")
-        print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)")
-        print(f"{'─'*80}")
-        try:
-            batch_results = fetch_all_issue_metadata_single_query(
-                client, batch_identifiers, start_date, end_date
-            )
-            # Merge results
-            for identifier, metadata_list in batch_results.items():
-                if identifier in all_metadata:
-                    all_metadata[identifier].extend(metadata_list)
-                else:
-                    all_metadata[identifier] = metadata_list
-            print(f"   ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
-            # Upload immediately after this batch if enabled
-            if upload_immediately and batch_results:
-                print(f"\n   🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
-                upload_success = 0
-                upload_errors = 0
-                for identifier, metadata_list in batch_results.items():
-                    if metadata_list:
-                        if save_issue_metadata_to_hf(metadata_list, identifier):
-                            upload_success += 1
-                        else:
-                            upload_errors += 1
-                print(f"   ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
-        except Exception as e:
-            print(f"   ✗ Batch {batch_num} failed: {str(e)}")
-            print(f"   Continuing with remaining batches...")
-            import traceback
-            traceback.print_exc()
-            continue
-    print(f"\n{'='*80}")
-    print(f"✅ All batches completed!")
-    print(f"   Total agents with data: {len(all_metadata)}")
-    total_issues = sum(len(issues) for issues in all_metadata.values())
-    print(f"   Total issues found: {total_issues}")
-    print(f"{'='*80}\n")
-    return all_metadata
-def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
     """
-    Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query.
-    This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
-    deduplicates to get the latest state of each issue. Filters by issue author,
-    commenter, or assignee.
-    NOTE: This function is designed for smaller batches (~100 agents). For large
-    numbers of agents, use fetch_issue_metadata_batched() instead.
     Args:
-        client: BigQuery client instance
-        identifiers: List of GitHub usernames/bot identifiers (recommended: <100)
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
     Returns:
-        Dictionary mapping agent identifier to list of issue metadata:
-        {
-            'agent-identifier': [
-                {
-                    'url': Issue URL,
-                    'created_at': Issue creation timestamp,
-                    'closed_at': Close timestamp (if closed, else None),
-                    'state_reason': Reason for closure (completed/not_planned/etc.)
-                },
-                ...
-            ],
-            ...
-        }
-    """
-    print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY")
-    print(f"   Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
-    # Generate table UNION statements for issue events
-    issue_tables = generate_table_union_statements(start_date, end_date)
-    # Build identifier list (handle both bot and non-bot versions)
-    identifier_set = set()
-    for id in identifiers:
-        identifier_set.add(id)
-        # Also add stripped version without [bot] suffix
-        stripped = id.replace('[bot]', '')
-        if stripped != id:
-            identifier_set.add(stripped)
-    # Convert to array literal for UNNEST (avoids query size limits from large IN clauses)
-    identifier_array = '[' + ', '.join([f'"{id}"' for id in identifier_set]) + ']'
-    print(f"   Total identifiers (including bot/non-bot variants): {len(identifier_set)}")
-    # Build comprehensive query with CTEs using UNNEST instead of large IN clauses
-    query = f"""
-    WITH agent_identifiers AS (
-      -- Create a table from the identifier array to avoid massive IN clauses
-      SELECT identifier
-      FROM UNNEST({identifier_array}) AS identifier
-    ),
-    issue_events AS (
-      -- Get all issue events and comment events for ALL agents
-      SELECT
-        JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as url,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as created_at,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as closed_at,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.state_reason') as state_reason,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as author,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') as assignee,
-        JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') as commenter,
-        JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
-        repo.name as repo_name,
-        created_at as event_time
-      FROM (
-        {issue_tables}
-      )
-      WHERE
-        type IN ('IssuesEvent', 'IssueCommentEvent')
-        -- Exclude pull requests (they have pull_request field)
-        AND JSON_EXTRACT(payload, '$.issue.pull_request') IS NULL
-        AND JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') IS NOT NULL
-        -- Filter by author OR commenter OR assignee using JOIN instead of IN
-        AND (
-          JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') IN (SELECT identifier FROM agent_identifiers)
-          OR JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') IN (SELECT identifier FROM agent_identifiers)
-          OR JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') IN (SELECT identifier FROM agent_identifiers)
-        )
-    ),
-    latest_states AS (
-      -- Deduplicate to get latest state for each issue
-      SELECT
-        url,
-        created_at,
-        closed_at,
-        state_reason,
-        author,
-        assignee,
-        commenter
-      FROM issue_events
-      QUALIFY ROW_NUMBER() OVER (
-        PARTITION BY repo_name, issue_number
-        ORDER BY event_time DESC
-      ) = 1
-    ),
-    agent_issues AS (
-      -- Map each issue to its relevant agent(s)
-      SELECT DISTINCT
-        CASE
-          WHEN author IN (SELECT identifier FROM agent_identifiers) THEN author
-          WHEN commenter IN (SELECT identifier FROM agent_identifiers) THEN commenter
-          WHEN assignee IN (SELECT identifier FROM agent_identifiers) THEN assignee
-          ELSE NULL
-        END as agent_identifier,
-        url,
-        created_at,
-        closed_at,
-        state_reason
-      FROM latest_states
-      WHERE
-        author IN (SELECT identifier FROM agent_identifiers)
-        OR commenter IN (SELECT identifier FROM agent_identifiers)
-        OR assignee IN (SELECT identifier FROM agent_identifiers)
-    )
-    SELECT
-      agent_identifier,
-      url,
-      created_at,
-      closed_at,
-      state_reason
-    FROM agent_issues
-    WHERE agent_identifier IS NOT NULL
-    ORDER BY agent_identifier, created_at DESC
     """
-    # Calculate number of days for reporting
-    query_days = (end_date - start_date).days
-    print(f"   Querying {query_days} days for issue and comment events...")
-    print(f"   Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
-    try:
-        query_job = client.query(query)
-        results = list(query_job.result())
-        print(f"   ✓ Found {len(results)} total issue records across all agents")
-        # Group results by agent
-        metadata_by_agent = defaultdict(list)
-        for row in results:
-            agent_id = row.agent_identifier
-            # Convert datetime objects to ISO strings
-            created_at = row.created_at
-            if hasattr(created_at, 'isoformat'):
-                created_at = created_at.isoformat()
-            closed_at = row.closed_at
-            if hasattr(closed_at, 'isoformat'):
-                closed_at = closed_at.isoformat()
-            metadata_by_agent[agent_id].append({
-                'url': row.url,
-                'created_at': created_at,
-                'closed_at': closed_at,
-                'state_reason': row.state_reason,
-            })
-        # Print breakdown by agent
-        print(f"\n   📊 Results breakdown by agent:")
-        for identifier in identifiers:
-            # Check both original and stripped versions
-            count = len(metadata_by_agent.get(identifier, []))
-            stripped = identifier.replace('[bot]', '')
-            if stripped != identifier:
-                count += len(metadata_by_agent.get(stripped, []))
-            if count > 0:
-                # Merge both versions if needed
-                all_metadata = metadata_by_agent.get(identifier, []) + metadata_by_agent.get(stripped, [])
-                completed_count = sum(1 for m in all_metadata if m['state_reason'] == 'completed')
-                closed_count = sum(1 for m in all_metadata if m['closed_at'] is not None)
-                open_count = count - closed_count
-                print(f"      {identifier}: {count} issues ({completed_count} completed, {closed_count} closed, {open_count} open)")
-        # Convert defaultdict to regular dict and merge bot/non-bot versions
-        final_metadata = {}
-        for identifier in identifiers:
-            combined = metadata_by_agent.get(identifier, [])
-            stripped = identifier.replace('[bot]', '')
-            if stripped != identifier and stripped in metadata_by_agent:
-                combined.extend(metadata_by_agent[stripped])
-            if combined:
-                final_metadata[identifier] = combined
-        return final_metadata
-    except Exception as e:
-        print(f"   ✗ BigQuery error: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return {}
-# =============================================================================
-# HUGGINGFACE STORAGE FUNCTIONS
-# =============================================================================
-def group_metadata_by_date(metadata_list):
-    """
-    Group issue metadata by exact date (year.month.day) for efficient daily storage.
-    Returns dict: {(year, month, day): [metadata_list]}
-    """
-    grouped = defaultdict(list)
-    for issue_meta in metadata_list:
-        created_at = issue_meta.get('created_at')
-        if not created_at:
             continue
-        try:
-            dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
-            key = (dt.year, dt.month, dt.day)
-            grouped[key].append(issue_meta)
-        except Exception as e:
-            print(f"Warning: Could not parse date '{created_at}': {e}")
-    return dict(grouped)
-def save_issue_metadata_to_hf(metadata_list, agent_identifier):
-    """
-    Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
-    Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
-    This function OVERWRITES existing files completely with fresh data from BigQuery.
-    Uses upload_folder for single-commit batch uploads (avoids rate limit issues).
-    Args:
-        metadata_list: List of issue metadata dictionaries
-        agent_identifier: GitHub identifier of the agent (used as folder name)
-    """
-    import shutil
-    try:
-        token = get_hf_token()
-        if not token:
-            raise Exception("No HuggingFace token found")
-        api = HfApi(token=token)
-        # Group by date (year, month, day)
-        grouped = group_metadata_by_date(metadata_list)
-        if not grouped:
-            print(f"   No valid metadata to save for {agent_identifier}")
-            return False
-        # Create a temporary directory for batch upload
-        temp_dir = tempfile.mkdtemp()
-        agent_folder = os.path.join(temp_dir, agent_identifier)
-        os.makedirs(agent_folder, exist_ok=True)
-        try:
-            print(f"   📦 Preparing batch upload for {len(grouped)} daily files...")
-            # Process each daily file
-            for (issue_year, month, day), day_metadata in grouped.items():
-                filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
-                local_filename = os.path.join(agent_folder, f"{issue_year}.{month:02d}.{day:02d}.jsonl")
-                # Sort by created_at for better organization
-                day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
-                # Save to temp directory (complete overwrite, no merging)
-                save_jsonl(local_filename, day_metadata)
-                print(f"      Prepared {len(day_metadata)} issues for {filename}")
-            # Upload entire folder using upload_folder (single commit per agent)
-            print(f"   🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
-            upload_folder_with_backoff(
-                api,
-                folder_path=temp_dir,
-                repo_id=ISSUE_METADATA_REPO,
-                repo_type="dataset",
-                commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
-            )
-            print(f"   ✓ Batch upload complete for {agent_identifier}")
             return True
-        finally:
-            # Always clean up temp directory
-            if os.path.exists(temp_dir):
-                shutil.rmtree(temp_dir)
     except Exception as e:
-        print(f"   ✗ Error saving issue metadata: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return False
 def load_agents_from_hf():
     """
-    Load all agent metadata JSON files from HuggingFace dataset.
-    The github_identifier is extracted from the filename (e.g., 'agent-name[bot].json' -> 'agent-name[bot]')
     """
-    try:
-        api = HfApi()
-        agents = []
-        # List all files in the repository
-        files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
-        # Filter for JSON files only
-        json_files = [f for f in files if f.endswith('.json')]
-        print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
-        # Download and parse each JSON file
-        for json_file in json_files:
-            try:
-                file_path = hf_hub_download_with_backoff(
-                    repo_id=AGENTS_REPO,
-                    filename=json_file,
-                    repo_type="dataset"
-                )
-                with open(file_path, 'r') as f:
                     agent_data = json.load(f)
-                    # Only process agents with status == "public"
-                    if agent_data.get('status') != 'public':
-                        continue
-                    # Extract github_identifier from filename (remove .json extension)
-                    github_identifier = json_file.replace('.json', '')
-                    agent_data['github_identifier'] = github_identifier
-                    agents.append(agent_data)
             except Exception as e:
-                print(f"Warning: Could not load {json_file}: {str(e)}")
                 continue
-        print(f"✓ Loaded {len(agents)} agents from HuggingFace")
-        return agents
-    except Exception as e:
-        print(f"Could not load agents from HuggingFace: {str(e)}")
-        return []
-# =============================================================================
-# LEADERBOARD CALCULATION FUNCTIONS
-# =============================================================================
 def calculate_issue_stats_from_metadata(metadata_list):
-    """
-    Calculate statistics from a list of issue metadata.
-    Returns:
-        dict: Issue statistics including total, closed, resolved counts and rate
-    """
     total_issues = len(metadata_list)
-    # Count closed issues (those with closed_at timestamp)
-    closed_issues = sum(1 for issue_meta in metadata_list
-                       if issue_meta.get('closed_at') is not None)
-    # Count completed issues (subset of closed issues with state_reason="completed")
-    completed = sum(1 for issue_meta in metadata_list
                    if issue_meta.get('state_reason') == 'completed')
-    # Calculate resolved rate as: completed / closed (not completed / total)
-    resolved_rate = (completed / closed_issues * 100) if closed_issues > 0 else 0
     return {
         'total_issues': total_issues,
-        'closed_issues': closed_issues,
-        'resolved_issues': completed,
         'resolved_rate': round(resolved_rate, 2),
     }
-def calculate_monthly_metrics(all_metadata, agents):
-    """
-    Calculate monthly metrics for all agents for visualization.
-    Args:
-        all_metadata: Dictionary mapping agent_identifier to list of issue metadata
-        agents: List of agent dictionaries with metadata
-    Returns:
-        dict: {
-            'agents': list of agent names,
-            'months': list of month labels (e.g., '2025-01'),
-            'data': {
-                agent_name: {
-                    'resolved_rates': list of resolved rates by month,
-                    'total_issues': list of issue counts by month,
-                    'resolved_issues': list of resolved issue counts by month
-                }
-            }
-        }
-    """
-    # Create mapping from agent_identifier to agent_name
-    identifier_to_name = {
-        agent.get('github_identifier'): agent.get('name', agent.get('name', agent.get('github_identifier')))
-        for agent in agents if agent.get('github_identifier')
-    }
-    # Group by agent and month
     agent_month_data = defaultdict(lambda: defaultdict(list))
-    for identifier, metadata_list in all_metadata.items():
-        agent_name = identifier_to_name.get(identifier, identifier)
         for issue_meta in metadata_list:
             created_at = issue_meta.get('created_at')
             if not created_at:
                 continue
             try:
                 dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
                 month_key = f"{dt.year}-{dt.month:02d}"
@@ -710,42 +617,38 @@ def calculate_monthly_metrics(all_metadata, agents):
                 print(f"Warning: Could not parse date '{created_at}': {e}")
                 continue
-    # Get all unique months and sort them
     all_months = set()
     for agent_data in agent_month_data.values():
         all_months.update(agent_data.keys())
     months = sorted(list(all_months))
-    # Calculate metrics for each agent and month
     result_data = {}
     for agent_name, month_dict in agent_month_data.items():
         resolved_rates = []
         total_issues_list = []
         resolved_issues_list = []
         for month in months:
             issues_in_month = month_dict.get(month, [])
-            # Count completed issues (those with state_reason="completed")
-            completed_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')
-            # Count closed issues (those with closed_at timestamp)
-            closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at') is not None)
-            # Total issues created in this month
             total_count = len(issues_in_month)
-            # Calculate resolved rate as: completed / closed (not completed / total)
-            resolved_rate = (completed_count / closed_count * 100) if closed_count > 0 else None
             resolved_rates.append(resolved_rate)
             total_issues_list.append(total_count)
-            resolved_issues_list.append(completed_count)
         result_data[agent_name] = {
             'resolved_rates': resolved_rates,
             'total_issues': total_issues_list,
-            'resolved_issues': resolved_issues_list
         }
     agents_list = sorted(list(agent_month_data.keys()))
@@ -757,168 +660,175 @@ def calculate_monthly_metrics(all_metadata, agents):
     }
-def save_leaderboard_and_metrics_to_hf(all_metadata, agents):
-    """
-    Creates a comprehensive JSON file with both leaderboard stats and monthly metrics.
-    If the file exists, it will be overwritten.
-    Args:
-        all_metadata: Dictionary mapping agent_identifier to list of issue metadata
-        agents: List of agent dictionaries with metadata
-    Returns:
-        bool: True if successful, False otherwise
-    """
-    import io
     try:
         token = get_hf_token()
         if not token:
             raise Exception("No HuggingFace token found")
         api = HfApi(token=token)
-        print(f"\n{'='*80}")
-        print(f"📊 Preparing leaderboard and metrics data for upload...")
-        print(f"{'='*80}\n")
-        # Build leaderboard data
-        print("   Constructing leaderboard data...")
-        leaderboard_data = {}
-        for agent in agents:
-            identifier = agent.get('github_identifier')
-            agent_name = agent.get('name', 'Unknown')
-            if not identifier:
-                continue
-            metadata = all_metadata.get(identifier, [])
-            stats = calculate_issue_stats_from_metadata(metadata)
-            leaderboard_data[identifier] = {
-                'name': agent_name,
-                'website': agent.get('website', 'N/A'),
-                'github_identifier': identifier,
-                **stats
-            }
-        # Get monthly metrics data
-        print("   Calculating monthly metrics...")
-        monthly_metrics = calculate_monthly_metrics(all_metadata, agents)
-        # Combine into a single structure
         combined_data = {
-            "leaderboard": leaderboard_data,
-            "monthly_metrics": monthly_metrics,
-            "metadata": {
-                "last_updated": datetime.now(timezone.utc).isoformat(),
-                "time_frame_days": LEADERBOARD_TIME_FRAME_DAYS,
-                "total_agents": len(leaderboard_data)
             }
         }
-        print(f"   Leaderboard entries: {len(leaderboard_data)}")
-        print(f"   Monthly metrics for: {len(monthly_metrics['agents'])} agents")
-        print(f"   Time frame: {LEADERBOARD_TIME_FRAME_DAYS} days")
-        # Convert to JSON and create file-like object
-        json_content = json.dumps(combined_data, indent=2)
-        file_like_object = io.BytesIO(json_content.encode('utf-8'))
-        # Upload to HuggingFace (will overwrite if exists)
-        print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
-        upload_file_with_backoff(
-            api,
-            path_or_fileobj=file_like_object,
-            path_in_repo="swe-issue.json",
-            repo_id=LEADERBOARD_REPO,
-            repo_type="dataset",
-            token=token,
-            commit_message=f"Update leaderboard data - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
-        )
-        print(f"   ✓ Successfully uploaded swe-issue.json")
-        print(f"{'='*80}\n")
-        return True
     except Exception as e:
-        print(f"✗ Error saving leaderboard and metrics: {str(e)}")
         import traceback
         traceback.print_exc()
         return False
 # =============================================================================
-# MAIN MINING FUNCTION
 # =============================================================================
 def mine_all_agents():
     """
-    Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
-    Uses ONE BigQuery query for ALL agents (most efficient approach).
     """
-    # Load agent metadata from HuggingFace
     agents = load_agents_from_hf()
     if not agents:
-        print("No agents found in HuggingFace dataset")
         return
-    # Extract all identifiers
     identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
     if not identifiers:
-        print("No valid agent identifiers found")
         return
-    print(f"\n{'='*80}")
-    print(f"Starting issue metadata mining for {len(identifiers)} agents")
-    print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
-    print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
-    print(f"{'='*80}\n")
-    # Initialize BigQuery client
     try:
-        client = get_bigquery_client()
     except Exception as e:
-        print(f"✗ Failed to initialize BigQuery client: {str(e)}")
         return
-    # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
     current_time = datetime.now(timezone.utc)
     end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
-        # Use batched approach for better performance
-        # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
-        all_metadata = fetch_issue_metadata_batched(
-            client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
         )
-        # Calculate summary statistics
-        total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
-        agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
-        print(f"\n{'='*80}")
-        print(f"✅ BigQuery mining and upload complete!")
-        print(f"   Total agents: {len(agents)}")
-        print(f"   Agents with data: {agents_with_data}")
-        print(f"   Total PRs found: {total_prs}")
-        print(f"{'='*80}\n")
     except Exception as e:
-        print(f"✗ Error during BigQuery fetch: {str(e)}")
         import traceback
         traceback.print_exc()
-        return
-    # After mining is complete, save leaderboard and metrics to HuggingFace
-    print(f"📤 Uploading leaderboard and metrics data...")
-    if save_leaderboard_and_metrics_to_hf(all_metadata, agents):
-        print(f"✓ Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}")
-    else:
-        print(f"⚠️ Failed to upload leaderboard and metrics data")
 # =============================================================================
@@ -926,4 +836,7 @@ def mine_all_agents():
 # =============================================================================
 if __name__ == "__main__":
-    mine_all_agents()

 import json
 import os
+import time
 from datetime import datetime, timezone, timedelta
 from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
 from dotenv import load_dotenv
+import duckdb
 import backoff
+import requests
+import requests.exceptions
+from apscheduler.schedulers.blocking import BlockingScheduler
+from apscheduler.triggers.cron import CronTrigger
+import logging
 # Load environment variables
 load_dotenv()
 # CONFIGURATION
 # =============================================================================
+AGENTS_REPO = "SWE-Arena/bot_data"
+AGENTS_REPO_LOCAL_PATH = os.path.expanduser("~/bot_data")  # Local git clone path
+DUCKDB_CACHE_FILE = "cache.duckdb"
+GHARCHIVE_DATA_LOCAL_PATH = os.path.expanduser("~/gharchive/data")
+LEADERBOARD_REPO = "SWE-Arena/leaderboard_data"
+LEADERBOARD_TIME_FRAME_DAYS = 180
+# Git sync configuration (mandatory to get latest bot data)
+GIT_SYNC_TIMEOUT = 300  # 5 minutes timeout for git pull
+# OPTIMIZED DUCKDB CONFIGURATION
+DUCKDB_THREADS = 8
+DUCKDB_MEMORY_LIMIT = "64GB"
+# Streaming batch configuration
+BATCH_SIZE_DAYS = 7  # Process 1 week at a time (~168 hourly files)
+# At this size: ~7 days ⚠ 24 files ⚠ ~100MB per file = ~16GB uncompressed per batch
+# Download configuration
+DOWNLOAD_WORKERS = 4
+DOWNLOAD_RETRY_DELAY = 2
+MAX_RETRIES = 5
+# Upload configuration
+UPLOAD_DELAY_SECONDS = 5
+UPLOAD_MAX_BACKOFF = 3600
+# Scheduler configuration
+SCHEDULE_ENABLED = True
+SCHEDULE_DAY_OF_WEEK = 'sun'  # Sunday
+SCHEDULE_HOUR = 0
+SCHEDULE_MINUTE = 0
+SCHEDULE_TIMEZONE = 'UTC'
 # =============================================================================
 # UTILITY FUNCTIONS
     """Save list of dictionaries to JSONL file."""
     with open(filename, 'w', encoding='utf-8') as f:
         for item in data:
+            f.write(json.dumps(item) + '\\n')
+def normalize_date_format(date_string):
+    """Convert date strings or datetime objects to standardized ISO 8601 format with Z suffix."""
+    if not date_string or date_string == 'N/A':
+        return 'N/A'
+    try:
+        import re
+        if isinstance(date_string, datetime):
+            return date_string.strftime('%Y-%m-%dT%H:%M:%SZ')
+        date_string = re.sub(r'\\s+', ' ', date_string.strip())
+        date_string = date_string.replace(' ', 'T')
+        if len(date_string) >= 3:
+            if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
+                date_string = date_string + ':00'
+        dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
+        return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
+    except Exception as e:
+        print(f"Warning: Could not parse date '{date_string}': {e}")
+        return date_string
 def get_hf_token():
     return token
+# =============================================================================
+# GHARCHIVE DOWNLOAD FUNCTIONS
+# =============================================================================
+def download_file(url):
+    """Download a GHArchive file with retry logic."""
+    filename = url.split("/")[-1]
+    filepath = os.path.join(GHARCHIVE_DATA_LOCAL_PATH, filename)
+    if os.path.exists(filepath):
+        return True
+    for attempt in range(MAX_RETRIES):
+        try:
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            with open(filepath, "wb") as f:
+                f.write(response.content)
+            return True
+        except requests.exceptions.HTTPError as e:
+            # 404 means the file doesn't exist in GHArchive - skip without retry
+            if e.response.status_code == 404:
+                if attempt == 0:  # Only log once, not for each retry
+                    print(f"   ⚠ {filename}: Not available (404) - skipping")
+                return False
+            # Other HTTP errors (5xx, etc.) should be retried
+            wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
+            print(f"   ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
+            time.sleep(wait_time)
+        except Exception as e:
+            # Network errors, timeouts, etc. should be retried
+            wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
+            print(f"   ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
+            time.sleep(wait_time)
+    return False
+def download_all_gharchive_data():
+    """Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS."""
+    os.makedirs(GHARCHIVE_DATA_LOCAL_PATH, exist_ok=True)
+    end_date = datetime.now()
+    start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
+    urls = []
+    current_date = start_date
+    while current_date <= end_date:
+        date_str = current_date.strftime("%Y-%m-%d")
+        for hour in range(24):
+            url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
+            urls.append(url)
+        current_date += timedelta(days=1)
+    downloads_processed = 0
+    try:
+        with ThreadPoolExecutor(max_workers=DOWNLOAD_WORKERS) as executor:
+            futures = [executor.submit(download_file, url) for url in urls]
+            for future in as_completed(futures):
+                downloads_processed += 1
+        print(f"   Download complete: {downloads_processed} files")
+        return True
+    except Exception as e:
+        print(f"Error during download: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return False
 # =============================================================================
+# HUGGINGFACE API WRAPPERS
 # =============================================================================
+def is_retryable_error(e):
+    """Check if exception is retryable (rate limit or timeout error)."""
+    if isinstance(e, HfHubHTTPError):
+        if e.response.status_code == 429:
+            return True
+    if isinstance(e, (requests.exceptions.Timeout,
+                     requests.exceptions.ReadTimeout,
+                     requests.exceptions.ConnectTimeout)):
+        return True
+    if isinstance(e, Exception):
+        error_str = str(e).lower()
+        if 'timeout' in error_str or 'timed out' in error_str:
+            return True
+    return False
+@backoff.on_exception(
+    backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def list_repo_files_with_backoff(api, **kwargs):
+    """Wrapper for api.list_repo_files() with exponential backoff."""
+    return api.list_repo_files(**kwargs)
+@backoff.on_exception(
+    backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def hf_hub_download_with_backoff(**kwargs):
+    """Wrapper for hf_hub_download() with exponential backoff."""
+    return hf_hub_download(**kwargs)
+@backoff.on_exception(
+    backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def upload_file_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_file() with exponential backoff."""
+    return api.upload_file(**kwargs)
+@backoff.on_exception(
+    backoff.expo,
+    (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
+    max_tries=MAX_RETRIES,
+    base=300,
+    max_value=3600,
+    giveup=lambda e: not is_retryable_error(e),
+    on_backoff=lambda details: print(
+        f"   {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
+    )
+)
+def upload_folder_with_backoff(api, **kwargs):
+    """Wrapper for api.upload_folder() with exponential backoff."""
+    return api.upload_folder(**kwargs)
+def get_duckdb_connection():
+    """
+    Initialize DuckDB connection with OPTIMIZED memory settings.
+    Uses persistent database and reduced memory footprint.
+    """
+    conn = duckdb.connect(DUCKDB_CACHE_FILE)
+    # OPTIMIZED SETTINGS
+    conn.execute(f"SET threads TO {DUCKDB_THREADS};")
+    conn.execute("SET preserve_insertion_order = false;")
+    conn.execute("SET enable_object_cache = true;")
+    conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
+    conn.execute(f"SET memory_limit = '{DUCKDB_MEMORY_LIMIT}';")  # Per-query limit
+    conn.execute(f"SET max_memory = '{DUCKDB_MEMORY_LIMIT}';")  # Hard cap
+    return conn
+def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_LOCAL_PATH):
+    """Generate file path patterns for GHArchive data in date range (only existing files)."""
+    file_patterns = []
+    missing_dates = set()
+    current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
+    end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
+    while current_date <= end_day:
+        date_has_files = False
+        for hour in range(24):
+            pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
+            if os.path.exists(pattern):
+                file_patterns.append(pattern)
+                date_has_files = True
+        if not date_has_files:
+            missing_dates.add(current_date.strftime('%Y-%m-%d'))
+        current_date += timedelta(days=1)
+    if missing_dates:
+        print(f"   ⚠ Skipping {len(missing_dates)} date(s) with no data")
+    return file_patterns
+# =============================================================================
+# STREAMING BATCH PROCESSING FOR ISSUES
+# =============================================================================
+def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
     """
+    OPTIMIZED: Fetch issue metadata using streaming batch processing.
+    Only tracks issues assigned to the agents.
+    Processes GHArchive files in BATCH_SIZE_DAYS chunks to limit memory usage.
+    Instead of loading 180 days (4,344 files) at once, processes 7 days at a time.
+    This prevents OOM errors by:
+    1. Only keeping ~168 hourly files in memory per batch (vs 4,344)
+    2. Incrementally building the results dictionary
+    3. Allowing DuckDB to garbage collect after each batch
     Args:
+        conn: DuckDB connection instance
+        identifiers: List of GitHub usernames/bot identifiers (~1500)
         start_date: Start datetime (timezone-aware)
         end_date: End datetime (timezone-aware)
     Returns:
+        Dictionary mapping agent identifier to list of issue metadata
     """
+    identifier_list = ', '.join([f"'{id}'" for id in identifiers])
+    metadata_by_agent = defaultdict(list)
+    # Calculate total batches
+    total_days = (end_date - start_date).days
+    total_batches = (total_days // BATCH_SIZE_DAYS) + 1
+    # Process in configurable batches
+    current_date = start_date
+    batch_num = 0
+    total_issues = 0
+    print(f"   Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...")
+    while current_date <= end_date:
+        batch_num += 1
+        batch_end = min(current_date + timedelta(days=BATCH_SIZE_DAYS - 1), end_date)
+        # Get file patterns for THIS BATCH ONLY (not all 180 days)
+        file_patterns = generate_file_path_patterns(current_date, batch_end)
+        if not file_patterns:
+            print(f"   Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} - NO DATA")
+            current_date = batch_end + timedelta(days=1)
             continue
+        # Progress indicator
+        print(f"   Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} ({len(file_patterns)} files)... ", end="", flush=True)
+        # Build file patterns SQL for THIS BATCH
+        file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
+        # Query for this batch - IssuesEvent filtered by assignee only
+        query = f"""
+        WITH issue_events AS (
+            SELECT
+                CONCAT(
+                    REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
+                    '/issues/',
+                    CAST(payload.issue.number AS VARCHAR)
+                ) as url,
+                payload.issue.assignee.login as assignee,
+                created_at as event_time,
+                payload.issue.created_at as issue_created_at,
+                payload.issue.closed_at as issue_closed_at,
+                payload.issue.state_reason as state_reason
+            FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
+            WHERE
+                type = 'IssuesEvent'
+                AND payload.issue.number IS NOT NULL
+                AND payload.issue.pull_request IS NULL
+                AND payload.issue.assignee.login IN ({identifier_list})
+        ),
+        issue_timeline AS (
+            SELECT
+                url,
+                assignee as agent_identifier,
+                MIN(issue_created_at) as created_at,
+                MAX(issue_closed_at) as closed_at,
+                MAX(state_reason) as state_reason
+            FROM issue_events
+            GROUP BY url, assignee
+        )
+        SELECT url, agent_identifier, created_at, closed_at, state_reason
+        FROM issue_timeline
+        WHERE agent_identifier IS NOT NULL AND created_at IS NOT NULL
+        """
+        try:
+            results = conn.execute(query).fetchall()
+            batch_issues = 0
+            # Add results to accumulating dictionary
+            for row in results:
+                url = row[0]
+                agent_identifier = row[1]
+                created_at = normalize_date_format(row[2]) if row[2] else None
+                closed_at = normalize_date_format(row[3]) if row[3] else None
+                state_reason = row[4]
+                if not url or not agent_identifier:
+                    continue
+                issue_metadata = {
+                    'url': url,
+                    'created_at': created_at,
+                    'closed_at': closed_at,
+                    'state_reason': state_reason,
+                }
+                metadata_by_agent[agent_identifier].append(issue_metadata)
+                batch_issues += 1
+                total_issues += 1
+            print(f"✓ {batch_issues} issues found")
+        except Exception as e:
+            print(f"\\n   ✗ Batch {batch_num} error: {str(e)}")
+            import traceback
+            traceback.print_exc()
+        # Move to next batch
+        current_date = batch_end + timedelta(days=1)
+    # Final summary
+    agents_with_data = sum(1 for issues in metadata_by_agent.values() if issues)
+    print(f"\\n   ✓ Complete: {total_issues} issues found for {agents_with_data}/{len(identifiers)} agents")
+    return dict(metadata_by_agent)
+def sync_agents_repo():
+    """
+    Sync local bot_data repository with remote using git pull.
+    This is MANDATORY to ensure we have the latest bot data.
+    Raises exception if sync fails.
+    """
+    if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
+        error_msg = f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}"
+        print(f"   ✗ {error_msg}")
+        print(f"   Please clone it first: git clone https://huggingface.co/datasets/{AGENTS_REPO}")
+        raise FileNotFoundError(error_msg)
+    if not os.path.exists(os.path.join(AGENTS_REPO_LOCAL_PATH, '.git')):
+        error_msg = f"{AGENTS_REPO_LOCAL_PATH} exists but is not a git repository"
+        print(f"   ✗ {error_msg}")
+        raise ValueError(error_msg)
+    try:
+        import subprocess
+        # Run git pull with extended timeout due to large repository
+        result = subprocess.run(
+            ['git', 'pull'],
+            cwd=AGENTS_REPO_LOCAL_PATH,
+            capture_output=True,
+            text=True,
+            timeout=GIT_SYNC_TIMEOUT
+        )
+        if result.returncode == 0:
+            output = result.stdout.strip()
+            if "Already up to date" in output or "Already up-to-date" in output:
+                print(f"   ✓ Repository is up to date")
+            else:
+                print(f"   ✓ Repository synced successfully")
+                if output:
+                    # Print first few lines of output
+                    lines = output.split('\\n')[:5]
+                    for line in lines:
+                        print(f"     {line}")
             return True
+        else:
+            error_msg = f"Git pull failed: {result.stderr.strip()}"
+            print(f"   ✗ {error_msg}")
+            raise RuntimeError(error_msg)
+    except subprocess.TimeoutExpired:
+        error_msg = f"Git pull timed out after {GIT_SYNC_TIMEOUT} seconds"
+        print(f"   ✗ {error_msg}")
+        raise TimeoutError(error_msg)
+    except (FileNotFoundError, ValueError, RuntimeError, TimeoutError):
+        raise  # Re-raise expected exceptions
     except Exception as e:
+        error_msg = f"Error syncing repository: {str(e)}"
+        print(f"   ✗ {error_msg}")
+        raise RuntimeError(error_msg) from e
 def load_agents_from_hf():
     """
+    Load all agent metadata JSON files from local git repository.
+    ALWAYS syncs with remote first to ensure we have the latest bot data.
     """
+    # MANDATORY: Sync with remote first to get latest bot data
+    print(f"   Syncing bot_data repository to get latest agents...")
+    sync_agents_repo()  # Will raise exception if sync fails
+    agents = []
+    # Scan local directory for JSON files
+    if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
+        raise FileNotFoundError(f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}")
+    # Walk through the directory to find all JSON files
+    files_processed = 0
+    print(f"   Loading agent metadata from {AGENTS_REPO_LOCAL_PATH}...")
+    for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
+        # Skip .git directory
+        if '.git' in root:
+            continue
+        for filename in files:
+            if not filename.endswith('.json'):
+                continue
+            files_processed += 1
+            file_path = os.path.join(root, filename)
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
                     agent_data = json.load(f)
+                # Only include public agents
+                if agent_data.get('status') != 'public':
+                    continue
+                # Extract github_identifier from filename
+                github_identifier = filename.replace('.json', '')
+                agent_data['github_identifier'] = github_identifier
+                agents.append(agent_data)
             except Exception as e:
+                print(f"   ⚠ Error loading {filename}: {str(e)}")
                 continue
+    print(f"   ✓ Loaded {len(agents)} public agents (from {files_processed} total files)")
+    return agents
 def calculate_issue_stats_from_metadata(metadata_list):
+    """Calculate statistics from a list of issue metadata."""
     total_issues = len(metadata_list)
+    closed = sum(1 for issue_meta in metadata_list if issue_meta.get('closed_at'))
+    resolved = sum(1 for issue_meta in metadata_list
                    if issue_meta.get('state_reason') == 'completed')
+    # Resolved rate = resolved / closed (not resolved / total)
+    resolved_rate = (resolved / closed * 100) if closed > 0 else 0
     return {
         'total_issues': total_issues,
+        'closed_issues': closed,
+        'resolved_issues': resolved,
         'resolved_rate': round(resolved_rate, 2),
     }
+def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
+    """Calculate monthly metrics for all agents for visualization."""
+    identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
+    if not all_metadata_dict:
+        return {'agents': [], 'months': [], 'data': {}}
     agent_month_data = defaultdict(lambda: defaultdict(list))
+    for agent_identifier, metadata_list in all_metadata_dict.items():
         for issue_meta in metadata_list:
             created_at = issue_meta.get('created_at')
             if not created_at:
                 continue
+            agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
             try:
                 dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
                 month_key = f"{dt.year}-{dt.month:02d}"
                 print(f"Warning: Could not parse date '{created_at}': {e}")
                 continue
     all_months = set()
     for agent_data in agent_month_data.values():
         all_months.update(agent_data.keys())
     months = sorted(list(all_months))
     result_data = {}
     for agent_name, month_dict in agent_month_data.items():
         resolved_rates = []
         total_issues_list = []
         resolved_issues_list = []
+        closed_issues_list = []
         for month in months:
             issues_in_month = month_dict.get(month, [])
+            resolved_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')
+            closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at'))
             total_count = len(issues_in_month)
+            # Resolved rate = resolved / closed (not resolved / total)
+            resolved_rate = (resolved_count / closed_count * 100) if closed_count > 0 else None
             resolved_rates.append(resolved_rate)
             total_issues_list.append(total_count)
+            resolved_issues_list.append(resolved_count)
+            closed_issues_list.append(closed_count)
         result_data[agent_name] = {
             'resolved_rates': resolved_rates,
             'total_issues': total_issues_list,
+            'resolved_issues': resolved_issues_list,
+            'closed_issues': closed_issues_list
         }
     agents_list = sorted(list(agent_month_data.keys()))
     }
+def construct_leaderboard_from_metadata(all_metadata_dict, agents):
+    """Construct leaderboard from in-memory issue metadata."""
+    if not agents:
+        print("Error: No agents found")
+        return {}
+    cache_dict = {}
+    for agent in agents:
+        identifier = agent.get('github_identifier')
+        agent_name = agent.get('name', 'Unknown')
+        bot_metadata = all_metadata_dict.get(identifier, [])
+        stats = calculate_issue_stats_from_metadata(bot_metadata)
+        cache_dict[identifier] = {
+            'name': agent_name,
+            'website': agent.get('website', 'N/A'),
+            'github_identifier': identifier,
+            **stats
+        }
+    return cache_dict
+def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
+    """Save leaderboard data and monthly metrics to HuggingFace dataset."""
     try:
         token = get_hf_token()
         if not token:
             raise Exception("No HuggingFace token found")
         api = HfApi(token=token)
+        filename = "swe-issue.json"
         combined_data = {
+            'last_updated': datetime.now(timezone.utc).isoformat(),
+            'leaderboard': leaderboard_dict,
+            'monthly_metrics': monthly_metrics,
+            'metadata': {
+                'leaderboard_time_frame_days': LEADERBOARD_TIME_FRAME_DAYS
             }
         }
+        with open(filename, 'w') as f:
+            json.dump(combined_data, f, indent=2)
+        try:
+            upload_file_with_backoff(
+                api=api,
+                path_or_fileobj=filename,
+                path_in_repo=filename,
+                repo_id=LEADERBOARD_REPO,
+                repo_type="dataset"
+            )
+            return True
+        finally:
+            if os.path.exists(filename):
+                os.remove(filename)
     except Exception as e:
+        print(f"Error saving leaderboard data: {str(e)}")
         import traceback
         traceback.print_exc()
         return False
 # =============================================================================
+# MINING FUNCTION
 # =============================================================================
 def mine_all_agents():
     """
+    Mine issue metadata for all agents using STREAMING batch processing.
+    Downloads GHArchive data, then uses BATCH-based DuckDB queries.
     """
+    print(f"\\n[1/4] Downloading GHArchive data...")
+    if not download_all_gharchive_data():
+        print("Warning: Download had errors, continuing with available data...")
+    print(f"\\n[2/4] Loading agent metadata...")
     agents = load_agents_from_hf()
     if not agents:
+        print("Error: No agents found")
         return
     identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
     if not identifiers:
+        print("Error: No valid agent identifiers found")
         return
+    print(f"\\n[3/4] Mining issue metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
     try:
+        conn = get_duckdb_connection()
     except Exception as e:
+        print(f"Failed to initialize DuckDB connection: {str(e)}")
         return
     current_time = datetime.now(timezone.utc)
     end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
     start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
     try:
+        # USE STREAMING FUNCTION FOR ISSUES
+        all_metadata = fetch_all_issue_metadata_streaming(
+            conn, identifiers, start_date, end_date
         )
+    except Exception as e:
+        print(f"Error during DuckDB fetch: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return
+    finally:
+        conn.close()
+    print(f"\\n[4/4] Saving leaderboard...")
+    try:
+        leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
+        monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
+        save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
     except Exception as e:
+        print(f"Error saving leaderboard: {str(e)}")
         import traceback
         traceback.print_exc()
+# =============================================================================
+# SCHEDULER SETUP
+# =============================================================================
+def setup_scheduler():
+    """Set up APScheduler to run mining jobs periodically."""
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    logging.getLogger('httpx').setLevel(logging.WARNING)
+    scheduler = BlockingScheduler(timezone=SCHEDULE_TIMEZONE)
+    trigger = CronTrigger(
+        day_of_week=SCHEDULE_DAY_OF_WEEK,
+        hour=SCHEDULE_HOUR,
+        minute=SCHEDULE_MINUTE,
+        timezone=SCHEDULE_TIMEZONE
+    )
+    scheduler.add_job(
+        mine_all_agents,
+        trigger=trigger,
+        id='mine_all_agents',
+        name='Mine GHArchive data for all agents',
+        replace_existing=True
+    )
+    from datetime import datetime
+    next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
+    print(f"Scheduler: Weekly on {SCHEDULE_DAY_OF_WEEK} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
+    print(f"Next run: {next_run}\\n")
+    print(f"\\nScheduler started")
+    scheduler.start()
 # =============================================================================
 # =============================================================================
 if __name__ == "__main__":
+    if SCHEDULE_ENABLED:
+        setup_scheduler()
+    else:
+        mine_all_agents()

requirements.txt CHANGED Viewed

@@ -1,12 +1,10 @@
 APScheduler
 backoff
-datasets
-db-dtypes
-google-cloud-bigquery
 gradio
 gradio_leaderboard
 huggingface_hub
 pandas
 plotly
-PyGithub
-python-dotenv

 APScheduler
 backoff
+duckdb[all]
 gradio
 gradio_leaderboard
 huggingface_hub
 pandas
 plotly
+python-dotenv
+requests