zhimin-z commited on
Commit
5998589
·
1 Parent(s): 68ab628
Files changed (7) hide show
  1. .gitignore +2 -1
  2. Dockerfile +6 -18
  3. README.md +0 -1
  4. app.py +239 -1322
  5. docker-compose.yml +23 -0
  6. msr.py +573 -660
  7. requirements.txt +3 -5
.gitignore CHANGED
@@ -2,4 +2,5 @@
2
  *.env
3
  *.venv
4
  *.ipynb
5
- *.pyc
 
 
2
  *.env
3
  *.venv
4
  *.ipynb
5
+ *.pyc
6
+ *.duckdb
Dockerfile CHANGED
@@ -1,34 +1,22 @@
1
- # Use official Python runtime as base image
2
  FROM python:3.12-slim
3
 
4
  # Set working directory
5
  WORKDIR /app
6
 
7
- # Install system dependencies (if needed)
8
  RUN apt-get update && apt-get install -y \
9
- git \
 
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
- # Copy requirements.txt
13
  COPY requirements.txt .
14
 
15
  # Install Python dependencies
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
18
- # Copy application files
19
- COPY .env .
20
- COPY msr.py .
21
-
22
- # Create a non-root user for security (optional but recommended)
23
- RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
24
- USER appuser
25
-
26
- # Expose port for Gradio web interface (default is 7860)
27
- EXPOSE 7860
28
-
29
  # Set environment variables
30
- ENV GRADIO_SERVER_NAME=0.0.0.0
31
- ENV GRADIO_SERVER_PORT=7860
32
 
33
- # Run the Gradio app
34
  CMD ["python", "msr.py"]
 
 
1
  FROM python:3.12-slim
2
 
3
  # Set working directory
4
  WORKDIR /app
5
 
6
+ # Install system dependencies
7
  RUN apt-get update && apt-get install -y \
8
+ gcc \
9
+ g++ \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
+ # Copy requirements file
13
  COPY requirements.txt .
14
 
15
  # Install Python dependencies
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Set environment variables
19
+ ENV PYTHONUNBUFFERED=1
 
20
 
21
+ # Run the mining script with scheduler
22
  CMD ["python", "msr.py"]
README.md CHANGED
@@ -52,7 +52,6 @@ Behind the scenes, we're doing a few things:
52
 
53
  **Data Collection**
54
  We search GitHub using multiple query patterns to catch all issues associated with an agent:
55
- - Issues authored by the agent (`author:agent-name`)
56
  - Issues assigned to the agent (`assignee:agent-name`)
57
 
58
  **Regular Updates**
 
52
 
53
  **Data Collection**
54
  We search GitHub using multiple query patterns to catch all issues associated with an agent:
 
55
  - Issues assigned to the agent (`assignee:agent-name`)
56
 
57
  **Regular Updates**
app.py CHANGED
@@ -3,12 +3,10 @@ from gradio_leaderboard import Leaderboard, ColumnFilter
3
  import json
4
  import os
5
  import time
6
- import tempfile
7
  import requests
8
- from datetime import datetime, timezone, timedelta
9
- from collections import defaultdict
10
  from huggingface_hub import HfApi, hf_hub_download
11
  from huggingface_hub.errors import HfHubHTTPError
 
12
  from dotenv import load_dotenv
13
  import pandas as pd
14
  import random
@@ -16,8 +14,6 @@ import plotly.graph_objects as go
16
  from plotly.subplots import make_subplots
17
  from apscheduler.schedulers.background import BackgroundScheduler
18
  from apscheduler.triggers.cron import CronTrigger
19
- from google.cloud import bigquery
20
- import backoff
21
 
22
  # Load environment variables
23
  load_dotenv()
@@ -27,10 +23,8 @@ load_dotenv()
27
  # =============================================================================
28
 
29
  AGENTS_REPO = "SWE-Arena/bot_metadata" # HuggingFace dataset for agent metadata
30
- ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata" # HuggingFace dataset for issue metadata
31
- LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata" # HuggingFace dataset for leaderboard metadata
32
- LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard
33
- UPDATE_TIME_FRAME_DAYS = 30 # How often to re-mine data via BigQuery
34
 
35
  LEADERBOARD_COLUMNS = [
36
  ("Agent Name", "string"),
@@ -45,1005 +39,57 @@ LEADERBOARD_COLUMNS = [
45
  # =============================================================================
46
 
47
  def is_rate_limit_error(e):
48
- """Check if the exception is a rate limit error (429)."""
49
- return isinstance(e, HfHubHTTPError) and e.response.status_code == 429
 
 
50
 
51
 
52
  @backoff.on_exception(
53
  backoff.expo,
54
  HfHubHTTPError,
55
- giveup=lambda e: not is_rate_limit_error(e),
56
- max_tries=8,
57
  base=300,
58
  max_value=3600,
59
- jitter=backoff.full_jitter,
60
- on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
 
 
61
  )
62
  def list_repo_files_with_backoff(api, **kwargs):
63
- """List repo files with exponential backoff on rate limit errors."""
64
  return api.list_repo_files(**kwargs)
65
 
66
- @backoff.on_exception(
67
- backoff.expo,
68
- HfHubHTTPError,
69
- giveup=lambda e: not is_rate_limit_error(e),
70
- max_tries=8,
71
- base=300,
72
- max_value=3600,
73
- jitter=backoff.full_jitter,
74
- on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
75
- )
76
- def hf_hub_download_with_backoff(**kwargs):
77
- """Download from HF Hub with exponential backoff on rate limit errors."""
78
- return hf_hub_download(**kwargs)
79
 
80
  @backoff.on_exception(
81
  backoff.expo,
82
  HfHubHTTPError,
83
- giveup=lambda e: not is_rate_limit_error(e),
84
- max_tries=8,
85
  base=300,
86
  max_value=3600,
87
- jitter=backoff.full_jitter,
88
- on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
89
- )
90
- def upload_file_with_backoff(api, **kwargs):
91
- """Upload file with exponential backoff on rate limit errors."""
92
- return api.upload_file(**kwargs)
93
-
94
- @backoff.on_exception(
95
- backoff.expo,
96
- HfHubHTTPError,
97
  giveup=lambda e: not is_rate_limit_error(e),
98
- max_tries=8,
99
- base=300,
100
- max_value=3600,
101
- jitter=backoff.full_jitter,
102
- on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
103
- )
104
- def upload_folder_with_backoff(api, **kwargs):
105
- """Upload folder with exponential backoff on rate limit errors."""
106
- return api.upload_folder(**kwargs)
107
-
108
- # =============================================================================
109
- # JSONL FILE OPERATIONS
110
- # =============================================================================
111
-
112
- def load_jsonl(filename):
113
- """Load JSONL file and return list of dictionaries."""
114
- if not os.path.exists(filename):
115
- return []
116
-
117
- data = []
118
- with open(filename, 'r', encoding='utf-8') as f:
119
- for line in f:
120
- line = line.strip()
121
- if line:
122
- try:
123
- entry = json.loads(line)
124
- data.append(entry)
125
- except json.JSONDecodeError as e:
126
- print(f"Warning: Skipping invalid JSON line: {e}")
127
- return data
128
-
129
-
130
- def save_jsonl(filename, data):
131
- """Save list of dictionaries to JSONL file."""
132
- with open(filename, 'w', encoding='utf-8') as f:
133
- for item in data:
134
- f.write(json.dumps(item) + '\n')
135
-
136
-
137
- def cache_to_dict(cache_list):
138
- """Convert list of cache entries to dictionary by identifier."""
139
- return {entry['github_identifier']: entry for entry in cache_list}
140
-
141
-
142
- def dict_to_cache(cache_dict):
143
- """Convert dictionary back to list of values."""
144
- return list(cache_dict.values())
145
-
146
-
147
- def normalize_date_format(date_string):
148
- """
149
- Convert date strings to standardized ISO 8601 format with Z suffix.
150
- Handles both old format (2025-10-15T23:23:47.983068) and new format (2025-10-15T23:23:47Z).
151
- Also handles space separator (2025-06-23 07:18:28) and incomplete timezone offsets (+00).
152
- """
153
- if not date_string or date_string == 'N/A':
154
- return 'N/A'
155
-
156
- try:
157
- # Replace space with 'T' for ISO format compatibility
158
- date_string = date_string.replace(' ', 'T')
159
-
160
- # Fix incomplete timezone offset (+00 or -00 -> +00:00 or -00:00)
161
- if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
162
- date_string = date_string + ':00'
163
-
164
- # Parse the date string (handles both with and without microseconds)
165
- dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
166
-
167
- # Convert to standardized format
168
- return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
169
- except Exception as e:
170
- print(f"Warning: Could not parse date '{date_string}': {e}")
171
- return date_string
172
-
173
-
174
- # =============================================================================
175
- # BIGQUERY OPERATIONS
176
- # =============================================================================
177
-
178
- def get_bigquery_client():
179
- """
180
- Initialize BigQuery client using credentials from environment variable.
181
-
182
- Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
183
- the service account JSON credentials as a string.
184
- """
185
- # Get the JSON content from environment variable
186
- creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
187
-
188
- if creds_json:
189
- # Create a temporary file to store credentials
190
- with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
191
- temp_file.write(creds_json)
192
- temp_path = temp_file.name
193
-
194
- # Set environment variable to point to temp file
195
- os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
196
-
197
- # Initialize BigQuery client
198
- client = bigquery.Client()
199
-
200
- # Clean up temp file
201
- os.unlink(temp_path)
202
-
203
- return client
204
- else:
205
- raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
206
-
207
-
208
- def generate_table_union_statements(start_date, end_date):
209
- """
210
- Generate UNION ALL statements for githubarchive.month tables in date range.
211
-
212
- Args:
213
- start_date: Start datetime
214
- end_date: End datetime
215
-
216
- Returns:
217
- String with UNION ALL SELECT statements for all monthly tables in range
218
- """
219
- table_names = []
220
-
221
- # Start from the beginning of start_date's month
222
- current_date = start_date.replace(day=1)
223
- end_month = end_date.replace(day=1)
224
-
225
- while current_date <= end_month:
226
- table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
227
- table_names.append(table_name)
228
-
229
- # Move to next month
230
- if current_date.month == 12:
231
- current_date = current_date.replace(year=current_date.year + 1, month=1)
232
- else:
233
- current_date = current_date.replace(month=current_date.month + 1)
234
-
235
- # Create UNION ALL chain
236
- union_parts = [f"SELECT * FROM {table}" for table in table_names]
237
- return " UNION ALL ".join(union_parts)
238
-
239
-
240
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
241
- """
242
- Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
243
-
244
- Splits agents into smaller batches to avoid performance issues with large UNNEST arrays
245
- and correlated subqueries. Each batch query runs much faster than one massive query.
246
-
247
- Args:
248
- client: BigQuery client instance
249
- identifiers: List of GitHub usernames/bot identifiers
250
- start_date: Start datetime (timezone-aware)
251
- end_date: End datetime (timezone-aware)
252
- batch_size: Number of agents per batch (default: 100)
253
- upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
254
-
255
- Returns:
256
- Dictionary mapping agent identifier to list of issue metadata
257
- """
258
- print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
259
- print(f" Batch size: {batch_size} agents per query")
260
- print(f" Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
261
-
262
- # Split identifiers into batches
263
- batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
264
- print(f" Total batches: {len(batches)}")
265
-
266
- # Collect results from all batches
267
- all_metadata = {}
268
-
269
- for batch_num, batch_identifiers in enumerate(batches, 1):
270
- print(f"\n{'─'*80}")
271
- print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)")
272
- print(f"{'─'*80}")
273
-
274
- try:
275
- batch_results = fetch_all_issue_metadata_single_query(
276
- client, batch_identifiers, start_date, end_date
277
- )
278
-
279
- # Merge results
280
- for identifier, metadata_list in batch_results.items():
281
- if identifier in all_metadata:
282
- all_metadata[identifier].extend(metadata_list)
283
- else:
284
- all_metadata[identifier] = metadata_list
285
-
286
- print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
287
-
288
- # Upload immediately after this batch if enabled
289
- if upload_immediately and batch_results:
290
- print(f"\n 🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
291
- upload_success = 0
292
- upload_errors = 0
293
-
294
- for identifier, metadata_list in batch_results.items():
295
- if metadata_list:
296
- if save_issue_metadata_to_hf(metadata_list, identifier):
297
- upload_success += 1
298
- else:
299
- upload_errors += 1
300
-
301
- print(f" ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
302
-
303
- except Exception as e:
304
- print(f" ✗ Batch {batch_num} failed: {str(e)}")
305
- print(f" Continuing with remaining batches...")
306
- import traceback
307
- traceback.print_exc()
308
- continue
309
-
310
- print(f"\n{'='*80}")
311
- print(f"✅ All batches completed!")
312
- print(f" Total agents with data: {len(all_metadata)}")
313
- total_issues = sum(len(issues) for issues in all_metadata.values())
314
- print(f" Total issues found: {total_issues}")
315
- print(f"{'='*80}\n")
316
-
317
- return all_metadata
318
-
319
-
320
- def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
321
- """
322
- Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query.
323
-
324
- This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
325
- deduplicates to get the latest state of each issue. Filters by issue author,
326
- commenter, or assignee.
327
-
328
- NOTE: This function is designed for smaller batches (~100 agents). For large
329
- numbers of agents, use fetch_issue_metadata_batched() instead.
330
-
331
- Args:
332
- client: BigQuery client instance
333
- identifiers: List of GitHub usernames/bot identifiers (recommended: <100)
334
- start_date: Start datetime (timezone-aware)
335
- end_date: End datetime (timezone-aware)
336
-
337
- Returns:
338
- Dictionary mapping agent identifier to list of issue metadata:
339
- {
340
- 'agent-identifier': [
341
- {
342
- 'url': Issue URL,
343
- 'created_at': Issue creation timestamp,
344
- 'closed_at': Close timestamp (if closed, else None),
345
- 'state_reason': Reason for closure (completed/not_planned/etc.)
346
- },
347
- ...
348
- ],
349
- ...
350
- }
351
- """
352
- print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY")
353
- print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
354
-
355
- # Generate table UNION statements for issue events
356
- issue_tables = generate_table_union_statements(start_date, end_date)
357
-
358
- # Build identifier list for IN clause (handle both bot and non-bot versions)
359
- identifier_set = set()
360
- for id in identifiers:
361
- identifier_set.add(id)
362
- # Also add stripped version without [bot] suffix
363
- stripped = id.replace('[bot]', '')
364
- if stripped != id:
365
- identifier_set.add(stripped)
366
-
367
- # Create array format for UNNEST (avoids 256KB query size limit)
368
- identifier_array = '[' + ', '.join([f'"{id}"' for id in identifier_set]) + ']'
369
-
370
- print(f" Total identifiers (including bot/non-bot variants): {len(identifier_set)}")
371
-
372
- # Build comprehensive query with CTEs
373
- query = f"""
374
- WITH agent_identifiers AS (
375
- -- Create a table of all agent identifiers using UNNEST
376
- -- This avoids hitting BigQuery's 256KB query size limit with large IN clauses
377
- SELECT identifier
378
- FROM UNNEST({identifier_array}) AS identifier
379
- ),
380
-
381
- issue_events AS (
382
- -- Get all issue events and comment events for ALL agents
383
- SELECT
384
- JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as url,
385
- JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as created_at,
386
- JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as closed_at,
387
- JSON_EXTRACT_SCALAR(payload, '$.issue.state_reason') as state_reason,
388
- JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as author,
389
- JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') as assignee,
390
- JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') as commenter,
391
- JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
392
- repo.name as repo_name,
393
- created_at as event_time
394
- FROM (
395
- {issue_tables}
396
- )
397
- WHERE
398
- type IN ('IssuesEvent', 'IssueCommentEvent')
399
- -- Exclude pull requests (they have pull_request field)
400
- AND JSON_EXTRACT(payload, '$.issue.pull_request') IS NULL
401
- AND JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') IS NOT NULL
402
- -- Filter by author OR commenter OR assignee
403
- AND (
404
- JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') IN (SELECT identifier FROM agent_identifiers)
405
- OR JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') IN (SELECT identifier FROM agent_identifiers)
406
- OR JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') IN (SELECT identifier FROM agent_identifiers)
407
- )
408
- ),
409
-
410
- latest_states AS (
411
- -- Deduplicate to get latest state for each issue
412
- SELECT
413
- url,
414
- created_at,
415
- closed_at,
416
- state_reason,
417
- author,
418
- assignee,
419
- commenter
420
- FROM issue_events
421
- QUALIFY ROW_NUMBER() OVER (
422
- PARTITION BY repo_name, issue_number
423
- ORDER BY event_time DESC
424
- ) = 1
425
- ),
426
-
427
- agent_issues AS (
428
- -- Map each issue to its relevant agent(s)
429
- SELECT DISTINCT
430
- CASE
431
- WHEN author IN (SELECT identifier FROM agent_identifiers) THEN author
432
- WHEN commenter IN (SELECT identifier FROM agent_identifiers) THEN commenter
433
- WHEN assignee IN (SELECT identifier FROM agent_identifiers) THEN assignee
434
- ELSE NULL
435
- END as agent_identifier,
436
- url,
437
- created_at,
438
- closed_at,
439
- state_reason
440
- FROM latest_states
441
- WHERE
442
- author IN (SELECT identifier FROM agent_identifiers)
443
- OR commenter IN (SELECT identifier FROM agent_identifiers)
444
- OR assignee IN (SELECT identifier FROM agent_identifiers)
445
  )
446
-
447
- SELECT
448
- agent_identifier,
449
- url,
450
- created_at,
451
- closed_at,
452
- state_reason
453
- FROM agent_issues
454
- WHERE agent_identifier IS NOT NULL
455
- ORDER BY agent_identifier, created_at DESC
456
- """
457
-
458
- # Calculate number of days for reporting
459
- query_days = (end_date - start_date).days
460
-
461
- print(f" Querying {query_days} days for issue and comment events...")
462
- print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
463
-
464
- try:
465
- query_job = client.query(query)
466
- results = list(query_job.result())
467
-
468
- print(f" ✓ Found {len(results)} total issue records across all agents")
469
-
470
- # Group results by agent
471
- metadata_by_agent = defaultdict(list)
472
-
473
- for row in results:
474
- agent_id = row.agent_identifier
475
-
476
- # Convert datetime objects to ISO strings
477
- created_at = row.created_at
478
- if hasattr(created_at, 'isoformat'):
479
- created_at = created_at.isoformat()
480
-
481
- closed_at = row.closed_at
482
- if hasattr(closed_at, 'isoformat'):
483
- closed_at = closed_at.isoformat()
484
-
485
- metadata_by_agent[agent_id].append({
486
- 'url': row.url,
487
- 'created_at': created_at,
488
- 'closed_at': closed_at,
489
- 'state_reason': row.state_reason,
490
- })
491
-
492
- # Print breakdown by agent
493
- print(f"\n 📊 Results breakdown by agent:")
494
- for identifier in identifiers:
495
- # Check both original and stripped versions
496
- count = len(metadata_by_agent.get(identifier, []))
497
- stripped = identifier.replace('[bot]', '')
498
- if stripped != identifier:
499
- count += len(metadata_by_agent.get(stripped, []))
500
-
501
- if count > 0:
502
- # Merge both versions if needed
503
- all_metadata = metadata_by_agent.get(identifier, []) + metadata_by_agent.get(stripped, [])
504
- completed_count = sum(1 for m in all_metadata if m['state_reason'] == 'completed')
505
- closed_count = sum(1 for m in all_metadata if m['closed_at'] is not None)
506
- open_count = count - closed_count
507
- print(f" {identifier}: {count} issues ({completed_count} completed, {closed_count} closed, {open_count} open)")
508
-
509
- # Convert defaultdict to regular dict and merge bot/non-bot versions
510
- final_metadata = {}
511
- for identifier in identifiers:
512
- combined = metadata_by_agent.get(identifier, [])
513
- stripped = identifier.replace('[bot]', '')
514
- if stripped != identifier and stripped in metadata_by_agent:
515
- combined.extend(metadata_by_agent[stripped])
516
-
517
- if combined:
518
- final_metadata[identifier] = combined
519
-
520
- return final_metadata
521
-
522
- except Exception as e:
523
- print(f" ✗ BigQuery error: {str(e)}")
524
- import traceback
525
- traceback.print_exc()
526
- return {}
527
 
528
 
529
  # =============================================================================
530
- # GITHUB API OPERATIONS (Minimal - for validation only)
531
  # =============================================================================
532
 
533
- def get_github_token():
534
- """Get GitHub token from environment variables for validation purposes."""
535
- token = os.getenv('GITHUB_TOKEN')
536
- if not token:
537
- print("Warning: GITHUB_TOKEN not found for validation")
538
- return token
539
-
540
-
541
  def validate_github_username(identifier):
542
- """Verify that a GitHub identifier exists (simple validation for submission)."""
543
  try:
544
- token = get_github_token()
545
- headers = {'Authorization': f'token {token}'} if token else {}
546
- url = f'https://api.github.com/users/{identifier}'
547
- response = requests.get(url, headers=headers, timeout=10)
548
-
549
- if response.status_code == 200:
550
- return True, "Username is valid"
551
- elif response.status_code == 404:
552
- return False, "GitHub identifier not found"
553
- else:
554
- return False, f"Validation error: HTTP {response.status_code}"
555
  except Exception as e:
556
  return False, f"Validation error: {str(e)}"
557
 
558
 
559
- # =============================================================================
560
- # ISSUE METADATA OPERATIONS
561
- # =============================================================================
562
-
563
-
564
- def extract_issue_metadata(issue):
565
- """
566
- Extract minimal issue metadata for efficient storage.
567
- Only keeps essential fields: url, created_at, closed_at, state_reason.
568
- Note: agent_name is not stored as it's inferred from the folder structure.
569
-
570
- Issue states:
571
- - state: "open" or "closed"
572
- - state_reason: "completed" (resolved), "not_planned" (closed as not planned), or None (still open)
573
- """
574
- # Extract dates and state
575
- created_at = issue.get('created_at')
576
- closed_at = issue.get('closed_at')
577
- state = issue.get('state')
578
- state_reason = issue.get('state_reason')
579
-
580
- return {
581
- 'url': issue.get('url'),
582
- 'created_at': created_at,
583
- 'closed_at': closed_at,
584
- 'state': state,
585
- 'state_reason': state_reason
586
- }
587
-
588
-
589
-
590
-
591
- def calculate_issue_stats_from_metadata(metadata_list):
592
- """
593
- Calculate statistics from a list of issue metadata (lightweight objects).
594
- Works with minimal metadata: url, created_at, closed_at, state, state_reason.
595
-
596
- Returns a dictionary with comprehensive issue metrics.
597
-
598
- Resolved Rate is calculated as:
599
- completed issues / closed issues * 100
600
-
601
- Completed Issues = issues closed as completed (state_reason="completed")
602
- Closed Issues = all issues that have been closed (closed_at is not None)
603
- We do NOT count issues closed as not planned (state_reason="not_planned") as resolved,
604
- but they ARE counted in the denominator as closed issues.
605
- """
606
- total_issues = len(metadata_list)
607
-
608
- # Count closed issues (those with closed_at timestamp)
609
- closed_issues = sum(1 for issue_meta in metadata_list
610
- if issue_meta.get('closed_at') is not None)
611
-
612
- # Count completed issues (subset of closed issues with state_reason="completed")
613
- completed = sum(1 for issue_meta in metadata_list
614
- if issue_meta.get('state_reason') == 'completed')
615
-
616
- # Calculate resolved rate as: completed / closed (not completed / total)
617
- resolved_rate = (completed / closed_issues * 100) if closed_issues > 0 else 0
618
-
619
- return {
620
- 'total_issues': total_issues,
621
- 'closed_issues': closed_issues,
622
- 'resolved_issues': completed,
623
- 'resolved_rate': round(resolved_rate, 2),
624
- }
625
-
626
-
627
- def calculate_monthly_metrics_by_agent(top_n=None):
628
- """
629
- Calculate monthly metrics for all agents (or top N agents) for visualization.
630
- Loads data directly from SWE-Arena/issue_metadata dataset.
631
-
632
- Args:
633
- top_n: If specified, only return metrics for the top N agents by total issues.
634
- Agents are ranked by their total issue count across all months.
635
-
636
- Returns:
637
- dict: {
638
- 'agents': list of agent names,
639
- 'months': list of month labels (e.g., '2025-01'),
640
- 'data': {
641
- agent_name: {
642
- 'resolved_rates': list of resolved rates by month,
643
- 'total_issues': list of issue counts by month,
644
- 'resolved_issues': list of resolved issue counts by month
645
- }
646
- }
647
- }
648
- """
649
- # Load ALL agents from HuggingFace agents repo
650
- agents = load_agents_from_hf()
651
-
652
- # Create mapping from agent_identifier to agent_name
653
- identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
654
-
655
- # Load all issue metadata from issue_metadata dataset
656
- all_metadata = load_issue_metadata()
657
-
658
- if not all_metadata:
659
- return {'agents': [], 'months': [], 'data': {}}
660
-
661
- # Group by agent and month
662
- agent_month_data = defaultdict(lambda: defaultdict(list))
663
-
664
- for issue_meta in all_metadata:
665
- agent_identifier = issue_meta.get('agent_identifier')
666
- created_at = issue_meta.get('created_at')
667
-
668
- if not agent_identifier or not created_at:
669
- continue
670
-
671
- # Get agent_name from identifier
672
- agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
673
-
674
- try:
675
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
676
- month_key = f"{dt.year}-{dt.month:02d}"
677
- agent_month_data[agent_name][month_key].append(issue_meta)
678
- except Exception as e:
679
- print(f"Warning: Could not parse date '{created_at}': {e}")
680
- continue
681
-
682
- # Get all unique months and sort them
683
- all_months = set()
684
- for agent_data in agent_month_data.values():
685
- all_months.update(agent_data.keys())
686
- months = sorted(list(all_months))
687
-
688
- # Calculate metrics for each agent and month
689
- result_data = {}
690
- for agent_name, month_dict in agent_month_data.items():
691
- resolved_rates = []
692
- total_issues_list = []
693
- resolved_issues_list = []
694
-
695
- for month in months:
696
- issues_in_month = month_dict.get(month, [])
697
-
698
- # Count completed issues (those with state_reason="completed")
699
- completed_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')
700
-
701
- # Count closed issues (those with closed_at timestamp)
702
- closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at') is not None)
703
-
704
- # Total issues created in this month
705
- total_count = len(issues_in_month)
706
-
707
- # Calculate resolved rate as: completed / closed (not completed / total)
708
- resolved_rate = (completed_count / closed_count * 100) if closed_count > 0 else None
709
-
710
- resolved_rates.append(resolved_rate)
711
- total_issues_list.append(total_count)
712
- resolved_issues_list.append(completed_count)
713
-
714
- result_data[agent_name] = {
715
- 'resolved_rates': resolved_rates,
716
- 'total_issues': total_issues_list,
717
- 'resolved_issues': resolved_issues_list
718
- }
719
-
720
- # Filter to top N agents if specified
721
- agents_list = sorted(list(agent_month_data.keys()))
722
- if top_n is not None and top_n > 0:
723
- # Calculate total issues for each agent across all months
724
- agent_totals = []
725
- for agent_name in agents_list:
726
- total_issues = sum(result_data[agent_name]['total_issues'])
727
- agent_totals.append((agent_name, total_issues))
728
-
729
- # Sort by total issues (descending) and take top N
730
- agent_totals.sort(key=lambda x: x[1], reverse=True)
731
- top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
732
-
733
- # Filter result_data to only include top agents
734
- result_data = {agent: result_data[agent] for agent in top_agents if agent in result_data}
735
- agents_list = top_agents
736
-
737
- return {
738
- 'agents': agents_list,
739
- 'months': months,
740
- 'data': result_data
741
- }
742
-
743
-
744
- # =============================================================================
745
- # ISSUE METADATA STORAGE & RETRIEVAL
746
- # =============================================================================
747
-
748
- def group_metadata_by_date(metadata_list):
749
- """
750
- Group issue metadata by exact date (year.month.day) for efficient daily storage.
751
- Returns dict: {(year, month, day): [metadata_list]}
752
- """
753
- grouped = defaultdict(list)
754
-
755
- for issue_meta in metadata_list:
756
- created_at = issue_meta.get('created_at')
757
- if not created_at:
758
- continue
759
-
760
- try:
761
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
762
- key = (dt.year, dt.month, dt.day)
763
- grouped[key].append(issue_meta)
764
- except Exception as e:
765
- print(f"Warning: Could not parse date '{created_at}': {e}")
766
-
767
- return dict(grouped)
768
-
769
-
770
- def save_issue_metadata_to_hf(metadata_list, agent_identifier):
771
- """
772
- Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
773
- Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
774
-
775
- This function uses COMPLETE OVERWRITE strategy (not append/deduplicate).
776
- Uses upload_folder for single-commit batch uploads (avoids rate limit issues).
777
-
778
- Args:
779
- metadata_list: List of issue metadata dictionaries
780
- agent_identifier: GitHub identifier of the agent (used as folder name)
781
- """
782
- import tempfile
783
- import shutil
784
-
785
- temp_dir = None
786
- try:
787
- token = get_hf_token()
788
- if not token:
789
- raise Exception("No HuggingFace token found")
790
-
791
- api = HfApi(token=token)
792
-
793
- # Group by exact date (year, month, day)
794
- grouped = group_metadata_by_date(metadata_list)
795
-
796
- if not grouped:
797
- print(f" No valid metadata to save for {agent_identifier}")
798
- return False
799
-
800
- # Create temporary directory for batch upload
801
- temp_dir = tempfile.mkdtemp()
802
- agent_folder = os.path.join(temp_dir, agent_identifier)
803
- os.makedirs(agent_folder, exist_ok=True)
804
-
805
- print(f"📦 Preparing batch upload for {agent_identifier} ({len(grouped)} daily files)...")
806
-
807
- # Process each daily file
808
- for (issue_year, month, day), day_metadata in grouped.items():
809
- filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
810
- local_filename = os.path.join(agent_folder, f"{issue_year}.{month:02d}.{day:02d}.jsonl")
811
-
812
- # Sort by created_at for better organization
813
- day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
814
-
815
- # Save to temp directory (complete overwrite, no merging)
816
- save_jsonl(local_filename, day_metadata)
817
- print(f" Prepared {len(day_metadata)} issues for {filename}")
818
-
819
- # Upload entire folder using upload_folder (single commit per agent)
820
- print(f"🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
821
- upload_folder_with_backoff(
822
- api,
823
- folder_path=temp_dir,
824
- repo_id=ISSUE_METADATA_REPO,
825
- repo_type="dataset",
826
- commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
827
- )
828
- print(f" ✓ Batch upload complete for {agent_identifier}")
829
-
830
- return True
831
-
832
- except Exception as e:
833
- print(f"✗ Error saving issue metadata: {str(e)}")
834
- return False
835
- finally:
836
- # Always clean up temporary directory
837
- if temp_dir and os.path.exists(temp_dir):
838
- shutil.rmtree(temp_dir)
839
-
840
-
841
- def load_issue_metadata():
842
- """
843
- Load issue metadata from the last LEADERBOARD_TIME_FRAME_DAYS only.
844
-
845
- Structure: [agent_identifier]/YYYY.MM.DD.jsonl
846
-
847
- Returns:
848
- List of dictionaries with 'agent_identifier' added to each issue metadata.
849
- Only includes issues within the last LEADERBOARD_TIME_FRAME_DAYS.
850
- """
851
- # Calculate cutoff date based on LEADERBOARD_TIME_FRAME_DAYS
852
- current_time = datetime.now(timezone.utc)
853
- cutoff_date = current_time - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
854
-
855
- try:
856
- api = HfApi()
857
- token = get_hf_token()
858
-
859
- # List all files in the repository
860
- files = list_repo_files_with_backoff(api, repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
861
-
862
- # Filter for files within the time frame: [agent_identifier]/YYYY.MM.DD.jsonl
863
- # Parse date from filename and only include files within LEADERBOARD_TIME_FRAME_DAYS
864
- time_frame_files = []
865
- for f in files:
866
- if f.endswith('.jsonl'):
867
- parts = f.split('/')
868
- if len(parts) == 2: # [agent_identifier]/YYYY.MM.DD.jsonl
869
- filename = parts[1]
870
- try:
871
- # Extract date from filename: YYYY.MM.DD.jsonl
872
- date_part = filename.replace('.jsonl', '') # Get YYYY.MM.DD
873
- date_components = date_part.split('.')
874
- if len(date_components) == 3:
875
- file_year, file_month, file_day = map(int, date_components)
876
- file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
877
-
878
- # Only include files within the time frame
879
- if file_date >= cutoff_date:
880
- time_frame_files.append(f)
881
- except Exception:
882
- # Skip files with unparseable dates
883
- continue
884
-
885
- print(f"📥 [LOAD] Reading cached issue metadata from HuggingFace ({len(time_frame_files)} files, last {LEADERBOARD_TIME_FRAME_DAYS} days)...")
886
-
887
- all_metadata = []
888
- for filename in time_frame_files:
889
- try:
890
- # Extract agent_identifier from path (first part)
891
- # Format: agent_identifier/YYYY.MM.DD.jsonl
892
- parts = filename.split('/')
893
- if len(parts) != 2:
894
- print(f" Warning: Unexpected filename format: {filename}")
895
- continue
896
-
897
- agent_identifier = parts[0]
898
-
899
- file_path = hf_hub_download_with_backoff(
900
- repo_id=ISSUE_METADATA_REPO,
901
- filename=filename,
902
- repo_type="dataset",
903
- token=token
904
- )
905
- day_metadata = load_jsonl(file_path)
906
-
907
- # Add agent_identifier and filter by date as a double-check
908
- for issue_meta in day_metadata:
909
- # Validate issue date against cutoff
910
- created_at = issue_meta.get('created_at')
911
- if created_at:
912
- try:
913
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
914
- if dt < cutoff_date:
915
- continue # Skip issues outside time frame
916
- except Exception:
917
- pass # Keep issues with unparseable dates
918
-
919
- issue_meta['agent_identifier'] = agent_identifier
920
- all_metadata.append(issue_meta)
921
-
922
- print(f" ✓ Loaded {len(day_metadata)} issues from {filename}")
923
- except Exception as e:
924
- print(f" Warning: Could not load {filename}: {str(e)}")
925
-
926
- print(f"✓ Loaded {len(all_metadata)} total issues from last {LEADERBOARD_TIME_FRAME_DAYS} days")
927
- return all_metadata
928
-
929
- except Exception as e:
930
- print(f"✗ Error loading issue metadata from last {LEADERBOARD_TIME_FRAME_DAYS} days: {str(e)}")
931
- return []
932
-
933
-
934
- def get_latest_issue_date_for_agent(agent_identifier):
935
- """
936
- Get the latest issue creation date for an agent from stored metadata.
937
- Used for incremental updates - only fetch issues newer than this date.
938
-
939
- Structure: [agent_identifier]/YYYY.MM.DD.jsonl
940
-
941
- Args:
942
- agent_identifier: GitHub identifier of the agent
943
-
944
- Returns:
945
- datetime or None if no existing issues found.
946
- """
947
- try:
948
- api = HfApi()
949
- token = get_hf_token()
950
-
951
- # List all files in the repository
952
- files = list_repo_files_with_backoff(api, repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
953
-
954
- # Filter for files in this agent's folder
955
- # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
956
- agent_pattern = f"{agent_identifier}/"
957
- agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
958
-
959
- if not agent_files:
960
- return None
961
-
962
- # Find latest created_at across all files
963
- latest_date = None
964
- for filename in agent_files:
965
- try:
966
- file_path = hf_hub_download_with_backoff(
967
- repo_id=ISSUE_METADATA_REPO,
968
- filename=filename,
969
- repo_type="dataset",
970
- token=token
971
- )
972
- metadata = load_jsonl(file_path)
973
-
974
- for issue in metadata:
975
- created_at = issue.get('created_at')
976
- if created_at:
977
- try:
978
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
979
- if latest_date is None or dt > latest_date:
980
- latest_date = dt
981
- except Exception:
982
- continue
983
- except Exception:
984
- continue
985
-
986
- return latest_date
987
-
988
- except Exception:
989
- return None
990
-
991
-
992
- def get_daily_files_last_time_frame(agent_identifier):
993
- """
994
- Get list of daily file paths for an agent from the configured time frame.
995
-
996
- Args:
997
- agent_identifier: GitHub identifier of the agent
998
-
999
- Returns:
1000
- List of file paths in format: [agent_identifier]/YYYY.MM.DD.jsonl
1001
- """
1002
- try:
1003
- api = HfApi()
1004
- token = get_hf_token()
1005
-
1006
- # Calculate date range using configured time frame
1007
- today = datetime.now(timezone.utc)
1008
- cutoff_date = today - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
1009
-
1010
- # List all files in the repository
1011
- files = list_repo_files_with_backoff(api, repo_id=ISSUE_METADATA_REPO, repo_type="dataset")
1012
-
1013
- # Filter for files in this agent's folder
1014
- agent_pattern = f"{agent_identifier}/"
1015
- agent_files = [f for f in files if f.startswith(agent_pattern) and f.endswith('.jsonl')]
1016
-
1017
- # Filter by date range (extract date from filename)
1018
- recent_files = []
1019
- for filename in agent_files:
1020
- try:
1021
- # Extract date from filename: YYYY.MM.DD.jsonl
1022
- parts = filename.split('/')
1023
- if len(parts) != 2:
1024
- continue
1025
-
1026
- date_part = parts[1].replace('.jsonl', '') # Get YYYY.MM.DD
1027
- date_components = date_part.split('.')
1028
- if len(date_components) != 3:
1029
- continue
1030
-
1031
- file_year, file_month, file_day = map(int, date_components)
1032
- file_date = datetime(file_year, file_month, file_day, tzinfo=timezone.utc)
1033
-
1034
- # Include if within configured time frame
1035
- if cutoff_date <= file_date <= today:
1036
- recent_files.append(filename)
1037
- except Exception:
1038
- continue
1039
-
1040
- return recent_files
1041
-
1042
- except Exception as e:
1043
- print(f"Error getting daily files: {str(e)}")
1044
- return []
1045
-
1046
-
1047
  # =============================================================================
1048
  # HUGGINGFACE DATASET OPERATIONS
1049
  # =============================================================================
@@ -1055,7 +101,7 @@ def load_agents_from_hf():
1055
  agents = []
1056
 
1057
  # List all files in the repository
1058
- files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
1059
 
1060
  # Filter for JSON files only
1061
  json_files = [f for f in files if f.endswith('.json')]
@@ -1082,19 +128,13 @@ def load_agents_from_hf():
1082
  # Add or override github_identifier to match filename
1083
  agent_data['github_identifier'] = filename_identifier
1084
 
1085
- # Normalize name field: use 'name' if exists, otherwise use identifier
1086
- if 'name' in agent_data:
1087
- agent_data['name'] = agent_data['name']
1088
- elif 'name' not in agent_data:
1089
- agent_data['name'] = filename_identifier
1090
-
1091
  agents.append(agent_data)
1092
 
1093
  except Exception as e:
1094
  print(f"Warning: Could not load {json_file}: {str(e)}")
1095
  continue
1096
 
1097
- print(f"Loaded {len(agents)} agents from HuggingFace")
1098
  return agents
1099
 
1100
  except Exception as e:
@@ -1102,8 +142,6 @@ def load_agents_from_hf():
1102
  return None
1103
 
1104
 
1105
-
1106
-
1107
  def get_hf_token():
1108
  """Get HuggingFace token from environment variables."""
1109
  token = os.getenv('HF_TOKEN')
@@ -1112,48 +150,6 @@ def get_hf_token():
1112
  return token
1113
 
1114
 
1115
- def load_cached_leaderboard_and_metrics():
1116
- """
1117
- Load cached leaderboard and monthly metrics data from HuggingFace.
1118
- This is much faster than constructing from scratch on every app launch.
1119
-
1120
- Returns:
1121
- dict: {
1122
- 'leaderboard': dict of agent stats,
1123
- 'monthly_metrics': dict with agents, months, and data,
1124
- 'metadata': dict with last_updated, time_frame_days, total_agents
1125
- }
1126
- Returns None if cache doesn't exist or fails to load.
1127
- """
1128
- try:
1129
- token = get_hf_token()
1130
-
1131
- print("📥 Loading cached leaderboard and metrics from HuggingFace...")
1132
-
1133
- # Download cached file
1134
- cached_path = hf_hub_download_with_backoff(
1135
- repo_id=LEADERBOARD_REPO,
1136
- filename="swe-issue.json",
1137
- repo_type="dataset",
1138
- token=token
1139
- )
1140
-
1141
- # Load JSON data
1142
- with open(cached_path, 'r', encoding='utf-8') as f:
1143
- data = json.load(f)
1144
-
1145
- print(f" ✓ Loaded cached data (last updated: {data.get('metadata', {}).get('last_updated', 'Unknown')})")
1146
- print(f" ✓ Leaderboard entries: {len(data.get('leaderboard', {}))}")
1147
- print(f" ✓ Monthly metrics for: {len(data.get('monthly_metrics', {}).get('agents', []))} agents")
1148
-
1149
- return data
1150
-
1151
- except Exception as e:
1152
- print(f"⚠️ Could not load cached data: {str(e)}")
1153
- print(f" Falling back to constructing from issue metadata...")
1154
- return None
1155
-
1156
-
1157
  def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
1158
  """
1159
  Upload file to HuggingFace with exponential backoff retry logic.
@@ -1182,18 +178,18 @@ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, to
1182
  token=token
1183
  )
1184
  if attempt > 0:
1185
- print(f" Upload succeeded on attempt {attempt + 1}/{max_retries}")
1186
  return True
1187
 
1188
  except Exception as e:
1189
  if attempt < max_retries - 1:
1190
  wait_time = delay + random.uniform(0, 1.0)
1191
- print(f" ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
1192
- print(f" Retrying in {wait_time:.1f} seconds...")
1193
  time.sleep(wait_time)
1194
  delay = min(delay * 2, 60.0) # Exponential backoff, max 60s
1195
  else:
1196
- print(f" Upload failed after {max_retries} attempts: {str(e)}")
1197
  raise
1198
 
1199
 
@@ -1223,7 +219,7 @@ def save_agent_to_hf(data):
1223
  repo_type="dataset",
1224
  token=token
1225
  )
1226
- print(f"Saved agent to HuggingFace: {filename}")
1227
  return True
1228
  finally:
1229
  # Always clean up local file, even if upload fails
@@ -1231,207 +227,48 @@ def save_agent_to_hf(data):
1231
  os.remove(filename)
1232
 
1233
  except Exception as e:
1234
- print(f"Error saving agent: {str(e)}")
1235
  return False
1236
 
1237
 
1238
-
1239
-
1240
- # =============================================================================
1241
- # DATA MANAGEMENT
1242
- # =============================================================================
1243
-
1244
- def save_leaderboard_and_metrics_to_hf():
1245
  """
1246
- Creates a comprehensive JSON file with both leaderboard stats and monthly metrics.
1247
- If the file exists, it will be overwritten.
1248
 
1249
  Returns:
1250
- bool: True if successful, False otherwise
 
1251
  """
1252
- import io
1253
-
1254
  try:
1255
  token = get_hf_token()
1256
- if not token:
1257
- raise Exception("No HuggingFace token found")
1258
-
1259
- api = HfApi(token=token)
1260
-
1261
- print(f"\n{'='*80}")
1262
- print(f"📊 Preparing leaderboard and metrics data for upload...")
1263
- print(f"{'='*80}\n")
1264
-
1265
- # Get leaderboard data
1266
- print(" Constructing leaderboard data...")
1267
- leaderboard_data = construct_leaderboard_from_metadata()
1268
-
1269
- # Get monthly metrics data (all agents, not just top N)
1270
- print(" Calculating monthly metrics...")
1271
- monthly_metrics = calculate_monthly_metrics_by_agent(top_n=None)
1272
-
1273
- # Combine into a single structure
1274
- combined_data = {
1275
- "leaderboard": leaderboard_data,
1276
- "monthly_metrics": monthly_metrics,
1277
- "metadata": {
1278
- "last_updated": datetime.now(timezone.utc).isoformat(),
1279
- "time_frame_days": LEADERBOARD_TIME_FRAME_DAYS,
1280
- "total_agents": len(leaderboard_data)
1281
- }
1282
- }
1283
-
1284
- print(f" Leaderboard entries: {len(leaderboard_data)}")
1285
- print(f" Monthly metrics for: {len(monthly_metrics['agents'])} agents")
1286
- print(f" Time frame: {LEADERBOARD_TIME_FRAME_DAYS} days")
1287
 
1288
- # Convert to JSON and create file-like object
1289
- json_content = json.dumps(combined_data, indent=2)
1290
- file_like_object = io.BytesIO(json_content.encode('utf-8'))
1291
-
1292
- # Upload to HuggingFace (will overwrite if exists)
1293
- print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
1294
- upload_file_with_backoff(
1295
- api,
1296
- path_or_fileobj=file_like_object,
1297
- path_in_repo="swe-issue.json",
1298
  repo_id=LEADERBOARD_REPO,
 
1299
  repo_type="dataset",
1300
- token=token,
1301
- commit_message=f"Update leaderboard data - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
1302
  )
1303
 
1304
- print(f" ✓ Successfully uploaded swe-issue.json")
1305
- print(f"{'='*80}\n")
1306
-
1307
- return True
1308
-
1309
- except Exception as e:
1310
- print(f"✗ Error saving leaderboard and metrics: {str(e)}")
1311
- import traceback
1312
- traceback.print_exc()
1313
- return False
1314
-
1315
-
1316
- def mine_all_agents():
1317
- """
1318
- Mine issue metadata for all agents within UPDATE_TIME_FRAME_DAYS and save to HuggingFace.
1319
- Uses BATCHED BigQuery queries for all agents (efficient approach).
1320
- """
1321
- # Load agent metadata from HuggingFace
1322
- agents = load_agents_from_hf()
1323
- if not agents:
1324
- print("No agents found in HuggingFace dataset")
1325
- return
1326
-
1327
- # Extract all identifiers
1328
- identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
1329
- if not identifiers:
1330
- print("No valid agent identifiers found")
1331
- return
1332
-
1333
- print(f"\n{'='*80}")
1334
- print(f"⛏️ [MINE] Starting BigQuery data mining for {len(identifiers)} agents")
1335
- print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
1336
- print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
1337
- print(f"⚠️ This will query BigQuery and may take several minutes")
1338
- print(f"{'='*80}\n")
1339
-
1340
- # Initialize BigQuery client
1341
- try:
1342
- client = get_bigquery_client()
1343
- except Exception as e:
1344
- print(f"✗ Failed to initialize BigQuery client: {str(e)}")
1345
- return
1346
-
1347
- # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
1348
- current_time = datetime.now(timezone.utc)
1349
- end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
1350
- start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
1351
-
1352
- try:
1353
- # Use batched approach for better performance
1354
- # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
1355
- all_metadata = fetch_issue_metadata_batched(
1356
- client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
1357
- )
1358
 
1359
- # Calculate summary statistics
1360
- total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
1361
- agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
1362
 
1363
- print(f"\n{'='*80}")
1364
- print(f"✅ BigQuery mining and upload complete!")
1365
- print(f" Total agents: {len(agents)}")
1366
- print(f" Agents with data: {agents_with_data}")
1367
- print(f" Total PRs found: {total_prs}")
1368
- print(f"{'='*80}\n")
1369
 
1370
  except Exception as e:
1371
- print(f" Error during BigQuery fetch: {str(e)}")
1372
- import traceback
1373
- traceback.print_exc()
1374
- return
1375
-
1376
- # After mining is complete, save leaderboard and metrics to HuggingFace
1377
- print(f"📤 Uploading leaderboard and metrics data...")
1378
- if save_leaderboard_and_metrics_to_hf():
1379
- print(f"✓ Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}")
1380
- else:
1381
- print(f"⚠️ Failed to upload leaderboard and metrics data")
1382
-
1383
-
1384
- def construct_leaderboard_from_metadata():
1385
- """
1386
- Construct leaderboard from stored issue metadata instead of fetching all issues.
1387
- Much more memory-efficient and faster.
1388
-
1389
- Returns dictionary of agent stats.
1390
- """
1391
- print("📊 Constructing leaderboard from issue metadata...")
1392
- # Load agents
1393
- agents = load_agents_from_hf()
1394
- if not agents:
1395
- print("No agents found")
1396
- return {}
1397
-
1398
- # Load all issue metadata
1399
- all_metadata = load_issue_metadata()
1400
-
1401
- cache_dict = {}
1402
-
1403
- for agent in agents:
1404
- identifier = agent.get('github_identifier')
1405
- agent_name = agent.get('name', 'Unknown')
1406
-
1407
- # Filter metadata for this agent
1408
- bot_metadata = [issue for issue in all_metadata if issue.get('agent_identifier') == identifier]
1409
-
1410
- # Calculate stats
1411
- stats = calculate_issue_stats_from_metadata(bot_metadata)
1412
-
1413
- cache_dict[identifier] = {
1414
- 'name': agent_name,
1415
- 'website': agent.get('website', 'N/A'),
1416
- 'github_identifier': identifier,
1417
- **stats
1418
- }
1419
-
1420
- return cache_dict
1421
 
1422
 
1423
  # =============================================================================
1424
  # UI FUNCTIONS
1425
  # =============================================================================
1426
 
1427
- def generate_color(index, total):
1428
- """Generate distinct colors using HSL color space for better distribution"""
1429
- hue = (index * 360 / total) % 360
1430
- saturation = 70 + (index % 3) * 10 # Vary saturation slightly
1431
- lightness = 45 + (index % 2) * 10 # Vary lightness slightly
1432
- return f'hsl({hue}, {saturation}%, {lightness}%)'
1433
-
1434
-
1435
  def create_monthly_metrics_plot(top_n=5):
1436
  """
1437
  Create a Plotly figure with dual y-axes showing:
@@ -1443,37 +280,47 @@ def create_monthly_metrics_plot(top_n=5):
1443
  Args:
1444
  top_n: Number of top agents to show (default: 5)
1445
  """
1446
- # Try to load from cache first
1447
- cached_data = load_cached_leaderboard_and_metrics()
1448
-
1449
- if cached_data and 'monthly_metrics' in cached_data:
1450
- # Use cached monthly metrics
1451
- all_metrics = cached_data['monthly_metrics']
1452
-
1453
- # Filter to top_n agents by total issue count
1454
- if all_metrics.get('agents') and all_metrics.get('data'):
1455
- # Calculate total issues for each agent
1456
- agent_totals = []
1457
- for agent_name in all_metrics['agents']:
1458
- total_issues = sum(all_metrics['data'][agent_name]['total_issues'])
1459
- agent_totals.append((agent_name, total_issues))
1460
-
1461
- # Sort and take top_n agents
1462
- agent_totals.sort(key=lambda x: x[1], reverse=True)
1463
- top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
1464
-
1465
- # Filter metrics to only include top agents
1466
- metrics = {
1467
- 'agents': top_agents,
1468
- 'months': all_metrics['months'],
1469
- 'data': {agent: all_metrics['data'][agent] for agent in top_agents if agent in all_metrics['data']}
1470
- }
1471
- else:
1472
- metrics = all_metrics
1473
- else:
1474
- # Fallback: Calculate from issue metadata
1475
- print(" Calculating monthly metrics from issue metadata...")
1476
- metrics = calculate_monthly_metrics_by_agent(top_n=top_n)
 
 
 
 
 
 
 
 
 
 
1477
 
1478
  if not metrics['agents'] or not metrics['months']:
1479
  # Return an empty figure with a message
@@ -1494,15 +341,23 @@ def create_monthly_metrics_plot(top_n=5):
1494
  # Create figure with secondary y-axis
1495
  fig = make_subplots(specs=[[{"secondary_y": True}]])
1496
 
 
 
 
 
 
 
 
 
1497
  agents = metrics['agents']
1498
  months = metrics['months']
1499
  data = metrics['data']
1500
 
1501
- # Generate unique colors for many agents using HSL color space
1502
  agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
1503
 
1504
  # Add traces for each agent
1505
- for agent_name in agents:
1506
  color = agent_colors[agent_name]
1507
  agent_data = data[agent_name]
1508
 
@@ -1520,10 +375,11 @@ def create_monthly_metrics_plot(top_n=5):
1520
  name=agent_name,
1521
  mode='lines+markers',
1522
  line=dict(color=color, width=2),
1523
- marker=dict(size=6),
1524
  legendgroup=agent_name,
1525
- showlegend=True,
1526
- hovertemplate='<b>%{fullData.name}</b><br>' +
 
1527
  'Resolved Rate: %{y:.2f}%<br>' +
1528
  '<extra></extra>'
1529
  ),
@@ -1547,8 +403,9 @@ def create_monthly_metrics_plot(top_n=5):
1547
  name=agent_name,
1548
  marker=dict(color=color, opacity=0.6),
1549
  legendgroup=agent_name,
1550
- showlegend=False, # Don't show in legend (already shown for line)
1551
- hovertemplate='<b>%{fullData.name}</b><br>' +
 
1552
  'Total Issues: %{y}<br>' +
1553
  '<extra></extra>',
1554
  offsetgroup=agent_name # Group bars by agent for proper spacing
@@ -1558,23 +415,26 @@ def create_monthly_metrics_plot(top_n=5):
1558
 
1559
  # Update axes labels
1560
  fig.update_xaxes(title_text=None)
1561
- fig.update_yaxes(title_text="<b>Resolved Rate (%)</b>", secondary_y=False)
 
 
 
 
 
 
 
 
1562
  fig.update_yaxes(title_text="<b>Total Issues</b>", secondary_y=True)
1563
 
1564
  # Update layout
 
1565
  fig.update_layout(
1566
  title=None,
1567
- hovermode='closest',
1568
  barmode='group',
1569
  height=600,
1570
- legend=dict(
1571
- orientation="h",
1572
- yanchor="bottom",
1573
- y=1.02,
1574
- xanchor="right",
1575
- x=1
1576
- ),
1577
- margin=dict(l=50, r=50, t=100, b=50)
1578
  )
1579
 
1580
  return fig
@@ -1582,39 +442,52 @@ def create_monthly_metrics_plot(top_n=5):
1582
 
1583
  def get_leaderboard_dataframe():
1584
  """
1585
- Load leaderboard from cached data and convert to pandas DataFrame for display.
1586
- Falls back to constructing from issue metadata if cache is unavailable.
1587
  Returns formatted DataFrame sorted by total issues.
1588
  """
1589
- # Try to load from cache first
1590
- cached_data = load_cached_leaderboard_and_metrics()
1591
 
1592
- if cached_data and 'leaderboard' in cached_data:
1593
- cache_dict = cached_data['leaderboard']
1594
- else:
1595
- # Fallback: Construct leaderboard from metadata
1596
- print(" Constructing leaderboard from issue metadata...")
1597
- cache_dict = construct_leaderboard_from_metadata()
 
 
 
 
1598
 
1599
  if not cache_dict:
 
1600
  # Return empty DataFrame with correct columns if no data
1601
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
1602
  return pd.DataFrame(columns=column_names)
1603
 
1604
  rows = []
1605
- for data in cache_dict.values():
 
 
 
 
1606
  # Filter out agents with zero total issues
1607
- if data.get('total_issues', 0) == 0:
 
1608
  continue
 
1609
  # Only include display-relevant fields
1610
  rows.append([
1611
  data.get('name', 'Unknown'),
1612
  data.get('website', 'N/A'),
1613
- data.get('total_issues', 0),
1614
  data.get('resolved_issues', 0),
1615
  data.get('resolved_rate', 0.0),
1616
  ])
1617
 
 
 
 
1618
  # Create DataFrame
1619
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
1620
  df = pd.DataFrame(rows, columns=column_names)
@@ -1629,95 +502,125 @@ def get_leaderboard_dataframe():
1629
  if "Total Issues" in df.columns and not df.empty:
1630
  df = df.sort_values(by="Total Issues", ascending=False).reset_index(drop=True)
1631
 
 
 
 
1632
  return df
1633
 
1634
 
1635
- def submit_agent(identifier, agent_name, developer, website):
1636
  """
1637
  Submit a new agent to the leaderboard.
1638
  Validates input and saves submission.
1639
- Issue data will be populated by the monthly mining task.
1640
  """
1641
  # Validate required fields
1642
  if not identifier or not identifier.strip():
1643
- return " GitHub identifier is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1644
  if not agent_name or not agent_name.strip():
1645
- return " Agent name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1646
- if not developer or not developer.strip():
1647
- return " Developer name is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1648
  if not website or not website.strip():
1649
- return " Website URL is required", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1650
 
1651
  # Clean inputs
1652
  identifier = identifier.strip()
1653
  agent_name = agent_name.strip()
1654
- developer = developer.strip()
1655
  website = website.strip()
1656
 
1657
  # Validate GitHub identifier
1658
  is_valid, message = validate_github_username(identifier)
1659
  if not is_valid:
1660
- return f" {message}", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1661
 
1662
  # Check for duplicates by loading agents from HuggingFace
1663
  agents = load_agents_from_hf()
1664
  if agents:
1665
  existing_names = {agent['github_identifier'] for agent in agents}
1666
  if identifier in existing_names:
1667
- return f"⚠️ Agent with identifier '{identifier}' already exists", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1668
 
1669
  # Create submission
1670
  submission = {
1671
  'name': agent_name,
1672
- 'developer': developer,
1673
  'github_identifier': identifier,
1674
  'website': website,
 
1675
  }
1676
 
1677
  # Save to HuggingFace
1678
  if not save_agent_to_hf(submission):
1679
- return " Failed to save submission", get_leaderboard_dataframe(), create_monthly_metrics_plot()
1680
 
1681
- return f"✅ Successfully submitted {agent_name}! Issue data will be populated by daily incremental updates.", get_leaderboard_dataframe(), create_monthly_metrics_plot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1682
 
1683
 
1684
  # =============================================================================
1685
  # GRADIO APPLICATION
1686
  # =============================================================================
1687
 
1688
- print(f"\n🚀 Starting SWE Agent PR Leaderboard")
1689
- print(f" Leaderboard time frame: {LEADERBOARD_TIME_FRAME_DAYS} days ({LEADERBOARD_TIME_FRAME_DAYS // 30} months)")
1690
- print(f" Mining update frequency: Every {UPDATE_TIME_FRAME_DAYS} days\n")
1691
 
1692
- # Start APScheduler for monthly PR mining at 12:00 AM UTC every 1st of the month
1693
  scheduler = BackgroundScheduler(timezone="UTC")
1694
  scheduler.add_job(
1695
- mine_all_agents,
1696
- trigger=CronTrigger(day=1, hour=0, minute=0), # 12:00 AM UTC every 1st of the month
1697
- id='monthly_issue_mining',
1698
- name='Monthly Issue Mining',
1699
  replace_existing=True
1700
  )
1701
  scheduler.start()
1702
  print(f"\n{'='*80}")
1703
- print(f"Scheduler initialized successfully")
1704
- print(f"⛏️ Mining schedule: Every 1st of the month at 12:00 AM UTC")
1705
- print(f"📥 On startup: Only loads cached data from HuggingFace (no mining)")
1706
  print(f"{'='*80}\n")
1707
 
1708
  # Create Gradio interface
1709
  with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as app:
1710
-
1711
- gr.Markdown("# 🏆 SWE Agent Issue Leaderboard")
1712
  gr.Markdown(f"Track and compare GitHub issue resolution statistics for SWE agents")
1713
 
1714
  with gr.Tabs():
1715
 
1716
  # Leaderboard Tab
1717
- with gr.Tab("📊 Leaderboard"):
1718
- gr.Markdown(f"*All statistics are based on issues from the last {LEADERBOARD_TIME_FRAME_DAYS // 30} months*")
1719
  leaderboard_table = Leaderboard(
1720
- value=get_leaderboard_dataframe(),
1721
  datatype=LEADERBOARD_COLUMNS,
1722
  search_columns=["Agent Name", "Website"],
1723
  filter_columns=[
@@ -1732,41 +635,55 @@ with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as a
1732
  ]
1733
  )
1734
 
1735
- gr.Markdown("### Monthly Metrics")
1736
- gr.Markdown("Track resolution rates and issue activity over time")
 
 
 
 
 
 
 
 
 
 
 
1737
 
1738
- monthly_plot = gr.Plot(
1739
- value=create_monthly_metrics_plot(),
1740
- label="Monthly Issue Metrics"
 
 
1741
  )
1742
 
 
1743
  # Submit Agent Tab
1744
- with gr.Tab("Submit Agent"):
1745
-
1746
  gr.Markdown("### Submit Your Agent")
1747
- gr.Markdown("Fill in the details below to add your agent to the leaderboard. Make sure you're logged in to HuggingFace CLI on your machine.")
1748
-
1749
  with gr.Row():
1750
  with gr.Column():
1751
  github_input = gr.Textbox(
1752
  label="GitHub Identifier*",
1753
- placeholder="Your agent username (e.g., my-agent-bot)"
1754
  )
1755
  name_input = gr.Textbox(
1756
  label="Agent Name*",
1757
  placeholder="Your agent's display name"
1758
  )
1759
-
1760
  with gr.Column():
1761
- developer_input = gr.Textbox(
1762
- label="Developer*",
1763
- placeholder="Your developer or team name"
1764
  )
1765
  website_input = gr.Textbox(
1766
- label="Website",
1767
  placeholder="https://your-agent-website.com"
1768
  )
1769
-
1770
  submit_button = gr.Button(
1771
  "Submit Agent",
1772
  variant="primary"
@@ -1775,15 +692,15 @@ with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as a
1775
  label="Submission Status",
1776
  interactive=False
1777
  )
1778
-
1779
  # Event handler
1780
  submit_button.click(
1781
  fn=submit_agent,
1782
- inputs=[github_input, name_input, developer_input, website_input],
1783
- outputs=[submission_status, leaderboard_table, monthly_plot]
1784
  )
1785
 
1786
 
1787
  # Launch application
1788
  if __name__ == "__main__":
1789
- app.launch()
 
3
  import json
4
  import os
5
  import time
 
6
  import requests
 
 
7
  from huggingface_hub import HfApi, hf_hub_download
8
  from huggingface_hub.errors import HfHubHTTPError
9
+ import backoff
10
  from dotenv import load_dotenv
11
  import pandas as pd
12
  import random
 
14
  from plotly.subplots import make_subplots
15
  from apscheduler.schedulers.background import BackgroundScheduler
16
  from apscheduler.triggers.cron import CronTrigger
 
 
17
 
18
  # Load environment variables
19
  load_dotenv()
 
23
  # =============================================================================
24
 
25
  AGENTS_REPO = "SWE-Arena/bot_metadata" # HuggingFace dataset for agent metadata
26
+ LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata" # HuggingFace dataset for leaderboard data
27
+ MAX_RETRIES = 5
 
 
28
 
29
  LEADERBOARD_COLUMNS = [
30
  ("Agent Name", "string"),
 
39
  # =============================================================================
40
 
41
  def is_rate_limit_error(e):
42
+ """Check if exception is a HuggingFace rate limit error (429)."""
43
+ if isinstance(e, HfHubHTTPError):
44
+ return e.response.status_code == 429
45
+ return False
46
 
47
 
48
  @backoff.on_exception(
49
  backoff.expo,
50
  HfHubHTTPError,
51
+ max_tries=MAX_RETRIES,
 
52
  base=300,
53
  max_value=3600,
54
+ giveup=lambda e: not is_rate_limit_error(e),
55
+ on_backoff=lambda details: print(
56
+ f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
57
+ )
58
  )
59
  def list_repo_files_with_backoff(api, **kwargs):
60
+ """Wrapper for api.list_repo_files() with exponential backoff for rate limits."""
61
  return api.list_repo_files(**kwargs)
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  @backoff.on_exception(
65
  backoff.expo,
66
  HfHubHTTPError,
67
+ max_tries=MAX_RETRIES,
 
68
  base=300,
69
  max_value=3600,
 
 
 
 
 
 
 
 
 
 
70
  giveup=lambda e: not is_rate_limit_error(e),
71
+ on_backoff=lambda details: print(
72
+ f"Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  )
74
+ )
75
+ def hf_hub_download_with_backoff(**kwargs):
76
+ """Wrapper for hf_hub_download() with exponential backoff for rate limits."""
77
+ return hf_hub_download(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
 
80
  # =============================================================================
81
+ # GITHUB USERNAME VALIDATION
82
  # =============================================================================
83
 
 
 
 
 
 
 
 
 
84
  def validate_github_username(identifier):
85
+ """Verify that a GitHub identifier exists."""
86
  try:
87
+ response = requests.get(f'https://api.github.com/users/{identifier}', timeout=10)
88
+ return (True, "Username is valid") if response.status_code == 200 else (False, "GitHub identifier not found" if response.status_code == 404 else f"Validation error: HTTP {response.status_code}")
 
 
 
 
 
 
 
 
 
89
  except Exception as e:
90
  return False, f"Validation error: {str(e)}"
91
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # =============================================================================
94
  # HUGGINGFACE DATASET OPERATIONS
95
  # =============================================================================
 
101
  agents = []
102
 
103
  # List all files in the repository
104
+ files = list_repo_files_with_backoff(api=api, repo_id=AGENTS_REPO, repo_type="dataset")
105
 
106
  # Filter for JSON files only
107
  json_files = [f for f in files if f.endswith('.json')]
 
128
  # Add or override github_identifier to match filename
129
  agent_data['github_identifier'] = filename_identifier
130
 
 
 
 
 
 
 
131
  agents.append(agent_data)
132
 
133
  except Exception as e:
134
  print(f"Warning: Could not load {json_file}: {str(e)}")
135
  continue
136
 
137
+ print(f"Loaded {len(agents)} agents from HuggingFace")
138
  return agents
139
 
140
  except Exception as e:
 
142
  return None
143
 
144
 
 
 
145
  def get_hf_token():
146
  """Get HuggingFace token from environment variables."""
147
  token = os.getenv('HF_TOKEN')
 
150
  return token
151
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
154
  """
155
  Upload file to HuggingFace with exponential backoff retry logic.
 
178
  token=token
179
  )
180
  if attempt > 0:
181
+ print(f" Upload succeeded on attempt {attempt + 1}/{max_retries}")
182
  return True
183
 
184
  except Exception as e:
185
  if attempt < max_retries - 1:
186
  wait_time = delay + random.uniform(0, 1.0)
187
+ print(f" Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
188
+ print(f" Retrying in {wait_time:.1f} seconds...")
189
  time.sleep(wait_time)
190
  delay = min(delay * 2, 60.0) # Exponential backoff, max 60s
191
  else:
192
+ print(f" Upload failed after {max_retries} attempts: {str(e)}")
193
  raise
194
 
195
 
 
219
  repo_type="dataset",
220
  token=token
221
  )
222
+ print(f"Saved agent to HuggingFace: {filename}")
223
  return True
224
  finally:
225
  # Always clean up local file, even if upload fails
 
227
  os.remove(filename)
228
 
229
  except Exception as e:
230
+ print(f"Error saving agent: {str(e)}")
231
  return False
232
 
233
 
234
+ def load_leaderboard_data_from_hf():
 
 
 
 
 
 
235
  """
236
+ Load leaderboard data and monthly metrics from HuggingFace dataset.
 
237
 
238
  Returns:
239
+ dict: Dictionary with 'leaderboard', 'monthly_metrics', and 'metadata' keys
240
+ Returns None if file doesn't exist or error occurs
241
  """
 
 
242
  try:
243
  token = get_hf_token()
244
+ filename = "swe-issue.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
+ # Download file
247
+ file_path = hf_hub_download_with_backoff(
 
 
 
 
 
 
 
 
248
  repo_id=LEADERBOARD_REPO,
249
+ filename=filename,
250
  repo_type="dataset",
251
+ token=token
 
252
  )
253
 
254
+ # Load JSON data
255
+ with open(file_path, 'r') as f:
256
+ data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
+ last_updated = data.get('metadata', {}).get('last_updated', 'Unknown')
259
+ print(f"Loaded leaderboard data from HuggingFace (last updated: {last_updated})")
 
260
 
261
+ return data
 
 
 
 
 
262
 
263
  except Exception as e:
264
+ print(f"Could not load leaderboard data from HuggingFace: {str(e)}")
265
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
 
268
  # =============================================================================
269
  # UI FUNCTIONS
270
  # =============================================================================
271
 
 
 
 
 
 
 
 
 
272
  def create_monthly_metrics_plot(top_n=5):
273
  """
274
  Create a Plotly figure with dual y-axes showing:
 
280
  Args:
281
  top_n: Number of top agents to show (default: 5)
282
  """
283
+ # Load from saved dataset
284
+ saved_data = load_leaderboard_data_from_hf()
285
+
286
+ if not saved_data or 'monthly_metrics' not in saved_data:
287
+ # Return an empty figure with a message
288
+ fig = go.Figure()
289
+ fig.add_annotation(
290
+ text="No data available for visualization",
291
+ xref="paper", yref="paper",
292
+ x=0.5, y=0.5, showarrow=False,
293
+ font=dict(size=16)
294
+ )
295
+ fig.update_layout(
296
+ title=None,
297
+ xaxis_title=None,
298
+ height=500
299
+ )
300
+ return fig
301
+
302
+ metrics = saved_data['monthly_metrics']
303
+ print(f"Loaded monthly metrics from saved dataset")
304
+
305
+ # Apply top_n filter if specified
306
+ if top_n is not None and top_n > 0 and metrics.get('agents'):
307
+ # Calculate total issues for each agent
308
+ agent_totals = []
309
+ for agent_name in metrics['agents']:
310
+ agent_data = metrics['data'].get(agent_name, {})
311
+ total_issues = sum(agent_data.get('total_issues', []))
312
+ agent_totals.append((agent_name, total_issues))
313
+
314
+ # Sort by total issues and take top N
315
+ agent_totals.sort(key=lambda x: x[1], reverse=True)
316
+ top_agents = [agent_name for agent_name, _ in agent_totals[:top_n]]
317
+
318
+ # Filter metrics to only include top agents
319
+ metrics = {
320
+ 'agents': top_agents,
321
+ 'months': metrics['months'],
322
+ 'data': {agent: metrics['data'][agent] for agent in top_agents if agent in metrics['data']}
323
+ }
324
 
325
  if not metrics['agents'] or not metrics['months']:
326
  # Return an empty figure with a message
 
341
  # Create figure with secondary y-axis
342
  fig = make_subplots(specs=[[{"secondary_y": True}]])
343
 
344
+ # Generate unique colors for many agents using HSL color space
345
+ def generate_color(index, total):
346
+ """Generate distinct colors using HSL color space for better distribution"""
347
+ hue = (index * 360 / total) % 360
348
+ saturation = 70 + (index % 3) * 10 # Vary saturation slightly
349
+ lightness = 45 + (index % 2) * 10 # Vary lightness slightly
350
+ return f'hsl({hue}, {saturation}%, {lightness}%)'
351
+
352
  agents = metrics['agents']
353
  months = metrics['months']
354
  data = metrics['data']
355
 
356
+ # Generate colors for all agents
357
  agent_colors = {agent: generate_color(idx, len(agents)) for idx, agent in enumerate(agents)}
358
 
359
  # Add traces for each agent
360
+ for idx, agent_name in enumerate(agents):
361
  color = agent_colors[agent_name]
362
  agent_data = data[agent_name]
363
 
 
375
  name=agent_name,
376
  mode='lines+markers',
377
  line=dict(color=color, width=2),
378
+ marker=dict(size=8),
379
  legendgroup=agent_name,
380
+ showlegend=(top_n is not None and top_n <= 10), # Show legend for top N agents
381
+ hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
382
+ 'Month: %{x}<br>' +
383
  'Resolved Rate: %{y:.2f}%<br>' +
384
  '<extra></extra>'
385
  ),
 
403
  name=agent_name,
404
  marker=dict(color=color, opacity=0.6),
405
  legendgroup=agent_name,
406
+ showlegend=False, # Hide duplicate legend entry (already shown in Scatter)
407
+ hovertemplate='<b>Agent: %{fullData.name}</b><br>' +
408
+ 'Month: %{x}<br>' +
409
  'Total Issues: %{y}<br>' +
410
  '<extra></extra>',
411
  offsetgroup=agent_name # Group bars by agent for proper spacing
 
415
 
416
  # Update axes labels
417
  fig.update_xaxes(title_text=None)
418
+ fig.update_yaxes(
419
+ title_text="<b>Resolved Rate (%)</b>",
420
+ range=[0, 100],
421
+ secondary_y=False,
422
+ showticklabels=True,
423
+ tickmode='linear',
424
+ dtick=10,
425
+ showgrid=True
426
+ )
427
  fig.update_yaxes(title_text="<b>Total Issues</b>", secondary_y=True)
428
 
429
  # Update layout
430
+ show_legend = (top_n is not None and top_n <= 10)
431
  fig.update_layout(
432
  title=None,
433
+ hovermode='closest', # Show individual agent info on hover
434
  barmode='group',
435
  height=600,
436
+ showlegend=show_legend,
437
+ margin=dict(l=50, r=150 if show_legend else 50, t=50, b=50) # More right margin when legend is shown
 
 
 
 
 
 
438
  )
439
 
440
  return fig
 
442
 
443
  def get_leaderboard_dataframe():
444
  """
445
+ Load leaderboard from saved dataset and convert to pandas DataFrame for display.
 
446
  Returns formatted DataFrame sorted by total issues.
447
  """
448
+ # Load from saved dataset
449
+ saved_data = load_leaderboard_data_from_hf()
450
 
451
+ if not saved_data or 'leaderboard' not in saved_data:
452
+ print(f"No leaderboard data available")
453
+ # Return empty DataFrame with correct columns if no data
454
+ column_names = [col[0] for col in LEADERBOARD_COLUMNS]
455
+ return pd.DataFrame(columns=column_names)
456
+
457
+ cache_dict = saved_data['leaderboard']
458
+ last_updated = saved_data.get('metadata', {}).get('last_updated', 'Unknown')
459
+ print(f"Loaded leaderboard from saved dataset (last updated: {last_updated})")
460
+ print(f"Cache dict size: {len(cache_dict)}")
461
 
462
  if not cache_dict:
463
+ print("WARNING: cache_dict is empty!")
464
  # Return empty DataFrame with correct columns if no data
465
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
466
  return pd.DataFrame(columns=column_names)
467
 
468
  rows = []
469
+ filtered_count = 0
470
+ for identifier, data in cache_dict.items():
471
+ total_issues = data.get('total_issues', 0)
472
+ print(f" Agent '{identifier}': {total_issues} issues")
473
+
474
  # Filter out agents with zero total issues
475
+ if total_issues == 0:
476
+ filtered_count += 1
477
  continue
478
+
479
  # Only include display-relevant fields
480
  rows.append([
481
  data.get('name', 'Unknown'),
482
  data.get('website', 'N/A'),
483
+ total_issues,
484
  data.get('resolved_issues', 0),
485
  data.get('resolved_rate', 0.0),
486
  ])
487
 
488
+ print(f"Filtered out {filtered_count} agents with 0 issues")
489
+ print(f"Leaderboard will show {len(rows)} agents")
490
+
491
  # Create DataFrame
492
  column_names = [col[0] for col in LEADERBOARD_COLUMNS]
493
  df = pd.DataFrame(rows, columns=column_names)
 
502
  if "Total Issues" in df.columns and not df.empty:
503
  df = df.sort_values(by="Total Issues", ascending=False).reset_index(drop=True)
504
 
505
+ print(f"Final DataFrame shape: {df.shape}")
506
+ print("="*60 + "\n")
507
+
508
  return df
509
 
510
 
511
+ def submit_agent(identifier, agent_name, organization, website):
512
  """
513
  Submit a new agent to the leaderboard.
514
  Validates input and saves submission.
 
515
  """
516
  # Validate required fields
517
  if not identifier or not identifier.strip():
518
+ return "ERROR: GitHub identifier is required", gr.update()
519
  if not agent_name or not agent_name.strip():
520
+ return "ERROR: Agent name is required", gr.update()
521
+ if not organization or not organization.strip():
522
+ return "ERROR: Organization name is required", gr.update()
523
  if not website or not website.strip():
524
+ return "ERROR: Website URL is required", gr.update()
525
 
526
  # Clean inputs
527
  identifier = identifier.strip()
528
  agent_name = agent_name.strip()
529
+ organization = organization.strip()
530
  website = website.strip()
531
 
532
  # Validate GitHub identifier
533
  is_valid, message = validate_github_username(identifier)
534
  if not is_valid:
535
+ return f"ERROR: {message}", gr.update()
536
 
537
  # Check for duplicates by loading agents from HuggingFace
538
  agents = load_agents_from_hf()
539
  if agents:
540
  existing_names = {agent['github_identifier'] for agent in agents}
541
  if identifier in existing_names:
542
+ return f"WARNING: Agent with identifier '{identifier}' already exists", gr.update()
543
 
544
  # Create submission
545
  submission = {
546
  'name': agent_name,
547
+ 'organization': organization,
548
  'github_identifier': identifier,
549
  'website': website,
550
+ 'status': 'public'
551
  }
552
 
553
  # Save to HuggingFace
554
  if not save_agent_to_hf(submission):
555
+ return "ERROR: Failed to save submission", gr.update()
556
 
557
+ # Return success message - data will be populated by backend updates
558
+ return f"SUCCESS: Successfully submitted {agent_name}! Issue data will be automatically populated by the backend system via the maintainers.", gr.update()
559
+
560
+
561
+ # =============================================================================
562
+ # DATA RELOAD FUNCTION
563
+ # =============================================================================
564
+
565
+ def reload_leaderboard_data():
566
+ """
567
+ Reload leaderboard data from HuggingFace.
568
+ This function is called by the scheduler on a daily basis.
569
+ """
570
+ print(f"\n{'='*80}")
571
+ print(f"Reloading leaderboard data from HuggingFace...")
572
+ print(f"{'='*80}\n")
573
+
574
+ try:
575
+ data = load_leaderboard_data_from_hf()
576
+ if data:
577
+ print(f"Successfully reloaded leaderboard data")
578
+ print(f" Last updated: {data.get('metadata', {}).get('last_updated', 'Unknown')}")
579
+ print(f" Agents: {len(data.get('leaderboard', {}))}")
580
+ else:
581
+ print(f"No data available")
582
+ except Exception as e:
583
+ print(f"Error reloading leaderboard data: {str(e)}")
584
+
585
+ print(f"{'='*80}\n")
586
 
587
 
588
  # =============================================================================
589
  # GRADIO APPLICATION
590
  # =============================================================================
591
 
592
+ print(f"\nStarting SWE Agent Issue Leaderboard")
593
+ print(f" Data source: {LEADERBOARD_REPO}")
594
+ print(f" Reload frequency: Daily at 12:00 AM UTC\n")
595
 
596
+ # Start APScheduler for daily data reload at 12:00 AM UTC
597
  scheduler = BackgroundScheduler(timezone="UTC")
598
  scheduler.add_job(
599
+ reload_leaderboard_data,
600
+ trigger=CronTrigger(hour=0, minute=0), # 12:00 AM UTC daily
601
+ id='daily_data_reload',
602
+ name='Daily Data Reload',
603
  replace_existing=True
604
  )
605
  scheduler.start()
606
  print(f"\n{'='*80}")
607
+ print(f"Scheduler initialized successfully")
608
+ print(f"Reload schedule: Daily at 12:00 AM UTC")
609
+ print(f"On startup: Loads cached data from HuggingFace on demand")
610
  print(f"{'='*80}\n")
611
 
612
  # Create Gradio interface
613
  with gr.Blocks(title="SWE Agent Issue Leaderboard", theme=gr.themes.Soft()) as app:
614
+ gr.Markdown("# SWE Agent Issue Leaderboard")
 
615
  gr.Markdown(f"Track and compare GitHub issue resolution statistics for SWE agents")
616
 
617
  with gr.Tabs():
618
 
619
  # Leaderboard Tab
620
+ with gr.Tab("Leaderboard"):
621
+ gr.Markdown("*Statistics are based on agent issue resolution activity tracked by the system*")
622
  leaderboard_table = Leaderboard(
623
+ value=pd.DataFrame(columns=[col[0] for col in LEADERBOARD_COLUMNS]), # Empty initially
624
  datatype=LEADERBOARD_COLUMNS,
625
  search_columns=["Agent Name", "Website"],
626
  filter_columns=[
 
635
  ]
636
  )
637
 
638
+ # Load leaderboard data when app starts
639
+ app.load(
640
+ fn=get_leaderboard_dataframe,
641
+ inputs=[],
642
+ outputs=[leaderboard_table]
643
+ )
644
+
645
+ # Monthly Metrics Section
646
+ gr.Markdown("---") # Divider
647
+ gr.Markdown("### Monthly Performance - Top 5 Agents")
648
+ gr.Markdown("*Shows resolved rate trends and issue volumes for the most active agents*")
649
+
650
+ monthly_metrics_plot = gr.Plot(label="Monthly Metrics")
651
 
652
+ # Load monthly metrics when app starts
653
+ app.load(
654
+ fn=lambda: create_monthly_metrics_plot(),
655
+ inputs=[],
656
+ outputs=[monthly_metrics_plot]
657
  )
658
 
659
+
660
  # Submit Agent Tab
661
+ with gr.Tab("Submit Agent"):
662
+
663
  gr.Markdown("### Submit Your Agent")
664
+ gr.Markdown("Fill in the details below to add your agent to the leaderboard.")
665
+
666
  with gr.Row():
667
  with gr.Column():
668
  github_input = gr.Textbox(
669
  label="GitHub Identifier*",
670
+ placeholder="Your agent username (e.g., my-agent[bot])"
671
  )
672
  name_input = gr.Textbox(
673
  label="Agent Name*",
674
  placeholder="Your agent's display name"
675
  )
676
+
677
  with gr.Column():
678
+ organization_input = gr.Textbox(
679
+ label="Organization*",
680
+ placeholder="Your organization or team name"
681
  )
682
  website_input = gr.Textbox(
683
+ label="Website*",
684
  placeholder="https://your-agent-website.com"
685
  )
686
+
687
  submit_button = gr.Button(
688
  "Submit Agent",
689
  variant="primary"
 
692
  label="Submission Status",
693
  interactive=False
694
  )
695
+
696
  # Event handler
697
  submit_button.click(
698
  fn=submit_agent,
699
+ inputs=[github_input, name_input, organization_input, website_input],
700
+ outputs=[submission_status, leaderboard_table]
701
  )
702
 
703
 
704
  # Launch application
705
  if __name__ == "__main__":
706
+ app.launch()
docker-compose.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ msr-miner:
3
+ build:
4
+ context: .
5
+ dockerfile: Dockerfile
6
+ container_name: gharchive-miner
7
+ restart: unless-stopped
8
+ env_file:
9
+ - .env
10
+ volumes:
11
+ # Mount entire workspace for live code updates
12
+ - .:/app
13
+ # Mount gharchive workspace for data storage
14
+ - ../gharchive:/gharchive:ro
15
+ # Mount bot data for agent repository storage
16
+ - ../bot_data:/bot_data:ro
17
+ environment:
18
+ - PYTHONUNBUFFERED=1
19
+ logging:
20
+ driver: "json-file"
21
+ options:
22
+ max-size: "10m"
23
+ max-file: "3"
msr.py CHANGED
@@ -1,18 +1,19 @@
1
- """
2
- Minimalist Issue Metadata Mining Script
3
- Mines issue metadata from GitHub Archive via BigQuery and saves to HuggingFace dataset.
4
- """
5
-
6
  import json
7
  import os
8
- import tempfile
9
  from datetime import datetime, timezone, timedelta
10
  from collections import defaultdict
 
11
  from huggingface_hub import HfApi, hf_hub_download
12
  from huggingface_hub.errors import HfHubHTTPError
13
  from dotenv import load_dotenv
14
- from google.cloud import bigquery
15
  import backoff
 
 
 
 
 
16
 
17
  # Load environment variables
18
  load_dotenv()
@@ -21,75 +22,39 @@ load_dotenv()
21
  # CONFIGURATION
22
  # =============================================================================
23
 
24
- AGENTS_REPO = "SWE-Arena/bot_metadata"
25
- ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata"
26
- LEADERBOARD_REPO = "SWE-Arena/leaderboard_metadata"
27
- LEADERBOARD_TIME_FRAME_DAYS = 180 # Time frame for leaderboard
 
 
28
 
29
- # =============================================================================
30
- # HUGGINGFACE API WRAPPERS WITH BACKOFF
31
- # =============================================================================
32
 
33
- def is_rate_limit_error(e):
34
- """Check if the exception is a rate limit error (429)."""
35
- return isinstance(e, HfHubHTTPError) and e.response.status_code == 429
36
 
 
 
 
37
 
38
- @backoff.on_exception(
39
- backoff.expo,
40
- HfHubHTTPError,
41
- giveup=lambda e: not is_rate_limit_error(e),
42
- max_tries=8,
43
- base=300,
44
- max_value=3600,
45
- jitter=backoff.full_jitter,
46
- on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
47
- )
48
- def list_repo_files_with_backoff(api, **kwargs):
49
- """List repo files with exponential backoff on rate limit errors."""
50
- return api.list_repo_files(**kwargs)
51
 
52
- @backoff.on_exception(
53
- backoff.expo,
54
- HfHubHTTPError,
55
- giveup=lambda e: not is_rate_limit_error(e),
56
- max_tries=8,
57
- base=300,
58
- max_value=3600,
59
- jitter=backoff.full_jitter,
60
- on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
61
- )
62
- def hf_hub_download_with_backoff(**kwargs):
63
- """Download from HF Hub with exponential backoff on rate limit errors."""
64
- return hf_hub_download(**kwargs)
65
 
66
- @backoff.on_exception(
67
- backoff.expo,
68
- HfHubHTTPError,
69
- giveup=lambda e: not is_rate_limit_error(e),
70
- max_tries=8,
71
- base=300,
72
- max_value=3600,
73
- jitter=backoff.full_jitter,
74
- on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
75
- )
76
- def upload_file_with_backoff(api, **kwargs):
77
- """Upload file with exponential backoff on rate limit errors."""
78
- return api.upload_file(**kwargs)
79
-
80
- @backoff.on_exception(
81
- backoff.expo,
82
- HfHubHTTPError,
83
- giveup=lambda e: not is_rate_limit_error(e),
84
- max_tries=8,
85
- base=300,
86
- max_value=3600,
87
- jitter=backoff.full_jitter,
88
- on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
89
- )
90
- def upload_folder_with_backoff(api, **kwargs):
91
- """Upload folder with exponential backoff on rate limit errors."""
92
- return api.upload_folder(**kwargs)
93
 
94
  # =============================================================================
95
  # UTILITY FUNCTIONS
@@ -116,7 +81,32 @@ def save_jsonl(filename, data):
116
  """Save list of dictionaries to JSONL file."""
117
  with open(filename, 'w', encoding='utf-8') as f:
118
  for item in data:
119
- f.write(json.dumps(item) + '\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
 
122
  def get_hf_token():
@@ -127,581 +117,498 @@ def get_hf_token():
127
  return token
128
 
129
 
130
- def get_bigquery_client():
131
- """
132
- Initialize BigQuery client using credentials from environment variable.
133
 
134
- Expects GOOGLE_APPLICATION_CREDENTIALS_JSON environment variable containing
135
- the service account JSON credentials as a string.
136
- """
137
- # Get the JSON content from environment variable
138
- creds_json = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS_JSON')
139
 
140
- if creds_json:
141
- # Create a temporary file to store credentials
142
- with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
143
- temp_file.write(creds_json)
144
- temp_path = temp_file.name
145
 
146
- # Set environment variable to point to temp file
147
- os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = temp_path
 
 
 
 
 
148
 
149
- # Initialize BigQuery client
150
- client = bigquery.Client()
 
 
 
 
151
 
152
- # Clean up temp file
153
- os.unlink(temp_path)
 
 
154
 
155
- return client
156
- else:
157
- raise ValueError("GOOGLE_APPLICATION_CREDENTIALS_JSON not found in environment")
 
 
158
 
 
159
 
160
- def generate_table_union_statements(start_date, end_date):
161
- """
162
- Generate UNION ALL statements for githubarchive.month tables in date range.
163
 
164
- Args:
165
- start_date: Start datetime
166
- end_date: End datetime
167
 
168
- Returns:
169
- String with UNION ALL SELECT statements for all monthly tables in range
170
- """
171
- table_names = []
172
 
173
- # Start from the beginning of start_date's month
174
- current_date = start_date.replace(day=1)
175
- end_month = end_date.replace(day=1)
 
 
 
 
 
176
 
177
- while current_date <= end_month:
178
- table_name = f"`githubarchive.month.{current_date.strftime('%Y%m')}`"
179
- table_names.append(table_name)
180
 
181
- # Move to next month
182
- if current_date.month == 12:
183
- current_date = current_date.replace(year=current_date.year + 1, month=1)
184
- else:
185
- current_date = current_date.replace(month=current_date.month + 1)
186
 
187
- # Create UNION ALL chain
188
- union_parts = [f"SELECT * FROM {table}" for table in table_names]
189
- return " UNION ALL ".join(union_parts)
 
 
 
 
 
190
 
191
 
192
  # =============================================================================
193
- # BIGQUERY FUNCTIONS
194
  # =============================================================================
195
 
196
- def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True):
197
- """
198
- Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
 
 
199
 
200
- Splits agents into smaller batches to avoid performance issues with large UNNEST arrays
201
- and correlated subqueries. Each batch query runs much faster than one massive query.
 
 
202
 
203
- Args:
204
- client: BigQuery client instance
205
- identifiers: List of GitHub usernames/bot identifiers
206
- start_date: Start datetime (timezone-aware)
207
- end_date: End datetime (timezone-aware)
208
- batch_size: Number of agents per batch (default: 100)
209
- upload_immediately: Upload results to HuggingFace immediately after each batch (default: True)
210
 
211
- Returns:
212
- Dictionary mapping agent identifier to list of issue metadata
213
- """
214
- print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
215
- print(f" Batch size: {batch_size} agents per query")
216
- print(f" Upload mode: {'Immediate (after each batch)' if upload_immediately else 'Deferred (after all batches)'}")
 
 
 
 
 
 
 
 
 
 
 
217
 
218
- # Split identifiers into batches
219
- batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
220
- print(f" Total batches: {len(batches)}")
221
 
222
- # Collect results from all batches
223
- all_metadata = {}
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- for batch_num, batch_identifiers in enumerate(batches, 1):
226
- print(f"\n{'─'*80}")
227
- print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)")
228
- print(f"{'─'*80}")
229
 
230
- try:
231
- batch_results = fetch_all_issue_metadata_single_query(
232
- client, batch_identifiers, start_date, end_date
233
- )
 
 
 
 
 
 
 
 
 
 
234
 
235
- # Merge results
236
- for identifier, metadata_list in batch_results.items():
237
- if identifier in all_metadata:
238
- all_metadata[identifier].extend(metadata_list)
239
- else:
240
- all_metadata[identifier] = metadata_list
241
 
242
- print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
- # Upload immediately after this batch if enabled
245
- if upload_immediately and batch_results:
246
- print(f"\n 🤗 Uploading batch {batch_num}/{len(batches)} results to HuggingFace...")
247
- upload_success = 0
248
- upload_errors = 0
249
 
250
- for identifier, metadata_list in batch_results.items():
251
- if metadata_list:
252
- if save_issue_metadata_to_hf(metadata_list, identifier):
253
- upload_success += 1
254
- else:
255
- upload_errors += 1
256
 
257
- print(f" ✓ Batch {batch_num}/{len(batches)} upload complete ({upload_success} agents uploaded, {upload_errors} errors)")
 
 
 
 
 
 
258
 
259
- except Exception as e:
260
- print(f" ✗ Batch {batch_num} failed: {str(e)}")
261
- print(f" Continuing with remaining batches...")
262
- import traceback
263
- traceback.print_exc()
264
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
- print(f"\n{'='*80}")
267
- print(f" All batches completed!")
268
- print(f" Total agents with data: {len(all_metadata)}")
269
- total_issues = sum(len(issues) for issues in all_metadata.values())
270
- print(f" Total issues found: {total_issues}")
271
- print(f"{'='*80}\n")
272
 
273
- return all_metadata
274
 
275
 
276
- def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
 
 
 
 
277
  """
278
- Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query.
 
 
279
 
280
- This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
281
- deduplicates to get the latest state of each issue. Filters by issue author,
282
- commenter, or assignee.
283
 
284
- NOTE: This function is designed for smaller batches (~100 agents). For large
285
- numbers of agents, use fetch_issue_metadata_batched() instead.
 
 
286
 
287
  Args:
288
- client: BigQuery client instance
289
- identifiers: List of GitHub usernames/bot identifiers (recommended: <100)
290
  start_date: Start datetime (timezone-aware)
291
  end_date: End datetime (timezone-aware)
292
 
293
  Returns:
294
- Dictionary mapping agent identifier to list of issue metadata:
295
- {
296
- 'agent-identifier': [
297
- {
298
- 'url': Issue URL,
299
- 'created_at': Issue creation timestamp,
300
- 'closed_at': Close timestamp (if closed, else None),
301
- 'state_reason': Reason for closure (completed/not_planned/etc.)
302
- },
303
- ...
304
- ],
305
- ...
306
- }
307
- """
308
- print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY")
309
- print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
310
-
311
- # Generate table UNION statements for issue events
312
- issue_tables = generate_table_union_statements(start_date, end_date)
313
-
314
- # Build identifier list (handle both bot and non-bot versions)
315
- identifier_set = set()
316
- for id in identifiers:
317
- identifier_set.add(id)
318
- # Also add stripped version without [bot] suffix
319
- stripped = id.replace('[bot]', '')
320
- if stripped != id:
321
- identifier_set.add(stripped)
322
-
323
- # Convert to array literal for UNNEST (avoids query size limits from large IN clauses)
324
- identifier_array = '[' + ', '.join([f'"{id}"' for id in identifier_set]) + ']'
325
-
326
- print(f" Total identifiers (including bot/non-bot variants): {len(identifier_set)}")
327
-
328
- # Build comprehensive query with CTEs using UNNEST instead of large IN clauses
329
- query = f"""
330
- WITH agent_identifiers AS (
331
- -- Create a table from the identifier array to avoid massive IN clauses
332
- SELECT identifier
333
- FROM UNNEST({identifier_array}) AS identifier
334
- ),
335
-
336
- issue_events AS (
337
- -- Get all issue events and comment events for ALL agents
338
- SELECT
339
- JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') as url,
340
- JSON_EXTRACT_SCALAR(payload, '$.issue.created_at') as created_at,
341
- JSON_EXTRACT_SCALAR(payload, '$.issue.closed_at') as closed_at,
342
- JSON_EXTRACT_SCALAR(payload, '$.issue.state_reason') as state_reason,
343
- JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') as author,
344
- JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') as assignee,
345
- JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') as commenter,
346
- JSON_EXTRACT_SCALAR(payload, '$.issue.number') as issue_number,
347
- repo.name as repo_name,
348
- created_at as event_time
349
- FROM (
350
- {issue_tables}
351
- )
352
- WHERE
353
- type IN ('IssuesEvent', 'IssueCommentEvent')
354
- -- Exclude pull requests (they have pull_request field)
355
- AND JSON_EXTRACT(payload, '$.issue.pull_request') IS NULL
356
- AND JSON_EXTRACT_SCALAR(payload, '$.issue.html_url') IS NOT NULL
357
- -- Filter by author OR commenter OR assignee using JOIN instead of IN
358
- AND (
359
- JSON_EXTRACT_SCALAR(payload, '$.issue.user.login') IN (SELECT identifier FROM agent_identifiers)
360
- OR JSON_EXTRACT_SCALAR(payload, '$.comment.user.login') IN (SELECT identifier FROM agent_identifiers)
361
- OR JSON_EXTRACT_SCALAR(payload, '$.issue.assignee.login') IN (SELECT identifier FROM agent_identifiers)
362
- )
363
- ),
364
-
365
- latest_states AS (
366
- -- Deduplicate to get latest state for each issue
367
- SELECT
368
- url,
369
- created_at,
370
- closed_at,
371
- state_reason,
372
- author,
373
- assignee,
374
- commenter
375
- FROM issue_events
376
- QUALIFY ROW_NUMBER() OVER (
377
- PARTITION BY repo_name, issue_number
378
- ORDER BY event_time DESC
379
- ) = 1
380
- ),
381
-
382
- agent_issues AS (
383
- -- Map each issue to its relevant agent(s)
384
- SELECT DISTINCT
385
- CASE
386
- WHEN author IN (SELECT identifier FROM agent_identifiers) THEN author
387
- WHEN commenter IN (SELECT identifier FROM agent_identifiers) THEN commenter
388
- WHEN assignee IN (SELECT identifier FROM agent_identifiers) THEN assignee
389
- ELSE NULL
390
- END as agent_identifier,
391
- url,
392
- created_at,
393
- closed_at,
394
- state_reason
395
- FROM latest_states
396
- WHERE
397
- author IN (SELECT identifier FROM agent_identifiers)
398
- OR commenter IN (SELECT identifier FROM agent_identifiers)
399
- OR assignee IN (SELECT identifier FROM agent_identifiers)
400
- )
401
-
402
- SELECT
403
- agent_identifier,
404
- url,
405
- created_at,
406
- closed_at,
407
- state_reason
408
- FROM agent_issues
409
- WHERE agent_identifier IS NOT NULL
410
- ORDER BY agent_identifier, created_at DESC
411
  """
 
 
412
 
413
- # Calculate number of days for reporting
414
- query_days = (end_date - start_date).days
415
-
416
- print(f" Querying {query_days} days for issue and comment events...")
417
- print(f" Agents: {', '.join(identifiers[:5])}{'...' if len(identifiers) > 5 else ''}")
418
-
419
- try:
420
- query_job = client.query(query)
421
- results = list(query_job.result())
422
-
423
- print(f" ✓ Found {len(results)} total issue records across all agents")
424
-
425
- # Group results by agent
426
- metadata_by_agent = defaultdict(list)
427
-
428
- for row in results:
429
- agent_id = row.agent_identifier
430
-
431
- # Convert datetime objects to ISO strings
432
- created_at = row.created_at
433
- if hasattr(created_at, 'isoformat'):
434
- created_at = created_at.isoformat()
435
-
436
- closed_at = row.closed_at
437
- if hasattr(closed_at, 'isoformat'):
438
- closed_at = closed_at.isoformat()
439
-
440
- metadata_by_agent[agent_id].append({
441
- 'url': row.url,
442
- 'created_at': created_at,
443
- 'closed_at': closed_at,
444
- 'state_reason': row.state_reason,
445
- })
446
-
447
- # Print breakdown by agent
448
- print(f"\n 📊 Results breakdown by agent:")
449
- for identifier in identifiers:
450
- # Check both original and stripped versions
451
- count = len(metadata_by_agent.get(identifier, []))
452
- stripped = identifier.replace('[bot]', '')
453
- if stripped != identifier:
454
- count += len(metadata_by_agent.get(stripped, []))
455
-
456
- if count > 0:
457
- # Merge both versions if needed
458
- all_metadata = metadata_by_agent.get(identifier, []) + metadata_by_agent.get(stripped, [])
459
- completed_count = sum(1 for m in all_metadata if m['state_reason'] == 'completed')
460
- closed_count = sum(1 for m in all_metadata if m['closed_at'] is not None)
461
- open_count = count - closed_count
462
- print(f" {identifier}: {count} issues ({completed_count} completed, {closed_count} closed, {open_count} open)")
463
-
464
- # Convert defaultdict to regular dict and merge bot/non-bot versions
465
- final_metadata = {}
466
- for identifier in identifiers:
467
- combined = metadata_by_agent.get(identifier, [])
468
- stripped = identifier.replace('[bot]', '')
469
- if stripped != identifier and stripped in metadata_by_agent:
470
- combined.extend(metadata_by_agent[stripped])
471
-
472
- if combined:
473
- final_metadata[identifier] = combined
474
-
475
- return final_metadata
476
 
477
- except Exception as e:
478
- print(f" ✗ BigQuery error: {str(e)}")
479
- import traceback
480
- traceback.print_exc()
481
- return {}
482
 
 
483
 
484
- # =============================================================================
485
- # HUGGINGFACE STORAGE FUNCTIONS
486
- # =============================================================================
487
 
488
- def group_metadata_by_date(metadata_list):
489
- """
490
- Group issue metadata by exact date (year.month.day) for efficient daily storage.
491
- Returns dict: {(year, month, day): [metadata_list]}
492
- """
493
- grouped = defaultdict(list)
494
 
495
- for issue_meta in metadata_list:
496
- created_at = issue_meta.get('created_at')
497
- if not created_at:
498
  continue
499
 
500
- try:
501
- dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
502
- key = (dt.year, dt.month, dt.day)
503
- grouped[key].append(issue_meta)
504
- except Exception as e:
505
- print(f"Warning: Could not parse date '{created_at}': {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
 
507
- return dict(grouped)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
 
 
 
509
 
510
- def save_issue_metadata_to_hf(metadata_list, agent_identifier):
511
- """
512
- Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
513
- Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
514
 
515
- This function OVERWRITES existing files completely with fresh data from BigQuery.
516
- Uses upload_folder for single-commit batch uploads (avoids rate limit issues).
 
 
517
 
518
- Args:
519
- metadata_list: List of issue metadata dictionaries
520
- agent_identifier: GitHub identifier of the agent (used as folder name)
521
- """
522
- import shutil
523
 
524
- try:
525
- token = get_hf_token()
526
- if not token:
527
- raise Exception("No HuggingFace token found")
528
 
529
- api = HfApi(token=token)
530
 
531
- # Group by date (year, month, day)
532
- grouped = group_metadata_by_date(metadata_list)
533
 
534
- if not grouped:
535
- print(f" No valid metadata to save for {agent_identifier}")
536
- return False
 
 
 
 
 
 
 
 
537
 
538
- # Create a temporary directory for batch upload
539
- temp_dir = tempfile.mkdtemp()
540
- agent_folder = os.path.join(temp_dir, agent_identifier)
541
- os.makedirs(agent_folder, exist_ok=True)
542
 
543
- try:
544
- print(f" 📦 Preparing batch upload for {len(grouped)} daily files...")
545
-
546
- # Process each daily file
547
- for (issue_year, month, day), day_metadata in grouped.items():
548
- filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
549
- local_filename = os.path.join(agent_folder, f"{issue_year}.{month:02d}.{day:02d}.jsonl")
550
-
551
- # Sort by created_at for better organization
552
- day_metadata.sort(key=lambda x: x.get('created_at', ''), reverse=True)
553
-
554
- # Save to temp directory (complete overwrite, no merging)
555
- save_jsonl(local_filename, day_metadata)
556
- print(f" Prepared {len(day_metadata)} issues for {filename}")
557
-
558
- # Upload entire folder using upload_folder (single commit per agent)
559
- print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
560
- upload_folder_with_backoff(
561
- api,
562
- folder_path=temp_dir,
563
- repo_id=ISSUE_METADATA_REPO,
564
- repo_type="dataset",
565
- commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
566
- )
567
- print(f" ✓ Batch upload complete for {agent_identifier}")
568
 
 
 
 
 
 
 
 
 
 
 
 
569
  return True
570
-
571
- finally:
572
- # Always clean up temp directory
573
- if os.path.exists(temp_dir):
574
- shutil.rmtree(temp_dir)
575
-
 
 
 
 
 
576
  except Exception as e:
577
- print(f"Error saving issue metadata: {str(e)}")
578
- import traceback
579
- traceback.print_exc()
580
- return False
581
 
582
 
583
  def load_agents_from_hf():
584
  """
585
- Load all agent metadata JSON files from HuggingFace dataset.
586
-
587
- The github_identifier is extracted from the filename (e.g., 'agent-name[bot].json' -> 'agent-name[bot]')
588
  """
589
- try:
590
- api = HfApi()
591
- agents = []
592
 
593
- # List all files in the repository
594
- files = list_repo_files_with_backoff(api, repo_id=AGENTS_REPO, repo_type="dataset")
595
 
596
- # Filter for JSON files only
597
- json_files = [f for f in files if f.endswith('.json')]
 
598
 
599
- print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
 
 
600
 
601
- # Download and parse each JSON file
602
- for json_file in json_files:
603
- try:
604
- file_path = hf_hub_download_with_backoff(
605
- repo_id=AGENTS_REPO,
606
- filename=json_file,
607
- repo_type="dataset"
608
- )
609
 
610
- with open(file_path, 'r') as f:
 
 
 
 
611
  agent_data = json.load(f)
612
 
613
- # Only process agents with status == "public"
614
- if agent_data.get('status') != 'public':
615
- continue
616
 
617
- # Extract github_identifier from filename (remove .json extension)
618
- github_identifier = json_file.replace('.json', '')
619
- agent_data['github_identifier'] = github_identifier
620
 
621
- agents.append(agent_data)
622
 
623
  except Exception as e:
624
- print(f"Warning: Could not load {json_file}: {str(e)}")
625
  continue
626
 
627
- print(f"✓ Loaded {len(agents)} agents from HuggingFace")
628
- return agents
629
-
630
- except Exception as e:
631
- print(f"Could not load agents from HuggingFace: {str(e)}")
632
- return []
633
-
634
 
635
- # =============================================================================
636
- # LEADERBOARD CALCULATION FUNCTIONS
637
- # =============================================================================
638
 
639
  def calculate_issue_stats_from_metadata(metadata_list):
640
- """
641
- Calculate statistics from a list of issue metadata.
642
-
643
- Returns:
644
- dict: Issue statistics including total, closed, resolved counts and rate
645
- """
646
  total_issues = len(metadata_list)
647
-
648
- # Count closed issues (those with closed_at timestamp)
649
- closed_issues = sum(1 for issue_meta in metadata_list
650
- if issue_meta.get('closed_at') is not None)
651
-
652
- # Count completed issues (subset of closed issues with state_reason="completed")
653
- completed = sum(1 for issue_meta in metadata_list
654
  if issue_meta.get('state_reason') == 'completed')
655
 
656
- # Calculate resolved rate as: completed / closed (not completed / total)
657
- resolved_rate = (completed / closed_issues * 100) if closed_issues > 0 else 0
658
 
659
  return {
660
  'total_issues': total_issues,
661
- 'closed_issues': closed_issues,
662
- 'resolved_issues': completed,
663
  'resolved_rate': round(resolved_rate, 2),
664
  }
665
 
666
 
667
- def calculate_monthly_metrics(all_metadata, agents):
668
- """
669
- Calculate monthly metrics for all agents for visualization.
670
-
671
- Args:
672
- all_metadata: Dictionary mapping agent_identifier to list of issue metadata
673
- agents: List of agent dictionaries with metadata
674
 
675
- Returns:
676
- dict: {
677
- 'agents': list of agent names,
678
- 'months': list of month labels (e.g., '2025-01'),
679
- 'data': {
680
- agent_name: {
681
- 'resolved_rates': list of resolved rates by month,
682
- 'total_issues': list of issue counts by month,
683
- 'resolved_issues': list of resolved issue counts by month
684
- }
685
- }
686
- }
687
- """
688
- # Create mapping from agent_identifier to agent_name
689
- identifier_to_name = {
690
- agent.get('github_identifier'): agent.get('name', agent.get('name', agent.get('github_identifier')))
691
- for agent in agents if agent.get('github_identifier')
692
- }
693
 
694
- # Group by agent and month
695
  agent_month_data = defaultdict(lambda: defaultdict(list))
696
 
697
- for identifier, metadata_list in all_metadata.items():
698
- agent_name = identifier_to_name.get(identifier, identifier)
699
-
700
  for issue_meta in metadata_list:
701
  created_at = issue_meta.get('created_at')
 
702
  if not created_at:
703
  continue
704
 
 
 
705
  try:
706
  dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
707
  month_key = f"{dt.year}-{dt.month:02d}"
@@ -710,42 +617,38 @@ def calculate_monthly_metrics(all_metadata, agents):
710
  print(f"Warning: Could not parse date '{created_at}': {e}")
711
  continue
712
 
713
- # Get all unique months and sort them
714
  all_months = set()
715
  for agent_data in agent_month_data.values():
716
  all_months.update(agent_data.keys())
717
  months = sorted(list(all_months))
718
 
719
- # Calculate metrics for each agent and month
720
  result_data = {}
721
  for agent_name, month_dict in agent_month_data.items():
722
  resolved_rates = []
723
  total_issues_list = []
724
  resolved_issues_list = []
 
725
 
726
  for month in months:
727
  issues_in_month = month_dict.get(month, [])
728
 
729
- # Count completed issues (those with state_reason="completed")
730
- completed_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')
731
-
732
- # Count closed issues (those with closed_at timestamp)
733
- closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at') is not None)
734
-
735
- # Total issues created in this month
736
  total_count = len(issues_in_month)
737
 
738
- # Calculate resolved rate as: completed / closed (not completed / total)
739
- resolved_rate = (completed_count / closed_count * 100) if closed_count > 0 else None
740
 
741
  resolved_rates.append(resolved_rate)
742
  total_issues_list.append(total_count)
743
- resolved_issues_list.append(completed_count)
 
744
 
745
  result_data[agent_name] = {
746
  'resolved_rates': resolved_rates,
747
  'total_issues': total_issues_list,
748
- 'resolved_issues': resolved_issues_list
 
749
  }
750
 
751
  agents_list = sorted(list(agent_month_data.keys()))
@@ -757,168 +660,175 @@ def calculate_monthly_metrics(all_metadata, agents):
757
  }
758
 
759
 
760
- def save_leaderboard_and_metrics_to_hf(all_metadata, agents):
761
- """
762
- Creates a comprehensive JSON file with both leaderboard stats and monthly metrics.
763
- If the file exists, it will be overwritten.
 
764
 
765
- Args:
766
- all_metadata: Dictionary mapping agent_identifier to list of issue metadata
767
- agents: List of agent dictionaries with metadata
768
 
769
- Returns:
770
- bool: True if successful, False otherwise
771
- """
772
- import io
 
 
 
 
 
 
 
 
 
 
 
773
 
 
 
 
774
  try:
775
  token = get_hf_token()
776
  if not token:
777
  raise Exception("No HuggingFace token found")
778
 
779
  api = HfApi(token=token)
 
780
 
781
- print(f"\n{'='*80}")
782
- print(f"📊 Preparing leaderboard and metrics data for upload...")
783
- print(f"{'='*80}\n")
784
-
785
- # Build leaderboard data
786
- print(" Constructing leaderboard data...")
787
- leaderboard_data = {}
788
-
789
- for agent in agents:
790
- identifier = agent.get('github_identifier')
791
- agent_name = agent.get('name', 'Unknown')
792
-
793
- if not identifier:
794
- continue
795
-
796
- metadata = all_metadata.get(identifier, [])
797
- stats = calculate_issue_stats_from_metadata(metadata)
798
-
799
- leaderboard_data[identifier] = {
800
- 'name': agent_name,
801
- 'website': agent.get('website', 'N/A'),
802
- 'github_identifier': identifier,
803
- **stats
804
- }
805
-
806
- # Get monthly metrics data
807
- print(" Calculating monthly metrics...")
808
- monthly_metrics = calculate_monthly_metrics(all_metadata, agents)
809
-
810
- # Combine into a single structure
811
  combined_data = {
812
- "leaderboard": leaderboard_data,
813
- "monthly_metrics": monthly_metrics,
814
- "metadata": {
815
- "last_updated": datetime.now(timezone.utc).isoformat(),
816
- "time_frame_days": LEADERBOARD_TIME_FRAME_DAYS,
817
- "total_agents": len(leaderboard_data)
818
  }
819
  }
820
 
821
- print(f" Leaderboard entries: {len(leaderboard_data)}")
822
- print(f" Monthly metrics for: {len(monthly_metrics['agents'])} agents")
823
- print(f" Time frame: {LEADERBOARD_TIME_FRAME_DAYS} days")
824
-
825
- # Convert to JSON and create file-like object
826
- json_content = json.dumps(combined_data, indent=2)
827
- file_like_object = io.BytesIO(json_content.encode('utf-8'))
828
-
829
- # Upload to HuggingFace (will overwrite if exists)
830
- print(f"\n🤗 Uploading to {LEADERBOARD_REPO}...")
831
- upload_file_with_backoff(
832
- api,
833
- path_or_fileobj=file_like_object,
834
- path_in_repo="swe-issue.json",
835
- repo_id=LEADERBOARD_REPO,
836
- repo_type="dataset",
837
- token=token,
838
- commit_message=f"Update leaderboard data - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
839
- )
840
-
841
- print(f" ✓ Successfully uploaded swe-issue.json")
842
- print(f"{'='*80}\n")
843
 
844
- return True
 
 
 
 
 
 
 
 
 
 
 
845
 
846
  except Exception as e:
847
- print(f"Error saving leaderboard and metrics: {str(e)}")
848
  import traceback
849
  traceback.print_exc()
850
  return False
851
 
852
 
853
  # =============================================================================
854
- # MAIN MINING FUNCTION
855
  # =============================================================================
856
 
857
  def mine_all_agents():
858
  """
859
- Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
860
- Uses ONE BigQuery query for ALL agents (most efficient approach).
861
  """
862
- # Load agent metadata from HuggingFace
 
 
 
 
 
 
863
  agents = load_agents_from_hf()
864
  if not agents:
865
- print("No agents found in HuggingFace dataset")
866
  return
867
 
868
- # Extract all identifiers
869
  identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
870
  if not identifiers:
871
- print("No valid agent identifiers found")
872
  return
873
 
874
- print(f"\n{'='*80}")
875
- print(f"Starting issue metadata mining for {len(identifiers)} agents")
876
- print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
877
- print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
878
- print(f"{'='*80}\n")
879
 
880
- # Initialize BigQuery client
881
  try:
882
- client = get_bigquery_client()
883
  except Exception as e:
884
- print(f"Failed to initialize BigQuery client: {str(e)}")
885
  return
886
 
887
- # Define time range: past LEADERBOARD_TIME_FRAME_DAYS (excluding today)
888
  current_time = datetime.now(timezone.utc)
889
  end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
890
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
891
 
892
  try:
893
- # Use batched approach for better performance
894
- # upload_immediately=True means each batch uploads to HuggingFace right after BigQuery completes
895
- all_metadata = fetch_issue_metadata_batched(
896
- client, identifiers, start_date, end_date, batch_size=100, upload_immediately=True
897
  )
898
 
899
- # Calculate summary statistics
900
- total_prs = sum(len(metadata_list) for metadata_list in all_metadata.values())
901
- agents_with_data = sum(1 for metadata_list in all_metadata.values() if metadata_list)
 
 
 
 
 
 
902
 
903
- print(f"\n{'='*80}")
904
- print(f"✅ BigQuery mining and upload complete!")
905
- print(f" Total agents: {len(agents)}")
906
- print(f" Agents with data: {agents_with_data}")
907
- print(f" Total PRs found: {total_prs}")
908
- print(f"{'='*80}\n")
909
 
910
  except Exception as e:
911
- print(f"Error during BigQuery fetch: {str(e)}")
912
  import traceback
913
  traceback.print_exc()
914
- return
915
 
916
- # After mining is complete, save leaderboard and metrics to HuggingFace
917
- print(f"📤 Uploading leaderboard and metrics data...")
918
- if save_leaderboard_and_metrics_to_hf(all_metadata, agents):
919
- print(f"✓ Leaderboard and metrics successfully uploaded to {LEADERBOARD_REPO}")
920
- else:
921
- print(f"⚠️ Failed to upload leaderboard and metrics data")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922
 
923
 
924
  # =============================================================================
@@ -926,4 +836,7 @@ def mine_all_agents():
926
  # =============================================================================
927
 
928
  if __name__ == "__main__":
929
- mine_all_agents()
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
+ import time
4
  from datetime import datetime, timezone, timedelta
5
  from collections import defaultdict
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
  from huggingface_hub import HfApi, hf_hub_download
8
  from huggingface_hub.errors import HfHubHTTPError
9
  from dotenv import load_dotenv
10
+ import duckdb
11
  import backoff
12
+ import requests
13
+ import requests.exceptions
14
+ from apscheduler.schedulers.blocking import BlockingScheduler
15
+ from apscheduler.triggers.cron import CronTrigger
16
+ import logging
17
 
18
  # Load environment variables
19
  load_dotenv()
 
22
  # CONFIGURATION
23
  # =============================================================================
24
 
25
+ AGENTS_REPO = "SWE-Arena/bot_data"
26
+ AGENTS_REPO_LOCAL_PATH = os.path.expanduser("~/bot_data") # Local git clone path
27
+ DUCKDB_CACHE_FILE = "cache.duckdb"
28
+ GHARCHIVE_DATA_LOCAL_PATH = os.path.expanduser("~/gharchive/data")
29
+ LEADERBOARD_REPO = "SWE-Arena/leaderboard_data"
30
+ LEADERBOARD_TIME_FRAME_DAYS = 180
31
 
32
+ # Git sync configuration (mandatory to get latest bot data)
33
+ GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
 
34
 
35
+ # OPTIMIZED DUCKDB CONFIGURATION
36
+ DUCKDB_THREADS = 8
37
+ DUCKDB_MEMORY_LIMIT = "64GB"
38
 
39
+ # Streaming batch configuration
40
+ BATCH_SIZE_DAYS = 7 # Process 1 week at a time (~168 hourly files)
41
+ # At this size: ~7 days ⚠ 24 files ⚠ ~100MB per file = ~16GB uncompressed per batch
42
 
43
+ # Download configuration
44
+ DOWNLOAD_WORKERS = 4
45
+ DOWNLOAD_RETRY_DELAY = 2
46
+ MAX_RETRIES = 5
 
 
 
 
 
 
 
 
 
47
 
48
+ # Upload configuration
49
+ UPLOAD_DELAY_SECONDS = 5
50
+ UPLOAD_MAX_BACKOFF = 3600
 
 
 
 
 
 
 
 
 
 
51
 
52
+ # Scheduler configuration
53
+ SCHEDULE_ENABLED = True
54
+ SCHEDULE_DAY_OF_WEEK = 'sun' # Sunday
55
+ SCHEDULE_HOUR = 0
56
+ SCHEDULE_MINUTE = 0
57
+ SCHEDULE_TIMEZONE = 'UTC'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  # =============================================================================
60
  # UTILITY FUNCTIONS
 
81
  """Save list of dictionaries to JSONL file."""
82
  with open(filename, 'w', encoding='utf-8') as f:
83
  for item in data:
84
+ f.write(json.dumps(item) + '\\n')
85
+
86
+
87
+ def normalize_date_format(date_string):
88
+ """Convert date strings or datetime objects to standardized ISO 8601 format with Z suffix."""
89
+ if not date_string or date_string == 'N/A':
90
+ return 'N/A'
91
+
92
+ try:
93
+ import re
94
+
95
+ if isinstance(date_string, datetime):
96
+ return date_string.strftime('%Y-%m-%dT%H:%M:%SZ')
97
+
98
+ date_string = re.sub(r'\\s+', ' ', date_string.strip())
99
+ date_string = date_string.replace(' ', 'T')
100
+
101
+ if len(date_string) >= 3:
102
+ if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
103
+ date_string = date_string + ':00'
104
+
105
+ dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
106
+ return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
107
+ except Exception as e:
108
+ print(f"Warning: Could not parse date '{date_string}': {e}")
109
+ return date_string
110
 
111
 
112
  def get_hf_token():
 
117
  return token
118
 
119
 
120
+ # =============================================================================
121
+ # GHARCHIVE DOWNLOAD FUNCTIONS
122
+ # =============================================================================
123
 
124
+ def download_file(url):
125
+ """Download a GHArchive file with retry logic."""
126
+ filename = url.split("/")[-1]
127
+ filepath = os.path.join(GHARCHIVE_DATA_LOCAL_PATH, filename)
 
128
 
129
+ if os.path.exists(filepath):
130
+ return True
 
 
 
131
 
132
+ for attempt in range(MAX_RETRIES):
133
+ try:
134
+ response = requests.get(url, timeout=30)
135
+ response.raise_for_status()
136
+ with open(filepath, "wb") as f:
137
+ f.write(response.content)
138
+ return True
139
 
140
+ except requests.exceptions.HTTPError as e:
141
+ # 404 means the file doesn't exist in GHArchive - skip without retry
142
+ if e.response.status_code == 404:
143
+ if attempt == 0: # Only log once, not for each retry
144
+ print(f" ⚠ {filename}: Not available (404) - skipping")
145
+ return False
146
 
147
+ # Other HTTP errors (5xx, etc.) should be retried
148
+ wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
149
+ print(f" ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
150
+ time.sleep(wait_time)
151
 
152
+ except Exception as e:
153
+ # Network errors, timeouts, etc. should be retried
154
+ wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
155
+ print(f" ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
156
+ time.sleep(wait_time)
157
 
158
+ return False
159
 
 
 
 
160
 
161
+ def download_all_gharchive_data():
162
+ """Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS."""
163
+ os.makedirs(GHARCHIVE_DATA_LOCAL_PATH, exist_ok=True)
164
 
165
+ end_date = datetime.now()
166
+ start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
 
 
167
 
168
+ urls = []
169
+ current_date = start_date
170
+ while current_date <= end_date:
171
+ date_str = current_date.strftime("%Y-%m-%d")
172
+ for hour in range(24):
173
+ url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
174
+ urls.append(url)
175
+ current_date += timedelta(days=1)
176
 
177
+ downloads_processed = 0
 
 
178
 
179
+ try:
180
+ with ThreadPoolExecutor(max_workers=DOWNLOAD_WORKERS) as executor:
181
+ futures = [executor.submit(download_file, url) for url in urls]
182
+ for future in as_completed(futures):
183
+ downloads_processed += 1
184
 
185
+ print(f" Download complete: {downloads_processed} files")
186
+ return True
187
+
188
+ except Exception as e:
189
+ print(f"Error during download: {str(e)}")
190
+ import traceback
191
+ traceback.print_exc()
192
+ return False
193
 
194
 
195
  # =============================================================================
196
+ # HUGGINGFACE API WRAPPERS
197
  # =============================================================================
198
 
199
+ def is_retryable_error(e):
200
+ """Check if exception is retryable (rate limit or timeout error)."""
201
+ if isinstance(e, HfHubHTTPError):
202
+ if e.response.status_code == 429:
203
+ return True
204
 
205
+ if isinstance(e, (requests.exceptions.Timeout,
206
+ requests.exceptions.ReadTimeout,
207
+ requests.exceptions.ConnectTimeout)):
208
+ return True
209
 
210
+ if isinstance(e, Exception):
211
+ error_str = str(e).lower()
212
+ if 'timeout' in error_str or 'timed out' in error_str:
213
+ return True
 
 
 
214
 
215
+ return False
216
+
217
+
218
+ @backoff.on_exception(
219
+ backoff.expo,
220
+ (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
221
+ max_tries=MAX_RETRIES,
222
+ base=300,
223
+ max_value=3600,
224
+ giveup=lambda e: not is_retryable_error(e),
225
+ on_backoff=lambda details: print(
226
+ f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
227
+ )
228
+ )
229
+ def list_repo_files_with_backoff(api, **kwargs):
230
+ """Wrapper for api.list_repo_files() with exponential backoff."""
231
+ return api.list_repo_files(**kwargs)
232
 
 
 
 
233
 
234
+ @backoff.on_exception(
235
+ backoff.expo,
236
+ (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
237
+ max_tries=MAX_RETRIES,
238
+ base=300,
239
+ max_value=3600,
240
+ giveup=lambda e: not is_retryable_error(e),
241
+ on_backoff=lambda details: print(
242
+ f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
243
+ )
244
+ )
245
+ def hf_hub_download_with_backoff(**kwargs):
246
+ """Wrapper for hf_hub_download() with exponential backoff."""
247
+ return hf_hub_download(**kwargs)
248
 
 
 
 
 
249
 
250
+ @backoff.on_exception(
251
+ backoff.expo,
252
+ (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
253
+ max_tries=MAX_RETRIES,
254
+ base=300,
255
+ max_value=3600,
256
+ giveup=lambda e: not is_retryable_error(e),
257
+ on_backoff=lambda details: print(
258
+ f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
259
+ )
260
+ )
261
+ def upload_file_with_backoff(api, **kwargs):
262
+ """Wrapper for api.upload_file() with exponential backoff."""
263
+ return api.upload_file(**kwargs)
264
 
 
 
 
 
 
 
265
 
266
+ @backoff.on_exception(
267
+ backoff.expo,
268
+ (HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
269
+ max_tries=MAX_RETRIES,
270
+ base=300,
271
+ max_value=3600,
272
+ giveup=lambda e: not is_retryable_error(e),
273
+ on_backoff=lambda details: print(
274
+ f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
275
+ )
276
+ )
277
+ def upload_folder_with_backoff(api, **kwargs):
278
+ """Wrapper for api.upload_folder() with exponential backoff."""
279
+ return api.upload_folder(**kwargs)
280
 
 
 
 
 
 
281
 
282
+ def get_duckdb_connection():
283
+ """
284
+ Initialize DuckDB connection with OPTIMIZED memory settings.
285
+ Uses persistent database and reduced memory footprint.
286
+ """
287
+ conn = duckdb.connect(DUCKDB_CACHE_FILE)
288
 
289
+ # OPTIMIZED SETTINGS
290
+ conn.execute(f"SET threads TO {DUCKDB_THREADS};")
291
+ conn.execute("SET preserve_insertion_order = false;")
292
+ conn.execute("SET enable_object_cache = true;")
293
+ conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
294
+ conn.execute(f"SET memory_limit = '{DUCKDB_MEMORY_LIMIT}';") # Per-query limit
295
+ conn.execute(f"SET max_memory = '{DUCKDB_MEMORY_LIMIT}';") # Hard cap
296
 
297
+ return conn
298
+
299
+
300
+ def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_LOCAL_PATH):
301
+ """Generate file path patterns for GHArchive data in date range (only existing files)."""
302
+ file_patterns = []
303
+ missing_dates = set()
304
+
305
+ current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
306
+ end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
307
+
308
+ while current_date <= end_day:
309
+ date_has_files = False
310
+ for hour in range(24):
311
+ pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
312
+ if os.path.exists(pattern):
313
+ file_patterns.append(pattern)
314
+ date_has_files = True
315
+
316
+ if not date_has_files:
317
+ missing_dates.add(current_date.strftime('%Y-%m-%d'))
318
+
319
+ current_date += timedelta(days=1)
320
 
321
+ if missing_dates:
322
+ print(f" Skipping {len(missing_dates)} date(s) with no data")
 
 
 
 
323
 
324
+ return file_patterns
325
 
326
 
327
+ # =============================================================================
328
+ # STREAMING BATCH PROCESSING FOR ISSUES
329
+ # =============================================================================
330
+
331
+ def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
332
  """
333
+ OPTIMIZED: Fetch issue metadata using streaming batch processing.
334
+
335
+ Only tracks issues assigned to the agents.
336
 
337
+ Processes GHArchive files in BATCH_SIZE_DAYS chunks to limit memory usage.
338
+ Instead of loading 180 days (4,344 files) at once, processes 7 days at a time.
 
339
 
340
+ This prevents OOM errors by:
341
+ 1. Only keeping ~168 hourly files in memory per batch (vs 4,344)
342
+ 2. Incrementally building the results dictionary
343
+ 3. Allowing DuckDB to garbage collect after each batch
344
 
345
  Args:
346
+ conn: DuckDB connection instance
347
+ identifiers: List of GitHub usernames/bot identifiers (~1500)
348
  start_date: Start datetime (timezone-aware)
349
  end_date: End datetime (timezone-aware)
350
 
351
  Returns:
352
+ Dictionary mapping agent identifier to list of issue metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  """
354
+ identifier_list = ', '.join([f"'{id}'" for id in identifiers])
355
+ metadata_by_agent = defaultdict(list)
356
 
357
+ # Calculate total batches
358
+ total_days = (end_date - start_date).days
359
+ total_batches = (total_days // BATCH_SIZE_DAYS) + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
+ # Process in configurable batches
362
+ current_date = start_date
363
+ batch_num = 0
364
+ total_issues = 0
 
365
 
366
+ print(f" Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...")
367
 
368
+ while current_date <= end_date:
369
+ batch_num += 1
370
+ batch_end = min(current_date + timedelta(days=BATCH_SIZE_DAYS - 1), end_date)
371
 
372
+ # Get file patterns for THIS BATCH ONLY (not all 180 days)
373
+ file_patterns = generate_file_path_patterns(current_date, batch_end)
 
 
 
 
374
 
375
+ if not file_patterns:
376
+ print(f" Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} - NO DATA")
377
+ current_date = batch_end + timedelta(days=1)
378
  continue
379
 
380
+ # Progress indicator
381
+ print(f" Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} ({len(file_patterns)} files)... ", end="", flush=True)
382
+
383
+ # Build file patterns SQL for THIS BATCH
384
+ file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
385
+
386
+ # Query for this batch - IssuesEvent filtered by assignee only
387
+ query = f"""
388
+ WITH issue_events AS (
389
+ SELECT
390
+ CONCAT(
391
+ REPLACE(repo.url, 'api.github.com/repos/', 'github.com/'),
392
+ '/issues/',
393
+ CAST(payload.issue.number AS VARCHAR)
394
+ ) as url,
395
+ payload.issue.assignee.login as assignee,
396
+ created_at as event_time,
397
+ payload.issue.created_at as issue_created_at,
398
+ payload.issue.closed_at as issue_closed_at,
399
+ payload.issue.state_reason as state_reason
400
+ FROM read_json({file_patterns_sql}, union_by_name=true, filename=true, compression='gzip', format='newline_delimited', ignore_errors=true, maximum_object_size=2147483648)
401
+ WHERE
402
+ type = 'IssuesEvent'
403
+ AND payload.issue.number IS NOT NULL
404
+ AND payload.issue.pull_request IS NULL
405
+ AND payload.issue.assignee.login IN ({identifier_list})
406
+ ),
407
+ issue_timeline AS (
408
+ SELECT
409
+ url,
410
+ assignee as agent_identifier,
411
+ MIN(issue_created_at) as created_at,
412
+ MAX(issue_closed_at) as closed_at,
413
+ MAX(state_reason) as state_reason
414
+ FROM issue_events
415
+ GROUP BY url, assignee
416
+ )
417
+ SELECT url, agent_identifier, created_at, closed_at, state_reason
418
+ FROM issue_timeline
419
+ WHERE agent_identifier IS NOT NULL AND created_at IS NOT NULL
420
+ """
421
 
422
+ try:
423
+ results = conn.execute(query).fetchall()
424
+ batch_issues = 0
425
+
426
+ # Add results to accumulating dictionary
427
+ for row in results:
428
+ url = row[0]
429
+ agent_identifier = row[1]
430
+ created_at = normalize_date_format(row[2]) if row[2] else None
431
+ closed_at = normalize_date_format(row[3]) if row[3] else None
432
+ state_reason = row[4]
433
+
434
+ if not url or not agent_identifier:
435
+ continue
436
+
437
+ issue_metadata = {
438
+ 'url': url,
439
+ 'created_at': created_at,
440
+ 'closed_at': closed_at,
441
+ 'state_reason': state_reason,
442
+ }
443
 
444
+ metadata_by_agent[agent_identifier].append(issue_metadata)
445
+ batch_issues += 1
446
+ total_issues += 1
447
 
448
+ print(f"✓ {batch_issues} issues found")
 
 
 
449
 
450
+ except Exception as e:
451
+ print(f"\\n ✗ Batch {batch_num} error: {str(e)}")
452
+ import traceback
453
+ traceback.print_exc()
454
 
455
+ # Move to next batch
456
+ current_date = batch_end + timedelta(days=1)
 
 
 
457
 
458
+ # Final summary
459
+ agents_with_data = sum(1 for issues in metadata_by_agent.values() if issues)
460
+ print(f"\\n ✓ Complete: {total_issues} issues found for {agents_with_data}/{len(identifiers)} agents")
 
461
 
462
+ return dict(metadata_by_agent)
463
 
 
 
464
 
465
+ def sync_agents_repo():
466
+ """
467
+ Sync local bot_data repository with remote using git pull.
468
+ This is MANDATORY to ensure we have the latest bot data.
469
+ Raises exception if sync fails.
470
+ """
471
+ if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
472
+ error_msg = f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}"
473
+ print(f" ✗ {error_msg}")
474
+ print(f" Please clone it first: git clone https://huggingface.co/datasets/{AGENTS_REPO}")
475
+ raise FileNotFoundError(error_msg)
476
 
477
+ if not os.path.exists(os.path.join(AGENTS_REPO_LOCAL_PATH, '.git')):
478
+ error_msg = f"{AGENTS_REPO_LOCAL_PATH} exists but is not a git repository"
479
+ print(f" ✗ {error_msg}")
480
+ raise ValueError(error_msg)
481
 
482
+ try:
483
+ import subprocess
484
+
485
+ # Run git pull with extended timeout due to large repository
486
+ result = subprocess.run(
487
+ ['git', 'pull'],
488
+ cwd=AGENTS_REPO_LOCAL_PATH,
489
+ capture_output=True,
490
+ text=True,
491
+ timeout=GIT_SYNC_TIMEOUT
492
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
494
+ if result.returncode == 0:
495
+ output = result.stdout.strip()
496
+ if "Already up to date" in output or "Already up-to-date" in output:
497
+ print(f" ✓ Repository is up to date")
498
+ else:
499
+ print(f" ✓ Repository synced successfully")
500
+ if output:
501
+ # Print first few lines of output
502
+ lines = output.split('\\n')[:5]
503
+ for line in lines:
504
+ print(f" {line}")
505
  return True
506
+ else:
507
+ error_msg = f"Git pull failed: {result.stderr.strip()}"
508
+ print(f" ✗ {error_msg}")
509
+ raise RuntimeError(error_msg)
510
+
511
+ except subprocess.TimeoutExpired:
512
+ error_msg = f"Git pull timed out after {GIT_SYNC_TIMEOUT} seconds"
513
+ print(f" ✗ {error_msg}")
514
+ raise TimeoutError(error_msg)
515
+ except (FileNotFoundError, ValueError, RuntimeError, TimeoutError):
516
+ raise # Re-raise expected exceptions
517
  except Exception as e:
518
+ error_msg = f"Error syncing repository: {str(e)}"
519
+ print(f" ✗ {error_msg}")
520
+ raise RuntimeError(error_msg) from e
 
521
 
522
 
523
  def load_agents_from_hf():
524
  """
525
+ Load all agent metadata JSON files from local git repository.
526
+ ALWAYS syncs with remote first to ensure we have the latest bot data.
 
527
  """
528
+ # MANDATORY: Sync with remote first to get latest bot data
529
+ print(f" Syncing bot_data repository to get latest agents...")
530
+ sync_agents_repo() # Will raise exception if sync fails
531
 
532
+ agents = []
 
533
 
534
+ # Scan local directory for JSON files
535
+ if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
536
+ raise FileNotFoundError(f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}")
537
 
538
+ # Walk through the directory to find all JSON files
539
+ files_processed = 0
540
+ print(f" Loading agent metadata from {AGENTS_REPO_LOCAL_PATH}...")
541
 
542
+ for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
543
+ # Skip .git directory
544
+ if '.git' in root:
545
+ continue
546
+
547
+ for filename in files:
548
+ if not filename.endswith('.json'):
549
+ continue
550
 
551
+ files_processed += 1
552
+ file_path = os.path.join(root, filename)
553
+
554
+ try:
555
+ with open(file_path, 'r', encoding='utf-8') as f:
556
  agent_data = json.load(f)
557
 
558
+ # Only include public agents
559
+ if agent_data.get('status') != 'public':
560
+ continue
561
 
562
+ # Extract github_identifier from filename
563
+ github_identifier = filename.replace('.json', '')
564
+ agent_data['github_identifier'] = github_identifier
565
 
566
+ agents.append(agent_data)
567
 
568
  except Exception as e:
569
+ print(f" Error loading {filename}: {str(e)}")
570
  continue
571
 
572
+ print(f" ✓ Loaded {len(agents)} public agents (from {files_processed} total files)")
573
+ return agents
 
 
 
 
 
574
 
 
 
 
575
 
576
  def calculate_issue_stats_from_metadata(metadata_list):
577
+ """Calculate statistics from a list of issue metadata."""
 
 
 
 
 
578
  total_issues = len(metadata_list)
579
+ closed = sum(1 for issue_meta in metadata_list if issue_meta.get('closed_at'))
580
+ resolved = sum(1 for issue_meta in metadata_list
 
 
 
 
 
581
  if issue_meta.get('state_reason') == 'completed')
582
 
583
+ # Resolved rate = resolved / closed (not resolved / total)
584
+ resolved_rate = (resolved / closed * 100) if closed > 0 else 0
585
 
586
  return {
587
  'total_issues': total_issues,
588
+ 'closed_issues': closed,
589
+ 'resolved_issues': resolved,
590
  'resolved_rate': round(resolved_rate, 2),
591
  }
592
 
593
 
594
+ def calculate_monthly_metrics_by_agent(all_metadata_dict, agents):
595
+ """Calculate monthly metrics for all agents for visualization."""
596
+ identifier_to_name = {agent.get('github_identifier'): agent.get('name') for agent in agents if agent.get('github_identifier')}
 
 
 
 
597
 
598
+ if not all_metadata_dict:
599
+ return {'agents': [], 'months': [], 'data': {}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
600
 
 
601
  agent_month_data = defaultdict(lambda: defaultdict(list))
602
 
603
+ for agent_identifier, metadata_list in all_metadata_dict.items():
 
 
604
  for issue_meta in metadata_list:
605
  created_at = issue_meta.get('created_at')
606
+
607
  if not created_at:
608
  continue
609
 
610
+ agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
611
+
612
  try:
613
  dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
614
  month_key = f"{dt.year}-{dt.month:02d}"
 
617
  print(f"Warning: Could not parse date '{created_at}': {e}")
618
  continue
619
 
 
620
  all_months = set()
621
  for agent_data in agent_month_data.values():
622
  all_months.update(agent_data.keys())
623
  months = sorted(list(all_months))
624
 
 
625
  result_data = {}
626
  for agent_name, month_dict in agent_month_data.items():
627
  resolved_rates = []
628
  total_issues_list = []
629
  resolved_issues_list = []
630
+ closed_issues_list = []
631
 
632
  for month in months:
633
  issues_in_month = month_dict.get(month, [])
634
 
635
+ resolved_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')
636
+ closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at'))
 
 
 
 
 
637
  total_count = len(issues_in_month)
638
 
639
+ # Resolved rate = resolved / closed (not resolved / total)
640
+ resolved_rate = (resolved_count / closed_count * 100) if closed_count > 0 else None
641
 
642
  resolved_rates.append(resolved_rate)
643
  total_issues_list.append(total_count)
644
+ resolved_issues_list.append(resolved_count)
645
+ closed_issues_list.append(closed_count)
646
 
647
  result_data[agent_name] = {
648
  'resolved_rates': resolved_rates,
649
  'total_issues': total_issues_list,
650
+ 'resolved_issues': resolved_issues_list,
651
+ 'closed_issues': closed_issues_list
652
  }
653
 
654
  agents_list = sorted(list(agent_month_data.keys()))
 
660
  }
661
 
662
 
663
+ def construct_leaderboard_from_metadata(all_metadata_dict, agents):
664
+ """Construct leaderboard from in-memory issue metadata."""
665
+ if not agents:
666
+ print("Error: No agents found")
667
+ return {}
668
 
669
+ cache_dict = {}
 
 
670
 
671
+ for agent in agents:
672
+ identifier = agent.get('github_identifier')
673
+ agent_name = agent.get('name', 'Unknown')
674
+
675
+ bot_metadata = all_metadata_dict.get(identifier, [])
676
+ stats = calculate_issue_stats_from_metadata(bot_metadata)
677
+
678
+ cache_dict[identifier] = {
679
+ 'name': agent_name,
680
+ 'website': agent.get('website', 'N/A'),
681
+ 'github_identifier': identifier,
682
+ **stats
683
+ }
684
+
685
+ return cache_dict
686
 
687
+
688
+ def save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics):
689
+ """Save leaderboard data and monthly metrics to HuggingFace dataset."""
690
  try:
691
  token = get_hf_token()
692
  if not token:
693
  raise Exception("No HuggingFace token found")
694
 
695
  api = HfApi(token=token)
696
+ filename = "swe-issue.json"
697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
  combined_data = {
699
+ 'last_updated': datetime.now(timezone.utc).isoformat(),
700
+ 'leaderboard': leaderboard_dict,
701
+ 'monthly_metrics': monthly_metrics,
702
+ 'metadata': {
703
+ 'leaderboard_time_frame_days': LEADERBOARD_TIME_FRAME_DAYS
 
704
  }
705
  }
706
 
707
+ with open(filename, 'w') as f:
708
+ json.dump(combined_data, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
 
710
+ try:
711
+ upload_file_with_backoff(
712
+ api=api,
713
+ path_or_fileobj=filename,
714
+ path_in_repo=filename,
715
+ repo_id=LEADERBOARD_REPO,
716
+ repo_type="dataset"
717
+ )
718
+ return True
719
+ finally:
720
+ if os.path.exists(filename):
721
+ os.remove(filename)
722
 
723
  except Exception as e:
724
+ print(f"Error saving leaderboard data: {str(e)}")
725
  import traceback
726
  traceback.print_exc()
727
  return False
728
 
729
 
730
  # =============================================================================
731
+ # MINING FUNCTION
732
  # =============================================================================
733
 
734
  def mine_all_agents():
735
  """
736
+ Mine issue metadata for all agents using STREAMING batch processing.
737
+ Downloads GHArchive data, then uses BATCH-based DuckDB queries.
738
  """
739
+ print(f"\\n[1/4] Downloading GHArchive data...")
740
+
741
+ if not download_all_gharchive_data():
742
+ print("Warning: Download had errors, continuing with available data...")
743
+
744
+ print(f"\\n[2/4] Loading agent metadata...")
745
+
746
  agents = load_agents_from_hf()
747
  if not agents:
748
+ print("Error: No agents found")
749
  return
750
 
 
751
  identifiers = [agent['github_identifier'] for agent in agents if agent.get('github_identifier')]
752
  if not identifiers:
753
+ print("Error: No valid agent identifiers found")
754
  return
755
 
756
+ print(f"\\n[3/4] Mining issue metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
 
 
 
 
757
 
 
758
  try:
759
+ conn = get_duckdb_connection()
760
  except Exception as e:
761
+ print(f"Failed to initialize DuckDB connection: {str(e)}")
762
  return
763
 
 
764
  current_time = datetime.now(timezone.utc)
765
  end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
766
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
767
 
768
  try:
769
+ # USE STREAMING FUNCTION FOR ISSUES
770
+ all_metadata = fetch_all_issue_metadata_streaming(
771
+ conn, identifiers, start_date, end_date
 
772
  )
773
 
774
+ except Exception as e:
775
+ print(f"Error during DuckDB fetch: {str(e)}")
776
+ import traceback
777
+ traceback.print_exc()
778
+ return
779
+ finally:
780
+ conn.close()
781
+
782
+ print(f"\\n[4/4] Saving leaderboard...")
783
 
784
+ try:
785
+ leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
786
+ monthly_metrics = calculate_monthly_metrics_by_agent(all_metadata, agents)
787
+ save_leaderboard_data_to_hf(leaderboard_dict, monthly_metrics)
 
 
788
 
789
  except Exception as e:
790
+ print(f"Error saving leaderboard: {str(e)}")
791
  import traceback
792
  traceback.print_exc()
 
793
 
794
+
795
+ # =============================================================================
796
+ # SCHEDULER SETUP
797
+ # =============================================================================
798
+
799
+ def setup_scheduler():
800
+ """Set up APScheduler to run mining jobs periodically."""
801
+ logging.basicConfig(
802
+ level=logging.INFO,
803
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
804
+ )
805
+
806
+ logging.getLogger('httpx').setLevel(logging.WARNING)
807
+
808
+ scheduler = BlockingScheduler(timezone=SCHEDULE_TIMEZONE)
809
+
810
+ trigger = CronTrigger(
811
+ day_of_week=SCHEDULE_DAY_OF_WEEK,
812
+ hour=SCHEDULE_HOUR,
813
+ minute=SCHEDULE_MINUTE,
814
+ timezone=SCHEDULE_TIMEZONE
815
+ )
816
+
817
+ scheduler.add_job(
818
+ mine_all_agents,
819
+ trigger=trigger,
820
+ id='mine_all_agents',
821
+ name='Mine GHArchive data for all agents',
822
+ replace_existing=True
823
+ )
824
+
825
+ from datetime import datetime
826
+ next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
827
+ print(f"Scheduler: Weekly on {SCHEDULE_DAY_OF_WEEK} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
828
+ print(f"Next run: {next_run}\\n")
829
+
830
+ print(f"\\nScheduler started")
831
+ scheduler.start()
832
 
833
 
834
  # =============================================================================
 
836
  # =============================================================================
837
 
838
  if __name__ == "__main__":
839
+ if SCHEDULE_ENABLED:
840
+ setup_scheduler()
841
+ else:
842
+ mine_all_agents()
requirements.txt CHANGED
@@ -1,12 +1,10 @@
1
  APScheduler
2
  backoff
3
- datasets
4
- db-dtypes
5
- google-cloud-bigquery
6
  gradio
7
  gradio_leaderboard
8
  huggingface_hub
9
  pandas
10
  plotly
11
- PyGithub
12
- python-dotenv
 
1
  APScheduler
2
  backoff
3
+ duckdb[all]
 
 
4
  gradio
5
  gradio_leaderboard
6
  huggingface_hub
7
  pandas
8
  plotly
9
+ python-dotenv
10
+ requests