zhiminy commited on
Commit
dbebb52
·
1 Parent(s): d8e431a
Files changed (1) hide show
  1. msr.py +689 -0
msr.py ADDED
@@ -0,0 +1,689 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimalist Issue Metadata Mining Script
3
+ Mines issue metadata from GitHub and saves to HuggingFace dataset.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import time
9
+ import requests
10
+ from datetime import datetime, timezone, timedelta
11
+ from collections import defaultdict
12
+ from huggingface_hub import HfApi, hf_hub_download
13
+ from dotenv import load_dotenv
14
+ import random
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # =============================================================================
20
+ # CONFIGURATION
21
+ # =============================================================================
22
+
23
+ AGENTS_REPO = "SWE-Arena/swe_agents"
24
+ ISSUE_METADATA_REPO = "SWE-Arena/issue_metadata"
25
+ LEADERBOARD_TIME_FRAME_DAYS = 180 # 6 months
26
+
27
+ # =============================================================================
28
+ # UTILITY FUNCTIONS
29
+ # =============================================================================
30
+
31
+ def load_jsonl(filename):
32
+ """Load JSONL file and return list of dictionaries."""
33
+ if not os.path.exists(filename):
34
+ return []
35
+
36
+ data = []
37
+ with open(filename, 'r', encoding='utf-8') as f:
38
+ for line in f:
39
+ line = line.strip()
40
+ if line:
41
+ try:
42
+ data.append(json.loads(line))
43
+ except json.JSONDecodeError as e:
44
+ print(f"Warning: Skipping invalid JSON line: {e}")
45
+ return data
46
+
47
+
48
+ def save_jsonl(filename, data):
49
+ """Save list of dictionaries to JSONL file."""
50
+ with open(filename, 'w', encoding='utf-8') as f:
51
+ for item in data:
52
+ f.write(json.dumps(item) + '\n')
53
+
54
+
55
+ def get_github_token():
56
+ """Get GitHub token from environment variables."""
57
+ token = os.getenv('GITHUB_TOKEN')
58
+ if not token:
59
+ print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
60
+ return token
61
+
62
+
63
+ def get_hf_token():
64
+ """Get HuggingFace token from environment variables."""
65
+ token = os.getenv('HF_TOKEN')
66
+ if not token:
67
+ print("Warning: HF_TOKEN not found in environment variables")
68
+ return token
69
+
70
+
71
+ # =============================================================================
72
+ # GITHUB API FUNCTIONS
73
+ # =============================================================================
74
+
75
+ def request_with_backoff(method, url, *, headers=None, params=None, json_body=None, data=None, max_retries=10, timeout=30):
76
+ """
77
+ Perform an HTTP request with exponential backoff and jitter for GitHub API.
78
+ Retries on 403/429 (rate limits), 5xx server errors, and transient network exceptions.
79
+ """
80
+ delay = 1.0
81
+ for attempt in range(max_retries):
82
+ try:
83
+ resp = requests.request(
84
+ method,
85
+ url,
86
+ headers=headers or {},
87
+ params=params,
88
+ json=json_body,
89
+ data=data,
90
+ timeout=timeout
91
+ )
92
+
93
+ status = resp.status_code
94
+
95
+ # Success
96
+ if 200 <= status < 300:
97
+ return resp
98
+
99
+ # Rate limits or server errors -> retry with backoff
100
+ if status in (403, 429) or 500 <= status < 600:
101
+ wait = None
102
+
103
+ # Prefer Retry-After when present
104
+ retry_after = resp.headers.get('Retry-After') or resp.headers.get('retry-after')
105
+ if retry_after:
106
+ try:
107
+ wait = float(retry_after)
108
+ except Exception:
109
+ wait = None
110
+
111
+ # Fallback to X-RateLimit-Reset when 403/429
112
+ if wait is None and status in (403, 429):
113
+ reset_hdr = resp.headers.get('X-RateLimit-Reset') or resp.headers.get('x-ratelimit-reset')
114
+ if reset_hdr:
115
+ try:
116
+ reset_ts = int(float(reset_hdr))
117
+ wait = max(reset_ts - time.time() + 2, 1)
118
+ except Exception:
119
+ wait = None
120
+
121
+ # Final fallback: exponential backoff with jitter
122
+ if wait is None:
123
+ wait = delay + random.uniform(0, 0.5)
124
+
125
+ # Cap individual wait to avoid extreme sleeps
126
+ wait = max(1.0, min(wait, 120.0))
127
+ print(f"GitHub API {status}. Backing off {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
128
+ time.sleep(wait)
129
+ delay = min(delay * 2, 60.0)
130
+ continue
131
+
132
+ # Non-retryable error; return response for caller to handle
133
+ return resp
134
+
135
+ except requests.RequestException as e:
136
+ # Network error -> retry with backoff
137
+ wait = delay + random.uniform(0, 0.5)
138
+ wait = max(1.0, min(wait, 60.0))
139
+ print(f"Request error: {e}. Retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
140
+ time.sleep(wait)
141
+ delay = min(delay * 2, 60.0)
142
+
143
+ print(f"Exceeded max retries for {url}")
144
+ return None
145
+
146
+
147
+ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers, issues_by_id, depth=0):
148
+ """
149
+ Fetch issues within a specific time range using time-based partitioning.
150
+ Recursively splits the time range if hitting the 1000-result limit.
151
+ Supports splitting by day, hour, minute, and second as needed.
152
+
153
+ Returns the number of issues found in this time partition.
154
+ """
155
+ # Calculate time difference
156
+ time_diff = end_date - start_date
157
+ total_seconds = time_diff.total_seconds()
158
+
159
+ # Determine granularity and format dates accordingly
160
+ if total_seconds >= 86400: # >= 1 day
161
+ # Use day granularity (YYYY-MM-DD)
162
+ start_str = start_date.strftime('%Y-%m-%d')
163
+ end_str = end_date.strftime('%Y-%m-%d')
164
+ elif total_seconds >= 3600: # >= 1 hour but < 1 day
165
+ # Use hour granularity (YYYY-MM-DDTHH:MM:SSZ)
166
+ start_str = start_date.strftime('%Y-%m-%dT%H:00:00Z')
167
+ end_str = end_date.strftime('%Y-%m-%dT%H:59:59Z')
168
+ elif total_seconds >= 60: # >= 1 minute but < 1 hour
169
+ # Use minute granularity (YYYY-MM-DDTHH:MM:SSZ)
170
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:00Z')
171
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:59Z')
172
+ else: # < 1 minute
173
+ # Use second granularity (YYYY-MM-DDTHH:MM:SSZ)
174
+ start_str = start_date.strftime('%Y-%m-%dT%H:%M:%SZ')
175
+ end_str = end_date.strftime('%Y-%m-%dT%H:%M:%SZ')
176
+
177
+ # Add date range to query
178
+ query = f'{base_query} created:{start_str}..{end_str}'
179
+
180
+ indent = " " + " " * depth
181
+ print(f"{indent}Searching range {start_str} to {end_str}...")
182
+
183
+ page = 1
184
+ per_page = 100
185
+ total_in_partition = 0
186
+
187
+ while True:
188
+ url = 'https://api.github.com/search/issues'
189
+ params = {
190
+ 'q': query,
191
+ 'per_page': per_page,
192
+ 'page': page,
193
+ 'sort': 'created',
194
+ 'order': 'asc'
195
+ }
196
+
197
+ try:
198
+ response = request_with_backoff('GET', url, headers=headers, params=params)
199
+ if response is None:
200
+ print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
201
+ return total_in_partition
202
+
203
+ if response.status_code != 200:
204
+ print(f"{indent} Error: HTTP {response.status_code} for range {start_str} to {end_str}")
205
+ return total_in_partition
206
+
207
+ data = response.json()
208
+ total_count = data.get('total_count', 0)
209
+ items = data.get('items', [])
210
+
211
+ if not items:
212
+ break
213
+
214
+ # Add issues to global dict
215
+ for issue in items:
216
+ issue_id = issue.get('id')
217
+ if issue_id and issue_id not in issues_by_id:
218
+ issues_by_id[issue_id] = issue
219
+ total_in_partition += 1
220
+
221
+ # Check if we hit the 1000-result limit
222
+ if total_count > 1000 and page == 10:
223
+ print(f"{indent} ⚠️ Hit 1000-result limit ({total_count} total). Splitting time range...")
224
+
225
+ # Determine how to split based on time range duration
226
+ if total_seconds < 2: # Less than 2 seconds - can't split further
227
+ print(f"{indent} ⚠️ Cannot split further (range < 2 seconds). Some results may be missing.")
228
+ break
229
+
230
+ elif total_seconds < 120: # Less than 2 minutes - split by seconds
231
+ # Split into 2-4 parts depending on range
232
+ num_splits = min(4, max(2, int(total_seconds / 30)))
233
+ split_duration = time_diff / num_splits
234
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
235
+
236
+ total_from_splits = 0
237
+ for i in range(num_splits):
238
+ split_start = split_dates[i]
239
+ split_end = split_dates[i + 1]
240
+ # Avoid overlapping ranges (add 1 second to start)
241
+ if i > 0:
242
+ split_start = split_start + timedelta(seconds=1)
243
+
244
+ count = fetch_issues_with_time_partition(
245
+ base_query, split_start, split_end, headers, issues_by_id, depth + 1
246
+ )
247
+ total_from_splits += count
248
+
249
+ return total_from_splits
250
+
251
+ elif total_seconds < 7200: # Less than 2 hours - split by minutes
252
+ # Split into 2-4 parts
253
+ num_splits = min(4, max(2, int(total_seconds / 1800)))
254
+ split_duration = time_diff / num_splits
255
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
256
+
257
+ total_from_splits = 0
258
+ for i in range(num_splits):
259
+ split_start = split_dates[i]
260
+ split_end = split_dates[i + 1]
261
+ # Avoid overlapping ranges (add 1 minute to start)
262
+ if i > 0:
263
+ split_start = split_start + timedelta(minutes=1)
264
+
265
+ count = fetch_issues_with_time_partition(
266
+ base_query, split_start, split_end, headers, issues_by_id, depth + 1
267
+ )
268
+ total_from_splits += count
269
+
270
+ return total_from_splits
271
+
272
+ elif total_seconds < 172800: # Less than 2 days - split by hours
273
+ # Split into 2-4 parts
274
+ num_splits = min(4, max(2, int(total_seconds / 43200)))
275
+ split_duration = time_diff / num_splits
276
+ split_dates = [start_date + split_duration * i for i in range(num_splits + 1)]
277
+
278
+ total_from_splits = 0
279
+ for i in range(num_splits):
280
+ split_start = split_dates[i]
281
+ split_end = split_dates[i + 1]
282
+ # Avoid overlapping ranges (add 1 hour to start)
283
+ if i > 0:
284
+ split_start = split_start + timedelta(hours=1)
285
+
286
+ count = fetch_issues_with_time_partition(
287
+ base_query, split_start, split_end, headers, issues_by_id, depth + 1
288
+ )
289
+ total_from_splits += count
290
+
291
+ return total_from_splits
292
+
293
+ else: # 2+ days - split by days
294
+ days_diff = time_diff.days
295
+
296
+ # Use aggressive splitting for large ranges or deep recursion
297
+ # Split into 4 parts if range is > 30 days, otherwise split in half
298
+ if days_diff > 30 or depth > 5:
299
+ # Split into 4 parts for more aggressive partitioning
300
+ quarter_diff = time_diff / 4
301
+ split_dates = [
302
+ start_date,
303
+ start_date + quarter_diff,
304
+ start_date + quarter_diff * 2,
305
+ start_date + quarter_diff * 3,
306
+ end_date
307
+ ]
308
+
309
+ total_from_splits = 0
310
+ for i in range(4):
311
+ split_start = split_dates[i]
312
+ split_end = split_dates[i + 1]
313
+ # Avoid overlapping ranges
314
+ if i > 0:
315
+ split_start = split_start + timedelta(days=1)
316
+
317
+ count = fetch_issues_with_time_partition(
318
+ base_query, split_start, split_end, headers, issues_by_id, depth + 1
319
+ )
320
+ total_from_splits += count
321
+
322
+ return total_from_splits
323
+ else:
324
+ # Binary split for smaller ranges
325
+ mid_date = start_date + time_diff / 2
326
+
327
+ # Recursively fetch both halves
328
+ count1 = fetch_issues_with_time_partition(
329
+ base_query, start_date, mid_date, headers, issues_by_id, depth + 1
330
+ )
331
+ count2 = fetch_issues_with_time_partition(
332
+ base_query, mid_date + timedelta(days=1), end_date, headers, issues_by_id, depth + 1
333
+ )
334
+
335
+ return count1 + count2
336
+
337
+ # Normal pagination: check if there are more pages
338
+ if len(items) < per_page or page >= 10:
339
+ break
340
+
341
+ page += 1
342
+ time.sleep(0.5) # Courtesy delay between pages
343
+
344
+ except Exception as e:
345
+ print(f"{indent} Error fetching range {start_str} to {end_str}: {str(e)}")
346
+ return total_in_partition
347
+
348
+ if total_in_partition > 0:
349
+ print(f"{indent} ✓ Found {total_in_partition} issues in range {start_str} to {end_str}")
350
+
351
+ return total_in_partition
352
+
353
+
354
+ def extract_issue_metadata(issue):
355
+ """
356
+ Extract minimal issue metadata for efficient storage.
357
+ Only keeps essential fields: html_url, created_at, closed_at, state_reason.
358
+
359
+ Issue states:
360
+ - state: "open" or "closed"
361
+ - state_reason: "completed" (resolved), "not_planned" (closed as not planned), or None (still open)
362
+ """
363
+ created_at = issue.get('created_at')
364
+ closed_at = issue.get('closed_at')
365
+ state = issue.get('state')
366
+ state_reason = issue.get('state_reason')
367
+
368
+ return {
369
+ 'html_url': issue.get('html_url'),
370
+ 'created_at': created_at,
371
+ 'closed_at': closed_at,
372
+ 'state': state,
373
+ 'state_reason': state_reason
374
+ }
375
+
376
+
377
+ def fetch_all_issues_metadata(identifier, agent_name, token=None):
378
+ """
379
+ Fetch issues associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
380
+ Returns lightweight metadata instead of full issue objects.
381
+
382
+ This function employs time-based partitioning to navigate GitHub's 1000-result limit per query.
383
+ It searches using multiple query patterns:
384
+ - is:issue author:{identifier} (issues authored by the bot)
385
+ - is:issue assignee:{identifier} (issues assigned to the bot)
386
+
387
+ Args:
388
+ identifier: GitHub username or bot identifier
389
+ agent_name: Human-readable name of the agent for metadata purposes
390
+ token: GitHub API token for authentication
391
+
392
+ Returns:
393
+ List of dictionaries containing minimal issue metadata
394
+ """
395
+ headers = {'Authorization': f'token {token}'} if token else {}
396
+
397
+ # Define query patterns for issues:
398
+ # 1) author pattern: issues authored by the identifier
399
+ # 2) assignee pattern: issues assigned to the identifier
400
+ stripped_id = identifier.replace('[bot]', '')
401
+ query_patterns = []
402
+
403
+ # Always add author and assignee pattern
404
+ query_patterns.append(f'is:issue author:{identifier}')
405
+ query_patterns.append(f'is:issue assignee:{identifier}')
406
+ query_patterns.append(f'is:issue assignee:{stripped_id}')
407
+
408
+ # Use a dict to deduplicate issues by ID
409
+ issues_by_id = {}
410
+
411
+ # Define time range: past LEADERBOARD_TIME_FRAME_DAYS
412
+ current_time = datetime.now(timezone.utc)
413
+ start_date = current_time - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
414
+ end_date = current_time
415
+
416
+ for query_pattern in query_patterns:
417
+ print(f"\n🔍 Searching with query: {query_pattern}")
418
+ print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
419
+
420
+ pattern_start_time = time.time()
421
+ initial_count = len(issues_by_id)
422
+
423
+ # Fetch with time partitioning
424
+ issues_found = fetch_issues_with_time_partition(
425
+ query_pattern,
426
+ start_date,
427
+ end_date,
428
+ headers,
429
+ issues_by_id
430
+ )
431
+
432
+ pattern_duration = time.time() - pattern_start_time
433
+ new_issues = len(issues_by_id) - initial_count
434
+
435
+ print(f" ✓ Pattern complete: {new_issues} new issues found ({issues_found} total fetched, {len(issues_by_id) - initial_count - (issues_found - new_issues)} duplicates)")
436
+ print(f" ⏱️ Time taken: {pattern_duration:.1f} seconds")
437
+
438
+ time.sleep(1.0)
439
+
440
+ all_issues = list(issues_by_id.values())
441
+
442
+ print(f"\n✅ COMPLETE: Found {len(all_issues)} unique issues for {identifier}")
443
+ print(f"📦 Extracting minimal metadata...")
444
+
445
+ metadata_list = [extract_issue_metadata(issue) for issue in all_issues]
446
+
447
+ # Calculate memory savings
448
+ import sys
449
+ original_size = sys.getsizeof(str(all_issues))
450
+ metadata_size = sys.getsizeof(str(metadata_list))
451
+ savings_pct = ((original_size - metadata_size) / original_size * 100) if original_size > 0 else 0
452
+
453
+ print(f"💾 Memory efficiency: {original_size // 1024}KB → {metadata_size // 1024}KB (saved {savings_pct:.1f}%)")
454
+
455
+ return metadata_list
456
+
457
+
458
+ # =============================================================================
459
+ # HUGGINGFACE STORAGE FUNCTIONS
460
+ # =============================================================================
461
+
462
+ def group_metadata_by_date(metadata_list):
463
+ """
464
+ Group issue metadata by exact date (year.month.day) for efficient daily storage.
465
+ Returns dict: {(year, month, day): [metadata_list]}
466
+ """
467
+ grouped = defaultdict(list)
468
+
469
+ for issue_meta in metadata_list:
470
+ created_at = issue_meta.get('created_at')
471
+ if not created_at:
472
+ continue
473
+
474
+ try:
475
+ dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
476
+ key = (dt.year, dt.month, dt.day)
477
+ grouped[key].append(issue_meta)
478
+ except Exception as e:
479
+ print(f"Warning: Could not parse date '{created_at}': {e}")
480
+
481
+ return dict(grouped)
482
+
483
+
484
+ def upload_with_retry(api, path_or_fileobj, path_in_repo, repo_id, repo_type, token, max_retries=5):
485
+ """
486
+ Upload file to HuggingFace with exponential backoff retry logic.
487
+ """
488
+ delay = 2.0
489
+
490
+ for attempt in range(max_retries):
491
+ try:
492
+ api.upload_file(
493
+ path_or_fileobj=path_or_fileobj,
494
+ path_in_repo=path_in_repo,
495
+ repo_id=repo_id,
496
+ repo_type=repo_type,
497
+ token=token
498
+ )
499
+ if attempt > 0:
500
+ print(f" ✓ Upload succeeded on attempt {attempt + 1}/{max_retries}")
501
+ return True
502
+
503
+ except Exception as e:
504
+ if attempt < max_retries - 1:
505
+ wait_time = delay + random.uniform(0, 1.0)
506
+ print(f" ⚠️ Upload failed (attempt {attempt + 1}/{max_retries}): {str(e)}")
507
+ print(f" ⏳ Retrying in {wait_time:.1f} seconds...")
508
+ time.sleep(wait_time)
509
+ delay = min(delay * 2, 60.0)
510
+ else:
511
+ print(f" ✗ Upload failed after {max_retries} attempts: {str(e)}")
512
+ raise
513
+
514
+
515
+ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
516
+ """
517
+ Save issue metadata to HuggingFace dataset, organized by [agent_identifier]/YYYY.MM.DD.jsonl.
518
+ Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
519
+
520
+ This function APPENDS new metadata and DEDUPLICATES by html_url.
521
+
522
+ Args:
523
+ metadata_list: List of issue metadata dictionaries
524
+ agent_identifier: GitHub identifier of the agent (used as folder name)
525
+ """
526
+ try:
527
+ token = get_hf_token()
528
+ if not token:
529
+ raise Exception("No HuggingFace token found")
530
+
531
+ api = HfApi()
532
+
533
+ # Group by exact date (year, month, day)
534
+ grouped = group_metadata_by_date(metadata_list)
535
+
536
+ for (issue_year, month, day), day_metadata in grouped.items():
537
+ filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
538
+ local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
539
+ print(f"📤 Uploading {len(day_metadata)} issues to {filename}...")
540
+
541
+ # Download existing file if it exists
542
+ existing_metadata = []
543
+ try:
544
+ file_path = hf_hub_download(
545
+ repo_id=ISSUE_METADATA_REPO,
546
+ filename=filename,
547
+ repo_type="dataset",
548
+ token=token
549
+ )
550
+ existing_metadata = load_jsonl(file_path)
551
+ print(f" Found {len(existing_metadata)} existing issues in {filename}")
552
+ except Exception:
553
+ print(f" No existing file found for {filename}, creating new")
554
+
555
+ # Merge and deduplicate by html_url
556
+ existing_by_url = {meta['html_url']: meta for meta in existing_metadata if meta.get('html_url')}
557
+ new_by_url = {meta['html_url']: meta for meta in day_metadata if meta.get('html_url')}
558
+
559
+ # Update with new data (new data overwrites old)
560
+ existing_by_url.update(new_by_url)
561
+ merged_metadata = list(existing_by_url.values())
562
+
563
+ # Save locally
564
+ save_jsonl(local_filename, merged_metadata)
565
+
566
+ try:
567
+ # Upload to HuggingFace with folder path
568
+ upload_with_retry(
569
+ api=api,
570
+ path_or_fileobj=local_filename,
571
+ path_in_repo=filename,
572
+ repo_id=ISSUE_METADATA_REPO,
573
+ repo_type="dataset",
574
+ token=token
575
+ )
576
+ print(f" ✓ Saved {len(merged_metadata)} total issues to {filename}")
577
+ finally:
578
+ # Always clean up local file, even if upload fails
579
+ if os.path.exists(local_filename):
580
+ os.remove(local_filename)
581
+
582
+ return True
583
+
584
+ except Exception as e:
585
+ print(f"✗ Error saving issue metadata: {str(e)}")
586
+ return False
587
+
588
+
589
+ def load_agents_from_hf():
590
+ """Load all agent metadata JSON files from HuggingFace dataset."""
591
+ try:
592
+ api = HfApi()
593
+ agents = []
594
+
595
+ # List all files in the repository
596
+ files = api.list_repo_files(repo_id=AGENTS_REPO, repo_type="dataset")
597
+
598
+ # Filter for JSON files only
599
+ json_files = [f for f in files if f.endswith('.json')]
600
+
601
+ print(f"Found {len(json_files)} agent files in {AGENTS_REPO}")
602
+
603
+ # Download and parse each JSON file
604
+ for json_file in json_files:
605
+ try:
606
+ file_path = hf_hub_download(
607
+ repo_id=AGENTS_REPO,
608
+ filename=json_file,
609
+ repo_type="dataset"
610
+ )
611
+
612
+ with open(file_path, 'r') as f:
613
+ agent_data = json.load(f)
614
+ agents.append(agent_data)
615
+
616
+ except Exception as e:
617
+ print(f"Warning: Could not load {json_file}: {str(e)}")
618
+ continue
619
+
620
+ print(f"✓ Loaded {len(agents)} agents from HuggingFace")
621
+ return agents
622
+
623
+ except Exception as e:
624
+ print(f"Could not load agents from HuggingFace: {str(e)}")
625
+ return []
626
+
627
+
628
+ # =============================================================================
629
+ # MAIN MINING FUNCTION
630
+ # =============================================================================
631
+
632
+ def mine_all_agents():
633
+ """
634
+ Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
635
+ """
636
+ token = get_github_token()
637
+
638
+ # Load agent metadata from HuggingFace
639
+ agents = load_agents_from_hf()
640
+ if not agents:
641
+ print("No agents found in HuggingFace dataset")
642
+ return
643
+
644
+ print(f"\n{'='*80}")
645
+ print(f"Starting issue metadata mining for {len(agents)} agents")
646
+ print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
647
+ print(f"{'='*80}\n")
648
+
649
+ # Mine each agent
650
+ for agent in agents:
651
+ identifier = agent.get('github_identifier')
652
+ agent_name = agent.get('agent_name', 'Unknown')
653
+
654
+ if not identifier:
655
+ print(f"Warning: Skipping agent without identifier: {agent}")
656
+ continue
657
+
658
+ try:
659
+ print(f"\n{'='*80}")
660
+ print(f"Processing: {agent_name} ({identifier})")
661
+ print(f"{'='*80}")
662
+
663
+ # Fetch issue metadata
664
+ metadata = fetch_all_issues_metadata(identifier, agent_name, token)
665
+
666
+ if metadata:
667
+ print(f"💾 Saving {len(metadata)} issue records...")
668
+ save_issue_metadata_to_hf(metadata, identifier)
669
+ print(f"✓ Successfully processed {agent_name}")
670
+ else:
671
+ print(f" No issues found for {agent_name}")
672
+
673
+ except Exception as e:
674
+ print(f"✗ Error processing {identifier}: {str(e)}")
675
+ import traceback
676
+ traceback.print_exc()
677
+ continue
678
+
679
+ print(f"\n{'='*80}")
680
+ print(f"✅ Mining complete for all agents")
681
+ print(f"{'='*80}\n")
682
+
683
+
684
+ # =============================================================================
685
+ # ENTRY POINT
686
+ # =============================================================================
687
+
688
+ if __name__ == "__main__":
689
+ mine_all_agents()