zhiminy commited on
Commit
42f17bd
·
1 Parent(s): b5cbf74

refine msr

Browse files
Files changed (2) hide show
  1. app.py +62 -16
  2. msr.py +62 -14
app.py CHANGED
@@ -198,14 +198,52 @@ def request_with_backoff(method, url, *, headers=None, params=None, json_body=No
198
  print(f"Exceeded max retries for {url}")
199
  return None
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  def get_github_token():
202
- """Get GitHub token from environment variables."""
203
  token = os.getenv('GITHUB_TOKEN')
204
  if not token:
205
  print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
206
  return token
207
 
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  def validate_github_username(identifier):
210
  """Verify that a GitHub identifier exists with backoff-aware requests."""
211
  try:
@@ -225,13 +263,18 @@ def validate_github_username(identifier):
225
  return False, f"Validation error: {str(e)}"
226
 
227
 
228
- def fetch_issues_with_time_partition(base_query, start_date, end_date, headers, issues_by_id, debug_limit=None, depth=0):
229
  """
230
  Fetch issues within a specific time range using time-based partitioning.
231
  Recursively splits the time range if hitting the 1000-result limit.
232
  Supports splitting by day, hour, minute, and second as needed.
233
 
234
  Args:
 
 
 
 
 
235
  debug_limit: If set, stops fetching after this many issues (for testing)
236
  depth: Current recursion depth (for tracking)
237
 
@@ -284,6 +327,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
284
  }
285
 
286
  try:
 
287
  response = request_with_backoff('GET', url, headers=headers, params=params)
288
  if response is None:
289
  print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
@@ -331,7 +375,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
331
  split_start = split_start + timedelta(seconds=1)
332
 
333
  count = fetch_issues_with_time_partition(
334
- base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
335
  )
336
  total_from_splits += count
337
 
@@ -352,7 +396,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
352
  split_start = split_start + timedelta(minutes=1)
353
 
354
  count = fetch_issues_with_time_partition(
355
- base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
356
  )
357
  total_from_splits += count
358
 
@@ -373,7 +417,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
373
  split_start = split_start + timedelta(hours=1)
374
 
375
  count = fetch_issues_with_time_partition(
376
- base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
377
  )
378
  total_from_splits += count
379
 
@@ -404,7 +448,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
404
  split_start = split_start + timedelta(days=1)
405
 
406
  count = fetch_issues_with_time_partition(
407
- base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
408
  )
409
  total_from_splits += count
410
 
@@ -415,10 +459,10 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
415
 
416
  # Recursively fetch both halves
417
  count1 = fetch_issues_with_time_partition(
418
- base_query, start_date, mid_date, headers, issues_by_id, debug_limit, depth + 1
419
  )
420
  count2 = fetch_issues_with_time_partition(
421
- base_query, mid_date + timedelta(days=1), end_date, headers, issues_by_id, debug_limit, depth + 1
422
  )
423
 
424
  return count1 + count2
@@ -1235,13 +1279,13 @@ def save_agent_to_hf(data):
1235
  # DATA MANAGEMENT
1236
  # =============================================================================
1237
 
1238
- def fetch_new_issues_for_agent(agent_identifier, token, query_patterns=None):
1239
  """
1240
  Fetch and save new issues for an agent from yesterday 12am UTC to today 12am UTC.
1241
 
1242
  Args:
1243
  agent_identifier: GitHub identifier of the agent
1244
- token: GitHub API token
1245
  query_patterns: List of query patterns to search (if None, uses default)
1246
 
1247
  Returns:
@@ -1260,8 +1304,6 @@ def fetch_new_issues_for_agent(agent_identifier, token, query_patterns=None):
1260
  today_midnight = now_utc.replace(hour=0, minute=0, second=0, microsecond=0)
1261
  yesterday_midnight = today_midnight - timedelta(days=1)
1262
 
1263
- headers = {'Authorization': f'token {token}'} if token else {}
1264
-
1265
  print(f"\n 📥 Fetching new issues for {agent_identifier}...")
1266
  print(f" Time range: {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}")
1267
 
@@ -1276,7 +1318,7 @@ def fetch_new_issues_for_agent(agent_identifier, token, query_patterns=None):
1276
  base_query,
1277
  yesterday_midnight,
1278
  today_midnight,
1279
- headers,
1280
  issues_by_id,
1281
  debug_limit=10 if DEBUG_MODE else None,
1282
  depth=0
@@ -1316,8 +1358,12 @@ def update_all_agents_incremental():
1316
  print(f"{'='*80}")
1317
 
1318
  try:
1319
- # Get GitHub token
1320
- token = get_github_token()
 
 
 
 
1321
 
1322
  # Load agent metadata from HuggingFace
1323
  agents = load_agents_from_hf()
@@ -1358,7 +1404,7 @@ def update_all_agents_incremental():
1358
  continue
1359
 
1360
  try:
1361
- new_count = fetch_new_issues_for_agent(identifier, token)
1362
  total_new_issues += new_count
1363
  except Exception as e:
1364
  print(f" ⚠️ Error fetching new issues for {identifier}: {str(e)}")
 
198
  print(f"Exceeded max retries for {url}")
199
  return None
200
 
201
+ def get_github_tokens():
202
+ """Get all GitHub tokens from environment variables (all keys starting with GITHUB_TOKEN)."""
203
+ tokens = []
204
+ for key, value in os.environ.items():
205
+ if key.startswith('GITHUB_TOKEN') and value:
206
+ tokens.append(value)
207
+
208
+ if not tokens:
209
+ print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
210
+ else:
211
+ print(f"✓ Loaded {len(tokens)} GitHub token(s) for rotation")
212
+
213
+ return tokens
214
+
215
+
216
  def get_github_token():
217
+ """Get primary GitHub token from environment variables (backward compatibility)."""
218
  token = os.getenv('GITHUB_TOKEN')
219
  if not token:
220
  print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
221
  return token
222
 
223
 
224
+ class TokenPool:
225
+ """
226
+ Manages a pool of GitHub tokens for load balancing across rate limits.
227
+ Rotates through tokens in round-robin fashion to distribute API calls.
228
+ """
229
+ def __init__(self, tokens):
230
+ self.tokens = tokens if tokens else [None]
231
+ self.current_index = 0
232
+
233
+ def get_next_token(self):
234
+ """Get the next token in round-robin order."""
235
+ if not self.tokens:
236
+ return None
237
+ token = self.tokens[self.current_index]
238
+ self.current_index = (self.current_index + 1) % len(self.tokens)
239
+ return token
240
+
241
+ def get_headers(self):
242
+ """Get headers with the next token in rotation."""
243
+ token = self.get_next_token()
244
+ return {'Authorization': f'token {token}'} if token else {}
245
+
246
+
247
  def validate_github_username(identifier):
248
  """Verify that a GitHub identifier exists with backoff-aware requests."""
249
  try:
 
263
  return False, f"Validation error: {str(e)}"
264
 
265
 
266
+ def fetch_issues_with_time_partition(base_query, start_date, end_date, token_pool, issues_by_id, debug_limit=None, depth=0):
267
  """
268
  Fetch issues within a specific time range using time-based partitioning.
269
  Recursively splits the time range if hitting the 1000-result limit.
270
  Supports splitting by day, hour, minute, and second as needed.
271
 
272
  Args:
273
+ base_query: Base GitHub search query
274
+ start_date: Start date for time range
275
+ end_date: End date for time range
276
+ token_pool: TokenPool instance for rotating tokens
277
+ issues_by_id: Dictionary to store issues (deduplicated by ID)
278
  debug_limit: If set, stops fetching after this many issues (for testing)
279
  depth: Current recursion depth (for tracking)
280
 
 
327
  }
328
 
329
  try:
330
+ headers = token_pool.get_headers()
331
  response = request_with_backoff('GET', url, headers=headers, params=params)
332
  if response is None:
333
  print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
 
375
  split_start = split_start + timedelta(seconds=1)
376
 
377
  count = fetch_issues_with_time_partition(
378
+ base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
379
  )
380
  total_from_splits += count
381
 
 
396
  split_start = split_start + timedelta(minutes=1)
397
 
398
  count = fetch_issues_with_time_partition(
399
+ base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
400
  )
401
  total_from_splits += count
402
 
 
417
  split_start = split_start + timedelta(hours=1)
418
 
419
  count = fetch_issues_with_time_partition(
420
+ base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
421
  )
422
  total_from_splits += count
423
 
 
448
  split_start = split_start + timedelta(days=1)
449
 
450
  count = fetch_issues_with_time_partition(
451
+ base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
452
  )
453
  total_from_splits += count
454
 
 
459
 
460
  # Recursively fetch both halves
461
  count1 = fetch_issues_with_time_partition(
462
+ base_query, start_date, mid_date, token_pool, issues_by_id, debug_limit, depth + 1
463
  )
464
  count2 = fetch_issues_with_time_partition(
465
+ base_query, mid_date + timedelta(days=1), end_date, token_pool, issues_by_id, debug_limit, depth + 1
466
  )
467
 
468
  return count1 + count2
 
1279
  # DATA MANAGEMENT
1280
  # =============================================================================
1281
 
1282
+ def fetch_new_issues_for_agent(agent_identifier, token_pool, query_patterns=None):
1283
  """
1284
  Fetch and save new issues for an agent from yesterday 12am UTC to today 12am UTC.
1285
 
1286
  Args:
1287
  agent_identifier: GitHub identifier of the agent
1288
+ token_pool: TokenPool instance for rotating tokens
1289
  query_patterns: List of query patterns to search (if None, uses default)
1290
 
1291
  Returns:
 
1304
  today_midnight = now_utc.replace(hour=0, minute=0, second=0, microsecond=0)
1305
  yesterday_midnight = today_midnight - timedelta(days=1)
1306
 
 
 
1307
  print(f"\n 📥 Fetching new issues for {agent_identifier}...")
1308
  print(f" Time range: {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}")
1309
 
 
1318
  base_query,
1319
  yesterday_midnight,
1320
  today_midnight,
1321
+ token_pool,
1322
  issues_by_id,
1323
  debug_limit=10 if DEBUG_MODE else None,
1324
  depth=0
 
1358
  print(f"{'='*80}")
1359
 
1360
  try:
1361
+ # Load all GitHub tokens and create token pool
1362
+ tokens = get_github_tokens()
1363
+ token_pool = TokenPool(tokens)
1364
+
1365
+ # Get first token for functions that still need single token
1366
+ token = tokens[0] if tokens else None
1367
 
1368
  # Load agent metadata from HuggingFace
1369
  agents = load_agents_from_hf()
 
1404
  continue
1405
 
1406
  try:
1407
+ new_count = fetch_new_issues_for_agent(identifier, token_pool)
1408
  total_new_issues += new_count
1409
  except Exception as e:
1410
  print(f" ⚠️ Error fetching new issues for {identifier}: {str(e)}")
msr.py CHANGED
@@ -52,14 +52,52 @@ def save_jsonl(filename, data):
52
  f.write(json.dumps(item) + '\n')
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def get_github_token():
56
- """Get GitHub token from environment variables."""
57
  token = os.getenv('GITHUB_TOKEN')
58
  if not token:
59
  print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
60
  return token
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def get_hf_token():
64
  """Get HuggingFace token from environment variables."""
65
  token = os.getenv('HF_TOKEN')
@@ -144,12 +182,20 @@ def request_with_backoff(method, url, *, headers=None, params=None, json_body=No
144
  return None
145
 
146
 
147
- def fetch_issues_with_time_partition(base_query, start_date, end_date, headers, issues_by_id, depth=0):
148
  """
149
  Fetch issues within a specific time range using time-based partitioning.
150
  Recursively splits the time range if hitting the 1000-result limit.
151
  Supports splitting by day, hour, minute, and second as needed.
152
 
 
 
 
 
 
 
 
 
153
  Returns the number of issues found in this time partition.
154
  """
155
  # Calculate time difference
@@ -195,6 +241,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
195
  }
196
 
197
  try:
 
198
  response = request_with_backoff('GET', url, headers=headers, params=params)
199
  if response is None:
200
  print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
@@ -242,7 +289,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
242
  split_start = split_start + timedelta(seconds=1)
243
 
244
  count = fetch_issues_with_time_partition(
245
- base_query, split_start, split_end, headers, issues_by_id, depth + 1
246
  )
247
  total_from_splits += count
248
 
@@ -263,7 +310,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
263
  split_start = split_start + timedelta(minutes=1)
264
 
265
  count = fetch_issues_with_time_partition(
266
- base_query, split_start, split_end, headers, issues_by_id, depth + 1
267
  )
268
  total_from_splits += count
269
 
@@ -284,7 +331,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
284
  split_start = split_start + timedelta(hours=1)
285
 
286
  count = fetch_issues_with_time_partition(
287
- base_query, split_start, split_end, headers, issues_by_id, depth + 1
288
  )
289
  total_from_splits += count
290
 
@@ -315,7 +362,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
315
  split_start = split_start + timedelta(days=1)
316
 
317
  count = fetch_issues_with_time_partition(
318
- base_query, split_start, split_end, headers, issues_by_id, depth + 1
319
  )
320
  total_from_splits += count
321
 
@@ -326,10 +373,10 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
326
 
327
  # Recursively fetch both halves
328
  count1 = fetch_issues_with_time_partition(
329
- base_query, start_date, mid_date, headers, issues_by_id, depth + 1
330
  )
331
  count2 = fetch_issues_with_time_partition(
332
- base_query, mid_date + timedelta(days=1), end_date, headers, issues_by_id, depth + 1
333
  )
334
 
335
  return count1 + count2
@@ -374,7 +421,7 @@ def extract_issue_metadata(issue):
374
  }
375
 
376
 
377
- def fetch_all_issues_metadata(identifier, agent_name, token=None):
378
  """
379
  Fetch issues associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
380
  Returns lightweight metadata instead of full issue objects.
@@ -387,12 +434,11 @@ def fetch_all_issues_metadata(identifier, agent_name, token=None):
387
  Args:
388
  identifier: GitHub username or bot identifier
389
  agent_name: Human-readable name of the agent for metadata purposes
390
- token: GitHub API token for authentication
391
 
392
  Returns:
393
  List of dictionaries containing minimal issue metadata
394
  """
395
- headers = {'Authorization': f'token {token}'} if token else {}
396
 
397
  # Define query patterns for issues:
398
  # 1) author pattern: issues authored by the identifier
@@ -425,7 +471,7 @@ def fetch_all_issues_metadata(identifier, agent_name, token=None):
425
  query_pattern,
426
  start_date,
427
  end_date,
428
- headers,
429
  issues_by_id
430
  )
431
 
@@ -633,7 +679,9 @@ def mine_all_agents():
633
  """
634
  Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
635
  """
636
- token = get_github_token()
 
 
637
 
638
  # Load agent metadata from HuggingFace
639
  agents = load_agents_from_hf()
@@ -661,7 +709,7 @@ def mine_all_agents():
661
  print(f"{'='*80}")
662
 
663
  # Fetch issue metadata
664
- metadata = fetch_all_issues_metadata(identifier, agent_name, token)
665
 
666
  if metadata:
667
  print(f"💾 Saving {len(metadata)} issue records...")
 
52
  f.write(json.dumps(item) + '\n')
53
 
54
 
55
+ def get_github_tokens():
56
+ """Get all GitHub tokens from environment variables (all keys starting with GITHUB_TOKEN)."""
57
+ tokens = []
58
+ for key, value in os.environ.items():
59
+ if key.startswith('GITHUB_TOKEN') and value:
60
+ tokens.append(value)
61
+
62
+ if not tokens:
63
+ print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
64
+ else:
65
+ print(f"✓ Loaded {len(tokens)} GitHub token(s) for rotation")
66
+
67
+ return tokens
68
+
69
+
70
  def get_github_token():
71
+ """Get primary GitHub token from environment variables (backward compatibility)."""
72
  token = os.getenv('GITHUB_TOKEN')
73
  if not token:
74
  print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
75
  return token
76
 
77
 
78
+ class TokenPool:
79
+ """
80
+ Manages a pool of GitHub tokens for load balancing across rate limits.
81
+ Rotates through tokens in round-robin fashion to distribute API calls.
82
+ """
83
+ def __init__(self, tokens):
84
+ self.tokens = tokens if tokens else [None]
85
+ self.current_index = 0
86
+
87
+ def get_next_token(self):
88
+ """Get the next token in round-robin order."""
89
+ if not self.tokens:
90
+ return None
91
+ token = self.tokens[self.current_index]
92
+ self.current_index = (self.current_index + 1) % len(self.tokens)
93
+ return token
94
+
95
+ def get_headers(self):
96
+ """Get headers with the next token in rotation."""
97
+ token = self.get_next_token()
98
+ return {'Authorization': f'token {token}'} if token else {}
99
+
100
+
101
  def get_hf_token():
102
  """Get HuggingFace token from environment variables."""
103
  token = os.getenv('HF_TOKEN')
 
182
  return None
183
 
184
 
185
+ def fetch_issues_with_time_partition(base_query, start_date, end_date, token_pool, issues_by_id, depth=0):
186
  """
187
  Fetch issues within a specific time range using time-based partitioning.
188
  Recursively splits the time range if hitting the 1000-result limit.
189
  Supports splitting by day, hour, minute, and second as needed.
190
 
191
+ Args:
192
+ base_query: Base GitHub search query
193
+ start_date: Start date for time range
194
+ end_date: End date for time range
195
+ token_pool: TokenPool instance for rotating tokens
196
+ issues_by_id: Dictionary to store issues (deduplicated by ID)
197
+ depth: Current recursion depth
198
+
199
  Returns the number of issues found in this time partition.
200
  """
201
  # Calculate time difference
 
241
  }
242
 
243
  try:
244
+ headers = token_pool.get_headers()
245
  response = request_with_backoff('GET', url, headers=headers, params=params)
246
  if response is None:
247
  print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
 
289
  split_start = split_start + timedelta(seconds=1)
290
 
291
  count = fetch_issues_with_time_partition(
292
+ base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
293
  )
294
  total_from_splits += count
295
 
 
310
  split_start = split_start + timedelta(minutes=1)
311
 
312
  count = fetch_issues_with_time_partition(
313
+ base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
314
  )
315
  total_from_splits += count
316
 
 
331
  split_start = split_start + timedelta(hours=1)
332
 
333
  count = fetch_issues_with_time_partition(
334
+ base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
335
  )
336
  total_from_splits += count
337
 
 
362
  split_start = split_start + timedelta(days=1)
363
 
364
  count = fetch_issues_with_time_partition(
365
+ base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
366
  )
367
  total_from_splits += count
368
 
 
373
 
374
  # Recursively fetch both halves
375
  count1 = fetch_issues_with_time_partition(
376
+ base_query, start_date, mid_date, token_pool, issues_by_id, depth + 1
377
  )
378
  count2 = fetch_issues_with_time_partition(
379
+ base_query, mid_date + timedelta(days=1), end_date, token_pool, issues_by_id, depth + 1
380
  )
381
 
382
  return count1 + count2
 
421
  }
422
 
423
 
424
+ def fetch_all_issues_metadata(identifier, agent_name, token_pool):
425
  """
426
  Fetch issues associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
427
  Returns lightweight metadata instead of full issue objects.
 
434
  Args:
435
  identifier: GitHub username or bot identifier
436
  agent_name: Human-readable name of the agent for metadata purposes
437
+ token_pool: TokenPool instance for rotating tokens
438
 
439
  Returns:
440
  List of dictionaries containing minimal issue metadata
441
  """
 
442
 
443
  # Define query patterns for issues:
444
  # 1) author pattern: issues authored by the identifier
 
471
  query_pattern,
472
  start_date,
473
  end_date,
474
+ token_pool,
475
  issues_by_id
476
  )
477
 
 
679
  """
680
  Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
681
  """
682
+ # Load all GitHub tokens and create token pool
683
+ tokens = get_github_tokens()
684
+ token_pool = TokenPool(tokens)
685
 
686
  # Load agent metadata from HuggingFace
687
  agents = load_agents_from_hf()
 
709
  print(f"{'='*80}")
710
 
711
  # Fetch issue metadata
712
+ metadata = fetch_all_issues_metadata(identifier, agent_name, token_pool)
713
 
714
  if metadata:
715
  print(f"💾 Saving {len(metadata)} issue records...")