refine msr
Browse files
app.py
CHANGED
|
@@ -198,14 +198,52 @@ def request_with_backoff(method, url, *, headers=None, params=None, json_body=No
|
|
| 198 |
print(f"Exceeded max retries for {url}")
|
| 199 |
return None
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
def get_github_token():
|
| 202 |
-
"""Get GitHub token from environment variables."""
|
| 203 |
token = os.getenv('GITHUB_TOKEN')
|
| 204 |
if not token:
|
| 205 |
print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
|
| 206 |
return token
|
| 207 |
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
def validate_github_username(identifier):
|
| 210 |
"""Verify that a GitHub identifier exists with backoff-aware requests."""
|
| 211 |
try:
|
|
@@ -225,13 +263,18 @@ def validate_github_username(identifier):
|
|
| 225 |
return False, f"Validation error: {str(e)}"
|
| 226 |
|
| 227 |
|
| 228 |
-
def fetch_issues_with_time_partition(base_query, start_date, end_date,
|
| 229 |
"""
|
| 230 |
Fetch issues within a specific time range using time-based partitioning.
|
| 231 |
Recursively splits the time range if hitting the 1000-result limit.
|
| 232 |
Supports splitting by day, hour, minute, and second as needed.
|
| 233 |
|
| 234 |
Args:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
debug_limit: If set, stops fetching after this many issues (for testing)
|
| 236 |
depth: Current recursion depth (for tracking)
|
| 237 |
|
|
@@ -284,6 +327,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 284 |
}
|
| 285 |
|
| 286 |
try:
|
|
|
|
| 287 |
response = request_with_backoff('GET', url, headers=headers, params=params)
|
| 288 |
if response is None:
|
| 289 |
print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
|
|
@@ -331,7 +375,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 331 |
split_start = split_start + timedelta(seconds=1)
|
| 332 |
|
| 333 |
count = fetch_issues_with_time_partition(
|
| 334 |
-
base_query, split_start, split_end,
|
| 335 |
)
|
| 336 |
total_from_splits += count
|
| 337 |
|
|
@@ -352,7 +396,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 352 |
split_start = split_start + timedelta(minutes=1)
|
| 353 |
|
| 354 |
count = fetch_issues_with_time_partition(
|
| 355 |
-
base_query, split_start, split_end,
|
| 356 |
)
|
| 357 |
total_from_splits += count
|
| 358 |
|
|
@@ -373,7 +417,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 373 |
split_start = split_start + timedelta(hours=1)
|
| 374 |
|
| 375 |
count = fetch_issues_with_time_partition(
|
| 376 |
-
base_query, split_start, split_end,
|
| 377 |
)
|
| 378 |
total_from_splits += count
|
| 379 |
|
|
@@ -404,7 +448,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 404 |
split_start = split_start + timedelta(days=1)
|
| 405 |
|
| 406 |
count = fetch_issues_with_time_partition(
|
| 407 |
-
base_query, split_start, split_end,
|
| 408 |
)
|
| 409 |
total_from_splits += count
|
| 410 |
|
|
@@ -415,10 +459,10 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 415 |
|
| 416 |
# Recursively fetch both halves
|
| 417 |
count1 = fetch_issues_with_time_partition(
|
| 418 |
-
base_query, start_date, mid_date,
|
| 419 |
)
|
| 420 |
count2 = fetch_issues_with_time_partition(
|
| 421 |
-
base_query, mid_date + timedelta(days=1), end_date,
|
| 422 |
)
|
| 423 |
|
| 424 |
return count1 + count2
|
|
@@ -1235,13 +1279,13 @@ def save_agent_to_hf(data):
|
|
| 1235 |
# DATA MANAGEMENT
|
| 1236 |
# =============================================================================
|
| 1237 |
|
| 1238 |
-
def fetch_new_issues_for_agent(agent_identifier,
|
| 1239 |
"""
|
| 1240 |
Fetch and save new issues for an agent from yesterday 12am UTC to today 12am UTC.
|
| 1241 |
|
| 1242 |
Args:
|
| 1243 |
agent_identifier: GitHub identifier of the agent
|
| 1244 |
-
|
| 1245 |
query_patterns: List of query patterns to search (if None, uses default)
|
| 1246 |
|
| 1247 |
Returns:
|
|
@@ -1260,8 +1304,6 @@ def fetch_new_issues_for_agent(agent_identifier, token, query_patterns=None):
|
|
| 1260 |
today_midnight = now_utc.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 1261 |
yesterday_midnight = today_midnight - timedelta(days=1)
|
| 1262 |
|
| 1263 |
-
headers = {'Authorization': f'token {token}'} if token else {}
|
| 1264 |
-
|
| 1265 |
print(f"\n 📥 Fetching new issues for {agent_identifier}...")
|
| 1266 |
print(f" Time range: {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}")
|
| 1267 |
|
|
@@ -1276,7 +1318,7 @@ def fetch_new_issues_for_agent(agent_identifier, token, query_patterns=None):
|
|
| 1276 |
base_query,
|
| 1277 |
yesterday_midnight,
|
| 1278 |
today_midnight,
|
| 1279 |
-
|
| 1280 |
issues_by_id,
|
| 1281 |
debug_limit=10 if DEBUG_MODE else None,
|
| 1282 |
depth=0
|
|
@@ -1316,8 +1358,12 @@ def update_all_agents_incremental():
|
|
| 1316 |
print(f"{'='*80}")
|
| 1317 |
|
| 1318 |
try:
|
| 1319 |
-
#
|
| 1320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1321 |
|
| 1322 |
# Load agent metadata from HuggingFace
|
| 1323 |
agents = load_agents_from_hf()
|
|
@@ -1358,7 +1404,7 @@ def update_all_agents_incremental():
|
|
| 1358 |
continue
|
| 1359 |
|
| 1360 |
try:
|
| 1361 |
-
new_count = fetch_new_issues_for_agent(identifier,
|
| 1362 |
total_new_issues += new_count
|
| 1363 |
except Exception as e:
|
| 1364 |
print(f" ⚠️ Error fetching new issues for {identifier}: {str(e)}")
|
|
|
|
| 198 |
print(f"Exceeded max retries for {url}")
|
| 199 |
return None
|
| 200 |
|
| 201 |
+
def get_github_tokens():
|
| 202 |
+
"""Get all GitHub tokens from environment variables (all keys starting with GITHUB_TOKEN)."""
|
| 203 |
+
tokens = []
|
| 204 |
+
for key, value in os.environ.items():
|
| 205 |
+
if key.startswith('GITHUB_TOKEN') and value:
|
| 206 |
+
tokens.append(value)
|
| 207 |
+
|
| 208 |
+
if not tokens:
|
| 209 |
+
print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
|
| 210 |
+
else:
|
| 211 |
+
print(f"✓ Loaded {len(tokens)} GitHub token(s) for rotation")
|
| 212 |
+
|
| 213 |
+
return tokens
|
| 214 |
+
|
| 215 |
+
|
| 216 |
def get_github_token():
|
| 217 |
+
"""Get primary GitHub token from environment variables (backward compatibility)."""
|
| 218 |
token = os.getenv('GITHUB_TOKEN')
|
| 219 |
if not token:
|
| 220 |
print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
|
| 221 |
return token
|
| 222 |
|
| 223 |
|
| 224 |
+
class TokenPool:
|
| 225 |
+
"""
|
| 226 |
+
Manages a pool of GitHub tokens for load balancing across rate limits.
|
| 227 |
+
Rotates through tokens in round-robin fashion to distribute API calls.
|
| 228 |
+
"""
|
| 229 |
+
def __init__(self, tokens):
|
| 230 |
+
self.tokens = tokens if tokens else [None]
|
| 231 |
+
self.current_index = 0
|
| 232 |
+
|
| 233 |
+
def get_next_token(self):
|
| 234 |
+
"""Get the next token in round-robin order."""
|
| 235 |
+
if not self.tokens:
|
| 236 |
+
return None
|
| 237 |
+
token = self.tokens[self.current_index]
|
| 238 |
+
self.current_index = (self.current_index + 1) % len(self.tokens)
|
| 239 |
+
return token
|
| 240 |
+
|
| 241 |
+
def get_headers(self):
|
| 242 |
+
"""Get headers with the next token in rotation."""
|
| 243 |
+
token = self.get_next_token()
|
| 244 |
+
return {'Authorization': f'token {token}'} if token else {}
|
| 245 |
+
|
| 246 |
+
|
| 247 |
def validate_github_username(identifier):
|
| 248 |
"""Verify that a GitHub identifier exists with backoff-aware requests."""
|
| 249 |
try:
|
|
|
|
| 263 |
return False, f"Validation error: {str(e)}"
|
| 264 |
|
| 265 |
|
| 266 |
+
def fetch_issues_with_time_partition(base_query, start_date, end_date, token_pool, issues_by_id, debug_limit=None, depth=0):
|
| 267 |
"""
|
| 268 |
Fetch issues within a specific time range using time-based partitioning.
|
| 269 |
Recursively splits the time range if hitting the 1000-result limit.
|
| 270 |
Supports splitting by day, hour, minute, and second as needed.
|
| 271 |
|
| 272 |
Args:
|
| 273 |
+
base_query: Base GitHub search query
|
| 274 |
+
start_date: Start date for time range
|
| 275 |
+
end_date: End date for time range
|
| 276 |
+
token_pool: TokenPool instance for rotating tokens
|
| 277 |
+
issues_by_id: Dictionary to store issues (deduplicated by ID)
|
| 278 |
debug_limit: If set, stops fetching after this many issues (for testing)
|
| 279 |
depth: Current recursion depth (for tracking)
|
| 280 |
|
|
|
|
| 327 |
}
|
| 328 |
|
| 329 |
try:
|
| 330 |
+
headers = token_pool.get_headers()
|
| 331 |
response = request_with_backoff('GET', url, headers=headers, params=params)
|
| 332 |
if response is None:
|
| 333 |
print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
|
|
|
|
| 375 |
split_start = split_start + timedelta(seconds=1)
|
| 376 |
|
| 377 |
count = fetch_issues_with_time_partition(
|
| 378 |
+
base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
|
| 379 |
)
|
| 380 |
total_from_splits += count
|
| 381 |
|
|
|
|
| 396 |
split_start = split_start + timedelta(minutes=1)
|
| 397 |
|
| 398 |
count = fetch_issues_with_time_partition(
|
| 399 |
+
base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
|
| 400 |
)
|
| 401 |
total_from_splits += count
|
| 402 |
|
|
|
|
| 417 |
split_start = split_start + timedelta(hours=1)
|
| 418 |
|
| 419 |
count = fetch_issues_with_time_partition(
|
| 420 |
+
base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
|
| 421 |
)
|
| 422 |
total_from_splits += count
|
| 423 |
|
|
|
|
| 448 |
split_start = split_start + timedelta(days=1)
|
| 449 |
|
| 450 |
count = fetch_issues_with_time_partition(
|
| 451 |
+
base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
|
| 452 |
)
|
| 453 |
total_from_splits += count
|
| 454 |
|
|
|
|
| 459 |
|
| 460 |
# Recursively fetch both halves
|
| 461 |
count1 = fetch_issues_with_time_partition(
|
| 462 |
+
base_query, start_date, mid_date, token_pool, issues_by_id, debug_limit, depth + 1
|
| 463 |
)
|
| 464 |
count2 = fetch_issues_with_time_partition(
|
| 465 |
+
base_query, mid_date + timedelta(days=1), end_date, token_pool, issues_by_id, debug_limit, depth + 1
|
| 466 |
)
|
| 467 |
|
| 468 |
return count1 + count2
|
|
|
|
| 1279 |
# DATA MANAGEMENT
|
| 1280 |
# =============================================================================
|
| 1281 |
|
| 1282 |
+
def fetch_new_issues_for_agent(agent_identifier, token_pool, query_patterns=None):
|
| 1283 |
"""
|
| 1284 |
Fetch and save new issues for an agent from yesterday 12am UTC to today 12am UTC.
|
| 1285 |
|
| 1286 |
Args:
|
| 1287 |
agent_identifier: GitHub identifier of the agent
|
| 1288 |
+
token_pool: TokenPool instance for rotating tokens
|
| 1289 |
query_patterns: List of query patterns to search (if None, uses default)
|
| 1290 |
|
| 1291 |
Returns:
|
|
|
|
| 1304 |
today_midnight = now_utc.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 1305 |
yesterday_midnight = today_midnight - timedelta(days=1)
|
| 1306 |
|
|
|
|
|
|
|
| 1307 |
print(f"\n 📥 Fetching new issues for {agent_identifier}...")
|
| 1308 |
print(f" Time range: {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}")
|
| 1309 |
|
|
|
|
| 1318 |
base_query,
|
| 1319 |
yesterday_midnight,
|
| 1320 |
today_midnight,
|
| 1321 |
+
token_pool,
|
| 1322 |
issues_by_id,
|
| 1323 |
debug_limit=10 if DEBUG_MODE else None,
|
| 1324 |
depth=0
|
|
|
|
| 1358 |
print(f"{'='*80}")
|
| 1359 |
|
| 1360 |
try:
|
| 1361 |
+
# Load all GitHub tokens and create token pool
|
| 1362 |
+
tokens = get_github_tokens()
|
| 1363 |
+
token_pool = TokenPool(tokens)
|
| 1364 |
+
|
| 1365 |
+
# Get first token for functions that still need single token
|
| 1366 |
+
token = tokens[0] if tokens else None
|
| 1367 |
|
| 1368 |
# Load agent metadata from HuggingFace
|
| 1369 |
agents = load_agents_from_hf()
|
|
|
|
| 1404 |
continue
|
| 1405 |
|
| 1406 |
try:
|
| 1407 |
+
new_count = fetch_new_issues_for_agent(identifier, token_pool)
|
| 1408 |
total_new_issues += new_count
|
| 1409 |
except Exception as e:
|
| 1410 |
print(f" ⚠️ Error fetching new issues for {identifier}: {str(e)}")
|
msr.py
CHANGED
|
@@ -52,14 +52,52 @@ def save_jsonl(filename, data):
|
|
| 52 |
f.write(json.dumps(item) + '\n')
|
| 53 |
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def get_github_token():
|
| 56 |
-
"""Get GitHub token from environment variables."""
|
| 57 |
token = os.getenv('GITHUB_TOKEN')
|
| 58 |
if not token:
|
| 59 |
print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
|
| 60 |
return token
|
| 61 |
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def get_hf_token():
|
| 64 |
"""Get HuggingFace token from environment variables."""
|
| 65 |
token = os.getenv('HF_TOKEN')
|
|
@@ -144,12 +182,20 @@ def request_with_backoff(method, url, *, headers=None, params=None, json_body=No
|
|
| 144 |
return None
|
| 145 |
|
| 146 |
|
| 147 |
-
def fetch_issues_with_time_partition(base_query, start_date, end_date,
|
| 148 |
"""
|
| 149 |
Fetch issues within a specific time range using time-based partitioning.
|
| 150 |
Recursively splits the time range if hitting the 1000-result limit.
|
| 151 |
Supports splitting by day, hour, minute, and second as needed.
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
Returns the number of issues found in this time partition.
|
| 154 |
"""
|
| 155 |
# Calculate time difference
|
|
@@ -195,6 +241,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 195 |
}
|
| 196 |
|
| 197 |
try:
|
|
|
|
| 198 |
response = request_with_backoff('GET', url, headers=headers, params=params)
|
| 199 |
if response is None:
|
| 200 |
print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
|
|
@@ -242,7 +289,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 242 |
split_start = split_start + timedelta(seconds=1)
|
| 243 |
|
| 244 |
count = fetch_issues_with_time_partition(
|
| 245 |
-
base_query, split_start, split_end,
|
| 246 |
)
|
| 247 |
total_from_splits += count
|
| 248 |
|
|
@@ -263,7 +310,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 263 |
split_start = split_start + timedelta(minutes=1)
|
| 264 |
|
| 265 |
count = fetch_issues_with_time_partition(
|
| 266 |
-
base_query, split_start, split_end,
|
| 267 |
)
|
| 268 |
total_from_splits += count
|
| 269 |
|
|
@@ -284,7 +331,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 284 |
split_start = split_start + timedelta(hours=1)
|
| 285 |
|
| 286 |
count = fetch_issues_with_time_partition(
|
| 287 |
-
base_query, split_start, split_end,
|
| 288 |
)
|
| 289 |
total_from_splits += count
|
| 290 |
|
|
@@ -315,7 +362,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 315 |
split_start = split_start + timedelta(days=1)
|
| 316 |
|
| 317 |
count = fetch_issues_with_time_partition(
|
| 318 |
-
base_query, split_start, split_end,
|
| 319 |
)
|
| 320 |
total_from_splits += count
|
| 321 |
|
|
@@ -326,10 +373,10 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
|
|
| 326 |
|
| 327 |
# Recursively fetch both halves
|
| 328 |
count1 = fetch_issues_with_time_partition(
|
| 329 |
-
base_query, start_date, mid_date,
|
| 330 |
)
|
| 331 |
count2 = fetch_issues_with_time_partition(
|
| 332 |
-
base_query, mid_date + timedelta(days=1), end_date,
|
| 333 |
)
|
| 334 |
|
| 335 |
return count1 + count2
|
|
@@ -374,7 +421,7 @@ def extract_issue_metadata(issue):
|
|
| 374 |
}
|
| 375 |
|
| 376 |
|
| 377 |
-
def fetch_all_issues_metadata(identifier, agent_name,
|
| 378 |
"""
|
| 379 |
Fetch issues associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
|
| 380 |
Returns lightweight metadata instead of full issue objects.
|
|
@@ -387,12 +434,11 @@ def fetch_all_issues_metadata(identifier, agent_name, token=None):
|
|
| 387 |
Args:
|
| 388 |
identifier: GitHub username or bot identifier
|
| 389 |
agent_name: Human-readable name of the agent for metadata purposes
|
| 390 |
-
|
| 391 |
|
| 392 |
Returns:
|
| 393 |
List of dictionaries containing minimal issue metadata
|
| 394 |
"""
|
| 395 |
-
headers = {'Authorization': f'token {token}'} if token else {}
|
| 396 |
|
| 397 |
# Define query patterns for issues:
|
| 398 |
# 1) author pattern: issues authored by the identifier
|
|
@@ -425,7 +471,7 @@ def fetch_all_issues_metadata(identifier, agent_name, token=None):
|
|
| 425 |
query_pattern,
|
| 426 |
start_date,
|
| 427 |
end_date,
|
| 428 |
-
|
| 429 |
issues_by_id
|
| 430 |
)
|
| 431 |
|
|
@@ -633,7 +679,9 @@ def mine_all_agents():
|
|
| 633 |
"""
|
| 634 |
Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
|
| 635 |
"""
|
| 636 |
-
token
|
|
|
|
|
|
|
| 637 |
|
| 638 |
# Load agent metadata from HuggingFace
|
| 639 |
agents = load_agents_from_hf()
|
|
@@ -661,7 +709,7 @@ def mine_all_agents():
|
|
| 661 |
print(f"{'='*80}")
|
| 662 |
|
| 663 |
# Fetch issue metadata
|
| 664 |
-
metadata = fetch_all_issues_metadata(identifier, agent_name,
|
| 665 |
|
| 666 |
if metadata:
|
| 667 |
print(f"💾 Saving {len(metadata)} issue records...")
|
|
|
|
| 52 |
f.write(json.dumps(item) + '\n')
|
| 53 |
|
| 54 |
|
| 55 |
+
def get_github_tokens():
|
| 56 |
+
"""Get all GitHub tokens from environment variables (all keys starting with GITHUB_TOKEN)."""
|
| 57 |
+
tokens = []
|
| 58 |
+
for key, value in os.environ.items():
|
| 59 |
+
if key.startswith('GITHUB_TOKEN') and value:
|
| 60 |
+
tokens.append(value)
|
| 61 |
+
|
| 62 |
+
if not tokens:
|
| 63 |
+
print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
|
| 64 |
+
else:
|
| 65 |
+
print(f"✓ Loaded {len(tokens)} GitHub token(s) for rotation")
|
| 66 |
+
|
| 67 |
+
return tokens
|
| 68 |
+
|
| 69 |
+
|
| 70 |
def get_github_token():
|
| 71 |
+
"""Get primary GitHub token from environment variables (backward compatibility)."""
|
| 72 |
token = os.getenv('GITHUB_TOKEN')
|
| 73 |
if not token:
|
| 74 |
print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
|
| 75 |
return token
|
| 76 |
|
| 77 |
|
| 78 |
+
class TokenPool:
|
| 79 |
+
"""
|
| 80 |
+
Manages a pool of GitHub tokens for load balancing across rate limits.
|
| 81 |
+
Rotates through tokens in round-robin fashion to distribute API calls.
|
| 82 |
+
"""
|
| 83 |
+
def __init__(self, tokens):
|
| 84 |
+
self.tokens = tokens if tokens else [None]
|
| 85 |
+
self.current_index = 0
|
| 86 |
+
|
| 87 |
+
def get_next_token(self):
|
| 88 |
+
"""Get the next token in round-robin order."""
|
| 89 |
+
if not self.tokens:
|
| 90 |
+
return None
|
| 91 |
+
token = self.tokens[self.current_index]
|
| 92 |
+
self.current_index = (self.current_index + 1) % len(self.tokens)
|
| 93 |
+
return token
|
| 94 |
+
|
| 95 |
+
def get_headers(self):
|
| 96 |
+
"""Get headers with the next token in rotation."""
|
| 97 |
+
token = self.get_next_token()
|
| 98 |
+
return {'Authorization': f'token {token}'} if token else {}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
def get_hf_token():
|
| 102 |
"""Get HuggingFace token from environment variables."""
|
| 103 |
token = os.getenv('HF_TOKEN')
|
|
|
|
| 182 |
return None
|
| 183 |
|
| 184 |
|
| 185 |
+
def fetch_issues_with_time_partition(base_query, start_date, end_date, token_pool, issues_by_id, depth=0):
|
| 186 |
"""
|
| 187 |
Fetch issues within a specific time range using time-based partitioning.
|
| 188 |
Recursively splits the time range if hitting the 1000-result limit.
|
| 189 |
Supports splitting by day, hour, minute, and second as needed.
|
| 190 |
|
| 191 |
+
Args:
|
| 192 |
+
base_query: Base GitHub search query
|
| 193 |
+
start_date: Start date for time range
|
| 194 |
+
end_date: End date for time range
|
| 195 |
+
token_pool: TokenPool instance for rotating tokens
|
| 196 |
+
issues_by_id: Dictionary to store issues (deduplicated by ID)
|
| 197 |
+
depth: Current recursion depth
|
| 198 |
+
|
| 199 |
Returns the number of issues found in this time partition.
|
| 200 |
"""
|
| 201 |
# Calculate time difference
|
|
|
|
| 241 |
}
|
| 242 |
|
| 243 |
try:
|
| 244 |
+
headers = token_pool.get_headers()
|
| 245 |
response = request_with_backoff('GET', url, headers=headers, params=params)
|
| 246 |
if response is None:
|
| 247 |
print(f"{indent} Error: retries exhausted for range {start_str} to {end_str}")
|
|
|
|
| 289 |
split_start = split_start + timedelta(seconds=1)
|
| 290 |
|
| 291 |
count = fetch_issues_with_time_partition(
|
| 292 |
+
base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
|
| 293 |
)
|
| 294 |
total_from_splits += count
|
| 295 |
|
|
|
|
| 310 |
split_start = split_start + timedelta(minutes=1)
|
| 311 |
|
| 312 |
count = fetch_issues_with_time_partition(
|
| 313 |
+
base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
|
| 314 |
)
|
| 315 |
total_from_splits += count
|
| 316 |
|
|
|
|
| 331 |
split_start = split_start + timedelta(hours=1)
|
| 332 |
|
| 333 |
count = fetch_issues_with_time_partition(
|
| 334 |
+
base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
|
| 335 |
)
|
| 336 |
total_from_splits += count
|
| 337 |
|
|
|
|
| 362 |
split_start = split_start + timedelta(days=1)
|
| 363 |
|
| 364 |
count = fetch_issues_with_time_partition(
|
| 365 |
+
base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
|
| 366 |
)
|
| 367 |
total_from_splits += count
|
| 368 |
|
|
|
|
| 373 |
|
| 374 |
# Recursively fetch both halves
|
| 375 |
count1 = fetch_issues_with_time_partition(
|
| 376 |
+
base_query, start_date, mid_date, token_pool, issues_by_id, depth + 1
|
| 377 |
)
|
| 378 |
count2 = fetch_issues_with_time_partition(
|
| 379 |
+
base_query, mid_date + timedelta(days=1), end_date, token_pool, issues_by_id, depth + 1
|
| 380 |
)
|
| 381 |
|
| 382 |
return count1 + count2
|
|
|
|
| 421 |
}
|
| 422 |
|
| 423 |
|
| 424 |
+
def fetch_all_issues_metadata(identifier, agent_name, token_pool):
|
| 425 |
"""
|
| 426 |
Fetch issues associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
|
| 427 |
Returns lightweight metadata instead of full issue objects.
|
|
|
|
| 434 |
Args:
|
| 435 |
identifier: GitHub username or bot identifier
|
| 436 |
agent_name: Human-readable name of the agent for metadata purposes
|
| 437 |
+
token_pool: TokenPool instance for rotating tokens
|
| 438 |
|
| 439 |
Returns:
|
| 440 |
List of dictionaries containing minimal issue metadata
|
| 441 |
"""
|
|
|
|
| 442 |
|
| 443 |
# Define query patterns for issues:
|
| 444 |
# 1) author pattern: issues authored by the identifier
|
|
|
|
| 471 |
query_pattern,
|
| 472 |
start_date,
|
| 473 |
end_date,
|
| 474 |
+
token_pool,
|
| 475 |
issues_by_id
|
| 476 |
)
|
| 477 |
|
|
|
|
| 679 |
"""
|
| 680 |
Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
|
| 681 |
"""
|
| 682 |
+
# Load all GitHub tokens and create token pool
|
| 683 |
+
tokens = get_github_tokens()
|
| 684 |
+
token_pool = TokenPool(tokens)
|
| 685 |
|
| 686 |
# Load agent metadata from HuggingFace
|
| 687 |
agents = load_agents_from_hf()
|
|
|
|
| 709 |
print(f"{'='*80}")
|
| 710 |
|
| 711 |
# Fetch issue metadata
|
| 712 |
+
metadata = fetch_all_issues_metadata(identifier, agent_name, token_pool)
|
| 713 |
|
| 714 |
if metadata:
|
| 715 |
print(f"💾 Saving {len(metadata)} issue records...")
|