Spaces:

SWE-Arena
/

SWE-Issue

Running

App Files Files Community

zhiminy commited on Oct 22

Commit

42f17bd

1 Parent(s): b5cbf74

refine msr

Browse files

Files changed (2) hide show

app.py +62 -16
msr.py +62 -14

app.py CHANGED Viewed

@@ -198,14 +198,52 @@ def request_with_backoff(method, url, *, headers=None, params=None, json_body=No
     print(f"Exceeded max retries for {url}")
     return None
 def get_github_token():
-    """Get GitHub token from environment variables."""
     token = os.getenv('GITHUB_TOKEN')
     if not token:
         print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
     return token
 def validate_github_username(identifier):
     """Verify that a GitHub identifier exists with backoff-aware requests."""
     try:
@@ -225,13 +263,18 @@ def validate_github_username(identifier):
         return False, f"Validation error: {str(e)}"
-def fetch_issues_with_time_partition(base_query, start_date, end_date, headers, issues_by_id, debug_limit=None, depth=0):
     """
     Fetch issues within a specific time range using time-based partitioning.
     Recursively splits the time range if hitting the 1000-result limit.
     Supports splitting by day, hour, minute, and second as needed.
     Args:
         debug_limit: If set, stops fetching after this many issues (for testing)
         depth: Current recursion depth (for tracking)
@@ -284,6 +327,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
         }
         try:
             response = request_with_backoff('GET', url, headers=headers, params=params)
             if response is None:
                 print(f"{indent}  Error: retries exhausted for range {start_str} to {end_str}")
@@ -331,7 +375,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                             split_start = split_start + timedelta(seconds=1)
                         count = fetch_issues_with_time_partition(
-                            base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
                         )
                         total_from_splits += count
@@ -352,7 +396,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                             split_start = split_start + timedelta(minutes=1)
                         count = fetch_issues_with_time_partition(
-                            base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
                         )
                         total_from_splits += count
@@ -373,7 +417,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                             split_start = split_start + timedelta(hours=1)
                         count = fetch_issues_with_time_partition(
-                            base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
                         )
                         total_from_splits += count
@@ -404,7 +448,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                                 split_start = split_start + timedelta(days=1)
                             count = fetch_issues_with_time_partition(
-                                base_query, split_start, split_end, headers, issues_by_id, debug_limit, depth + 1
                             )
                             total_from_splits += count
@@ -415,10 +459,10 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                         # Recursively fetch both halves
                         count1 = fetch_issues_with_time_partition(
-                            base_query, start_date, mid_date, headers, issues_by_id, debug_limit, depth + 1
                         )
                         count2 = fetch_issues_with_time_partition(
-                            base_query, mid_date + timedelta(days=1), end_date, headers, issues_by_id, debug_limit, depth + 1
                         )
                         return count1 + count2
@@ -1235,13 +1279,13 @@ def save_agent_to_hf(data):
 # DATA MANAGEMENT
 # =============================================================================
-def fetch_new_issues_for_agent(agent_identifier, token, query_patterns=None):
     """
     Fetch and save new issues for an agent from yesterday 12am UTC to today 12am UTC.
     Args:
         agent_identifier: GitHub identifier of the agent
-        token: GitHub API token
         query_patterns: List of query patterns to search (if None, uses default)
     Returns:
@@ -1260,8 +1304,6 @@ def fetch_new_issues_for_agent(agent_identifier, token, query_patterns=None):
     today_midnight = now_utc.replace(hour=0, minute=0, second=0, microsecond=0)
     yesterday_midnight = today_midnight - timedelta(days=1)
-    headers = {'Authorization': f'token {token}'} if token else {}
     print(f"\n  📥 Fetching new issues for {agent_identifier}...")
     print(f"     Time range: {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}")
@@ -1276,7 +1318,7 @@ def fetch_new_issues_for_agent(agent_identifier, token, query_patterns=None):
                 base_query,
                 yesterday_midnight,
                 today_midnight,
-                headers,
                 issues_by_id,
                 debug_limit=10 if DEBUG_MODE else None,
                 depth=0
@@ -1316,8 +1358,12 @@ def update_all_agents_incremental():
     print(f"{'='*80}")
     try:
-        # Get GitHub token
-        token = get_github_token()
         # Load agent metadata from HuggingFace
         agents = load_agents_from_hf()
@@ -1358,7 +1404,7 @@ def update_all_agents_incremental():
                 continue
             try:
-                new_count = fetch_new_issues_for_agent(identifier, token)
                 total_new_issues += new_count
             except Exception as e:
                 print(f"   ⚠️ Error fetching new issues for {identifier}: {str(e)}")

     print(f"Exceeded max retries for {url}")
     return None
+def get_github_tokens():
+    """Get all GitHub tokens from environment variables (all keys starting with GITHUB_TOKEN)."""
+    tokens = []
+    for key, value in os.environ.items():
+        if key.startswith('GITHUB_TOKEN') and value:
+            tokens.append(value)
+    if not tokens:
+        print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
+    else:
+        print(f"✓ Loaded {len(tokens)} GitHub token(s) for rotation")
+    return tokens
 def get_github_token():
+    """Get primary GitHub token from environment variables (backward compatibility)."""
     token = os.getenv('GITHUB_TOKEN')
     if not token:
         print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
     return token
+class TokenPool:
+    """
+    Manages a pool of GitHub tokens for load balancing across rate limits.
+    Rotates through tokens in round-robin fashion to distribute API calls.
+    """
+    def __init__(self, tokens):
+        self.tokens = tokens if tokens else [None]
+        self.current_index = 0
+    def get_next_token(self):
+        """Get the next token in round-robin order."""
+        if not self.tokens:
+            return None
+        token = self.tokens[self.current_index]
+        self.current_index = (self.current_index + 1) % len(self.tokens)
+        return token
+    def get_headers(self):
+        """Get headers with the next token in rotation."""
+        token = self.get_next_token()
+        return {'Authorization': f'token {token}'} if token else {}
 def validate_github_username(identifier):
     """Verify that a GitHub identifier exists with backoff-aware requests."""
     try:
         return False, f"Validation error: {str(e)}"
+def fetch_issues_with_time_partition(base_query, start_date, end_date, token_pool, issues_by_id, debug_limit=None, depth=0):
     """
     Fetch issues within a specific time range using time-based partitioning.
     Recursively splits the time range if hitting the 1000-result limit.
     Supports splitting by day, hour, minute, and second as needed.
     Args:
+        base_query: Base GitHub search query
+        start_date: Start date for time range
+        end_date: End date for time range
+        token_pool: TokenPool instance for rotating tokens
+        issues_by_id: Dictionary to store issues (deduplicated by ID)
         debug_limit: If set, stops fetching after this many issues (for testing)
         depth: Current recursion depth (for tracking)
         }
         try:
+            headers = token_pool.get_headers()
             response = request_with_backoff('GET', url, headers=headers, params=params)
             if response is None:
                 print(f"{indent}  Error: retries exhausted for range {start_str} to {end_str}")
                             split_start = split_start + timedelta(seconds=1)
                         count = fetch_issues_with_time_partition(
+                            base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
                         )
                         total_from_splits += count
                             split_start = split_start + timedelta(minutes=1)
                         count = fetch_issues_with_time_partition(
+                            base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
                         )
                         total_from_splits += count
                             split_start = split_start + timedelta(hours=1)
                         count = fetch_issues_with_time_partition(
+                            base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
                         )
                         total_from_splits += count
                                 split_start = split_start + timedelta(days=1)
                             count = fetch_issues_with_time_partition(
+                                base_query, split_start, split_end, token_pool, issues_by_id, debug_limit, depth + 1
                             )
                             total_from_splits += count
                         # Recursively fetch both halves
                         count1 = fetch_issues_with_time_partition(
+                            base_query, start_date, mid_date, token_pool, issues_by_id, debug_limit, depth + 1
                         )
                         count2 = fetch_issues_with_time_partition(
+                            base_query, mid_date + timedelta(days=1), end_date, token_pool, issues_by_id, debug_limit, depth + 1
                         )
                         return count1 + count2
 # DATA MANAGEMENT
 # =============================================================================
+def fetch_new_issues_for_agent(agent_identifier, token_pool, query_patterns=None):
     """
     Fetch and save new issues for an agent from yesterday 12am UTC to today 12am UTC.
     Args:
         agent_identifier: GitHub identifier of the agent
+        token_pool: TokenPool instance for rotating tokens
         query_patterns: List of query patterns to search (if None, uses default)
     Returns:
     today_midnight = now_utc.replace(hour=0, minute=0, second=0, microsecond=0)
     yesterday_midnight = today_midnight - timedelta(days=1)
     print(f"\n  📥 Fetching new issues for {agent_identifier}...")
     print(f"     Time range: {yesterday_midnight.isoformat()} to {today_midnight.isoformat()}")
                 base_query,
                 yesterday_midnight,
                 today_midnight,
+                token_pool,
                 issues_by_id,
                 debug_limit=10 if DEBUG_MODE else None,
                 depth=0
     print(f"{'='*80}")
     try:
+        # Load all GitHub tokens and create token pool
+        tokens = get_github_tokens()
+        token_pool = TokenPool(tokens)
+        # Get first token for functions that still need single token
+        token = tokens[0] if tokens else None
         # Load agent metadata from HuggingFace
         agents = load_agents_from_hf()
                 continue
             try:
+                new_count = fetch_new_issues_for_agent(identifier, token_pool)
                 total_new_issues += new_count
             except Exception as e:
                 print(f"   ⚠️ Error fetching new issues for {identifier}: {str(e)}")

msr.py CHANGED Viewed

@@ -52,14 +52,52 @@ def save_jsonl(filename, data):
             f.write(json.dumps(item) + '\n')
 def get_github_token():
-    """Get GitHub token from environment variables."""
     token = os.getenv('GITHUB_TOKEN')
     if not token:
         print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
     return token
 def get_hf_token():
     """Get HuggingFace token from environment variables."""
     token = os.getenv('HF_TOKEN')
@@ -144,12 +182,20 @@ def request_with_backoff(method, url, *, headers=None, params=None, json_body=No
     return None
-def fetch_issues_with_time_partition(base_query, start_date, end_date, headers, issues_by_id, depth=0):
     """
     Fetch issues within a specific time range using time-based partitioning.
     Recursively splits the time range if hitting the 1000-result limit.
     Supports splitting by day, hour, minute, and second as needed.
     Returns the number of issues found in this time partition.
     """
     # Calculate time difference
@@ -195,6 +241,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
         }
         try:
             response = request_with_backoff('GET', url, headers=headers, params=params)
             if response is None:
                 print(f"{indent}  Error: retries exhausted for range {start_str} to {end_str}")
@@ -242,7 +289,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                             split_start = split_start + timedelta(seconds=1)
                         count = fetch_issues_with_time_partition(
-                            base_query, split_start, split_end, headers, issues_by_id, depth + 1
                         )
                         total_from_splits += count
@@ -263,7 +310,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                             split_start = split_start + timedelta(minutes=1)
                         count = fetch_issues_with_time_partition(
-                            base_query, split_start, split_end, headers, issues_by_id, depth + 1
                         )
                         total_from_splits += count
@@ -284,7 +331,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                             split_start = split_start + timedelta(hours=1)
                         count = fetch_issues_with_time_partition(
-                            base_query, split_start, split_end, headers, issues_by_id, depth + 1
                         )
                         total_from_splits += count
@@ -315,7 +362,7 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                                 split_start = split_start + timedelta(days=1)
                             count = fetch_issues_with_time_partition(
-                                base_query, split_start, split_end, headers, issues_by_id, depth + 1
                             )
                             total_from_splits += count
@@ -326,10 +373,10 @@ def fetch_issues_with_time_partition(base_query, start_date, end_date, headers,
                         # Recursively fetch both halves
                         count1 = fetch_issues_with_time_partition(
-                            base_query, start_date, mid_date, headers, issues_by_id, depth + 1
                         )
                         count2 = fetch_issues_with_time_partition(
-                            base_query, mid_date + timedelta(days=1), end_date, headers, issues_by_id, depth + 1
                         )
                         return count1 + count2
@@ -374,7 +421,7 @@ def extract_issue_metadata(issue):
     }
-def fetch_all_issues_metadata(identifier, agent_name, token=None):
     """
     Fetch issues associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
     Returns lightweight metadata instead of full issue objects.
@@ -387,12 +434,11 @@ def fetch_all_issues_metadata(identifier, agent_name, token=None):
     Args:
         identifier: GitHub username or bot identifier
         agent_name: Human-readable name of the agent for metadata purposes
-        token: GitHub API token for authentication
     Returns:
         List of dictionaries containing minimal issue metadata
     """
-    headers = {'Authorization': f'token {token}'} if token else {}
     # Define query patterns for issues:
     # 1) author pattern: issues authored by the identifier
@@ -425,7 +471,7 @@ def fetch_all_issues_metadata(identifier, agent_name, token=None):
             query_pattern,
             start_date,
             end_date,
-            headers,
             issues_by_id
         )
@@ -633,7 +679,9 @@ def mine_all_agents():
     """
     Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
     """
-    token = get_github_token()
     # Load agent metadata from HuggingFace
     agents = load_agents_from_hf()
@@ -661,7 +709,7 @@ def mine_all_agents():
             print(f"{'='*80}")
             # Fetch issue metadata
-            metadata = fetch_all_issues_metadata(identifier, agent_name, token)
             if metadata:
                 print(f"💾 Saving {len(metadata)} issue records...")

             f.write(json.dumps(item) + '\n')
+def get_github_tokens():
+    """Get all GitHub tokens from environment variables (all keys starting with GITHUB_TOKEN)."""
+    tokens = []
+    for key, value in os.environ.items():
+        if key.startswith('GITHUB_TOKEN') and value:
+            tokens.append(value)
+    if not tokens:
+        print("Warning: No GITHUB_TOKEN found. API rate limits: 60/hour (authenticated: 5000/hour)")
+    else:
+        print(f"✓ Loaded {len(tokens)} GitHub token(s) for rotation")
+    return tokens
 def get_github_token():
+    """Get primary GitHub token from environment variables (backward compatibility)."""
     token = os.getenv('GITHUB_TOKEN')
     if not token:
         print("Warning: GITHUB_TOKEN not found. API rate limits: 60/hour (authenticated: 5000/hour)")
     return token
+class TokenPool:
+    """
+    Manages a pool of GitHub tokens for load balancing across rate limits.
+    Rotates through tokens in round-robin fashion to distribute API calls.
+    """
+    def __init__(self, tokens):
+        self.tokens = tokens if tokens else [None]
+        self.current_index = 0
+    def get_next_token(self):
+        """Get the next token in round-robin order."""
+        if not self.tokens:
+            return None
+        token = self.tokens[self.current_index]
+        self.current_index = (self.current_index + 1) % len(self.tokens)
+        return token
+    def get_headers(self):
+        """Get headers with the next token in rotation."""
+        token = self.get_next_token()
+        return {'Authorization': f'token {token}'} if token else {}
 def get_hf_token():
     """Get HuggingFace token from environment variables."""
     token = os.getenv('HF_TOKEN')
     return None
+def fetch_issues_with_time_partition(base_query, start_date, end_date, token_pool, issues_by_id, depth=0):
     """
     Fetch issues within a specific time range using time-based partitioning.
     Recursively splits the time range if hitting the 1000-result limit.
     Supports splitting by day, hour, minute, and second as needed.
+    Args:
+        base_query: Base GitHub search query
+        start_date: Start date for time range
+        end_date: End date for time range
+        token_pool: TokenPool instance for rotating tokens
+        issues_by_id: Dictionary to store issues (deduplicated by ID)
+        depth: Current recursion depth
     Returns the number of issues found in this time partition.
     """
     # Calculate time difference
         }
         try:
+            headers = token_pool.get_headers()
             response = request_with_backoff('GET', url, headers=headers, params=params)
             if response is None:
                 print(f"{indent}  Error: retries exhausted for range {start_str} to {end_str}")
                             split_start = split_start + timedelta(seconds=1)
                         count = fetch_issues_with_time_partition(
+                            base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
                         )
                         total_from_splits += count
                             split_start = split_start + timedelta(minutes=1)
                         count = fetch_issues_with_time_partition(
+                            base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
                         )
                         total_from_splits += count
                             split_start = split_start + timedelta(hours=1)
                         count = fetch_issues_with_time_partition(
+                            base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
                         )
                         total_from_splits += count
                                 split_start = split_start + timedelta(days=1)
                             count = fetch_issues_with_time_partition(
+                                base_query, split_start, split_end, token_pool, issues_by_id, depth + 1
                             )
                             total_from_splits += count
                         # Recursively fetch both halves
                         count1 = fetch_issues_with_time_partition(
+                            base_query, start_date, mid_date, token_pool, issues_by_id, depth + 1
                         )
                         count2 = fetch_issues_with_time_partition(
+                            base_query, mid_date + timedelta(days=1), end_date, token_pool, issues_by_id, depth + 1
                         )
                         return count1 + count2
     }
+def fetch_all_issues_metadata(identifier, agent_name, token_pool):
     """
     Fetch issues associated with a GitHub user or bot for the past LEADERBOARD_TIME_FRAME_DAYS.
     Returns lightweight metadata instead of full issue objects.
     Args:
         identifier: GitHub username or bot identifier
         agent_name: Human-readable name of the agent for metadata purposes
+        token_pool: TokenPool instance for rotating tokens
     Returns:
         List of dictionaries containing minimal issue metadata
     """
     # Define query patterns for issues:
     # 1) author pattern: issues authored by the identifier
             query_pattern,
             start_date,
             end_date,
+            token_pool,
             issues_by_id
         )
     """
     Mine issue metadata for all agents within LEADERBOARD_TIME_FRAME_DAYS and save to HuggingFace.
     """
+    # Load all GitHub tokens and create token pool
+    tokens = get_github_tokens()
+    token_pool = TokenPool(tokens)
     # Load agent metadata from HuggingFace
     agents = load_agents_from_hf()
             print(f"{'='*80}")
             # Fetch issue metadata
+            metadata = fetch_all_issues_metadata(identifier, agent_name, token_pool)
             if metadata:
                 print(f"💾 Saving {len(metadata)} issue records...")