refine
Browse files
app.py
CHANGED
|
@@ -162,17 +162,83 @@ def generate_table_union_statements(start_date, end_date):
|
|
| 162 |
return " UNION ALL ".join(union_parts)
|
| 163 |
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
|
| 166 |
"""
|
| 167 |
-
Fetch issue metadata for
|
| 168 |
|
| 169 |
This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
|
| 170 |
deduplicates to get the latest state of each issue. Filters by issue author,
|
| 171 |
commenter, or assignee.
|
| 172 |
|
|
|
|
|
|
|
|
|
|
| 173 |
Args:
|
| 174 |
client: BigQuery client instance
|
| 175 |
-
identifiers: List of GitHub usernames/bot identifiers
|
| 176 |
start_date: Start datetime (timezone-aware)
|
| 177 |
end_date: End datetime (timezone-aware)
|
| 178 |
|
|
@@ -191,7 +257,7 @@ def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_d
|
|
| 191 |
...
|
| 192 |
}
|
| 193 |
"""
|
| 194 |
-
print(f"\n🔍 Querying BigQuery for
|
| 195 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 196 |
|
| 197 |
# Generate table UNION statements for issue events
|
|
@@ -1171,7 +1237,7 @@ def mine_all_agents():
|
|
| 1171 |
print(f"\n{'='*80}")
|
| 1172 |
print(f"⛏️ [MINE] Starting BigQuery data mining for {len(identifiers)} agents")
|
| 1173 |
print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
|
| 1174 |
-
print(f"Data source: BigQuery + GitHub Archive (
|
| 1175 |
print(f"⚠️ This will query BigQuery and may take several minutes")
|
| 1176 |
print(f"{'='*80}\n")
|
| 1177 |
|
|
@@ -1188,8 +1254,9 @@ def mine_all_agents():
|
|
| 1188 |
start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 1189 |
|
| 1190 |
try:
|
| 1191 |
-
|
| 1192 |
-
|
|
|
|
| 1193 |
)
|
| 1194 |
except Exception as e:
|
| 1195 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
|
@@ -1237,13 +1304,17 @@ def mine_all_agents():
|
|
| 1237 |
error_count += 1
|
| 1238 |
continue
|
| 1239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1240 |
print(f"\n{'='*80}")
|
| 1241 |
print(f"✅ Mining complete!")
|
| 1242 |
print(f" Total agents: {len(agents)}")
|
| 1243 |
print(f" Successfully saved: {success_count}")
|
| 1244 |
print(f" No data (skipped): {no_data_count}")
|
| 1245 |
print(f" Errors: {error_count}")
|
| 1246 |
-
print(f" BigQuery
|
| 1247 |
print(f"{'='*80}\n")
|
| 1248 |
|
| 1249 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
|
|
|
| 162 |
return " UNION ALL ".join(union_parts)
|
| 163 |
|
| 164 |
|
| 165 |
+
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
|
| 166 |
+
"""
|
| 167 |
+
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 168 |
+
|
| 169 |
+
Splits agents into smaller batches to avoid performance issues with large UNNEST arrays
|
| 170 |
+
and correlated subqueries. Each batch query runs much faster than one massive query.
|
| 171 |
+
|
| 172 |
+
Args:
|
| 173 |
+
client: BigQuery client instance
|
| 174 |
+
identifiers: List of GitHub usernames/bot identifiers
|
| 175 |
+
start_date: Start datetime (timezone-aware)
|
| 176 |
+
end_date: End datetime (timezone-aware)
|
| 177 |
+
batch_size: Number of agents per batch (default: 100)
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
Dictionary mapping agent identifier to list of issue metadata
|
| 181 |
+
"""
|
| 182 |
+
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
|
| 183 |
+
print(f" Batch size: {batch_size} agents per query")
|
| 184 |
+
|
| 185 |
+
# Split identifiers into batches
|
| 186 |
+
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
| 187 |
+
print(f" Total batches: {len(batches)}")
|
| 188 |
+
|
| 189 |
+
# Collect results from all batches
|
| 190 |
+
all_metadata = {}
|
| 191 |
+
|
| 192 |
+
for batch_num, batch_identifiers in enumerate(batches, 1):
|
| 193 |
+
print(f"\n{'─'*80}")
|
| 194 |
+
print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)")
|
| 195 |
+
print(f"{'─'*80}")
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
batch_results = fetch_all_issue_metadata_single_query(
|
| 199 |
+
client, batch_identifiers, start_date, end_date
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
# Merge results
|
| 203 |
+
for identifier, metadata_list in batch_results.items():
|
| 204 |
+
if identifier in all_metadata:
|
| 205 |
+
all_metadata[identifier].extend(metadata_list)
|
| 206 |
+
else:
|
| 207 |
+
all_metadata[identifier] = metadata_list
|
| 208 |
+
|
| 209 |
+
print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
print(f" ✗ Batch {batch_num} failed: {str(e)}")
|
| 213 |
+
print(f" Continuing with remaining batches...")
|
| 214 |
+
import traceback
|
| 215 |
+
traceback.print_exc()
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
print(f"\n{'='*80}")
|
| 219 |
+
print(f"✅ All batches completed!")
|
| 220 |
+
print(f" Total agents with data: {len(all_metadata)}")
|
| 221 |
+
total_issues = sum(len(issues) for issues in all_metadata.values())
|
| 222 |
+
print(f" Total issues found: {total_issues}")
|
| 223 |
+
print(f"{'='*80}\n")
|
| 224 |
+
|
| 225 |
+
return all_metadata
|
| 226 |
+
|
| 227 |
+
|
| 228 |
def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
|
| 229 |
"""
|
| 230 |
+
Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query.
|
| 231 |
|
| 232 |
This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
|
| 233 |
deduplicates to get the latest state of each issue. Filters by issue author,
|
| 234 |
commenter, or assignee.
|
| 235 |
|
| 236 |
+
NOTE: This function is designed for smaller batches (~100 agents). For large
|
| 237 |
+
numbers of agents, use fetch_issue_metadata_batched() instead.
|
| 238 |
+
|
| 239 |
Args:
|
| 240 |
client: BigQuery client instance
|
| 241 |
+
identifiers: List of GitHub usernames/bot identifiers (recommended: <100)
|
| 242 |
start_date: Start datetime (timezone-aware)
|
| 243 |
end_date: End datetime (timezone-aware)
|
| 244 |
|
|
|
|
| 257 |
...
|
| 258 |
}
|
| 259 |
"""
|
| 260 |
+
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY")
|
| 261 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 262 |
|
| 263 |
# Generate table UNION statements for issue events
|
|
|
|
| 1237 |
print(f"\n{'='*80}")
|
| 1238 |
print(f"⛏️ [MINE] Starting BigQuery data mining for {len(identifiers)} agents")
|
| 1239 |
print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
|
| 1240 |
+
print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
|
| 1241 |
print(f"⚠️ This will query BigQuery and may take several minutes")
|
| 1242 |
print(f"{'='*80}\n")
|
| 1243 |
|
|
|
|
| 1254 |
start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 1255 |
|
| 1256 |
try:
|
| 1257 |
+
# Use batched approach for better performance
|
| 1258 |
+
all_metadata = fetch_issue_metadata_batched(
|
| 1259 |
+
client, identifiers, start_date, end_date, batch_size=100
|
| 1260 |
)
|
| 1261 |
except Exception as e:
|
| 1262 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
|
|
|
| 1304 |
error_count += 1
|
| 1305 |
continue
|
| 1306 |
|
| 1307 |
+
# Calculate number of batches executed
|
| 1308 |
+
batch_size = 100
|
| 1309 |
+
num_batches = (len(identifiers) + batch_size - 1) // batch_size
|
| 1310 |
+
|
| 1311 |
print(f"\n{'='*80}")
|
| 1312 |
print(f"✅ Mining complete!")
|
| 1313 |
print(f" Total agents: {len(agents)}")
|
| 1314 |
print(f" Successfully saved: {success_count}")
|
| 1315 |
print(f" No data (skipped): {no_data_count}")
|
| 1316 |
print(f" Errors: {error_count}")
|
| 1317 |
+
print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
|
| 1318 |
print(f"{'='*80}\n")
|
| 1319 |
|
| 1320 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
msr.py
CHANGED
|
@@ -118,17 +118,83 @@ def generate_table_union_statements(start_date, end_date):
|
|
| 118 |
# BIGQUERY FUNCTIONS
|
| 119 |
# =============================================================================
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
|
| 122 |
"""
|
| 123 |
-
Fetch issue metadata for
|
| 124 |
|
| 125 |
This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
|
| 126 |
deduplicates to get the latest state of each issue. Filters by issue author,
|
| 127 |
commenter, or assignee.
|
| 128 |
|
|
|
|
|
|
|
|
|
|
| 129 |
Args:
|
| 130 |
client: BigQuery client instance
|
| 131 |
-
identifiers: List of GitHub usernames/bot identifiers
|
| 132 |
start_date: Start datetime (timezone-aware)
|
| 133 |
end_date: End datetime (timezone-aware)
|
| 134 |
|
|
@@ -147,7 +213,7 @@ def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_d
|
|
| 147 |
...
|
| 148 |
}
|
| 149 |
"""
|
| 150 |
-
print(f"\n🔍 Querying BigQuery for
|
| 151 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 152 |
|
| 153 |
# Generate table UNION statements for issue events
|
|
@@ -715,7 +781,7 @@ def mine_all_agents():
|
|
| 715 |
print(f"\n{'='*80}")
|
| 716 |
print(f"Starting issue metadata mining for {len(identifiers)} agents")
|
| 717 |
print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
|
| 718 |
-
print(f"Data source: BigQuery + GitHub Archive (
|
| 719 |
print(f"{'='*80}\n")
|
| 720 |
|
| 721 |
# Initialize BigQuery client
|
|
@@ -731,8 +797,9 @@ def mine_all_agents():
|
|
| 731 |
start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 732 |
|
| 733 |
try:
|
| 734 |
-
|
| 735 |
-
|
|
|
|
| 736 |
)
|
| 737 |
except Exception as e:
|
| 738 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
|
@@ -780,13 +847,17 @@ def mine_all_agents():
|
|
| 780 |
error_count += 1
|
| 781 |
continue
|
| 782 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
print(f"\n{'='*80}")
|
| 784 |
print(f"✅ Mining complete!")
|
| 785 |
print(f" Total agents: {len(agents)}")
|
| 786 |
print(f" Successfully saved: {success_count}")
|
| 787 |
print(f" No data (skipped): {no_data_count}")
|
| 788 |
print(f" Errors: {error_count}")
|
| 789 |
-
print(f" BigQuery
|
| 790 |
print(f"{'='*80}\n")
|
| 791 |
|
| 792 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|
|
|
|
| 118 |
# BIGQUERY FUNCTIONS
|
| 119 |
# =============================================================================
|
| 120 |
|
| 121 |
+
def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
|
| 122 |
+
"""
|
| 123 |
+
Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
|
| 124 |
+
|
| 125 |
+
Splits agents into smaller batches to avoid performance issues with large UNNEST arrays
|
| 126 |
+
and correlated subqueries. Each batch query runs much faster than one massive query.
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
client: BigQuery client instance
|
| 130 |
+
identifiers: List of GitHub usernames/bot identifiers
|
| 131 |
+
start_date: Start datetime (timezone-aware)
|
| 132 |
+
end_date: End datetime (timezone-aware)
|
| 133 |
+
batch_size: Number of agents per batch (default: 100)
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
Dictionary mapping agent identifier to list of issue metadata
|
| 137 |
+
"""
|
| 138 |
+
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
|
| 139 |
+
print(f" Batch size: {batch_size} agents per query")
|
| 140 |
+
|
| 141 |
+
# Split identifiers into batches
|
| 142 |
+
batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
|
| 143 |
+
print(f" Total batches: {len(batches)}")
|
| 144 |
+
|
| 145 |
+
# Collect results from all batches
|
| 146 |
+
all_metadata = {}
|
| 147 |
+
|
| 148 |
+
for batch_num, batch_identifiers in enumerate(batches, 1):
|
| 149 |
+
print(f"\n{'─'*80}")
|
| 150 |
+
print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)")
|
| 151 |
+
print(f"{'─'*80}")
|
| 152 |
+
|
| 153 |
+
try:
|
| 154 |
+
batch_results = fetch_all_issue_metadata_single_query(
|
| 155 |
+
client, batch_identifiers, start_date, end_date
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# Merge results
|
| 159 |
+
for identifier, metadata_list in batch_results.items():
|
| 160 |
+
if identifier in all_metadata:
|
| 161 |
+
all_metadata[identifier].extend(metadata_list)
|
| 162 |
+
else:
|
| 163 |
+
all_metadata[identifier] = metadata_list
|
| 164 |
+
|
| 165 |
+
print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
print(f" ✗ Batch {batch_num} failed: {str(e)}")
|
| 169 |
+
print(f" Continuing with remaining batches...")
|
| 170 |
+
import traceback
|
| 171 |
+
traceback.print_exc()
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
print(f"\n{'='*80}")
|
| 175 |
+
print(f"✅ All batches completed!")
|
| 176 |
+
print(f" Total agents with data: {len(all_metadata)}")
|
| 177 |
+
total_issues = sum(len(issues) for issues in all_metadata.values())
|
| 178 |
+
print(f" Total issues found: {total_issues}")
|
| 179 |
+
print(f"{'='*80}\n")
|
| 180 |
+
|
| 181 |
+
return all_metadata
|
| 182 |
+
|
| 183 |
+
|
| 184 |
def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
|
| 185 |
"""
|
| 186 |
+
Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query.
|
| 187 |
|
| 188 |
This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
|
| 189 |
deduplicates to get the latest state of each issue. Filters by issue author,
|
| 190 |
commenter, or assignee.
|
| 191 |
|
| 192 |
+
NOTE: This function is designed for smaller batches (~100 agents). For large
|
| 193 |
+
numbers of agents, use fetch_issue_metadata_batched() instead.
|
| 194 |
+
|
| 195 |
Args:
|
| 196 |
client: BigQuery client instance
|
| 197 |
+
identifiers: List of GitHub usernames/bot identifiers (recommended: <100)
|
| 198 |
start_date: Start datetime (timezone-aware)
|
| 199 |
end_date: End datetime (timezone-aware)
|
| 200 |
|
|
|
|
| 213 |
...
|
| 214 |
}
|
| 215 |
"""
|
| 216 |
+
print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY")
|
| 217 |
print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
| 218 |
|
| 219 |
# Generate table UNION statements for issue events
|
|
|
|
| 781 |
print(f"\n{'='*80}")
|
| 782 |
print(f"Starting issue metadata mining for {len(identifiers)} agents")
|
| 783 |
print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
|
| 784 |
+
print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
|
| 785 |
print(f"{'='*80}\n")
|
| 786 |
|
| 787 |
# Initialize BigQuery client
|
|
|
|
| 797 |
start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
|
| 798 |
|
| 799 |
try:
|
| 800 |
+
# Use batched approach for better performance
|
| 801 |
+
all_metadata = fetch_issue_metadata_batched(
|
| 802 |
+
client, identifiers, start_date, end_date, batch_size=100
|
| 803 |
)
|
| 804 |
except Exception as e:
|
| 805 |
print(f"✗ Error during BigQuery fetch: {str(e)}")
|
|
|
|
| 847 |
error_count += 1
|
| 848 |
continue
|
| 849 |
|
| 850 |
+
# Calculate number of batches executed
|
| 851 |
+
batch_size = 100
|
| 852 |
+
num_batches = (len(identifiers) + batch_size - 1) // batch_size
|
| 853 |
+
|
| 854 |
print(f"\n{'='*80}")
|
| 855 |
print(f"✅ Mining complete!")
|
| 856 |
print(f" Total agents: {len(agents)}")
|
| 857 |
print(f" Successfully saved: {success_count}")
|
| 858 |
print(f" No data (skipped): {no_data_count}")
|
| 859 |
print(f" Errors: {error_count}")
|
| 860 |
+
print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
|
| 861 |
print(f"{'='*80}\n")
|
| 862 |
|
| 863 |
# After mining is complete, save leaderboard and metrics to HuggingFace
|