zhiminy commited on
Commit
adbc63a
·
1 Parent(s): 9a5f3b6
Files changed (2) hide show
  1. app.py +78 -7
  2. msr.py +78 -7
app.py CHANGED
@@ -162,17 +162,83 @@ def generate_table_union_statements(start_date, end_date):
162
  return " UNION ALL ".join(union_parts)
163
 
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
166
  """
167
- Fetch issue metadata for ALL agents using ONE comprehensive BigQuery query.
168
 
169
  This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
170
  deduplicates to get the latest state of each issue. Filters by issue author,
171
  commenter, or assignee.
172
 
 
 
 
173
  Args:
174
  client: BigQuery client instance
175
- identifiers: List of GitHub usernames/bot identifiers
176
  start_date: Start datetime (timezone-aware)
177
  end_date: End datetime (timezone-aware)
178
 
@@ -191,7 +257,7 @@ def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_d
191
  ...
192
  }
193
  """
194
- print(f"\n🔍 Querying BigQuery for ALL {len(identifiers)} agents in ONE QUERY")
195
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
196
 
197
  # Generate table UNION statements for issue events
@@ -1171,7 +1237,7 @@ def mine_all_agents():
1171
  print(f"\n{'='*80}")
1172
  print(f"⛏️ [MINE] Starting BigQuery data mining for {len(identifiers)} agents")
1173
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
1174
- print(f"Data source: BigQuery + GitHub Archive (ONE QUERY FOR ALL AGENTS)")
1175
  print(f"⚠️ This will query BigQuery and may take several minutes")
1176
  print(f"{'='*80}\n")
1177
 
@@ -1188,8 +1254,9 @@ def mine_all_agents():
1188
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
1189
 
1190
  try:
1191
- all_metadata = fetch_all_issue_metadata_single_query(
1192
- client, identifiers, start_date, end_date
 
1193
  )
1194
  except Exception as e:
1195
  print(f"✗ Error during BigQuery fetch: {str(e)}")
@@ -1237,13 +1304,17 @@ def mine_all_agents():
1237
  error_count += 1
1238
  continue
1239
 
 
 
 
 
1240
  print(f"\n{'='*80}")
1241
  print(f"✅ Mining complete!")
1242
  print(f" Total agents: {len(agents)}")
1243
  print(f" Successfully saved: {success_count}")
1244
  print(f" No data (skipped): {no_data_count}")
1245
  print(f" Errors: {error_count}")
1246
- print(f" BigQuery queries executed: 1")
1247
  print(f"{'='*80}\n")
1248
 
1249
  # After mining is complete, save leaderboard and metrics to HuggingFace
 
162
  return " UNION ALL ".join(union_parts)
163
 
164
 
165
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
166
+ """
167
+ Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
168
+
169
+ Splits agents into smaller batches to avoid performance issues with large UNNEST arrays
170
+ and correlated subqueries. Each batch query runs much faster than one massive query.
171
+
172
+ Args:
173
+ client: BigQuery client instance
174
+ identifiers: List of GitHub usernames/bot identifiers
175
+ start_date: Start datetime (timezone-aware)
176
+ end_date: End datetime (timezone-aware)
177
+ batch_size: Number of agents per batch (default: 100)
178
+
179
+ Returns:
180
+ Dictionary mapping agent identifier to list of issue metadata
181
+ """
182
+ print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
183
+ print(f" Batch size: {batch_size} agents per query")
184
+
185
+ # Split identifiers into batches
186
+ batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
187
+ print(f" Total batches: {len(batches)}")
188
+
189
+ # Collect results from all batches
190
+ all_metadata = {}
191
+
192
+ for batch_num, batch_identifiers in enumerate(batches, 1):
193
+ print(f"\n{'─'*80}")
194
+ print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)")
195
+ print(f"{'─'*80}")
196
+
197
+ try:
198
+ batch_results = fetch_all_issue_metadata_single_query(
199
+ client, batch_identifiers, start_date, end_date
200
+ )
201
+
202
+ # Merge results
203
+ for identifier, metadata_list in batch_results.items():
204
+ if identifier in all_metadata:
205
+ all_metadata[identifier].extend(metadata_list)
206
+ else:
207
+ all_metadata[identifier] = metadata_list
208
+
209
+ print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
210
+
211
+ except Exception as e:
212
+ print(f" ✗ Batch {batch_num} failed: {str(e)}")
213
+ print(f" Continuing with remaining batches...")
214
+ import traceback
215
+ traceback.print_exc()
216
+ continue
217
+
218
+ print(f"\n{'='*80}")
219
+ print(f"✅ All batches completed!")
220
+ print(f" Total agents with data: {len(all_metadata)}")
221
+ total_issues = sum(len(issues) for issues in all_metadata.values())
222
+ print(f" Total issues found: {total_issues}")
223
+ print(f"{'='*80}\n")
224
+
225
+ return all_metadata
226
+
227
+
228
  def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
229
  """
230
+ Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query.
231
 
232
  This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
233
  deduplicates to get the latest state of each issue. Filters by issue author,
234
  commenter, or assignee.
235
 
236
+ NOTE: This function is designed for smaller batches (~100 agents). For large
237
+ numbers of agents, use fetch_issue_metadata_batched() instead.
238
+
239
  Args:
240
  client: BigQuery client instance
241
+ identifiers: List of GitHub usernames/bot identifiers (recommended: <100)
242
  start_date: Start datetime (timezone-aware)
243
  end_date: End datetime (timezone-aware)
244
 
 
257
  ...
258
  }
259
  """
260
+ print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY")
261
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
262
 
263
  # Generate table UNION statements for issue events
 
1237
  print(f"\n{'='*80}")
1238
  print(f"⛏️ [MINE] Starting BigQuery data mining for {len(identifiers)} agents")
1239
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
1240
+ print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
1241
  print(f"⚠️ This will query BigQuery and may take several minutes")
1242
  print(f"{'='*80}\n")
1243
 
 
1254
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
1255
 
1256
  try:
1257
+ # Use batched approach for better performance
1258
+ all_metadata = fetch_issue_metadata_batched(
1259
+ client, identifiers, start_date, end_date, batch_size=100
1260
  )
1261
  except Exception as e:
1262
  print(f"✗ Error during BigQuery fetch: {str(e)}")
 
1304
  error_count += 1
1305
  continue
1306
 
1307
+ # Calculate number of batches executed
1308
+ batch_size = 100
1309
+ num_batches = (len(identifiers) + batch_size - 1) // batch_size
1310
+
1311
  print(f"\n{'='*80}")
1312
  print(f"✅ Mining complete!")
1313
  print(f" Total agents: {len(agents)}")
1314
  print(f" Successfully saved: {success_count}")
1315
  print(f" No data (skipped): {no_data_count}")
1316
  print(f" Errors: {error_count}")
1317
+ print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
1318
  print(f"{'='*80}\n")
1319
 
1320
  # After mining is complete, save leaderboard and metrics to HuggingFace
msr.py CHANGED
@@ -118,17 +118,83 @@ def generate_table_union_statements(start_date, end_date):
118
  # BIGQUERY FUNCTIONS
119
  # =============================================================================
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
122
  """
123
- Fetch issue metadata for ALL agents using ONE comprehensive BigQuery query.
124
 
125
  This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
126
  deduplicates to get the latest state of each issue. Filters by issue author,
127
  commenter, or assignee.
128
 
 
 
 
129
  Args:
130
  client: BigQuery client instance
131
- identifiers: List of GitHub usernames/bot identifiers
132
  start_date: Start datetime (timezone-aware)
133
  end_date: End datetime (timezone-aware)
134
 
@@ -147,7 +213,7 @@ def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_d
147
  ...
148
  }
149
  """
150
- print(f"\n🔍 Querying BigQuery for ALL {len(identifiers)} agents in ONE QUERY")
151
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
152
 
153
  # Generate table UNION statements for issue events
@@ -715,7 +781,7 @@ def mine_all_agents():
715
  print(f"\n{'='*80}")
716
  print(f"Starting issue metadata mining for {len(identifiers)} agents")
717
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
718
- print(f"Data source: BigQuery + GitHub Archive (ONE QUERY FOR ALL AGENTS)")
719
  print(f"{'='*80}\n")
720
 
721
  # Initialize BigQuery client
@@ -731,8 +797,9 @@ def mine_all_agents():
731
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
732
 
733
  try:
734
- all_metadata = fetch_all_issue_metadata_single_query(
735
- client, identifiers, start_date, end_date
 
736
  )
737
  except Exception as e:
738
  print(f"✗ Error during BigQuery fetch: {str(e)}")
@@ -780,13 +847,17 @@ def mine_all_agents():
780
  error_count += 1
781
  continue
782
 
 
 
 
 
783
  print(f"\n{'='*80}")
784
  print(f"✅ Mining complete!")
785
  print(f" Total agents: {len(agents)}")
786
  print(f" Successfully saved: {success_count}")
787
  print(f" No data (skipped): {no_data_count}")
788
  print(f" Errors: {error_count}")
789
- print(f" BigQuery queries executed: 1")
790
  print(f"{'='*80}\n")
791
 
792
  # After mining is complete, save leaderboard and metrics to HuggingFace
 
118
  # BIGQUERY FUNCTIONS
119
  # =============================================================================
120
 
121
+ def fetch_issue_metadata_batched(client, identifiers, start_date, end_date, batch_size=100):
122
+ """
123
+ Fetch issue metadata for ALL agents using BATCHED BigQuery queries.
124
+
125
+ Splits agents into smaller batches to avoid performance issues with large UNNEST arrays
126
+ and correlated subqueries. Each batch query runs much faster than one massive query.
127
+
128
+ Args:
129
+ client: BigQuery client instance
130
+ identifiers: List of GitHub usernames/bot identifiers
131
+ start_date: Start datetime (timezone-aware)
132
+ end_date: End datetime (timezone-aware)
133
+ batch_size: Number of agents per batch (default: 100)
134
+
135
+ Returns:
136
+ Dictionary mapping agent identifier to list of issue metadata
137
+ """
138
+ print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents using BATCHED approach")
139
+ print(f" Batch size: {batch_size} agents per query")
140
+
141
+ # Split identifiers into batches
142
+ batches = [identifiers[i:i + batch_size] for i in range(0, len(identifiers), batch_size)]
143
+ print(f" Total batches: {len(batches)}")
144
+
145
+ # Collect results from all batches
146
+ all_metadata = {}
147
+
148
+ for batch_num, batch_identifiers in enumerate(batches, 1):
149
+ print(f"\n{'─'*80}")
150
+ print(f"📦 Processing Batch {batch_num}/{len(batches)} ({len(batch_identifiers)} agents)")
151
+ print(f"{'─'*80}")
152
+
153
+ try:
154
+ batch_results = fetch_all_issue_metadata_single_query(
155
+ client, batch_identifiers, start_date, end_date
156
+ )
157
+
158
+ # Merge results
159
+ for identifier, metadata_list in batch_results.items():
160
+ if identifier in all_metadata:
161
+ all_metadata[identifier].extend(metadata_list)
162
+ else:
163
+ all_metadata[identifier] = metadata_list
164
+
165
+ print(f" ✓ Batch {batch_num} completed: {len(batch_results)} agents with data")
166
+
167
+ except Exception as e:
168
+ print(f" ✗ Batch {batch_num} failed: {str(e)}")
169
+ print(f" Continuing with remaining batches...")
170
+ import traceback
171
+ traceback.print_exc()
172
+ continue
173
+
174
+ print(f"\n{'='*80}")
175
+ print(f"✅ All batches completed!")
176
+ print(f" Total agents with data: {len(all_metadata)}")
177
+ total_issues = sum(len(issues) for issues in all_metadata.values())
178
+ print(f" Total issues found: {total_issues}")
179
+ print(f"{'='*80}\n")
180
+
181
+ return all_metadata
182
+
183
+
184
  def fetch_all_issue_metadata_single_query(client, identifiers, start_date, end_date):
185
  """
186
+ Fetch issue metadata for a batch of agents using ONE comprehensive BigQuery query.
187
 
188
  This query fetches IssuesEvent and IssueCommentEvent from GitHub Archive and
189
  deduplicates to get the latest state of each issue. Filters by issue author,
190
  commenter, or assignee.
191
 
192
+ NOTE: This function is designed for smaller batches (~100 agents). For large
193
+ numbers of agents, use fetch_issue_metadata_batched() instead.
194
+
195
  Args:
196
  client: BigQuery client instance
197
+ identifiers: List of GitHub usernames/bot identifiers (recommended: <100)
198
  start_date: Start datetime (timezone-aware)
199
  end_date: End datetime (timezone-aware)
200
 
 
213
  ...
214
  }
215
  """
216
+ print(f"\n🔍 Querying BigQuery for {len(identifiers)} agents in SINGLE QUERY")
217
  print(f" Time range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
218
 
219
  # Generate table UNION statements for issue events
 
781
  print(f"\n{'='*80}")
782
  print(f"Starting issue metadata mining for {len(identifiers)} agents")
783
  print(f"Time frame: Last {LEADERBOARD_TIME_FRAME_DAYS} days")
784
+ print(f"Data source: BigQuery + GitHub Archive (BATCHED QUERIES)")
785
  print(f"{'='*80}\n")
786
 
787
  # Initialize BigQuery client
 
797
  start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
798
 
799
  try:
800
+ # Use batched approach for better performance
801
+ all_metadata = fetch_issue_metadata_batched(
802
+ client, identifiers, start_date, end_date, batch_size=100
803
  )
804
  except Exception as e:
805
  print(f"✗ Error during BigQuery fetch: {str(e)}")
 
847
  error_count += 1
848
  continue
849
 
850
+ # Calculate number of batches executed
851
+ batch_size = 100
852
+ num_batches = (len(identifiers) + batch_size - 1) // batch_size
853
+
854
  print(f"\n{'='*80}")
855
  print(f"✅ Mining complete!")
856
  print(f" Total agents: {len(agents)}")
857
  print(f" Successfully saved: {success_count}")
858
  print(f" No data (skipped): {no_data_count}")
859
  print(f" Errors: {error_count}")
860
+ print(f" BigQuery batches executed: {num_batches} (batch size: {batch_size})")
861
  print(f"{'='*80}\n")
862
 
863
  # After mining is complete, save leaderboard and metrics to HuggingFace