File size: 54,998 Bytes
dbebb52 5998589 dbebb52 5998589 dbebb52 e9034e9 dbebb52 5998589 e9034e9 5998589 d42ec54 dbebb52 ce72984 9ae9289 ce72984 340fbae 244b6ac 5998589 4b78e58 dbebb52 5998589 e9034e9 5998589 91984f1 e9034e9 5998589 e9034e9 5998589 e9034e9 5998589 605e063 23b8ef2 5998589 ee5d1f1 dbebb52 3da62f9 5998589 dbebb52 5998589 d81f624 5998589 dbebb52 5998589 dbebb52 5998589 dbebb52 5998589 dbebb52 5998589 dbebb52 5998589 dbebb52 5998589 7485ae4 d81f624 5998589 3174e17 5998589 d81f624 5998589 fa492c4 5998589 fa492c4 5998589 d81f624 5998589 d81f624 dbebb52 7485ae4 5998589 7485ae4 dbebb52 5998589 adbc63a 5998589 adbc63a 5998589 adbc63a 5998589 adbc63a 5998589 adbc63a 5998589 adbc63a 5998589 adbc63a 725070c 5998589 76e8b98 5998589 76e8b98 725070c 91984f1 3ac36c6 78e2cec 5998589 91984f1 725070c 5998589 adbc63a 5998589 adbc63a 5998589 adbc63a 5998589 244b6ac 5998589 244b6ac dbebb52 f348397 f51e3c7 244b6ac 5998589 4b78e58 244b6ac f51e3c7 4b78e58 f51e3c7 4b78e58 244b6ac f51e3c7 244b6ac 4b78e58 244b6ac 4b78e58 f348397 4b78e58 4ad0589 244b6ac 4b78e58 244b6ac 4b78e58 244b6ac 4b78e58 244b6ac f51e3c7 244b6ac 4ad0589 244b6ac 4ad0589 9d8b26b 4ad0589 4b78e58 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac 4b78e58 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac 4b78e58 244b6ac 4b78e58 244b6ac 4b78e58 244b6ac 4b78e58 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac 4b78e58 f51e3c7 4b78e58 f51e3c7 4b78e58 f51e3c7 4b78e58 f51e3c7 4b78e58 f51e3c7 4b78e58 244b6ac 4b78e58 244b6ac 4b78e58 5998589 9ae9289 5998589 de8d643 5998589 de8d643 5998589 dbebb52 5998589 3da62f9 5998589 7485ae4 5998589 dbebb52 5998589 dbebb52 7485ae4 f51e3c7 5998589 7485ae4 5998589 9ae9289 5998589 dbebb52 f51e3c7 dbebb52 5998589 dbebb52 5998589 f51e3c7 dbebb52 5998589 dbebb52 5998589 dbebb52 7485ae4 f51e3c7 b984c23 5998589 7485ae4 5998589 7485ae4 f51e3c7 dbebb52 5998589 dbebb52 f51e3c7 dbebb52 4b09f5e 5998589 4b09f5e 5998589 4b09f5e 5998589 4b09f5e 5998589 4b09f5e f51e3c7 4b09f5e 5998589 f51e3c7 ae37963 4b09f5e 5998589 4b09f5e 5998589 4b09f5e 5998589 4b09f5e ae37963 4b09f5e 5998589 4b09f5e 5998589 4b09f5e 5998589 4b09f5e 5998589 4b09f5e 5998589 4b09f5e f51e3c7 4b09f5e 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac f51e3c7 244b6ac 4b78e58 f51e3c7 4b78e58 f51e3c7 5998589 4b09f5e 4b78e58 244b6ac 5998589 4b09f5e f51e3c7 5998589 9ae9289 5998589 4b78e58 244b6ac 5998589 f51e3c7 5998589 4b78e58 244b6ac 5998589 4b09f5e 5998589 3a48c45 244b6ac 4b09f5e 4b78e58 4b09f5e 4b78e58 5998589 3a48c45 244b6ac 4b09f5e e94bca2 5998589 4b09f5e 5998589 e94bca2 5998589 e94bca2 4b09f5e 5998589 4b09f5e dbebb52 5998589 dbebb52 f51e3c7 5998589 dbebb52 3da62f9 5998589 f51e3c7 5998589 f51e3c7 dbebb52 f51e3c7 7485ae4 f51e3c7 7485ae4 f51e3c7 7485ae4 5998589 7485ae4 5998589 7485ae4 244b6ac 5998589 7485ae4 725070c 4b78e58 244b6ac 5998589 3da62f9 725070c 5998589 244b6ac f51e3c7 244b6ac 3a48c45 244b6ac f51e3c7 244b6ac 3a48c45 244b6ac 7485ae4 5998589 7485ae4 605e063 7485ae4 5998589 f51e3c7 5998589 3da62f9 5998589 3da62f9 5998589 4b09f5e dbebb52 5998589 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 |
import json
import os
import time
from datetime import datetime, timezone, timedelta
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.errors import HfHubHTTPError
from dotenv import load_dotenv
import duckdb
import backoff
import requests
import requests.exceptions
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.cron import CronTrigger
import logging
import traceback
import subprocess
import re
# Load environment variables
load_dotenv()
# =============================================================================
# CONFIGURATION
# =============================================================================
# Get script directory for relative paths
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
BASE_DIR = os.path.dirname(SCRIPT_DIR) # Parent directory
AGENTS_REPO = "SWE-Arena/bot_data"
AGENTS_REPO_LOCAL_PATH = os.path.join(BASE_DIR, "bot_data") # Local git clone path
DUCKDB_CACHE_FILE = os.path.join(SCRIPT_DIR, "cache.duckdb")
GHARCHIVE_DATA_LOCAL_PATH = os.path.join(BASE_DIR, "gharchive/data")
LEADERBOARD_FILENAME = f"{os.getenv('COMPOSE_PROJECT_NAME')}.json"
LEADERBOARD_REPO = "SWE-Arena/leaderboard_data"
LEADERBOARD_TIME_FRAME_DAYS = 180
LONGSTANDING_GAP_DAYS = 30 # Minimum days for an issue to be considered long-standing
# GitHub organizations and repositories to track for wanted issues
TRACKED_ORGS = [
"apache",
"github",
"huggingface",
]
# Labels that indicate "patch wanted" status
PATCH_WANTED_LABELS = [
"bug",
"enhancement",
]
# Git sync configuration (mandatory to get latest bot data)
GIT_SYNC_TIMEOUT = 300 # 5 minutes timeout for git pull
# Streaming batch configuration
BATCH_SIZE_DAYS = 1 # Process 1 day at a time (~24 hourly files)
# Download configuration
DOWNLOAD_WORKERS = 4
DOWNLOAD_RETRY_DELAY = 2
MAX_RETRIES = 5
# Upload configuration
UPLOAD_DELAY_SECONDS = 5
UPLOAD_MAX_BACKOFF = 3600
# Scheduler configuration
SCHEDULE_ENABLED = False
SCHEDULE_DAY_OF_WEEK = 'fri' # Friday
SCHEDULE_HOUR = 0
SCHEDULE_MINUTE = 0
SCHEDULE_TIMEZONE = 'UTC'
# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================
def load_jsonl(filename):
"""Load JSONL file and return list of dictionaries."""
if not os.path.exists(filename):
return []
data = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
try:
data.append(json.loads(line))
except json.JSONDecodeError as e:
print(f"Warning: Skipping invalid JSON line: {e}")
return data
def save_jsonl(filename, data):
"""Save list of dictionaries to JSONL file."""
with open(filename, 'w', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item) + '\n')
def normalize_date_format(date_string):
"""Convert date strings or datetime objects to standardized ISO 8601 format with Z suffix."""
if not date_string or date_string == 'N/A':
return 'N/A'
try:
if isinstance(date_string, datetime):
return date_string.strftime('%Y-%m-%dT%H:%M:%SZ')
date_string = re.sub(r'\\s+', ' ', date_string.strip())
date_string = date_string.replace(' ', 'T')
if len(date_string) >= 3:
if date_string[-3:-2] in ('+', '-') and ':' not in date_string[-3:]:
date_string = date_string + ':00'
dt = datetime.fromisoformat(date_string.replace('Z', '+00:00'))
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
except Exception as e:
print(f"Warning: Could not parse date '{date_string}': {e}")
return date_string
def get_hf_token():
"""Get HuggingFace token from environment variables."""
token = os.getenv('HF_TOKEN')
if not token:
print("Warning: HF_TOKEN not found in environment variables")
return token
# =============================================================================
# GHARCHIVE DOWNLOAD FUNCTIONS
# =============================================================================
def download_file(url):
"""Download a GHArchive file with retry logic."""
filename = url.split("/")[-1]
filepath = os.path.join(GHARCHIVE_DATA_LOCAL_PATH, filename)
if os.path.exists(filepath):
return True
for attempt in range(MAX_RETRIES):
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
with open(filepath, "wb") as f:
f.write(response.content)
return True
except requests.exceptions.HTTPError as e:
# 404 means the file doesn't exist in GHArchive - skip without retry
if e.response.status_code == 404:
if attempt == 0: # Only log once, not for each retry
print(f" ⚠ {filename}: Not available (404) - skipping")
return False
# Other HTTP errors (5xx, etc.) should be retried
wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
print(f" ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
time.sleep(wait_time)
except Exception as e:
# Network errors, timeouts, etc. should be retried
wait_time = DOWNLOAD_RETRY_DELAY * (2 ** attempt)
print(f" ⚠ {filename}: {e}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
time.sleep(wait_time)
return False
def download_all_gharchive_data():
"""Download all GHArchive data files for the last LEADERBOARD_TIME_FRAME_DAYS."""
os.makedirs(GHARCHIVE_DATA_LOCAL_PATH, exist_ok=True)
end_date = datetime.now(timezone.utc)
start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
urls = []
current_date = start_date
while current_date <= end_date:
date_str = current_date.strftime("%Y-%m-%d")
for hour in range(24):
url = f"https://data.gharchive.org/{date_str}-{hour}.json.gz"
urls.append(url)
current_date += timedelta(days=1)
downloads_processed = 0
try:
with ThreadPoolExecutor(max_workers=DOWNLOAD_WORKERS) as executor:
futures = [executor.submit(download_file, url) for url in urls]
for future in as_completed(futures):
downloads_processed += 1
print(f" Download complete: {downloads_processed} files")
return True
except Exception as e:
print(f"Error during download: {str(e)}")
traceback.print_exc()
return False
# =============================================================================
# HUGGINGFACE API WRAPPERS
# =============================================================================
def is_retryable_error(e):
"""Check if exception is retryable (rate limit or timeout error)."""
if isinstance(e, HfHubHTTPError):
if e.response.status_code == 429:
return True
if isinstance(e, (requests.exceptions.Timeout,
requests.exceptions.ReadTimeout,
requests.exceptions.ConnectTimeout)):
return True
if isinstance(e, Exception):
error_str = str(e).lower()
if 'timeout' in error_str or 'timed out' in error_str:
return True
return False
@backoff.on_exception(
backoff.expo,
(HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
max_tries=MAX_RETRIES,
base=300,
max_value=3600,
giveup=lambda e: not is_retryable_error(e),
on_backoff=lambda details: print(
f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
)
)
def list_repo_files_with_backoff(api, **kwargs):
"""Wrapper for api.list_repo_files() with exponential backoff."""
return api.list_repo_files(**kwargs)
@backoff.on_exception(
backoff.expo,
(HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
max_tries=MAX_RETRIES,
base=300,
max_value=3600,
giveup=lambda e: not is_retryable_error(e),
on_backoff=lambda details: print(
f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
)
)
def hf_hub_download_with_backoff(**kwargs):
"""Wrapper for hf_hub_download() with exponential backoff."""
return hf_hub_download(**kwargs)
@backoff.on_exception(
backoff.expo,
(HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
max_tries=MAX_RETRIES,
base=300,
max_value=3600,
giveup=lambda e: not is_retryable_error(e),
on_backoff=lambda details: print(
f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
)
)
def upload_file_with_backoff(api, **kwargs):
"""Wrapper for api.upload_file() with exponential backoff."""
return api.upload_file(**kwargs)
@backoff.on_exception(
backoff.expo,
(HfHubHTTPError, requests.exceptions.Timeout, requests.exceptions.RequestException, Exception),
max_tries=MAX_RETRIES,
base=300,
max_value=3600,
giveup=lambda e: not is_retryable_error(e),
on_backoff=lambda details: print(
f" {details['exception']} error. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/5..."
)
)
def upload_folder_with_backoff(api, **kwargs):
"""Wrapper for api.upload_folder() with exponential backoff."""
return api.upload_folder(**kwargs)
def get_duckdb_connection():
"""
Initialize DuckDB connection with OPTIMIZED memory settings.
Uses persistent database and reduced memory footprint.
Automatically removes cache file if lock conflict is detected.
"""
try:
conn = duckdb.connect(DUCKDB_CACHE_FILE)
except Exception as e:
# Check if it's a locking error
error_msg = str(e)
if "lock" in error_msg.lower() or "conflicting" in error_msg.lower():
print(f" ⚠ Lock conflict detected, removing {DUCKDB_CACHE_FILE}...")
if os.path.exists(DUCKDB_CACHE_FILE):
os.remove(DUCKDB_CACHE_FILE)
print(f" ✓ Cache file removed, retrying connection...")
# Retry connection after removing cache
conn = duckdb.connect(DUCKDB_CACHE_FILE)
else:
# Re-raise if it's not a locking error
raise
# CORE MEMORY & THREADING SETTINGS
conn.execute(f"SET threads TO 6;")
conn.execute(f"SET max_memory = '50GB';")
conn.execute("SET temp_directory = '/tmp/duckdb_temp';")
# PERFORMANCE OPTIMIZATIONS
conn.execute("SET preserve_insertion_order = false;") # Disable expensive ordering
conn.execute("SET enable_object_cache = true;") # Cache repeatedly read files
return conn
def generate_file_path_patterns(start_date, end_date, data_dir=GHARCHIVE_DATA_LOCAL_PATH):
"""Generate file path patterns for GHArchive data in date range (only existing files)."""
file_patterns = []
missing_dates = set()
current_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
end_day = end_date.replace(hour=0, minute=0, second=0, microsecond=0)
while current_date <= end_day:
date_has_files = False
for hour in range(24):
pattern = os.path.join(data_dir, f"{current_date.strftime('%Y-%m-%d')}-{hour}.json.gz")
if os.path.exists(pattern):
file_patterns.append(pattern)
date_has_files = True
if not date_has_files:
missing_dates.add(current_date.strftime('%Y-%m-%d'))
current_date += timedelta(days=1)
if missing_dates:
print(f" ⚠ Skipping {len(missing_dates)} date(s) with no data")
return file_patterns
# =============================================================================
# STREAMING BATCH PROCESSING - UNIFIED QUERY FOR ALL METADATA
# =============================================================================
def fetch_all_metadata_streaming(conn, identifiers, start_date, end_date):
"""
QUERY: Fetch both issue and discussion metadata using streaming batch processing:
- IssuesEvent, IssueCommentEvent (for assistant-assigned issues AND wanted issues)
- PullRequestEvent (for wanted issue tracking)
- DiscussionEvent (for discussion tracking)
Args:
conn: DuckDB connection instance
identifiers: List of GitHub usernames/bot identifiers
start_date: Start datetime (timezone-aware)
end_date: End datetime (timezone-aware)
Returns:
Dictionary with four keys:
- 'agent_issues': {agent_id: [issue_metadata]} for assistant-assigned issues
- 'wanted_open': [open_wanted_issues] for long-standing open issues
- 'wanted_resolved': {agent_id: [resolved_wanted]} for resolved wanted issues
- 'agent_discussions': {agent_id: [discussion_metadata]} for assistant discussions
"""
identifier_set = set(identifiers)
identifier_list = ', '.join([f"'{id}'" for id in identifiers])
tracked_orgs_list = ', '.join([f"'{org}'" for org in TRACKED_ORGS])
# Storage for assistant-assigned issues
agent_issues = defaultdict(list) # agent_id -> [issue_metadata]
agent_issue_urls = defaultdict(set) # agent_id -> set of issue URLs (for deduplication)
# Storage for wanted issues
all_issues = {} # issue_url -> issue_metadata
issue_to_prs = defaultdict(set) # issue_url -> set of PR URLs
pr_creators = {} # pr_url -> creator login
pr_merged_at = {} # pr_url -> merged_at timestamp
# Storage for discussions
discussions_by_agent = defaultdict(list)
# Calculate total batches
total_days = (end_date - start_date).days
total_batches = (total_days // BATCH_SIZE_DAYS) + 1
# Process in batches
current_date = start_date
batch_num = 0
print(f" Streaming {total_batches} batches of {BATCH_SIZE_DAYS}-day intervals...")
while current_date <= end_date:
batch_num += 1
batch_end = min(current_date + timedelta(days=BATCH_SIZE_DAYS - 1), end_date)
# Get file patterns for THIS BATCH ONLY
file_patterns = generate_file_path_patterns(current_date, batch_end)
if not file_patterns:
print(f" Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} - NO DATA")
current_date = batch_end + timedelta(days=1)
continue
# Progress indicator
print(f" Batch {batch_num}/{total_batches}: {current_date.date()} to {batch_end.date()} ({len(file_patterns)} files)... ", end="", flush=True)
# Build file patterns SQL for THIS BATCH
file_patterns_sql = '[' + ', '.join([f"'{fp}'" for fp in file_patterns]) + ']'
try:
# UNIFIED QUERY: Optimized for gzip decompression
unified_query = f"""
SELECT
type,
json_extract_string(repo, '$.name') as repo_name,
json_extract_string(repo, '$.url') as repo_url,
-- Issue fields
json_extract_string(payload, '$.issue.html_url') as issue_url,
json_extract_string(payload, '$.issue.title') as issue_title,
json_extract_string(payload, '$.issue.number') as issue_number,
json_extract_string(payload, '$.issue.created_at') as issue_created_at,
json_extract_string(payload, '$.issue.closed_at') as issue_closed_at,
json_extract(payload, '$.issue.labels') as issue_labels,
json_extract_string(payload, '$.issue.pull_request') as is_pull_request,
json_extract_string(payload, '$.issue.state_reason') as issue_state_reason,
-- Actor/assignee fields for assistant assignment
json_extract_string(payload, '$.issue.user.login') as issue_creator,
json_extract_string(payload, '$.issue.assignee.login') as issue_assignee,
json_extract(payload, '$.issue.assignees') as issue_assignees,
json_extract_string(payload, '$.comment.user.login') as commenter,
-- PR fields - simplified with COALESCE
COALESCE(
json_extract_string(payload, '$.issue.html_url'),
json_extract_string(payload, '$.pull_request.html_url')
) as pr_url,
COALESCE(
json_extract_string(payload, '$.issue.user.login'),
json_extract_string(payload, '$.pull_request.user.login')
) as pr_creator,
COALESCE(
json_extract_string(payload, '$.issue.pull_request.merged_at'),
json_extract_string(payload, '$.pull_request.merged_at')
) as pr_merged_at,
COALESCE(
json_extract_string(payload, '$.issue.body'),
json_extract_string(payload, '$.pull_request.body')
) as pr_body,
-- Discussion fields
json_extract_string(payload, '$.discussion.html_url') as discussion_url,
json_extract_string(payload, '$.discussion.user.login') as discussion_creator,
json_extract_string(payload, '$.discussion.created_at') as discussion_created_at,
json_extract_string(payload, '$.discussion.answer_chosen_at') as discussion_closed_at,
json_extract_string(payload, '$.discussion.state_reason') as discussion_state_reason,
json_extract_string(payload, '$.action') as action
FROM read_json(
{file_patterns_sql},
union_by_name=true,
filename=true,
compression='gzip',
format='newline_delimited',
ignore_errors=true
)
WHERE
type IN ('IssuesEvent', 'IssueCommentEvent', 'PullRequestEvent', 'DiscussionEvent')
AND (
-- Assistant-assigned issues: assistant is creator, assignee, or commenter
(type = 'IssuesEvent' AND (
json_extract_string(payload, '$.issue.user.login') IN ({identifier_list})
OR json_extract_string(payload, '$.issue.assignee.login') IN ({identifier_list})
OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
))
-- Issue comments: assistant is commenter OR tracked org
OR (type = 'IssueCommentEvent' AND (
json_extract_string(payload, '$.comment.user.login') IN ({identifier_list})
OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
))
-- PRs: assistant is creator OR tracked org (for wanted issue tracking)
OR (type = 'PullRequestEvent' AND (
json_extract_string(payload, '$.pull_request.user.login') IN ({identifier_list})
OR SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
))
-- Discussions: assistant is creator AND tracked org
OR (type = 'DiscussionEvent'
AND json_extract_string(payload, '$.discussion.user.login') IN ({identifier_list})
AND SPLIT_PART(json_extract_string(repo, '$.name'), '/', 1) IN ({tracked_orgs_list})
)
)
"""
all_results = conn.execute(unified_query).fetchall()
# Post-process results to separate into different categories
# Row structure: [type, repo_name, repo_url, issue_url, issue_title, issue_number,
# issue_created_at, issue_closed_at, issue_labels, is_pull_request,
# issue_state_reason, issue_creator, issue_assignee, issue_assignees,
# commenter, pr_url, pr_creator, pr_merged_at, pr_body,
# discussion_url, discussion_creator, discussion_created_at,
# discussion_closed_at, discussion_state_reason, action]
issue_events = [] # For wanted tracking
pr_events = [] # For wanted tracking
discussion_events = [] # For discussion tracking
agent_issue_events = [] # For assistant-assigned issues
for row in all_results:
event_type = row[0]
is_pr = row[9] # is_pull_request field
if event_type in ('IssuesEvent', 'IssueCommentEvent'):
if not is_pr: # It's an issue, not a PR
# Check if this is an assistant-assigned issue
issue_creator = row[11]
issue_assignee = row[12]
issue_assignees_json = row[13]
commenter = row[14]
agent_identifier = None
if event_type == 'IssuesEvent':
# Check if issue creator, assignee, or any assignees match our identifiers
if issue_creator in identifier_set:
agent_identifier = issue_creator
elif issue_assignee in identifier_set:
agent_identifier = issue_assignee
else:
# Check assignees array
try:
if issue_assignees_json:
if isinstance(issue_assignees_json, str):
assignees_data = json.loads(issue_assignees_json)
else:
assignees_data = issue_assignees_json
if isinstance(assignees_data, list):
for assignee_obj in assignees_data:
if isinstance(assignee_obj, dict):
assignee_login = assignee_obj.get('login')
if assignee_login in identifier_set:
agent_identifier = assignee_login
break
except (json.JSONDecodeError, TypeError):
pass
elif event_type == 'IssueCommentEvent':
# Check if commenter is an assistant
if commenter in identifier_set:
agent_identifier = commenter
# Add to appropriate list
if agent_identifier:
agent_issue_events.append((row, agent_identifier))
# Always add to issue_events for wanted tracking (if from tracked orgs)
issue_events.append(row)
else:
# It's a PR
pr_events.append(row)
elif event_type == 'PullRequestEvent':
pr_events.append(row)
elif event_type == 'DiscussionEvent':
discussion_events.append(row)
# Process assistant-assigned issues
for row, agent_identifier in agent_issue_events:
# Row indices: repo_url=2, issue_url=3, issue_created_at=6, issue_closed_at=7, issue_state_reason=10
repo_url = row[2]
issue_url = row[3]
created_at = row[6]
closed_at = row[7]
state_reason = row[10]
if not issue_url or not agent_identifier:
continue
# Build full URL from repo_url if needed
if repo_url and '/issues/' not in issue_url:
issue_number = row[5]
full_url = f"{repo_url.replace('api.github.com/repos/', 'github.com/')}/issues/{issue_number}"
else:
full_url = issue_url
# Only include issues created within timeframe
if created_at:
try:
created_dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
if created_dt < start_date:
continue
except:
continue
# Deduplicate: only add if we haven't seen this issue for this assistant
if full_url in agent_issue_urls[agent_identifier]:
continue
agent_issue_urls[agent_identifier].add(full_url)
issue_metadata = {
'url': full_url,
'created_at': normalize_date_format(created_at),
'closed_at': normalize_date_format(closed_at) if closed_at else None,
'state_reason': state_reason,
}
agent_issues[agent_identifier].append(issue_metadata)
# Process issues for wanted tracking
for row in issue_events:
# Row indices: repo_name=1, issue_url=3, issue_title=4, issue_number=5,
# issue_created_at=6, issue_closed_at=7, issue_labels=8
repo_name = row[1]
issue_url = row[3]
title = row[4]
issue_number = row[5]
created_at = row[6]
closed_at = row[7]
labels_json = row[8]
if not issue_url or not repo_name:
continue
# Extract org from repo_name
parts = repo_name.split('/')
if len(parts) != 2:
continue
org = parts[0]
# Filter by tracked orgs
if org not in TRACKED_ORGS:
continue
# Parse labels
try:
if isinstance(labels_json, str):
labels_data = json.loads(labels_json)
else:
labels_data = labels_json
if not isinstance(labels_data, list):
label_names = []
else:
label_names = [label.get('name', '').lower() for label in labels_data if isinstance(label, dict)]
except (json.JSONDecodeError, TypeError):
label_names = []
# Determine state
normalized_closed_at = normalize_date_format(closed_at) if closed_at else None
state = 'closed' if (normalized_closed_at and normalized_closed_at != 'N/A') else 'open'
# Store issue metadata
all_issues[issue_url] = {
'url': issue_url,
'repo': repo_name,
'title': title,
'number': issue_number,
'state': state,
'created_at': normalize_date_format(created_at),
'closed_at': normalized_closed_at,
'labels': label_names
}
# Process PRs for wanted tracking
for row in pr_events:
# Row indices: pr_url=15, pr_creator=16, pr_merged_at=17, pr_body=18
pr_url = row[15]
pr_creator = row[16]
merged_at = row[17]
pr_body = row[18]
if not pr_url or not pr_creator:
continue
pr_creators[pr_url] = pr_creator
pr_merged_at[pr_url] = merged_at
# Extract linked issues from PR body
if pr_body:
# Match issue URLs or #number references
issue_refs = re.findall(r'(?:https?://github\.com/[\w-]+/[\w-]+/issues/\d+)|(?:#\d+)', pr_body, re.IGNORECASE)
for ref in issue_refs:
# Convert #number to full URL if needed
if ref.startswith('#'):
# Extract org/repo from PR URL
pr_parts = pr_url.split('/')
if len(pr_parts) >= 5:
org = pr_parts[-4]
repo = pr_parts[-3]
issue_num = ref[1:]
issue_url = f"https://github.com/{org}/{repo}/issues/{issue_num}"
issue_to_prs[issue_url].add(pr_url)
else:
issue_to_prs[ref].add(pr_url)
# Process discussions
for row in discussion_events:
# Row indices: repo_name=1, discussion_url=19, discussion_creator=20,
# discussion_created_at=21, discussion_closed_at=22,
# discussion_state_reason=23, action=24
repo_name = row[1]
discussion_url = row[19]
discussion_creator = row[20]
discussion_created_at = row[21]
discussion_closed_at = row[22]
discussion_state_reason = row[23]
action = row[24]
if not discussion_url or not repo_name:
continue
# Extract org from repo_name
parts = repo_name.split('/')
if len(parts) != 2:
continue
org = parts[0]
# Filter by tracked orgs
if org not in TRACKED_ORGS:
continue
# Parse discussion creation date to filter by time window
created_dt = None
if discussion_created_at:
try:
created_dt = datetime.fromisoformat(discussion_created_at.replace('Z', '+00:00'))
# Only track discussions created on or after start_date
if created_dt < start_date:
continue
except:
continue
# Group by creator (assistant identifier)
# Only track discussions from our assistant identifiers
if discussion_creator not in identifier_set:
continue
# Determine discussion state
# A discussion is "resolved" if it has an answer chosen OR is marked answered
is_resolved = False
if discussion_closed_at:
is_resolved = True
elif discussion_state_reason and 'answered' in discussion_state_reason.lower():
is_resolved = True
# Store discussion metadata
discussion_meta = {
'url': discussion_url,
'repo': repo_name,
'created_at': normalize_date_format(discussion_created_at),
'closed_at': normalize_date_format(discussion_closed_at) if discussion_closed_at else None,
'state': 'resolved' if is_resolved else 'open',
'state_reason': discussion_state_reason
}
# Group by assistant
if discussion_creator not in discussions_by_agent:
discussions_by_agent[discussion_creator] = []
discussions_by_agent[discussion_creator].append(discussion_meta)
print(f"✓ {len(agent_issue_events)} assistant issues, {len(issue_events)} wanted issues, {len(pr_events)} PRs, {len(discussion_events)} discussions")
except Exception as e:
print(f"\n ✗ Batch {batch_num} error: {str(e)}")
traceback.print_exc()
# Move to next batch
current_date = batch_end + timedelta(days=1)
# Post-processing: Filter issues and assign to assistants
print(f"\n Post-processing {len(all_issues)} wanted issues...")
wanted_open = []
wanted_resolved = defaultdict(list)
current_time = datetime.now(timezone.utc)
for issue_url, issue_meta in all_issues.items():
# Check if issue has linked PRs
linked_prs = issue_to_prs.get(issue_url, set())
if not linked_prs:
continue
# Check if any linked PR was merged AND created by an assistant
resolved_by = None
for pr_url in linked_prs:
merged_at = pr_merged_at.get(pr_url)
if merged_at: # PR was merged
pr_creator = pr_creators.get(pr_url)
if pr_creator in identifier_set:
resolved_by = pr_creator
break
if not resolved_by:
continue
# Process based on issue state
if issue_meta['state'] == 'open':
# For open issues: check if labels match PATCH_WANTED_LABELS
issue_labels = issue_meta.get('labels', [])
has_patch_label = False
for issue_label in issue_labels:
for wanted_label in PATCH_WANTED_LABELS:
if wanted_label.lower() in issue_label:
has_patch_label = True
break
if has_patch_label:
break
if not has_patch_label:
continue
# Check if long-standing
created_at_str = issue_meta.get('created_at')
if created_at_str and created_at_str != 'N/A':
try:
created_dt = datetime.fromisoformat(created_at_str.replace('Z', '+00:00'))
days_open = (current_time - created_dt).days
if days_open >= LONGSTANDING_GAP_DAYS:
wanted_open.append(issue_meta)
except:
pass
elif issue_meta['state'] == 'closed':
# For closed issues: must be closed within time frame AND open 30+ days
closed_at_str = issue_meta.get('closed_at')
created_at_str = issue_meta.get('created_at')
if closed_at_str and closed_at_str != 'N/A' and created_at_str and created_at_str != 'N/A':
try:
closed_dt = datetime.fromisoformat(closed_at_str.replace('Z', '+00:00'))
created_dt = datetime.fromisoformat(created_at_str.replace('Z', '+00:00'))
# Calculate how long the issue was open
days_open = (closed_dt - created_dt).days
# Only include if closed within timeframe AND was open 30+ days
if start_date <= closed_dt <= end_date and days_open >= LONGSTANDING_GAP_DAYS:
wanted_resolved[resolved_by].append(issue_meta)
except:
pass
print(f" ✓ Found {sum(len(issues) for issues in agent_issues.values())} assistant-assigned issues across {len(agent_issues)} assistants")
print(f" ✓ Found {len(wanted_open)} long-standing open wanted issues")
print(f" ✓ Found {sum(len(issues) for issues in wanted_resolved.values())} resolved wanted issues across {len(wanted_resolved)} assistants")
print(f" ✓ Found {sum(len(discussions) for discussions in discussions_by_agent.values())} discussions across {len(discussions_by_agent)} assistants")
return {
'agent_issues': dict(agent_issues),
'wanted_open': wanted_open,
'wanted_resolved': dict(wanted_resolved),
'agent_discussions': dict(discussions_by_agent)
}
def sync_agents_repo():
"""
Sync local bot_data repository with remote using git pull.
This is MANDATORY to ensure we have the latest bot data.
Raises exception if sync fails.
"""
if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
error_msg = f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}"
print(f" ✗ {error_msg}")
print(f" Please clone it first: git clone https://huggingface.co/datasets/{AGENTS_REPO}")
raise FileNotFoundError(error_msg)
if not os.path.exists(os.path.join(AGENTS_REPO_LOCAL_PATH, '.git')):
error_msg = f"{AGENTS_REPO_LOCAL_PATH} exists but is not a git repository"
print(f" ✗ {error_msg}")
raise ValueError(error_msg)
try:
# Run git pull with extended timeout due to large repository
result = subprocess.run(
['git', 'pull'],
cwd=AGENTS_REPO_LOCAL_PATH,
capture_output=True,
text=True,
timeout=GIT_SYNC_TIMEOUT
)
if result.returncode == 0:
output = result.stdout.strip()
if "Already up to date" in output or "Already up-to-date" in output:
print(f" ✓ Repository is up to date")
else:
print(f" ✓ Repository synced successfully")
if output:
# Print first few lines of output
lines = output.split('\n')[:5]
for line in lines:
print(f" {line}")
return True
else:
error_msg = f"Git pull failed: {result.stderr.strip()}"
print(f" ✗ {error_msg}")
raise RuntimeError(error_msg)
except subprocess.TimeoutExpired:
error_msg = f"Git pull timed out after {GIT_SYNC_TIMEOUT} seconds"
print(f" ✗ {error_msg}")
raise TimeoutError(error_msg)
except (FileNotFoundError, ValueError, RuntimeError, TimeoutError):
raise # Re-raise expected exceptions
except Exception as e:
error_msg = f"Error syncing repository: {str(e)}"
print(f" ✗ {error_msg}")
raise RuntimeError(error_msg) from e
def load_agents_from_hf():
"""
Load all assistant metadata JSON files from local git repository.
ALWAYS syncs with remote first to ensure we have the latest bot data.
"""
# MANDATORY: Sync with remote first to get latest bot data
print(f" Syncing bot_data repository to get latest assistants...")
sync_agents_repo() # Will raise exception if sync fails
assistants = []
# Scan local directory for JSON files
if not os.path.exists(AGENTS_REPO_LOCAL_PATH):
raise FileNotFoundError(f"Local repository not found at {AGENTS_REPO_LOCAL_PATH}")
# Walk through the directory to find all JSON files
files_processed = 0
print(f" Loading assistant metadata from {AGENTS_REPO_LOCAL_PATH}...")
for root, dirs, files in os.walk(AGENTS_REPO_LOCAL_PATH):
# Skip .git directory
if '.git' in root:
continue
for filename in files:
if not filename.endswith('.json'):
continue
files_processed += 1
file_path = os.path.join(root, filename)
try:
with open(file_path, 'r', encoding='utf-8') as f:
agent_data = json.load(f)
# Only include active assistants
if agent_data.get('status') != 'active':
continue
# Extract github_identifier from filename
github_identifier = filename.replace('.json', '')
agent_data['github_identifier'] = github_identifier
assistants.append(agent_data)
except Exception as e:
print(f" ⚠ Error loading {filename}: {str(e)}")
continue
print(f" ✓ Loaded {len(assistants)} active assistants (from {files_processed} total files)")
return assistants
def calculate_issue_stats_from_metadata(metadata_list):
"""Calculate statistics from a list of issue metadata."""
total_issues = len(metadata_list)
closed = sum(1 for issue_meta in metadata_list if issue_meta.get('closed_at'))
resolved = sum(1 for issue_meta in metadata_list
if issue_meta.get('state_reason') == 'completed')
# Resolved rate = resolved / closed (not resolved / total)
resolved_rate = (resolved / closed * 100) if closed > 0 else 0
return {
'total_issues': total_issues,
'closed_issues': closed,
'resolved_issues': resolved,
'resolved_rate': round(resolved_rate, 2),
}
def calculate_monthly_metrics_by_agent(all_metadata_dict, assistants):
"""Calculate monthly metrics for all assistants for visualization."""
identifier_to_name = {assistant.get('github_identifier'): assistant.get('name') for assistant in assistants if assistant.get('github_identifier')}
if not all_metadata_dict:
return {'assistants': [], 'months': [], 'data': {}}
agent_month_data = defaultdict(lambda: defaultdict(list))
for agent_identifier, metadata_list in all_metadata_dict.items():
for issue_meta in metadata_list:
created_at = issue_meta.get('created_at')
if not created_at:
continue
agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
try:
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
month_key = f"{dt.year}-{dt.month:02d}"
agent_month_data[agent_name][month_key].append(issue_meta)
except Exception as e:
print(f"Warning: Could not parse date '{created_at}': {e}")
continue
all_months = set()
for agent_data in agent_month_data.values():
all_months.update(agent_data.keys())
months = sorted(list(all_months))
result_data = {}
for agent_name, month_dict in agent_month_data.items():
resolved_rates = []
total_issues_list = []
resolved_issues_list = []
closed_issues_list = []
for month in months:
issues_in_month = month_dict.get(month, [])
resolved_count = sum(1 for issue in issues_in_month if issue.get('state_reason') == 'completed')
closed_count = sum(1 for issue in issues_in_month if issue.get('closed_at'))
total_count = len(issues_in_month)
# Resolved rate = resolved / closed (not resolved / total)
resolved_rate = (resolved_count / closed_count * 100) if closed_count > 0 else None
resolved_rates.append(resolved_rate)
total_issues_list.append(total_count)
resolved_issues_list.append(resolved_count)
closed_issues_list.append(closed_count)
result_data[agent_name] = {
'resolved_rates': resolved_rates,
'total_issues': total_issues_list,
'resolved_issues': resolved_issues_list,
'closed_issues': closed_issues_list
}
agents_list = sorted(list(agent_month_data.keys()))
return {
'assistants': agents_list,
'months': months,
'data': result_data
}
def calculate_discussion_stats_from_metadata(metadata_list):
"""Calculate statistics from a list of discussion metadata."""
total_discussions = len(metadata_list)
resolved = sum(1 for discussion_meta in metadata_list if discussion_meta.get('state') == 'resolved')
# Resolved rate = resolved / total * 100
resolved_rate = (resolved / total_discussions * 100) if total_discussions > 0 else 0
return {
'total_discussions': total_discussions,
'resolved_discussions': resolved,
'discussion_resolved_rate': round(resolved_rate, 2),
}
def calculate_monthly_metrics_by_agent_discussions(all_discussions_dict, assistants):
"""Calculate monthly metrics for discussions for all assistants for visualization."""
identifier_to_name = {assistant.get('github_identifier'): assistant.get('name') for assistant in assistants if assistant.get('github_identifier')}
if not all_discussions_dict:
return {'assistants': [], 'months': [], 'data': {}}
agent_month_data = defaultdict(lambda: defaultdict(list))
for agent_identifier, metadata_list in all_discussions_dict.items():
for discussion_meta in metadata_list:
created_at = discussion_meta.get('created_at')
if not created_at:
continue
agent_name = identifier_to_name.get(agent_identifier, agent_identifier)
try:
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
month_key = f"{dt.year}-{dt.month:02d}"
agent_month_data[agent_name][month_key].append(discussion_meta)
except Exception as e:
print(f"Warning: Could not parse discussion date '{created_at}': {e}")
continue
all_months = set()
for agent_data in agent_month_data.values():
all_months.update(agent_data.keys())
months = sorted(list(all_months))
result_data = {}
for agent_name, month_dict in agent_month_data.items():
resolved_rates = []
total_discussions_list = []
resolved_discussions_list = []
for month in months:
discussions_in_month = month_dict.get(month, [])
resolved_count = sum(1 for discussion in discussions_in_month if discussion.get('state') == 'resolved')
total_count = len(discussions_in_month)
# Resolved rate = resolved / total * 100
resolved_rate = (resolved_count / total_count * 100) if total_count > 0 else None
resolved_rates.append(resolved_rate)
total_discussions_list.append(total_count)
resolved_discussions_list.append(resolved_count)
result_data[agent_name] = {
'resolved_rates': resolved_rates,
'total_discussions': total_discussions_list,
'resolved_discussions': resolved_discussions_list
}
agents_list = sorted(list(agent_month_data.keys()))
return {
'assistants': agents_list,
'months': months,
'data': result_data
}
def construct_leaderboard_from_metadata(all_metadata_dict, assistants, wanted_resolved_dict=None, discussions_dict=None):
"""Construct leaderboard from in-memory issue metadata and discussion metadata.
Args:
all_metadata_dict: Dictionary mapping assistant ID to list of issue metadata (assistant-assigned issues)
assistants: List of assistant metadata
wanted_resolved_dict: Optional dictionary mapping assistant ID to list of resolved wanted issues
discussions_dict: Optional dictionary mapping assistant ID to list of discussion metadata
"""
if not assistants:
print("Error: No assistants found")
return {}
if wanted_resolved_dict is None:
wanted_resolved_dict = {}
if discussions_dict is None:
discussions_dict = {}
cache_dict = {}
for assistant in assistants:
identifier = assistant.get('github_identifier')
agent_name = assistant.get('name', 'Unknown')
bot_data = all_metadata_dict.get(identifier, [])
stats = calculate_issue_stats_from_metadata(bot_data)
# Add wanted issues count
resolved_wanted = len(wanted_resolved_dict.get(identifier, []))
# Add discussion stats
discussion_metadata = discussions_dict.get(identifier, [])
discussion_stats = calculate_discussion_stats_from_metadata(discussion_metadata)
cache_dict[identifier] = {
'name': agent_name,
'website': assistant.get('website', 'N/A'),
'github_identifier': identifier,
**stats,
'resolved_wanted_issues': resolved_wanted,
**discussion_stats
}
return cache_dict
def save_leaderboard_data_to_hf(leaderboard_dict, issue_monthly_metrics, wanted_issues=None, discussion_monthly_metrics=None):
"""Save leaderboard data, monthly metrics, wanted issues, and discussion metrics to HuggingFace dataset."""
try:
token = get_hf_token()
if not token:
raise Exception("No HuggingFace token found")
api = HfApi(token=token)
if wanted_issues is None:
wanted_issues = []
combined_data = {
'metadata': {
'last_updated': datetime.now(timezone.utc).isoformat(),
'leaderboard_time_frame_days': LEADERBOARD_TIME_FRAME_DAYS,
'longstanding_gap_days': LONGSTANDING_GAP_DAYS,
'tracked_orgs': TRACKED_ORGS,
'patch_wanted_labels': PATCH_WANTED_LABELS
},
'leaderboard': leaderboard_dict,
'issue_monthly_metrics': issue_monthly_metrics,
'wanted_issues': wanted_issues,
'discussion_monthly_metrics': discussion_monthly_metrics
}
with open(LEADERBOARD_FILENAME, 'w') as f:
json.dump(combined_data, f, indent=2)
try:
upload_file_with_backoff(
api=api,
path_or_fileobj=LEADERBOARD_FILENAME,
path_in_repo=LEADERBOARD_FILENAME,
repo_id=LEADERBOARD_REPO,
repo_type="dataset"
)
return True
finally:
if os.path.exists(LEADERBOARD_FILENAME):
os.remove(LEADERBOARD_FILENAME)
except Exception as e:
print(f"Error saving leaderboard data: {str(e)}")
traceback.print_exc()
return False
# =============================================================================
# MINING FUNCTION
# =============================================================================
def mine_all_agents():
"""
Mine issue metadata for all assistants using STREAMING batch processing.
Downloads GHArchive data, then uses BATCH-based DuckDB queries.
"""
print(f"\n[1/4] Downloading GHArchive data...")
if not download_all_gharchive_data():
print("Warning: Download had errors, continuing with available data...")
print(f"\n[2/4] Loading assistant metadata...")
assistants = load_agents_from_hf()
if not assistants:
print("Error: No assistants found")
return
identifiers = [assistant['github_identifier'] for assistant in assistants if assistant.get('github_identifier')]
if not identifiers:
print("Error: No valid assistant identifiers found")
return
print(f"\n[3/4] Mining issue metadata ({len(identifiers)} assistants, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
try:
conn = get_duckdb_connection()
except Exception as e:
print(f"Failed to initialize DuckDB connection: {str(e)}")
return
current_time = datetime.now(timezone.utc)
end_date = current_time.replace(hour=0, minute=0, second=0, microsecond=0)
start_date = end_date - timedelta(days=LEADERBOARD_TIME_FRAME_DAYS)
try:
# USE UNIFIED STREAMING FUNCTION FOR ISSUES, WANTED, AND DISCUSSIONS
results = fetch_all_metadata_streaming(
conn, identifiers, start_date, end_date
)
agent_issues = results['agent_issues']
wanted_open = results['wanted_open']
wanted_resolved = results['wanted_resolved']
agent_discussions = results['agent_discussions']
except Exception as e:
print(f"Error during DuckDB fetch: {str(e)}")
traceback.print_exc()
return
finally:
conn.close()
print(f"\n[4/4] Saving leaderboard...")
try:
leaderboard_dict = construct_leaderboard_from_metadata(
agent_issues, assistants, wanted_resolved, agent_discussions
)
issue_monthly_metrics = calculate_monthly_metrics_by_agent(agent_issues, assistants)
discussion_monthly_metrics = calculate_monthly_metrics_by_agent_discussions(
agent_discussions, assistants
)
save_leaderboard_data_to_hf(
leaderboard_dict, issue_monthly_metrics, wanted_open, discussion_monthly_metrics
)
except Exception as e:
print(f"Error saving leaderboard: {str(e)}")
traceback.print_exc()
finally:
# Clean up DuckDB cache file to save storage
if os.path.exists(DUCKDB_CACHE_FILE):
try:
os.remove(DUCKDB_CACHE_FILE)
print(f" ✓ Cache file removed: {DUCKDB_CACHE_FILE}")
except Exception as e:
print(f" ⚠ Failed to remove cache file: {str(e)}")
# =============================================================================
# SCHEDULER SETUP
# =============================================================================
def setup_scheduler():
"""Set up APScheduler to run mining jobs periodically."""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logging.getLogger('httpx').setLevel(logging.WARNING)
scheduler = BlockingScheduler(timezone=SCHEDULE_TIMEZONE)
trigger = CronTrigger(
day_of_week=SCHEDULE_DAY_OF_WEEK,
hour=SCHEDULE_HOUR,
minute=SCHEDULE_MINUTE,
timezone=SCHEDULE_TIMEZONE
)
scheduler.add_job(
mine_all_agents,
trigger=trigger,
id='mine_all_agents',
name='Mine GHArchive data for all assistants',
replace_existing=True
)
next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
print(f"Scheduler: Weekly on {SCHEDULE_DAY_OF_WEEK} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
print(f"Next run: {next_run}\n")
print(f"\nScheduler started")
scheduler.start()
# =============================================================================
# ENTRY POINT
# =============================================================================
if __name__ == "__main__":
if SCHEDULE_ENABLED:
setup_scheduler()
else:
mine_all_agents()
|