zhimin-z
commited on
Commit
·
3da62f9
1
Parent(s):
5998589
erfine
Browse files
msr.py
CHANGED
|
@@ -50,7 +50,7 @@ UPLOAD_DELAY_SECONDS = 5
|
|
| 50 |
UPLOAD_MAX_BACKOFF = 3600
|
| 51 |
|
| 52 |
# Scheduler configuration
|
| 53 |
-
SCHEDULE_ENABLED =
|
| 54 |
SCHEDULE_DAY_OF_WEEK = 'sun' # Sunday
|
| 55 |
SCHEDULE_HOUR = 0
|
| 56 |
SCHEDULE_MINUTE = 0
|
|
@@ -81,7 +81,7 @@ def save_jsonl(filename, data):
|
|
| 81 |
"""Save list of dictionaries to JSONL file."""
|
| 82 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 83 |
for item in data:
|
| 84 |
-
f.write(json.dumps(item) + '
|
| 85 |
|
| 86 |
|
| 87 |
def normalize_date_format(date_string):
|
|
@@ -448,7 +448,7 @@ def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 448 |
print(f"✓ {batch_issues} issues found")
|
| 449 |
|
| 450 |
except Exception as e:
|
| 451 |
-
print(f"
|
| 452 |
import traceback
|
| 453 |
traceback.print_exc()
|
| 454 |
|
|
@@ -457,7 +457,7 @@ def fetch_all_issue_metadata_streaming(conn, identifiers, start_date, end_date):
|
|
| 457 |
|
| 458 |
# Final summary
|
| 459 |
agents_with_data = sum(1 for issues in metadata_by_agent.values() if issues)
|
| 460 |
-
print(f"
|
| 461 |
|
| 462 |
return dict(metadata_by_agent)
|
| 463 |
|
|
@@ -499,7 +499,7 @@ def sync_agents_repo():
|
|
| 499 |
print(f" ✓ Repository synced successfully")
|
| 500 |
if output:
|
| 501 |
# Print first few lines of output
|
| 502 |
-
lines = output.split('
|
| 503 |
for line in lines:
|
| 504 |
print(f" {line}")
|
| 505 |
return True
|
|
@@ -736,12 +736,12 @@ def mine_all_agents():
|
|
| 736 |
Mine issue metadata for all agents using STREAMING batch processing.
|
| 737 |
Downloads GHArchive data, then uses BATCH-based DuckDB queries.
|
| 738 |
"""
|
| 739 |
-
print(f"
|
| 740 |
|
| 741 |
if not download_all_gharchive_data():
|
| 742 |
print("Warning: Download had errors, continuing with available data...")
|
| 743 |
|
| 744 |
-
print(f"
|
| 745 |
|
| 746 |
agents = load_agents_from_hf()
|
| 747 |
if not agents:
|
|
@@ -753,7 +753,7 @@ def mine_all_agents():
|
|
| 753 |
print("Error: No valid agent identifiers found")
|
| 754 |
return
|
| 755 |
|
| 756 |
-
print(f"
|
| 757 |
|
| 758 |
try:
|
| 759 |
conn = get_duckdb_connection()
|
|
@@ -779,7 +779,7 @@ def mine_all_agents():
|
|
| 779 |
finally:
|
| 780 |
conn.close()
|
| 781 |
|
| 782 |
-
print(f"
|
| 783 |
|
| 784 |
try:
|
| 785 |
leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
|
|
@@ -825,9 +825,9 @@ def setup_scheduler():
|
|
| 825 |
from datetime import datetime
|
| 826 |
next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
|
| 827 |
print(f"Scheduler: Weekly on {SCHEDULE_DAY_OF_WEEK} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
|
| 828 |
-
print(f"Next run: {next_run}
|
| 829 |
|
| 830 |
-
print(f"
|
| 831 |
scheduler.start()
|
| 832 |
|
| 833 |
|
|
|
|
| 50 |
UPLOAD_MAX_BACKOFF = 3600
|
| 51 |
|
| 52 |
# Scheduler configuration
|
| 53 |
+
SCHEDULE_ENABLED = False
|
| 54 |
SCHEDULE_DAY_OF_WEEK = 'sun' # Sunday
|
| 55 |
SCHEDULE_HOUR = 0
|
| 56 |
SCHEDULE_MINUTE = 0
|
|
|
|
| 81 |
"""Save list of dictionaries to JSONL file."""
|
| 82 |
with open(filename, 'w', encoding='utf-8') as f:
|
| 83 |
for item in data:
|
| 84 |
+
f.write(json.dumps(item) + '\n')
|
| 85 |
|
| 86 |
|
| 87 |
def normalize_date_format(date_string):
|
|
|
|
| 448 |
print(f"✓ {batch_issues} issues found")
|
| 449 |
|
| 450 |
except Exception as e:
|
| 451 |
+
print(f"\n ✗ Batch {batch_num} error: {str(e)}")
|
| 452 |
import traceback
|
| 453 |
traceback.print_exc()
|
| 454 |
|
|
|
|
| 457 |
|
| 458 |
# Final summary
|
| 459 |
agents_with_data = sum(1 for issues in metadata_by_agent.values() if issues)
|
| 460 |
+
print(f"\n ✓ Complete: {total_issues} issues found for {agents_with_data}/{len(identifiers)} agents")
|
| 461 |
|
| 462 |
return dict(metadata_by_agent)
|
| 463 |
|
|
|
|
| 499 |
print(f" ✓ Repository synced successfully")
|
| 500 |
if output:
|
| 501 |
# Print first few lines of output
|
| 502 |
+
lines = output.split('\n')[:5]
|
| 503 |
for line in lines:
|
| 504 |
print(f" {line}")
|
| 505 |
return True
|
|
|
|
| 736 |
Mine issue metadata for all agents using STREAMING batch processing.
|
| 737 |
Downloads GHArchive data, then uses BATCH-based DuckDB queries.
|
| 738 |
"""
|
| 739 |
+
print(f"\n[1/4] Downloading GHArchive data...")
|
| 740 |
|
| 741 |
if not download_all_gharchive_data():
|
| 742 |
print("Warning: Download had errors, continuing with available data...")
|
| 743 |
|
| 744 |
+
print(f"\n[2/4] Loading agent metadata...")
|
| 745 |
|
| 746 |
agents = load_agents_from_hf()
|
| 747 |
if not agents:
|
|
|
|
| 753 |
print("Error: No valid agent identifiers found")
|
| 754 |
return
|
| 755 |
|
| 756 |
+
print(f"\n[3/4] Mining issue metadata ({len(identifiers)} agents, {LEADERBOARD_TIME_FRAME_DAYS} days)...")
|
| 757 |
|
| 758 |
try:
|
| 759 |
conn = get_duckdb_connection()
|
|
|
|
| 779 |
finally:
|
| 780 |
conn.close()
|
| 781 |
|
| 782 |
+
print(f"\n[4/4] Saving leaderboard...")
|
| 783 |
|
| 784 |
try:
|
| 785 |
leaderboard_dict = construct_leaderboard_from_metadata(all_metadata, agents)
|
|
|
|
| 825 |
from datetime import datetime
|
| 826 |
next_run = trigger.get_next_fire_time(None, datetime.now(trigger.timezone))
|
| 827 |
print(f"Scheduler: Weekly on {SCHEDULE_DAY_OF_WEEK} at {SCHEDULE_HOUR:02d}:{SCHEDULE_MINUTE:02d} {SCHEDULE_TIMEZONE}")
|
| 828 |
+
print(f"Next run: {next_run}\n")
|
| 829 |
|
| 830 |
+
print(f"\nScheduler started")
|
| 831 |
scheduler.start()
|
| 832 |
|
| 833 |
|