batch upload
Browse files
app.py
CHANGED
|
@@ -666,6 +666,7 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 666 |
In debug mode, saves to in-memory cache only.
|
| 667 |
|
| 668 |
This function APPENDS new metadata and DEDUPLICATES by html_url.
|
|
|
|
| 669 |
|
| 670 |
Args:
|
| 671 |
metadata_list: List of issue metadata dictionaries
|
|
@@ -682,6 +683,10 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 682 |
print(f"🐛 DEBUG MODE: Saved to in-memory cache only ({len(metadata_list)} issues) - NOT saved to HuggingFace")
|
| 683 |
return True
|
| 684 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
try:
|
| 686 |
token = get_hf_token()
|
| 687 |
if not token:
|
|
@@ -689,14 +694,23 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 689 |
|
| 690 |
api = HfApi()
|
| 691 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 692 |
# Group by exact date (year, month, day)
|
| 693 |
grouped = group_metadata_by_date(metadata_list)
|
| 694 |
|
|
|
|
|
|
|
| 695 |
for (issue_year, month, day), day_metadata in grouped.items():
|
| 696 |
# New structure: [agent_identifier]/YYYY.MM.DD.jsonl
|
| 697 |
filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
|
| 698 |
local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
|
| 699 |
-
|
|
|
|
|
|
|
| 700 |
|
| 701 |
# Download existing file if it exists
|
| 702 |
existing_metadata = []
|
|
@@ -720,30 +734,31 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 720 |
existing_by_url.update(new_by_url)
|
| 721 |
merged_metadata = list(existing_by_url.values())
|
| 722 |
|
| 723 |
-
# Save
|
| 724 |
-
save_jsonl(
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
# Always clean up local file, even if upload fails
|
| 739 |
-
if os.path.exists(local_filename):
|
| 740 |
-
os.remove(local_filename)
|
| 741 |
|
| 742 |
return True
|
| 743 |
|
| 744 |
except Exception as e:
|
| 745 |
print(f"✗ Error saving issue metadata: {str(e)}")
|
| 746 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
|
| 748 |
|
| 749 |
def load_issue_metadata():
|
|
|
|
| 666 |
In debug mode, saves to in-memory cache only.
|
| 667 |
|
| 668 |
This function APPENDS new metadata and DEDUPLICATES by html_url.
|
| 669 |
+
Uses batch folder upload to minimize commits (1 commit per agent instead of 1 per file).
|
| 670 |
|
| 671 |
Args:
|
| 672 |
metadata_list: List of issue metadata dictionaries
|
|
|
|
| 683 |
print(f"🐛 DEBUG MODE: Saved to in-memory cache only ({len(metadata_list)} issues) - NOT saved to HuggingFace")
|
| 684 |
return True
|
| 685 |
|
| 686 |
+
import tempfile
|
| 687 |
+
import shutil
|
| 688 |
+
|
| 689 |
+
temp_dir = None
|
| 690 |
try:
|
| 691 |
token = get_hf_token()
|
| 692 |
if not token:
|
|
|
|
| 694 |
|
| 695 |
api = HfApi()
|
| 696 |
|
| 697 |
+
# Create temporary directory for batch upload
|
| 698 |
+
temp_dir = tempfile.mkdtemp()
|
| 699 |
+
agent_folder = os.path.join(temp_dir, agent_identifier)
|
| 700 |
+
os.makedirs(agent_folder, exist_ok=True)
|
| 701 |
+
|
| 702 |
# Group by exact date (year, month, day)
|
| 703 |
grouped = group_metadata_by_date(metadata_list)
|
| 704 |
|
| 705 |
+
print(f"📤 Preparing batch upload for {agent_identifier} ({len(grouped)} daily files)...")
|
| 706 |
+
|
| 707 |
for (issue_year, month, day), day_metadata in grouped.items():
|
| 708 |
# New structure: [agent_identifier]/YYYY.MM.DD.jsonl
|
| 709 |
filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
|
| 710 |
local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
|
| 711 |
+
local_path = os.path.join(agent_folder, local_filename)
|
| 712 |
+
|
| 713 |
+
print(f" Preparing {len(day_metadata)} issues for {filename}...")
|
| 714 |
|
| 715 |
# Download existing file if it exists
|
| 716 |
existing_metadata = []
|
|
|
|
| 734 |
existing_by_url.update(new_by_url)
|
| 735 |
merged_metadata = list(existing_by_url.values())
|
| 736 |
|
| 737 |
+
# Save to temporary folder
|
| 738 |
+
save_jsonl(local_path, merged_metadata)
|
| 739 |
+
print(f" ✓ Prepared {len(merged_metadata)} total issues for {local_filename}")
|
| 740 |
+
|
| 741 |
+
# Upload entire folder in a single commit
|
| 742 |
+
print(f"📤 Uploading folder {agent_identifier} to HuggingFace (1 commit)...")
|
| 743 |
+
api.upload_folder(
|
| 744 |
+
folder_path=agent_folder,
|
| 745 |
+
path_in_repo=agent_identifier,
|
| 746 |
+
repo_id=ISSUE_METADATA_REPO,
|
| 747 |
+
repo_type="dataset",
|
| 748 |
+
token=token,
|
| 749 |
+
commit_message=f"Update metadata for {agent_identifier}"
|
| 750 |
+
)
|
| 751 |
+
print(f" ✓ Successfully uploaded {len(grouped)} files in 1 commit")
|
|
|
|
|
|
|
|
|
|
| 752 |
|
| 753 |
return True
|
| 754 |
|
| 755 |
except Exception as e:
|
| 756 |
print(f"✗ Error saving issue metadata: {str(e)}")
|
| 757 |
return False
|
| 758 |
+
finally:
|
| 759 |
+
# Always clean up temporary directory
|
| 760 |
+
if temp_dir and os.path.exists(temp_dir):
|
| 761 |
+
shutil.rmtree(temp_dir)
|
| 762 |
|
| 763 |
|
| 764 |
def load_issue_metadata():
|
msr.py
CHANGED
|
@@ -564,11 +564,16 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 564 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
|
| 565 |
|
| 566 |
This function APPENDS new metadata and DEDUPLICATES by html_url.
|
|
|
|
| 567 |
|
| 568 |
Args:
|
| 569 |
metadata_list: List of issue metadata dictionaries
|
| 570 |
agent_identifier: GitHub identifier of the agent (used as folder name)
|
| 571 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
try:
|
| 573 |
token = get_hf_token()
|
| 574 |
if not token:
|
|
@@ -576,13 +581,22 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 576 |
|
| 577 |
api = HfApi()
|
| 578 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
# Group by exact date (year, month, day)
|
| 580 |
grouped = group_metadata_by_date(metadata_list)
|
| 581 |
|
|
|
|
|
|
|
| 582 |
for (issue_year, month, day), day_metadata in grouped.items():
|
| 583 |
filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
|
| 584 |
local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
|
| 585 |
-
|
|
|
|
|
|
|
| 586 |
|
| 587 |
# Download existing file if it exists
|
| 588 |
existing_metadata = []
|
|
@@ -606,30 +620,31 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 606 |
existing_by_url.update(new_by_url)
|
| 607 |
merged_metadata = list(existing_by_url.values())
|
| 608 |
|
| 609 |
-
# Save
|
| 610 |
-
save_jsonl(
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
# Always clean up local file, even if upload fails
|
| 625 |
-
if os.path.exists(local_filename):
|
| 626 |
-
os.remove(local_filename)
|
| 627 |
|
| 628 |
return True
|
| 629 |
|
| 630 |
except Exception as e:
|
| 631 |
print(f"✗ Error saving issue metadata: {str(e)}")
|
| 632 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
|
| 634 |
|
| 635 |
def load_agents_from_hf():
|
|
|
|
| 564 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
|
| 565 |
|
| 566 |
This function APPENDS new metadata and DEDUPLICATES by html_url.
|
| 567 |
+
Uses batch folder upload to minimize commits (1 commit per agent instead of 1 per file).
|
| 568 |
|
| 569 |
Args:
|
| 570 |
metadata_list: List of issue metadata dictionaries
|
| 571 |
agent_identifier: GitHub identifier of the agent (used as folder name)
|
| 572 |
"""
|
| 573 |
+
import tempfile
|
| 574 |
+
import shutil
|
| 575 |
+
|
| 576 |
+
temp_dir = None
|
| 577 |
try:
|
| 578 |
token = get_hf_token()
|
| 579 |
if not token:
|
|
|
|
| 581 |
|
| 582 |
api = HfApi()
|
| 583 |
|
| 584 |
+
# Create temporary directory for batch upload
|
| 585 |
+
temp_dir = tempfile.mkdtemp()
|
| 586 |
+
agent_folder = os.path.join(temp_dir, agent_identifier)
|
| 587 |
+
os.makedirs(agent_folder, exist_ok=True)
|
| 588 |
+
|
| 589 |
# Group by exact date (year, month, day)
|
| 590 |
grouped = group_metadata_by_date(metadata_list)
|
| 591 |
|
| 592 |
+
print(f"📤 Preparing batch upload for {agent_identifier} ({len(grouped)} daily files)...")
|
| 593 |
+
|
| 594 |
for (issue_year, month, day), day_metadata in grouped.items():
|
| 595 |
filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
|
| 596 |
local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
|
| 597 |
+
local_path = os.path.join(agent_folder, local_filename)
|
| 598 |
+
|
| 599 |
+
print(f" Preparing {len(day_metadata)} issues for {filename}...")
|
| 600 |
|
| 601 |
# Download existing file if it exists
|
| 602 |
existing_metadata = []
|
|
|
|
| 620 |
existing_by_url.update(new_by_url)
|
| 621 |
merged_metadata = list(existing_by_url.values())
|
| 622 |
|
| 623 |
+
# Save to temporary folder
|
| 624 |
+
save_jsonl(local_path, merged_metadata)
|
| 625 |
+
print(f" ✓ Prepared {len(merged_metadata)} total issues for {local_filename}")
|
| 626 |
+
|
| 627 |
+
# Upload entire folder in a single commit
|
| 628 |
+
print(f"📤 Uploading folder {agent_identifier} to HuggingFace (1 commit)...")
|
| 629 |
+
api.upload_folder(
|
| 630 |
+
folder_path=agent_folder,
|
| 631 |
+
path_in_repo=agent_identifier,
|
| 632 |
+
repo_id=ISSUE_METADATA_REPO,
|
| 633 |
+
repo_type="dataset",
|
| 634 |
+
token=token,
|
| 635 |
+
commit_message=f"Update metadata for {agent_identifier}"
|
| 636 |
+
)
|
| 637 |
+
print(f" ✓ Successfully uploaded {len(grouped)} files in 1 commit")
|
|
|
|
|
|
|
|
|
|
| 638 |
|
| 639 |
return True
|
| 640 |
|
| 641 |
except Exception as e:
|
| 642 |
print(f"✗ Error saving issue metadata: {str(e)}")
|
| 643 |
return False
|
| 644 |
+
finally:
|
| 645 |
+
# Always clean up temporary directory
|
| 646 |
+
if temp_dir and os.path.exists(temp_dir):
|
| 647 |
+
shutil.rmtree(temp_dir)
|
| 648 |
|
| 649 |
|
| 650 |
def load_agents_from_hf():
|