zhiminy commited on
Commit
de8d643
·
1 Parent(s): 42f17bd

batch upload

Browse files
Files changed (2) hide show
  1. app.py +34 -19
  2. msr.py +34 -19
app.py CHANGED
@@ -666,6 +666,7 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
666
  In debug mode, saves to in-memory cache only.
667
 
668
  This function APPENDS new metadata and DEDUPLICATES by html_url.
 
669
 
670
  Args:
671
  metadata_list: List of issue metadata dictionaries
@@ -682,6 +683,10 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
682
  print(f"🐛 DEBUG MODE: Saved to in-memory cache only ({len(metadata_list)} issues) - NOT saved to HuggingFace")
683
  return True
684
 
 
 
 
 
685
  try:
686
  token = get_hf_token()
687
  if not token:
@@ -689,14 +694,23 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
689
 
690
  api = HfApi()
691
 
 
 
 
 
 
692
  # Group by exact date (year, month, day)
693
  grouped = group_metadata_by_date(metadata_list)
694
 
 
 
695
  for (issue_year, month, day), day_metadata in grouped.items():
696
  # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
697
  filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
698
  local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
699
- print(f"📤 Uploading {len(day_metadata)} issues to {filename}...")
 
 
700
 
701
  # Download existing file if it exists
702
  existing_metadata = []
@@ -720,30 +734,31 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
720
  existing_by_url.update(new_by_url)
721
  merged_metadata = list(existing_by_url.values())
722
 
723
- # Save locally
724
- save_jsonl(local_filename, merged_metadata)
725
-
726
- try:
727
- # Upload to HuggingFace with folder path
728
- upload_with_retry(
729
- api=api,
730
- path_or_fileobj=local_filename,
731
- path_in_repo=filename,
732
- repo_id=ISSUE_METADATA_REPO,
733
- repo_type="dataset",
734
- token=token
735
- )
736
- print(f" ✓ Saved {len(merged_metadata)} total issues to {filename}")
737
- finally:
738
- # Always clean up local file, even if upload fails
739
- if os.path.exists(local_filename):
740
- os.remove(local_filename)
741
 
742
  return True
743
 
744
  except Exception as e:
745
  print(f"✗ Error saving issue metadata: {str(e)}")
746
  return False
 
 
 
 
747
 
748
 
749
  def load_issue_metadata():
 
666
  In debug mode, saves to in-memory cache only.
667
 
668
  This function APPENDS new metadata and DEDUPLICATES by html_url.
669
+ Uses batch folder upload to minimize commits (1 commit per agent instead of 1 per file).
670
 
671
  Args:
672
  metadata_list: List of issue metadata dictionaries
 
683
  print(f"🐛 DEBUG MODE: Saved to in-memory cache only ({len(metadata_list)} issues) - NOT saved to HuggingFace")
684
  return True
685
 
686
+ import tempfile
687
+ import shutil
688
+
689
+ temp_dir = None
690
  try:
691
  token = get_hf_token()
692
  if not token:
 
694
 
695
  api = HfApi()
696
 
697
+ # Create temporary directory for batch upload
698
+ temp_dir = tempfile.mkdtemp()
699
+ agent_folder = os.path.join(temp_dir, agent_identifier)
700
+ os.makedirs(agent_folder, exist_ok=True)
701
+
702
  # Group by exact date (year, month, day)
703
  grouped = group_metadata_by_date(metadata_list)
704
 
705
+ print(f"📤 Preparing batch upload for {agent_identifier} ({len(grouped)} daily files)...")
706
+
707
  for (issue_year, month, day), day_metadata in grouped.items():
708
  # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
709
  filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
710
  local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
711
+ local_path = os.path.join(agent_folder, local_filename)
712
+
713
+ print(f" Preparing {len(day_metadata)} issues for {filename}...")
714
 
715
  # Download existing file if it exists
716
  existing_metadata = []
 
734
  existing_by_url.update(new_by_url)
735
  merged_metadata = list(existing_by_url.values())
736
 
737
+ # Save to temporary folder
738
+ save_jsonl(local_path, merged_metadata)
739
+ print(f" ✓ Prepared {len(merged_metadata)} total issues for {local_filename}")
740
+
741
+ # Upload entire folder in a single commit
742
+ print(f"📤 Uploading folder {agent_identifier} to HuggingFace (1 commit)...")
743
+ api.upload_folder(
744
+ folder_path=agent_folder,
745
+ path_in_repo=agent_identifier,
746
+ repo_id=ISSUE_METADATA_REPO,
747
+ repo_type="dataset",
748
+ token=token,
749
+ commit_message=f"Update metadata for {agent_identifier}"
750
+ )
751
+ print(f" ✓ Successfully uploaded {len(grouped)} files in 1 commit")
 
 
 
752
 
753
  return True
754
 
755
  except Exception as e:
756
  print(f"✗ Error saving issue metadata: {str(e)}")
757
  return False
758
+ finally:
759
+ # Always clean up temporary directory
760
+ if temp_dir and os.path.exists(temp_dir):
761
+ shutil.rmtree(temp_dir)
762
 
763
 
764
  def load_issue_metadata():
msr.py CHANGED
@@ -564,11 +564,16 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
564
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
565
 
566
  This function APPENDS new metadata and DEDUPLICATES by html_url.
 
567
 
568
  Args:
569
  metadata_list: List of issue metadata dictionaries
570
  agent_identifier: GitHub identifier of the agent (used as folder name)
571
  """
 
 
 
 
572
  try:
573
  token = get_hf_token()
574
  if not token:
@@ -576,13 +581,22 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
576
 
577
  api = HfApi()
578
 
 
 
 
 
 
579
  # Group by exact date (year, month, day)
580
  grouped = group_metadata_by_date(metadata_list)
581
 
 
 
582
  for (issue_year, month, day), day_metadata in grouped.items():
583
  filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
584
  local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
585
- print(f"📤 Uploading {len(day_metadata)} issues to {filename}...")
 
 
586
 
587
  # Download existing file if it exists
588
  existing_metadata = []
@@ -606,30 +620,31 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
606
  existing_by_url.update(new_by_url)
607
  merged_metadata = list(existing_by_url.values())
608
 
609
- # Save locally
610
- save_jsonl(local_filename, merged_metadata)
611
-
612
- try:
613
- # Upload to HuggingFace with folder path
614
- upload_with_retry(
615
- api=api,
616
- path_or_fileobj=local_filename,
617
- path_in_repo=filename,
618
- repo_id=ISSUE_METADATA_REPO,
619
- repo_type="dataset",
620
- token=token
621
- )
622
- print(f" ✓ Saved {len(merged_metadata)} total issues to {filename}")
623
- finally:
624
- # Always clean up local file, even if upload fails
625
- if os.path.exists(local_filename):
626
- os.remove(local_filename)
627
 
628
  return True
629
 
630
  except Exception as e:
631
  print(f"✗ Error saving issue metadata: {str(e)}")
632
  return False
 
 
 
 
633
 
634
 
635
  def load_agents_from_hf():
 
564
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
565
 
566
  This function APPENDS new metadata and DEDUPLICATES by html_url.
567
+ Uses batch folder upload to minimize commits (1 commit per agent instead of 1 per file).
568
 
569
  Args:
570
  metadata_list: List of issue metadata dictionaries
571
  agent_identifier: GitHub identifier of the agent (used as folder name)
572
  """
573
+ import tempfile
574
+ import shutil
575
+
576
+ temp_dir = None
577
  try:
578
  token = get_hf_token()
579
  if not token:
 
581
 
582
  api = HfApi()
583
 
584
+ # Create temporary directory for batch upload
585
+ temp_dir = tempfile.mkdtemp()
586
+ agent_folder = os.path.join(temp_dir, agent_identifier)
587
+ os.makedirs(agent_folder, exist_ok=True)
588
+
589
  # Group by exact date (year, month, day)
590
  grouped = group_metadata_by_date(metadata_list)
591
 
592
+ print(f"📤 Preparing batch upload for {agent_identifier} ({len(grouped)} daily files)...")
593
+
594
  for (issue_year, month, day), day_metadata in grouped.items():
595
  filename = f"{agent_identifier}/{issue_year}.{month:02d}.{day:02d}.jsonl"
596
  local_filename = f"{issue_year}.{month:02d}.{day:02d}.jsonl"
597
+ local_path = os.path.join(agent_folder, local_filename)
598
+
599
+ print(f" Preparing {len(day_metadata)} issues for {filename}...")
600
 
601
  # Download existing file if it exists
602
  existing_metadata = []
 
620
  existing_by_url.update(new_by_url)
621
  merged_metadata = list(existing_by_url.values())
622
 
623
+ # Save to temporary folder
624
+ save_jsonl(local_path, merged_metadata)
625
+ print(f" ✓ Prepared {len(merged_metadata)} total issues for {local_filename}")
626
+
627
+ # Upload entire folder in a single commit
628
+ print(f"📤 Uploading folder {agent_identifier} to HuggingFace (1 commit)...")
629
+ api.upload_folder(
630
+ folder_path=agent_folder,
631
+ path_in_repo=agent_identifier,
632
+ repo_id=ISSUE_METADATA_REPO,
633
+ repo_type="dataset",
634
+ token=token,
635
+ commit_message=f"Update metadata for {agent_identifier}"
636
+ )
637
+ print(f" ✓ Successfully uploaded {len(grouped)} files in 1 commit")
 
 
 
638
 
639
  return True
640
 
641
  except Exception as e:
642
  print(f"✗ Error saving issue metadata: {str(e)}")
643
  return False
644
+ finally:
645
+ # Always clean up temporary directory
646
+ if temp_dir and os.path.exists(temp_dir):
647
+ shutil.rmtree(temp_dir)
648
 
649
 
650
  def load_agents_from_hf():