zhiminy commited on
Commit
ee5d1f1
·
1 Parent(s): d786442

upload_large_folder

Browse files
Files changed (2) hide show
  1. app.py +19 -5
  2. msr.py +19 -5
app.py CHANGED
@@ -104,6 +104,20 @@ def upload_file_with_backoff(api, **kwargs):
104
  """Upload file with exponential backoff on rate limit errors."""
105
  return api.upload_file(**kwargs)
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # =============================================================================
108
  # JSONL FILE OPERATIONS
109
  # =============================================================================
@@ -772,7 +786,7 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
772
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
773
 
774
  This function uses COMPLETE OVERWRITE strategy (not append/deduplicate).
775
- Uses upload_large_folder for optimized batch uploads.
776
 
777
  Args:
778
  metadata_list: List of issue metadata dictionaries
@@ -815,14 +829,14 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
815
  save_jsonl(local_filename, day_metadata)
816
  print(f" Prepared {len(day_metadata)} issues for {filename}")
817
 
818
- # Upload entire folder using upload_large_folder (optimized for large files)
819
- # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
820
  print(f"🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
821
- upload_large_folder_with_backoff(
822
  api,
823
  folder_path=temp_dir,
824
  repo_id=ISSUE_METADATA_REPO,
825
- repo_type="dataset"
 
826
  )
827
  print(f" ✓ Batch upload complete for {agent_identifier}")
828
 
 
104
  """Upload file with exponential backoff on rate limit errors."""
105
  return api.upload_file(**kwargs)
106
 
107
+ @backoff.on_exception(
108
+ backoff.expo,
109
+ HfHubHTTPError,
110
+ giveup=lambda e: not is_rate_limit_error(e),
111
+ max_tries=8,
112
+ base=300,
113
+ max_value=3600,
114
+ jitter=backoff.full_jitter,
115
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
116
+ )
117
+ def upload_folder_with_backoff(api, **kwargs):
118
+ """Upload folder with exponential backoff on rate limit errors."""
119
+ return api.upload_folder(**kwargs)
120
+
121
  # =============================================================================
122
  # JSONL FILE OPERATIONS
123
  # =============================================================================
 
786
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
787
 
788
  This function uses COMPLETE OVERWRITE strategy (not append/deduplicate).
789
+ Uses upload_folder for single-commit batch uploads (avoids rate limit issues).
790
 
791
  Args:
792
  metadata_list: List of issue metadata dictionaries
 
829
  save_jsonl(local_filename, day_metadata)
830
  print(f" Prepared {len(day_metadata)} issues for {filename}")
831
 
832
+ # Upload entire folder using upload_folder (single commit per agent)
 
833
  print(f"🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
834
+ upload_folder_with_backoff(
835
  api,
836
  folder_path=temp_dir,
837
  repo_id=ISSUE_METADATA_REPO,
838
+ repo_type="dataset",
839
+ commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
840
  )
841
  print(f" ✓ Batch upload complete for {agent_identifier}")
842
 
msr.py CHANGED
@@ -90,6 +90,20 @@ def upload_file_with_backoff(api, **kwargs):
90
  """Upload file with exponential backoff on rate limit errors."""
91
  return api.upload_file(**kwargs)
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  # =============================================================================
94
  # UTILITY FUNCTIONS
95
  # =============================================================================
@@ -512,7 +526,7 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
512
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
513
 
514
  This function OVERWRITES existing files completely with fresh data from BigQuery.
515
- Uses batch upload to avoid rate limit (uploads entire folder in single commit).
516
 
517
  Args:
518
  metadata_list: List of issue metadata dictionaries
@@ -554,14 +568,14 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
554
  save_jsonl(local_filename, day_metadata)
555
  print(f" Prepared {len(day_metadata)} issues for {filename}")
556
 
557
- # Upload entire folder using upload_large_folder (optimized for large files)
558
- # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
559
  print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
560
- upload_large_folder_with_backoff(
561
  api,
562
  folder_path=temp_dir,
563
  repo_id=ISSUE_METADATA_REPO,
564
- repo_type="dataset"
 
565
  )
566
  print(f" ✓ Batch upload complete for {agent_identifier}")
567
 
 
90
  """Upload file with exponential backoff on rate limit errors."""
91
  return api.upload_file(**kwargs)
92
 
93
+ @backoff.on_exception(
94
+ backoff.expo,
95
+ HfHubHTTPError,
96
+ giveup=lambda e: not is_rate_limit_error(e),
97
+ max_tries=8,
98
+ base=300,
99
+ max_value=3600,
100
+ jitter=backoff.full_jitter,
101
+ on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
102
+ )
103
+ def upload_folder_with_backoff(api, **kwargs):
104
+ """Upload folder with exponential backoff on rate limit errors."""
105
+ return api.upload_folder(**kwargs)
106
+
107
  # =============================================================================
108
  # UTILITY FUNCTIONS
109
  # =============================================================================
 
526
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
527
 
528
  This function OVERWRITES existing files completely with fresh data from BigQuery.
529
+ Uses upload_folder for single-commit batch uploads (avoids rate limit issues).
530
 
531
  Args:
532
  metadata_list: List of issue metadata dictionaries
 
568
  save_jsonl(local_filename, day_metadata)
569
  print(f" Prepared {len(day_metadata)} issues for {filename}")
570
 
571
+ # Upload entire folder using upload_folder (single commit per agent)
 
572
  print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
573
+ upload_folder_with_backoff(
574
  api,
575
  folder_path=temp_dir,
576
  repo_id=ISSUE_METADATA_REPO,
577
+ repo_type="dataset",
578
+ commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
579
  )
580
  print(f" ✓ Batch upload complete for {agent_identifier}")
581