upload_large_folder
Browse files
app.py
CHANGED
|
@@ -104,6 +104,20 @@ def upload_file_with_backoff(api, **kwargs):
|
|
| 104 |
"""Upload file with exponential backoff on rate limit errors."""
|
| 105 |
return api.upload_file(**kwargs)
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
# =============================================================================
|
| 108 |
# JSONL FILE OPERATIONS
|
| 109 |
# =============================================================================
|
|
@@ -772,7 +786,7 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 772 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
|
| 773 |
|
| 774 |
This function uses COMPLETE OVERWRITE strategy (not append/deduplicate).
|
| 775 |
-
Uses
|
| 776 |
|
| 777 |
Args:
|
| 778 |
metadata_list: List of issue metadata dictionaries
|
|
@@ -815,14 +829,14 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 815 |
save_jsonl(local_filename, day_metadata)
|
| 816 |
print(f" Prepared {len(day_metadata)} issues for {filename}")
|
| 817 |
|
| 818 |
-
# Upload entire folder using
|
| 819 |
-
# Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
|
| 820 |
print(f"🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
|
| 821 |
-
|
| 822 |
api,
|
| 823 |
folder_path=temp_dir,
|
| 824 |
repo_id=ISSUE_METADATA_REPO,
|
| 825 |
-
repo_type="dataset"
|
|
|
|
| 826 |
)
|
| 827 |
print(f" ✓ Batch upload complete for {agent_identifier}")
|
| 828 |
|
|
|
|
| 104 |
"""Upload file with exponential backoff on rate limit errors."""
|
| 105 |
return api.upload_file(**kwargs)
|
| 106 |
|
| 107 |
+
@backoff.on_exception(
|
| 108 |
+
backoff.expo,
|
| 109 |
+
HfHubHTTPError,
|
| 110 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 111 |
+
max_tries=8,
|
| 112 |
+
base=300,
|
| 113 |
+
max_value=3600,
|
| 114 |
+
jitter=backoff.full_jitter,
|
| 115 |
+
on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
|
| 116 |
+
)
|
| 117 |
+
def upload_folder_with_backoff(api, **kwargs):
|
| 118 |
+
"""Upload folder with exponential backoff on rate limit errors."""
|
| 119 |
+
return api.upload_folder(**kwargs)
|
| 120 |
+
|
| 121 |
# =============================================================================
|
| 122 |
# JSONL FILE OPERATIONS
|
| 123 |
# =============================================================================
|
|
|
|
| 786 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
|
| 787 |
|
| 788 |
This function uses COMPLETE OVERWRITE strategy (not append/deduplicate).
|
| 789 |
+
Uses upload_folder for single-commit batch uploads (avoids rate limit issues).
|
| 790 |
|
| 791 |
Args:
|
| 792 |
metadata_list: List of issue metadata dictionaries
|
|
|
|
| 829 |
save_jsonl(local_filename, day_metadata)
|
| 830 |
print(f" Prepared {len(day_metadata)} issues for {filename}")
|
| 831 |
|
| 832 |
+
# Upload entire folder using upload_folder (single commit per agent)
|
|
|
|
| 833 |
print(f"🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
|
| 834 |
+
upload_folder_with_backoff(
|
| 835 |
api,
|
| 836 |
folder_path=temp_dir,
|
| 837 |
repo_id=ISSUE_METADATA_REPO,
|
| 838 |
+
repo_type="dataset",
|
| 839 |
+
commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
|
| 840 |
)
|
| 841 |
print(f" ✓ Batch upload complete for {agent_identifier}")
|
| 842 |
|
msr.py
CHANGED
|
@@ -90,6 +90,20 @@ def upload_file_with_backoff(api, **kwargs):
|
|
| 90 |
"""Upload file with exponential backoff on rate limit errors."""
|
| 91 |
return api.upload_file(**kwargs)
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
# =============================================================================
|
| 94 |
# UTILITY FUNCTIONS
|
| 95 |
# =============================================================================
|
|
@@ -512,7 +526,7 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 512 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
|
| 513 |
|
| 514 |
This function OVERWRITES existing files completely with fresh data from BigQuery.
|
| 515 |
-
Uses
|
| 516 |
|
| 517 |
Args:
|
| 518 |
metadata_list: List of issue metadata dictionaries
|
|
@@ -554,14 +568,14 @@ def save_issue_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 554 |
save_jsonl(local_filename, day_metadata)
|
| 555 |
print(f" Prepared {len(day_metadata)} issues for {filename}")
|
| 556 |
|
| 557 |
-
# Upload entire folder using
|
| 558 |
-
# Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
|
| 559 |
print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
|
| 560 |
-
|
| 561 |
api,
|
| 562 |
folder_path=temp_dir,
|
| 563 |
repo_id=ISSUE_METADATA_REPO,
|
| 564 |
-
repo_type="dataset"
|
|
|
|
| 565 |
)
|
| 566 |
print(f" ✓ Batch upload complete for {agent_identifier}")
|
| 567 |
|
|
|
|
| 90 |
"""Upload file with exponential backoff on rate limit errors."""
|
| 91 |
return api.upload_file(**kwargs)
|
| 92 |
|
| 93 |
+
@backoff.on_exception(
|
| 94 |
+
backoff.expo,
|
| 95 |
+
HfHubHTTPError,
|
| 96 |
+
giveup=lambda e: not is_rate_limit_error(e),
|
| 97 |
+
max_tries=8,
|
| 98 |
+
base=300,
|
| 99 |
+
max_value=3600,
|
| 100 |
+
jitter=backoff.full_jitter,
|
| 101 |
+
on_backoff=lambda details: print(f" ⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/{8}...")
|
| 102 |
+
)
|
| 103 |
+
def upload_folder_with_backoff(api, **kwargs):
|
| 104 |
+
"""Upload folder with exponential backoff on rate limit errors."""
|
| 105 |
+
return api.upload_folder(**kwargs)
|
| 106 |
+
|
| 107 |
# =============================================================================
|
| 108 |
# UTILITY FUNCTIONS
|
| 109 |
# =============================================================================
|
|
|
|
| 526 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's issues.
|
| 527 |
|
| 528 |
This function OVERWRITES existing files completely with fresh data from BigQuery.
|
| 529 |
+
Uses upload_folder for single-commit batch uploads (avoids rate limit issues).
|
| 530 |
|
| 531 |
Args:
|
| 532 |
metadata_list: List of issue metadata dictionaries
|
|
|
|
| 568 |
save_jsonl(local_filename, day_metadata)
|
| 569 |
print(f" Prepared {len(day_metadata)} issues for {filename}")
|
| 570 |
|
| 571 |
+
# Upload entire folder using upload_folder (single commit per agent)
|
|
|
|
| 572 |
print(f" 🤗 Uploading {len(grouped)} files ({len(metadata_list)} total issues)...")
|
| 573 |
+
upload_folder_with_backoff(
|
| 574 |
api,
|
| 575 |
folder_path=temp_dir,
|
| 576 |
repo_id=ISSUE_METADATA_REPO,
|
| 577 |
+
repo_type="dataset",
|
| 578 |
+
commit_message=f"Update issue metadata for {agent_identifier} - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')} UTC"
|
| 579 |
)
|
| 580 |
print(f" ✓ Batch upload complete for {agent_identifier}")
|
| 581 |
|