Spaces:
Running
Running
batch upload
Browse files
app.py
CHANGED
|
@@ -793,12 +793,16 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 793 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's reviews.
|
| 794 |
In debug mode, saves to in-memory cache only.
|
| 795 |
|
| 796 |
-
This function APPENDS new metadata and DEDUPLICATES by
|
|
|
|
| 797 |
|
| 798 |
Args:
|
| 799 |
metadata_list: List of review metadata dictionaries
|
| 800 |
agent_identifier: GitHub identifier of the agent (used as folder name)
|
| 801 |
"""
|
|
|
|
|
|
|
|
|
|
| 802 |
# Skip saving to HF in debug mode - use in-memory cache instead
|
| 803 |
if DEBUG_MODE:
|
| 804 |
global DEBUG_REVIEW_METADATA_CACHE
|
|
@@ -820,57 +824,67 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 820 |
# Group by exact date (year, month, day)
|
| 821 |
grouped = group_metadata_by_date(metadata_list)
|
| 822 |
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
print(f"π€ Uploading {len(day_metadata)} reviews to {filename}...")
|
| 828 |
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
try:
|
| 832 |
-
file_path = hf_hub_download(
|
| 833 |
-
repo_id=REVIEW_METADATA_REPO,
|
| 834 |
-
filename=filename,
|
| 835 |
-
repo_type="dataset",
|
| 836 |
-
token=token
|
| 837 |
-
)
|
| 838 |
-
existing_metadata = load_jsonl(file_path)
|
| 839 |
-
print(f" Found {len(existing_metadata)} existing reviews in {filename}")
|
| 840 |
-
except Exception:
|
| 841 |
-
print(f" No existing file found for {filename}, creating new")
|
| 842 |
-
|
| 843 |
-
# Merge and deduplicate by review_id
|
| 844 |
-
existing_by_id = {meta['review_id']: meta for meta in existing_metadata if meta.get('review_id')}
|
| 845 |
-
new_by_id = {meta['review_id']: meta for meta in day_metadata if meta.get('review_id')}
|
| 846 |
|
| 847 |
-
#
|
| 848 |
-
|
| 849 |
-
|
|
|
|
| 850 |
|
| 851 |
-
|
| 852 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 853 |
|
| 854 |
-
|
| 855 |
-
# Upload to HuggingFace with folder path
|
| 856 |
-
upload_with_retry(
|
| 857 |
-
api=api,
|
| 858 |
-
path_or_fileobj=local_filename,
|
| 859 |
-
path_in_repo=filename,
|
| 860 |
-
repo_id=REVIEW_METADATA_REPO,
|
| 861 |
-
repo_type="dataset",
|
| 862 |
-
token=token
|
| 863 |
-
)
|
| 864 |
-
print(f" β Saved {len(merged_metadata)} total reviews to {filename}")
|
| 865 |
-
finally:
|
| 866 |
-
# Always clean up local file, even if upload fails
|
| 867 |
-
if os.path.exists(local_filename):
|
| 868 |
-
os.remove(local_filename)
|
| 869 |
|
| 870 |
-
|
|
|
|
|
|
|
|
|
|
| 871 |
|
| 872 |
except Exception as e:
|
| 873 |
print(f"β Error saving review metadata: {str(e)}")
|
|
|
|
|
|
|
| 874 |
return False
|
| 875 |
|
| 876 |
|
|
|
|
| 793 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's reviews.
|
| 794 |
In debug mode, saves to in-memory cache only.
|
| 795 |
|
| 796 |
+
This function APPENDS new metadata and DEDUPLICATES by review_id.
|
| 797 |
+
Uses batch upload to avoid rate limit (uploads entire folder in single commit).
|
| 798 |
|
| 799 |
Args:
|
| 800 |
metadata_list: List of review metadata dictionaries
|
| 801 |
agent_identifier: GitHub identifier of the agent (used as folder name)
|
| 802 |
"""
|
| 803 |
+
import tempfile
|
| 804 |
+
import shutil
|
| 805 |
+
|
| 806 |
# Skip saving to HF in debug mode - use in-memory cache instead
|
| 807 |
if DEBUG_MODE:
|
| 808 |
global DEBUG_REVIEW_METADATA_CACHE
|
|
|
|
| 824 |
# Group by exact date (year, month, day)
|
| 825 |
grouped = group_metadata_by_date(metadata_list)
|
| 826 |
|
| 827 |
+
# Create a temporary directory for batch upload
|
| 828 |
+
temp_dir = tempfile.mkdtemp()
|
| 829 |
+
agent_folder = os.path.join(temp_dir, agent_identifier)
|
| 830 |
+
os.makedirs(agent_folder, exist_ok=True)
|
|
|
|
| 831 |
|
| 832 |
+
try:
|
| 833 |
+
print(f"π¦ Preparing batch upload for {len(grouped)} daily files...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
|
| 835 |
+
# Process each daily file
|
| 836 |
+
for (review_year, month, day), day_metadata in grouped.items():
|
| 837 |
+
filename = f"{agent_identifier}/{review_year}.{month:02d}.{day:02d}.jsonl"
|
| 838 |
+
local_filename = os.path.join(agent_folder, f"{review_year}.{month:02d}.{day:02d}.jsonl")
|
| 839 |
|
| 840 |
+
# Download existing file if it exists
|
| 841 |
+
existing_metadata = []
|
| 842 |
+
try:
|
| 843 |
+
file_path = hf_hub_download(
|
| 844 |
+
repo_id=REVIEW_METADATA_REPO,
|
| 845 |
+
filename=filename,
|
| 846 |
+
repo_type="dataset",
|
| 847 |
+
token=token
|
| 848 |
+
)
|
| 849 |
+
existing_metadata = load_jsonl(file_path)
|
| 850 |
+
print(f" Found {len(existing_metadata)} existing reviews in {filename}")
|
| 851 |
+
except Exception:
|
| 852 |
+
print(f" Creating new file: {filename}")
|
| 853 |
+
|
| 854 |
+
# Merge and deduplicate by review_id
|
| 855 |
+
existing_by_id = {meta['review_id']: meta for meta in existing_metadata if meta.get('review_id')}
|
| 856 |
+
new_by_id = {meta['review_id']: meta for meta in day_metadata if meta.get('review_id')}
|
| 857 |
+
|
| 858 |
+
# Update with new data (new data overwrites old)
|
| 859 |
+
existing_by_id.update(new_by_id)
|
| 860 |
+
merged_metadata = list(existing_by_id.values())
|
| 861 |
+
|
| 862 |
+
# Save to temp directory
|
| 863 |
+
save_jsonl(local_filename, merged_metadata)
|
| 864 |
+
print(f" Prepared {len(merged_metadata)} reviews for {filename}")
|
| 865 |
+
|
| 866 |
+
# Upload entire folder in a single commit
|
| 867 |
+
print(f"π€ Uploading {len(grouped)} files in single batch commit...")
|
| 868 |
+
api.upload_folder(
|
| 869 |
+
folder_path=temp_dir,
|
| 870 |
+
repo_id=REVIEW_METADATA_REPO,
|
| 871 |
+
repo_type="dataset",
|
| 872 |
+
token=token,
|
| 873 |
+
commit_message=f"Batch update: {agent_identifier} ({len(grouped)} daily files)"
|
| 874 |
+
)
|
| 875 |
+
print(f" β Batch upload complete for {agent_identifier}")
|
| 876 |
|
| 877 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 878 |
|
| 879 |
+
finally:
|
| 880 |
+
# Always clean up temp directory
|
| 881 |
+
if os.path.exists(temp_dir):
|
| 882 |
+
shutil.rmtree(temp_dir)
|
| 883 |
|
| 884 |
except Exception as e:
|
| 885 |
print(f"β Error saving review metadata: {str(e)}")
|
| 886 |
+
import traceback
|
| 887 |
+
traceback.print_exc()
|
| 888 |
return False
|
| 889 |
|
| 890 |
|
msr.py
CHANGED
|
@@ -639,11 +639,15 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 639 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's reviews.
|
| 640 |
|
| 641 |
This function APPENDS new metadata and DEDUPLICATES by review_id.
|
|
|
|
| 642 |
|
| 643 |
Args:
|
| 644 |
metadata_list: List of review metadata dictionaries
|
| 645 |
agent_identifier: GitHub identifier of the agent (used as folder name)
|
| 646 |
"""
|
|
|
|
|
|
|
|
|
|
| 647 |
try:
|
| 648 |
token = get_hf_token()
|
| 649 |
if not token:
|
|
@@ -654,56 +658,67 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
|
|
| 654 |
# Group by exact date (year, month, day)
|
| 655 |
grouped = group_metadata_by_date(metadata_list)
|
| 656 |
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
# Download existing file if it exists
|
| 663 |
-
existing_metadata = []
|
| 664 |
-
try:
|
| 665 |
-
file_path = hf_hub_download(
|
| 666 |
-
repo_id=REVIEW_METADATA_REPO,
|
| 667 |
-
filename=filename,
|
| 668 |
-
repo_type="dataset",
|
| 669 |
-
token=token
|
| 670 |
-
)
|
| 671 |
-
existing_metadata = load_jsonl(file_path)
|
| 672 |
-
print(f" Found {len(existing_metadata)} existing reviews in {filename}")
|
| 673 |
-
except Exception:
|
| 674 |
-
print(f" No existing file found for {filename}, creating new")
|
| 675 |
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
new_by_id = {meta['review_id']: meta for meta in day_metadata if meta.get('review_id')}
|
| 679 |
|
| 680 |
-
#
|
| 681 |
-
|
| 682 |
-
|
|
|
|
| 683 |
|
| 684 |
-
|
| 685 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
|
| 687 |
-
|
| 688 |
-
# Upload to HuggingFace with folder path
|
| 689 |
-
upload_with_retry(
|
| 690 |
-
api=api,
|
| 691 |
-
path_or_fileobj=local_filename,
|
| 692 |
-
path_in_repo=filename,
|
| 693 |
-
repo_id=REVIEW_METADATA_REPO,
|
| 694 |
-
repo_type="dataset",
|
| 695 |
-
token=token
|
| 696 |
-
)
|
| 697 |
-
print(f" β Saved {len(merged_metadata)} total reviews to {filename}")
|
| 698 |
-
finally:
|
| 699 |
-
# Always clean up local file, even if upload fails
|
| 700 |
-
if os.path.exists(local_filename):
|
| 701 |
-
os.remove(local_filename)
|
| 702 |
|
| 703 |
-
|
|
|
|
|
|
|
|
|
|
| 704 |
|
| 705 |
except Exception as e:
|
| 706 |
print(f"β Error saving review metadata: {str(e)}")
|
|
|
|
|
|
|
| 707 |
return False
|
| 708 |
|
| 709 |
|
|
|
|
| 639 |
Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's reviews.
|
| 640 |
|
| 641 |
This function APPENDS new metadata and DEDUPLICATES by review_id.
|
| 642 |
+
Uses batch upload to avoid rate limit (uploads entire folder in single commit).
|
| 643 |
|
| 644 |
Args:
|
| 645 |
metadata_list: List of review metadata dictionaries
|
| 646 |
agent_identifier: GitHub identifier of the agent (used as folder name)
|
| 647 |
"""
|
| 648 |
+
import tempfile
|
| 649 |
+
import shutil
|
| 650 |
+
|
| 651 |
try:
|
| 652 |
token = get_hf_token()
|
| 653 |
if not token:
|
|
|
|
| 658 |
# Group by exact date (year, month, day)
|
| 659 |
grouped = group_metadata_by_date(metadata_list)
|
| 660 |
|
| 661 |
+
# Create a temporary directory for batch upload
|
| 662 |
+
temp_dir = tempfile.mkdtemp()
|
| 663 |
+
agent_folder = os.path.join(temp_dir, agent_identifier)
|
| 664 |
+
os.makedirs(agent_folder, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
|
| 666 |
+
try:
|
| 667 |
+
print(f"π¦ Preparing batch upload for {len(grouped)} daily files...")
|
|
|
|
| 668 |
|
| 669 |
+
# Process each daily file
|
| 670 |
+
for (review_year, month, day), day_metadata in grouped.items():
|
| 671 |
+
filename = f"{agent_identifier}/{review_year}.{month:02d}.{day:02d}.jsonl"
|
| 672 |
+
local_filename = os.path.join(agent_folder, f"{review_year}.{month:02d}.{day:02d}.jsonl")
|
| 673 |
|
| 674 |
+
# Download existing file if it exists
|
| 675 |
+
existing_metadata = []
|
| 676 |
+
try:
|
| 677 |
+
file_path = hf_hub_download(
|
| 678 |
+
repo_id=REVIEW_METADATA_REPO,
|
| 679 |
+
filename=filename,
|
| 680 |
+
repo_type="dataset",
|
| 681 |
+
token=token
|
| 682 |
+
)
|
| 683 |
+
existing_metadata = load_jsonl(file_path)
|
| 684 |
+
print(f" Found {len(existing_metadata)} existing reviews in {filename}")
|
| 685 |
+
except Exception:
|
| 686 |
+
print(f" Creating new file: {filename}")
|
| 687 |
+
|
| 688 |
+
# Merge and deduplicate by review_id
|
| 689 |
+
existing_by_id = {meta['review_id']: meta for meta in existing_metadata if meta.get('review_id')}
|
| 690 |
+
new_by_id = {meta['review_id']: meta for meta in day_metadata if meta.get('review_id')}
|
| 691 |
+
|
| 692 |
+
# Update with new data (new data overwrites old)
|
| 693 |
+
existing_by_id.update(new_by_id)
|
| 694 |
+
merged_metadata = list(existing_by_id.values())
|
| 695 |
+
|
| 696 |
+
# Save to temp directory
|
| 697 |
+
save_jsonl(local_filename, merged_metadata)
|
| 698 |
+
print(f" Prepared {len(merged_metadata)} reviews for {filename}")
|
| 699 |
+
|
| 700 |
+
# Upload entire folder in a single commit
|
| 701 |
+
print(f"π€ Uploading {len(grouped)} files in single batch commit...")
|
| 702 |
+
api.upload_folder(
|
| 703 |
+
folder_path=temp_dir,
|
| 704 |
+
repo_id=REVIEW_METADATA_REPO,
|
| 705 |
+
repo_type="dataset",
|
| 706 |
+
token=token,
|
| 707 |
+
commit_message=f"Batch update: {agent_identifier} ({len(grouped)} daily files)"
|
| 708 |
+
)
|
| 709 |
+
print(f" β Batch upload complete for {agent_identifier}")
|
| 710 |
|
| 711 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 712 |
|
| 713 |
+
finally:
|
| 714 |
+
# Always clean up temp directory
|
| 715 |
+
if os.path.exists(temp_dir):
|
| 716 |
+
shutil.rmtree(temp_dir)
|
| 717 |
|
| 718 |
except Exception as e:
|
| 719 |
print(f"β Error saving review metadata: {str(e)}")
|
| 720 |
+
import traceback
|
| 721 |
+
traceback.print_exc()
|
| 722 |
return False
|
| 723 |
|
| 724 |
|