zhiminy commited on
Commit
8f68ab4
Β·
1 Parent(s): 3bf98ae

batch upload

Browse files
Files changed (2) hide show
  1. app.py +58 -44
  2. msr.py +57 -42
app.py CHANGED
@@ -793,12 +793,16 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
793
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's reviews.
794
  In debug mode, saves to in-memory cache only.
795
 
796
- This function APPENDS new metadata and DEDUPLICATES by sha.
 
797
 
798
  Args:
799
  metadata_list: List of review metadata dictionaries
800
  agent_identifier: GitHub identifier of the agent (used as folder name)
801
  """
 
 
 
802
  # Skip saving to HF in debug mode - use in-memory cache instead
803
  if DEBUG_MODE:
804
  global DEBUG_REVIEW_METADATA_CACHE
@@ -820,57 +824,67 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
820
  # Group by exact date (year, month, day)
821
  grouped = group_metadata_by_date(metadata_list)
822
 
823
- for (review_year, month, day), day_metadata in grouped.items():
824
- # New structure: [agent_identifier]/YYYY.MM.DD.jsonl
825
- filename = f"{agent_identifier}/{review_year}.{month:02d}.{day:02d}.jsonl"
826
- local_filename = f"{review_year}.{month:02d}.{day:02d}.jsonl"
827
- print(f"πŸ“€ Uploading {len(day_metadata)} reviews to {filename}...")
828
 
829
- # Download existing file if it exists
830
- existing_metadata = []
831
- try:
832
- file_path = hf_hub_download(
833
- repo_id=REVIEW_METADATA_REPO,
834
- filename=filename,
835
- repo_type="dataset",
836
- token=token
837
- )
838
- existing_metadata = load_jsonl(file_path)
839
- print(f" Found {len(existing_metadata)} existing reviews in {filename}")
840
- except Exception:
841
- print(f" No existing file found for {filename}, creating new")
842
-
843
- # Merge and deduplicate by review_id
844
- existing_by_id = {meta['review_id']: meta for meta in existing_metadata if meta.get('review_id')}
845
- new_by_id = {meta['review_id']: meta for meta in day_metadata if meta.get('review_id')}
846
 
847
- # Update with new data (new data overwrites old)
848
- existing_by_id.update(new_by_id)
849
- merged_metadata = list(existing_by_id.values())
 
850
 
851
- # Save locally
852
- save_jsonl(local_filename, merged_metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
853
 
854
- try:
855
- # Upload to HuggingFace with folder path
856
- upload_with_retry(
857
- api=api,
858
- path_or_fileobj=local_filename,
859
- path_in_repo=filename,
860
- repo_id=REVIEW_METADATA_REPO,
861
- repo_type="dataset",
862
- token=token
863
- )
864
- print(f" βœ“ Saved {len(merged_metadata)} total reviews to {filename}")
865
- finally:
866
- # Always clean up local file, even if upload fails
867
- if os.path.exists(local_filename):
868
- os.remove(local_filename)
869
 
870
- return True
 
 
 
871
 
872
  except Exception as e:
873
  print(f"βœ— Error saving review metadata: {str(e)}")
 
 
874
  return False
875
 
876
 
 
793
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's reviews.
794
  In debug mode, saves to in-memory cache only.
795
 
796
+ This function APPENDS new metadata and DEDUPLICATES by review_id.
797
+ Uses batch upload to avoid rate limit (uploads entire folder in single commit).
798
 
799
  Args:
800
  metadata_list: List of review metadata dictionaries
801
  agent_identifier: GitHub identifier of the agent (used as folder name)
802
  """
803
+ import tempfile
804
+ import shutil
805
+
806
  # Skip saving to HF in debug mode - use in-memory cache instead
807
  if DEBUG_MODE:
808
  global DEBUG_REVIEW_METADATA_CACHE
 
824
  # Group by exact date (year, month, day)
825
  grouped = group_metadata_by_date(metadata_list)
826
 
827
+ # Create a temporary directory for batch upload
828
+ temp_dir = tempfile.mkdtemp()
829
+ agent_folder = os.path.join(temp_dir, agent_identifier)
830
+ os.makedirs(agent_folder, exist_ok=True)
 
831
 
832
+ try:
833
+ print(f"πŸ“¦ Preparing batch upload for {len(grouped)} daily files...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834
 
835
+ # Process each daily file
836
+ for (review_year, month, day), day_metadata in grouped.items():
837
+ filename = f"{agent_identifier}/{review_year}.{month:02d}.{day:02d}.jsonl"
838
+ local_filename = os.path.join(agent_folder, f"{review_year}.{month:02d}.{day:02d}.jsonl")
839
 
840
+ # Download existing file if it exists
841
+ existing_metadata = []
842
+ try:
843
+ file_path = hf_hub_download(
844
+ repo_id=REVIEW_METADATA_REPO,
845
+ filename=filename,
846
+ repo_type="dataset",
847
+ token=token
848
+ )
849
+ existing_metadata = load_jsonl(file_path)
850
+ print(f" Found {len(existing_metadata)} existing reviews in {filename}")
851
+ except Exception:
852
+ print(f" Creating new file: {filename}")
853
+
854
+ # Merge and deduplicate by review_id
855
+ existing_by_id = {meta['review_id']: meta for meta in existing_metadata if meta.get('review_id')}
856
+ new_by_id = {meta['review_id']: meta for meta in day_metadata if meta.get('review_id')}
857
+
858
+ # Update with new data (new data overwrites old)
859
+ existing_by_id.update(new_by_id)
860
+ merged_metadata = list(existing_by_id.values())
861
+
862
+ # Save to temp directory
863
+ save_jsonl(local_filename, merged_metadata)
864
+ print(f" Prepared {len(merged_metadata)} reviews for {filename}")
865
+
866
+ # Upload entire folder in a single commit
867
+ print(f"πŸ“€ Uploading {len(grouped)} files in single batch commit...")
868
+ api.upload_folder(
869
+ folder_path=temp_dir,
870
+ repo_id=REVIEW_METADATA_REPO,
871
+ repo_type="dataset",
872
+ token=token,
873
+ commit_message=f"Batch update: {agent_identifier} ({len(grouped)} daily files)"
874
+ )
875
+ print(f" βœ“ Batch upload complete for {agent_identifier}")
876
 
877
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878
 
879
+ finally:
880
+ # Always clean up temp directory
881
+ if os.path.exists(temp_dir):
882
+ shutil.rmtree(temp_dir)
883
 
884
  except Exception as e:
885
  print(f"βœ— Error saving review metadata: {str(e)}")
886
+ import traceback
887
+ traceback.print_exc()
888
  return False
889
 
890
 
msr.py CHANGED
@@ -639,11 +639,15 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
639
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's reviews.
640
 
641
  This function APPENDS new metadata and DEDUPLICATES by review_id.
 
642
 
643
  Args:
644
  metadata_list: List of review metadata dictionaries
645
  agent_identifier: GitHub identifier of the agent (used as folder name)
646
  """
 
 
 
647
  try:
648
  token = get_hf_token()
649
  if not token:
@@ -654,56 +658,67 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
654
  # Group by exact date (year, month, day)
655
  grouped = group_metadata_by_date(metadata_list)
656
 
657
- for (review_year, month, day), day_metadata in grouped.items():
658
- filename = f"{agent_identifier}/{review_year}.{month:02d}.{day:02d}.jsonl"
659
- local_filename = f"{review_year}.{month:02d}.{day:02d}.jsonl"
660
- print(f"πŸ“€ Uploading {len(day_metadata)} reviews to {filename}...")
661
-
662
- # Download existing file if it exists
663
- existing_metadata = []
664
- try:
665
- file_path = hf_hub_download(
666
- repo_id=REVIEW_METADATA_REPO,
667
- filename=filename,
668
- repo_type="dataset",
669
- token=token
670
- )
671
- existing_metadata = load_jsonl(file_path)
672
- print(f" Found {len(existing_metadata)} existing reviews in {filename}")
673
- except Exception:
674
- print(f" No existing file found for {filename}, creating new")
675
 
676
- # Merge and deduplicate by review_id
677
- existing_by_id = {meta['review_id']: meta for meta in existing_metadata if meta.get('review_id')}
678
- new_by_id = {meta['review_id']: meta for meta in day_metadata if meta.get('review_id')}
679
 
680
- # Update with new data (new data overwrites old)
681
- existing_by_id.update(new_by_id)
682
- merged_metadata = list(existing_by_id.values())
 
683
 
684
- # Save locally
685
- save_jsonl(local_filename, merged_metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
 
687
- try:
688
- # Upload to HuggingFace with folder path
689
- upload_with_retry(
690
- api=api,
691
- path_or_fileobj=local_filename,
692
- path_in_repo=filename,
693
- repo_id=REVIEW_METADATA_REPO,
694
- repo_type="dataset",
695
- token=token
696
- )
697
- print(f" βœ“ Saved {len(merged_metadata)} total reviews to {filename}")
698
- finally:
699
- # Always clean up local file, even if upload fails
700
- if os.path.exists(local_filename):
701
- os.remove(local_filename)
702
 
703
- return True
 
 
 
704
 
705
  except Exception as e:
706
  print(f"βœ— Error saving review metadata: {str(e)}")
 
 
707
  return False
708
 
709
 
 
639
  Each file is stored in the agent's folder and named YYYY.MM.DD.jsonl for that day's reviews.
640
 
641
  This function APPENDS new metadata and DEDUPLICATES by review_id.
642
+ Uses batch upload to avoid rate limit (uploads entire folder in single commit).
643
 
644
  Args:
645
  metadata_list: List of review metadata dictionaries
646
  agent_identifier: GitHub identifier of the agent (used as folder name)
647
  """
648
+ import tempfile
649
+ import shutil
650
+
651
  try:
652
  token = get_hf_token()
653
  if not token:
 
658
  # Group by exact date (year, month, day)
659
  grouped = group_metadata_by_date(metadata_list)
660
 
661
+ # Create a temporary directory for batch upload
662
+ temp_dir = tempfile.mkdtemp()
663
+ agent_folder = os.path.join(temp_dir, agent_identifier)
664
+ os.makedirs(agent_folder, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
 
666
+ try:
667
+ print(f"πŸ“¦ Preparing batch upload for {len(grouped)} daily files...")
 
668
 
669
+ # Process each daily file
670
+ for (review_year, month, day), day_metadata in grouped.items():
671
+ filename = f"{agent_identifier}/{review_year}.{month:02d}.{day:02d}.jsonl"
672
+ local_filename = os.path.join(agent_folder, f"{review_year}.{month:02d}.{day:02d}.jsonl")
673
 
674
+ # Download existing file if it exists
675
+ existing_metadata = []
676
+ try:
677
+ file_path = hf_hub_download(
678
+ repo_id=REVIEW_METADATA_REPO,
679
+ filename=filename,
680
+ repo_type="dataset",
681
+ token=token
682
+ )
683
+ existing_metadata = load_jsonl(file_path)
684
+ print(f" Found {len(existing_metadata)} existing reviews in {filename}")
685
+ except Exception:
686
+ print(f" Creating new file: {filename}")
687
+
688
+ # Merge and deduplicate by review_id
689
+ existing_by_id = {meta['review_id']: meta for meta in existing_metadata if meta.get('review_id')}
690
+ new_by_id = {meta['review_id']: meta for meta in day_metadata if meta.get('review_id')}
691
+
692
+ # Update with new data (new data overwrites old)
693
+ existing_by_id.update(new_by_id)
694
+ merged_metadata = list(existing_by_id.values())
695
+
696
+ # Save to temp directory
697
+ save_jsonl(local_filename, merged_metadata)
698
+ print(f" Prepared {len(merged_metadata)} reviews for {filename}")
699
+
700
+ # Upload entire folder in a single commit
701
+ print(f"πŸ“€ Uploading {len(grouped)} files in single batch commit...")
702
+ api.upload_folder(
703
+ folder_path=temp_dir,
704
+ repo_id=REVIEW_METADATA_REPO,
705
+ repo_type="dataset",
706
+ token=token,
707
+ commit_message=f"Batch update: {agent_identifier} ({len(grouped)} daily files)"
708
+ )
709
+ print(f" βœ“ Batch upload complete for {agent_identifier}")
710
 
711
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
712
 
713
+ finally:
714
+ # Always clean up temp directory
715
+ if os.path.exists(temp_dir):
716
+ shutil.rmtree(temp_dir)
717
 
718
  except Exception as e:
719
  print(f"βœ— Error saving review metadata: {str(e)}")
720
+ import traceback
721
+ traceback.print_exc()
722
  return False
723
 
724