Spaces:

yazoniak
/

twitteremo-pl-classifier

Running on Zero

App Files Files Community

yazoniak commited on 1 day ago

Commit

b02839d

verified ·

1 Parent(s): 061ca04

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -46

app.py CHANGED Viewed

@@ -28,9 +28,10 @@ import os
 import re
 import spaces
 from datetime import datetime
-from datasets import Dataset, load_dataset
-from huggingface_hub import HfApi
 import pandas as pd
 # Model configuration
@@ -64,6 +65,9 @@ class HFDatasetLogger:
     This provides persistent storage across space restarts by storing data
     directly to a HuggingFace dataset repository.
     """
     def __init__(self, dataset_name: str, hf_token: str, private: bool = True):
@@ -80,14 +84,66 @@ class HFDatasetLogger:
         self.private = private
         self.api = HfApi()
         self.dataset_exists = False
-        # Check if dataset exists
         try:
-            load_dataset(dataset_name, split="train", token=hf_token, streaming=True)
-            self.dataset_exists = True
-        except Exception:
             self.dataset_exists = False
     def log(
         self,
         text: str,
@@ -100,8 +156,8 @@ class HFDatasetLogger:
         """
         Log a prediction to the HuggingFace dataset.
-        Uses pandas DataFrame as intermediate format to ensure proper
-        parquet compatibility when appending to existing datasets.
         Args:
             text: Input text
@@ -124,46 +180,31 @@ class HFDatasetLogger:
             }])
             if self.dataset_exists:
-                # Append to existing dataset
-                try:
-                    # Download existing dataset and convert to pandas
-                    existing_dataset = load_dataset(
-                        self.dataset_name,
-                        split="train",
-                        token=self.hf_token,
-                        download_mode="force_redownload",
-                    )
-                    existing_df = existing_dataset.to_pandas()
                     # Concatenate DataFrames
                     combined_df = pd.concat([existing_df, new_row], ignore_index=True)
-                    # Convert back to Dataset and push
-                    combined_dataset = Dataset.from_pandas(combined_df)
-                    combined_dataset.push_to_hub(
-                        self.dataset_name,
-                        token=self.hf_token,
-                        private=self.private,
-                        commit_message=f"Add prediction at {datetime.utcnow().isoformat()}",
-                    )
-                    print(f"✓ Appended prediction (total rows: {len(combined_df)})")
-                except FileNotFoundError:
-                    # Dataset doesn't exist yet despite our check - create it
-                    print("⚠ Dataset not found, creating new dataset")
-                    new_dataset = Dataset.from_pandas(new_row)
-                    new_dataset.push_to_hub(
-                        self.dataset_name,
-                        token=self.hf_token,
-                        private=self.private,
-                    )
-                    self.dataset_exists = True
-                except Exception as e:
-                    # For any other error, DO NOT fall back to push_to_hub
-                    # as that would REPLACE the entire dataset with just the new entry!
-                    print(f"⚠ Error appending to dataset (data not saved): {e}")
-                    import traceback
-                    traceback.print_exc()
             else:
                 # Create new dataset
                 new_dataset = Dataset.from_pandas(new_row)
@@ -173,6 +214,7 @@ class HFDatasetLogger:
                     private=self.private,
                 )
                 self.dataset_exists = True
                 print("✓ Created new dataset with first prediction")
         except Exception as e:

 import re
 import spaces
 from datetime import datetime
+from datasets import Dataset
+from huggingface_hub import HfApi, hf_hub_download, list_repo_files
 import pandas as pd
+import tempfile
 # Model configuration
     This provides persistent storage across space restarts by storing data
     directly to a HuggingFace dataset repository.
+    Uses direct parquet file download via hf_hub_download to bypass
+    any caching issues with load_dataset.
     """
     def __init__(self, dataset_name: str, hf_token: str, private: bool = True):
         self.private = private
         self.api = HfApi()
         self.dataset_exists = False
+        self.parquet_filename = None
+        # Check if dataset exists by listing files in the repo
         try:
+            files = list_repo_files(
+                dataset_name,
+                repo_type="dataset",
+                token=hf_token,
+            )
+            files_list = list(files)  # Convert to list to allow multiple iterations
+            print(f"  Files in repo: {files_list}")
+            # Find the parquet file(s)
+            parquet_files = [f for f in files_list if f.endswith(".parquet")]
+            if parquet_files:
+                # Use the first parquet file (could be at root or in data/ folder)
+                self.parquet_filename = parquet_files[0]
+                self.dataset_exists = True
+                print(f"  ✓ Found existing parquet file: {self.parquet_filename}")
+            else:
+                print(f"  No parquet files found in dataset repo (files: {files_list})")
+        except Exception as e:
+            print(f"  Dataset repo not found or error: {type(e).__name__}: {e}")
             self.dataset_exists = False
+    def _download_existing_data(self) -> pd.DataFrame | None:
+        """
+        Download existing parquet data directly using hf_hub_download.
+        Uses force_download=True to bypass all caching.
+        Returns:
+            DataFrame with existing data, or None if download fails
+        """
+        if not self.parquet_filename:
+            print("  No parquet filename set, cannot download")
+            return None
+        try:
+            print(f"  Downloading parquet file: {self.parquet_filename}")
+            # Create a unique temp directory for each download to avoid caching
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                local_path = hf_hub_download(
+                    repo_id=self.dataset_name,
+                    filename=self.parquet_filename,
+                    repo_type="dataset",
+                    token=self.hf_token,
+                    force_download=True,  # Force fresh download, bypass cache
+                    local_dir=tmp_dir,
+                )
+                print(f"  Downloaded to: {local_path}")
+                df = pd.read_parquet(local_path)
+                print(f"  ✓ Loaded existing data: {len(df)} rows")
+                return df
+        except Exception as e:
+            print(f"  ✗ Error downloading existing data: {type(e).__name__}: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
     def log(
         self,
         text: str,
         """
         Log a prediction to the HuggingFace dataset.
+        Downloads existing parquet directly (bypassing load_dataset cache),
+        appends new row, and pushes combined data back to Hub.
         Args:
             text: Input text
             }])
             if self.dataset_exists:
+                # Download existing data directly from parquet file
+                existing_df = self._download_existing_data()
+                if existing_df is not None and len(existing_df) > 0:
                     # Concatenate DataFrames
                     combined_df = pd.concat([existing_df, new_row], ignore_index=True)
+                    print(f"  Combining {len(existing_df)} existing + 1 new = {len(combined_df)} rows")
+                else:
+                    # No existing data or download failed, use just the new row
+                    combined_df = new_row
+                    print("  No existing data found, starting fresh")
+                # Convert to Dataset and push
+                combined_dataset = Dataset.from_pandas(combined_df)
+                combined_dataset.push_to_hub(
+                    self.dataset_name,
+                    token=self.hf_token,
+                    private=self.private,
+                    commit_message=f"Add prediction at {datetime.utcnow().isoformat()}",
+                )
+                print(f"✓ Pushed dataset with {len(combined_df)} total rows")
+                # Update parquet filename if this was the first push
+                if not self.parquet_filename:
+                    self.parquet_filename = "data/train-00000-of-00001.parquet"
             else:
                 # Create new dataset
                 new_dataset = Dataset.from_pandas(new_row)
                     private=self.private,
                 )
                 self.dataset_exists = True
+                self.parquet_filename = "data/train-00000-of-00001.parquet"
                 print("✓ Created new dataset with first prediction")
         except Exception as e: