Spaces:

Fred808
/

YT-Trainer

Runtime error

App Files Files Community

Fred808 commited on Jan 20

Commit

fdfa721

verified ·

1 Parent(s): 6d0fc85

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -13

app.py CHANGED Viewed

@@ -14,7 +14,8 @@ from transformers import (
 )
 from sentence_transformers import SentenceTransformer
 from bertopic import BERTopic
-from datasets import load_dataset
 # Preprocessing function
 def preprocess_text(text):
@@ -139,18 +140,48 @@ topic_extractor = TopicExtractor()
 chatbot = Chatbot()
 # Load the yt-commons dataset
-from datasets import load_dataset
-# Load specific .parquet files
-dataset = load_dataset("PleIAs/YouTube-Commons", data_files=["cctube_0.parquet", "cctube_1.parquet"], streaming=True)
-# Extract specific columns
-for example in dataset["train"]:
-    title = example["title"]  # Replace 'title' with the correct column name
-    description = example["description"]  # Replace 'description' with the correct column name
-    print(f"Title: {title}")
-    print(f"Description: {description}")
-    break  # Stop after the first example for demonstration
 # API Endpoints

 )
 from sentence_transformers import SentenceTransformer
 from bertopic import BERTopic
+from datasets import load_dataset, Features, Value
+import pyarrow.parquet as pq
 # Preprocessing function
 def preprocess_text(text):
 chatbot = Chatbot()
 # Load the yt-commons dataset
+# Define the schema
+features = Features({
+    "video_id": Value("string"),
+    "video_link": Value("string"),
+    "title": Value("string"),
+    "text": Value("string"),
+    "channel": Value("string"),
+    "channel_id": Value("string"),
+    "date": Value("string"),
+    "license": Value("string"),
+    "original_language": Value("string"),
+    "source_language": Value("string"),
+    "transcription_language": Value("string"),
+    "word_count": Value("int64"),
+    "character_count": Value("int64"),
+})
+# Load the dataset with the correct schema
+try:
+    dataset = load_dataset(
+        "parquet",
+        data_files="PleIAs/YouTube-Commons/cctube_*.parquet",  # Replace with the actual path
+        features=features,
+        ignore_extra_columns=True,  # Ignore extra columns
+        streaming=True,  # Use streaming mode for large datasets
+    )
+    # Process the dataset
+    for example in dataset["train"]:
+        print(example)  # Process each example
+        break  # Stop after the first example for demonstration
+except Exception as e:
+    print(f"Error loading dataset: {e}")
+    # Debugging: Load a single .parquet file
+    file_path = "PleIAs/YouTube-Commons/cctube_0.parquet"
+    table = pq.read_table(file_path)
+    print("Schema of the .parquet file:")
+    print(table.schema)
+    print("First few rows:")
+    print(table.to_pandas().head())
 # API Endpoints