Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,8 @@ from transformers import (
|
|
14 |
)
|
15 |
from sentence_transformers import SentenceTransformer
|
16 |
from bertopic import BERTopic
|
17 |
-
from datasets import load_dataset
|
|
|
18 |
|
19 |
# Preprocessing function
|
20 |
def preprocess_text(text):
|
@@ -139,18 +140,48 @@ topic_extractor = TopicExtractor()
|
|
139 |
chatbot = Chatbot()
|
140 |
|
141 |
# Load the yt-commons dataset
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
|
156 |
# API Endpoints
|
|
|
14 |
)
|
15 |
from sentence_transformers import SentenceTransformer
|
16 |
from bertopic import BERTopic
|
17 |
+
from datasets import load_dataset, Features, Value
|
18 |
+
import pyarrow.parquet as pq
|
19 |
|
20 |
# Preprocessing function
|
21 |
def preprocess_text(text):
|
|
|
140 |
chatbot = Chatbot()
|
141 |
|
142 |
# Load the yt-commons dataset
|
143 |
+
|
144 |
+
# Define the schema
|
145 |
+
features = Features({
|
146 |
+
"video_id": Value("string"),
|
147 |
+
"video_link": Value("string"),
|
148 |
+
"title": Value("string"),
|
149 |
+
"text": Value("string"),
|
150 |
+
"channel": Value("string"),
|
151 |
+
"channel_id": Value("string"),
|
152 |
+
"date": Value("string"),
|
153 |
+
"license": Value("string"),
|
154 |
+
"original_language": Value("string"),
|
155 |
+
"source_language": Value("string"),
|
156 |
+
"transcription_language": Value("string"),
|
157 |
+
"word_count": Value("int64"),
|
158 |
+
"character_count": Value("int64"),
|
159 |
+
})
|
160 |
+
|
161 |
+
# Load the dataset with the correct schema
|
162 |
+
try:
|
163 |
+
dataset = load_dataset(
|
164 |
+
"parquet",
|
165 |
+
data_files="PleIAs/YouTube-Commons/cctube_*.parquet", # Replace with the actual path
|
166 |
+
features=features,
|
167 |
+
ignore_extra_columns=True, # Ignore extra columns
|
168 |
+
streaming=True, # Use streaming mode for large datasets
|
169 |
+
)
|
170 |
+
|
171 |
+
# Process the dataset
|
172 |
+
for example in dataset["train"]:
|
173 |
+
print(example) # Process each example
|
174 |
+
break # Stop after the first example for demonstration
|
175 |
+
except Exception as e:
|
176 |
+
print(f"Error loading dataset: {e}")
|
177 |
+
|
178 |
+
# Debugging: Load a single .parquet file
|
179 |
+
file_path = "PleIAs/YouTube-Commons/cctube_0.parquet"
|
180 |
+
table = pq.read_table(file_path)
|
181 |
+
print("Schema of the .parquet file:")
|
182 |
+
print(table.schema)
|
183 |
+
print("First few rows:")
|
184 |
+
print(table.to_pandas().head())
|
185 |
|
186 |
|
187 |
# API Endpoints
|