Fred808 commited on
Commit
fdfa721
·
verified ·
1 Parent(s): 6d0fc85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -13
app.py CHANGED
@@ -14,7 +14,8 @@ from transformers import (
14
  )
15
  from sentence_transformers import SentenceTransformer
16
  from bertopic import BERTopic
17
- from datasets import load_dataset
 
18
 
19
  # Preprocessing function
20
  def preprocess_text(text):
@@ -139,18 +140,48 @@ topic_extractor = TopicExtractor()
139
  chatbot = Chatbot()
140
 
141
  # Load the yt-commons dataset
142
- from datasets import load_dataset
143
-
144
- # Load specific .parquet files
145
- dataset = load_dataset("PleIAs/YouTube-Commons", data_files=["cctube_0.parquet", "cctube_1.parquet"], streaming=True)
146
-
147
- # Extract specific columns
148
- for example in dataset["train"]:
149
- title = example["title"] # Replace 'title' with the correct column name
150
- description = example["description"] # Replace 'description' with the correct column name
151
- print(f"Title: {title}")
152
- print(f"Description: {description}")
153
- break # Stop after the first example for demonstration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
 
156
  # API Endpoints
 
14
  )
15
  from sentence_transformers import SentenceTransformer
16
  from bertopic import BERTopic
17
+ from datasets import load_dataset, Features, Value
18
+ import pyarrow.parquet as pq
19
 
20
  # Preprocessing function
21
  def preprocess_text(text):
 
140
  chatbot = Chatbot()
141
 
142
  # Load the yt-commons dataset
143
+
144
+ # Define the schema
145
+ features = Features({
146
+ "video_id": Value("string"),
147
+ "video_link": Value("string"),
148
+ "title": Value("string"),
149
+ "text": Value("string"),
150
+ "channel": Value("string"),
151
+ "channel_id": Value("string"),
152
+ "date": Value("string"),
153
+ "license": Value("string"),
154
+ "original_language": Value("string"),
155
+ "source_language": Value("string"),
156
+ "transcription_language": Value("string"),
157
+ "word_count": Value("int64"),
158
+ "character_count": Value("int64"),
159
+ })
160
+
161
+ # Load the dataset with the correct schema
162
+ try:
163
+ dataset = load_dataset(
164
+ "parquet",
165
+ data_files="PleIAs/YouTube-Commons/cctube_*.parquet", # Replace with the actual path
166
+ features=features,
167
+ ignore_extra_columns=True, # Ignore extra columns
168
+ streaming=True, # Use streaming mode for large datasets
169
+ )
170
+
171
+ # Process the dataset
172
+ for example in dataset["train"]:
173
+ print(example) # Process each example
174
+ break # Stop after the first example for demonstration
175
+ except Exception as e:
176
+ print(f"Error loading dataset: {e}")
177
+
178
+ # Debugging: Load a single .parquet file
179
+ file_path = "PleIAs/YouTube-Commons/cctube_0.parquet"
180
+ table = pq.read_table(file_path)
181
+ print("Schema of the .parquet file:")
182
+ print(table.schema)
183
+ print("First few rows:")
184
+ print(table.to_pandas().head())
185
 
186
 
187
  # API Endpoints