Spaces:
Paused
Paused
initialize datasett script
Browse files- initialize_dataset.py +24 -0
initialize_dataset.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import Dataset
|
2 |
+
from config import DATASET_NAME
|
3 |
+
import huggingface_hub
|
4 |
+
|
5 |
+
# Initialize an empty dataset with the expected structure
|
6 |
+
initial_data = {
|
7 |
+
"id": [],
|
8 |
+
"title": [],
|
9 |
+
"authors": [],
|
10 |
+
"published": [],
|
11 |
+
"updated": [],
|
12 |
+
"pdf_url": [],
|
13 |
+
"entry_id": [],
|
14 |
+
"summary": [],
|
15 |
+
"categories": [],
|
16 |
+
"primary_category": [],
|
17 |
+
"html_url": []
|
18 |
+
}
|
19 |
+
|
20 |
+
# Create the dataset
|
21 |
+
dataset = Dataset.from_dict(initial_data)
|
22 |
+
|
23 |
+
# Push the initial dataset to the Hub
|
24 |
+
dataset.push_to_hub(DATASET_NAME, split="train")
|