Spaces:
Sleeping
Sleeping
labels = ["CV", "AI", "ML", "NE", "CL"] | |
id2label = {i: label for i, label in enumerate(labels)} | |
label2id = {label: i for i, label in enumerate(labels)} | |
categorie2human = { | |
"CV": "Computer Vision", | |
"AI": "Artificial Intelligence", | |
"ML": "Machine Learning", | |
"NE": "Neural and Evolutionary Computing", | |
"CL": "Computation and Language" | |
} | |
def load_arxiv_dataset(): | |
import kagglehub | |
import os | |
from datasets import load_dataset | |
# Download latest version | |
path = kagglehub.dataset_download("spsayakpaul/arxiv-paper-abstracts") | |
dataset = load_dataset( | |
"csv", | |
data_files=os.path.join(path, "arxiv_data.csv"), | |
encoding="utf-8", | |
split="train" | |
) | |
# convert string to lists | |
import ast | |
def parse_terms(example): | |
example["terms"] = ast.literal_eval(example["terms"]) | |
return example | |
dataset = dataset.map(parse_terms) | |
return dataset |