File size: 996 Bytes
5ccdf55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from huggingface_hub import snapshot_download
import os

def download_dataset(local_dir="./downloaded_data", token=None):
    """
    Download the arxiv-cs-paper-metadata-embedding dataset from Hugging Face
    
    Args:
        local_dir (str): Local directory to save the downloaded dataset
        token (str): Hugging Face token (optional, will use HF_TOKEN env var if not provided)
    
    Returns:
        str: Path to the downloaded dataset
    """
    if token is None:
        token = os.getenv("HF_TOKEN")
    
    # Create local directory if it doesn't exist
    os.makedirs(local_dir, exist_ok=True)
    
    # Download the dataset
    downloaded_path = snapshot_download(
        repo_id="Just-Curieous/arxiv-cs-paper-metadata-embedding",
        repo_type="dataset",
        local_dir=local_dir,
        token=token
    )
    
    print(f"Dataset downloaded to: {downloaded_path}")
    return downloaded_path

if __name__ == "__main__":
    # Example usage
    download_dataset()