Spaces:
Paused
Paused
from arxiv_fetcher import fetch_arxiv_metadata | |
from datasets import load_dataset, Dataset | |
from huggingface_hub import HfApi | |
from config import DATASET_NAME | |
import logging | |
from typing import List, Dict, Any | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
class ArxivMetadataService: | |
def __init__(self): | |
self.hf_api = HfApi() | |
def extract_and_update(self, query: str, max_results: int = 10) -> str: | |
metadata_list = fetch_arxiv_metadata(query, max_results) | |
if not metadata_list: | |
return "No metadata found for the given query." | |
return self.update_dataset(metadata_list) | |
def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str: | |
try: | |
# Load the existing dataset | |
try: | |
dataset = load_dataset(DATASET_NAME, split="train") | |
current_data = dataset.to_dict() | |
except Exception: | |
# If loading fails, start with an empty dictionary | |
current_data = {} | |
# If the dataset is empty, initialize it with the structure from metadata_list | |
if not current_data: | |
current_data = {key: [] for key in metadata_list[0].keys()} | |
updated = False | |
for paper in metadata_list: | |
entry_id = paper['entry_id'].split('/')[-1] | |
if 'entry_id' not in current_data or entry_id not in current_data['entry_id']: | |
# Add new paper | |
for key, value in paper.items(): | |
current_data.setdefault(key, []).append(value) | |
updated = True | |
else: | |
# Update existing paper | |
index = current_data['entry_id'].index(entry_id) | |
for key, value in paper.items(): | |
if current_data[key][index] != value: | |
current_data[key][index] = value | |
updated = True | |
if updated: | |
updated_dataset = Dataset.from_dict(current_data) | |
updated_dataset.push_to_hub(DATASET_NAME, split="train") | |
return f"Successfully updated dataset with {len(metadata_list)} papers" | |
else: | |
return "No new data to update." | |
except Exception as e: | |
logging.error(f"Failed to update dataset: {str(e)}") | |
return f"Failed to update dataset: {str(e)}" |