donb-hf commited on
Commit
19ab6fa
1 Parent(s): 778b735

update dataset info

Browse files
Files changed (4) hide show
  1. app.py +45 -11
  2. arxiv_metadata_service.py +36 -14
  3. initialize_dataset.py +30 -19
  4. requirements.txt +2 -0
app.py CHANGED
@@ -1,26 +1,60 @@
1
  import gradio as gr
2
  from arxiv_metadata_service import ArxivMetadataService
3
  import traceback
 
 
 
 
 
4
 
5
  arxiv_service = ArxivMetadataService()
6
 
7
  def extract_metadata(query: str, max_results: int):
8
  try:
9
- return arxiv_service.extract_and_update(query, max_results)
 
 
10
  except Exception as e:
11
  error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
 
12
  return error_msg
13
 
14
- demo = gr.Interface(
15
- fn=extract_metadata,
16
- inputs=[
17
- gr.Textbox(label="ArXiv Query"),
18
- gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Max Results")
19
- ],
20
- outputs="text",
21
- title="ArXiv Metadata Extractor",
22
- description="Extract metadata from ArXiv papers and update the dataset."
23
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  if __name__ == "__main__":
26
  demo.launch()
 
1
  import gradio as gr
2
  from arxiv_metadata_service import ArxivMetadataService
3
  import traceback
4
+ import logging
5
+ from config import DATASET_NAME
6
+ from datasets import load_dataset
7
+
8
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
 
10
  arxiv_service = ArxivMetadataService()
11
 
12
  def extract_metadata(query: str, max_results: int):
13
  try:
14
+ result = arxiv_service.extract_and_update(query, max_results)
15
+ logging.info(f"Extraction result: {result}")
16
+ return result
17
  except Exception as e:
18
  error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
19
+ logging.error(error_msg)
20
  return error_msg
21
 
22
+ def load_dataset_info():
23
+ try:
24
+ dataset = load_dataset(DATASET_NAME, split="train")
25
+ return f"Dataset contains {len(dataset)} records."
26
+ except Exception as e:
27
+ return f"Error loading dataset: {str(e)}"
28
+
29
+ with gr.Blocks() as demo:
30
+ gr.Markdown(
31
+ f"""Extract metadata from ArXiv papers and update the dataset.
32
+ \n\nCurrently leverages the following datasets:
33
+ \n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer) dataset.
34
+ """
35
+ )
36
+
37
+ with gr.Tab("Extract Metadata"):
38
+ query_input = gr.Textbox(label="ArXiv Query")
39
+ max_results = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Max Results")
40
+ submit_button = gr.Button("Extract Metadata")
41
+ output = gr.Textbox(label="Result")
42
+
43
+ submit_button.click(
44
+ fn=extract_metadata,
45
+ inputs=[query_input, max_results],
46
+ outputs=output
47
+ )
48
+
49
+ with gr.Tab("View Dataset"):
50
+ refresh_button = gr.Button("Refresh Dataset Info")
51
+ dataset_info = gr.Textbox(label="Dataset Info")
52
+
53
+ refresh_button.click(
54
+ fn=load_dataset_info,
55
+ inputs=[],
56
+ outputs=dataset_info
57
+ )
58
 
59
  if __name__ == "__main__":
60
  demo.launch()
arxiv_metadata_service.py CHANGED
@@ -1,5 +1,6 @@
1
  from arxiv_fetcher import fetch_arxiv_metadata
2
  from datasets import load_dataset, Dataset
 
3
  from config import DATASET_NAME
4
  import logging
5
  from typing import List, Dict, Any
@@ -7,30 +8,51 @@ from typing import List, Dict, Any
7
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8
 
9
  class ArxivMetadataService:
 
 
 
10
  def extract_and_update(self, query: str, max_results: int = 10) -> str:
11
  metadata_list = fetch_arxiv_metadata(query, max_results)
 
 
12
  return self.update_dataset(metadata_list)
13
 
14
  def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
15
  try:
16
- dataset = load_dataset(DATASET_NAME, split="train")
17
- current_data = dataset.to_dict()
18
-
 
 
 
 
 
 
 
 
 
 
19
  for paper in metadata_list:
20
- if paper['id'] not in current_data.get('id', []):
 
 
21
  for key, value in paper.items():
22
- if key not in current_data:
23
- current_data[key] = []
24
- current_data[key].append(value)
25
  else:
26
- index = current_data['id'].index(paper['id'])
 
27
  for key, value in paper.items():
28
- current_data[key][index] = value
29
-
30
- updated_dataset = Dataset.from_dict(current_data)
31
- updated_dataset.push_to_hub(DATASET_NAME, split="train")
32
-
33
- return f"Successfully updated dataset with {len(metadata_list)} papers"
 
 
 
 
34
  except Exception as e:
35
  logging.error(f"Failed to update dataset: {str(e)}")
36
  return f"Failed to update dataset: {str(e)}"
 
1
  from arxiv_fetcher import fetch_arxiv_metadata
2
  from datasets import load_dataset, Dataset
3
+ from huggingface_hub import HfApi
4
  from config import DATASET_NAME
5
  import logging
6
  from typing import List, Dict, Any
 
8
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
 
10
  class ArxivMetadataService:
11
+ def __init__(self):
12
+ self.hf_api = HfApi()
13
+
14
  def extract_and_update(self, query: str, max_results: int = 10) -> str:
15
  metadata_list = fetch_arxiv_metadata(query, max_results)
16
+ if not metadata_list:
17
+ return "No metadata found for the given query."
18
  return self.update_dataset(metadata_list)
19
 
20
  def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
21
  try:
22
+ # Load the existing dataset
23
+ try:
24
+ dataset = load_dataset(DATASET_NAME, split="train")
25
+ current_data = dataset.to_dict()
26
+ except Exception:
27
+ # If loading fails, start with an empty dictionary
28
+ current_data = {}
29
+
30
+ # If the dataset is empty, initialize it with the structure from metadata_list
31
+ if not current_data:
32
+ current_data = {key: [] for key in metadata_list[0].keys()}
33
+
34
+ updated = False
35
  for paper in metadata_list:
36
+ entry_id = paper['entry_id'].split('/')[-1]
37
+ if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
38
+ # Add new paper
39
  for key, value in paper.items():
40
+ current_data.setdefault(key, []).append(value)
41
+ updated = True
 
42
  else:
43
+ # Update existing paper
44
+ index = current_data['entry_id'].index(entry_id)
45
  for key, value in paper.items():
46
+ if current_data[key][index] != value:
47
+ current_data[key][index] = value
48
+ updated = True
49
+
50
+ if updated:
51
+ updated_dataset = Dataset.from_dict(current_data)
52
+ updated_dataset.push_to_hub(DATASET_NAME, split="train")
53
+ return f"Successfully updated dataset with {len(metadata_list)} papers"
54
+ else:
55
+ return "No new data to update."
56
  except Exception as e:
57
  logging.error(f"Failed to update dataset: {str(e)}")
58
  return f"Failed to update dataset: {str(e)}"
initialize_dataset.py CHANGED
@@ -1,24 +1,35 @@
1
  from datasets import Dataset
 
2
  from config import DATASET_NAME
3
- import huggingface_hub
4
 
5
- # Initialize an empty dataset with the expected structure
6
- initial_data = {
7
- "id": [],
8
- "title": [],
9
- "authors": [],
10
- "published": [],
11
- "updated": [],
12
- "pdf_url": [],
13
- "entry_id": [],
14
- "summary": [],
15
- "categories": [],
16
- "primary_category": [],
17
- "html_url": []
18
- }
19
 
20
- # Create the dataset
21
- dataset = Dataset.from_dict(initial_data)
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Push the initial dataset to the Hub
24
- dataset.push_to_hub(DATASET_NAME, split="train")
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datasets import Dataset
2
+ from huggingface_hub import HfApi
3
  from config import DATASET_NAME
4
+ import logging
5
 
6
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ def initialize_dataset():
9
+ # Initialize an empty dataset with the expected structure
10
+ initial_data = {
11
+ "entry_id": [],
12
+ "title": [],
13
+ "authors": [],
14
+ "published": [],
15
+ "updated": [],
16
+ "pdf_url": [],
17
+ "summary": [],
18
+ "categories": [],
19
+ "primary_category": [],
20
+ "html_url": []
21
+ }
22
 
23
+ # Create the dataset
24
+ dataset = Dataset.from_dict(initial_data)
25
+
26
+ try:
27
+ # Push the initial dataset to the Hub
28
+ dataset.push_to_hub(DATASET_NAME, split="train")
29
+ logging.info(f"Dataset {DATASET_NAME} initialized successfully with 'train' split.")
30
+ except Exception as e:
31
+ logging.error(f"Failed to initialize dataset: {str(e)}")
32
+ raise
33
+
34
+ if __name__ == "__main__":
35
+ initialize_dataset()
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  arxiv
2
  datasets
3
  gradio
 
 
 
1
  arxiv
2
  datasets
3
  gradio
4
+ huggingface_hub
5
+ python-dotenv