donb-hf commited on
Commit
97e8d87
1 Parent(s): a37fd25

adding components

Browse files
Files changed (3) hide show
  1. app.py +18 -114
  2. arxiv_retrieval_service.py +35 -0
  3. dataset_management_service.py +46 -0
app.py CHANGED
@@ -1,133 +1,38 @@
1
  import gradio as gr
2
- import arxiv
3
- import traceback
4
- import logging
5
  from typing import List, Dict, Any
6
- from datasets import load_dataset, Dataset
7
- from huggingface_hub import HfApi
8
  from config import DATASET_NAME
 
 
9
 
10
- # Logging setup
11
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
12
 
13
- # Arxiv Fetcher logic
14
- def fetch_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
15
- logging.info(f"Fetching arXiv metadata for query: {query}")
16
- if not query.strip():
17
- logging.warning("Empty or whitespace-only query provided")
18
- return []
19
-
20
- client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
21
- search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
22
-
23
- results = []
24
  try:
25
- for result in client.results(search):
26
- metadata = {
27
- "title": result.title,
28
- "authors": [author.name for author in result.authors],
29
- "published": result.published.isoformat(),
30
- "updated": result.updated.isoformat(),
31
- "pdf_url": result.pdf_url,
32
- "entry_id": result.entry_id,
33
- "summary": result.summary,
34
- "categories": result.categories,
35
- "primary_category": result.primary_category,
36
- "html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
37
- }
38
- results.append(metadata)
39
- logging.info(f"Fetched metadata for {len(results)} papers")
40
- except Exception as e:
41
- logging.error(f"Error fetching metadata: {str(e)}")
42
-
43
- return results
44
-
45
- # Arxiv Metadata Service logic
46
- class ArxivMetadataService:
47
- def __init__(self):
48
- self.hf_api = HfApi()
49
-
50
- def extract_metadata_and_update_dataset(self, query: str, max_results: int = 10) -> str:
51
- metadata_list = fetch_metadata(query, max_results)
52
  if not metadata_list:
53
  return "No metadata found for the given query."
54
- return self.update_dataset(metadata_list)
55
-
56
- def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
57
- try:
58
- # Load the existing dataset
59
- try:
60
- dataset = load_dataset(DATASET_NAME, split="train")
61
- current_data = dataset.to_dict()
62
- except Exception:
63
- # If loading fails, start with an empty dictionary
64
- current_data = {}
65
-
66
- # If the dataset is empty, initialize it with the structure from metadata_list
67
- if not current_data:
68
- current_data = {key: [] for key in metadata_list[0].keys()}
69
-
70
- updated = False
71
- for paper in metadata_list:
72
- entry_id = paper['entry_id'].split('/')[-1]
73
- if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
74
- # Add new paper
75
- for key, value in paper.items():
76
- current_data.setdefault(key, []).append(value)
77
- updated = True
78
- else:
79
- # Update existing paper
80
- index = current_data['entry_id'].index(entry_id)
81
- for key, value in paper.items():
82
- if current_data[key][index] != value:
83
- current_data[key][index] = value
84
- updated = True
85
-
86
- if updated:
87
- updated_dataset = Dataset.from_dict(current_data)
88
- updated_dataset.push_to_hub(DATASET_NAME, split="train")
89
- return f"Successfully updated dataset with {len(metadata_list)} papers"
90
- else:
91
- return "No new data to update."
92
- except Exception as e:
93
- logging.error(f"Failed to update dataset: {str(e)}")
94
- return f"Failed to update dataset: {str(e)}"
95
-
96
- def get_dataset_records(self):
97
- try:
98
- dataset = load_dataset(DATASET_NAME, split="train")
99
- records = dataset.to_pandas().to_dict(orient="records")
100
- return records
101
- except Exception as e:
102
- return f"Error loading dataset: {str(e)}"
103
-
104
- # Initialize Arxiv Metadata Service
105
- arxiv_service = ArxivMetadataService()
106
-
107
- # Define Gradio functions
108
- def handle_metadata_extraction(query: str, max_results: int):
109
- try:
110
- result = arxiv_service.extract_metadata_and_update_dataset(query, max_results)
111
- logging.info(f"Extraction result: {result}")
112
  return result
113
  except Exception as e:
114
- error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
115
- logging.error(error_msg)
116
- return error_msg
117
 
118
- def handle_dataset_view():
119
  try:
120
- records = arxiv_service.get_dataset_records()
121
- return records
122
  except Exception as e:
123
- return f"Error loading dataset: {str(e)}"
124
 
125
  # Define Gradio interface
126
  with gr.Blocks() as demo:
127
  gr.Markdown(
128
  f"""Extract metadata from ArXiv papers and update the dataset.
129
- \n\nCurrently leverages the following datasets:
130
- \n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer) dataset.
131
  """
132
  )
133
 
@@ -154,5 +59,4 @@ with gr.Blocks() as demo:
154
  )
155
 
156
  if __name__ == "__main__":
157
- demo.queue()
158
- demo.launch()
 
1
  import gradio as gr
 
 
 
2
  from typing import List, Dict, Any
 
 
3
  from config import DATASET_NAME
4
+ from arxiv_retrieval_service import ArxivRetrievalService
5
+ from dataset_management_service import DatasetManagementService
6
 
7
+ # Initialize services
8
+ arxiv_service = ArxivRetrievalService()
9
+ dataset_service = DatasetManagementService(DATASET_NAME)
10
 
11
+ def handle_metadata_extraction(query: str, max_results: int) -> str:
 
 
 
 
 
 
 
 
 
 
12
  try:
13
+ # Fetch metadata from ArXiv
14
+ metadata_list = arxiv_service.fetch_metadata(query, max_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  if not metadata_list:
16
  return "No metadata found for the given query."
17
+
18
+ # Update the dataset with new metadata
19
+ result = dataset_service.update_dataset(metadata_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  return result
21
  except Exception as e:
22
+ return f"An error occurred: {str(e)}"
 
 
23
 
24
+ def handle_dataset_view() -> List[Dict[str, Any]]:
25
  try:
26
+ return dataset_service.get_dataset_records()
 
27
  except Exception as e:
28
+ return [{"error": f"Error loading dataset: {str(e)}"}]
29
 
30
  # Define Gradio interface
31
  with gr.Blocks() as demo:
32
  gr.Markdown(
33
  f"""Extract metadata from ArXiv papers and update the dataset.
34
+ \n\nCurrently leverages the following dataset:
35
+ \n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer)
36
  """
37
  )
38
 
 
59
  )
60
 
61
  if __name__ == "__main__":
62
+ demo.launch()
 
arxiv_retrieval_service.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import arxiv
2
+ from typing import List, Dict, Any
3
+
4
+ class ArxivRetrievalService:
5
+ def __init__(self):
6
+ self.client = arxiv.Client(delay_seconds=3, num_retries=3)
7
+
8
+ def fetch_metadata(self, query: str, max_results: int = 10) -> List[Dict[str, Any]]:
9
+ search = arxiv.Search(
10
+ query=query,
11
+ max_results=max_results,
12
+ sort_by=arxiv.SortCriterion.SubmittedDate
13
+ )
14
+
15
+ results = []
16
+ for result in self.client.results(search):
17
+ metadata = {
18
+ "title": result.title,
19
+ "authors": [author.name for author in result.authors],
20
+ "published": result.published.isoformat(),
21
+ "updated": result.updated.isoformat(),
22
+ "pdf_url": result.pdf_url,
23
+ "entry_id": result.entry_id,
24
+ "summary": result.summary,
25
+ "categories": result.categories,
26
+ "primary_category": result.primary_category,
27
+ "html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
28
+ }
29
+ results.append(metadata)
30
+
31
+ return results
32
+
33
+ # Usage:
34
+ # arxiv_service = ArxivRetrievalService()
35
+ # metadata = arxiv_service.fetch_metadata("quantum computing", max_results=5)
dataset_management_service.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any
2
+ from datasets import load_dataset, Dataset
3
+
4
+ class DatasetManagementService:
5
+ def __init__(self, dataset_name: str):
6
+ self.dataset_name = dataset_name
7
+
8
+ def update_dataset(self, new_metadata: List[Dict[str, Any]]) -> str:
9
+ try:
10
+ dataset = load_dataset(self.dataset_name, split="train")
11
+ current_data = dataset.to_dict()
12
+
13
+ if not current_data:
14
+ current_data = {key: [] for key in new_metadata[0].keys()}
15
+
16
+ updated = False
17
+ for paper in new_metadata:
18
+ entry_id = paper['entry_id'].split('/')[-1]
19
+ if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
20
+ for key, value in paper.items():
21
+ current_data.setdefault(key, []).append(value)
22
+ updated = True
23
+ else:
24
+ index = current_data['entry_id'].index(entry_id)
25
+ for key, value in paper.items():
26
+ if current_data[key][index] != value:
27
+ current_data[key][index] = value
28
+ updated = True
29
+
30
+ if updated:
31
+ updated_dataset = Dataset.from_dict(current_data)
32
+ updated_dataset.push_to_hub(self.dataset_name, split="train")
33
+ return f"Successfully updated dataset with {len(new_metadata)} papers"
34
+ else:
35
+ return "No new data to update."
36
+ except Exception as e:
37
+ return f"Failed to update dataset: {str(e)}"
38
+
39
+ def get_dataset_records(self) -> List[Dict[str, Any]]:
40
+ dataset = load_dataset(self.dataset_name, split="train")
41
+ return dataset.to_pandas().to_dict(orient="records")
42
+
43
+ # Usage:
44
+ # dataset_service = DatasetManagementService("your_dataset_name")
45
+ # result = dataset_service.update_dataset(new_metadata)
46
+ # records = dataset_service.get_dataset_records()