donb-hf commited on
Commit
79cf287
1 Parent(s): 660b29c

update get_dataset_records

Browse files
Files changed (2) hide show
  1. app.py +34 -13
  2. dataset_management_service.py +23 -4
app.py CHANGED
@@ -3,36 +3,56 @@ from typing import List, Dict, Any
3
  from config import DATASET_NAME
4
  from arxiv_retrieval_service import ArxivRetrievalService
5
  from dataset_management_service import DatasetManagementService
 
 
 
6
 
7
- # Initialize services
8
  arxiv_service = ArxivRetrievalService()
9
  dataset_service = DatasetManagementService(DATASET_NAME)
10
 
11
  def handle_metadata_extraction(query: str, max_results: int) -> str:
12
  try:
13
- # Fetch metadata from ArXiv
14
  metadata_list = arxiv_service.fetch_metadata(query, max_results)
15
  if not metadata_list:
16
  return "No metadata found for the given query."
17
 
18
- # Update the dataset with new metadata
19
  result = dataset_service.update_dataset(metadata_list)
 
20
  return result
21
  except Exception as e:
22
- return f"An error occurred: {str(e)}"
 
 
23
 
24
- def handle_dataset_view() -> List[Dict[str, Any]]:
 
25
  try:
26
- return dataset_service.get_dataset_records()
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  except Exception as e:
28
- return [{"error": f"Error loading dataset: {str(e)}"}]
 
 
29
 
30
- # Define Gradio interface
31
  with gr.Blocks() as demo:
32
  gr.Markdown(
33
- f"""Extract metadata from ArXiv papers and update the dataset.
34
- \n\nCurrently leverages the following dataset:
35
- \n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer)
 
36
  """
37
  )
38
 
@@ -49,12 +69,13 @@ with gr.Blocks() as demo:
49
  )
50
 
51
  with gr.Tab("View Dataset"):
52
- refresh_button = gr.Button("Refresh Dataset Info")
 
53
  dataset_info = gr.JSON(label="Dataset Info")
54
 
55
  refresh_button.click(
56
  fn=handle_dataset_view,
57
- inputs=[],
58
  outputs=dataset_info
59
  )
60
 
 
3
  from config import DATASET_NAME
4
  from arxiv_retrieval_service import ArxivRetrievalService
5
  from dataset_management_service import DatasetManagementService
6
+ import logging
7
+
8
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
 
 
10
  arxiv_service = ArxivRetrievalService()
11
  dataset_service = DatasetManagementService(DATASET_NAME)
12
 
13
  def handle_metadata_extraction(query: str, max_results: int) -> str:
14
  try:
15
+ logging.info(f"Fetching metadata for query: {query}, max_results: {max_results}")
16
  metadata_list = arxiv_service.fetch_metadata(query, max_results)
17
  if not metadata_list:
18
  return "No metadata found for the given query."
19
 
 
20
  result = dataset_service.update_dataset(metadata_list)
21
+ logging.info(f"Dataset update result: {result}")
22
  return result
23
  except Exception as e:
24
+ error_msg = f"An error occurred during metadata extraction: {str(e)}"
25
+ logging.error(error_msg)
26
+ return error_msg
27
 
28
+ def handle_dataset_view(page: int = 1, page_size: int = 10) -> Dict[str, Any]:
29
+ logging.info(f"handle_dataset_view called with page={page}, page_size={page_size}")
30
  try:
31
+ total_records = dataset_service.get_dataset_size()
32
+ logging.info(f"Total records: {total_records}")
33
+
34
+ records = dataset_service.get_dataset_records(page, page_size)
35
+ logging.info(f"Records type: {type(records)}")
36
+ logging.info(f"Number of records returned: {len(records)}")
37
+
38
+ result = {
39
+ "total_records": total_records,
40
+ "current_page": page,
41
+ "records": records
42
+ }
43
+ logging.info(f"Returning result: {result}")
44
+ return result
45
  except Exception as e:
46
+ error_msg = f"Error loading dataset: {str(e)}"
47
+ logging.error(error_msg)
48
+ return {"error": error_msg}
49
 
 
50
  with gr.Blocks() as demo:
51
  gr.Markdown(
52
+ f"""# ArXiv Metadata Extraction and Dataset Management
53
+
54
+ This application extracts metadata from ArXiv papers and manages the dataset:
55
+ [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer)
56
  """
57
  )
58
 
 
69
  )
70
 
71
  with gr.Tab("View Dataset"):
72
+ page_number = gr.Number(value=1, label="Page Number", precision=0)
73
+ refresh_button = gr.Button("Refresh Dataset View")
74
  dataset_info = gr.JSON(label="Dataset Info")
75
 
76
  refresh_button.click(
77
  fn=handle_dataset_view,
78
+ inputs=[page_number],
79
  outputs=dataset_info
80
  )
81
 
dataset_management_service.py CHANGED
@@ -1,5 +1,6 @@
1
  from typing import List, Dict, Any
2
  from datasets import load_dataset, Dataset
 
3
 
4
  class DatasetManagementService:
5
  def __init__(self, dataset_name: str):
@@ -47,13 +48,31 @@ class DatasetManagementService:
47
  except Exception as e:
48
  return f"Failed to update dataset: {str(e)}"
49
 
50
- def get_dataset_records(self) -> List[Dict[str, Any]]:
51
  try:
52
  dataset = load_dataset(self.dataset_name, split="train")
53
- if len(dataset) == 0:
54
- return []
55
- return dataset.to_pandas().to_dict(orient="records")
56
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  return [{"error": f"Error loading dataset: {str(e)}"}]
58
 
59
  # Usage:
 
1
  from typing import List, Dict, Any
2
  from datasets import load_dataset, Dataset
3
+ import logging
4
 
5
  class DatasetManagementService:
6
  def __init__(self, dataset_name: str):
 
48
  except Exception as e:
49
  return f"Failed to update dataset: {str(e)}"
50
 
51
+ def get_dataset_size(self) -> int:
52
  try:
53
  dataset = load_dataset(self.dataset_name, split="train")
54
+ size = len(dataset)
55
+ logging.info(f"Dataset size: {size}")
56
+ return size
57
  except Exception as e:
58
+ logging.error(f"Error getting dataset size: {str(e)}")
59
+ return 0
60
+
61
+ def get_dataset_records(self, page: int, page_size: int) -> List[Dict[str, Any]]:
62
+ try:
63
+ dataset = load_dataset(self.dataset_name, split="train")
64
+ start_idx = (page - 1) * page_size
65
+ end_idx = start_idx + page_size
66
+ records = dataset[start_idx:end_idx]
67
+
68
+ # Convert to list of dictionaries
69
+ records_list = [dict(zip(records.keys(), values)) for values in zip(*records.values())]
70
+
71
+ logging.info(f"Records type: {type(records_list)}")
72
+ logging.info(f"Number of records: {len(records_list)}")
73
+ return records_list
74
+ except Exception as e:
75
+ logging.error(f"Error loading dataset records: {str(e)}")
76
  return [{"error": f"Error loading dataset: {str(e)}"}]
77
 
78
  # Usage: