Zhiming666 commited on
Commit
a06773e
1 Parent(s): 80354f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -18
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import requests
3
  import xml.etree.ElementTree as ET
@@ -7,6 +8,9 @@ from datetime import datetime
7
  import pandas as pd
8
  from fpdf import FPDF
9
  import gradio as gr
 
 
 
10
 
11
  def get_arxiv_data(search_query, number):
12
  url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
@@ -16,10 +20,7 @@ def get_arxiv_data(search_query, number):
16
  entries = root.findall('{http://www.w3.org/2005/Atom}entry')
17
  results = []
18
 
19
- # Create folder for current date and time
20
- current_time = datetime.now().strftime('%Y_%m_%d__%H_%M')
21
- folder_path = os.path.join(os.path.dirname(__file__), 'data', current_time)
22
- os.makedirs(folder_path, exist_ok=True)
23
 
24
  for entry in entries:
25
  title = entry.find('{http://www.w3.org/2005/Atom}title').text
@@ -37,14 +38,19 @@ def get_arxiv_data(search_query, number):
37
  # Download PDF file
38
  pdf_link = link.replace('abs', 'pdf') + '.pdf'
39
  filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
40
- filepath = os.path.join(folder_path, filename)
 
41
 
42
  try:
43
- urllib.request.urlretrieve(pdf_link, filepath)
 
44
  except Exception as e:
45
  continue
 
 
46
 
47
  # Save search query and results to PDF
 
48
  pdf = FPDF()
49
  pdf.add_page()
50
  pdf.set_font('Arial', 'B', 16)
@@ -53,36 +59,42 @@ def get_arxiv_data(search_query, number):
53
  for i, result in enumerate(results):
54
  pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
55
  pdf.ln(5) # Add newline after each result
56
- pdf.output(os.path.join(folder_path, '1_Search_query_AND_results.pdf'))
 
57
 
58
  # Save search query, results, and current time to Excel file
59
  current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
60
  df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
61
- folder_path = os.path.join(os.path.dirname(__file__), 'data')
62
- os.makedirs(folder_path, exist_ok=True)
63
- excel_filepath = os.path.join(folder_path, 'information.xlsx')
64
- if os.path.exists(excel_filepath):
65
- existing_df = pd.read_excel(excel_filepath)
66
- df = pd.concat([existing_df, df], ignore_index=True)
67
- df.to_excel(excel_filepath, index=False)
68
 
69
- return results
70
 
71
  def search_arxiv(search_query, max_results):
72
  start_time = datetime.now()
73
- results = get_arxiv_data(search_query, max_results)
74
  elapsed_time = datetime.now() - start_time
75
  elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"
76
 
77
- return '\n'.join(results), elapsed_time_str
 
 
 
 
 
 
 
 
78
 
79
  search_query_input = gr.inputs.Textbox(label="Search Query")
80
  max_results_input = gr.inputs.Textbox(label="Max Results")
81
 
82
  output_text = gr.outputs.Textbox(label="Results")
83
  output_time = gr.outputs.Textbox(label="Elapsed Time")
 
84
 
85
  title = "ArXiv Search"
86
  description = "Search for articles on ArXiv"
87
 
88
- gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch()
 
1
+ import fs.memoryfs
2
  import os
3
  import requests
4
  import xml.etree.ElementTree as ET
 
8
  import pandas as pd
9
  from fpdf import FPDF
10
  import gradio as gr
11
+ import io
12
+ import shutil
13
+ from zipfile import ZipFile
14
 
15
  def get_arxiv_data(search_query, number):
16
  url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
 
20
  entries = root.findall('{http://www.w3.org/2005/Atom}entry')
21
  results = []
22
 
23
+ mem_fs = fs.memoryfs.MemoryFS()
 
 
 
24
 
25
  for entry in entries:
26
  title = entry.find('{http://www.w3.org/2005/Atom}title').text
 
38
  # Download PDF file
39
  pdf_link = link.replace('abs', 'pdf') + '.pdf'
40
  filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
41
+ mem_fs.makedirs("pdfs", recreate=True)
42
+ filepath = mem_fs.openbin("pdfs/" + filename, "w")
43
 
44
  try:
45
+ response = requests.get(pdf_link, stream=True)
46
+ shutil.copyfileobj(response.raw, filepath)
47
  except Exception as e:
48
  continue
49
+ finally:
50
+ filepath.close()
51
 
52
  # Save search query and results to PDF
53
+ pdf_file = mem_fs.openbin("1_Search_query_AND_results.pdf", "w")
54
  pdf = FPDF()
55
  pdf.add_page()
56
  pdf.set_font('Arial', 'B', 16)
 
59
  for i, result in enumerate(results):
60
  pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
61
  pdf.ln(5) # Add newline after each result
62
+ pdf.output(pdf_file)
63
+ pdf_file.close()
64
 
65
  # Save search query, results, and current time to Excel file
66
  current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
67
  df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
68
+ excel_file = mem_fs.openbin("information.xlsx", "w")
69
+ df.to_excel(excel_file)
70
+ excel_file.close()
 
 
 
 
71
 
72
+ return results, mem_fs
73
 
74
  def search_arxiv(search_query, max_results):
75
  start_time = datetime.now()
76
+ results, mem_fs = get_arxiv_data(search_query, max_results)
77
  elapsed_time = datetime.now() - start_time
78
  elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"
79
 
80
+ # Create a Zip file
81
+ zip_file = io.BytesIO()
82
+ with ZipFile(zip_file, 'w') as zip:
83
+ for path in mem_fs.walk.files():
84
+ file_data = mem_fs.getbytes(path)
85
+ zip.writestr(path, file_data)
86
+ zip_file.seek(0) # Rewind the file. Essential for reading!
87
+
88
+ return '\n'.join(results), elapsed_time_str, zip_file
89
 
90
  search_query_input = gr.inputs.Textbox(label="Search Query")
91
  max_results_input = gr.inputs.Textbox(label="Max Results")
92
 
93
  output_text = gr.outputs.Textbox(label="Results")
94
  output_time = gr.outputs.Textbox(label="Elapsed Time")
95
+ output_file = gr.outputs.File(label="Download")
96
 
97
  title = "ArXiv Search"
98
  description = "Search for articles on ArXiv"
99
 
100
+ gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time, output_file], title=title, description=description).launch()