Spaces:
Build error
Build error
Commit
•
a06773e
1
Parent(s):
80354f9
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
import requests
|
3 |
import xml.etree.ElementTree as ET
|
@@ -7,6 +8,9 @@ from datetime import datetime
|
|
7 |
import pandas as pd
|
8 |
from fpdf import FPDF
|
9 |
import gradio as gr
|
|
|
|
|
|
|
10 |
|
11 |
def get_arxiv_data(search_query, number):
|
12 |
url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
|
@@ -16,10 +20,7 @@ def get_arxiv_data(search_query, number):
|
|
16 |
entries = root.findall('{http://www.w3.org/2005/Atom}entry')
|
17 |
results = []
|
18 |
|
19 |
-
|
20 |
-
current_time = datetime.now().strftime('%Y_%m_%d__%H_%M')
|
21 |
-
folder_path = os.path.join(os.path.dirname(__file__), 'data', current_time)
|
22 |
-
os.makedirs(folder_path, exist_ok=True)
|
23 |
|
24 |
for entry in entries:
|
25 |
title = entry.find('{http://www.w3.org/2005/Atom}title').text
|
@@ -37,14 +38,19 @@ def get_arxiv_data(search_query, number):
|
|
37 |
# Download PDF file
|
38 |
pdf_link = link.replace('abs', 'pdf') + '.pdf'
|
39 |
filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
|
40 |
-
|
|
|
41 |
|
42 |
try:
|
43 |
-
|
|
|
44 |
except Exception as e:
|
45 |
continue
|
|
|
|
|
46 |
|
47 |
# Save search query and results to PDF
|
|
|
48 |
pdf = FPDF()
|
49 |
pdf.add_page()
|
50 |
pdf.set_font('Arial', 'B', 16)
|
@@ -53,36 +59,42 @@ def get_arxiv_data(search_query, number):
|
|
53 |
for i, result in enumerate(results):
|
54 |
pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
|
55 |
pdf.ln(5) # Add newline after each result
|
56 |
-
pdf.output(
|
|
|
57 |
|
58 |
# Save search query, results, and current time to Excel file
|
59 |
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
60 |
df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
if os.path.exists(excel_filepath):
|
65 |
-
existing_df = pd.read_excel(excel_filepath)
|
66 |
-
df = pd.concat([existing_df, df], ignore_index=True)
|
67 |
-
df.to_excel(excel_filepath, index=False)
|
68 |
|
69 |
-
return results
|
70 |
|
71 |
def search_arxiv(search_query, max_results):
|
72 |
start_time = datetime.now()
|
73 |
-
results = get_arxiv_data(search_query, max_results)
|
74 |
elapsed_time = datetime.now() - start_time
|
75 |
elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"
|
76 |
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
search_query_input = gr.inputs.Textbox(label="Search Query")
|
80 |
max_results_input = gr.inputs.Textbox(label="Max Results")
|
81 |
|
82 |
output_text = gr.outputs.Textbox(label="Results")
|
83 |
output_time = gr.outputs.Textbox(label="Elapsed Time")
|
|
|
84 |
|
85 |
title = "ArXiv Search"
|
86 |
description = "Search for articles on ArXiv"
|
87 |
|
88 |
-
gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch()
|
|
|
1 |
+
import fs.memoryfs
|
2 |
import os
|
3 |
import requests
|
4 |
import xml.etree.ElementTree as ET
|
|
|
8 |
import pandas as pd
|
9 |
from fpdf import FPDF
|
10 |
import gradio as gr
|
11 |
+
import io
|
12 |
+
import shutil
|
13 |
+
from zipfile import ZipFile
|
14 |
|
15 |
def get_arxiv_data(search_query, number):
|
16 |
url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
|
|
|
20 |
entries = root.findall('{http://www.w3.org/2005/Atom}entry')
|
21 |
results = []
|
22 |
|
23 |
+
mem_fs = fs.memoryfs.MemoryFS()
|
|
|
|
|
|
|
24 |
|
25 |
for entry in entries:
|
26 |
title = entry.find('{http://www.w3.org/2005/Atom}title').text
|
|
|
38 |
# Download PDF file
|
39 |
pdf_link = link.replace('abs', 'pdf') + '.pdf'
|
40 |
filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
|
41 |
+
mem_fs.makedirs("pdfs", recreate=True)
|
42 |
+
filepath = mem_fs.openbin("pdfs/" + filename, "w")
|
43 |
|
44 |
try:
|
45 |
+
response = requests.get(pdf_link, stream=True)
|
46 |
+
shutil.copyfileobj(response.raw, filepath)
|
47 |
except Exception as e:
|
48 |
continue
|
49 |
+
finally:
|
50 |
+
filepath.close()
|
51 |
|
52 |
# Save search query and results to PDF
|
53 |
+
pdf_file = mem_fs.openbin("1_Search_query_AND_results.pdf", "w")
|
54 |
pdf = FPDF()
|
55 |
pdf.add_page()
|
56 |
pdf.set_font('Arial', 'B', 16)
|
|
|
59 |
for i, result in enumerate(results):
|
60 |
pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
|
61 |
pdf.ln(5) # Add newline after each result
|
62 |
+
pdf.output(pdf_file)
|
63 |
+
pdf_file.close()
|
64 |
|
65 |
# Save search query, results, and current time to Excel file
|
66 |
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
67 |
df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
|
68 |
+
excel_file = mem_fs.openbin("information.xlsx", "w")
|
69 |
+
df.to_excel(excel_file)
|
70 |
+
excel_file.close()
|
|
|
|
|
|
|
|
|
71 |
|
72 |
+
return results, mem_fs
|
73 |
|
74 |
def search_arxiv(search_query, max_results):
|
75 |
start_time = datetime.now()
|
76 |
+
results, mem_fs = get_arxiv_data(search_query, max_results)
|
77 |
elapsed_time = datetime.now() - start_time
|
78 |
elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"
|
79 |
|
80 |
+
# Create a Zip file
|
81 |
+
zip_file = io.BytesIO()
|
82 |
+
with ZipFile(zip_file, 'w') as zip:
|
83 |
+
for path in mem_fs.walk.files():
|
84 |
+
file_data = mem_fs.getbytes(path)
|
85 |
+
zip.writestr(path, file_data)
|
86 |
+
zip_file.seek(0) # Rewind the file. Essential for reading!
|
87 |
+
|
88 |
+
return '\n'.join(results), elapsed_time_str, zip_file
|
89 |
|
90 |
search_query_input = gr.inputs.Textbox(label="Search Query")
|
91 |
max_results_input = gr.inputs.Textbox(label="Max Results")
|
92 |
|
93 |
output_text = gr.outputs.Textbox(label="Results")
|
94 |
output_time = gr.outputs.Textbox(label="Elapsed Time")
|
95 |
+
output_file = gr.outputs.File(label="Download")
|
96 |
|
97 |
title = "ArXiv Search"
|
98 |
description = "Search for articles on ArXiv"
|
99 |
|
100 |
+
gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time, output_file], title=title, description=description).launch()
|