Crawl_Arxiv / app.py
Zhiming666's picture
Update app.py
a06773e
raw
history blame
3.7 kB
import fs.memoryfs
import os
import requests
import xml.etree.ElementTree as ET
import urllib.request
import re
from datetime import datetime
import pandas as pd
from fpdf import FPDF
import gradio as gr
import io
import shutil
from zipfile import ZipFile
def get_arxiv_data(search_query, number):
url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
response = requests.get(url)
xml_data = response.text
root = ET.fromstring(xml_data)
entries = root.findall('{http://www.w3.org/2005/Atom}entry')
results = []
mem_fs = fs.memoryfs.MemoryFS()
for entry in entries:
title = entry.find('{http://www.w3.org/2005/Atom}title').text
link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href']
published = entry.find('{http://www.w3.org/2005/Atom}published').text
author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text
# Skip non-arXiv links
if not link.startswith('http://arxiv.org/'):
continue
result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n'
results.append(result_string)
# Download PDF file
pdf_link = link.replace('abs', 'pdf') + '.pdf'
filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
mem_fs.makedirs("pdfs", recreate=True)
filepath = mem_fs.openbin("pdfs/" + filename, "w")
try:
response = requests.get(pdf_link, stream=True)
shutil.copyfileobj(response.raw, filepath)
except Exception as e:
continue
finally:
filepath.close()
# Save search query and results to PDF
pdf_file = mem_fs.openbin("1_Search_query_AND_results.pdf", "w")
pdf = FPDF()
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, f"Search Query: {search_query}", ln=True)
pdf.set_font('Arial', '', 12)
for i, result in enumerate(results):
pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
pdf.ln(5) # Add newline after each result
pdf.output(pdf_file)
pdf_file.close()
# Save search query, results, and current time to Excel file
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
excel_file = mem_fs.openbin("information.xlsx", "w")
df.to_excel(excel_file)
excel_file.close()
return results, mem_fs
def search_arxiv(search_query, max_results):
start_time = datetime.now()
results, mem_fs = get_arxiv_data(search_query, max_results)
elapsed_time = datetime.now() - start_time
elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"
# Create a Zip file
zip_file = io.BytesIO()
with ZipFile(zip_file, 'w') as zip:
for path in mem_fs.walk.files():
file_data = mem_fs.getbytes(path)
zip.writestr(path, file_data)
zip_file.seek(0) # Rewind the file. Essential for reading!
return '\n'.join(results), elapsed_time_str, zip_file
search_query_input = gr.inputs.Textbox(label="Search Query")
max_results_input = gr.inputs.Textbox(label="Max Results")
output_text = gr.outputs.Textbox(label="Results")
output_time = gr.outputs.Textbox(label="Elapsed Time")
output_file = gr.outputs.File(label="Download")
title = "ArXiv Search"
description = "Search for articles on ArXiv"
gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time, output_file], title=title, description=description).launch()