Spaces:
Build error
Build error
import fs.memoryfs | |
import os | |
import requests | |
import xml.etree.ElementTree as ET | |
import urllib.request | |
import re | |
from datetime import datetime | |
import pandas as pd | |
from fpdf import FPDF | |
import gradio as gr | |
import io | |
import shutil | |
from zipfile import ZipFile | |
def get_arxiv_data(search_query, number): | |
url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}' | |
response = requests.get(url) | |
xml_data = response.text | |
root = ET.fromstring(xml_data) | |
entries = root.findall('{http://www.w3.org/2005/Atom}entry') | |
results = [] | |
mem_fs = fs.memoryfs.MemoryFS() | |
for entry in entries: | |
title = entry.find('{http://www.w3.org/2005/Atom}title').text | |
link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href'] | |
published = entry.find('{http://www.w3.org/2005/Atom}published').text | |
author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text | |
# Skip non-arXiv links | |
if not link.startswith('http://arxiv.org/'): | |
continue | |
result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n' | |
results.append(result_string) | |
# Download PDF file | |
pdf_link = link.replace('abs', 'pdf') + '.pdf' | |
filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf' | |
mem_fs.makedirs("pdfs", recreate=True) | |
filepath = mem_fs.openbin("pdfs/" + filename, "w") | |
try: | |
response = requests.get(pdf_link, stream=True) | |
shutil.copyfileobj(response.raw, filepath) | |
except Exception as e: | |
continue | |
finally: | |
filepath.close() | |
# Save search query and results to PDF | |
pdf_file = mem_fs.openbin("1_Search_query_AND_results.pdf", "w") | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font('Arial', 'B', 16) | |
pdf.cell(0, 10, f"Search Query: {search_query}", ln=True) | |
pdf.set_font('Arial', '', 12) | |
for i, result in enumerate(results): | |
pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n") | |
pdf.ln(5) # Add newline after each result | |
pdf.output(pdf_file) | |
pdf_file.close() | |
# Save search query, results, and current time to Excel file | |
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') | |
df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]}) | |
excel_file = mem_fs.openbin("information.xlsx", "w") | |
df.to_excel(excel_file) | |
excel_file.close() | |
return results, mem_fs | |
def search_arxiv(search_query, max_results): | |
start_time = datetime.now() | |
results, mem_fs = get_arxiv_data(search_query, max_results) | |
elapsed_time = datetime.now() - start_time | |
elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds" | |
# Create a Zip file | |
zip_file = io.BytesIO() | |
with ZipFile(zip_file, 'w') as zip: | |
for path in mem_fs.walk.files(): | |
file_data = mem_fs.getbytes(path) | |
zip.writestr(path, file_data) | |
zip_file.seek(0) # Rewind the file. Essential for reading! | |
return '\n'.join(results), elapsed_time_str, zip_file | |
search_query_input = gr.inputs.Textbox(label="Search Query") | |
max_results_input = gr.inputs.Textbox(label="Max Results") | |
output_text = gr.outputs.Textbox(label="Results") | |
output_time = gr.outputs.Textbox(label="Elapsed Time") | |
output_file = gr.outputs.File(label="Download") | |
title = "ArXiv Search" | |
description = "Search for articles on ArXiv" | |
gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time, output_file], title=title, description=description).launch() | |