import fs.memoryfs import os import requests import xml.etree.ElementTree as ET import urllib.request import re from datetime import datetime import pandas as pd from fpdf import FPDF import gradio as gr import io import shutil from zipfile import ZipFile def get_arxiv_data(search_query, number): url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}' response = requests.get(url) xml_data = response.text root = ET.fromstring(xml_data) entries = root.findall('{http://www.w3.org/2005/Atom}entry') results = [] mem_fs = fs.memoryfs.MemoryFS() for entry in entries: title = entry.find('{http://www.w3.org/2005/Atom}title').text link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href'] published = entry.find('{http://www.w3.org/2005/Atom}published').text author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text # Skip non-arXiv links if not link.startswith('http://arxiv.org/'): continue result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n' results.append(result_string) # Download PDF file pdf_link = link.replace('abs', 'pdf') + '.pdf' filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf' mem_fs.makedirs("pdfs", recreate=True) filepath = mem_fs.openbin("pdfs/" + filename, "w") try: response = requests.get(pdf_link, stream=True) shutil.copyfileobj(response.raw, filepath) except Exception as e: continue finally: filepath.close() # Save search query and results to PDF pdf_file = mem_fs.openbin("1_Search_query_AND_results.pdf", "w") pdf = FPDF() pdf.add_page() pdf.set_font('Arial', 'B', 16) pdf.cell(0, 10, f"Search Query: {search_query}", ln=True) pdf.set_font('Arial', '', 12) for i, result in enumerate(results): pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n") pdf.ln(5) # Add newline after each result pdf.output(pdf_file) pdf_file.close() # Save search query, results, and current time to Excel file current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]}) excel_file = mem_fs.openbin("information.xlsx", "w") df.to_excel(excel_file) excel_file.close() return results, mem_fs def search_arxiv(search_query, max_results): start_time = datetime.now() results, mem_fs = get_arxiv_data(search_query, max_results) elapsed_time = datetime.now() - start_time elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds" # Create a Zip file zip_file = io.BytesIO() with ZipFile(zip_file, 'w') as zip: for path in mem_fs.walk.files(): file_data = mem_fs.getbytes(path) zip.writestr(path, file_data) zip_file.seek(0) # Rewind the file. Essential for reading! return '\n'.join(results), elapsed_time_str, zip_file search_query_input = gr.inputs.Textbox(label="Search Query") max_results_input = gr.inputs.Textbox(label="Max Results") output_text = gr.outputs.Textbox(label="Results") output_time = gr.outputs.Textbox(label="Elapsed Time") output_file = gr.outputs.File(label="Download") title = "ArXiv Search" description = "Search for articles on ArXiv" gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time, output_file], title=title, description=description).launch()