import os import requests import xml.etree.ElementTree as ET import urllib.request import re from datetime import datetime import pandas as pd from fpdf import FPDF import gradio as gr def get_arxiv_data(search_query, number): url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}' response = requests.get(url) xml_data = response.text root = ET.fromstring(xml_data) entries = root.findall('{http://www.w3.org/2005/Atom}entry') results = [] # Create folder for current date and time current_time = datetime.now().strftime('%Y_%m_%d__%H_%M') folder_path = os.path.join(os.path.dirname(__file__), 'data', current_time) os.makedirs(folder_path, exist_ok=True) for entry in entries: title = entry.find('{http://www.w3.org/2005/Atom}title').text link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href'] published = entry.find('{http://www.w3.org/2005/Atom}published').text author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text # Skip non-arXiv links if not link.startswith('http://arxiv.org/'): continue result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n' results.append(result_string) # Download PDF file pdf_link = link.replace('abs', 'pdf') + '.pdf' filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf' filepath = os.path.join(folder_path, filename) try: urllib.request.urlretrieve(pdf_link, filepath) except Exception as e: continue # Save search query and results to PDF pdf = FPDF() pdf.add_page() pdf.set_font('Arial', 'B', 16) pdf.cell(0, 10, f"Search Query: {search_query}", ln=True) pdf.set_font('Arial', '', 12) for i, result in enumerate(results): pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n") pdf.ln(5) # Add newline after each result pdf.output(os.path.join(folder_path, '1_Search_query_AND_results.pdf')) # Save search query, results, and current time to Excel file current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]}) folder_path = os.path.join(os.path.dirname(__file__), 'data') os.makedirs(folder_path, exist_ok=True) excel_filepath = os.path.join(folder_path, 'information.xlsx') if os.path.exists(excel_filepath): existing_df = pd.read_excel(excel_filepath) df = pd.concat([existing_df, df], ignore_index=True) df.to_excel(excel_filepath, index=False) return results def search_arxiv(search_query, max_results): start_time = datetime.now() results = get_arxiv_data(search_query, max_results) elapsed_time = datetime.now() - start_time elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds" return '\n'.join(results), elapsed_time_str search_query_input = gr.inputs.Textbox(label="Search Query") max_results_input = gr.inputs.Textbox(label="Max Results") output_text = gr.outputs.Textbox(label="Results") output_time = gr.outputs.Textbox(label="Elapsed Time") title = "ArXiv Search" description = "Search for articles on ArXiv" gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch()