Spaces:
Build error
Build error
import os | |
import requests | |
import xml.etree.ElementTree as ET | |
import urllib.request | |
import re | |
from datetime import datetime | |
import pandas as pd | |
from fpdf import FPDF | |
import gradio as gr | |
def get_arxiv_data(search_query, number): | |
url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}' | |
response = requests.get(url) | |
xml_data = response.text | |
root = ET.fromstring(xml_data) | |
entries = root.findall('{http://www.w3.org/2005/Atom}entry') | |
results = [] | |
# Create folder for current date and time | |
current_time = datetime.now().strftime('%Y_%m_%d__%H_%M') | |
folder_path = os.path.join('data', current_time) | |
os.makedirs(folder_path, exist_ok=True) | |
for entry in entries: | |
title = entry.find('{http://www.w3.org/2005/Atom}title').text | |
link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href'] | |
published = entry.find('{http://www.w3.org/2005/Atom}published').text | |
author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text | |
# Skip non-arXiv links | |
if not link.startswith('http://arxiv.org/'): | |
continue | |
result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n' | |
results.append(result_string) | |
# Download PDF file | |
pdf_link = link.replace('abs', 'pdf') + '.pdf' | |
filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf' | |
filepath = os.path.join(folder_path, filename) | |
try: | |
urllib.request.urlretrieve(pdf_link, filepath) | |
except Exception as e: | |
continue | |
# Save search query and results to PDF | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font('Arial', 'B', 16) | |
pdf.cell(0, 10, f"Search Query: {search_query}", ln=True) | |
pdf.set_font('Arial', '', 12) | |
for i, result in enumerate(results): | |
pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n") | |
pdf.ln(5) # Add newline after each result | |
pdf.output(os.path.join(folder_path, '1_Search_query_AND_results.pdf')) | |
# Save search query, results, and current time to Excel file | |
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') | |
df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]}) | |
folder_path = 'data' | |
os.makedirs(folder_path, exist_ok=True) | |
excel_filepath = os.path.join(folder_path, 'information.xlsx') | |
if os.path.exists(excel_filepath): | |
existing_df = pd.read_excel(excel_filepath) | |
df = pd.concat([existing_df, df], ignore_index=True) | |
df.to_excel(excel_filepath, index=False) | |
return results | |
def search_arxiv(search_query, max_results): | |
start_time = datetime.now() | |
results = get_arxiv_data(search_query, max_results) | |
elapsed_time = datetime.now() - start_time | |
elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds" | |
return '\n'.join(results), elapsed_time_str | |
search_query_input = gr.inputs.Textbox(label="Search Query") | |
max_results_input = gr.inputs.Textbox(label="Max Results") | |
output_text = gr.outputs.Textbox(label="Results") | |
output_time = gr.outputs.Textbox(label="Elapsed Time") | |
title = "ArXiv Search" | |
description = "Crawling Papers on Arxiv" | |
gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch() | |