Spaces:
Build error
Build error
File size: 3,527 Bytes
20809c7 7b27482 20809c7 423ca9c 20809c7 423ca9c 20809c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import os
import requests
import xml.etree.ElementTree as ET
import urllib.request
import re
from datetime import datetime
import pandas as pd
from fpdf import FPDF
import gradio as gr
def get_arxiv_data(search_query, number):
url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
response = requests.get(url)
xml_data = response.text
root = ET.fromstring(xml_data)
entries = root.findall('{http://www.w3.org/2005/Atom}entry')
results = []
# Create folder for current date and time
current_time = datetime.now().strftime('%Y_%m_%d__%H_%M')
folder_path = os.path.join(os.path.dirname(__file__), 'data', current_time)
os.makedirs(folder_path, exist_ok=True)
for entry in entries:
title = entry.find('{http://www.w3.org/2005/Atom}title').text
link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href']
published = entry.find('{http://www.w3.org/2005/Atom}published').text
author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text
# Skip non-arXiv links
if not link.startswith('http://arxiv.org/'):
continue
result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n'
results.append(result_string)
# Download PDF file
pdf_link = link.replace('abs', 'pdf') + '.pdf'
filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
filepath = os.path.join(folder_path, filename)
try:
urllib.request.urlretrieve(pdf_link, filepath)
except Exception as e:
continue
# Save search query and results to PDF
pdf = FPDF()
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, f"Search Query: {search_query}", ln=True)
pdf.set_font('Arial', '', 12)
for i, result in enumerate(results):
pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
pdf.ln(5) # Add newline after each result
pdf.output(os.path.join(folder_path, '1_Search_query_AND_results.pdf'))
# Save search query, results, and current time to Excel file
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
folder_path = os.path.join(os.path.dirname(__file__), 'data')
os.makedirs(folder_path, exist_ok=True)
excel_filepath = os.path.join(folder_path, 'information.xlsx')
if os.path.exists(excel_filepath):
existing_df = pd.read_excel(excel_filepath)
df = pd.concat([existing_df, df], ignore_index=True)
df.to_excel(excel_filepath, index=False)
return results
def search_arxiv(search_query, max_results):
start_time = datetime.now()
results = get_arxiv_data(search_query, max_results)
elapsed_time = datetime.now() - start_time
elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"
return '\n'.join(results), elapsed_time_str
search_query_input = gr.inputs.Textbox(label="Search Query")
max_results_input = gr.inputs.Textbox(label="Max Results")
output_text = gr.outputs.Textbox(label="Results")
output_time = gr.outputs.Textbox(label="Elapsed Time")
title = "ArXiv Search"
description = "Search for articles on ArXiv"
gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch()
|