Crawl_Arxiv / app.py
Zhiming666's picture
Upload 2 files
20809c7
raw
history blame
3.46 kB
import os
import requests
import xml.etree.ElementTree as ET
import urllib.request
import re
from datetime import datetime
import pandas as pd
from fpdf import FPDF
import gradio as gr
def get_arxiv_data(search_query, number):
url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
response = requests.get(url)
xml_data = response.text
root = ET.fromstring(xml_data)
entries = root.findall('{http://www.w3.org/2005/Atom}entry')
results = []
# Create folder for current date and time
current_time = datetime.now().strftime('%Y_%m_%d__%H_%M')
folder_path = os.path.join('data', current_time)
os.makedirs(folder_path, exist_ok=True)
for entry in entries:
title = entry.find('{http://www.w3.org/2005/Atom}title').text
link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href']
published = entry.find('{http://www.w3.org/2005/Atom}published').text
author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text
# Skip non-arXiv links
if not link.startswith('http://arxiv.org/'):
continue
result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n'
results.append(result_string)
# Download PDF file
pdf_link = link.replace('abs', 'pdf') + '.pdf'
filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
filepath = os.path.join(folder_path, filename)
try:
urllib.request.urlretrieve(pdf_link, filepath)
except Exception as e:
continue
# Save search query and results to PDF
pdf = FPDF()
pdf.add_page()
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, f"Search Query: {search_query}", ln=True)
pdf.set_font('Arial', '', 12)
for i, result in enumerate(results):
pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
pdf.ln(5) # Add newline after each result
pdf.output(os.path.join(folder_path, '1_Search_query_AND_results.pdf'))
# Save search query, results, and current time to Excel file
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
folder_path = 'data'
os.makedirs(folder_path, exist_ok=True)
excel_filepath = os.path.join(folder_path, 'information.xlsx')
if os.path.exists(excel_filepath):
existing_df = pd.read_excel(excel_filepath)
df = pd.concat([existing_df, df], ignore_index=True)
df.to_excel(excel_filepath, index=False)
return results
def search_arxiv(search_query, max_results):
start_time = datetime.now()
results = get_arxiv_data(search_query, max_results)
elapsed_time = datetime.now() - start_time
elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"
return '\n'.join(results), elapsed_time_str
search_query_input = gr.inputs.Textbox(label="Search Query")
max_results_input = gr.inputs.Textbox(label="Max Results")
output_text = gr.outputs.Textbox(label="Results")
output_time = gr.outputs.Textbox(label="Elapsed Time")
title = "ArXiv Search"
description = "Crawling Papers on Arxiv"
gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch()