Spaces:

Zhiming666
/

Crawl_Arxiv

Build error

App Files Files Community

Crawl_Arxiv / app.py

Zhiming666

Upload 2 files

20809c7 about 1 year ago

raw

history blame

3.46 kB

	import os
	import requests
	import xml.etree.ElementTree as ET
	import urllib.request
	import re
	from datetime import datetime
	import pandas as pd
	from fpdf import FPDF
	import gradio as gr

	def get_arxiv_data(search_query, number):
	url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
	response = requests.get(url)
	xml_data = response.text
	root = ET.fromstring(xml_data)
	entries = root.findall('{http://www.w3.org/2005/Atom}entry')
	results = []

	# Create folder for current date and time
	current_time = datetime.now().strftime('%Y_%m_%d__%H_%M')
	folder_path = os.path.join('data', current_time)
	os.makedirs(folder_path, exist_ok=True)

	for entry in entries:
	title = entry.find('{http://www.w3.org/2005/Atom}title').text
	link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href']
	published = entry.find('{http://www.w3.org/2005/Atom}published').text
	author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text

	# Skip non-arXiv links
	if not link.startswith('http://arxiv.org/'):
	continue

	result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n'
	results.append(result_string)

	# Download PDF file
	pdf_link = link.replace('abs', 'pdf') + '.pdf'
	filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
	filepath = os.path.join(folder_path, filename)

	try:
	urllib.request.urlretrieve(pdf_link, filepath)
	except Exception as e:
	continue

	# Save search query and results to PDF
	pdf = FPDF()
	pdf.add_page()
	pdf.set_font('Arial', 'B', 16)
	pdf.cell(0, 10, f"Search Query: {search_query}", ln=True)
	pdf.set_font('Arial', '', 12)
	for i, result in enumerate(results):
	pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
	pdf.ln(5) # Add newline after each result
	pdf.output(os.path.join(folder_path, '1_Search_query_AND_results.pdf'))

	# Save search query, results, and current time to Excel file
	current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
	df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
	folder_path = 'data'
	os.makedirs(folder_path, exist_ok=True)
	excel_filepath = os.path.join(folder_path, 'information.xlsx')
	if os.path.exists(excel_filepath):
	existing_df = pd.read_excel(excel_filepath)
	df = pd.concat([existing_df, df], ignore_index=True)
	df.to_excel(excel_filepath, index=False)

	return results

	def search_arxiv(search_query, max_results):
	start_time = datetime.now()
	results = get_arxiv_data(search_query, max_results)
	elapsed_time = datetime.now() - start_time
	elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"

	return '\n'.join(results), elapsed_time_str

	search_query_input = gr.inputs.Textbox(label="Search Query")
	max_results_input = gr.inputs.Textbox(label="Max Results")

	output_text = gr.outputs.Textbox(label="Results")
	output_time = gr.outputs.Textbox(label="Elapsed Time")

	title = "ArXiv Search"
	description = "Crawling Papers on Arxiv"

	gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch()