Spaces:

Zhiming666
/

Crawl_Arxiv

Build error

File size: 3,527 Bytes

import os
import requests
import xml.etree.ElementTree as ET
import urllib.request
import re
from datetime import datetime
import pandas as pd
from fpdf import FPDF
import gradio as gr

def get_arxiv_data(search_query, number):
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
    response = requests.get(url)
    xml_data = response.text
    root = ET.fromstring(xml_data)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    results = []

    # Create folder for current date and time
    current_time = datetime.now().strftime('%Y_%m_%d__%H_%M')
    folder_path = os.path.join(os.path.dirname(__file__), 'data', current_time)
    os.makedirs(folder_path, exist_ok=True)

    for entry in entries:
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href']
        published = entry.find('{http://www.w3.org/2005/Atom}published').text
        author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text

        # Skip non-arXiv links
        if not link.startswith('http://arxiv.org/'):
            continue

        result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n'
        results.append(result_string)

        # Download PDF file
        pdf_link = link.replace('abs', 'pdf') + '.pdf'
        filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
        filepath = os.path.join(folder_path, filename)

        try:
            urllib.request.urlretrieve(pdf_link, filepath)
        except Exception as e:
            continue

    # Save search query and results to PDF
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font('Arial', 'B', 16)
    pdf.cell(0, 10, f"Search Query: {search_query}", ln=True)
    pdf.set_font('Arial', '', 12)
    for i, result in enumerate(results):
        pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
        pdf.ln(5)  # Add newline after each result
    pdf.output(os.path.join(folder_path, '1_Search_query_AND_results.pdf'))

    # Save search query, results, and current time to Excel file
    current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
    folder_path = os.path.join(os.path.dirname(__file__), 'data')
    os.makedirs(folder_path, exist_ok=True)
    excel_filepath = os.path.join(folder_path, 'information.xlsx')
    if os.path.exists(excel_filepath):
        existing_df = pd.read_excel(excel_filepath)
        df = pd.concat([existing_df, df], ignore_index=True)
    df.to_excel(excel_filepath, index=False)

    return results

def search_arxiv(search_query, max_results):
    start_time = datetime.now()
    results = get_arxiv_data(search_query, max_results)
    elapsed_time = datetime.now() - start_time
    elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"

    return '\n'.join(results), elapsed_time_str

search_query_input = gr.inputs.Textbox(label="Search Query")
max_results_input = gr.inputs.Textbox(label="Max Results")

output_text = gr.outputs.Textbox(label="Results")
output_time = gr.outputs.Textbox(label="Elapsed Time")

title = "ArXiv Search"
description = "Search for articles on ArXiv"

gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch()