File size: 3,527 Bytes
20809c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b27482
20809c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423ca9c
20809c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423ca9c
20809c7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import requests
import xml.etree.ElementTree as ET
import urllib.request
import re
from datetime import datetime
import pandas as pd
from fpdf import FPDF
import gradio as gr

def get_arxiv_data(search_query, number):
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start=0&max_results={number}'
    response = requests.get(url)
    xml_data = response.text
    root = ET.fromstring(xml_data)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    results = []

    # Create folder for current date and time
    current_time = datetime.now().strftime('%Y_%m_%d__%H_%M')
    folder_path = os.path.join(os.path.dirname(__file__), 'data', current_time)
    os.makedirs(folder_path, exist_ok=True)

    for entry in entries:
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        link = entry.find('{http://www.w3.org/2005/Atom}link').attrib['href']
        published = entry.find('{http://www.w3.org/2005/Atom}published').text
        author = entry.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text

        # Skip non-arXiv links
        if not link.startswith('http://arxiv.org/'):
            continue

        result_string = f'Title: {title}\nLink: {link}\nPublished: {published}\nAuthor: {author}\n'
        results.append(result_string)

        # Download PDF file
        pdf_link = link.replace('abs', 'pdf') + '.pdf'
        filename = re.sub(r'[^a-zA-Z0-9_]', '_', title) + '.pdf'
        filepath = os.path.join(folder_path, filename)

        try:
            urllib.request.urlretrieve(pdf_link, filepath)
        except Exception as e:
            continue

    # Save search query and results to PDF
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font('Arial', 'B', 16)
    pdf.cell(0, 10, f"Search Query: {search_query}", ln=True)
    pdf.set_font('Arial', '', 12)
    for i, result in enumerate(results):
        pdf.multi_cell(0, 10, f"Result {i + 1}:\n{result}\n")
        pdf.ln(5)  # Add newline after each result
    pdf.output(os.path.join(folder_path, '1_Search_query_AND_results.pdf'))

    # Save search query, results, and current time to Excel file
    current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    df = pd.DataFrame({'Search Query': [search_query], 'Results': ['\n'.join(results)], 'Timestamp': [current_time]})
    folder_path = os.path.join(os.path.dirname(__file__), 'data')
    os.makedirs(folder_path, exist_ok=True)
    excel_filepath = os.path.join(folder_path, 'information.xlsx')
    if os.path.exists(excel_filepath):
        existing_df = pd.read_excel(excel_filepath)
        df = pd.concat([existing_df, df], ignore_index=True)
    df.to_excel(excel_filepath, index=False)

    return results

def search_arxiv(search_query, max_results):
    start_time = datetime.now()
    results = get_arxiv_data(search_query, max_results)
    elapsed_time = datetime.now() - start_time
    elapsed_time_str = f"Elapsed Time: {elapsed_time.total_seconds()} seconds"

    return '\n'.join(results), elapsed_time_str

search_query_input = gr.inputs.Textbox(label="Search Query")
max_results_input = gr.inputs.Textbox(label="Max Results")

output_text = gr.outputs.Textbox(label="Results")
output_time = gr.outputs.Textbox(label="Elapsed Time")

title = "ArXiv Search"
description = "Search for articles on ArXiv"

gr.Interface(fn=search_arxiv, inputs=[search_query_input, max_results_input], outputs=[output_text, output_time], title=title, description=description).launch()