Spaces:

oceansweep
/

tldw

Running

App Files Files Community

oceansweep commited on Oct 8

Commit

8619cce

•

1 Parent(s): 1b24d60

Upload 2 files

Browse files

Files changed (2) hide show

App_Function_Libraries/Third_Party/Arxiv.py +166 -0
App_Function_Libraries/Third_Party/__init__.py +0 -0

App_Function_Libraries/Third_Party/Arxiv.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Arxiv.py
+# Description: This file contains the functions for searching and ingesting arXiv papers.
+import time
+import arxiv
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+from requests.adapters import HTTPAdapter
+from urllib3 import Retry
+#
+# Local Imports
+from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
+#
+#####################################################################################################
+#
+# Functions:
+# Number of results per page
+ARXIV_PAGE_SIZE = 10
+def fetch_arxiv_pdf_url(paper_id):
+    base_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"
+    # Configure retry strategy
+    retry_strategy = Retry(
+        total=3,  # Maximum number of retries
+        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these status codes
+        backoff_factor=1  # Exponential backoff factor
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    http = requests.Session()
+    http.mount("https://", adapter)
+    http.mount("http://", adapter)
+    try:
+        response = http.get(base_url)
+        response.raise_for_status()
+        # Delay between requests to avoid rate limiting
+        time.sleep(2)
+        soup = BeautifulSoup(response.text, 'xml')
+        pdf_link = soup.find('link', title='pdf')['href']
+        return pdf_link
+    except requests.exceptions.RequestException as e:
+        print(f"**Error:** {e}")
+        return None
+def search_arxiv(query):
+    client = arxiv.Client()
+    search = arxiv.Search(
+        query=query,
+        max_results=10,
+        sort_by=arxiv.SortCriterion.Relevance
+    )
+    results = []
+    for result in client.results(search):
+        results.append([
+            result.title,
+            result.entry_id.split('/')[-1],  # Extract the ID from the entry_id
+            ', '.join(author.name for author in result.authors),
+            result.summary
+        ])
+    return results
+def fetch_arxiv_xml(paper_id):
+    base_url = "http://export.arxiv.org/api/query?id_list="
+    response = requests.get(base_url + paper_id)
+    response.raise_for_status()
+    return response.text
+def parse_arxiv_feed(xml_content):
+    soup = BeautifulSoup(xml_content, 'xml')
+    entries = []
+    for entry in soup.find_all('entry'):
+        title = entry.title.text.strip()
+        paper_id = entry.id.text.strip().split('/abs/')[-1]
+        authors = ', '.join(author.find('name').text.strip() for author in entry.find_all('author'))
+        published = entry.published.text.strip().split('T')[0]
+        abstract = entry.summary.text.strip()
+        entries.append({
+            'id': paper_id,
+            'title': title,
+            'authors': authors,
+            'published': published,
+            'abstract': abstract
+        })
+    return entries
+def build_query_url(query, author, year, start):
+    # HTTP? FIXME....
+    base_url = "http://export.arxiv.org/api/query?"
+    search_params = []
+    # Build search query
+    if query:
+        search_params.append(f"all:{query}")
+    if author:
+        search_params.append(f'au:"{author}"')
+    if year:
+        search_params.append(f'submittedDate:[{year}01010000 TO {year}12312359]')
+    search_query = "+AND+".join(search_params) if search_params else "all:*"
+    url = f"{base_url}search_query={search_query}&start={start}&max_results={ARXIV_PAGE_SIZE}"
+    return url
+def convert_xml_to_markdown(xml_content):
+    soup = BeautifulSoup(xml_content, 'xml')
+    # Extract title, authors, abstract, and other relevant information from the specific paper entry
+    entry = soup.find('entry')
+    title = entry.find('title').text.strip()
+    authors = [author.find('name').text.strip() for author in entry.find_all('author')]
+    abstract = entry.find('summary').text.strip()
+    published = entry.find('published').text.strip()
+    categories = [category['term'] for category in entry.find_all('category')]
+    # Constructing a markdown representation for the paper
+    markdown = f"# {title}\n\n"
+    markdown += f"**Authors:** {', '.join(authors)}\n\n"
+    markdown += f"**Published Date:** {published}\n\n"
+    markdown += f"**Abstract:**\n\n{abstract}\n\n"
+    markdown += f"**Categories:** {', '.join(categories)}\n\n"
+    return markdown, title, authors, categories
+def process_and_ingest_arxiv_paper(paper_id, additional_keywords):
+    try:
+        xml_content = fetch_arxiv_xml(paper_id)
+        markdown, title, authors, categories = convert_xml_to_markdown(xml_content)
+        keywords = f"arxiv,{','.join(categories)}"
+        if additional_keywords:
+            keywords += f",{additional_keywords}"
+        add_media_with_keywords(
+            url=f"https://arxiv.org/abs/{paper_id}",
+            title=title,
+            media_type='document',
+            content=markdown,
+            keywords=keywords,
+            prompt='No prompt for arXiv papers',
+            summary='arXiv paper ingested from XML',
+            transcription_model='None',
+            author=', '.join(authors),
+            ingestion_date=datetime.now().strftime('%Y-%m-%d')
+        )
+        return f"arXiv paper '{title}' ingested successfully."
+    except Exception as e:
+        return f"Error processing arXiv paper: {str(e)}"
+#
+# End of Arxiv.py
+####################################################################################################

App_Function_Libraries/Third_Party/__init__.py ADDED Viewed

File without changes