Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import requests | |
import tempfile | |
from datetime import datetime, timezone | |
import base64 | |
from tqdm.auto import tqdm | |
import pymupdf | |
DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers" | |
class PaperManager: | |
def __init__(self, papers_per_page=30): | |
self.papers = [] | |
self.raw_papers = [] # To store fetched data | |
def calculate_rising_score(self, paper): | |
""" | |
Calculate the rising score of a paper. | |
This emphasizes recent upvotes and the rate of upvote accumulation. | |
""" | |
upvotes = paper.get('paper', {}).get('upvotes', 0) | |
published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat()) | |
try: | |
published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00')) | |
except ValueError: | |
published_time = datetime.now(timezone.utc) | |
time_diff = datetime.now(timezone.utc) - published_time | |
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours | |
# Rising score favors papers that are gaining upvotes quickly | |
# Adjusted to have a linear decay over time | |
score = upvotes / (time_diff_hours + 1) | |
return score | |
def fetch_papers(self): | |
try: | |
response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100") | |
response.raise_for_status() | |
data = response.json() | |
if not data: | |
print("No data received from API.") | |
return False | |
self.raw_papers = data # Store raw data | |
return True | |
except requests.RequestException as e: | |
print(f"Error fetching papers: {e}") | |
return False | |
except Exception as e: | |
print(f"Unexpected error: {e}") | |
return False | |
def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7): | |
self.papers = [] | |
for paper in self.raw_papers: | |
paper_score = self.calculate_rising_score(paper) | |
# if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent): | |
self.papers.append(paper) | |
self.papers = sorted( | |
self.papers, | |
key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1), | |
reverse=True | |
)[:2] | |
return self.papers | |
# def get_paper_content(self, paper_id): | |
# pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf" | |
# print("Processing paper:", pdf_url) | |
# client = httpx.Client(follow_redirects=True) | |
# response = client.get(pdf_url) | |
# # First verification - check if we got a valid PDF response | |
# if response.status_code != 200: | |
# raise Exception(f"Failed to fetch PDF: {response.status_code}") | |
# if not response.headers.get('content-type', '').startswith('application/pdf'): | |
# raise Exception(f"Unexpected content type: {response.headers.get('content-type')}") | |
# # Second verification - check the first few bytes of the content | |
# if not response.content.startswith(b'%PDF'): | |
# raise Exception("Content doesn't appear to be a valid PDF") | |
# pdf_data = base64.standard_b64encode(response.content).decode("utf-8") | |
# return {"pdf": pdf_data, "url": pdf_url} | |
def get_paper_text(self, paper_id): | |
url = f"https://arxiv.org/pdf/{paper_id}.pdf" | |
response = requests.get(url) | |
if response.status_code != 200: | |
raise Exception(f"Failed to download PDF: {response.status_code}") | |
with open("temp.pdf", "wb") as f: | |
f.write(response.content) | |
with pymupdf.open("temp.pdf") as doc: | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
def get_top_content(self): | |
self.fetch_papers() | |
self.filter_top_papers() | |
contents = {} | |
print(f"Processing {len(self.papers)} papers:") | |
for paper in tqdm(self.papers): | |
paper_id = paper["paper"]['id'] | |
contents[paper["paper"]['title']] = self.get_paper_text(paper_id) | |
return contents |