import os import requests import tempfile from datetime import datetime, timezone import base64 from tqdm.auto import tqdm import pymupdf DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers" class PaperManager: def __init__(self, papers_per_page=30): self.papers = [] self.raw_papers = [] # To store fetched data def calculate_rising_score(self, paper): """ Calculate the rising score of a paper. This emphasizes recent upvotes and the rate of upvote accumulation. """ upvotes = paper.get('paper', {}).get('upvotes', 0) published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat()) try: published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00')) except ValueError: published_time = datetime.now(timezone.utc) time_diff = datetime.now(timezone.utc) - published_time time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours # Rising score favors papers that are gaining upvotes quickly # Adjusted to have a linear decay over time score = upvotes / (time_diff_hours + 1) return score def fetch_papers(self): try: response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100") response.raise_for_status() data = response.json() if not data: print("No data received from API.") return False self.raw_papers = data # Store raw data return True except requests.RequestException as e: print(f"Error fetching papers: {e}") return False except Exception as e: print(f"Unexpected error: {e}") return False def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7): self.papers = [] for paper in self.raw_papers: paper_score = self.calculate_rising_score(paper) # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent): self.papers.append(paper) self.papers = sorted( self.papers, key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1), reverse=True )[:2] return self.papers # def get_paper_content(self, paper_id): # pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf" # print("Processing paper:", pdf_url) # client = httpx.Client(follow_redirects=True) # response = client.get(pdf_url) # # First verification - check if we got a valid PDF response # if response.status_code != 200: # raise Exception(f"Failed to fetch PDF: {response.status_code}") # if not response.headers.get('content-type', '').startswith('application/pdf'): # raise Exception(f"Unexpected content type: {response.headers.get('content-type')}") # # Second verification - check the first few bytes of the content # if not response.content.startswith(b'%PDF'): # raise Exception("Content doesn't appear to be a valid PDF") # pdf_data = base64.standard_b64encode(response.content).decode("utf-8") # return {"pdf": pdf_data, "url": pdf_url} def get_paper_text(self, paper_id): url = f"https://arxiv.org/pdf/{paper_id}.pdf" response = requests.get(url) if response.status_code != 200: raise Exception(f"Failed to download PDF: {response.status_code}") with open("temp.pdf", "wb") as f: f.write(response.content) with pymupdf.open("temp.pdf") as doc: text = "" for page in doc: text += page.get_text() return text def get_top_content(self): self.fetch_papers() self.filter_top_papers() contents = {} print(f"Processing {len(self.papers)} papers:") for paper in tqdm(self.papers): paper_id = paper["paper"]['id'] contents[paper["paper"]['title']] = self.get_paper_text(paper_id) return contents