open-notebooklm / papers.py
m-ric's picture
m-ric HF Staff
More fixes
a3761cd
import os
import requests
import tempfile
from datetime import datetime, timezone
import base64
from tqdm.auto import tqdm
import pymupdf
DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"
class PaperManager:
def __init__(self, papers_per_page=30):
self.papers = []
self.raw_papers = [] # To store fetched data
def calculate_rising_score(self, paper):
"""
Calculate the rising score of a paper.
This emphasizes recent upvotes and the rate of upvote accumulation.
"""
upvotes = paper.get('paper', {}).get('upvotes', 0)
published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
try:
published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
except ValueError:
published_time = datetime.now(timezone.utc)
time_diff = datetime.now(timezone.utc) - published_time
time_diff_hours = time_diff.total_seconds() / 3600 # Convert time difference to hours
# Rising score favors papers that are gaining upvotes quickly
# Adjusted to have a linear decay over time
score = upvotes / (time_diff_hours + 1)
return score
def fetch_papers(self):
try:
response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
response.raise_for_status()
data = response.json()
if not data:
print("No data received from API.")
return False
self.raw_papers = data # Store raw data
return True
except requests.RequestException as e:
print(f"Error fetching papers: {e}")
return False
except Exception as e:
print(f"Unexpected error: {e}")
return False
def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
self.papers = []
for paper in self.raw_papers:
paper_score = self.calculate_rising_score(paper)
# if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
self.papers.append(paper)
self.papers = sorted(
self.papers,
key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
reverse=True
)[:2]
return self.papers
# def get_paper_content(self, paper_id):
# pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
# print("Processing paper:", pdf_url)
# client = httpx.Client(follow_redirects=True)
# response = client.get(pdf_url)
# # First verification - check if we got a valid PDF response
# if response.status_code != 200:
# raise Exception(f"Failed to fetch PDF: {response.status_code}")
# if not response.headers.get('content-type', '').startswith('application/pdf'):
# raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
# # Second verification - check the first few bytes of the content
# if not response.content.startswith(b'%PDF'):
# raise Exception("Content doesn't appear to be a valid PDF")
# pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
# return {"pdf": pdf_data, "url": pdf_url}
def get_paper_text(self, paper_id):
url = f"https://arxiv.org/pdf/{paper_id}.pdf"
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to download PDF: {response.status_code}")
with open("temp.pdf", "wb") as f:
f.write(response.content)
with pymupdf.open("temp.pdf") as doc:
text = ""
for page in doc:
text += page.get_text()
return text
def get_top_content(self):
self.fetch_papers()
self.filter_top_papers()
contents = {}
print(f"Processing {len(self.papers)} papers:")
for paper in tqdm(self.papers):
paper_id = paper["paper"]['id']
contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
return contents