File size: 4,280 Bytes
6cc0694
 
 
 
 
 
8511ecd
6cc0694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8511ecd
6cc0694
 
 
 
 
 
 
 
 
 
 
 
 
a3761cd
6cc0694
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import requests
import tempfile
from datetime import datetime, timezone
import base64
from tqdm.auto import tqdm
import pymupdf

DAILY_PAPERS_API_URL = "https://huggingface.co/api/daily_papers"

class PaperManager:
    def __init__(self, papers_per_page=30):
        self.papers = []
        self.raw_papers = []  # To store fetched data

    def calculate_rising_score(self, paper):
        """
        Calculate the rising score of a paper.
        This emphasizes recent upvotes and the rate of upvote accumulation.
        """
        upvotes = paper.get('paper', {}).get('upvotes', 0)
        published_at_str = paper.get('publishedAt', datetime.now(timezone.utc).isoformat())
        try:
            published_time = datetime.fromisoformat(published_at_str.replace('Z', '+00:00'))
        except ValueError:
            published_time = datetime.now(timezone.utc)

        time_diff = datetime.now(timezone.utc) - published_time
        time_diff_hours = time_diff.total_seconds() / 3600  # Convert time difference to hours

        # Rising score favors papers that are gaining upvotes quickly
        # Adjusted to have a linear decay over time
        score = upvotes / (time_diff_hours + 1)
        return score

    def fetch_papers(self):
        try:
            response = requests.get(f"{DAILY_PAPERS_API_URL}?limit=100")
            response.raise_for_status()
            data = response.json()

            if not data:
                print("No data received from API.")
                return False

            self.raw_papers = data  # Store raw data

            return True

        except requests.RequestException as e:
            print(f"Error fetching papers: {e}")
            return False
        except Exception as e:
            print(f"Unexpected error: {e}")
            return False

    def filter_top_papers(self, threshold_general=2.0, threshold_agent=0.7):
        self.papers = []
        for paper in self.raw_papers:
            paper_score = self.calculate_rising_score(paper)
            # if paper_score >= threshold_general or ('agent' in paper['title'].lower() and paper_score >= threshold_agent):
            self.papers.append(paper)

        self.papers = sorted(
            self.papers,
            key=lambda x: self.calculate_rising_score(x) * (3 if 'agent' in x['title'].lower() else 1),
            reverse=True
        )[:2]
        return self.papers

    # def get_paper_content(self, paper_id):
    #     pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
    #     print("Processing paper:", pdf_url)
    #     client = httpx.Client(follow_redirects=True)
    #     response = client.get(pdf_url)

    #     # First verification - check if we got a valid PDF response
    #     if response.status_code != 200:
    #         raise Exception(f"Failed to fetch PDF: {response.status_code}")
        
    #     if not response.headers.get('content-type', '').startswith('application/pdf'):
    #         raise Exception(f"Unexpected content type: {response.headers.get('content-type')}")
        
    #     # Second verification - check the first few bytes of the content
    #     if not response.content.startswith(b'%PDF'):
    #         raise Exception("Content doesn't appear to be a valid PDF")
    
    #     pdf_data = base64.standard_b64encode(response.content).decode("utf-8")
    #     return {"pdf": pdf_data, "url": pdf_url}

    def get_paper_text(self, paper_id):
        url = f"https://arxiv.org/pdf/{paper_id}.pdf"
        response = requests.get(url)
        
        if response.status_code != 200:
            raise Exception(f"Failed to download PDF: {response.status_code}")
        
        with open("temp.pdf", "wb") as f:
            f.write(response.content)

        with pymupdf.open("temp.pdf") as doc:
            text = ""
            for page in doc:
                text += page.get_text()        
        return text


    def get_top_content(self):
        self.fetch_papers()
        self.filter_top_papers()
        contents =  {}
        print(f"Processing {len(self.papers)} papers:")
        for paper in tqdm(self.papers):
            paper_id = paper["paper"]['id']
            contents[paper["paper"]['title']] = self.get_paper_text(paper_id)
        return contents