Spaces:

smokxy
/

PaperFlux

Sleeping

App Files Files Community

smokxy commited on Feb 22

Commit

43ec9c8

1 Parent(s): f4c99e4

repository sturcturing

Browse files

Files changed (19) hide show

.env +3 -12
paperflux/main.py +30 -0
paperflux/src/__pycache__/scheduler.cpython-311.pyc +0 -0
paperflux/src/__pycache__/tasks.cpython-311.pyc +0 -0
paperflux/src/app.py +0 -18
paperflux/src/config/settings.py +9 -0
paperflux/{__init__.py → src/models/__init__.py} +0 -0
paperflux/src/models/paper.py +34 -0
paperflux/src/scheduler.py +0 -24
paperflux/src/{__init__.py → scheduler/__init__.py} +0 -0
paperflux/src/scheduler/jobs.py +61 -0
paperflux/src/services/database.py +21 -0
paperflux/src/services/paper_analyzer.py +38 -0
paperflux/src/services/paper_fetcher.py +98 -0
paperflux/src/tasks.py +0 -56
paperflux/src/web/__init__.py +0 -0
paperflux/src/web/app.py +95 -0
poetry.lock +0 -0
pyproject.toml +4 -2

.env CHANGED Viewed

@@ -1,14 +1,5 @@
-# Hugging Face
-HF_API_KEY=your_huggingface_key
 # Gemini
-GEMINI_API_KEY=your_gemini_key
-# Redis
-REDIS_HOST=localhost
-REDIS_PORT=6379
-REDIS_DB=0
-# App Config
-DAILY_PAPER_LIMIT=
-CACHE_TTL=86400

 # Gemini
+GEMINI_API_KEY=YOUR_API_KEY
+# MongoDB
+MONGO_URI=your_mongo_uri

paperflux/main.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import signal
+import sys
+from src.scheduler.jobs import PaperProcessingScheduler
+from src.web.app import PaperFluxUI
+import threading
+def signal_handler(signum, frame):
+    print("\nShutting down gracefully...")
+    scheduler.stop()
+    sys.exit(0)
+def main():
+    global scheduler
+    # Set up signal handlers
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+    # Start the scheduler in a background thread
+    scheduler = PaperProcessingScheduler()
+    scheduler_thread = threading.Thread(target=scheduler.start, daemon=True)
+    scheduler_thread.start()
+    # Create and launch the Gradio interface
+    ui = PaperFluxUI()
+    interface = ui.create_interface()
+    interface.launch(server_name="0.0.0.0", share=True)
+if __name__ == "__main__":
+    main()

paperflux/src/__pycache__/scheduler.cpython-311.pyc DELETED Viewed

Binary file (1.61 kB)

paperflux/src/__pycache__/tasks.cpython-311.pyc DELETED Viewed

Binary file (5.11 kB)

paperflux/src/app.py DELETED Viewed

@@ -1,18 +0,0 @@
-from threading import Thread
-from scheduler import start_scheduler
-from flask import Flask
-app = Flask(__name__)
-@app.route("/")
-def home():
-    return "Welcome!"
-if __name__ == "__main__":
-    # Run scheduler to fetch papers in a separate thread
-    scheduler_thread = Thread(target=start_scheduler, daemon=True)
-    scheduler_thread.start()
-    app.run(debug=True, use_reloader=False)

paperflux/src/config/settings.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+MONGODB_URI = "mongodb+srv:"
+DB_NAME = "papers_summary_database"
+COLLECTION_NAME = "papers"
+HF_API_URL = "https://huggingface.co/api/daily_papers"
+PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
+TEMP_DIR = "temp_papers"
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

paperflux/{__init__.py → src/models/__init__.py} RENAMED Viewed

File without changes

paperflux/src/models/paper.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from datetime import datetime
+from typing import List, Dict, Optional
+class Paper:
+    def __init__(
+        self,
+        paper_id: str,
+        title: str,
+        authors: List[Dict],
+        summary: str,
+        published_at: str,
+        explanation: Optional[str] = None,
+        pdf_url: Optional[str] = None,
+    ):
+        self.paper_id = paper_id
+        self.title = title
+        self.authors = authors
+        self.summary = summary
+        self.published_at = published_at
+        self.explanation = explanation
+        self.pdf_url = pdf_url
+        self.processed_at = datetime.utcnow()
+    def to_dict(self) -> Dict:
+        return {
+            "paper_id": self.paper_id,
+            "title": self.title,
+            "authors": self.authors,
+            "summary": self.summary,
+            "published_at": self.published_at,
+            "explanation": self.explanation,
+            "pdf_url": self.pdf_url,
+            "processed_at": self.processed_at,
+        }

paperflux/src/scheduler.py DELETED Viewed

@@ -1,24 +0,0 @@
-import time
-import asyncio
-from apscheduler.schedulers.background import BackgroundScheduler
-from tasks import run_paper_fetch_job
-from datetime import datetime
-def job():
-    print("Fetching papers 📝")
-    asyncio.run(run_paper_fetch_job())
-def start_scheduler():
-    scheduler = BackgroundScheduler()
-    scheduler.add_job(job, "interval", hours=24, next_run_time=datetime.now())
-    scheduler.start()
-    print("Scheduler started. Running in background.")
-    try:
-        while True:
-            time.sleep(60)
-    except (KeyboardInterrupt, SystemExit):
-        scheduler.shutdown()
-        print("Scheduler stopped.")

paperflux/src/{__init__.py → scheduler/__init__.py} RENAMED Viewed

File without changes

paperflux/src/scheduler/jobs.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import asyncio
+from apscheduler.schedulers.background import BackgroundScheduler
+from datetime import datetime
+import os
+from src.services.paper_fetcher import PaperFetcher
+from src.services.paper_analyzer import PaperAnalyzer
+from src.services.database import DatabaseService
+class PaperProcessingScheduler:
+    def __init__(self):
+        self.scheduler = BackgroundScheduler()
+        self.fetcher = PaperFetcher()
+        self.analyzer = PaperAnalyzer()
+        self.db = DatabaseService()
+        self._running = False
+    async def process_papers(self):
+        if self._running:
+            print("Previous processing still running, skipping...")
+            return
+        self._running = True
+        print("Starting daily paper processing...")
+        try:
+            self.db.clear_collection()
+            papers = await self.fetcher.fetch_papers()
+            for paper in papers:
+                if not self._running:  # Check if we should stop
+                    break
+                pdf_path = await self.fetcher.download_paper(paper)
+                if pdf_path:
+                    try:
+                        explanation = self.analyzer.analyze_paper(pdf_path)
+                        paper_obj = self.fetcher.parse_paper_data(paper)
+                        paper_obj.explanation = explanation
+                        self.db.insert_paper(paper_obj)
+                    finally:
+                        if os.path.exists(pdf_path):
+                            os.remove(pdf_path)
+        except Exception as e:
+            print(f"Error in paper processing: {str(e)}")
+        finally:
+            self._running = False
+    def start(self):
+        self.scheduler.add_job(
+            lambda: asyncio.run(self.process_papers()),
+            'cron',
+            hour=0,
+            minute=0,
+            next_run_time=datetime.now()
+        )
+        self.scheduler.start()
+    def stop(self):
+        self._running = False
+        self.scheduler.shutdown()

paperflux/src/services/database.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from pymongo import MongoClient
+from src.config.settings import MONGODB_URI, DB_NAME, COLLECTION_NAME
+from src.models.paper import Paper
+class DatabaseService:
+    def __init__(self):
+        self.client = MongoClient(MONGODB_URI)
+        self.db = self.client[DB_NAME]
+        self.collection = self.db[COLLECTION_NAME]
+    def clear_collection(self):
+        self.collection.delete_many({})
+    def insert_paper(self, paper: Paper):
+        return self.collection.insert_one(paper.to_dict())
+    def get_all_papers(self):
+        return list(self.collection.find())
+    def get_paper_by_id(self, paper_id: str):
+        return self.collection.find_one({"paper_id": paper_id})

paperflux/src/services/paper_analyzer.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import google.generativeai as genai
+from google.generativeai.types import HarmCategory, HarmBlockThreshold
+from src.config.settings import GEMINI_API_KEY
+class PaperAnalyzer:
+    def __init__(self):
+        genai.configure(api_key=GEMINI_API_KEY)
+        self.model = genai.GenerativeModel("gemini-1.5-pro-latest")
+        self.safety_settings = {
+            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        }
+    def analyze_paper(self, pdf_path: str) -> str:
+        uploaded_file = genai.upload_file(pdf_path)
+        prompt = """Analyze this research paper thoroughly and provide:
+        # Paper Title
+        ## Core Contribution
+        ## Technical Breakdown
+        - Detailed mathematical concepts and intuition with in depth explanation
+        - Key algorithms and methodologies
+        ## Visual Analysis
+        ## Critical Assessment
+        ## Potential Applications
+        Include detailed mathematical expressions and thorough explanations."""
+        response = self.model.generate_content(
+            [prompt, uploaded_file],
+            safety_settings=self.safety_settings,
+            generation_config={"temperature": 0.2},
+        )
+        genai.delete_file(uploaded_file.name)
+        return response.text

paperflux/src/services/paper_fetcher.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import aiohttp
+import asyncio
+from datetime import datetime
+from typing import List, Tuple, Optional
+from src.config.settings import HF_API_URL, PDF_BASE_URL, TEMP_DIR
+from src.models.paper import Paper
+class PaperFetcher:
+    def __init__(self):
+        os.makedirs(TEMP_DIR, exist_ok=True)
+    async def fetch_papers(self) -> List[dict]:
+        """Fetch daily papers from the Hugging Face API."""
+        async with aiohttp.ClientSession() as session:
+            async with session.get(HF_API_URL) as response:
+                if response.status == 200:
+                    papers = await response.json()
+                    print(f"Found {len(papers)} papers")
+                    return papers
+                raise Exception(f"API request failed: {response.status}")
+    async def download_paper(self, paper_entry: dict) -> Optional[str]:
+        """
+        Download a single paper's PDF.
+        Returns the path to the downloaded PDF or None if download failed.
+        """
+        try:
+            paper_id = paper_entry["paper"]["id"]
+            pdf_url = PDF_BASE_URL.format(id=paper_id)
+            clean_id = paper_id.replace("/", "_")
+            filename = f"{datetime.now().date()}_{clean_id}.pdf"
+            filepath = os.path.join(TEMP_DIR, filename)
+            async with aiohttp.ClientSession() as session:
+                async with session.get(pdf_url) as response:
+                    if response.status == 200:
+                        content = await response.read()
+                        with open(filepath, "wb") as f:
+                            f.write(content)
+                        print(f"Successfully downloaded: {paper_id}")
+                        return filepath
+                    print(f"Failed to download {paper_id}: HTTP {response.status}")
+                    return None
+        except Exception as e:
+            print(f"Error downloading {paper_id}: {str(e)}")
+            return None
+    async def download_all_papers(self, papers: List[dict]) -> List[Tuple[str, bool]]:
+        """Download all papers in parallel."""
+        async with aiohttp.ClientSession() as session:
+            tasks = []
+            for paper in papers:
+                paper_id = paper["paper"]["id"]
+                pdf_url = PDF_BASE_URL.format(id=paper_id)
+                clean_id = paper_id.replace("/", "_")
+                filename = f"{datetime.now().date()}_{clean_id}.pdf"
+                filepath = os.path.join(TEMP_DIR, filename)
+                tasks.append(self.download_single_paper(session, paper_id, pdf_url, filepath))
+            results = await asyncio.gather(*tasks)
+            successful = sum(1 for status in results if status[1])
+            print(f"Downloaded {successful}/{len(papers)} papers successfully")
+            return results
+    async def download_single_paper(
+        self,
+        session: aiohttp.ClientSession,
+        paper_id: str,
+        pdf_url: str,
+        filepath: str
+    ) -> Tuple[str, bool]:
+        """Download a single paper with the given session."""
+        try:
+            async with session.get(pdf_url) as response:
+                if response.status == 200:
+                    content = await response.read()
+                    with open(filepath, "wb") as f:
+                        f.write(content)
+                    return (paper_id, True)
+                return (paper_id, False)
+        except Exception as e:
+            print(f"Error downloading {paper_id}: {str(e)}")
+            return (paper_id, False)
+    def parse_paper_data(self, paper_entry: dict) -> Paper:
+        """Convert raw paper data to Paper model."""
+        paper_data = paper_entry["paper"]
+        return Paper(
+            paper_id=paper_data["id"],
+            title=paper_data["title"],
+            authors=paper_data["authors"],
+            summary=paper_data["summary"],
+            published_at=paper_data["publishedAt"],
+            pdf_url=PDF_BASE_URL.format(id=paper_data["id"])
+        )

paperflux/src/tasks.py DELETED Viewed

@@ -1,56 +0,0 @@
-import asyncio
-import aiohttp
-from pymongo import MongoClient
-from datetime import datetime, timezone
-import os
-API_URL = "https://huggingface.co/api/daily_papers"
-PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
-client = MongoClient("mongodb://localhost:27017/")
-db = client["papers_summary_database"]
-collection = db["papers"]
-async def fetch_papers(session):
-    async with session.get(API_URL) as response:
-        if response.status == 200:
-            return await response.json()
-        raise Exception(f"API request failed: {response.status}")
-async def download_pdf(session, paper_entry):
-    try:
-        paper_id = paper_entry["paper"]["id"]
-        pdf_url = PDF_BASE_URL.format(id=paper_id)
-        async with session.get(pdf_url) as response:
-            if response.status == 200:
-                content = await response.read()
-                os.makedirs("pdfs", exist_ok=True)
-                with open(f"pdfs/{paper_id}", "wb") as f:
-                    f.write(content)
-                return (paper_id, True)
-            return (paper_id, False)
-    except Exception as e:
-        print(f"Error downloading {paper_id}: {str(e)}")
-        return (paper_id, False)
-async def run_paper_fetch_job():
-    async with aiohttp.ClientSession() as session:
-        papers = await fetch_papers(session)
-        tasks = []
-        for paper in papers:
-            paper_data = paper["paper"]
-            paper_data["fetchedAt"] = datetime.now(timezone.utc).isoformat()
-            collection.insert_one(paper_data)
-        tasks = [download_pdf(session, paper) for paper in papers]
-        results = await asyncio.gather(*tasks)
-        successful = sum(1 for _, status in results if status)
-        print(f"Downloaded {successful}/{len(papers)} papers successfully")

paperflux/src/web/__init__.py ADDED Viewed

File without changes

paperflux/src/web/app.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import gradio as gr
+from src.services.database import DatabaseService
+class PaperFluxUI:
+    def __init__(self):
+        self.db = DatabaseService()
+        self.papers = self.db.get_all_papers()
+        self.current_index = 0
+    def get_current_paper(self):
+        if not self.papers:
+            return {
+                "title": "No papers available",
+                "explanation": "Please wait for papers to be processed.",
+                "pdf_url": ""
+            }
+        paper = self.papers[self.current_index]
+        authors = ", ".join([author["name"] for author in paper["authors"]])
+        title = f"# {paper['title']}\n\nAuthors: {authors}"
+        return {
+            "title": title,
+            "explanation": paper["explanation"],
+            "pdf_url": paper["pdf_url"]
+        }
+    def next_paper(self):
+        if self.current_index < len(self.papers) - 1:
+            self.current_index += 1
+        return self.get_current_paper()
+    def previous_paper(self):
+        if self.current_index > 0:
+            self.current_index -= 1
+        return self.get_current_paper()
+    def create_interface(self):
+        with gr.Blocks(theme=gr.themes.Base()) as interface:
+            title = gr.Markdown()
+            explanation = gr.Markdown()
+            # Create an HTML component for the download link
+            download_html = gr.HTML()
+            with gr.Row():
+                prev_btn = gr.Button("Previous Paper")
+                next_btn = gr.Button("Next Paper")
+            def update_ui(paper_data):
+                download_link = f"""
+                <div style="text-align: center; margin-top: 10px;">
+                    <a href="{paper_data['pdf_url']}" target="_blank"
+                       style="text-decoration: none;">
+                        <button style="padding: 10px 20px; background-color: #4CAF50;
+                                     color: white; border: none; border-radius: 5px;
+                                     cursor: pointer;">
+                            Download Paper
+                        </button>
+                    </a>
+                </div>
+                """
+                return (
+                    paper_data["title"],
+                    paper_data["explanation"],
+                    download_link
+                )
+            next_btn.click(
+                fn=lambda: update_ui(self.next_paper()),
+                outputs=[title, explanation, download_html]
+            )
+            prev_btn.click(
+                fn=lambda: update_ui(self.previous_paper()),
+                outputs=[title, explanation, download_html]
+            )
+            # Initialize with first paper
+            paper_data = self.get_current_paper()
+            init_download_link = f"""
+            <div style="text-align: center; margin-top: 10px;">
+                <a href="{paper_data['pdf_url']}" target="_blank"
+                   style="text-decoration: none;">
+                    <button style="padding: 10px 20px; background-color: #4CAF50;
+                                 color: white; border: none; border-radius: 5px;
+                                 cursor: pointer;">
+                        Download Paper
+                    </button>
+                </a>
+            </div>
+            """
+            title.value = paper_data["title"]
+            explanation.value = paper_data["explanation"]
+            download_html.value = init_download_link
+        return interface

poetry.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -3,7 +3,8 @@ name = "paperflux"
 version = "0.1.0"
 description = ""
 authors = [
-    {name = "kartikbhtt7",email = "kartikbhtt7@gmail.com"}
 ]
 license = {text = "MIT"}
 readme = "README.md"
@@ -21,7 +22,8 @@ dependencies = [
     "markdown (>=3.7,<4.0)",
     "pymongo (>=4.11.1,<5.0.0)",
     "flask (>=3.1.0,<4.0.0)",
-    "tqdm (>=4.67.1,<5.0.0)"
 ]

 version = "0.1.0"
 description = ""
 authors = [
+    {name = "kartikbhtt7",email = "kartikbhtt7@gmail.com"},
+    {name = "Vector73",email = "v.shm.kunal@gmail.com"}
 ]
 license = {text = "MIT"}
 readme = "README.md"
     "markdown (>=3.7,<4.0)",
     "pymongo (>=4.11.1,<5.0.0)",
     "flask (>=3.1.0,<4.0.0)",
+    "tqdm (>=4.67.1,<5.0.0)",
+    "aiohttp (>=3.11.12,<4.0.0)"
 ]