repository sturcturing
Browse files- .env +3 -12
- paperflux/main.py +30 -0
- paperflux/src/__pycache__/scheduler.cpython-311.pyc +0 -0
- paperflux/src/__pycache__/tasks.cpython-311.pyc +0 -0
- paperflux/src/app.py +0 -18
- paperflux/src/config/settings.py +9 -0
- paperflux/{__init__.py → src/models/__init__.py} +0 -0
- paperflux/src/models/paper.py +34 -0
- paperflux/src/scheduler.py +0 -24
- paperflux/src/{__init__.py → scheduler/__init__.py} +0 -0
- paperflux/src/scheduler/jobs.py +61 -0
- paperflux/src/services/database.py +21 -0
- paperflux/src/services/paper_analyzer.py +38 -0
- paperflux/src/services/paper_fetcher.py +98 -0
- paperflux/src/tasks.py +0 -56
- paperflux/src/web/__init__.py +0 -0
- paperflux/src/web/app.py +95 -0
- poetry.lock +0 -0
- pyproject.toml +4 -2
.env
CHANGED
|
@@ -1,14 +1,5 @@
|
|
| 1 |
-
# Hugging Face
|
| 2 |
-
HF_API_KEY=your_huggingface_key
|
| 3 |
-
|
| 4 |
# Gemini
|
| 5 |
-
GEMINI_API_KEY=
|
| 6 |
-
|
| 7 |
-
# Redis
|
| 8 |
-
REDIS_HOST=localhost
|
| 9 |
-
REDIS_PORT=6379
|
| 10 |
-
REDIS_DB=0
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
|
| 14 |
-
CACHE_TTL=86400
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Gemini
|
| 2 |
+
GEMINI_API_KEY=YOUR_API_KEY
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
# MongoDB
|
| 5 |
+
MONGO_URI=your_mongo_uri
|
|
|
paperflux/main.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import signal
|
| 2 |
+
import sys
|
| 3 |
+
from src.scheduler.jobs import PaperProcessingScheduler
|
| 4 |
+
from src.web.app import PaperFluxUI
|
| 5 |
+
import threading
|
| 6 |
+
|
| 7 |
+
def signal_handler(signum, frame):
|
| 8 |
+
print("\nShutting down gracefully...")
|
| 9 |
+
scheduler.stop()
|
| 10 |
+
sys.exit(0)
|
| 11 |
+
|
| 12 |
+
def main():
|
| 13 |
+
global scheduler
|
| 14 |
+
|
| 15 |
+
# Set up signal handlers
|
| 16 |
+
signal.signal(signal.SIGINT, signal_handler)
|
| 17 |
+
signal.signal(signal.SIGTERM, signal_handler)
|
| 18 |
+
|
| 19 |
+
# Start the scheduler in a background thread
|
| 20 |
+
scheduler = PaperProcessingScheduler()
|
| 21 |
+
scheduler_thread = threading.Thread(target=scheduler.start, daemon=True)
|
| 22 |
+
scheduler_thread.start()
|
| 23 |
+
|
| 24 |
+
# Create and launch the Gradio interface
|
| 25 |
+
ui = PaperFluxUI()
|
| 26 |
+
interface = ui.create_interface()
|
| 27 |
+
interface.launch(server_name="0.0.0.0", share=True)
|
| 28 |
+
|
| 29 |
+
if __name__ == "__main__":
|
| 30 |
+
main()
|
paperflux/src/__pycache__/scheduler.cpython-311.pyc
DELETED
|
Binary file (1.61 kB)
|
|
|
paperflux/src/__pycache__/tasks.cpython-311.pyc
DELETED
|
Binary file (5.11 kB)
|
|
|
paperflux/src/app.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
from threading import Thread
|
| 2 |
-
from scheduler import start_scheduler
|
| 3 |
-
from flask import Flask
|
| 4 |
-
|
| 5 |
-
app = Flask(__name__)
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
@app.route("/")
|
| 9 |
-
def home():
|
| 10 |
-
return "Welcome!"
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
if __name__ == "__main__":
|
| 14 |
-
# Run scheduler to fetch papers in a separate thread
|
| 15 |
-
scheduler_thread = Thread(target=start_scheduler, daemon=True)
|
| 16 |
-
scheduler_thread.start()
|
| 17 |
-
|
| 18 |
-
app.run(debug=True, use_reloader=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
paperflux/src/config/settings.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
MONGODB_URI = "mongodb+srv:"
|
| 4 |
+
DB_NAME = "papers_summary_database"
|
| 5 |
+
COLLECTION_NAME = "papers"
|
| 6 |
+
HF_API_URL = "https://huggingface.co/api/daily_papers"
|
| 7 |
+
PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
|
| 8 |
+
TEMP_DIR = "temp_papers"
|
| 9 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
paperflux/{__init__.py → src/models/__init__.py}
RENAMED
|
File without changes
|
paperflux/src/models/paper.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
from typing import List, Dict, Optional
|
| 3 |
+
|
| 4 |
+
class Paper:
|
| 5 |
+
def __init__(
|
| 6 |
+
self,
|
| 7 |
+
paper_id: str,
|
| 8 |
+
title: str,
|
| 9 |
+
authors: List[Dict],
|
| 10 |
+
summary: str,
|
| 11 |
+
published_at: str,
|
| 12 |
+
explanation: Optional[str] = None,
|
| 13 |
+
pdf_url: Optional[str] = None,
|
| 14 |
+
):
|
| 15 |
+
self.paper_id = paper_id
|
| 16 |
+
self.title = title
|
| 17 |
+
self.authors = authors
|
| 18 |
+
self.summary = summary
|
| 19 |
+
self.published_at = published_at
|
| 20 |
+
self.explanation = explanation
|
| 21 |
+
self.pdf_url = pdf_url
|
| 22 |
+
self.processed_at = datetime.utcnow()
|
| 23 |
+
|
| 24 |
+
def to_dict(self) -> Dict:
|
| 25 |
+
return {
|
| 26 |
+
"paper_id": self.paper_id,
|
| 27 |
+
"title": self.title,
|
| 28 |
+
"authors": self.authors,
|
| 29 |
+
"summary": self.summary,
|
| 30 |
+
"published_at": self.published_at,
|
| 31 |
+
"explanation": self.explanation,
|
| 32 |
+
"pdf_url": self.pdf_url,
|
| 33 |
+
"processed_at": self.processed_at,
|
| 34 |
+
}
|
paperflux/src/scheduler.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
import time
|
| 2 |
-
import asyncio
|
| 3 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
| 4 |
-
from tasks import run_paper_fetch_job
|
| 5 |
-
from datetime import datetime
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def job():
|
| 9 |
-
print("Fetching papers 📝")
|
| 10 |
-
asyncio.run(run_paper_fetch_job())
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def start_scheduler():
|
| 14 |
-
scheduler = BackgroundScheduler()
|
| 15 |
-
scheduler.add_job(job, "interval", hours=24, next_run_time=datetime.now())
|
| 16 |
-
scheduler.start()
|
| 17 |
-
print("Scheduler started. Running in background.")
|
| 18 |
-
|
| 19 |
-
try:
|
| 20 |
-
while True:
|
| 21 |
-
time.sleep(60)
|
| 22 |
-
except (KeyboardInterrupt, SystemExit):
|
| 23 |
-
scheduler.shutdown()
|
| 24 |
-
print("Scheduler stopped.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
paperflux/src/{__init__.py → scheduler/__init__.py}
RENAMED
|
File without changes
|
paperflux/src/scheduler/jobs.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
import os
|
| 5 |
+
from src.services.paper_fetcher import PaperFetcher
|
| 6 |
+
from src.services.paper_analyzer import PaperAnalyzer
|
| 7 |
+
from src.services.database import DatabaseService
|
| 8 |
+
|
| 9 |
+
class PaperProcessingScheduler:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.scheduler = BackgroundScheduler()
|
| 12 |
+
self.fetcher = PaperFetcher()
|
| 13 |
+
self.analyzer = PaperAnalyzer()
|
| 14 |
+
self.db = DatabaseService()
|
| 15 |
+
self._running = False
|
| 16 |
+
|
| 17 |
+
async def process_papers(self):
|
| 18 |
+
if self._running:
|
| 19 |
+
print("Previous processing still running, skipping...")
|
| 20 |
+
return
|
| 21 |
+
|
| 22 |
+
self._running = True
|
| 23 |
+
print("Starting daily paper processing...")
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
self.db.clear_collection()
|
| 27 |
+
papers = await self.fetcher.fetch_papers()
|
| 28 |
+
|
| 29 |
+
for paper in papers:
|
| 30 |
+
if not self._running: # Check if we should stop
|
| 31 |
+
break
|
| 32 |
+
|
| 33 |
+
pdf_path = await self.fetcher.download_paper(paper)
|
| 34 |
+
if pdf_path:
|
| 35 |
+
try:
|
| 36 |
+
explanation = self.analyzer.analyze_paper(pdf_path)
|
| 37 |
+
paper_obj = self.fetcher.parse_paper_data(paper)
|
| 38 |
+
paper_obj.explanation = explanation
|
| 39 |
+
self.db.insert_paper(paper_obj)
|
| 40 |
+
finally:
|
| 41 |
+
if os.path.exists(pdf_path):
|
| 42 |
+
os.remove(pdf_path)
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"Error in paper processing: {str(e)}")
|
| 46 |
+
finally:
|
| 47 |
+
self._running = False
|
| 48 |
+
|
| 49 |
+
def start(self):
|
| 50 |
+
self.scheduler.add_job(
|
| 51 |
+
lambda: asyncio.run(self.process_papers()),
|
| 52 |
+
'cron',
|
| 53 |
+
hour=0,
|
| 54 |
+
minute=0,
|
| 55 |
+
next_run_time=datetime.now()
|
| 56 |
+
)
|
| 57 |
+
self.scheduler.start()
|
| 58 |
+
|
| 59 |
+
def stop(self):
|
| 60 |
+
self._running = False
|
| 61 |
+
self.scheduler.shutdown()
|
paperflux/src/services/database.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pymongo import MongoClient
|
| 2 |
+
from src.config.settings import MONGODB_URI, DB_NAME, COLLECTION_NAME
|
| 3 |
+
from src.models.paper import Paper
|
| 4 |
+
|
| 5 |
+
class DatabaseService:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
self.client = MongoClient(MONGODB_URI)
|
| 8 |
+
self.db = self.client[DB_NAME]
|
| 9 |
+
self.collection = self.db[COLLECTION_NAME]
|
| 10 |
+
|
| 11 |
+
def clear_collection(self):
|
| 12 |
+
self.collection.delete_many({})
|
| 13 |
+
|
| 14 |
+
def insert_paper(self, paper: Paper):
|
| 15 |
+
return self.collection.insert_one(paper.to_dict())
|
| 16 |
+
|
| 17 |
+
def get_all_papers(self):
|
| 18 |
+
return list(self.collection.find())
|
| 19 |
+
|
| 20 |
+
def get_paper_by_id(self, paper_id: str):
|
| 21 |
+
return self.collection.find_one({"paper_id": paper_id})
|
paperflux/src/services/paper_analyzer.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import google.generativeai as genai
|
| 2 |
+
from google.generativeai.types import HarmCategory, HarmBlockThreshold
|
| 3 |
+
from src.config.settings import GEMINI_API_KEY
|
| 4 |
+
|
| 5 |
+
class PaperAnalyzer:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
| 8 |
+
self.model = genai.GenerativeModel("gemini-1.5-pro-latest")
|
| 9 |
+
self.safety_settings = {
|
| 10 |
+
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
|
| 11 |
+
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
|
| 12 |
+
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
|
| 13 |
+
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
def analyze_paper(self, pdf_path: str) -> str:
|
| 17 |
+
uploaded_file = genai.upload_file(pdf_path)
|
| 18 |
+
prompt = """Analyze this research paper thoroughly and provide:
|
| 19 |
+
|
| 20 |
+
# Paper Title
|
| 21 |
+
## Core Contribution
|
| 22 |
+
## Technical Breakdown
|
| 23 |
+
- Detailed mathematical concepts and intuition with in depth explanation
|
| 24 |
+
- Key algorithms and methodologies
|
| 25 |
+
## Visual Analysis
|
| 26 |
+
## Critical Assessment
|
| 27 |
+
## Potential Applications
|
| 28 |
+
|
| 29 |
+
Include detailed mathematical expressions and thorough explanations."""
|
| 30 |
+
|
| 31 |
+
response = self.model.generate_content(
|
| 32 |
+
[prompt, uploaded_file],
|
| 33 |
+
safety_settings=self.safety_settings,
|
| 34 |
+
generation_config={"temperature": 0.2},
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
genai.delete_file(uploaded_file.name)
|
| 38 |
+
return response.text
|
paperflux/src/services/paper_fetcher.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import aiohttp
|
| 3 |
+
import asyncio
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from typing import List, Tuple, Optional
|
| 6 |
+
from src.config.settings import HF_API_URL, PDF_BASE_URL, TEMP_DIR
|
| 7 |
+
from src.models.paper import Paper
|
| 8 |
+
|
| 9 |
+
class PaperFetcher:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
os.makedirs(TEMP_DIR, exist_ok=True)
|
| 12 |
+
|
| 13 |
+
async def fetch_papers(self) -> List[dict]:
|
| 14 |
+
"""Fetch daily papers from the Hugging Face API."""
|
| 15 |
+
async with aiohttp.ClientSession() as session:
|
| 16 |
+
async with session.get(HF_API_URL) as response:
|
| 17 |
+
if response.status == 200:
|
| 18 |
+
papers = await response.json()
|
| 19 |
+
print(f"Found {len(papers)} papers")
|
| 20 |
+
return papers
|
| 21 |
+
raise Exception(f"API request failed: {response.status}")
|
| 22 |
+
|
| 23 |
+
async def download_paper(self, paper_entry: dict) -> Optional[str]:
|
| 24 |
+
"""
|
| 25 |
+
Download a single paper's PDF.
|
| 26 |
+
Returns the path to the downloaded PDF or None if download failed.
|
| 27 |
+
"""
|
| 28 |
+
try:
|
| 29 |
+
paper_id = paper_entry["paper"]["id"]
|
| 30 |
+
pdf_url = PDF_BASE_URL.format(id=paper_id)
|
| 31 |
+
clean_id = paper_id.replace("/", "_")
|
| 32 |
+
filename = f"{datetime.now().date()}_{clean_id}.pdf"
|
| 33 |
+
filepath = os.path.join(TEMP_DIR, filename)
|
| 34 |
+
|
| 35 |
+
async with aiohttp.ClientSession() as session:
|
| 36 |
+
async with session.get(pdf_url) as response:
|
| 37 |
+
if response.status == 200:
|
| 38 |
+
content = await response.read()
|
| 39 |
+
with open(filepath, "wb") as f:
|
| 40 |
+
f.write(content)
|
| 41 |
+
print(f"Successfully downloaded: {paper_id}")
|
| 42 |
+
return filepath
|
| 43 |
+
print(f"Failed to download {paper_id}: HTTP {response.status}")
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Error downloading {paper_id}: {str(e)}")
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
async def download_all_papers(self, papers: List[dict]) -> List[Tuple[str, bool]]:
|
| 51 |
+
"""Download all papers in parallel."""
|
| 52 |
+
async with aiohttp.ClientSession() as session:
|
| 53 |
+
tasks = []
|
| 54 |
+
for paper in papers:
|
| 55 |
+
paper_id = paper["paper"]["id"]
|
| 56 |
+
pdf_url = PDF_BASE_URL.format(id=paper_id)
|
| 57 |
+
clean_id = paper_id.replace("/", "_")
|
| 58 |
+
filename = f"{datetime.now().date()}_{clean_id}.pdf"
|
| 59 |
+
filepath = os.path.join(TEMP_DIR, filename)
|
| 60 |
+
|
| 61 |
+
tasks.append(self.download_single_paper(session, paper_id, pdf_url, filepath))
|
| 62 |
+
|
| 63 |
+
results = await asyncio.gather(*tasks)
|
| 64 |
+
successful = sum(1 for status in results if status[1])
|
| 65 |
+
print(f"Downloaded {successful}/{len(papers)} papers successfully")
|
| 66 |
+
return results
|
| 67 |
+
|
| 68 |
+
async def download_single_paper(
|
| 69 |
+
self,
|
| 70 |
+
session: aiohttp.ClientSession,
|
| 71 |
+
paper_id: str,
|
| 72 |
+
pdf_url: str,
|
| 73 |
+
filepath: str
|
| 74 |
+
) -> Tuple[str, bool]:
|
| 75 |
+
"""Download a single paper with the given session."""
|
| 76 |
+
try:
|
| 77 |
+
async with session.get(pdf_url) as response:
|
| 78 |
+
if response.status == 200:
|
| 79 |
+
content = await response.read()
|
| 80 |
+
with open(filepath, "wb") as f:
|
| 81 |
+
f.write(content)
|
| 82 |
+
return (paper_id, True)
|
| 83 |
+
return (paper_id, False)
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"Error downloading {paper_id}: {str(e)}")
|
| 86 |
+
return (paper_id, False)
|
| 87 |
+
|
| 88 |
+
def parse_paper_data(self, paper_entry: dict) -> Paper:
|
| 89 |
+
"""Convert raw paper data to Paper model."""
|
| 90 |
+
paper_data = paper_entry["paper"]
|
| 91 |
+
return Paper(
|
| 92 |
+
paper_id=paper_data["id"],
|
| 93 |
+
title=paper_data["title"],
|
| 94 |
+
authors=paper_data["authors"],
|
| 95 |
+
summary=paper_data["summary"],
|
| 96 |
+
published_at=paper_data["publishedAt"],
|
| 97 |
+
pdf_url=PDF_BASE_URL.format(id=paper_data["id"])
|
| 98 |
+
)
|
paperflux/src/tasks.py
DELETED
|
@@ -1,56 +0,0 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
-
import aiohttp
|
| 3 |
-
from pymongo import MongoClient
|
| 4 |
-
from datetime import datetime, timezone
|
| 5 |
-
import os
|
| 6 |
-
|
| 7 |
-
API_URL = "https://huggingface.co/api/daily_papers"
|
| 8 |
-
PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
|
| 9 |
-
|
| 10 |
-
client = MongoClient("mongodb://localhost:27017/")
|
| 11 |
-
db = client["papers_summary_database"]
|
| 12 |
-
collection = db["papers"]
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
async def fetch_papers(session):
|
| 16 |
-
async with session.get(API_URL) as response:
|
| 17 |
-
if response.status == 200:
|
| 18 |
-
return await response.json()
|
| 19 |
-
raise Exception(f"API request failed: {response.status}")
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
async def download_pdf(session, paper_entry):
|
| 23 |
-
try:
|
| 24 |
-
paper_id = paper_entry["paper"]["id"]
|
| 25 |
-
pdf_url = PDF_BASE_URL.format(id=paper_id)
|
| 26 |
-
|
| 27 |
-
async with session.get(pdf_url) as response:
|
| 28 |
-
if response.status == 200:
|
| 29 |
-
content = await response.read()
|
| 30 |
-
os.makedirs("pdfs", exist_ok=True)
|
| 31 |
-
with open(f"pdfs/{paper_id}", "wb") as f:
|
| 32 |
-
f.write(content)
|
| 33 |
-
|
| 34 |
-
return (paper_id, True)
|
| 35 |
-
|
| 36 |
-
return (paper_id, False)
|
| 37 |
-
except Exception as e:
|
| 38 |
-
print(f"Error downloading {paper_id}: {str(e)}")
|
| 39 |
-
return (paper_id, False)
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
async def run_paper_fetch_job():
|
| 43 |
-
async with aiohttp.ClientSession() as session:
|
| 44 |
-
papers = await fetch_papers(session)
|
| 45 |
-
tasks = []
|
| 46 |
-
|
| 47 |
-
for paper in papers:
|
| 48 |
-
paper_data = paper["paper"]
|
| 49 |
-
paper_data["fetchedAt"] = datetime.now(timezone.utc).isoformat()
|
| 50 |
-
collection.insert_one(paper_data)
|
| 51 |
-
|
| 52 |
-
tasks = [download_pdf(session, paper) for paper in papers]
|
| 53 |
-
results = await asyncio.gather(*tasks)
|
| 54 |
-
|
| 55 |
-
successful = sum(1 for _, status in results if status)
|
| 56 |
-
print(f"Downloaded {successful}/{len(papers)} papers successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
paperflux/src/web/__init__.py
ADDED
|
File without changes
|
paperflux/src/web/app.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from src.services.database import DatabaseService
|
| 3 |
+
|
| 4 |
+
class PaperFluxUI:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.db = DatabaseService()
|
| 7 |
+
self.papers = self.db.get_all_papers()
|
| 8 |
+
self.current_index = 0
|
| 9 |
+
|
| 10 |
+
def get_current_paper(self):
|
| 11 |
+
if not self.papers:
|
| 12 |
+
return {
|
| 13 |
+
"title": "No papers available",
|
| 14 |
+
"explanation": "Please wait for papers to be processed.",
|
| 15 |
+
"pdf_url": ""
|
| 16 |
+
}
|
| 17 |
+
paper = self.papers[self.current_index]
|
| 18 |
+
authors = ", ".join([author["name"] for author in paper["authors"]])
|
| 19 |
+
title = f"# {paper['title']}\n\nAuthors: {authors}"
|
| 20 |
+
return {
|
| 21 |
+
"title": title,
|
| 22 |
+
"explanation": paper["explanation"],
|
| 23 |
+
"pdf_url": paper["pdf_url"]
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
def next_paper(self):
|
| 27 |
+
if self.current_index < len(self.papers) - 1:
|
| 28 |
+
self.current_index += 1
|
| 29 |
+
return self.get_current_paper()
|
| 30 |
+
|
| 31 |
+
def previous_paper(self):
|
| 32 |
+
if self.current_index > 0:
|
| 33 |
+
self.current_index -= 1
|
| 34 |
+
return self.get_current_paper()
|
| 35 |
+
|
| 36 |
+
def create_interface(self):
|
| 37 |
+
with gr.Blocks(theme=gr.themes.Base()) as interface:
|
| 38 |
+
title = gr.Markdown()
|
| 39 |
+
explanation = gr.Markdown()
|
| 40 |
+
|
| 41 |
+
# Create an HTML component for the download link
|
| 42 |
+
download_html = gr.HTML()
|
| 43 |
+
|
| 44 |
+
with gr.Row():
|
| 45 |
+
prev_btn = gr.Button("Previous Paper")
|
| 46 |
+
next_btn = gr.Button("Next Paper")
|
| 47 |
+
|
| 48 |
+
def update_ui(paper_data):
|
| 49 |
+
download_link = f"""
|
| 50 |
+
<div style="text-align: center; margin-top: 10px;">
|
| 51 |
+
<a href="{paper_data['pdf_url']}" target="_blank"
|
| 52 |
+
style="text-decoration: none;">
|
| 53 |
+
<button style="padding: 10px 20px; background-color: #4CAF50;
|
| 54 |
+
color: white; border: none; border-radius: 5px;
|
| 55 |
+
cursor: pointer;">
|
| 56 |
+
Download Paper
|
| 57 |
+
</button>
|
| 58 |
+
</a>
|
| 59 |
+
</div>
|
| 60 |
+
"""
|
| 61 |
+
return (
|
| 62 |
+
paper_data["title"],
|
| 63 |
+
paper_data["explanation"],
|
| 64 |
+
download_link
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
next_btn.click(
|
| 68 |
+
fn=lambda: update_ui(self.next_paper()),
|
| 69 |
+
outputs=[title, explanation, download_html]
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
prev_btn.click(
|
| 73 |
+
fn=lambda: update_ui(self.previous_paper()),
|
| 74 |
+
outputs=[title, explanation, download_html]
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Initialize with first paper
|
| 78 |
+
paper_data = self.get_current_paper()
|
| 79 |
+
init_download_link = f"""
|
| 80 |
+
<div style="text-align: center; margin-top: 10px;">
|
| 81 |
+
<a href="{paper_data['pdf_url']}" target="_blank"
|
| 82 |
+
style="text-decoration: none;">
|
| 83 |
+
<button style="padding: 10px 20px; background-color: #4CAF50;
|
| 84 |
+
color: white; border: none; border-radius: 5px;
|
| 85 |
+
cursor: pointer;">
|
| 86 |
+
Download Paper
|
| 87 |
+
</button>
|
| 88 |
+
</a>
|
| 89 |
+
</div>
|
| 90 |
+
"""
|
| 91 |
+
title.value = paper_data["title"]
|
| 92 |
+
explanation.value = paper_data["explanation"]
|
| 93 |
+
download_html.value = init_download_link
|
| 94 |
+
|
| 95 |
+
return interface
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -3,7 +3,8 @@ name = "paperflux"
|
|
| 3 |
version = "0.1.0"
|
| 4 |
description = ""
|
| 5 |
authors = [
|
| 6 |
-
{name = "kartikbhtt7",email = "kartikbhtt7@gmail.com"}
|
|
|
|
| 7 |
]
|
| 8 |
license = {text = "MIT"}
|
| 9 |
readme = "README.md"
|
|
@@ -21,7 +22,8 @@ dependencies = [
|
|
| 21 |
"markdown (>=3.7,<4.0)",
|
| 22 |
"pymongo (>=4.11.1,<5.0.0)",
|
| 23 |
"flask (>=3.1.0,<4.0.0)",
|
| 24 |
-
"tqdm (>=4.67.1,<5.0.0)"
|
|
|
|
| 25 |
]
|
| 26 |
|
| 27 |
|
|
|
|
| 3 |
version = "0.1.0"
|
| 4 |
description = ""
|
| 5 |
authors = [
|
| 6 |
+
{name = "kartikbhtt7",email = "kartikbhtt7@gmail.com"},
|
| 7 |
+
{name = "Vector73",email = "v.shm.kunal@gmail.com"}
|
| 8 |
]
|
| 9 |
license = {text = "MIT"}
|
| 10 |
readme = "README.md"
|
|
|
|
| 22 |
"markdown (>=3.7,<4.0)",
|
| 23 |
"pymongo (>=4.11.1,<5.0.0)",
|
| 24 |
"flask (>=3.1.0,<4.0.0)",
|
| 25 |
+
"tqdm (>=4.67.1,<5.0.0)",
|
| 26 |
+
"aiohttp (>=3.11.12,<4.0.0)"
|
| 27 |
]
|
| 28 |
|
| 29 |
|