smokxy commited on
Commit
43ec9c8
·
1 Parent(s): f4c99e4

repository sturcturing

Browse files
.env CHANGED
@@ -1,14 +1,5 @@
1
- # Hugging Face
2
- HF_API_KEY=your_huggingface_key
3
-
4
  # Gemini
5
- GEMINI_API_KEY=your_gemini_key
6
-
7
- # Redis
8
- REDIS_HOST=localhost
9
- REDIS_PORT=6379
10
- REDIS_DB=0
11
 
12
- # App Config
13
- DAILY_PAPER_LIMIT=
14
- CACHE_TTL=86400
 
 
 
 
1
  # Gemini
2
+ GEMINI_API_KEY=YOUR_API_KEY
 
 
 
 
 
3
 
4
+ # MongoDB
5
+ MONGO_URI=your_mongo_uri
 
paperflux/main.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import signal
2
+ import sys
3
+ from src.scheduler.jobs import PaperProcessingScheduler
4
+ from src.web.app import PaperFluxUI
5
+ import threading
6
+
7
+ def signal_handler(signum, frame):
8
+ print("\nShutting down gracefully...")
9
+ scheduler.stop()
10
+ sys.exit(0)
11
+
12
+ def main():
13
+ global scheduler
14
+
15
+ # Set up signal handlers
16
+ signal.signal(signal.SIGINT, signal_handler)
17
+ signal.signal(signal.SIGTERM, signal_handler)
18
+
19
+ # Start the scheduler in a background thread
20
+ scheduler = PaperProcessingScheduler()
21
+ scheduler_thread = threading.Thread(target=scheduler.start, daemon=True)
22
+ scheduler_thread.start()
23
+
24
+ # Create and launch the Gradio interface
25
+ ui = PaperFluxUI()
26
+ interface = ui.create_interface()
27
+ interface.launch(server_name="0.0.0.0", share=True)
28
+
29
+ if __name__ == "__main__":
30
+ main()
paperflux/src/__pycache__/scheduler.cpython-311.pyc DELETED
Binary file (1.61 kB)
 
paperflux/src/__pycache__/tasks.cpython-311.pyc DELETED
Binary file (5.11 kB)
 
paperflux/src/app.py DELETED
@@ -1,18 +0,0 @@
1
- from threading import Thread
2
- from scheduler import start_scheduler
3
- from flask import Flask
4
-
5
- app = Flask(__name__)
6
-
7
-
8
- @app.route("/")
9
- def home():
10
- return "Welcome!"
11
-
12
-
13
- if __name__ == "__main__":
14
- # Run scheduler to fetch papers in a separate thread
15
- scheduler_thread = Thread(target=start_scheduler, daemon=True)
16
- scheduler_thread.start()
17
-
18
- app.run(debug=True, use_reloader=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
paperflux/src/config/settings.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ MONGODB_URI = "mongodb+srv:"
4
+ DB_NAME = "papers_summary_database"
5
+ COLLECTION_NAME = "papers"
6
+ HF_API_URL = "https://huggingface.co/api/daily_papers"
7
+ PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
8
+ TEMP_DIR = "temp_papers"
9
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
paperflux/{__init__.py → src/models/__init__.py} RENAMED
File without changes
paperflux/src/models/paper.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from typing import List, Dict, Optional
3
+
4
+ class Paper:
5
+ def __init__(
6
+ self,
7
+ paper_id: str,
8
+ title: str,
9
+ authors: List[Dict],
10
+ summary: str,
11
+ published_at: str,
12
+ explanation: Optional[str] = None,
13
+ pdf_url: Optional[str] = None,
14
+ ):
15
+ self.paper_id = paper_id
16
+ self.title = title
17
+ self.authors = authors
18
+ self.summary = summary
19
+ self.published_at = published_at
20
+ self.explanation = explanation
21
+ self.pdf_url = pdf_url
22
+ self.processed_at = datetime.utcnow()
23
+
24
+ def to_dict(self) -> Dict:
25
+ return {
26
+ "paper_id": self.paper_id,
27
+ "title": self.title,
28
+ "authors": self.authors,
29
+ "summary": self.summary,
30
+ "published_at": self.published_at,
31
+ "explanation": self.explanation,
32
+ "pdf_url": self.pdf_url,
33
+ "processed_at": self.processed_at,
34
+ }
paperflux/src/scheduler.py DELETED
@@ -1,24 +0,0 @@
1
- import time
2
- import asyncio
3
- from apscheduler.schedulers.background import BackgroundScheduler
4
- from tasks import run_paper_fetch_job
5
- from datetime import datetime
6
-
7
-
8
- def job():
9
- print("Fetching papers 📝")
10
- asyncio.run(run_paper_fetch_job())
11
-
12
-
13
- def start_scheduler():
14
- scheduler = BackgroundScheduler()
15
- scheduler.add_job(job, "interval", hours=24, next_run_time=datetime.now())
16
- scheduler.start()
17
- print("Scheduler started. Running in background.")
18
-
19
- try:
20
- while True:
21
- time.sleep(60)
22
- except (KeyboardInterrupt, SystemExit):
23
- scheduler.shutdown()
24
- print("Scheduler stopped.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
paperflux/src/{__init__.py → scheduler/__init__.py} RENAMED
File without changes
paperflux/src/scheduler/jobs.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from apscheduler.schedulers.background import BackgroundScheduler
3
+ from datetime import datetime
4
+ import os
5
+ from src.services.paper_fetcher import PaperFetcher
6
+ from src.services.paper_analyzer import PaperAnalyzer
7
+ from src.services.database import DatabaseService
8
+
9
+ class PaperProcessingScheduler:
10
+ def __init__(self):
11
+ self.scheduler = BackgroundScheduler()
12
+ self.fetcher = PaperFetcher()
13
+ self.analyzer = PaperAnalyzer()
14
+ self.db = DatabaseService()
15
+ self._running = False
16
+
17
+ async def process_papers(self):
18
+ if self._running:
19
+ print("Previous processing still running, skipping...")
20
+ return
21
+
22
+ self._running = True
23
+ print("Starting daily paper processing...")
24
+
25
+ try:
26
+ self.db.clear_collection()
27
+ papers = await self.fetcher.fetch_papers()
28
+
29
+ for paper in papers:
30
+ if not self._running: # Check if we should stop
31
+ break
32
+
33
+ pdf_path = await self.fetcher.download_paper(paper)
34
+ if pdf_path:
35
+ try:
36
+ explanation = self.analyzer.analyze_paper(pdf_path)
37
+ paper_obj = self.fetcher.parse_paper_data(paper)
38
+ paper_obj.explanation = explanation
39
+ self.db.insert_paper(paper_obj)
40
+ finally:
41
+ if os.path.exists(pdf_path):
42
+ os.remove(pdf_path)
43
+
44
+ except Exception as e:
45
+ print(f"Error in paper processing: {str(e)}")
46
+ finally:
47
+ self._running = False
48
+
49
+ def start(self):
50
+ self.scheduler.add_job(
51
+ lambda: asyncio.run(self.process_papers()),
52
+ 'cron',
53
+ hour=0,
54
+ minute=0,
55
+ next_run_time=datetime.now()
56
+ )
57
+ self.scheduler.start()
58
+
59
+ def stop(self):
60
+ self._running = False
61
+ self.scheduler.shutdown()
paperflux/src/services/database.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ from src.config.settings import MONGODB_URI, DB_NAME, COLLECTION_NAME
3
+ from src.models.paper import Paper
4
+
5
+ class DatabaseService:
6
+ def __init__(self):
7
+ self.client = MongoClient(MONGODB_URI)
8
+ self.db = self.client[DB_NAME]
9
+ self.collection = self.db[COLLECTION_NAME]
10
+
11
+ def clear_collection(self):
12
+ self.collection.delete_many({})
13
+
14
+ def insert_paper(self, paper: Paper):
15
+ return self.collection.insert_one(paper.to_dict())
16
+
17
+ def get_all_papers(self):
18
+ return list(self.collection.find())
19
+
20
+ def get_paper_by_id(self, paper_id: str):
21
+ return self.collection.find_one({"paper_id": paper_id})
paperflux/src/services/paper_analyzer.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
3
+ from src.config.settings import GEMINI_API_KEY
4
+
5
+ class PaperAnalyzer:
6
+ def __init__(self):
7
+ genai.configure(api_key=GEMINI_API_KEY)
8
+ self.model = genai.GenerativeModel("gemini-1.5-pro-latest")
9
+ self.safety_settings = {
10
+ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
11
+ HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
12
+ HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
13
+ HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
14
+ }
15
+
16
+ def analyze_paper(self, pdf_path: str) -> str:
17
+ uploaded_file = genai.upload_file(pdf_path)
18
+ prompt = """Analyze this research paper thoroughly and provide:
19
+
20
+ # Paper Title
21
+ ## Core Contribution
22
+ ## Technical Breakdown
23
+ - Detailed mathematical concepts and intuition with in depth explanation
24
+ - Key algorithms and methodologies
25
+ ## Visual Analysis
26
+ ## Critical Assessment
27
+ ## Potential Applications
28
+
29
+ Include detailed mathematical expressions and thorough explanations."""
30
+
31
+ response = self.model.generate_content(
32
+ [prompt, uploaded_file],
33
+ safety_settings=self.safety_settings,
34
+ generation_config={"temperature": 0.2},
35
+ )
36
+
37
+ genai.delete_file(uploaded_file.name)
38
+ return response.text
paperflux/src/services/paper_fetcher.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import aiohttp
3
+ import asyncio
4
+ from datetime import datetime
5
+ from typing import List, Tuple, Optional
6
+ from src.config.settings import HF_API_URL, PDF_BASE_URL, TEMP_DIR
7
+ from src.models.paper import Paper
8
+
9
+ class PaperFetcher:
10
+ def __init__(self):
11
+ os.makedirs(TEMP_DIR, exist_ok=True)
12
+
13
+ async def fetch_papers(self) -> List[dict]:
14
+ """Fetch daily papers from the Hugging Face API."""
15
+ async with aiohttp.ClientSession() as session:
16
+ async with session.get(HF_API_URL) as response:
17
+ if response.status == 200:
18
+ papers = await response.json()
19
+ print(f"Found {len(papers)} papers")
20
+ return papers
21
+ raise Exception(f"API request failed: {response.status}")
22
+
23
+ async def download_paper(self, paper_entry: dict) -> Optional[str]:
24
+ """
25
+ Download a single paper's PDF.
26
+ Returns the path to the downloaded PDF or None if download failed.
27
+ """
28
+ try:
29
+ paper_id = paper_entry["paper"]["id"]
30
+ pdf_url = PDF_BASE_URL.format(id=paper_id)
31
+ clean_id = paper_id.replace("/", "_")
32
+ filename = f"{datetime.now().date()}_{clean_id}.pdf"
33
+ filepath = os.path.join(TEMP_DIR, filename)
34
+
35
+ async with aiohttp.ClientSession() as session:
36
+ async with session.get(pdf_url) as response:
37
+ if response.status == 200:
38
+ content = await response.read()
39
+ with open(filepath, "wb") as f:
40
+ f.write(content)
41
+ print(f"Successfully downloaded: {paper_id}")
42
+ return filepath
43
+ print(f"Failed to download {paper_id}: HTTP {response.status}")
44
+ return None
45
+
46
+ except Exception as e:
47
+ print(f"Error downloading {paper_id}: {str(e)}")
48
+ return None
49
+
50
+ async def download_all_papers(self, papers: List[dict]) -> List[Tuple[str, bool]]:
51
+ """Download all papers in parallel."""
52
+ async with aiohttp.ClientSession() as session:
53
+ tasks = []
54
+ for paper in papers:
55
+ paper_id = paper["paper"]["id"]
56
+ pdf_url = PDF_BASE_URL.format(id=paper_id)
57
+ clean_id = paper_id.replace("/", "_")
58
+ filename = f"{datetime.now().date()}_{clean_id}.pdf"
59
+ filepath = os.path.join(TEMP_DIR, filename)
60
+
61
+ tasks.append(self.download_single_paper(session, paper_id, pdf_url, filepath))
62
+
63
+ results = await asyncio.gather(*tasks)
64
+ successful = sum(1 for status in results if status[1])
65
+ print(f"Downloaded {successful}/{len(papers)} papers successfully")
66
+ return results
67
+
68
+ async def download_single_paper(
69
+ self,
70
+ session: aiohttp.ClientSession,
71
+ paper_id: str,
72
+ pdf_url: str,
73
+ filepath: str
74
+ ) -> Tuple[str, bool]:
75
+ """Download a single paper with the given session."""
76
+ try:
77
+ async with session.get(pdf_url) as response:
78
+ if response.status == 200:
79
+ content = await response.read()
80
+ with open(filepath, "wb") as f:
81
+ f.write(content)
82
+ return (paper_id, True)
83
+ return (paper_id, False)
84
+ except Exception as e:
85
+ print(f"Error downloading {paper_id}: {str(e)}")
86
+ return (paper_id, False)
87
+
88
+ def parse_paper_data(self, paper_entry: dict) -> Paper:
89
+ """Convert raw paper data to Paper model."""
90
+ paper_data = paper_entry["paper"]
91
+ return Paper(
92
+ paper_id=paper_data["id"],
93
+ title=paper_data["title"],
94
+ authors=paper_data["authors"],
95
+ summary=paper_data["summary"],
96
+ published_at=paper_data["publishedAt"],
97
+ pdf_url=PDF_BASE_URL.format(id=paper_data["id"])
98
+ )
paperflux/src/tasks.py DELETED
@@ -1,56 +0,0 @@
1
- import asyncio
2
- import aiohttp
3
- from pymongo import MongoClient
4
- from datetime import datetime, timezone
5
- import os
6
-
7
- API_URL = "https://huggingface.co/api/daily_papers"
8
- PDF_BASE_URL = "https://arxiv.org/pdf/{id}.pdf"
9
-
10
- client = MongoClient("mongodb://localhost:27017/")
11
- db = client["papers_summary_database"]
12
- collection = db["papers"]
13
-
14
-
15
- async def fetch_papers(session):
16
- async with session.get(API_URL) as response:
17
- if response.status == 200:
18
- return await response.json()
19
- raise Exception(f"API request failed: {response.status}")
20
-
21
-
22
- async def download_pdf(session, paper_entry):
23
- try:
24
- paper_id = paper_entry["paper"]["id"]
25
- pdf_url = PDF_BASE_URL.format(id=paper_id)
26
-
27
- async with session.get(pdf_url) as response:
28
- if response.status == 200:
29
- content = await response.read()
30
- os.makedirs("pdfs", exist_ok=True)
31
- with open(f"pdfs/{paper_id}", "wb") as f:
32
- f.write(content)
33
-
34
- return (paper_id, True)
35
-
36
- return (paper_id, False)
37
- except Exception as e:
38
- print(f"Error downloading {paper_id}: {str(e)}")
39
- return (paper_id, False)
40
-
41
-
42
- async def run_paper_fetch_job():
43
- async with aiohttp.ClientSession() as session:
44
- papers = await fetch_papers(session)
45
- tasks = []
46
-
47
- for paper in papers:
48
- paper_data = paper["paper"]
49
- paper_data["fetchedAt"] = datetime.now(timezone.utc).isoformat()
50
- collection.insert_one(paper_data)
51
-
52
- tasks = [download_pdf(session, paper) for paper in papers]
53
- results = await asyncio.gather(*tasks)
54
-
55
- successful = sum(1 for _, status in results if status)
56
- print(f"Downloaded {successful}/{len(papers)} papers successfully")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
paperflux/src/web/__init__.py ADDED
File without changes
paperflux/src/web/app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.services.database import DatabaseService
3
+
4
+ class PaperFluxUI:
5
+ def __init__(self):
6
+ self.db = DatabaseService()
7
+ self.papers = self.db.get_all_papers()
8
+ self.current_index = 0
9
+
10
+ def get_current_paper(self):
11
+ if not self.papers:
12
+ return {
13
+ "title": "No papers available",
14
+ "explanation": "Please wait for papers to be processed.",
15
+ "pdf_url": ""
16
+ }
17
+ paper = self.papers[self.current_index]
18
+ authors = ", ".join([author["name"] for author in paper["authors"]])
19
+ title = f"# {paper['title']}\n\nAuthors: {authors}"
20
+ return {
21
+ "title": title,
22
+ "explanation": paper["explanation"],
23
+ "pdf_url": paper["pdf_url"]
24
+ }
25
+
26
+ def next_paper(self):
27
+ if self.current_index < len(self.papers) - 1:
28
+ self.current_index += 1
29
+ return self.get_current_paper()
30
+
31
+ def previous_paper(self):
32
+ if self.current_index > 0:
33
+ self.current_index -= 1
34
+ return self.get_current_paper()
35
+
36
+ def create_interface(self):
37
+ with gr.Blocks(theme=gr.themes.Base()) as interface:
38
+ title = gr.Markdown()
39
+ explanation = gr.Markdown()
40
+
41
+ # Create an HTML component for the download link
42
+ download_html = gr.HTML()
43
+
44
+ with gr.Row():
45
+ prev_btn = gr.Button("Previous Paper")
46
+ next_btn = gr.Button("Next Paper")
47
+
48
+ def update_ui(paper_data):
49
+ download_link = f"""
50
+ <div style="text-align: center; margin-top: 10px;">
51
+ <a href="{paper_data['pdf_url']}" target="_blank"
52
+ style="text-decoration: none;">
53
+ <button style="padding: 10px 20px; background-color: #4CAF50;
54
+ color: white; border: none; border-radius: 5px;
55
+ cursor: pointer;">
56
+ Download Paper
57
+ </button>
58
+ </a>
59
+ </div>
60
+ """
61
+ return (
62
+ paper_data["title"],
63
+ paper_data["explanation"],
64
+ download_link
65
+ )
66
+
67
+ next_btn.click(
68
+ fn=lambda: update_ui(self.next_paper()),
69
+ outputs=[title, explanation, download_html]
70
+ )
71
+
72
+ prev_btn.click(
73
+ fn=lambda: update_ui(self.previous_paper()),
74
+ outputs=[title, explanation, download_html]
75
+ )
76
+
77
+ # Initialize with first paper
78
+ paper_data = self.get_current_paper()
79
+ init_download_link = f"""
80
+ <div style="text-align: center; margin-top: 10px;">
81
+ <a href="{paper_data['pdf_url']}" target="_blank"
82
+ style="text-decoration: none;">
83
+ <button style="padding: 10px 20px; background-color: #4CAF50;
84
+ color: white; border: none; border-radius: 5px;
85
+ cursor: pointer;">
86
+ Download Paper
87
+ </button>
88
+ </a>
89
+ </div>
90
+ """
91
+ title.value = paper_data["title"]
92
+ explanation.value = paper_data["explanation"]
93
+ download_html.value = init_download_link
94
+
95
+ return interface
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -3,7 +3,8 @@ name = "paperflux"
3
  version = "0.1.0"
4
  description = ""
5
  authors = [
6
- {name = "kartikbhtt7",email = "kartikbhtt7@gmail.com"}
 
7
  ]
8
  license = {text = "MIT"}
9
  readme = "README.md"
@@ -21,7 +22,8 @@ dependencies = [
21
  "markdown (>=3.7,<4.0)",
22
  "pymongo (>=4.11.1,<5.0.0)",
23
  "flask (>=3.1.0,<4.0.0)",
24
- "tqdm (>=4.67.1,<5.0.0)"
 
25
  ]
26
 
27
 
 
3
  version = "0.1.0"
4
  description = ""
5
  authors = [
6
+ {name = "kartikbhtt7",email = "kartikbhtt7@gmail.com"},
7
+ {name = "Vector73",email = "v.shm.kunal@gmail.com"}
8
  ]
9
  license = {text = "MIT"}
10
  readme = "README.md"
 
22
  "markdown (>=3.7,<4.0)",
23
  "pymongo (>=4.11.1,<5.0.0)",
24
  "flask (>=3.1.0,<4.0.0)",
25
+ "tqdm (>=4.67.1,<5.0.0)",
26
+ "aiohttp (>=3.11.12,<4.0.0)"
27
  ]
28
 
29