| |
| |
| |
| import os, re, time, json, pickle, threading |
| import requests |
| import xml.etree.ElementTree as ET |
| from datetime import datetime, timedelta |
| from collections import Counter |
|
|
| import numpy as np |
| import faiss |
| import pandas as pd |
| import matplotlib |
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
| import gradio as gr |
| from sentence_transformers import SentenceTransformer |
| from groq import Groq |
| from gtts import gTTS |
| from langdetect import detect, DetectorFactory |
| from reportlab.lib.pagesizes import A4 |
| from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
| from reportlab.lib.units import cm |
| from reportlab.lib import colors |
| from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable |
|
|
| DetectorFactory.seed = 0 |
|
|
| GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "") |
| S2_API_KEY = os.environ.get("S2_API_KEY", "") |
| groq_client = Groq(api_key=GROQ_API_KEY) |
|
|
| print("Loading embedder...") |
| embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") |
| _ = embedder.encode(["warmup"]) |
| print("Embedder ready!") |
|
|
| PAPERS = [] |
| ACTIVE_PAPERS = [] |
| FAISS_INDEX = None |
| AUTO_RUNNING = False |
| AUTO_LOG = [] |
| CURRENT_YEAR = datetime.now().year |
|
|
| PERSIST_DIR = "/tmp" |
| FAVORITES_PATH = PERSIST_DIR + "/favorites.pkl" |
| SEEN_IDS_PATH = PERSIST_DIR + "/seen_ids.json" |
| os.makedirs(PERSIST_DIR, exist_ok=True) |
|
|
| CATEGORIES = { |
| "🌐 All": "", |
| "📊 Economics": "econ", |
| "💰 Quant Finance": "q-fin", |
| "🤖 AI": "cs.AI", |
| "🧠 Machine Learning":"cs.LG", |
| "💬 NLP": "cs.CL", |
| "📈 Statistics": "stat", |
| "🔬 Biology": "q-bio", |
| "⚛️ Physics": "physics", |
| "📐 Mathematics": "math", |
| "💻 Computer Science":"cs", |
| } |
| CROSSREF_SUBJECTS = { |
| "🌐 All": "", |
| "📊 Economics": "economics", |
| "💰 Quant Finance": "finance", |
| "🤖 AI": "artificial intelligence", |
| "🧠 Machine Learning":"machine learning", |
| "💬 NLP": "natural language processing", |
| "📈 Statistics": "statistics", |
| "🔬 Biology": "biology", |
| "⚛️ Physics": "physics", |
| "📐 Mathematics": "mathematics", |
| "💻 Computer Science":"computer science", |
| } |
| LANG_CHOICES = ["Arabic", "English"] |
| SORT_CHOICES = ["Newest", "Oldest", "Most Cited", "Least Cited"] |
| AR_RULES = """ |
| - ابدأ كل قسم بـ ## مع سطر فارغ قبله وبعده |
| - اكتب كل قسم في فقرة 3-4 جمل بالعربية الفصحى |
| - لا تكرر عنوان القسم داخل النص |
| """ |
|
|
| |
| |
| |
| def detect_lang(text): |
| try: |
| return "ar" if detect(str(text)[:300]).startswith("ar") else "en" |
| except: |
| return "en" |
|
|
| def clean_md(text): |
| text = re.sub(r"[#*`>\[\]!_~]", "", text) |
| return re.sub(r"\n+", " ", text).strip()[:2500] |
|
|
| def fix_ar_format(text): |
| text = re.sub(r"\n(##)", r"\n\n\1", text) |
| text = re.sub(r"(## [^\n]+)\n([^\n#])", r"\1\n\n\2", text) |
| return re.sub(r"\n{3,}", "\n\n", text).strip() |
|
|
| def cit_badge(n): |
| if n is None or n == "": return "—" |
| n = int(n) |
| if n >= 1000: return "🥇 " + "{:,}".format(n) |
| if n >= 100: return "🏆 " + "{:,}".format(n) |
| if n >= 10: return "⭐ " + "{:,}".format(n) |
| if n > 0: return "📄 " + str(n) |
| return "·" |
|
|
| def build_table(papers_list): |
| rows = "| # | Title | Author | Date | Citations | Source |\n" |
| rows += "|---|---|---|---|---|---|\n" |
| choices = [] |
| for i, p in enumerate(papers_list): |
| first = p["authors"][0] if p["authors"] else "N/A" |
| badge = "NEW" if p.get("recent") else "📄" |
| rows += "| {} | {} {} | {} | {} | {} | {} |\n".format( |
| i+1, badge, p["title"], first, |
| p["published"], cit_badge(p.get("citations")), |
| p.get("source","arXiv")) |
| choices.append("{}. {}".format(i+1, p["title"])) |
| return rows, choices |
|
|
| def s2_headers(): |
| h = {"User-Agent": "ScientificPaperBot/7.4"} |
| if S2_API_KEY: |
| h["x-api-key"] = S2_API_KEY |
| return h |
|
|
| def cr_headers(): |
| return {"User-Agent": "ScientificPaperBot/7.4 (mailto:researcher@example.com)"} |
|
|
| |
| |
| |
| def parse_crossref_date(item): |
| for field in ["issued", "published", "published-print", "published-online", "created"]: |
| dp = (item.get(field) or {}).get("date-parts", [[]]) |
| if not dp or not dp[0]: continue |
| pts = dp[0] |
| try: |
| year = int(pts[0]) |
| if not (1900 <= year <= CURRENT_YEAR + 1): continue |
| month = max(1, min(12, int(pts[1]) if len(pts) >= 2 else 1)) |
| day = max(1, min(31, int(pts[2]) if len(pts) >= 3 else 1)) |
| return "{:04d}-{:02d}-{:02d}".format(year, month, day) |
| except (ValueError, TypeError, IndexError): |
| continue |
| return "N/A" |
|
|
| |
| |
| |
| def load_seen_ids(): |
| try: |
| with open(SEEN_IDS_PATH) as f: return set(json.load(f)) |
| except: return set() |
|
|
| def save_seen_ids(ids): |
| with open(SEEN_IDS_PATH, "w") as f: json.dump(list(ids), f) |
|
|
| def load_favorites(): |
| try: |
| with open(FAVORITES_PATH, "rb") as f: return pickle.load(f) |
| except: return [] |
|
|
| def save_favorite(paper): |
| favs = load_favorites() |
| if paper["id"] not in {p["id"] for p in favs}: |
| favs.append(paper) |
| with open(FAVORITES_PATH, "wb") as f: pickle.dump(favs, f) |
| return "Saved: " + paper["title"] |
| return "Already saved." |
|
|
| def export_favorites_csv(): |
| favs = load_favorites() |
| if not favs: return None |
| df = pd.DataFrame([{ |
| "Title": p["title"], |
| "Authors": ", ".join(p["authors"][:3]), |
| "Date": p["published"], |
| "Citations": p.get("citations","N/A"), |
| "URL": p["url"], |
| "Source": p.get("source","arXiv") |
| } for p in favs]) |
| path = PERSIST_DIR + "/favorites.csv" |
| df.to_csv(path, index=False, encoding="utf-8-sig") |
| return path |
|
|
| def gr_export_fav(): return export_favorites_csv() |
|
|
| |
| |
| |
| def export_explanation_pdf(explanation_text, paper_title="paper"): |
| if not explanation_text or len(explanation_text) < 30: return None |
| safe = re.sub(r"[^\w\s-]", "", paper_title)[:50].strip().replace(" ", "_") |
| path = PERSIST_DIR + "/explanation_" + safe + ".pdf" |
| doc = SimpleDocTemplate(path, pagesize=A4, |
| rightMargin=2*cm, leftMargin=2*cm, |
| topMargin=2*cm, bottomMargin=2*cm) |
| styles = getSampleStyleSheet() |
| h2_style = ParagraphStyle("H2", parent=styles["Heading2"], |
| fontSize=11, textColor=colors.HexColor("#2563eb"), |
| spaceBefore=14, spaceAfter=6) |
| bd_style = ParagraphStyle("BD", parent=styles["Normal"], |
| fontSize=10, leading=16, spaceAfter=8) |
| mt_style = ParagraphStyle("MT", parent=styles["Normal"], |
| fontSize=9, textColor=colors.HexColor("#64748b")) |
| story = [] |
| for line in explanation_text.split("\n"): |
| line = line.strip() |
| if not line: story.append(Spacer(1, 6)); continue |
| clean = re.sub(r"\*\*(.+?)\*\*", r"\1", line) |
| clean = re.sub(r"\*(.+?)\*", r"\1", clean) |
| clean = re.sub(r"`(.+?)`", r"\1", clean) |
| clean = re.sub(r"^#{1,6}\s*", "", clean) |
| clean = re.sub(r"[🎯❓🔧📊🌟🔗📄👥📅📡🤖#*_~]", "", clean).strip() |
| if not clean: continue |
| if line.startswith("## ") or line.startswith("# "): |
| story.append(HRFlowable(width="100%", thickness=0.5, |
| color=colors.HexColor("#e2e8f0"), spaceAfter=4)) |
| story.append(Paragraph(clean, h2_style)) |
| elif line.startswith(">"): |
| q_st = ParagraphStyle("Q", parent=styles["Normal"], |
| fontSize=9, leftIndent=20, |
| textColor=colors.HexColor("#475569"), leading=14) |
| story.append(Paragraph( |
| re.sub(r"[🎯❓🔧📊🌟🔗📄👥📅📡🤖#*_~]","",line.lstrip(">").strip()), |
| q_st)) |
| else: |
| story.append(Paragraph(clean, bd_style)) |
| story += [ |
| Spacer(1, 20), |
| HRFlowable(width="100%", thickness=0.5, color=colors.HexColor("#e2e8f0")), |
| Paragraph("Generated by Paper Discovery v7.4 — " + |
| datetime.now().strftime("%Y-%m-%d %H:%M"), mt_style) |
| ] |
| try: |
| doc.build(story); return path |
| except Exception as e: |
| print("PDF error: " + str(e)); return None |
|
|
| def gr_export_pdf(explanation_text, choice): |
| if not explanation_text or len(explanation_text) < 50: |
| return None, "Explain a paper first." |
| title = choice.split(". ", 1)[-1] if choice else "paper" |
| path = export_explanation_pdf(explanation_text, title) |
| return (path, "PDF ready!") if path else (None, "PDF failed.") |
|
|
| |
| |
| |
| |
| |
| |
| def fetch_arxiv_papers(query, category, max_results=20, days_back=365, |
| sort_by="submittedDate"): |
| parts = [] |
| words = query.strip().split() |
| if len(words) >= 3 and sort_by == "relevance": |
| parts.append('ti:"' + query.strip() + '"') |
| elif query.strip(): |
| parts.append("all:" + query.strip()) |
| if category.strip(): |
| parts.append("cat:" + category.strip()) |
| sq = " AND ".join(parts) if parts else "all:machine learning" |
| params = { |
| "search_query": sq, |
| "start": 0, |
| "max_results": max_results, |
| "sortBy": sort_by, |
| "sortOrder": "descending", |
| } |
| try: |
| resp = requests.get("http://export.arxiv.org/api/query", params=params, timeout=30) |
| resp.raise_for_status() |
| except Exception as e: |
| print("arXiv error: " + str(e)); return [] |
|
|
| ns_a = "http://www.w3.org/2005/Atom" |
| ns_x = "http://arxiv.org/schemas/atom" |
| root = ET.fromstring(resp.content) |
| cutoff = datetime.now() - timedelta(days=days_back) |
| papers = [] |
| for entry in root.findall("{" + ns_a + "}entry"): |
| try: |
| pid = entry.find("{" + ns_a + "}id").text.split("/abs/")[-1].strip() |
| title = entry.find("{" + ns_a + "}title").text.strip().replace("\n"," ") |
| abstract = entry.find("{" + ns_a + "}summary").text.strip().replace("\n"," ") |
| published = entry.find("{" + ns_a + "}published").text[:10] |
| authors = [a.find("{" + ns_a + "}name").text |
| for a in entry.findall("{" + ns_a + "}author")] |
| cats = set() |
| pc = entry.find("{" + ns_x + "}primary_category") |
| if pc is not None: cats.add(pc.get("term","")) |
| for c in entry.findall("{" + ns_x + "}category"): cats.add(c.get("term","")) |
| cats.discard("") |
| papers.append({ |
| "id": pid, |
| "title": title, |
| "authors": authors[:6], |
| "abstract": abstract[:1200], |
| "published": published, |
| "categories": list(cats)[:4], |
| "citations": None, |
| "url": "https://arxiv.org/abs/" + pid, |
| "pdf_url": "https://arxiv.org/pdf/" + pid, |
| "recent": datetime.strptime(published, "%Y-%m-%d") >= cutoff, |
| "source": "arXiv", |
| }) |
| except Exception as e: |
| print("arXiv parse: " + str(e)) |
| return papers |
|
|
| |
| |
| |
| def fetch_crossref_papers(query, category_label="", max_results=20, |
| days_back=365, use_title=False): |
| subject = CROSSREF_SUBJECTS.get(category_label, "") |
| full_query = (query + " " + subject).strip() if subject else query |
| key = "query.title" if use_title else "query" |
| params = { |
| key: full_query, |
| "rows": min(max_results * 3, 200), |
| "sort": "relevance", |
| "select": ("title,author,abstract,published,published-print," |
| "published-online,issued,created,DOI," |
| "is-referenced-by-count,link,subject"), |
| } |
| items = [] |
| for attempt in range(3): |
| try: |
| r = requests.get("https://api.crossref.org/works", |
| params=params, headers=cr_headers(), timeout=30) |
| if r.status_code == 200: |
| items = r.json().get("message",{}).get("items",[]); break |
| if r.status_code == 429: time.sleep(2**attempt); continue |
| print("CrossRef " + str(r.status_code)); return [] |
| except Exception as e: |
| print("CrossRef attempt " + str(attempt) + ": " + str(e)); time.sleep(1) |
|
|
| cutoff = datetime.now() - timedelta(days=days_back) |
| papers, seen_ids = [], set() |
| for item in items: |
| if len(papers) >= max_results: break |
| title_list = item.get("title", []) |
| if not title_list: continue |
| title = title_list[0].strip() |
| if not title or title.lower().startswith("title pending"): continue |
| pub = parse_crossref_date(item) |
| if pub == "N/A": continue |
| cit = int(item.get("is-referenced-by-count", 0) or 0) |
| authors = [ |
| (a.get("given","") + " " + a.get("family","")).strip() |
| for a in item.get("author",[])[:6] |
| ] |
| authors = [a for a in authors if a.strip()] or ["Unknown"] |
| abstract = re.sub(r"<[^>]+>","", |
| item.get("abstract","No abstract.")).strip()[:1200] |
| doi = item.get("DOI","") |
| url = "https://doi.org/" + doi if doi else "#" |
| pid = doi or re.sub(r"\W","",title)[:40] |
| if pid in seen_ids: continue |
| seen_ids.add(pid) |
| pdf_url = next((l.get("URL","") for l in item.get("link",[]) |
| if "pdf" in l.get("content-type","").lower()), "") |
| try: recent = datetime.strptime(pub[:10], "%Y-%m-%d") >= cutoff |
| except: recent = False |
| papers.append({ |
| "id": pid, |
| "title": title, |
| "authors": authors, |
| "abstract": abstract, |
| "published": pub[:10], |
| "categories": item.get("subject",[])[:3], |
| "citations": cit, |
| "url": url, |
| "pdf_url": pdf_url, |
| "recent": recent, |
| "source": "CrossRef", |
| }) |
| papers.sort(key=lambda x: x["citations"], reverse=True) |
| return papers |
|
|
| |
| |
| |
| def global_paper_search(query, source_choice, max_results=10): |
| if not query or not query.strip(): |
| return "Enter a title or keywords." |
| q = query.strip(); papers = [] |
| if source_choice in ("arXiv", "Both"): |
| papers += fetch_arxiv_papers(q, "", int(max_results), 3650, |
| sort_by="relevance") |
| if source_choice in ("CrossRef", "Both"): |
| papers += fetch_crossref_papers(q, "", int(max_results), 3650, |
| use_title=True) |
| if not papers: |
| return "No results for: " + q |
|
|
| seen, unique = set(), [] |
| for p in papers: |
| key = re.sub(r"\W","",p["title"].lower())[:60] |
| if key not in seen: seen.add(key); unique.append(p) |
| unique.sort(key=lambda x: x.get("citations") or 0, reverse=True) |
|
|
| NL = "\n" |
| md = "## Search Results: " + q + NL + NL |
| md += "**" + str(len(unique)) + " papers found**" + NL + NL + "---" + NL + NL |
| for i, p in enumerate(unique, 1): |
| cit = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else "" |
| cats = " | ".join(p.get("categories",[])[:2]) |
| auth = ", ".join(p["authors"][:3]) |
| abst = p["abstract"][:450] |
| link = "[View](" + p["url"] + ")" |
| pdf = (" [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else "" |
| src = p.get("source","") |
| md += ("### " + str(i) + ". " + p["title"] + NL + NL + |
| auth + " | " + p["published"] + cit + " | " + src + |
| (" | " + cats if cats else "") + NL + NL + |
| "> " + abst + "..." + NL + NL + |
| link + pdf + NL + NL + "---" + NL + NL) |
| return md |
|
|
| |
| |
| |
| def enrich_citations(papers): |
| arxiv_papers = [p for p in papers |
| if p.get("source")=="arXiv" and |
| (p.get("citations") is None or p.get("citations")==0)] |
| if not arxiv_papers: |
| for p in papers: |
| if p.get("citations") is None: p["citations"] = 0 |
| return papers |
| id_map, batch_ids = {}, [] |
| for p in arxiv_papers: |
| clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip()) |
| id_map[clean] = p |
| batch_ids.append("arXiv:" + clean) |
| for i in range(0, len(batch_ids), 500): |
| try: |
| r = requests.post( |
| "https://api.semanticscholar.org/graph/v1/paper/batch", |
| json={"ids": batch_ids[i:i+500]}, |
| params={"fields":"citationCount,externalIds"}, |
| headers=s2_headers(), timeout=30) |
| if r.status_code == 200: |
| for item in r.json(): |
| if not item: continue |
| ext = item.get("externalIds") or {} |
| clean = re.sub(r"v\d+$","", |
| ext.get("ArXiv","").split("/")[-1].strip()) |
| if clean and clean in id_map: |
| c = item.get("citationCount") |
| if c is not None: id_map[clean]["citations"] = int(c) |
| elif r.status_code == 429: time.sleep(4) |
| except Exception as e: print("S2 batch: " + str(e)) |
| for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0][:15]: |
| clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip()) |
| for attempt in range(2): |
| try: |
| r = requests.get( |
| "https://api.semanticscholar.org/graph/v1/paper/arXiv:" + clean, |
| params={"fields":"citationCount"}, |
| headers=s2_headers(), timeout=10) |
| if r.status_code == 200: |
| c = r.json().get("citationCount") |
| p["citations"] = int(c) if c else 0; break |
| if r.status_code == 429: time.sleep(2**attempt); continue |
| p["citations"] = 0; break |
| except: p["citations"] = 0; break |
| time.sleep(0.12) |
| for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0]: |
| try: |
| r = requests.get("https://api.crossref.org/works", |
| params={"query.title": p["title"], "rows": 1, |
| "select": "is-referenced-by-count,title"}, |
| headers=cr_headers(), timeout=8) |
| if r.status_code == 200: |
| items = r.json().get("message",{}).get("items",[]) |
| if items: |
| found = (items[0].get("title") or [""])[0].lower() |
| qw = set(p["title"].lower().split()[:5]) |
| fw = set(found.split()[:10]) |
| p["citations"] = ( |
| int(items[0].get("is-referenced-by-count",0) or 0) |
| if len(qw & fw) >= 2 else 0) |
| else: p["citations"] = 0 |
| else: p["citations"] = 0 |
| time.sleep(0.12) |
| except: p["citations"] = 0 |
| for p in papers: |
| if p.get("citations") is None: p["citations"] = 0 |
| return papers |
|
|
| |
| |
| |
| def build_papers_index(papers): |
| global FAISS_INDEX, PAPERS |
| PAPERS = papers |
| if not papers: FAISS_INDEX = None; return |
| texts = [p["title"] + " " + p["abstract"] for p in papers] |
| embs = embedder.encode(texts, convert_to_numpy=True, |
| normalize_embeddings=True).astype("float32") |
| idx = faiss.IndexFlatIP(embs.shape[1]) |
| idx.add(embs) |
| FAISS_INDEX = idx |
|
|
| def search_papers(query, top_k=5): |
| if FAISS_INDEX is None or not PAPERS: return [] |
| qe = embedder.encode([query], convert_to_numpy=True, |
| normalize_embeddings=True).astype("float32") |
| scores, ids = FAISS_INDEX.search(qe, min(top_k, len(PAPERS))) |
| return [{"paper": PAPERS[i], "score": float(s)} |
| for s, i in zip(scores[0], ids[0]) if i >= 0 and float(s) > 0.1] |
|
|
| |
| |
| |
| def auto_fetch_worker(query, category, interval): |
| global AUTO_RUNNING |
| while AUTO_RUNNING: |
| time.sleep(interval) |
| if not AUTO_RUNNING: break |
| papers = fetch_arxiv_papers(query, category, 30, 1) |
| seen = load_seen_ids() |
| new_ps = [p for p in papers if p["id"] not in seen] |
| if new_ps: |
| save_seen_ids(seen | {p["id"] for p in papers}) |
| AUTO_LOG.append( |
| "[" + datetime.now().strftime("%H:%M") + "] NEW " + |
| str(len(new_ps)) + " — " + query) |
| if len(AUTO_LOG) > 20: AUTO_LOG.pop(0) |
|
|
| def start_auto_fetch(query, cat_label, interval_min): |
| global AUTO_RUNNING |
| if AUTO_RUNNING: return "Already running." |
| AUTO_RUNNING = True |
| threading.Thread( |
| target=auto_fetch_worker, |
| args=(query, CATEGORIES.get(cat_label,""), int(interval_min)*60), |
| daemon=True).start() |
| return "Auto-fetch started every " + str(interval_min) + " min for: " + query |
|
|
| def stop_auto_fetch(): |
| global AUTO_RUNNING; AUTO_RUNNING = False; return "Stopped." |
|
|
| def get_auto_log(): |
| return "\n\n".join(reversed(AUTO_LOG[-10:])) if AUTO_LOG else "No log." |
|
|
| |
| |
| |
| def analyze_trends(papers): |
| if not papers: return None, "No papers." |
| date_counts = Counter(p["published"][:7] for p in papers if p["published"]!="N/A") |
| stopwords = {"the","a","an","of","in","for","on","with","and","or","to","using", |
| "based","via","from","by","is","are","our","we","this","that","which", |
| "towards","approach","method","new","into","over","learning","deep", |
| "model","models","data","neural","large","language","paper","study", |
| "analysis","results","show","also","can","used","two","its","their"} |
| all_words = [w.lower() for p in papers |
| for w in re.findall(r"[a-zA-Z]{4,}", p["title"]) |
| if w.lower() not in stopwords] |
| top_words = Counter(all_words).most_common(15) |
| sources = Counter(p.get("source","arXiv") for p in papers) |
| cit_papers = [p for p in papers if (p.get("citations") or 0)>0] |
| top_cited = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:10] |
| all_auth = [a for p in papers for a in p["authors"][:3]] |
| top_authors = Counter(all_auth).most_common(10) |
| cvals = [p["citations"] for p in cit_papers] |
| buckets = [0,1,5,10,50,100,500,10000] |
| blabels = ["0","1-4","5-9","10-49","50-99","100-499","500+"] |
| bcounts = ([sum(1 for c in cvals if buckets[i]<=c<buckets[i+1]) |
| for i in range(len(buckets)-1)] if cvals else [0]*7) |
| avg_cit = round(sum(cvals)/max(len(cvals),1),1) if cvals else 0 |
| total_cit = sum(p.get("citations") or 0 for p in papers) |
| C = ["#3b82f6","#8b5cf6","#10b981","#f59e0b","#ef4444","#06b6d4", |
| "#ec4899","#14b8a6","#f97316","#a855f7","#22d3ee","#84cc16", |
| "#fbbf24","#34d399","#f87171"] |
| BG,PNL,BR,W = "#0f172a","#1e293b","#334155","white" |
| fig, axes = plt.subplots(2, 3, figsize=(20,12)) |
| fig.patch.set_facecolor(BG) |
| fig.suptitle("Research Trends", color=W, fontsize=16, fontweight="bold", y=1.01) |
| def style(ax): |
| ax.set_facecolor(PNL) |
| for sp in ax.spines.values(): sp.set_edgecolor(BR) |
| ax.tick_params(colors=W, labelsize=8) |
| ax = axes[0,0]; style(ax) |
| if date_counts: |
| ms,cs = zip(*sorted(date_counts.items())) |
| ms,cs = list(ms), list(cs) |
| bars = ax.bar(ms, cs, color=C[0], edgecolor="#60a5fa", lw=0.8) |
| for b,c in zip(bars,cs): |
| ax.text(b.get_x()+b.get_width()/2, b.get_height()+.05, str(c), |
| ha="center", va="bottom", color=W, fontsize=8) |
| if len(cs) > 2: |
| z = np.polyfit(range(len(cs)), cs, 1) |
| ax.plot(ms, np.poly1d(z)(range(len(cs))), "--", |
| color="#f59e0b", lw=1.5, alpha=.8, label="Trend") |
| ax.legend(fontsize=8, facecolor=PNL, labelcolor=W) |
| ax.set_title("Papers per Month", color=W, fontsize=12, fontweight="bold", pad=10) |
| ax.set_ylabel("Count", color=W, fontsize=9) |
| ax.tick_params(rotation=45) |
| ax = axes[0,1]; style(ax) |
| if top_words: |
| wds,wcts = zip(*top_words) |
| ax.barh(list(wds), list(wcts), color=C[:len(wds)], edgecolor="#475569", lw=.6) |
| for b,c in zip(ax.patches, wcts): |
| ax.text(b.get_width()+.1, b.get_y()+b.get_height()/2, str(c), |
| va="center", color=W, fontsize=8) |
| ax.set_title("Top Keywords", color=W, fontsize=12, fontweight="bold", pad=10) |
| ax.set_xlabel("Frequency", color=W, fontsize=9) |
| ax = axes[0,2]; ax.set_facecolor(PNL) |
| if sources: |
| sl,sv = zip(*sources.items()) |
| _,txts,ats = ax.pie(sv, labels=sl, autopct="%1.0f%%", |
| colors=C[:len(sl)], startangle=90, |
| textprops={"color":W,"fontsize":10}, |
| wedgeprops={"edgecolor":BR,"linewidth":1.5}) |
| for at in ats: at.set_color(W); at.set_fontsize(9) |
| ax.set_title("Source Distribution", color=W, fontsize=12, fontweight="bold", pad=10) |
| ax = axes[1,0]; style(ax) |
| if top_cited: |
| lbls = [(p["title"][:35]+"..." if len(p["title"])>35 else p["title"]) |
| for p in top_cited] |
| cv = [p["citations"] for p in top_cited] |
| ax.barh(lbls[::-1], cv[::-1], color=C[1], edgecolor="#475569", lw=.6) |
| mx = max(cv) if cv else 1 |
| for b,c in zip(ax.patches, cv[::-1]): |
| ax.text(b.get_width()+mx*.01, b.get_y()+b.get_height()/2, |
| "{:,}".format(c), va="center", color=W, fontsize=8) |
| ax.set_xlabel("Citations", color=W, fontsize=9) |
| else: |
| ax.text(.5,.5,"No citation data", ha="center", va="center", |
| color="#94a3b8", fontsize=11, transform=ax.transAxes) |
| ax.set_title("Top 10 Cited", color=W, fontsize=12, fontweight="bold", pad=10) |
| ax = axes[1,1]; style(ax) |
| if any(bcounts): |
| ax.bar(blabels, bcounts, color=C[2], edgecolor="#475569", lw=.8) |
| for b,c in zip(ax.patches, bcounts): |
| if c > 0: |
| ax.text(b.get_x()+b.get_width()/2, b.get_height()+.1, str(c), |
| ha="center", va="bottom", color=W, fontsize=9) |
| ax.set_xlabel("Citation Range", color=W, fontsize=9) |
| ax.set_ylabel("Papers", color=W, fontsize=9) |
| ax.annotate("Avg " + str(avg_cit) + " | Total " + "{:,}".format(total_cit), |
| xy=(.98,.96), xycoords="axes fraction", |
| ha="right", va="top", color="#94a3b8", fontsize=8) |
| else: |
| ax.text(.5,.5,"No citation data", ha="center", va="center", |
| color="#94a3b8", fontsize=11, transform=ax.transAxes) |
| ax.set_title("Citation Distribution", color=W, fontsize=12, fontweight="bold", pad=10) |
| ax = axes[1,2]; style(ax) |
| if top_authors: |
| an,ac = zip(*top_authors) |
| ax.barh(list(an)[::-1], list(ac)[::-1], color=C[3], edgecolor="#475569", lw=.6) |
| for b,c in zip(ax.patches, list(ac)[::-1]): |
| ax.text(b.get_width()+.05, b.get_y()+b.get_height()/2, str(c), |
| va="center", color=W, fontsize=8) |
| ax.set_xlabel("Papers", color=W, fontsize=9) |
| ax.set_title("Top Authors", color=W, fontsize=12, fontweight="bold", pad=10) |
| plt.tight_layout(pad=3) |
| path = PERSIST_DIR + "/trends.png" |
| plt.savefig(path, bbox_inches="tight", dpi=150, facecolor=BG) |
| plt.close() |
| top5 = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:5] |
| stats = ("### Stats\n\n| Metric | Value |\n|---|---|\n" + |
| "| Total | **" + str(len(papers)) + "** |\n" + |
| "| New | **" + str(sum(1 for p in papers if p.get("recent"))) + "** |\n" + |
| "| Citations | **" + "{:,}".format(total_cit) + "** |\n" + |
| "| Average | **" + str(avg_cit) + "** |\n\n") |
| if top5: |
| stats += "### Top Cited\n\n" |
| for i,p in enumerate(top5,1): |
| stats += (str(i) + ". [" + p["title"] + "](" + p["url"] + ")" + |
| " — **" + "{:,}".format(p["citations"]) + "**\n\n") |
| return path, stats |
|
|
| |
| |
| |
| def _llm(messages, max_tokens=1200): |
| try: |
| r = groq_client.chat.completions.create( |
| model="llama-3.3-70b-versatile", |
| messages=messages, temperature=0.3, max_tokens=max_tokens) |
| return r.choices[0].message.content.strip() |
| except Exception as e: return "LLM Error: " + str(e) |
|
|
| def explain_paper(paper, lang="ar"): |
| cit = paper.get("citations","N/A") |
| if lang == "ar": |
| return fix_ar_format(_llm([ |
| {"role":"system","content": "أنت خبير أكاديمي يشرح الأبحاث بالعربية الفصحى.\n" + AR_RULES}, |
| {"role":"user","content": |
| "اشرح الورقة:\nالعنوان: " + paper["title"] + "\n" + |
| "المؤلفون: " + ", ".join(paper["authors"][:3]) + "\n" + |
| "التاريخ: " + paper["published"] + " | الاقتباسات: " + str(cit) + "\n" + |
| "الملخص: " + paper["abstract"] + "\n\n" + |
| "## موضوع الورقة\n\n## المشكلة\n\n## المنهجية\n\n" + |
| "## النتائج\n\n## الأهمية\n\n## التطبيقات"}])) |
| return _llm([{"role":"user","content": |
| "Explain:\nTitle: " + paper["title"] + "\nAuthors: " + |
| ", ".join(paper["authors"][:3]) + "\nDate: " + paper["published"] + |
| " | Citations: " + str(cit) + "\nAbstract: " + paper["abstract"] + "\n\n" + |
| "## Topic\n## Problem\n## Methodology\n## Findings\n## Contribution\n## Applications"}]) |
|
|
| def compare_papers(pa, pb, lang="ar"): |
| body = ("Paper A: " + pa["title"] + " | Citations: " + str(pa.get("citations","N/A")) + |
| "\n" + pa["abstract"][:500] + "\n\nPaper B: " + |
| pb["title"] + " | Citations: " + str(pb.get("citations","N/A")) + |
| "\n" + pb["abstract"][:500]) |
| if lang == "ar": |
| return fix_ar_format(_llm([{"role":"user","content": |
| "قارن بين الورقتين.\n" + AR_RULES + "\n\n" + body + "\n\n" + |
| "## الهدف\n\n## المنهجية\n\n## النتائج\n\n" + |
| "## القوة\n\n## القيود\n\n## الخلاصة"}], 1400)) |
| return _llm([{"role":"user","content": |
| "Compare:\n" + body + "\n\n" + |
| "## Topic\n## Methodology\n## Results\n## Strengths\n## Limits\n## Verdict"}], 1400) |
|
|
| def summarize_papers(papers, topic, lang="ar"): |
| text = "".join( |
| str(i) + ". " + p["title"] + " (" + p["published"] + "): " + |
| p["abstract"][:300] + "...\n\n" |
| for i,p in enumerate(papers[:8],1)) |
| if lang == "ar": |
| return fix_ar_format(_llm([{"role":"user","content": |
| "نظرة عامة أكاديمية حول \"" + topic + "\".\n" + AR_RULES + |
| "\n\n" + text + "\n\n" + |
| "## الاتجاهات\n\n## أبرز الأوراق\n\n" + |
| "## المواضيع المشتركة\n\n## الفجوات"}], 900)) |
| return _llm([{"role":"user","content": |
| "Academic overview of \"" + topic + "\":\n" + text + "\n\n" + |
| "## Trends\n## Key Papers\n## Themes\n## Gaps"}], 900) |
|
|
| def generate_bibliography(papers, style="APA"): |
| entries = [] |
| for i,p in enumerate(papers,1): |
| auth = ", ".join(p["authors"][:6]) + (" et al." if len(p["authors"])>6 else "") |
| year = p["published"][:4] if p["published"] not in ("N/A","") else "n.d." |
| t,u = p["title"], p["url"] |
| if style == "APA": |
| entries.append(str(i) + ". " + auth + " (" + year + "). *" + t + "*. " + u) |
| elif style == "IEEE": |
| ae = " and ".join(p["authors"][:3]) + (" et al." if len(p["authors"])>3 else "") |
| entries.append("[" + str(i) + "] " + ae + ', "' + t + '," ' + year + ". [Online]: " + u) |
| elif style == "Chicago": |
| entries.append(str(i) + ". " + auth + '. "' + t + '." (' + year + "). " + u) |
| else: |
| key = re.sub(r"\W","", (p["authors"][0].split()[-1] |
| if p["authors"] else "Auth")) + year |
| entries.append("@article{" + key + str(i) + ",\n title={" + t + |
| "},\n author={" + auth + "},\n year={" + year + |
| "},\n url={" + u + "}\n}") |
| bib = "\n\n".join(entries) |
| path = PERSIST_DIR + "/bibliography_" + style + ".txt" |
| with open(path, "w", encoding="utf-8") as f: f.write(bib) |
| return bib, path |
|
|
| def chat_about_papers(question, history): |
| if not PAPERS: |
| return ("يرجى جلب الأوراق أولاً." if detect_lang(question)=="ar" |
| else "Fetch papers first.") |
| lang = detect_lang(question) |
| relevant = search_papers(question, top_k=4) |
| context = "" |
| if relevant: |
| context = ("الأوراق ذات الصلة:\n\n" if lang=="ar" else "Relevant papers:\n\n") |
| for r in relevant: |
| p = r["paper"] |
| cit = (" | " + str(p["citations"]) + " citations") if p.get("citations") else "" |
| context += ("**" + p["title"] + "** (" + p["published"] + ")" + cit + |
| "\n" + p["abstract"][:400] + "\n🔗 " + p["url"] + "\n\n") |
| sys_msg = (("أنت مساعد بحثي. أجب بالعربية الفصحى.\n" + AR_RULES) if lang=="ar" |
| else "You are an academic assistant. Answer in English.") |
| msgs = [{"role":"system","content":sys_msg}] |
| for t in history[-4:]: msgs.append({"role":t["role"],"content":t["content"]}) |
| msgs.append({"role":"user","content": |
| (context + "\nسؤال: " + question) if context else question}) |
| out = _llm(msgs, 800) |
| return fix_ar_format(out) if lang=="ar" else out |
|
|
| def text_to_audio(text, lang="ar"): |
| clean = clean_md(text) |
| if not clean: return None |
| try: |
| tts = gTTS(text=clean, lang=lang, slow=False) |
| path = PERSIST_DIR + "/audio_" + lang + ".mp3" |
| tts.save(path); return path |
| except Exception as e: print("TTS: " + str(e)); return None |
|
|
| |
| |
| |
| def gr_fetch(query, category_label, max_results, days_back, source_choice, |
| progress=gr.Progress()): |
| global ACTIVE_PAPERS |
| progress(0.05, desc="Connecting...") |
| papers, warn = [], "" |
| if source_choice in ("arXiv", "Both"): |
| progress(0.15, desc="Fetching arXiv...") |
| papers += fetch_arxiv_papers(query, CATEGORIES.get(category_label,""), |
| int(max_results), int(days_back), |
| sort_by="submittedDate") |
| if source_choice in ("CrossRef", "Both"): |
| progress(0.35, desc="Fetching CrossRef...") |
| cr = fetch_crossref_papers(query, category_label, int(max_results), int(days_back)) |
| if not cr: warn = "\n\n> CrossRef: no results." |
| papers += cr |
| seen, unique = set(), [] |
| for p in papers: |
| key = re.sub(r"\W","",p["title"].lower())[:60] |
| if key not in seen: seen.add(key); unique.append(p) |
| papers = unique |
| if not papers: |
| return ("No results." + warn, |
| gr.update(choices=[], value=None), gr.update(choices=[], value=None), |
| gr.update(choices=[], value=None), gr.update(choices=[], value=None), |
| "0 papers") |
| progress(0.60, desc="Fetching citations...") |
| papers = enrich_citations(papers) |
| progress(0.85, desc="FAISS indexing...") |
| build_papers_index(papers) |
| ACTIVE_PAPERS = list(papers) |
| tbl, choices = build_table(papers) |
| recent = sum(1 for p in papers if p.get("recent")) |
| tot_cit = sum(p.get("citations") or 0 for p in papers) |
| zero_cit = sum(1 for p in papers if (p.get("citations") or 0)==0) |
| note = ("\n\n> " + str(zero_cit) + " papers with 0 citations (new/unindexed)." |
| if zero_cit else "") |
| md = ("## Fetched **" + str(len(papers)) + "** papers\n\n" + |
| "New: **" + str(recent) + "** | Citations: **" + |
| "{:,}".format(tot_cit) + "**" + warn + note + |
| "\n\n---\n\n" + tbl) |
| upd = gr.update(choices=choices, value=choices[0] if choices else None) |
| progress(1.0) |
| return md, upd, upd, upd, upd, str(len(papers)) + " papers | " + "{:,}".format(tot_cit) + " cit." |
|
|
| def gr_filter_papers(year_from, year_to, cit_min, cit_max, sort_by): |
| global ACTIVE_PAPERS |
| if not PAPERS: return "Fetch papers first.", gr.update(), "0" |
| filtered = [] |
| for p in PAPERS: |
| try: |
| y = int(p["published"][:4]) |
| if y < int(year_from) or y > int(year_to): continue |
| except: pass |
| cit = int(p.get("citations") or 0) |
| if cit < int(cit_min) or cit > int(cit_max): continue |
| filtered.append(p) |
| if sort_by == "Newest": filtered.sort(key=lambda x: x["published"], reverse=True) |
| elif sort_by == "Oldest": filtered.sort(key=lambda x: x["published"]) |
| elif sort_by == "Most Cited": filtered.sort(key=lambda x: x.get("citations") or 0, reverse=True) |
| elif sort_by == "Least Cited":filtered.sort(key=lambda x: x.get("citations") or 0) |
| if not filtered: |
| ACTIVE_PAPERS = [] |
| return "No matching papers.", gr.update(choices=[], value=None), "0" |
| ACTIVE_PAPERS = list(filtered) |
| tbl, choices = build_table(filtered) |
| tot = sum(p.get("citations") or 0 for p in filtered) |
| md = ("## " + str(len(filtered)) + "/" + str(len(PAPERS)) + " papers" + |
| " | " + str(year_from) + "-" + str(year_to) + |
| " | cit " + str(cit_min) + "-" + str(cit_max) + |
| " | total " + "{:,}".format(tot) + "\n\n---\n\n" + tbl) |
| return md, gr.update(choices=choices, value=choices[0] if choices else None), str(len(filtered)) + "/" + str(len(PAPERS)) |
|
|
| def gr_search_fetched(query): |
| if not query or not query.strip(): return "Enter a query." |
| if not PAPERS: return "Fetch papers first." |
| results = search_papers(query.strip(), top_k=8) |
| if not results: return "No results for: " + query |
| NL = "\n" |
| md = "## Search: " + query + " — " + str(len(results)) + " results" + NL + NL |
| for r in results: |
| p,s = r["paper"], r["score"] |
| bar = "green " * round(s*10) |
| cit = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else "" |
| link = "[View](" + p["url"] + ")" |
| pdf = (" [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else "" |
| md += ("### " + "{:.0f}".format(s*100) + "% — " + p["title"] + NL + NL + |
| ", ".join(p["authors"][:2]) + " | " + p["published"] + cit + |
| " | " + p.get("source","") + NL + NL + |
| "> " + p["abstract"][:350] + "..." + NL + NL + |
| link + pdf + NL + NL + "---" + NL + NL) |
| return md |
|
|
| def _get_paper(choice): |
| pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS |
| try: return pool[int(choice.split(".")[0]) - 1] |
| except: return None |
|
|
| def gr_explain(choice, lang_choice): |
| if not choice: return "Fetch papers and select one." |
| paper = _get_paper(choice) |
| if not paper: return "Selection error." |
| lang = "ar" if "Arabic" in lang_choice else "en" |
| NL = "\n" |
| |
| pdf_link = (" [PDF](" + paper["pdf_url"] + ")") if paper.get("pdf_url") else "" |
| header = ("# " + paper["title"] + NL + NL + |
| "**Authors:** " + ", ".join(paper["authors"]) + NL + NL + |
| "**Date:** " + paper["published"] + |
| " | **Citations:** " + cit_badge(paper.get("citations")) + |
| " | **Source:** " + paper.get("source","arXiv") + NL + NL + |
| "[View Paper](" + paper["url"] + ")" + pdf_link + NL + NL + |
| "---" + NL + NL + |
| "> " + paper["abstract"] + NL + NL + |
| "---" + NL + NL + |
| "## Explanation (Llama 3.3 70B)" + NL + NL) |
| return header + explain_paper(paper, lang) |
|
|
| def gr_audio(txt, lang_choice): |
| if not txt or len(txt) < 50: return None |
| return text_to_audio(txt, "ar" if "Arabic" in lang_choice else "en") |
|
|
| def gr_save_fav(choice): |
| if not choice: return "Select a paper first." |
| paper = _get_paper(choice) |
| return save_favorite(paper) if paper else "Error." |
|
|
| def gr_show_favs(): |
| favs = load_favorites() |
| if not favs: return "No saved papers." |
| NL = "\n" |
| lines = [("**" + p["title"] + "**" + NL + |
| (p["authors"][0] if p["authors"] else "N/A") + |
| " | " + p["published"] + " | " + p.get("source","") + |
| " | " + cit_badge(p.get("citations")) + |
| " | [Link](" + p["url"] + ")") |
| for p in favs] |
| return ("### Favorites — " + str(len(favs)) + " papers" + NL + NL + |
| (NL + NL + "---" + NL + NL).join(lines)) |
|
|
| def gr_compare(ca, cb, lc): |
| if not ca or not cb: return "Select two papers." |
| pa = _get_paper(ca); pb = _get_paper(cb) |
| if not pa or not pb: return "Selection error." |
| if pa["id"] == pb["id"]: return "Select two different papers." |
| return compare_papers(pa, pb, "ar" if "Arabic" in lc else "en") |
|
|
| def gr_overview(query, lc): |
| if not PAPERS: return "Fetch papers first." |
| pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS |
| return ("## Overview\n\n" + |
| summarize_papers(pool, query or "research", |
| "ar" if "Arabic" in lc else "en")) |
|
|
| def gr_trends(): |
| if not PAPERS: return None, "Fetch papers first." |
| return analyze_trends(ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS) |
|
|
| def gr_bib(style, progress=gr.Progress()): |
| if not PAPERS: return "Fetch papers first.", None |
| progress(0.5, desc="Generating...") |
| pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS |
| text, path = generate_bibliography(pool, style) |
| progress(1.0) |
| short = text[:3000] + ("..." if len(text)>3000 else "") |
| return "```\n" + short + "\n```", path |
|
|
| def gr_chat_fn(message, history): |
| if not message.strip(): return history, "" |
| hd = [] |
| for pair in history: |
| if pair[0]: hd.append({"role":"user", "content":pair[0]}) |
| if pair[1]: hd.append({"role":"assistant","content":pair[1]}) |
| history.append((message, chat_about_papers(message, hd))) |
| return history, "" |
|
|
| |
| |
| |
| CSS = """ |
| footer{display:none!important} |
| h1{text-align:center} |
| .status-bar{font-size:.85rem;color:#94a3b8;padding:2px 0} |
| .legend{font-size:.8rem;color:#cbd5e1;background:#1e293b; |
| border-radius:8px;padding:6px 14px;margin-bottom:6px} |
| .filter-box{background:#1e293b;border-radius:10px; |
| padding:12px 16px;margin-top:8px} |
| .gs-box{background:#1e293b;border-radius:10px;padding:14px 18px; |
| margin-bottom:10px;border:1px solid #334155} |
| """ |
|
|
| with gr.Blocks( |
| theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"), |
| title="Scientific Paper Discovery v7.4", css=CSS |
| ) as demo: |
|
|
| gr.Markdown("# Scientific Paper Discovery v7.4\narXiv · CrossRef · Llama-3.3-70B · FAISS") |
| gr.Markdown("Citations: 🥇 >=1000 | 🏆 >=100 | ⭐ >=10 | 📄 <10 | · = 0", |
| elem_classes="legend") |
| status_bar = gr.Markdown("No papers loaded yet.", elem_classes="status-bar") |
|
|
| with gr.Tabs(): |
|
|
| |
| with gr.Tab("Browse / Search"): |
| with gr.Row(): |
| with gr.Column(scale=3): |
| t_query = gr.Textbox(label="Topic", |
| placeholder="ARIMA, inflation, LLM...", |
| value="economic forecasting") |
| t_category = gr.Dropdown(label="Category", |
| choices=list(CATEGORIES.keys()), |
| value="📊 Economics") |
| t_source = gr.Radio(label="Source", |
| choices=["arXiv","CrossRef","Both"], |
| value="arXiv") |
| with gr.Column(scale=1): |
| t_max = gr.Slider(5, 50, value=15, step=5, label="Max papers") |
| t_days = gr.Slider(1, 1500, value=365, step=30, label="Last N days") |
| btn_fetch = gr.Button("Fetch Papers", variant="primary", size="lg") |
| papers_table_md = gr.Markdown("Results appear here.") |
| paper_selector = gr.Dropdown(label="Select paper", choices=[], interactive=True) |
| with gr.Group(elem_classes="filter-box"): |
| gr.Markdown("### Filter & Sort") |
| with gr.Row(): |
| f_year_from = gr.Slider(2000,2026,value=2020,step=1,label="Year from") |
| f_year_to = gr.Slider(2000,2026,value=2026,step=1,label="Year to") |
| with gr.Row(): |
| f_cit_min = gr.Slider(0,5000,value=0, step=5,label="Citations min") |
| f_cit_max = gr.Slider(0,5000,value=5000,step=5,label="Citations max") |
| with gr.Row(): |
| f_sort = gr.Dropdown(choices=SORT_CHOICES, |
| value="Most Cited",label="Sort",scale=3) |
| btn_filter = gr.Button("Apply",variant="primary",scale=1) |
| gr.Markdown("---\n### Semantic Search (FAISS — in loaded papers)") |
| with gr.Row(): |
| search_in_box = gr.Textbox(label="Search in loaded papers", |
| placeholder="ARIMA, transformer...",scale=5) |
| btn_search_in = gr.Button("Search",scale=1) |
| search_in_out = gr.Markdown() |
|
|
| |
| with gr.Tab("Global Search"): |
| gr.Markdown( |
| "### Search any paper by title or keywords\n\n" |
| "> Uses arXiv **relevance** sort + CrossRef **title** search.\n" |
| "> Example: `Attention is All You Need`" |
| ) |
| with gr.Group(elem_classes="gs-box"): |
| with gr.Row(): |
| gs_query = gr.Textbox( |
| label="Title or keywords", |
| placeholder="Attention is All You Need | ARIMA forecasting ...", |
| scale=4) |
| gs_source = gr.Radio(label="Source", |
| choices=["arXiv","CrossRef","Both"], |
| value="Both", scale=2) |
| gs_max = gr.Slider(5,30,value=10,step=5,label="Max results",scale=1) |
| btn_gs = gr.Button("Search Now", variant="primary", size="lg") |
| gs_out = gr.Markdown("Enter a title or keywords...") |
|
|
| |
| with gr.Tab("Explain"): |
| with gr.Row(): |
| paper_sel2 = gr.Dropdown(label="Select paper", |
| choices=[], interactive=True, scale=4) |
| lang_exp = gr.Radio(LANG_CHOICES, value="Arabic", |
| label="Language", scale=1) |
| with gr.Row(): |
| btn_explain = gr.Button("Explain", variant="primary") |
| btn_fav = gr.Button("Save Fav") |
| btn_audio = gr.Button("Listen") |
| btn_export_pdf = gr.Button("Export PDF", variant="secondary") |
| with gr.Row(): |
| fav_status = gr.Markdown() |
| pdf_status = gr.Markdown() |
| explanation_out = gr.Markdown("Fetch papers and select one.") |
| audio_out = gr.Audio(label="Audio", type="filepath") |
| pdf_out = gr.File(label="Download PDF") |
|
|
| |
| with gr.Tab("Compare"): |
| with gr.Row(): |
| cmp_a = gr.Dropdown(label="Paper A", choices=[], interactive=True) |
| cmp_b = gr.Dropdown(label="Paper B", choices=[], interactive=True) |
| lang_cmp = gr.Radio(LANG_CHOICES, value="Arabic", |
| label="Language", scale=1) |
| btn_compare = gr.Button("Compare", variant="primary") |
| compare_out = gr.Markdown("Select two papers.") |
|
|
| |
| with gr.Tab("Chat"): |
| chatbot_ui = gr.Chatbot(label="Research Assistant", |
| height=480, bubble_full_width=False) |
| with gr.Row(): |
| chat_in = gr.Textbox(label="Question", scale=5, |
| placeholder="Key findings? | ما أبرز النتائج؟") |
| btn_send = gr.Button("Send", variant="primary", scale=1) |
| btn_clear = gr.Button("Clear", size="sm") |
|
|
| |
| with gr.Tab("Overview"): |
| with gr.Row(): |
| lang_ov = gr.Radio(LANG_CHOICES, value="Arabic", |
| label="Language", scale=1) |
| btn_overview = gr.Button("Generate Report", variant="primary", scale=3) |
| overview_out = gr.Markdown("Fetch papers first.") |
|
|
| |
| with gr.Tab("Trends"): |
| btn_trends = gr.Button("Analyze Trends", variant="primary", size="lg") |
| trend_chart = gr.Image(label="Trends Dashboard", type="filepath") |
| trend_stats = gr.Markdown("Fetch papers first.") |
|
|
| |
| with gr.Tab("Bibliography"): |
| bib_style = gr.Radio(["APA","IEEE","Chicago","BibTeX"], |
| value="APA", label="Style") |
| btn_bib = gr.Button("Generate Bibliography", variant="primary") |
| bib_out = gr.Markdown() |
| bib_file = gr.File(label="Download") |
|
|
| |
| with gr.Tab("Favorites"): |
| btn_show_fav = gr.Button("Show Favorites") |
| favs_md = gr.Markdown("Press to show.") |
| btn_export_fav = gr.Button("Export CSV", variant="secondary") |
| fav_csv_file = gr.File(label="CSV File") |
|
|
| |
| with gr.Tab("Auto-Fetch"): |
| with gr.Row(): |
| auto_q = gr.Textbox(label="Topic", |
| value="economic forecasting", scale=3) |
| auto_cat = gr.Dropdown(label="Category", |
| choices=list(CATEGORIES.keys()), |
| value="📊 Economics", scale=2) |
| auto_interval = gr.Slider(5,120,value=60,step=5, |
| label="Every (min)",scale=1) |
| with gr.Row(): |
| btn_start_auto = gr.Button("Start", variant="primary") |
| btn_stop_auto = gr.Button("Stop", variant="stop") |
| btn_refresh_log = gr.Button("Refresh Log") |
| auto_status = gr.Markdown() |
| auto_log_md = gr.Markdown("No log.") |
|
|
| |
| with gr.Tab("About"): |
| gr.Markdown(""" |
| # 🔬 Scientific Paper Discovery |
| ### Version 7.4 — Intelligent Research Assistant |
| |
| --- |
| |
| ## 🧠 About This Tool |
| |
| **Scientific Paper Discovery** is an AI-powered academic research assistant that enables researchers, students, and scientists to **discover, understand, and organize** scientific literature with unprecedented ease. It combines state-of-the-art language models with multi-source academic APIs to deliver a seamless research experience. |
| |
| --- |
| |
| ## ⚙️ Core Technologies |
| |
| | Component | Technology | Role | |
| |---|---|---| |
| | 🤖 Language Model | **Llama 3.3 70B** via Groq API | Paper explanation, comparison & chat | |
| | 🔍 Semantic Search | **FAISS** + MiniLM-L12-v2 | Vector similarity search | |
| | 📡 Source 1 | **arXiv API** | Preprints across all sciences | |
| | 📚 Source 2 | **CrossRef API** | Peer-reviewed journal articles | |
| | 📊 Citations | **Semantic Scholar** (3-layer) | Real citation counts | |
| | 🎙️ Text-to-Speech | **gTTS** | Audio playback of explanations | |
| | 📄 PDF Export | **ReportLab** | Professional PDF generation | |
| |
| --- |
| |
| ## 🗂️ Feature Overview |
| |
| | Tab | Feature | Description | |
| |---|---|---| |
| | 🔍 Browse | Paper Fetching | Fetch latest papers by topic & category | |
| | 🌐 Global Search | Title Search | Find any paper by exact title (relevance-sorted) | |
| | 📖 Explain | AI Explanation | Full structured explanation in Arabic or English | |
| | ⚖️ Compare | Paper Comparison | Side-by-side AI comparison of two papers | |
| | 💬 Chat | Research Chat | Ask questions about loaded papers | |
| | 🌐 Overview | Batch Summary | Academic overview of all loaded papers | |
| | 📊 Trends | Analytics | Citation, keyword & author trend charts | |
| | 📚 Bibliography | Citation Export | APA, IEEE, Chicago, BibTeX formats | |
| | ⭐ Favorites | Saved Papers | Bookmark & export favorite papers | |
| | 🔔 Auto-Fetch | Monitoring | Automatic periodic paper discovery | |
| |
| --- |
| |
| ## 🔎 Search Mode Guide |
| |
| | Mode | Algorithm | Best For | |
| |---|---|---| |
| | Browse | `sortBy=submittedDate` | Discovering latest papers on a topic | |
| | 🌐 Global Search | `sortBy=relevance` + `ti:"..."` | Finding a specific paper by title | |
| | FAISS (internal) | Cosine similarity | Semantic search within loaded papers | |
| |
| --- |
| |
| ## 📌 Citation Badges |
| |
| | Badge | Meaning | |
| |---|---| |
| | 🥇 | ≥ 1,000 citations — Highly influential | |
| | 🏆 | ≥ 100 citations — Well-cited | |
| | ⭐ | ≥ 10 citations — Notable | |
| | 📄 | < 10 citations — Recent or niche | |
| | · | 0 citations — New or unindexed | |
| |
| --- |
| |
| *Built with ❤️ for the research community — v7.4* |
| """) |
|
|
| |
| FETCH_OUT = [papers_table_md, paper_selector, paper_sel2, cmp_a, cmp_b, status_bar] |
|
|
| btn_fetch.click(gr_fetch, |
| inputs=[t_query, t_category, t_max, t_days, t_source], |
| outputs=FETCH_OUT) |
| btn_filter.click(gr_filter_papers, |
| inputs=[f_year_from, f_year_to, f_cit_min, f_cit_max, f_sort], |
| outputs=[papers_table_md, paper_selector, status_bar]) |
| paper_selector.change(lambda x: [gr.update(value=x)]*3, |
| inputs=[paper_selector], |
| outputs=[paper_sel2, cmp_a, cmp_b]) |
|
|
| btn_search_in.click(gr_search_fetched, inputs=[search_in_box], outputs=[search_in_out]) |
| search_in_box.submit(gr_search_fetched, inputs=[search_in_box], outputs=[search_in_out]) |
|
|
| btn_gs.click(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out]) |
| gs_query.submit(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out]) |
|
|
| btn_explain.click(gr_explain, inputs=[paper_sel2, lang_exp], outputs=[explanation_out]) |
| btn_fav.click(gr_save_fav, inputs=[paper_sel2], outputs=[fav_status]) |
| btn_audio.click(gr_audio, inputs=[explanation_out, lang_exp], outputs=[audio_out]) |
| btn_export_pdf.click(gr_export_pdf, |
| inputs=[explanation_out, paper_sel2], |
| outputs=[pdf_out, pdf_status]) |
|
|
| btn_compare.click(gr_compare, inputs=[cmp_a, cmp_b, lang_cmp], outputs=[compare_out]) |
| btn_overview.click(gr_overview, inputs=[t_query, lang_ov], outputs=[overview_out]) |
| btn_trends.click(gr_trends, outputs=[trend_chart, trend_stats]) |
| btn_bib.click(gr_bib, inputs=[bib_style], outputs=[bib_out, bib_file]) |
|
|
| btn_show_fav.click(gr_show_favs, outputs=[favs_md]) |
| btn_export_fav.click(gr_export_fav, outputs=[fav_csv_file]) |
|
|
| btn_start_auto.click(start_auto_fetch, |
| inputs=[auto_q, auto_cat, auto_interval], |
| outputs=[auto_status]) |
| btn_stop_auto.click(stop_auto_fetch, outputs=[auto_status]) |
| btn_refresh_log.click(get_auto_log, outputs=[auto_log_md]) |
|
|
| btn_send.click(gr_chat_fn, inputs=[chat_in, chatbot_ui], outputs=[chatbot_ui, chat_in]) |
| chat_in.submit(gr_chat_fn, inputs=[chat_in, chatbot_ui], outputs=[chatbot_ui, chat_in]) |
| btn_clear.click(lambda: ([], ""), outputs=[chatbot_ui, chat_in]) |
|
|
| if __name__ == "__main__": |
| demo.launch() |