Spaces:

Ani14
/

AutoReasearcher

Running

App Files Files Community

Ani14 commited on Apr 21, 2025

Commit

f33b3b6

verified ·

1 Parent(s): 65eddf9

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -25

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from PIL import Image
 from io import BytesIO
 from fpdf import FPDF
 import base64
 # Load environment variables
 load_dotenv()
@@ -19,6 +20,9 @@ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 tavily = TavilyClient(api_key=TAVILY_API_KEY)
 # --- Helper Functions ---
 def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=3500, temperature=0.7):
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
@@ -74,10 +78,6 @@ def get_semantic_papers(query):
         "url": p.get("url")
     } for p in papers]
-def get_images(topic):
-    response = tavily.image_search(query=topic, max_results=5)
-    return response.get("images", [])
 def check_plagiarism(text, topic):
     hits = []
     for r in get_sources(topic):
@@ -103,26 +103,18 @@ def merge_duplicates(entries):
     return unique
 def generate_pdf(text):
     pdf = FPDF()
     pdf.add_page()
     pdf.set_auto_page_break(auto=True, margin=15)
-    lines = text.split('\n')
-    for line in lines:
-        if line.startswith("# "):
-            pdf.set_font("Arial", style="B", size=16)
-            pdf.multi_cell(0, 10, line[2:])
-        elif line.startswith("## "):
-            pdf.set_font("Arial", style="B", size=14)
-            pdf.multi_cell(0, 10, line[3:])
-        else:
-            pdf.set_font("Arial", size=12)
-            pdf.multi_cell(0, 8, line)
-    pdf_bytes = pdf.output(dest='S').encode('latin-1')
-    pdf_output = BytesIO(pdf_bytes)
-    pdf_output.seek(0)
-    return pdf_output
 def generate_latex(text):
     latex = "\\documentclass{article}\n\\usepackage{hyperref}\n\\begin{document}\n"
     for line in text.split('\n'):
         latex += line.replace('_', '\\_') + "\\\\\n"
@@ -132,11 +124,97 @@ def generate_latex(text):
 def generate_download_button(file, label, mime_type):
     b64 = base64.b64encode(file.read()).decode()
     return f"""
-        <a href=\"data:{mime_type};base64,{b64}\" download=\"{label}\">Download {label}</a>
     """
-def download_image_as_bytes(url):
-    response = requests.get(url)
-    if response.status_code == 200:
-        return BytesIO(response.content)
-    return None

 from io import BytesIO
 from fpdf import FPDF
 import base64
+import re
 # Load environment variables
 load_dotenv()
 tavily = TavilyClient(api_key=TAVILY_API_KEY)
 # --- Helper Functions ---
+def remove_invalid_unicode(text):
+    return re.sub(r'[\ud800-\udfff]', '', text)
 def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=3500, temperature=0.7):
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
         "url": p.get("url")
     } for p in papers]
 def check_plagiarism(text, topic):
     hits = []
     for r in get_sources(topic):
     return unique
 def generate_pdf(text):
+    text = remove_invalid_unicode(text)
     pdf = FPDF()
     pdf.add_page()
     pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.set_font("Arial", size=12)
+    for line in text.split('\n'):
+        pdf.multi_cell(0, 10, line)
+    pdf_bytes = pdf.output(dest='S').encode('latin1')
+    return BytesIO(pdf_bytes)
 def generate_latex(text):
+    text = remove_invalid_unicode(text)
     latex = "\\documentclass{article}\n\\usepackage{hyperref}\n\\begin{document}\n"
     for line in text.split('\n'):
         latex += line.replace('_', '\\_') + "\\\\\n"
 def generate_download_button(file, label, mime_type):
     b64 = base64.b64encode(file.read()).decode()
     return f"""
+        <a href="data:{mime_type};base64,{b64}" download="{label}">
+            📥 Download {label}
+        </a>
     """
+# --- Streamlit UI ---
+st.set_page_config("Deep Research Bot", layout="wide")
+with st.sidebar:
+    st.title("🧠 Deep Research Assistant")
+    topic = st.text_input("💡 Topic to research")
+    report_type = st.selectbox("📄 Type of report", [
+        "Summary - Short and fast (~2 min)",
+        "Detailed Report (~5 min)",
+        "Thorough Academic Research (~10 min)"
+    ])
+    tone = st.selectbox("🎯 Tone of the report", [
+        "Objective - Impartial and unbiased presentation of facts and findings",
+        "Persuasive - Advocating a specific point of view",
+        "Narrative - Storytelling tone for layperson readers"
+    ])
+    source_type = st.selectbox("🌐 Sources to include", ["Web Only", "Academic Only", "Hybrid"])
+    custom_domains = st.text_input("🔍 Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
+    research_button = st.button("Research")
+st.title("📑 Research Output")
+if research_button and topic:
+    try:
+        with st.status("🔍 Gathering data..."):
+            st.info("Fetching from sources...")
+            all_sources = []
+            citations = []
+            if source_type in ["Web Only", "Hybrid"]:
+                web_data = get_sources(topic, custom_domains)
+                for item in web_data:
+                    all_sources.append(item | {"source": "web"})
+            if source_type in ["Academic Only", "Hybrid"]:
+                arxiv_data = get_arxiv_papers(topic)
+                for item in arxiv_data:
+                    all_sources.append(item | {"source": "arxiv"})
+                semantic_data = get_semantic_papers(topic)
+                for item in semantic_data:
+                    all_sources.append(item | {"source": "semantic"})
+            merged = merge_duplicates(all_sources)
+            combined_text = ""
+            for m in merged:
+                combined_text += f"- [{m['title']}]({m['url']})\n> {m.get('snippet', m.get('summary', ''))[:300]}...\n\n"
+                citations.append(generate_apa_citation(m['title'], m['url'], m['source']))
+        with st.spinner("✍️ Synthesizing report..."):
+            prompt = f"""
+# Research Topic: {topic}
+Tone: {tone}
+Type: {report_type}
+Sources:
+{combined_text}
+Write the report in academic markdown with paragraphs (use bullet points only when necessary). Include:
+1. Introduction
+2. Research Gap
+3. Novel Insight
+4. Application
+5. Full Academic Writeup if Thorough Report
+            """
+            final_output = call_llm([{"role": "user", "content": prompt}])
+        st.markdown(f"### 📄 {report_type}")
+        st.markdown(final_output, unsafe_allow_html=True)
+        st.markdown("### 📚 Citations (APA Format)")
+        for cite in citations:
+            st.markdown(f"- {cite}")
+        if report_type == "Thorough Academic Research (~10 min)":
+            with st.spinner("📦 Preparing PDF and LaTeX..."):
+                pdf_file = generate_pdf(final_output)
+                latex_file = generate_latex(final_output)
+                st.markdown(generate_download_button(pdf_file, "Research_Report.pdf", "application/pdf"), unsafe_allow_html=True)
+                st.markdown(generate_download_button(latex_file, "Research_Report.tex", "application/x-latex"), unsafe_allow_html=True)
+        overlaps = check_plagiarism(final_output, topic)
+        if overlaps:
+            st.warning("⚠️ Potential overlaps detected:")
+            for hit in overlaps:
+                st.markdown(f"- [{hit['title']}]({hit['url']})")
+        else:
+            st.success("✅ No major overlaps found.")
+    except Exception as e:
+        st.error(f"Error: {e}")