Ani14 commited on
Commit
f33b3b6
Β·
verified Β·
1 Parent(s): 65eddf9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -25
app.py CHANGED
@@ -11,6 +11,7 @@ from PIL import Image
11
  from io import BytesIO
12
  from fpdf import FPDF
13
  import base64
 
14
 
15
  # Load environment variables
16
  load_dotenv()
@@ -19,6 +20,9 @@ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
19
  tavily = TavilyClient(api_key=TAVILY_API_KEY)
20
 
21
  # --- Helper Functions ---
 
 
 
22
  def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=3500, temperature=0.7):
23
  url = "https://openrouter.ai/api/v1/chat/completions"
24
  headers = {
@@ -74,10 +78,6 @@ def get_semantic_papers(query):
74
  "url": p.get("url")
75
  } for p in papers]
76
 
77
- def get_images(topic):
78
- response = tavily.image_search(query=topic, max_results=5)
79
- return response.get("images", [])
80
-
81
  def check_plagiarism(text, topic):
82
  hits = []
83
  for r in get_sources(topic):
@@ -103,26 +103,18 @@ def merge_duplicates(entries):
103
  return unique
104
 
105
  def generate_pdf(text):
 
106
  pdf = FPDF()
107
  pdf.add_page()
108
  pdf.set_auto_page_break(auto=True, margin=15)
109
- lines = text.split('\n')
110
- for line in lines:
111
- if line.startswith("# "):
112
- pdf.set_font("Arial", style="B", size=16)
113
- pdf.multi_cell(0, 10, line[2:])
114
- elif line.startswith("## "):
115
- pdf.set_font("Arial", style="B", size=14)
116
- pdf.multi_cell(0, 10, line[3:])
117
- else:
118
- pdf.set_font("Arial", size=12)
119
- pdf.multi_cell(0, 8, line)
120
- pdf_bytes = pdf.output(dest='S').encode('latin-1')
121
- pdf_output = BytesIO(pdf_bytes)
122
- pdf_output.seek(0)
123
- return pdf_output
124
 
125
  def generate_latex(text):
 
126
  latex = "\\documentclass{article}\n\\usepackage{hyperref}\n\\begin{document}\n"
127
  for line in text.split('\n'):
128
  latex += line.replace('_', '\\_') + "\\\\\n"
@@ -132,11 +124,97 @@ def generate_latex(text):
132
  def generate_download_button(file, label, mime_type):
133
  b64 = base64.b64encode(file.read()).decode()
134
  return f"""
135
- <a href=\"data:{mime_type};base64,{b64}\" download=\"{label}\">Download {label}</a>
 
 
136
  """
137
 
138
- def download_image_as_bytes(url):
139
- response = requests.get(url)
140
- if response.status_code == 200:
141
- return BytesIO(response.content)
142
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from io import BytesIO
12
  from fpdf import FPDF
13
  import base64
14
+ import re
15
 
16
  # Load environment variables
17
  load_dotenv()
 
20
  tavily = TavilyClient(api_key=TAVILY_API_KEY)
21
 
22
  # --- Helper Functions ---
23
+ def remove_invalid_unicode(text):
24
+ return re.sub(r'[\ud800-\udfff]', '', text)
25
+
26
  def call_llm(messages, model="deepseek/deepseek-chat-v3-0324:free", max_tokens=3500, temperature=0.7):
27
  url = "https://openrouter.ai/api/v1/chat/completions"
28
  headers = {
 
78
  "url": p.get("url")
79
  } for p in papers]
80
 
 
 
 
 
81
  def check_plagiarism(text, topic):
82
  hits = []
83
  for r in get_sources(topic):
 
103
  return unique
104
 
105
  def generate_pdf(text):
106
+ text = remove_invalid_unicode(text)
107
  pdf = FPDF()
108
  pdf.add_page()
109
  pdf.set_auto_page_break(auto=True, margin=15)
110
+ pdf.set_font("Arial", size=12)
111
+ for line in text.split('\n'):
112
+ pdf.multi_cell(0, 10, line)
113
+ pdf_bytes = pdf.output(dest='S').encode('latin1')
114
+ return BytesIO(pdf_bytes)
 
 
 
 
 
 
 
 
 
 
115
 
116
  def generate_latex(text):
117
+ text = remove_invalid_unicode(text)
118
  latex = "\\documentclass{article}\n\\usepackage{hyperref}\n\\begin{document}\n"
119
  for line in text.split('\n'):
120
  latex += line.replace('_', '\\_') + "\\\\\n"
 
124
  def generate_download_button(file, label, mime_type):
125
  b64 = base64.b64encode(file.read()).decode()
126
  return f"""
127
+ <a href="data:{mime_type};base64,{b64}" download="{label}">
128
+ πŸ“₯ Download {label}
129
+ </a>
130
  """
131
 
132
+ # --- Streamlit UI ---
133
+ st.set_page_config("Deep Research Bot", layout="wide")
134
+
135
+ with st.sidebar:
136
+ st.title("🧠 Deep Research Assistant")
137
+ topic = st.text_input("πŸ’‘ Topic to research")
138
+ report_type = st.selectbox("πŸ“„ Type of report", [
139
+ "Summary - Short and fast (~2 min)",
140
+ "Detailed Report (~5 min)",
141
+ "Thorough Academic Research (~10 min)"
142
+ ])
143
+ tone = st.selectbox("🎯 Tone of the report", [
144
+ "Objective - Impartial and unbiased presentation of facts and findings",
145
+ "Persuasive - Advocating a specific point of view",
146
+ "Narrative - Storytelling tone for layperson readers"
147
+ ])
148
+ source_type = st.selectbox("🌐 Sources to include", ["Web Only", "Academic Only", "Hybrid"])
149
+ custom_domains = st.text_input("πŸ” Query Domains (Optional)", placeholder="techcrunch.com, forbes.com")
150
+ research_button = st.button("Research")
151
+
152
+ st.title("πŸ“‘ Research Output")
153
+
154
+ if research_button and topic:
155
+ try:
156
+ with st.status("πŸ” Gathering data..."):
157
+ st.info("Fetching from sources...")
158
+
159
+ all_sources = []
160
+ citations = []
161
+
162
+ if source_type in ["Web Only", "Hybrid"]:
163
+ web_data = get_sources(topic, custom_domains)
164
+ for item in web_data:
165
+ all_sources.append(item | {"source": "web"})
166
+
167
+ if source_type in ["Academic Only", "Hybrid"]:
168
+ arxiv_data = get_arxiv_papers(topic)
169
+ for item in arxiv_data:
170
+ all_sources.append(item | {"source": "arxiv"})
171
+ semantic_data = get_semantic_papers(topic)
172
+ for item in semantic_data:
173
+ all_sources.append(item | {"source": "semantic"})
174
+
175
+ merged = merge_duplicates(all_sources)
176
+ combined_text = ""
177
+ for m in merged:
178
+ combined_text += f"- [{m['title']}]({m['url']})\n> {m.get('snippet', m.get('summary', ''))[:300]}...\n\n"
179
+ citations.append(generate_apa_citation(m['title'], m['url'], m['source']))
180
+
181
+ with st.spinner("✍️ Synthesizing report..."):
182
+ prompt = f"""
183
+ # Research Topic: {topic}
184
+ Tone: {tone}
185
+ Type: {report_type}
186
+ Sources:
187
+ {combined_text}
188
+ Write the report in academic markdown with paragraphs (use bullet points only when necessary). Include:
189
+ 1. Introduction
190
+ 2. Research Gap
191
+ 3. Novel Insight
192
+ 4. Application
193
+ 5. Full Academic Writeup if Thorough Report
194
+ """
195
+ final_output = call_llm([{"role": "user", "content": prompt}])
196
+
197
+ st.markdown(f"### πŸ“„ {report_type}")
198
+ st.markdown(final_output, unsafe_allow_html=True)
199
+
200
+ st.markdown("### πŸ“š Citations (APA Format)")
201
+ for cite in citations:
202
+ st.markdown(f"- {cite}")
203
+
204
+ if report_type == "Thorough Academic Research (~10 min)":
205
+ with st.spinner("πŸ“¦ Preparing PDF and LaTeX..."):
206
+ pdf_file = generate_pdf(final_output)
207
+ latex_file = generate_latex(final_output)
208
+ st.markdown(generate_download_button(pdf_file, "Research_Report.pdf", "application/pdf"), unsafe_allow_html=True)
209
+ st.markdown(generate_download_button(latex_file, "Research_Report.tex", "application/x-latex"), unsafe_allow_html=True)
210
+
211
+ overlaps = check_plagiarism(final_output, topic)
212
+ if overlaps:
213
+ st.warning("⚠️ Potential overlaps detected:")
214
+ for hit in overlaps:
215
+ st.markdown(f"- [{hit['title']}]({hit['url']})")
216
+ else:
217
+ st.success("βœ… No major overlaps found.")
218
+
219
+ except Exception as e:
220
+ st.error(f"Error: {e}")