rdune71 commited on
Commit
742b2a5
·
1 Parent(s): 0272eed
app.py CHANGED
@@ -1,1039 +1,81 @@
 
1
  import gradio as gr
2
- import requests
3
- import json
 
 
 
4
  import os
5
- import re
6
- import time
7
- import io
8
- from datetime import datetime
9
- from functools import lru_cache
10
- from requests.adapters import HTTPAdapter
11
- from urllib3.util.retry import Retry
12
 
13
- # Configuration
14
- BASE_URL = "https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/"
15
- HF_TOKEN = os.environ.get("HF_TOKEN")
16
- TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY")
17
-
18
- # Validate required environment variables
19
- if not HF_TOKEN:
20
- raise ValueError("HF_TOKEN environment variable is required")
21
-
22
- # Get current date and time information
23
- CURRENT_DATE = datetime.now()
24
- DATE_INFO = CURRENT_DATE.strftime("%A, %B %d, %Y")
25
- TIME_INFO = CURRENT_DATE.strftime("%I:%M %p")
26
- FORMATTED_DATE_TIME = f"Current Date: {DATE_INFO}\nCurrent Time: {TIME_INFO}"
27
-
28
- # Initialize session with retry strategy
29
- session = requests.Session()
30
- retry_strategy = Retry(
31
- total=3,
32
- backoff_factor=1,
33
- status_forcelist=[429, 500, 502, 503, 504],
34
- )
35
- adapter = HTTPAdapter(max_retries=retry_strategy)
36
- session.mount("http://", adapter)
37
- session.mount("https://", adapter)
38
-
39
- # Initialize Tavily client
40
- try:
41
- from tavily import TavilyClient
42
- tavily_client = TavilyClient(api_key=TAVILY_API_KEY) if TAVILY_API_KEY else None
43
- TAVILY_AVAILABLE = True
44
- except ImportError:
45
- tavily_client = None
46
- TAVILY_AVAILABLE = False
47
- print("Tavily not available: Please install tavily-python")
48
-
49
- # Import additional libraries for advanced features
50
- try:
51
- import PyPDF2
52
- PDF_SUPPORT = True
53
- except ImportError:
54
- PDF_SUPPORT = False
55
- print("PyPDF2 not available: Install for PDF processing support")
56
-
57
- try:
58
- from bs4 import BeautifulSoup
59
- WEB_SCRAPING = True
60
- except ImportError:
61
- WEB_SCRAPING = False
62
- print("BeautifulSoup not available: Install for web scraping support")
63
-
64
- try:
65
- import feedparser
66
- ACADEMIC_SEARCH = True
67
- except ImportError:
68
- ACADEMIC_SEARCH = False
69
- print("feedparser not available: Install for academic search support")
70
-
71
- # Rate limiter class
72
- class RateLimiter:
73
- def __init__(self, max_calls=10, time_window=60):
74
- self.max_calls = max_calls
75
- self.time_window = time_window
76
- self.calls = []
77
-
78
- def is_allowed(self):
79
- now = time.time()
80
- self.calls = [call for call in self.calls if now - call < self.time_window]
81
- if len(self.calls) < self.max_calls:
82
- self.calls.append(now)
83
- return True
84
- return False
85
-
86
- rate_limiter = RateLimiter(max_calls=20, time_window=60)
87
-
88
- # Feedback storage
89
- feedback_data = []
90
-
91
- def get_preloaded_context():
92
- """Get preloaded context information"""
93
- context = f"""{FORMATTED_DATE_TIME} System Information: You are an AI assistant with access to current information through web search and academic research tools. Always provide sources for factual information. Available APIs: - Web Search (Tavily) - Academic Research (arXiv, Semantic Scholar) - PDF Document Analysis - Web Page Content Extraction Specialized Features: - Research-focused queries automatically processed - Academic paper analysis and summarization - Literature review generation - Citation management and bibliography creation Response Guidelines: 1. After completing your analysis, ALWAYS end with either: '[ANALYSIS COMPLETE]' - when you've fully addressed the query '[FURTHER RESEARCH NEEDED]' - when additional investigation would be beneficial 2. For search results, provide clear synthesis rather than just listing findings 3. Include specific citations and sources where applicable 4. Structure complex answers with clear sections when appropriate"""
94
- return context
95
-
96
- def clean_query_for_current_info(query):
97
- """Clean query to focus on current/fresh information"""
98
- # Remove old dates
99
- query = re.sub(r'\d{4}-\d{2}-\d{2}', '', query)
100
- query = re.sub(r'\d{4}/\d{2}/\d{2}', '', query)
101
- query = re.sub(r'\d{2}/\d{2}/\d{4}', '', query)
102
-
103
- return query.strip()
104
-
105
- def determine_research_content_type(query):
106
- """Determine if query requires research-focused search"""
107
- research_keywords = [
108
- 'research', 'study', 'paper', 'academic', 'scientific',
109
- 'experiment', 'findings', 'discovery', 'theory',
110
- 'hypothesis', 'methodology', 'conclusion', 'literature',
111
- 'peer reviewed', 'scholarly', 'journal', 'publication',
112
- 'analyze', 'investigate', 'examine', 'review'
113
- ]
114
- return any(keyword in query.lower() for keyword in research_keywords)
115
-
116
- def is_news_related_query(query):
117
- """Check if query is related to news"""
118
- news_keywords = ['news', 'headline', 'breaking', 'latest', 'today', 'current event', 'update', 'report']
119
- query_lower = query.lower()
120
- return any(word in query_lower for word in news_keywords)
121
-
122
- def is_search_results_content(content):
123
- """Check if content appears to be search results that need analysis"""
124
- search_indicators = [
125
- "[SEARCH RESULTS FOR",
126
- "Source: Web Search",
127
- "Tavily search error",
128
- "arXiv Paper:",
129
- "Semantic Scholar Paper:"
130
- ]
131
- return any(indicator in content for indicator in search_indicators) and len(content) > 200
132
-
133
- def is_looping_content(content):
134
- """Detect if content is stuck in a loop"""
135
- if len(content) > 2000: # Too long, likely looping
136
- return True
137
- if content.count("let's do") > 15: # Repeated phrases
138
- return True
139
- if content.count("search") > 40: # Excessive repetition
140
- return True
141
- return False
142
-
143
- def validate_history(chat_history):
144
- """Ensure proper alternation in chat_history"""
145
- if not chat_history:
146
- return []
147
-
148
- validated = []
149
- expected_role = "user"
150
-
151
- for message in chat_history:
152
- role = message.get("role")
153
- content = message.get("content", "")
154
-
155
- # Skip empty messages
156
- if not content:
157
- continue
158
-
159
- # Only add messages that follow proper alternation
160
- if role == expected_role:
161
- validated.append(message)
162
- expected_role = "assistant" if expected_role == "user" else "user"
163
- elif role == "system" and len(validated) == 0:
164
- # Allow system message at start
165
- validated.append(message)
166
-
167
- return validated
168
-
169
- def convert_history_format(internal_history):
170
- """Convert internal dict format to Gradio chatbot format"""
171
- gradio_history = []
172
- for msg in internal_history:
173
- if isinstance(msg, dict):
174
- gradio_history.append([msg.get("role", "unknown"), msg.get("content", "")])
175
- else:
176
- gradio_history.append(msg)
177
- return gradio_history
178
-
179
- def truncate_history(messages, max_tokens=4000):
180
- """Truncate conversation history to prevent context overflow"""
181
- if not messages:
182
- return []
183
-
184
- # Simplified token estimation (4 chars ~ 1 token)
185
- estimated_tokens = sum(len(msg.get("content", "")) for msg in messages) // 4
186
-
187
- if estimated_tokens <= max_tokens:
188
- return messages
189
-
190
- # Truncate older messages
191
- truncated = []
192
- current_tokens = 0
193
-
194
- # Keep system message if present
195
- if messages and messages[0].get("role") == "system":
196
- truncated.append(messages[0])
197
- messages = messages[1:]
198
-
199
- # Add recent messages up to token limit
200
- for message in reversed(messages):
201
- content = message.get("content", "")
202
- message_tokens = len(content) // 4
203
-
204
- if current_tokens + message_tokens > max_tokens:
205
- break
206
-
207
- truncated.insert(0, message)
208
- current_tokens += message_tokens
209
-
210
- return truncated
211
-
212
- def manage_conversation_memory(messages, max_turns=10):
213
- """Keep conversation focused and prevent context overflow"""
214
- if len(messages) > max_turns * 2: # *2 for user/assistant pairs
215
- # Keep system message + last N turns
216
- system_msg = [msg for msg in messages if msg.get("role") == "system"]
217
- recent_messages = messages[-(max_turns * 2):]
218
- return system_msg + recent_messages if system_msg else recent_messages
219
- return messages
220
-
221
- def tavily_search(query):
222
- """Perform search using Tavily"""
223
- if not TAVILY_AVAILABLE or not tavily_client:
224
- return "Web search not available."
225
-
226
- try:
227
- # Clean query for current info
228
- clean_query = clean_query_for_current_info(query)
229
-
230
- if not clean_query:
231
- return "No valid search query provided."
232
-
233
- response = tavily_client.search(
234
- clean_query,
235
- search_depth="advanced",
236
- topic="general",
237
- max_results=5
238
- )
239
-
240
- results = []
241
- for result in response.get("results", [])[:5]:
242
- title = result.get("title", "")
243
- content = result.get("content", "")
244
- url = result.get("url", "")
245
- if title and content:
246
- results.append(f"{title}: {content} (Source: {url})")
247
- elif content:
248
- results.append(f"{content} (Source: {url})")
249
-
250
- if results:
251
- return "\n\n".join(results)
252
- else:
253
- return "No relevant information found."
254
-
255
- except Exception as e:
256
- return f"Tavily search error: {str(e)}"
257
-
258
- def download_and_extract_pdf(url):
259
- """Download PDF and extract text content"""
260
- if not PDF_SUPPORT:
261
- return "PDF processing not available. Please install PyPDF2."
262
-
263
- try:
264
- # Download PDF
265
- response = session.get(url, timeout=30)
266
- response.raise_for_status()
267
-
268
- # Extract text from PDF
269
- pdf_file = io.BytesIO(response.content)
270
- pdf_reader = PyPDF2.PdfReader(pdf_file)
271
-
272
- text_content = []
273
- for page_num, page in enumerate(pdf_reader.pages):
274
- if page_num < 15: # Limit to first 15 pages
275
- text_content.append(page.extract_text())
276
- else:
277
- break
278
-
279
- full_text = "\n".join(text_content)
280
- return f"PDF CONTENT EXTRACTED FROM {url}:\n{full_text[:4000]}..." # Limit size
281
-
282
- except Exception as e:
283
- return f"PDF extraction error: {str(e)}"
284
-
285
- def scrape_web_page(url):
286
- """Scrape and process full web pages"""
287
- if not WEB_SCRAPING:
288
- return "Web scraping not available. Please install beautifulsoup4."
289
-
290
- try:
291
- headers = {
292
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
293
- }
294
-
295
- response = session.get(url, headers=headers, timeout=15)
296
- soup = BeautifulSoup(response.content, 'html.parser')
297
-
298
- # Remove script and style elements
299
- for script in soup(["script", "style", "nav", "footer", "aside"]):
300
- script.decompose()
301
-
302
- # Extract main content
303
- title = soup.find('title').get_text().strip() if soup.find('title') else "No Title"
304
-
305
- # Try to find main content area
306
- main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
307
- if main_content:
308
- paragraphs = main_content.find_all(['p', 'h1', 'h2', 'h3'])
309
- else:
310
- paragraphs = soup.find_all('p')
311
-
312
- content = ' '.join([p.get_text().strip() for p in paragraphs[:30] if p.get_text().strip()])
313
-
314
- return f"WEB PAGE CONTENT FROM {url}:\nTitle: {title}\nContent: {content[:3000]}..." # Limit content size
315
-
316
- except Exception as e:
317
- return f"Error scraping page: {str(e)}"
318
-
319
- def arxiv_search(query):
320
- """Search and process academic papers from arXiv"""
321
- if not ACADEMIC_SEARCH:
322
- return "Academic search not available. Please install feedparser."
323
-
324
- try:
325
- # Search arXiv
326
- search_url = f"http://export.arxiv.org/api/query?search_query=all:{requests.utils.quote(query)}&max_results=3&sortBy=relevance&sortOrder=descending"
327
- feed = feedparser.parse(search_url)
328
-
329
- results = []
330
- for entry in feed.entries[:3]:
331
- title = entry.title
332
- summary = entry.summary
333
- authors = ", ".join([author.name for author in entry.authors[:3]]) if entry.authors else "Unknown Authors"
334
- published = entry.published if hasattr(entry, 'published') else "Unknown Date"
335
- pdf_url = entry.links[1].href if len(entry.links) > 1 and entry.links[1].type == 'application/pdf' else "No PDF link"
336
-
337
- result = f"arXiv Paper:\nTitle: {title}\nAuthors: {authors}\nPublished: {published}\nAbstract: {summary}"
338
- if pdf_url and pdf_url != "No PDF link":
339
- result += f"\nPDF URL: {pdf_url}"
340
- results.append(result)
341
-
342
- if results:
343
- return "\n\n---\n\n".join(results)
344
- else:
345
- return "No arXiv papers found for this query."
346
-
347
- except Exception as e:
348
- return f"arXiv search error: {str(e)}"
349
-
350
- def semantic_scholar_search(query):
351
- """Search academic papers using Semantic Scholar API"""
352
  try:
353
- api_url = "https://api.semanticscholar.org/graph/v1/paper/search"
354
- params = {
355
- "query": query,
356
- "limit": 3,
357
- "fields": "title,abstract,authors,year,venue,url,citationCount,referenceCount"
358
- }
359
 
360
- response = session.get(api_url, params=params, timeout=15)
361
- if response.status_code == 200:
362
- data = response.json()
363
- else:
364
- return f"Semantic Scholar API error: {response.status_code}"
365
 
366
- results = []
367
- for paper in data.get("data", [])[:3]:
368
- title = paper.get("title", "")
369
- abstract = paper.get("abstract", "")
370
- authors = ", ".join([author.get("name", "") for author in paper.get("authors", [])[:3]])
371
- year = paper.get("year", "")
372
- venue = paper.get("venue", "")
373
- url = paper.get("url", "")
374
- citations = paper.get("citationCount", 0)
375
- references = paper.get("referenceCount", 0)
376
-
377
- result = f"Semantic Scholar Paper:\nTitle: {title}\nAuthors: {authors}\nYear: {year}\nVenue: {venue}\nCitations: {citations}\nReferences: {references}\nAbstract: {abstract[:500]}..."
378
- if url:
379
- result += f"\nURL: {url}"
380
- results.append(result)
381
 
382
- if results:
383
- return "\n\n---\n\n".join(results)
384
- else:
385
- return "No Semantic Scholar papers found for this query."
386
-
387
- except Exception as e:
388
- return f"Semantic Scholar search error: {str(e)}"
389
-
390
- def comprehensive_research(query):
391
- """Aggregate research from multiple academic sources"""
392
- results = []
393
-
394
- # Add search header
395
- results.append(f"COMPREHENSIVE RESEARCH RESULTS FOR: '{query}'\n" + "="*50)
396
-
397
- # Academic databases
398
- if TAVILY_AVAILABLE and tavily_client:
399
- tavily_result = tavily_search(query)
400
- results.append(f"TAVILY ACADEMIC SEARCH RESULTS:\n{tavily_result}")
401
-
402
- # arXiv for academic papers
403
- arxiv_result = arxiv_search(query)
404
- if "error" not in arxiv_result.lower():
405
- results.append(f"ARXIV ACADEMIC PAPERS:\n{arxiv_result}")
406
-
407
- # Semantic Scholar
408
- semantic_result = semantic_scholar_search(query)
409
- if "error" not in semantic_result.lower():
410
- results.append(f"SEMANTIC SCHOLAR RESULTS:\n{semantic_result}")
411
-
412
- # Generate bibliography
413
- combined_results = "\n\n---\n\n".join(results)
414
- bibliography = generate_bibliography(combined_results)
415
- results.append(f"BIBLIOGRAPHY:\n{bibliography}")
416
-
417
- return "\n\n---\n\n".join(results)
418
-
419
- def analyze_search_results(query, search_results):
420
- """Create a prompt for the model to analyze search results"""
421
- analysis_prompt = f"""Based on the search results below, please answer the original question: "{query}" Search Results: {search_results} Please provide a clear, concise answer based on these sources. Include specific names, facts, and cite the sources where possible. Do not mention that you are analyzing search results - just provide the answer directly. Structure your response thoughtfully and when you complete your analysis, please explicitly state '[ANALYSIS COMPLETE]' at the end if you have fully addressed the query and have no further input. If additional research or clarification would be beneficial, please state '[FURTHER RESEARCH NEEDED]'."""
422
-
423
- return analysis_prompt
424
-
425
- def generate_bibliography(search_results):
426
- """Generate proper bibliography from research results"""
427
- # Simple bibliography generation (can be enhanced)
428
- citations = []
429
- lines = search_results.split('\n')
430
-
431
- current_citation = {}
432
- for line in lines:
433
- if line.startswith("Title:"):
434
- if current_citation:
435
- citations.append(current_citation)
436
- current_citation = {"title": line[7:].strip()}
437
- elif line.startswith("Authors:") and current_citation:
438
- current_citation["authors"] = line[9:].strip()
439
- elif line.startswith("Year:") and current_citation:
440
- current_citation["year"] = line[6:].strip()
441
- elif line.startswith("URL:") and current_citation:
442
- current_citation["url"] = line[5:].strip()
443
-
444
- if current_citation:
445
- citations.append(current_citation)
446
-
447
- # Format citations in APA style
448
- formatted_citations = []
449
- for i, citation in enumerate(citations, 1):
450
- authors = citation.get("authors", "Unknown Author")
451
- title = citation.get("title", "Unknown Title")
452
- year = citation.get("year", "N.d.")
453
- url = citation.get("url", "")
454
 
455
- formatted = f"{i}. {authors} ({year}). {title}. Retrieved from {url}"
456
- formatted_citations.append(formatted)
457
-
458
- return "\n".join(formatted_citations) if formatted_citations else "No citations found."
459
-
460
- def generate_literature_review(topic, search_results):
461
- """Generate structured literature review from search results"""
462
- prompt = f"""Based on the following research on '{topic}', create a structured literature review: {search_results} Please organize your response as follows: 1. INTRODUCTION: Brief overview of the topic 2. KEY FINDINGS: Major discoveries and insights from the research 3. METHODOLOGIES: Common research approaches used 4. LIMITATIONS: Identified gaps or limitations in current research 5. FUTURE DIRECTIONS: Suggested areas for future investigation 6. CONCLUSION: Summary of the current state of research Format your response clearly with these section headings. When you complete your analysis, please explicitly state '[ANALYSIS COMPLETE]' at the end."""
463
-
464
- return prompt
465
-
466
- def generate_follow_up_questions(last_response):
467
- """Generate 3-5 relevant follow-up questions"""
468
- if not last_response:
469
- return []
470
-
471
- # Simple heuristic-based questions
472
- question_words = ["What", "How", "Why", "When", "Where", "Who"]
473
- topics = ["related", "similar", "detailed", "practical"]
474
-
475
- # Extract key topics from response (simplified)
476
- words = last_response.split()[:20] # First 20 words
477
- key_topics = [word for word in words if len(word) > 4][:3] # Simple filtering
478
-
479
- questions = []
480
- for word in question_words[:3]: # Limit to 3
481
- if key_topics:
482
- topic = key_topics[0] if key_topics else "this"
483
- questions.append(f"{word} about {topic}?")
484
-
485
- return questions[:3] # Return max 3 questions
486
-
487
- def check_analysis_status(content):
488
- """Check if the AI has indicated completion status"""
489
- # Check for explicit completion markers first
490
- if "[ANALYSIS COMPLETE]" in content:
491
- return "✅ Analysis complete - AI has finished reviewing and has no further input."
492
- elif "[FURTHER RESEARCH NEEDED]" in content:
493
- return "🔍 Further research needed - AI suggests additional investigation would be beneficial."
494
-
495
- # Check if this is search results being displayed (and needs analysis)
496
- elif ("[SEARCH RESULTS FOR" in content or "Source: Web Search" in content) and len(content) > 200:
497
- return "📊 Search results retrieved - AUTO-ANALYSIS TRIGGERED"
498
-
499
- # Check for comprehensive research results
500
- elif "COMPREHENSIVE RESEARCH RESULTS FOR" in content:
501
- return "📚 Comprehensive research completed - detailed findings provided."
502
-
503
- # Check if this appears to be a final answer/response
504
- elif any(phrase in content.lower() for phrase in [
505
- "in conclusion", "to summarize", "in summary",
506
- "overall", "therefore", "thus", "in closing"
507
- ]):
508
- return "✅ AI appears to be concluding its response."
509
-
510
- # Check if this is bibliographic content
511
- elif "BIBLIOGRAPHY:" in content or "REFERENCES:" in content:
512
- return "📖 Bibliography generated - research sources compiled."
513
-
514
- # Check if this is URL analysis results
515
- elif "PDF CONTENT EXTRACTED" in content or "WEB PAGE CONTENT" in content:
516
- return "📄 Document analysis complete - content extracted and ready for review."
517
-
518
- # Check for literature review generation
519
- elif "LITERATURE REVIEW" in content and any(header in content for header in [
520
- "INTRODUCTION", "KEY FINDINGS", "METHODOLOGIES",
521
- "LIMITATIONS", "FUTURE DIRECTIONS", "CONCLUSION"
522
- ]):
523
- return "📑 Literature review structured - comprehensive analysis provided."
524
-
525
- # Default status for ongoing processing
526
- else:
527
- # If content is substantial and appears analytical, assume it's progressing toward completion
528
- if len(content) > 200 and not content.startswith("[SEARCH RESULTS"):
529
- return "🧠 Analysis in progress - AI is formulating detailed response."
530
- elif content.startswith("[SEARCH RESULTS"):
531
- return "📊 Search results displayed - awaiting analysis"
532
- else:
533
- return "⏳ Processing - AI is working on your request."
534
-
535
- def generate_with_streaming(messages, model, max_tokens=8192, temperature=0.7, top_p=0.9):
536
- """Generate text with streaming"""
537
- headers = {
538
- "Authorization": f"Bearer {HF_TOKEN}",
539
- "Content-Type": "application/json"
540
- }
541
-
542
- # Validate history to prevent errors
543
- validated_messages = validate_history(messages)
544
-
545
- payload = {
546
- "model": model,
547
- "messages": validated_messages,
548
- "max_tokens": max_tokens,
549
- "temperature": temperature,
550
- "top_p": top_p,
551
- "stream": True
552
- }
553
-
554
- start_time = time.time()
555
- try:
556
- response = session.post(
557
- f"{BASE_URL}chat/completions",
558
- headers=headers,
559
- json=payload,
560
- timeout=300,
561
- stream=True
562
- )
563
 
564
- if response.status_code == 200:
565
- full_response = ""
566
- for line in response.iter_lines():
567
- if line:
568
- decoded_line = line.decode('utf-8')
569
- if decoded_line.startswith('data: '):
570
- data = decoded_line[6:]
571
- if data != '[DONE]':
572
- try:
573
- json_data = json.loads(data)
574
- if 'choices' in json_data and len(json_data['choices']) > 0:
575
- delta = json_data['choices'][0].get('delta', {})
576
- content = delta.get('content', '')
577
- if content:
578
- full_response += content
579
- yield full_response
580
- except:
581
- continue
582
- else:
583
- yield f"Error: {response.status_code} - {response.text}"
584
 
585
  except Exception as e:
586
- yield f"Connection error: {str(e)}"
587
- finally:
588
- end_time = time.time()
589
- # Track usage (simplified)
590
- track_usage("user123", str(messages[-1]) if messages else "",
591
- end_time - start_time, len(str(messages)))
592
-
593
- def format_code_blocks(text):
594
- """Detect and format code blocks with syntax highlighting"""
595
- import re
596
- # Simple pattern to detect code blocks
597
- pattern = r'```(\w+)?\n(.*?)```'
598
- # Replace with HTML formatted code (simplified)
599
- formatted = re.sub(pattern, r'<pre><code class="language-\1">\2</code></pre>', text, flags=re.DOTALL)
600
- return formatted
601
-
602
- def extract_and_format_citations(search_results):
603
- """Extract sources and create clickable citations"""
604
- # Simple citation extraction (can be enhanced)
605
- citations = []
606
- if "Source:" in search_results:
607
- lines = search_results.split('\n')
608
- for line in lines:
609
- if "http" in line:
610
- citations.append(line.strip())
611
- return citations
612
-
613
- def track_usage(user_id, query, response_time, tokens_used):
614
- """Track usage metrics for improvement"""
615
- metrics = {
616
- "timestamp": datetime.now().isoformat(),
617
- "user_id": user_id or "anonymous",
618
- "query_length": len(query),
619
- "response_time": response_time,
620
- "tokens_used": tokens_used
621
- }
622
- # In a real app, you'd store this in a database
623
- print(f"Usage tracked: {metrics}")
624
- return metrics
625
-
626
- def collect_feedback(feedback, query, response):
627
- """Collect user feedback for model improvement"""
628
- feedback_entry = {
629
- "timestamp": datetime.now().isoformat(),
630
- "feedback": feedback,
631
- "query": query,
632
- "response": response[:100] + "..." if len(response) > 100 else response
633
- }
634
- feedback_data.append(feedback_entry)
635
- print(f"Feedback collected: {feedback_entry}")
636
- return f"Thank you for your feedback: {feedback}"
637
-
638
- @lru_cache(maxsize=100)
639
- def cached_search(query):
640
- """Cache frequent searches"""
641
- return tavily_search(query)
642
-
643
- def handle_api_failure(error_type, fallback_strategy="retry"):
644
- """Handle different types of API failures gracefully"""
645
- # Simplified error handling
646
- return f"API Error: {error_type}. Strategy: {fallback_strategy}"
647
-
648
- def export_conversation(chat_history, export_format):
649
- """Export conversation in various formats"""
650
- if not chat_history:
651
- return "No conversation to export"
652
-
653
- if export_format == "JSON":
654
- # Filter out system messages for export
655
- exportable_history = [msg for msg in chat_history if msg[0] != "system"]
656
- return json.dumps(exportable_history, indent=2, ensure_ascii=False)
657
- elif export_format == "Text":
658
- lines = []
659
- for msg in chat_history:
660
- if msg[0] != "system": # Skip system messages
661
- lines.append(f"{msg[0].upper()}: {msg[1]}")
662
- return "\n".join(lines)
663
- return "Invalid format"
664
-
665
- def process_url_content(url):
666
- """Intelligent URL content processing"""
667
- if not url:
668
- return "Please enter a URL"
669
-
670
- if not url.startswith(('http://', 'https://')):
671
- return "Invalid URL format. Please include http:// or https://"
672
-
673
- # Determine content type and process accordingly
674
- if url.lower().endswith('.pdf'):
675
- return download_and_extract_pdf(url)
676
- elif any(domain in url.lower() for domain in ['arxiv.org']):
677
- # Extract arXiv ID and search
678
- import re
679
- arxiv_match = re.search(r'arxiv\.org/abs/(\d+\.\d+)', url)
680
- if arxiv_match:
681
- arxiv_id = arxiv_match.group(1)
682
- return arxiv_search(arxiv_id)
683
- else:
684
- return scrape_web_page(url)
685
- else:
686
- return scrape_web_page(url)
687
-
688
- def respond(message, chat_history, model_choice, max_tokens, temperature, top_p,
689
- creativity, precision, system_prompt, use_web_search, research_mode, theme):
690
- """Main response handler with conversation history"""
691
- if not message:
692
- yield "", chat_history, "", gr.update(choices=[], visible=False), "", "💬 Ready for your query"
693
- return
694
-
695
- # Rate limiting check
696
- if not rate_limiter.is_allowed():
697
- yield "", chat_history + [["assistant", "Rate limit exceeded. Please wait a moment before sending another message."]], "", "", "", "⏰ Rate limit active"
698
- return
699
-
700
- # Convert Gradio format to internal format
701
- internal_history = []
702
- for msg in chat_history:
703
- if len(msg) >= 2:
704
- internal_history.append({"role": msg[0], "content": msg[1]})
705
-
706
- # Add custom system prompt or preloaded context
707
- if not internal_history:
708
- if system_prompt:
709
- system_message = {"role": "system", "content": system_prompt}
710
- else:
711
- preloaded_context = get_preloaded_context()
712
- system_message = {"role": "system", "content": preloaded_context}
713
- internal_history = [system_message] + internal_history
714
-
715
- # Check if the message contains search results that need analysis
716
- if is_search_results_content(message):
717
- # This is search results that need analysis
718
- analysis_status = "🧠 Auto-analyzing search results..."
719
-
720
- # Extract the original query and search results
721
- lines = message.split('\n')
722
- if len(lines) > 2:
723
- # Get the query from the first line
724
- first_line = lines[0]
725
- if "'" in first_line:
726
- query = first_line.split("'")[1]
727
- else:
728
- query = message[:100] # Fallback
729
- else:
730
- query = "summary request"
731
-
732
- # Perform analysis
733
- analysis_prompt = analyze_search_results(query, message)
734
-
735
- # Create history with analysis prompt
736
- analysis_history = internal_history + [{"role": "user", "content": analysis_prompt}]
737
-
738
- # Generate analyzed response
739
- full_response = ""
740
- bibliography = generate_bibliography(message) if "COMPREHENSIVE RESEARCH" in message else ""
741
-
742
- for chunk in generate_with_streaming(analysis_history, model_choice, max_tokens, temperature * creativity, top_p * precision):
743
- if isinstance(chunk, str):
744
- full_response = chunk
745
- analysis_status = check_analysis_status(full_response)
746
- # Generate follow-up questions
747
- follow_ups = generate_follow_up_questions(full_response)
748
- # Convert back to Gradio format
749
- gradio_history = convert_history_format(internal_history + [{"role": "user", "content": message}, {"role": "assistant", "content": full_response}])
750
- yield "", gradio_history, message, gr.update(choices=follow_ups, visible=True if follow_ups else False), bibliography, analysis_status
751
- return
752
-
753
- # Check if we should perform a search
754
- user_message = {"role": "user", "content": message}
755
-
756
- # Always perform search if web search is enabled
757
- if use_web_search:
758
- analysis_status = "🔍 Performing search..."
759
- # Use enhanced research search for research queries or when research mode is enabled
760
- if research_mode or determine_research_content_type(message):
761
- search_result = comprehensive_research(message)
762
- bibliography = generate_bibliography(search_result)
763
- analysis_status = "📚 Comprehensive research completed"
764
- else:
765
- search_result = tavily_search(message)
766
- bibliography = ""
767
- analysis_status = "📊 Search results retrieved"
768
-
769
- # AUTOMATICALLY analyze search results for ANY search
770
- analysis_status = "🧠 Auto-analyzing search results..."
771
- # Extract the original query for analysis
772
- lines = search_result.split('\n')
773
- if len(lines) > 2:
774
- first_line = lines[0]
775
- if "'" in first_line:
776
- query = first_line.split("'")[1]
777
- else:
778
- query = message
779
- else:
780
- query = message
781
-
782
- # Perform analysis of the search results
783
- analysis_prompt = analyze_search_results(query, search_result)
784
-
785
- # Create history with analysis prompt
786
- analysis_history = internal_history + [user_message, {"role": "assistant", "content": search_result}, {"role": "user", "content": analysis_prompt}]
787
-
788
- # Generate analyzed response
789
- full_response = ""
790
- search_results_output = search_result # Store raw search results
791
-
792
- for chunk in generate_with_streaming(analysis_history, model_choice, max_tokens, temperature * creativity, top_p * precision):
793
- if isinstance(chunk, str):
794
- full_response = chunk
795
- analysis_status = check_analysis_status(full_response)
796
- # Generate follow-up questions
797
- follow_ups = generate_follow_up_questions(full_response)
798
- # Convert back to Gradio format
799
- gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": search_result}, {"role": "assistant", "content": full_response}])
800
- # Stream both the analysis and raw search results
801
- yield "", gradio_history, search_results_output, gr.update(choices=follow_ups, visible=True if follow_ups else False), bibliography, analysis_status
802
- return
803
-
804
- # Normal flow - generate response
805
- current_history = internal_history + [user_message]
806
- full_response = ""
807
- analysis_status = "💭 Generating response..."
808
-
809
- for chunk in generate_with_streaming(current_history, model_choice, max_tokens, temperature * creativity, top_p * precision):
810
- if isinstance(chunk, str):
811
- full_response = chunk
812
- analysis_status = check_analysis_status(full_response)
813
- # Break infinite loops
814
- if is_looping_content(full_response):
815
- # Force search instead of looping
816
- search_result = tavily_search(message)
817
- follow_ups = generate_follow_up_questions(search_result)
818
- analysis_status = "⚠️ Loop detected - performing search instead"
819
- # Convert back to Gradio format
820
- gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": f"[LOOP DETECTED - PERFORMING SEARCH]\n{search_result}"}])
821
- yield "", gradio_history, search_result, gr.update(choices=follow_ups, visible=True if follow_ups else False), "", analysis_status
822
- return
823
- # Stream the response
824
- follow_ups = generate_follow_up_questions(full_response)
825
- # Convert back to Gradio format
826
- gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": full_response}])
827
- yield "", gradio_history, "", gr.update(choices=follow_ups, visible=True if follow_ups else False), "", analysis_status
828
-
829
- # Check for tool calls after completion or break loops
830
- if is_looping_content(full_response):
831
- # Force search for looping content
832
- search_result = tavily_search(message)
833
- follow_ups = generate_follow_up_questions(search_result)
834
- analysis_status = "⚠️ Loop detected - performing search instead"
835
- # Convert back to Gradio format
836
- gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": f"[LOOP DETECTED - PERFORMING SEARCH]\n{search_result}"}])
837
- yield "", gradio_history, search_result, gr.update(choices=follow_ups, visible=True if follow_ups else False), "", analysis_status
838
- return
839
-
840
- # Normal completion
841
- follow_ups = generate_follow_up_questions(full_response)
842
- analysis_status = check_analysis_status(full_response)
843
- # Convert back to Gradio format
844
- gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": full_response}])
845
- yield "", gradio_history, "", gr.update(choices=follow_ups, visible=True if follow_ups else False), "", analysis_status
846
-
847
- def apply_theme(theme):
848
- """Apply theme-specific CSS"""
849
- if theme == "Dark":
850
- return """
851
- <style>
852
- body { background-color: #1a1a1a; color: #ffffff; }
853
- .message { background-color: #2d2d2d; }
854
- .dark-mode { background-color: #1a1a1a; color: #ffffff; }
855
- .analysis-complete { color: #4CAF50; font-weight: bold; }
856
- .further-research { color: #FF9800; font-weight: bold; }
857
- .in-progress { color: #2196F3; font-weight: bold; }
858
- .search-results { color: #9C27B0; font-weight: bold; }
859
- .processing { color: #00BCD4; font-weight: bold; }
860
- .ready { color: #8BC34A; font-weight: bold; }
861
- .warning { color: #FF5722; font-weight: bold; }
862
- .document-analysis { color: #009688; font-weight: bold; }
863
- .literature-review { color: #795548; font-weight: bold; }
864
- .bibliography { color: #607D8B; font-weight: bold; }
865
- </style>
866
- """
867
- else:
868
- return """
869
- <style>
870
- body { background-color: #ffffff; color: #000000; }
871
- .message { background-color: #f0f0f0; }
872
- .light-mode { background-color: #ffffff; color: #000000; }
873
- .analysis-complete { color: #2E7D32; font-weight: bold; }
874
- .further-research { color: #EF6C00; font-weight: bold; }
875
- .in-progress { color: #1565C0; font-weight: bold; }
876
- .search-results { color: #7B1FA2; font-weight: bold; }
877
- .processing { color: #006064; font-weight: bold; }
878
- .ready { color: #558B2F; font-weight: bold; }
879
- .warning { color: #D84315; font-weight: bold; }
880
- .document-analysis { color: #00796B; font-weight: bold; }
881
- .literature-review { color: #5D4037; font-weight: bold; }
882
- .bibliography { color: #455A64; font-weight: bold; }
883
- </style>
884
- """
885
-
886
- # Gradio Interface
887
- with gr.Blocks(title="GPT-OSS Research Assistant") as demo:
888
- gr.Markdown("# 🎓 GPT-OSS Research Assistant")
889
- gr.Markdown(f"Advanced AI assistant with academic research capabilities\n\n**Current Date/Time**: {FORMATTED_DATE_TIME}")
890
-
891
- # Theme CSS
892
- theme_css = gr.HTML()
893
-
894
- with gr.Tab("Chat"):
895
- with gr.Row():
896
- chatbot = gr.Chatbot(height=500, label="Conversation")
897
-
898
- with gr.Row():
899
- msg = gr.Textbox(label="Message", placeholder="Ask anything...", scale=9)
900
- submit = gr.Button("Send", scale=1)
901
-
902
- with gr.Row():
903
- clear = gr.Button("Clear")
904
- theme_toggle = gr.Radio(choices=["Light", "Dark"], value="Light", label="Theme")
905
- feedback_radio = gr.Radio(
906
- choices=["👍 Helpful", "👎 Not Helpful", "🔄 Needs Improvement"],
907
- label="Rate Last Response"
908
- )
909
-
910
- with gr.Row():
911
- with gr.Column():
912
- follow_up_questions = gr.Radio(
913
- choices=[],
914
- label="Suggested Follow-up Questions",
915
- visible=False
916
- )
917
- with gr.Column():
918
- with gr.Row():
919
- export_format = gr.Radio(choices=["JSON", "Text"], value="JSON", label="Export Format")
920
- export_btn = gr.Button("Export Conversation")
921
- export_output = gr.File(label="Download")
922
-
923
- with gr.Accordion("Search Results", open=False):
924
- search_results = gr.Textbox(label="Raw Search Data", interactive=False, max_lines=10)
925
-
926
- with gr.Accordion("Bibliography", open=False):
927
- bibliography_output = gr.Textbox(label="Generated Bibliography", interactive=False, max_lines=10)
928
-
929
- with gr.Accordion("Analysis Status", open=False):
930
- analysis_status_output = gr.Textbox(label="AI Analysis Status", interactive=False, max_lines=3)
931
-
932
- with gr.Tab("Research Tools"):
933
- gr.Markdown("## 🔍 Advanced Research Tools")
934
-
935
- with gr.Row():
936
- url_input = gr.Textbox(label="Process URL Content", placeholder="Enter URL to analyze (web page or PDF)...")
937
- url_button = gr.Button("Analyze URL")
938
-
939
- url_output = gr.Textbox(label="URL Analysis Results", interactive=False, max_lines=20)
940
-
941
- with gr.Row():
942
- literature_topic = gr.Textbox(label="Generate Literature Review", placeholder="Enter research topic...")
943
- lit_review_button = gr.Button("Generate Review")
944
-
945
- lit_review_output = gr.Textbox(label="Literature Review", interactive=False, max_lines=20)
946
-
947
- with gr.Accordion("Settings", open=False):
948
- with gr.Row():
949
- model_choice = gr.Dropdown(
950
- choices=[
951
- "DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
952
- "other-model-variants"
953
- ],
954
- value="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
955
- label="Model"
956
  )
957
-
958
- with gr.Row():
959
- max_tokens = gr.Slider(50, 8192, value=8192, label="Max Tokens")
960
- temperature = gr.Slider(0.1, 1.0, value=0.7, label="Base Temperature")
961
- top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top P")
962
-
963
- with gr.Row():
964
- creativity = gr.Slider(0.1, 1.0, value=0.7, label="Creativity")
965
- precision = gr.Slider(0.1, 1.0, value=0.9, label="Precision")
966
-
967
- system_prompt = gr.Textbox(
968
- label="System Prompt",
969
- value="",
970
- placeholder="Enter custom system prompt...",
971
- max_lines=3
972
- )
973
-
974
- with gr.Row():
975
- use_web_search = gr.Checkbox(label="Enable Web Search", value=True)
976
- research_mode = gr.Checkbox(label="Enable Research Mode (for academic queries)", value=False)
977
-
978
- # Event handling
979
- submit_event = submit.click(
980
- respond,
981
- [msg, chatbot, model_choice, max_tokens, temperature, top_p, creativity, precision, system_prompt, use_web_search, research_mode, theme_toggle],
982
- [msg, chatbot, search_results, follow_up_questions, bibliography_output, analysis_status_output],
983
- queue=True
984
- )
985
-
986
- msg_event = msg.submit(
987
- respond,
988
- [msg, chatbot, model_choice, max_tokens, temperature, top_p, creativity, precision, system_prompt, use_web_search, research_mode, theme_toggle],
989
- [msg, chatbot, search_results, follow_up_questions, bibliography_output, analysis_status_output],
990
- queue=True
991
- )
992
-
993
- clear.click(lambda: None, None, chatbot, queue=False)
994
-
995
- theme_toggle.change(
996
- apply_theme,
997
- [theme_toggle],
998
- [theme_css]
999
- )
1000
-
1001
- feedback_radio.change(
1002
- collect_feedback,
1003
- [feedback_radio, msg, chatbot],
1004
- []
1005
- )
1006
-
1007
- follow_up_questions.change(
1008
- lambda x: x,
1009
- [follow_up_questions],
1010
- [msg]
1011
- )
1012
-
1013
- export_btn.click(
1014
- export_conversation,
1015
- [chatbot, export_format],
1016
- [export_output]
1017
  )
1018
 
1019
- # URL processing events
1020
- url_button.click(
1021
- process_url_content,
1022
- [url_input],
1023
- [url_output]
1024
  )
1025
 
1026
- # Literature review generation
1027
- def generate_lit_review_wrapper(topic):
1028
- if not topic:
1029
- return "Please enter a research topic"
1030
- research_results = comprehensive_research(topic)
1031
- return generate_literature_review(topic, research_results)
1032
-
1033
- lit_review_button.click(
1034
- generate_lit_review_wrapper,
1035
- [literature_topic],
1036
- [lit_review_output]
1037
  )
1038
 
1039
  if __name__ == "__main__":
 
1
+ # app.py
2
  import gradio as gr
3
+ from modules.input_handler import InputHandler
4
+ from modules.retriever import Retriever
5
+ from modules.analyzer import Analyzer
6
+ from modules.citation import CitationManager
7
+ from modules.formatter import OutputFormatter
8
  import os
 
 
 
 
 
 
 
9
 
10
+ # Initialize modules
11
+ input_handler = InputHandler()
12
+ retriever = Retriever(api_key=os.getenv("TAVILY_API_KEY"))
13
+ analyzer = Analyzer(base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
14
+ api_key=os.getenv("HF_TOKEN"))
15
+ citation_manager = CitationManager()
16
+ formatter = OutputFormatter()
17
+
18
+ def research_assistant(query):
19
+ """
20
+ Main orchestrator function that coordinates all modules
21
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  try:
23
+ # Step 1: Process input
24
+ processed_query = input_handler.process_query(query)
 
 
 
 
25
 
26
+ # Step 2: Retrieve data
27
+ search_results = retriever.search(processed_query)
 
 
 
28
 
29
+ # Step 3: Analyze content
30
+ analysis = analyzer.analyze(query, search_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # Step 4: Manage citations
33
+ cited_analysis = citation_manager.add_citations(analysis, search_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Step 5: Format output
36
+ formatted_output = formatter.format_response(cited_analysis, search_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ return formatted_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  except Exception as e:
41
+ return f"An error occurred: {str(e)}"
42
+
43
+ # Create Gradio interface
44
+ with gr.Blocks(title="Research Assistant") as demo:
45
+ gr.Markdown("# 🧠 AI Research Assistant")
46
+ gr.Markdown("Enter a research topic to get a structured analysis with sources")
47
+
48
+ with gr.Row():
49
+ with gr.Column():
50
+ query_input = gr.Textbox(
51
+ label="Research Query",
52
+ placeholder="Enter your research question...",
53
+ lines=3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  )
55
+ submit_btn = gr.Button("Research", variant="primary")
56
+
57
+ with gr.Column():
58
+ output = gr.Markdown(label="Analysis Results")
59
+
60
+ examples = gr.Examples(
61
+ examples=[
62
+ "Latest advancements in quantum computing",
63
+ "Impact of climate change on global agriculture",
64
+ "Recent developments in Alzheimer's treatment research"
65
+ ],
66
+ inputs=query_input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  )
68
 
69
+ submit_btn.click(
70
+ fn=research_assistant,
71
+ inputs=query_input,
72
+ outputs=output
 
73
  )
74
 
75
+ query_input.submit(
76
+ fn=research_assistant,
77
+ inputs=query_input,
78
+ outputs=output
 
 
 
 
 
 
 
79
  )
80
 
81
  if __name__ == "__main__":
modules/analyzer.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import json
3
+
4
+ class Analyzer:
5
+ def __init__(self, base_url, api_key):
6
+ self.client = OpenAI(
7
+ base_url=base_url,
8
+ api_key=api_key
9
+ )
10
+
11
+ def analyze(self, query, search_results):
12
+ """
13
+ Analyze search results using the custom LLM
14
+ """
15
+ # Prepare context from search results
16
+ context = "\n\n".join([
17
+ f"Source: {result.get('url', 'N/A')}\nContent: {result.get('content', '')}"
18
+ for result in search_results[:3] # Limit to top 3 for context
19
+ ])
20
+
21
+ prompt = f"""
22
+ You are an expert research analyst. Analyze the following query and information to provide a comprehensive summary.
23
+
24
+ Query: {query}
25
+
26
+ Information:
27
+ {context}
28
+
29
+ Please provide:
30
+ 1. A brief overview of the topic
31
+ 2. Key findings or developments
32
+ 3. Different perspectives or approaches
33
+ 4. Potential implications or future directions
34
+ 5. Any controversies or conflicting viewpoints
35
+
36
+ Structure your response clearly with these sections.
37
+ """
38
+
39
+ try:
40
+ response = self.client.chat.completions.create(
41
+ model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
42
+ messages=[
43
+ {"role": "system", "content": "You are a helpful research assistant that provides structured, analytical responses."},
44
+ {"role": "user", "content": prompt}
45
+ ],
46
+ temperature=0.7,
47
+ max_tokens=1500,
48
+ stream=False
49
+ )
50
+
51
+ return response.choices[0].message.content
52
+
53
+ except Exception as e:
54
+ return f"Analysis failed: {str(e)}"
55
+
modules/citation.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class CitationManager:
2
+ def add_citations(self, analysis, search_results):
3
+ """
4
+ Add citations to the analysis based on source URLs
5
+ """
6
+ if not search_results:
7
+ return analysis
8
+
9
+ # Create a simple citation mapping
10
+ citations = {}
11
+ for i, result in enumerate(search_results):
12
+ citation_id = f"[{i+1}]"
13
+ citations[citation_id] = {
14
+ 'url': result.get('url', ''),
15
+ 'title': result.get('title', 'Untitled'),
16
+ 'source': result.get('source', 'Unknown')
17
+ }
18
+
19
+ # Add citation references to analysis
20
+ cited_analysis = analysis
21
+ # In a more sophisticated implementation, we would match claims to sources
22
+ # For now, we'll just append the citation list
23
+
24
+ return cited_analysis, citations
25
+
26
+ def format_bibliography(self, citations):
27
+ """
28
+ Format citations into a bibliography
29
+ """
30
+ bib_items = []
31
+ for cite_id, info in citations.items():
32
+ bib_item = f"{cite_id} {info['title']}. {info['source']}. Retrieved from: {info['url']}"
33
+ bib_items.append(bib_item)
34
+ return "\n".join(bib_items)
modules/formatter.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class OutputFormatter:
2
+ def format_response(self, analysis_result, search_results):
3
+ """
4
+ Format the final response with proper structure
5
+ """
6
+ if isinstance(analysis_result, tuple):
7
+ analysis, citations = analysis_result
8
+ else:
9
+ analysis = analysis_result
10
+ citations = {}
11
+
12
+ # Format the response
13
+ formatted_output = f"## Research Analysis\n\n{analysis}\n\n"
14
+
15
+ # Add sources section
16
+ if search_results:
17
+ formatted_output += "## Sources\n"
18
+ for i, result in enumerate(search_results):
19
+ formatted_output += f"{i+1}. [{result.get('title', 'Untitled')}]({result.get('url', '')})\n"
20
+
21
+ # Add citation details if available
22
+ if citations:
23
+ formatted_output += "\n## Detailed Citations\n"
24
+ for cite_id, info in citations.items():
25
+ formatted_output += f"- {cite_id} **{info['title']}** - {info['source']}: {info['url']}\n"
26
+
27
+ return formatted_output
modules/input_handler.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class InputHandler:
2
+ def process_query(self, query):
3
+ """
4
+ Process and validate user input
5
+ """
6
+ # Clean and normalize query
7
+ cleaned_query = query.strip()
8
+
9
+ # Add context if needed
10
+ if len(cleaned_query) < 5:
11
+ raise ValueError("Query too short. Please provide more details.")
12
+
13
+ return cleaned_query
14
+
15
+ def extract_keywords(self, query):
16
+ """
17
+ Extract important keywords from query
18
+ """
19
+ # Simple keyword extraction (could be enhanced with NLP)
20
+ stop_words = {'the', 'is', 'at', 'which', 'on', 'in', 'for', 'of', 'with', 'by'}
21
+ words = query.lower().split()
22
+ keywords = [word for word in words if word not in stop_words]
23
+ return keywords
modules/retriever.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tavily import TavilyClient
2
+ import logging
3
+
4
+ class Retriever:
5
+ def __init__(self, api_key):
6
+ self.client = TavilyClient(api_key=api_key)
7
+
8
+ def search(self, query, max_results=5):
9
+ """
10
+ Search for relevant content using Tavily API
11
+ """
12
+ try:
13
+ response = self.client.search(
14
+ query=query,
15
+ search_depth="advanced",
16
+ max_results=max_results,
17
+ include_answer=False,
18
+ include_raw_content=False
19
+ )
20
+ return response.get('results', [])
21
+ except Exception as e:
22
+ logging.error(f"Search failed: {str(e)}")
23
+ return []
24
+
25
+ def get_related_queries(self, query):
26
+ """
27
+ Generate related search queries
28
+ """
29
+ # This could be enhanced with LLM-based query expansion
30
+ return [
31
+ f"{query} research paper",
32
+ f"{query} latest developments",
33
+ f"{query} pros and cons"
34
+ ]
35
+