myspace134v

Runtime error

App Files Files Community

rdune71 commited on Sep 2

Commit

742b2a5

1 Parent(s): 0272eed

Hi

Browse files

Files changed (6) hide show

app.py +62 -1020
modules/analyzer.py +55 -0
modules/citation.py +34 -0
modules/formatter.py +27 -0
modules/input_handler.py +23 -0
modules/retriever.py +35 -0

app.py CHANGED Viewed

@@ -1,1039 +1,81 @@
 import gradio as gr
-import requests
-import json
 import os
-import re
-import time
-import io
-from datetime import datetime
-from functools import lru_cache
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
-# Configuration
-BASE_URL = "https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/"
-HF_TOKEN = os.environ.get("HF_TOKEN")
-TAVILY_API_KEY = os.environ.get("TAVILY_API_KEY")
-# Validate required environment variables
-if not HF_TOKEN:
-    raise ValueError("HF_TOKEN environment variable is required")
-# Get current date and time information
-CURRENT_DATE = datetime.now()
-DATE_INFO = CURRENT_DATE.strftime("%A, %B %d, %Y")
-TIME_INFO = CURRENT_DATE.strftime("%I:%M %p")
-FORMATTED_DATE_TIME = f"Current Date: {DATE_INFO}\nCurrent Time: {TIME_INFO}"
-# Initialize session with retry strategy
-session = requests.Session()
-retry_strategy = Retry(
-    total=3,
-    backoff_factor=1,
-    status_forcelist=[429, 500, 502, 503, 504],
-)
-adapter = HTTPAdapter(max_retries=retry_strategy)
-session.mount("http://", adapter)
-session.mount("https://", adapter)
-# Initialize Tavily client
-try:
-    from tavily import TavilyClient
-    tavily_client = TavilyClient(api_key=TAVILY_API_KEY) if TAVILY_API_KEY else None
-    TAVILY_AVAILABLE = True
-except ImportError:
-    tavily_client = None
-    TAVILY_AVAILABLE = False
-    print("Tavily not available: Please install tavily-python")
-# Import additional libraries for advanced features
-try:
-    import PyPDF2
-    PDF_SUPPORT = True
-except ImportError:
-    PDF_SUPPORT = False
-    print("PyPDF2 not available: Install for PDF processing support")
-try:
-    from bs4 import BeautifulSoup
-    WEB_SCRAPING = True
-except ImportError:
-    WEB_SCRAPING = False
-    print("BeautifulSoup not available: Install for web scraping support")
-try:
-    import feedparser
-    ACADEMIC_SEARCH = True
-except ImportError:
-    ACADEMIC_SEARCH = False
-    print("feedparser not available: Install for academic search support")
-# Rate limiter class
-class RateLimiter:
-    def __init__(self, max_calls=10, time_window=60):
-        self.max_calls = max_calls
-        self.time_window = time_window
-        self.calls = []
-    def is_allowed(self):
-        now = time.time()
-        self.calls = [call for call in self.calls if now - call < self.time_window]
-        if len(self.calls) < self.max_calls:
-            self.calls.append(now)
-            return True
-        return False
-rate_limiter = RateLimiter(max_calls=20, time_window=60)
-# Feedback storage
-feedback_data = []
-def get_preloaded_context():
-    """Get preloaded context information"""
-    context = f"""{FORMATTED_DATE_TIME} System Information: You are an AI assistant with access to current information through web search and academic research tools. Always provide sources for factual information. Available APIs:  - Web Search (Tavily) - Academic Research (arXiv, Semantic Scholar) - PDF Document Analysis - Web Page Content Extraction Specialized Features: - Research-focused queries automatically processed - Academic paper analysis and summarization - Literature review generation - Citation management and bibliography creation Response Guidelines: 1. After completing your analysis, ALWAYS end with either:    '[ANALYSIS COMPLETE]' - when you've fully addressed the query    '[FURTHER RESEARCH NEEDED]' - when additional investigation would be beneficial 2. For search results, provide clear synthesis rather than just listing findings 3. Include specific citations and sources where applicable 4. Structure complex answers with clear sections when appropriate"""
-    return context
-def clean_query_for_current_info(query):
-    """Clean query to focus on current/fresh information"""
-    # Remove old dates
-    query = re.sub(r'\d{4}-\d{2}-\d{2}', '', query)
-    query = re.sub(r'\d{4}/\d{2}/\d{2}', '', query)
-    query = re.sub(r'\d{2}/\d{2}/\d{4}', '', query)
-    return query.strip()
-def determine_research_content_type(query):
-    """Determine if query requires research-focused search"""
-    research_keywords = [
-        'research', 'study', 'paper', 'academic', 'scientific',
-        'experiment', 'findings', 'discovery', 'theory',
-        'hypothesis', 'methodology', 'conclusion', 'literature',
-        'peer reviewed', 'scholarly', 'journal', 'publication',
-        'analyze', 'investigate', 'examine', 'review'
-    ]
-    return any(keyword in query.lower() for keyword in research_keywords)
-def is_news_related_query(query):
-    """Check if query is related to news"""
-    news_keywords = ['news', 'headline', 'breaking', 'latest', 'today', 'current event', 'update', 'report']
-    query_lower = query.lower()
-    return any(word in query_lower for word in news_keywords)
-def is_search_results_content(content):
-    """Check if content appears to be search results that need analysis"""
-    search_indicators = [
-        "[SEARCH RESULTS FOR",
-        "Source: Web Search",
-        "Tavily search error",
-        "arXiv Paper:",
-        "Semantic Scholar Paper:"
-    ]
-    return any(indicator in content for indicator in search_indicators) and len(content) > 200
-def is_looping_content(content):
-    """Detect if content is stuck in a loop"""
-    if len(content) > 2000:  # Too long, likely looping
-        return True
-    if content.count("let's do") > 15:  # Repeated phrases
-        return True
-    if content.count("search") > 40:  # Excessive repetition
-        return True
-    return False
-def validate_history(chat_history):
-    """Ensure proper alternation in chat_history"""
-    if not chat_history:
-        return []
-    validated = []
-    expected_role = "user"
-    for message in chat_history:
-        role = message.get("role")
-        content = message.get("content", "")
-        # Skip empty messages
-        if not content:
-            continue
-        # Only add messages that follow proper alternation
-        if role == expected_role:
-            validated.append(message)
-            expected_role = "assistant" if expected_role == "user" else "user"
-        elif role == "system" and len(validated) == 0:
-            # Allow system message at start
-            validated.append(message)
-    return validated
-def convert_history_format(internal_history):
-    """Convert internal dict format to Gradio chatbot format"""
-    gradio_history = []
-    for msg in internal_history:
-        if isinstance(msg, dict):
-            gradio_history.append([msg.get("role", "unknown"), msg.get("content", "")])
-        else:
-            gradio_history.append(msg)
-    return gradio_history
-def truncate_history(messages, max_tokens=4000):
-    """Truncate conversation history to prevent context overflow"""
-    if not messages:
-        return []
-    # Simplified token estimation (4 chars ~ 1 token)
-    estimated_tokens = sum(len(msg.get("content", "")) for msg in messages) // 4
-    if estimated_tokens <= max_tokens:
-        return messages
-    # Truncate older messages
-    truncated = []
-    current_tokens = 0
-    # Keep system message if present
-    if messages and messages[0].get("role") == "system":
-        truncated.append(messages[0])
-        messages = messages[1:]
-    # Add recent messages up to token limit
-    for message in reversed(messages):
-        content = message.get("content", "")
-        message_tokens = len(content) // 4
-        if current_tokens + message_tokens > max_tokens:
-            break
-        truncated.insert(0, message)
-        current_tokens += message_tokens
-    return truncated
-def manage_conversation_memory(messages, max_turns=10):
-    """Keep conversation focused and prevent context overflow"""
-    if len(messages) > max_turns * 2:  # *2 for user/assistant pairs
-        # Keep system message + last N turns
-        system_msg = [msg for msg in messages if msg.get("role") == "system"]
-        recent_messages = messages[-(max_turns * 2):]
-        return system_msg + recent_messages if system_msg else recent_messages
-    return messages
-def tavily_search(query):
-    """Perform search using Tavily"""
-    if not TAVILY_AVAILABLE or not tavily_client:
-        return "Web search not available."
-    try:
-        # Clean query for current info
-        clean_query = clean_query_for_current_info(query)
-        if not clean_query:
-            return "No valid search query provided."
-        response = tavily_client.search(
-            clean_query,
-            search_depth="advanced",
-            topic="general",
-            max_results=5
-        )
-        results = []
-        for result in response.get("results", [])[:5]:
-            title = result.get("title", "")
-            content = result.get("content", "")
-            url = result.get("url", "")
-            if title and content:
-                results.append(f"{title}: {content} (Source: {url})")
-            elif content:
-                results.append(f"{content} (Source: {url})")
-        if results:
-            return "\n\n".join(results)
-        else:
-            return "No relevant information found."
-    except Exception as e:
-        return f"Tavily search error: {str(e)}"
-def download_and_extract_pdf(url):
-    """Download PDF and extract text content"""
-    if not PDF_SUPPORT:
-        return "PDF processing not available. Please install PyPDF2."
-    try:
-        # Download PDF
-        response = session.get(url, timeout=30)
-        response.raise_for_status()
-        # Extract text from PDF
-        pdf_file = io.BytesIO(response.content)
-        pdf_reader = PyPDF2.PdfReader(pdf_file)
-        text_content = []
-        for page_num, page in enumerate(pdf_reader.pages):
-            if page_num < 15:  # Limit to first 15 pages
-                text_content.append(page.extract_text())
-            else:
-                break
-        full_text = "\n".join(text_content)
-        return f"PDF CONTENT EXTRACTED FROM {url}:\n{full_text[:4000]}..."  # Limit size
-    except Exception as e:
-        return f"PDF extraction error: {str(e)}"
-def scrape_web_page(url):
-    """Scrape and process full web pages"""
-    if not WEB_SCRAPING:
-        return "Web scraping not available. Please install beautifulsoup4."
-    try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        }
-        response = session.get(url, headers=headers, timeout=15)
-        soup = BeautifulSoup(response.content, 'html.parser')
-        # Remove script and style elements
-        for script in soup(["script", "style", "nav", "footer", "aside"]):
-            script.decompose()
-        # Extract main content
-        title = soup.find('title').get_text().strip() if soup.find('title') else "No Title"
-        # Try to find main content area
-        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
-        if main_content:
-            paragraphs = main_content.find_all(['p', 'h1', 'h2', 'h3'])
-        else:
-            paragraphs = soup.find_all('p')
-        content = ' '.join([p.get_text().strip() for p in paragraphs[:30] if p.get_text().strip()])
-        return f"WEB PAGE CONTENT FROM {url}:\nTitle: {title}\nContent: {content[:3000]}..."  # Limit content size
-    except Exception as e:
-        return f"Error scraping page: {str(e)}"
-def arxiv_search(query):
-    """Search and process academic papers from arXiv"""
-    if not ACADEMIC_SEARCH:
-        return "Academic search not available. Please install feedparser."
-    try:
-        # Search arXiv
-        search_url = f"http://export.arxiv.org/api/query?search_query=all:{requests.utils.quote(query)}&max_results=3&sortBy=relevance&sortOrder=descending"
-        feed = feedparser.parse(search_url)
-        results = []
-        for entry in feed.entries[:3]:
-            title = entry.title
-            summary = entry.summary
-            authors = ", ".join([author.name for author in entry.authors[:3]]) if entry.authors else "Unknown Authors"
-            published = entry.published if hasattr(entry, 'published') else "Unknown Date"
-            pdf_url = entry.links[1].href if len(entry.links) > 1 and entry.links[1].type == 'application/pdf' else "No PDF link"
-            result = f"arXiv Paper:\nTitle: {title}\nAuthors: {authors}\nPublished: {published}\nAbstract: {summary}"
-            if pdf_url and pdf_url != "No PDF link":
-                result += f"\nPDF URL: {pdf_url}"
-            results.append(result)
-        if results:
-            return "\n\n---\n\n".join(results)
-        else:
-            return "No arXiv papers found for this query."
-    except Exception as e:
-        return f"arXiv search error: {str(e)}"
-def semantic_scholar_search(query):
-    """Search academic papers using Semantic Scholar API"""
     try:
-        api_url = "https://api.semanticscholar.org/graph/v1/paper/search"
-        params = {
-            "query": query,
-            "limit": 3,
-            "fields": "title,abstract,authors,year,venue,url,citationCount,referenceCount"
-        }
-        response = session.get(api_url, params=params, timeout=15)
-        if response.status_code == 200:
-            data = response.json()
-        else:
-            return f"Semantic Scholar API error: {response.status_code}"
-        results = []
-        for paper in data.get("data", [])[:3]:
-            title = paper.get("title", "")
-            abstract = paper.get("abstract", "")
-            authors = ", ".join([author.get("name", "") for author in paper.get("authors", [])[:3]])
-            year = paper.get("year", "")
-            venue = paper.get("venue", "")
-            url = paper.get("url", "")
-            citations = paper.get("citationCount", 0)
-            references = paper.get("referenceCount", 0)
-            result = f"Semantic Scholar Paper:\nTitle: {title}\nAuthors: {authors}\nYear: {year}\nVenue: {venue}\nCitations: {citations}\nReferences: {references}\nAbstract: {abstract[:500]}..."
-            if url:
-                result += f"\nURL: {url}"
-            results.append(result)
-        if results:
-            return "\n\n---\n\n".join(results)
-        else:
-            return "No Semantic Scholar papers found for this query."
-    except Exception as e:
-        return f"Semantic Scholar search error: {str(e)}"
-def comprehensive_research(query):
-    """Aggregate research from multiple academic sources"""
-    results = []
-    # Add search header
-    results.append(f"COMPREHENSIVE RESEARCH RESULTS FOR: '{query}'\n" + "="*50)
-    # Academic databases
-    if TAVILY_AVAILABLE and tavily_client:
-        tavily_result = tavily_search(query)
-        results.append(f"TAVILY ACADEMIC SEARCH RESULTS:\n{tavily_result}")
-    # arXiv for academic papers
-    arxiv_result = arxiv_search(query)
-    if "error" not in arxiv_result.lower():
-        results.append(f"ARXIV ACADEMIC PAPERS:\n{arxiv_result}")
-    # Semantic Scholar
-    semantic_result = semantic_scholar_search(query)
-    if "error" not in semantic_result.lower():
-        results.append(f"SEMANTIC SCHOLAR RESULTS:\n{semantic_result}")
-    # Generate bibliography
-    combined_results = "\n\n---\n\n".join(results)
-    bibliography = generate_bibliography(combined_results)
-    results.append(f"BIBLIOGRAPHY:\n{bibliography}")
-    return "\n\n---\n\n".join(results)
-def analyze_search_results(query, search_results):
-    """Create a prompt for the model to analyze search results"""
-    analysis_prompt = f"""Based on the search results below, please answer the original question: "{query}" Search Results: {search_results} Please provide a clear, concise answer based on these sources. Include specific names, facts, and cite the sources where possible. Do not mention that you are analyzing search results - just provide the answer directly. Structure your response thoughtfully and when you complete your analysis, please explicitly state '[ANALYSIS COMPLETE]' at the end if you have fully addressed the query and have no further input.  If additional research or clarification would be beneficial, please state '[FURTHER RESEARCH NEEDED]'."""
-    return analysis_prompt
-def generate_bibliography(search_results):
-    """Generate proper bibliography from research results"""
-    # Simple bibliography generation (can be enhanced)
-    citations = []
-    lines = search_results.split('\n')
-    current_citation = {}
-    for line in lines:
-        if line.startswith("Title:"):
-            if current_citation:
-                citations.append(current_citation)
-            current_citation = {"title": line[7:].strip()}
-        elif line.startswith("Authors:") and current_citation:
-            current_citation["authors"] = line[9:].strip()
-        elif line.startswith("Year:") and current_citation:
-            current_citation["year"] = line[6:].strip()
-        elif line.startswith("URL:") and current_citation:
-            current_citation["url"] = line[5:].strip()
-    if current_citation:
-        citations.append(current_citation)
-    # Format citations in APA style
-    formatted_citations = []
-    for i, citation in enumerate(citations, 1):
-        authors = citation.get("authors", "Unknown Author")
-        title = citation.get("title", "Unknown Title")
-        year = citation.get("year", "N.d.")
-        url = citation.get("url", "")
-        formatted = f"{i}. {authors} ({year}). {title}. Retrieved from {url}"
-        formatted_citations.append(formatted)
-    return "\n".join(formatted_citations) if formatted_citations else "No citations found."
-def generate_literature_review(topic, search_results):
-    """Generate structured literature review from search results"""
-    prompt = f"""Based on the following research on '{topic}', create a structured literature review: {search_results} Please organize your response as follows: 1. INTRODUCTION: Brief overview of the topic 2. KEY FINDINGS: Major discoveries and insights from the research 3. METHODOLOGIES: Common research approaches used 4. LIMITATIONS: Identified gaps or limitations in current research 5. FUTURE DIRECTIONS: Suggested areas for future investigation 6. CONCLUSION: Summary of the current state of research Format your response clearly with these section headings. When you complete your analysis, please explicitly state '[ANALYSIS COMPLETE]' at the end."""
-    return prompt
-def generate_follow_up_questions(last_response):
-    """Generate 3-5 relevant follow-up questions"""
-    if not last_response:
-        return []
-    # Simple heuristic-based questions
-    question_words = ["What", "How", "Why", "When", "Where", "Who"]
-    topics = ["related", "similar", "detailed", "practical"]
-    # Extract key topics from response (simplified)
-    words = last_response.split()[:20]  # First 20 words
-    key_topics = [word for word in words if len(word) > 4][:3]  # Simple filtering
-    questions = []
-    for word in question_words[:3]:  # Limit to 3
-        if key_topics:
-            topic = key_topics[0] if key_topics else "this"
-            questions.append(f"{word} about {topic}?")
-    return questions[:3]  # Return max 3 questions
-def check_analysis_status(content):
-    """Check if the AI has indicated completion status"""
-    # Check for explicit completion markers first
-    if "[ANALYSIS COMPLETE]" in content:
-        return "✅ Analysis complete - AI has finished reviewing and has no further input."
-    elif "[FURTHER RESEARCH NEEDED]" in content:
-        return "🔍 Further research needed - AI suggests additional investigation would be beneficial."
-    # Check if this is search results being displayed (and needs analysis)
-    elif ("[SEARCH RESULTS FOR" in content or "Source: Web Search" in content) and len(content) > 200:
-        return "📊 Search results retrieved - AUTO-ANALYSIS TRIGGERED"
-    # Check for comprehensive research results
-    elif "COMPREHENSIVE RESEARCH RESULTS FOR" in content:
-        return "📚 Comprehensive research completed - detailed findings provided."
-    # Check if this appears to be a final answer/response
-    elif any(phrase in content.lower() for phrase in [
-        "in conclusion", "to summarize", "in summary",
-        "overall", "therefore", "thus", "in closing"
-    ]):
-        return "✅ AI appears to be concluding its response."
-    # Check if this is bibliographic content
-    elif "BIBLIOGRAPHY:" in content or "REFERENCES:" in content:
-        return "📖 Bibliography generated - research sources compiled."
-    # Check if this is URL analysis results
-    elif "PDF CONTENT EXTRACTED" in content or "WEB PAGE CONTENT" in content:
-        return "📄 Document analysis complete - content extracted and ready for review."
-    # Check for literature review generation
-    elif "LITERATURE REVIEW" in content and any(header in content for header in [
-        "INTRODUCTION", "KEY FINDINGS", "METHODOLOGIES",
-        "LIMITATIONS", "FUTURE DIRECTIONS", "CONCLUSION"
-    ]):
-        return "📑 Literature review structured - comprehensive analysis provided."
-    # Default status for ongoing processing
-    else:
-        # If content is substantial and appears analytical, assume it's progressing toward completion
-        if len(content) > 200 and not content.startswith("[SEARCH RESULTS"):
-            return "🧠 Analysis in progress - AI is formulating detailed response."
-        elif content.startswith("[SEARCH RESULTS"):
-            return "📊 Search results displayed - awaiting analysis"
-        else:
-            return "⏳ Processing - AI is working on your request."
-def generate_with_streaming(messages, model, max_tokens=8192, temperature=0.7, top_p=0.9):
-    """Generate text with streaming"""
-    headers = {
-        "Authorization": f"Bearer {HF_TOKEN}",
-        "Content-Type": "application/json"
-    }
-    # Validate history to prevent errors
-    validated_messages = validate_history(messages)
-    payload = {
-        "model": model,
-        "messages": validated_messages,
-        "max_tokens": max_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "stream": True
-    }
-    start_time = time.time()
-    try:
-        response = session.post(
-            f"{BASE_URL}chat/completions",
-            headers=headers,
-            json=payload,
-            timeout=300,
-            stream=True
-        )
-        if response.status_code == 200:
-            full_response = ""
-            for line in response.iter_lines():
-                if line:
-                    decoded_line = line.decode('utf-8')
-                    if decoded_line.startswith('data: '):
-                        data = decoded_line[6:]
-                        if data != '[DONE]':
-                            try:
-                                json_data = json.loads(data)
-                                if 'choices' in json_data and len(json_data['choices']) > 0:
-                                    delta = json_data['choices'][0].get('delta', {})
-                                    content = delta.get('content', '')
-                                    if content:
-                                        full_response += content
-                                        yield full_response
-                            except:
-                                continue
-        else:
-            yield f"Error: {response.status_code} - {response.text}"
     except Exception as e:
-        yield f"Connection error: {str(e)}"
-    finally:
-        end_time = time.time()
-        # Track usage (simplified)
-        track_usage("user123", str(messages[-1]) if messages else "",
-                   end_time - start_time, len(str(messages)))
-def format_code_blocks(text):
-    """Detect and format code blocks with syntax highlighting"""
-    import re
-    # Simple pattern to detect code blocks
-    pattern = r'```(\w+)?\n(.*?)```'
-    # Replace with HTML formatted code (simplified)
-    formatted = re.sub(pattern, r'<pre><code class="language-\1">\2</code></pre>', text, flags=re.DOTALL)
-    return formatted
-def extract_and_format_citations(search_results):
-    """Extract sources and create clickable citations"""
-    # Simple citation extraction (can be enhanced)
-    citations = []
-    if "Source:" in search_results:
-        lines = search_results.split('\n')
-        for line in lines:
-            if "http" in line:
-                citations.append(line.strip())
-    return citations
-def track_usage(user_id, query, response_time, tokens_used):
-    """Track usage metrics for improvement"""
-    metrics = {
-        "timestamp": datetime.now().isoformat(),
-        "user_id": user_id or "anonymous",
-        "query_length": len(query),
-        "response_time": response_time,
-        "tokens_used": tokens_used
-    }
-    # In a real app, you'd store this in a database
-    print(f"Usage tracked: {metrics}")
-    return metrics
-def collect_feedback(feedback, query, response):
-    """Collect user feedback for model improvement"""
-    feedback_entry = {
-        "timestamp": datetime.now().isoformat(),
-        "feedback": feedback,
-        "query": query,
-        "response": response[:100] + "..." if len(response) > 100 else response
-    }
-    feedback_data.append(feedback_entry)
-    print(f"Feedback collected: {feedback_entry}")
-    return f"Thank you for your feedback: {feedback}"
-@lru_cache(maxsize=100)
-def cached_search(query):
-    """Cache frequent searches"""
-    return tavily_search(query)
-def handle_api_failure(error_type, fallback_strategy="retry"):
-    """Handle different types of API failures gracefully"""
-    # Simplified error handling
-    return f"API Error: {error_type}. Strategy: {fallback_strategy}"
-def export_conversation(chat_history, export_format):
-    """Export conversation in various formats"""
-    if not chat_history:
-        return "No conversation to export"
-    if export_format == "JSON":
-        # Filter out system messages for export
-        exportable_history = [msg for msg in chat_history if msg[0] != "system"]
-        return json.dumps(exportable_history, indent=2, ensure_ascii=False)
-    elif export_format == "Text":
-        lines = []
-        for msg in chat_history:
-            if msg[0] != "system":  # Skip system messages
-                lines.append(f"{msg[0].upper()}: {msg[1]}")
-        return "\n".join(lines)
-    return "Invalid format"
-def process_url_content(url):
-    """Intelligent URL content processing"""
-    if not url:
-        return "Please enter a URL"
-    if not url.startswith(('http://', 'https://')):
-        return "Invalid URL format. Please include http:// or https://"
-    # Determine content type and process accordingly
-    if url.lower().endswith('.pdf'):
-        return download_and_extract_pdf(url)
-    elif any(domain in url.lower() for domain in ['arxiv.org']):
-        # Extract arXiv ID and search
-        import re
-        arxiv_match = re.search(r'arxiv\.org/abs/(\d+\.\d+)', url)
-        if arxiv_match:
-            arxiv_id = arxiv_match.group(1)
-            return arxiv_search(arxiv_id)
-        else:
-            return scrape_web_page(url)
-    else:
-        return scrape_web_page(url)
-def respond(message, chat_history, model_choice, max_tokens, temperature, top_p,
-            creativity, precision, system_prompt, use_web_search, research_mode, theme):
-    """Main response handler with conversation history"""
-    if not message:
-        yield "", chat_history, "", gr.update(choices=[], visible=False), "", "💬 Ready for your query"
-        return
-    # Rate limiting check
-    if not rate_limiter.is_allowed():
-        yield "", chat_history + [["assistant", "Rate limit exceeded. Please wait a moment before sending another message."]], "", "", "", "⏰ Rate limit active"
-        return
-    # Convert Gradio format to internal format
-    internal_history = []
-    for msg in chat_history:
-        if len(msg) >= 2:
-            internal_history.append({"role": msg[0], "content": msg[1]})
-    # Add custom system prompt or preloaded context
-    if not internal_history:
-        if system_prompt:
-            system_message = {"role": "system", "content": system_prompt}
-        else:
-            preloaded_context = get_preloaded_context()
-            system_message = {"role": "system", "content": preloaded_context}
-        internal_history = [system_message] + internal_history
-    # Check if the message contains search results that need analysis
-    if is_search_results_content(message):
-        # This is search results that need analysis
-        analysis_status = "🧠 Auto-analyzing search results..."
-        # Extract the original query and search results
-        lines = message.split('\n')
-        if len(lines) > 2:
-            # Get the query from the first line
-            first_line = lines[0]
-            if "'" in first_line:
-                query = first_line.split("'")[1]
-            else:
-                query = message[:100]  # Fallback
-        else:
-            query = "summary request"
-        # Perform analysis
-        analysis_prompt = analyze_search_results(query, message)
-        # Create history with analysis prompt
-        analysis_history = internal_history + [{"role": "user", "content": analysis_prompt}]
-        # Generate analyzed response
-        full_response = ""
-        bibliography = generate_bibliography(message) if "COMPREHENSIVE RESEARCH" in message else ""
-        for chunk in generate_with_streaming(analysis_history, model_choice, max_tokens, temperature * creativity, top_p * precision):
-            if isinstance(chunk, str):
-                full_response = chunk
-                analysis_status = check_analysis_status(full_response)
-                # Generate follow-up questions
-                follow_ups = generate_follow_up_questions(full_response)
-                # Convert back to Gradio format
-                gradio_history = convert_history_format(internal_history + [{"role": "user", "content": message}, {"role": "assistant", "content": full_response}])
-                yield "", gradio_history, message, gr.update(choices=follow_ups, visible=True if follow_ups else False), bibliography, analysis_status
-        return
-    # Check if we should perform a search
-    user_message = {"role": "user", "content": message}
-    # Always perform search if web search is enabled
-    if use_web_search:
-        analysis_status = "🔍 Performing search..."
-        # Use enhanced research search for research queries or when research mode is enabled
-        if research_mode or determine_research_content_type(message):
-            search_result = comprehensive_research(message)
-            bibliography = generate_bibliography(search_result)
-            analysis_status = "📚 Comprehensive research completed"
-        else:
-            search_result = tavily_search(message)
-            bibliography = ""
-            analysis_status = "📊 Search results retrieved"
-        # AUTOMATICALLY analyze search results for ANY search
-        analysis_status = "🧠 Auto-analyzing search results..."
-        # Extract the original query for analysis
-        lines = search_result.split('\n')
-        if len(lines) > 2:
-            first_line = lines[0]
-            if "'" in first_line:
-                query = first_line.split("'")[1]
-            else:
-                query = message
-        else:
-            query = message
-        # Perform analysis of the search results
-        analysis_prompt = analyze_search_results(query, search_result)
-        # Create history with analysis prompt
-        analysis_history = internal_history + [user_message, {"role": "assistant", "content": search_result}, {"role": "user", "content": analysis_prompt}]
-        # Generate analyzed response
-        full_response = ""
-        search_results_output = search_result  # Store raw search results
-        for chunk in generate_with_streaming(analysis_history, model_choice, max_tokens, temperature * creativity, top_p * precision):
-            if isinstance(chunk, str):
-                full_response = chunk
-                analysis_status = check_analysis_status(full_response)
-                # Generate follow-up questions
-                follow_ups = generate_follow_up_questions(full_response)
-                # Convert back to Gradio format
-                gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": search_result}, {"role": "assistant", "content": full_response}])
-                # Stream both the analysis and raw search results
-                yield "", gradio_history, search_results_output, gr.update(choices=follow_ups, visible=True if follow_ups else False), bibliography, analysis_status
-        return
-    # Normal flow - generate response
-    current_history = internal_history + [user_message]
-    full_response = ""
-    analysis_status = "💭 Generating response..."
-    for chunk in generate_with_streaming(current_history, model_choice, max_tokens, temperature * creativity, top_p * precision):
-        if isinstance(chunk, str):
-            full_response = chunk
-            analysis_status = check_analysis_status(full_response)
-            # Break infinite loops
-            if is_looping_content(full_response):
-                # Force search instead of looping
-                search_result = tavily_search(message)
-                follow_ups = generate_follow_up_questions(search_result)
-                analysis_status = "⚠️ Loop detected - performing search instead"
-                # Convert back to Gradio format
-                gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": f"[LOOP DETECTED - PERFORMING SEARCH]\n{search_result}"}])
-                yield "", gradio_history, search_result, gr.update(choices=follow_ups, visible=True if follow_ups else False), "", analysis_status
-                return
-            # Stream the response
-            follow_ups = generate_follow_up_questions(full_response)
-            # Convert back to Gradio format
-            gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": full_response}])
-            yield "", gradio_history, "", gr.update(choices=follow_ups, visible=True if follow_ups else False), "", analysis_status
-    # Check for tool calls after completion or break loops
-    if is_looping_content(full_response):
-        # Force search for looping content
-        search_result = tavily_search(message)
-        follow_ups = generate_follow_up_questions(search_result)
-        analysis_status = "⚠️ Loop detected - performing search instead"
-        # Convert back to Gradio format
-        gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": f"[LOOP DETECTED - PERFORMING SEARCH]\n{search_result}"}])
-        yield "", gradio_history, search_result, gr.update(choices=follow_ups, visible=True if follow_ups else False), "", analysis_status
-        return
-    # Normal completion
-    follow_ups = generate_follow_up_questions(full_response)
-    analysis_status = check_analysis_status(full_response)
-    # Convert back to Gradio format
-    gradio_history = convert_history_format(internal_history + [user_message, {"role": "assistant", "content": full_response}])
-    yield "", gradio_history, "", gr.update(choices=follow_ups, visible=True if follow_ups else False), "", analysis_status
-def apply_theme(theme):
-    """Apply theme-specific CSS"""
-    if theme == "Dark":
-        return """
-        <style>
-        body { background-color: #1a1a1a; color: #ffffff; }
-        .message { background-color: #2d2d2d; }
-        .dark-mode { background-color: #1a1a1a; color: #ffffff; }
-        .analysis-complete { color: #4CAF50; font-weight: bold; }
-        .further-research { color: #FF9800; font-weight: bold; }
-        .in-progress { color: #2196F3; font-weight: bold; }
-        .search-results { color: #9C27B0; font-weight: bold; }
-        .processing { color: #00BCD4; font-weight: bold; }
-        .ready { color: #8BC34A; font-weight: bold; }
-        .warning { color: #FF5722; font-weight: bold; }
-        .document-analysis { color: #009688; font-weight: bold; }
-        .literature-review { color: #795548; font-weight: bold; }
-        .bibliography { color: #607D8B; font-weight: bold; }
-        </style>
-        """
-    else:
-        return """
-        <style>
-        body { background-color: #ffffff; color: #000000; }
-        .message { background-color: #f0f0f0; }
-        .light-mode { background-color: #ffffff; color: #000000; }
-        .analysis-complete { color: #2E7D32; font-weight: bold; }
-        .further-research { color: #EF6C00; font-weight: bold; }
-        .in-progress { color: #1565C0; font-weight: bold; }
-        .search-results { color: #7B1FA2; font-weight: bold; }
-        .processing { color: #006064; font-weight: bold; }
-        .ready { color: #558B2F; font-weight: bold; }
-        .warning { color: #D84315; font-weight: bold; }
-        .document-analysis { color: #00796B; font-weight: bold; }
-        .literature-review { color: #5D4037; font-weight: bold; }
-        .bibliography { color: #455A64; font-weight: bold; }
-        </style>
-        """
-# Gradio Interface
-with gr.Blocks(title="GPT-OSS Research Assistant") as demo:
-    gr.Markdown("# 🎓 GPT-OSS Research Assistant")
-    gr.Markdown(f"Advanced AI assistant with academic research capabilities\n\n**Current Date/Time**: {FORMATTED_DATE_TIME}")
-    # Theme CSS
-    theme_css = gr.HTML()
-    with gr.Tab("Chat"):
-        with gr.Row():
-            chatbot = gr.Chatbot(height=500, label="Conversation")
-        with gr.Row():
-            msg = gr.Textbox(label="Message", placeholder="Ask anything...", scale=9)
-            submit = gr.Button("Send", scale=1)
-        with gr.Row():
-            clear = gr.Button("Clear")
-            theme_toggle = gr.Radio(choices=["Light", "Dark"], value="Light", label="Theme")
-            feedback_radio = gr.Radio(
-                choices=["👍 Helpful", "👎 Not Helpful", "🔄 Needs Improvement"],
-                label="Rate Last Response"
-            )
-        with gr.Row():
-            with gr.Column():
-                follow_up_questions = gr.Radio(
-                    choices=[],
-                    label="Suggested Follow-up Questions",
-                    visible=False
-                )
-            with gr.Column():
-                with gr.Row():
-                    export_format = gr.Radio(choices=["JSON", "Text"], value="JSON", label="Export Format")
-                    export_btn = gr.Button("Export Conversation")
-                    export_output = gr.File(label="Download")
-        with gr.Accordion("Search Results", open=False):
-            search_results = gr.Textbox(label="Raw Search Data", interactive=False, max_lines=10)
-        with gr.Accordion("Bibliography", open=False):
-            bibliography_output = gr.Textbox(label="Generated Bibliography", interactive=False, max_lines=10)
-        with gr.Accordion("Analysis Status", open=False):
-            analysis_status_output = gr.Textbox(label="AI Analysis Status", interactive=False, max_lines=3)
-    with gr.Tab("Research Tools"):
-        gr.Markdown("## 🔍 Advanced Research Tools")
-        with gr.Row():
-            url_input = gr.Textbox(label="Process URL Content", placeholder="Enter URL to analyze (web page or PDF)...")
-            url_button = gr.Button("Analyze URL")
-        url_output = gr.Textbox(label="URL Analysis Results", interactive=False, max_lines=20)
-        with gr.Row():
-            literature_topic = gr.Textbox(label="Generate Literature Review", placeholder="Enter research topic...")
-            lit_review_button = gr.Button("Generate Review")
-        lit_review_output = gr.Textbox(label="Literature Review", interactive=False, max_lines=20)
-    with gr.Accordion("Settings", open=False):
-        with gr.Row():
-            model_choice = gr.Dropdown(
-                choices=[
-                    "DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
-                    "other-model-variants"
-                ],
-                value="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
-                label="Model"
             )
-        with gr.Row():
-            max_tokens = gr.Slider(50, 8192, value=8192, label="Max Tokens")
-            temperature = gr.Slider(0.1, 1.0, value=0.7, label="Base Temperature")
-            top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top P")
-        with gr.Row():
-            creativity = gr.Slider(0.1, 1.0, value=0.7, label="Creativity")
-            precision = gr.Slider(0.1, 1.0, value=0.9, label="Precision")
-        system_prompt = gr.Textbox(
-            label="System Prompt",
-            value="",
-            placeholder="Enter custom system prompt...",
-            max_lines=3
-        )
-        with gr.Row():
-            use_web_search = gr.Checkbox(label="Enable Web Search", value=True)
-            research_mode = gr.Checkbox(label="Enable Research Mode (for academic queries)", value=False)
-    # Event handling
-    submit_event = submit.click(
-        respond,
-        [msg, chatbot, model_choice, max_tokens, temperature, top_p, creativity, precision, system_prompt, use_web_search, research_mode, theme_toggle],
-        [msg, chatbot, search_results, follow_up_questions, bibliography_output, analysis_status_output],
-        queue=True
-    )
-    msg_event = msg.submit(
-        respond,
-        [msg, chatbot, model_choice, max_tokens, temperature, top_p, creativity, precision, system_prompt, use_web_search, research_mode, theme_toggle],
-        [msg, chatbot, search_results, follow_up_questions, bibliography_output, analysis_status_output],
-        queue=True
-    )
-    clear.click(lambda: None, None, chatbot, queue=False)
-    theme_toggle.change(
-        apply_theme,
-        [theme_toggle],
-        [theme_css]
-    )
-    feedback_radio.change(
-        collect_feedback,
-        [feedback_radio, msg, chatbot],
-        []
-    )
-    follow_up_questions.change(
-        lambda x: x,
-        [follow_up_questions],
-        [msg]
-    )
-    export_btn.click(
-        export_conversation,
-        [chatbot, export_format],
-        [export_output]
     )
-    # URL processing events
-    url_button.click(
-        process_url_content,
-        [url_input],
-        [url_output]
     )
-    # Literature review generation
-    def generate_lit_review_wrapper(topic):
-        if not topic:
-            return "Please enter a research topic"
-        research_results = comprehensive_research(topic)
-        return generate_literature_review(topic, research_results)
-    lit_review_button.click(
-        generate_lit_review_wrapper,
-        [literature_topic],
-        [lit_review_output]
     )
 if __name__ == "__main__":

+# app.py
 import gradio as gr
+from modules.input_handler import InputHandler
+from modules.retriever import Retriever
+from modules.analyzer import Analyzer
+from modules.citation import CitationManager
+from modules.formatter import OutputFormatter
 import os
+# Initialize modules
+input_handler = InputHandler()
+retriever = Retriever(api_key=os.getenv("TAVILY_API_KEY"))
+analyzer = Analyzer(base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
+                   api_key=os.getenv("HF_TOKEN"))
+citation_manager = CitationManager()
+formatter = OutputFormatter()
+def research_assistant(query):
+    """
+    Main orchestrator function that coordinates all modules
+    """
     try:
+        # Step 1: Process input
+        processed_query = input_handler.process_query(query)
+        # Step 2: Retrieve data
+        search_results = retriever.search(processed_query)
+        # Step 3: Analyze content
+        analysis = analyzer.analyze(query, search_results)
+        # Step 4: Manage citations
+        cited_analysis = citation_manager.add_citations(analysis, search_results)
+        # Step 5: Format output
+        formatted_output = formatter.format_response(cited_analysis, search_results)
+        return formatted_output
     except Exception as e:
+        return f"An error occurred: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Research Assistant") as demo:
+    gr.Markdown("# 🧠 AI Research Assistant")
+    gr.Markdown("Enter a research topic to get a structured analysis with sources")
+    with gr.Row():
+        with gr.Column():
+            query_input = gr.Textbox(
+                label="Research Query",
+                placeholder="Enter your research question...",
+                lines=3
             )
+            submit_btn = gr.Button("Research", variant="primary")
+        with gr.Column():
+            output = gr.Markdown(label="Analysis Results")
+    examples = gr.Examples(
+        examples=[
+            "Latest advancements in quantum computing",
+            "Impact of climate change on global agriculture",
+            "Recent developments in Alzheimer's treatment research"
+        ],
+        inputs=query_input
     )
+    submit_btn.click(
+        fn=research_assistant,
+        inputs=query_input,
+        outputs=output
     )
+    query_input.submit(
+        fn=research_assistant,
+        inputs=query_input,
+        outputs=output
     )
 if __name__ == "__main__":

modules/analyzer.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from openai import OpenAI
+import json
+class Analyzer:
+    def __init__(self, base_url, api_key):
+        self.client = OpenAI(
+            base_url=base_url,
+            api_key=api_key
+        )
+    def analyze(self, query, search_results):
+        """
+        Analyze search results using the custom LLM
+        """
+        # Prepare context from search results
+        context = "\n\n".join([
+            f"Source: {result.get('url', 'N/A')}\nContent: {result.get('content', '')}"
+            for result in search_results[:3]  # Limit to top 3 for context
+        ])
+        prompt = f"""
+        You are an expert research analyst. Analyze the following query and information to provide a comprehensive summary.
+        Query: {query}
+        Information:
+        {context}
+        Please provide:
+        1. A brief overview of the topic
+        2. Key findings or developments
+        3. Different perspectives or approaches
+        4. Potential implications or future directions
+        5. Any controversies or conflicting viewpoints
+        Structure your response clearly with these sections.
+        """
+        try:
+            response = self.client.chat.completions.create(
+                model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
+                messages=[
+                    {"role": "system", "content": "You are a helpful research assistant that provides structured, analytical responses."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.7,
+                max_tokens=1500,
+                stream=False
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            return f"Analysis failed: {str(e)}"

modules/citation.py ADDED Viewed

	@@ -0,0 +1,34 @@

+class CitationManager:
+    def add_citations(self, analysis, search_results):
+        """
+        Add citations to the analysis based on source URLs
+        """
+        if not search_results:
+            return analysis
+        # Create a simple citation mapping
+        citations = {}
+        for i, result in enumerate(search_results):
+            citation_id = f"[{i+1}]"
+            citations[citation_id] = {
+                'url': result.get('url', ''),
+                'title': result.get('title', 'Untitled'),
+                'source': result.get('source', 'Unknown')
+            }
+        # Add citation references to analysis
+        cited_analysis = analysis
+        # In a more sophisticated implementation, we would match claims to sources
+        # For now, we'll just append the citation list
+        return cited_analysis, citations
+    def format_bibliography(self, citations):
+        """
+        Format citations into a bibliography
+        """
+        bib_items = []
+        for cite_id, info in citations.items():
+            bib_item = f"{cite_id} {info['title']}. {info['source']}. Retrieved from: {info['url']}"
+            bib_items.append(bib_item)
+        return "\n".join(bib_items)

modules/formatter.py ADDED Viewed

	@@ -0,0 +1,27 @@

+class OutputFormatter:
+    def format_response(self, analysis_result, search_results):
+        """
+        Format the final response with proper structure
+        """
+        if isinstance(analysis_result, tuple):
+            analysis, citations = analysis_result
+        else:
+            analysis = analysis_result
+            citations = {}
+        # Format the response
+        formatted_output = f"## Research Analysis\n\n{analysis}\n\n"
+        # Add sources section
+        if search_results:
+            formatted_output += "## Sources\n"
+            for i, result in enumerate(search_results):
+                formatted_output += f"{i+1}. [{result.get('title', 'Untitled')}]({result.get('url', '')})\n"
+        # Add citation details if available
+        if citations:
+            formatted_output += "\n## Detailed Citations\n"
+            for cite_id, info in citations.items():
+                formatted_output += f"- {cite_id} **{info['title']}** - {info['source']}: {info['url']}\n"
+        return formatted_output

modules/input_handler.py ADDED Viewed

	@@ -0,0 +1,23 @@

+class InputHandler:
+    def process_query(self, query):
+        """
+        Process and validate user input
+        """
+        # Clean and normalize query
+        cleaned_query = query.strip()
+        # Add context if needed
+        if len(cleaned_query) < 5:
+            raise ValueError("Query too short. Please provide more details.")
+        return cleaned_query
+    def extract_keywords(self, query):
+        """
+        Extract important keywords from query
+        """
+        # Simple keyword extraction (could be enhanced with NLP)
+        stop_words = {'the', 'is', 'at', 'which', 'on', 'in', 'for', 'of', 'with', 'by'}
+        words = query.lower().split()
+        keywords = [word for word in words if word not in stop_words]
+        return keywords

modules/retriever.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from tavily import TavilyClient
+import logging
+class Retriever:
+    def __init__(self, api_key):
+        self.client = TavilyClient(api_key=api_key)
+    def search(self, query, max_results=5):
+        """
+        Search for relevant content using Tavily API
+        """
+        try:
+            response = self.client.search(
+                query=query,
+                search_depth="advanced",
+                max_results=max_results,
+                include_answer=False,
+                include_raw_content=False
+            )
+            return response.get('results', [])
+        except Exception as e:
+            logging.error(f"Search failed: {str(e)}")
+            return []
+    def get_related_queries(self, query):
+        """
+        Generate related search queries
+        """
+        # This could be enhanced with LLM-based query expansion
+        return [
+            f"{query} research paper",
+            f"{query} latest developments",
+            f"{query} pros and cons"
+        ]