Final_Assignment_Template

Sleeping

File size: 3,636 Bytes

from langchain_core.tools import tool
from langchain_community.document_loaders import WebBaseLoader, WikipediaLoader, ArxivLoader

@tool
def wikipedia_search(query: str) -> str:
    """
    Search Wikipedia for information
    Args:
        query: The query to search for
    Returns:
        The search results
    """
    docs_found = WikipediaLoader(query=query, load_max_docs=5).load()
    # format the docs found into a string keeping just first paragraph
    formatted_results = []
    
    for i, doc in enumerate(docs_found, 1):
        source = doc.metadata.get('source', 'Unknown source')
        title = doc.metadata.get('title', 'Untitled')
        
        # Get the first paragraph (split by \n\n and take first part)
        content = doc.page_content.strip()
        first_paragraph = content.split('\n\n')[0] if content else "No content available"
        
        formatted_doc = f"""--- DOCUMENT {i} START ---
Source: {source}
Title: {title}
Content: {first_paragraph}
--- DOCUMENT {i} END ---"""
        
        formatted_results.append(formatted_doc)
    
    return "\n\n".join(formatted_results)

@tool
def arxiv_search(query: str) -> str:
    """
    Search ArXiv for research papers
    Args:
        query: The query to search for
    Returns:
        The search results with abstracts
    """
    docs_found = ArxivLoader(query=query, load_max_docs=3).load()
    formatted_results = []
    
    for i, doc in enumerate(docs_found, 1):
        source = doc.metadata.get('source', 'Unknown source')
        title = doc.metadata.get('title', 'Untitled')
        
        # For ArXiv, the abstract is typically in the page_content or metadata
        abstract = doc.page_content.strip() if doc.page_content else "No abstract available"
        
        formatted_doc = f"""--- DOCUMENT {i} START ---
Source: {source}
Title: {title}
Abstract: {abstract}
--- DOCUMENT {i} END ---"""
        
        formatted_results.append(formatted_doc)
    
    return "\n\n".join(formatted_results)

@tool
def web_search(query: str) -> str:
    """
    Search the web for information
    Args:
        query: The query to search for (should be a list of URLs or single URL)
    Returns:
        The search results with first 1000 characters
    """
    # Note: WebBaseLoader requires URLs, so this assumes query contains URLs
    # For a more general web search, you'd need a different approach like SerpAPI
    try:
        if isinstance(query, str):
            urls = [query] if query.startswith('http') else []
        else:
            urls = query
            
        if not urls:
            return "No valid URLs provided for web search."
            
        # Limit to 4 URLs maximum
        urls = urls[:4]
        docs_found = WebBaseLoader(urls).load()
        formatted_results = []
        
        for i, doc in enumerate(docs_found, 1):
            source = doc.metadata.get('source', 'Unknown source')
            title = doc.metadata.get('title', 'Untitled')
            
            # Get first 1000 characters of content
            content = doc.page_content.strip()
            first_1000_chars = content[:1000] if content else "No content available"
            if len(content) > 1000:
                first_1000_chars += "..."
            
            formatted_doc = f"""--- DOCUMENT {i} START ---
Source: {source}
Title: {title}
Content: {first_1000_chars}
--- DOCUMENT {i} END ---"""
            
            formatted_results.append(formatted_doc)
        
        return "\n\n".join(formatted_results)
        
    except Exception as e:
        return f"Error during web search: {str(e)}"