import gradio as gr
import os
import bs4
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain import hub
from bs4 import BeautifulSoup
import requests
from langchain_core.prompts import ChatPromptTemplate


os.environ["OPENAI_API_KEY"] = "sk-None-I5QCG8e21NqWVwxcHz2QT3BlbkFJUMfGESJ2JMWLZUwA4zPg"
llm = ChatOpenAI(model="gpt-4o-mini")
system_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
            If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
            Question: {question}
            Context: {context}
            Answer:"""),
    ("user", "{question}, {context}")
])

def read_url(url):
    response = requests.get(url)
    html_content = response.text
    paragraphs = BeautifulSoup(html_content, 'html.parser').find_all('p')

    full_content = ""
    for p in paragraphs:
        full_content += p.get_text()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
    splits = text_splitter.create_documents([full_content])
    return splits

def read_file(file):
    if file.name.endswith('.pdf'):
        loader = PyPDFLoader(file.name)
        pages = loader.load_and_split()
        
    elif file.name.endswith('.txt') or file.name.endswith('.md'):
        loader = TextLoader(file.name)
        pages_no_split = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20, add_start_index=True)
        pages = text_splitter.split_documents(pages_no_split)  # ❤
    else:
        return None
    
    return pages

def output_format_docs(docs):
    formatted_docs = [
        f"\n ========== THE {i+1} KNOWLEDGE SNIPPET ========== \n{doc.page_content}"
        for i, doc in enumerate(docs)
    ]
    return "\n".join(formatted_docs)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# ==================== GRADIO START ====================
def greet(prompt, file, url):
    if prompt == "":
        return "You haven't enter the question yet!", ''
    elif url == '':
        file_splits = read_file(file)
        all_splits = file_splits
    else:
        url_splits = read_url(url)
        all_splits = url_splits

    vectorstore = Chroma(
        collection_name = "example_collection",
        embedding_function = OpenAIEmbeddings(),
        # persist_directory = "./chroma_langchain_db",  # Where to save data locally, remove if not neccesary
    )
    vectorstore.add_documents(documents = all_splits)

    retriever = vectorstore.as_retriever()
    retrieved_docs = retriever.invoke(prompt)
    formatted_doc = format_docs(retrieved_docs)

    chain = system_prompt | llm | StrOutputParser()
    complete_sentence = chain.invoke({"question": prompt, "context": formatted_doc})

    output_0 = output_format_docs(retrieved_docs)
    output_1 = complete_sentence

    vectorstore.delete_collection()


    return output_0, output_1


demo = gr.Interface(fn=greet, 
          inputs=[gr.Textbox(label = 'PROMPT', info = 'Feel free to ask the Bot your questions here!', lines = 5, placeholder = """Examples:
"What are the key findings of the latest financial report?"
"Can you summarize the main legal requirements for data privacy in this document?"
"What are the recommended treatment options for [specific medical condition] mentioned in the report?"
"""), 
              gr.File(
                  file_types = ['.pdf', '.txt', '.md'], 
                  label = 'Support PDF、TXT、MD',
                #   value = './story.txt'
                  ), 
              gr.Textbox(label = 'URL', info = 'Please paste your URL and ask question about the web page!')], 
          outputs = [gr.Textbox(label = 'Knowledge Snippets', info = 'These are the knowledge snippets detected by the system. Do you think they are accurate?'), gr.Textbox(label = 'BOT OUTPUT （gpt-4o-mini）', info = "These are the knowledge snippets detected by the system. Do you think they are accurate?")],
          
          title = "Enhancing LLM Accuracy with Retrieval-Augmented Generation (RAG)",
          description = """\n
          Large language models (LLM) today often fall short in providing accurate specialized information. Inquiries related to fields such as medicine, law, or finance may result in inaccurate responses.\n 
          Retrieval-Augmented Generation (RAG) is a widely adopted solution to this challenge. By storing specialized knowledge in a database, RAG enables Bots to search the knowledge base and generate precise, expert-level responses.\n 
          This methodology not only allows businesses to develop Bots tailored to their specific operations by incorporating proprietary data and knowledge but also ensures enhanced security by hosting the knowledge base on their own servers, thereby reducing the risk of data breaches.\n 
          Try to upload your own documents or URLs below:"""
          )
demo.launch(debug=True)