Spaces:

Jofthomas
/

HuggingDocsLLM

Runtime error

App Files Files Community

Jofthomas HF staff commited on Sep 15, 2023

Commit

88768cb

•

1 Parent(s): 198090e

initial commit

Browse files

Files changed (4) hide show

app.py +202 -0
backend_utils/file_handlers.py +51 -0
backend_utils/text_processor.py +45 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import sys
+import os
+sys.path.append(os.path.abspath('.'))
+import streamlit as st
+import time
+import openai
+from typing import List, Optional, Tuple, Dict, IO
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import HumanMessage, AIMessage, ChatMessage, FunctionMessage
+from langchain.chains.question_answering import load_qa_chain
+from langchain.callbacks import get_openai_callback
+from backend_utils.file_handlers import FileHandlerFactory
+from backend_utils.text_processor import DefaultTextProcessor
+MODELS = {
+    'gpt-3.5': 'openai',
+    'gpt-4': 'openai',
+}
+openai.api_key = ""
+os.environ["OPENAI_API_KEY"]=""
+def set_api_key(api_provider, api_key):
+    """
+    Set the API key in the respective environment variable
+    """
+    if api_provider == 'openai':
+        os.environ["OPENAI_API_KEY"] = api_key
+        openai.api_key = os.environ["OPENAI_API_KEY"]
+    else:
+        raise ValueError(f"Unknown API provider: {api_provider}")
+def load_chain(selected_model):
+    """Logic for loading the chain you want to use should go here."""
+    if selected_model=='gpt-4':
+        llm = ChatOpenAI(temperature=0, model="gpt-4")
+    else:
+         llm = ChatOpenAI(temperature=0)
+    return llm
+def answer_question(knowledge_base, user_question,llm):
+    try:
+        retrived_docs= knowledge_base.similarity_search(
+                                                user_question,
+                                                k=10
+                                                )
+    except Exception as e:
+        print(f"Error finding relative chunks: {e}")
+        return []
+    print(retrived_docs)
+    try:
+        chain = load_qa_chain(
+                            llm,
+                            chain_type="stuff"
+                            )
+        with get_openai_callback() as callback:
+            response = chain.run(
+                                input_documents=retrived_docs,
+                                question=user_question,
+                                max_tokens=50
+                                )
+            print(callback)
+        return response
+    except Exception as e:
+        print(f"Error running QA chain: {e}")
+        return ""
+def read_files(files: List[IO]) -> Optional[str]:
+        """
+        Reads the files and returns the combined text.
+        """
+        combined_text = ""
+        if len(files)==1:
+            file=files[0]
+            if file is not None:
+                file_factory=FileHandlerFactory()
+                handler = file_factory.get_file_handler(file.type)
+                text = handler.read_file(file)
+                if not text:
+                    print(f"No text could be extracted from {file.name}. Please ensure the file is not encrypted or corrupted.")
+                    return None
+                else:
+                    combined_text += text
+        else:
+            for file in files:
+                if file is not None:
+                    file_factory=FileHandlerFactory()
+                    handler = file_factory.get_file_handler(file.type)
+                    text = handler.read_file(file)
+                    if not text:
+                        print(f"No text could be extracted from {file.name}. Please ensure the file is not encrypted or corrupted.")
+                        return None
+                    else:
+                        combined_text += text
+        return combined_text
+def chunk_text(combined_text: str) -> Optional[List[str]]:
+    processor=DefaultTextProcessor(500,0)
+    chunks = processor.split_text(combined_text)
+    if not chunks:
+        print("Couldn't split the text into chunks. Please try again with different text.")
+        return None
+    return chunks,processor
+def create_embeddings( chunks: List[str], processor) -> Optional[Dict]:
+    """
+    Takes chunks and creates embeddings in a knowledge base.
+    """
+    knowledge_base = processor.create_embeddings(chunks)
+    if not knowledge_base:
+        print("Couldn't create embeddings from the text. Please try again.")
+        return None
+    return knowledge_base
+def load_documents(files):
+    print(files)
+    combined_text = read_files(files)
+    chunks,processor = chunk_text(combined_text)
+    knowledge_base = create_embeddings(chunks,processor)
+    print("ALL DONE")
+    return knowledge_base
+def get_text():
+    input_text = st.text_input("You: ", "Hello, how are you?", key="input")
+    return input_text
+if __name__ == "__main__":
+    st.set_page_config(
+            page_title="Chat with your documents demo:",
+            page_icon="📖",
+            layout="wide",
+            initial_sidebar_state="expanded", )
+     # Dropdown to select model
+    selected_model = st.sidebar.selectbox("Select a model", list(MODELS.keys()))
+    # Input box to enter API key
+    api_key = st.sidebar.text_input(f"Enter API key for {MODELS[selected_model]}", type="password")
+    # Set the API key for the selected model
+    if api_key:
+        set_api_key(MODELS[selected_model], api_key)
+    llm = load_chain(selected_model)
+    if "loaded" not in st.session_state:
+        st.session_state["loaded"] = False
+    if "knowledge_base" not in st.session_state:
+        st.session_state["knowledge_base"] = None
+    ResumePDF = st.sidebar.file_uploader(
+        "Upload your documents", type=['pdf'], help="Help message goes here", key="uploaded_file", accept_multiple_files=True
+    )
+    if ResumePDF :
+        print("ResumePDF",ResumePDF)
+        if not st.session_state["loaded"]:
+            with st.spinner('Loading files 📖'):
+                st.session_state["knowledge_base"] = load_documents(ResumePDF)
+                st.session_state["loaded"] = True
+        st.header("📖 Chat with your documents demo:")
+        if "messages" not in st.session_state:
+            st.session_state["messages"] = [
+                {"role": "assistant", "content": "How can I help you?"}]
+        # Display chat messages from history on app rerun
+        for message in st.session_state.messages:
+            with st.chat_message(message["role"]):
+                st.markdown(message["content"])
+        if user_input := st.chat_input("What is your question?"):
+            # Add user message to chat history
+            st.session_state.messages.append({"role": "user", "content": user_input})
+            # Display user message in chat message container
+            with st.chat_message("user"):
+                st.markdown(user_input)
+            with st.chat_message("assistant"):
+                message_placeholder = st.empty()
+                full_response = ""
+                with st.spinner('Thinking ...'):
+                    ai_message=answer_question(st.session_state["knowledge_base"],user_input,llm)
+                    # ai_message = llm.predict_messages([HumanMessage(content=user_input)])
+                    # Simulate stream of response with milliseconds delay
+                    print(ai_message)
+                    for chunk in ai_message.split():
+                        full_response += chunk + " "
+                        time.sleep(0.05)
+                        # Add a blinking cursor to simulate typing
+                        message_placeholder.markdown(full_response + "▌")
+                    message_placeholder.markdown(full_response)
+            st.session_state.messages.append({"role": "assistant", "content": full_response})

backend_utils/file_handlers.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from abc import ABC, abstractmethod
+from PyPDF2 import PdfReader
+class FileHandler(ABC):
+    """Abstract base class for file handlers."""
+    @abstractmethod
+    def read_file(self, file):
+        """Read the file and extract the text.
+        Parameters:
+        file (UploadedFile): The file to read.
+        Returns:
+        str: The extracted text.
+        """
+        pass
+class PDFHandler(FileHandler):
+    def read_file(self, file):
+        try:
+            pdf_reader = PdfReader(file)
+            text = ""
+            for page in pdf_reader.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text
+            return text
+        except Exception as e:
+            print(f"Error reading file: {e}")
+            return ""  # return an empty string if an error occurs
+class FileHandlerFactory:
+    @staticmethod
+    def get_file_handler(file_type):
+        if file_type == "application/pdf":
+            return PDFHandler()
+        else:
+            raise ValueError("Invalid file type")

backend_utils/text_processor.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from abc import ABC, abstractmethod
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+class TextProcessor(ABC):
+    @abstractmethod
+    def split_text(self, text):
+        pass
+    @abstractmethod
+    def create_embeddings(self, chunks):
+        pass
+class DefaultTextProcessor(TextProcessor):
+    def __init__(self,chunk_size,chunk_overlap):
+        self.chunk_overlap = chunk_overlap
+        self.chunk_size = chunk_size
+    def split_text(self, text):
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size ,
+            chunk_overlap=self.chunk_overlap,
+            separators=[" ", ",", "\n"],
+            length_function=len
+        )
+        chunks = text_splitter.split_text(text)
+        return chunks
+    def create_embeddings(self, chunks):
+        if not chunks:
+            return None
+        embeddings = OpenAIEmbeddings()
+        try:
+            return FAISS.from_texts(chunks, embeddings)
+        except Exception as e:
+            print(f"Error creating embeddings: {e}")
+            return None

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+openai==0.27.8
+transformers==4.30.2
+langchain==0.0.250
+streamlit==1.25.0
+torch==2.0.1
+faiss-cpu==1.7.4
+tiktoken==0.4.0
+PyPDF2