import os import sys import openai from langchain.chains import ConversationalRetrievalChain, RetrievalQA from langchain.chat_models import ChatOpenAI from langchain.document_loaders import DirectoryLoader, TextLoader from langchain.embeddings import OpenAIEmbeddings from langchain.indexes import VectorstoreIndexCreator from langchain.indexes.vectorstore import VectorStoreIndexWrapper from langchain.llms import OpenAI from langchain.text_splitter import CharacterTextSplitter __import__('pysqlite3') import sys sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') from langchain.vectorstores import Chroma import gradio as gr os.environ["OPENAI_API_KEY"] = os.getenv("OPENAPIKEY") docs = [] for f in os.listdir("multiple_docs"): if f.endswith(".pdf"): pdf_path = "./multiple_docs/" + f loader = PyPDFLoader(pdf_path) docs.extend(loader.load()) elif f.endswith('.docx') or f.endswith('.doc'): doc_path = "./multiple_docs/" + f loader = Docx2txtLoader(doc_path) docs.extend(loader.load()) elif f.endswith('.txt'): text_path = "./multiple_docs/" + f loader = TextLoader(text_path) docs.extend(loader.load()) splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) docs = splitter.split_documents(docs) # Convert the document chunks to embedding and save them to the vector store vectorstore = Chroma.from_documents(docs, embedding=OpenAIEmbeddings(), persist_directory="./data") vectorstore.persist() chain = ConversationalRetrievalChain.from_llm( ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo'), retriever=vectorstore.as_retriever(search_kwargs={'k': 6}), return_source_documents=True, verbose=False ) chat_history = [] with gr.Blocks() as demo: chatbot = gr.Chatbot([("", "Hello, I'm Thierry Decae's chatbot, you can ask me any recruitment related questions such as my previous or most recent experience, where I'm eligible to work, when I can start work, what NLP skills I have, and much more! you can chat with me directly in multiple languages")],avatar_images=["./multiple_docs/Guest.jpg","./multiple_docs/Thierry Picture.jpg"]) msg = gr.Textbox() clear = gr.Button("Clear") chat_history = [] def user(query, chat_history): # print("User query:", query) # print("Chat history:", chat_history) # Convert chat history to list of tuples chat_history_tuples = [] for message in chat_history: chat_history_tuples.append((message[0], message[1])) # Get result from QA chain result = chain({"question": query, "chat_history": chat_history_tuples}) # Append user message and response to chat history chat_history.append((query, result["answer"])) # print("Updated chat history:", chat_history) return gr.update(value=""), chat_history msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False) clear.click(lambda: None, None, chatbot, queue=False) demo.launch(debug=True) # import os # import sys # from langchain.chains import ConversationalRetrievalChain # from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader # from langchain.text_splitter import CharacterTextSplitter # from langchain.vectorstores import Chroma # import gradio as gr # from transformers import pipeline # from sentence_transformers import SentenceTransformer # __import__('pysqlite3') # sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') # docs = [] # for f in os.listdir("multiple_docs"): # if f.endswith(".pdf"): # pdf_path = "./multiple_docs/" + f # loader = PyPDFLoader(pdf_path) # docs.extend(loader.load()) # elif f.endswith('.docx') or f.endswith('.doc'): # doc_path = "./multiple_docs/" + f # loader = Docx2txtLoader(doc_path) # docs.extend(loader.load()) # elif f.endswith('.txt'): # text_path = "./multiple_docs/" + f # loader = TextLoader(text_path) # docs.extend(loader.load()) # splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10) # docs = splitter.split_documents(docs) # # Extract the content from documents and create embeddings # embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # texts = [doc.page_content for doc in docs] # embeddings = embedding_model.encode(texts).tolist() # Convert numpy arrays to lists # # Create a Chroma vector store and add documents and their embeddings # vectorstore = Chroma(persist_directory="./db", embedding_function=embedding_model.encode) # vectorstore.add_texts(texts=texts, metadatas=[{"id": i} for i in range(len(texts))], embeddings=embeddings) # vectorstore.persist() # # Load the Hugging Face model for text generation # generator = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B") # class HuggingFaceLLMWrapper: # def __init__(self, generator): # self.generator = generator # def __call__(self, prompt, max_length=512): # result = self.generator(prompt, max_length=max_length, num_return_sequences=1) # return result[0]['generated_text'] # llm = HuggingFaceLLMWrapper(generator) # chain = ConversationalRetrievalChain.from_llm( # llm, # retriever=vectorstore.as_retriever(search_kwargs={'k': 6}), # return_source_documents=True, # verbose=False # ) # chat_history = [] # with gr.Blocks() as demo: # chatbot = gr.Chatbot([("", "Hello, I'm Thierry Decae's chatbot, you can ask me any recruitment related questions such as my previous or most recent experience, where I'm eligible to work, when I can start work, what NLP skills I have, and much more! you can chat with me directly in multiple languages")], avatar_images=["./multiple_docs/Guest.jpg","./multiple_docs/Thierry Picture.jpg"]) # msg = gr.Textbox() # clear = gr.Button("Clear") # chat_history = [] # def user(query, chat_history): # # Convert chat history to list of tuples # chat_history_tuples = [] # for message in chat_history: # chat_history_tuples.append((message[0], message[1])) # # Get result from QA chain # result = chain({"question": query, "chat_history": chat_history_tuples}) # # Append user message and response to chat history # chat_history.append((query, result["answer"])) # return gr.update(value=""), chat_history # msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False) # clear.click(lambda: None, None, chatbot, queue=False) # demo.launch(debug=True)