chatbot / app.py
tdecae's picture
Update app.py
804125a verified
raw
history blame
No virus
6.55 kB
# import os
# import sys
# import openai
# from langchain.chains import ConversationalRetrievalChain, RetrievalQA
# from langchain.chat_models import ChatOpenAI
# from langchain.document_loaders import DirectoryLoader, TextLoader
# from langchain.embeddings import OpenAIEmbeddings
# from langchain.indexes import VectorstoreIndexCreator
# from langchain.indexes.vectorstore import VectorStoreIndexWrapper
# from langchain.llms import OpenAI
# from langchain.text_splitter import CharacterTextSplitter
# __import__('pysqlite3')
# import sys
# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
# from langchain.vectorstores import Chroma
# import gradio as gr
# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAPIKEY")
# docs = []
# for f in os.listdir("multiple_docs"):
# if f.endswith(".pdf"):
# pdf_path = "./multiple_docs/" + f
# loader = PyPDFLoader(pdf_path)
# docs.extend(loader.load())
# elif f.endswith('.docx') or f.endswith('.doc'):
# doc_path = "./multiple_docs/" + f
# loader = Docx2txtLoader(doc_path)
# docs.extend(loader.load())
# elif f.endswith('.txt'):
# text_path = "./multiple_docs/" + f
# loader = TextLoader(text_path)
# docs.extend(loader.load())
# splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
# docs = splitter.split_documents(docs)
# # Convert the document chunks to embedding and save them to the vector store
# vectorstore = Chroma.from_documents(docs, embedding=OpenAIEmbeddings(), persist_directory="./data")
# vectorstore.persist()
# chain = ConversationalRetrievalChain.from_llm(
# ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo'),
# retriever=vectorstore.as_retriever(search_kwargs={'k': 6}),
# return_source_documents=True,
# verbose=False
# )
# chat_history = []
# with gr.Blocks() as demo:
# chatbot = gr.Chatbot([("", "Hello, I'm Thierry Decae's chatbot, you can ask me any recruitment related questions such as my previous or most recent experience, where I'm eligible to work, when I can start work, what NLP skills I have, and much more! you can chat with me directly in multiple languages")],avatar_images=["./multiple_docs/Guest.jpg","./multiple_docs/Thierry Picture.jpg"])
# msg = gr.Textbox()
# clear = gr.Button("Clear")
# chat_history = []
# def user(query, chat_history):
# # print("User query:", query)
# # print("Chat history:", chat_history)
# # Convert chat history to list of tuples
# chat_history_tuples = []
# for message in chat_history:
# chat_history_tuples.append((message[0], message[1]))
# # Get result from QA chain
# result = chain({"question": query, "chat_history": chat_history_tuples})
# # Append user message and response to chat history
# chat_history.append((query, result["answer"]))
# # print("Updated chat history:", chat_history)
# return gr.update(value=""), chat_history
# msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
# clear.click(lambda: None, None, chatbot, queue=False)
# demo.launch(debug=True)
import os
import sys
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
import gradio as gr
from transformers import pipeline
from sentence_transformers import SentenceTransformer
__import__('pysqlite3')
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
docs = []
for f in os.listdir("multiple_docs"):
if f.endswith(".pdf"):
pdf_path = "./multiple_docs/" + f
loader = PyPDFLoader(pdf_path)
docs.extend(loader.load())
elif f.endswith('.docx') or f.endswith('.doc'):
doc_path = "./multiple_docs/" + f
loader = Docx2txtLoader(doc_path)
docs.extend(loader.load())
elif f.endswith('.txt'):
text_path = "./multiple_docs/" + f
loader = TextLoader(text_path)
docs.extend(loader.load())
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
docs = splitter.split_documents(docs)
# Extract the content from documents and create embeddings
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
texts = [doc.page_content for doc in docs]
embeddings = embedding_model.encode(texts).tolist() # Convert numpy arrays to lists
# Create a Chroma vector store and add documents and their embeddings
vectorstore = Chroma(persist_directory="./db")
vectorstore.add_texts(texts=texts, metadatas=[{"id": i} for i in range(len(texts))], embeddings=embeddings)
vectorstore.persist()
# Load the Hugging Face model for text generation
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B")
class HuggingFaceLLMWrapper:
def __init__(self, generator):
self.generator = generator
def __call__(self, prompt, max_length=512):
result = self.generator(prompt, max_length=max_length, num_return_sequences=1)
return result[0]['generated_text']
llm = HuggingFaceLLMWrapper(generator)
chain = ConversationalRetrievalChain.from_llm(
llm,
retriever=vectorstore.as_retriever(search_kwargs={'k': 6}),
return_source_documents=True,
verbose=False
)
chat_history = []
with gr.Blocks() as demo:
chatbot = gr.Chatbot([("", "Hello, I'm Thierry Decae's chatbot, you can ask me any recruitment related questions such as my previous or most recent experience, where I'm eligible to work, when I can start work, what NLP skills I have, and much more! you can chat with me directly in multiple languages")], avatar_images=["./multiple_docs/Guest.jpg","./multiple_docs/Thierry Picture.jpg"])
msg = gr.Textbox()
clear = gr.Button("Clear")
chat_history = []
def user(query, chat_history):
# Convert chat history to list of tuples
chat_history_tuples = []
for message in chat_history:
chat_history_tuples.append((message[0], message[1]))
# Get result from QA chain
result = chain({"question": query, "chat_history": chat_history_tuples})
# Append user message and response to chat history
chat_history.append((query, result["answer"]))
return gr.update(value=""), chat_history
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
clear.click(lambda: None, None, chatbot, queue=False)
demo.launch(debug=True)