Spaces:
Sleeping
Sleeping
vidhiparikh
commited on
Commit
•
51a81da
1
Parent(s):
d1418c2
Upload app.py
Browse filesThe code implements a conversational AI chatbot leveraging Langchain's document processing, text splitting, and retrieval capabilities, alongside the LLamaCpp conversational language model. It utilizes PyPDF2 for document parsing and HuggingFace's SentenceTransformer for generating embeddings. The chatbot integrates with Gradio for user interaction, enabling natural language queries and responses sourced from documents. This technical setup combines Langchain's functionality with LLamaCpp's conversational abilities, facilitating efficient and contextually relevant interactions through document-based retrieval.
app.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
import gradio as gr
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain_community.llms import LlamaCpp
|
5 |
+
|
6 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
7 |
+
from langchain.memory import ConversationBufferMemory
|
8 |
+
from langchain.chains import ConversationalRetrievalChain
|
9 |
+
from langchain_community.vectorstores import FAISS
|
10 |
+
|
11 |
+
from langchain.prompts import PromptTemplate
|
12 |
+
from sentence_transformers import SentenceTransformer, util
|
13 |
+
from langchain.callbacks.manager import CallbackManager
|
14 |
+
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
15 |
+
|
16 |
+
# Customized file paths
|
17 |
+
pdf_files = ["C:/Users/vidhi/OneDrive/Desktop/CVs/final/CV_Vidhi_Parikh.pdf"]
|
18 |
+
|
19 |
+
# Function to extract documents from PDF files
|
20 |
+
def extract_documents_from_pdf(pdf_files):
|
21 |
+
documents = []
|
22 |
+
metadata = []
|
23 |
+
content = []
|
24 |
+
for pdf in pdf_files:
|
25 |
+
pdf_reader = PyPDF2.PdfReader(pdf)
|
26 |
+
for index, page in enumerate(pdf_reader.pages):
|
27 |
+
document_page = {'title': pdf + " page " + str(index + 1),'content': page.extract_text()}
|
28 |
+
documents.append(document_page)
|
29 |
+
for doc in documents:
|
30 |
+
content.append(doc["content"])
|
31 |
+
metadata.append({
|
32 |
+
"title": doc["title"]
|
33 |
+
})
|
34 |
+
print("Documents extracted from PDF files.")
|
35 |
+
return content, metadata
|
36 |
+
|
37 |
+
# Function to split documents into text chunks
|
38 |
+
def split_documents_into_chunks(content, metadata):
|
39 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
40 |
+
chunk_size=512,
|
41 |
+
chunk_overlap=256,
|
42 |
+
)
|
43 |
+
split_documents = text_splitter.create_documents(content, metadatas=metadata)
|
44 |
+
print(f"Documents split into {len(split_documents)} passages.")
|
45 |
+
return split_documents
|
46 |
+
|
47 |
+
# Function to ingest split documents into the vector database
|
48 |
+
def ingest_into_vector_database(split_documents):
|
49 |
+
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
|
50 |
+
database = FAISS.from_documents(split_documents, embeddings)
|
51 |
+
DB_PATH = 'vectorstore/vector_database'
|
52 |
+
database.save_local(DB_PATH)
|
53 |
+
return database
|
54 |
+
|
55 |
+
# Customized conversation template
|
56 |
+
template = """[INST]
|
57 |
+
As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
|
58 |
+
- Answer the question based on the provided documents.
|
59 |
+
- Be concise and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no, etc.
|
60 |
+
- Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
|
61 |
+
- If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
|
62 |
+
- Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
|
63 |
+
- Do not fabricate information or include questions in your responses.
|
64 |
+
- Do not prompt to select answers. Do not ask additional questions.
|
65 |
+
- Cite the source of where exactly the information in the document is found and mention it in your responses.
|
66 |
+
{question}
|
67 |
+
[/INST]
|
68 |
+
"""
|
69 |
+
|
70 |
+
# Callback manager for handling callbacks
|
71 |
+
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
|
72 |
+
|
73 |
+
# Function to create a conversational chain
|
74 |
+
def create_conversational_chain(database):
|
75 |
+
llama_llm = LlamaCpp(
|
76 |
+
model_path="llama-2-7b-chat.Q8_0.gguf",
|
77 |
+
temperature=0.75,
|
78 |
+
max_tokens=200,
|
79 |
+
top_p=1,
|
80 |
+
callback_manager=callback_manager,
|
81 |
+
n_ctx=3000)
|
82 |
+
|
83 |
+
retriever = database.as_retriever()
|
84 |
+
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)
|
85 |
+
|
86 |
+
memory = ConversationBufferMemory(
|
87 |
+
memory_key='chat_history', return_messages=True, output_key='answer')
|
88 |
+
|
89 |
+
conversation_chain = (ConversationalRetrievalChain.from_llm
|
90 |
+
(llm=llama_llm,
|
91 |
+
retriever=retriever,
|
92 |
+
#condense_question_prompt=CONDENSE_QUESTION_PROMPT,
|
93 |
+
memory=memory,
|
94 |
+
return_source_documents=True))
|
95 |
+
print("Conversational Chain created.")
|
96 |
+
return conversation_chain
|
97 |
+
|
98 |
+
# Function to validate the answer against source documents
|
99 |
+
def validate_answer(response_answer, source_documents):
|
100 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
101 |
+
similarity_threshold = 0.5
|
102 |
+
source_texts = [doc.page_content for doc in source_documents]
|
103 |
+
|
104 |
+
answer_embedding = model.encode(response_answer, convert_to_tensor=True)
|
105 |
+
source_embeddings = model.encode(source_texts, convert_to_tensor=True)
|
106 |
+
|
107 |
+
cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
|
108 |
+
|
109 |
+
if any(score.item() > similarity_threshold for score in cosine_scores[0]):
|
110 |
+
return True
|
111 |
+
|
112 |
+
return False
|
113 |
+
|
114 |
+
# Extract documents from PDF files
|
115 |
+
content, metadata = extract_documents_from_pdf(pdf_files)
|
116 |
+
|
117 |
+
# Split documents into text chunks
|
118 |
+
split_documents = split_documents_into_chunks(content, metadata)
|
119 |
+
|
120 |
+
# Ingest split documents into the vector database
|
121 |
+
vector_database = ingest_into_vector_database(split_documents)
|
122 |
+
print("Vector database created.")
|
123 |
+
|
124 |
+
# Create the conversation chain
|
125 |
+
conversation_chain = create_conversational_chain(vector_database)
|
126 |
+
|
127 |
+
# Function for the chatbot
|
128 |
+
def chat_with_bot(input_text):
|
129 |
+
user_query = input_text
|
130 |
+
response = conversation_chain({"question": user_query})
|
131 |
+
print("Response:", response)
|
132 |
+
print("Answer:", response['answer'])
|
133 |
+
return response['answer']
|
134 |
+
|
135 |
+
# Create Gradio interface
|
136 |
+
iface = gr.Interface(
|
137 |
+
fn=chat_with_bot,
|
138 |
+
inputs=gr.inputs.Textbox(lines=2, label="User Input"),
|
139 |
+
outputs="text",
|
140 |
+
layout="vertical",
|
141 |
+
title="Simple Chatbot",
|
142 |
+
description="Enter your message and the chatbot will respond."
|
143 |
+
)
|
144 |
+
|
145 |
+
# Launch the interface
|
146 |
+
iface.launch()
|