Spaces:
Sleeping
Sleeping
File size: 5,757 Bytes
51a81da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import PyPDF2
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import LlamaCpp
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer, util
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
# Customized file paths
pdf_files = ["C:/Users/vidhi/OneDrive/Desktop/CVs/final/CV_Vidhi_Parikh.pdf"]
# Function to extract documents from PDF files
def extract_documents_from_pdf(pdf_files):
documents = []
metadata = []
content = []
for pdf in pdf_files:
pdf_reader = PyPDF2.PdfReader(pdf)
for index, page in enumerate(pdf_reader.pages):
document_page = {'title': pdf + " page " + str(index + 1),'content': page.extract_text()}
documents.append(document_page)
for doc in documents:
content.append(doc["content"])
metadata.append({
"title": doc["title"]
})
print("Documents extracted from PDF files.")
return content, metadata
# Function to split documents into text chunks
def split_documents_into_chunks(content, metadata):
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=512,
chunk_overlap=256,
)
split_documents = text_splitter.create_documents(content, metadatas=metadata)
print(f"Documents split into {len(split_documents)} passages.")
return split_documents
# Function to ingest split documents into the vector database
def ingest_into_vector_database(split_documents):
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
database = FAISS.from_documents(split_documents, embeddings)
DB_PATH = 'vectorstore/vector_database'
database.save_local(DB_PATH)
return database
# Customized conversation template
template = """[INST]
As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
- Answer the question based on the provided documents.
- Be concise and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no, etc.
- Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
- If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
- Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
- Do not fabricate information or include questions in your responses.
- Do not prompt to select answers. Do not ask additional questions.
- Cite the source of where exactly the information in the document is found and mention it in your responses.
{question}
[/INST]
"""
# Callback manager for handling callbacks
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Function to create a conversational chain
def create_conversational_chain(database):
llama_llm = LlamaCpp(
model_path="llama-2-7b-chat.Q8_0.gguf",
temperature=0.75,
max_tokens=200,
top_p=1,
callback_manager=callback_manager,
n_ctx=3000)
retriever = database.as_retriever()
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)
memory = ConversationBufferMemory(
memory_key='chat_history', return_messages=True, output_key='answer')
conversation_chain = (ConversationalRetrievalChain.from_llm
(llm=llama_llm,
retriever=retriever,
#condense_question_prompt=CONDENSE_QUESTION_PROMPT,
memory=memory,
return_source_documents=True))
print("Conversational Chain created.")
return conversation_chain
# Function to validate the answer against source documents
def validate_answer(response_answer, source_documents):
model = SentenceTransformer('all-MiniLM-L6-v2')
similarity_threshold = 0.5
source_texts = [doc.page_content for doc in source_documents]
answer_embedding = model.encode(response_answer, convert_to_tensor=True)
source_embeddings = model.encode(source_texts, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
if any(score.item() > similarity_threshold for score in cosine_scores[0]):
return True
return False
# Extract documents from PDF files
content, metadata = extract_documents_from_pdf(pdf_files)
# Split documents into text chunks
split_documents = split_documents_into_chunks(content, metadata)
# Ingest split documents into the vector database
vector_database = ingest_into_vector_database(split_documents)
print("Vector database created.")
# Create the conversation chain
conversation_chain = create_conversational_chain(vector_database)
# Function for the chatbot
def chat_with_bot(input_text):
user_query = input_text
response = conversation_chain({"question": user_query})
print("Response:", response)
print("Answer:", response['answer'])
return response['answer']
# Create Gradio interface
iface = gr.Interface(
fn=chat_with_bot,
inputs=gr.inputs.Textbox(lines=2, label="User Input"),
outputs="text",
layout="vertical",
title="Simple Chatbot",
description="Enter your message and the chatbot will respond."
)
# Launch the interface
iface.launch()
|