Spaces:

vidhiparikh
/

About-Me

Sleeping

App Files Files Community

About-Me / app.py

vidhiparikh

Upload app.py

51a81da verified 8 months ago

raw

history blame

5.76 kB

	import PyPDF2
	import gradio as gr
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.llms import LlamaCpp

	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from langchain_community.vectorstores import FAISS

	from langchain.prompts import PromptTemplate
	from sentence_transformers import SentenceTransformer, util
	from langchain.callbacks.manager import CallbackManager
	from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

	# Customized file paths
	pdf_files = ["C:/Users/vidhi/OneDrive/Desktop/CVs/final/CV_Vidhi_Parikh.pdf"]

	# Function to extract documents from PDF files
	def extract_documents_from_pdf(pdf_files):
	documents = []
	metadata = []
	content = []
	for pdf in pdf_files:
	pdf_reader = PyPDF2.PdfReader(pdf)
	for index, page in enumerate(pdf_reader.pages):
	document_page = {'title': pdf + " page " + str(index + 1),'content': page.extract_text()}
	documents.append(document_page)
	for doc in documents:
	content.append(doc["content"])
	metadata.append({
	"title": doc["title"]
	})
	print("Documents extracted from PDF files.")
	return content, metadata

	# Function to split documents into text chunks
	def split_documents_into_chunks(content, metadata):
	text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
	chunk_size=512,
	chunk_overlap=256,
	)
	split_documents = text_splitter.create_documents(content, metadatas=metadata)
	print(f"Documents split into {len(split_documents)} passages.")
	return split_documents

	# Function to ingest split documents into the vector database
	def ingest_into_vector_database(split_documents):
	embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
	database = FAISS.from_documents(split_documents, embeddings)
	DB_PATH = 'vectorstore/vector_database'
	database.save_local(DB_PATH)
	return database

	# Customized conversation template
	template = """[INST]
	As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
	- Answer the question based on the provided documents.
	- Be concise and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no, etc.
	- Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
	- If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
	- Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
	- Do not fabricate information or include questions in your responses.
	- Do not prompt to select answers. Do not ask additional questions.
	- Cite the source of where exactly the information in the document is found and mention it in your responses.
	{question}
	[/INST]
	"""

	# Callback manager for handling callbacks
	callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

	# Function to create a conversational chain
	def create_conversational_chain(database):
	llama_llm = LlamaCpp(
	model_path="llama-2-7b-chat.Q8_0.gguf",
	temperature=0.75,
	max_tokens=200,
	top_p=1,
	callback_manager=callback_manager,
	n_ctx=3000)

	retriever = database.as_retriever()
	CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)

	memory = ConversationBufferMemory(
	memory_key='chat_history', return_messages=True, output_key='answer')

	conversation_chain = (ConversationalRetrievalChain.from_llm
	(llm=llama_llm,
	retriever=retriever,
	#condense_question_prompt=CONDENSE_QUESTION_PROMPT,
	memory=memory,
	return_source_documents=True))
	print("Conversational Chain created.")
	return conversation_chain

	# Function to validate the answer against source documents
	def validate_answer(response_answer, source_documents):
	model = SentenceTransformer('all-MiniLM-L6-v2')
	similarity_threshold = 0.5
	source_texts = [doc.page_content for doc in source_documents]

	answer_embedding = model.encode(response_answer, convert_to_tensor=True)
	source_embeddings = model.encode(source_texts, convert_to_tensor=True)

	cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)

	if any(score.item() > similarity_threshold for score in cosine_scores[0]):
	return True

	return False

	# Extract documents from PDF files
	content, metadata = extract_documents_from_pdf(pdf_files)

	# Split documents into text chunks
	split_documents = split_documents_into_chunks(content, metadata)

	# Ingest split documents into the vector database
	vector_database = ingest_into_vector_database(split_documents)
	print("Vector database created.")

	# Create the conversation chain
	conversation_chain = create_conversational_chain(vector_database)

	# Function for the chatbot
	def chat_with_bot(input_text):
	user_query = input_text
	response = conversation_chain({"question": user_query})
	print("Response:", response)
	print("Answer:", response['answer'])
	return response['answer']

	# Create Gradio interface
	iface = gr.Interface(
	fn=chat_with_bot,
	inputs=gr.inputs.Textbox(lines=2, label="User Input"),
	outputs="text",
	layout="vertical",
	title="Simple Chatbot",
	description="Enter your message and the chatbot will respond."
	)

	# Launch the interface
	iface.launch()