Spaces:

vidhiparikh
/

About-Me

Sleeping

App Files Files Community

vidhiparikh commited on Mar 20

Commit

f8ac855

•

1 Parent(s): 074e0cc

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -41

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import PyPDF2
 import gradio as gr
-import os
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.llms import LlamaCpp
@@ -14,31 +13,26 @@ from sentence_transformers import SentenceTransformer, util
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-from ctransformers import AutoModelForCausalLM
-# Customized file paths
 pdf_files = ["CV_Vidhi_Parikh.pdf"]
-# Function to extract documents from PDF files
-def extract_documents_from_pdf(pdf_files):
     documents = []
     metadata = []
     content = []
     for pdf in pdf_files:
         pdf_reader = PyPDF2.PdfReader(pdf)
-        for index, page in enumerate(pdf_reader.pages):
-            document_page = {'title': pdf + " page " + str(index + 1),'content': page.extract_text()}
             documents.append(document_page)
     for doc in documents:
         content.append(doc["content"])
         metadata.append({
             "title": doc["title"]
         })
-    print("Documents extracted from PDF files.")
     return content, metadata
-# Function to split documents into text chunks
-def split_documents_into_chunks(content, metadata):
     text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
         chunk_size=512,
         chunk_overlap=256,
@@ -47,34 +41,30 @@ def split_documents_into_chunks(content, metadata):
     print(f"Documents split into {len(split_documents)} passages.")
     return split_documents
-# Function to ingest split documents into the vector database
-def ingest_into_vector_database(split_documents):
     embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
     database = FAISS.from_documents(split_documents, embeddings)
-    DB_PATH = 'vectorstore/vector_database'
     database.save_local(DB_PATH)
     return database
-# Customized conversation template
 template = """[INST]
 As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
 - Answer the question based on the provided documents.
-- Be concise and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no, etc.
 - Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
 - If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
 - Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
 - Do not fabricate information or include questions in your responses.
 - Do not prompt to select answers. Do not ask additional questions.
-- Cite the source of where exactly the information in the document is found and mention it in your responses.
 {question}
 [/INST]
 """
-# Callback manager for handling callbacks
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
-# Function to create a conversational chain
-def create_conversational_chain(vectordb):
     llama_llm = LlamaCpp(
     model_path="llama-2-7b-chat.Q8_0.gguf",
     temperature=0.75,
@@ -83,7 +73,7 @@ def create_conversational_chain(vectordb):
     callback_manager=callback_manager,
     n_ctx=3000)
-    retriever = vectordb.as_retriever()
     CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)
     memory = ConversationBufferMemory(
@@ -95,7 +85,7 @@ def create_conversational_chain(vectordb):
                            #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                            memory=memory,
                            return_source_documents=True))
-    print("Conversational Chain created for the LLM using the vector store")
     return conversation_chain
 def validate_answer(response_answer, source_documents):
@@ -113,30 +103,21 @@ def validate_answer(response_answer, source_documents):
     return False
-# Extract documents from PDF files
-content, metadata = extract_documents_from_pdf(pdf_files)
-# Split documents into text chunks
-split_documents = split_documents_into_chunks(content, metadata)
-# Ingest split documents into the vector database
-vector_database = ingest_into_vector_database(split_documents)
 print("Vector database created.")
-# Create the conversation chain
-conversation_chain = create_conversational_chain(vector_database)
-# Function for the chatbot
-def chat_with_bot(input_text):
     user_query = input_text
     response = conversation_chain({"question": user_query})
-    print("Response:", response)
-    print("Answer:", response['answer'])
     return response['answer']
-# Create Gradio interface
 iface = gr.Interface(
-    fn=chat_with_bot,
     inputs=gr.inputs.Textbox(lines=2, label="User Input"),
     outputs="text",
     layout="vertical",
@@ -144,6 +125,4 @@ iface = gr.Interface(
     description="Enter your message and the chatbot will respond."
 )
-# Launch the interface
 iface.launch()
-#

 import PyPDF2
 import gradio as gr
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.llms import LlamaCpp
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 pdf_files = ["CV_Vidhi_Parikh.pdf"]
+def extract_documents(pdf_files):
     documents = []
     metadata = []
     content = []
     for pdf in pdf_files:
         pdf_reader = PyPDF2.PdfReader(pdf)
+        for index, text in enumerate(pdf_reader.pages):
+            document_page = {'title': pdf + " page " + str(index + 1),'content': pdf_reader.pages[index].extract_text()}
             documents.append(document_page)
     for doc in documents:
         content.append(doc["content"])
         metadata.append({
             "title": doc["title"]
         })
+    print("Content and metadata extracted from the documents.")
     return content, metadata
+def split_text_chunks(content, metadata):
     text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
         chunk_size=512,
         chunk_overlap=256,
     print(f"Documents split into {len(split_documents)} passages.")
     return split_documents
+def ingest_into_database(split_documents):
     embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
     database = FAISS.from_documents(split_documents, embeddings)
+    DB_PATH = 'vectorstore/db_faiss'
     database.save_local(DB_PATH)
     return database
 template = """[INST]
 As an AI, provide accurate and relevant information based on the provided document. Your responses should adhere to the following guidelines:
 - Answer the question based on the provided documents.
+- Be direct and factual, limited to 50 words and 2-3 sentences. Begin your response without using introductory phrases like yes, no etc.
 - Maintain an ethical and unbiased tone, avoiding harmful or offensive content.
 - If the document does not contain relevant information, state "I cannot provide an answer based on the provided document."
 - Avoid using confirmatory phrases like "Yes, you are correct" or any similar validation in your responses.
 - Do not fabricate information or include questions in your responses.
 - Do not prompt to select answers. Do not ask additional questions.
+- Cite the source of where exactly is the information in the document and mention it in your responses.
 {question}
 [/INST]
 """
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
+def create_conversation_chain(database):
     llama_llm = LlamaCpp(
     model_path="llama-2-7b-chat.Q8_0.gguf",
     temperature=0.75,
     callback_manager=callback_manager,
     n_ctx=3000)
+    retriever = database.as_retriever()
     CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(template)
     memory = ConversationBufferMemory(
                            #condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                            memory=memory,
                            return_source_documents=True))
+    print("Conversational Chain created for the LLM using the vector store.")
     return conversation_chain
 def validate_answer(response_answer, source_documents):
     return False
+content, metadata = extract_documents(pdf_files)
+split_documents = split_text_chunks(content, metadata)
+database = ingest_into_database(split_documents)
 print("Vector database created.")
+conversation_chain = create_conversation_chain(database)
+def chat(input_text):
     user_query = input_text
     response = conversation_chain({"question": user_query})
+    print("Answer: ", response)
+    print("    Only answer:", response['answer'])
     return response['answer']
 iface = gr.Interface(
+    fn=chat,
     inputs=gr.inputs.Textbox(lines=2, label="User Input"),
     outputs="text",
     layout="vertical",
     description="Enter your message and the chatbot will respond."
 )
 iface.launch()