|
|
|
"""Said Lfagrouche: RAG_QA_App_base_on_BROOKLYN_COLLEGE STUDENT_HANDBOOK_2023-2024_app |
|
""" |
|
|
|
import getpass |
|
import gradio as gr |
|
import os |
|
import pprint |
|
import sys |
|
|
|
from gradio.themes.base import Base |
|
from icecream import ic |
|
from pymongo import MongoClient |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
from weaviate.embedded import EmbeddedOptions |
|
|
|
from langchain_community.document_loaders import TextLoader, PyPDFLoader |
|
from langchain_community.embeddings import OpenAIEmbeddings |
|
from langchain_community.vectorstores import MongoDBAtlasVectorSearch, Weaviate |
|
|
|
|
|
|
|
from langchain.callbacks.tracers import ConsoleCallbackHandler |
|
from langchain.prompts import PromptTemplate, ChatPromptTemplate |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_core.messages import HumanMessage, SystemMessage |
|
from langchain_core.output_parsers import StrOutputParser |
|
from langchain_core.runnables import RunnablePassthrough |
|
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.llms import HuggingFacePipeline |
|
from langchain_community.chat_models import ChatOpenAI |
|
|
|
|
|
|
|
OPENAI_API_KEYY = os.getenv('OPENAI_API_KEY') |
|
MONGO_URII = os.getenv('MONGO_URI') |
|
|
|
|
|
directory_path = "Data" |
|
|
|
sys.path.append(directory_path) |
|
|
|
|
|
print("sys.path =", sys.path) |
|
|
|
|
|
my_pdfs = os.listdir(directory_path) |
|
my_pdfs |
|
|
|
|
|
cluster = MongoClient(MONGO_URII) |
|
|
|
|
|
DB_NAME = "pdfs" |
|
COLLECTION_NAME = "pdfs_collection" |
|
|
|
|
|
MONGODB_COLLECTION = cluster[DB_NAME][COLLECTION_NAME] |
|
|
|
vector_search_index = "vector_index" |
|
|
|
|
|
loaders = [] |
|
for my_pdf in my_pdfs: |
|
my_pdf_path = os.path.join(directory_path, my_pdf) |
|
loaders.append(PyPDFLoader(my_pdf_path)) |
|
|
|
print("len(loaders) =", len(loaders)) |
|
|
|
loaders |
|
|
|
|
|
|
|
|
|
data = [] |
|
for loader in loaders: |
|
data.append(loader.load()) |
|
|
|
print("len(data) =", len(data), "\n") |
|
|
|
|
|
data[0] |
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20) |
|
text_splitter |
|
|
|
|
|
|
|
docs = [] |
|
for doc in data: |
|
chunk = text_splitter.split_documents(doc) |
|
docs.append(chunk) |
|
|
|
|
|
|
|
total = 0 |
|
for i in range(len(docs)): |
|
if i == len(docs) - 1: |
|
print(len(docs[i]), end="") |
|
else: |
|
print(len(docs[i]), "+ " ,end="") |
|
total += len(docs[i]) |
|
print(" =", total, " total documents\n") |
|
|
|
|
|
print(docs[0], "\n\n\n") |
|
|
|
|
|
|
|
print(len(docs)) |
|
|
|
docs |
|
|
|
|
|
merged_documents = [] |
|
|
|
for doc in docs: |
|
merged_documents.extend(doc) |
|
|
|
|
|
print("len(merged_documents) =", len(merged_documents)) |
|
print(merged_documents) |
|
|
|
|
|
model_name = "sentence-transformers/all-MiniLM-L6-v2" |
|
model_kwargs = {'device': 'cpu'} |
|
embeddings = HuggingFaceEmbeddings( |
|
model_name=model_name, |
|
model_kwargs=model_kwargs, |
|
) |
|
|
|
import weaviate |
|
from weaviate.embedded import EmbeddedOptions |
|
|
|
client = weaviate.Client( |
|
embedded_options=EmbeddedOptions() |
|
) |
|
|
|
vector_search = Weaviate.from_documents( |
|
client = client, |
|
documents = merged_documents, |
|
embedding = OpenAIEmbeddings(), |
|
by_text = False |
|
) |
|
|
|
|
|
|
|
|
|
query = "Who is Prof. Langsam" |
|
results = vector_search.similarity_search(query=query, k=10) |
|
|
|
print("\n") |
|
pprint.pprint(results) |
|
|
|
|
|
|
|
|
|
query = "Is there a pool in campus?" |
|
results = vector_search.similarity_search_with_score( |
|
query = query, k = 10 |
|
) |
|
|
|
pprint.pprint(results) |
|
|
|
|
|
|
|
|
|
query = "Where is Data tools and algorithm exam taken?" |
|
|
|
results = vector_search.similarity_search_with_score( |
|
query = query, |
|
k = 10, |
|
pre_filter = { "page": { "$eq": 1 } } |
|
) |
|
|
|
pprint.pprint(results) |
|
|
|
|
|
|
|
retriever = vector_search.as_retriever( |
|
search_type = "similarity", |
|
search_kwargs = {"k": 5, "score_threshold": 0.89} |
|
) |
|
|
|
|
|
|
|
|
|
template = """ |
|
Use the following pieces of context to answer the question at the end. |
|
If you do not know the answer, just say that you do not know, do not try to make up an answer. |
|
|
|
{context} |
|
|
|
Question: {question} |
|
""" |
|
|
|
custom_rag_prompt = PromptTemplate.from_template(template) |
|
|
|
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2) |
|
|
|
|
|
|
|
def format_docs(docs): |
|
return "\n\n".join(doc.page_content for doc in docs) |
|
|
|
|
|
|
|
rag_chain = ( |
|
{"context": retriever | format_docs, "question": RunnablePassthrough()} |
|
| custom_rag_prompt |
|
| llm |
|
| StrOutputParser() |
|
) |
|
|
|
|
|
query = "Where is Student Center located at?" |
|
answer = rag_chain.invoke(query) |
|
|
|
print("\nQuestion: " + query) |
|
print("Answer: " + answer) |
|
|
|
|
|
documents = retriever.get_relevant_documents(query) |
|
|
|
print("\nSource documents:") |
|
pprint.pprint(documents) |
|
|
|
|
|
|
|
def get_response(query): |
|
return rag_chain.invoke(query) |
|
|
|
|
|
with gr.Blocks(theme=Base(), title="RAG QA App base on BROOKLYN COLLEGE STUDENT HANDBOOK 2023 - 2024, Weaviate As The Vector Database, and Gradio") as demo: |
|
gr.Markdown( |
|
""" |
|
# RAG QA App base on BROOKLYN COLLEGE STUDENT HANDBOOK 2023 - 2024, Weaviate As The Vector Database, and Gradio |
|
""") |
|
textbox = gr.Textbox(label="Question:") |
|
with gr.Row(): |
|
button = gr.Button("Submit", variant="primary") |
|
with gr.Column(): |
|
output1 = gr.Textbox(lines=1, max_lines=10, label="Answer:") |
|
|
|
|
|
|
|
button.click(get_response, textbox, outputs=[output1]) |
|
|
|
demo.launch(share=True) |
|
|
|
|
|
|
|
|