0504ankitsharma commited on
Commit
52794ee
β€’
1 Parent(s): 36d2f7b

Upload 5 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. config.json +1 -0
  3. data/paper1.pdf +3 -0
  4. main.py +85 -0
  5. requirements.txt +9 -0
  6. vectorize_documents.py +26 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/paper1.pdf filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GROQ_API_KEY": "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"}
data/paper1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3be008fcf307c7d5840ce6d40db144928149d43eea722cbbf9cb32ae1d3a4c87
3
+ size 1667360
main.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ import streamlit as st
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from langchain_chroma import Chroma
7
+ from langchain_groq import ChatGroq
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+
11
+ from vectorize_documents import embeddings
12
+
13
+ working_dir = os.path.dirname(os.path.abspath(__file__))
14
+ config_data = json.load(open(f"{working_dir}/config.json"))
15
+ GROQ_API_KEY = config_data["GROQ_API_KEY"]
16
+ os.environ["GROQ_API_KEY"] = GROQ_API_KEY
17
+
18
+
19
+ def setup_vectorstore():
20
+ persist_directory = f"{working_dir}/vector_db_dir"
21
+ embedddings = HuggingFaceEmbeddings()
22
+ vectorstore = Chroma(persist_directory=persist_directory,
23
+ embedding_function=embeddings)
24
+ return vectorstore
25
+
26
+
27
+ def chat_chain(vectorstore):
28
+ llm = ChatGroq(model="llama-3.1-70b-versatile",
29
+ temperature=0)
30
+ retriever = vectorstore.as_retriever()
31
+ memory = ConversationBufferMemory(
32
+ llm=llm,
33
+ output_key="answer",
34
+ memory_key="chat_history",
35
+ return_messages=True
36
+ )
37
+ chain = ConversationalRetrievalChain.from_llm(
38
+ llm=llm,
39
+ retriever=retriever,
40
+ chain_type="stuff",
41
+ memory=memory,
42
+ verbose=True,
43
+ return_source_documents=True
44
+ )
45
+
46
+ return chain
47
+
48
+
49
+ st.set_page_config(
50
+ page_title="Multi Doc Chat",
51
+ page_icon = "πŸ“š",
52
+ layout="centered"
53
+ )
54
+
55
+ st.title("πŸ“š Multi Documents Chatbot")
56
+
57
+ if "chat_history" not in st.session_state:
58
+ st.session_state.chat_history = []
59
+
60
+ if "vectorstore" not in st.session_state:
61
+ st.session_state.vectorstore = setup_vectorstore()
62
+
63
+ if "conversationsal_chain" not in st.session_state:
64
+ st.session_state.conversationsal_chain = chat_chain(st.session_state.vectorstore)
65
+
66
+
67
+ for message in st.session_state.chat_history:
68
+ with st.chat_message(message["role"]):
69
+ st.markdown(message["content"])
70
+
71
+ user_input = st.chat_input("Ask AI...")
72
+
73
+ if user_input:
74
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
75
+
76
+ with st.chat_message("user"):
77
+ st.markdown(user_input)
78
+
79
+
80
+ with st.chat_message("assistant"):
81
+ response = st.session_state.conversationsal_chain({"question": user_input})
82
+ assistant_response = response["answer"]
83
+ st.markdown(assistant_response)
84
+ st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
85
+
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.38.0
2
+ langchain-community==0.2.16
3
+ langchain-text-splitters==0.2.4
4
+ langchain-chroma==0.1.3
5
+ langchain-huggingface==0.0.3
6
+ langchain-groq==0.1.9
7
+ unstructured==0.15.0
8
+ unstructured[pdf]==0.15.0
9
+ nltk==3.8.1
vectorize_documents.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import UnstructuredFileLoader
2
+ from langchain_community.document_loaders import DirectoryLoader
3
+ from langchain_text_splitters import CharacterTextSplitter
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_chroma import Chroma
6
+
7
+ # loaidng the embedding model
8
+ embeddings = HuggingFaceEmbeddings()
9
+
10
+ loader = DirectoryLoader(path="data",
11
+ glob="./*.pdf",
12
+ loader_cls=UnstructuredFileLoader)
13
+ documents = loader.load()
14
+
15
+
16
+ text_splitter = CharacterTextSplitter(chunk_size=2000,
17
+ chunk_overlap=500)
18
+ text_chunks = text_splitter.split_documents(documents)
19
+
20
+ vectordb = Chroma.from_documents(
21
+ documents=text_chunks,
22
+ embedding=embeddings,
23
+ persist_directory="vector_db_dir"
24
+ )
25
+
26
+ print("Documents Vectorized")