valeriylo commited on
Commit
0af2865
1 Parent(s): 9c7ac65

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from htmlTemplates import css, bot_template, user_template
11
+ from langchain.llms import HuggingFaceHub
12
+ from huggingface_hub import snapshot_download
13
+ from llama_cpp import Llama
14
+
15
+ def get_pdf_text(pdf_docs):
16
+ text = ""
17
+ for pdf in pdf_docs:
18
+ pdf_reader = PdfReader(pdf)
19
+ for page in pdf_reader.pages:
20
+ text += page.extract_text()
21
+ return text
22
+
23
+
24
+ def get_text_chunks(text):
25
+ text_splitter = CharacterTextSplitter(
26
+ separator="\n",
27
+ chunk_size=1000,
28
+ chunk_overlap=200,
29
+ length_function=len
30
+ )
31
+ chunks = text_splitter.split_text(text)
32
+ return chunks
33
+
34
+
35
+ def get_vectorstore(text_chunks):
36
+ embeddings = OpenAIEmbeddings()
37
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
38
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
39
+ return vectorstore
40
+
41
+
42
+ def get_conversation_chain(vectorstore):
43
+ #llm = ChatOpenAI()
44
+ #llm = HuggingFaceHub(repo_id="tinkoff-ai/ruDialoGPT-medium", model_kwargs={"temperature": 0.1,
45
+ #"top_k": 10,
46
+ # "top_p": 0.95,})
47
+ #"max_length": 512})
48
+
49
+ repo_name = "IlyaGusev/saiga2_7b_gguf"
50
+ model_name = "model-q2_K.gguf"
51
+ snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name)
52
+
53
+ llm = Llama(model_path=model_name, n_ctx=2000, n_parts=1)
54
+
55
+ memory = ConversationBufferMemory(
56
+ memory_key='chat_history', return_messages=True)
57
+ conversation_chain = ConversationalRetrievalChain.from_llm(
58
+ llm=llm,
59
+ retriever=vectorstore.as_retriever(),
60
+ memory=memory
61
+ )
62
+ return conversation_chain
63
+
64
+
65
+ def handle_userinput(user_question):
66
+ response = st.session_state.conversation({'question': user_question})
67
+ st.session_state.chat_history = response['chat_history']
68
+
69
+ for i, message in enumerate(st.session_state.chat_history):
70
+ if i % 2 == 0:
71
+ st.write(user_template.replace(
72
+ "{{MSG}}", message.content), unsafe_allow_html=True)
73
+ else:
74
+ st.write(bot_template.replace(
75
+ "{{MSG}}", message.content), unsafe_allow_html=True)
76
+
77
+
78
+ def main():
79
+ load_dotenv()
80
+ st.set_page_config(page_title="Chat with multiple PDFs",
81
+ page_icon=":books:")
82
+ st.write(css, unsafe_allow_html=True)
83
+
84
+ if "conversation" not in st.session_state:
85
+ st.session_state.conversation = None
86
+ if "chat_history" not in st.session_state:
87
+ st.session_state.chat_history = None
88
+
89
+ st.header("Chat with multiple PDFs :books:")
90
+ user_question = st.text_input("Ask a question about your documents:")
91
+ if user_question:
92
+ handle_userinput(user_question)
93
+
94
+ with st.sidebar:
95
+ st.subheader("Your documents")
96
+ pdf_docs = st.file_uploader(
97
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
98
+ if st.button("Process"):
99
+ with st.spinner("Processing"):
100
+ # get pdf text
101
+ raw_text = get_pdf_text(pdf_docs)
102
+
103
+ # get the text chunks
104
+ text_chunks = get_text_chunks(raw_text)
105
+
106
+ # create vector store
107
+ vectorstore = get_vectorstore(text_chunks)
108
+
109
+ # create conversation chain
110
+ st.session_state.conversation = get_conversation_chain(
111
+ vectorstore)
112
+
113
+
114
+ if __name__ == '__main__':
115
+ main()