Cazimir Roman commited on
Commit
4171f86
1 Parent(s): 77be341

initial commit

Browse files
Files changed (2) hide show
  1. app.py +111 -0
  2. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ import pickle
4
+ from PyPDF2 import PdfReader
5
+ from streamlit_extras.add_vertical_space import add_vertical_space
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+
8
+ from langchain import HuggingFaceHub
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
+
11
+ from langchain.vectorstores import FAISS
12
+ from langchain.chains.question_answering import load_qa_chain
13
+
14
+ import os
15
+
16
+ with st.sidebar:
17
+ st.title('PDF Chat App')
18
+ st.markdown('''
19
+ ## About
20
+ This app is an LLM-powered PDF chatbot built using:
21
+ - [Streamlit](https://streamlit.io/)
22
+ - [LangChain](https://python.langchain.com/)
23
+ - [OpenAI](https://platform.openai.com/docs/models) LLM model
24
+
25
+ ## How it works
26
+ - Load up a PDF file
27
+ - Extract the text from the PDF file
28
+ - Split the text into chunks
29
+ - Create embeddings using OpenAI, which are vectors of floating-point numbers that measure the relatedness of text strings
30
+ - Save these embeddings as vectors in a vector store, such as FAISS
31
+ - Use a similarity search to ask a question
32
+ - Get the answer and tokens used from OpenAI
33
+
34
+ ''')
35
+ st.write('Made with 🤖 by [Cazimir Roman](https://cazimir.dev)')
36
+
37
+ def load_app():
38
+ llm = HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
39
+
40
+ # upload a PDF file
41
+ pdf = st.file_uploader("Upload your PDF", type='pdf')
42
+
43
+ if pdf is not None:
44
+ pdf_reader = PdfReader(pdf)
45
+
46
+ text = ""
47
+ for page in pdf_reader.pages:
48
+ text += page.extract_text()
49
+
50
+ text_splitter = RecursiveCharacterTextSplitter(
51
+ chunk_size = 1000,
52
+ chunk_overlap=200,
53
+ length_function=len
54
+ )
55
+
56
+ chunks = text_splitter.split_text(text=text)
57
+
58
+ store_name = pdf.name[:-4]
59
+
60
+ # check if vector store exists. if not, create one
61
+ if os.path.exists(f"{store_name}.pkl"):
62
+ with open(f"{store_name}.pkl", "rb") as f:
63
+ vectorStore = pickle.load(f)
64
+ st.success('Text embeddings loaded from disk')
65
+ else:
66
+ embeddings = HuggingFaceEmbeddings()
67
+ with st.spinner(f"Creating vector store embeddings..."):
68
+ vectorStore = FAISS.from_texts(chunks, embeddings)
69
+ with open(f"{store_name}.pkl", "wb") as f:
70
+ pickle.dump(vectorStore, f)
71
+ st.success('Embeddings computation completed')
72
+
73
+ # Accept user question/query
74
+ # st.divider()
75
+ query = st.text_input("Ask a question about your PDF file")
76
+
77
+ if query:
78
+ st.write(f"You asked: {query}")
79
+ with st.spinner("Thinking..."):
80
+ # top 3 that are most similar to our query
81
+ docs = vectorStore.similarity_search(query)
82
+ chain = load_qa_chain(llm=llm, chain_type="stuff")
83
+ response = chain.run(input_documents=docs, question=query)
84
+ st.write(response)
85
+
86
+ def main():
87
+ print("Main called")
88
+ st.header("Chat with your PDF")
89
+
90
+ container = st.container()
91
+
92
+ with container:
93
+ hugging_face_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
94
+ api_key = container.text_input("Enter your HuggingFace API token", type="password", value="" if hugging_face_token == None else hugging_face_token)
95
+ # You can find it here: https://platform.openai.com/account/api-keys
96
+ submit = container.button("Submit")
97
+
98
+ if hugging_face_token:
99
+ load_app()
100
+
101
+ # submit button is pressed
102
+ if submit:
103
+ # check if api key length correct
104
+ if len(api_key) == 37:
105
+ os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
106
+ load_app()
107
+ else:
108
+ st.error("Api key is not correct")
109
+
110
+ if __name__ == '__main__':
111
+ main()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.137
2
+ PyPDF2
3
+ python-dotenv
4
+ streamlit==1.22.0
5
+ faiss-cpu
6
+ streamlit-extras
7
+ openai
8
+ altair<5
9
+ tiktoken
10
+ huggingface_hub
11
+ sentence_transformers