raseel-zymr commited on
Commit
8c5d334
1 Parent(s): 952eb35

Add support for PDF files

Browse files
Files changed (1) hide show
  1. app.py +62 -64
app.py CHANGED
@@ -15,96 +15,94 @@ from langchain.vectorstores import FAISS
15
  #facebook vectorization
16
  from langchain.chains.question_answering import load_qa_chain
17
  #load pdf
 
 
 
18
  from langchain.document_loaders import UnstructuredPDFLoader
19
 
20
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]
21
 
22
- def pdf_file(filename):
23
- st.subheader('Uploaded PDF File:')
24
- st.write(filename)
25
 
26
- def text_file(filename):
27
- st.subheader('Uploaded Text File:')
28
- st.write(filename)
 
 
 
29
 
30
- # loader = TextLoader(filename)
31
- # documents = loader.load()
 
32
 
33
- # # Text Splitter
34
- # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
35
- # docs = text_splitter.split_documents(documents)
36
-
37
- # db = FAISS.from_documents(docs, embeddings)
38
 
39
- # chain = load_qa_chain(llm2, chain_type="stuff")
 
 
 
 
 
 
 
 
40
 
41
- st.title('Document Q&A - Ask anything in your Document')
42
- st.subheader('This application can be used to upload text(.txt) and PDF(.pdf) files and ask questions about their contents.')
43
- st.sidebar.subheader('Upload document')
44
- uploaded_file = st.sidebar.file_uploader("Upload File",type=['txt','pdf'])
45
 
46
- if Path(uploaded_file.name).suffix == '.txt':
47
- text_file(uploaded_file.name)
48
 
49
- if Path(uploaded_file.name).suffix == '.pdf':
50
- pdf_file(uploaded_file.name)
51
 
52
- with st.sidebar.expander('File'):
53
- if (uploaded_file):
54
- st.info(uploaded_file.name)
55
- if os.path.exists('/content/'):
56
- st.info(os.listdir('/content/'))
57
 
 
 
 
 
 
 
58
 
 
 
 
59
 
 
60
 
61
- # url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt"
62
- # res = requests.get(url2)
63
- # with open("KS-all-info_rev1.txt", "w") as f:
64
- # f.write(res.text)
65
- if (uploaded_file):
66
  st.subheader('Enter query')
67
  query = st.text_input('Ask anything about the Document you uploaded')
68
- stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
69
- with open(uploaded_file.name, "w") as f:
70
- f.write(stringio.read())
71
 
72
- if(uploaded_file):
73
- loader = TextLoader(uploaded_file.name)
74
- documents = loader.load()
75
 
76
- # import textwrap
77
- # def wrap_text_preserve_newlines(text, width=110):
78
- # # Split the input text into lines based on newline characters
79
- # lines = text.split('\n')
80
- # # Wrap each line individually
81
- # wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
82
- # # Join the wrapped lines back together using newline characters
83
- # wrapped_text = '\n'.join(wrapped_lines)
84
- # return wrapped_text
85
 
86
- # Text Splitter
87
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
88
- docs = text_splitter.split_documents(documents)
89
-
90
- # Embeddings
91
- embeddings = HuggingFaceEmbeddings()
92
 
93
- #Create the vectorized db
94
- db = FAISS.from_documents(docs, embeddings)
95
 
96
- #llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})
97
- llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
98
- chain = load_qa_chain(llm2, chain_type="stuff")
99
 
100
- # Sample question
101
- #query = "What the actual issues and drawbacks ?"
 
102
 
103
- docs = db.similarity_search(query)
104
- answer = chain.run(input_documents=docs, question=query)
105
 
106
- st.subheader('Answer')
107
- st.write(answer)
 
 
 
108
 
109
 
110
  # # PDFs
 
15
  #facebook vectorization
16
  from langchain.chains.question_answering import load_qa_chain
17
  #load pdf
18
+ #vectorize db index with chromadb
19
+ from langchain.indexes import VectorstoreIndexCreator
20
+ from langchain.chains import RetrievalQA
21
  from langchain.document_loaders import UnstructuredPDFLoader
22
 
23
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]
24
 
 
 
 
25
 
26
+ def init():
27
+ global embeddings, llm, llm2, chain
28
+ # Embeddings
29
+ embeddings = HuggingFaceEmbeddings()
30
+ llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
31
+ chain = load_qa_chain(llm, chain_type="stuff")
32
 
33
+ def pdf_file(txtFileObj):
34
+ st.subheader('Uploaded PDF File:')
35
+ st.write(txtFileObj.name)
36
 
37
+ with open(txtFileObj.name, "wb") as f:
38
+ f.write(txtFileObj.getbuffer())
 
 
 
39
 
40
+ loaders = [UnstructuredPDFLoader(txtFileObj.name)]
41
+ index = VectorstoreIndexCreator(
42
+ embedding=embeddings,
43
+ text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
44
+
45
+ chain = RetrievalQA.from_chain_type(llm=llm,
46
+ chain_type="stuff",
47
+ retriever=index.vectorstore.as_retriever(),
48
+ input_key="question")
49
 
50
+ st.subheader('Enter query')
51
+ query = st.text_input('Ask anything about the Document you uploaded')
 
 
52
 
53
+ if (query):
54
+ answer = chain.run(question=query)
55
 
56
+ st.subheader('Answer')
57
+ st.write(answer)
58
 
59
+ def text_file(txtFileObj):
60
+ st.subheader('Uploaded Text File:')
61
+ st.write(txtFileObj.name)
 
 
62
 
63
+ #stringio = StringIO(txtFileObj.getvalue().decode("utf-8"))
64
+ with open(txtFileObj.name, "wb") as f:
65
+ f.write(txtFileObj.getbuffer())
66
+
67
+ loader = TextLoader(txtFileObj.name)
68
+ documents = loader.load()
69
 
70
+ # Text Splitter
71
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
72
+ docs = text_splitter.split_documents(documents)
73
 
74
+ db = FAISS.from_documents(docs, embeddings)
75
 
 
 
 
 
 
76
  st.subheader('Enter query')
77
  query = st.text_input('Ask anything about the Document you uploaded')
 
 
 
78
 
79
+ if (query):
80
+ docs = db.similarity_search(query)
81
+ answer = chain.run(input_documents=docs, question=query)
82
 
83
+ st.subheader('Answer')
84
+ st.write(answer)
 
 
 
 
 
 
 
85
 
86
+ st.title('Document Q&A - Ask anything in your Document')
87
+ st.subheader('This application can be used to upload text(.txt) and PDF(.pdf) files and ask questions about their contents.')
 
 
 
 
88
 
89
+ init()
 
90
 
91
+ st.sidebar.subheader('Upload document')
92
+ uploaded_file = st.sidebar.file_uploader("Upload File",type=['txt','pdf'])
 
93
 
94
+ if uploaded_file and Path(uploaded_file.name).suffix == '.txt':
95
+ st.sidebar.info(Path(uploaded_file.name))
96
+ text_file(uploaded_file)
97
 
98
+ if uploaded_file and Path(uploaded_file.name).suffix == '.pdf':
99
+ pdf_file(uploaded_file)
100
 
101
+ with st.sidebar.expander('File'):
102
+ if (uploaded_file):
103
+ st.info(uploaded_file.name)
104
+ if os.path.exists('/content/'):
105
+ st.info(os.listdir('/content/'))
106
 
107
 
108
  # # PDFs