raseel-zymr commited on
Commit
eaf0e00
1 Parent(s): dd2ca7e

Initial commit with streamlit

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +15 -1
  3. app.py +93 -0
  4. requirements.txt +161 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -10,4 +10,18 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  license: mit
11
  ---
12
 
13
+ # Document Question & Answer
14
+ A Langchain-based application to upload any text or PDF document, ask relevant Questions to it and expect summarised answers.
15
+
16
+
17
+ ### Pre-requisites
18
+
19
+ $ pip install langchain huggingface_hub sentence_transformers faiss-cpu unstructured chromadb Cython tiktoken unstructured[local-inference]
20
+
21
+ Or
22
+
23
+ $ pip install -r requirements.txt
24
+
25
+ * Install the above Python packages
26
+ ### Reference:
27
+ * Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+
4
+ #for textfiles
5
+ from langchain.document_loaders import TextLoader
6
+ #text splitter
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ #for using HugginFace models & embeddings
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
+ from langchain import HuggingFaceHub
11
+ # Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
12
+ from langchain.vectorstores import FAISS
13
+ #facebook vectorization
14
+ from langchain.chains.question_answering import load_qa_chain
15
+ #load pdf
16
+ from langchain.document_loaders import UnstructuredPDFLoader
17
+
18
+ os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]
19
+
20
+ st.title('Document Q&A - Ask anything in your Document')
21
+ st.sidebar.subheader('Upload document')
22
+ uploaded_file = st.file_uploader("Upload File",type=['txt','pdf'])
23
+ # url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt"
24
+ # res = requests.get(url2)
25
+ # with open("KS-all-info_rev1.txt", "w") as f:
26
+ # f.write(res.text)
27
+
28
+ st.subheader('Enter query')
29
+ query = st.text_input('Ask anything about the Document you uploaded')
30
+
31
+ st.subheader('Answer')
32
+ st.write('Answer from document')
33
+
34
+ # # Document Loader
35
+ # loader = TextLoader('./KS-all-info_rev1.txt')
36
+ # documents = loader.load()
37
+ # import textwrap
38
+ # def wrap_text_preserve_newlines(text, width=110):
39
+ # # Split the input text into lines based on newline characters
40
+ # lines = text.split('\n')
41
+ # # Wrap each line individually
42
+ # wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
43
+ # # Join the wrapped lines back together using newline characters
44
+ # wrapped_text = '\n'.join(wrapped_lines)
45
+ # return wrapped_text
46
+
47
+ # # Text Splitter
48
+ # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
49
+ # docs = text_splitter.split_documents(documents)
50
+
51
+ # # Embeddings
52
+ # embeddings = HuggingFaceEmbeddings()
53
+
54
+ # #Create the vectorized db
55
+ # db = FAISS.from_documents(docs, embeddings)
56
+
57
+ # llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})
58
+ # llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
59
+ # chain = load_qa_chain(llm2, chain_type="stuff")
60
+
61
+ # # Sample question
62
+ # # query = "What the actual issues and drawbacks ?"
63
+
64
+ # # docs = db.similarity_search(query)
65
+ # # chain.run(input_documents=docs, question=query)
66
+
67
+
68
+ # # PDFs
69
+ # # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf
70
+ # # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf
71
+ # # !mkdir pdfs
72
+ # # !cp *pdf '/content/pdfs'
73
+
74
+ # # pdf_folder_path = '/content/pdfs'
75
+ # # os.listdir(pdf_folder_path)
76
+
77
+ # # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
78
+ # # loaders
79
+
80
+ # index = VectorstoreIndexCreator(
81
+ # embedding=HuggingFaceEmbeddings(),
82
+ # text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
83
+
84
+ # #Load llm with selected one
85
+ # llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
86
+ # #Prepare the pipeline
87
+ # from langchain.chains import RetrievalQA
88
+ # chain = RetrievalQA.from_chain_type(llm=llm2,
89
+ # chain_type="stuff",
90
+ # retriever=index.vectorstore.as_retriever(),
91
+ # input_key="question")
92
+ # #get reply to our questions
93
+ # # chain.run('What is the difference between a PLC and a PC?')
requirements.txt ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.4
2
+ aiosignal==1.3.1
3
+ altair==5.0.1
4
+ antlr4-python3-runtime==4.9.3
5
+ anyio==3.7.0
6
+ argilla==1.9.0
7
+ async-timeout==4.0.2
8
+ attrs==23.1.0
9
+ backoff==2.2.1
10
+ blinker==1.6.2
11
+ cachetools==5.3.1
12
+ certifi==2023.5.7
13
+ cffi==1.15.1
14
+ chardet==5.1.0
15
+ charset-normalizer==3.1.0
16
+ chromadb==0.3.26
17
+ click==8.1.3
18
+ clickhouse-connect==0.6.2
19
+ coloredlogs==15.0.1
20
+ commonmark==0.9.1
21
+ contourpy==1.0.7
22
+ cryptography==41.0.1
23
+ cycler==0.11.0
24
+ Cython==0.29.35
25
+ dataclasses-json==0.5.8
26
+ decorator==5.1.1
27
+ Deprecated==1.2.14
28
+ duckdb==0.8.1
29
+ effdet==0.4.1
30
+ et-xmlfile==1.1.0
31
+ exceptiongroup==1.1.1
32
+ faiss-cpu==1.7.4
33
+ fastapi==0.97.0
34
+ filelock==3.12.2
35
+ filetype==1.2.0
36
+ flatbuffers==23.5.26
37
+ fonttools==4.40.0
38
+ frozenlist==1.3.3
39
+ fsspec==2023.6.0
40
+ gitdb==4.0.10
41
+ GitPython==3.1.31
42
+ greenlet==2.0.2
43
+ h11==0.14.0
44
+ hnswlib==0.7.0
45
+ httpcore==0.16.3
46
+ httptools==0.5.0
47
+ httpx==0.23.3
48
+ huggingface-hub==0.15.1
49
+ humanfriendly==10.0
50
+ idna==3.4
51
+ importlib-metadata==6.6.0
52
+ iopath==0.1.10
53
+ Jinja2==3.1.2
54
+ joblib==1.2.0
55
+ jsonschema==4.17.3
56
+ kiwisolver==1.4.4
57
+ langchain==0.0.198
58
+ langchainplus-sdk==0.0.9
59
+ layoutparser==0.3.4
60
+ lxml==4.9.2
61
+ lz4==4.3.2
62
+ Markdown==3.4.3
63
+ MarkupSafe==2.1.3
64
+ marshmallow==3.19.0
65
+ marshmallow-enum==1.5.1
66
+ matplotlib==3.7.1
67
+ monotonic==1.6
68
+ mpmath==1.3.0
69
+ msg-parser==1.2.0
70
+ multidict==6.0.4
71
+ mypy-extensions==1.0.0
72
+ networkx==3.1
73
+ nltk==3.8.1
74
+ numexpr==2.8.4
75
+ numpy==1.23.5
76
+ olefile==0.46
77
+ omegaconf==2.3.0
78
+ onnxruntime==1.15.0
79
+ openapi-schema-pydantic==1.2.4
80
+ opencv-python==4.7.0.72
81
+ openpyxl==3.1.2
82
+ overrides==7.3.1
83
+ packaging==23.1
84
+ pandas==1.5.3
85
+ pdf2image==1.16.3
86
+ pdfminer.six==20221105
87
+ pdfplumber==0.9.0
88
+ Pillow==9.5.0
89
+ portalocker==2.7.0
90
+ posthog==3.0.1
91
+ protobuf==4.23.2
92
+ pulsar-client==3.2.0
93
+ pyarrow==12.0.1
94
+ pycocotools==2.0.6
95
+ pycparser==2.21
96
+ pydantic==1.10.9
97
+ pydeck==0.8.1b0
98
+ Pygments==2.15.1
99
+ Pympler==1.0.1
100
+ pypandoc==1.11
101
+ pyparsing==3.0.9
102
+ pyrsistent==0.19.3
103
+ pytesseract==0.3.10
104
+ python-dateutil==2.8.2
105
+ python-docx==0.8.11
106
+ python-dotenv==1.0.0
107
+ python-magic==0.4.27
108
+ python-multipart==0.0.6
109
+ python-pptx==0.6.21
110
+ pytz==2023.3
111
+ pytz-deprecation-shim==0.1.0.post0
112
+ PyYAML==6.0
113
+ regex==2023.6.3
114
+ requests==2.31.0
115
+ rfc3986==1.5.0
116
+ rich==13.0.1
117
+ safetensors==0.3.1
118
+ scikit-learn==1.2.2
119
+ scipy==1.10.1
120
+ sentence-transformers==2.2.2
121
+ sentencepiece==0.1.99
122
+ six==1.16.0
123
+ smmap==5.0.0
124
+ sniffio==1.3.0
125
+ SQLAlchemy==2.0.16
126
+ starlette==0.27.0
127
+ streamlit==1.23.1
128
+ sympy==1.12
129
+ tabulate==0.9.0
130
+ tenacity==8.2.2
131
+ threadpoolctl==3.1.0
132
+ tiktoken==0.4.0
133
+ timm==0.9.2
134
+ tokenizers==0.13.3
135
+ toml==0.10.2
136
+ toolz==0.12.0
137
+ torch==2.0.1
138
+ torchvision==0.15.2
139
+ tornado==6.3.2
140
+ tqdm==4.65.0
141
+ transformers==4.30.1
142
+ typer==0.9.0
143
+ typing-inspect==0.9.0
144
+ typing_extensions==4.6.3
145
+ tzdata==2023.3
146
+ tzlocal==4.3
147
+ unstructured==0.7.4
148
+ unstructured-inference==0.5.1
149
+ urllib3==2.0.3
150
+ uvicorn==0.22.0
151
+ uvloop==0.17.0
152
+ validators==0.20.0
153
+ Wand==0.6.11
154
+ watchfiles==0.19.0
155
+ websockets==11.0.3
156
+ wrapt==1.14.1
157
+ xlrd==2.0.1
158
+ XlsxWriter==3.1.2
159
+ yarl==1.9.2
160
+ zipp==3.15.0
161
+ zstandard==0.21.0