Spaces:

raseel-zymr
/

Document-QandA

Sleeping

App Files Files Community

raseel-zymr commited on Jun 13, 2023

Commit

eaf0e00

1 Parent(s): dd2ca7e

Initial commit with streamlit

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +15 -1
app.py +93 -0
requirements.txt +161 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -10,4 +10,18 @@ pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: mit
 ---
+# Document Question & Answer
+A Langchain-based application to upload any text or PDF document, ask relevant Questions to it and expect summarised answers.
+### Pre-requisites
+    $ pip install langchain huggingface_hub sentence_transformers faiss-cpu unstructured chromadb Cython tiktoken unstructured[local-inference]
+Or
+    $ pip install -r requirements.txt
+* Install the above Python packages
+### Reference:
+* Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os
+import streamlit as st
+#for textfiles
+from langchain.document_loaders import TextLoader
+#text splitter
+from langchain.text_splitter import CharacterTextSplitter
+#for using HugginFace models & embeddings
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain import HuggingFaceHub
+# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
+from langchain.vectorstores import FAISS
+#facebook vectorization
+from langchain.chains.question_answering import load_qa_chain
+#load pdf
+from langchain.document_loaders import UnstructuredPDFLoader
+os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]
+st.title('Document Q&A - Ask anything in your Document')
+st.sidebar.subheader('Upload document')
+uploaded_file = st.file_uploader("Upload File",type=['txt','pdf'])
+# url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt"
+# res = requests.get(url2)
+# with open("KS-all-info_rev1.txt", "w") as f:
+#   f.write(res.text)
+st.subheader('Enter query')
+query = st.text_input('Ask anything about the Document you uploaded')
+st.subheader('Answer')
+st.write('Answer from document')
+# # Document Loader
+# loader = TextLoader('./KS-all-info_rev1.txt')
+# documents = loader.load()
+# import textwrap
+# def wrap_text_preserve_newlines(text, width=110):
+#     # Split the input text into lines based on newline characters
+#     lines = text.split('\n')
+#     # Wrap each line individually
+#     wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
+#     # Join the wrapped lines back together using newline characters
+#     wrapped_text = '\n'.join(wrapped_lines)
+#     return wrapped_text
+# # Text Splitter
+# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
+# docs = text_splitter.split_documents(documents)
+# # Embeddings
+# embeddings = HuggingFaceEmbeddings()
+# #Create the vectorized db
+# db = FAISS.from_documents(docs, embeddings)
+# llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})
+# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
+# chain = load_qa_chain(llm2, chain_type="stuff")
+# # Sample question
+# # query = "What the actual issues and drawbacks ?"
+# # docs = db.similarity_search(query)
+# # chain.run(input_documents=docs, question=query)
+# # PDFs
+# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf
+# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf
+# # !mkdir pdfs
+# # !cp *pdf '/content/pdfs'
+# # pdf_folder_path = '/content/pdfs'
+# # os.listdir(pdf_folder_path)
+# # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
+# # loaders
+# index = VectorstoreIndexCreator(
+#     embedding=HuggingFaceEmbeddings(),
+#     text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
+# #Load llm with selected one
+# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
+# #Prepare the pipeline
+# from langchain.chains import RetrievalQA
+# chain = RetrievalQA.from_chain_type(llm=llm2,
+#                                     chain_type="stuff",
+#                                     retriever=index.vectorstore.as_retriever(),
+#                                     input_key="question")
+# #get reply to our questions
+# # chain.run('What is the difference between a PLC and a PC?')

requirements.txt ADDED Viewed

	@@ -0,0 +1,161 @@

+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==5.0.1
+antlr4-python3-runtime==4.9.3
+anyio==3.7.0
+argilla==1.9.0
+async-timeout==4.0.2
+attrs==23.1.0
+backoff==2.2.1
+blinker==1.6.2
+cachetools==5.3.1
+certifi==2023.5.7
+cffi==1.15.1
+chardet==5.1.0
+charset-normalizer==3.1.0
+chromadb==0.3.26
+click==8.1.3
+clickhouse-connect==0.6.2
+coloredlogs==15.0.1
+commonmark==0.9.1
+contourpy==1.0.7
+cryptography==41.0.1
+cycler==0.11.0
+Cython==0.29.35
+dataclasses-json==0.5.8
+decorator==5.1.1
+Deprecated==1.2.14
+duckdb==0.8.1
+effdet==0.4.1
+et-xmlfile==1.1.0
+exceptiongroup==1.1.1
+faiss-cpu==1.7.4
+fastapi==0.97.0
+filelock==3.12.2
+filetype==1.2.0
+flatbuffers==23.5.26
+fonttools==4.40.0
+frozenlist==1.3.3
+fsspec==2023.6.0
+gitdb==4.0.10
+GitPython==3.1.31
+greenlet==2.0.2
+h11==0.14.0
+hnswlib==0.7.0
+httpcore==0.16.3
+httptools==0.5.0
+httpx==0.23.3
+huggingface-hub==0.15.1
+humanfriendly==10.0
+idna==3.4
+importlib-metadata==6.6.0
+iopath==0.1.10
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.17.3
+kiwisolver==1.4.4
+langchain==0.0.198
+langchainplus-sdk==0.0.9
+layoutparser==0.3.4
+lxml==4.9.2
+lz4==4.3.2
+Markdown==3.4.3
+MarkupSafe==2.1.3
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+matplotlib==3.7.1
+monotonic==1.6
+mpmath==1.3.0
+msg-parser==1.2.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+networkx==3.1
+nltk==3.8.1
+numexpr==2.8.4
+numpy==1.23.5
+olefile==0.46
+omegaconf==2.3.0
+onnxruntime==1.15.0
+openapi-schema-pydantic==1.2.4
+opencv-python==4.7.0.72
+openpyxl==3.1.2
+overrides==7.3.1
+packaging==23.1
+pandas==1.5.3
+pdf2image==1.16.3
+pdfminer.six==20221105
+pdfplumber==0.9.0
+Pillow==9.5.0
+portalocker==2.7.0
+posthog==3.0.1
+protobuf==4.23.2
+pulsar-client==3.2.0
+pyarrow==12.0.1
+pycocotools==2.0.6
+pycparser==2.21
+pydantic==1.10.9
+pydeck==0.8.1b0
+Pygments==2.15.1
+Pympler==1.0.1
+pypandoc==1.11
+pyparsing==3.0.9
+pyrsistent==0.19.3
+pytesseract==0.3.10
+python-dateutil==2.8.2
+python-docx==0.8.11
+python-dotenv==1.0.0
+python-magic==0.4.27
+python-multipart==0.0.6
+python-pptx==0.6.21
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+regex==2023.6.3
+requests==2.31.0
+rfc3986==1.5.0
+rich==13.0.1
+safetensors==0.3.1
+scikit-learn==1.2.2
+scipy==1.10.1
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+six==1.16.0
+smmap==5.0.0
+sniffio==1.3.0
+SQLAlchemy==2.0.16
+starlette==0.27.0
+streamlit==1.23.1
+sympy==1.12
+tabulate==0.9.0
+tenacity==8.2.2
+threadpoolctl==3.1.0
+tiktoken==0.4.0
+timm==0.9.2
+tokenizers==0.13.3
+toml==0.10.2
+toolz==0.12.0
+torch==2.0.1
+torchvision==0.15.2
+tornado==6.3.2
+tqdm==4.65.0
+transformers==4.30.1
+typer==0.9.0
+typing-inspect==0.9.0
+typing_extensions==4.6.3
+tzdata==2023.3
+tzlocal==4.3
+unstructured==0.7.4
+unstructured-inference==0.5.1
+urllib3==2.0.3
+uvicorn==0.22.0
+uvloop==0.17.0
+validators==0.20.0
+Wand==0.6.11
+watchfiles==0.19.0
+websockets==11.0.3
+wrapt==1.14.1
+xlrd==2.0.1
+XlsxWriter==3.1.2
+yarl==1.9.2
+zipp==3.15.0
+zstandard==0.21.0