Spaces:

unstructuredio
/

irs-manuals

Running

App Files Files Community

amanda103 commited on Apr 11, 2023

Commit

0043c9e

•

1 Parent(s): b429508

Upload 5 files

Browse files

Files changed (5) hide show

app.py +88 -0
cli_app.py +63 -0
download_data.py +44 -0
ingest_data.py +44 -0
requirements.txt +160 -0

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+from typing import Optional, Tuple
+import gradio as gr
+from cli_app import get_chain
+from threading import Lock
+from langchain.vectorstores import Pinecone
+from langchain.embeddings.openai import OpenAIEmbeddings
+import pinecone
+PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
+PINECONE_API_ENV = os.environ.get("PINECONE_API_ENV")
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX_NAME")
+def grab_vector_connection():
+    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
+    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
+    vectorstore = Pinecone.from_existing_index(PINECONE_INDEX_NAME, embeddings)
+    qa_chain = get_chain(vectorstore)
+    return qa_chain
+class ChatWrapper:
+    def __init__(self):
+        self.lock = Lock()
+    def __call__(self, inp: str, history: Optional[Tuple[str, str]], chain):
+        """Execute the chat functionality."""
+        self.lock.acquire()
+        if not chain:
+            chain = grab_vector_connection()
+        try:
+            history = history or []
+            # Run chain and append input.
+            output = chain({"question": inp, "chat_history": history})["answer"]
+            history.append((inp, output))
+        except Exception as e:
+            raise e
+        finally:
+            self.lock.release()
+        return history, history
+chat = ChatWrapper()
+block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
+with block:
+    with gr.Row():
+        gr.Markdown("<h3><center>Chat-IRS-Manuals</center></h3>")
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        message = gr.Textbox(
+            label="What's your question?",
+            placeholder="Ask questions about the IRS Manuals",
+            lines=1,
+        )
+        submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
+    gr.Examples(
+        examples=[
+            "What is the definition of a taxpayer?",
+            "What kinds of factors affect how much I owe in taxes?",
+            "What if I don't pay my taxes?",
+        ],
+        inputs=message,
+    )
+    gr.HTML("Demo application of a LangChain chain.")
+    gr.HTML(
+        """<center>
+            Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a>
+            and <a href='https://github.com/unstructured-io/unstructured'>Unstructured.IO</a>
+        </center>"""
+    )
+    state = gr.State()
+    agent_state = gr.State()
+    submit.click(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
+    message.submit(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
+block.launch(debug=True)

cli_app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from langchain.prompts.prompt import PromptTemplate
+from langchain.llms import OpenAI
+from langchain.chains import ConversationalRetrievalChain, ChatVectorDBChain
+from langchain.vectorstores import Pinecone
+from langchain.embeddings.openai import OpenAIEmbeddings
+import pinecone
+import os
+PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
+PINECONE_API_ENV = os.environ.get("PINECONE_API_ENV")
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX_NAME")
+_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
+You can assume the question about the Internal Revenue Manuals.
+Chat History:
+{chat_history}
+Follow Up Input: {question}
+Standalone question:"""
+CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
+template = """You are an AI assistant for answering questions about the Internal Revenue Manuals. You are given the following extracted parts of a long document and a question. Provide a conversational answer.
+If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
+If the question is not about the war in Internal Revenue Manuals, politely inform them that you are tuned to only answer questions about the Internal Revenue Manuals
+Question: {question}
+=========
+{context}
+=========
+Answer in Markdown:"""
+QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
+def get_chain(vector):
+    llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
+    qa_chain = ChatVectorDBChain.from_llm(
+        llm,
+        vector,
+        qa_prompt=QA_PROMPT,
+        condense_question_prompt=CONDENSE_QUESTION_PROMPT,
+    )
+    return qa_chain
+if __name__ == "__main__":
+    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
+    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
+    vectorstore = Pinecone.from_existing_index(PINECONE_INDEX_NAME, embeddings)
+    qa_chain = get_chain(vectorstore)
+    chat_history = []
+    print("Chat with your docs!")
+    while True:
+        print("Human:")
+        question = input()
+        result = qa_chain({"question": question, "chat_history": chat_history})
+        chat_history.append((question, result["answer"]))
+        print("AI:")
+        print(result["answer"])

download_data.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import sys
+import urllib
+import requests
+from bs4 import BeautifulSoup
+import re
+import zipfile
+def get_zip_urls(base="https://www.irs.gov/downloads/irm", start_page=1, max_page=74):
+    urls = []
+    for page_num in range(start_page, max_page + 1):
+        url = f"{base}?page={page_num}"
+        response = requests.get(url)
+        html_content = response.text
+        soup = BeautifulSoup(html_content, "html.parser")
+        for link in soup.find_all("a", href=re.compile(r"\.zip$")):
+            urls.append(link.get("href"))
+    return urls
+def download_and_unzip(urls, unzip_dir):
+    for zip_url in urls[:10]:
+        filename = zip_url.split("/")[-1]
+        urllib.request.urlretrieve(zip_url, filename)
+        with zipfile.ZipFile(filename, "r") as zip_ref:
+            for file_info in zip_ref.infolist():
+                # check if the file has a PDF extension
+                if file_info.filename.lower().endswith(".pdf"):
+                    # extract the file to the PDF directory
+                    zip_ref.extract(file_info, unzip_dir)
+if __name__ == "__main__":
+    base_url = sys.argv[1]
+    page_start = int(sys.argv[2])
+    page_max = int(sys.argv[3])
+    pdf_dir = sys.argv[4]
+    print(f"Grabbing zip urls from {base_url}")
+    zip_urls = get_zip_urls(base_url, page_start, page_max)
+    print(
+        f"Found {len(zip_urls)} zip urls, downloading and unzipping pdfs into {pdf_dir}"
+    )
+    download_and_unzip(zip_urls, pdf_dir)
+    print(f"Finished unzipping")

ingest_data.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import sys
+import os
+import pinecone
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.document_loaders import DirectoryLoader
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Pinecone
+PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
+PINECONE_API_ENV = os.environ.get("PINECONE_API_ENV")
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX_NAME")
+def load_documents(path_to_files):
+    # Uses UnstructuredLoader under the hood
+    loader = DirectoryLoader(path=path_to_files, glob="*.json")
+    raw_documents = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter()
+    documents = text_splitter.split_documents(raw_documents)
+    return documents
+def send_docs_to_pinecone(documents):
+    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
+    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
+    if PINECONE_INDEX_NAME in pinecone.list_indexes():
+        print(
+            f"Index {PINECONE_INDEX_NAME} already exists, deleting and recreating to avoid duplicates"
+        )
+        pinecone.delete_index(name=PINECONE_INDEX_NAME)
+    pinecone.create_index(name=PINECONE_INDEX_NAME, dimension=1536)
+    Pinecone.from_documents(documents, embeddings, index_name=PINECONE_INDEX_NAME)
+if __name__ == "__main__":
+    path_to_files = sys.argv[1]
+    print(f"Grabbing json files from {path_to_files}")
+    docs = load_documents(path_to_files)
+    print(f"Found {len(docs)}, sending to pinecone")
+    send_docs_to_pinecone(docs)

requirements.txt ADDED Viewed

	@@ -0,0 +1,160 @@

+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+antlr4-python3-runtime==4.9.3
+anyio==3.6.2
+appnope==0.1.3
+argilla==1.6.0
+asttokens==2.2.1
+async-timeout==4.0.2
+attrs==22.2.0
+backcall==0.2.0
+backoff==2.2.1
+beautifulsoup4==4.12.2
+bs4==0.0.1
+certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==3.1.0
+click==8.1.3
+coloredlogs==15.0.1
+commonmark==0.9.1
+contourpy==1.0.7
+cryptography==40.0.1
+cycler==0.11.0
+dataclasses-json==0.5.7
+decorator==5.1.1
+Deprecated==1.2.13
+dnspython==2.3.0
+effdet==0.3.0
+entrypoints==0.4
+et-xmlfile==1.1.0
+executing==1.2.0
+faiss-cpu==1.7.3
+fastapi==0.95.0
+ffmpy==0.3.0
+filelock==3.11.0
+flatbuffers==23.3.3
+fonttools==4.39.3
+frozenlist==1.3.3
+fsspec==2023.4.0
+gradio==3.25.0
+gradio_client==0.0.10
+h11==0.14.0
+httpcore==0.16.3
+httpx==0.23.3
+huggingface-hub==0.13.4
+humanfriendly==10.0
+idna==3.4
+importlib-metadata==6.3.0
+importlib-resources==5.12.0
+iopath==0.1.10
+ipython==8.12.0
+jedi==0.18.2
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.17.3
+kiwisolver==1.4.4
+langchain==0.0.136
+layoutparser==0.3.4
+linkify-it-py==2.0.0
+loguru==0.7.0
+lxml==4.9.2
+Markdown==3.4.3
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+monotonic==1.6
+mpmath==1.3.0
+msg-parser==1.2.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+networkx==3.1
+nltk==3.8.1
+numpy==1.23.5
+olefile==0.46
+omegaconf==2.3.0
+onnxruntime==1.14.1
+openai==0.27.4
+openapi-schema-pydantic==1.2.4
+opencv-python==4.6.0.66
+openpyxl==3.1.2
+orjson==3.8.10
+packaging==23.0
+pandas==1.5.3
+parso==0.8.3
+pdf2image==1.16.3
+pdfminer.six==20221105
+pdfplumber==0.8.1
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.5.0
+pinecone-client==2.2.1
+pkgutil_resolve_name==1.3.10
+portalocker==2.7.0
+prompt-toolkit==3.0.38
+protobuf==4.22.1
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycocotools==2.0.6
+pycparser==2.21
+pydantic==1.10.7
+pydub==0.25.1
+Pygments==2.15.0
+pypandoc==1.11
+pyparsing==3.0.9
+pyrsistent==0.19.3
+pytesseract==0.3.10
+python-dateutil==2.8.2
+python-docx==0.8.11
+python-magic==0.4.27
+python-multipart==0.0.6
+python-pptx==0.6.21
+pytz==2023.3
+PyYAML==6.0
+regex==2023.3.23
+requests==2.28.2
+rfc3986==1.5.0
+rich==13.0.1
+scikit-learn==1.2.2
+scipy==1.10.1
+semantic-version==2.10.0
+sentence-transformers==2.2.2
+sentencepiece==0.1.97
+six==1.16.0
+sniffio==1.3.0
+soupsieve==2.4
+SQLAlchemy==1.4.47
+stack-data==0.6.2
+starlette==0.26.1
+sympy==1.11.1
+tenacity==8.2.2
+threadpoolctl==3.1.0
+tiktoken==0.3.3
+timm==0.6.13
+tokenizers==0.13.3
+toolz==0.12.0
+torch==2.0.0
+torchvision==0.15.1
+tqdm==4.65.0
+traitlets==5.9.0
+transformers==4.27.4
+typing-inspect==0.8.0
+typing_extensions==4.5.0
+uc-micro-py==1.0.1
+unstructured==0.5.11
+unstructured-inference==0.3.2
+urllib3==1.26.15
+uvicorn==0.21.1
+Wand==0.6.11
+wcwidth==0.2.6
+websockets==11.0.1
+wrapt==1.14.1
+XlsxWriter==3.0.9
+yarl==1.8.2
+zipp==3.15.0