Spaces:
Running
Running
Upload 5 files
Browse files- app.py +88 -0
- cli_app.py +63 -0
- download_data.py +44 -0
- ingest_data.py +44 -0
- requirements.txt +160 -0
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Optional, Tuple
|
3 |
+
import gradio as gr
|
4 |
+
from cli_app import get_chain
|
5 |
+
from threading import Lock
|
6 |
+
from langchain.vectorstores import Pinecone
|
7 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
8 |
+
import pinecone
|
9 |
+
|
10 |
+
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
11 |
+
PINECONE_API_ENV = os.environ.get("PINECONE_API_ENV")
|
12 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
13 |
+
PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX_NAME")
|
14 |
+
|
15 |
+
|
16 |
+
def grab_vector_connection():
|
17 |
+
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
|
18 |
+
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
|
19 |
+
vectorstore = Pinecone.from_existing_index(PINECONE_INDEX_NAME, embeddings)
|
20 |
+
qa_chain = get_chain(vectorstore)
|
21 |
+
return qa_chain
|
22 |
+
|
23 |
+
|
24 |
+
class ChatWrapper:
|
25 |
+
def __init__(self):
|
26 |
+
self.lock = Lock()
|
27 |
+
|
28 |
+
def __call__(self, inp: str, history: Optional[Tuple[str, str]], chain):
|
29 |
+
"""Execute the chat functionality."""
|
30 |
+
self.lock.acquire()
|
31 |
+
if not chain:
|
32 |
+
chain = grab_vector_connection()
|
33 |
+
try:
|
34 |
+
history = history or []
|
35 |
+
# Run chain and append input.
|
36 |
+
output = chain({"question": inp, "chat_history": history})["answer"]
|
37 |
+
history.append((inp, output))
|
38 |
+
except Exception as e:
|
39 |
+
raise e
|
40 |
+
finally:
|
41 |
+
self.lock.release()
|
42 |
+
return history, history
|
43 |
+
|
44 |
+
|
45 |
+
chat = ChatWrapper()
|
46 |
+
|
47 |
+
|
48 |
+
block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
|
49 |
+
|
50 |
+
with block:
|
51 |
+
with gr.Row():
|
52 |
+
gr.Markdown("<h3><center>Chat-IRS-Manuals</center></h3>")
|
53 |
+
|
54 |
+
chatbot = gr.Chatbot()
|
55 |
+
|
56 |
+
with gr.Row():
|
57 |
+
message = gr.Textbox(
|
58 |
+
label="What's your question?",
|
59 |
+
placeholder="Ask questions about the IRS Manuals",
|
60 |
+
lines=1,
|
61 |
+
)
|
62 |
+
submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
|
63 |
+
|
64 |
+
gr.Examples(
|
65 |
+
examples=[
|
66 |
+
"What is the definition of a taxpayer?",
|
67 |
+
"What kinds of factors affect how much I owe in taxes?",
|
68 |
+
"What if I don't pay my taxes?",
|
69 |
+
],
|
70 |
+
inputs=message,
|
71 |
+
)
|
72 |
+
|
73 |
+
gr.HTML("Demo application of a LangChain chain.")
|
74 |
+
|
75 |
+
gr.HTML(
|
76 |
+
"""<center>
|
77 |
+
Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a>
|
78 |
+
and <a href='https://github.com/unstructured-io/unstructured'>Unstructured.IO</a>
|
79 |
+
</center>"""
|
80 |
+
)
|
81 |
+
|
82 |
+
state = gr.State()
|
83 |
+
agent_state = gr.State()
|
84 |
+
|
85 |
+
submit.click(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
|
86 |
+
message.submit(chat, inputs=[message, state, agent_state], outputs=[chatbot, state])
|
87 |
+
|
88 |
+
block.launch(debug=True)
|
cli_app.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts.prompt import PromptTemplate
|
2 |
+
from langchain.llms import OpenAI
|
3 |
+
from langchain.chains import ConversationalRetrievalChain, ChatVectorDBChain
|
4 |
+
from langchain.vectorstores import Pinecone
|
5 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
6 |
+
import pinecone
|
7 |
+
import os
|
8 |
+
|
9 |
+
|
10 |
+
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
11 |
+
PINECONE_API_ENV = os.environ.get("PINECONE_API_ENV")
|
12 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
13 |
+
PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX_NAME")
|
14 |
+
|
15 |
+
|
16 |
+
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
|
17 |
+
You can assume the question about the Internal Revenue Manuals.
|
18 |
+
|
19 |
+
Chat History:
|
20 |
+
{chat_history}
|
21 |
+
Follow Up Input: {question}
|
22 |
+
Standalone question:"""
|
23 |
+
|
24 |
+
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
|
25 |
+
|
26 |
+
template = """You are an AI assistant for answering questions about the Internal Revenue Manuals. You are given the following extracted parts of a long document and a question. Provide a conversational answer.
|
27 |
+
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
|
28 |
+
If the question is not about the war in Internal Revenue Manuals, politely inform them that you are tuned to only answer questions about the Internal Revenue Manuals
|
29 |
+
Question: {question}
|
30 |
+
=========
|
31 |
+
{context}
|
32 |
+
=========
|
33 |
+
Answer in Markdown:"""
|
34 |
+
|
35 |
+
|
36 |
+
QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
|
37 |
+
|
38 |
+
|
39 |
+
def get_chain(vector):
|
40 |
+
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
|
41 |
+
qa_chain = ChatVectorDBChain.from_llm(
|
42 |
+
llm,
|
43 |
+
vector,
|
44 |
+
qa_prompt=QA_PROMPT,
|
45 |
+
condense_question_prompt=CONDENSE_QUESTION_PROMPT,
|
46 |
+
)
|
47 |
+
return qa_chain
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
|
52 |
+
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
|
53 |
+
vectorstore = Pinecone.from_existing_index(PINECONE_INDEX_NAME, embeddings)
|
54 |
+
qa_chain = get_chain(vectorstore)
|
55 |
+
chat_history = []
|
56 |
+
print("Chat with your docs!")
|
57 |
+
while True:
|
58 |
+
print("Human:")
|
59 |
+
question = input()
|
60 |
+
result = qa_chain({"question": question, "chat_history": chat_history})
|
61 |
+
chat_history.append((question, result["answer"]))
|
62 |
+
print("AI:")
|
63 |
+
print(result["answer"])
|
download_data.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import urllib
|
3 |
+
import requests
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import re
|
6 |
+
import zipfile
|
7 |
+
|
8 |
+
|
9 |
+
def get_zip_urls(base="https://www.irs.gov/downloads/irm", start_page=1, max_page=74):
|
10 |
+
urls = []
|
11 |
+
for page_num in range(start_page, max_page + 1):
|
12 |
+
url = f"{base}?page={page_num}"
|
13 |
+
response = requests.get(url)
|
14 |
+
html_content = response.text
|
15 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
16 |
+
for link in soup.find_all("a", href=re.compile(r"\.zip$")):
|
17 |
+
urls.append(link.get("href"))
|
18 |
+
return urls
|
19 |
+
|
20 |
+
|
21 |
+
def download_and_unzip(urls, unzip_dir):
|
22 |
+
for zip_url in urls[:10]:
|
23 |
+
filename = zip_url.split("/")[-1]
|
24 |
+
urllib.request.urlretrieve(zip_url, filename)
|
25 |
+
with zipfile.ZipFile(filename, "r") as zip_ref:
|
26 |
+
for file_info in zip_ref.infolist():
|
27 |
+
# check if the file has a PDF extension
|
28 |
+
if file_info.filename.lower().endswith(".pdf"):
|
29 |
+
# extract the file to the PDF directory
|
30 |
+
zip_ref.extract(file_info, unzip_dir)
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == "__main__":
|
34 |
+
base_url = sys.argv[1]
|
35 |
+
page_start = int(sys.argv[2])
|
36 |
+
page_max = int(sys.argv[3])
|
37 |
+
pdf_dir = sys.argv[4]
|
38 |
+
print(f"Grabbing zip urls from {base_url}")
|
39 |
+
zip_urls = get_zip_urls(base_url, page_start, page_max)
|
40 |
+
print(
|
41 |
+
f"Found {len(zip_urls)} zip urls, downloading and unzipping pdfs into {pdf_dir}"
|
42 |
+
)
|
43 |
+
download_and_unzip(zip_urls, pdf_dir)
|
44 |
+
print(f"Finished unzipping")
|
ingest_data.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
import pinecone
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.document_loaders import DirectoryLoader
|
6 |
+
from langchain.embeddings import OpenAIEmbeddings
|
7 |
+
from langchain.vectorstores import Pinecone
|
8 |
+
|
9 |
+
|
10 |
+
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
|
11 |
+
PINECONE_API_ENV = os.environ.get("PINECONE_API_ENV")
|
12 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
13 |
+
PINECONE_INDEX_NAME = os.environ.get("PINECONE_INDEX_NAME")
|
14 |
+
|
15 |
+
|
16 |
+
def load_documents(path_to_files):
|
17 |
+
# Uses UnstructuredLoader under the hood
|
18 |
+
loader = DirectoryLoader(path=path_to_files, glob="*.json")
|
19 |
+
raw_documents = loader.load()
|
20 |
+
text_splitter = RecursiveCharacterTextSplitter()
|
21 |
+
documents = text_splitter.split_documents(raw_documents)
|
22 |
+
return documents
|
23 |
+
|
24 |
+
|
25 |
+
def send_docs_to_pinecone(documents):
|
26 |
+
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
|
27 |
+
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
|
28 |
+
|
29 |
+
if PINECONE_INDEX_NAME in pinecone.list_indexes():
|
30 |
+
print(
|
31 |
+
f"Index {PINECONE_INDEX_NAME} already exists, deleting and recreating to avoid duplicates"
|
32 |
+
)
|
33 |
+
pinecone.delete_index(name=PINECONE_INDEX_NAME)
|
34 |
+
|
35 |
+
pinecone.create_index(name=PINECONE_INDEX_NAME, dimension=1536)
|
36 |
+
Pinecone.from_documents(documents, embeddings, index_name=PINECONE_INDEX_NAME)
|
37 |
+
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
path_to_files = sys.argv[1]
|
41 |
+
print(f"Grabbing json files from {path_to_files}")
|
42 |
+
docs = load_documents(path_to_files)
|
43 |
+
print(f"Found {len(docs)}, sending to pinecone")
|
44 |
+
send_docs_to_pinecone(docs)
|
requirements.txt
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.2
|
5 |
+
antlr4-python3-runtime==4.9.3
|
6 |
+
anyio==3.6.2
|
7 |
+
appnope==0.1.3
|
8 |
+
argilla==1.6.0
|
9 |
+
asttokens==2.2.1
|
10 |
+
async-timeout==4.0.2
|
11 |
+
attrs==22.2.0
|
12 |
+
backcall==0.2.0
|
13 |
+
backoff==2.2.1
|
14 |
+
beautifulsoup4==4.12.2
|
15 |
+
bs4==0.0.1
|
16 |
+
certifi==2022.12.7
|
17 |
+
cffi==1.15.1
|
18 |
+
charset-normalizer==3.1.0
|
19 |
+
click==8.1.3
|
20 |
+
coloredlogs==15.0.1
|
21 |
+
commonmark==0.9.1
|
22 |
+
contourpy==1.0.7
|
23 |
+
cryptography==40.0.1
|
24 |
+
cycler==0.11.0
|
25 |
+
dataclasses-json==0.5.7
|
26 |
+
decorator==5.1.1
|
27 |
+
Deprecated==1.2.13
|
28 |
+
dnspython==2.3.0
|
29 |
+
effdet==0.3.0
|
30 |
+
entrypoints==0.4
|
31 |
+
et-xmlfile==1.1.0
|
32 |
+
executing==1.2.0
|
33 |
+
faiss-cpu==1.7.3
|
34 |
+
fastapi==0.95.0
|
35 |
+
ffmpy==0.3.0
|
36 |
+
filelock==3.11.0
|
37 |
+
flatbuffers==23.3.3
|
38 |
+
fonttools==4.39.3
|
39 |
+
frozenlist==1.3.3
|
40 |
+
fsspec==2023.4.0
|
41 |
+
gradio==3.25.0
|
42 |
+
gradio_client==0.0.10
|
43 |
+
h11==0.14.0
|
44 |
+
httpcore==0.16.3
|
45 |
+
httpx==0.23.3
|
46 |
+
huggingface-hub==0.13.4
|
47 |
+
humanfriendly==10.0
|
48 |
+
idna==3.4
|
49 |
+
importlib-metadata==6.3.0
|
50 |
+
importlib-resources==5.12.0
|
51 |
+
iopath==0.1.10
|
52 |
+
ipython==8.12.0
|
53 |
+
jedi==0.18.2
|
54 |
+
Jinja2==3.1.2
|
55 |
+
joblib==1.2.0
|
56 |
+
jsonschema==4.17.3
|
57 |
+
kiwisolver==1.4.4
|
58 |
+
langchain==0.0.136
|
59 |
+
layoutparser==0.3.4
|
60 |
+
linkify-it-py==2.0.0
|
61 |
+
loguru==0.7.0
|
62 |
+
lxml==4.9.2
|
63 |
+
Markdown==3.4.3
|
64 |
+
markdown-it-py==2.2.0
|
65 |
+
MarkupSafe==2.1.2
|
66 |
+
marshmallow==3.19.0
|
67 |
+
marshmallow-enum==1.5.1
|
68 |
+
matplotlib==3.7.1
|
69 |
+
matplotlib-inline==0.1.6
|
70 |
+
mdit-py-plugins==0.3.3
|
71 |
+
mdurl==0.1.2
|
72 |
+
monotonic==1.6
|
73 |
+
mpmath==1.3.0
|
74 |
+
msg-parser==1.2.0
|
75 |
+
multidict==6.0.4
|
76 |
+
mypy-extensions==1.0.0
|
77 |
+
networkx==3.1
|
78 |
+
nltk==3.8.1
|
79 |
+
numpy==1.23.5
|
80 |
+
olefile==0.46
|
81 |
+
omegaconf==2.3.0
|
82 |
+
onnxruntime==1.14.1
|
83 |
+
openai==0.27.4
|
84 |
+
openapi-schema-pydantic==1.2.4
|
85 |
+
opencv-python==4.6.0.66
|
86 |
+
openpyxl==3.1.2
|
87 |
+
orjson==3.8.10
|
88 |
+
packaging==23.0
|
89 |
+
pandas==1.5.3
|
90 |
+
parso==0.8.3
|
91 |
+
pdf2image==1.16.3
|
92 |
+
pdfminer.six==20221105
|
93 |
+
pdfplumber==0.8.1
|
94 |
+
pexpect==4.8.0
|
95 |
+
pickleshare==0.7.5
|
96 |
+
Pillow==9.5.0
|
97 |
+
pinecone-client==2.2.1
|
98 |
+
pkgutil_resolve_name==1.3.10
|
99 |
+
portalocker==2.7.0
|
100 |
+
prompt-toolkit==3.0.38
|
101 |
+
protobuf==4.22.1
|
102 |
+
ptyprocess==0.7.0
|
103 |
+
pure-eval==0.2.2
|
104 |
+
pycocotools==2.0.6
|
105 |
+
pycparser==2.21
|
106 |
+
pydantic==1.10.7
|
107 |
+
pydub==0.25.1
|
108 |
+
Pygments==2.15.0
|
109 |
+
pypandoc==1.11
|
110 |
+
pyparsing==3.0.9
|
111 |
+
pyrsistent==0.19.3
|
112 |
+
pytesseract==0.3.10
|
113 |
+
python-dateutil==2.8.2
|
114 |
+
python-docx==0.8.11
|
115 |
+
python-magic==0.4.27
|
116 |
+
python-multipart==0.0.6
|
117 |
+
python-pptx==0.6.21
|
118 |
+
pytz==2023.3
|
119 |
+
PyYAML==6.0
|
120 |
+
regex==2023.3.23
|
121 |
+
requests==2.28.2
|
122 |
+
rfc3986==1.5.0
|
123 |
+
rich==13.0.1
|
124 |
+
scikit-learn==1.2.2
|
125 |
+
scipy==1.10.1
|
126 |
+
semantic-version==2.10.0
|
127 |
+
sentence-transformers==2.2.2
|
128 |
+
sentencepiece==0.1.97
|
129 |
+
six==1.16.0
|
130 |
+
sniffio==1.3.0
|
131 |
+
soupsieve==2.4
|
132 |
+
SQLAlchemy==1.4.47
|
133 |
+
stack-data==0.6.2
|
134 |
+
starlette==0.26.1
|
135 |
+
sympy==1.11.1
|
136 |
+
tenacity==8.2.2
|
137 |
+
threadpoolctl==3.1.0
|
138 |
+
tiktoken==0.3.3
|
139 |
+
timm==0.6.13
|
140 |
+
tokenizers==0.13.3
|
141 |
+
toolz==0.12.0
|
142 |
+
torch==2.0.0
|
143 |
+
torchvision==0.15.1
|
144 |
+
tqdm==4.65.0
|
145 |
+
traitlets==5.9.0
|
146 |
+
transformers==4.27.4
|
147 |
+
typing-inspect==0.8.0
|
148 |
+
typing_extensions==4.5.0
|
149 |
+
uc-micro-py==1.0.1
|
150 |
+
unstructured==0.5.11
|
151 |
+
unstructured-inference==0.3.2
|
152 |
+
urllib3==1.26.15
|
153 |
+
uvicorn==0.21.1
|
154 |
+
Wand==0.6.11
|
155 |
+
wcwidth==0.2.6
|
156 |
+
websockets==11.0.1
|
157 |
+
wrapt==1.14.1
|
158 |
+
XlsxWriter==3.0.9
|
159 |
+
yarl==1.8.2
|
160 |
+
zipp==3.15.0
|