ngmitam commited on
Commit
7d1995c
0 Parent(s):

Duplicate from ngmitam/trade

Browse files
Files changed (6) hide show
  1. .env +1 -0
  2. .gitattributes +35 -0
  3. .gitignore +2 -0
  4. README.md +13 -0
  5. app.py +107 -0
  6. requirements.txt +206 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ HUGGINGFACEHUB_API_TOKEN=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
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # .env
2
+ .vscode
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Trade
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.25.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: ngmitam/trade
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import os
3
+ import streamlit as st
4
+ from dotenv import load_dotenv
5
+ from PyPDF2 import PdfReader
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.vectorstores import FAISS
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
10
+ from langchain.memory import ConversationBufferMemory
11
+ from langchain.chains import ConversationalRetrievalChain
12
+ from langchain.llms import GPT4All
13
+ from streamlit_chat import message
14
+ from huggingface_hub import hf_hub_download
15
+
16
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
17
+
18
+
19
+ def get_pdf_text(pdfs):
20
+ text = ""
21
+ for pdf in pdfs:
22
+ pdf_reader = PdfReader(pdf)
23
+ for page in pdf_reader.pages:
24
+ text += page.extract_text()
25
+ return text
26
+
27
+
28
+ def get_text_chunks(text):
29
+ text_splitter = CharacterTextSplitter(separator="\n",
30
+ chunk_size=1000, chunk_overlap=200, length_function=len)
31
+ chunks = text_splitter.split_text(text)
32
+ return chunks
33
+
34
+
35
+ def get_vectorstore(text_chunks):
36
+ # embeddings = OpenAIEmbeddings()
37
+ embeddings = HuggingFaceEmbeddings(
38
+ model_name="all-MiniLM-L6-v2")
39
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
40
+ return vectorstore
41
+
42
+
43
+ def get_conversation_chain(vectorstore):
44
+ callbacks = [StreamingStdOutCallbackHandler()]
45
+ llm = GPT4All(model="/tmp/ggml-gpt4all-j-v1.3-groovy.bin",
46
+ max_tokens=1000, backend='gptj', callbacks=callbacks, n_batch=8, verbose=False)
47
+ # llm = ChatOpenAI()
48
+ memory = ConversationBufferMemory(
49
+ memory_key='chat_history', return_messages=True)
50
+ conversation_chain = ConversationalRetrievalChain.from_llm(
51
+ llm=llm,
52
+ retriever=vectorstore.as_retriever(),
53
+ memory=memory
54
+
55
+ )
56
+ return conversation_chain
57
+
58
+
59
+ def user_input(user_question):
60
+ # log user question with timestamp
61
+ print(f"[{datetime.now()}]:{user_question}")
62
+ with st.spinner("Thinking ..."):
63
+ response = st.session_state.conversation({'question': user_question})
64
+ # log bot answer with timestamp
65
+ print(f"\n[{datetime.now()}]:{response['answer']}")
66
+ st.session_state.chat_history = response['chat_history']
67
+ for i, messages in enumerate(st.session_state.chat_history):
68
+ if i % 2 == 0:
69
+ message(messages.content, is_user=True)
70
+ else:
71
+ message(messages.content)
72
+
73
+
74
+ def main():
75
+ load_dotenv()
76
+ if "ggml-gpt4all-j-v1.3-groovy.bin" not in os.listdir("/tmp"):
77
+ hf_hub_download(repo_id="dnato/ggml-gpt4all-j-v1.3-groovy.bin",
78
+ filename="ggml-gpt4all-j-v1.3-groovy.bin", local_dir="/tmp")
79
+ st.set_page_config(page_title="Trade Document Chatbot")
80
+ if "conversation" not in st.session_state:
81
+ st.session_state.conversation = None
82
+ if "chat_history" not in st.session_state:
83
+ st.session_state.chat_history = None
84
+
85
+ st.header("Query your trade documents")
86
+ user_question = st.text_input("Ask a question about your documents...")
87
+ if user_question and st.session_state.conversation:
88
+ user_input(user_question)
89
+ with st.sidebar:
90
+ st.subheader("Your trade documents")
91
+ pdfs = st.file_uploader(
92
+ "Upload here", accept_multiple_files=True, type=["pdf"],)
93
+ if st.button("Study"):
94
+ with st.spinner("Studying ..."):
95
+ raw_text = get_pdf_text(pdfs)
96
+ # print(raw_text)
97
+ chunks = get_text_chunks(raw_text)
98
+ # print(chunks)
99
+ vectorstore = get_vectorstore(chunks)
100
+ # print(vectorstore)
101
+ st.session_state.conversation = get_conversation_chain(
102
+ vectorstore)
103
+ st.success("Done!")
104
+
105
+
106
+ if __name__ == '__main__':
107
+ main()
requirements.txt ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.5
2
+ aiosignal==1.3.1
3
+ altair==5.0.1
4
+ anyio==3.7.1
5
+ appnope==0.1.3
6
+ argilla==1.1.1
7
+ astroid==2.15.5
8
+ asttokens==2.2.1
9
+ async-timeout==4.0.2
10
+ attrs==23.1.0
11
+ backcall==0.2.0
12
+ backoff==2.2.1
13
+ beautifulsoup4==4.12.2
14
+ blinker==1.6.2
15
+ cachetools==5.3.1
16
+ certifi==2023.5.7
17
+ cffi==1.15.1
18
+ chardet==5.2.0
19
+ charset-normalizer==3.1.0
20
+ chromadb==0.3.26
21
+ click==8.1.6
22
+ clickhouse-connect==0.6.8
23
+ colorclass==2.2.2
24
+ coloredlogs==15.0.1
25
+ comm==0.1.3
26
+ compressed-rtf==1.0.6
27
+ contourpy==1.1.0
28
+ cryptography==41.0.3
29
+ cycler==0.11.0
30
+ Cython==0.29.35
31
+ dataclasses-json==0.5.14
32
+ debugpy==1.6.7
33
+ decorator==5.1.1
34
+ Deprecated==1.2.14
35
+ dill==0.3.6
36
+ diskcache==5.6.1
37
+ duckdb==0.8.1
38
+ easygui==0.98.3
39
+ ebcdic==1.1.1
40
+ et-xmlfile==1.1.0
41
+ executing==1.2.0
42
+ extract-msg==0.41.5
43
+ faiss-cpu==1.7.4
44
+ fastapi==0.101.0
45
+ filelock==3.12.2
46
+ filetype==1.2.0
47
+ flatbuffers==23.5.26
48
+ fonttools==4.40.0
49
+ frozenlist==1.4.0
50
+ fsspec==2023.6.0
51
+ gitdb==4.0.10
52
+ GitPython==3.1.32
53
+ gpt4all==1.0.3
54
+ h11==0.9.0
55
+ hnswlib==0.7.0
56
+ httpcore==0.11.1
57
+ httptools==0.6.0
58
+ httpx==0.15.5
59
+ huggingface-hub==0.16.4
60
+ humanfriendly==10.0
61
+ idna==3.4
62
+ IMAPClient==2.3.1
63
+ importlib-metadata==6.8.0
64
+ InstructorEmbedding==1.0.1
65
+ ipykernel==6.23.3
66
+ ipython==8.14.0
67
+ isort==5.12.0
68
+ jedi==0.18.2
69
+ Jinja2==3.1.2
70
+ joblib==1.3.1
71
+ jsonschema==4.19.0
72
+ jsonschema-specifications==2023.7.1
73
+ jupyter_client==8.3.0
74
+ jupyter_core==5.3.1
75
+ kiwisolver==1.4.4
76
+ langchain==0.0.228
77
+ langchainplus-sdk==0.0.20
78
+ lark-parser==0.12.0
79
+ lazy-object-proxy==1.9.0
80
+ llama-cpp-python==0.1.68
81
+ lxml==4.9.3
82
+ lz4==4.3.2
83
+ Markdown==3.4.4
84
+ markdown-it-py==3.0.0
85
+ MarkupSafe==2.1.3
86
+ marshmallow==3.20.1
87
+ matplotlib==3.7.1
88
+ matplotlib-inline==0.1.6
89
+ mccabe==0.7.0
90
+ mdurl==0.1.2
91
+ monotonic==1.6
92
+ mpmath==1.3.0
93
+ msg-parser==1.2.0
94
+ msoffcrypto-tool==5.1.1
95
+ multidict==6.0.4
96
+ mypy-extensions==1.0.0
97
+ nest-asyncio==1.5.6
98
+ networkx==3.1
99
+ nltk==3.8.1
100
+ numexpr==2.8.5
101
+ numpy==1.25.0
102
+ olefile==0.46
103
+ oletools==0.60.1
104
+ onnxruntime==1.15.1
105
+ openai==0.27.8
106
+ openapi-schema-pydantic==1.2.4
107
+ openpyxl==3.1.2
108
+ overrides==7.4.0
109
+ packaging==23.1
110
+ pandas==1.5.3
111
+ pandoc==2.3
112
+ parso==0.8.3
113
+ pcodedmp==1.2.6
114
+ pdf2image==1.16.3
115
+ pdfminer.six==20221105
116
+ pexpect==4.8.0
117
+ pickleshare==0.7.5
118
+ Pillow==9.5.0
119
+ platformdirs==3.8.0
120
+ plumbum==1.8.2
121
+ ply==3.11
122
+ posthog==3.0.1
123
+ prompt-toolkit==3.0.38
124
+ protobuf==4.23.4
125
+ psutil==5.9.5
126
+ ptyprocess==0.7.0
127
+ pulsar-client==3.2.0
128
+ pure-eval==0.2.2
129
+ pyarrow==12.0.1
130
+ pycocotools @ git+https://github.com/leimao/cocoapi.git@8c9bcc3cf640524c4c20a9c40e89cb6a2f2fa0e9#subdirectory=PythonAPI
131
+ pycparser==2.21
132
+ pycryptodome==3.15.0
133
+ pydantic==1.10.12
134
+ pydeck==0.8.0
135
+ Pygments==2.15.1
136
+ pylint==2.17.4
137
+ Pympler==1.0.1
138
+ PyMuPDF==1.22.5
139
+ pypandoc==1.11
140
+ pyparsing==2.4.7
141
+ PyPDF2==3.0.1
142
+ python-dateutil==2.8.2
143
+ python-docx==0.8.11
144
+ python-dotenv==1.0.0
145
+ python-magic==0.4.27
146
+ python-pptx==0.6.21
147
+ pytz==2023.3
148
+ pytz-deprecation-shim==0.1.0.post0
149
+ pywatchman==1.4.1
150
+ PyYAML==6.0.1
151
+ pyzmq==25.1.0
152
+ red-black-tree-mod==1.20
153
+ referencing==0.30.2
154
+ regex==2023.6.3
155
+ requests==2.31.0
156
+ rfc3986==1.5.0
157
+ rich==13.5.2
158
+ rpds-py==0.9.2
159
+ RTFDE==0.0.2
160
+ safetensors==0.3.1
161
+ scikit-learn==1.3.0
162
+ scipy==1.11.1
163
+ sentence-transformers==2.2.2
164
+ sentencepiece==0.1.99
165
+ six==1.16.0
166
+ smmap==5.0.0
167
+ sniffio==1.3.0
168
+ soupsieve==2.4.1
169
+ SQLAlchemy==2.0.19
170
+ stack-data==0.6.2
171
+ starlette==0.27.0
172
+ streamlit==1.24.0
173
+ streamlit-chat==0.1.1
174
+ sympy==1.12
175
+ tabulate==0.9.0
176
+ tenacity==8.2.2
177
+ threadpoolctl==3.2.0
178
+ tiktoken==0.4.0
179
+ tokenizers==0.13.3
180
+ toml==0.10.2
181
+ tomlkit==0.11.8
182
+ toolz==0.12.0
183
+ torch==2.0.1
184
+ torchvision==0.15.2
185
+ tornado==6.3.2
186
+ tqdm==4.65.0
187
+ traitlets==5.9.0
188
+ transformers==4.31.0
189
+ typing-inspect==0.9.0
190
+ typing_extensions==4.7.1
191
+ tzdata==2023.3
192
+ tzlocal==4.3.1
193
+ unstructured==0.8.0
194
+ urllib3==2.0.3
195
+ uvicorn==0.23.2
196
+ uvloop==0.17.0
197
+ validators==0.21.2
198
+ watchfiles==0.19.0
199
+ wcwidth==0.2.6
200
+ websockets==11.0.3
201
+ wrapt==1.13.3
202
+ xlrd==2.0.1
203
+ XlsxWriter==3.1.2
204
+ yarl==1.9.2
205
+ zipp==3.16.2
206
+ zstandard==0.21.0