Jorge Aguirregomezcorta commited on
Commit
ea08e05
1 Parent(s): 704765c

Clone other repo

Browse files
Files changed (6) hide show
  1. .env.example +1 -0
  2. .gitignore +160 -0
  3. README.md +2 -12
  4. app.py +56 -0
  5. environment.yml +159 -0
  6. logic.py +58 -0
.env.example ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
README.md CHANGED
@@ -1,12 +1,2 @@
1
- ---
2
- title: Work Assistant App
3
- emoji: 📊
4
- colorFrom: green
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.21.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Introduction
2
+ This document outlines the Onboarding case plan for the development of a Work Assistant application using ChatGPT API 4.0. The plan spans two weeks, from June 19th to June 30th, 2023.
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load libraries and dependencies
2
+ from dotenv import load_dotenv
3
+ from logic import get_pdf_text, get_text_chunks, init_embeddings, get_conversation_chain
4
+ import streamlit as st
5
+
6
+ # Initialization function
7
+ def init():
8
+ # Load environmental variables to extract API secrets
9
+ load_dotenv()
10
+
11
+ # Create and configure streamlit page
12
+ st.set_page_config(page_title="Work-Assistant App", page_icon="🐮")
13
+ st.header("Work-Assistant App 🐮")
14
+
15
+ # Initialize session state
16
+ if "conversation_chain" not in st.session_state:
17
+ st.session_state.conversation_chain = None
18
+ if "chat_history" not in st.session_state:
19
+ st.session_state.chat_history = None
20
+
21
+ # Main function
22
+ def main():
23
+ init()
24
+
25
+ # UI main layout
26
+ user_input = st.text_input("Ask a question:")
27
+ if user_input:
28
+ # Obtain response from conversation chain
29
+ response = st.session_state.conversation_chain({'question': user_input})
30
+ # Update chat history via response history
31
+ st.session_state.chat_history = response['chat_history']
32
+ # Show conversation
33
+ for i, message in enumerate(st.session_state.chat_history):
34
+ if i % 2 == 0:
35
+ st.write("User:" + message.content)
36
+ else:
37
+ st.write("Bot:" + message.content)
38
+
39
+ # UI sidebar layout
40
+ with st.sidebar:
41
+ st.subheader("Your documents")
42
+ pdf_docs = st.file_uploader("Upload your PDFs", accept_multiple_files=True)
43
+ if st.button("Process files"):
44
+ with st.spinner("Processing..."):
45
+ # Extract text content from pdf
46
+ text = get_pdf_text(pdf_docs)
47
+ # Split text into chunks
48
+ chunks = get_text_chunks(text)
49
+ # Obtain embeddings
50
+ embeddings = init_embeddings()
51
+ # Start conversation chain
52
+ st.session_state.conversation_chain = get_conversation_chain(chunks=chunks, embeddings=embeddings)
53
+
54
+ # Main program of the application
55
+ if __name__ == '__main__':
56
+ main()
environment.yml ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: py311
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - aiohttp=3.8.3=py311h2bbff1b_0
7
+ - aiosignal=1.2.0=pyhd3eb1b0_0
8
+ - appdirs=1.4.4=pyhd3eb1b0_0
9
+ - async-timeout=4.0.2=py311haa95532_0
10
+ - attrs=22.1.0=py311haa95532_0
11
+ - blas=1.0=mkl
12
+ - bottleneck=1.3.5=py311h5bb9823_0
13
+ - brotli=1.0.9=h2bbff1b_7
14
+ - brotli-bin=1.0.9=h2bbff1b_7
15
+ - brotlipy=0.7.0=py311h2bbff1b_1002
16
+ - bzip2=1.0.8=he774522_0
17
+ - ca-certificates=2023.05.30=haa95532_0
18
+ - certifi=2023.5.7=py311haa95532_0
19
+ - cffi=1.15.1=py311h2bbff1b_3
20
+ - charset-normalizer=2.0.4=pyhd3eb1b0_0
21
+ - click=8.0.4=py311haa95532_0
22
+ - colorama=0.4.6=py311haa95532_0
23
+ - contourpy=1.0.5=py311h59b6b97_0
24
+ - cryptography=39.0.1=py311h21b164f_2
25
+ - cycler=0.11.0=pyhd3eb1b0_0
26
+ - dataclasses=0.8=pyh6d0b6a4_7
27
+ - dataclasses-json=0.5.7=py311haa95532_0
28
+ - et_xmlfile=1.1.0=py311haa95532_0
29
+ - faiss=1.7.4=py311h8355858_0_cpu
30
+ - fonttools=4.25.0=pyhd3eb1b0_0
31
+ - freetype=2.12.1=ha860e81_0
32
+ - frozenlist=1.3.3=py311h2bbff1b_0
33
+ - giflib=5.2.1=h8cc25b3_3
34
+ - greenlet=2.0.1=py311hd77b12b_0
35
+ - icc_rt=2022.1.0=h6049295_2
36
+ - idna=3.4=py311haa95532_0
37
+ - intel-openmp=2023.1.0=h59b6b97_46319
38
+ - joblib=1.2.0=py311haa95532_0
39
+ - jpeg=9e=h2bbff1b_1
40
+ - kiwisolver=1.4.4=py311hd77b12b_0
41
+ - langchain=0.0.190=pyhd8ed1ab_0
42
+ - lerc=3.0=hd77b12b_0
43
+ - libblas=3.9.0=1_h8933c1f_netlib
44
+ - libbrotlicommon=1.0.9=h2bbff1b_7
45
+ - libbrotlidec=1.0.9=h2bbff1b_7
46
+ - libbrotlienc=1.0.9=h2bbff1b_7
47
+ - libdeflate=1.17=h2bbff1b_0
48
+ - libfaiss=1.7.4=hba6d9cf_0_cpu
49
+ - libfaiss-avx2=1.7.4=h1234567_0_cpu
50
+ - libffi=3.4.4=hd77b12b_0
51
+ - liblapack=3.9.0=5_hd5c7e75_netlib
52
+ - libpng=1.6.39=h8cc25b3_0
53
+ - libtiff=4.5.0=h6c2663c_2
54
+ - libwebp=1.2.4=hbc33d0d_1
55
+ - libwebp-base=1.2.4=h2bbff1b_1
56
+ - lz4-c=1.9.4=h2bbff1b_0
57
+ - m2w64-gcc-libgfortran=5.3.0=6
58
+ - m2w64-gcc-libs=5.3.0=7
59
+ - m2w64-gcc-libs-core=5.3.0=7
60
+ - m2w64-gmp=6.1.0=2
61
+ - m2w64-libwinpthread-git=5.0.0.4634.697f757=2
62
+ - marshmallow=3.19.0=py311haa95532_0
63
+ - marshmallow-enum=1.5.1=py311haa95532_0
64
+ - matplotlib-base=3.7.1=py311hf62ec03_1
65
+ - mkl=2023.1.0=h8bd8f75_46356
66
+ - mkl-service=2.4.0=py311h2bbff1b_1
67
+ - mkl_fft=1.3.6=py311hf62ec03_1
68
+ - mkl_random=1.2.2=py311hf62ec03_1
69
+ - msys2-conda-epoch=20160418=1
70
+ - multidict=6.0.2=py311h2bbff1b_0
71
+ - munkres=1.1.4=py_0
72
+ - mypy_extensions=0.4.3=py311haa95532_1
73
+ - numexpr=2.8.4=py311h1fcbade_1
74
+ - numpy=1.25.0=py311hdab7c0b_0
75
+ - numpy-base=1.25.0=py311hd01c5d8_0
76
+ - openai=0.27.4=py311haa95532_0
77
+ - openapi-schema-pydantic=1.2.4=py311haa95532_0
78
+ - openpyxl=3.0.10=py311h2bbff1b_0
79
+ - openssl=3.0.9=h2bbff1b_0
80
+ - packaging=23.0=py311haa95532_0
81
+ - pandas=1.5.3=py311heda8569_0
82
+ - pandas-stubs=1.5.3.230203=py311haa95532_0
83
+ - pillow=9.4.0=py311hd77b12b_0
84
+ - pip=23.1.2=py311haa95532_0
85
+ - plotly=5.9.0=py311haa95532_0
86
+ - pooch=1.4.0=pyhd3eb1b0_0
87
+ - pycparser=2.21=pyhd3eb1b0_0
88
+ - pydantic=1.10.8=py311h2bbff1b_0
89
+ - pyopenssl=23.0.0=py311haa95532_0
90
+ - pyparsing=3.0.9=py311haa95532_0
91
+ - pypdf=3.11.0=pyhd8ed1ab_0
92
+ - pysocks=1.7.1=py311haa95532_0
93
+ - python=3.11.3=he1021f5_1
94
+ - python-dateutil=2.8.2=pyhd3eb1b0_0
95
+ - python-dotenv=0.21.0=py311haa95532_0
96
+ - python_abi=3.11=2_cp311
97
+ - pytz=2022.7=py311haa95532_0
98
+ - pyyaml=6.0=py311h2bbff1b_1
99
+ - requests=2.29.0=py311haa95532_0
100
+ - scikit-learn=1.2.2=py311hd77b12b_1
101
+ - scipy=1.10.1=py311hc1ccb85_1
102
+ - setuptools=67.8.0=py311haa95532_0
103
+ - six=1.16.0=pyhd3eb1b0_1
104
+ - sqlalchemy=1.4.39=py311h2bbff1b_0
105
+ - sqlite=3.41.2=h2bbff1b_0
106
+ - tbb=2021.8.0=h59b6b97_0
107
+ - tenacity=8.2.2=py311haa95532_0
108
+ - threadpoolctl=2.2.0=pyh0d69192_0
109
+ - tk=8.6.12=h2bbff1b_0
110
+ - tqdm=4.65.0=py311h746a85d_0
111
+ - types-pytz=2022.4.0.0=py311haa95532_1
112
+ - typing-extensions=4.6.3=py311haa95532_0
113
+ - typing_extensions=4.6.3=py311haa95532_0
114
+ - typing_inspect=0.7.1=pyhd3eb1b0_0
115
+ - ucrt=10.0.20348.0=haa95532_0
116
+ - urllib3=1.26.16=py311haa95532_0
117
+ - vc=14.2=h21ff451_1
118
+ - vc14_runtime=14.36.32532=hfdfe4a8_16
119
+ - vs2015_runtime=14.36.32532=h05e6639_16
120
+ - wheel=0.38.4=py311haa95532_0
121
+ - win_inet_pton=1.1.0=py311haa95532_0
122
+ - xz=5.4.2=h8cc25b3_0
123
+ - yaml=0.2.5=he774522_0
124
+ - yarl=1.8.1=py311h2bbff1b_0
125
+ - zlib=1.2.13=h8cc25b3_0
126
+ - zstd=1.5.5=hd43e919_0
127
+ - pip:
128
+ - altair==5.0.1
129
+ - blinker==1.6.2
130
+ - cachetools==5.3.1
131
+ - decorator==5.1.1
132
+ - gitdb==4.0.10
133
+ - gitpython==3.1.31
134
+ - importlib-metadata==6.7.0
135
+ - jinja2==3.1.2
136
+ - jsonschema==4.17.3
137
+ - markdown-it-py==3.0.0
138
+ - markupsafe==2.1.3
139
+ - mdurl==0.1.2
140
+ - protobuf==4.23.3
141
+ - pyarrow==12.0.1
142
+ - pydeck==0.8.1b0
143
+ - pygments==2.15.1
144
+ - pympler==1.0.1
145
+ - pyrsistent==0.19.3
146
+ - pytz-deprecation-shim==0.1.0.post0
147
+ - regex==2023.6.3
148
+ - rich==13.4.2
149
+ - smmap==5.0.0
150
+ - streamlit==1.23.1
151
+ - tiktoken==0.4.0
152
+ - toml==0.10.2
153
+ - toolz==0.12.0
154
+ - tornado==6.3.2
155
+ - tzdata==2023.3
156
+ - tzlocal==4.3.1
157
+ - validators==0.20.0
158
+ - watchdog==3.0.0
159
+ - zipp==3.15.0
logic.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load libraries and dependencies
2
+ from pypdf import PdfReader
3
+ from langchain.text_splitter import CharacterTextSplitter
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+
11
+ # Transform a list of PDFs into a single string
12
+ def get_pdf_text(pdf_documents):
13
+ # Initialize line of text
14
+ text = ""
15
+ # Append text extracted from the documents into the text string
16
+ for pdf in pdf_documents:
17
+ pdf_reader = PdfReader(pdf, strict=True)
18
+ for page in pdf_reader.pages:
19
+ text += page.extract_text()
20
+ return text
21
+
22
+ # Transform a single line of text into an array of text chunks
23
+ def get_text_chunks(raw_text, separator="\n", chunk_size=1000, chunk_overlap=200, lenght_function=len):
24
+ # Initialize TextSplitter with default variables
25
+ text_splitter = CharacterTextSplitter(
26
+ separator=separator,
27
+ chunk_size=chunk_size,
28
+ chunk_overlap=chunk_overlap,
29
+ length_function=lenght_function
30
+ )
31
+ # Create list of text chunks
32
+ return text_splitter.split_text(raw_text)
33
+
34
+ # Initialize embeddings
35
+ def init_embeddings(type=1):
36
+ # Choose embeding depending on the project's necessities
37
+ if type == 1:
38
+ # OpenAI Embeddings
39
+ return OpenAIEmbeddings()
40
+ else:
41
+ # Instructor Embeddings
42
+ return HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
43
+
44
+ # Initialize Conversation Chain
45
+ def get_conversation_chain(chunks, embeddings):
46
+ # Create Vector Database from text chunks and embeddings
47
+ knowledge_base = FAISS.from_texts(chunks, embeddings).as_retriever()
48
+ # Create buffer to store the conversation memory
49
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
50
+ # Initialize language model
51
+ language_model = ChatOpenAI()
52
+ # Create conversation chain
53
+ conversation_chain = ConversationalRetrievalChain.from_llm(
54
+ llm=language_model,
55
+ retriever=knowledge_base,
56
+ memory=memory
57
+ )
58
+ return conversation_chain