nouman-10 commited on
Commit
ccba2d5
1 Parent(s): b9d2b52

Upload 35 files

Browse files
app/app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m streamlit run app.py
2
+
3
+ import streamlit as st
4
+ from PIL import Image
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import shutil
8
+ import sys
9
+ sys.path.insert(1, "src/models")
10
+ from extractive_qa import QA
11
+ from visual_qa import VisualQA
12
+ from search_engine import IR
13
+ # from src.models.extractive_qa import QA
14
+ # from src.models.search_engine import IR
15
+
16
+
17
+ @st.cache_resource
18
+ def load_visual_qa_module():
19
+ """
20
+ Loads the Visual QA module
21
+ """
22
+ qa_module = VisualQA()
23
+ return qa_module
24
+
25
+ @st.cache_resource
26
+ def load_qa_module():
27
+ """
28
+ Loads the extractive QA module
29
+ """
30
+ qa_module = QA()
31
+ return qa_module
32
+
33
+ @st.cache_resource
34
+ def load_search_engine():
35
+ """
36
+ Loads the extractive QA module
37
+ """
38
+ search_engine = IR()
39
+ return search_engine
40
+
41
+ def get_metadata_from_question(question):
42
+ if 'artist' in question:
43
+ return 'artist'
44
+ elif 'style' in question:
45
+ return 'style'
46
+ elif 'genre' in question:
47
+ return 'genre'
48
+
49
+ # Defining session variables
50
+ if 'extractive_qa' not in st.session_state:
51
+ st.session_state.extractive_qa = False
52
+
53
+ if 'vqa_prediction' not in st.session_state:
54
+ st.session_state.vqa_prediction = None
55
+
56
+ dirpath = Path.cwd() / 'results'
57
+ model_path = Path.cwd() / 'models'
58
+ #print(dirpath)
59
+ if dirpath.exists() and dirpath.is_dir():
60
+ shutil.rmtree(dirpath)
61
+
62
+ vqa_module = load_visual_qa_module()
63
+ qa_module = load_qa_module()
64
+ search_engine = load_search_engine()
65
+
66
+ st.title("VQArt")
67
+
68
+ st.markdown("""Hello, please take a picture of the painting and ask a question about it. \
69
+ I can answer questions about the style, artist and genre of the painting, \
70
+ and then questions about these topics. \
71
+ """)
72
+
73
+ # Take a picture
74
+ imgbuffer = st.camera_input('')
75
+
76
+ # Upload a file
77
+ uploaded_file = st.file_uploader('Upload a photo of a painting')
78
+
79
+ # Prompt for a question
80
+ question = st.text_input(label="What is your question (e.g. Who's the artist of this painting?)")
81
+
82
+ if question:
83
+ print(f'Received question: {question}')
84
+
85
+ if st.session_state.extractive_qa:
86
+ # Doing Extractive QA
87
+ full_question = f'[{st.session_state.vqa_prediction}] {question}'
88
+
89
+ articles, scores = search_engine.retrieve_documents(full_question, 5)
90
+ print(f'Found {len(articles)} search results')
91
+
92
+ if len(articles) == 0:
93
+ st.markdown("Sorry, I don't know the answer to that question :(")
94
+ else:
95
+ best_result = articles[0]
96
+ answer = qa_module.answer_question(full_question, best_result)
97
+ st.markdown(f'Answer: {answer}')
98
+ else:
99
+ # Doing VQA
100
+
101
+ if imgbuffer:
102
+ # Camera
103
+ img = Image.open(imgbuffer)
104
+ elif uploaded_file:
105
+ # Uploaded file
106
+ img = Image.open(uploaded_file)
107
+
108
+ result = vqa_module.answer_question(question, img)
109
+ meta_data = get_metadata_from_question(question)
110
+ st.markdown(f"Answer: The {meta_data} of this painting is {result}")
111
+
112
+ # Switching to extractive QA
113
+ st.session_state.extractive_qa = True
114
+
115
+ # Saving the predicted VQA answer
116
+ st.session_state.vqa_prediction = result
docs/Makefile ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line.
5
+ SPHINXOPTS =
6
+ SPHINXBUILD = sphinx-build
7
+ PAPER =
8
+ BUILDDIR = _build
9
+
10
+ # Internal variables.
11
+ PAPEROPT_a4 = -D latex_paper_size=a4
12
+ PAPEROPT_letter = -D latex_paper_size=letter
13
+ ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
14
+ # the i18n builder cannot share the environment and doctrees with the others
15
+ I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
16
+
17
+ .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
18
+
19
+ help:
20
+ @echo "Please use \`make <target>' where <target> is one of"
21
+ @echo " html to make standalone HTML files"
22
+ @echo " dirhtml to make HTML files named index.html in directories"
23
+ @echo " singlehtml to make a single large HTML file"
24
+ @echo " pickle to make pickle files"
25
+ @echo " json to make JSON files"
26
+ @echo " htmlhelp to make HTML files and a HTML help project"
27
+ @echo " qthelp to make HTML files and a qthelp project"
28
+ @echo " devhelp to make HTML files and a Devhelp project"
29
+ @echo " epub to make an epub"
30
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
31
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
32
+ @echo " text to make text files"
33
+ @echo " man to make manual pages"
34
+ @echo " texinfo to make Texinfo files"
35
+ @echo " info to make Texinfo files and run them through makeinfo"
36
+ @echo " gettext to make PO message catalogs"
37
+ @echo " changes to make an overview of all changed/added/deprecated items"
38
+ @echo " linkcheck to check all external links for integrity"
39
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
40
+
41
+ clean:
42
+ -rm -rf $(BUILDDIR)/*
43
+
44
+ html:
45
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
46
+ @echo
47
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
48
+
49
+ dirhtml:
50
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
51
+ @echo
52
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
53
+
54
+ singlehtml:
55
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
56
+ @echo
57
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
58
+
59
+ pickle:
60
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
61
+ @echo
62
+ @echo "Build finished; now you can process the pickle files."
63
+
64
+ json:
65
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
66
+ @echo
67
+ @echo "Build finished; now you can process the JSON files."
68
+
69
+ htmlhelp:
70
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
71
+ @echo
72
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
73
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
74
+
75
+ qthelp:
76
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
77
+ @echo
78
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
79
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
80
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/art_chatbot.qhcp"
81
+ @echo "To view the help file:"
82
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/art_chatbot.qhc"
83
+
84
+ devhelp:
85
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
86
+ @echo
87
+ @echo "Build finished."
88
+ @echo "To view the help file:"
89
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/art_chatbot"
90
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/art_chatbot"
91
+ @echo "# devhelp"
92
+
93
+ epub:
94
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
95
+ @echo
96
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
97
+
98
+ latex:
99
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100
+ @echo
101
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
103
+ "(use \`make latexpdf' here to do that automatically)."
104
+
105
+ latexpdf:
106
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107
+ @echo "Running LaTeX files through pdflatex..."
108
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
109
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110
+
111
+ text:
112
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113
+ @echo
114
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
115
+
116
+ man:
117
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118
+ @echo
119
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120
+
121
+ texinfo:
122
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123
+ @echo
124
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125
+ @echo "Run \`make' in that directory to run these through makeinfo" \
126
+ "(use \`make info' here to do that automatically)."
127
+
128
+ info:
129
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130
+ @echo "Running Texinfo files through makeinfo..."
131
+ make -C $(BUILDDIR)/texinfo info
132
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133
+
134
+ gettext:
135
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136
+ @echo
137
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138
+
139
+ changes:
140
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141
+ @echo
142
+ @echo "The overview file is in $(BUILDDIR)/changes."
143
+
144
+ linkcheck:
145
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146
+ @echo
147
+ @echo "Link check complete; look for any errors in the above output " \
148
+ "or in $(BUILDDIR)/linkcheck/output.txt."
149
+
150
+ doctest:
151
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152
+ @echo "Testing of doctests in the sources finished, look at the " \
153
+ "results in $(BUILDDIR)/doctest/output.txt."
docs/commands.rst ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Commands
2
+ ========
3
+
4
+ The Makefile contains the central entry points for common tasks related to this project.
5
+
6
+ Syncing data to S3
7
+ ^^^^^^^^^^^^^^^^^^
8
+
9
+ * `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`.
10
+ * `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`.
docs/conf.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # art_chatbot documentation build configuration file, created by
4
+ # sphinx-quickstart.
5
+ #
6
+ # This file is execfile()d with the current directory set to its containing dir.
7
+ #
8
+ # Note that not all possible configuration values are present in this
9
+ # autogenerated file.
10
+ #
11
+ # All configuration values have a default; values that are commented out
12
+ # serve to show the default.
13
+
14
+ import os
15
+ import sys
16
+
17
+ # If extensions (or modules to document with autodoc) are in another directory,
18
+ # add these directories to sys.path here. If the directory is relative to the
19
+ # documentation root, use os.path.abspath to make it absolute, like shown here.
20
+ # sys.path.insert(0, os.path.abspath('.'))
21
+
22
+ # -- General configuration -----------------------------------------------------
23
+
24
+ # If your documentation needs a minimal Sphinx version, state it here.
25
+ # needs_sphinx = '1.0'
26
+
27
+ # Add any Sphinx extension module names here, as strings. They can be extensions
28
+ # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
29
+ extensions = []
30
+
31
+ # Add any paths that contain templates here, relative to this directory.
32
+ templates_path = ['_templates']
33
+
34
+ # The suffix of source filenames.
35
+ source_suffix = '.rst'
36
+
37
+ # The encoding of source files.
38
+ # source_encoding = 'utf-8-sig'
39
+
40
+ # The master toctree document.
41
+ master_doc = 'index'
42
+
43
+ # General information about the project.
44
+ project = u'art_chatbot'
45
+
46
+ # The version info for the project you're documenting, acts as replacement for
47
+ # |version| and |release|, also used in various other places throughout the
48
+ # built documents.
49
+ #
50
+ # The short X.Y version.
51
+ version = '0.1'
52
+ # The full version, including alpha/beta/rc tags.
53
+ release = '0.1'
54
+
55
+ # The language for content autogenerated by Sphinx. Refer to documentation
56
+ # for a list of supported languages.
57
+ # language = None
58
+
59
+ # There are two options for replacing |today|: either, you set today to some
60
+ # non-false value, then it is used:
61
+ # today = ''
62
+ # Else, today_fmt is used as the format for a strftime call.
63
+ # today_fmt = '%B %d, %Y'
64
+
65
+ # List of patterns, relative to source directory, that match files and
66
+ # directories to ignore when looking for source files.
67
+ exclude_patterns = ['_build']
68
+
69
+ # The reST default role (used for this markup: `text`) to use for all documents.
70
+ # default_role = None
71
+
72
+ # If true, '()' will be appended to :func: etc. cross-reference text.
73
+ # add_function_parentheses = True
74
+
75
+ # If true, the current module name will be prepended to all description
76
+ # unit titles (such as .. function::).
77
+ # add_module_names = True
78
+
79
+ # If true, sectionauthor and moduleauthor directives will be shown in the
80
+ # output. They are ignored by default.
81
+ # show_authors = False
82
+
83
+ # The name of the Pygments (syntax highlighting) style to use.
84
+ pygments_style = 'sphinx'
85
+
86
+ # A list of ignored prefixes for module index sorting.
87
+ # modindex_common_prefix = []
88
+
89
+
90
+ # -- Options for HTML output ---------------------------------------------------
91
+
92
+ # The theme to use for HTML and HTML Help pages. See the documentation for
93
+ # a list of builtin themes.
94
+ html_theme = 'default'
95
+
96
+ # Theme options are theme-specific and customize the look and feel of a theme
97
+ # further. For a list of options available for each theme, see the
98
+ # documentation.
99
+ # html_theme_options = {}
100
+
101
+ # Add any paths that contain custom themes here, relative to this directory.
102
+ # html_theme_path = []
103
+
104
+ # The name for this set of Sphinx documents. If None, it defaults to
105
+ # "<project> v<release> documentation".
106
+ # html_title = None
107
+
108
+ # A shorter title for the navigation bar. Default is the same as html_title.
109
+ # html_short_title = None
110
+
111
+ # The name of an image file (relative to this directory) to place at the top
112
+ # of the sidebar.
113
+ # html_logo = None
114
+
115
+ # The name of an image file (within the static path) to use as favicon of the
116
+ # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
117
+ # pixels large.
118
+ # html_favicon = None
119
+
120
+ # Add any paths that contain custom static files (such as style sheets) here,
121
+ # relative to this directory. They are copied after the builtin static files,
122
+ # so a file named "default.css" will overwrite the builtin "default.css".
123
+ html_static_path = ['_static']
124
+
125
+ # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126
+ # using the given strftime format.
127
+ # html_last_updated_fmt = '%b %d, %Y'
128
+
129
+ # If true, SmartyPants will be used to convert quotes and dashes to
130
+ # typographically correct entities.
131
+ # html_use_smartypants = True
132
+
133
+ # Custom sidebar templates, maps document names to template names.
134
+ # html_sidebars = {}
135
+
136
+ # Additional templates that should be rendered to pages, maps page names to
137
+ # template names.
138
+ # html_additional_pages = {}
139
+
140
+ # If false, no module index is generated.
141
+ # html_domain_indices = True
142
+
143
+ # If false, no index is generated.
144
+ # html_use_index = True
145
+
146
+ # If true, the index is split into individual pages for each letter.
147
+ # html_split_index = False
148
+
149
+ # If true, links to the reST sources are added to the pages.
150
+ # html_show_sourcelink = True
151
+
152
+ # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153
+ # html_show_sphinx = True
154
+
155
+ # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156
+ # html_show_copyright = True
157
+
158
+ # If true, an OpenSearch description file will be output, and all pages will
159
+ # contain a <link> tag referring to it. The value of this option must be the
160
+ # base URL from which the finished HTML is served.
161
+ # html_use_opensearch = ''
162
+
163
+ # This is the file name suffix for HTML files (e.g. ".xhtml").
164
+ # html_file_suffix = None
165
+
166
+ # Output file base name for HTML help builder.
167
+ htmlhelp_basename = 'art_chatbotdoc'
168
+
169
+
170
+ # -- Options for LaTeX output --------------------------------------------------
171
+
172
+ latex_elements = {
173
+ # The paper size ('letterpaper' or 'a4paper').
174
+ # 'papersize': 'letterpaper',
175
+
176
+ # The font size ('10pt', '11pt' or '12pt').
177
+ # 'pointsize': '10pt',
178
+
179
+ # Additional stuff for the LaTeX preamble.
180
+ # 'preamble': '',
181
+ }
182
+
183
+ # Grouping the document tree into LaTeX files. List of tuples
184
+ # (source start file, target name, title, author, documentclass [howto/manual]).
185
+ latex_documents = [
186
+ ('index',
187
+ 'art_chatbot.tex',
188
+ u'art_chatbot Documentation',
189
+ u"Your name (or your organization/company/team)", 'manual'),
190
+ ]
191
+
192
+ # The name of an image file (relative to this directory) to place at the top of
193
+ # the title page.
194
+ # latex_logo = None
195
+
196
+ # For "manual" documents, if this is true, then toplevel headings are parts,
197
+ # not chapters.
198
+ # latex_use_parts = False
199
+
200
+ # If true, show page references after internal links.
201
+ # latex_show_pagerefs = False
202
+
203
+ # If true, show URL addresses after external links.
204
+ # latex_show_urls = False
205
+
206
+ # Documents to append as an appendix to all manuals.
207
+ # latex_appendices = []
208
+
209
+ # If false, no module index is generated.
210
+ # latex_domain_indices = True
211
+
212
+
213
+ # -- Options for manual page output --------------------------------------------
214
+
215
+ # One entry per manual page. List of tuples
216
+ # (source start file, name, description, authors, manual section).
217
+ man_pages = [
218
+ ('index', 'art_chatbot', u'art_chatbot Documentation',
219
+ [u"Your name (or your organization/company/team)"], 1)
220
+ ]
221
+
222
+ # If true, show URL addresses after external links.
223
+ # man_show_urls = False
224
+
225
+
226
+ # -- Options for Texinfo output ------------------------------------------------
227
+
228
+ # Grouping the document tree into Texinfo files. List of tuples
229
+ # (source start file, target name, title, author,
230
+ # dir menu entry, description, category)
231
+ texinfo_documents = [
232
+ ('index', 'art_chatbot', u'art_chatbot Documentation',
233
+ u"Your name (or your organization/company/team)", 'art_chatbot',
234
+ 'A short description of the project.', 'Miscellaneous'),
235
+ ]
236
+
237
+ # Documents to append as an appendix to all manuals.
238
+ # texinfo_appendices = []
239
+
240
+ # If false, no module index is generated.
241
+ # texinfo_domain_indices = True
242
+
243
+ # How to display URL addresses: 'footnote', 'no', or 'inline'.
244
+ # texinfo_show_urls = 'footnote'
docs/getting-started.rst ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Getting started
2
+ ===============
3
+
4
+ This is where you describe how to get set up on a clean install, including the
5
+ commands necessary to get the raw data (using the `sync_data_from_s3` command,
6
+ for example), and then how to make the cleaned, final data sets.
docs/index.rst ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. art_chatbot documentation master file, created by
2
+ sphinx-quickstart.
3
+ You can adapt this file completely to your liking, but it should at least
4
+ contain the root `toctree` directive.
5
+
6
+ art_chatbot documentation!
7
+ ==============================================
8
+
9
+ Contents:
10
+
11
+ .. toctree::
12
+ :maxdepth: 2
13
+
14
+ getting-started
15
+ commands
16
+
17
+
18
+
19
+ Indices and tables
20
+ ==================
21
+
22
+ * :ref:`genindex`
23
+ * :ref:`modindex`
24
+ * :ref:`search`
docs/make.bat ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @ECHO OFF
2
+
3
+ REM Command file for Sphinx documentation
4
+
5
+ if "%SPHINXBUILD%" == "" (
6
+ set SPHINXBUILD=sphinx-build
7
+ )
8
+ set BUILDDIR=_build
9
+ set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
10
+ set I18NSPHINXOPTS=%SPHINXOPTS% .
11
+ if NOT "%PAPER%" == "" (
12
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
13
+ set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
14
+ )
15
+
16
+ if "%1" == "" goto help
17
+
18
+ if "%1" == "help" (
19
+ :help
20
+ echo.Please use `make ^<target^>` where ^<target^> is one of
21
+ echo. html to make standalone HTML files
22
+ echo. dirhtml to make HTML files named index.html in directories
23
+ echo. singlehtml to make a single large HTML file
24
+ echo. pickle to make pickle files
25
+ echo. json to make JSON files
26
+ echo. htmlhelp to make HTML files and a HTML help project
27
+ echo. qthelp to make HTML files and a qthelp project
28
+ echo. devhelp to make HTML files and a Devhelp project
29
+ echo. epub to make an epub
30
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
31
+ echo. text to make text files
32
+ echo. man to make manual pages
33
+ echo. texinfo to make Texinfo files
34
+ echo. gettext to make PO message catalogs
35
+ echo. changes to make an overview over all changed/added/deprecated items
36
+ echo. linkcheck to check all external links for integrity
37
+ echo. doctest to run all doctests embedded in the documentation if enabled
38
+ goto end
39
+ )
40
+
41
+ if "%1" == "clean" (
42
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
43
+ del /q /s %BUILDDIR%\*
44
+ goto end
45
+ )
46
+
47
+ if "%1" == "html" (
48
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
49
+ if errorlevel 1 exit /b 1
50
+ echo.
51
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
52
+ goto end
53
+ )
54
+
55
+ if "%1" == "dirhtml" (
56
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
57
+ if errorlevel 1 exit /b 1
58
+ echo.
59
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
60
+ goto end
61
+ )
62
+
63
+ if "%1" == "singlehtml" (
64
+ %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
65
+ if errorlevel 1 exit /b 1
66
+ echo.
67
+ echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
68
+ goto end
69
+ )
70
+
71
+ if "%1" == "pickle" (
72
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
73
+ if errorlevel 1 exit /b 1
74
+ echo.
75
+ echo.Build finished; now you can process the pickle files.
76
+ goto end
77
+ )
78
+
79
+ if "%1" == "json" (
80
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
81
+ if errorlevel 1 exit /b 1
82
+ echo.
83
+ echo.Build finished; now you can process the JSON files.
84
+ goto end
85
+ )
86
+
87
+ if "%1" == "htmlhelp" (
88
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
89
+ if errorlevel 1 exit /b 1
90
+ echo.
91
+ echo.Build finished; now you can run HTML Help Workshop with the ^
92
+ .hhp project file in %BUILDDIR%/htmlhelp.
93
+ goto end
94
+ )
95
+
96
+ if "%1" == "qthelp" (
97
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
98
+ if errorlevel 1 exit /b 1
99
+ echo.
100
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
101
+ .qhcp project file in %BUILDDIR%/qthelp, like this:
102
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\art_chatbot.qhcp
103
+ echo.To view the help file:
104
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\art_chatbot.ghc
105
+ goto end
106
+ )
107
+
108
+ if "%1" == "devhelp" (
109
+ %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110
+ if errorlevel 1 exit /b 1
111
+ echo.
112
+ echo.Build finished.
113
+ goto end
114
+ )
115
+
116
+ if "%1" == "epub" (
117
+ %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118
+ if errorlevel 1 exit /b 1
119
+ echo.
120
+ echo.Build finished. The epub file is in %BUILDDIR%/epub.
121
+ goto end
122
+ )
123
+
124
+ if "%1" == "latex" (
125
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126
+ if errorlevel 1 exit /b 1
127
+ echo.
128
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129
+ goto end
130
+ )
131
+
132
+ if "%1" == "text" (
133
+ %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134
+ if errorlevel 1 exit /b 1
135
+ echo.
136
+ echo.Build finished. The text files are in %BUILDDIR%/text.
137
+ goto end
138
+ )
139
+
140
+ if "%1" == "man" (
141
+ %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142
+ if errorlevel 1 exit /b 1
143
+ echo.
144
+ echo.Build finished. The manual pages are in %BUILDDIR%/man.
145
+ goto end
146
+ )
147
+
148
+ if "%1" == "texinfo" (
149
+ %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150
+ if errorlevel 1 exit /b 1
151
+ echo.
152
+ echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153
+ goto end
154
+ )
155
+
156
+ if "%1" == "gettext" (
157
+ %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158
+ if errorlevel 1 exit /b 1
159
+ echo.
160
+ echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161
+ goto end
162
+ )
163
+
164
+ if "%1" == "changes" (
165
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166
+ if errorlevel 1 exit /b 1
167
+ echo.
168
+ echo.The overview file is in %BUILDDIR%/changes.
169
+ goto end
170
+ )
171
+
172
+ if "%1" == "linkcheck" (
173
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174
+ if errorlevel 1 exit /b 1
175
+ echo.
176
+ echo.Link check complete; look for any errors in the above output ^
177
+ or in %BUILDDIR%/linkcheck/output.txt.
178
+ goto end
179
+ )
180
+
181
+ if "%1" == "doctest" (
182
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183
+ if errorlevel 1 exit /b 1
184
+ echo.
185
+ echo.Testing of doctests in the sources finished, look at the ^
186
+ results in %BUILDDIR%/doctest/output.txt.
187
+ goto end
188
+ )
189
+
190
+ :end
models/.gitkeep ADDED
File without changes
notebooks/.gitkeep ADDED
File without changes
references/.gitkeep ADDED
File without changes
reports/.gitkeep ADDED
File without changes
reports/figures/.gitkeep ADDED
File without changes
src/__init__.py ADDED
File without changes
src/data/.gitkeep ADDED
File without changes
src/data/__init__.py ADDED
File without changes
src/data/__pycache__/wiki_scrape.cpython-311.pyc ADDED
Binary file (3.78 kB). View file
 
src/data/entities.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Realism
2
+ portrait
3
+ Romanticism
4
+ landscape
5
+ Surrealism
6
+ Impressionism
7
+ genre painting
8
+ religious painting
9
+ Neoclassicism
10
+ Symbolist painting
11
+ Ivan Aivazovsky
12
+ Marc Chagall
13
+ John Singer Sargent
14
+ Gustave Dore
15
+ Zdzisław Beksiński
src/data/make_dataset.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import click
3
+ import logging
4
+ from pathlib import Path
5
+ from dotenv import find_dotenv, load_dotenv
6
+
7
+
8
+ @click.command()
9
+ @click.argument('input_filepath', type=click.Path(exists=True))
10
+ @click.argument('output_filepath', type=click.Path())
11
+ def main(input_filepath, output_filepath):
12
+ """ Runs data processing scripts to turn raw data from (../raw) into
13
+ cleaned data ready to be analyzed (saved in ../processed).
14
+ """
15
+ logger = logging.getLogger(__name__)
16
+ logger.info('making final data set from raw data')
17
+
18
+
19
+ if __name__ == '__main__':
20
+ log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
21
+ logging.basicConfig(level=logging.INFO, format=log_fmt)
22
+
23
+ # not used in this stub but often useful for finding various files
24
+ project_dir = Path(__file__).resolve().parents[2]
25
+
26
+ # find .env automagically by walking up directories until it's found, then
27
+ # load up the .env entries as environment variables
28
+ load_dotenv(find_dotenv())
29
+
30
+ main()
src/data/wiki_scrape.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wikipedia
2
+ import os
3
+
4
+ def get_raw_wikipedia_article(entity):
5
+ try:
6
+ results = wikipedia.search(entity)
7
+ best_result = results[0]
8
+ page = wikipedia.page(best_result, auto_suggest=False)
9
+ return page.content
10
+ except wikipedia.exceptions.DisambiguationError as e:
11
+ # Search term can't be disambiguated so we try
12
+ # again with a more specific search term adding ' (arts)'
13
+ return get_raw_wikipedia_article(entity + ' (arts)')
14
+
15
+ except wikipedia.exceptions.PageError as e:
16
+ # If the page doesn't exist, handle the PageError here.
17
+ print("The requested page does not exist on Wikipedia.")
18
+ return None
19
+
20
+ def clean_article(raw_article):
21
+ lines = raw_article.split('\n')
22
+ clean_lines = []
23
+ for l in lines:
24
+ if l.startswith('== See also'):
25
+ break
26
+ if l.startswith('== References'):
27
+ break
28
+ if l.startswith('='):
29
+ continue
30
+ if len(l.strip()) == 0:
31
+ continue
32
+
33
+ clean_lines.append(l.strip())
34
+ return '\n'.join(clean_lines)
35
+
36
+ def save_article(content, path):
37
+ with open(path, 'w', encoding='utf-8') as f:
38
+ f.write(content)
39
+
40
+ def load_entities(entities_path):
41
+ with open(entities_path, 'r', encoding='utf-8') as f:
42
+ return [l.strip() for l in f.readlines()]
43
+
44
+ def scrape(entities_path, save_path):
45
+ entities = load_entities(entities_path)
46
+ for entity in entities:
47
+ raw_article = get_raw_wikipedia_article(entity)
48
+ if raw_article == None:
49
+ print(f'Article on Wikipedia not found for entity {entity} :(')
50
+ continue
51
+
52
+ cleaned_article = clean_article(raw_article)
53
+ save_article(cleaned_article, os.path.join(save_path, f'{entity}.txt'))
54
+
55
+ if __name__ == '__main__':
56
+ scrape('src/data/entities.txt', 'data/wiki_articles')
src/features/.gitkeep ADDED
File without changes
src/features/__init__.py ADDED
File without changes
src/features/build_features.py ADDED
File without changes
src/models/.gitkeep ADDED
File without changes
src/models/__init__.py ADDED
File without changes
src/models/__pycache__/extractive_qa.cpython-311.pyc ADDED
Binary file (2.74 kB). View file
 
src/models/__pycache__/search_engine.cpython-311.pyc ADDED
Binary file (6.58 kB). View file
 
src/models/__pycache__/visual_qa.cpython-311.pyc ADDED
Binary file (1.41 kB). View file
 
src/models/extractive_qa.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertForQuestionAnswering
2
+ import torch
3
+
4
+ class QA(object):
5
+ def __init__(self,
6
+ model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad'):
7
+
8
+ self.model_name = model_name
9
+
10
+ self.__load_model_and_tokenizer()
11
+
12
+ def __load_model_and_tokenizer(self):
13
+ self.model = BertForQuestionAnswering.from_pretrained(self.model_name)
14
+ self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
15
+
16
+ def __get_segment_ids(self, input_ids):
17
+ # Search the input_ids for the first instance of the `[SEP]` token.
18
+ sep_index = input_ids.index(self.tokenizer.sep_token_id)
19
+
20
+ # The number of segment A tokens includes the [SEP] token istelf.
21
+ num_seg_a = sep_index + 1
22
+
23
+ # The remainder are segment B.
24
+ num_seg_b = len(input_ids) - num_seg_a
25
+
26
+ # Construct the list of 0s and 1s.
27
+ segment_ids = [0]*num_seg_a + [1]*num_seg_b
28
+
29
+ # There should be a segment_id for every input token.
30
+ assert len(segment_ids) == len(input_ids)
31
+
32
+ return segment_ids
33
+
34
+ def answer_question(self, query, passage):
35
+ input_ids = self.tokenizer.encode(query, passage)
36
+ segment_ids = self.__get_segment_ids(input_ids)
37
+
38
+ # Run our example through the model.
39
+ outputs = self.model(torch.tensor([input_ids]), # The tokens representing our input text.
40
+ token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
41
+ return_dict=True)
42
+
43
+ start_scores = outputs.start_logits
44
+ end_scores = outputs.end_logits
45
+
46
+ # Find the tokens with the highest `start` and `end` scores.
47
+ answer_start = torch.argmax(start_scores)
48
+ answer_end = torch.argmax(end_scores)
49
+
50
+ return self.tokenizer.decode(input_ids[answer_start:answer_end+1])
src/models/predict_model.py ADDED
File without changes
src/models/search_engine.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tqdm import tqdm
3
+ from whoosh.index import * # whoosh: full-text indexing and searching
4
+ from whoosh.fields import *
5
+ from whoosh import qparser
6
+ import sys
7
+ sys.path.insert(1, "src/data")
8
+ #import src.data.wiki_scrape as wiki_scrape
9
+ import wiki_scrape as wiki_scrape
10
+
11
+ class IR(object):
12
+ def __init__(self,
13
+ max_passage_length = 800,
14
+ overlap = 0.4,
15
+ passages_limit = 10000,
16
+ data_path = 'data/wiki_articles',
17
+ index_path = 'index'):
18
+ self.max_passage_length = max_passage_length
19
+ self.overlap = overlap
20
+ self.passages_limit = passages_limit
21
+ self.data_path = data_path
22
+ self.index_path = index_path
23
+ self.ix = None
24
+
25
+ passages = self.__load_passages()
26
+
27
+ if os.path.exists(self.index_path):
28
+ print(f'Index already exists in the directory {self.index_path}')
29
+ print('Skipping building the index...')
30
+ self.ix = open_dir(self.index_path)
31
+ else:
32
+ self.__create_index(passages)
33
+
34
+ def __create_passages_from_article(self, content):
35
+ passages = []
36
+ passage_diff = int(self.max_passage_length * (1-self.overlap))
37
+
38
+ for i in range(0, len(content), passage_diff):
39
+ passages.append(content[i: i + self.max_passage_length])
40
+ return passages
41
+
42
+ def __scrape_wiki_if_not_exists(self):
43
+ if not os.path.exists(self.data_path):
44
+ os.makedirs(self.data_path)
45
+
46
+ if len(os.listdir(self.data_path)) == 0:
47
+ print('No Wiki articles. Scraping...')
48
+ wiki_scrape.scrape('src/data/entities.txt', 'data/wiki_articles')
49
+
50
+ def __load_passages(self):
51
+ self.__scrape_wiki_if_not_exists()
52
+
53
+ passages = []
54
+ count = 0
55
+
56
+ directory = os.fsencode(self.data_path)
57
+
58
+ for file in os.listdir(directory):
59
+ filename = os.fsdecode(file)
60
+ if not filename.endswith(".txt"):
61
+ continue
62
+
63
+ with open(os.path.join(self.data_path, filename), 'r', encoding='utf-8') as f:
64
+ content = f.read()
65
+ article_passages = self.__create_passages_from_article(content)
66
+ #print(f'Created {len(article_passages)} passages')
67
+ passages.extend(article_passages)
68
+
69
+ count += 1
70
+ if count == self.passages_limit:
71
+ break
72
+ return passages
73
+
74
+ def __create_index(self, passages):
75
+ # Create the index directory
76
+ os.mkdir(self.index_path)
77
+
78
+ # Schema definition:
79
+ # - id: type ID, unique, stored; doc id in order given the passages file
80
+ # - text: type TEXT processed by StemmingAnalyzer; not stored; content of the passage
81
+ schema = Schema(id = ID(stored=True,unique=True),
82
+ text = TEXT(analyzer=analysis.StemmingAnalyzer())
83
+ )
84
+
85
+ # Create an index
86
+ self.ix = create_in("index", schema)
87
+ writer = self.ix.writer() #run once! or restart runtime
88
+
89
+ # Add papers to the index, iterating through each row in the metadata dataframe
90
+ id = 0
91
+ for passage_text in tqdm(passages, desc='Building index'):
92
+ writer.add_document(id=str(id),text=passage_text)
93
+ id += 1
94
+
95
+ # Save the added documents
96
+ writer.commit()
97
+ print("Index successfully created")
98
+
99
+ def retrieve_documents(self, query, topk):
100
+ scores=[]
101
+ text=[]
102
+ passages = self.__load_passages()
103
+ # Open the searcher for reading the index. The default BM25 algorithm will be used for scoring
104
+ with self.ix.searcher() as searcher:
105
+ searcher = self.ix.searcher()
106
+
107
+ # Define the query parser ('text' will be the default field to search), and set the input query
108
+ q = qparser.QueryParser("text", self.ix.schema, group=qparser.OrGroup).parse(query)
109
+
110
+ # Search using the query q, and get the n_docs documents, sorted with the highest-scoring documents first
111
+ results = searcher.search(q, limit=topk)
112
+ # results is a list of dictionaries where each dictionary is the stored fields of the document
113
+
114
+ # Iterate over the retrieved documents
115
+ for hit in results:
116
+ scores.append(hit.score)
117
+ text.append(passages[int(hit['id'])])
118
+ return text, scores
src/models/train_model.py ADDED
File without changes
src/models/visual_qa.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+
4
+ class VisualQA(object):
5
+ def __init__(self, model_name='nflechas/VQArt', tokenizer_name='dandelin/vilt-b32-finetuned-vqa'):
6
+ self.model_name = model_name
7
+ self.tokenizer_name = tokenizer_name
8
+ self.__load_model()
9
+
10
+ def __load_model(self):
11
+ self.model = pipeline('vqa', model=self.model_name, tokenizer=self.tokenizer_name)
12
+
13
+ def answer_question(self, query, image):
14
+ return self.model(question=query, image=image, top_k=1)[0]['answer']
src/visualization/.gitkeep ADDED
File without changes
src/visualization/__init__.py ADDED
File without changes
src/visualization/visualize.py ADDED
File without changes