Madhuri commited on
Commit
7a69915
β€’
1 Parent(s): 7eba131

Add chatbot and audiobot pages to application.

Browse files

Use streamlit to create chatbot and audiobot pages for visual
question answering.

Files changed (7) hide show
  1. .gitignore +10 -0
  2. README.md +24 -1
  3. app.py +39 -0
  4. audiobot.py +60 -0
  5. chatbot.py +57 -0
  6. model/predictor.py +48 -0
  7. requirements.txt +116 -0
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+
5
+ # Distribution / packaging
6
+ .Python
7
+
8
+ # Installer logs
9
+ pip-log.txt
10
+ pip-delete-this-directory.txt
README.md CHANGED
@@ -4,10 +4,33 @@ emoji: πŸ“ˆ
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: streamlit
 
7
  sdk_version: 1.10.0
8
  app_file: app.py
 
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: streamlit
7
+ python_version: 3.9.0
8
  sdk_version: 1.10.0
9
  app_file: app.py
10
+ models: ['Madhuri/t5_small_vqa_fs', 'dandelin/vilt-b32-finetuned-vqa']
11
  pinned: false
12
  license: mit
13
  ---
14
 
15
+ ## Visual Question Answering - Bot
16
+
17
+ VQA Bot addresses the challenge of visual question answering with the chat and voice assistance.
18
+ Here, we merged Vision transformer and Language generator with the audio transformer.
19
+ We pretrained and finetuned our model on Language and Audio transformer to get the desired result.
20
+ Please use the radio buttons below to navigate.
21
+
22
+
23
+ ## References
24
+
25
+ > ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision
26
+ >
27
+ > Author: Wonjae Kim and Bokyung Son and Ildoo Kim
28
+ >
29
+ > Year: 2021
30
+ >
31
+ > eprint: 2102.03334
32
+ >
33
+ > archivePrefix: arXiv
34
+ >
35
+ > primaryClass: stat.ML
36
+
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from model import predictor
4
+ import audiobot
5
+ import chatbot
6
+ import os
7
+
8
+
9
+ def run():
10
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
11
+ st.set_page_config(
12
+ page_title='Visual Question Answering - Bot',
13
+ page_icon=':robot:',
14
+ layout='wide'
15
+ )
16
+
17
+ st.session_state['predictor'] = predictor.Predictor()
18
+
19
+ st.sidebar.title('VQA Bot')
20
+ st.sidebar.write('''
21
+ VQA Bot addresses the challenge of visual question answering with the chat and voice assistance.
22
+ Here, we merged Vision transformer and Language generator with the audio transformer.
23
+ We pretrained and finetuned our model on Language and Audio transformer to get the desired result.
24
+ Please use the radio buttons below to navigate.
25
+ ''')
26
+
27
+ selected_page = st.sidebar.radio('', ('VQA Chatbot', 'VQA Audiobot'))
28
+ if selected_page == 'VQA Chatbot':
29
+ chatbot.show()
30
+ elif selected_page == 'VQA Audiobot':
31
+ audiobot.show()
32
+
33
+ st.sidebar.write(
34
+ 'Madhuri Sakhare: [Hugging Face](https://huggingface.co/Madhuri)',
35
+ '[Github](https://github.com/msak1612/vqa_chatbot)'
36
+ )
37
+
38
+
39
+ run()
audiobot.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+
4
+ from bokeh.models.widgets import Button
5
+ from bokeh.models import CustomJS
6
+ from streamlit_bokeh_events import streamlit_bokeh_events
7
+ import subprocess
8
+
9
+
10
+ def show():
11
+ st.title('Visual Question Answering - Audiobot')
12
+ st.markdown('''
13
+ <h4 style='text-align: center; color: #B2BEB5;'>
14
+ <i>Hi, I am a Visual Audiobot, capable of answering a sequence of questions about images.
15
+ Please upload image and fire away!
16
+ </i></h4>
17
+ ''', unsafe_allow_html=True)
18
+ upload_pic = st.file_uploader('Choose an image...', type=[
19
+ 'jpg', 'png', 'jpeg'], accept_multiple_files=False)
20
+ if upload_pic is not None:
21
+ st.session_state.image = Image.open(upload_pic)
22
+ st.image(upload_pic, use_column_width='auto')
23
+ else:
24
+ st.session_state.image = None
25
+
26
+ # Speech recognition based in streamlit based on
27
+ # https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
28
+ stt_button = Button(label='Ask', width=100)
29
+ stt_button.js_on_event('button_click', CustomJS(code='''
30
+ var recognition = new webkitSpeechRecognition();
31
+ recognition.continuous = false;
32
+ recognition.interimResults = false;
33
+
34
+ recognition.onresult = function (e) {
35
+ var value = '';
36
+ for (var i = e.resultIndex; i < e.results.length; ++i) {
37
+ if (e.results[i].isFinal) {
38
+ value += e.results[i][0].transcript;
39
+ }
40
+ }
41
+ if ( value != '') {
42
+ document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
43
+ }
44
+ }
45
+ recognition.start();
46
+ '''))
47
+
48
+ result = streamlit_bokeh_events(
49
+ stt_button,
50
+ events='GET_TEXT',
51
+ key='listen',
52
+ refresh_on_update=False,
53
+ override_height=75,
54
+ debounce_time=0)
55
+
56
+ if result:
57
+ if 'GET_TEXT' in result:
58
+ answer = st.session_state.predictor.predict_answer_from_text(
59
+ st.session_state.image, result.get('GET_TEXT'))
60
+ subprocess.check_output(['say', answer])
chatbot.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_chat import message
3
+ from PIL import Image
4
+
5
+
6
+ def init_chat_history():
7
+ if 'question' not in st.session_state:
8
+ st.session_state['question'] = []
9
+
10
+ if 'answer' not in st.session_state:
11
+ st.session_state['answer'] = []
12
+
13
+
14
+ def update_chat_messages():
15
+ if st.session_state['answer']:
16
+ for i in range(len(st.session_state['answer'])-1, -1, -1):
17
+ message(st.session_state['answer'][i], key=str(i))
18
+ message(st.session_state['question'][i],
19
+ is_user=True, key=str(i) + '_user')
20
+
21
+
22
+ def predict(image, input):
23
+ if image is None or not input:
24
+ return
25
+
26
+ answer = st.session_state.predictor.predict_answer_from_text(image, input)
27
+ st.session_state.question.append(input)
28
+ st.session_state.answer.append(answer)
29
+
30
+
31
+ def show():
32
+ init_chat_history()
33
+
34
+ st.title('Visual Question Answering - Chatbot')
35
+ st.markdown('''
36
+ <h4 style='text-align: center; color: #B2BEB5;'>
37
+ <i>Hi, I am a Visual Chatbot, capable of answering a sequence of questions about images.
38
+ Please upload image and fire away!
39
+ </i></h4>
40
+ ''', unsafe_allow_html=True)
41
+
42
+ image_col, text_col = st.columns(2)
43
+ with image_col:
44
+ upload_pic = st.file_uploader('Choose an image...', type=[
45
+ 'jpg', 'png', 'jpeg'], accept_multiple_files=False)
46
+ if upload_pic is not None:
47
+ image = Image.open(upload_pic)
48
+ st.image(upload_pic, use_column_width='auto')
49
+ else:
50
+ st.session_state.question.clear()
51
+ st.session_state.answer.clear()
52
+ st.session_state.input = ''
53
+ with text_col:
54
+ input = st.text_input('', '', key='input')
55
+ if input:
56
+ predict(image, input)
57
+ update_chat_messages()
model/predictor.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import ViltProcessor
3
+ from transformers import ViltForQuestionAnswering
4
+ from transformers import AutoTokenizer
5
+ from transformers import AutoModelForSeq2SeqLM
6
+
7
+ import os
8
+ import torch
9
+
10
+
11
+ '''
12
+ Visual Question Answering Model to generate answer statement for
13
+ question.
14
+ '''
15
+
16
+
17
+ @st.experimental_singleton
18
+ class Predictor:
19
+ def __init__(self):
20
+ auth_token = os.environ.get('TOKEN') or True
21
+ self.vqa_processor = ViltProcessor.from_pretrained(
22
+ 'dandelin/vilt-b32-finetuned-vqa')
23
+ self.vqa_model = ViltForQuestionAnswering.from_pretrained(
24
+ 'dandelin/vilt-b32-finetuned-vqa')
25
+ self.qa_model = AutoModelForSeq2SeqLM.from_pretrained(
26
+ 'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
27
+ self.qa_tokenizer = AutoTokenizer.from_pretrained(
28
+ 'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
29
+
30
+ def predict_answer_from_text(self, image, question):
31
+ if not question or image is None:
32
+ return ''
33
+
34
+ # process question using image model
35
+ encoding = self.vqa_processor(image, question, return_tensors='pt')
36
+ with torch.no_grad():
37
+ outputs = self.vqa_model(**encoding)
38
+ short_answer = self.vqa_model.config.id2label[outputs.logits.argmax(
39
+ -1).item()]
40
+
41
+ # generate statement using sentence generator model
42
+ prompt = question + '. ' + short_answer
43
+ input_ids = self.qa_tokenizer(prompt, return_tensors='pt').input_ids
44
+ with torch.no_grad():
45
+ output_ids = self.qa_model.generate(input_ids)
46
+ answers = self.qa_tokenizer.batch_decode(
47
+ output_ids, skip_special_tokens=True)
48
+ return answers[0]
requirements.txt ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.2.0
2
+ ansicolors==1.1.8
3
+ ansiwrap==0.8.4
4
+ appnope==0.1.3
5
+ argon2-cffi==21.3.0
6
+ argon2-cffi-bindings==21.2.0
7
+ asttokens==2.0.5
8
+ attrs==21.4.0
9
+ backcall==0.2.0
10
+ beautifulsoup4==4.11.1
11
+ bleach==5.0.0
12
+ blinker==1.4
13
+ bokeh==2.4.3
14
+ cachetools==5.2.0
15
+ certifi==2022.6.15
16
+ cffi==1.15.0
17
+ chainmap==1.0.3
18
+ charset-normalizer==2.0.12
19
+ click==8.1.3
20
+ combomethod==1.0.12
21
+ commonmark==0.9.1
22
+ debugpy==1.6.0
23
+ decorator==5.1.1
24
+ defusedxml==0.7.1
25
+ entrypoints==0.4
26
+ executing==0.8.3
27
+ fastjsonschema==2.15.3
28
+ filelock==3.7.1
29
+ gitdb==4.0.9
30
+ GitPython==3.1.27
31
+ huggingface-hub==0.7.0
32
+ idna==3.3
33
+ importlib-metadata==4.11.4
34
+ ipykernel==6.15.0
35
+ ipython==8.4.0
36
+ ipython-genutils==0.2.0
37
+ ipywidgets==7.7.0
38
+ jedi==0.18.1
39
+ Jinja2==3.1.2
40
+ jsonschema==4.6.0
41
+ jupyter-client==7.3.4
42
+ jupyter-core==4.10.0
43
+ jupyterlab-pygments==0.2.2
44
+ jupyterlab-widgets==1.1.0
45
+ MarkupSafe==2.1.1
46
+ matplotlib-inline==0.1.3
47
+ mementos==1.3.1
48
+ mistune==0.8.4
49
+ nbclient==0.6.4
50
+ nbconvert==6.5.0
51
+ nbformat==5.4.0
52
+ nest-asyncio==1.5.5
53
+ notebook==6.4.12
54
+ nulltype==2.3.1
55
+ numpy==1.22.4
56
+ options==1.4.10
57
+ packaging==21.3
58
+ pandas==1.4.2
59
+ pandocfilters==1.5.0
60
+ parso==0.8.3
61
+ pexpect==4.8.0
62
+ pickleshare==0.7.5
63
+ Pillow==9.1.1
64
+ prometheus-client==0.14.1
65
+ prompt-toolkit==3.0.29
66
+ protobuf==3.20.1
67
+ psutil==5.9.1
68
+ ptyprocess==0.7.0
69
+ pure-eval==0.2.2
70
+ pyarrow==8.0.0
71
+ pycparser==2.21
72
+ pydeck==0.7.1
73
+ Pygments==2.12.0
74
+ Pympler==1.0.1
75
+ pyparsing==3.0.9
76
+ pyrsistent==0.18.1
77
+ python-dateutil==2.8.2
78
+ pytz==2022.1
79
+ pytz-deprecation-shim==0.1.0.post0
80
+ PyYAML==6.0
81
+ pyzmq==23.1.0
82
+ regex==2022.6.2
83
+ requests==2.28.0
84
+ rich==12.4.4
85
+ say==1.6.6
86
+ semver==2.13.0
87
+ Send2Trash==1.8.0
88
+ simplere==1.2.13
89
+ six==1.12.0
90
+ smmap==5.0.0
91
+ soupsieve==2.3.2.post1
92
+ stack-data==0.3.0
93
+ streamlit==1.10.0
94
+ streamlit-bokeh-events==0.1.2
95
+ streamlit-chat==0.0.2.1
96
+ subprocess.run==0.0.8
97
+ terminado==0.15.0
98
+ textwrap3==0.9.2
99
+ tinycss2==1.1.1
100
+ tokenizers==0.12.1
101
+ toml==0.10.2
102
+ toolz==0.11.2
103
+ torch==1.11.0
104
+ tornado==6.1
105
+ tqdm==4.64.0
106
+ traitlets==5.3.0
107
+ transformers==4.20.0
108
+ typing_extensions==4.2.0
109
+ tzdata==2022.1
110
+ tzlocal==4.2
111
+ urllib3==1.26.9
112
+ validators==0.20.0
113
+ wcwidth==0.2.5
114
+ webencodings==0.5.1
115
+ widgetsnbextension==3.6.0
116
+ zipp==3.8.0