Spaces:

Madhuri
/

vqa_audiobot

Runtime error

App Files Files Community

Madhuri commited on Jun 20, 2022

Commit

7a69915

•

1 Parent(s): 7eba131

Add chatbot and audiobot pages to application.

Browse files

Use streamlit to create chatbot and audiobot pages for visual
question answering.

Files changed (7) hide show

.gitignore +10 -0
README.md +24 -1
app.py +39 -0
audiobot.py +60 -0
chatbot.py +57 -0
model/predictor.py +48 -0
requirements.txt +116 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+# Distribution / packaging
+.Python
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt

README.md CHANGED Viewed

@@ -4,10 +4,33 @@ emoji: 📈
 colorFrom: indigo
 colorTo: purple
 sdk: streamlit
 sdk_version: 1.10.0
 app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: indigo
 colorTo: purple
 sdk: streamlit
+python_version: 3.9.0
 sdk_version: 1.10.0
 app_file: app.py
+models: ['Madhuri/t5_small_vqa_fs', 'dandelin/vilt-b32-finetuned-vqa']
 pinned: false
 license: mit
 ---
+## Visual Question Answering - Bot
+VQA Bot addresses the challenge of visual question answering with the chat and voice assistance.
+Here, we merged Vision transformer and Language generator with the audio transformer.
+We pretrained and finetuned our model on Language and Audio transformer to get the desired result.
+Please use the radio buttons below to navigate.
+## References
+> ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision
+>
+> Author: Wonjae Kim and Bokyung Son and Ildoo Kim
+>
+> Year: 2021
+>
+> eprint: 2102.03334
+>
+> archivePrefix: arXiv
+>
+> primaryClass: stat.ML

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import streamlit as st
+from model import predictor
+import audiobot
+import chatbot
+import os
+def run():
+    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+    st.set_page_config(
+        page_title='Visual Question Answering - Bot',
+        page_icon=':robot:',
+        layout='wide'
+    )
+    st.session_state['predictor'] = predictor.Predictor()
+    st.sidebar.title('VQA Bot')
+    st.sidebar.write('''
+        VQA Bot addresses the challenge of visual question answering with the chat and voice assistance.
+        Here, we merged Vision transformer and Language generator with the audio transformer.
+        We pretrained and finetuned our model on Language and Audio transformer to get the desired result.
+        Please use the radio buttons below to navigate.
+        ''')
+    selected_page = st.sidebar.radio('', ('VQA Chatbot', 'VQA Audiobot'))
+    if selected_page == 'VQA Chatbot':
+        chatbot.show()
+    elif selected_page == 'VQA Audiobot':
+        audiobot.show()
+    st.sidebar.write(
+        'Madhuri Sakhare: [Hugging Face](https://huggingface.co/Madhuri)',
+        '[Github](https://github.com/msak1612/vqa_chatbot)'
+    )
+run()

audiobot.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import streamlit as st
+from PIL import Image
+from bokeh.models.widgets import Button
+from bokeh.models import CustomJS
+from streamlit_bokeh_events import streamlit_bokeh_events
+import subprocess
+def show():
+    st.title('Visual Question Answering - Audiobot')
+    st.markdown('''
+            <h4 style='text-align: center; color: #B2BEB5;'>
+            <i>Hi, I am a Visual Audiobot, capable of answering a sequence of questions about images.
+                Please upload image and fire away!
+            </i></h4>
+            ''', unsafe_allow_html=True)
+    upload_pic = st.file_uploader('Choose an image...', type=[
+                                  'jpg', 'png', 'jpeg'], accept_multiple_files=False)
+    if upload_pic is not None:
+        st.session_state.image = Image.open(upload_pic)
+        st.image(upload_pic, use_column_width='auto')
+    else:
+        st.session_state.image = None
+    # Speech recognition based in streamlit based on
+    # https://discuss.streamlit.io/t/speech-to-text-on-client-side-using-html5-and-streamlit-bokeh-events/7888
+    stt_button = Button(label='Ask', width=100)
+    stt_button.js_on_event('button_click', CustomJS(code='''
+        var recognition = new webkitSpeechRecognition();
+        recognition.continuous = false;
+        recognition.interimResults = false;
+        recognition.onresult = function (e) {
+            var value = '';
+            for (var i = e.resultIndex; i < e.results.length; ++i) {
+                if (e.results[i].isFinal) {
+                    value += e.results[i][0].transcript;
+                }
+            }
+            if ( value != '') {
+                document.dispatchEvent(new CustomEvent('GET_TEXT', {detail: value}));
+            }
+        }
+        recognition.start();
+        '''))
+    result = streamlit_bokeh_events(
+        stt_button,
+        events='GET_TEXT',
+        key='listen',
+        refresh_on_update=False,
+        override_height=75,
+        debounce_time=0)
+    if result:
+        if 'GET_TEXT' in result:
+            answer = st.session_state.predictor.predict_answer_from_text(
+                st.session_state.image, result.get('GET_TEXT'))
+            subprocess.check_output(['say', answer])

chatbot.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import streamlit as st
+from streamlit_chat import message
+from PIL import Image
+def init_chat_history():
+    if 'question' not in st.session_state:
+        st.session_state['question'] = []
+    if 'answer' not in st.session_state:
+        st.session_state['answer'] = []
+def update_chat_messages():
+    if st.session_state['answer']:
+        for i in range(len(st.session_state['answer'])-1, -1, -1):
+            message(st.session_state['answer'][i], key=str(i))
+            message(st.session_state['question'][i],
+                    is_user=True, key=str(i) + '_user')
+def predict(image, input):
+    if image is None or not input:
+        return
+    answer = st.session_state.predictor.predict_answer_from_text(image, input)
+    st.session_state.question.append(input)
+    st.session_state.answer.append(answer)
+def show():
+    init_chat_history()
+    st.title('Visual Question Answering - Chatbot')
+    st.markdown('''
+            <h4 style='text-align: center; color: #B2BEB5;'>
+            <i>Hi, I am a Visual Chatbot, capable of answering a sequence of questions about images.
+                Please upload image and fire away!
+            </i></h4>
+            ''', unsafe_allow_html=True)
+    image_col, text_col = st.columns(2)
+    with image_col:
+        upload_pic = st.file_uploader('Choose an image...', type=[
+                                      'jpg', 'png', 'jpeg'], accept_multiple_files=False)
+        if upload_pic is not None:
+            image = Image.open(upload_pic)
+            st.image(upload_pic, use_column_width='auto')
+        else:
+            st.session_state.question.clear()
+            st.session_state.answer.clear()
+            st.session_state.input = ''
+    with text_col:
+        input = st.text_input('', '', key='input')
+        if input:
+            predict(image, input)
+        update_chat_messages()

model/predictor.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import streamlit as st
+from transformers import ViltProcessor
+from transformers import ViltForQuestionAnswering
+from transformers import AutoTokenizer
+from transformers import AutoModelForSeq2SeqLM
+import os
+import torch
+'''
+Visual Question Answering Model to generate answer statement for
+question.
+'''
+@st.experimental_singleton
+class Predictor:
+    def __init__(self):
+        auth_token = os.environ.get('TOKEN') or True
+        self.vqa_processor = ViltProcessor.from_pretrained(
+            'dandelin/vilt-b32-finetuned-vqa')
+        self.vqa_model = ViltForQuestionAnswering.from_pretrained(
+            'dandelin/vilt-b32-finetuned-vqa')
+        self.qa_model = AutoModelForSeq2SeqLM.from_pretrained(
+            'Madhuri/t5_small_vqa_fs',  use_auth_token=auth_token)
+        self.qa_tokenizer = AutoTokenizer.from_pretrained(
+            'Madhuri/t5_small_vqa_fs', use_auth_token=auth_token)
+    def predict_answer_from_text(self, image, question):
+        if not question or image is None:
+            return ''
+        # process question using image model
+        encoding = self.vqa_processor(image, question, return_tensors='pt')
+        with torch.no_grad():
+            outputs = self.vqa_model(**encoding)
+        short_answer = self.vqa_model.config.id2label[outputs.logits.argmax(
+            -1).item()]
+        # generate statement using sentence generator model
+        prompt = question + '. ' + short_answer
+        input_ids = self.qa_tokenizer(prompt, return_tensors='pt').input_ids
+        with torch.no_grad():
+            output_ids = self.qa_model.generate(input_ids)
+        answers = self.qa_tokenizer.batch_decode(
+            output_ids, skip_special_tokens=True)
+        return answers[0]

requirements.txt ADDED Viewed

	@@ -0,0 +1,116 @@

+altair==4.2.0
+ansicolors==1.1.8
+ansiwrap==0.8.4
+appnope==0.1.3
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.5
+attrs==21.4.0
+backcall==0.2.0
+beautifulsoup4==4.11.1
+bleach==5.0.0
+blinker==1.4
+bokeh==2.4.3
+cachetools==5.2.0
+certifi==2022.6.15
+cffi==1.15.0
+chainmap==1.0.3
+charset-normalizer==2.0.12
+click==8.1.3
+combomethod==1.0.12
+commonmark==0.9.1
+debugpy==1.6.0
+decorator==5.1.1
+defusedxml==0.7.1
+entrypoints==0.4
+executing==0.8.3
+fastjsonschema==2.15.3
+filelock==3.7.1
+gitdb==4.0.9
+GitPython==3.1.27
+huggingface-hub==0.7.0
+idna==3.3
+importlib-metadata==4.11.4
+ipykernel==6.15.0
+ipython==8.4.0
+ipython-genutils==0.2.0
+ipywidgets==7.7.0
+jedi==0.18.1
+Jinja2==3.1.2
+jsonschema==4.6.0
+jupyter-client==7.3.4
+jupyter-core==4.10.0
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==1.1.0
+MarkupSafe==2.1.1
+matplotlib-inline==0.1.3
+mementos==1.3.1
+mistune==0.8.4
+nbclient==0.6.4
+nbconvert==6.5.0
+nbformat==5.4.0
+nest-asyncio==1.5.5
+notebook==6.4.12
+nulltype==2.3.1
+numpy==1.22.4
+options==1.4.10
+packaging==21.3
+pandas==1.4.2
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.1.1
+prometheus-client==0.14.1
+prompt-toolkit==3.0.29
+protobuf==3.20.1
+psutil==5.9.1
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==8.0.0
+pycparser==2.21
+pydeck==0.7.1
+Pygments==2.12.0
+Pympler==1.0.1
+pyparsing==3.0.9
+pyrsistent==0.18.1
+python-dateutil==2.8.2
+pytz==2022.1
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+pyzmq==23.1.0
+regex==2022.6.2
+requests==2.28.0
+rich==12.4.4
+say==1.6.6
+semver==2.13.0
+Send2Trash==1.8.0
+simplere==1.2.13
+six==1.12.0
+smmap==5.0.0
+soupsieve==2.3.2.post1
+stack-data==0.3.0
+streamlit==1.10.0
+streamlit-bokeh-events==0.1.2
+streamlit-chat==0.0.2.1
+subprocess.run==0.0.8
+terminado==0.15.0
+textwrap3==0.9.2
+tinycss2==1.1.1
+tokenizers==0.12.1
+toml==0.10.2
+toolz==0.11.2
+torch==1.11.0
+tornado==6.1
+tqdm==4.64.0
+traitlets==5.3.0
+transformers==4.20.0
+typing_extensions==4.2.0
+tzdata==2022.1
+tzlocal==4.2
+urllib3==1.26.9
+validators==0.20.0
+wcwidth==0.2.5
+webencodings==0.5.1
+widgetsnbextension==3.6.0
+zipp==3.8.0