Spaces:

ixxan
/

cross-lingual-vqa

Running

App Files Files Community

Irpan commited on Nov 17, 2024

Commit

13e41f6

1 Parent(s): 97db7c3

a

Browse files

Files changed (1) hide show

app.py +55 -3

app.py CHANGED Viewed

@@ -3,15 +3,59 @@ from transformers import ViltProcessor, ViltForQuestionAnswering
 import torch
 from googletrans import Translator
 from googletrans import LANGCODES
 torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
 processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
-def answer_question(image, text):
-    encoding = processor(image, text, return_tensors="pt")
     # forward pass
     with torch.no_grad():
      outputs = model(**encoding)
@@ -21,6 +65,14 @@ def answer_question(image, text):
     predicted_answer = model.config.id2label[idx]
     return predicted_answer
 image = gr.inputs.Image(type="pil")
 question = gr.inputs.Textbox(label="Question")
@@ -30,7 +82,7 @@ examples = [["cats.jpg", "How many cats are there, in French?"]]
 title = "Cross-lingual VQA"
 description = "ViLT (Vision and Language Transformer), fine-tuned on VQAv2 "
-interface = gr.Interface(fn=answer_question,
                          inputs=[image, question],
                          outputs=answer,
                          examples=examples,

 import torch
 from googletrans import Translator
 from googletrans import LANGCODES
+import re
 torch.hub.download_url_to_file('http://images.cocodataset.org/val2017/000000039769.jpg', 'cats.jpg')
 processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
+# List of acceptable languages
+acceptable_languages = set(L.split()[0] for L in LANGCODES)
+acceptable_languages.add("mandarin")
+acceptable_languages.add("cantonese")
+# Translation
+def google_translate(question, dest):
+    translator = Translator()
+    translation = translator.translate(question, dest=dest)
+    print("Translation text: " + translation.text)
+    print("Translation src: " + translation.src)
+    return (translation.text, translation.src)
+# Lang to lang_code mapping
+def lang_code_match(accaptable_lang):
+    # Exception for chinese langs
+    if accaptable_lang == 'mandarin':
+        return 'zh-cn'
+    elif accaptable_lang == 'cantonese' or accaptable_lang == 'chinese':
+        return 'zh-tw'
+    # Default
+    else:
+        return LANGCODES[accaptable_lang]
+# Find destination langauge
+def find_dest_language(sentence, src_lang):
+    pattern = r'\b(' + '|'.join(acceptable_languages) + r')\b'
+    match = re.search(pattern, sentence, flags=re.IGNORECASE)
+    if match:
+        lang_code = lang_code_match(match.group(0).lower())
+        print("Destination lang: " + lang_code)
+        return lang_code
+    else:
+        print("Destination lang:" + src_lang)
+        return src_lang
+# Remove destination langauge context
+def remove_language_phrase(sentence):
+    # Bremove "in [acceptable_languages]" or "[acceptable_languages]" and any non-closing punctuation around it
+    pattern = r'(\b(in\s)?(' + '|'.join(acceptable_languages) + r')\b)[\s,;:.!?]*'
+    cleaned_sentence = re.sub(pattern, '', sentence, flags=re.IGNORECASE).strip()
+    print("Language Phrase Removed: " + cleaned_sentence)
+    return cleaned_sentence
+def vqa(image, text):
+    encoding = processor(image, text, return_tensors="pt")
     # forward pass
     with torch.no_grad():
      outputs = model(**encoding)
     predicted_answer = model.config.id2label[idx]
     return predicted_answer
+def main(image, text):
+    en_question, question_src_lang = google_translate(text, dest='en')
+    dest_lang = find_dest_language(en_question, question_src_lang)
+    cleaned_sentence = remove_language_phrase(en_question)
+    vqa_answer = vqa(image, cleaned_sentence)
+    return vqa_answer
 image = gr.inputs.Image(type="pil")
 question = gr.inputs.Textbox(label="Question")
 title = "Cross-lingual VQA"
 description = "ViLT (Vision and Language Transformer), fine-tuned on VQAv2 "
+interface = gr.Interface(fn=main,
                          inputs=[image, question],
                          outputs=answer,
                          examples=examples,