themanas021 commited on
Commit
f699624
1 Parent(s): 458ffd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -18
app.py CHANGED
@@ -2,8 +2,10 @@ import streamlit as st
2
  from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
3
  from PIL import Image
4
  from gtts import gTTS
5
- from googletrans import Translator
6
- import torch
 
 
7
 
8
  # Load the models and tokenizer
9
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
@@ -35,26 +37,29 @@ def main():
35
  pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values
36
  output_ids = model.generate(pixel_values, **gen_kwargs)
37
 
38
- # Check if output_ids has only one value
39
- if len(torch.unique(output_ids)) == 1:
40
- # Decode the caption
41
- caption = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
42
- caption = caption[0].strip()
43
- else:
44
- caption = ""
45
 
46
  # Translate the caption to Hindi
47
- translator = Translator()
48
- translated_caption = translator.translate(caption, src='en', dest='hi').text
 
49
 
50
- # Display the caption in English and its translation in Hindi
51
- st.write(f"English Caption: {caption}")
52
- st.write(f"Hindi Translation: {translated_caption}")
53
-
54
- # Convert the caption to speech and play it
55
  tts = gTTS(translated_caption, lang='hi')
56
- st.audio(tts.get_urls()[0], format='audio/wav')
 
 
 
 
 
 
 
 
 
 
57
 
58
  if __name__ == "__main__":
59
- st.set_option('deprecation.showfileUploaderEncoding', False) # Disable file uploader encoding warning
60
  main()
 
2
  from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
3
  from PIL import Image
4
  from gtts import gTTS
5
+ from translate import Translator as TextTranslator
6
+ import io
7
+ import tempfile
8
+ import os
9
 
10
  # Load the models and tokenizer
11
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
 
37
  pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values
38
  output_ids = model.generate(pixel_values, **gen_kwargs)
39
 
40
+ # Decode and display the caption
41
+ caption = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
42
+ caption = caption[0].strip()
43
+ st.write(f"Caption (English): {caption}")
 
 
 
44
 
45
  # Translate the caption to Hindi
46
+ translator = TextTranslator(to_lang="hi")
47
+ translated_caption = translator.translate(caption)
48
+ st.write(f"Caption (Hindi): {translated_caption}")
49
 
50
+ # Convert the translated caption to speech and save it as an MP3 file
 
 
 
 
51
  tts = gTTS(translated_caption, lang='hi')
52
+
53
+ # Save the MP3 file to a temporary location
54
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmpfile:
55
+ mp3_filepath = tmpfile.name
56
+ tts.save(mp3_filepath)
57
+
58
+ # Display the audio player using Streamlit's audio widget
59
+ st.audio(mp3_filepath, format="audio/mp3", start_time=0)
60
+
61
+ # Clean up the temporary MP3 file
62
+ os.unlink(mp3_filepath)
63
 
64
  if __name__ == "__main__":
 
65
  main()