Spaces:

Devarsh24
/

Image_Captioning_Advanced

Sleeping

App Files Files Community

Devarsh24 commited on Jul 22, 2024

Commit

1db872f

verified ·

1 Parent(s): 989da0b

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -13

app.py CHANGED Viewed

@@ -1,30 +1,26 @@
 import torch
 import gradio as gr
 from PIL  import Image
 import scipy.io.wavfile as wavfile
 # Use a pipeline as a high-level helper
 from transformers import pipeline
-# from phonemizer.backend.espeak.wrapper import EspeakWrapper
-# _ESPEAK_LIBRARY = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib'  #use the Path to the library.
-# EspeakWrapper.set_library(_ESPEAK_LIBRARY)
 device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
-# tts_model_path = "../Models/models--kakao-enterprise--vits-ljs/snapshots/3bcb8321394f671bd948ebf0d086d694dda95464"
-# narrator = pipeline("text-to-speech", model=tts_model_path)
 # Load the pretrained weights
 caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
-# model_path = "../Models/models--Salesforce--blip-image-captioning-large/snapshots/2227ac38c9f16105cb0412e7cab4759978a8fd90"
-# Load the pretrained weights
-# caption_image = pipeline("image-to-text", model=model_path, device=device)
 # Define the function to generate audio from text
 def generate_audio(text):
@@ -36,13 +32,13 @@ def generate_audio(text):
                   data=narrated_text["audio"][0])
     # Return the path to the saved output WAV file
-    return "output.wav"
 def caption_my_image(pil_image):
     semantics = caption_image(images=pil_image)[0]['generated_text']
     audio = generate_audio(semantics)
-    return semantics,audio
 gr.close_all()

+# to create nueral network
 import torch
+# for interface
 import gradio as gr
+# to open images
 from PIL  import Image
+# used for audio
 import scipy.io.wavfile as wavfile
 # Use a pipeline as a high-level helper
 from transformers import pipeline
 device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
 # Load the pretrained weights
 caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
 # Define the function to generate audio from text
 def generate_audio(text):
                   data=narrated_text["audio"][0])
     # Return the path to the saved output WAV file
+    return "output.wav" # return audio
 def caption_my_image(pil_image):
     semantics = caption_image(images=pil_image)[0]['generated_text']
     audio = generate_audio(semantics)
+    return semantics,audio  # returns both text and audio output
 gr.close_all()