Devarsh24 commited on
Commit
1db872f
·
verified ·
1 Parent(s): 989da0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -13
app.py CHANGED
@@ -1,30 +1,26 @@
 
1
  import torch
 
 
2
  import gradio as gr
 
 
3
  from PIL import Image
 
 
4
  import scipy.io.wavfile as wavfile
5
 
6
  # Use a pipeline as a high-level helper
7
  from transformers import pipeline
8
 
9
- # from phonemizer.backend.espeak.wrapper import EspeakWrapper
10
- # _ESPEAK_LIBRARY = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib' #use the Path to the library.
11
- # EspeakWrapper.set_library(_ESPEAK_LIBRARY)
12
 
13
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
14
 
15
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
16
 
17
- # tts_model_path = "../Models/models--kakao-enterprise--vits-ljs/snapshots/3bcb8321394f671bd948ebf0d086d694dda95464"
18
-
19
- # narrator = pipeline("text-to-speech", model=tts_model_path)
20
-
21
  # Load the pretrained weights
22
  caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
23
 
24
- # model_path = "../Models/models--Salesforce--blip-image-captioning-large/snapshots/2227ac38c9f16105cb0412e7cab4759978a8fd90"
25
-
26
- # Load the pretrained weights
27
- # caption_image = pipeline("image-to-text", model=model_path, device=device)
28
 
29
  # Define the function to generate audio from text
30
  def generate_audio(text):
@@ -36,13 +32,13 @@ def generate_audio(text):
36
  data=narrated_text["audio"][0])
37
 
38
  # Return the path to the saved output WAV file
39
- return "output.wav"
40
 
41
  def caption_my_image(pil_image):
42
 
43
  semantics = caption_image(images=pil_image)[0]['generated_text']
44
  audio = generate_audio(semantics)
45
- return semantics,audio
46
 
47
 
48
  gr.close_all()
 
1
+ # to create nueral network
2
  import torch
3
+
4
+ # for interface
5
  import gradio as gr
6
+
7
+ # to open images
8
  from PIL import Image
9
+
10
+ # used for audio
11
  import scipy.io.wavfile as wavfile
12
 
13
  # Use a pipeline as a high-level helper
14
  from transformers import pipeline
15
 
 
 
 
16
 
17
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
18
 
19
  narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
20
 
 
 
 
 
21
  # Load the pretrained weights
22
  caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
23
 
 
 
 
 
24
 
25
  # Define the function to generate audio from text
26
  def generate_audio(text):
 
32
  data=narrated_text["audio"][0])
33
 
34
  # Return the path to the saved output WAV file
35
+ return "output.wav" # return audio
36
 
37
  def caption_my_image(pil_image):
38
 
39
  semantics = caption_image(images=pil_image)[0]['generated_text']
40
  audio = generate_audio(semantics)
41
+ return semantics,audio # returns both text and audio output
42
 
43
 
44
  gr.close_all()