cahya commited on
Commit
cac3bb3
1 Parent(s): 2f6b046

Move to Speech to Speech

Browse files
Files changed (4) hide show
  1. README.md +3 -4
  2. app.py +63 -59
  3. packages.txt +2 -0
  4. requirements.txt +3 -1
README.md CHANGED
@@ -1,15 +1,14 @@
1
  ---
2
  title: Indonesian Whisperer
3
  emoji: 🇮🇩
4
- colorFrom: indigo
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.9.1
8
  app_file: app.py
9
- pinned: false
10
  tags:
11
  - whisper-event
12
- duplicated_from: whisper-event/whisper-demo
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
1
  ---
2
  title: Indonesian Whisperer
3
  emoji: 🇮🇩
4
+ colorFrom: purple
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.15.0
8
  app_file: app.py
9
+ pinned: true
10
  tags:
11
  - whisper-event
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,12 +1,20 @@
1
  import torch
2
-
3
  import gradio as gr
4
- import pytube as pt
5
  from transformers import pipeline
6
- from huggingface_hub import model_info
 
 
 
 
 
 
7
 
8
  MODEL_NAME = "cahya/whisper-medium-id" #this always needs to stay in line 8 :D sorry for the hackiness
9
  lang = "id"
 
 
 
 
10
 
11
  device = 0 if torch.cuda.is_available() else "cpu"
12
 
@@ -37,62 +45,58 @@ def transcribe(microphone, file_upload):
37
  return warn_output + text
38
 
39
 
40
- def _return_yt_html_embed(yt_url):
41
- video_id = yt_url.split("?v=")[-1]
42
- HTML_str = (
43
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
44
- " </center>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  )
46
- return HTML_str
47
-
48
-
49
- def yt_transcribe(yt_url):
50
- yt = pt.YouTube(yt_url)
51
- html_embed_str = _return_yt_html_embed(yt_url)
52
- stream = yt.streams.filter(only_audio=True)[0]
53
- stream.download(filename="audio.mp3")
54
-
55
- text = pipe("audio.mp3")["text"]
56
-
57
- return html_embed_str, text
58
-
59
-
60
- demo = gr.Blocks()
61
-
62
- mf_transcribe = gr.Interface(
63
- fn=transcribe,
64
- inputs=[
65
- gr.inputs.Audio(source="microphone", type="filepath", optional=True),
66
- gr.inputs.Audio(source="upload", type="filepath", optional=True),
67
- ],
68
- outputs="text",
69
- layout="horizontal",
70
- theme="huggingface",
71
- title="Whisper Demo: Transcribe Audio",
72
- description=(
73
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
74
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
75
- " of arbitrary length."
76
- ),
77
- allow_flagging="never",
78
- )
79
-
80
- yt_transcribe = gr.Interface(
81
- fn=yt_transcribe,
82
- inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
83
- outputs=["html", "text"],
84
- layout="horizontal",
85
- theme="huggingface",
86
- title="Whisper Demo: Transcribe YouTube",
87
- description=(
88
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
89
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
90
- " arbitrary length."
91
- ),
92
- allow_flagging="never",
93
- )
94
 
95
- with demo:
96
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
97
 
98
- demo.launch(enable_queue=True)
1
  import torch
 
2
  import gradio as gr
 
3
  from transformers import pipeline
4
+ import tempfile
5
+ from neon_tts_plugin_coqui import CoquiTTS
6
+ from datetime import datetime
7
+ import time
8
+ import psutil
9
+ from mtranslate import translate
10
+
11
 
12
  MODEL_NAME = "cahya/whisper-medium-id" #this always needs to stay in line 8 :D sorry for the hackiness
13
  lang = "id"
14
+ title = "indonesian Whisperer"
15
+ description = "Cross Language Speech to Speech using OpenAI Whisper and Coqui TTS"
16
+ info = "more info at [indonesian Whisperer](https://github.com/cahya-wirawan/indonesian-whisperer)"
17
+ badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"
18
 
19
  device = 0 if torch.cuda.is_available() else "cpu"
20
 
45
  return warn_output + text
46
 
47
 
48
+ LANGUAGES = list(CoquiTTS.langs.keys())
49
+ default_lang = "en"
50
+
51
+ coquiTTS = CoquiTTS()
52
+
53
+
54
+ def tts(language: str, audio_file: str):
55
+ print(f"### {datetime.now()} TTS", language, audio_file)
56
+ transcribed = transcribe(None, audio_file)
57
+ print(f"### {datetime.now()} transcribed:", transcribed)
58
+ translation = translate(transcribed, language, "id")
59
+ # return output
60
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
61
+ coquiTTS.get_tts(translation, fp, speaker={"language": language})
62
+ print(f"### {datetime.now()} fp.name:", fp.name)
63
+ return fp.name
64
+
65
+
66
+ with gr.Blocks() as blocks:
67
+ gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
68
+ + title
69
+ + "</h1>")
70
+ gr.Markdown(description)
71
+ with gr.Row():# equal_height=False
72
+ with gr.Column():# variant="panel"
73
+ upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
74
+ print("upload:", upload)
75
+ radio = gr.Radio(
76
+ label="Language",
77
+ choices=LANGUAGES,
78
+ value=default_lang
79
+ )
80
+ with gr.Row(): # mobile_collapse=False
81
+ submit = gr.Button("Submit", variant="primary")
82
+ audio = gr.Audio(label="Output", interactive=False)
83
+ memory = psutil.virtual_memory()
84
+ gr.Markdown(info)
85
+ system_status = info = f"""
86
+ *Memory: {memory.total/(1024*1024*1024):.2f}GB, used: {memory.percent}%, available: {memory.available/(1024*1024*1024):.2f}GB*
87
+ """
88
+ gr.Markdown(system_status)
89
+ gr.Markdown("<center>"
90
+ +f'<img src={badge} alt="visitors badge"/>'
91
+ +"</center>")
92
+
93
+ # actions
94
+ submit.click(
95
+ tts,
96
+ [radio, upload],
97
+ [audio],
98
  )
99
+ radio.change(lambda lang: CoquiTTS.langs[lang]["sentence"], radio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ blocks.launch()
 
102
 
 
packages.txt CHANGED
@@ -1 +1,3 @@
1
  ffmpeg
 
 
1
  ffmpeg
2
+ libsndfile1
3
+ espeak-ng
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  git+https://github.com/huggingface/transformers
2
  torch
3
- pytube
 
 
1
  git+https://github.com/huggingface/transformers
2
  torch
3
+ neon-tts-plugin-coqui==0.6.0
4
+ psutil
5
+ mtranslate