zongxiao commited on
Commit
cca84a3
1 Parent(s): 4df22b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -103
app.py CHANGED
@@ -1,87 +1,3 @@
1
- # import torch
2
- # import numpy as np
3
- # import soundfile as sf
4
- # from transformers import pipeline
5
- # from transformers import BarkModel
6
- # from transformers import AutoProcessor
7
-
8
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
-
10
- # pipe = pipeline(
11
- # "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
12
- # )
13
- # label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
14
- # processor = AutoProcessor.from_pretrained("suno/bark")
15
- # model = BarkModel.from_pretrained("suno/bark")
16
- # model = model.to(device)
17
- # synthesised_rate = model.generation_config.sample_rate
18
-
19
- # def translate(audio_file):
20
- # audio, sampling_rate = sf.read(audio_file)
21
- # outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
22
- # language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
23
- # label_outputs = {}
24
- # for pred in language_prediction:
25
- # label_outputs[pred["label"]] = pred["score"]
26
- # return outputs["text"],label_outputs
27
- # def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
28
- # inputs = processor(text_prompt, voice_preset=voice_preset)
29
- # speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
30
- # return speech_output
31
- # def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
32
- # translated_text, label_outputs= translate(audio)
33
- # synthesised_speech = synthesise(translated_text,voice_preset)
34
- # synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
35
- # return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
36
-
37
- # title = "外国话转中文话"
38
- # description = """
39
- # 作为[Hugging Face Audio course](https://huggingface.co/learn/audio-course/chapter0/introduction) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话,一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
40
-
41
- # ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
42
- # """
43
-
44
- # examples = [
45
- # ["./en.mp3", None],
46
- # ["./de.mp3", None],
47
- # ["./fr.mp3", None],
48
- # ["./it.mp3", None],
49
-
50
- # ]
51
- # import gradio as gr
52
-
53
- # demo = gr.Blocks()
54
- # file_transcribe = gr.Interface(
55
- # fn=speech_to_speech_translation,
56
- # inputs=gr.Audio(source="upload", type="filepath"),
57
- # outputs=[
58
- # gr.Audio(label="Generated Speech", type="numpy"),
59
- # gr.Text(label="Transcription"),
60
- # gr.Label(label="Language prediction"),
61
- # ],
62
- # title=title,
63
- # description=description,
64
- # examples=examples,
65
- # )
66
- # mic_transcribe = gr.Interface(
67
- # fn=speech_to_speech_translation,
68
- # inputs=gr.Audio(source="microphone", type="filepath"),
69
- # outputs=[
70
- # gr.Audio(label="Generated Speech", type="numpy"),
71
- # gr.Text(label="Transcription"),
72
- # gr.Label(label="Language prediction"),
73
- # ],
74
- # title=title,
75
- # description=description,
76
- # )
77
- # with demo:
78
- # gr.TabbedInterface(
79
- # [file_transcribe, mic_transcribe],
80
- # ["Transcribe Audio File", "Transcribe Microphone"],
81
- # )
82
-
83
- # demo.launch(share=True)
84
- ###########################################################################################################################
85
  import torch
86
  import numpy as np
87
  import soundfile as sf
@@ -94,47 +10,46 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
94
  pipe = pipeline(
95
  "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
96
  )
97
- #label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
98
  processor = AutoProcessor.from_pretrained("suno/bark")
99
  model = BarkModel.from_pretrained("suno/bark")
100
  model = model.to(device)
101
  synthesised_rate = model.generation_config.sample_rate
102
 
103
  def translate(audio_file):
104
- # audio, sampling_rate = sf.read(audio_file)
105
- outputs = pipe(audio_file, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
106
- # language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
107
- # label_outputs = {}
108
- # for pred in language_prediction:
109
- # label_outputs[pred["label"]] = pred["score"]
110
- return outputs["text"]#,label_outputs
111
  def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
112
  inputs = processor(text_prompt, voice_preset=voice_preset)
113
  speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
114
  return speech_output
115
  def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
116
- #translated_text, label_outputs= translate(audio)
117
- translated_text = translate(audio)
118
  synthesised_speech = synthesise(translated_text,voice_preset)
119
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
120
- return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs
121
 
122
  title = "外国话转中文话"
123
  description = """
124
- 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话(CPU演示太慢暂时先去掉了),一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
125
 
126
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
127
  """
128
 
129
  examples = [
130
- ["./en.mp3", None],
131
- ["./de.mp3", None],
132
  ["./fr.mp3", None],
133
  ["./it.mp3", None],
134
  ["./nl.mp3", None],
135
  ["./fi.mp3", None],
136
- ["./cs.mp3", None],
137
- ["./pl.mp3", None],
138
  ]
139
  import gradio as gr
140
 
@@ -145,7 +60,7 @@ file_transcribe = gr.Interface(
145
  outputs=[
146
  gr.Audio(label="Generated Speech", type="numpy"),
147
  gr.Text(label="Transcription"),
148
- # gr.Label(label="Language prediction"),
149
  ],
150
  title=title,
151
  description=description,
@@ -157,7 +72,7 @@ mic_transcribe = gr.Interface(
157
  outputs=[
158
  gr.Audio(label="Generated Speech", type="numpy"),
159
  gr.Text(label="Transcription"),
160
- # gr.Label(label="Language prediction"),
161
  ],
162
  title=title,
163
  description=description,
@@ -168,4 +83,92 @@ with demo:
168
  ["Transcribe Audio File", "Transcribe Microphone"],
169
  )
170
 
171
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import numpy as np
3
  import soundfile as sf
 
10
  pipe = pipeline(
11
  "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
12
  )
13
+ label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
14
  processor = AutoProcessor.from_pretrained("suno/bark")
15
  model = BarkModel.from_pretrained("suno/bark")
16
  model = model.to(device)
17
  synthesised_rate = model.generation_config.sample_rate
18
 
19
  def translate(audio_file):
20
+ audio, sampling_rate = sf.read(audio_file)
21
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
22
+ language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
23
+ label_outputs = {}
24
+ for pred in language_prediction:
25
+ label_outputs[pred["label"]] = pred["score"]
26
+ return outputs["text"],label_outputs
27
  def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
28
  inputs = processor(text_prompt, voice_preset=voice_preset)
29
  speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
30
  return speech_output
31
  def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
32
+ translated_text, label_outputs= translate(audio)
 
33
  synthesised_speech = synthesise(translated_text,voice_preset)
34
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
35
+ return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
36
 
37
  title = "外国话转中文话"
38
  description = """
39
+ 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话,一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
40
 
41
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
42
  """
43
 
44
  examples = [
45
+ # ["./en.mp3", None],
46
+ # ["./de.mp3", None],
47
  ["./fr.mp3", None],
48
  ["./it.mp3", None],
49
  ["./nl.mp3", None],
50
  ["./fi.mp3", None],
51
+ # ["./cs.mp3", None],
52
+ # ["./pl.mp3", None],
53
  ]
54
  import gradio as gr
55
 
 
60
  outputs=[
61
  gr.Audio(label="Generated Speech", type="numpy"),
62
  gr.Text(label="Transcription"),
63
+ gr.Label(label="Language prediction"),
64
  ],
65
  title=title,
66
  description=description,
 
72
  outputs=[
73
  gr.Audio(label="Generated Speech", type="numpy"),
74
  gr.Text(label="Transcription"),
75
+ gr.Label(label="Language prediction"),
76
  ],
77
  title=title,
78
  description=description,
 
83
  ["Transcribe Audio File", "Transcribe Microphone"],
84
  )
85
 
86
+ demo.launch()
87
+ ###########################################################################################################################
88
+ # import torch
89
+ # import numpy as np
90
+ # import soundfile as sf
91
+ # from transformers import pipeline
92
+ # from transformers import BarkModel
93
+ # from transformers import AutoProcessor
94
+
95
+ # device = "cuda:0" if torch.cuda.is_available() else "cpu"
96
+
97
+ # pipe = pipeline(
98
+ # "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
99
+ # )
100
+ # #label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
101
+ # processor = AutoProcessor.from_pretrained("suno/bark")
102
+ # model = BarkModel.from_pretrained("suno/bark")
103
+ # model = model.to(device)
104
+ # synthesised_rate = model.generation_config.sample_rate
105
+
106
+ # def translate(audio_file):
107
+ # # audio, sampling_rate = sf.read(audio_file)
108
+ # outputs = pipe(audio_file, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
109
+ # # language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
110
+ # # label_outputs = {}
111
+ # # for pred in language_prediction:
112
+ # # label_outputs[pred["label"]] = pred["score"]
113
+ # return outputs["text"]#,label_outputs
114
+ # def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
115
+ # inputs = processor(text_prompt, voice_preset=voice_preset)
116
+ # speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
117
+ # return speech_output
118
+ # def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
119
+ # #translated_text, label_outputs= translate(audio)
120
+ # translated_text = translate(audio)
121
+ # synthesised_speech = synthesise(translated_text,voice_preset)
122
+ # synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
123
+ # return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs
124
+
125
+ # title = "外国话转中文话"
126
+ # description = """
127
+ # 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话(CPU演示太慢暂时先去掉了),一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
128
+
129
+ # ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
130
+ # """
131
+
132
+ # examples = [
133
+ # ["./en.mp3", None],
134
+ # ["./de.mp3", None],
135
+ # ["./fr.mp3", None],
136
+ # ["./it.mp3", None],
137
+ # ["./nl.mp3", None],
138
+ # ["./fi.mp3", None],
139
+ # ["./cs.mp3", None],
140
+ # ["./pl.mp3", None],
141
+ # ]
142
+ # import gradio as gr
143
+
144
+ # demo = gr.Blocks()
145
+ # file_transcribe = gr.Interface(
146
+ # fn=speech_to_speech_translation,
147
+ # inputs=gr.Audio(source="upload", type="filepath"),
148
+ # outputs=[
149
+ # gr.Audio(label="Generated Speech", type="numpy"),
150
+ # gr.Text(label="Transcription"),
151
+ # # gr.Label(label="Language prediction"),
152
+ # ],
153
+ # title=title,
154
+ # description=description,
155
+ # examples=examples,
156
+ # )
157
+ # mic_transcribe = gr.Interface(
158
+ # fn=speech_to_speech_translation,
159
+ # inputs=gr.Audio(source="microphone", type="filepath"),
160
+ # outputs=[
161
+ # gr.Audio(label="Generated Speech", type="numpy"),
162
+ # gr.Text(label="Transcription"),
163
+ # # gr.Label(label="Language prediction"),
164
+ # ],
165
+ # title=title,
166
+ # description=description,
167
+ # )
168
+ # with demo:
169
+ # gr.TabbedInterface(
170
+ # [file_transcribe, mic_transcribe],
171
+ # ["Transcribe Audio File", "Transcribe Microphone"],
172
+ # )
173
+
174
+ # demo.launch()