JunzhaoSun commited on
Commit
89f022a
1 Parent(s): 21147ce

设置默认输出中文

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +40 -23
  3. examples/zhichu.wav +0 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Whisper Large V2
3
  emoji: 📚
4
  colorFrom: gray
5
  colorTo: indigo
@@ -10,4 +10,4 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Whisper Base
3
  emoji: 📚
4
  colorFrom: gray
5
  colorTo: indigo
 
10
  license: mit
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -3,12 +3,13 @@
3
  import gradio as gr
4
  import librosa
5
  import torch
6
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
7
-
8
- checkpoint = "openai/whisper-large-v2"
9
- processor = AutoProcessor.from_pretrained(checkpoint)
10
- model = AutoModelForSpeechSeq2Seq.from_pretrained(checkpoint)
11
 
 
 
 
 
 
12
 
13
  def process_audio(sampling_rate, waveform):
14
  # convert from int16 to floating point
@@ -30,7 +31,7 @@ def process_audio(sampling_rate, waveform):
30
  return waveform
31
 
32
 
33
- def predict(audio, mic_audio=None):
34
  # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
35
  if mic_audio is not None:
36
  sampling_rate, waveform = mic_audio
@@ -39,54 +40,70 @@ def predict(audio, mic_audio=None):
39
  else:
40
  return "(please provide audio)"
41
 
 
 
42
  waveform = process_audio(sampling_rate, waveform)
43
  inputs = processor(audio=waveform, sampling_rate=16000, return_tensors="pt")
44
- predicted_ids = model.generate(**inputs, max_length=400)
45
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
46
  return transcription[0]
47
 
 
48
 
49
- title = "OpenAI Whisper Large v2"
50
 
51
  description = """
52
- 本例用于演示 <b>openai/whisper-large-v2</b> 模型的语音识别(ASR)能力。目前没有对模型做微调,基于原始模型开发。 Whisper原始模型主要支持英语语音的识别。英语的效果最好,中文语音识别后只会输出汉语拼音。
53
 
54
- <b>更多的信息请参考:</b> <a href="https://huggingface.co/openai/whisper-large-v2">openai/whisper-large-v2</a>。
55
 
56
  <b>使用方法:</b> 上传一个音频文件或直接在页面中录制音频。音频会在传递到模型之前转换为单声道并重新采样为16 kHz。
57
  """
58
 
59
  article = """
60
  <div style='margin:20px auto;'>
61
-
62
- <p>
63
- 参考:
64
- <a href="https://huggingface.co/openai/whisper-large-v2">OpenAI Whisper Large v2</a> |
65
- <a href="https://github.com/innev">Innev GitHub</a>
66
- </p>
67
-
68
  <p>音频案例:<p>
69
  <ul>
70
- <li>"春日阳光普照大地,正是踏春好时节" 来源: 知琪(Zhiqi)
 
71
  <li>"Hmm, I don't know" 来源: <a href="https://freesound.org/people/InspectorJ/sounds/519189/">InspectorJ</a> (CC BY 4.0 license)
72
  <li>"Henry V" excerpt 来源: <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
73
  <li>"You can see it in the eyes" 来源: <a href="https://freesound.org/people/JoyOhJoy/sounds/165348/">JoyOhJoy</a> (CC0 license)
74
  <li>"We yearn for time" 来源: <a href="https://freesound.org/people/Sample_Me/sounds/610529/">Sample_Me</a> (CC0 license)
75
  </ul>
 
 
 
 
 
 
76
  </div>
 
 
 
 
 
 
 
 
 
 
 
77
  """
78
 
79
  examples = [
80
- ["examples/zhiqi.wav", None],
81
- ["examples/hmm_i_dont_know.wav", None],
82
- ["examples/henry5.mp3", None],
83
- ["examples/yearn_for_time.mp3", None],
84
- ["examples/see_in_eyes.wav", None],
 
85
  ]
86
 
87
  gr.Interface(
88
  fn=predict,
89
  inputs=[
 
90
  gr.Audio(label="上传语音", source="upload", type="numpy"),
91
  gr.Audio(label="录制语音", source="microphone", type="numpy"),
92
  ],
 
3
  import gradio as gr
4
  import librosa
5
  import torch
6
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
 
 
 
7
 
8
+ checkpoint = "openai/whisper-base"
9
+ # checkpoint = "/innev/open-ai/huggingface/models/openai/whisper-medium"
10
+ # checkpoint = "/innev/open-ai/huggingface/models/openai/whisper-base"
11
+ processor = WhisperProcessor.from_pretrained(checkpoint)
12
+ model = WhisperForConditionalGeneration.from_pretrained(checkpoint)
13
 
14
  def process_audio(sampling_rate, waveform):
15
  # convert from int16 to floating point
 
31
  return waveform
32
 
33
 
34
+ def predict(language, audio, mic_audio=None):
35
  # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
36
  if mic_audio is not None:
37
  sampling_rate, waveform = mic_audio
 
40
  else:
41
  return "(please provide audio)"
42
 
43
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
44
+
45
  waveform = process_audio(sampling_rate, waveform)
46
  inputs = processor(audio=waveform, sampling_rate=16000, return_tensors="pt")
47
+ predicted_ids = model.generate(**inputs, max_length=400, forced_decoder_ids=forced_decoder_ids)
48
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
49
  return transcription[0]
50
 
51
+ supportLangs = ['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese']
52
 
53
+ title = "OpenAI Whisper Base"
54
 
55
  description = """
56
+ 本例用于演示 <b>openai/whisper-base</b> 模型的语音识别(ASR)能力。基于原始模型开发,没有对模型做微调。 本例默认输出为中文,Whisper识别出的是繁体中文。
57
 
58
+ Whisper包含多个不同大小的版本,理论来讲模型越大识别效果越好,模型越小速度越快
59
 
60
  <b>使用方法:</b> 上传一个音频文件或直接在页面中录制音频。音频会在传递到模型之前转换为单声道并重新采样为16 kHz。
61
  """
62
 
63
  article = """
64
  <div style='margin:20px auto;'>
 
 
 
 
 
 
 
65
  <p>音频案例:<p>
66
  <ul>
67
+ <li>"春日阳光普照大地,正是踏春好时节" 来源: 知琪(Zhiqi
68
+ <li>"这是一年中最美味的团聚,也注定是一顿白感交集的晚餐。" 来源: 知厨(zhichu)
69
  <li>"Hmm, I don't know" 来源: <a href="https://freesound.org/people/InspectorJ/sounds/519189/">InspectorJ</a> (CC BY 4.0 license)
70
  <li>"Henry V" excerpt 来源: <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
71
  <li>"You can see it in the eyes" 来源: <a href="https://freesound.org/people/JoyOhJoy/sounds/165348/">JoyOhJoy</a> (CC0 license)
72
  <li>"We yearn for time" 来源: <a href="https://freesound.org/people/Sample_Me/sounds/610529/">Sample_Me</a> (CC0 license)
73
  </ul>
74
+
75
+ <p>
76
+ 参考:
77
+ <a href="https://huggingface.co/openai/whisper-base">OpenAI Whisper Base</a> |
78
+ <a href="https://github.com/innev">Innev GitHub</a>
79
+ </p>
80
  </div>
81
+
82
+ <b>多语言支持: </b> english, chinese, german, spanish, russian, korean, french, japanese, portuguese, turkish, polish, catalan, dutch, arabic, swedish, italian, indonesian, hindi, finnish, vietnamese, hebrew, ukrainian, greek, malay, czech, romanian, danish, hungarian, tamil, norwegian, thai, urdu, croatian, bulgarian, lithuanian, latin, maori, malayalam, welsh, slovak, telugu, persian, latvian, bengali, serbian, azerbaijani, slovenian, kannada, estonian, macedonian, breton, basque, icelandic, armenian, nepali, mongolian, bosnian, kazakh, albanian, swahili, galician, marathi, punjabi, sinhala, khmer, shona, yoruba, somali, afrikaans, occitan, georgian, belarusian, tajik, sindhi, gujarati, amharic, yiddish, lao, uzbek, faroese, haitian creole, pashto, turkmen, nynorsk, maltese, sanskrit, luxembourgish, myanmar, tibetan, tagalog, malagasy, assamese, tatar, hawaiian, lingala, hausa, bashkir, javanese, sundanese, burmese, valencian, flemish, haitian, letzeburgesch, pushto, panjabi, moldavian, moldovan, sinhalese, castilian
83
+
84
+ | 模型版本 | 大小 | 仅英语 | 多语言 |
85
+ |----------|------------|------------------------------------------------------|-----------------------------------------------------|
86
+ | tiny | 39 M | [✓](https://huggingface.co/openai/whisper-tiny.en) | [✓](https://huggingface.co/openai/whisper-tiny) |
87
+ | base | 74 M | [✓](https://huggingface.co/openai/whisper-base.en) | [✓](https://huggingface.co/openai/whisper-base) |
88
+ | small | 244 M | [✓](https://huggingface.co/openai/whisper-small.en) | [✓](https://huggingface.co/openai/whisper-small) |
89
+ | medium | 769 M | [✓](https://huggingface.co/openai/whisper-medium.en) | [✓](https://huggingface.co/openai/whisper-medium) |
90
+ | large | 1550 M | x | [✓](https://huggingface.co/openai/whisper-large) |
91
+ | large-v2 | 1550 M | x | [✓](https://huggingface.co/openai/whisper-large-v2) |
92
  """
93
 
94
  examples = [
95
+ [None, "examples/zhiqi.wav", None],
96
+ [None, "examples/zhichu.wav", None],
97
+ [None, "examples/hmm_i_dont_know.wav", None],
98
+ [None, "examples/henry5.mp3", None],
99
+ [None, "examples/yearn_for_time.mp3", None],
100
+ [None, "examples/see_in_eyes.wav", None],
101
  ]
102
 
103
  gr.Interface(
104
  fn=predict,
105
  inputs=[
106
+ gr.Radio(label="目标语言", choices=supportLangs, value="chinese"),
107
  gr.Audio(label="上传语音", source="upload", type="numpy"),
108
  gr.Audio(label="录制语音", source="microphone", type="numpy"),
109
  ],
examples/zhichu.wav ADDED
Binary file (500 kB). View file