ericckfeng commited on
Commit
441dbf2
1 Parent(s): 21220b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -41
app.py CHANGED
@@ -6,68 +6,68 @@ import torch
6
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
7
 
8
  checkpoint = "openai/whisper-base"
9
- # checkpoint = "/innev/open-ai/huggingface/openai/whisper-base"
10
- processor = WhisperProcessor.from_pretrained(checkpoint)
11
- model = WhisperForConditionalGeneration.from_pretrained(checkpoint)
12
 
13
- def process_audio(sampling_rate, waveform):
 
 
 
14
  # convert from int16 to floating point
15
- waveform = waveform / 32678.0
16
 
17
  # convert to mono if stereo
18
- if len(waveform.shape) > 1:
19
- waveform = librosa.to_mono(waveform.T)
20
 
21
  # resample to 16 kHz if necessary
22
- if sampling_rate != 16000:
23
- waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)
24
 
25
  # limit to 30 seconds
26
- waveform = waveform[:16000*30]
27
 
28
  # make PyTorch tensor
29
- waveform = torch.tensor(waveform)
30
  return waveform
31
 
32
 
33
- def predict(language, audio, mic_audio=None):
34
- if mic_audio is not None:
35
- sampling_rate, waveform = mic_audio
36
- elif audio is not None:
37
- sampling_rate, waveform = audio
38
  else:
39
- return "(please provide audio)"
40
 
41
- forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")
42
 
43
- waveform = process_audio(sampling_rate, waveform)
44
- inputs = processor(audio=waveform, sampling_rate=16000, return_tensors="pt")
45
- predicted_ids = model.generate(**inputs, max_length=400, forced_decoder_ids=forced_decoder_ids)
46
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
47
- return transcription[0]
48
 
49
  supportLangs = ['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese']
50
 
51
- title = "OpenAI Whisper Base"
52
 
53
  description = """
54
- 本例用于演示 <b>openai/whisper-base</b> 模型的语音识别(ASR)能力。基于原始模型开发,没有对模型做微调。 本例默认输出为中文,Whisper识别出的是繁体中文。
55
 
56
- Whisper包含多个不同大小的版本,理论来讲模型越大识别效果越好,模型越小速度越快
57
 
58
- <b>使用方法:</b> 上传一个音频文件或直接在页面中录制音频。音频会在传递到模型之前转换为单声道并重新采样为16 kHz
59
  """
60
 
61
  article = """
62
 
63
- ## 音频案例:
64
 
65
- - "春日阳光普照大地,正是踏春好时节" 来源: 知琪(Zhiqi)
66
- - "这是一年中最美味的团聚,也注定是一顿白感交集的晚餐。" 来源: 知厨(zhichu)
67
- - "Hmm, I don't know" 来源: [InspectorJ](https://freesound.org/people/InspectorJ/sounds/519189) (CC BY 4.0 license)
68
- - "Henry V" excerpt 来源: [acclivity](https://freesound.org/people/acclivity/sounds/24096) (CC BY-NC 4.0 license)
69
- - "You can see it in the eyes" 来源: [JoyOhJoy](https://freesound.org/people/JoyOhJoy/sounds/165348) (CC0 license)
70
- - "We yearn for time" 来源: [Sample_Me](https://freesound.org/people/Sample_Me/sounds/610529) (CC0 license)
71
 
72
  ## 参考
73
 
@@ -75,13 +75,13 @@ article = """
75
  - [Innev GitHub](https://github.com/innev)
76
 
77
 
78
- ## 多语言支持
79
 
80
  english, chinese, german, spanish, russian, korean, french, japanese, portuguese, turkish, polish, catalan, dutch, arabic, swedish, italian, indonesian, hindi, finnish, vietnamese, hebrew, ukrainian, greek, malay, czech, romanian, danish, hungarian, tamil, norwegian, thai, urdu, croatian, bulgarian, lithuanian, latin, maori, malayalam, welsh, slovak, telugu, persian, latvian, bengali, serbian, azerbaijani, slovenian, kannada, estonian, macedonian, breton, basque, icelandic, armenian, nepali, mongolian, bosnian, kazakh, albanian, swahili, galician, marathi, punjabi, sinhala, khmer, shona, yoruba, somali, afrikaans, occitan, georgian, belarusian, tajik, sindhi, gujarati, amharic, yiddish, lao, uzbek, faroese, haitian creole, pashto, turkmen, nynorsk, maltese, sanskrit, luxembourgish, myanmar, tibetan, tagalog, malagasy, assamese, tatar, hawaiian, lingala, hausa, bashkir, javanese, sundanese, burmese, valencian, flemish, haitian, letzeburgesch, pushto, panjabi, moldavian, moldovan, sinhalese, castilian
81
 
82
  ## 模型版本
83
 
84
- | 模型版本 | 参数大小 | 仅英语 | 多语言 |
85
  |----------|------------|------------------------------------------------------|-----------------------------------------------------|
86
  | tiny | 39 M | [✓](https://huggingface.co/openai/whisper-tiny.en) | [✓](https://huggingface.co/openai/whisper-tiny) |
87
  | base | 74 M | [✓](https://huggingface.co/openai/whisper-base.en) | [✓](https://huggingface.co/openai/whisper-base) |
@@ -100,15 +100,15 @@ examples = [
100
  [None, "examples/see_in_eyes.wav", None],
101
  ]
102
 
103
- gr.Interface(
104
  fn=predict,
105
  inputs=[
106
- gr.Radio(label="目标语言", choices=supportLangs, value="chinese"),
107
- gr.Audio(label="上传语音", source="upload", type="numpy"),
108
- gr.Audio(label="录制语音", source="microphone", type="numpy"),
109
  ],
110
  outputs=[
111
- gr.Text(label="识别出的文字"),
112
  ],
113
  title=title,
114
  description=description,
 
6
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
7
 
8
  checkpoint = "openai/whisper-base"
 
 
 
9
 
10
+ processor = WhisperProcessor.from_pretrained(checkpoint) # 加载tokenizer
11
+ model = WhisperForConditionalGeneration.from_pretrained(checkpoint) # 加载模型
12
+
13
+ def process_audio(sampling_rate, waveform): # 取樣率及波形
14
  # convert from int16 to floating point
15
+ waveform = waveform / 32678.0 # 2^15
16
 
17
  # convert to mono if stereo
18
+ if len(waveform.shape) > 1: # 如果是雙聲道
19
+ waveform = librosa.to_mono(waveform.T) # 轉成單聲道
20
 
21
  # resample to 16 kHz if necessary
22
+ if sampling_rate != 16000: # 如果不是16kHz
23
+ waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000) # 重新取樣成16kHz
24
 
25
  # limit to 30 seconds
26
+ waveform = waveform[:16000*30] # 限制30秒
27
 
28
  # make PyTorch tensor
29
+ waveform = torch.tensor(waveform) # 轉成PyTorch tensor
30
  return waveform
31
 
32
 
33
+ def predict(language, audio, mic_audio=None): # 預測
34
+ if mic_audio is not None: # 如果有麥克風輸入
35
+ sampling_rate, waveform = mic_audio # 取樣率及波形
36
+ elif audio is not None: # 如果有音頻檔案輸入
37
+ sampling_rate, waveform = audio # 取樣率及波形
38
  else:
39
+ return "(please provide audio)" # 請提供音頻
40
 
41
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe") # 取得decoder prompt ids
42
 
43
+ waveform = process_audio(sampling_rate, waveform) # 預處理音頻
44
+ inputs = processor(audio=waveform, sampling_rate=16000, return_tensors="pt") # 轉成PyTorch tensor
45
+ predicted_ids = model.generate(**inputs, max_length=400, forced_decoder_ids=forced_decoder_ids) # 生成預測結果
46
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) # 轉成文字
47
+ return transcription[0] # 回傳預測結果
48
 
49
  supportLangs = ['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese']
50
 
51
+ title = "OpenAI Whisper Base 原創為 https://huggingface.co/spaces/innev/whisper-Base 作者: Inne Villumsen 馮建凱加註複製至此,僅供自己學習使用。 "
52
 
53
  description = """
54
+ 本例用於演示 <b>openai/whisper-base</b> 模型的語音識別(ASR)能力。基于原始模型開發,没有對模型做微調。 本例預設為中文輸出,Whisper識別出的是繁体中文。
55
 
56
+ Whisper包含多個大小不同的版本,理論上來說模型越大效果越好,模型越小速度越快
57
 
58
+ <b>使用方法:</b> 上傳一個音檔或是直接在頁面中錄制音頻。音頻會在送到模型前先轉成單音並重新採樣為16Khz
59
  """
60
 
61
  article = """
62
 
63
+ ## 音檔範例:
64
 
65
+ - "春日陽光普照大地,正是踏春好時節" 來源: 知琪(Zhiqi)
66
+ - "這是一年中最美味的團聚,也注定是一頓白感交集的晚餐。" 來源: 知廚(zhichu)
67
+ - "Hmm, I don't know" 來源: [InspectorJ](https://freesound.org/people/InspectorJ/sounds/519189) (CC BY 4.0 license)
68
+ - "Henry V" excerpt 來源: [acclivity](https://freesound.org/people/acclivity/sounds/24096) (CC BY-NC 4.0 license)
69
+ - "You can see it in the eyes" 來源: [JoyOhJoy](https://freesound.org/people/JoyOhJoy/sounds/165348) (CC0 license)
70
+ - "We yearn for time" 來源: [Sample_Me](https://freesound.org/people/Sample_Me/sounds/610529) (CC0 license)
71
 
72
  ## 参考
73
 
 
75
  - [Innev GitHub](https://github.com/innev)
76
 
77
 
78
+ ## 多語音
79
 
80
  english, chinese, german, spanish, russian, korean, french, japanese, portuguese, turkish, polish, catalan, dutch, arabic, swedish, italian, indonesian, hindi, finnish, vietnamese, hebrew, ukrainian, greek, malay, czech, romanian, danish, hungarian, tamil, norwegian, thai, urdu, croatian, bulgarian, lithuanian, latin, maori, malayalam, welsh, slovak, telugu, persian, latvian, bengali, serbian, azerbaijani, slovenian, kannada, estonian, macedonian, breton, basque, icelandic, armenian, nepali, mongolian, bosnian, kazakh, albanian, swahili, galician, marathi, punjabi, sinhala, khmer, shona, yoruba, somali, afrikaans, occitan, georgian, belarusian, tajik, sindhi, gujarati, amharic, yiddish, lao, uzbek, faroese, haitian creole, pashto, turkmen, nynorsk, maltese, sanskrit, luxembourgish, myanmar, tibetan, tagalog, malagasy, assamese, tatar, hawaiian, lingala, hausa, bashkir, javanese, sundanese, burmese, valencian, flemish, haitian, letzeburgesch, pushto, panjabi, moldavian, moldovan, sinhalese, castilian
81
 
82
  ## 模型版本
83
 
84
+ | 模型版本 | 参數大小 | 只有英文 | 多語言 |
85
  |----------|------------|------------------------------------------------------|-----------------------------------------------------|
86
  | tiny | 39 M | [✓](https://huggingface.co/openai/whisper-tiny.en) | [✓](https://huggingface.co/openai/whisper-tiny) |
87
  | base | 74 M | [✓](https://huggingface.co/openai/whisper-base.en) | [✓](https://huggingface.co/openai/whisper-base) |
 
100
  [None, "examples/see_in_eyes.wav", None],
101
  ]
102
 
103
+ gr.Interface(
104
  fn=predict,
105
  inputs=[
106
+ gr.Radio(label="目標語言", choices=supportLangs, value="chinese"),
107
+ gr.Audio(label="上傳語音", source="upload", type="numpy"),
108
+ gr.Audio(label="錄製語音", source="microphone", type="numpy"),
109
  ],
110
  outputs=[
111
+ gr.Text(label="識別結果"),
112
  ],
113
  title=title,
114
  description=description,