ReneeYe commited on
Commit
9f7d061
β€’
1 Parent(s): d8aa18e
Files changed (2) hide show
  1. app.py +23 -25
  2. requirements.txt +1 -1
app.py CHANGED
@@ -12,7 +12,7 @@ import shutil
12
  import yaml
13
  import torchaudio
14
  import gradio as gr
15
- from huggingface_hub import snapshot_download, hf_hub_url
16
 
17
 
18
  LANGUAGE_CODES = {
@@ -38,10 +38,12 @@ LANG_GEN_SETUPS = {
38
  }
39
 
40
  os.system("git clone https://github.com/ReneeYe/ConST")
41
- os.system('mv ConST/* ./')
42
- os.system("rm -rf ConST")
43
- os.system("sudo python3 setup.py install")
44
- os.system("sudo python3 setup.py build_ext --inplace")
 
 
45
  os.system("mkdir -p data checkpoint")
46
 
47
 
@@ -52,7 +54,7 @@ def convert_audio_to_16k_wav(audio_input):
52
  num_frames = torchaudio.info(audio_input.name).num_frames
53
  filename = audio_input.name.split("/")[-1]
54
  shutil.copy(audio_input.name, f'data/{filename}')
55
- return f'data/{filename}', num_frames
56
 
57
 
58
  def prepare_tsv(file_name, n_frame, language, task="ST"):
@@ -90,7 +92,7 @@ def get_model(language):
90
 
91
 
92
  def generate(model_path):
93
- os.system(f"fairseq-generate data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \
94
  --max-tokens 4000000 --max-source-positions 4000000 \
95
  --config-yaml config.yaml --path {model_path} | tee temp.txt")
96
  output = os.popen("grep ^D temp.txt | sort -n -k 2 -t '-' | cut -f 3")
@@ -103,22 +105,24 @@ def remove_temp_files():
103
 
104
 
105
  def run(audio_file, language):
106
- converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file)
107
- prepare_tsv(converted_audio_file, n_frame, language)
108
- get_vocab_and_yaml(language)
109
- model_path = get_model(language)
110
- generated_output = generate(model_path)
111
- remove_temp_files()
112
- return generated_output
 
 
 
113
 
114
 
115
- def greet(audio_file, language):
116
- print(audio_file.name)
117
- return f"Hello {language}!!"
118
 
119
 
120
  inputs = [
121
- gr.inputs.Audio(source="microphone", type="file", label="Record something (in English)..."),
122
  gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="German", label="From English to Languages X..."),
123
  ]
124
 
@@ -133,11 +137,5 @@ iface = gr.Interface(
133
  "Its motivation is to use contrastive learning method to learn similar representations for semantically similar speech and text.",
134
  theme="seafoam",
135
  layout='vertical',
136
- # analytics_enabled=False,
137
- # flagging_dir='results/flagged/',
138
- # allow_flagging=True,
139
- # flagging_options=['Interesting!', 'Error: Claim Phrase Parsing', 'Error: Local Premise',
140
- # 'Error: Require Commonsense', 'Error: Evidence Retrieval'],
141
- enable_queue=True
142
  )
143
- iface.launch(inline=False)
 
12
  import yaml
13
  import torchaudio
14
  import gradio as gr
15
+ from huggingface_hub import snapshot_download
16
 
17
 
18
  LANGUAGE_CODES = {
 
38
  }
39
 
40
  os.system("git clone https://github.com/ReneeYe/ConST")
41
+ os.system("mv ConST ConST_git")
42
+ os.system('mv -n ConST_git/* ./')
43
+ os.system("rm -rf ConST_git")
44
+ # os.system("python3 setup.py install")
45
+ # os.system("python3 setup.py build_ext --inplace")
46
+ os.system("pip3 install --editable ./")
47
  os.system("mkdir -p data checkpoint")
48
 
49
 
 
54
  num_frames = torchaudio.info(audio_input.name).num_frames
55
  filename = audio_input.name.split("/")[-1]
56
  shutil.copy(audio_input.name, f'data/{filename}')
57
+ return filename, num_frames
58
 
59
 
60
  def prepare_tsv(file_name, n_frame, language, task="ST"):
 
92
 
93
 
94
  def generate(model_path):
95
+ os.system(f"python3 fairseq_cli/generate.py data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \
96
  --max-tokens 4000000 --max-source-positions 4000000 \
97
  --config-yaml config.yaml --path {model_path} | tee temp.txt")
98
  output = os.popen("grep ^D temp.txt | sort -n -k 2 -t '-' | cut -f 3")
 
105
 
106
 
107
  def run(audio_file, language):
108
+ try:
109
+ converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file)
110
+ prepare_tsv(converted_audio_file, n_frame, language)
111
+ get_vocab_and_yaml(language)
112
+ model_path = get_model(language)
113
+ generated_output = generate(model_path)
114
+ remove_temp_files()
115
+ return generated_output
116
+ except:
117
+ return error_output(language)
118
 
119
 
120
+ def error_output(language):
121
+ return f"Fail to translate the audio into {language}, you may use the examples I provide."
 
122
 
123
 
124
  inputs = [
125
+ gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
126
  gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="German", label="From English to Languages X..."),
127
  ]
128
 
 
137
  "Its motivation is to use contrastive learning method to learn similar representations for semantically similar speech and text.",
138
  theme="seafoam",
139
  layout='vertical',
 
 
 
 
 
 
140
  )
141
+ iface.launch()
requirements.txt CHANGED
@@ -20,5 +20,5 @@ sacrebleu==1.5.1
20
  omegaconf==2.0.5
21
  hydra-core==1.0.0
22
  huggingface_hub
23
- gradio
24
  torch==1.10.0
 
20
  omegaconf==2.0.5
21
  hydra-core==1.0.0
22
  huggingface_hub
23
+ gradio==2.7.5
24
  torch==1.10.0