asahi417 commited on
Commit
bda6501
1 Parent(s): 9151f3b

add stability ts

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import time
3
  import tempfile
4
- from copy import deepcopy
5
  from math import floor
6
  from typing import Optional, List, Dict, Any
7
 
@@ -162,11 +161,11 @@ def get_prediction(inputs, prompt: Optional[str], punctuate_text: bool = True, s
162
  generate_kwargs = {"language": "japanese", "task": "transcribe"}
163
  if prompt:
164
  generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(prompt, return_tensors='pt').to(device)
165
- prediction = pipe(deepcopy(inputs), return_timestamps=True, generate_kwargs=generate_kwargs)
 
 
166
  if stabilize_timestamp:
167
- prediction['chunks'] = fix_timestamp(pipeline_output=prediction['chunks'],
168
- audio=inputs["array"],
169
- sample_rate=inputs["sampling_rate"])
170
  if punctuate_text:
171
  prediction['chunks'] = PUNCTUATOR.punctuate(prediction['chunks'])
172
  text = "".join([c['text'] for c in prediction['chunks']])
@@ -176,9 +175,11 @@ def get_prediction(inputs, prompt: Optional[str], punctuate_text: bool = True, s
176
  return text, text_timestamped
177
 
178
 
179
- def transcribe(inputs, prompt, punctuate_text, stabilize_timestamp):
180
  if inputs is None:
181
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
 
 
182
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
183
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
184
  return get_prediction(inputs, prompt, punctuate_text, stabilize_timestamp)
 
1
  import os
2
  import time
3
  import tempfile
 
4
  from math import floor
5
  from typing import Optional, List, Dict, Any
6
 
 
161
  generate_kwargs = {"language": "japanese", "task": "transcribe"}
162
  if prompt:
163
  generate_kwargs['prompt_ids'] = pipe.tokenizer.get_prompt_ids(prompt, return_tensors='pt').to(device)
164
+ array = inputs["array"]
165
+ sr = inputs["sampling_rate"]
166
+ prediction = pipe(inputs, return_timestamps=True, generate_kwargs=generate_kwargs)
167
  if stabilize_timestamp:
168
+ prediction['chunks'] = fix_timestamp(pipeline_output=prediction['chunks'], audio=array, sample_rate=sr)
 
 
169
  if punctuate_text:
170
  prediction['chunks'] = PUNCTUATOR.punctuate(prediction['chunks'])
171
  text = "".join([c['text'] for c in prediction['chunks']])
 
175
  return text, text_timestamped
176
 
177
 
178
+ def transcribe(inputs: str, prompt, punctuate_text, stabilize_timestamp):
179
  if inputs is None:
180
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
181
+ with open(inputs, "rb") as f:
182
+ inputs = f.read()
183
  inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
184
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
185
  return get_prediction(inputs, prompt, punctuate_text, stabilize_timestamp)