Spaces:
Sleeping
Sleeping
Halving precision to speed up inference
Browse files
app.py
CHANGED
@@ -24,7 +24,7 @@ def format_time(seconds):
|
|
24 |
|
25 |
#Convert Video/Audio into 16K wav file
|
26 |
def preprocessAudio(audioFile):
|
27 |
-
os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./
|
28 |
|
29 |
#Transcribe!!!
|
30 |
def Transcribe(file):
|
@@ -33,18 +33,18 @@ def Transcribe(file):
|
|
33 |
model.load_adapter("amh")
|
34 |
|
35 |
preprocessAudio(file)
|
36 |
-
#os.system(f"ffmpeg -y -i ./July3_2023_Sermon.mov -ar 16000 ./audio.wav")
|
37 |
block_size = 30 #30 second chunks of audio
|
38 |
|
39 |
transcripts = []
|
40 |
stream = librosa.stream(
|
41 |
-
"./
|
42 |
block_length=block_size,
|
43 |
frame_length=16000,
|
44 |
hop_length=16000
|
45 |
)
|
46 |
|
47 |
model.to(device)
|
|
|
48 |
print(f"Model loaded to {device}: Entering transcription phase")
|
49 |
|
50 |
#Code for timestamping
|
@@ -55,12 +55,11 @@ def Transcribe(file):
|
|
55 |
if len(speech_segment.shape) > 1:
|
56 |
speech_segment = speech_segment[:,0] + speech_segment[:,1]
|
57 |
input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device)
|
|
|
58 |
with torch.no_grad():
|
59 |
logits = model(input_values).logits
|
60 |
if len(logits.shape) == 1:
|
61 |
-
print("test")
|
62 |
logits = logits.unsqueeze(0)
|
63 |
-
#predicted_ids = torch.argmax(logits, dim=-1)
|
64 |
transcription = processor.batch_decode(logits.cpu().numpy()).text
|
65 |
transcripts.append(transcription[0])
|
66 |
|
@@ -77,7 +76,6 @@ def Transcribe(file):
|
|
77 |
# Freeing up memory
|
78 |
del input_values
|
79 |
del logits
|
80 |
-
#del predicted_ids
|
81 |
del transcription
|
82 |
torch.cuda.empty_cache()
|
83 |
gc.collect()
|
@@ -92,11 +90,6 @@ def Transcribe(file):
|
|
92 |
return("./subtitle.sbv")
|
93 |
|
94 |
demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs="file")
|
95 |
-
#with gr.Blocks() as demo:
|
96 |
-
#file_output = gr.Textbox()
|
97 |
-
#upload_button = gr.UploadButton("Click to Upload a sermon",
|
98 |
-
# file_types=["video", "audio"], file_count="multiple")
|
99 |
-
#upload_button.upload(Transcribe, upload_button, file_output)
|
100 |
demo.launch()
|
101 |
|
102 |
|
|
|
24 |
|
25 |
#Convert Video/Audio into 16K wav file
|
26 |
def preprocessAudio(audioFile):
|
27 |
+
os.system(f"ffmpeg -y -i {audioFile.name} -ar 16000 ./audioToConvert.wav")
|
28 |
|
29 |
#Transcribe!!!
|
30 |
def Transcribe(file):
|
|
|
33 |
model.load_adapter("amh")
|
34 |
|
35 |
preprocessAudio(file)
|
|
|
36 |
block_size = 30 #30 second chunks of audio
|
37 |
|
38 |
transcripts = []
|
39 |
stream = librosa.stream(
|
40 |
+
"./audioToConvert.wav",
|
41 |
block_length=block_size,
|
42 |
frame_length=16000,
|
43 |
hop_length=16000
|
44 |
)
|
45 |
|
46 |
model.to(device)
|
47 |
+
model.half()
|
48 |
print(f"Model loaded to {device}: Entering transcription phase")
|
49 |
|
50 |
#Code for timestamping
|
|
|
55 |
if len(speech_segment.shape) > 1:
|
56 |
speech_segment = speech_segment[:,0] + speech_segment[:,1]
|
57 |
input_values = processor(speech_segment, sampling_rate=16_000, return_tensors="pt").input_values.to(device)
|
58 |
+
input_values = input_values.half()
|
59 |
with torch.no_grad():
|
60 |
logits = model(input_values).logits
|
61 |
if len(logits.shape) == 1:
|
|
|
62 |
logits = logits.unsqueeze(0)
|
|
|
63 |
transcription = processor.batch_decode(logits.cpu().numpy()).text
|
64 |
transcripts.append(transcription[0])
|
65 |
|
|
|
76 |
# Freeing up memory
|
77 |
del input_values
|
78 |
del logits
|
|
|
79 |
del transcription
|
80 |
torch.cuda.empty_cache()
|
81 |
gc.collect()
|
|
|
90 |
return("./subtitle.sbv")
|
91 |
|
92 |
demo = gr.Interface(fn=Transcribe, inputs=gr.File(), outputs="file")
|
|
|
|
|
|
|
|
|
|
|
93 |
demo.launch()
|
94 |
|
95 |
|