|
import os |
|
import streamlit as st |
|
from transformers import WhisperForConditionalGeneration, WhisperProcessor |
|
import torch |
|
import librosa |
|
import srt |
|
from datetime import timedelta |
|
|
|
|
|
def split_audio(audio, sr, segment_duration=5): |
|
segments = [] |
|
for i in range(0, len(audio), int(segment_duration * sr)): |
|
segment = audio[i:i + int(segment_duration * sr)] |
|
segments.append(segment) |
|
return segments |
|
|
|
|
|
@st.cache_resource |
|
def load_model(): |
|
model = WhisperForConditionalGeneration.from_pretrained("lcjln/AIME_Project_The_Final") |
|
processor = WhisperProcessor.from_pretrained("lcjln/AIME_The_Final") |
|
return model, processor |
|
|
|
model, processor = load_model() |
|
|
|
|
|
st.title("Whisper ์๋ง ์์ฑ๊ธฐ") |
|
|
|
|
|
uploaded_files = st.file_uploader("์ฌ๊ธฐ์ WAV ํ์ผ๋ค์ ๋๋๊ทธ ์ค ๋๋กญ ํ์ธ์", type=["wav"], accept_multiple_files=True) |
|
|
|
|
|
if uploaded_files: |
|
st.write("์
๋ก๋๋ ํ์ผ ๋ชฉ๋ก:") |
|
for uploaded_file in uploaded_files: |
|
st.write(uploaded_file.name) |
|
|
|
|
|
if st.button("์คํ"): |
|
combined_subs = [] |
|
last_end_time = timedelta(0) |
|
subtitle_index = 1 |
|
|
|
for uploaded_file in uploaded_files: |
|
st.write(f"์ฒ๋ฆฌ ์ค: {uploaded_file.name}") |
|
|
|
|
|
progress_bar = st.progress(0) |
|
|
|
|
|
st.write("์ค๋์ค ํ์ผ์ ์ฒ๋ฆฌํ๋ ์ค์
๋๋ค...") |
|
audio, sr = librosa.load(uploaded_file, sr=16000) |
|
|
|
progress_bar.progress(50) |
|
|
|
|
|
st.write("๋ชจ๋ธ์ ํตํด ์๋ง์ ์์ฑํ๋ ์ค์
๋๋ค...") |
|
segments = split_audio(audio, sr, segment_duration=5) |
|
|
|
for i, segment in enumerate(segments): |
|
inputs = processor(segment, return_tensors="pt", sampling_rate=16000) |
|
with torch.no_grad(): |
|
outputs = model.generate(inputs["input_features"], max_length=2048, return_dict_in_generate=True, output_scores=True) |
|
|
|
|
|
transcription = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0].strip() |
|
|
|
|
|
avg_logit_score = torch.mean(outputs.scores[-1]).item() |
|
|
|
|
|
if transcription and avg_logit_score > -5.0: |
|
segment_duration = librosa.get_duration(y=segment, sr=sr) |
|
end_time = last_end_time + timedelta(seconds=segment_duration) |
|
|
|
combined_subs.append( |
|
srt.Subtitle( |
|
index=subtitle_index, |
|
start=last_end_time, |
|
end=end_time, |
|
content=transcription |
|
) |
|
) |
|
last_end_time = end_time |
|
subtitle_index += 1 |
|
|
|
progress_bar.progress(100) |
|
st.success(f"{uploaded_file.name}์ ์๋ง์ด ์ฑ๊ณต์ ์ผ๋ก ์์ฑ๋์์ต๋๋ค!") |
|
|
|
|
|
st.write("์ต์ข
SRT ํ์ผ์ ์์ฑํ๋ ์ค์
๋๋ค...") |
|
srt_content = srt.compose(combined_subs) |
|
|
|
final_srt_file_path = "combined_output.srt" |
|
with open(final_srt_file_path, "w", encoding="utf-8") as f: |
|
f.write(srt_content) |
|
|
|
st.success("์ต์ข
SRT ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์์ฑ๋์์ต๋๋ค!") |
|
|
|
|
|
with open(final_srt_file_path, "rb") as srt_file: |
|
st.download_button(label="SRT ํ์ผ ๋ค์ด๋ก๋", data=srt_file, file_name=final_srt_file_path, mime="text/srt") |