|
from faster_whisper import WhisperModel |
|
import math |
|
import gradio as gr |
|
from moviepy import VideoFileClip |
|
import requests |
|
|
|
|
|
|
|
def extract_audio(input_video_name): |
|
|
|
mp3_file = "audio.mp3" |
|
|
|
video_clip = VideoFileClip(input_video_name) |
|
|
|
|
|
audio_clip = video_clip.audio |
|
duration = audio_clip.duration |
|
print(f"Audio duration: {duration}") |
|
|
|
audio_clip.write_audiofile(mp3_file) |
|
|
|
|
|
audio_clip.close() |
|
video_clip.close() |
|
|
|
print("Audio extraction successful!") |
|
return mp3_file, duration |
|
|
|
def download_video(url): |
|
response = requests.get(url, stream=True) |
|
response.raise_for_status() |
|
video_file = "video.mp4" |
|
with open(video_file, 'wb') as file: |
|
for chunk in response.iter_content(chunk_size=8192): |
|
if chunk: |
|
file.write(chunk) |
|
print("Video downloaded successfully!") |
|
return video_file |
|
|
|
def word_level_transcribe(audio, max_segment_duration=2.0): |
|
model = WhisperModel("tiny", device="cpu") |
|
segments, info = model.transcribe(audio, vad_filter=True, vad_parameters=dict(min_silence_duration_ms=1500), word_timestamps=True, log_progress=True) |
|
segments = list(segments) |
|
wordlevel_info = [] |
|
for segment in segments: |
|
for word in segment.words: |
|
print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word)) |
|
wordlevel_info.append({'word':word.word,'start':word.start,'end':word.end}) |
|
return wordlevel_info |
|
|
|
def create_subtitles(wordlevel_info): |
|
punctuation_marks = {'.', '!', '?', ',', ';', ':', '—', '-', '。', '!', '?'} |
|
subtitles = [] |
|
line = [] |
|
|
|
for word_data in wordlevel_info: |
|
line.append(word_data) |
|
current_word = word_data['word'] |
|
|
|
|
|
ends_with_punct = current_word and (current_word[-1] in punctuation_marks) |
|
|
|
if ends_with_punct or len(line) == 5: |
|
|
|
subtitle = { |
|
"word": " ".join(item["word"] for item in line), |
|
"start": line[0]["start"], |
|
"end": line[-1]["end"], |
|
"textcontents": line.copy() |
|
} |
|
subtitles.append(subtitle) |
|
line = [] |
|
|
|
|
|
if line: |
|
subtitle = { |
|
"word": " ".join(item["word"] for item in line), |
|
"start": line[0]["start"], |
|
"end": line[-1]["end"], |
|
"textcontents": line.copy() |
|
} |
|
subtitles.append(subtitle) |
|
|
|
|
|
for i in range(1, len(subtitles)): |
|
prev_subtitle = subtitles[i - 1] |
|
current_subtitle = subtitles[i] |
|
|
|
|
|
prev_subtitle["end"] = current_subtitle["start"] |
|
|
|
return subtitles |
|
|
|
def format_time(seconds): |
|
hours = math.floor(seconds / 3600) |
|
seconds %= 3600 |
|
minutes = math.floor(seconds / 60) |
|
seconds %= 60 |
|
milliseconds = round((seconds - math.floor(seconds)) * 1000) |
|
seconds = math.floor(seconds) |
|
formatted_time = f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}" |
|
return formatted_time |
|
|
|
def generate_subtitle_file(language, segments, input_video_name): |
|
subtitle_file = f"sub-{input_video_name}.{language}.srt" |
|
text = "" |
|
for index, segment in enumerate(segments): |
|
segment_start = format_time(segment['start']) |
|
segment_end = format_time(segment['end']) |
|
text += f"{str(index+1)} \n" |
|
text += f"{segment_start} --> {segment_end} \n" |
|
text += f"{segment['word']} \n" |
|
text += "\n" |
|
f = open(subtitle_file, "w", encoding='utf8') |
|
f.write(text) |
|
f.close() |
|
return subtitle_file |
|
|
|
def transcribe(video): |
|
|
|
|
|
mp3_file, duration = extract_audio(video) |
|
print("transcribe") |
|
wordlevel_info=word_level_transcribe(mp3_file) |
|
subtitles = create_subtitles(wordlevel_info) |
|
subtitle_file = generate_subtitle_file('fa', subtitles, 'video_subtitled') |
|
return subtitle_file, video, mp3_file |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("Start typing below and then click **Run** to see the progress and final output.") |
|
with gr.Column(): |
|
|
|
url = gr.File() |
|
srt_file = gr.File() |
|
btn = gr.Button("Create") |
|
video_file_output = gr.Video(label="Result Video") |
|
mp3_file = gr.Audio(type="filepath") |
|
btn.click( |
|
fn=transcribe, |
|
inputs=url, |
|
outputs=[srt_file, video_file_output, mp3_file], |
|
) |
|
|
|
demo.launch(debug=True) |