Spaces:

raphael-gl
/

ai-days-subtitles-demo

Paused

App Files Files Community

Raphael commited on Jun 21, 2023

Commit

7d79fa6

unverified ·

1 Parent(s): 4581d71

Improve translation and subtitles sync

Browse files

Signed-off-by: Raphael <oOraph@users.noreply.github.com>

Files changed (1) hide show

app.py +101 -41

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import gradio as gr
 import moviepy.editor as mp
 import numpy as np
 import pysrt
 import torch
 from transformers import pipeline
 import yt_dlp
@@ -22,9 +23,10 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(level
 LOG = logging.getLogger(__name__)
 CLIP_SECONDS = 20
 SLICES = 4
-SLICE_DURATION = CLIP_SECONDS / SLICES
 # At most 6 mins
 MAX_CHUNKS = 45
 asr_kwargs = {
     "task": "automatic-speech-recognition",
@@ -118,7 +120,7 @@ def process_video(basedir: str, duration, translate: bool):
     subs = translation(transcriptions, translate)
     srt_file = build_srt_clips(subs, basedir)
     summary = summarize(transcriptions, translate)
-    return srt_file, ' '.join(subs).strip(), summary
 def transcription(audio_dir: str, duration):
@@ -141,74 +143,131 @@ def transcription(audio_dir: str, duration):
         t = asr(d, max_new_tokens=10000)
         transcriptions.extend(t)
-    transcriptions = [t['text'] for t in transcriptions]
     elapsed = time.time() - start
     LOG.info("Transcription done, elapsed %.2f seconds", elapsed)
-    return transcriptions
 def translation(transcriptions, translate):
     if translate:
         LOG.info("Performing translation")
         start = time.time()
-        translations = translator(transcriptions)
-        translations = [t['translation_text'] for t in translations]
         elapsed = time.time() - start
         LOG.info("Translation done, elapsed %.2f seconds", elapsed)
     else:
-        translations = transcriptions
-    return translations
 def summarize(transcriptions, translate):
     LOG.info("Generating video summary")
-    whole_text = ' '.join(transcriptions).strip()
-    word_count = len(whole_text.split())
     summary = summarizer(whole_text)
     # min_length=word_count // 4 + 1,
     # max_length=word_count // 2 + 1)
-    summary = translation([summary[0]['summary_text']], translate)[0]
-    return summary
-def subs_to_timed_segments(subtitles: list[str]):
-    LOG.info("Building srt segments")
-    all_chunks = []
     for sub in subtitles:
-        chunks = np.array_split(sub.split(' '), SLICES)
-        all_chunks.extend(chunks)
-    subs = []
-    for c in all_chunks:
-        c = ' '.join(c)
-        subs.append(c)
-    segments = []
-    for i, c in enumerate(subs):
-        segments.append({
-            'text': c.strip(),
-            'start': i * SLICE_DURATION,
-            'end': (i + 1) * SLICE_DURATION
-        })
-    return segments
-def build_srt_clips(subs, basedir):
     LOG.info("Generating subtitles")
-    segments = subs_to_timed_segments(subs)
     LOG.info("Building srt clips")
-    max_text_len = 30
     subtitles = pysrt.SubRipFile()
-    first = True
     for segment in segments:
-        start = segment['start'] * 1000
-        if first:
-            start += 3000
-            first = False
-        end = segment['end'] * 1000
         text = segment['text']
         text = text.strip()
         if len(text) < max_text_len:
@@ -250,4 +309,5 @@ iface = gr.Interface(
         gr.Text(label="Full transcription")
     ])
 iface.launch()

 import moviepy.editor as mp
 import numpy as np
 import pysrt
+import re
 import torch
 from transformers import pipeline
 import yt_dlp
 LOG = logging.getLogger(__name__)
 CLIP_SECONDS = 20
 SLICES = 4
+# SLICE_DURATION = CLIP_SECONDS / SLICES
 # At most 6 mins
 MAX_CHUNKS = 45
+SENTENCE_SPLIT = re.compile(r'([^.?!]*[.?!]+)([^.?!].*|$)')
 asr_kwargs = {
     "task": "automatic-speech-recognition",
     subs = translation(transcriptions, translate)
     srt_file = build_srt_clips(subs, basedir)
     summary = summarize(transcriptions, translate)
+    return srt_file, ' '.join([s['text'].strip() for s in subs]).strip(), summary
 def transcription(audio_dir: str, duration):
         t = asr(d, max_new_tokens=10000)
         transcriptions.extend(t)
+    transcriptions = [
+        {
+            'text': t['text'].strip(),
+            'start': i * CLIP_SECONDS * 1000,
+            'end': (i + 1) * CLIP_SECONDS * 1000
+        } for i, t in enumerate(transcriptions)
+    ]
+    if transcriptions:
+        transcriptions[0]['start'] += 2500
+    # Will improve the translation
+    segments = segments_on_sentence_boundaries(transcriptions)
     elapsed = time.time() - start
     LOG.info("Transcription done, elapsed %.2f seconds", elapsed)
+    return segments
+def segments_on_sentence_boundaries(segments):
+    LOG.info("Segmenting along sentence boundaries for better translations")
+    new_segments = []
+    i = 0
+    while i < len(segments):
+        s = segments[i]
+        text = s['text'].strip()
+        if not text:
+            i += 1
+            continue
+        if i == len(segments)-1:
+            new_segments.append(s)
+            break
+        next_s = segments[i+1]
+        next_text = next_s['text'].strip()
+        if not next_text or (text[-1] in ['.', '?', '!']):
+            new_segments.append(s)
+            i += 1
+            continue
+        m = SENTENCE_SPLIT.match(next_s['text'].strip())
+        if not m:
+            LOG.warning("Bad pattern matching on segment [%s], "
+                        "this should not be possible", next_s['text'])
+            s['end'] = next_s['end']
+            s['text'] = '{} {}'.format(s['text'].strip(), next_s['text'].strip())
+            new_segments.append(s)
+            i += 2
+        else:
+            before = m.group(1)
+            after = m.group(2)
+            next_segment_duration = next_s['end'] - next_s['start']
+            ratio = len(before) / len(next_text)
+            add_time = int(next_segment_duration * ratio)
+            s['end'] = s['end'] + add_time
+            s['text'] = '{} {}'.format(text, before)
+            next_s['start'] = next_s['start'] + add_time
+            next_s['text'] = after.strip()
+            new_segments.append(s)
+            i += 1
+    return new_segments
 def translation(transcriptions, translate):
+    translations_d = []
     if translate:
         LOG.info("Performing translation")
         start = time.time()
+        translations = translator([t['text'] for t in transcriptions])
+        for i, t in enumerate(transcriptions):
+            tsl = t.copy()
+            tsl['text'] = translations[i]['translation_text'].strip()
+            translations_d.append(tsl)
         elapsed = time.time() - start
         LOG.info("Translation done, elapsed %.2f seconds", elapsed)
+        LOG.info('Translations %s', translations_d)
     else:
+        translations_d = transcriptions
+    return translations_d
 def summarize(transcriptions, translate):
     LOG.info("Generating video summary")
+    whole_text = ' '.join([t['text'].strip() for t in transcriptions])
+    # word_count = len(whole_text.split())
     summary = summarizer(whole_text)
     # min_length=word_count // 4 + 1,
     # max_length=word_count // 2 + 1)
+    summary = translation([{'text': summary[0]['summary_text']}], translate)[0]
+    return summary['text']
+def segment_slices(subtitles: list[str]):
+    LOG.info("Building srt segments slices")
+    slices = []
     for sub in subtitles:
+        chunks = np.array_split(sub['text'].split(' '), SLICES)
+        start = sub['start']
+        duration = sub['end'] - start
+        for i in range(0, SLICES):
+            s = {
+                'text': ' '.join(chunks[i]),
+                'start': start + i * duration / SLICES,
+                'end': start + (i+1) * duration / SLICES
+            }
+            slices.append(s)
+    return slices
+def build_srt_clips(segments, basedir):
     LOG.info("Generating subtitles")
+    segments = segment_slices(segments)
     LOG.info("Building srt clips")
+    max_text_len = 45
     subtitles = pysrt.SubRipFile()
     for segment in segments:
+        start = segment['start']
+        end = segment['end']
         text = segment['text']
         text = text.strip()
         if len(text) < max_text_len:
         gr.Text(label="Full transcription")
     ])
+# iface.launch(server_name="0.0.0.0", server_port=6443)
 iface.launch()