Spaces:

NHLOCAL
/

gemini-subtitle-creator

Running

App Files Files Community

NHLOCAL commited on Jun 29

Commit

e6ed4d0

1 Parent(s): 4ad247f

זיהוי דינמי של שניות/מילישניות

Browse files

Files changed (1) hide show

main.py +81 -88

main.py CHANGED Viewed

@@ -9,6 +9,7 @@ import yaml
 import json
 import io
 import os
 from datetime import timedelta
 import logging
 import asyncio
@@ -48,27 +49,60 @@ def load_system_prompt():
         logging.error(f"Error loading instruct.yml: {e}")
         raise HTTPException(status_code=500, detail="Server configuration error.")
 def parse_time_str_to_ms(time_str: str) -> int:
     if not isinstance(time_str, str):
-        raise TypeError("Time string must be a string.")
     time_str = time_str.replace(',', '.')
     try:
-        if '.' in time_str:
-            parts = time_str.split('.')
-            hms_part, ms_part = parts[0], parts[1]
             ms = int(ms_part.ljust(3, '0')[:3])
         else:
-            hms_part, ms = time_str, 0
-        time_components = list(map(int, hms_part.split(':')))
-        h, m, s = 0, 0, 0
-        if len(time_components) == 3: h, m, s = time_components
-        elif len(time_components) == 2: m, s = time_components
-        elif len(time_components) == 1: s = time_components[0]
-        else: raise ValueError("Too many ':' separators.")
         return (h * 3600000) + (m * 60000) + (s * 1000) + ms
     except (ValueError, IndexError) as e:
-        raise ValueError(f"Could not parse time string: '{time_str}'. Error: {e}")
 def format_ms_to_srt_time(ms: int) -> str:
     td = timedelta(milliseconds=ms)
@@ -78,114 +112,82 @@ def format_ms_to_srt_time(ms: int) -> str:
     milliseconds = td.microseconds // 1000
     return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
 def find_silence_points_webrtcvad(audio_segment: AudioSegment, min_silence_len_ms: int, vad_aggressiveness: int):
-    # This function remains unchanged
     if audio_segment.frame_rate not in [8000, 16000, 32000, 48000]:
         audio_segment = audio_segment.set_frame_rate(16000)
-    if audio_segment.channels > 1:
-        audio_segment = audio_segment.set_channels(1)
-    if audio_segment.sample_width != 2:
-        audio_segment = audio_segment.set_sample_width(2)
     vad = webrtcvad.Vad(vad_aggressiveness)
     frame_duration_ms = 30
-    frame_size_bytes = int(audio_segment.frame_rate * (frame_duration_ms / 1000.0) * audio_segment.sample_width)
     silence_points_ms, silence_start_ms = [], None
-    raw_data = audio_segment.raw_data
-    num_frames = len(raw_data) // frame_size_bytes
     for i in range(num_frames):
-        start_byte, end_byte = i * frame_size_bytes, i * frame_size_bytes + frame_size_bytes
-        frame = raw_data[start_byte:end_byte]
         if len(frame) < frame_size_bytes: break
-        is_speech = vad.is_speech(frame, audio_segment.frame_rate)
-        current_time_ms = i * frame_duration_ms
         if not is_speech:
             if silence_start_ms is None: silence_start_ms = current_time_ms
-        else:
-            if silence_start_ms is not None:
-                if current_time_ms - silence_start_ms >= min_silence_len_ms:
-                    silence_points_ms.append(silence_start_ms + (current_time_ms - silence_start_ms) // 2)
-                silence_start_ms = None
     if silence_start_ms is not None and len(audio_segment) - silence_start_ms >= min_silence_len_ms:
         silence_points_ms.append(silence_start_ms + (len(audio_segment) - silence_start_ms) // 2)
     return silence_points_ms
 def split_audio_webrtcvad(audio_segment, min_silence_len):
-    # This function remains unchanged
     logging.info(f"Splitting with WebRTCVAD: Target Chunk {TARGET_CHUNK_DURATION_MIN}m, VAD Aggressiveness {VAD_AGGRESSIVENESS}")
     silence_points = find_silence_points_webrtcvad(audio_segment, min_silence_len, VAD_AGGRESSIVENESS)
     if not silence_points:
         logging.warning("WebRTCVAD found no significant silences. Splitting into fixed chunks.")
         return [audio_segment[i:i + TARGET_CHUNK_DURATION_MS] for i in range(0, len(audio_segment), TARGET_CHUNK_DURATION_MS)]
-    final_chunks, current_offset = [], 0
-    total_length = len(audio_segment)
     while current_offset < total_length:
-        remaining_length = total_length - current_offset
-        if remaining_length <= MAX_SPLIT_SEARCH_END_MS:
             final_chunks.append(audio_segment[current_offset:])
             break
         ideal_split_point = current_offset + TARGET_CHUNK_DURATION_MS
         candidate_points = [p for p in silence_points if (current_offset + MIN_SPLIT_SEARCH_START_MS) <= p < (current_offset + MAX_SPLIT_SEARCH_END_MS)]
         best_split_point = min(candidate_points, key=lambda p: abs(p - ideal_split_point)) if candidate_points else -1
-        split_at = best_split_point if best_split_point != -1 else current_offset + TARGET_CHUNK_DURATION_MS
         final_chunks.append(audio_segment[current_offset:int(split_at)])
         current_offset = int(split_at)
     logging.info(f"File successfully split into {len(final_chunks)} chunks using WebRTCVAD.")
     logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
     return final_chunks
-# ---
-# *** פונקציית התיקון עם הלוגיקה החדשה והקריטית ***
-# ---
 def validate_and_correct_segments(segments_from_api, chunk_duration_ms):
-    corrected_segments = []
-    last_corrected_end_ms = 0
     for seg in segments_from_api:
         try:
             start_ms = parse_time_str_to_ms(seg.get('start_time'))
             end_ms = parse_time_str_to_ms(seg.get('end_time'))
-            # --- NEW CRITICAL LOGIC TO PREVENT AVALANCHE ---
-            # אם זמן ההתחלה הוא הזיה (מחוץ לגבולות הקטע),
-            # דלג על המקטע הזה לחלוטין. זה ימנע ממנו להרוס את
-            # last_corrected_end_ms ואת כל הקטעים שאחריו.
             if start_ms >= chunk_duration_ms:
-                logging.warning(f"Skipping segment with hallucinatory start_time ({format_ms_to_srt_time(start_ms)}) outside of chunk duration ({format_ms_to_srt_time(chunk_duration_ms)}).")
-                continue
-            # --- END OF NEW LOGIC ---
-            # קצץ את זמן הסיום אם הוא חורג (התנהגות פחות הרסנית מהזיית זמן התחלה)
-            if end_ms > chunk_duration_ms:
-                end_ms = chunk_duration_ms
-            # תקן זמנים הפוכים
-            if start_ms >= end_ms:
-                end_ms = start_ms + 3000
-                end_ms = min(end_ms, chunk_duration_ms)
-            # תקן חפיפה
-            if start_ms < last_corrected_end_ms:
-                start_ms = last_corrected_end_ms
-            # בדיקה סופית
-            if start_ms >= end_ms:
-                logging.warning(f"Skipping segment after corrections resulted in zero/negative duration: {seg}")
                 continue
-            seg['start_time_relative'] = start_ms
-            seg['end_time_relative'] = end_ms
             corrected_segments.append(seg)
             last_corrected_end_ms = end_ms
         except (ValueError, TypeError, KeyError) as e:
             logging.warning(f"Skipping segment due to invalid format or value: {seg}. Error: {e}")
             continue
     return corrected_segments
 def transcribe_chunk(chunk_audio, api_key, system_prompt, pydantic_schema, model_name):
-    # This function remains unchanged
     try:
         client = genai.Client(api_key=api_key)
         buffer = io.BytesIO()
@@ -202,22 +204,15 @@ def transcribe_chunk(chunk_audio, api_key, system_prompt, pydantic_schema, model
 def generate_srt_content(segments):
     lines = []
     for i, seg in enumerate(segments, 1):
-        lines.append(str(i))
-        start = format_ms_to_srt_time(seg['start_time_abs'])
-        end = format_ms_to_srt_time(seg['end_time_abs'])
-        lines.append(f"{start} --> {end}")
-        lines.append(seg['text'])
-        lines.append("")
     return "\n".join(lines)
 async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name: str):
-    # This function's core logic remains unchanged, but it now benefits from the improved validation
     def send_event(type: str, message: str = "", percent: int = 0, data: str = ""):
         return json.dumps({"type": type, "message": message, "percent": percent, "data": data}) + "\n\n"
     try:
-        system_prompt = load_system_prompt()
-        pydantic_schema = TranscriptionSegment
         yield send_event("progress", "מעבד את קובץ השמע...", 5)
         audio = AudioSegment.from_file(io.BytesIO(file_content))
         yield send_event("progress", f"אורך הקובץ {len(audio) / 60000:.1f} דקות. מבצע חלוקה...", 15)
@@ -228,28 +223,26 @@ async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name:
         yield send_event("progress", f"הקובץ חולק ל-{len(chunks)} מקטעים. מתחיל תמלול...", 20)
         all_segs, offset = [], 0
-        total_chunks = len(chunks)
         for i, ch in enumerate(chunks):
-            progress_percent = 20 + int((i / total_chunks) * 75)
-            yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {total_chunks}...", progress_percent)
             data, error_msg = await asyncio.to_thread(transcribe_chunk, ch, api_key, system_prompt, pydantic_schema, model_name)
             if error_msg: raise ValueError(f"שגיאה בעיבוד מקטע {i+1}: {error_msg}")
             if data and isinstance(data, list):
                 corrected_segments = validate_and_correct_segments(data, len(ch))
                 for seg in corrected_segments:
                     seg['start_time_abs'] = seg['start_time_relative'] + offset
-                    seg['end_time_abs']   = seg['end_time_relative']   + offset
                     all_segs.append(seg)
             offset += len(ch)
         if not all_segs: raise ValueError("התמלול נכשל. לא נוצר תוכן תקני.")
-        yield send_event("progress", "התמלול הושלם! יוצר קובץ SRT...", 98)
-        srt_content = generate_srt_content(all_segs)
-        yield send_event("result", "התהליך הושלם בהצלחה!", 100, data=srt_content)
     except Exception as e:
         logging.error(f"Streaming transcription failed: {e}", exc_info=True)
         yield send_event("error", f"אירעה שגיאה: {e}", 100)
 @app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
     return templates.TemplateResponse("index.html", {"request": request})

 import json
 import io
 import os
+import re
 from datetime import timedelta
 import logging
 import asyncio
         logging.error(f"Error loading instruct.yml: {e}")
         raise HTTPException(status_code=500, detail="Server configuration error.")
+# ---
+# *** פונקציית פיענוח זמן חכמה וסופית ***
+# יודעת להבחין באופן אדפטיבי בין פורמט HH:MM:SS ל-MM:SS:mmm
+# ---
 def parse_time_str_to_ms(time_str: str) -> int:
+    """
+    Parses a timestamp string into milliseconds with adaptive format detection.
+    Correctly interprets HH:MM:SS,mmm and MM:SS:mmm formats, even with inconsistent separators.
+    """
     if not isinstance(time_str, str):
+        raise TypeError(f"Time string must be a string, got {type(time_str)}")
+    # Normalize decimal separator to period
     time_str = time_str.replace(',', '.')
+    # Find the last separator to distinguish format
+    last_colon_pos = time_str.rfind(':')
+    last_period_pos = time_str.rfind('.')
+    h, m, s, ms = 0, 0, 0, 0
     try:
+        # Case 1: Format includes milliseconds (e.g., "MM:SS.mmm" or "HH:MM:SS.mmm")
+        if last_period_pos > last_colon_pos:
+            hms_part = time_str[:last_period_pos]
+            ms_part = time_str[last_period_pos+1:]
             ms = int(ms_part.ljust(3, '0')[:3])
+            time_components = list(map(int, hms_part.split(':')))
+            if len(time_components) == 3: h, m, s = time_components
+            elif len(time_components) == 2: m, s = time_components
+            elif len(time_components) == 1: s = time_components[0]
+        # Case 2: Format uses colon for milliseconds (e.g., "MM:SS:mmm")
+        # Or it's a standard HH:MM:SS format.
         else:
+            components = list(map(int, time_str.split(':')))
+            # If the last component is > 59, it must be milliseconds
+            if len(components) >= 2 and components[-1] > 59:
+                ms = components[-1]
+                s = components[-2]
+                if len(components) == 3: m = components[0]
+                elif len(components) > 3: h, m = components[0], components[1] # For very long times
+            # Otherwise, it's a standard HH:MM:SS format
+            else:
+                if len(components) == 3: h, m, s = components
+                elif len(components) == 2: m, s = components
+                elif len(components) == 1: s = components[0]
         return (h * 3600000) + (m * 60000) + (s * 1000) + ms
     except (ValueError, IndexError) as e:
+        raise ValueError(f"Could not parse adaptive time string: '{time_str}'. Error: {e}")
 def format_ms_to_srt_time(ms: int) -> str:
     td = timedelta(milliseconds=ms)
     milliseconds = td.microseconds // 1000
     return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
 def find_silence_points_webrtcvad(audio_segment: AudioSegment, min_silence_len_ms: int, vad_aggressiveness: int):
     if audio_segment.frame_rate not in [8000, 16000, 32000, 48000]:
         audio_segment = audio_segment.set_frame_rate(16000)
+    if audio_segment.channels > 1: audio_segment = audio_segment.set_channels(1)
+    if audio_segment.sample_width != 2: audio_segment = audio_segment.set_sample_width(2)
     vad = webrtcvad.Vad(vad_aggressiveness)
     frame_duration_ms = 30
+    frame_size_bytes = int(audio_segment.frame_rate * (frame_duration_ms / 1000.0) * 2)
     silence_points_ms, silence_start_ms = [], None
+    raw_data, num_frames = audio_segment.raw_data, len(audio_segment.raw_data) // frame_size_bytes
     for i in range(num_frames):
+        frame = raw_data[i*frame_size_bytes:(i+1)*frame_size_bytes]
         if len(frame) < frame_size_bytes: break
+        is_speech, current_time_ms = vad.is_speech(frame, audio_segment.frame_rate), i * frame_duration_ms
         if not is_speech:
             if silence_start_ms is None: silence_start_ms = current_time_ms
+        elif silence_start_ms is not None:
+            if current_time_ms - silence_start_ms >= min_silence_len_ms:
+                silence_points_ms.append(silence_start_ms + (current_time_ms - silence_start_ms) // 2)
+            silence_start_ms = None
     if silence_start_ms is not None and len(audio_segment) - silence_start_ms >= min_silence_len_ms:
         silence_points_ms.append(silence_start_ms + (len(audio_segment) - silence_start_ms) // 2)
     return silence_points_ms
 def split_audio_webrtcvad(audio_segment, min_silence_len):
     logging.info(f"Splitting with WebRTCVAD: Target Chunk {TARGET_CHUNK_DURATION_MIN}m, VAD Aggressiveness {VAD_AGGRESSIVENESS}")
     silence_points = find_silence_points_webrtcvad(audio_segment, min_silence_len, VAD_AGGRESSIVENESS)
     if not silence_points:
         logging.warning("WebRTCVAD found no significant silences. Splitting into fixed chunks.")
         return [audio_segment[i:i + TARGET_CHUNK_DURATION_MS] for i in range(0, len(audio_segment), TARGET_CHUNK_DURATION_MS)]
+    final_chunks, current_offset, total_length = [], 0, len(audio_segment)
     while current_offset < total_length:
+        if total_length - current_offset <= MAX_SPLIT_SEARCH_END_MS:
             final_chunks.append(audio_segment[current_offset:])
             break
         ideal_split_point = current_offset + TARGET_CHUNK_DURATION_MS
         candidate_points = [p for p in silence_points if (current_offset + MIN_SPLIT_SEARCH_START_MS) <= p < (current_offset + MAX_SPLIT_SEARCH_END_MS)]
         best_split_point = min(candidate_points, key=lambda p: abs(p - ideal_split_point)) if candidate_points else -1
+        split_at = best_split_point if best_split_point != -1 else ideal_split_point
         final_chunks.append(audio_segment[current_offset:int(split_at)])
         current_offset = int(split_at)
     logging.info(f"File successfully split into {len(final_chunks)} chunks using WebRTCVAD.")
     logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
     return final_chunks
 def validate_and_correct_segments(segments_from_api, chunk_duration_ms):
+    corrected_segments, last_corrected_end_ms = [], 0
     for seg in segments_from_api:
         try:
             start_ms = parse_time_str_to_ms(seg.get('start_time'))
             end_ms = parse_time_str_to_ms(seg.get('end_time'))
+            # The improved parser makes this check much more reliable.
+            # We still keep it for true hallucinations.
             if start_ms >= chunk_duration_ms:
+                logging.warning(f"Skipping segment with true hallucinatory start_time ({format_ms_to_srt_time(start_ms)}) outside of chunk duration ({format_ms_to_srt_time(chunk_duration_ms)}).")
                 continue
+            if end_ms > chunk_duration_ms: end_ms = chunk_duration_ms
+            if start_ms >= end_ms: end_ms = min(start_ms + 3000, chunk_duration_ms)
+            if start_ms < last_corrected_end_ms: start_ms = last_corrected_end_ms
+            if start_ms >= end_ms: continue
+            seg['start_time_relative'], seg['end_time_relative'] = start_ms, end_ms
             corrected_segments.append(seg)
             last_corrected_end_ms = end_ms
         except (ValueError, TypeError, KeyError) as e:
             logging.warning(f"Skipping segment due to invalid format or value: {seg}. Error: {e}")
             continue
     return corrected_segments
 def transcribe_chunk(chunk_audio, api_key, system_prompt, pydantic_schema, model_name):
     try:
         client = genai.Client(api_key=api_key)
         buffer = io.BytesIO()
 def generate_srt_content(segments):
     lines = []
     for i, seg in enumerate(segments, 1):
+        lines.extend([str(i), f"{format_ms_to_srt_time(seg['start_time_abs'])} --> {format_ms_to_srt_time(seg['end_time_abs'])}", seg['text'], ""])
     return "\n".join(lines)
 async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name: str):
     def send_event(type: str, message: str = "", percent: int = 0, data: str = ""):
         return json.dumps({"type": type, "message": message, "percent": percent, "data": data}) + "\n\n"
     try:
+        system_prompt, pydantic_schema = load_system_prompt(), TranscriptionSegment
         yield send_event("progress", "מעבד את קובץ השמע...", 5)
         audio = AudioSegment.from_file(io.BytesIO(file_content))
         yield send_event("progress", f"אורך הקובץ {len(audio) / 60000:.1f} דקות. מבצע חלוקה...", 15)
         yield send_event("progress", f"הקובץ חולק ל-{len(chunks)} מקטעים. מתחיל תמלול...", 20)
         all_segs, offset = [], 0
         for i, ch in enumerate(chunks):
+            progress_percent = 20 + int(((i + 1) / len(chunks)) * 75)
+            yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {len(chunks)}...", progress_percent)
             data, error_msg = await asyncio.to_thread(transcribe_chunk, ch, api_key, system_prompt, pydantic_schema, model_name)
             if error_msg: raise ValueError(f"שגיאה בעיבוד מקטע {i+1}: {error_msg}")
             if data and isinstance(data, list):
                 corrected_segments = validate_and_correct_segments(data, len(ch))
                 for seg in corrected_segments:
                     seg['start_time_abs'] = seg['start_time_relative'] + offset
+                    seg['end_time_abs'] = seg['end_time_relative'] + offset
                     all_segs.append(seg)
             offset += len(ch)
         if not all_segs: raise ValueError("התמלול נכשל. לא נוצר תוכן תקני.")
+        yield send_event("result", "התהליך הושלם בהצלחה!", 100, data=generate_srt_content(all_segs))
     except Exception as e:
         logging.error(f"Streaming transcription failed: {e}", exc_info=True)
         yield send_event("error", f"אירעה שגיאה: {e}", 100)
 @app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
     return templates.TemplateResponse("index.html", {"request": request})