Spaces:

NHLOCAL
/

gemini-subtitle-creator

Running

App Files Files Community

NHLOCAL commited on Jun 29

Commit

64c3c73

1 Parent(s): 9022007

פיצול משופר והצגה שלו למשתמש

Browse files

Files changed (2) hide show

main.py +111 -47
templates/index.html +34 -4

main.py CHANGED Viewed

@@ -4,7 +4,7 @@ from fastapi.templating import Jinja2Templates
 from google import genai
 from google.genai import types
 from pydub import AudioSegment
-from pydub.silence import split_on_silence
 import yaml
 import json
 import io
@@ -12,7 +12,7 @@ import os
 from datetime import timedelta
 import logging
 import asyncio
-from pydantic import BaseModel, Field # <-- CHANGE: Import BaseModel and Field from pydantic
 # --- Setup and Constants ---
 logging.basicConfig(level=logging.INFO)
@@ -20,14 +20,22 @@ app = FastAPI()
 templates = Jinja2Templates(directory="templates")
 # --- Audio Splitting Constants ---
-MAX_CHUNK_DURATION_MIN = 10
-MAX_CHUNK_DURATION_MS = MAX_CHUNK_DURATION_MIN * 60 * 1000
 SILENCE_THRESH_DB = -30
 MIN_SILENCE_LEN_MS = 500
-NO_SPLIT_DURATION_MIN = 14
-# --- CHANGE: Pydantic Schema Definition ---
-# This class replaces schema.json. It's type-safe and recommended by Google.
 class TranscriptionSegment(BaseModel):
     id: int = Field(description="מספר סידורי של הכתובית", ge=1)
     start_time: str = Field(description="שעת התחלה בפורמט HH:MM:SS,mmm")
@@ -36,7 +44,6 @@ class TranscriptionSegment(BaseModel):
 # --- Helper functions ---
-# <-- CHANGE: This function now only loads the system prompt.
 def load_system_prompt():
     """טוען system_prompt מקובץ חיצוני."""
     try:
@@ -50,35 +57,83 @@ def load_system_prompt():
         logging.error(f"Error loading configuration: {e}")
         raise HTTPException(status_code=500, detail=f"שגיאת שרת: בעיה בטעינת ההגדרות: {e}")
-def split_audio_smart(audio_segment, max_duration_ms, silence_thresh, min_silence_len):
-    """מפצל אודיו למקטעים הקרובים ככל האפשר ל-max_duration_ms."""
-    logging.info(f"Splitting audio smartly. Target duration: ~{max_duration_ms / 60000} mins. Silence threshold: {silence_thresh}dB.")
-    raw_chunks = split_on_silence(
-        audio_segment,
-        min_silence_len=min_silence_len,
-        silence_thresh=silence_thresh,
-        keep_silence=500
-    )
-    if not raw_chunks:
-        logging.warning("No silence detected for smart splitting. Using fixed-size chunks.")
-        return [audio_segment[i:i + max_duration_ms] for i in range(0, len(audio_segment), max_duration_ms)]
     final_chunks = []
-    current_recombined_chunk = AudioSegment.empty()
-    for chunk in raw_chunks:
-        if len(current_recombined_chunk) + len(chunk) > max_duration_ms and len(current_recombined_chunk) > 0:
-            final_chunks.append(current_recombined_chunk)
-            current_recombined_chunk = AudioSegment.empty()
-        current_recombined_chunk += chunk
-        while len(current_recombined_chunk) > max_duration_ms:
-            final_chunks.append(current_recombined_chunk[:max_duration_ms])
-            current_recombined_chunk = current_recombined_chunk[max_duration_ms:]
-    if len(current_recombined_chunk) > 0:
-        final_chunks.append(current_recombined_chunk)
     logging.info(f"File successfully split into {len(final_chunks)} chunks.")
     logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
     return final_chunks
-# <-- CHANGE: Function now accepts a Pydantic model instead of a JSON schema object.
 def transcribe_chunk(chunk_audio, api_key, system_prompt, pydantic_schema, model_name):
     """שולח מקטע שמע אחד ל‑Gemini ומקבל JSON, בהתאם לסכמת Pydantic."""
     try:
@@ -95,8 +150,6 @@ def transcribe_chunk(chunk_audio, api_key, system_prompt, pydantic_schema, model
             config=types.GenerateContentConfig(
                 system_instruction=system_prompt,
                 response_mime_type="application/json",
-                # <-- CHANGE: Pass the Pydantic model directly to the SDK.
-                # The `list[]` indicates we expect a list of these objects.
                 response_schema=list[pydantic_schema]
             )
         )
@@ -149,7 +202,6 @@ async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name:
         return json.dumps(event_data) + "\n\n"
     try:
-        # <-- CHANGE: Load only the system prompt and use the Pydantic class.
         system_prompt = load_system_prompt()
         pydantic_schema = TranscriptionSegment
@@ -157,20 +209,33 @@ async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name:
         audio = AudioSegment.from_file(io.BytesIO(file_content))
         duration_min = len(audio) / (1000 * 60)
-        chunks = []
-        if duration_min <= NO_SPLIT_DURATION_MIN:
-            yield send_event("progress", f"אורך הקובץ ({duration_min:.1f} דקות) קצר, מעבד כמקשה אחת...", 15)
-            chunks = [audio]
-        else:
-            yield send_event("progress", f"אורך הקובץ ({duration_min:.1f} דקות) ארוך, מבצע חלוקה חכמה...", 15)
-            chunks = await asyncio.to_thread(
-                split_audio_smart, audio, MAX_CHUNK_DURATION_MS, SILENCE_THRESH_DB, MIN_SILENCE_LEN_MS
-            )
         if not chunks:
             raise ValueError("לא נוצרו מקטעי שמע לעיבוד.")
         yield send_event("progress", f"הקובץ חולק ל-{len(chunks)} מקטעים. מתחיל תמלול עם מודל {model_name}...", 20)
         all_segs, offset = [], 0
@@ -180,7 +245,6 @@ async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name:
             progress_percent = 20 + int((i / total_chunks) * 75)
             yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {total_chunks}...", progress_percent)
-            # <-- CHANGE: Pass the Pydantic schema class to the transcription function.
             data, error_msg = await asyncio.to_thread(transcribe_chunk, ch, api_key, system_prompt, pydantic_schema, model_name)
             if error_msg:
@@ -228,4 +292,4 @@ async def handle_transcription_stream(
     file_content = await audio_file.read()
-    return StreamingResponse(_transcribe_and_stream(api_key, file_content, model_name), media_type="text/event-stream")

 from google import genai
 from google.genai import types
 from pydub import AudioSegment
+from pydub.silence import split_on_silence, detect_silence
 import yaml
 import json
 import io
 from datetime import timedelta
 import logging
 import asyncio
+from pydantic import BaseModel, Field
 # --- Setup and Constants ---
 logging.basicConfig(level=logging.INFO)
 templates = Jinja2Templates(directory="templates")
 # --- Audio Splitting Constants ---
+# אורך יעד למקטע שמע (לדוגמה, 10 דקות)
+TARGET_CHUNK_DURATION_MIN = 10
+TARGET_CHUNK_DURATION_MS = TARGET_CHUNK_DURATION_MIN * 60 * 1000
+# זמן מינימלי למקטע לפני שמתחילים לחפש נקודת פיצול חלופית (לדוגמה, 7 דקות)
+MIN_SPLIT_SEARCH_START_MIN = 7
+MIN_SPLIT_SEARCH_START_MS = MIN_SPLIT_SEARCH_START_MIN * 60 * 1000
+# אורך מקסימלי של מקטע בו נחפש שקט לפני חיתוך כפוי (לדוגמה, 14 דקות)
+MAX_SPLIT_SEARCH_END_MIN = 14
+MAX_SPLIT_SEARCH_END_MS = MAX_SPLIT_SEARCH_END_MIN * 60 * 1000
 SILENCE_THRESH_DB = -30
 MIN_SILENCE_LEN_MS = 500
+# --- Pydantic Schema Definition ---
 class TranscriptionSegment(BaseModel):
     id: int = Field(description="מספר סידורי של הכתובית", ge=1)
     start_time: str = Field(description="שעת התחלה בפורמט HH:MM:SS,mmm")
 # --- Helper functions ---
 def load_system_prompt():
     """טוען system_prompt מקובץ חיצוני."""
     try:
         logging.error(f"Error loading configuration: {e}")
         raise HTTPException(status_code=500, detail=f"שגיאת שרת: בעיה בטעינת ההגדרות: {e}")
+# NEW: Function to format milliseconds to HH:MM:SS
+def format_ms_to_hms(ms):
+    td = timedelta(milliseconds=ms)
+    minutes, seconds = divmod(td.seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    hours += td.days * 24 # Handle durations > 24 hours
+    return f"{hours:02}:{minutes:02}:{seconds:02}"
+def split_audio_smart(audio_segment, silence_thresh, min_silence_len):
+    """
+    מפצל אודיו למקטעים, תוך העדפת נקודות שקט ועם גבולות חיתוך מוגדרים.
+    - מנסה לשמור מקטעים סביב TARGET_CHUNK_DURATION_MS (10 דקות).
+    - מחפש נקודת שקט לפי הסדר:
+      1. השקט הראשון שמתחיל בין MIN_SPLIT_SEARCH_START_MS (7 דקות) ל-TARGET_CHUNK_DURATION_MS (10 דקות).
+      2. אם לא נמצא כזה, השקט הראשון שמתחיל בין TARGET_CHUNK_DURATION_MS (10 דקות) ל-MAX_SPLIT_SEARCH_END_MS (14 דקות).
+    - אם לא נמצא שקט מתאים באף אחד מהטווחים (7-14 דקות), יבוצע חיתוך כפוי ב-MIN_SPLIT_SEARCH_START_MS (7 דקות).
+    """
+    logging.info(f"Smart splitting: Target Chunk {TARGET_CHUNK_DURATION_MIN}m, Min Split Search Start {MIN_SPLIT_SEARCH_START_MIN}m, Max Split Search End {MAX_SPLIT_SEARCH_END_MIN}m")
     final_chunks = []
+    current_offset = 0
+    total_length = len(audio_segment)
+    while current_offset < total_length:
+        remaining_audio = audio_segment[current_offset:]
+        # אם האודיו שנותר קצר או שווה ל-MAX_SPLIT_SEARCH_END_MS (14 דקות), קח אותו כמקטע האחרון וסיים.
+        # זה מטפל גם בקבצים קצרים מ-14 דקות שלא יפוצלו מלכתחילה.
+        if len(remaining_audio) <= MAX_SPLIT_SEARCH_END_MS:
+            final_chunks.append(remaining_audio)
+            break
+        # הגדר את מקטע האודיו לבדיקת שקט. נחפש עד ל-MAX_SPLIT_SEARCH_END_MS מההתחלה הנוכחית.
+        segment_for_silence_detection = remaining_audio[:MAX_SPLIT_SEARCH_END_MS]
+        # זיהוי שקטים בתוך המקטע הנוכחי. המיקומים הם יחסיים לתחילת segment_for_silence_detection.
+        silences = detect_silence(
+            segment_for_silence_detection,
+            min_silence_len=min_silence_len,
+            silence_thresh=silence_thresh
+        )
+        # נקודת החיתוך שנבחרה, יחסית לתחילת המקטע הנוכחי.
+        split_point_relative_to_chunk_start = -1
+        # 1. חיפוש שקט בטווח המועדף (7 דקות עד לפני 10 דקות)
+        for s_start, s_end in silences:
+            if MIN_SPLIT_SEARCH_START_MS <= s_start < TARGET_CHUNK_DURATION_MS:
+                # נמצאה נקודת פיצול מועדפת. נשתמש בסוף קטע השקט כנקודת החיתוך.
+                split_point_relative_to_chunk_start = s_end
+                break
+        # 2. אם לא נמצא שקט מועדף, חפש בטווח המורחב (10 דקות עד לפני 14 דקות)
+        if split_point_relative_to_chunk_start == -1:
+            for s_start, s_end in silences:
+                if TARGET_CHUNK_DURATION_MS <= s_start < MAX_SPLIT_SEARCH_END_MS:
+                    # נמצאה נקודת פיצול בטווח המורחב. נשתמש בסוף קטע השקט כנקודת החיתוך.
+                    split_point_relative_to_chunk_start = s_end
+                    break
+        # 3. אם לא נמצא שקט מתאים באף אחד מהטווחים (7-14 דקות)
+        if split_point_relative_to_chunk_start == -1:
+            # על פי בקשת המשתמש: "יתבצע חיתוך אחרי 7 דקות בלבד"
+            logging.warning(f"No suitable silence found between {MIN_SPLIT_SEARCH_START_MIN}m and {MAX_SPLIT_SEARCH_END_MIN}m. Performing hard cut at {MIN_SPLIT_SEARCH_START_MIN}m.")
+            split_point_relative_to_chunk_start = MIN_SPLIT_SEARCH_START_MS
+        # וודא שנקודת הפיצול לא חורגת מאורך האודיו הנותר (למען בטיחות).
+        split_point_relative_to_chunk_start = min(split_point_relative_to_chunk_start, len(remaining_audio))
+        # הוסף את המקטע שנקבע וקדם את ה-offset.
+        final_chunks.append(remaining_audio[:split_point_relative_to_chunk_start])
+        current_offset += split_point_relative_to_chunk_start
     logging.info(f"File successfully split into {len(final_chunks)} chunks.")
     logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
     return final_chunks
 def transcribe_chunk(chunk_audio, api_key, system_prompt, pydantic_schema, model_name):
     """שולח מקטע שמע אחד ל‑Gemini ומקבל JSON, בהתאם לסכמת Pydantic."""
     try:
             config=types.GenerateContentConfig(
                 system_instruction=system_prompt,
                 response_mime_type="application/json",
                 response_schema=list[pydantic_schema]
             )
         )
         return json.dumps(event_data) + "\n\n"
     try:
         system_prompt = load_system_prompt()
         pydantic_schema = TranscriptionSegment
         audio = AudioSegment.from_file(io.BytesIO(file_content))
         duration_min = len(audio) / (1000 * 60)
+        yield send_event("progress", f"אורך הקובץ {duration_min:.1f} דקות. מבצע חלוקה חכמה...", 15)
+        chunks = await asyncio.to_thread(
+            split_audio_smart, audio, SILENCE_THRESH_DB, MIN_SILENCE_LEN_MS
+        )
         if not chunks:
             raise ValueError("לא נוצרו מקטעי שמע לעיבוד.")
+        # NEW: Calculate and send chunk timestamps
+        chunk_info_messages = []
+        current_cumulative_offset = 0
+        for i, ch in enumerate(chunks):
+            chunk_start_ms = current_cumulative_offset
+            chunk_end_ms = current_cumulative_offset + len(ch)
+            chunk_info_messages.append(
+                f"{i+1}. {format_ms_to_hms(chunk_start_ms)} - {format_ms_to_hms(chunk_end_ms)}"
+            )
+            current_cumulative_offset += len(ch)
+        yield send_event(
+            "chunk_timestamps", # New event type
+            message="השמע חולק למקטעים בנקודות הבאות:",
+            data="\n".join(chunk_info_messages)
+        )
+        # End NEW
         yield send_event("progress", f"הקובץ חולק ל-{len(chunks)} מקטעים. מתחיל תמלול עם מודל {model_name}...", 20)
         all_segs, offset = [], 0
             progress_percent = 20 + int((i / total_chunks) * 75)
             yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {total_chunks}...", progress_percent)
             data, error_msg = await asyncio.to_thread(transcribe_chunk, ch, api_key, system_prompt, pydantic_schema, model_name)
             if error_msg:
     file_content = await audio_file.read()
+    return StreamingResponse(_transcribe_and_stream(api_key, file_content, model_name), media_type="text/event-stream")

templates/index.html CHANGED Viewed

@@ -57,7 +57,7 @@
         #status-container { margin-top: 1.5rem; display: none; }
         #status-message { text-align: center; padding: 1rem; font-weight: 600; border-radius: var(--border-radius-small) var(--border-radius-small) 0 0; }
         #status-message.loading { background-color: var(--md-sys-color-primary-container); color: var(--md-sys-color-on-primary-container); }
-        #status-message.error { background-color: var(--md-sys-color-error-container); color: var(--md-sys-color-on-error-container); } /* Corrected color */
         #progress-bar-container { width: 100%; background-color: var(--md-sys-color-surface-variant); border-radius: 0 0 var(--border-radius-small) var(--border-radius-small); overflow: hidden; height: 8px; }
         #progress-bar { width: 0%; height: 100%; background-color: var(--md-sys-color-primary); transition: width 0.3s ease-in-out; }
         #progress-bar.error { background-color: var(--md-sys-color-error); }
@@ -81,7 +81,7 @@
                         </small>
                     </div>
-                    <!-- NEW: Model Selection -->
                     <div class="input-group">
                         <label for="model-select">בחר מודל</label>
                         <select id="model-select">
@@ -116,6 +116,25 @@
                 </div>
             </div>
             <section id="results-section" class="card">
                 <h2>תוצאות התמלול (SRT)</h2>
                 <textarea id="srt-output" readonly></textarea>
@@ -145,6 +164,10 @@
             const resultsSection = document.getElementById('results-section');
             const srtOutput = document.getElementById('srt-output');
             const downloadButton = document.getElementById('download-button');
             let audioFile = null;
@@ -165,10 +188,13 @@
                 submitButton.disabled = false;
                 statusContainer.style.display = 'none';
                 resultsSection.style.display = 'none';
                 updateStatus("", 0);
             }
-            // --- NEW: API Key Persistence ---
             function loadApiKey() {
                 const savedKey = localStorage.getItem('geminiApiKey');
                 if (savedKey) {
@@ -194,7 +220,7 @@
                 checkInputs();
             });
-            // NEW: Model selection logic
             modelSelect.addEventListener('change', () => {
                 modelCustomInput.style.display = (modelSelect.value === 'custom') ? 'block' : 'none';
             });
@@ -273,6 +299,10 @@
                             if (event.type === 'progress') {
                                 updateStatus(event.message, event.percent);
                             } else if (event.type === 'result') {
                                 updateStatus(event.message, event.percent);
                                 srtOutput.value = event.data;

         #status-container { margin-top: 1.5rem; display: none; }
         #status-message { text-align: center; padding: 1rem; font-weight: 600; border-radius: var(--border-radius-small) var(--border-radius-small) 0 0; }
         #status-message.loading { background-color: var(--md-sys-color-primary-container); color: var(--md-sys-color-on-primary-container); }
+        #status-message.error { background-color: var(--md-sys-color-error-container); color: var(--md-sys-color-on-error-container); }
         #progress-bar-container { width: 100%; background-color: var(--md-sys-color-surface-variant); border-radius: 0 0 var(--border-radius-small) var(--border-radius-small); overflow: hidden; height: 8px; }
         #progress-bar { width: 0%; height: 100%; background-color: var(--md-sys-color-primary); transition: width 0.3s ease-in-out; }
         #progress-bar.error { background-color: var(--md-sys-color-error); }
                         </small>
                     </div>
+                    <!-- Model Selection -->
                     <div class="input-group">
                         <label for="model-select">בחר מודל</label>
                         <select id="model-select">
                 </div>
             </div>
+            <!-- NEW: Section for displaying chunk cut times -->
+            <div id="chunk-info-section" class="card" style="margin-top: 1.5rem; display: none;">
+                <h2>חלוקת קובץ השמע למקטעים</h2>
+                <p id="chunk-info-message" style="margin-bottom: 1rem;"></p>
+                <pre id="chunk-timestamps-output" style="
+                    background-color: var(--md-sys-color-surface);
+                    padding: 1rem;
+                    border-radius: var(--border-radius-small);
+                    border: 1px solid var(--md-sys-color-outline);
+                    font-family: monospace;
+                    font-size: 0.9rem;
+                    direction: ltr; /* Ensure LTR for timestamps */
+                    text-align: left;
+                    max-height: 200px;
+                    overflow-y: auto;
+                "></pre>
+            </div>
+            <!-- END NEW -->
             <section id="results-section" class="card">
                 <h2>תוצאות התמלול (SRT)</h2>
                 <textarea id="srt-output" readonly></textarea>
             const resultsSection = document.getElementById('results-section');
             const srtOutput = document.getElementById('srt-output');
             const downloadButton = document.getElementById('download-button');
+            // NEW: Element selections for chunk info
+            const chunkInfoSection = document.getElementById('chunk-info-section');
+            const chunkInfoMessage = document.getElementById('chunk-info-message');
+            const chunkTimestampsOutput = document.getElementById('chunk-timestamps-output');
             let audioFile = null;
                 submitButton.disabled = false;
                 statusContainer.style.display = 'none';
                 resultsSection.style.display = 'none';
+                chunkInfoSection.style.display = 'none'; // NEW: Hide chunk info section
+                chunkTimestampsOutput.textContent = ''; // NEW: Clear chunk info
+                chunkInfoMessage.textContent = ''; // NEW: Clear chunk info message
                 updateStatus("", 0);
             }
+            // --- API Key Persistence ---
             function loadApiKey() {
                 const savedKey = localStorage.getItem('geminiApiKey');
                 if (savedKey) {
                 checkInputs();
             });
+            // Model selection logic
             modelSelect.addEventListener('change', () => {
                 modelCustomInput.style.display = (modelSelect.value === 'custom') ? 'block' : 'none';
             });
                             if (event.type === 'progress') {
                                 updateStatus(event.message, event.percent);
+                            } else if (event.type === 'chunk_timestamps') { // NEW: Handle chunk timestamps
+                                chunkInfoSection.style.display = 'block';
+                                chunkInfoMessage.textContent = event.message;
+                                chunkTimestampsOutput.textContent = event.data;
                             } else if (event.type === 'result') {
                                 updateStatus(event.message, event.percent);
                                 srtOutput.value = event.data;