Spaces:

NHLOCAL
/

gemini-subtitle-creator

Running

App Files Files Community

NHLOCAL commited on Jun 29

Commit

f3d614b

1 Parent(s): d2b9a14

סכמה מתוקנת

Browse files

Files changed (3) hide show

main.py +33 -38
requirements.txt +1 -0
schema.json +9 -6

main.py CHANGED Viewed

@@ -12,28 +12,37 @@ import os
 from datetime import timedelta
 import logging
 import asyncio
 # --- Setup and Constants ---
 logging.basicConfig(level=logging.INFO)
 app = FastAPI()
 templates = Jinja2Templates(directory="templates")
-# --- CHANGE 1: Updated constants as per user request ---
 MAX_CHUNK_DURATION_MIN = 10
 MAX_CHUNK_DURATION_MS = MAX_CHUNK_DURATION_MIN * 60 * 1000
 SILENCE_THRESH_DB = -30
 MIN_SILENCE_LEN_MS = 600
-NO_SPLIT_DURATION_MIN = 11 # Files shorter than this won't be split
-# --- Helper functions (Only split_audio_smart is changed) ---
-def load_config():
-    """טוען system_prompt וסכמת JSON מקבצים חיצוניים."""
     try:
         with open("instruct.yml", 'r', encoding='utf-8') as f:
             instruct_config = yaml.safe_load(f)
-        with open("schema.json", 'r', encoding='utf-8') as f:
-            json_schema_config = json.load(f)
-        return instruct_config['system_prompt'], json_schema_config
     except FileNotFoundError as e:
         logging.error(f"Configuration file not found: {e.filename}")
         raise HTTPException(status_code=500, detail=f"שגיאת שרת: הקובץ {e.filename} לא נמצא.")
@@ -41,55 +50,37 @@ def load_config():
         logging.error(f"Error loading configuration: {e}")
         raise HTTPException(status_code=500, detail=f"שגיאת שרת: בעיה בטעינת ההגדרות: {e}")
-# --- CHANGE 2: Completely rewritten split_audio_smart function for robust splitting ---
 def split_audio_smart(audio_segment, max_duration_ms, silence_thresh, min_silence_len):
-    """
-    מפצל אודיו למקטעים הקרובים ככל האפשר ל-max_duration_ms.
-    הפונקציה מפרקת מקטעים ארוכים מדי ומאחדת קצרים מדי.
-    """
     logging.info(f"Splitting audio smartly. Target duration: ~{max_duration_ms / 60000} mins. Silence threshold: {silence_thresh}dB.")
-    # שלב 1: פיצול ראשוני לפי הפסקות שקט
     raw_chunks = split_on_silence(
         audio_segment,
         min_silence_len=min_silence_len,
         silence_thresh=silence_thresh,
-        keep_silence=500  # שמירת חצי שנייה של שקט למעבר טבעי
     )
     if not raw_chunks:
         logging.warning("No silence detected for smart splitting. Using fixed-size chunks.")
         return [audio_segment[i:i + max_duration_ms] for i in range(0, len(audio_segment), max_duration_ms)]
-    # שלב 2: איחוד ופירוק מחדש כדי לעמוד במגבלת האורך
     final_chunks = []
     current_recombined_chunk = AudioSegment.empty()
     for chunk in raw_chunks:
-        # אם הוספת המקטע הבא תיצור מקטע ארוך מדי, יש לסיים את הנוכחי
         if len(current_recombined_chunk) + len(chunk) > max_duration_ms and len(current_recombined_chunk) > 0:
             final_chunks.append(current_recombined_chunk)
             current_recombined_chunk = AudioSegment.empty()
-        # הוסף את המקטע החדש למקטע הנבנה הנוכחי
         current_recombined_chunk += chunk
-        # אם המקטע הנבנה חרג מהאורך המותר, פרק אותו
         while len(current_recombined_chunk) > max_duration_ms:
             final_chunks.append(current_recombined_chunk[:max_duration_ms])
             current_recombined_chunk = current_recombined_chunk[max_duration_ms:]
-    # הוסף את השארית האחרונה אם קיימת
     if len(current_recombined_chunk) > 0:
         final_chunks.append(current_recombined_chunk)
     logging.info(f"File successfully split into {len(final_chunks)} chunks.")
     logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
     return final_chunks
-def transcribe_chunk(chunk_audio, api_key, system_prompt, json_schema, model_name):
-    """שולח מקטע שמע אחד ל‑Gemini ומקבל JSON, עם טיפול משופר בשגיאות."""
     try:
         client = genai.Client(api_key=api_key)
         buffer = io.BytesIO()
@@ -104,7 +95,9 @@ def transcribe_chunk(chunk_audio, api_key, system_prompt, json_schema, model_nam
             config=types.GenerateContentConfig(
                 system_instruction=system_prompt,
                 response_mime_type="application/json",
-                response_schema=json_schema
             )
         )
@@ -150,15 +143,16 @@ def generate_srt_content(segments):
 # --- Streaming Transcription Logic ---
 async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name: str):
-    """
-    An async generator that performs transcription and yields progress updates.
-    """
     def send_event(type: str, message: str = "", percent: int = 0, data: str = ""):
         event_data = {"type": type, "message": message, "percent": percent, "data": data}
         return json.dumps(event_data) + "\n\n"
     try:
-        system_prompt, json_schema = load_config()
         yield send_event("progress", "מעבד את קובץ השמע...", 5)
         audio = AudioSegment.from_file(io.BytesIO(file_content))
@@ -186,7 +180,8 @@ async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name:
             progress_percent = 20 + int((i / total_chunks) * 75)
             yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {total_chunks}...", progress_percent)
-            data, error_msg = await asyncio.to_thread(transcribe_chunk, ch, api_key, system_prompt, json_schema, model_name)
             if error_msg:
                 raise ValueError(f"שגיאה בעיבוד מקטע {i+1}: {error_msg}")
@@ -233,4 +228,4 @@ async def handle_transcription_stream(
     file_content = await audio_file.read()
-    return StreamingResponse(_transcribe_and_stream(api_key, file_content, model_name), media_type="text/event-stream")

 from datetime import timedelta
 import logging
 import asyncio
+from pydantic import BaseModel, Field # <-- CHANGE: Import BaseModel and Field from pydantic
 # --- Setup and Constants ---
 logging.basicConfig(level=logging.INFO)
 app = FastAPI()
 templates = Jinja2Templates(directory="templates")
+# --- Audio Splitting Constants ---
 MAX_CHUNK_DURATION_MIN = 10
 MAX_CHUNK_DURATION_MS = MAX_CHUNK_DURATION_MIN * 60 * 1000
 SILENCE_THRESH_DB = -30
 MIN_SILENCE_LEN_MS = 600
+NO_SPLIT_DURATION_MIN = 11
+# --- CHANGE: Pydantic Schema Definition ---
+# This class replaces schema.json. It's type-safe and recommended by Google.
+class TranscriptionSegment(BaseModel):
+    id: int = Field(description="מספר סידורי של הכתובית", ge=1)
+    start_time: str = Field(description="שעת התחלה בפורמט HH:MM:SS,mmm")
+    end_time: str = Field(description="שעת סיום בפורמט HH:MM:SS,mmm")
+    text: str = Field(description="תוכן הכתובית")
+# --- Helper functions ---
+# <-- CHANGE: This function now only loads the system prompt.
+def load_system_prompt():
+    """טוען system_prompt מקובץ חיצוני."""
     try:
         with open("instruct.yml", 'r', encoding='utf-8') as f:
             instruct_config = yaml.safe_load(f)
+        return instruct_config['system_prompt']
     except FileNotFoundError as e:
         logging.error(f"Configuration file not found: {e.filename}")
         raise HTTPException(status_code=500, detail=f"שגיאת שרת: הקובץ {e.filename} לא נמצא.")
         logging.error(f"Error loading configuration: {e}")
         raise HTTPException(status_code=500, detail=f"שגיאת שרת: בעיה בטעינת ההגדרות: {e}")
 def split_audio_smart(audio_segment, max_duration_ms, silence_thresh, min_silence_len):
+    """מפצל אודיו למקטעים הקרובים ככל האפשר ל-max_duration_ms."""
     logging.info(f"Splitting audio smartly. Target duration: ~{max_duration_ms / 60000} mins. Silence threshold: {silence_thresh}dB.")
     raw_chunks = split_on_silence(
         audio_segment,
         min_silence_len=min_silence_len,
         silence_thresh=silence_thresh,
+        keep_silence=500
     )
     if not raw_chunks:
         logging.warning("No silence detected for smart splitting. Using fixed-size chunks.")
         return [audio_segment[i:i + max_duration_ms] for i in range(0, len(audio_segment), max_duration_ms)]
     final_chunks = []
     current_recombined_chunk = AudioSegment.empty()
     for chunk in raw_chunks:
         if len(current_recombined_chunk) + len(chunk) > max_duration_ms and len(current_recombined_chunk) > 0:
             final_chunks.append(current_recombined_chunk)
             current_recombined_chunk = AudioSegment.empty()
         current_recombined_chunk += chunk
         while len(current_recombined_chunk) > max_duration_ms:
             final_chunks.append(current_recombined_chunk[:max_duration_ms])
             current_recombined_chunk = current_recombined_chunk[max_duration_ms:]
     if len(current_recombined_chunk) > 0:
         final_chunks.append(current_recombined_chunk)
     logging.info(f"File successfully split into {len(final_chunks)} chunks.")
     logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
     return final_chunks
+# <-- CHANGE: Function now accepts a Pydantic model instead of a JSON schema object.
+def transcribe_chunk(chunk_audio, api_key, system_prompt, pydantic_schema, model_name):
+    """שולח מקטע שמע אחד ל‑Gemini ומקבל JSON, בהתאם לסכמת Pydantic."""
     try:
         client = genai.Client(api_key=api_key)
         buffer = io.BytesIO()
             config=types.GenerateContentConfig(
                 system_instruction=system_prompt,
                 response_mime_type="application/json",
+                # <-- CHANGE: Pass the Pydantic model directly to the SDK.
+                # The `list[]` indicates we expect a list of these objects.
+                response_schema=list[pydantic_schema]
             )
         )
 # --- Streaming Transcription Logic ---
 async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name: str):
+    """An async generator that performs transcription and yields progress updates."""
     def send_event(type: str, message: str = "", percent: int = 0, data: str = ""):
         event_data = {"type": type, "message": message, "percent": percent, "data": data}
         return json.dumps(event_data) + "\n\n"
     try:
+        # <-- CHANGE: Load only the system prompt and use the Pydantic class.
+        system_prompt = load_system_prompt()
+        pydantic_schema = TranscriptionSegment
         yield send_event("progress", "מעבד את קובץ השמע...", 5)
         audio = AudioSegment.from_file(io.BytesIO(file_content))
             progress_percent = 20 + int((i / total_chunks) * 75)
             yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {total_chunks}...", progress_percent)
+            # <-- CHANGE: Pass the Pydantic schema class to the transcription function.
+            data, error_msg = await asyncio.to_thread(transcribe_chunk, ch, api_key, system_prompt, pydantic_schema, model_name)
             if error_msg:
                 raise ValueError(f"שגיאה בעיבוד מקטע {i+1}: {error_msg}")
     file_content = await audio_file.read()
+    return StreamingResponse(_transcribe_and_stream(api_key, file_content, model_name), media_type="text/event-stream")

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ google-genai
 pydub
 PyYAML
 Jinja2

 pydub
 PyYAML
 Jinja2
+pydantic

schema.json CHANGED Viewed

@@ -10,18 +10,15 @@
       },
       "start_time": {
         "type": "string",
-        "description": "שעת התחלה בפורמט HH:MM:SS,mmm עם אפסים מובילים",
-        "pattern": "^\\d{2}:\\d{2}:\\d{2},\\d{3}$"
       },
       "end_time": {
         "type": "string",
-        "description": "שעת סיום בפורמט HH:MM:SS,mmm עם אפסים מובילים",
-        "pattern": "^\\d{2}:\\d{2}:\\d{2},\\d{3}$"
       },
       "text": {
         "type": "string",
-        "description": "תוכן הכתובית",
-        "minLength": 1
       }
     },
     "required": [
@@ -29,6 +26,12 @@
       "start_time",
       "end_time",
       "text"
     ]
   }
 }

       },
       "start_time": {
         "type": "string",
+        "description": "שעת התחלה בפורמט HH:MM:SS,mmm עם אפסים מובילים"
       },
       "end_time": {
         "type": "string",
+        "description": "שעת סיום בפורמט HH:MM:SS,mmm עם אפסים מובילים"
       },
       "text": {
         "type": "string",
+        "description": "תוכן הכתובית"
       }
     },
     "required": [
       "start_time",
       "end_time",
       "text"
+    ],
+    "propertyOrdering": [
+      "id",
+      "start_time",
+      "end_time",
+      "text"
     ]
   }
 }