Spaces:
Running
Running
סכמה מתוקנת
Browse files- main.py +33 -38
- requirements.txt +1 -0
- schema.json +9 -6
main.py
CHANGED
|
@@ -12,28 +12,37 @@ import os
|
|
| 12 |
from datetime import timedelta
|
| 13 |
import logging
|
| 14 |
import asyncio
|
|
|
|
| 15 |
|
| 16 |
# --- Setup and Constants ---
|
| 17 |
logging.basicConfig(level=logging.INFO)
|
| 18 |
app = FastAPI()
|
| 19 |
templates = Jinja2Templates(directory="templates")
|
| 20 |
|
| 21 |
-
# ---
|
| 22 |
MAX_CHUNK_DURATION_MIN = 10
|
| 23 |
MAX_CHUNK_DURATION_MS = MAX_CHUNK_DURATION_MIN * 60 * 1000
|
| 24 |
SILENCE_THRESH_DB = -30
|
| 25 |
MIN_SILENCE_LEN_MS = 600
|
| 26 |
-
NO_SPLIT_DURATION_MIN = 11
|
| 27 |
|
| 28 |
-
# ---
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
try:
|
| 32 |
with open("instruct.yml", 'r', encoding='utf-8') as f:
|
| 33 |
instruct_config = yaml.safe_load(f)
|
| 34 |
-
|
| 35 |
-
json_schema_config = json.load(f)
|
| 36 |
-
return instruct_config['system_prompt'], json_schema_config
|
| 37 |
except FileNotFoundError as e:
|
| 38 |
logging.error(f"Configuration file not found: {e.filename}")
|
| 39 |
raise HTTPException(status_code=500, detail=f"שגיאת שרת: הקובץ {e.filename} לא נמצא.")
|
|
@@ -41,55 +50,37 @@ def load_config():
|
|
| 41 |
logging.error(f"Error loading configuration: {e}")
|
| 42 |
raise HTTPException(status_code=500, detail=f"שגיאת שרת: בעיה בטעינת ההגדרות: {e}")
|
| 43 |
|
| 44 |
-
# --- CHANGE 2: Completely rewritten split_audio_smart function for robust splitting ---
|
| 45 |
def split_audio_smart(audio_segment, max_duration_ms, silence_thresh, min_silence_len):
|
| 46 |
-
"""
|
| 47 |
-
מפצל אודיו למקטעים הקרובים ככל האפשר ל-max_duration_ms.
|
| 48 |
-
הפונקציה מפרקת מקטעים ארוכים מדי ומאחדת קצרים מדי.
|
| 49 |
-
"""
|
| 50 |
logging.info(f"Splitting audio smartly. Target duration: ~{max_duration_ms / 60000} mins. Silence threshold: {silence_thresh}dB.")
|
| 51 |
-
|
| 52 |
-
# שלב 1: פיצול ראשוני לפי הפסקות שקט
|
| 53 |
raw_chunks = split_on_silence(
|
| 54 |
audio_segment,
|
| 55 |
min_silence_len=min_silence_len,
|
| 56 |
silence_thresh=silence_thresh,
|
| 57 |
-
keep_silence=500
|
| 58 |
)
|
| 59 |
-
|
| 60 |
if not raw_chunks:
|
| 61 |
logging.warning("No silence detected for smart splitting. Using fixed-size chunks.")
|
| 62 |
return [audio_segment[i:i + max_duration_ms] for i in range(0, len(audio_segment), max_duration_ms)]
|
| 63 |
-
|
| 64 |
-
# שלב 2: איחוד ופירוק מחדש כדי לעמוד במגבלת האורך
|
| 65 |
final_chunks = []
|
| 66 |
current_recombined_chunk = AudioSegment.empty()
|
| 67 |
-
|
| 68 |
for chunk in raw_chunks:
|
| 69 |
-
# אם הוספת המקטע הבא תיצור מקטע ארוך מדי, יש לסיים את הנוכחי
|
| 70 |
if len(current_recombined_chunk) + len(chunk) > max_duration_ms and len(current_recombined_chunk) > 0:
|
| 71 |
final_chunks.append(current_recombined_chunk)
|
| 72 |
current_recombined_chunk = AudioSegment.empty()
|
| 73 |
-
|
| 74 |
-
# הוסף את המקטע החדש למקטע הנבנה הנוכחי
|
| 75 |
current_recombined_chunk += chunk
|
| 76 |
-
|
| 77 |
-
# אם המקטע הנבנה חרג מהאורך המותר, פרק אותו
|
| 78 |
while len(current_recombined_chunk) > max_duration_ms:
|
| 79 |
final_chunks.append(current_recombined_chunk[:max_duration_ms])
|
| 80 |
current_recombined_chunk = current_recombined_chunk[max_duration_ms:]
|
| 81 |
-
|
| 82 |
-
# הוסף את השארית האחרונה אם קיימת
|
| 83 |
if len(current_recombined_chunk) > 0:
|
| 84 |
final_chunks.append(current_recombined_chunk)
|
| 85 |
-
|
| 86 |
logging.info(f"File successfully split into {len(final_chunks)} chunks.")
|
| 87 |
logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
|
| 88 |
return final_chunks
|
| 89 |
|
| 90 |
-
|
| 91 |
-
def transcribe_chunk(chunk_audio, api_key, system_prompt,
|
| 92 |
-
"""שולח מקטע שמע אחד ל‑Gemini ומקבל JSON,
|
| 93 |
try:
|
| 94 |
client = genai.Client(api_key=api_key)
|
| 95 |
buffer = io.BytesIO()
|
|
@@ -104,7 +95,9 @@ def transcribe_chunk(chunk_audio, api_key, system_prompt, json_schema, model_nam
|
|
| 104 |
config=types.GenerateContentConfig(
|
| 105 |
system_instruction=system_prompt,
|
| 106 |
response_mime_type="application/json",
|
| 107 |
-
|
|
|
|
|
|
|
| 108 |
)
|
| 109 |
)
|
| 110 |
|
|
@@ -150,15 +143,16 @@ def generate_srt_content(segments):
|
|
| 150 |
|
| 151 |
# --- Streaming Transcription Logic ---
|
| 152 |
async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name: str):
|
| 153 |
-
"""
|
| 154 |
-
An async generator that performs transcription and yields progress updates.
|
| 155 |
-
"""
|
| 156 |
def send_event(type: str, message: str = "", percent: int = 0, data: str = ""):
|
| 157 |
event_data = {"type": type, "message": message, "percent": percent, "data": data}
|
| 158 |
return json.dumps(event_data) + "\n\n"
|
| 159 |
|
| 160 |
try:
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
| 162 |
yield send_event("progress", "מעבד את קובץ השמע...", 5)
|
| 163 |
|
| 164 |
audio = AudioSegment.from_file(io.BytesIO(file_content))
|
|
@@ -186,7 +180,8 @@ async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name:
|
|
| 186 |
progress_percent = 20 + int((i / total_chunks) * 75)
|
| 187 |
yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {total_chunks}...", progress_percent)
|
| 188 |
|
| 189 |
-
|
|
|
|
| 190 |
|
| 191 |
if error_msg:
|
| 192 |
raise ValueError(f"שגיאה בעיבוד מקטע {i+1}: {error_msg}")
|
|
@@ -233,4 +228,4 @@ async def handle_transcription_stream(
|
|
| 233 |
|
| 234 |
file_content = await audio_file.read()
|
| 235 |
|
| 236 |
-
return StreamingResponse(_transcribe_and_stream(api_key, file_content, model_name), media_type="text/event-stream")
|
|
|
|
| 12 |
from datetime import timedelta
|
| 13 |
import logging
|
| 14 |
import asyncio
|
| 15 |
+
from pydantic import BaseModel, Field # <-- CHANGE: Import BaseModel and Field from pydantic
|
| 16 |
|
| 17 |
# --- Setup and Constants ---
|
| 18 |
logging.basicConfig(level=logging.INFO)
|
| 19 |
app = FastAPI()
|
| 20 |
templates = Jinja2Templates(directory="templates")
|
| 21 |
|
| 22 |
+
# --- Audio Splitting Constants ---
|
| 23 |
MAX_CHUNK_DURATION_MIN = 10
|
| 24 |
MAX_CHUNK_DURATION_MS = MAX_CHUNK_DURATION_MIN * 60 * 1000
|
| 25 |
SILENCE_THRESH_DB = -30
|
| 26 |
MIN_SILENCE_LEN_MS = 600
|
| 27 |
+
NO_SPLIT_DURATION_MIN = 11
|
| 28 |
|
| 29 |
+
# --- CHANGE: Pydantic Schema Definition ---
|
| 30 |
+
# This class replaces schema.json. It's type-safe and recommended by Google.
|
| 31 |
+
class TranscriptionSegment(BaseModel):
|
| 32 |
+
id: int = Field(description="מספר סידורי של הכתובית", ge=1)
|
| 33 |
+
start_time: str = Field(description="שעת התחלה בפורמט HH:MM:SS,mmm")
|
| 34 |
+
end_time: str = Field(description="שעת סיום בפורמט HH:MM:SS,mmm")
|
| 35 |
+
text: str = Field(description="תוכן הכתובית")
|
| 36 |
+
|
| 37 |
+
# --- Helper functions ---
|
| 38 |
+
|
| 39 |
+
# <-- CHANGE: This function now only loads the system prompt.
|
| 40 |
+
def load_system_prompt():
|
| 41 |
+
"""טוען system_prompt מקובץ חיצוני."""
|
| 42 |
try:
|
| 43 |
with open("instruct.yml", 'r', encoding='utf-8') as f:
|
| 44 |
instruct_config = yaml.safe_load(f)
|
| 45 |
+
return instruct_config['system_prompt']
|
|
|
|
|
|
|
| 46 |
except FileNotFoundError as e:
|
| 47 |
logging.error(f"Configuration file not found: {e.filename}")
|
| 48 |
raise HTTPException(status_code=500, detail=f"שגיאת שרת: הקובץ {e.filename} לא נמצא.")
|
|
|
|
| 50 |
logging.error(f"Error loading configuration: {e}")
|
| 51 |
raise HTTPException(status_code=500, detail=f"שגיאת שרת: בעיה בטעינת ההגדרות: {e}")
|
| 52 |
|
|
|
|
| 53 |
def split_audio_smart(audio_segment, max_duration_ms, silence_thresh, min_silence_len):
|
| 54 |
+
"""מפצל אודיו למקטעים הקרובים ככל האפשר ל-max_duration_ms."""
|
|
|
|
|
|
|
|
|
|
| 55 |
logging.info(f"Splitting audio smartly. Target duration: ~{max_duration_ms / 60000} mins. Silence threshold: {silence_thresh}dB.")
|
|
|
|
|
|
|
| 56 |
raw_chunks = split_on_silence(
|
| 57 |
audio_segment,
|
| 58 |
min_silence_len=min_silence_len,
|
| 59 |
silence_thresh=silence_thresh,
|
| 60 |
+
keep_silence=500
|
| 61 |
)
|
|
|
|
| 62 |
if not raw_chunks:
|
| 63 |
logging.warning("No silence detected for smart splitting. Using fixed-size chunks.")
|
| 64 |
return [audio_segment[i:i + max_duration_ms] for i in range(0, len(audio_segment), max_duration_ms)]
|
|
|
|
|
|
|
| 65 |
final_chunks = []
|
| 66 |
current_recombined_chunk = AudioSegment.empty()
|
|
|
|
| 67 |
for chunk in raw_chunks:
|
|
|
|
| 68 |
if len(current_recombined_chunk) + len(chunk) > max_duration_ms and len(current_recombined_chunk) > 0:
|
| 69 |
final_chunks.append(current_recombined_chunk)
|
| 70 |
current_recombined_chunk = AudioSegment.empty()
|
|
|
|
|
|
|
| 71 |
current_recombined_chunk += chunk
|
|
|
|
|
|
|
| 72 |
while len(current_recombined_chunk) > max_duration_ms:
|
| 73 |
final_chunks.append(current_recombined_chunk[:max_duration_ms])
|
| 74 |
current_recombined_chunk = current_recombined_chunk[max_duration_ms:]
|
|
|
|
|
|
|
| 75 |
if len(current_recombined_chunk) > 0:
|
| 76 |
final_chunks.append(current_recombined_chunk)
|
|
|
|
| 77 |
logging.info(f"File successfully split into {len(final_chunks)} chunks.")
|
| 78 |
logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
|
| 79 |
return final_chunks
|
| 80 |
|
| 81 |
+
# <-- CHANGE: Function now accepts a Pydantic model instead of a JSON schema object.
|
| 82 |
+
def transcribe_chunk(chunk_audio, api_key, system_prompt, pydantic_schema, model_name):
|
| 83 |
+
"""שולח מקטע שמע אחד ל‑Gemini ומקבל JSON, בהתאם לסכמת Pydantic."""
|
| 84 |
try:
|
| 85 |
client = genai.Client(api_key=api_key)
|
| 86 |
buffer = io.BytesIO()
|
|
|
|
| 95 |
config=types.GenerateContentConfig(
|
| 96 |
system_instruction=system_prompt,
|
| 97 |
response_mime_type="application/json",
|
| 98 |
+
# <-- CHANGE: Pass the Pydantic model directly to the SDK.
|
| 99 |
+
# The `list[]` indicates we expect a list of these objects.
|
| 100 |
+
response_schema=list[pydantic_schema]
|
| 101 |
)
|
| 102 |
)
|
| 103 |
|
|
|
|
| 143 |
|
| 144 |
# --- Streaming Transcription Logic ---
|
| 145 |
async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name: str):
|
| 146 |
+
"""An async generator that performs transcription and yields progress updates."""
|
|
|
|
|
|
|
| 147 |
def send_event(type: str, message: str = "", percent: int = 0, data: str = ""):
|
| 148 |
event_data = {"type": type, "message": message, "percent": percent, "data": data}
|
| 149 |
return json.dumps(event_data) + "\n\n"
|
| 150 |
|
| 151 |
try:
|
| 152 |
+
# <-- CHANGE: Load only the system prompt and use the Pydantic class.
|
| 153 |
+
system_prompt = load_system_prompt()
|
| 154 |
+
pydantic_schema = TranscriptionSegment
|
| 155 |
+
|
| 156 |
yield send_event("progress", "מעבד את קובץ השמע...", 5)
|
| 157 |
|
| 158 |
audio = AudioSegment.from_file(io.BytesIO(file_content))
|
|
|
|
| 180 |
progress_percent = 20 + int((i / total_chunks) * 75)
|
| 181 |
yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {total_chunks}...", progress_percent)
|
| 182 |
|
| 183 |
+
# <-- CHANGE: Pass the Pydantic schema class to the transcription function.
|
| 184 |
+
data, error_msg = await asyncio.to_thread(transcribe_chunk, ch, api_key, system_prompt, pydantic_schema, model_name)
|
| 185 |
|
| 186 |
if error_msg:
|
| 187 |
raise ValueError(f"שגיאה בעיבוד מקטע {i+1}: {error_msg}")
|
|
|
|
| 228 |
|
| 229 |
file_content = await audio_file.read()
|
| 230 |
|
| 231 |
+
return StreamingResponse(_transcribe_and_stream(api_key, file_content, model_name), media_type="text/event-stream")
|
requirements.txt
CHANGED
|
@@ -5,3 +5,4 @@ google-genai
|
|
| 5 |
pydub
|
| 6 |
PyYAML
|
| 7 |
Jinja2
|
|
|
|
|
|
| 5 |
pydub
|
| 6 |
PyYAML
|
| 7 |
Jinja2
|
| 8 |
+
pydantic
|
schema.json
CHANGED
|
@@ -10,18 +10,15 @@
|
|
| 10 |
},
|
| 11 |
"start_time": {
|
| 12 |
"type": "string",
|
| 13 |
-
"description": "שעת התחלה בפורמט HH:MM:SS,mmm עם אפסים מובילים"
|
| 14 |
-
"pattern": "^\\d{2}:\\d{2}:\\d{2},\\d{3}$"
|
| 15 |
},
|
| 16 |
"end_time": {
|
| 17 |
"type": "string",
|
| 18 |
-
"description": "שעת סיום בפורמט HH:MM:SS,mmm עם אפסים מובילים"
|
| 19 |
-
"pattern": "^\\d{2}:\\d{2}:\\d{2},\\d{3}$"
|
| 20 |
},
|
| 21 |
"text": {
|
| 22 |
"type": "string",
|
| 23 |
-
"description": "תוכן הכתובית"
|
| 24 |
-
"minLength": 1
|
| 25 |
}
|
| 26 |
},
|
| 27 |
"required": [
|
|
@@ -29,6 +26,12 @@
|
|
| 29 |
"start_time",
|
| 30 |
"end_time",
|
| 31 |
"text"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
]
|
| 33 |
}
|
| 34 |
}
|
|
|
|
| 10 |
},
|
| 11 |
"start_time": {
|
| 12 |
"type": "string",
|
| 13 |
+
"description": "שעת התחלה בפורמט HH:MM:SS,mmm עם אפסים מובילים"
|
|
|
|
| 14 |
},
|
| 15 |
"end_time": {
|
| 16 |
"type": "string",
|
| 17 |
+
"description": "שעת סיום בפורמט HH:MM:SS,mmm עם אפסים מובילים"
|
|
|
|
| 18 |
},
|
| 19 |
"text": {
|
| 20 |
"type": "string",
|
| 21 |
+
"description": "תוכן הכתובית"
|
|
|
|
| 22 |
}
|
| 23 |
},
|
| 24 |
"required": [
|
|
|
|
| 26 |
"start_time",
|
| 27 |
"end_time",
|
| 28 |
"text"
|
| 29 |
+
],
|
| 30 |
+
"propertyOrdering": [
|
| 31 |
+
"id",
|
| 32 |
+
"start_time",
|
| 33 |
+
"end_time",
|
| 34 |
+
"text"
|
| 35 |
]
|
| 36 |
}
|
| 37 |
}
|