NHLOCAL commited on
Commit
f3d614b
·
1 Parent(s): d2b9a14

סכמה מתוקנת

Browse files
Files changed (3) hide show
  1. main.py +33 -38
  2. requirements.txt +1 -0
  3. schema.json +9 -6
main.py CHANGED
@@ -12,28 +12,37 @@ import os
12
  from datetime import timedelta
13
  import logging
14
  import asyncio
 
15
 
16
  # --- Setup and Constants ---
17
  logging.basicConfig(level=logging.INFO)
18
  app = FastAPI()
19
  templates = Jinja2Templates(directory="templates")
20
 
21
- # --- CHANGE 1: Updated constants as per user request ---
22
  MAX_CHUNK_DURATION_MIN = 10
23
  MAX_CHUNK_DURATION_MS = MAX_CHUNK_DURATION_MIN * 60 * 1000
24
  SILENCE_THRESH_DB = -30
25
  MIN_SILENCE_LEN_MS = 600
26
- NO_SPLIT_DURATION_MIN = 11 # Files shorter than this won't be split
27
 
28
- # --- Helper functions (Only split_audio_smart is changed) ---
29
- def load_config():
30
- """טוען system_prompt וסכמת JSON מקבצים חיצוניים."""
 
 
 
 
 
 
 
 
 
 
31
  try:
32
  with open("instruct.yml", 'r', encoding='utf-8') as f:
33
  instruct_config = yaml.safe_load(f)
34
- with open("schema.json", 'r', encoding='utf-8') as f:
35
- json_schema_config = json.load(f)
36
- return instruct_config['system_prompt'], json_schema_config
37
  except FileNotFoundError as e:
38
  logging.error(f"Configuration file not found: {e.filename}")
39
  raise HTTPException(status_code=500, detail=f"שגיאת שרת: הקובץ {e.filename} לא נמצא.")
@@ -41,55 +50,37 @@ def load_config():
41
  logging.error(f"Error loading configuration: {e}")
42
  raise HTTPException(status_code=500, detail=f"שגיאת שרת: בעיה בטעינת ההגדרות: {e}")
43
 
44
- # --- CHANGE 2: Completely rewritten split_audio_smart function for robust splitting ---
45
  def split_audio_smart(audio_segment, max_duration_ms, silence_thresh, min_silence_len):
46
- """
47
- מפצל אודיו למקטעים הקרובים ככל האפשר ל-max_duration_ms.
48
- הפונקציה מפרקת מקטעים ארוכים מדי ומאחדת קצרים מדי.
49
- """
50
  logging.info(f"Splitting audio smartly. Target duration: ~{max_duration_ms / 60000} mins. Silence threshold: {silence_thresh}dB.")
51
-
52
- # שלב 1: פיצול ראשוני לפי הפסקות שקט
53
  raw_chunks = split_on_silence(
54
  audio_segment,
55
  min_silence_len=min_silence_len,
56
  silence_thresh=silence_thresh,
57
- keep_silence=500 # שמירת חצי שנייה של שקט למעבר טבעי
58
  )
59
-
60
  if not raw_chunks:
61
  logging.warning("No silence detected for smart splitting. Using fixed-size chunks.")
62
  return [audio_segment[i:i + max_duration_ms] for i in range(0, len(audio_segment), max_duration_ms)]
63
-
64
- # שלב 2: איחוד ופירוק מחדש כדי לעמוד במגבלת האורך
65
  final_chunks = []
66
  current_recombined_chunk = AudioSegment.empty()
67
-
68
  for chunk in raw_chunks:
69
- # אם הוספת המקטע הבא תיצור מקטע ארוך מדי, יש לסיים את הנוכחי
70
  if len(current_recombined_chunk) + len(chunk) > max_duration_ms and len(current_recombined_chunk) > 0:
71
  final_chunks.append(current_recombined_chunk)
72
  current_recombined_chunk = AudioSegment.empty()
73
-
74
- # הוסף את המקטע החדש למקטע הנבנה הנוכחי
75
  current_recombined_chunk += chunk
76
-
77
- # אם המקטע הנבנה חרג מהאורך המותר, פרק אותו
78
  while len(current_recombined_chunk) > max_duration_ms:
79
  final_chunks.append(current_recombined_chunk[:max_duration_ms])
80
  current_recombined_chunk = current_recombined_chunk[max_duration_ms:]
81
-
82
- # הוסף את השארית האחרונה אם קיימת
83
  if len(current_recombined_chunk) > 0:
84
  final_chunks.append(current_recombined_chunk)
85
-
86
  logging.info(f"File successfully split into {len(final_chunks)} chunks.")
87
  logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
88
  return final_chunks
89
 
90
-
91
- def transcribe_chunk(chunk_audio, api_key, system_prompt, json_schema, model_name):
92
- """שולח מקטע שמע אחד ל‑Gemini ומקבל JSON, עם טיפול משופר בשגיאות."""
93
  try:
94
  client = genai.Client(api_key=api_key)
95
  buffer = io.BytesIO()
@@ -104,7 +95,9 @@ def transcribe_chunk(chunk_audio, api_key, system_prompt, json_schema, model_nam
104
  config=types.GenerateContentConfig(
105
  system_instruction=system_prompt,
106
  response_mime_type="application/json",
107
- response_schema=json_schema
 
 
108
  )
109
  )
110
 
@@ -150,15 +143,16 @@ def generate_srt_content(segments):
150
 
151
  # --- Streaming Transcription Logic ---
152
  async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name: str):
153
- """
154
- An async generator that performs transcription and yields progress updates.
155
- """
156
  def send_event(type: str, message: str = "", percent: int = 0, data: str = ""):
157
  event_data = {"type": type, "message": message, "percent": percent, "data": data}
158
  return json.dumps(event_data) + "\n\n"
159
 
160
  try:
161
- system_prompt, json_schema = load_config()
 
 
 
162
  yield send_event("progress", "מעבד את קובץ השמע...", 5)
163
 
164
  audio = AudioSegment.from_file(io.BytesIO(file_content))
@@ -186,7 +180,8 @@ async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name:
186
  progress_percent = 20 + int((i / total_chunks) * 75)
187
  yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {total_chunks}...", progress_percent)
188
 
189
- data, error_msg = await asyncio.to_thread(transcribe_chunk, ch, api_key, system_prompt, json_schema, model_name)
 
190
 
191
  if error_msg:
192
  raise ValueError(f"שגיאה בעיבוד מקטע {i+1}: {error_msg}")
@@ -233,4 +228,4 @@ async def handle_transcription_stream(
233
 
234
  file_content = await audio_file.read()
235
 
236
- return StreamingResponse(_transcribe_and_stream(api_key, file_content, model_name), media_type="text/event-stream")
 
12
  from datetime import timedelta
13
  import logging
14
  import asyncio
15
+ from pydantic import BaseModel, Field # <-- CHANGE: Import BaseModel and Field from pydantic
16
 
17
  # --- Setup and Constants ---
18
  logging.basicConfig(level=logging.INFO)
19
  app = FastAPI()
20
  templates = Jinja2Templates(directory="templates")
21
 
22
+ # --- Audio Splitting Constants ---
23
  MAX_CHUNK_DURATION_MIN = 10
24
  MAX_CHUNK_DURATION_MS = MAX_CHUNK_DURATION_MIN * 60 * 1000
25
  SILENCE_THRESH_DB = -30
26
  MIN_SILENCE_LEN_MS = 600
27
+ NO_SPLIT_DURATION_MIN = 11
28
 
29
+ # --- CHANGE: Pydantic Schema Definition ---
30
+ # This class replaces schema.json. It's type-safe and recommended by Google.
31
+ class TranscriptionSegment(BaseModel):
32
+ id: int = Field(description="מספר סידורי של הכתובית", ge=1)
33
+ start_time: str = Field(description="שעת התחלה בפורמט HH:MM:SS,mmm")
34
+ end_time: str = Field(description="שעת סיום בפורמט HH:MM:SS,mmm")
35
+ text: str = Field(description="תוכן הכתובית")
36
+
37
+ # --- Helper functions ---
38
+
39
+ # <-- CHANGE: This function now only loads the system prompt.
40
+ def load_system_prompt():
41
+ """טוען system_prompt מקובץ חיצוני."""
42
  try:
43
  with open("instruct.yml", 'r', encoding='utf-8') as f:
44
  instruct_config = yaml.safe_load(f)
45
+ return instruct_config['system_prompt']
 
 
46
  except FileNotFoundError as e:
47
  logging.error(f"Configuration file not found: {e.filename}")
48
  raise HTTPException(status_code=500, detail=f"שגיאת שרת: הקובץ {e.filename} לא נמצא.")
 
50
  logging.error(f"Error loading configuration: {e}")
51
  raise HTTPException(status_code=500, detail=f"שגיאת שרת: בעיה בטעינת ההגדרות: {e}")
52
 
 
53
  def split_audio_smart(audio_segment, max_duration_ms, silence_thresh, min_silence_len):
54
+ """מפצל אודיו למקטעים הקרובים ככל האפשר ל-max_duration_ms."""
 
 
 
55
  logging.info(f"Splitting audio smartly. Target duration: ~{max_duration_ms / 60000} mins. Silence threshold: {silence_thresh}dB.")
 
 
56
  raw_chunks = split_on_silence(
57
  audio_segment,
58
  min_silence_len=min_silence_len,
59
  silence_thresh=silence_thresh,
60
+ keep_silence=500
61
  )
 
62
  if not raw_chunks:
63
  logging.warning("No silence detected for smart splitting. Using fixed-size chunks.")
64
  return [audio_segment[i:i + max_duration_ms] for i in range(0, len(audio_segment), max_duration_ms)]
 
 
65
  final_chunks = []
66
  current_recombined_chunk = AudioSegment.empty()
 
67
  for chunk in raw_chunks:
 
68
  if len(current_recombined_chunk) + len(chunk) > max_duration_ms and len(current_recombined_chunk) > 0:
69
  final_chunks.append(current_recombined_chunk)
70
  current_recombined_chunk = AudioSegment.empty()
 
 
71
  current_recombined_chunk += chunk
 
 
72
  while len(current_recombined_chunk) > max_duration_ms:
73
  final_chunks.append(current_recombined_chunk[:max_duration_ms])
74
  current_recombined_chunk = current_recombined_chunk[max_duration_ms:]
 
 
75
  if len(current_recombined_chunk) > 0:
76
  final_chunks.append(current_recombined_chunk)
 
77
  logging.info(f"File successfully split into {len(final_chunks)} chunks.")
78
  logging.info(f"Chunk durations (seconds): {[round(len(c) / 1000) for c in final_chunks]}")
79
  return final_chunks
80
 
81
+ # <-- CHANGE: Function now accepts a Pydantic model instead of a JSON schema object.
82
+ def transcribe_chunk(chunk_audio, api_key, system_prompt, pydantic_schema, model_name):
83
+ """שולח מקטע שמע אחד ל‑Gemini ומקבל JSON, בהתאם לסכמת Pydantic."""
84
  try:
85
  client = genai.Client(api_key=api_key)
86
  buffer = io.BytesIO()
 
95
  config=types.GenerateContentConfig(
96
  system_instruction=system_prompt,
97
  response_mime_type="application/json",
98
+ # <-- CHANGE: Pass the Pydantic model directly to the SDK.
99
+ # The `list[]` indicates we expect a list of these objects.
100
+ response_schema=list[pydantic_schema]
101
  )
102
  )
103
 
 
143
 
144
  # --- Streaming Transcription Logic ---
145
  async def _transcribe_and_stream(api_key: str, file_content: bytes, model_name: str):
146
+ """An async generator that performs transcription and yields progress updates."""
 
 
147
  def send_event(type: str, message: str = "", percent: int = 0, data: str = ""):
148
  event_data = {"type": type, "message": message, "percent": percent, "data": data}
149
  return json.dumps(event_data) + "\n\n"
150
 
151
  try:
152
+ # <-- CHANGE: Load only the system prompt and use the Pydantic class.
153
+ system_prompt = load_system_prompt()
154
+ pydantic_schema = TranscriptionSegment
155
+
156
  yield send_event("progress", "מעבד את קובץ השמע...", 5)
157
 
158
  audio = AudioSegment.from_file(io.BytesIO(file_content))
 
180
  progress_percent = 20 + int((i / total_chunks) * 75)
181
  yield send_event("progress", f"מתמלל מקטע {i+1} מתוך {total_chunks}...", progress_percent)
182
 
183
+ # <-- CHANGE: Pass the Pydantic schema class to the transcription function.
184
+ data, error_msg = await asyncio.to_thread(transcribe_chunk, ch, api_key, system_prompt, pydantic_schema, model_name)
185
 
186
  if error_msg:
187
  raise ValueError(f"שגיאה בעיבוד מקטע {i+1}: {error_msg}")
 
228
 
229
  file_content = await audio_file.read()
230
 
231
+ return StreamingResponse(_transcribe_and_stream(api_key, file_content, model_name), media_type="text/event-stream")
requirements.txt CHANGED
@@ -5,3 +5,4 @@ google-genai
5
  pydub
6
  PyYAML
7
  Jinja2
 
 
5
  pydub
6
  PyYAML
7
  Jinja2
8
+ pydantic
schema.json CHANGED
@@ -10,18 +10,15 @@
10
  },
11
  "start_time": {
12
  "type": "string",
13
- "description": "שעת התחלה בפורמט HH:MM:SS,mmm עם אפסים מובילים",
14
- "pattern": "^\\d{2}:\\d{2}:\\d{2},\\d{3}$"
15
  },
16
  "end_time": {
17
  "type": "string",
18
- "description": "שעת סיום בפורמט HH:MM:SS,mmm עם אפסים מובילים",
19
- "pattern": "^\\d{2}:\\d{2}:\\d{2},\\d{3}$"
20
  },
21
  "text": {
22
  "type": "string",
23
- "description": "תוכן הכתובית",
24
- "minLength": 1
25
  }
26
  },
27
  "required": [
@@ -29,6 +26,12 @@
29
  "start_time",
30
  "end_time",
31
  "text"
 
 
 
 
 
 
32
  ]
33
  }
34
  }
 
10
  },
11
  "start_time": {
12
  "type": "string",
13
+ "description": "שעת התחלה בפורמט HH:MM:SS,mmm עם אפסים מובילים"
 
14
  },
15
  "end_time": {
16
  "type": "string",
17
+ "description": "שעת סיום בפורמט HH:MM:SS,mmm עם אפסים מובילים"
 
18
  },
19
  "text": {
20
  "type": "string",
21
+ "description": "תוכן הכתובית"
 
22
  }
23
  },
24
  "required": [
 
26
  "start_time",
27
  "end_time",
28
  "text"
29
+ ],
30
+ "propertyOrdering": [
31
+ "id",
32
+ "start_time",
33
+ "end_time",
34
+ "text"
35
  ]
36
  }
37
  }