Samuelblue commited on
Commit
38f2532
·
verified ·
1 Parent(s): a10b5f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -127
app.py CHANGED
@@ -3,38 +3,42 @@ import json
3
  from difflib import Differ
4
  import ffmpeg
5
  import os
 
6
  from pathlib import Path
7
  import time
8
  import aiohttp
9
  import asyncio
10
  import base64
11
  from dotenv import load_dotenv
 
12
 
13
  # --- Configuration ---
14
  # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
15
  API_BACKEND = True
16
  MODEL = "facebook/wav2vec2-base-960h"
17
- # MODEL = "facebook/wav2vec2-large-960h"
18
- # MODEL = "facebook/wav2vec2-large-960h-lv60-self"
19
- # MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram" # Example of different model
20
  API_URL = f'https://api-inference.huggingface.co/models/{MODEL}'
21
- RETRY_ATTEMPTS = 5 # Increased retry attempts for API calls
22
- RETRY_DELAY = 5 # Base delay in seconds before retrying API calls
 
 
 
 
23
 
24
  # --- Initialization ---
25
  if API_BACKEND:
26
  load_dotenv(Path(".env"))
27
  HF_TOKEN = os.environ.get("HF_TOKEN")
28
  if not HF_TOKEN:
 
29
  raise ValueError("HF_TOKEN environment variable not set.")
30
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
31
  else:
32
  import torch
33
  from transformers import pipeline
34
 
35
- # is cuda available?
36
  device = 0 if torch.cuda.is_available() else -1
37
  try:
 
38
  speech_recognizer = pipeline(
39
  task="automatic-speech-recognition",
40
  model=MODEL,
@@ -42,24 +46,31 @@ else:
42
  framework="pt",
43
  device=device,
44
  )
 
45
  except Exception as e:
 
46
  raise RuntimeError(f"Error initializing local model {MODEL}: {e}")
47
 
48
  videos_out_path = Path("./videos_out")
49
  videos_out_path.mkdir(parents=True, exist_ok=True)
 
50
 
51
- # Load samples data
52
  samples_data_files = sorted(Path('examples').glob('*.json'))
53
  SAMPLES = []
54
  for file in samples_data_files:
55
  try:
56
  with open(file, 'r') as f:
57
  sample = json.load(f)
58
- SAMPLES.append(sample)
 
 
 
59
  except (json.JSONDecodeError, FileNotFoundError) as e:
60
- print(f"Error loading sample file {file}: {e}")
 
 
 
61
 
62
- VIDEOS = [[sample['video']] for sample in SAMPLES if 'video' in sample]
63
 
64
  # --- Helper Functions ---
65
  async def query_api(audio_bytes: bytes):
@@ -74,15 +85,15 @@ async def query_api(audio_bytes: bytes):
74
  "chunk_length_s": 10,
75
  "stride_length_s": [4, 2]
76
  },
77
- "options": {"use_gpu": False} # Set to True if you have a GPU and want to use it
78
  }).encode("utf-8")
79
 
80
  async with aiohttp.ClientSession() as session:
81
  for attempt in range(RETRY_ATTEMPTS):
82
- print(f'Transcribing from API attempt {attempt + 1}/{RETRY_ATTEMPTS}')
83
  try:
84
  async with session.post(API_URL, headers=headers, data=payload) as response:
85
- print("API Response Status:", response.status)
86
  content_type = response.headers.get('Content-Type', '')
87
 
88
  if response.status == 200 and 'application/json' in content_type:
@@ -91,8 +102,8 @@ async def query_api(audio_bytes: bytes):
91
  error_response = await response.json()
92
  if 'error' in error_response and 'estimated_time' in error_response:
93
  wait_time = error_response['estimated_time']
94
- print(f"Model loading, waiting for {wait_time} seconds...")
95
- await asyncio.sleep(wait_time + RETRY_DELAY) # Wait time + buffer
96
  elif 'error' in error_response:
97
  raise RuntimeError(f"API Error: {error_response['error']}")
98
  else:
@@ -102,13 +113,13 @@ async def query_api(audio_bytes: bytes):
102
  raise RuntimeError(f"Unexpected API response format (Status: {response.status}, Content-Type: {content_type}): {response_text}")
103
 
104
  except aiohttp.ClientError as e:
105
- print(f"AIOHTTP Client Error during API call: {e}")
106
  except RuntimeError as e:
107
- print(f"Runtime error during API call: {e}")
108
 
109
  if attempt < RETRY_ATTEMPTS - 1:
110
- wait_time = RETRY_DELAY * (2 ** attempt) # Exponential backoff
111
- print(f"Retrying in {wait_time} seconds...")
112
  await asyncio.sleep(wait_time)
113
 
114
  raise RuntimeError(f"Failed to get transcription after {RETRY_ATTEMPTS} attempts.")
@@ -120,210 +131,257 @@ def ping_telemetry(name: str):
120
  This is fire-and-forget and doesn't affect the main process flow.
121
  """
122
  url = f'https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}'
123
- print(f"Pinging telemetry: {url}")
124
 
125
  async def send_ping():
126
  try:
127
  async with aiohttp.ClientSession() as session:
128
  async with session.get(url) as response:
129
- print(f"Telemetry pong: {response.status}")
130
  except aiohttp.ClientError as e:
131
- print(f"Failed to send telemetry ping: {e}")
132
- # Using asyncio.run_coroutine_threadsafe might be safer in a threaded Gradio environment,
133
- # but requires managing an event loop in a separate thread.
134
- # For simplicity here, we'll use create_task assuming an event loop is running (Gradio handles this).
135
  asyncio.create_task(send_ping())
136
 
137
 
138
  # --- Main Gradio Functions ---
139
- async def speech_to_text(video_file_path):
140
  """
141
  Takes a video path to convert to audio, transcribe audio channel to text and char timestamps.
 
142
  """
143
  if video_file_path is None:
144
  raise gr.Error("Error: No video input provided.")
145
 
146
  video_path = Path(video_file_path)
147
  if not video_path.exists():
148
- raise gr.Error(f"Error: Video file not found at {video_file_path}")
149
 
 
150
  try:
151
- # convert video to audio 16k using PIPE to audio_memory
152
- # Use asyncio-compatible way or run in a separate thread if ffmpeg-python is blocking
 
 
153
  loop = asyncio.get_running_loop()
154
- audio_memory, _ = await loop.run_in_executor(
155
  None, lambda: ffmpeg.input(video_path).output(
156
- '-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
157
  )
 
 
 
 
158
 
159
  except ffmpeg.Error as e:
 
160
  raise gr.Error(f"Error converting video to audio: {e.stderr.decode()}")
161
  except Exception as e:
 
162
  raise gr.Error(f"An unexpected error occurred during audio conversion: {e}")
 
 
 
 
163
 
164
 
165
  ping_telemetry("speech_to_text")
 
166
 
167
  if API_BACKEND:
168
  try:
169
  inference_response = await query_api(audio_memory)
170
- print("Inference Response:", inference_response)
171
  if not isinstance(inference_response, dict) or 'text' not in inference_response or 'chunks' not in inference_response:
172
  raise RuntimeError(f"Unexpected API response structure: {inference_response}")
173
 
174
  transcription = inference_response["text"].lower()
175
- # Ensure timestamps have the correct structure and handle potential None values
176
  timestamps = [[chunk.get("text", "").lower(), chunk.get("timestamp", [None, None])[0], chunk.get("timestamp", [None, None])[1]]
177
  for chunk in inference_response.get('chunks', []) if isinstance(chunk, dict)]
178
 
179
- # Filter out timestamps with None values if necessary, or handle them downstream
180
  timestamps = [ts for ts in timestamps if ts[1] is not None and ts[2] is not None]
181
 
182
-
183
  return (transcription, transcription, timestamps)
184
 
185
  except Exception as e:
 
186
  raise gr.Error(f"Error fetching transcription from API: {e}")
187
 
188
  else:
189
  try:
190
- print(f'Transcribing via local model {MODEL}')
191
- # Run blocking model inference in an executor
192
  loop = asyncio.get_running_loop()
193
  output = await loop.run_in_executor(
194
  None, lambda: speech_recognizer(
195
  audio_memory, return_timestamps="char", chunk_length_s=10, stride_length_s=(4, 2))
196
  )
 
197
 
198
  if not isinstance(output, dict) or 'text' not in output or 'chunks' not in output:
199
  raise RuntimeError(f"Unexpected model output structure: {output}")
200
 
201
  transcription = output["text"].lower()
202
- # Ensure timestamps have the correct structure and handle potential None/list values
203
  timestamps = [[chunk.get("text", "").lower(),
204
  chunk.get("timestamp", [None, None])[0] if not isinstance(chunk.get("timestamp", [None, None])[0], list) else chunk.get("timestamp", [None, None])[0][0],
205
  chunk.get("timestamp", [None, None])[1] if not isinstance(chunk.get("timestamp", [None, None])[1], list) else chunk.get("timestamp", [None, None])[1][0]
206
  ]
207
  for chunk in output.get('chunks', []) if isinstance(chunk, dict)]
208
 
209
- # Filter out timestamps with None values if necessary, or handle them downstream
210
  timestamps = [ts for ts in timestamps if ts[1] is not None and ts[2] is not None]
211
 
212
-
213
  return (transcription, transcription, timestamps)
214
 
215
  except Exception as e:
 
216
  raise gr.Error(f"Error running inference with local model: {e}")
217
 
218
 
219
- async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
220
  """
221
  Given original video input, text transcript + timestamps,
222
- and edited text cuts video segments into a single video
 
223
  """
224
  if video_in is None or text_in is None or transcription is None or timestamps is None:
225
  raise gr.Error("Inputs undefined. Please provide video, transcription, and edited text.")
226
 
227
- if not Path(video_in).exists():
228
- raise gr.Error(f"Error: Video file not found at {video_in}")
229
-
230
 
 
231
  d = Differ()
232
- # compare original transcription with edit text
233
  diff_chars = list(d.compare(transcription, text_in))
234
 
235
- # Map filtered characters back to original timestamps
236
- # This requires careful indexing or alignment
237
- # A more robust approach might involve aligning the diff output with the original timestamps
238
- # based on character positions. For simplicity here, we'll assume a direct mapping after filtering
239
- # which might not be accurate if additions/deletions significantly alter the text structure.
240
- # A better approach would be to process the diff and the original timestamps in parallel.
241
-
242
- # Let's refine the logic to align diff with timestamps more accurately.
243
- # We'll iterate through the diff and the timestamps simultaneously.
244
- filtered_timestamps = []
245
  timestamp_idx = 0
246
- for diff_line in diff_chars:
247
- # Lines starting with '-' are deletions, '+' are additions, '?' are changes (we ignore), ' ' are unchanged.
248
- if diff_line.startswith('-') or diff_line.startswith(' '):
249
- # If it's a deletion or unchanged, it corresponds to an original timestamp
250
- if timestamp_idx < len(timestamps):
251
- filtered_timestamps.append((diff_line, timestamps[timestamp_idx]))
 
 
 
 
 
 
 
 
 
 
 
 
252
  timestamp_idx += 1
253
- # Additions ('+') do not correspond to original timestamps, so we skip incrementing timestamp_idx
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
- # filter timestamps to be removed (those marked with '-')
256
- timestamps_to_keep = [ts_info for diff_line, ts_info in filtered_timestamps if not diff_line.startswith('-')]
257
 
 
258
 
259
- # groupping character timestamps to keep into continuous segments
260
  grouped_segments = []
261
  if timestamps_to_keep:
262
  current_segment = [timestamps_to_keep[0]]
263
  for i in range(1, len(timestamps_to_keep)):
264
- # Check if the current timestamp's start time is close to the previous timestamp's end time
265
- # This threshold might need adjustment based on the granularity of timestamps
266
- if timestamps_to_keep[i][1] - current_segment[-1][2] < 0.1: # 0.1 seconds threshold
267
  current_segment.append(timestamps_to_keep[i])
268
  else:
269
  grouped_segments.append(current_segment)
270
  current_segment = [timestamps_to_keep[i]]
271
- grouped_segments.append(current_segment) # Add the last segment
272
 
273
- # after grouping, gets the lower start and upper end time for each group
274
- cut_intervals = [[segment[0][1], segment[-1][2]] for segment in grouped_segments]
275
 
 
276
 
277
- video_path = Path(video_in)
278
  video_file_name = video_path.stem
279
- output_video_path = videos_out_path / f"{video_file_name}_cut.mp4" # Use _cut suffix to avoid overwriting original
280
 
281
  if cut_intervals:
 
282
  input_video_stream = ffmpeg.input(video_in)
283
 
284
- # Create select filters for video and audio based on cut intervals
285
- video_filters = []
286
- audio_filters = []
287
  for i, interval in enumerate(cut_intervals):
288
- video_filters.append(f'select=\'between(t,{interval[0]},{interval[1]})\'')
289
- audio_filters.append(f'aselect=\'between(t,{interval[0]},{interval[1]})\'')
 
 
290
 
291
- # Join filters with commas and add setpts
292
- video_filter_str = ','.join(video_filters) + ',setpts=N/FRAME_RATE/TB'
293
- audio_filter_str = ','.join(audio_filters) + ',asetpts=N/SR/TB'
294
 
295
- video_stream = input_video_stream.video.filter_complex(video_filter_str)
296
- audio_stream = input_video_stream.audio.filter_complex(audio_filter_str)
297
 
298
  try:
299
- # Use asyncio-compatible way or run in a separate thread
300
  loop = asyncio.get_running_loop()
301
  await loop.run_in_executor(
302
- None, lambda: ffmpeg.concat(video_stream, audio_stream, v=1, a=1).output(
303
- str(output_video_path), preset='fast', crf=23 # Use reasonable encoding settings
 
 
 
 
 
304
  ).overwrite_output().global_args('-loglevel', 'quiet').run()
305
  )
 
306
 
307
  except ffmpeg.Error as e:
 
308
  raise gr.Error(f"Error cutting video: {e.stderr.decode()}")
309
  except Exception as e:
 
310
  raise gr.Error(f"An unexpected error occurred during video cutting: {e}")
311
 
312
  else:
313
- # If no intervals to keep, output an empty video or handle as an error
314
- # For now, let's return the original video path and indicate no cuts were made.
315
- # Depending on requirements, creating an empty video might be better.
316
- output_video_path = Path(video_in) # No cuts, so output is the original video
317
- print("No text was kept, returning original video.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
 
320
- # Generate diff output for display
321
- # The diff_chars list already contains the diff with markers ('-', '+', ' ')
322
- # We can directly use this for the highlighted text output
323
  diff_output_tokens = [(token[2:], token[0] if token[0] != ' ' else None)
324
  for token in diff_chars]
325
 
326
  ping_telemetry("video_cuts")
 
327
 
328
  return (diff_output_tokens, str(output_video_path))
329
 
@@ -336,29 +394,30 @@ def load_example(id):
336
  transcription = sample.get('transcription', '').lower()
337
  timestamps = sample.get('timestamps', [])
338
  if video is None:
 
339
  raise gr.Error(f"Example at index {id} is missing video path.")
340
  return (video, transcription, transcription, timestamps)
341
  else:
 
342
  raise gr.Error(f"Invalid example index: {id}")
343
 
344
 
345
  # --- Gradio Layout ---
346
  css = """
347
  #cut_btn, #reset_btn { align-self:stretch; }
348
- #\\31 3 { max-width: 540px; } /* Consider making this more general or dynamic */
349
  .output-markdown {max-width: 65ch !important;}
350
  #video-container{
351
  max-width: 40rem;
352
  }
353
  """
354
  with gr.Blocks(css=css) as demo:
355
- # Using States to hold transcription and timestamps across interactions
356
  transcription_var = gr.State(value="")
357
  timestamps_var = gr.State(value=[])
358
  video_in = gr.Video(label="Video file", elem_id="video-container")
359
  text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
360
- video_out = gr.Video(label="Video Out", interactive=False) # Output video should not be edited directly
361
- diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True, show_legend=True) # Added legend
362
 
363
  gr.Markdown("""
364
  # Edit Video By Editing Text
@@ -370,24 +429,22 @@ with gr.Blocks(css=css) as demo:
370
  """)
371
 
372
  with gr.Row():
373
- # Examples section
374
  examples = gr.Dataset(components=[video_in], samples=VIDEOS, type="index", label="Examples")
375
  examples.click(
376
  load_example,
377
  inputs=[examples],
378
  outputs=[video_in, text_in, transcription_var, timestamps_var],
379
- queue=False # Set to False if you want immediate loading without waiting in queue
380
  )
381
 
382
  with gr.Row():
383
  with gr.Column():
384
- video_in.render()
385
  transcribe_btn = gr.Button("Transcribe Audio")
386
  transcribe_btn.click(
387
  speech_to_text,
388
  inputs=[video_in],
389
  outputs=[text_in, transcription_var, timestamps_var]
390
- # No queue=False here as transcription can take time
391
  )
392
 
393
  gr.Markdown("""
@@ -396,37 +453,15 @@ with gr.Blocks(css=css) as demo:
396
 
397
  with gr.Row():
398
  with gr.Column():
399
- text_in.render()
400
  with gr.Row():
401
  cut_btn = gr.Button("Cut to video", elem_id="cut_btn")
402
  cut_btn.click(
403
  cut_timestamps_to_video,
404
  inputs=[video_in, transcription_var, text_in, timestamps_var],
405
  outputs=[diff_out, video_out]
406
- # No queue=False here as video cutting can take time
407
  )
408
 
409
  reset_transcription = gr.Button(
410
  "Reset to last transcription", elem_id="reset_btn")
411
- reset_transcription.click(
412
- lambda x: x, # Simple lambda to return the input state
413
- inputs=[transcription_var],
414
- outputs=[text_in],
415
- queue=False # Immediate reset
416
- )
417
- with gr.Column():
418
- video_out.render()
419
- diff_out.render()
420
-
421
- gr.Markdown("""
422
- #### Video Credits
423
- 1. [Cooking](https://vimeo.com/573792389)
424
- 2. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
425
- 3. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
426
- """)
427
-
428
- demo.queue() # Enable queuing for handling multiple users
429
- if __name__ == "__main__":
430
- # debug=True is useful during development
431
- # share=True to create a public link (use cautiously)
432
- demo.launch(debug=True)
 
3
  from difflib import Differ
4
  import ffmpeg
5
  import os
6
+ import tempfile
7
  from pathlib import Path
8
  import time
9
  import aiohttp
10
  import asyncio
11
  import base64
12
  from dotenv import load_dotenv
13
+ import logging
14
 
15
  # --- Configuration ---
16
  # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
17
  API_BACKEND = True
18
  MODEL = "facebook/wav2vec2-base-960h"
 
 
 
19
  API_URL = f'https://api-inference.huggingface.co/models/{MODEL}'
20
+ RETRY_ATTEMPTS = 5
21
+ RETRY_DELAY = 5
22
+ TIMESTAMP_GROUPING_THRESHOLD = 0.1
23
+
24
+ # --- Logging Configuration ---
25
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s - %(funcName)s')
26
 
27
  # --- Initialization ---
28
  if API_BACKEND:
29
  load_dotenv(Path(".env"))
30
  HF_TOKEN = os.environ.get("HF_TOKEN")
31
  if not HF_TOKEN:
32
+ logging.error("HF_TOKEN environment variable not set. Please set it in a .env file.")
33
  raise ValueError("HF_TOKEN environment variable not set.")
34
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
35
  else:
36
  import torch
37
  from transformers import pipeline
38
 
 
39
  device = 0 if torch.cuda.is_available() else -1
40
  try:
41
+ logging.info(f"Initializing local model: {MODEL} on device: {device}")
42
  speech_recognizer = pipeline(
43
  task="automatic-speech-recognition",
44
  model=MODEL,
 
46
  framework="pt",
47
  device=device,
48
  )
49
+ logging.info("Local model initialized successfully.")
50
  except Exception as e:
51
+ logging.error(f"Error initializing local model {MODEL}: {e}")
52
  raise RuntimeError(f"Error initializing local model {MODEL}: {e}")
53
 
54
  videos_out_path = Path("./videos_out")
55
  videos_out_path.mkdir(parents=True, exist_ok=True)
56
+ logging.info(f"Output directory created: {videos_out_path}")
57
 
 
58
  samples_data_files = sorted(Path('examples').glob('*.json'))
59
  SAMPLES = []
60
  for file in samples_data_files:
61
  try:
62
  with open(file, 'r') as f:
63
  sample = json.load(f)
64
+ if 'video' in sample and 'transcription' in sample and 'timestamps' in sample:
65
+ SAMPLES.append(sample)
66
+ else:
67
+ logging.warning(f"Skipping sample file {file} due to missing keys (video, transcription, or timestamps).")
68
  except (json.JSONDecodeError, FileNotFoundError) as e:
69
+ logging.error(f"Error loading sample file {file}: {e}")
70
+
71
+ VIDEOS = [[sample['video']] for sample in SAMPLES]
72
+ logging.info(f"Loaded {len(SAMPLES)} example samples.")
73
 
 
74
 
75
  # --- Helper Functions ---
76
  async def query_api(audio_bytes: bytes):
 
85
  "chunk_length_s": 10,
86
  "stride_length_s": [4, 2]
87
  },
88
+ "options": {"use_gpu": False}
89
  }).encode("utf-8")
90
 
91
  async with aiohttp.ClientSession() as session:
92
  for attempt in range(RETRY_ATTEMPTS):
93
+ logging.info(f'Transcribing from API attempt {attempt + 1}/{RETRY_ATTEMPTS}')
94
  try:
95
  async with session.post(API_URL, headers=headers, data=payload) as response:
96
+ logging.info(f"API Response Status: {response.status}")
97
  content_type = response.headers.get('Content-Type', '')
98
 
99
  if response.status == 200 and 'application/json' in content_type:
 
102
  error_response = await response.json()
103
  if 'error' in error_response and 'estimated_time' in error_response:
104
  wait_time = error_response['estimated_time']
105
+ logging.warning(f"Model loading, waiting for {wait_time} seconds...")
106
+ await asyncio.sleep(wait_time + RETRY_DELAY)
107
  elif 'error' in error_response:
108
  raise RuntimeError(f"API Error: {error_response['error']}")
109
  else:
 
113
  raise RuntimeError(f"Unexpected API response format (Status: {response.status}, Content-Type: {content_type}): {response_text}")
114
 
115
  except aiohttp.ClientError as e:
116
+ logging.error(f"AIOHTTP Client Error during API call (Attempt {attempt + 1}): {e}")
117
  except RuntimeError as e:
118
+ logging.error(f"Runtime error during API call (Attempt {attempt + 1}): {e}")
119
 
120
  if attempt < RETRY_ATTEMPTS - 1:
121
+ wait_time = RETRY_DELAY * (2 ** attempt)
122
+ logging.info(f"Retrying in {wait_time} seconds...")
123
  await asyncio.sleep(wait_time)
124
 
125
  raise RuntimeError(f"Failed to get transcription after {RETRY_ATTEMPTS} attempts.")
 
131
  This is fire-and-forget and doesn't affect the main process flow.
132
  """
133
  url = f'https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}'
134
+ logging.info(f"Pinging telemetry: {url}")
135
 
136
  async def send_ping():
137
  try:
138
  async with aiohttp.ClientSession() as session:
139
  async with session.get(url) as response:
140
+ logging.info(f"Telemetry pong: {response.status}")
141
  except aiohttp.ClientError as e:
142
+ logging.warning(f"Failed to send telemetry ping: {e}")
 
 
 
143
  asyncio.create_task(send_ping())
144
 
145
 
146
  # --- Main Gradio Functions ---
147
+ async def speech_to_text(video_file_path, progress=gr.Progress()):
148
  """
149
  Takes a video path to convert to audio, transcribe audio channel to text and char timestamps.
150
+ Includes progress reporting.
151
  """
152
  if video_file_path is None:
153
  raise gr.Error("Error: No video input provided.")
154
 
155
  video_path = Path(video_file_path)
156
  if not video_path.exists():
157
+ raise gr.Error(f"Error: Video file not found at {video_path}")
158
 
159
+ temp_audio_file = None
160
  try:
161
+ progress(0, desc="Converting video to audio...")
162
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
163
+ temp_audio_file = Path(tmpfile.name)
164
+
165
  loop = asyncio.get_running_loop()
166
+ await loop.run_in_executor(
167
  None, lambda: ffmpeg.input(video_path).output(
168
+ str(temp_audio_file), format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run()
169
  )
170
+ logging.info(f"Video converted to temporary audio file: {temp_audio_file}")
171
+
172
+ with open(temp_audio_file, 'rb') as f:
173
+ audio_memory = f.read()
174
 
175
  except ffmpeg.Error as e:
176
+ logging.error(f"Error converting video to audio: {e.stderr.decode()}")
177
  raise gr.Error(f"Error converting video to audio: {e.stderr.decode()}")
178
  except Exception as e:
179
+ logging.error(f"An unexpected error occurred during audio conversion: {e}")
180
  raise gr.Error(f"An unexpected error occurred during audio conversion: {e}")
181
+ finally:
182
+ if temp_audio_file and temp_audio_file.exists():
183
+ os.remove(temp_audio_file)
184
+ logging.info(f"Cleaned up temporary audio file: {temp_audio_file}")
185
 
186
 
187
  ping_telemetry("speech_to_text")
188
+ progress(0.5, desc="Transcribing audio...")
189
 
190
  if API_BACKEND:
191
  try:
192
  inference_response = await query_api(audio_memory)
193
+ logging.info("Inference Response received from API.")
194
  if not isinstance(inference_response, dict) or 'text' not in inference_response or 'chunks' not in inference_response:
195
  raise RuntimeError(f"Unexpected API response structure: {inference_response}")
196
 
197
  transcription = inference_response["text"].lower()
 
198
  timestamps = [[chunk.get("text", "").lower(), chunk.get("timestamp", [None, None])[0], chunk.get("timestamp", [None, None])[1]]
199
  for chunk in inference_response.get('chunks', []) if isinstance(chunk, dict)]
200
 
 
201
  timestamps = [ts for ts in timestamps if ts[1] is not None and ts[2] is not None]
202
 
203
+ progress(1.0, desc="Transcription complete.")
204
  return (transcription, transcription, timestamps)
205
 
206
  except Exception as e:
207
+ logging.error(f"Error fetching transcription from API: {e}")
208
  raise gr.Error(f"Error fetching transcription from API: {e}")
209
 
210
  else:
211
  try:
212
+ logging.info(f'Transcribing via local model {MODEL}')
 
213
  loop = asyncio.get_running_loop()
214
  output = await loop.run_in_executor(
215
  None, lambda: speech_recognizer(
216
  audio_memory, return_timestamps="char", chunk_length_s=10, stride_length_s=(4, 2))
217
  )
218
+ logging.info("Inference complete with local model.")
219
 
220
  if not isinstance(output, dict) or 'text' not in output or 'chunks' not in output:
221
  raise RuntimeError(f"Unexpected model output structure: {output}")
222
 
223
  transcription = output["text"].lower()
 
224
  timestamps = [[chunk.get("text", "").lower(),
225
  chunk.get("timestamp", [None, None])[0] if not isinstance(chunk.get("timestamp", [None, None])[0], list) else chunk.get("timestamp", [None, None])[0][0],
226
  chunk.get("timestamp", [None, None])[1] if not isinstance(chunk.get("timestamp", [None, None])[1], list) else chunk.get("timestamp", [None, None])[1][0]
227
  ]
228
  for chunk in output.get('chunks', []) if isinstance(chunk, dict)]
229
 
 
230
  timestamps = [ts for ts in timestamps if ts[1] is not None and ts[2] is not None]
231
 
232
+ progress(1.0, desc="Transcription complete.")
233
  return (transcription, transcription, timestamps)
234
 
235
  except Exception as e:
236
+ logging.error(f"Error running inference with local model: {e}")
237
  raise gr.Error(f"Error running inference with local model: {e}")
238
 
239
 
240
+ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps, progress=gr.Progress()):
241
  """
242
  Given original video input, text transcript + timestamps,
243
+ and edited text cuts video segments into a single video.
244
+ Includes progress reporting and improved timestamp alignment.
245
  """
246
  if video_in is None or text_in is None or transcription is None or timestamps is None:
247
  raise gr.Error("Inputs undefined. Please provide video, transcription, and edited text.")
248
 
249
+ video_path = Path(video_in)
250
+ if not video_path.exists():
251
+ raise gr.Error(f"Error: Video file not found at {video_path}")
252
 
253
+ progress(0, desc="Analyzing text differences...")
254
  d = Differ()
 
255
  diff_chars = list(d.compare(transcription, text_in))
256
 
257
+ # --- Improved Timestamp Alignment ---
258
+ timestamps_to_keep = []
 
 
 
 
 
 
 
 
259
  timestamp_idx = 0
260
+ diff_idx = 0
261
+
262
+ while diff_idx < len(diff_chars) and timestamp_idx < len(timestamps):
263
+ diff_line = diff_chars[diff_idx]
264
+ ts_info = timestamps[timestamp_idx]
265
+ ts_char = ts_info[0]
266
+
267
+ if diff_line.startswith(' '):
268
+ if diff_line[2:].lower() == ts_char.lower():
269
+ timestamps_to_keep.append(ts_info)
270
+ timestamp_idx += 1
271
+ diff_idx += 1
272
+ else:
273
+ logging.warning(f"Timestamp alignment mismatch: Diff char '{diff_line[2:]}' vs Timestamp char '{ts_char}'. Skipping timestamp.")
274
+ diff_idx += 1
275
+
276
+ elif diff_line.startswith('-'):
277
+ if diff_line[2:].lower() == ts_char.lower():
278
  timestamp_idx += 1
279
+ diff_idx += 1
280
+ else:
281
+ logging.warning(f"Timestamp alignment mismatch for deletion: Diff char '{diff_line[2:]}' vs Timestamp char '{ts_char}'. Skipping diff char.")
282
+ diff_idx += 1
283
+
284
+ elif diff_line.startswith('+'):
285
+ diff_idx += 1
286
+
287
+ elif diff_line.startswith('?'):
288
+ diff_idx += 1
289
+
290
+ else:
291
+ logging.warning(f"Unexpected diff line format: {diff_line}. Skipping.")
292
+ diff_idx += 1
293
 
 
 
294
 
295
+ logging.info(f"Identified {len(timestamps_to_keep)} timestamps to keep after diff alignment.")
296
 
297
+ progress(0.2, desc="Grouping timestamps...")
298
  grouped_segments = []
299
  if timestamps_to_keep:
300
  current_segment = [timestamps_to_keep[0]]
301
  for i in range(1, len(timestamps_to_keep)):
302
+ if timestamps_to_keep[i][1] - current_segment[-1][2] < TIMESTAMP_GROUPING_THRESHOLD:
 
 
303
  current_segment.append(timestamps_to_keep[i])
304
  else:
305
  grouped_segments.append(current_segment)
306
  current_segment = [timestamps_to_keep[i]]
307
+ grouped_segments.append(current_segment)
308
 
309
+ logging.info(f"Grouped timestamps into {len(grouped_segments)} segments.")
 
310
 
311
+ cut_intervals = [[segment[0][1], segment[-1][2]] for segment in grouped_segments]
312
 
 
313
  video_file_name = video_path.stem
314
+ output_video_path = videos_out_path / f"{video_file_name}_cut.mp4"
315
 
316
  if cut_intervals:
317
+ progress(0.4, desc="Cutting video segments...")
318
  input_video_stream = ffmpeg.input(video_in)
319
 
320
+ filter_complex_parts = []
321
+ input_streams = []
322
+
323
  for i, interval in enumerate(cut_intervals):
324
+ start, end = interval
325
+ filter_complex_parts.append(f"[0:v]trim=start={start},end={end},setpts=PTS-STARTPTS[v{i}]")
326
+ filter_complex_parts.append(f"[0:a]atrim=start={start},end={end},asetpts=PTS-STARTPTS[a{i}]")
327
+ input_streams.append(f"[v{i}][a{i}]")
328
 
329
+ concat_input_str = ''.join(input_streams)
330
+ concat_filter = f"{concat_input_str}concat=n={len(cut_intervals)}:v=1:a=1[outv][outa]"
331
+ filter_complex_parts.append(concat_filter)
332
 
333
+ filter_complex_str = ';'.join(filter_complex_parts)
 
334
 
335
  try:
 
336
  loop = asyncio.get_running_loop()
337
  await loop.run_in_executor(
338
+ None, lambda: ffmpeg.output(
339
+ input_video_stream,
340
+ str(output_video_path),
341
+ filter_complex=filter_complex_str,
342
+ map=['[outv]', '[outa]'],
343
+ preset='fast',
344
+ crf=23
345
  ).overwrite_output().global_args('-loglevel', 'quiet').run()
346
  )
347
+ logging.info(f"Video segments cut and concatenated to: {output_video_path}")
348
 
349
  except ffmpeg.Error as e:
350
+ logging.error(f"Error cutting video: {e.stderr.decode()}")
351
  raise gr.Error(f"Error cutting video: {e.stderr.decode()}")
352
  except Exception as e:
353
+ logging.error(f"An unexpected error occurred during video cutting: {e}")
354
  raise gr.Error(f"An unexpected error occurred during video cutting: {e}")
355
 
356
  else:
357
+ logging.warning("No text was kept, creating a short empty video.")
358
+ try:
359
+ loop = asyncio.get_running_loop()
360
+ await loop.run_in_executor(
361
+ None, lambda: ffmpeg.input('color=c=black:s=1280x720:d=0.1', f='lavfi').output(
362
+ str(output_video_path),
363
+ format='mp4',
364
+ vcodec='libx264',
365
+ pix_fmt='yuv420p',
366
+ t='0.1'
367
+ ).overwrite_output().global_args('-loglevel', 'quiet').run()
368
+ )
369
+ logging.info(f"Created short empty video at: {output_video_path}")
370
+ except ffmpeg.Error as e:
371
+ logging.error(f"Error creating empty video: {e.stderr.decode()}")
372
+ output_video_path = Path(video_in)
373
+ logging.warning("Failed to create empty video, returning original video path as fallback.")
374
+ except Exception as e:
375
+ logging.error(f"An unexpected error occurred during empty video creation: {e}")
376
+ output_video_path = Path(video_in)
377
+ logging.warning("Failed to create empty video, returning original video path as fallback.")
378
 
379
 
 
 
 
380
  diff_output_tokens = [(token[2:], token[0] if token[0] != ' ' else None)
381
  for token in diff_chars]
382
 
383
  ping_telemetry("video_cuts")
384
+ progress(1.0, desc="Video cutting complete.")
385
 
386
  return (diff_output_tokens, str(output_video_path))
387
 
 
394
  transcription = sample.get('transcription', '').lower()
395
  timestamps = sample.get('timestamps', [])
396
  if video is None:
397
+ logging.error(f"Example at index {id} is missing video path.")
398
  raise gr.Error(f"Example at index {id} is missing video path.")
399
  return (video, transcription, transcription, timestamps)
400
  else:
401
+ logging.error(f"Invalid example index: {id}")
402
  raise gr.Error(f"Invalid example index: {id}")
403
 
404
 
405
  # --- Gradio Layout ---
406
  css = """
407
  #cut_btn, #reset_btn { align-self:stretch; }
408
+ #\\31 3 { max-width: 540px; }
409
  .output-markdown {max-width: 65ch !important;}
410
  #video-container{
411
  max-width: 40rem;
412
  }
413
  """
414
  with gr.Blocks(css=css) as demo:
 
415
  transcription_var = gr.State(value="")
416
  timestamps_var = gr.State(value=[])
417
  video_in = gr.Video(label="Video file", elem_id="video-container")
418
  text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
419
+ video_out = gr.Video(label="Video Out", interactive=False)
420
+ diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True, show_legend=True)
421
 
422
  gr.Markdown("""
423
  # Edit Video By Editing Text
 
429
  """)
430
 
431
  with gr.Row():
 
432
  examples = gr.Dataset(components=[video_in], samples=VIDEOS, type="index", label="Examples")
433
  examples.click(
434
  load_example,
435
  inputs=[examples],
436
  outputs=[video_in, text_in, transcription_var, timestamps_var],
437
+ queue=False
438
  )
439
 
440
  with gr.Row():
441
  with gr.Column():
442
+ # video_in is rendered when defined within gr.Blocks
443
  transcribe_btn = gr.Button("Transcribe Audio")
444
  transcribe_btn.click(
445
  speech_to_text,
446
  inputs=[video_in],
447
  outputs=[text_in, transcription_var, timestamps_var]
 
448
  )
449
 
450
  gr.Markdown("""
 
453
 
454
  with gr.Row():
455
  with gr.Column():
456
+ # text_in is rendered when defined within gr.Blocks
457
  with gr.Row():
458
  cut_btn = gr.Button("Cut to video", elem_id="cut_btn")
459
  cut_btn.click(
460
  cut_timestamps_to_video,
461
  inputs=[video_in, transcription_var, text_in, timestamps_var],
462
  outputs=[diff_out, video_out]
 
463
  )
464
 
465
  reset_transcription = gr.Button(
466
  "Reset to last transcription", elem_id="reset_btn")
467
+ reset_tran