youngtsai commited on
Commit
998d544
1 Parent(s): 10f0966
Files changed (1) hide show
  1. app.py +7 -0
app.py CHANGED
@@ -369,6 +369,10 @@ def generate_transcription_by_whisper(video_id):
369
 
370
  return transcription
371
 
 
 
 
 
372
  def process_transcript_and_screenshots_on_gcs(video_id):
373
  print("====process_transcript_and_screenshots_on_gcs====")
374
  # GCS
@@ -380,6 +384,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
380
  # 检查逐字稿是否存在
381
  is_new_transcript = False
382
  is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
 
383
  if not is_transcript_exists:
384
  print("逐字稿文件不存在于GCS中,重新建立")
385
  # 从YouTube获取逐字稿并上传
@@ -396,6 +401,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
396
  print("沒有找到字幕")
397
  transcript = generate_transcription_by_whisper(video_id)
398
 
 
399
  transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
400
  GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
401
 
@@ -405,6 +411,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
405
  print("逐字稿已存在于GCS中")
406
  transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
407
  transcript = json.loads(transcript_text)
 
408
 
409
  # print("===確認其他衍生文件===")
410
  # source = "gcs"
 
369
 
370
  return transcription
371
 
372
+ def get_video_duration(video_id):
373
+ yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
374
+ return yt.length
375
+
376
  def process_transcript_and_screenshots_on_gcs(video_id):
377
  print("====process_transcript_and_screenshots_on_gcs====")
378
  # GCS
 
384
  # 检查逐字稿是否存在
385
  is_new_transcript = False
386
  is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
387
+ video_length = get_video_duration(video_id)
388
  if not is_transcript_exists:
389
  print("逐字稿文件不存在于GCS中,重新建立")
390
  # 从YouTube获取逐字稿并上传
 
401
  print("沒有找到字幕")
402
  transcript = generate_transcription_by_whisper(video_id)
403
 
404
+ transcript = [entry for entry in transcript if entry['start'] <= video_length]
405
  transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
406
  GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
407
 
 
411
  print("逐字稿已存在于GCS中")
412
  transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
413
  transcript = json.loads(transcript_text)
414
+ transcript = [entry for entry in transcript if entry['start'] <= video_length]
415
 
416
  # print("===確認其他衍生文件===")
417
  # source = "gcs"