video_bot_999

Sleeping

youngtsai commited on Apr 22

Commit

998d544

•

1 Parent(s): 10f0966

update

Files changed (1) hide show

app.py CHANGED Viewed

@@ -369,6 +369,10 @@ def generate_transcription_by_whisper(video_id):
     return transcription
 def process_transcript_and_screenshots_on_gcs(video_id):
     print("====process_transcript_and_screenshots_on_gcs====")
     # GCS
@@ -380,6 +384,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
     # 检查逐字稿是否存在
     is_new_transcript = False
     is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
     if not is_transcript_exists:
         print("逐字稿文件不存在于GCS中，重新建立")
         # 从YouTube获取逐字稿并上传
@@ -396,6 +401,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
             print("沒有找到字幕")
             transcript = generate_transcription_by_whisper(video_id)
         transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
         GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
@@ -405,6 +411,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
         print("逐字稿已存在于GCS中")
         transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
         transcript = json.loads(transcript_text)
     # print("===確認其他衍生文件===")
     # source = "gcs"

     return transcription
+def get_video_duration(video_id):
+    yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
+    return yt.length
 def process_transcript_and_screenshots_on_gcs(video_id):
     print("====process_transcript_and_screenshots_on_gcs====")
     # GCS
     # 检查逐字稿是否存在
     is_new_transcript = False
     is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
+    video_length = get_video_duration(video_id)
     if not is_transcript_exists:
         print("逐字稿文件不存在于GCS中，重新建立")
         # 从YouTube获取逐字稿并上传
             print("沒有找到字幕")
             transcript = generate_transcription_by_whisper(video_id)
+        transcript = [entry for entry in transcript if entry['start'] <= video_length]
         transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
         GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
         print("逐字稿已存在于GCS中")
         transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
         transcript = json.loads(transcript_text)
+        transcript = [entry for entry in transcript if entry['start'] <= video_length]
     # print("===確認其他衍生文件===")
     # source = "gcs"