Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
@@ -369,6 +369,10 @@ def generate_transcription_by_whisper(video_id):
|
|
369 |
|
370 |
return transcription
|
371 |
|
|
|
|
|
|
|
|
|
372 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
373 |
print("====process_transcript_and_screenshots_on_gcs====")
|
374 |
# GCS
|
@@ -380,6 +384,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
380 |
# 检查逐字稿是否存在
|
381 |
is_new_transcript = False
|
382 |
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
|
|
383 |
if not is_transcript_exists:
|
384 |
print("逐字稿文件不存在于GCS中,重新建立")
|
385 |
# 从YouTube获取逐字稿并上传
|
@@ -396,6 +401,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
396 |
print("沒有找到字幕")
|
397 |
transcript = generate_transcription_by_whisper(video_id)
|
398 |
|
|
|
399 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
400 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
401 |
|
@@ -405,6 +411,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
405 |
print("逐字稿已存在于GCS中")
|
406 |
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
407 |
transcript = json.loads(transcript_text)
|
|
|
408 |
|
409 |
# print("===確認其他衍生文件===")
|
410 |
# source = "gcs"
|
|
|
369 |
|
370 |
return transcription
|
371 |
|
372 |
+
def get_video_duration(video_id):
|
373 |
+
yt = YouTube(f'https://www.youtube.com/watch?v={video_id}')
|
374 |
+
return yt.length
|
375 |
+
|
376 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
377 |
print("====process_transcript_and_screenshots_on_gcs====")
|
378 |
# GCS
|
|
|
384 |
# 检查逐字稿是否存在
|
385 |
is_new_transcript = False
|
386 |
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
387 |
+
video_length = get_video_duration(video_id)
|
388 |
if not is_transcript_exists:
|
389 |
print("逐字稿文件不存在于GCS中,重新建立")
|
390 |
# 从YouTube获取逐字稿并上传
|
|
|
401 |
print("沒有找到字幕")
|
402 |
transcript = generate_transcription_by_whisper(video_id)
|
403 |
|
404 |
+
transcript = [entry for entry in transcript if entry['start'] <= video_length]
|
405 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
406 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
407 |
|
|
|
411 |
print("逐字稿已存在于GCS中")
|
412 |
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
413 |
transcript = json.loads(transcript_text)
|
414 |
+
transcript = [entry for entry in transcript if entry['start'] <= video_length]
|
415 |
|
416 |
# print("===確認其他衍生文件===")
|
417 |
# source = "gcs"
|