youngtsai commited on
Commit
1491bd4
1 Parent(s): f0d8f54

transcript = process_transcript_and_screenshots_on_gcs(video_id)

Browse files
Files changed (1) hide show
  1. app.py +65 -2
app.py CHANGED
@@ -59,6 +59,7 @@ client = OpenAI(api_key=OPEN_AI_KEY)
59
  DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
60
  GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
61
 
 
62
  def init_gcs_client(service_account_key_string):
63
  """使用服务账号密钥文件创建 GCS 客户端"""
64
  credentials_json_string = service_account_key_string
@@ -112,6 +113,23 @@ def make_blob_public(gcs_client, bucket_name, blob_name):
112
  blob.make_public()
113
  print(f"Blob {blob_name} is now publicly accessible at {blob.public_url}")
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  def copy_all_files_from_drive_to_gcs(drive_service, gcs_client, drive_folder_id, bucket_name, gcs_folder_name):
116
  # Get all files from the folder
117
  query = f"'{drive_folder_id}' in parents and trashed = false"
@@ -141,7 +159,7 @@ def copy_file_from_drive_to_gcs(drive_service, gcs_client, file_id, bucket_name,
141
  blob.upload_from_string(file_content)
142
  print(f"File {file_id} copied to GCS at {gcs_destination_path}.")
143
 
144
- # # ====drive====初始化Google Drive服务
145
  def init_drive_service():
146
  credentials_json_string = DRIVE_KEY
147
  credentials_dict = json.loads(credentials_json_string)
@@ -391,6 +409,50 @@ def process_transcript_and_screenshots(video_id):
391
 
392
  return transcript
393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  def process_youtube_link(link):
395
  # 使用 YouTube API 获取逐字稿
396
  # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
@@ -400,7 +462,8 @@ def process_youtube_link(link):
400
  download_youtube_video(video_id, output_path=OUTPUT_PATH)
401
 
402
  try:
403
- transcript = process_transcript_and_screenshots(video_id)
 
404
  except Exception as e:
405
  error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
406
  print("===process_youtube_link error===")
 
59
  DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
60
  GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
61
 
62
+ # ====gcs====
63
  def init_gcs_client(service_account_key_string):
64
  """使用服务账号密钥文件创建 GCS 客户端"""
65
  credentials_json_string = service_account_key_string
 
113
  blob.make_public()
114
  print(f"Blob {blob_name} is now publicly accessible at {blob.public_url}")
115
 
116
+ def get_blob_public_url(gcs_client, bucket_name, blob_name):
117
+ """获取指定 GCS 对象的公开 URL"""
118
+ bucket = gcs_client.bucket(bucket_name)
119
+ blob = bucket.blob(blob_name)
120
+ return blob.public_url
121
+
122
+ def upload_img_and_get_public_url(gcs_client, bucket_name, file_name, file_path):
123
+ """上传图片到 GCS 并获取其公开 URL"""
124
+ # 上传图片
125
+ upload_file_to_gcs(gcs_client, bucket_name, file_name, file_path)
126
+ # 将上传的图片设置为公开
127
+ make_blob_public(gcs_client, bucket_name, file_name)
128
+ # 获取图片的公开 URL
129
+ public_url = get_blob_public_url(gcs_client, bucket_name, file_name)
130
+ print(f"Public URL for the uploaded image: {public_url}")
131
+ return public_url
132
+
133
  def copy_all_files_from_drive_to_gcs(drive_service, gcs_client, drive_folder_id, bucket_name, gcs_folder_name):
134
  # Get all files from the folder
135
  query = f"'{drive_folder_id}' in parents and trashed = false"
 
159
  blob.upload_from_string(file_content)
160
  print(f"File {file_id} copied to GCS at {gcs_destination_path}.")
161
 
162
+ # # ====drive====初始化
163
  def init_drive_service():
164
  credentials_json_string = DRIVE_KEY
165
  credentials_dict = json.loads(credentials_json_string)
 
409
 
410
  return transcript
411
 
412
+ def process_transcript_and_screenshots_on_gcs(video_id):
413
+ print("====process_transcript_and_screenshots_on_gcs====")
414
+ # GCS
415
+ gcs_client = init_gcs_client(GCS_KEY)
416
+ bucket_name = 'video_ai_assistant'
417
+ # 检查 folder 是否存在
418
+ is_gcs_exists = gcs_check_folder_exists(gcs_client, bucket_name, video_id)
419
+ if not is_gcs_exists:
420
+ gcs_create_bucket_folder_if_not_exists(gcs_client, bucket_name, video_id)
421
+ print("GCS folder:{video_id} 已创建")
422
+ else:
423
+ print("GCS folder:{video_id} 已存在")
424
+
425
+ # 逐字稿文件名
426
+ file_name = f'{video_id}_transcript.json'
427
+ # 检查逐字稿是否存在
428
+ exists = gcs_check_file_exists(gcs_client, bucket_name, file_name)
429
+ if not exists:
430
+ # 从YouTube获取逐字稿并上传
431
+ transcript = get_transcript(video_id)
432
+ if transcript:
433
+ print("成功獲取字幕")
434
+ else:
435
+ print("沒有找到字幕")
436
+ transcript
437
+
438
+ # 处理逐字稿中的每个条目,检查并上传截图 到 GCS,然後設定 GCS 權限
439
+ for entry in transcript:
440
+ if 'img_file_id' not in entry:
441
+ screenshot_path = screenshot_youtube_video(video_id, entry['start'])
442
+ img_file_id = upload_img_and_get_public_url(gcs_client, bucket_name, f"{video_id}_{entry['start']}.jpg", screenshot_path)
443
+ entry['img_file_id'] = img_file_id
444
+ print(f"截图已上传到GCS: {img_file_id}")
445
+
446
+ # 更新逐字稿文件
447
+ updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
448
+ upload_file_to_gcs(gcs_client, bucket_name, file_name, updated_transcript_text)
449
+ print("逐字稿已更新,包括截图链接")
450
+
451
+ return transcript
452
+
453
+
454
+
455
+
456
  def process_youtube_link(link):
457
  # 使用 YouTube API 获取逐字稿
458
  # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
 
462
  download_youtube_video(video_id, output_path=OUTPUT_PATH)
463
 
464
  try:
465
+ # transcript = process_transcript_and_screenshots(video_id)
466
+ transcript = process_transcript_and_screenshots_on_gcs(video_id)
467
  except Exception as e:
468
  error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
469
  print("===process_youtube_link error===")