youngtsai commited on
Commit
5324cd6
·
1 Parent(s): f4feb7d
Files changed (1) hide show
  1. app.py +69 -69
app.py CHANGED
@@ -49,6 +49,7 @@ print(gr.__version__)
49
  if is_env_local:
50
  with open("local_config.json") as f:
51
  config = json.load(f)
 
52
  PASSWORD = config["PASSWORD"]
53
  GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
54
  DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
@@ -64,7 +65,9 @@ if is_env_local:
64
  AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
65
  AWS_REGION_NAME = config["AWS_REGION_NAME"]
66
  OUTPUT_PATH = config["OUTPUT_PATH"]
 
67
  else:
 
68
  PASSWORD = os.getenv("PASSWORD")
69
  GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
70
  DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
@@ -426,54 +429,14 @@ def get_video_duration(video_id):
426
 
427
  def process_transcript_and_screenshots_on_gcs(video_id):
428
  print("====process_transcript_and_screenshots_on_gcs====")
429
- # GCS
430
- bucket_name = 'video_ai_assistant'
431
- # 逐字稿文件名
432
- transcript_file_name = f'{video_id}_transcript.json'
433
- transcript_blob_name = f"{video_id}/{transcript_file_name}"
434
- # 检查逐字稿是否存在
435
- is_new_transcript = False
436
- is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
437
- video_duration = get_video_duration(video_id)
438
- if not is_transcript_exists:
439
- print("逐字稿文件不存在于GCS中,重新建立")
440
- # 从YouTube获取逐字稿并上传
441
- try:
442
- transcript = get_transcript_by_yt_api(video_id)
443
- except:
444
- # call open ai whisper
445
- print("===call open ai whisper===")
446
- transcript = generate_transcription_by_whisper(video_id)
447
-
448
- if transcript:
449
- print("成功獲取字幕")
450
- else:
451
- print("沒有找到字幕")
452
- transcript = generate_transcription_by_whisper(video_id)
453
- if video_duration:
454
- transcript = [entry for entry in transcript if entry['start'] <= video_duration]
455
-
456
- transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
457
- GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
458
-
459
- is_new_transcript = True
460
- else:
461
- # 逐字稿已存在,下载逐字稿内容
462
- print("逐字稿已存在于GCS中")
463
- transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
464
- transcript = json.loads(transcript_text)
465
- if video_duration:
466
- transcript = [entry for entry in transcript if entry['start'] <= video_duration]
467
-
468
- # print("===確認其他衍生文件===")
469
- # source = "gcs"
470
- # get_questions(video_id, transcript_text, source)
471
- # get_video_id_summary(video_id, transcript_text, source)
472
- # get_mind_map(video_id, transcript_text, source)
473
- # print("===確認其他衍生文件 end ===")
474
-
475
 
476
  # 處理截圖
 
477
  for entry in transcript:
478
  if 'img_file_id' not in entry:
479
  # 檢查 OUTPUT_PATH 是否存在 video_id.mp4
@@ -488,38 +451,67 @@ def process_transcript_and_screenshots_on_gcs(video_id):
488
  if i == 4:
489
  raise gr.Error(f"下载视频失败: {str(e)}")
490
  time.sleep(5)
491
- # 截图
492
- screenshot_path = screenshot_youtube_video(video_id, entry['start'])
493
- screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
494
- img_file_id = GCS_SERVICE.upload_image_and_get_public_url(bucket_name, screenshot_blob_name, screenshot_path)
495
- entry['img_file_id'] = img_file_id
496
- print(f"截图已上传到GCS: {img_file_id}")
497
- is_new_transcript = True
 
 
498
 
499
- # 確認是否更新逐字稿文件
500
  if is_new_transcript:
501
- # 更新逐字稿文件
502
- print("===更新逐字稿文件===")
503
- print(transcript)
504
  print("===更新逐字稿文件===")
505
- updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
506
- GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, updated_transcript_text)
507
- print("逐字稿已更新,包括截图链接")
508
- updated_transcript_json = json.loads(updated_transcript_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  else:
510
- updated_transcript_json = transcript
 
511
 
512
- return updated_transcript_json
 
 
 
 
 
 
 
513
 
514
  def process_youtube_link(password, link):
515
  verify_password(password)
516
-
517
- # 使用 YouTube API 获取逐字稿
518
- # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
519
  video_id = extract_youtube_id(link)
520
-
521
  try:
522
- transcript = process_transcript_and_screenshots_on_gcs(video_id)
 
 
 
523
  except Exception as e:
524
  error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
525
  print("===process_youtube_link error===")
@@ -2615,6 +2607,8 @@ def init_params(text, request: gr.Request):
2615
  chatbot_ai = gr.update(visible=False)
2616
  ai_chatbot_params = gr.update(visible=True)
2617
 
 
 
2618
  # if youtube_link in query_params
2619
  if "youtube_id" in request.query_params:
2620
  youtube_id = request.query_params["youtube_id"]
@@ -2633,11 +2627,15 @@ def init_params(text, request: gr.Request):
2633
  lesson_plan_accordion = gr.update(visible=False)
2634
  exit_ticket_accordion = gr.update(visible=False)
2635
  ai_chatbot_params = gr.update(visible=False)
 
 
 
2636
 
2637
  return admin, reading_passage_admin, summary_admin, see_detail, \
2638
  worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
2639
  password_text, youtube_link, \
2640
- chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params
 
2641
 
2642
  def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
2643
  # inputs=[content_subject, content_grade, df_string_output],
@@ -2699,6 +2697,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
2699
  # web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
2700
  user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
2701
  youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
 
2702
  with gr.Row() as data_state:
2703
  content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
2704
  content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
@@ -3567,6 +3566,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3567
  chatbot_open_ai_streaming,
3568
  chatbot_ai,
3569
  ai_chatbot_params,
 
3570
  ]
3571
  demo.load(
3572
  init_params,
 
49
  if is_env_local:
50
  with open("local_config.json") as f:
51
  config = json.load(f)
52
+ IS_ENV_PROD = "False"
53
  PASSWORD = config["PASSWORD"]
54
  GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
55
  DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
 
65
  AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
66
  AWS_REGION_NAME = config["AWS_REGION_NAME"]
67
  OUTPUT_PATH = config["OUTPUT_PATH"]
68
+
69
  else:
70
+ IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
71
  PASSWORD = os.getenv("PASSWORD")
72
  GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
73
  DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
 
429
 
430
  def process_transcript_and_screenshots_on_gcs(video_id):
431
  print("====process_transcript_and_screenshots_on_gcs====")
432
+ transcript, exists = get_transcript_from_gcs(video_id)
433
+ if not exists:
434
+ print("Transcript file does not exist, creating new transcript...")
435
+ transcript = generate_transcription_by_whisper(video_id)
436
+ upload_transcript_to_gcs(video_id, transcript)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
  # 處理截圖
439
+ is_new_transcript = False
440
  for entry in transcript:
441
  if 'img_file_id' not in entry:
442
  # 檢查 OUTPUT_PATH 是否存在 video_id.mp4
 
451
  if i == 4:
452
  raise gr.Error(f"下载视频失败: {str(e)}")
453
  time.sleep(5)
454
+ try:
455
+ screenshot_path = screenshot_youtube_video(video_id, entry['start'])
456
+ screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
457
+ img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
458
+ entry['img_file_id'] = img_file_id
459
+ print(f"截图已上传到GCS: {img_file_id}")
460
+ is_new_transcript = True
461
+ except Exception as e:
462
+ print(f"Error processing screenshot: {str(e)}")
463
 
 
464
  if is_new_transcript:
 
 
 
465
  print("===更新逐字稿文件===")
466
+ upload_transcript_to_gcs(video_id, transcript)
467
+
468
+ return transcript
469
+
470
+ def get_transcript(video_id):
471
+ print("====get_transcript====")
472
+ transcript, exists = get_transcript_from_gcs(video_id)
473
+ if not exists:
474
+ raise gr.Error("逐字稿文件不存在於GCS中。")
475
+
476
+ if any('img_file_id' not in entry for entry in transcript):
477
+ raise gr.Error("Some entries in the transcript do not have an associated img_file_id.")
478
+
479
+ print("Transcript is verified with all necessary images.")
480
+ return transcript
481
+
482
+ def get_transcript_from_gcs(video_id):
483
+ print("Checking for transcript in GCS...")
484
+ bucket_name = 'video_ai_assistant'
485
+ transcript_file_name = f'{video_id}_transcript.json'
486
+ transcript_blob_name = f"{video_id}/{transcript_file_name}"
487
+ # Check if the transcript exists in GCS
488
+ is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
489
+ if is_transcript_exists:
490
+ # Download the transcript if it exists
491
+ transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
492
+ return json.loads(transcript_text), True
493
  else:
494
+ print("No transcript found for video ID:", video_id)
495
+ return None, False
496
 
497
+ def upload_transcript_to_gcs(video_id, transcript):
498
+ print("Uploading updated transcript to GCS...")
499
+ bucket_name = 'video_ai_assistant'
500
+ transcript_file_name = f'{video_id}_transcript.json'
501
+ transcript_blob_name = f"{video_id}/{transcript_file_name}"
502
+ transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
503
+ GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
504
+ print("Transcript uploaded successfully.")
505
 
506
  def process_youtube_link(password, link):
507
  verify_password(password)
 
 
 
508
  video_id = extract_youtube_id(link)
509
+
510
  try:
511
+ if IS_ENV_PROD == "True":
512
+ transcript = get_transcript(video_id)
513
+ else:
514
+ transcript = process_transcript_and_screenshots_on_gcs(video_id)
515
  except Exception as e:
516
  error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
517
  print("===process_youtube_link error===")
 
2607
  chatbot_ai = gr.update(visible=False)
2608
  ai_chatbot_params = gr.update(visible=True)
2609
 
2610
+ is_env_prod = gr.update(value=False)
2611
+
2612
  # if youtube_link in query_params
2613
  if "youtube_id" in request.query_params:
2614
  youtube_id = request.query_params["youtube_id"]
 
2627
  lesson_plan_accordion = gr.update(visible=False)
2628
  exit_ticket_accordion = gr.update(visible=False)
2629
  ai_chatbot_params = gr.update(visible=False)
2630
+
2631
+ if IS_ENV_PROD == "True":
2632
+ is_env_prod = gr.update(value=True)
2633
 
2634
  return admin, reading_passage_admin, summary_admin, see_detail, \
2635
  worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
2636
  password_text, youtube_link, \
2637
+ chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params, \
2638
+ is_env_prod
2639
 
2640
  def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
2641
  # inputs=[content_subject, content_grade, df_string_output],
 
2697
  # web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
2698
  user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
2699
  youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
2700
+ is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
2701
  with gr.Row() as data_state:
2702
  content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
2703
  content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
 
3566
  chatbot_open_ai_streaming,
3567
  chatbot_ai,
3568
  ai_chatbot_params,
3569
+ is_env_prod,
3570
  ]
3571
  demo.load(
3572
  init_params,