youngtsai commited on
Commit
1d575bc
1 Parent(s): c205271
Files changed (1) hide show
  1. app.py +188 -222
app.py CHANGED
@@ -49,6 +49,7 @@ print(gr.__version__)
49
  if is_env_local:
50
  with open("local_config.json") as f:
51
  config = json.load(f)
 
52
  PASSWORD = config["PASSWORD"]
53
  GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
54
  DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
@@ -64,7 +65,9 @@ if is_env_local:
64
  AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
65
  AWS_REGION_NAME = config["AWS_REGION_NAME"]
66
  OUTPUT_PATH = config["OUTPUT_PATH"]
 
67
  else:
 
68
  PASSWORD = os.getenv("PASSWORD")
69
  GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
70
  DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
@@ -426,54 +429,14 @@ def get_video_duration(video_id):
426
 
427
  def process_transcript_and_screenshots_on_gcs(video_id):
428
  print("====process_transcript_and_screenshots_on_gcs====")
429
- # GCS
430
- bucket_name = 'video_ai_assistant'
431
- # 逐字稿文件名
432
- transcript_file_name = f'{video_id}_transcript.json'
433
- transcript_blob_name = f"{video_id}/{transcript_file_name}"
434
- # 检查逐字稿是否存在
435
- is_new_transcript = False
436
- is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
437
- video_duration = get_video_duration(video_id)
438
- if not is_transcript_exists:
439
- print("逐字稿文件不存在于GCS中,重新建立")
440
- # 从YouTube获取逐字稿并上传
441
- try:
442
- transcript = get_transcript_by_yt_api(video_id)
443
- except:
444
- # call open ai whisper
445
- print("===call open ai whisper===")
446
- transcript = generate_transcription_by_whisper(video_id)
447
-
448
- if transcript:
449
- print("成功獲取字幕")
450
- else:
451
- print("沒有找到字幕")
452
- transcript = generate_transcription_by_whisper(video_id)
453
- if video_duration:
454
- transcript = [entry for entry in transcript if entry['start'] <= video_duration]
455
-
456
- transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
457
- GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
458
-
459
- is_new_transcript = True
460
- else:
461
- # 逐字稿已存在,下载逐字稿内容
462
- print("逐字稿已存在于GCS中")
463
- transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
464
- transcript = json.loads(transcript_text)
465
- if video_duration:
466
- transcript = [entry for entry in transcript if entry['start'] <= video_duration]
467
-
468
- # print("===確認其他衍生文件===")
469
- # source = "gcs"
470
- # get_questions(video_id, transcript_text, source)
471
- # get_video_id_summary(video_id, transcript_text, source)
472
- # get_mind_map(video_id, transcript_text, source)
473
- # print("===確認其他衍生文件 end ===")
474
-
475
 
476
  # 處理截圖
 
477
  for entry in transcript:
478
  if 'img_file_id' not in entry:
479
  # 檢查 OUTPUT_PATH 是否存在 video_id.mp4
@@ -488,38 +451,67 @@ def process_transcript_and_screenshots_on_gcs(video_id):
488
  if i == 4:
489
  raise gr.Error(f"下载视频失败: {str(e)}")
490
  time.sleep(5)
491
- # 截图
492
- screenshot_path = screenshot_youtube_video(video_id, entry['start'])
493
- screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
494
- img_file_id = GCS_SERVICE.upload_image_and_get_public_url(bucket_name, screenshot_blob_name, screenshot_path)
495
- entry['img_file_id'] = img_file_id
496
- print(f"截图已上传到GCS: {img_file_id}")
497
- is_new_transcript = True
 
 
498
 
499
- # 確認是否更新逐字稿文件
500
  if is_new_transcript:
501
- # 更新逐字稿文件
502
- print("===更新逐字稿文件===")
503
- print(transcript)
504
  print("===更新逐字稿文件===")
505
- updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
506
- GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, updated_transcript_text)
507
- print("逐字稿已更新,包括截图链接")
508
- updated_transcript_json = json.loads(updated_transcript_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  else:
510
- updated_transcript_json = transcript
 
511
 
512
- return updated_transcript_json
 
 
 
 
 
 
 
513
 
514
- def process_youtube_link(password, link):
515
  verify_password(password)
516
-
517
- # 使用 YouTube API 获取逐字稿
518
- # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
519
  video_id = extract_youtube_id(link)
520
-
521
  try:
522
- transcript = process_transcript_and_screenshots_on_gcs(video_id)
 
 
 
523
  except Exception as e:
524
  error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
525
  print("===process_youtube_link error===")
@@ -553,21 +545,21 @@ def process_youtube_link(password, link):
553
 
554
  # 基于逐字稿生成其他所需的输出
555
  source = "gcs"
556
- questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source)
557
  questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
558
- summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source)
559
  summary_text = summary_json["summary"]
560
  summary = summary_json["summary"]
561
- key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source)
562
  key_moments = key_moments_json["key_moments"]
563
  key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
564
  key_moments_html = get_key_moments_html(key_moments)
565
  html_content = format_transcript_to_html(formatted_transcript)
566
  simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
567
- mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source)
568
  mind_map = mind_map_json["mind_map"]
569
  mind_map_html = get_mind_map_html(mind_map)
570
- reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source)
571
  reading_passage_text = reading_passage_json["reading_passage"]
572
  reading_passage = reading_passage_json["reading_passage"]
573
  meta_data = get_meta_data(video_id)
@@ -711,70 +703,75 @@ def split_data(df_string, word_base=100000):
711
 
712
  return segments
713
 
714
- def generate_content_by_LLM(sys_content, user_content, response_format=None):
715
- # 使用 OpenAI 生成基于上传数据的问题
 
 
 
 
 
 
 
 
 
 
716
 
717
- try:
718
- model = "gpt-4-turbo"
719
- # 使用 OPEN AI 生成 Reading Passage
720
- messages = [
721
- {"role": "system", "content": sys_content},
722
- {"role": "user", "content": user_content}
723
- ]
724
 
725
- request_payload = {
726
- "model": model,
727
- "messages": messages,
728
- "max_tokens": 4000,
729
- "response_format": response_format
730
- }
731
 
732
- if response_format is not None:
733
- request_payload["response_format"] = response_format
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
 
735
- response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
736
- content = response.choices[0].message.content.strip()
737
- except Exception as e:
738
- print(f"Error generating reading passage: {str(e)}")
739
- print("using REDROCK")
740
- # 使用 REDROCK 生成 Reading Passage
741
- messages = [
742
- {"role": "user", "content": user_content}
743
- ]
744
- model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
745
- # model_id = "anthropic.claude-3-haiku-20240307-v1:0"
746
- kwargs = {
747
- "modelId": model_id,
748
- "contentType": "application/json",
749
- "accept": "application/json",
750
- "body": json.dumps({
751
- "anthropic_version": "bedrock-2023-05-31",
752
- "max_tokens": 4000,
753
- "system": sys_content,
754
- "messages": messages
755
- })
756
- }
757
- response = BEDROCK_CLIENT.invoke_model(**kwargs)
758
- response_body = json.loads(response.get('body').read())
759
- content = response_body.get('content')[0].get('text')
760
 
 
 
 
 
 
 
 
761
  print("=====content=====")
762
  print(content)
763
  print("=====content=====")
764
 
765
  return content
766
 
767
- def get_reading_passage(video_id, df_string, source):
768
  if source == "gcs":
769
  print("===get_reading_passage on gcs===")
770
- gcs_client = GCS_CLIENT
771
  bucket_name = 'video_ai_assistant'
772
  file_name = f'{video_id}_reading_passage_latex.json'
773
  blob_name = f"{video_id}/{file_name}"
774
  # 检查 reading_passage 是否存在
775
  is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
776
  if not is_file_exists:
777
- reading_passage = generate_reading_passage(df_string)
778
  reading_passage_json = {"reading_passage": str(reading_passage)}
779
  reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
780
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
@@ -807,7 +804,7 @@ def get_reading_passage(video_id, df_string, source):
807
 
808
  return reading_passage_json
809
 
810
- def generate_reading_passage(df_string):
811
  print("===generate_reading_passage===")
812
  segments = split_data(df_string, word_base=100000)
813
  all_content = []
@@ -826,7 +823,7 @@ def generate_reading_passage(df_string):
826
  加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
827
  請直接給出文章,不用介紹怎麼處理的或是文章字數等等
828
  """
829
- content = generate_content_by_LLM(sys_content, user_content)
830
  all_content.append(content + "\n")
831
 
832
  # 將所有生成的閱讀理解段落合併成一個完整的文章
@@ -839,7 +836,7 @@ def text_to_speech(video_id, text):
839
  tts.save(filename)
840
  return filename
841
 
842
- def get_mind_map(video_id, df_string, source):
843
  if source == "gcs":
844
  print("===get_mind_map on gcs===")
845
  gcs_client = GCS_CLIENT
@@ -849,7 +846,7 @@ def get_mind_map(video_id, df_string, source):
849
  # 检查檔案是否存在
850
  is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
851
  if not is_file_exists:
852
- mind_map = generate_mind_map(df_string)
853
  mind_map_json = {"mind_map": str(mind_map)}
854
  mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
855
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
@@ -870,7 +867,7 @@ def get_mind_map(video_id, df_string, source):
870
  # 检查檔案是否存在
871
  exists, file_id = check_file_exists(service, folder_id, file_name)
872
  if not exists:
873
- mind_map = generate_mind_map(df_string)
874
  mind_map_json = {"mind_map": str(mind_map)}
875
  mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
876
  upload_content_directly(service, file_name, folder_id, mind_map_text)
@@ -883,7 +880,7 @@ def get_mind_map(video_id, df_string, source):
883
 
884
  return mind_map_json
885
 
886
- def generate_mind_map(df_string):
887
  print("===generate_mind_map===")
888
  segments = split_data(df_string, word_base=100000)
889
  all_content = []
@@ -895,7 +892,7 @@ def generate_mind_map(df_string):
895
  注意:不需要前後文敘述,直接給出 markdown 文本即可
896
  這對我很重要
897
  """
898
- content = generate_content_by_LLM(sys_content, user_content)
899
  all_content.append(content + "\n")
900
 
901
  # 將所有生成的閱讀理解段落合併成一個完整的文章
@@ -914,10 +911,9 @@ def get_mind_map_html(mind_map):
914
  """
915
  return mind_map_html
916
 
917
- def get_video_id_summary(video_id, df_string, source):
918
  if source == "gcs":
919
  print("===get_video_id_summary on gcs===")
920
- gcs_client = GCS_CLIENT
921
  bucket_name = 'video_ai_assistant'
922
  file_name = f'{video_id}_summary_markdown.json'
923
  summary_file_blob_name = f"{video_id}/{file_name}"
@@ -925,7 +921,7 @@ def get_video_id_summary(video_id, df_string, source):
925
  is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
926
  if not is_summary_file_exists:
927
  meta_data = get_meta_data(video_id)
928
- summary = generate_summarise(df_string, meta_data)
929
  summary_json = {"summary": str(summary)}
930
  summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
931
  GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
@@ -947,7 +943,7 @@ def get_video_id_summary(video_id, df_string, source):
947
  exists, file_id = check_file_exists(service, folder_id, file_name)
948
  if not exists:
949
  meta_data = get_meta_data(video_id)
950
- summary = generate_summarise(df_string, meta_data)
951
  summary_json = {"summary": str(summary)}
952
  summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
953
 
@@ -968,7 +964,7 @@ def get_video_id_summary(video_id, df_string, source):
968
 
969
  return summary_json
970
 
971
- def generate_summarise(df_string, metadata=None):
972
  print("===generate_summarise===")
973
  # 使用 OpenAI 生成基于上传数据的问题
974
  if metadata:
@@ -1016,7 +1012,7 @@ def generate_summarise(df_string, metadata=None):
1016
  ## ❓ 延伸小問題
1017
  - (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
1018
  """
1019
- content = generate_content_by_LLM(sys_content, user_content)
1020
  all_content.append(content + "\n")
1021
 
1022
  if len(all_content) > 1:
@@ -1055,13 +1051,13 @@ def generate_summarise(df_string, metadata=None):
1055
  ## ❓ 延伸小問題
1056
  - ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
1057
  """
1058
- final_content = generate_content_by_LLM(sys_content, user_content)
1059
  else:
1060
  final_content = all_content[0]
1061
 
1062
  return final_content
1063
 
1064
- def get_questions(video_id, df_string, source="gcs"):
1065
  if source == "gcs":
1066
  # 去 gcs 確認是有有 video_id_questions.json
1067
  print("===get_questions on gcs===")
@@ -1072,7 +1068,7 @@ def get_questions(video_id, df_string, source="gcs"):
1072
  # 检查檔案是否存在
1073
  is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1074
  if not is_questions_exists:
1075
- questions = generate_questions(df_string)
1076
  questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
1077
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
1078
  print("questions已上傳到GCS")
@@ -1093,7 +1089,7 @@ def get_questions(video_id, df_string, source="gcs"):
1093
  # 检查檔案是否存在
1094
  exists, file_id = check_file_exists(service, folder_id, file_name)
1095
  if not exists:
1096
- questions = generate_questions(df_string)
1097
  questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
1098
  upload_content_directly(service, file_name, folder_id, questions_text)
1099
  print("questions已上傳到Google Drive")
@@ -1113,7 +1109,7 @@ def get_questions(video_id, df_string, source="gcs"):
1113
  print("=====get_questions=====")
1114
  return q1, q2, q3
1115
 
1116
- def generate_questions(df_string):
1117
  print("===generate_questions===")
1118
  # 使用 OpenAI 生成基于上传数据的问题
1119
  if isinstance(df_string, str):
@@ -1136,69 +1132,26 @@ def generate_questions(df_string):
1136
  [q1的敘述text, q2的敘述text, q3的敘述text]
1137
  }}
1138
  """
1139
-
1140
- try:
1141
- model = "gpt-4-turbo"
1142
- messages = [
1143
- {"role": "system", "content": sys_content},
1144
- {"role": "user", "content": user_content}
1145
- ]
1146
- response_format = { "type": "json_object" }
1147
-
1148
- print("=====messages=====")
1149
- print(messages)
1150
- print("=====messages=====")
1151
-
1152
-
1153
- request_payload = {
1154
- "model": model,
1155
- "messages": messages,
1156
- "max_tokens": 4000,
1157
- "response_format": response_format
1158
- }
1159
-
1160
- response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
1161
- questions = json.loads(response.choices[0].message.content)["questions"]
1162
- except:
1163
- messages = [
1164
- {"role": "user", "content": user_content}
1165
- ]
1166
- model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
1167
- # model_id = "anthropic.claude-3-haiku-20240307-v1:0"
1168
- kwargs = {
1169
- "modelId": model_id,
1170
- "contentType": "application/json",
1171
- "accept": "application/json",
1172
- "body": json.dumps({
1173
- "anthropic_version": "bedrock-2023-05-31",
1174
- "max_tokens": 4000,
1175
- "system": sys_content,
1176
- "messages": messages
1177
- })
1178
- }
1179
- response = BEDROCK_CLIENT.invoke_model(**kwargs)
1180
- response_body = json.loads(response.get('body').read())
1181
- response_completion = response_body.get('content')[0].get('text')
1182
- questions = json.loads(response_completion)["questions"]
1183
-
1184
  print("=====json_response=====")
1185
- print(questions)
1186
  print("=====json_response=====")
1187
 
1188
- return questions
1189
 
1190
- def get_questions_answers(video_id, df_string, source="gcs"):
1191
  if source == "gcs":
1192
  try:
1193
  print("===get_questions_answers on gcs===")
1194
- gcs_client = GCS_CLIENT
1195
  bucket_name = 'video_ai_assistant'
1196
  file_name = f'{video_id}_questions_answers.json'
1197
  blob_name = f"{video_id}/{file_name}"
1198
  # 检查檔案是否存在
1199
  is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1200
  if not is_questions_answers_exists:
1201
- questions_answers = generate_questions_answers(df_string)
1202
  questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
1203
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
1204
  print("questions_answers已上傳到GCS")
@@ -1209,12 +1162,12 @@ def get_questions_answers(video_id, df_string, source="gcs"):
1209
  questions_answers = json.loads(questions_answers_text)
1210
  except Exception as e:
1211
  print(f"Error getting questions_answers: {str(e)}")
1212
- questions = get_questions(video_id, df_string, source)
1213
- questions_answers = [{"question": q, "answer": ""} for q in questions]
1214
 
1215
  return questions_answers
1216
 
1217
- def generate_questions_answers(df_string):
1218
  print("===generate_questions_answers===")
1219
  segments = split_data(df_string, word_base=100000)
1220
  all_content = []
@@ -1240,7 +1193,7 @@ def generate_questions_answers(df_string):
1240
  }}
1241
  """
1242
  response_format = { "type": "json_object" }
1243
- content = generate_content_by_LLM(sys_content, user_content, response_format)
1244
  content_json = json.loads(content)["questions_answers"]
1245
  all_content += content_json
1246
 
@@ -1264,7 +1217,7 @@ def change_questions(password, df_string):
1264
  print("=====get_questions=====")
1265
  return q1, q2, q3
1266
 
1267
- def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source):
1268
  if source == "gcs":
1269
  print("===get_key_moments on gcs===")
1270
  gcs_client = GCS_CLIENT
@@ -1274,7 +1227,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
1274
  # 检查檔案是否存在
1275
  is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1276
  if not is_key_moments_exists:
1277
- key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
1278
  key_moments_json = {"key_moments": key_moments}
1279
  key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1280
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
@@ -1290,7 +1243,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
1290
  for key_moment in key_moments_json["key_moments"]:
1291
  if "keywords" not in key_moment:
1292
  transcript = key_moment["transcript"]
1293
- key_moment["keywords"] = generate_key_moments_keywords(transcript)
1294
  print("===keywords===")
1295
  print(key_moment["keywords"])
1296
  print("===keywords===")
@@ -1311,7 +1264,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
1311
  # 检查檔案是否存在
1312
  exists, file_id = check_file_exists(service, folder_id, file_name)
1313
  if not exists:
1314
- key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
1315
  key_moments_json = {"key_moments": key_moments}
1316
  key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1317
  upload_content_directly(service, file_name, folder_id, key_moments_text)
@@ -1324,7 +1277,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
1324
 
1325
  return key_moments_json
1326
 
1327
- def generate_key_moments(formatted_simple_transcript, formatted_transcript):
1328
  print("===generate_key_moments===")
1329
  segments = split_data(formatted_simple_transcript, word_base=100000)
1330
  all_content = []
@@ -1351,7 +1304,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
1351
  }}
1352
  """
1353
  response_format = { "type": "json_object" }
1354
- content = generate_content_by_LLM(sys_content, user_content, response_format)
1355
  key_moments = json.loads(content)["key_moments"]
1356
 
1357
  # "transcript": get text from formatted_simple_transcript
@@ -1379,7 +1332,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
1379
 
1380
  return all_content
1381
 
1382
- def generate_key_moments_keywords(transcript):
1383
  print("===generate_key_moments_keywords===")
1384
  segments = split_data(transcript, word_base=100000)
1385
  all_content = []
@@ -1392,7 +1345,7 @@ def generate_key_moments_keywords(transcript):
1392
  不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
1393
  transcript:{segment}
1394
  """
1395
- content = generate_content_by_LLM(sys_content, user_content)
1396
  keywords = content.strip().split(",")
1397
  all_content += keywords
1398
 
@@ -1673,7 +1626,6 @@ def delete_LLM_content(video_id, kind):
1673
 
1674
  def update_LLM_content(video_id, new_content, kind):
1675
  print(f"===upfdate kind on gcs===")
1676
- gcs_client = GCS_CLIENT
1677
  bucket_name = 'video_ai_assistant'
1678
  file_name = f'{video_id}_{kind}.json'
1679
  blob_name = f"{video_id}/{file_name}"
@@ -1747,16 +1699,16 @@ def update_LLM_content(video_id, new_content, kind):
1747
  print(f"{kind} 已更新到GCS")
1748
  return gr.update(value=updated_content, interactive=False)
1749
 
1750
- def create_LLM_content(video_id, df_string, kind):
1751
  print(f"===create_{kind}===")
1752
  print(f"video_id: {video_id}")
1753
 
1754
  if kind == "reading_passage_latex":
1755
- content = generate_reading_passage(df_string)
1756
  update_LLM_content(video_id, content, kind)
1757
  elif kind == "summary_markdown":
1758
  meta_data = get_meta_data(video_id)
1759
- content = generate_summarise(df_string, meta_data)
1760
  update_LLM_content(video_id, content, kind)
1761
  elif kind == "mind_map":
1762
  content = generate_mind_map(df_string)
@@ -1768,7 +1720,7 @@ def create_LLM_content(video_id, df_string, kind):
1768
  transcript = df_string
1769
  formatted_simple_transcript = create_formatted_simple_transcript(transcript)
1770
  formatted_transcript = create_formatted_transcript(video_id, transcript)
1771
- gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript)
1772
  update_LLM_content(video_id, gen_content, kind)
1773
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1774
  elif kind == "transcript":
@@ -1776,7 +1728,7 @@ def create_LLM_content(video_id, df_string, kind):
1776
  update_LLM_content(video_id, gen_content, kind)
1777
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1778
  elif kind == "questions":
1779
- gen_content = generate_questions(df_string)
1780
  update_LLM_content(video_id, gen_content, kind)
1781
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1782
  elif kind == "questions_answers":
@@ -1785,7 +1737,7 @@ def create_LLM_content(video_id, df_string, kind):
1785
  else:
1786
  transcript = df_string
1787
  formatted_simple_transcript = create_formatted_simple_transcript(transcript)
1788
- gen_content = generate_questions_answers(formatted_simple_transcript)
1789
  update_LLM_content(video_id, gen_content, kind)
1790
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1791
 
@@ -2615,6 +2567,8 @@ def init_params(text, request: gr.Request):
2615
  chatbot_ai = gr.update(visible=False)
2616
  ai_chatbot_params = gr.update(visible=True)
2617
 
 
 
2618
  # if youtube_link in query_params
2619
  if "youtube_id" in request.query_params:
2620
  youtube_id = request.query_params["youtube_id"]
@@ -2633,11 +2587,15 @@ def init_params(text, request: gr.Request):
2633
  lesson_plan_accordion = gr.update(visible=False)
2634
  exit_ticket_accordion = gr.update(visible=False)
2635
  ai_chatbot_params = gr.update(visible=False)
 
 
 
2636
 
2637
  return admin, reading_passage_admin, summary_admin, see_detail, \
2638
  worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
2639
  password_text, youtube_link, \
2640
- chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params
 
2641
 
2642
  def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
2643
  # inputs=[content_subject, content_grade, df_string_output],
@@ -2692,13 +2650,20 @@ HEAD = """
2692
 
2693
  with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
2694
  with gr.Row() as admin:
2695
- password = gr.Textbox(label="Password", type="password", elem_id="password_input", visible=True)
2696
- youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input", visible=True)
2697
- video_id = gr.Textbox(label="video_id", visible=True)
2698
- # file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
2699
- # web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
2700
- user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
2701
- youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
 
 
 
 
 
 
 
2702
  with gr.Row() as data_state:
2703
  content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
2704
  content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
@@ -3171,7 +3136,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3171
  )
3172
 
3173
  # 当输入 YouTube 链接时触发
3174
- process_youtube_link_inputs = [password, youtube_link]
3175
  process_youtube_link_outputs = [
3176
  video_id,
3177
  questions_answers_json,
@@ -3252,7 +3217,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3252
  {
3253
  'button': transcript_create_button,
3254
  'action': create_LLM_content,
3255
- 'inputs': [video_id, df_string_output, transcript_kind],
3256
  'outputs': [df_string_output]
3257
  },
3258
  {
@@ -3283,7 +3248,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3283
  {
3284
  'button': reading_passage_create_button,
3285
  'action': create_LLM_content,
3286
- 'inputs': [video_id, df_string_output, reading_passage_kind],
3287
  'outputs': [reading_passage_text]
3288
  },
3289
  {
@@ -3314,7 +3279,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3314
  {
3315
  'button': summary_create_button,
3316
  'action': create_LLM_content,
3317
- 'inputs': [video_id, df_string_output, summary_kind],
3318
  'outputs': [summary_text]
3319
  },
3320
  {
@@ -3345,7 +3310,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3345
  {
3346
  'button': key_moments_create_button,
3347
  'action': create_LLM_content,
3348
- 'inputs': [video_id, df_string_output, key_moments_kind],
3349
  'outputs': [key_moments]
3350
  },
3351
  {
@@ -3376,7 +3341,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3376
  {
3377
  'button': questions_create_button,
3378
  'action': create_LLM_content,
3379
- 'inputs': [video_id, df_string_output, questions_kind],
3380
  'outputs': [questions_json]
3381
  },
3382
  {
@@ -3407,7 +3372,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3407
  {
3408
  'button': questions_answers_create_button,
3409
  'action': create_LLM_content,
3410
- 'inputs': [video_id, df_string_output, questions_answers_kind],
3411
  'outputs': [questions_answers_json]
3412
  },
3413
  {
@@ -3438,7 +3403,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3438
  {
3439
  'button': worksheet_create_button,
3440
  'action': create_LLM_content,
3441
- 'inputs': [video_id, df_string_output, worksheet_kind],
3442
  'outputs': [worksheet_json]
3443
  },
3444
  {
@@ -3567,6 +3532,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3567
  chatbot_open_ai_streaming,
3568
  chatbot_ai,
3569
  ai_chatbot_params,
 
3570
  ]
3571
  demo.load(
3572
  init_params,
 
49
  if is_env_local:
50
  with open("local_config.json") as f:
51
  config = json.load(f)
52
+ IS_ENV_PROD = "False"
53
  PASSWORD = config["PASSWORD"]
54
  GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
55
  DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
 
65
  AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
66
  AWS_REGION_NAME = config["AWS_REGION_NAME"]
67
  OUTPUT_PATH = config["OUTPUT_PATH"]
68
+
69
  else:
70
+ IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
71
  PASSWORD = os.getenv("PASSWORD")
72
  GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
73
  DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
 
429
 
430
  def process_transcript_and_screenshots_on_gcs(video_id):
431
  print("====process_transcript_and_screenshots_on_gcs====")
432
+ transcript, exists = get_transcript_from_gcs(video_id)
433
+ if not exists:
434
+ print("Transcript file does not exist, creating new transcript...")
435
+ transcript = generate_transcription_by_whisper(video_id)
436
+ upload_transcript_to_gcs(video_id, transcript)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
  # 處理截圖
439
+ is_new_transcript = False
440
  for entry in transcript:
441
  if 'img_file_id' not in entry:
442
  # 檢查 OUTPUT_PATH 是否存在 video_id.mp4
 
451
  if i == 4:
452
  raise gr.Error(f"下载视频失败: {str(e)}")
453
  time.sleep(5)
454
+ try:
455
+ screenshot_path = screenshot_youtube_video(video_id, entry['start'])
456
+ screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
457
+ img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
458
+ entry['img_file_id'] = img_file_id
459
+ print(f"截图已上传到GCS: {img_file_id}")
460
+ is_new_transcript = True
461
+ except Exception as e:
462
+ print(f"Error processing screenshot: {str(e)}")
463
 
 
464
  if is_new_transcript:
 
 
 
465
  print("===更新逐字稿文件===")
466
+ upload_transcript_to_gcs(video_id, transcript)
467
+
468
+ return transcript
469
+
470
+ def get_transcript(video_id):
471
+ print("====get_transcript====")
472
+ transcript, exists = get_transcript_from_gcs(video_id)
473
+ if not exists:
474
+ raise gr.Error("逐字稿文件不存在於GCS中。")
475
+
476
+ if any('img_file_id' not in entry for entry in transcript):
477
+ raise gr.Error("Some entries in the transcript do not have an associated img_file_id.")
478
+
479
+ print("Transcript is verified with all necessary images.")
480
+ return transcript
481
+
482
+ def get_transcript_from_gcs(video_id):
483
+ print("Checking for transcript in GCS...")
484
+ bucket_name = 'video_ai_assistant'
485
+ transcript_file_name = f'{video_id}_transcript.json'
486
+ transcript_blob_name = f"{video_id}/{transcript_file_name}"
487
+ # Check if the transcript exists in GCS
488
+ is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
489
+ if is_transcript_exists:
490
+ # Download the transcript if it exists
491
+ transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
492
+ return json.loads(transcript_text), True
493
  else:
494
+ print("No transcript found for video ID:", video_id)
495
+ return None, False
496
 
497
+ def upload_transcript_to_gcs(video_id, transcript):
498
+ print("Uploading updated transcript to GCS...")
499
+ bucket_name = 'video_ai_assistant'
500
+ transcript_file_name = f'{video_id}_transcript.json'
501
+ transcript_blob_name = f"{video_id}/{transcript_file_name}"
502
+ transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
503
+ GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
504
+ print("Transcript uploaded successfully.")
505
 
506
+ def process_youtube_link(password, link, LLM_model=None):
507
  verify_password(password)
 
 
 
508
  video_id = extract_youtube_id(link)
509
+
510
  try:
511
+ if IS_ENV_PROD == "True":
512
+ transcript = get_transcript(video_id)
513
+ else:
514
+ transcript = process_transcript_and_screenshots_on_gcs(video_id)
515
  except Exception as e:
516
  error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
517
  print("===process_youtube_link error===")
 
545
 
546
  # 基于逐字稿生成其他所需的输出
547
  source = "gcs"
548
+ questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source, LLM_model)
549
  questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
550
+ summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source, LLM_model)
551
  summary_text = summary_json["summary"]
552
  summary = summary_json["summary"]
553
+ key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model)
554
  key_moments = key_moments_json["key_moments"]
555
  key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
556
  key_moments_html = get_key_moments_html(key_moments)
557
  html_content = format_transcript_to_html(formatted_transcript)
558
  simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
559
+ mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source, LLM_model)
560
  mind_map = mind_map_json["mind_map"]
561
  mind_map_html = get_mind_map_html(mind_map)
562
+ reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source, LLM_model)
563
  reading_passage_text = reading_passage_json["reading_passage"]
564
  reading_passage = reading_passage_json["reading_passage"]
565
  meta_data = get_meta_data(video_id)
 
703
 
704
  return segments
705
 
706
+ def generate_content_by_open_ai(sys_content, user_content, response_format=None):
707
+ print("LLM using OPEN AI")
708
+ model = "gpt-4-turbo"
709
+ messages = [
710
+ {"role": "system", "content": sys_content},
711
+ {"role": "user", "content": user_content}
712
+ ]
713
+ request_payload = {
714
+ "model": model,
715
+ "messages": messages,
716
+ "max_tokens": 4000,
717
+ }
718
 
719
+ if response_format is not None:
720
+ request_payload["response_format"] = response_format
 
 
 
 
 
721
 
722
+ response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
723
+ content = response.choices[0].message.content.strip()
724
+ return content
 
 
 
725
 
726
+ def generate_content_by_bedrock(sys_content, user_content):
727
+ print("LLM using REDROCK")
728
+ messages = [
729
+ {"role": "user", "content": user_content +"(如果是 JSON 格式,value 的引號,請用單引號,或是用反斜線+雙引號,避免 JSON Decoder error )"}
730
+ ]
731
+ model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
732
+ # model_id = "anthropic.claude-3-haiku-20240307-v1:0"
733
+ kwargs = {
734
+ "modelId": model_id,
735
+ "contentType": "application/json",
736
+ "accept": "application/json",
737
+ "body": json.dumps({
738
+ "anthropic_version": "bedrock-2023-05-31",
739
+ "max_tokens": 4000,
740
+ "system": sys_content,
741
+ "messages": messages
742
+ })
743
+ }
744
+ response = BEDROCK_CLIENT.invoke_model(**kwargs)
745
+ response_body = json.loads(response.get('body').read())
746
+ content = response_body.get('content')[0].get('text')
747
+ return content
748
 
749
+ def generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=None):
750
+ # 使用 OpenAI 生成基于上传数据的问题
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
 
752
+ if LLM_model == "anthropic-claude-3-sonnet":
753
+ print(f"LLM: {LLM_model}")
754
+ content = generate_content_by_bedrock(sys_content, user_content)
755
+ else:
756
+ print(f"LLM: {LLM_model}")
757
+ content = generate_content_by_open_ai(sys_content, user_content, response_format)
758
+
759
  print("=====content=====")
760
  print(content)
761
  print("=====content=====")
762
 
763
  return content
764
 
765
+ def get_reading_passage(video_id, df_string, source, LLM_model=None):
766
  if source == "gcs":
767
  print("===get_reading_passage on gcs===")
 
768
  bucket_name = 'video_ai_assistant'
769
  file_name = f'{video_id}_reading_passage_latex.json'
770
  blob_name = f"{video_id}/{file_name}"
771
  # 检查 reading_passage 是否存在
772
  is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
773
  if not is_file_exists:
774
+ reading_passage = generate_reading_passage(df_string, LLM_model)
775
  reading_passage_json = {"reading_passage": str(reading_passage)}
776
  reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
777
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
 
804
 
805
  return reading_passage_json
806
 
807
+ def generate_reading_passage(df_string, LLM_model=None):
808
  print("===generate_reading_passage===")
809
  segments = split_data(df_string, word_base=100000)
810
  all_content = []
 
823
  加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
824
  請直接給出文章,不用介紹怎麼處理的或是文章字數等等
825
  """
826
+ content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
827
  all_content.append(content + "\n")
828
 
829
  # 將所有生成的閱讀理解段落合併成一個完整的文章
 
836
  tts.save(filename)
837
  return filename
838
 
839
+ def get_mind_map(video_id, df_string, source, LLM_model=None):
840
  if source == "gcs":
841
  print("===get_mind_map on gcs===")
842
  gcs_client = GCS_CLIENT
 
846
  # 检查檔案是否存在
847
  is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
848
  if not is_file_exists:
849
+ mind_map = generate_mind_map(df_string, LLM_model)
850
  mind_map_json = {"mind_map": str(mind_map)}
851
  mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
852
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
 
867
  # 检查檔案是否存在
868
  exists, file_id = check_file_exists(service, folder_id, file_name)
869
  if not exists:
870
+ mind_map = generate_mind_map(df_string, LLM_model)
871
  mind_map_json = {"mind_map": str(mind_map)}
872
  mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
873
  upload_content_directly(service, file_name, folder_id, mind_map_text)
 
880
 
881
  return mind_map_json
882
 
883
+ def generate_mind_map(df_string, LLM_model=None):
884
  print("===generate_mind_map===")
885
  segments = split_data(df_string, word_base=100000)
886
  all_content = []
 
892
  注意:不需要前後文敘述,直接給出 markdown 文本即可
893
  這對我很重要
894
  """
895
+ content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
896
  all_content.append(content + "\n")
897
 
898
  # 將所有生成的閱讀理解段落合併成一個完整的文章
 
911
  """
912
  return mind_map_html
913
 
914
+ def get_video_id_summary(video_id, df_string, source, LLM_model=None):
915
  if source == "gcs":
916
  print("===get_video_id_summary on gcs===")
 
917
  bucket_name = 'video_ai_assistant'
918
  file_name = f'{video_id}_summary_markdown.json'
919
  summary_file_blob_name = f"{video_id}/{file_name}"
 
921
  is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
922
  if not is_summary_file_exists:
923
  meta_data = get_meta_data(video_id)
924
+ summary = generate_summarise(df_string, meta_data, LLM_model)
925
  summary_json = {"summary": str(summary)}
926
  summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
927
  GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
 
943
  exists, file_id = check_file_exists(service, folder_id, file_name)
944
  if not exists:
945
  meta_data = get_meta_data(video_id)
946
+ summary = generate_summarise(df_string, meta_data, LLM_model)
947
  summary_json = {"summary": str(summary)}
948
  summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
949
 
 
964
 
965
  return summary_json
966
 
967
+ def generate_summarise(df_string, metadata=None, LLM_model=None):
968
  print("===generate_summarise===")
969
  # 使用 OpenAI 生成基于上传数据的问题
970
  if metadata:
 
1012
  ## ❓ 延伸小問題
1013
  - (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
1014
  """
1015
+ content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
1016
  all_content.append(content + "\n")
1017
 
1018
  if len(all_content) > 1:
 
1051
  ## ❓ 延伸小問題
1052
  - ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
1053
  """
1054
+ final_content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
1055
  else:
1056
  final_content = all_content[0]
1057
 
1058
  return final_content
1059
 
1060
+ def get_questions(video_id, df_string, source="gcs", LLM_model=None):
1061
  if source == "gcs":
1062
  # 去 gcs 確認是有有 video_id_questions.json
1063
  print("===get_questions on gcs===")
 
1068
  # 检查檔案是否存在
1069
  is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1070
  if not is_questions_exists:
1071
+ questions = generate_questions(df_string, LLM_model)
1072
  questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
1073
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
1074
  print("questions已上傳到GCS")
 
1089
  # 检查檔案是否存在
1090
  exists, file_id = check_file_exists(service, folder_id, file_name)
1091
  if not exists:
1092
+ questions = generate_questions(df_string, LLM_model)
1093
  questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
1094
  upload_content_directly(service, file_name, folder_id, questions_text)
1095
  print("questions已上傳到Google Drive")
 
1109
  print("=====get_questions=====")
1110
  return q1, q2, q3
1111
 
1112
+ def generate_questions(df_string, LLM_model=None):
1113
  print("===generate_questions===")
1114
  # 使用 OpenAI 生成基于上传数据的问题
1115
  if isinstance(df_string, str):
 
1132
  [q1的敘述text, q2的敘述text, q3的敘述text]
1133
  }}
1134
  """
1135
+ response_format = { "type": "json_object" }
1136
+ questions = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
1137
+ questions_list = json.loads(questions)["questions"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1138
  print("=====json_response=====")
1139
+ print(questions_list)
1140
  print("=====json_response=====")
1141
 
1142
+ return questions_list
1143
 
1144
+ def get_questions_answers(video_id, df_string, source="gcs", LLM_model=None):
1145
  if source == "gcs":
1146
  try:
1147
  print("===get_questions_answers on gcs===")
 
1148
  bucket_name = 'video_ai_assistant'
1149
  file_name = f'{video_id}_questions_answers.json'
1150
  blob_name = f"{video_id}/{file_name}"
1151
  # 检查檔案是否存在
1152
  is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1153
  if not is_questions_answers_exists:
1154
+ questions_answers = generate_questions_answers(df_string, LLM_model)
1155
  questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
1156
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
1157
  print("questions_answers已上傳到GCS")
 
1162
  questions_answers = json.loads(questions_answers_text)
1163
  except Exception as e:
1164
  print(f"Error getting questions_answers: {str(e)}")
1165
+ questions_list = get_questions(video_id, df_string, source, LLM_model)
1166
+ questions_answers = [{"question": q, "answer": ""} for q in questions_list]
1167
 
1168
  return questions_answers
1169
 
1170
+ def generate_questions_answers(df_string, LLM_model=None):
1171
  print("===generate_questions_answers===")
1172
  segments = split_data(df_string, word_base=100000)
1173
  all_content = []
 
1193
  }}
1194
  """
1195
  response_format = { "type": "json_object" }
1196
+ content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
1197
  content_json = json.loads(content)["questions_answers"]
1198
  all_content += content_json
1199
 
 
1217
  print("=====get_questions=====")
1218
  return q1, q2, q3
1219
 
1220
+ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model=None):
1221
  if source == "gcs":
1222
  print("===get_key_moments on gcs===")
1223
  gcs_client = GCS_CLIENT
 
1227
  # 检查檔案是否存在
1228
  is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1229
  if not is_key_moments_exists:
1230
+ key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
1231
  key_moments_json = {"key_moments": key_moments}
1232
  key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1233
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
 
1243
  for key_moment in key_moments_json["key_moments"]:
1244
  if "keywords" not in key_moment:
1245
  transcript = key_moment["transcript"]
1246
+ key_moment["keywords"] = generate_key_moments_keywords(transcript, LLM_model)
1247
  print("===keywords===")
1248
  print(key_moment["keywords"])
1249
  print("===keywords===")
 
1264
  # 检查檔案是否存在
1265
  exists, file_id = check_file_exists(service, folder_id, file_name)
1266
  if not exists:
1267
+ key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
1268
  key_moments_json = {"key_moments": key_moments}
1269
  key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1270
  upload_content_directly(service, file_name, folder_id, key_moments_text)
 
1277
 
1278
  return key_moments_json
1279
 
1280
+ def generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model=None):
1281
  print("===generate_key_moments===")
1282
  segments = split_data(formatted_simple_transcript, word_base=100000)
1283
  all_content = []
 
1304
  }}
1305
  """
1306
  response_format = { "type": "json_object" }
1307
+ content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
1308
  key_moments = json.loads(content)["key_moments"]
1309
 
1310
  # "transcript": get text from formatted_simple_transcript
 
1332
 
1333
  return all_content
1334
 
1335
+ def generate_key_moments_keywords(transcript, LLM_model=None):
1336
  print("===generate_key_moments_keywords===")
1337
  segments = split_data(transcript, word_base=100000)
1338
  all_content = []
 
1345
  不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
1346
  transcript:{segment}
1347
  """
1348
+ content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
1349
  keywords = content.strip().split(",")
1350
  all_content += keywords
1351
 
 
1626
 
1627
  def update_LLM_content(video_id, new_content, kind):
1628
  print(f"===upfdate kind on gcs===")
 
1629
  bucket_name = 'video_ai_assistant'
1630
  file_name = f'{video_id}_{kind}.json'
1631
  blob_name = f"{video_id}/{file_name}"
 
1699
  print(f"{kind} 已更新到GCS")
1700
  return gr.update(value=updated_content, interactive=False)
1701
 
1702
+ def create_LLM_content(video_id, df_string, kind, LLM_model=None):
1703
  print(f"===create_{kind}===")
1704
  print(f"video_id: {video_id}")
1705
 
1706
  if kind == "reading_passage_latex":
1707
+ content = generate_reading_passage(df_string, LLM_model)
1708
  update_LLM_content(video_id, content, kind)
1709
  elif kind == "summary_markdown":
1710
  meta_data = get_meta_data(video_id)
1711
+ content = generate_summarise(df_string, meta_data, LLM_model)
1712
  update_LLM_content(video_id, content, kind)
1713
  elif kind == "mind_map":
1714
  content = generate_mind_map(df_string)
 
1720
  transcript = df_string
1721
  formatted_simple_transcript = create_formatted_simple_transcript(transcript)
1722
  formatted_transcript = create_formatted_transcript(video_id, transcript)
1723
+ gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
1724
  update_LLM_content(video_id, gen_content, kind)
1725
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1726
  elif kind == "transcript":
 
1728
  update_LLM_content(video_id, gen_content, kind)
1729
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1730
  elif kind == "questions":
1731
+ gen_content = generate_questions(df_string, LLM_model)
1732
  update_LLM_content(video_id, gen_content, kind)
1733
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1734
  elif kind == "questions_answers":
 
1737
  else:
1738
  transcript = df_string
1739
  formatted_simple_transcript = create_formatted_simple_transcript(transcript)
1740
+ gen_content = generate_questions_answers(formatted_simple_transcript, LLM_model)
1741
  update_LLM_content(video_id, gen_content, kind)
1742
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1743
 
 
2567
  chatbot_ai = gr.update(visible=False)
2568
  ai_chatbot_params = gr.update(visible=True)
2569
 
2570
+ is_env_prod = gr.update(value=False)
2571
+
2572
  # if youtube_link in query_params
2573
  if "youtube_id" in request.query_params:
2574
  youtube_id = request.query_params["youtube_id"]
 
2587
  lesson_plan_accordion = gr.update(visible=False)
2588
  exit_ticket_accordion = gr.update(visible=False)
2589
  ai_chatbot_params = gr.update(visible=False)
2590
+
2591
+ if IS_ENV_PROD == "True":
2592
+ is_env_prod = gr.update(value=True)
2593
 
2594
  return admin, reading_passage_admin, summary_admin, see_detail, \
2595
  worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
2596
  password_text, youtube_link, \
2597
+ chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params, \
2598
+ is_env_prod
2599
 
2600
  def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
2601
  # inputs=[content_subject, content_grade, df_string_output],
 
2650
 
2651
  with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
2652
  with gr.Row() as admin:
2653
+ with gr.Column(scale=4):
2654
+ with gr.Row():
2655
+ password = gr.Textbox(label="Password", type="password", elem_id="password_input", visible=True)
2656
+ youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input", visible=True)
2657
+ video_id = gr.Textbox(label="video_id", visible=True)
2658
+ # file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
2659
+ # web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
2660
+ user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
2661
+ with gr.Row():
2662
+ is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
2663
+ LLM_model = gr.Dropdown(label="LLM Model", choices=["open-ai-gpt-4", "anthropic-claude-3-sonnet"], value="open-ai-gpt-4", visible=True, interactive=True)
2664
+ with gr.Column(scale=1):
2665
+ with gr.Row():
2666
+ youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
2667
  with gr.Row() as data_state:
2668
  content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
2669
  content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
 
3136
  )
3137
 
3138
  # 当输入 YouTube 链接时触发
3139
+ process_youtube_link_inputs = [password, youtube_link, LLM_model]
3140
  process_youtube_link_outputs = [
3141
  video_id,
3142
  questions_answers_json,
 
3217
  {
3218
  'button': transcript_create_button,
3219
  'action': create_LLM_content,
3220
+ 'inputs': [video_id, df_string_output, transcript_kind, LLM_model],
3221
  'outputs': [df_string_output]
3222
  },
3223
  {
 
3248
  {
3249
  'button': reading_passage_create_button,
3250
  'action': create_LLM_content,
3251
+ 'inputs': [video_id, df_string_output, reading_passage_kind, LLM_model],
3252
  'outputs': [reading_passage_text]
3253
  },
3254
  {
 
3279
  {
3280
  'button': summary_create_button,
3281
  'action': create_LLM_content,
3282
+ 'inputs': [video_id, df_string_output, summary_kind, LLM_model],
3283
  'outputs': [summary_text]
3284
  },
3285
  {
 
3310
  {
3311
  'button': key_moments_create_button,
3312
  'action': create_LLM_content,
3313
+ 'inputs': [video_id, df_string_output, key_moments_kind, LLM_model],
3314
  'outputs': [key_moments]
3315
  },
3316
  {
 
3341
  {
3342
  'button': questions_create_button,
3343
  'action': create_LLM_content,
3344
+ 'inputs': [video_id, df_string_output, questions_kind, LLM_model],
3345
  'outputs': [questions_json]
3346
  },
3347
  {
 
3372
  {
3373
  'button': questions_answers_create_button,
3374
  'action': create_LLM_content,
3375
+ 'inputs': [video_id, df_string_output, questions_answers_kind, LLM_model],
3376
  'outputs': [questions_answers_json]
3377
  },
3378
  {
 
3403
  {
3404
  'button': worksheet_create_button,
3405
  'action': create_LLM_content,
3406
+ 'inputs': [video_id, df_string_output, worksheet_kind, LLM_model],
3407
  'outputs': [worksheet_json]
3408
  },
3409
  {
 
3532
  chatbot_open_ai_streaming,
3533
  chatbot_ai,
3534
  ai_chatbot_params,
3535
+ is_env_prod,
3536
  ]
3537
  demo.load(
3538
  init_params,