Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
@@ -49,6 +49,7 @@ print(gr.__version__)
|
|
49 |
if is_env_local:
|
50 |
with open("local_config.json") as f:
|
51 |
config = json.load(f)
|
|
|
52 |
PASSWORD = config["PASSWORD"]
|
53 |
GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
54 |
DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
@@ -64,7 +65,9 @@ if is_env_local:
|
|
64 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
65 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
66 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
|
|
67 |
else:
|
|
|
68 |
PASSWORD = os.getenv("PASSWORD")
|
69 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
70 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
@@ -426,54 +429,14 @@ def get_video_duration(video_id):
|
|
426 |
|
427 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
428 |
print("====process_transcript_and_screenshots_on_gcs====")
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
# 检查逐字稿是否存在
|
435 |
-
is_new_transcript = False
|
436 |
-
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
437 |
-
video_duration = get_video_duration(video_id)
|
438 |
-
if not is_transcript_exists:
|
439 |
-
print("逐字稿文件不存在于GCS中,重新建立")
|
440 |
-
# 从YouTube获取逐字稿并上传
|
441 |
-
try:
|
442 |
-
transcript = get_transcript_by_yt_api(video_id)
|
443 |
-
except:
|
444 |
-
# call open ai whisper
|
445 |
-
print("===call open ai whisper===")
|
446 |
-
transcript = generate_transcription_by_whisper(video_id)
|
447 |
-
|
448 |
-
if transcript:
|
449 |
-
print("成功獲取字幕")
|
450 |
-
else:
|
451 |
-
print("沒有找到字幕")
|
452 |
-
transcript = generate_transcription_by_whisper(video_id)
|
453 |
-
if video_duration:
|
454 |
-
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
455 |
-
|
456 |
-
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
457 |
-
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
458 |
-
|
459 |
-
is_new_transcript = True
|
460 |
-
else:
|
461 |
-
# 逐字稿已存在,下载逐字稿内容
|
462 |
-
print("逐字稿已存在于GCS中")
|
463 |
-
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
464 |
-
transcript = json.loads(transcript_text)
|
465 |
-
if video_duration:
|
466 |
-
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
467 |
-
|
468 |
-
# print("===確認其他衍生文件===")
|
469 |
-
# source = "gcs"
|
470 |
-
# get_questions(video_id, transcript_text, source)
|
471 |
-
# get_video_id_summary(video_id, transcript_text, source)
|
472 |
-
# get_mind_map(video_id, transcript_text, source)
|
473 |
-
# print("===確認其他衍生文件 end ===")
|
474 |
-
|
475 |
|
476 |
# 處理截圖
|
|
|
477 |
for entry in transcript:
|
478 |
if 'img_file_id' not in entry:
|
479 |
# 檢查 OUTPUT_PATH 是否存在 video_id.mp4
|
@@ -488,38 +451,67 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
488 |
if i == 4:
|
489 |
raise gr.Error(f"下载视频失败: {str(e)}")
|
490 |
time.sleep(5)
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
|
|
|
|
498 |
|
499 |
-
# 確認是否更新逐字稿文件
|
500 |
if is_new_transcript:
|
501 |
-
# 更新逐字稿文件
|
502 |
-
print("===更新逐字稿文件===")
|
503 |
-
print(transcript)
|
504 |
print("===更新逐字稿文件===")
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
else:
|
510 |
-
|
|
|
511 |
|
512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
|
514 |
-
def process_youtube_link(password, link):
|
515 |
verify_password(password)
|
516 |
-
|
517 |
-
# 使用 YouTube API 获取逐字稿
|
518 |
-
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
519 |
video_id = extract_youtube_id(link)
|
520 |
-
|
521 |
try:
|
522 |
-
|
|
|
|
|
|
|
523 |
except Exception as e:
|
524 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
525 |
print("===process_youtube_link error===")
|
@@ -553,21 +545,21 @@ def process_youtube_link(password, link):
|
|
553 |
|
554 |
# 基于逐字稿生成其他所需的输出
|
555 |
source = "gcs"
|
556 |
-
questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source)
|
557 |
questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
558 |
-
summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source)
|
559 |
summary_text = summary_json["summary"]
|
560 |
summary = summary_json["summary"]
|
561 |
-
key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source)
|
562 |
key_moments = key_moments_json["key_moments"]
|
563 |
key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
|
564 |
key_moments_html = get_key_moments_html(key_moments)
|
565 |
html_content = format_transcript_to_html(formatted_transcript)
|
566 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
567 |
-
mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source)
|
568 |
mind_map = mind_map_json["mind_map"]
|
569 |
mind_map_html = get_mind_map_html(mind_map)
|
570 |
-
reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source)
|
571 |
reading_passage_text = reading_passage_json["reading_passage"]
|
572 |
reading_passage = reading_passage_json["reading_passage"]
|
573 |
meta_data = get_meta_data(video_id)
|
@@ -711,70 +703,75 @@ def split_data(df_string, word_base=100000):
|
|
711 |
|
712 |
return segments
|
713 |
|
714 |
-
def
|
715 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
716 |
|
717 |
-
|
718 |
-
|
719 |
-
# 使用 OPEN AI 生成 Reading Passage
|
720 |
-
messages = [
|
721 |
-
{"role": "system", "content": sys_content},
|
722 |
-
{"role": "user", "content": user_content}
|
723 |
-
]
|
724 |
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
"max_tokens": 4000,
|
729 |
-
"response_format": response_format
|
730 |
-
}
|
731 |
|
732 |
-
|
733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
734 |
|
735 |
-
|
736 |
-
|
737 |
-
except Exception as e:
|
738 |
-
print(f"Error generating reading passage: {str(e)}")
|
739 |
-
print("using REDROCK")
|
740 |
-
# 使用 REDROCK 生成 Reading Passage
|
741 |
-
messages = [
|
742 |
-
{"role": "user", "content": user_content}
|
743 |
-
]
|
744 |
-
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
|
745 |
-
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
|
746 |
-
kwargs = {
|
747 |
-
"modelId": model_id,
|
748 |
-
"contentType": "application/json",
|
749 |
-
"accept": "application/json",
|
750 |
-
"body": json.dumps({
|
751 |
-
"anthropic_version": "bedrock-2023-05-31",
|
752 |
-
"max_tokens": 4000,
|
753 |
-
"system": sys_content,
|
754 |
-
"messages": messages
|
755 |
-
})
|
756 |
-
}
|
757 |
-
response = BEDROCK_CLIENT.invoke_model(**kwargs)
|
758 |
-
response_body = json.loads(response.get('body').read())
|
759 |
-
content = response_body.get('content')[0].get('text')
|
760 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
761 |
print("=====content=====")
|
762 |
print(content)
|
763 |
print("=====content=====")
|
764 |
|
765 |
return content
|
766 |
|
767 |
-
def get_reading_passage(video_id, df_string, source):
|
768 |
if source == "gcs":
|
769 |
print("===get_reading_passage on gcs===")
|
770 |
-
gcs_client = GCS_CLIENT
|
771 |
bucket_name = 'video_ai_assistant'
|
772 |
file_name = f'{video_id}_reading_passage_latex.json'
|
773 |
blob_name = f"{video_id}/{file_name}"
|
774 |
# 检查 reading_passage 是否存在
|
775 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
776 |
if not is_file_exists:
|
777 |
-
reading_passage = generate_reading_passage(df_string)
|
778 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
779 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
780 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
@@ -807,7 +804,7 @@ def get_reading_passage(video_id, df_string, source):
|
|
807 |
|
808 |
return reading_passage_json
|
809 |
|
810 |
-
def generate_reading_passage(df_string):
|
811 |
print("===generate_reading_passage===")
|
812 |
segments = split_data(df_string, word_base=100000)
|
813 |
all_content = []
|
@@ -826,7 +823,7 @@ def generate_reading_passage(df_string):
|
|
826 |
加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
|
827 |
請直接給出文章,不用介紹怎麼處理的或是文章字數等等
|
828 |
"""
|
829 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
830 |
all_content.append(content + "\n")
|
831 |
|
832 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
@@ -839,7 +836,7 @@ def text_to_speech(video_id, text):
|
|
839 |
tts.save(filename)
|
840 |
return filename
|
841 |
|
842 |
-
def get_mind_map(video_id, df_string, source):
|
843 |
if source == "gcs":
|
844 |
print("===get_mind_map on gcs===")
|
845 |
gcs_client = GCS_CLIENT
|
@@ -849,7 +846,7 @@ def get_mind_map(video_id, df_string, source):
|
|
849 |
# 检查檔案是否存在
|
850 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
851 |
if not is_file_exists:
|
852 |
-
mind_map = generate_mind_map(df_string)
|
853 |
mind_map_json = {"mind_map": str(mind_map)}
|
854 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
855 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
@@ -870,7 +867,7 @@ def get_mind_map(video_id, df_string, source):
|
|
870 |
# 检查檔案是否存在
|
871 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
872 |
if not exists:
|
873 |
-
mind_map = generate_mind_map(df_string)
|
874 |
mind_map_json = {"mind_map": str(mind_map)}
|
875 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
876 |
upload_content_directly(service, file_name, folder_id, mind_map_text)
|
@@ -883,7 +880,7 @@ def get_mind_map(video_id, df_string, source):
|
|
883 |
|
884 |
return mind_map_json
|
885 |
|
886 |
-
def generate_mind_map(df_string):
|
887 |
print("===generate_mind_map===")
|
888 |
segments = split_data(df_string, word_base=100000)
|
889 |
all_content = []
|
@@ -895,7 +892,7 @@ def generate_mind_map(df_string):
|
|
895 |
注意:不需要前後文敘述,直接給出 markdown 文本即可
|
896 |
這對我很重要
|
897 |
"""
|
898 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
899 |
all_content.append(content + "\n")
|
900 |
|
901 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
@@ -914,10 +911,9 @@ def get_mind_map_html(mind_map):
|
|
914 |
"""
|
915 |
return mind_map_html
|
916 |
|
917 |
-
def get_video_id_summary(video_id, df_string, source):
|
918 |
if source == "gcs":
|
919 |
print("===get_video_id_summary on gcs===")
|
920 |
-
gcs_client = GCS_CLIENT
|
921 |
bucket_name = 'video_ai_assistant'
|
922 |
file_name = f'{video_id}_summary_markdown.json'
|
923 |
summary_file_blob_name = f"{video_id}/{file_name}"
|
@@ -925,7 +921,7 @@ def get_video_id_summary(video_id, df_string, source):
|
|
925 |
is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
|
926 |
if not is_summary_file_exists:
|
927 |
meta_data = get_meta_data(video_id)
|
928 |
-
summary = generate_summarise(df_string, meta_data)
|
929 |
summary_json = {"summary": str(summary)}
|
930 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
931 |
GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
|
@@ -947,7 +943,7 @@ def get_video_id_summary(video_id, df_string, source):
|
|
947 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
948 |
if not exists:
|
949 |
meta_data = get_meta_data(video_id)
|
950 |
-
summary = generate_summarise(df_string, meta_data)
|
951 |
summary_json = {"summary": str(summary)}
|
952 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
953 |
|
@@ -968,7 +964,7 @@ def get_video_id_summary(video_id, df_string, source):
|
|
968 |
|
969 |
return summary_json
|
970 |
|
971 |
-
def generate_summarise(df_string, metadata=None):
|
972 |
print("===generate_summarise===")
|
973 |
# 使用 OpenAI 生成基于上传数据的问题
|
974 |
if metadata:
|
@@ -1016,7 +1012,7 @@ def generate_summarise(df_string, metadata=None):
|
|
1016 |
## ❓ 延伸小問題
|
1017 |
- (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
1018 |
"""
|
1019 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
1020 |
all_content.append(content + "\n")
|
1021 |
|
1022 |
if len(all_content) > 1:
|
@@ -1055,13 +1051,13 @@ def generate_summarise(df_string, metadata=None):
|
|
1055 |
## ❓ 延伸小問題
|
1056 |
- ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
1057 |
"""
|
1058 |
-
final_content = generate_content_by_LLM(sys_content, user_content)
|
1059 |
else:
|
1060 |
final_content = all_content[0]
|
1061 |
|
1062 |
return final_content
|
1063 |
|
1064 |
-
def get_questions(video_id, df_string, source="gcs"):
|
1065 |
if source == "gcs":
|
1066 |
# 去 gcs 確認是有有 video_id_questions.json
|
1067 |
print("===get_questions on gcs===")
|
@@ -1072,7 +1068,7 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
1072 |
# 检查檔案是否存在
|
1073 |
is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1074 |
if not is_questions_exists:
|
1075 |
-
questions = generate_questions(df_string)
|
1076 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
1077 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
1078 |
print("questions已上傳到GCS")
|
@@ -1093,7 +1089,7 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
1093 |
# 检查檔案是否存在
|
1094 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
1095 |
if not exists:
|
1096 |
-
questions = generate_questions(df_string)
|
1097 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
1098 |
upload_content_directly(service, file_name, folder_id, questions_text)
|
1099 |
print("questions已上傳到Google Drive")
|
@@ -1113,7 +1109,7 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
1113 |
print("=====get_questions=====")
|
1114 |
return q1, q2, q3
|
1115 |
|
1116 |
-
def generate_questions(df_string):
|
1117 |
print("===generate_questions===")
|
1118 |
# 使用 OpenAI 生成基于上传数据的问题
|
1119 |
if isinstance(df_string, str):
|
@@ -1136,69 +1132,26 @@ def generate_questions(df_string):
|
|
1136 |
[q1的敘述text, q2的敘述text, q3的敘述text]
|
1137 |
}}
|
1138 |
"""
|
1139 |
-
|
1140 |
-
|
1141 |
-
|
1142 |
-
messages = [
|
1143 |
-
{"role": "system", "content": sys_content},
|
1144 |
-
{"role": "user", "content": user_content}
|
1145 |
-
]
|
1146 |
-
response_format = { "type": "json_object" }
|
1147 |
-
|
1148 |
-
print("=====messages=====")
|
1149 |
-
print(messages)
|
1150 |
-
print("=====messages=====")
|
1151 |
-
|
1152 |
-
|
1153 |
-
request_payload = {
|
1154 |
-
"model": model,
|
1155 |
-
"messages": messages,
|
1156 |
-
"max_tokens": 4000,
|
1157 |
-
"response_format": response_format
|
1158 |
-
}
|
1159 |
-
|
1160 |
-
response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
|
1161 |
-
questions = json.loads(response.choices[0].message.content)["questions"]
|
1162 |
-
except:
|
1163 |
-
messages = [
|
1164 |
-
{"role": "user", "content": user_content}
|
1165 |
-
]
|
1166 |
-
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
|
1167 |
-
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
|
1168 |
-
kwargs = {
|
1169 |
-
"modelId": model_id,
|
1170 |
-
"contentType": "application/json",
|
1171 |
-
"accept": "application/json",
|
1172 |
-
"body": json.dumps({
|
1173 |
-
"anthropic_version": "bedrock-2023-05-31",
|
1174 |
-
"max_tokens": 4000,
|
1175 |
-
"system": sys_content,
|
1176 |
-
"messages": messages
|
1177 |
-
})
|
1178 |
-
}
|
1179 |
-
response = BEDROCK_CLIENT.invoke_model(**kwargs)
|
1180 |
-
response_body = json.loads(response.get('body').read())
|
1181 |
-
response_completion = response_body.get('content')[0].get('text')
|
1182 |
-
questions = json.loads(response_completion)["questions"]
|
1183 |
-
|
1184 |
print("=====json_response=====")
|
1185 |
-
print(
|
1186 |
print("=====json_response=====")
|
1187 |
|
1188 |
-
return
|
1189 |
|
1190 |
-
def get_questions_answers(video_id, df_string, source="gcs"):
|
1191 |
if source == "gcs":
|
1192 |
try:
|
1193 |
print("===get_questions_answers on gcs===")
|
1194 |
-
gcs_client = GCS_CLIENT
|
1195 |
bucket_name = 'video_ai_assistant'
|
1196 |
file_name = f'{video_id}_questions_answers.json'
|
1197 |
blob_name = f"{video_id}/{file_name}"
|
1198 |
# 检查檔案是否存在
|
1199 |
is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1200 |
if not is_questions_answers_exists:
|
1201 |
-
questions_answers = generate_questions_answers(df_string)
|
1202 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
1203 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
1204 |
print("questions_answers已上傳到GCS")
|
@@ -1209,12 +1162,12 @@ def get_questions_answers(video_id, df_string, source="gcs"):
|
|
1209 |
questions_answers = json.loads(questions_answers_text)
|
1210 |
except Exception as e:
|
1211 |
print(f"Error getting questions_answers: {str(e)}")
|
1212 |
-
|
1213 |
-
questions_answers = [{"question": q, "answer": ""} for q in
|
1214 |
|
1215 |
return questions_answers
|
1216 |
|
1217 |
-
def generate_questions_answers(df_string):
|
1218 |
print("===generate_questions_answers===")
|
1219 |
segments = split_data(df_string, word_base=100000)
|
1220 |
all_content = []
|
@@ -1240,7 +1193,7 @@ def generate_questions_answers(df_string):
|
|
1240 |
}}
|
1241 |
"""
|
1242 |
response_format = { "type": "json_object" }
|
1243 |
-
content = generate_content_by_LLM(sys_content, user_content, response_format)
|
1244 |
content_json = json.loads(content)["questions_answers"]
|
1245 |
all_content += content_json
|
1246 |
|
@@ -1264,7 +1217,7 @@ def change_questions(password, df_string):
|
|
1264 |
print("=====get_questions=====")
|
1265 |
return q1, q2, q3
|
1266 |
|
1267 |
-
def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source):
|
1268 |
if source == "gcs":
|
1269 |
print("===get_key_moments on gcs===")
|
1270 |
gcs_client = GCS_CLIENT
|
@@ -1274,7 +1227,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1274 |
# 检查檔案是否存在
|
1275 |
is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1276 |
if not is_key_moments_exists:
|
1277 |
-
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
1278 |
key_moments_json = {"key_moments": key_moments}
|
1279 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1280 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
@@ -1290,7 +1243,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1290 |
for key_moment in key_moments_json["key_moments"]:
|
1291 |
if "keywords" not in key_moment:
|
1292 |
transcript = key_moment["transcript"]
|
1293 |
-
key_moment["keywords"] = generate_key_moments_keywords(transcript)
|
1294 |
print("===keywords===")
|
1295 |
print(key_moment["keywords"])
|
1296 |
print("===keywords===")
|
@@ -1311,7 +1264,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1311 |
# 检查檔案是否存在
|
1312 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
1313 |
if not exists:
|
1314 |
-
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
1315 |
key_moments_json = {"key_moments": key_moments}
|
1316 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1317 |
upload_content_directly(service, file_name, folder_id, key_moments_text)
|
@@ -1324,7 +1277,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1324 |
|
1325 |
return key_moments_json
|
1326 |
|
1327 |
-
def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
1328 |
print("===generate_key_moments===")
|
1329 |
segments = split_data(formatted_simple_transcript, word_base=100000)
|
1330 |
all_content = []
|
@@ -1351,7 +1304,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
|
1351 |
}}
|
1352 |
"""
|
1353 |
response_format = { "type": "json_object" }
|
1354 |
-
content = generate_content_by_LLM(sys_content, user_content, response_format)
|
1355 |
key_moments = json.loads(content)["key_moments"]
|
1356 |
|
1357 |
# "transcript": get text from formatted_simple_transcript
|
@@ -1379,7 +1332,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
|
1379 |
|
1380 |
return all_content
|
1381 |
|
1382 |
-
def generate_key_moments_keywords(transcript):
|
1383 |
print("===generate_key_moments_keywords===")
|
1384 |
segments = split_data(transcript, word_base=100000)
|
1385 |
all_content = []
|
@@ -1392,7 +1345,7 @@ def generate_key_moments_keywords(transcript):
|
|
1392 |
不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
|
1393 |
transcript:{segment}
|
1394 |
"""
|
1395 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
1396 |
keywords = content.strip().split(",")
|
1397 |
all_content += keywords
|
1398 |
|
@@ -1673,7 +1626,6 @@ def delete_LLM_content(video_id, kind):
|
|
1673 |
|
1674 |
def update_LLM_content(video_id, new_content, kind):
|
1675 |
print(f"===upfdate kind on gcs===")
|
1676 |
-
gcs_client = GCS_CLIENT
|
1677 |
bucket_name = 'video_ai_assistant'
|
1678 |
file_name = f'{video_id}_{kind}.json'
|
1679 |
blob_name = f"{video_id}/{file_name}"
|
@@ -1747,16 +1699,16 @@ def update_LLM_content(video_id, new_content, kind):
|
|
1747 |
print(f"{kind} 已更新到GCS")
|
1748 |
return gr.update(value=updated_content, interactive=False)
|
1749 |
|
1750 |
-
def create_LLM_content(video_id, df_string, kind):
|
1751 |
print(f"===create_{kind}===")
|
1752 |
print(f"video_id: {video_id}")
|
1753 |
|
1754 |
if kind == "reading_passage_latex":
|
1755 |
-
content = generate_reading_passage(df_string)
|
1756 |
update_LLM_content(video_id, content, kind)
|
1757 |
elif kind == "summary_markdown":
|
1758 |
meta_data = get_meta_data(video_id)
|
1759 |
-
content = generate_summarise(df_string, meta_data)
|
1760 |
update_LLM_content(video_id, content, kind)
|
1761 |
elif kind == "mind_map":
|
1762 |
content = generate_mind_map(df_string)
|
@@ -1768,7 +1720,7 @@ def create_LLM_content(video_id, df_string, kind):
|
|
1768 |
transcript = df_string
|
1769 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
1770 |
formatted_transcript = create_formatted_transcript(video_id, transcript)
|
1771 |
-
gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
1772 |
update_LLM_content(video_id, gen_content, kind)
|
1773 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1774 |
elif kind == "transcript":
|
@@ -1776,7 +1728,7 @@ def create_LLM_content(video_id, df_string, kind):
|
|
1776 |
update_LLM_content(video_id, gen_content, kind)
|
1777 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1778 |
elif kind == "questions":
|
1779 |
-
gen_content = generate_questions(df_string)
|
1780 |
update_LLM_content(video_id, gen_content, kind)
|
1781 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1782 |
elif kind == "questions_answers":
|
@@ -1785,7 +1737,7 @@ def create_LLM_content(video_id, df_string, kind):
|
|
1785 |
else:
|
1786 |
transcript = df_string
|
1787 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
1788 |
-
gen_content = generate_questions_answers(formatted_simple_transcript)
|
1789 |
update_LLM_content(video_id, gen_content, kind)
|
1790 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1791 |
|
@@ -2615,6 +2567,8 @@ def init_params(text, request: gr.Request):
|
|
2615 |
chatbot_ai = gr.update(visible=False)
|
2616 |
ai_chatbot_params = gr.update(visible=True)
|
2617 |
|
|
|
|
|
2618 |
# if youtube_link in query_params
|
2619 |
if "youtube_id" in request.query_params:
|
2620 |
youtube_id = request.query_params["youtube_id"]
|
@@ -2633,11 +2587,15 @@ def init_params(text, request: gr.Request):
|
|
2633 |
lesson_plan_accordion = gr.update(visible=False)
|
2634 |
exit_ticket_accordion = gr.update(visible=False)
|
2635 |
ai_chatbot_params = gr.update(visible=False)
|
|
|
|
|
|
|
2636 |
|
2637 |
return admin, reading_passage_admin, summary_admin, see_detail, \
|
2638 |
worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
|
2639 |
password_text, youtube_link, \
|
2640 |
-
chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params
|
|
|
2641 |
|
2642 |
def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
|
2643 |
# inputs=[content_subject, content_grade, df_string_output],
|
@@ -2692,13 +2650,20 @@ HEAD = """
|
|
2692 |
|
2693 |
with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
|
2694 |
with gr.Row() as admin:
|
2695 |
-
|
2696 |
-
|
2697 |
-
|
2698 |
-
|
2699 |
-
|
2700 |
-
|
2701 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2702 |
with gr.Row() as data_state:
|
2703 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
2704 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
@@ -3171,7 +3136,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3171 |
)
|
3172 |
|
3173 |
# 当输入 YouTube 链接时触发
|
3174 |
-
process_youtube_link_inputs = [password, youtube_link]
|
3175 |
process_youtube_link_outputs = [
|
3176 |
video_id,
|
3177 |
questions_answers_json,
|
@@ -3252,7 +3217,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3252 |
{
|
3253 |
'button': transcript_create_button,
|
3254 |
'action': create_LLM_content,
|
3255 |
-
'inputs': [video_id, df_string_output, transcript_kind],
|
3256 |
'outputs': [df_string_output]
|
3257 |
},
|
3258 |
{
|
@@ -3283,7 +3248,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3283 |
{
|
3284 |
'button': reading_passage_create_button,
|
3285 |
'action': create_LLM_content,
|
3286 |
-
'inputs': [video_id, df_string_output, reading_passage_kind],
|
3287 |
'outputs': [reading_passage_text]
|
3288 |
},
|
3289 |
{
|
@@ -3314,7 +3279,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3314 |
{
|
3315 |
'button': summary_create_button,
|
3316 |
'action': create_LLM_content,
|
3317 |
-
'inputs': [video_id, df_string_output, summary_kind],
|
3318 |
'outputs': [summary_text]
|
3319 |
},
|
3320 |
{
|
@@ -3345,7 +3310,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3345 |
{
|
3346 |
'button': key_moments_create_button,
|
3347 |
'action': create_LLM_content,
|
3348 |
-
'inputs': [video_id, df_string_output, key_moments_kind],
|
3349 |
'outputs': [key_moments]
|
3350 |
},
|
3351 |
{
|
@@ -3376,7 +3341,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3376 |
{
|
3377 |
'button': questions_create_button,
|
3378 |
'action': create_LLM_content,
|
3379 |
-
'inputs': [video_id, df_string_output, questions_kind],
|
3380 |
'outputs': [questions_json]
|
3381 |
},
|
3382 |
{
|
@@ -3407,7 +3372,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3407 |
{
|
3408 |
'button': questions_answers_create_button,
|
3409 |
'action': create_LLM_content,
|
3410 |
-
'inputs': [video_id, df_string_output, questions_answers_kind],
|
3411 |
'outputs': [questions_answers_json]
|
3412 |
},
|
3413 |
{
|
@@ -3438,7 +3403,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3438 |
{
|
3439 |
'button': worksheet_create_button,
|
3440 |
'action': create_LLM_content,
|
3441 |
-
'inputs': [video_id, df_string_output, worksheet_kind],
|
3442 |
'outputs': [worksheet_json]
|
3443 |
},
|
3444 |
{
|
@@ -3567,6 +3532,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3567 |
chatbot_open_ai_streaming,
|
3568 |
chatbot_ai,
|
3569 |
ai_chatbot_params,
|
|
|
3570 |
]
|
3571 |
demo.load(
|
3572 |
init_params,
|
|
|
49 |
if is_env_local:
|
50 |
with open("local_config.json") as f:
|
51 |
config = json.load(f)
|
52 |
+
IS_ENV_PROD = "False"
|
53 |
PASSWORD = config["PASSWORD"]
|
54 |
GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
55 |
DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
|
|
65 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
66 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
67 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
68 |
+
|
69 |
else:
|
70 |
+
IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
|
71 |
PASSWORD = os.getenv("PASSWORD")
|
72 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
73 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
|
|
429 |
|
430 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
431 |
print("====process_transcript_and_screenshots_on_gcs====")
|
432 |
+
transcript, exists = get_transcript_from_gcs(video_id)
|
433 |
+
if not exists:
|
434 |
+
print("Transcript file does not exist, creating new transcript...")
|
435 |
+
transcript = generate_transcription_by_whisper(video_id)
|
436 |
+
upload_transcript_to_gcs(video_id, transcript)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
438 |
# 處理截圖
|
439 |
+
is_new_transcript = False
|
440 |
for entry in transcript:
|
441 |
if 'img_file_id' not in entry:
|
442 |
# 檢查 OUTPUT_PATH 是否存在 video_id.mp4
|
|
|
451 |
if i == 4:
|
452 |
raise gr.Error(f"下载视频失败: {str(e)}")
|
453 |
time.sleep(5)
|
454 |
+
try:
|
455 |
+
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
456 |
+
screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
|
457 |
+
img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
|
458 |
+
entry['img_file_id'] = img_file_id
|
459 |
+
print(f"截图已上传到GCS: {img_file_id}")
|
460 |
+
is_new_transcript = True
|
461 |
+
except Exception as e:
|
462 |
+
print(f"Error processing screenshot: {str(e)}")
|
463 |
|
|
|
464 |
if is_new_transcript:
|
|
|
|
|
|
|
465 |
print("===更新逐字稿文件===")
|
466 |
+
upload_transcript_to_gcs(video_id, transcript)
|
467 |
+
|
468 |
+
return transcript
|
469 |
+
|
470 |
+
def get_transcript(video_id):
|
471 |
+
print("====get_transcript====")
|
472 |
+
transcript, exists = get_transcript_from_gcs(video_id)
|
473 |
+
if not exists:
|
474 |
+
raise gr.Error("逐字稿文件不存在於GCS中。")
|
475 |
+
|
476 |
+
if any('img_file_id' not in entry for entry in transcript):
|
477 |
+
raise gr.Error("Some entries in the transcript do not have an associated img_file_id.")
|
478 |
+
|
479 |
+
print("Transcript is verified with all necessary images.")
|
480 |
+
return transcript
|
481 |
+
|
482 |
+
def get_transcript_from_gcs(video_id):
|
483 |
+
print("Checking for transcript in GCS...")
|
484 |
+
bucket_name = 'video_ai_assistant'
|
485 |
+
transcript_file_name = f'{video_id}_transcript.json'
|
486 |
+
transcript_blob_name = f"{video_id}/{transcript_file_name}"
|
487 |
+
# Check if the transcript exists in GCS
|
488 |
+
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
489 |
+
if is_transcript_exists:
|
490 |
+
# Download the transcript if it exists
|
491 |
+
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
492 |
+
return json.loads(transcript_text), True
|
493 |
else:
|
494 |
+
print("No transcript found for video ID:", video_id)
|
495 |
+
return None, False
|
496 |
|
497 |
+
def upload_transcript_to_gcs(video_id, transcript):
|
498 |
+
print("Uploading updated transcript to GCS...")
|
499 |
+
bucket_name = 'video_ai_assistant'
|
500 |
+
transcript_file_name = f'{video_id}_transcript.json'
|
501 |
+
transcript_blob_name = f"{video_id}/{transcript_file_name}"
|
502 |
+
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
503 |
+
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
504 |
+
print("Transcript uploaded successfully.")
|
505 |
|
506 |
+
def process_youtube_link(password, link, LLM_model=None):
|
507 |
verify_password(password)
|
|
|
|
|
|
|
508 |
video_id = extract_youtube_id(link)
|
509 |
+
|
510 |
try:
|
511 |
+
if IS_ENV_PROD == "True":
|
512 |
+
transcript = get_transcript(video_id)
|
513 |
+
else:
|
514 |
+
transcript = process_transcript_and_screenshots_on_gcs(video_id)
|
515 |
except Exception as e:
|
516 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
517 |
print("===process_youtube_link error===")
|
|
|
545 |
|
546 |
# 基于逐字稿生成其他所需的输出
|
547 |
source = "gcs"
|
548 |
+
questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source, LLM_model)
|
549 |
questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
550 |
+
summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source, LLM_model)
|
551 |
summary_text = summary_json["summary"]
|
552 |
summary = summary_json["summary"]
|
553 |
+
key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model)
|
554 |
key_moments = key_moments_json["key_moments"]
|
555 |
key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
|
556 |
key_moments_html = get_key_moments_html(key_moments)
|
557 |
html_content = format_transcript_to_html(formatted_transcript)
|
558 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
559 |
+
mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source, LLM_model)
|
560 |
mind_map = mind_map_json["mind_map"]
|
561 |
mind_map_html = get_mind_map_html(mind_map)
|
562 |
+
reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source, LLM_model)
|
563 |
reading_passage_text = reading_passage_json["reading_passage"]
|
564 |
reading_passage = reading_passage_json["reading_passage"]
|
565 |
meta_data = get_meta_data(video_id)
|
|
|
703 |
|
704 |
return segments
|
705 |
|
706 |
+
def generate_content_by_open_ai(sys_content, user_content, response_format=None):
|
707 |
+
print("LLM using OPEN AI")
|
708 |
+
model = "gpt-4-turbo"
|
709 |
+
messages = [
|
710 |
+
{"role": "system", "content": sys_content},
|
711 |
+
{"role": "user", "content": user_content}
|
712 |
+
]
|
713 |
+
request_payload = {
|
714 |
+
"model": model,
|
715 |
+
"messages": messages,
|
716 |
+
"max_tokens": 4000,
|
717 |
+
}
|
718 |
|
719 |
+
if response_format is not None:
|
720 |
+
request_payload["response_format"] = response_format
|
|
|
|
|
|
|
|
|
|
|
721 |
|
722 |
+
response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
|
723 |
+
content = response.choices[0].message.content.strip()
|
724 |
+
return content
|
|
|
|
|
|
|
725 |
|
726 |
+
def generate_content_by_bedrock(sys_content, user_content):
|
727 |
+
print("LLM using REDROCK")
|
728 |
+
messages = [
|
729 |
+
{"role": "user", "content": user_content +"(如果是 JSON 格式,value 的引號,請用單引號,或是用反斜線+雙引號,避免 JSON Decoder error )"}
|
730 |
+
]
|
731 |
+
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
|
732 |
+
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
|
733 |
+
kwargs = {
|
734 |
+
"modelId": model_id,
|
735 |
+
"contentType": "application/json",
|
736 |
+
"accept": "application/json",
|
737 |
+
"body": json.dumps({
|
738 |
+
"anthropic_version": "bedrock-2023-05-31",
|
739 |
+
"max_tokens": 4000,
|
740 |
+
"system": sys_content,
|
741 |
+
"messages": messages
|
742 |
+
})
|
743 |
+
}
|
744 |
+
response = BEDROCK_CLIENT.invoke_model(**kwargs)
|
745 |
+
response_body = json.loads(response.get('body').read())
|
746 |
+
content = response_body.get('content')[0].get('text')
|
747 |
+
return content
|
748 |
|
749 |
+
def generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=None):
|
750 |
+
# 使用 OpenAI 生成基于上传数据的问题
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
751 |
|
752 |
+
if LLM_model == "anthropic-claude-3-sonnet":
|
753 |
+
print(f"LLM: {LLM_model}")
|
754 |
+
content = generate_content_by_bedrock(sys_content, user_content)
|
755 |
+
else:
|
756 |
+
print(f"LLM: {LLM_model}")
|
757 |
+
content = generate_content_by_open_ai(sys_content, user_content, response_format)
|
758 |
+
|
759 |
print("=====content=====")
|
760 |
print(content)
|
761 |
print("=====content=====")
|
762 |
|
763 |
return content
|
764 |
|
765 |
+
def get_reading_passage(video_id, df_string, source, LLM_model=None):
|
766 |
if source == "gcs":
|
767 |
print("===get_reading_passage on gcs===")
|
|
|
768 |
bucket_name = 'video_ai_assistant'
|
769 |
file_name = f'{video_id}_reading_passage_latex.json'
|
770 |
blob_name = f"{video_id}/{file_name}"
|
771 |
# 检查 reading_passage 是否存在
|
772 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
773 |
if not is_file_exists:
|
774 |
+
reading_passage = generate_reading_passage(df_string, LLM_model)
|
775 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
776 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
777 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
|
|
804 |
|
805 |
return reading_passage_json
|
806 |
|
807 |
+
def generate_reading_passage(df_string, LLM_model=None):
|
808 |
print("===generate_reading_passage===")
|
809 |
segments = split_data(df_string, word_base=100000)
|
810 |
all_content = []
|
|
|
823 |
加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
|
824 |
請直接給出文章,不用介紹怎麼處理的或是文章字數等等
|
825 |
"""
|
826 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
827 |
all_content.append(content + "\n")
|
828 |
|
829 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
|
|
836 |
tts.save(filename)
|
837 |
return filename
|
838 |
|
839 |
+
def get_mind_map(video_id, df_string, source, LLM_model=None):
|
840 |
if source == "gcs":
|
841 |
print("===get_mind_map on gcs===")
|
842 |
gcs_client = GCS_CLIENT
|
|
|
846 |
# 检查檔案是否存在
|
847 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
848 |
if not is_file_exists:
|
849 |
+
mind_map = generate_mind_map(df_string, LLM_model)
|
850 |
mind_map_json = {"mind_map": str(mind_map)}
|
851 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
852 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
|
|
867 |
# 检查檔案是否存在
|
868 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
869 |
if not exists:
|
870 |
+
mind_map = generate_mind_map(df_string, LLM_model)
|
871 |
mind_map_json = {"mind_map": str(mind_map)}
|
872 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
873 |
upload_content_directly(service, file_name, folder_id, mind_map_text)
|
|
|
880 |
|
881 |
return mind_map_json
|
882 |
|
883 |
+
def generate_mind_map(df_string, LLM_model=None):
|
884 |
print("===generate_mind_map===")
|
885 |
segments = split_data(df_string, word_base=100000)
|
886 |
all_content = []
|
|
|
892 |
注意:不需要前後文敘述,直接給出 markdown 文本即可
|
893 |
這對我很重要
|
894 |
"""
|
895 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
896 |
all_content.append(content + "\n")
|
897 |
|
898 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
|
|
911 |
"""
|
912 |
return mind_map_html
|
913 |
|
914 |
+
def get_video_id_summary(video_id, df_string, source, LLM_model=None):
|
915 |
if source == "gcs":
|
916 |
print("===get_video_id_summary on gcs===")
|
|
|
917 |
bucket_name = 'video_ai_assistant'
|
918 |
file_name = f'{video_id}_summary_markdown.json'
|
919 |
summary_file_blob_name = f"{video_id}/{file_name}"
|
|
|
921 |
is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
|
922 |
if not is_summary_file_exists:
|
923 |
meta_data = get_meta_data(video_id)
|
924 |
+
summary = generate_summarise(df_string, meta_data, LLM_model)
|
925 |
summary_json = {"summary": str(summary)}
|
926 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
927 |
GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
|
|
|
943 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
944 |
if not exists:
|
945 |
meta_data = get_meta_data(video_id)
|
946 |
+
summary = generate_summarise(df_string, meta_data, LLM_model)
|
947 |
summary_json = {"summary": str(summary)}
|
948 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
949 |
|
|
|
964 |
|
965 |
return summary_json
|
966 |
|
967 |
+
def generate_summarise(df_string, metadata=None, LLM_model=None):
|
968 |
print("===generate_summarise===")
|
969 |
# 使用 OpenAI 生成基于上传数据的问题
|
970 |
if metadata:
|
|
|
1012 |
## ❓ 延伸小問題
|
1013 |
- (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
1014 |
"""
|
1015 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
1016 |
all_content.append(content + "\n")
|
1017 |
|
1018 |
if len(all_content) > 1:
|
|
|
1051 |
## ❓ 延伸小問題
|
1052 |
- ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
1053 |
"""
|
1054 |
+
final_content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
1055 |
else:
|
1056 |
final_content = all_content[0]
|
1057 |
|
1058 |
return final_content
|
1059 |
|
1060 |
+
def get_questions(video_id, df_string, source="gcs", LLM_model=None):
|
1061 |
if source == "gcs":
|
1062 |
# 去 gcs 確認是有有 video_id_questions.json
|
1063 |
print("===get_questions on gcs===")
|
|
|
1068 |
# 检查檔案是否存在
|
1069 |
is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1070 |
if not is_questions_exists:
|
1071 |
+
questions = generate_questions(df_string, LLM_model)
|
1072 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
1073 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
1074 |
print("questions已上傳到GCS")
|
|
|
1089 |
# 检查檔案是否存在
|
1090 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
1091 |
if not exists:
|
1092 |
+
questions = generate_questions(df_string, LLM_model)
|
1093 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
1094 |
upload_content_directly(service, file_name, folder_id, questions_text)
|
1095 |
print("questions已上傳到Google Drive")
|
|
|
1109 |
print("=====get_questions=====")
|
1110 |
return q1, q2, q3
|
1111 |
|
1112 |
+
def generate_questions(df_string, LLM_model=None):
|
1113 |
print("===generate_questions===")
|
1114 |
# 使用 OpenAI 生成基于上传数据的问题
|
1115 |
if isinstance(df_string, str):
|
|
|
1132 |
[q1的敘述text, q2的敘述text, q3的敘述text]
|
1133 |
}}
|
1134 |
"""
|
1135 |
+
response_format = { "type": "json_object" }
|
1136 |
+
questions = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
|
1137 |
+
questions_list = json.loads(questions)["questions"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1138 |
print("=====json_response=====")
|
1139 |
+
print(questions_list)
|
1140 |
print("=====json_response=====")
|
1141 |
|
1142 |
+
return questions_list
|
1143 |
|
1144 |
+
def get_questions_answers(video_id, df_string, source="gcs", LLM_model=None):
|
1145 |
if source == "gcs":
|
1146 |
try:
|
1147 |
print("===get_questions_answers on gcs===")
|
|
|
1148 |
bucket_name = 'video_ai_assistant'
|
1149 |
file_name = f'{video_id}_questions_answers.json'
|
1150 |
blob_name = f"{video_id}/{file_name}"
|
1151 |
# 检查檔案是否存在
|
1152 |
is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1153 |
if not is_questions_answers_exists:
|
1154 |
+
questions_answers = generate_questions_answers(df_string, LLM_model)
|
1155 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
1156 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
1157 |
print("questions_answers已上傳到GCS")
|
|
|
1162 |
questions_answers = json.loads(questions_answers_text)
|
1163 |
except Exception as e:
|
1164 |
print(f"Error getting questions_answers: {str(e)}")
|
1165 |
+
questions_list = get_questions(video_id, df_string, source, LLM_model)
|
1166 |
+
questions_answers = [{"question": q, "answer": ""} for q in questions_list]
|
1167 |
|
1168 |
return questions_answers
|
1169 |
|
1170 |
+
def generate_questions_answers(df_string, LLM_model=None):
|
1171 |
print("===generate_questions_answers===")
|
1172 |
segments = split_data(df_string, word_base=100000)
|
1173 |
all_content = []
|
|
|
1193 |
}}
|
1194 |
"""
|
1195 |
response_format = { "type": "json_object" }
|
1196 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
|
1197 |
content_json = json.loads(content)["questions_answers"]
|
1198 |
all_content += content_json
|
1199 |
|
|
|
1217 |
print("=====get_questions=====")
|
1218 |
return q1, q2, q3
|
1219 |
|
1220 |
+
def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model=None):
|
1221 |
if source == "gcs":
|
1222 |
print("===get_key_moments on gcs===")
|
1223 |
gcs_client = GCS_CLIENT
|
|
|
1227 |
# 检查檔案是否存在
|
1228 |
is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1229 |
if not is_key_moments_exists:
|
1230 |
+
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
|
1231 |
key_moments_json = {"key_moments": key_moments}
|
1232 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1233 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
|
|
1243 |
for key_moment in key_moments_json["key_moments"]:
|
1244 |
if "keywords" not in key_moment:
|
1245 |
transcript = key_moment["transcript"]
|
1246 |
+
key_moment["keywords"] = generate_key_moments_keywords(transcript, LLM_model)
|
1247 |
print("===keywords===")
|
1248 |
print(key_moment["keywords"])
|
1249 |
print("===keywords===")
|
|
|
1264 |
# 检查檔案是否存在
|
1265 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
1266 |
if not exists:
|
1267 |
+
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
|
1268 |
key_moments_json = {"key_moments": key_moments}
|
1269 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1270 |
upload_content_directly(service, file_name, folder_id, key_moments_text)
|
|
|
1277 |
|
1278 |
return key_moments_json
|
1279 |
|
1280 |
+
def generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model=None):
|
1281 |
print("===generate_key_moments===")
|
1282 |
segments = split_data(formatted_simple_transcript, word_base=100000)
|
1283 |
all_content = []
|
|
|
1304 |
}}
|
1305 |
"""
|
1306 |
response_format = { "type": "json_object" }
|
1307 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
|
1308 |
key_moments = json.loads(content)["key_moments"]
|
1309 |
|
1310 |
# "transcript": get text from formatted_simple_transcript
|
|
|
1332 |
|
1333 |
return all_content
|
1334 |
|
1335 |
+
def generate_key_moments_keywords(transcript, LLM_model=None):
|
1336 |
print("===generate_key_moments_keywords===")
|
1337 |
segments = split_data(transcript, word_base=100000)
|
1338 |
all_content = []
|
|
|
1345 |
不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
|
1346 |
transcript:{segment}
|
1347 |
"""
|
1348 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
1349 |
keywords = content.strip().split(",")
|
1350 |
all_content += keywords
|
1351 |
|
|
|
1626 |
|
1627 |
def update_LLM_content(video_id, new_content, kind):
|
1628 |
print(f"===upfdate kind on gcs===")
|
|
|
1629 |
bucket_name = 'video_ai_assistant'
|
1630 |
file_name = f'{video_id}_{kind}.json'
|
1631 |
blob_name = f"{video_id}/{file_name}"
|
|
|
1699 |
print(f"{kind} 已更新到GCS")
|
1700 |
return gr.update(value=updated_content, interactive=False)
|
1701 |
|
1702 |
+
def create_LLM_content(video_id, df_string, kind, LLM_model=None):
|
1703 |
print(f"===create_{kind}===")
|
1704 |
print(f"video_id: {video_id}")
|
1705 |
|
1706 |
if kind == "reading_passage_latex":
|
1707 |
+
content = generate_reading_passage(df_string, LLM_model)
|
1708 |
update_LLM_content(video_id, content, kind)
|
1709 |
elif kind == "summary_markdown":
|
1710 |
meta_data = get_meta_data(video_id)
|
1711 |
+
content = generate_summarise(df_string, meta_data, LLM_model)
|
1712 |
update_LLM_content(video_id, content, kind)
|
1713 |
elif kind == "mind_map":
|
1714 |
content = generate_mind_map(df_string)
|
|
|
1720 |
transcript = df_string
|
1721 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
1722 |
formatted_transcript = create_formatted_transcript(video_id, transcript)
|
1723 |
+
gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
|
1724 |
update_LLM_content(video_id, gen_content, kind)
|
1725 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1726 |
elif kind == "transcript":
|
|
|
1728 |
update_LLM_content(video_id, gen_content, kind)
|
1729 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1730 |
elif kind == "questions":
|
1731 |
+
gen_content = generate_questions(df_string, LLM_model)
|
1732 |
update_LLM_content(video_id, gen_content, kind)
|
1733 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1734 |
elif kind == "questions_answers":
|
|
|
1737 |
else:
|
1738 |
transcript = df_string
|
1739 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
1740 |
+
gen_content = generate_questions_answers(formatted_simple_transcript, LLM_model)
|
1741 |
update_LLM_content(video_id, gen_content, kind)
|
1742 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1743 |
|
|
|
2567 |
chatbot_ai = gr.update(visible=False)
|
2568 |
ai_chatbot_params = gr.update(visible=True)
|
2569 |
|
2570 |
+
is_env_prod = gr.update(value=False)
|
2571 |
+
|
2572 |
# if youtube_link in query_params
|
2573 |
if "youtube_id" in request.query_params:
|
2574 |
youtube_id = request.query_params["youtube_id"]
|
|
|
2587 |
lesson_plan_accordion = gr.update(visible=False)
|
2588 |
exit_ticket_accordion = gr.update(visible=False)
|
2589 |
ai_chatbot_params = gr.update(visible=False)
|
2590 |
+
|
2591 |
+
if IS_ENV_PROD == "True":
|
2592 |
+
is_env_prod = gr.update(value=True)
|
2593 |
|
2594 |
return admin, reading_passage_admin, summary_admin, see_detail, \
|
2595 |
worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
|
2596 |
password_text, youtube_link, \
|
2597 |
+
chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params, \
|
2598 |
+
is_env_prod
|
2599 |
|
2600 |
def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
|
2601 |
# inputs=[content_subject, content_grade, df_string_output],
|
|
|
2650 |
|
2651 |
with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
|
2652 |
with gr.Row() as admin:
|
2653 |
+
with gr.Column(scale=4):
|
2654 |
+
with gr.Row():
|
2655 |
+
password = gr.Textbox(label="Password", type="password", elem_id="password_input", visible=True)
|
2656 |
+
youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input", visible=True)
|
2657 |
+
video_id = gr.Textbox(label="video_id", visible=True)
|
2658 |
+
# file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
|
2659 |
+
# web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
2660 |
+
user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
|
2661 |
+
with gr.Row():
|
2662 |
+
is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
|
2663 |
+
LLM_model = gr.Dropdown(label="LLM Model", choices=["open-ai-gpt-4", "anthropic-claude-3-sonnet"], value="open-ai-gpt-4", visible=True, interactive=True)
|
2664 |
+
with gr.Column(scale=1):
|
2665 |
+
with gr.Row():
|
2666 |
+
youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
|
2667 |
with gr.Row() as data_state:
|
2668 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
2669 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
|
|
3136 |
)
|
3137 |
|
3138 |
# 当输入 YouTube 链接时触发
|
3139 |
+
process_youtube_link_inputs = [password, youtube_link, LLM_model]
|
3140 |
process_youtube_link_outputs = [
|
3141 |
video_id,
|
3142 |
questions_answers_json,
|
|
|
3217 |
{
|
3218 |
'button': transcript_create_button,
|
3219 |
'action': create_LLM_content,
|
3220 |
+
'inputs': [video_id, df_string_output, transcript_kind, LLM_model],
|
3221 |
'outputs': [df_string_output]
|
3222 |
},
|
3223 |
{
|
|
|
3248 |
{
|
3249 |
'button': reading_passage_create_button,
|
3250 |
'action': create_LLM_content,
|
3251 |
+
'inputs': [video_id, df_string_output, reading_passage_kind, LLM_model],
|
3252 |
'outputs': [reading_passage_text]
|
3253 |
},
|
3254 |
{
|
|
|
3279 |
{
|
3280 |
'button': summary_create_button,
|
3281 |
'action': create_LLM_content,
|
3282 |
+
'inputs': [video_id, df_string_output, summary_kind, LLM_model],
|
3283 |
'outputs': [summary_text]
|
3284 |
},
|
3285 |
{
|
|
|
3310 |
{
|
3311 |
'button': key_moments_create_button,
|
3312 |
'action': create_LLM_content,
|
3313 |
+
'inputs': [video_id, df_string_output, key_moments_kind, LLM_model],
|
3314 |
'outputs': [key_moments]
|
3315 |
},
|
3316 |
{
|
|
|
3341 |
{
|
3342 |
'button': questions_create_button,
|
3343 |
'action': create_LLM_content,
|
3344 |
+
'inputs': [video_id, df_string_output, questions_kind, LLM_model],
|
3345 |
'outputs': [questions_json]
|
3346 |
},
|
3347 |
{
|
|
|
3372 |
{
|
3373 |
'button': questions_answers_create_button,
|
3374 |
'action': create_LLM_content,
|
3375 |
+
'inputs': [video_id, df_string_output, questions_answers_kind, LLM_model],
|
3376 |
'outputs': [questions_answers_json]
|
3377 |
},
|
3378 |
{
|
|
|
3403 |
{
|
3404 |
'button': worksheet_create_button,
|
3405 |
'action': create_LLM_content,
|
3406 |
+
'inputs': [video_id, df_string_output, worksheet_kind, LLM_model],
|
3407 |
'outputs': [worksheet_json]
|
3408 |
},
|
3409 |
{
|
|
|
3532 |
chatbot_open_ai_streaming,
|
3533 |
chatbot_ai,
|
3534 |
ai_chatbot_params,
|
3535 |
+
is_env_prod,
|
3536 |
]
|
3537 |
demo.load(
|
3538 |
init_params,
|