Spaces:
Sleeping
Sleeping
update
Browse files- app.py +383 -179
- chatbot.py +27 -44
app.py
CHANGED
@@ -72,7 +72,6 @@ else:
|
|
72 |
|
73 |
TRANSCRIPTS = []
|
74 |
CURRENT_INDEX = 0
|
75 |
-
VIDEO_ID = ""
|
76 |
|
77 |
OPEN_AI_CLIENT = OpenAI(api_key=OPEN_AI_KEY)
|
78 |
GROQ_CLIENT = Groq(api_key=GROQ_API_KEY)
|
@@ -373,6 +372,9 @@ def get_transcript(video_id):
|
|
373 |
for language in languages:
|
374 |
try:
|
375 |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
|
|
|
|
|
|
|
376 |
return transcript # 成功獲取字幕,直接返回結果
|
377 |
except NoTranscriptFound:
|
378 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
@@ -413,73 +415,33 @@ def generate_transcription(video_id):
|
|
413 |
chunk_path = f"{OUTPUT_PATH}/{video_id}_part_{i}.{codec_name}"
|
414 |
chunk.export(chunk_path, format=codec_name)
|
415 |
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
|
|
|
|
|
|
434 |
|
435 |
# Remove temporary chunk files after processing
|
436 |
os.remove(chunk_path)
|
437 |
|
438 |
return transcription
|
439 |
|
440 |
-
def process_transcript_and_screenshots(video_id):
|
441 |
-
print("====process_transcript_and_screenshots====")
|
442 |
-
|
443 |
-
# Drive
|
444 |
-
service = init_drive_service()
|
445 |
-
parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL'
|
446 |
-
folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id)
|
447 |
-
|
448 |
-
# 逐字稿文件名
|
449 |
-
file_name = f'{video_id}_transcript.json'
|
450 |
-
# 检查逐字稿是否存在
|
451 |
-
exists, file_id = check_file_exists(service, folder_id, file_name)
|
452 |
-
if not exists:
|
453 |
-
# 从YouTube获取逐字稿并上传
|
454 |
-
transcript = get_transcript(video_id)
|
455 |
-
if transcript:
|
456 |
-
print("成功獲取字幕")
|
457 |
-
else:
|
458 |
-
print("沒有找到字幕")
|
459 |
-
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
460 |
-
file_id = upload_content_directly(service, file_name, folder_id, transcript_text)
|
461 |
-
print("逐字稿已上传到Google Drive")
|
462 |
-
else:
|
463 |
-
# 逐字稿已存在,下载逐字稿内容
|
464 |
-
print("逐字稿已存在于Google Drive中")
|
465 |
-
transcript_text = download_file_as_string(service, file_id)
|
466 |
-
transcript = json.loads(transcript_text)
|
467 |
-
|
468 |
-
# 处理逐字稿中的每个条目,检查并上传截图
|
469 |
-
for entry in transcript:
|
470 |
-
if 'img_file_id' not in entry:
|
471 |
-
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
472 |
-
img_file_id = upload_img_directly(service, f"{video_id}_{entry['start']}.jpg", folder_id, screenshot_path)
|
473 |
-
set_public_permission(service, img_file_id)
|
474 |
-
entry['img_file_id'] = img_file_id
|
475 |
-
print(f"截图已上传到Google Drive: {img_file_id}")
|
476 |
-
|
477 |
-
# 更新逐字稿文件
|
478 |
-
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
479 |
-
update_file_on_drive(service, file_id, updated_transcript_text)
|
480 |
-
print("逐字稿已更新,包括截图链接")
|
481 |
-
return transcript
|
482 |
-
|
483 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
484 |
print("====process_transcript_and_screenshots_on_gcs====")
|
485 |
# GCS
|
@@ -492,6 +454,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
492 |
is_new_transcript = False
|
493 |
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
494 |
if not is_transcript_exists:
|
|
|
495 |
# 从YouTube获取逐字稿并上传
|
496 |
try:
|
497 |
transcript = get_transcript(video_id)
|
@@ -567,11 +530,8 @@ def process_youtube_link(password, link):
|
|
567 |
# 使用 YouTube API 获取逐字稿
|
568 |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
569 |
video_id = extract_youtube_id(link)
|
570 |
-
global VIDEO_ID
|
571 |
-
VIDEO_ID = video_id
|
572 |
|
573 |
try:
|
574 |
-
# transcript = process_transcript_and_screenshots(video_id)
|
575 |
transcript = process_transcript_and_screenshots_on_gcs(video_id)
|
576 |
except Exception as e:
|
577 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
@@ -579,17 +539,14 @@ def process_youtube_link(password, link):
|
|
579 |
print(error_msg)
|
580 |
raise gr.Error(error_msg)
|
581 |
|
|
|
582 |
formatted_transcript = []
|
583 |
formatted_simple_transcript =[]
|
584 |
-
screenshot_paths = []
|
585 |
for entry in transcript:
|
586 |
start_time = format_seconds_to_time(entry['start'])
|
587 |
end_time = format_seconds_to_time(entry['start'] + entry['duration'])
|
588 |
embed_url = get_embedded_youtube_link(video_id, entry['start'])
|
589 |
img_file_id = entry['img_file_id']
|
590 |
-
# img_file_id =""
|
591 |
-
# 先取消 Google Drive 的图片
|
592 |
-
# screenshot_path = f"https://lh3.googleusercontent.com/d/{img_file_id}=s4000"
|
593 |
screenshot_path = img_file_id
|
594 |
line = {
|
595 |
"start_time": start_time,
|
@@ -606,7 +563,6 @@ def process_youtube_link(password, link):
|
|
606 |
"text": entry['text']
|
607 |
}
|
608 |
formatted_simple_transcript.append(simple_line)
|
609 |
-
screenshot_paths.append(screenshot_path)
|
610 |
|
611 |
global TRANSCRIPTS
|
612 |
TRANSCRIPTS = formatted_transcript
|
@@ -614,11 +570,12 @@ def process_youtube_link(password, link):
|
|
614 |
# 基于逐字稿生成其他所需的输出
|
615 |
source = "gcs"
|
616 |
questions = get_questions(video_id, formatted_simple_transcript, source)
|
617 |
-
|
618 |
summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source)
|
619 |
summary = summary_json["summary"]
|
620 |
key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source)
|
621 |
key_moments = key_moments_json["key_moments"]
|
|
|
622 |
key_moments_html = get_key_moments_html(key_moments)
|
623 |
html_content = format_transcript_to_html(formatted_transcript)
|
624 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
@@ -636,11 +593,13 @@ def process_youtube_link(password, link):
|
|
636 |
|
637 |
# 确保返回与 UI 组件预期匹配的输出
|
638 |
return video_id, \
|
|
|
639 |
questions[0] if len(questions) > 0 else "", \
|
640 |
questions[1] if len(questions) > 1 else "", \
|
641 |
questions[2] if len(questions) > 2 else "", \
|
642 |
-
|
643 |
summary, \
|
|
|
644 |
key_moments_html, \
|
645 |
mind_map, \
|
646 |
mind_map_html, \
|
@@ -652,6 +611,37 @@ def process_youtube_link(password, link):
|
|
652 |
subject, \
|
653 |
grade
|
654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
655 |
def format_transcript_to_html(formatted_transcript):
|
656 |
html_content = ""
|
657 |
for entry in formatted_transcript:
|
@@ -1088,6 +1078,22 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1088 |
print("key_moments已存在于GCS中")
|
1089 |
key_moments_text = download_blob_to_string(gcs_client, bucket_name, blob_name)
|
1090 |
key_moments_json = json.loads(key_moments_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1091 |
|
1092 |
elif source == "drive":
|
1093 |
print("===get_key_moments on drive===")
|
@@ -1117,20 +1123,21 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
|
1117 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
1118 |
user_content = f"""
|
1119 |
請根據 {formatted_simple_transcript} 文本,提取出重點摘要,並給出對應的時間軸
|
1120 |
-
重點摘要的「關鍵時刻」加上截圖資訊
|
1121 |
1. 小範圍切出不同段落的相對應時間軸的重點摘要,
|
1122 |
2. 每一小段最多不超過 1/5 的總內容,也就是大約 3~5段的重點(例如五~十分鐘的影片就一段大約1~2分鐘,最多三分鐘,但如果是超過十分鐘的影片,那一小段大約 2~3分鐘,以此類推)
|
1123 |
3. 注意不要遺漏任何一段時間軸的內容 從零秒開始
|
1124 |
4. 如果頭尾的情節不是重點,就併入到附近的段落,特別是打招呼或是介紹人物就是不重要的情節
|
1125 |
5. transcript 逐字稿的集合(要有合理的標點符號),要完整跟原來的一樣,不要省略
|
1126 |
以這種方式分析整個文本,從零秒開始分析,直到結束。這很重要
|
|
|
|
|
1127 |
|
1128 |
並用 JSON 格式返回 key_moments:[{{
|
1129 |
"start": "00:00",
|
1130 |
-
"end": "
|
1131 |
"text": "逐字稿的重點摘要",
|
1132 |
"transcript": "逐字稿的集合(要有合理的標點符號),要完整跟原來的一樣,不要省略",
|
1133 |
-
"
|
1134 |
}}]
|
1135 |
"""
|
1136 |
messages = [
|
@@ -1148,6 +1155,8 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
|
1148 |
|
1149 |
try:
|
1150 |
response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
|
|
|
|
|
1151 |
key_moments = json.loads(response.choices[0].message.content)["key_moments"]
|
1152 |
except Exception as e:
|
1153 |
error_msg = f" {video_id} 關鍵時刻錯誤: {str(e)}"
|
@@ -1168,6 +1177,28 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
|
1168 |
|
1169 |
return key_moments
|
1170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1171 |
def get_key_moments_html(key_moments):
|
1172 |
css = """
|
1173 |
<style>
|
@@ -1380,11 +1411,11 @@ def delete_LLM_content(video_id, kind):
|
|
1380 |
bucket_name = 'video_ai_assistant'
|
1381 |
file_name = f'{video_id}_{kind}.json'
|
1382 |
blob_name = f"{video_id}/{file_name}"
|
1383 |
-
# 检查
|
1384 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1385 |
if is_file_exists:
|
1386 |
delete_blob(gcs_client, bucket_name, blob_name)
|
1387 |
-
print("
|
1388 |
return gr.update(value="", interactive=False)
|
1389 |
|
1390 |
def update_LLM_content(video_id, new_content, kind):
|
@@ -1398,28 +1429,82 @@ def update_LLM_content(video_id, new_content, kind):
|
|
1398 |
reading_passage_json = {"reading_passage": str(new_content)}
|
1399 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
1400 |
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, reading_passage_text)
|
|
|
1401 |
elif kind == "summary":
|
1402 |
summary_json = {"summary": str(new_content)}
|
1403 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
1404 |
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, summary_text)
|
|
|
1405 |
elif kind == "mind_map":
|
1406 |
mind_map_json = {"mind_map": str(new_content)}
|
1407 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
1408 |
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, mind_map_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1409 |
|
1410 |
print(f"{kind} 已更新到GCS")
|
1411 |
-
return gr.update(value=
|
1412 |
|
1413 |
def create_LLM_content(video_id, df_string, kind):
|
1414 |
print(f"===create_{kind}===")
|
|
|
|
|
1415 |
if kind == "reading_passage":
|
1416 |
content = generate_reading_passage(df_string)
|
|
|
1417 |
elif kind == "summary":
|
1418 |
content = generate_summarise(df_string)
|
|
|
1419 |
elif kind == "mind_map":
|
1420 |
content = generate_mind_map(df_string)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1421 |
|
1422 |
-
update_LLM_content(video_id, content, kind)
|
1423 |
return gr.update(value=content, interactive=False)
|
1424 |
|
1425 |
|
@@ -1565,7 +1650,27 @@ def download_exam_result(content):
|
|
1565 |
return word_path
|
1566 |
|
1567 |
# ---- Chatbot ----
|
1568 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1569 |
verify_password(password)
|
1570 |
|
1571 |
if chat_history is not None and len(chat_history) > 10:
|
@@ -1578,18 +1683,42 @@ def chat_with_ai(ai_name, password, video_id, trascript, user_message, chat_hist
|
|
1578 |
ai_client = BEDROCK_CLIENT
|
1579 |
elif ai_name == "groq":
|
1580 |
ai_client = GROQ_CLIENT
|
|
|
|
|
|
|
|
|
1581 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1582 |
chatbot_config = {
|
1583 |
"video_id": video_id,
|
1584 |
-
"
|
|
|
1585 |
"content_subject": content_subject,
|
1586 |
"content_grade": content_grade,
|
1587 |
"jutor_chat_key": JUTOR_CHAT_KEY,
|
1588 |
"ai_name": ai_name,
|
1589 |
-
"ai_client": ai_client
|
|
|
1590 |
}
|
1591 |
-
|
1592 |
-
|
|
|
|
|
|
|
|
|
|
|
1593 |
|
1594 |
try:
|
1595 |
# 更新聊天历史
|
@@ -1606,7 +1735,7 @@ def chat_with_ai(ai_name, password, video_id, trascript, user_message, chat_hist
|
|
1606 |
print(f"Error: {e}")
|
1607 |
return "请求失败,请稍后再试!", chat_history
|
1608 |
|
1609 |
-
def chat_with_opan_ai_assistant(password, youtube_id, thread_id,
|
1610 |
verify_password(password)
|
1611 |
|
1612 |
# 先計算 user_message 是否超過 500 個字
|
@@ -1620,43 +1749,34 @@ def chat_with_opan_ai_assistant(password, youtube_id, thread_id, trascript, user
|
|
1620 |
raise gr.Error(error_msg)
|
1621 |
|
1622 |
try:
|
1623 |
-
assistant_id = "asst_kmvZLNkDUYaNkMNtZEAYxyPq"
|
|
|
|
|
1624 |
client = OPEN_AI_CLIENT
|
1625 |
# 直接安排逐字稿資料 in instructions
|
1626 |
-
|
|
|
|
|
|
|
1627 |
# 移除 embed_url, screenshot_path
|
1628 |
for entry in trascript_json:
|
1629 |
-
entry.pop('
|
1630 |
-
|
1631 |
-
|
1632 |
-
|
1633 |
-
|
1634 |
-
|
1635 |
-
|
1636 |
-
|
1637 |
-
|
1638 |
-
|
1639 |
-
|
1640 |
-
|
1641 |
-
|
1642 |
-
|
1643 |
-
|
1644 |
-
|
1645 |
-
|
1646 |
-
if socratic_mode is False,
|
1647 |
-
- 直接回答學生問題,字數在100字以內
|
1648 |
-
|
1649 |
-
rule:
|
1650 |
-
- 請一定要用繁體中文回答 zh-TW,並用台灣人的口語表達,回答時不用特別說明這是台灣人的語氣,也不用說這是「台語的說法」
|
1651 |
-
- 不用提到「逐字稿」這個詞,用「內容」代替
|
1652 |
-
- 如果學生問了一些問題你無法判斷,請告訴學生你無法判斷,並建議學生可以問其他問題
|
1653 |
-
- 或者你可以反問學生一些問題,幫助學生更好的理解資料,字數在100字以內
|
1654 |
-
- 如果學生的問題與資料文本無關,請告訴學生你「無法回答超出影片範圍的問題」,並告訴他可以怎麼問什麼樣的問題(一個就好)
|
1655 |
-
- 只要是參考逐字稿資料,請在回答的最後標註【參考資料:(分):(秒)】
|
1656 |
-
- 回答範圍一定要在逐字稿資料內,不要引用其他資料,請嚴格執行
|
1657 |
-
- 並在重複問句後給予學生鼓勵,讓學生有學習的動力
|
1658 |
-
- 請用 {content_grade} 的學生能懂的方式回答
|
1659 |
-
"""
|
1660 |
|
1661 |
# 创建线程
|
1662 |
if not thread_id:
|
@@ -1665,11 +1785,18 @@ def chat_with_opan_ai_assistant(password, youtube_id, thread_id, trascript, user
|
|
1665 |
else:
|
1666 |
thread = client.beta.threads.retrieve(thread_id)
|
1667 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1668 |
# 向线程添加用户的消息
|
1669 |
client.beta.threads.messages.create(
|
1670 |
thread_id=thread.id,
|
1671 |
role="user",
|
1672 |
-
content=user_message +
|
1673 |
)
|
1674 |
|
1675 |
# 运行助手,生成响应
|
@@ -1773,7 +1900,7 @@ def poll_run_status(run_id, thread_id, timeout=600, poll_interval=5):
|
|
1773 |
|
1774 |
return run.status
|
1775 |
|
1776 |
-
def streaming_chat_with_open_ai(user_message, chat_history, password, thread_id, trascript, content_subject, content_grade):
|
1777 |
verify_password(password)
|
1778 |
|
1779 |
print("===streaming_chat_with_open_ai===")
|
@@ -1789,50 +1916,29 @@ def streaming_chat_with_open_ai(user_message, chat_history, password, thread_id,
|
|
1789 |
error_msg = "此次對話超過上限"
|
1790 |
raise gr.Error(error_msg)
|
1791 |
|
1792 |
-
# fake data
|
1793 |
-
socratic_mode = True
|
1794 |
-
|
1795 |
try:
|
1796 |
-
assistant_id = "asst_kmvZLNkDUYaNkMNtZEAYxyPq"
|
|
|
1797 |
client = OPEN_AI_CLIENT
|
1798 |
# 直接安排逐字稿資料 in instructions
|
1799 |
-
|
1800 |
-
|
1801 |
-
|
1802 |
-
|
1803 |
-
|
1804 |
-
trascript_text = json.dumps(trascript_json, ensure_ascii=False, indent=2)
|
1805 |
# trascript_text 移除 \n, 空白
|
1806 |
trascript_text = trascript_text.replace("\n", "").replace(" ", "")
|
1807 |
|
1808 |
-
|
1809 |
-
|
1810 |
-
|
1811 |
-
|
1812 |
-
|
1813 |
-
|
1814 |
-
|
1815 |
-
|
1816 |
-
- 請用蘇格拉底式的提問方式,引導學生思考,並且給予學生一些提示
|
1817 |
-
- 一次只問一個問題,字數在100字以內
|
1818 |
-
- 不要直接給予答案,讓學生自己思考
|
1819 |
-
- 但可以給予一些提示跟引導,例如給予影片的時間軸,讓學生自己去找答案
|
1820 |
-
|
1821 |
-
if socratic_mode is False,
|
1822 |
-
- 直接回答學生問題,字數在100字以內
|
1823 |
-
|
1824 |
-
rule:
|
1825 |
-
- 請一定要用繁體中文回答 zh-TW,並用台灣人的口語表達,回答時不用特別說明這是台灣人的語氣,也不用說這是「台語的說法」
|
1826 |
-
- 不用提到「逐字稿」這個詞,用「內容」代替
|
1827 |
-
- 如果學生問了一些問題你無法判斷,請告訴學生你無法判斷,並建議學生可以問其他問題
|
1828 |
-
- 或者你可以反問學生一些問題,幫助學生更好的理解資料,字數在100字以內
|
1829 |
-
- 如果學生的問題與資料文本無關,請告訴學生你「無法回答超出影片範圍的問題」,並告訴他可以怎麼問什麼樣的問題(一個就好)
|
1830 |
-
- 只要是參考逐字稿資料,請在回答的最後標註【參考資料:(分):(秒)】
|
1831 |
-
- 回答範圍一定要在逐字稿資料內,不要引用其他資料,請嚴格執行
|
1832 |
-
- 並在重複問句後給予學生鼓勵,讓學生有學習的動力
|
1833 |
-
- 請用 {content_grade} 的學生能懂的方式回答
|
1834 |
-
"""
|
1835 |
|
|
|
1836 |
# 创建线程
|
1837 |
if not thread_id:
|
1838 |
thread = client.beta.threads.create()
|
@@ -1933,18 +2039,22 @@ def init_params(text, request: gr.Request):
|
|
1933 |
|
1934 |
return admin, reading_passage_admin, summary_admin, see_detail, password_text, youtube_link
|
1935 |
|
1936 |
-
def update_state(content_subject, content_grade, trascript, question_1, question_2, question_3):
|
1937 |
# inputs=[content_subject, content_grade, df_string_output],
|
1938 |
# outputs=[content_subject_state, content_grade_state, trascript_state]
|
1939 |
content_subject_state = content_subject
|
1940 |
content_grade_state = content_grade
|
1941 |
-
|
|
|
|
|
|
|
1942 |
streaming_chat_thread_id_state = create_thread_id()
|
1943 |
ai_chatbot_question_1 = question_1
|
1944 |
ai_chatbot_question_2 = question_2
|
1945 |
ai_chatbot_question_3 = question_3
|
1946 |
|
1947 |
-
return content_subject_state, content_grade_state, trascript_state,
|
|
|
1948 |
ai_chatbot_question_1, ai_chatbot_question_2, ai_chatbot_question_3
|
1949 |
|
1950 |
|
@@ -2016,7 +2126,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2016 |
with gr.Row() as admin:
|
2017 |
password = gr.Textbox(label="Password", type="password", elem_id="password_input", visible=True)
|
2018 |
youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input", visible=True)
|
2019 |
-
video_id = gr.Textbox(label="video_id", visible=
|
2020 |
# file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
|
2021 |
# web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
2022 |
user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
|
@@ -2025,6 +2135,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2025 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
2026 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
2027 |
trascript_state = gr.State() # 使用 gr.State 存储 trascript
|
|
|
2028 |
streaming_chat_thread_id_state = gr.State() # 使用 gr.State 存储 streaming_chat_thread_id
|
2029 |
with gr.Tab("AI小精靈"):
|
2030 |
with gr.Row():
|
@@ -2042,12 +2153,12 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2042 |
btn_3 = gr.Button("問題一")
|
2043 |
gr.Markdown("### 重新生成問題")
|
2044 |
btn_create_question = gr.Button("生成其他問題", variant="primary")
|
2045 |
-
openai_chatbot_audio_input = gr.Audio(sources=["microphone"], type="filepath")
|
2046 |
with gr.Row():
|
2047 |
msg = gr.Textbox(label="訊息",scale=3)
|
2048 |
send_button = gr.Button("送出", variant="primary", scale=1)
|
2049 |
with gr.Tab("飛特音速"):
|
2050 |
-
additional_inputs = [password, streaming_chat_thread_id_state, trascript_state, content_subject_state, content_grade_state]
|
2051 |
streaming_chat = gr.ChatInterface(
|
2052 |
fn=streaming_chat_with_open_ai,
|
2053 |
additional_inputs=additional_inputs,
|
@@ -2066,11 +2177,10 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2066 |
ai_chatbot_question_1 = gr.Button("問題一")
|
2067 |
ai_chatbot_question_2 = gr.Button("問題一")
|
2068 |
ai_chatbot_question_3 = gr.Button("問題一")
|
2069 |
-
ai_chatbot_audio_input = gr.Audio(sources=["microphone"], type="filepath")
|
2070 |
with gr.Row():
|
2071 |
-
ai_msg = gr.Textbox(label="
|
2072 |
-
ai_send_button = gr.Button("
|
2073 |
-
|
2074 |
with gr.Tab("文章模式"):
|
2075 |
with gr.Row() as reading_passage_admin:
|
2076 |
reading_passage_kind = gr.Textbox(value="reading_passage", show_label=False)
|
@@ -2094,7 +2204,6 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2094 |
with gr.Tab("關鍵時刻"):
|
2095 |
with gr.Row():
|
2096 |
key_moments_html = gr.HTML(value="")
|
2097 |
-
|
2098 |
with gr.Tab("教學備課"):
|
2099 |
with gr.Row():
|
2100 |
content_subject = gr.Dropdown(label="選擇主題", choices=["數學", "自然", "國文", "英文", "社會","物理", "化學", "生物", "地理", "歷史", "公民"], value="", visible=False)
|
@@ -2182,8 +2291,33 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2182 |
# metacognition_content_btn = gr.Button("生成後設認知問題")
|
2183 |
|
2184 |
with gr.Accordion("See Details", open=False) as see_details:
|
2185 |
-
with gr.Tab("
|
2186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2187 |
with gr.Tab("逐字稿"):
|
2188 |
simple_html_content = gr.HTML(label="Simple Transcript")
|
2189 |
with gr.Tab("圖文"):
|
@@ -2206,7 +2340,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2206 |
# OPENAI ASSISTANT CHATBOT 模式
|
2207 |
send_button.click(
|
2208 |
chat_with_opan_ai_assistant,
|
2209 |
-
inputs=[password, video_id, thread_id,
|
2210 |
outputs=[msg, chatbot, thread_id]
|
2211 |
)
|
2212 |
openai_chatbot_audio_input.change(
|
@@ -2215,9 +2349,9 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2215 |
outputs=[msg]
|
2216 |
)
|
2217 |
# OPENAI ASSISTANT CHATBOT 連接按鈕點擊事件
|
2218 |
-
btn_1_chat_with_opan_ai_assistant_input =[password, video_id, thread_id,
|
2219 |
-
btn_2_chat_with_opan_ai_assistant_input =[password, video_id, thread_id,
|
2220 |
-
btn_3_chat_with_opan_ai_assistant_input =[password, video_id, thread_id,
|
2221 |
btn_1.click(
|
2222 |
chat_with_opan_ai_assistant,
|
2223 |
inputs=btn_1_chat_with_opan_ai_assistant_input,
|
@@ -2242,13 +2376,13 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2242 |
# ai_chatbot 模式
|
2243 |
ai_send_button.click(
|
2244 |
chat_with_ai,
|
2245 |
-
inputs=[ai_name, password, video_id,
|
2246 |
outputs=[ai_msg, ai_chatbot]
|
2247 |
)
|
2248 |
# ai_chatbot 连接按钮点击事件
|
2249 |
-
ai_chatbot_question_1_chat_with_ai_input =[ai_name, password, video_id,
|
2250 |
-
ai_chatbot_question_2_chat_with_ai_input =[ai_name, password, video_id,
|
2251 |
-
ai_chatbot_question_3_chat_with_ai_input =[ai_name, password, video_id,
|
2252 |
ai_chatbot_question_1.click(
|
2253 |
chat_with_ai,
|
2254 |
inputs=ai_chatbot_question_1_chat_with_ai_input,
|
@@ -2272,11 +2406,13 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2272 |
process_youtube_link_inputs = [password, youtube_link]
|
2273 |
process_youtube_link_outputs = [
|
2274 |
video_id,
|
|
|
2275 |
btn_1,
|
2276 |
btn_2,
|
2277 |
btn_3,
|
2278 |
df_string_output,
|
2279 |
df_summarise,
|
|
|
2280 |
key_moments_html,
|
2281 |
mind_map,
|
2282 |
mind_map_html,
|
@@ -2292,6 +2428,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2292 |
content_subject,
|
2293 |
content_grade,
|
2294 |
df_string_output,
|
|
|
2295 |
btn_1,
|
2296 |
btn_2,
|
2297 |
btn_3
|
@@ -2299,7 +2436,8 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2299 |
update_state_outputs = [
|
2300 |
content_subject_state,
|
2301 |
content_grade_state,
|
2302 |
-
trascript_state,
|
|
|
2303 |
streaming_chat_thread_id_state,
|
2304 |
ai_chatbot_question_1,
|
2305 |
ai_chatbot_question_2,
|
@@ -2374,6 +2512,72 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2374 |
outputs=[df_summarise]
|
2375 |
)
|
2376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2377 |
# 教師版
|
2378 |
worksheet_content_btn.click(
|
2379 |
get_ai_content,
|
|
|
72 |
|
73 |
TRANSCRIPTS = []
|
74 |
CURRENT_INDEX = 0
|
|
|
75 |
|
76 |
OPEN_AI_CLIENT = OpenAI(api_key=OPEN_AI_KEY)
|
77 |
GROQ_CLIENT = Groq(api_key=GROQ_API_KEY)
|
|
|
372 |
for language in languages:
|
373 |
try:
|
374 |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
|
375 |
+
print("===transcript===")
|
376 |
+
print(transcript)
|
377 |
+
print("===transcript===")
|
378 |
return transcript # 成功獲取字幕,直接返回結果
|
379 |
except NoTranscriptFound:
|
380 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
|
|
415 |
chunk_path = f"{OUTPUT_PATH}/{video_id}_part_{i}.{codec_name}"
|
416 |
chunk.export(chunk_path, format=codec_name)
|
417 |
|
418 |
+
try:
|
419 |
+
with open(chunk_path, "rb") as chunk_file:
|
420 |
+
response = OPEN_AI_CLIENT.audio.transcriptions.create(
|
421 |
+
model="whisper-1",
|
422 |
+
file=chunk_file,
|
423 |
+
response_format="verbose_json",
|
424 |
+
timestamp_granularities=["segment"],
|
425 |
+
prompt="Transcribe the following audio file. if content is chinese, please using 'language: zh-TW' ",
|
426 |
+
)
|
427 |
+
|
428 |
+
# Adjusting the timestamps for the chunk based on its position in the full audio
|
429 |
+
adjusted_segments = [{
|
430 |
+
'text': segment['text'],
|
431 |
+
'start': math.ceil(segment['start'] + start_time / 1000.0), # Converting milliseconds to seconds
|
432 |
+
'end': math.ceil(segment['end'] + start_time / 1000.0),
|
433 |
+
'duration': math.ceil(segment['end'] - segment['start'])
|
434 |
+
} for segment in response.segments]
|
435 |
+
|
436 |
+
transcription.extend(adjusted_segments)
|
437 |
+
except Exception as e:
|
438 |
+
print(f"Error processing chunk {i}: {str(e)}")
|
439 |
|
440 |
# Remove temporary chunk files after processing
|
441 |
os.remove(chunk_path)
|
442 |
|
443 |
return transcription
|
444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
446 |
print("====process_transcript_and_screenshots_on_gcs====")
|
447 |
# GCS
|
|
|
454 |
is_new_transcript = False
|
455 |
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
456 |
if not is_transcript_exists:
|
457 |
+
print("逐字稿文件不存在于GCS中,重新建立")
|
458 |
# 从YouTube获取逐字稿并上传
|
459 |
try:
|
460 |
transcript = get_transcript(video_id)
|
|
|
530 |
# 使用 YouTube API 获取逐字稿
|
531 |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
532 |
video_id = extract_youtube_id(link)
|
|
|
|
|
533 |
|
534 |
try:
|
|
|
535 |
transcript = process_transcript_and_screenshots_on_gcs(video_id)
|
536 |
except Exception as e:
|
537 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
|
|
539 |
print(error_msg)
|
540 |
raise gr.Error(error_msg)
|
541 |
|
542 |
+
original_transcript = json.dumps(transcript, ensure_ascii=False, indent=2)
|
543 |
formatted_transcript = []
|
544 |
formatted_simple_transcript =[]
|
|
|
545 |
for entry in transcript:
|
546 |
start_time = format_seconds_to_time(entry['start'])
|
547 |
end_time = format_seconds_to_time(entry['start'] + entry['duration'])
|
548 |
embed_url = get_embedded_youtube_link(video_id, entry['start'])
|
549 |
img_file_id = entry['img_file_id']
|
|
|
|
|
|
|
550 |
screenshot_path = img_file_id
|
551 |
line = {
|
552 |
"start_time": start_time,
|
|
|
563 |
"text": entry['text']
|
564 |
}
|
565 |
formatted_simple_transcript.append(simple_line)
|
|
|
566 |
|
567 |
global TRANSCRIPTS
|
568 |
TRANSCRIPTS = formatted_transcript
|
|
|
570 |
# 基于逐字稿生成其他所需的输出
|
571 |
source = "gcs"
|
572 |
questions = get_questions(video_id, formatted_simple_transcript, source)
|
573 |
+
questions_json = json.dumps(questions, ensure_ascii=False, indent=2)
|
574 |
summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source)
|
575 |
summary = summary_json["summary"]
|
576 |
key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source)
|
577 |
key_moments = key_moments_json["key_moments"]
|
578 |
+
key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
|
579 |
key_moments_html = get_key_moments_html(key_moments)
|
580 |
html_content = format_transcript_to_html(formatted_transcript)
|
581 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
|
|
593 |
|
594 |
# 确保返回与 UI 组件预期匹配的输出
|
595 |
return video_id, \
|
596 |
+
questions_json, \
|
597 |
questions[0] if len(questions) > 0 else "", \
|
598 |
questions[1] if len(questions) > 1 else "", \
|
599 |
questions[2] if len(questions) > 2 else "", \
|
600 |
+
original_transcript, \
|
601 |
summary, \
|
602 |
+
key_moments_text, \
|
603 |
key_moments_html, \
|
604 |
mind_map, \
|
605 |
mind_map_html, \
|
|
|
611 |
subject, \
|
612 |
grade
|
613 |
|
614 |
+
def create_formatted_simple_transcript(transcript):
|
615 |
+
formatted_simple_transcript = []
|
616 |
+
for entry in transcript:
|
617 |
+
start_time = format_seconds_to_time(entry['start'])
|
618 |
+
end_time = format_seconds_to_time(entry['start'] + entry['duration'])
|
619 |
+
line = {
|
620 |
+
"start_time": start_time,
|
621 |
+
"end_time": end_time,
|
622 |
+
"text": entry['text']
|
623 |
+
}
|
624 |
+
formatted_simple_transcript.append(line)
|
625 |
+
return formatted_simple_transcript
|
626 |
+
|
627 |
+
def create_formatted_transcript(video_id, transcript):
|
628 |
+
formatted_transcript = []
|
629 |
+
for entry in transcript:
|
630 |
+
start_time = format_seconds_to_time(entry['start'])
|
631 |
+
end_time = format_seconds_to_time(entry['start'] + entry['duration'])
|
632 |
+
embed_url = get_embedded_youtube_link(video_id, entry['start'])
|
633 |
+
img_file_id = entry['img_file_id']
|
634 |
+
screenshot_path = img_file_id
|
635 |
+
line = {
|
636 |
+
"start_time": start_time,
|
637 |
+
"end_time": end_time,
|
638 |
+
"text": entry['text'],
|
639 |
+
"embed_url": embed_url,
|
640 |
+
"screenshot_path": screenshot_path
|
641 |
+
}
|
642 |
+
formatted_transcript.append(line)
|
643 |
+
return formatted_transcript
|
644 |
+
|
645 |
def format_transcript_to_html(formatted_transcript):
|
646 |
html_content = ""
|
647 |
for entry in formatted_transcript:
|
|
|
1078 |
print("key_moments已存在于GCS中")
|
1079 |
key_moments_text = download_blob_to_string(gcs_client, bucket_name, blob_name)
|
1080 |
key_moments_json = json.loads(key_moments_text)
|
1081 |
+
# 檢查 key_moments 是否有 keywords
|
1082 |
+
print("===檢查 key_moments 是否有 keywords===")
|
1083 |
+
has_keywords_added = False
|
1084 |
+
for key_moment in key_moments_json["key_moments"]:
|
1085 |
+
if "keywords" not in key_moment:
|
1086 |
+
transcript = key_moment["transcript"]
|
1087 |
+
key_moment["keywords"] = generate_key_moments_keywords(transcript)
|
1088 |
+
print("===keywords===")
|
1089 |
+
print(key_moment["keywords"])
|
1090 |
+
print("===keywords===")
|
1091 |
+
has_keywords_added = True
|
1092 |
+
if has_keywords_added:
|
1093 |
+
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1094 |
+
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, key_moments_text)
|
1095 |
+
key_moments_text = download_blob_to_string(gcs_client, bucket_name, blob_name)
|
1096 |
+
key_moments_json = json.loads(key_moments_text)
|
1097 |
|
1098 |
elif source == "drive":
|
1099 |
print("===get_key_moments on drive===")
|
|
|
1123 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
1124 |
user_content = f"""
|
1125 |
請根據 {formatted_simple_transcript} 文本,提取出重點摘要,並給出對應的時間軸
|
|
|
1126 |
1. 小範圍切出不同段落的相對應時間軸的重點摘要,
|
1127 |
2. 每一小段最多不超過 1/5 的總內容,也就是大約 3~5段的重點(例如五~十分鐘的影片就一段大約1~2分鐘,最多三分鐘,但如果是超過十分鐘的影片,那一小段大約 2~3分鐘,以此類推)
|
1128 |
3. 注意不要遺漏任何一段時間軸的內容 從零秒開始
|
1129 |
4. 如果頭尾的情節不是重點,就併入到附近的段落,特別是打招呼或是介紹人物就是不重要的情節
|
1130 |
5. transcript 逐字稿的集合(要有合理的標點符號),要完整跟原來的一樣,不要省略
|
1131 |
以這種方式分析整個文本,從零秒開始分析,直到結束。這很重要
|
1132 |
+
6. 關鍵字從transcript extract to keyword,保留專家名字、專業術語、年份、數字、期刊名稱、地名、數學公式
|
1133 |
+
7. text, transcript, keywords please use or transfer zh-TW, it's very important
|
1134 |
|
1135 |
並用 JSON 格式返回 key_moments:[{{
|
1136 |
"start": "00:00",
|
1137 |
+
"end": "01:00",
|
1138 |
"text": "逐字稿的重點摘要",
|
1139 |
"transcript": "逐字稿的集合(要有合理的標點符號),要完整跟原來的一樣,不要省略",
|
1140 |
+
"keywords": ["關鍵字", "關鍵字"]
|
1141 |
}}]
|
1142 |
"""
|
1143 |
messages = [
|
|
|
1155 |
|
1156 |
try:
|
1157 |
response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
|
1158 |
+
print("===response===")
|
1159 |
+
print(dict(response))
|
1160 |
key_moments = json.loads(response.choices[0].message.content)["key_moments"]
|
1161 |
except Exception as e:
|
1162 |
error_msg = f" {video_id} 關鍵時刻錯誤: {str(e)}"
|
|
|
1177 |
|
1178 |
return key_moments
|
1179 |
|
1180 |
+
def generate_key_moments_keywords(transcript):
|
1181 |
+
system_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請根據以下文本提取關鍵字"
|
1182 |
+
user_content = f"""transcript extract to keyword
|
1183 |
+
保留專家名字、專業術語、年份、數字、期刊名稱、地名、數學公式、數學表示式、物理化學符號,
|
1184 |
+
不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
|
1185 |
+
transcript:{transcript}
|
1186 |
+
"""
|
1187 |
+
messages = [
|
1188 |
+
{"role": "system", "content": system_content},
|
1189 |
+
{"role": "user", "content": user_content}
|
1190 |
+
]
|
1191 |
+
request_payload = {
|
1192 |
+
"model": "gpt-4-1106-preview",
|
1193 |
+
"messages": messages,
|
1194 |
+
"max_tokens": 100,
|
1195 |
+
}
|
1196 |
+
|
1197 |
+
response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
|
1198 |
+
keywords = response.choices[0].message.content.strip().split(", ")
|
1199 |
+
|
1200 |
+
return keywords
|
1201 |
+
|
1202 |
def get_key_moments_html(key_moments):
|
1203 |
css = """
|
1204 |
<style>
|
|
|
1411 |
bucket_name = 'video_ai_assistant'
|
1412 |
file_name = f'{video_id}_{kind}.json'
|
1413 |
blob_name = f"{video_id}/{file_name}"
|
1414 |
+
# 检查 file 是否存在
|
1415 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1416 |
if is_file_exists:
|
1417 |
delete_blob(gcs_client, bucket_name, blob_name)
|
1418 |
+
print(f"{file_name}已从GCS中删除")
|
1419 |
return gr.update(value="", interactive=False)
|
1420 |
|
1421 |
def update_LLM_content(video_id, new_content, kind):
|
|
|
1429 |
reading_passage_json = {"reading_passage": str(new_content)}
|
1430 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
1431 |
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, reading_passage_text)
|
1432 |
+
updated_content = reading_passage_text
|
1433 |
elif kind == "summary":
|
1434 |
summary_json = {"summary": str(new_content)}
|
1435 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
1436 |
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, summary_text)
|
1437 |
+
updated_content = summary_text
|
1438 |
elif kind == "mind_map":
|
1439 |
mind_map_json = {"mind_map": str(new_content)}
|
1440 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
1441 |
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, mind_map_text)
|
1442 |
+
updated_content = mind_map_text
|
1443 |
+
elif kind == "key_moments":
|
1444 |
+
# from update_LLM_btn -> new_content is a string
|
1445 |
+
# create_LLM_content -> new_content is a list
|
1446 |
+
if isinstance(new_content, str):
|
1447 |
+
key_moments_list = json.loads(new_content)
|
1448 |
+
else:
|
1449 |
+
key_moments_list = new_content
|
1450 |
+
key_moments_json = {"key_moments": key_moments_list}
|
1451 |
+
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1452 |
+
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, key_moments_text)
|
1453 |
+
updated_content = key_moments_text
|
1454 |
+
elif kind == "transcript":
|
1455 |
+
if isinstance(new_content, str):
|
1456 |
+
transcript_json = json.loads(new_content)
|
1457 |
+
else:
|
1458 |
+
transcript_json = new_content
|
1459 |
+
transcript_text = json.dumps(transcript_json, ensure_ascii=False, indent=2)
|
1460 |
+
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, transcript_text)
|
1461 |
+
updated_content = transcript_text
|
1462 |
+
elif kind == "questions":
|
1463 |
+
# from update_LLM_btn -> new_content is a string
|
1464 |
+
# create_LLM_content -> new_content is a list
|
1465 |
+
if isinstance(new_content, str):
|
1466 |
+
questions_json = json.loads(new_content)
|
1467 |
+
else:
|
1468 |
+
questions_json = new_content
|
1469 |
+
questions_text = json.dumps(questions_json, ensure_ascii=False, indent=2)
|
1470 |
+
upload_file_to_gcs_with_json_string(gcs_client, bucket_name, blob_name, questions_text)
|
1471 |
+
updated_content = questions_text
|
1472 |
|
1473 |
print(f"{kind} 已更新到GCS")
|
1474 |
+
return gr.update(value=updated_content, interactive=False)
|
1475 |
|
1476 |
def create_LLM_content(video_id, df_string, kind):
|
1477 |
print(f"===create_{kind}===")
|
1478 |
+
print(f"video_id: {video_id}")
|
1479 |
+
|
1480 |
if kind == "reading_passage":
|
1481 |
content = generate_reading_passage(df_string)
|
1482 |
+
update_LLM_content(video_id, content, kind)
|
1483 |
elif kind == "summary":
|
1484 |
content = generate_summarise(df_string)
|
1485 |
+
update_LLM_content(video_id, content, kind)
|
1486 |
elif kind == "mind_map":
|
1487 |
content = generate_mind_map(df_string)
|
1488 |
+
update_LLM_content(video_id, content, kind)
|
1489 |
+
elif kind == "key_moments":
|
1490 |
+
if isinstance(df_string, str):
|
1491 |
+
transcript = json.loads(df_string)
|
1492 |
+
else:
|
1493 |
+
transcript = df_string
|
1494 |
+
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
1495 |
+
formatted_transcript = create_formatted_transcript(video_id, transcript)
|
1496 |
+
content = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
1497 |
+
update_LLM_content(video_id, content, kind)
|
1498 |
+
content = json.dumps(content, ensure_ascii=False, indent=2)
|
1499 |
+
elif kind == "transcript":
|
1500 |
+
content = process_transcript_and_screenshots_on_gcs(video_id)
|
1501 |
+
update_LLM_content(video_id, content, kind)
|
1502 |
+
content = json.dumps(content, ensure_ascii=False, indent=2)
|
1503 |
+
elif kind == "questions":
|
1504 |
+
content = generate_questions(df_string)
|
1505 |
+
update_LLM_content(video_id, content, kind)
|
1506 |
+
content = json.dumps(content, ensure_ascii=False, indent=2)
|
1507 |
|
|
|
1508 |
return gr.update(value=content, interactive=False)
|
1509 |
|
1510 |
|
|
|
1650 |
return word_path
|
1651 |
|
1652 |
# ---- Chatbot ----
|
1653 |
+
def get_instructions(content_subject, content_grade, key_moments):
|
1654 |
+
instructions = f"""
|
1655 |
+
subject: {content_subject}
|
1656 |
+
grade: {content_grade}
|
1657 |
+
context: {key_moments}
|
1658 |
+
Assistant Role: you are a {content_subject} teacher
|
1659 |
+
User Role: {content_grade} th-grade student.
|
1660 |
+
Method: Socratic style, guide thinking, no direct answers. this is very important, please be seriously following.
|
1661 |
+
Language: Traditional Chinese ZH-TW (it's very important), suitable for {content_grade} th-grade level.
|
1662 |
+
Response:
|
1663 |
+
- Single question, under 100 characters
|
1664 |
+
- include math symbols (use LaTeX $ to cover before and after)
|
1665 |
+
- hint with video timestamp which format 【參考:00:00:00】.
|
1666 |
+
- Sometimes encourage user by Taiwanese style with relaxing atmosphere.
|
1667 |
+
- if user ask questions not include in context,
|
1668 |
+
- just tell them to ask the question in context and give them example question.
|
1669 |
+
Restrictions: Answer within video content, no external references
|
1670 |
+
"""
|
1671 |
+
return instructions
|
1672 |
+
|
1673 |
+
def chat_with_ai(ai_name, password, video_id, trascript_state, key_moments, user_message, chat_history, content_subject, content_grade, socratic_mode=False):
|
1674 |
verify_password(password)
|
1675 |
|
1676 |
if chat_history is not None and len(chat_history) > 10:
|
|
|
1683 |
ai_client = BEDROCK_CLIENT
|
1684 |
elif ai_name == "groq":
|
1685 |
ai_client = GROQ_CLIENT
|
1686 |
+
if isinstance(trascript_state, str):
|
1687 |
+
simple_transcript = json.loads(trascript_state)
|
1688 |
+
else:
|
1689 |
+
simple_transcript = trascript_state
|
1690 |
|
1691 |
+
if isinstance(key_moments, str):
|
1692 |
+
key_moments_json = json.loads(key_moments)
|
1693 |
+
else:
|
1694 |
+
key_moments_json = key_moments
|
1695 |
+
# key_moments_json remove images
|
1696 |
+
for moment in key_moments_json:
|
1697 |
+
moment.pop('images', None)
|
1698 |
+
moment.pop('end', None)
|
1699 |
+
moment.pop('text', None)
|
1700 |
+
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False)
|
1701 |
+
|
1702 |
+
instructions = get_instructions(content_subject, content_grade, key_moments_text)
|
1703 |
+
|
1704 |
chatbot_config = {
|
1705 |
"video_id": video_id,
|
1706 |
+
"transcript": simple_transcript,
|
1707 |
+
"key_moments": key_moments,
|
1708 |
"content_subject": content_subject,
|
1709 |
"content_grade": content_grade,
|
1710 |
"jutor_chat_key": JUTOR_CHAT_KEY,
|
1711 |
"ai_name": ai_name,
|
1712 |
+
"ai_client": ai_client,
|
1713 |
+
"instructions": instructions
|
1714 |
}
|
1715 |
+
|
1716 |
+
try:
|
1717 |
+
chatbot = Chatbot(chatbot_config)
|
1718 |
+
response_completion = chatbot.chat(user_message, chat_history, socratic_mode, ai_name)
|
1719 |
+
except Exception as e:
|
1720 |
+
print(f"Error: {e}")
|
1721 |
+
response_completion = "學習精靈有點累,請稍後再試!"
|
1722 |
|
1723 |
try:
|
1724 |
# 更新聊天历史
|
|
|
1735 |
print(f"Error: {e}")
|
1736 |
return "请求失败,请稍后再试!", chat_history
|
1737 |
|
1738 |
+
def chat_with_opan_ai_assistant(password, youtube_id, thread_id, trascript_state, key_moments, user_message, chat_history, content_subject, content_grade, socratic_mode=False):
|
1739 |
verify_password(password)
|
1740 |
|
1741 |
# 先計算 user_message 是否超過 500 個字
|
|
|
1749 |
raise gr.Error(error_msg)
|
1750 |
|
1751 |
try:
|
1752 |
+
assistant_id = "asst_kmvZLNkDUYaNkMNtZEAYxyPq" #GPT 4 turbo
|
1753 |
+
# assistant_id = "asst_5SaUElqvL3U0ybSi9PRM8x3P" #GPT 3.5 turbo
|
1754 |
+
|
1755 |
client = OPEN_AI_CLIENT
|
1756 |
# 直接安排逐字稿資料 in instructions
|
1757 |
+
if isinstance(trascript_state, str):
|
1758 |
+
trascript_json = json.loads(trascript_state)
|
1759 |
+
else:
|
1760 |
+
trascript_json = trascript_state
|
1761 |
# 移除 embed_url, screenshot_path
|
1762 |
for entry in trascript_json:
|
1763 |
+
entry.pop('end_time', None)
|
1764 |
+
trascript_text = json.dumps(trascript_json, ensure_ascii=False)
|
1765 |
+
|
1766 |
+
if isinstance(key_moments, str):
|
1767 |
+
key_moments_json = json.loads(key_moments)
|
1768 |
+
else:
|
1769 |
+
key_moments_json = key_moments
|
1770 |
+
# key_moments_json remove images
|
1771 |
+
for moment in key_moments_json:
|
1772 |
+
moment.pop('images', None)
|
1773 |
+
moment.pop('end', None)
|
1774 |
+
moment.pop('text', None)
|
1775 |
+
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False)
|
1776 |
+
|
1777 |
+
instructions = get_instructions(content_subject, content_grade, key_moments_text)
|
1778 |
+
print("=== instructions ===")
|
1779 |
+
print(instructions)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1780 |
|
1781 |
# 创建线程
|
1782 |
if not thread_id:
|
|
|
1785 |
else:
|
1786 |
thread = client.beta.threads.retrieve(thread_id)
|
1787 |
|
1788 |
+
user_msg_note = """\n (請一定要用繁體中文回答 zh-TW,
|
1789 |
+
請嚴格遵循instructions,擔任一位蘇格拉底家教,
|
1790 |
+
並用台灣人的禮貌口語表達,回答時不要特別說明這是台灣人的語氣,
|
1791 |
+
不用提到「逐字稿」這個詞,用「內容」代替),
|
1792 |
+
回答時請用數學符號代替文字(Latex 用 $ 字號 render)
|
1793 |
+
"""
|
1794 |
+
user_msg_note = user_msg_note.replace(" ","").replace("\n","")
|
1795 |
# 向线程添加用户的消息
|
1796 |
client.beta.threads.messages.create(
|
1797 |
thread_id=thread.id,
|
1798 |
role="user",
|
1799 |
+
content=user_message + user_msg_note
|
1800 |
)
|
1801 |
|
1802 |
# 运行助手,生成响应
|
|
|
1900 |
|
1901 |
return run.status
|
1902 |
|
1903 |
+
def streaming_chat_with_open_ai(user_message, chat_history, password, thread_id, trascript, key_moments, content_subject, content_grade):
|
1904 |
verify_password(password)
|
1905 |
|
1906 |
print("===streaming_chat_with_open_ai===")
|
|
|
1916 |
error_msg = "此次對話超過上限"
|
1917 |
raise gr.Error(error_msg)
|
1918 |
|
|
|
|
|
|
|
1919 |
try:
|
1920 |
+
assistant_id = "asst_kmvZLNkDUYaNkMNtZEAYxyPq" #GPT 4 turbo
|
1921 |
+
# assistant_id = "asst_5SaUElqvL3U0ybSi9PRM8x3P" #GPT 3.5 turbo
|
1922 |
client = OPEN_AI_CLIENT
|
1923 |
# 直接安排逐字稿資料 in instructions
|
1924 |
+
if isinstance(trascript, str):
|
1925 |
+
trascript_json = json.loads(trascript)
|
1926 |
+
else:
|
1927 |
+
trascript_json = trascript
|
1928 |
+
trascript_text = json.dumps(trascript_json, ensure_ascii=False)
|
|
|
1929 |
# trascript_text 移除 \n, 空白
|
1930 |
trascript_text = trascript_text.replace("\n", "").replace(" ", "")
|
1931 |
|
1932 |
+
if isinstance(key_moments, str):
|
1933 |
+
key_moments_json = json.loads(key_moments)
|
1934 |
+
else:
|
1935 |
+
key_moments_json = key_moments
|
1936 |
+
# key_moments_json remove images
|
1937 |
+
for moment in key_moments_json:
|
1938 |
+
moment.pop('images', None)
|
1939 |
+
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1940 |
|
1941 |
+
instructions = get_instructions(content_subject, content_grade, key_moments_text)
|
1942 |
# 创建线程
|
1943 |
if not thread_id:
|
1944 |
thread = client.beta.threads.create()
|
|
|
2039 |
|
2040 |
return admin, reading_passage_admin, summary_admin, see_detail, password_text, youtube_link
|
2041 |
|
2042 |
+
def update_state(content_subject, content_grade, trascript, key_moments, question_1, question_2, question_3):
|
2043 |
# inputs=[content_subject, content_grade, df_string_output],
|
2044 |
# outputs=[content_subject_state, content_grade_state, trascript_state]
|
2045 |
content_subject_state = content_subject
|
2046 |
content_grade_state = content_grade
|
2047 |
+
trascript_json = json.loads(trascript)
|
2048 |
+
formatted_simple_transcript = create_formatted_simple_transcript(trascript_json)
|
2049 |
+
trascript_state = formatted_simple_transcript
|
2050 |
+
key_moments_state = key_moments
|
2051 |
streaming_chat_thread_id_state = create_thread_id()
|
2052 |
ai_chatbot_question_1 = question_1
|
2053 |
ai_chatbot_question_2 = question_2
|
2054 |
ai_chatbot_question_3 = question_3
|
2055 |
|
2056 |
+
return content_subject_state, content_grade_state, trascript_state, key_moments_state, \
|
2057 |
+
streaming_chat_thread_id_state, \
|
2058 |
ai_chatbot_question_1, ai_chatbot_question_2, ai_chatbot_question_3
|
2059 |
|
2060 |
|
|
|
2126 |
with gr.Row() as admin:
|
2127 |
password = gr.Textbox(label="Password", type="password", elem_id="password_input", visible=True)
|
2128 |
youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input", visible=True)
|
2129 |
+
video_id = gr.Textbox(label="video_id", visible=True)
|
2130 |
# file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
|
2131 |
# web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
2132 |
user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
|
|
|
2135 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
2136 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
2137 |
trascript_state = gr.State() # 使用 gr.State 存储 trascript
|
2138 |
+
key_moments_state = gr.State() # 使用 gr.State 存储 key_moments
|
2139 |
streaming_chat_thread_id_state = gr.State() # 使用 gr.State 存储 streaming_chat_thread_id
|
2140 |
with gr.Tab("AI小精靈"):
|
2141 |
with gr.Row():
|
|
|
2153 |
btn_3 = gr.Button("問題一")
|
2154 |
gr.Markdown("### 重新生成問題")
|
2155 |
btn_create_question = gr.Button("生成其他問題", variant="primary")
|
2156 |
+
openai_chatbot_audio_input = gr.Audio(sources=["microphone"], type="filepath", max_length=60)
|
2157 |
with gr.Row():
|
2158 |
msg = gr.Textbox(label="訊息",scale=3)
|
2159 |
send_button = gr.Button("送出", variant="primary", scale=1)
|
2160 |
with gr.Tab("飛特音速"):
|
2161 |
+
additional_inputs = [password, streaming_chat_thread_id_state, trascript_state, key_moments_state, content_subject_state, content_grade_state]
|
2162 |
streaming_chat = gr.ChatInterface(
|
2163 |
fn=streaming_chat_with_open_ai,
|
2164 |
additional_inputs=additional_inputs,
|
|
|
2177 |
ai_chatbot_question_1 = gr.Button("問題一")
|
2178 |
ai_chatbot_question_2 = gr.Button("問題一")
|
2179 |
ai_chatbot_question_3 = gr.Button("問題一")
|
2180 |
+
ai_chatbot_audio_input = gr.Audio(sources=["microphone"], type="filepath", max_length=60)
|
2181 |
with gr.Row():
|
2182 |
+
ai_msg = gr.Textbox(label="訊息輸入",scale=3)
|
2183 |
+
ai_send_button = gr.Button("送出", variant="primary",scale=1)
|
|
|
2184 |
with gr.Tab("文章模式"):
|
2185 |
with gr.Row() as reading_passage_admin:
|
2186 |
reading_passage_kind = gr.Textbox(value="reading_passage", show_label=False)
|
|
|
2204 |
with gr.Tab("關鍵時刻"):
|
2205 |
with gr.Row():
|
2206 |
key_moments_html = gr.HTML(value="")
|
|
|
2207 |
with gr.Tab("教學備課"):
|
2208 |
with gr.Row():
|
2209 |
content_subject = gr.Dropdown(label="選擇主題", choices=["數學", "自然", "國文", "英文", "社會","物理", "化學", "生物", "地理", "歷史", "公民"], value="", visible=False)
|
|
|
2291 |
# metacognition_content_btn = gr.Button("生成後設認知問題")
|
2292 |
|
2293 |
with gr.Accordion("See Details", open=False) as see_details:
|
2294 |
+
with gr.Tab("逐字稿本文"):
|
2295 |
+
with gr.Row() as transcript_admmin:
|
2296 |
+
transcript_kind = gr.Textbox(value="transcript", show_label=False)
|
2297 |
+
transcript_edit_button = gr.Button("編輯", size="sm", variant="primary")
|
2298 |
+
transcript_update_button = gr.Button("更新", size="sm", variant="primary")
|
2299 |
+
transcript_delete_button = gr.Button("刪除", size="sm", variant="primary")
|
2300 |
+
transcript_create_button = gr.Button("建立", size="sm", variant="primary")
|
2301 |
+
with gr.Row():
|
2302 |
+
df_string_output = gr.Textbox(lines=40, label="Data Text", interactive=False, show_copy_button=True)
|
2303 |
+
with gr.Tab("關鍵時刻本文"):
|
2304 |
+
with gr.Row() as key_moments_admin:
|
2305 |
+
key_moments_kind = gr.Textbox(value="key_moments", show_label=False)
|
2306 |
+
key_moments_edit_button = gr.Button("編輯", size="sm", variant="primary")
|
2307 |
+
key_moments_update_button = gr.Button("更新", size="sm", variant="primary")
|
2308 |
+
key_moments_delete_button = gr.Button("刪除", size="sm", variant="primary")
|
2309 |
+
key_moments_create_button = gr.Button("建立", size="sm", variant="primary")
|
2310 |
+
with gr.Row():
|
2311 |
+
key_moments = gr.Textbox(label="Key Moments", lines=40, interactive=False, show_copy_button=True)
|
2312 |
+
with gr.Tab("問題本文"):
|
2313 |
+
with gr.Row() as question_list_admin:
|
2314 |
+
questions_kind = gr.Textbox(value="questions", show_label=False)
|
2315 |
+
questions_edit_button = gr.Button("編輯", size="sm", variant="primary")
|
2316 |
+
questions_update_button = gr.Button("更新", size="sm", variant="primary")
|
2317 |
+
questions_delete_button = gr.Button("刪除", size="sm", variant="primary")
|
2318 |
+
questions_create_button = gr.Button("建立", size="sm", variant="primary")
|
2319 |
+
with gr.Row():
|
2320 |
+
questions_json = gr.Textbox(label="Questions", lines=40, interactive=False, show_copy_button=True)
|
2321 |
with gr.Tab("逐字稿"):
|
2322 |
simple_html_content = gr.HTML(label="Simple Transcript")
|
2323 |
with gr.Tab("圖文"):
|
|
|
2340 |
# OPENAI ASSISTANT CHATBOT 模式
|
2341 |
send_button.click(
|
2342 |
chat_with_opan_ai_assistant,
|
2343 |
+
inputs=[password, video_id, thread_id, trascript_state, key_moments, msg, chatbot, content_subject, content_grade, socratic_mode_btn],
|
2344 |
outputs=[msg, chatbot, thread_id]
|
2345 |
)
|
2346 |
openai_chatbot_audio_input.change(
|
|
|
2349 |
outputs=[msg]
|
2350 |
)
|
2351 |
# OPENAI ASSISTANT CHATBOT 連接按鈕點擊事件
|
2352 |
+
btn_1_chat_with_opan_ai_assistant_input =[password, video_id, thread_id, trascript_state, key_moments, btn_1, chatbot, content_subject, content_grade, ai_chatbot_socratic_mode_btn]
|
2353 |
+
btn_2_chat_with_opan_ai_assistant_input =[password, video_id, thread_id, trascript_state, key_moments, btn_2, chatbot, content_subject, content_grade, ai_chatbot_socratic_mode_btn]
|
2354 |
+
btn_3_chat_with_opan_ai_assistant_input =[password, video_id, thread_id, trascript_state, key_moments, btn_3, chatbot, content_subject, content_grade, ai_chatbot_socratic_mode_btn]
|
2355 |
btn_1.click(
|
2356 |
chat_with_opan_ai_assistant,
|
2357 |
inputs=btn_1_chat_with_opan_ai_assistant_input,
|
|
|
2376 |
# ai_chatbot 模式
|
2377 |
ai_send_button.click(
|
2378 |
chat_with_ai,
|
2379 |
+
inputs=[ai_name, password, video_id, trascript_state, key_moments, ai_msg, ai_chatbot, content_subject, content_grade, ai_chatbot_socratic_mode_btn],
|
2380 |
outputs=[ai_msg, ai_chatbot]
|
2381 |
)
|
2382 |
# ai_chatbot 连接按钮点击事件
|
2383 |
+
ai_chatbot_question_1_chat_with_ai_input =[ai_name, password, video_id, trascript_state, key_moments, ai_chatbot_question_1, ai_chatbot, content_subject, content_grade, ai_chatbot_socratic_mode_btn]
|
2384 |
+
ai_chatbot_question_2_chat_with_ai_input =[ai_name, password, video_id, trascript_state, key_moments, ai_chatbot_question_2, ai_chatbot, content_subject, content_grade, ai_chatbot_socratic_mode_btn]
|
2385 |
+
ai_chatbot_question_3_chat_with_ai_input =[ai_name, password, video_id, trascript_state, key_moments, ai_chatbot_question_3, ai_chatbot, content_subject, content_grade, ai_chatbot_socratic_mode_btn]
|
2386 |
ai_chatbot_question_1.click(
|
2387 |
chat_with_ai,
|
2388 |
inputs=ai_chatbot_question_1_chat_with_ai_input,
|
|
|
2406 |
process_youtube_link_inputs = [password, youtube_link]
|
2407 |
process_youtube_link_outputs = [
|
2408 |
video_id,
|
2409 |
+
questions_json,
|
2410 |
btn_1,
|
2411 |
btn_2,
|
2412 |
btn_3,
|
2413 |
df_string_output,
|
2414 |
df_summarise,
|
2415 |
+
key_moments,
|
2416 |
key_moments_html,
|
2417 |
mind_map,
|
2418 |
mind_map_html,
|
|
|
2428 |
content_subject,
|
2429 |
content_grade,
|
2430 |
df_string_output,
|
2431 |
+
key_moments,
|
2432 |
btn_1,
|
2433 |
btn_2,
|
2434 |
btn_3
|
|
|
2436 |
update_state_outputs = [
|
2437 |
content_subject_state,
|
2438 |
content_grade_state,
|
2439 |
+
trascript_state,
|
2440 |
+
key_moments_state,
|
2441 |
streaming_chat_thread_id_state,
|
2442 |
ai_chatbot_question_1,
|
2443 |
ai_chatbot_question_2,
|
|
|
2512 |
outputs=[df_summarise]
|
2513 |
)
|
2514 |
|
2515 |
+
# transcript event
|
2516 |
+
transcript_create_button.click(
|
2517 |
+
create_LLM_content,
|
2518 |
+
inputs=[video_id, df_string_output, transcript_kind],
|
2519 |
+
outputs=[df_string_output]
|
2520 |
+
)
|
2521 |
+
transcript_delete_button.click(
|
2522 |
+
delete_LLM_content,
|
2523 |
+
inputs=[video_id, transcript_kind],
|
2524 |
+
outputs=[df_string_output]
|
2525 |
+
)
|
2526 |
+
transcript_edit_button.click(
|
2527 |
+
enable_edit_mode,
|
2528 |
+
inputs=[],
|
2529 |
+
outputs=[df_string_output]
|
2530 |
+
)
|
2531 |
+
transcript_update_button.click(
|
2532 |
+
update_LLM_content,
|
2533 |
+
inputs=[video_id, df_string_output, transcript_kind],
|
2534 |
+
outputs=[df_string_output]
|
2535 |
+
)
|
2536 |
+
|
2537 |
+
# key_moments event
|
2538 |
+
key_moments_create_button.click(
|
2539 |
+
create_LLM_content,
|
2540 |
+
inputs=[video_id, df_string_output, key_moments_kind],
|
2541 |
+
outputs=[key_moments]
|
2542 |
+
)
|
2543 |
+
key_moments_delete_button.click(
|
2544 |
+
delete_LLM_content,
|
2545 |
+
inputs=[video_id, key_moments_kind],
|
2546 |
+
outputs=[key_moments]
|
2547 |
+
)
|
2548 |
+
key_moments_edit_button.click(
|
2549 |
+
enable_edit_mode,
|
2550 |
+
inputs=[],
|
2551 |
+
outputs=[key_moments]
|
2552 |
+
)
|
2553 |
+
key_moments_update_button.click(
|
2554 |
+
update_LLM_content,
|
2555 |
+
inputs=[video_id, key_moments, key_moments_kind],
|
2556 |
+
outputs=[key_moments]
|
2557 |
+
)
|
2558 |
+
|
2559 |
+
# question_list event
|
2560 |
+
questions_create_button.click(
|
2561 |
+
create_LLM_content,
|
2562 |
+
inputs=[video_id, df_string_output, questions_kind],
|
2563 |
+
outputs=[questions_json]
|
2564 |
+
)
|
2565 |
+
questions_delete_button.click(
|
2566 |
+
delete_LLM_content,
|
2567 |
+
inputs=[video_id, questions_kind],
|
2568 |
+
outputs=[questions_json]
|
2569 |
+
)
|
2570 |
+
questions_edit_button.click(
|
2571 |
+
enable_edit_mode,
|
2572 |
+
inputs=[],
|
2573 |
+
outputs=[questions_json]
|
2574 |
+
)
|
2575 |
+
questions_update_button.click(
|
2576 |
+
update_LLM_content,
|
2577 |
+
inputs=[video_id, questions_json, questions_kind],
|
2578 |
+
outputs=[questions_json]
|
2579 |
+
)
|
2580 |
+
|
2581 |
# 教師版
|
2582 |
worksheet_content_btn.click(
|
2583 |
get_ai_content,
|
chatbot.py
CHANGED
@@ -8,65 +8,46 @@ class Chatbot:
|
|
8 |
self.content_subject = config.get('content_subject')
|
9 |
self.content_grade = config.get('content_grade')
|
10 |
self.jutor_chat_key = config.get('jutor_chat_key')
|
11 |
-
self.transcript_text = self.get_transcript_text(config.get('
|
|
|
12 |
self.ai_name = config.get('ai_name')
|
13 |
self.ai_client = config.get('ai_client')
|
|
|
14 |
|
15 |
def get_transcript_text(self, transcript_data):
|
16 |
-
|
|
|
|
|
|
|
17 |
for entry in transcript_json:
|
18 |
-
entry.pop('
|
19 |
-
entry.pop('screenshot_path', None)
|
20 |
transcript_text = json.dumps(transcript_json, ensure_ascii=False)
|
21 |
return transcript_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def chat(self, user_message, chat_history, socratic_mode=False, service_type='jutor'):
|
24 |
messages = self.prepare_messages(chat_history, user_message)
|
25 |
-
system_prompt = self.
|
26 |
if service_type in ['jutor', 'groq', 'claude3']:
|
27 |
response_text = self.chat_with_service(service_type, system_prompt, messages)
|
28 |
return response_text
|
29 |
else:
|
30 |
raise gr.Error("不支持此服務")
|
31 |
|
32 |
-
def prepare_system_prompt(self, socratic_mode):
|
33 |
-
content_subject = self.content_subject
|
34 |
-
content_grade = self.content_grade
|
35 |
-
video_id = self.video_id
|
36 |
-
trascript_text = self.transcript_text
|
37 |
-
socratic_mode = str(socratic_mode)
|
38 |
-
ai_name = self.ai_name
|
39 |
-
system_prompt = f"""
|
40 |
-
科目:{content_subject}
|
41 |
-
年級:{content_grade}
|
42 |
-
逐字稿資料:{trascript_text}
|
43 |
-
-------------------------------------
|
44 |
-
你是一個專業的{content_subject}老師, user 為{content_grade}的學生
|
45 |
-
socratic_mode = {socratic_mode}
|
46 |
-
if socratic_mode is True,
|
47 |
-
- 請用蘇格拉底式的提問方式,引導學生思考,並且給予學生一些提示
|
48 |
-
- 一次只問一個問題,字數在100字以內
|
49 |
-
- 不要直接給予答案,讓學生自己思考
|
50 |
-
- 但可以給予一些提示跟引導,例如給予影片的時間軸,讓學生自己去找答案
|
51 |
-
|
52 |
-
if socratic_mode is False,
|
53 |
-
- 直接回答學生問題,字數在100字以內
|
54 |
-
|
55 |
-
rule:
|
56 |
-
- 請一定要用繁體中文回答 zh-TW,並用台灣人的口語表達,回答時不用特別說明這是台灣人的語氣,也不用說這是「台語的說法」
|
57 |
-
- 不用提到「逐字稿」這個詞
|
58 |
-
- 如果學生問了一些問題你無法判斷,請告訴學生你無法判斷,並建議學生可以問其他問題
|
59 |
-
- 或者你可以反問學生一些問題,幫助學生更好的理解資料,字數在100字以內
|
60 |
-
- 如果學生的問題與資料文本無關,請告訴學生你「無法回答超出影片範圍的問題」,並告訴他可以怎麼問什麼樣的問題(一個就好)
|
61 |
-
- 只要是參考逐字稿資料,請在回答的最後標註【參考資料:(分):(秒)】
|
62 |
-
- 回答範圍一定要在逐字稿資料內,不要引用其他資料,請嚴格執行
|
63 |
-
- 並在重複問句後給予學生鼓勵,讓學生有學習的動力
|
64 |
-
- 請用 {content_grade} 的學生能懂的方式回答
|
65 |
-
- 回答時數學式請用數學符號代替文字(Latex 用 $ 字號 render)
|
66 |
-
"""
|
67 |
-
|
68 |
-
return system_prompt
|
69 |
-
|
70 |
def prepare_messages(self, chat_history, user_message):
|
71 |
messages = []
|
72 |
if chat_history is not None:
|
@@ -101,12 +82,14 @@ class Chatbot:
|
|
101 |
"Content-Type": "application/json",
|
102 |
"x-api-key": self.jutor_chat_key,
|
103 |
}
|
|
|
|
|
104 |
data = {
|
105 |
"data": {
|
106 |
"messages": messages,
|
107 |
"max_tokens": 512,
|
108 |
"temperature": 0.9,
|
109 |
-
"model":
|
110 |
"stream": False,
|
111 |
}
|
112 |
}
|
|
|
8 |
self.content_subject = config.get('content_subject')
|
9 |
self.content_grade = config.get('content_grade')
|
10 |
self.jutor_chat_key = config.get('jutor_chat_key')
|
11 |
+
self.transcript_text = self.get_transcript_text(config.get('transcript'))
|
12 |
+
self.key_moments_text = self.get_key_moments_text(config.get('key_moments'))
|
13 |
self.ai_name = config.get('ai_name')
|
14 |
self.ai_client = config.get('ai_client')
|
15 |
+
self.instructions = config.get('instructions')
|
16 |
|
17 |
def get_transcript_text(self, transcript_data):
|
18 |
+
if isinstance(transcript_data, str):
|
19 |
+
transcript_json = json.loads(transcript_data)
|
20 |
+
else:
|
21 |
+
transcript_json = transcript_data
|
22 |
for entry in transcript_json:
|
23 |
+
entry.pop('end_time', None)
|
|
|
24 |
transcript_text = json.dumps(transcript_json, ensure_ascii=False)
|
25 |
return transcript_text
|
26 |
+
|
27 |
+
def get_key_moments_text(self, key_moments_data):
|
28 |
+
if isinstance(key_moments_data, str):
|
29 |
+
key_moments_json = json.loads(key_moments_data)
|
30 |
+
else:
|
31 |
+
key_moments_json = key_moments_data
|
32 |
+
# key_moments_json remove images
|
33 |
+
for moment in key_moments_json:
|
34 |
+
moment.pop('images', None)
|
35 |
+
moment.pop('end', None)
|
36 |
+
moment.pop('transcript', None)
|
37 |
+
|
38 |
+
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False)
|
39 |
+
return key_moments_text
|
40 |
+
|
41 |
|
42 |
def chat(self, user_message, chat_history, socratic_mode=False, service_type='jutor'):
|
43 |
messages = self.prepare_messages(chat_history, user_message)
|
44 |
+
system_prompt = self.instructions
|
45 |
if service_type in ['jutor', 'groq', 'claude3']:
|
46 |
response_text = self.chat_with_service(service_type, system_prompt, messages)
|
47 |
return response_text
|
48 |
else:
|
49 |
raise gr.Error("不支持此服務")
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def prepare_messages(self, chat_history, user_message):
|
52 |
messages = []
|
53 |
if chat_history is not None:
|
|
|
82 |
"Content-Type": "application/json",
|
83 |
"x-api-key": self.jutor_chat_key,
|
84 |
}
|
85 |
+
model = "gpt-4-1106-preview"
|
86 |
+
# model = "gpt-3.5-turbo-0125"
|
87 |
data = {
|
88 |
"data": {
|
89 |
"messages": messages,
|
90 |
"max_tokens": 512,
|
91 |
"temperature": 0.9,
|
92 |
+
"model": model,
|
93 |
"stream": False,
|
94 |
}
|
95 |
}
|