Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
@@ -427,7 +427,6 @@ def get_video_duration(video_id):
|
|
427 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
428 |
print("====process_transcript_and_screenshots_on_gcs====")
|
429 |
# GCS
|
430 |
-
gcs_client = GCS_CLIENT
|
431 |
bucket_name = 'video_ai_assistant'
|
432 |
# 逐字稿文件名
|
433 |
transcript_file_name = f'{video_id}_transcript.json'
|
@@ -552,9 +551,6 @@ def process_youtube_link(password, link):
|
|
552 |
}
|
553 |
formatted_simple_transcript.append(simple_line)
|
554 |
|
555 |
-
global TRANSCRIPTS
|
556 |
-
TRANSCRIPTS = formatted_transcript
|
557 |
-
|
558 |
# 基于逐字稿生成其他所需的输出
|
559 |
source = "gcs"
|
560 |
questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source)
|
@@ -568,9 +564,6 @@ def process_youtube_link(password, link):
|
|
568 |
key_moments_html = get_key_moments_html(key_moments)
|
569 |
html_content = format_transcript_to_html(formatted_transcript)
|
570 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
571 |
-
first_image = formatted_transcript[0]['screenshot_path']
|
572 |
-
# first_image = "https://www.nameslook.com/names/dfsadf-nameslook.png"
|
573 |
-
first_text = formatted_transcript[0]['text']
|
574 |
mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source)
|
575 |
mind_map = mind_map_json["mind_map"]
|
576 |
mind_map_html = get_mind_map_html(mind_map)
|
@@ -593,8 +586,6 @@ def process_youtube_link(password, link):
|
|
593 |
mind_map_html, \
|
594 |
html_content, \
|
595 |
simple_html_content, \
|
596 |
-
first_image, \
|
597 |
-
first_text, \
|
598 |
reading_passage_text, \
|
599 |
reading_passage, \
|
600 |
subject, \
|
@@ -1335,28 +1326,6 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1335 |
|
1336 |
def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
1337 |
print("===generate_key_moments===")
|
1338 |
-
# 使用 OpenAI 生成基于上传数据的问题
|
1339 |
-
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
1340 |
-
user_content = f"""
|
1341 |
-
請根據 {formatted_simple_transcript} 文本,提取出重點摘要,並給出對應的時間軸
|
1342 |
-
1. 小範圍切出不同段落的相對應時間軸的重點摘要,
|
1343 |
-
2. 每一小段最多不超過 1/5 的總內容,也就是大約 3~5段的重點(例如五~十分鐘的影片就一段大約1~2分鐘,最多三分鐘,但如果是超過十分鐘的影片,那一小段大約 2~3分鐘,以此類推)
|
1344 |
-
3. 注意不要遺漏任何一段時間軸的內容 從零秒開始
|
1345 |
-
4. 如果頭尾的情節不是重點,特別是打招呼或是介紹人物、或是say goodbye 就是不重要的情節,就不用擷取
|
1346 |
-
5. 以這種方式分析整個文本,從零秒開始分析,直到結束。這很重要
|
1347 |
-
6. 關鍵字從transcript extract to keyword,保留專家名字、專業術語、年份、數字、期刊名稱、地名、數學公式
|
1348 |
-
7. text, keywords please use or transfer to zh-TW, it's very important
|
1349 |
-
|
1350 |
-
Example: retrun JSON
|
1351 |
-
{{key_moments:[{{
|
1352 |
-
"start": "00:00",
|
1353 |
-
"end": "01:00",
|
1354 |
-
"text": "逐字稿的重點摘要",
|
1355 |
-
"keywords": ["關鍵字", "關鍵字"]
|
1356 |
-
}}]
|
1357 |
-
}}
|
1358 |
-
"""
|
1359 |
-
|
1360 |
segments = split_data(formatted_simple_transcript, word_base=100000)
|
1361 |
all_content = []
|
1362 |
|
@@ -1562,6 +1531,29 @@ def get_key_moments_html(key_moments):
|
|
1562 |
position: absolute;
|
1563 |
width: 1px;
|
1564 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1565 |
|
1566 |
@media (max-width: 768px) {
|
1567 |
#gallery-main {
|
@@ -1614,6 +1606,9 @@ def get_key_moments_html(key_moments):
|
|
1614 |
</div>
|
1615 |
"""
|
1616 |
|
|
|
|
|
|
|
1617 |
key_moments_html += f"""
|
1618 |
<div class="gallery-container" id="gallery-main">
|
1619 |
<div id="gallery"><!-- gallery start -->
|
@@ -1624,7 +1619,11 @@ def get_key_moments_html(key_moments):
|
|
1624 |
<div id="text-content">
|
1625 |
<h3>{moment['start']} - {moment['end']}</h3>
|
1626 |
<p><strong>摘要: {moment['text']} </strong></p>
|
1627 |
-
<
|
|
|
|
|
|
|
|
|
1628 |
</div>
|
1629 |
</div>
|
1630 |
"""
|
@@ -1647,6 +1646,9 @@ def get_LLM_content(video_id, kind):
|
|
1647 |
content_text = content_json["reading_passage"]
|
1648 |
elif kind == "summary_markdown":
|
1649 |
content_text = content_json["summary"]
|
|
|
|
|
|
|
1650 |
else:
|
1651 |
content_text = json.dumps(content_json, ensure_ascii=False, indent=2)
|
1652 |
else:
|
@@ -1701,8 +1703,9 @@ def update_LLM_content(video_id, new_content, kind):
|
|
1701 |
else:
|
1702 |
key_moments_list = new_content
|
1703 |
key_moments_json = {"key_moments": key_moments_list}
|
1704 |
-
|
1705 |
-
GCS_SERVICE.upload_json_string(bucket_name, blob_name,
|
|
|
1706 |
updated_content = key_moments_text
|
1707 |
elif kind == "transcript":
|
1708 |
if isinstance(new_content, str):
|
@@ -2588,34 +2591,6 @@ def show_all_chatbot_accordion():
|
|
2588 |
all_chatbot_select_btn_visible = gr.update(visible=False)
|
2589 |
return chatbot_select_accordion_visible, all_chatbot_select_btn_visible
|
2590 |
|
2591 |
-
# --- Slide mode ---
|
2592 |
-
def update_slide(direction):
|
2593 |
-
global TRANSCRIPTS
|
2594 |
-
global CURRENT_INDEX
|
2595 |
-
|
2596 |
-
print("=== 更新投影片 ===")
|
2597 |
-
print(f"CURRENT_INDEX: {CURRENT_INDEX}")
|
2598 |
-
# print(f"TRANSCRIPTS: {TRANSCRIPTS}")
|
2599 |
-
|
2600 |
-
CURRENT_INDEX += direction
|
2601 |
-
if CURRENT_INDEX < 0:
|
2602 |
-
CURRENT_INDEX = 0 # 防止索引小于0
|
2603 |
-
elif CURRENT_INDEX >= len(TRANSCRIPTS):
|
2604 |
-
CURRENT_INDEX = len(TRANSCRIPTS) - 1 # 防止索引超出范围
|
2605 |
-
|
2606 |
-
# 获取当前条目的文本和截图 URL
|
2607 |
-
current_transcript = TRANSCRIPTS[CURRENT_INDEX]
|
2608 |
-
slide_image = current_transcript["screenshot_path"]
|
2609 |
-
slide_text = current_transcript["text"]
|
2610 |
-
|
2611 |
-
return slide_image, slide_text
|
2612 |
-
|
2613 |
-
def prev_slide():
|
2614 |
-
return update_slide(-1)
|
2615 |
-
|
2616 |
-
def next_slide():
|
2617 |
-
return update_slide(1)
|
2618 |
-
|
2619 |
|
2620 |
# --- Init params ---
|
2621 |
def init_params(text, request: gr.Request):
|
@@ -2649,7 +2624,7 @@ def init_params(text, request: gr.Request):
|
|
2649 |
# check if origin is from junyiacademy
|
2650 |
origin = request.headers.get("origin", "")
|
2651 |
if "junyiacademy" in origin:
|
2652 |
-
password_text =
|
2653 |
admin = gr.update(visible=False)
|
2654 |
reading_passage_admin = gr.update(visible=False)
|
2655 |
summary_admin = gr.update(visible=False)
|
@@ -3059,14 +3034,6 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3059 |
simple_html_content = gr.HTML(label="Simple Transcript")
|
3060 |
with gr.Tab("圖文"):
|
3061 |
transcript_html = gr.HTML(label="YouTube Transcript and Video")
|
3062 |
-
with gr.Tab("投影片"):
|
3063 |
-
slide_image = gr.Image()
|
3064 |
-
slide_text = gr.Textbox()
|
3065 |
-
with gr.Row():
|
3066 |
-
prev_button = gr.Button("Previous")
|
3067 |
-
next_button = gr.Button("Next")
|
3068 |
-
prev_button.click(fn=prev_slide, inputs=[], outputs=[slide_image, slide_text])
|
3069 |
-
next_button.click(fn=next_slide, inputs=[], outputs=[slide_image, slide_text])
|
3070 |
with gr.Tab("markdown"):
|
3071 |
gr.Markdown("## 請複製以下 markdown 並貼到你的心智圖工具中,建議使用:https://markmap.js.org/repl")
|
3072 |
mind_map = gr.Textbox(container=True, show_copy_button=True, lines=40, elem_id="mind_map_markdown")
|
@@ -3217,8 +3184,6 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3217 |
mind_map_html,
|
3218 |
transcript_html,
|
3219 |
simple_html_content,
|
3220 |
-
slide_image,
|
3221 |
-
slide_text,
|
3222 |
reading_passage_text,
|
3223 |
reading_passage,
|
3224 |
content_subject,
|
|
|
427 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
428 |
print("====process_transcript_and_screenshots_on_gcs====")
|
429 |
# GCS
|
|
|
430 |
bucket_name = 'video_ai_assistant'
|
431 |
# 逐字稿文件名
|
432 |
transcript_file_name = f'{video_id}_transcript.json'
|
|
|
551 |
}
|
552 |
formatted_simple_transcript.append(simple_line)
|
553 |
|
|
|
|
|
|
|
554 |
# 基于逐字稿生成其他所需的输出
|
555 |
source = "gcs"
|
556 |
questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source)
|
|
|
564 |
key_moments_html = get_key_moments_html(key_moments)
|
565 |
html_content = format_transcript_to_html(formatted_transcript)
|
566 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
|
|
|
|
|
|
567 |
mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source)
|
568 |
mind_map = mind_map_json["mind_map"]
|
569 |
mind_map_html = get_mind_map_html(mind_map)
|
|
|
586 |
mind_map_html, \
|
587 |
html_content, \
|
588 |
simple_html_content, \
|
|
|
|
|
589 |
reading_passage_text, \
|
590 |
reading_passage, \
|
591 |
subject, \
|
|
|
1326 |
|
1327 |
def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
1328 |
print("===generate_key_moments===")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1329 |
segments = split_data(formatted_simple_transcript, word_base=100000)
|
1330 |
all_content = []
|
1331 |
|
|
|
1531 |
position: absolute;
|
1532 |
width: 1px;
|
1533 |
}
|
1534 |
+
.keyword-label {
|
1535 |
+
display: inline-block;
|
1536 |
+
padding: 5px 10px;
|
1537 |
+
margin: 2px;
|
1538 |
+
border: 2px solid black;
|
1539 |
+
border-radius: 5px;
|
1540 |
+
font-size: 0.9em;
|
1541 |
+
}
|
1542 |
+
details {
|
1543 |
+
border-radius: 5px;
|
1544 |
+
padding: 10px;
|
1545 |
+
border: 2px solid black;
|
1546 |
+
}
|
1547 |
+
|
1548 |
+
summary {
|
1549 |
+
font-weight: bold;
|
1550 |
+
cursor: pointer;
|
1551 |
+
outline: none;
|
1552 |
+
}
|
1553 |
+
|
1554 |
+
summary::-webkit-details-marker {
|
1555 |
+
display: none;
|
1556 |
+
}
|
1557 |
|
1558 |
@media (max-width: 768px) {
|
1559 |
#gallery-main {
|
|
|
1606 |
</div>
|
1607 |
"""
|
1608 |
|
1609 |
+
keywords_html = ' '.join([f'<span class="keyword-label">{keyword}</span>' for keyword in moment['keywords']])
|
1610 |
+
|
1611 |
+
|
1612 |
key_moments_html += f"""
|
1613 |
<div class="gallery-container" id="gallery-main">
|
1614 |
<div id="gallery"><!-- gallery start -->
|
|
|
1619 |
<div id="text-content">
|
1620 |
<h3>{moment['start']} - {moment['end']}</h3>
|
1621 |
<p><strong>摘要: {moment['text']} </strong></p>
|
1622 |
+
<details>
|
1623 |
+
<summary>逐字稿</summary>
|
1624 |
+
<p><strong>內容: </strong> {moment['transcript']} </p>
|
1625 |
+
</details>
|
1626 |
+
<p><strong>關鍵字:</strong> {keywords_html}</p>
|
1627 |
</div>
|
1628 |
</div>
|
1629 |
"""
|
|
|
1646 |
content_text = content_json["reading_passage"]
|
1647 |
elif kind == "summary_markdown":
|
1648 |
content_text = content_json["summary"]
|
1649 |
+
elif kind == "key_moments":
|
1650 |
+
content_text = content_json["key_moments"]
|
1651 |
+
content_text = json.dumps(content_text, ensure_ascii=False, indent=2)
|
1652 |
else:
|
1653 |
content_text = json.dumps(content_json, ensure_ascii=False, indent=2)
|
1654 |
else:
|
|
|
1703 |
else:
|
1704 |
key_moments_list = new_content
|
1705 |
key_moments_json = {"key_moments": key_moments_list}
|
1706 |
+
key_moments_json_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1707 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_json_text)
|
1708 |
+
key_moments_text = json.dumps(key_moments_list, ensure_ascii=False, indent=2)
|
1709 |
updated_content = key_moments_text
|
1710 |
elif kind == "transcript":
|
1711 |
if isinstance(new_content, str):
|
|
|
2591 |
all_chatbot_select_btn_visible = gr.update(visible=False)
|
2592 |
return chatbot_select_accordion_visible, all_chatbot_select_btn_visible
|
2593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2594 |
|
2595 |
# --- Init params ---
|
2596 |
def init_params(text, request: gr.Request):
|
|
|
2624 |
# check if origin is from junyiacademy
|
2625 |
origin = request.headers.get("origin", "")
|
2626 |
if "junyiacademy" in origin:
|
2627 |
+
password_text = PASSWORD
|
2628 |
admin = gr.update(visible=False)
|
2629 |
reading_passage_admin = gr.update(visible=False)
|
2630 |
summary_admin = gr.update(visible=False)
|
|
|
3034 |
simple_html_content = gr.HTML(label="Simple Transcript")
|
3035 |
with gr.Tab("圖文"):
|
3036 |
transcript_html = gr.HTML(label="YouTube Transcript and Video")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3037 |
with gr.Tab("markdown"):
|
3038 |
gr.Markdown("## 請複製以下 markdown 並貼到你的心智圖工具中,建議使用:https://markmap.js.org/repl")
|
3039 |
mind_map = gr.Textbox(container=True, show_copy_button=True, lines=40, elem_id="mind_map_markdown")
|
|
|
3184 |
mind_map_html,
|
3185 |
transcript_html,
|
3186 |
simple_html_content,
|
|
|
|
|
3187 |
reading_passage_text,
|
3188 |
reading_passage,
|
3189 |
content_subject,
|