youngtsai commited on
Commit
c205271
1 Parent(s): aaa2911
Files changed (1) hide show
  1. app.py +38 -73
app.py CHANGED
@@ -427,7 +427,6 @@ def get_video_duration(video_id):
427
  def process_transcript_and_screenshots_on_gcs(video_id):
428
  print("====process_transcript_and_screenshots_on_gcs====")
429
  # GCS
430
- gcs_client = GCS_CLIENT
431
  bucket_name = 'video_ai_assistant'
432
  # 逐字稿文件名
433
  transcript_file_name = f'{video_id}_transcript.json'
@@ -552,9 +551,6 @@ def process_youtube_link(password, link):
552
  }
553
  formatted_simple_transcript.append(simple_line)
554
 
555
- global TRANSCRIPTS
556
- TRANSCRIPTS = formatted_transcript
557
-
558
  # 基于逐字稿生成其他所需的输出
559
  source = "gcs"
560
  questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source)
@@ -568,9 +564,6 @@ def process_youtube_link(password, link):
568
  key_moments_html = get_key_moments_html(key_moments)
569
  html_content = format_transcript_to_html(formatted_transcript)
570
  simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
571
- first_image = formatted_transcript[0]['screenshot_path']
572
- # first_image = "https://www.nameslook.com/names/dfsadf-nameslook.png"
573
- first_text = formatted_transcript[0]['text']
574
  mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source)
575
  mind_map = mind_map_json["mind_map"]
576
  mind_map_html = get_mind_map_html(mind_map)
@@ -593,8 +586,6 @@ def process_youtube_link(password, link):
593
  mind_map_html, \
594
  html_content, \
595
  simple_html_content, \
596
- first_image, \
597
- first_text, \
598
  reading_passage_text, \
599
  reading_passage, \
600
  subject, \
@@ -1335,28 +1326,6 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
1335
 
1336
  def generate_key_moments(formatted_simple_transcript, formatted_transcript):
1337
  print("===generate_key_moments===")
1338
- # 使用 OpenAI 生成基于上传数据的问题
1339
- sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
1340
- user_content = f"""
1341
- 請根據 {formatted_simple_transcript} 文本,提取出重點摘要,並給出對應的時間軸
1342
- 1. 小範圍切出不同段落的相對應時間軸的重點摘要,
1343
- 2. 每一小段最多不超過 1/5 的總內容,也就是大約 3~5段的重點(例如五~十分鐘的影片就一段大約1~2分鐘,最多三分鐘,但如果是超過十分鐘的影片,那一小段大約 2~3分鐘,以此類推)
1344
- 3. 注意不要遺漏任何一段時間軸的內容 從零秒開始
1345
- 4. 如果頭尾的情節不是重點,特別是打招呼或是介紹人物、或是say goodbye 就是不重要的情節,就不用擷取
1346
- 5. 以這種方式分析整個文本,從零秒開始分析,直到結束。這很重要
1347
- 6. 關鍵字從transcript extract to keyword,保留專家名字、專業術語、年份、數字、期刊名稱、地名、數學公式
1348
- 7. text, keywords please use or transfer to zh-TW, it's very important
1349
-
1350
- Example: retrun JSON
1351
- {{key_moments:[{{
1352
- "start": "00:00",
1353
- "end": "01:00",
1354
- "text": "逐字稿的重點摘要",
1355
- "keywords": ["關鍵字", "關鍵字"]
1356
- }}]
1357
- }}
1358
- """
1359
-
1360
  segments = split_data(formatted_simple_transcript, word_base=100000)
1361
  all_content = []
1362
 
@@ -1562,6 +1531,29 @@ def get_key_moments_html(key_moments):
1562
  position: absolute;
1563
  width: 1px;
1564
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1565
 
1566
  @media (max-width: 768px) {
1567
  #gallery-main {
@@ -1614,6 +1606,9 @@ def get_key_moments_html(key_moments):
1614
  </div>
1615
  """
1616
 
 
 
 
1617
  key_moments_html += f"""
1618
  <div class="gallery-container" id="gallery-main">
1619
  <div id="gallery"><!-- gallery start -->
@@ -1624,7 +1619,11 @@ def get_key_moments_html(key_moments):
1624
  <div id="text-content">
1625
  <h3>{moment['start']} - {moment['end']}</h3>
1626
  <p><strong>摘要: {moment['text']} </strong></p>
1627
- <p>內容: {moment['transcript']}</p>
 
 
 
 
1628
  </div>
1629
  </div>
1630
  """
@@ -1647,6 +1646,9 @@ def get_LLM_content(video_id, kind):
1647
  content_text = content_json["reading_passage"]
1648
  elif kind == "summary_markdown":
1649
  content_text = content_json["summary"]
 
 
 
1650
  else:
1651
  content_text = json.dumps(content_json, ensure_ascii=False, indent=2)
1652
  else:
@@ -1701,8 +1703,9 @@ def update_LLM_content(video_id, new_content, kind):
1701
  else:
1702
  key_moments_list = new_content
1703
  key_moments_json = {"key_moments": key_moments_list}
1704
- key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1705
- GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
 
1706
  updated_content = key_moments_text
1707
  elif kind == "transcript":
1708
  if isinstance(new_content, str):
@@ -2588,34 +2591,6 @@ def show_all_chatbot_accordion():
2588
  all_chatbot_select_btn_visible = gr.update(visible=False)
2589
  return chatbot_select_accordion_visible, all_chatbot_select_btn_visible
2590
 
2591
- # --- Slide mode ---
2592
- def update_slide(direction):
2593
- global TRANSCRIPTS
2594
- global CURRENT_INDEX
2595
-
2596
- print("=== 更新投影片 ===")
2597
- print(f"CURRENT_INDEX: {CURRENT_INDEX}")
2598
- # print(f"TRANSCRIPTS: {TRANSCRIPTS}")
2599
-
2600
- CURRENT_INDEX += direction
2601
- if CURRENT_INDEX < 0:
2602
- CURRENT_INDEX = 0 # 防止索引小于0
2603
- elif CURRENT_INDEX >= len(TRANSCRIPTS):
2604
- CURRENT_INDEX = len(TRANSCRIPTS) - 1 # 防止索引超出范围
2605
-
2606
- # 获取当前条目的文本和截图 URL
2607
- current_transcript = TRANSCRIPTS[CURRENT_INDEX]
2608
- slide_image = current_transcript["screenshot_path"]
2609
- slide_text = current_transcript["text"]
2610
-
2611
- return slide_image, slide_text
2612
-
2613
- def prev_slide():
2614
- return update_slide(-1)
2615
-
2616
- def next_slide():
2617
- return update_slide(1)
2618
-
2619
 
2620
  # --- Init params ---
2621
  def init_params(text, request: gr.Request):
@@ -2649,7 +2624,7 @@ def init_params(text, request: gr.Request):
2649
  # check if origin is from junyiacademy
2650
  origin = request.headers.get("origin", "")
2651
  if "junyiacademy" in origin:
2652
- password_text = "6161"
2653
  admin = gr.update(visible=False)
2654
  reading_passage_admin = gr.update(visible=False)
2655
  summary_admin = gr.update(visible=False)
@@ -3059,14 +3034,6 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3059
  simple_html_content = gr.HTML(label="Simple Transcript")
3060
  with gr.Tab("圖文"):
3061
  transcript_html = gr.HTML(label="YouTube Transcript and Video")
3062
- with gr.Tab("投影片"):
3063
- slide_image = gr.Image()
3064
- slide_text = gr.Textbox()
3065
- with gr.Row():
3066
- prev_button = gr.Button("Previous")
3067
- next_button = gr.Button("Next")
3068
- prev_button.click(fn=prev_slide, inputs=[], outputs=[slide_image, slide_text])
3069
- next_button.click(fn=next_slide, inputs=[], outputs=[slide_image, slide_text])
3070
  with gr.Tab("markdown"):
3071
  gr.Markdown("## 請複製以下 markdown 並貼到你的心智圖工具中,建議使用:https://markmap.js.org/repl")
3072
  mind_map = gr.Textbox(container=True, show_copy_button=True, lines=40, elem_id="mind_map_markdown")
@@ -3217,8 +3184,6 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3217
  mind_map_html,
3218
  transcript_html,
3219
  simple_html_content,
3220
- slide_image,
3221
- slide_text,
3222
  reading_passage_text,
3223
  reading_passage,
3224
  content_subject,
 
427
  def process_transcript_and_screenshots_on_gcs(video_id):
428
  print("====process_transcript_and_screenshots_on_gcs====")
429
  # GCS
 
430
  bucket_name = 'video_ai_assistant'
431
  # 逐字稿文件名
432
  transcript_file_name = f'{video_id}_transcript.json'
 
551
  }
552
  formatted_simple_transcript.append(simple_line)
553
 
 
 
 
554
  # 基于逐字稿生成其他所需的输出
555
  source = "gcs"
556
  questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source)
 
564
  key_moments_html = get_key_moments_html(key_moments)
565
  html_content = format_transcript_to_html(formatted_transcript)
566
  simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
 
 
 
567
  mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source)
568
  mind_map = mind_map_json["mind_map"]
569
  mind_map_html = get_mind_map_html(mind_map)
 
586
  mind_map_html, \
587
  html_content, \
588
  simple_html_content, \
 
 
589
  reading_passage_text, \
590
  reading_passage, \
591
  subject, \
 
1326
 
1327
  def generate_key_moments(formatted_simple_transcript, formatted_transcript):
1328
  print("===generate_key_moments===")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1329
  segments = split_data(formatted_simple_transcript, word_base=100000)
1330
  all_content = []
1331
 
 
1531
  position: absolute;
1532
  width: 1px;
1533
  }
1534
+ .keyword-label {
1535
+ display: inline-block;
1536
+ padding: 5px 10px;
1537
+ margin: 2px;
1538
+ border: 2px solid black;
1539
+ border-radius: 5px;
1540
+ font-size: 0.9em;
1541
+ }
1542
+ details {
1543
+ border-radius: 5px;
1544
+ padding: 10px;
1545
+ border: 2px solid black;
1546
+ }
1547
+
1548
+ summary {
1549
+ font-weight: bold;
1550
+ cursor: pointer;
1551
+ outline: none;
1552
+ }
1553
+
1554
+ summary::-webkit-details-marker {
1555
+ display: none;
1556
+ }
1557
 
1558
  @media (max-width: 768px) {
1559
  #gallery-main {
 
1606
  </div>
1607
  """
1608
 
1609
+ keywords_html = ' '.join([f'<span class="keyword-label">{keyword}</span>' for keyword in moment['keywords']])
1610
+
1611
+
1612
  key_moments_html += f"""
1613
  <div class="gallery-container" id="gallery-main">
1614
  <div id="gallery"><!-- gallery start -->
 
1619
  <div id="text-content">
1620
  <h3>{moment['start']} - {moment['end']}</h3>
1621
  <p><strong>摘要: {moment['text']} </strong></p>
1622
+ <details>
1623
+ <summary>逐字稿</summary>
1624
+ <p><strong>內容: </strong> {moment['transcript']} </p>
1625
+ </details>
1626
+ <p><strong>關鍵字:</strong> {keywords_html}</p>
1627
  </div>
1628
  </div>
1629
  """
 
1646
  content_text = content_json["reading_passage"]
1647
  elif kind == "summary_markdown":
1648
  content_text = content_json["summary"]
1649
+ elif kind == "key_moments":
1650
+ content_text = content_json["key_moments"]
1651
+ content_text = json.dumps(content_text, ensure_ascii=False, indent=2)
1652
  else:
1653
  content_text = json.dumps(content_json, ensure_ascii=False, indent=2)
1654
  else:
 
1703
  else:
1704
  key_moments_list = new_content
1705
  key_moments_json = {"key_moments": key_moments_list}
1706
+ key_moments_json_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1707
+ GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_json_text)
1708
+ key_moments_text = json.dumps(key_moments_list, ensure_ascii=False, indent=2)
1709
  updated_content = key_moments_text
1710
  elif kind == "transcript":
1711
  if isinstance(new_content, str):
 
2591
  all_chatbot_select_btn_visible = gr.update(visible=False)
2592
  return chatbot_select_accordion_visible, all_chatbot_select_btn_visible
2593
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2594
 
2595
  # --- Init params ---
2596
  def init_params(text, request: gr.Request):
 
2624
  # check if origin is from junyiacademy
2625
  origin = request.headers.get("origin", "")
2626
  if "junyiacademy" in origin:
2627
+ password_text = PASSWORD
2628
  admin = gr.update(visible=False)
2629
  reading_passage_admin = gr.update(visible=False)
2630
  summary_admin = gr.update(visible=False)
 
3034
  simple_html_content = gr.HTML(label="Simple Transcript")
3035
  with gr.Tab("圖文"):
3036
  transcript_html = gr.HTML(label="YouTube Transcript and Video")
 
 
 
 
 
 
 
 
3037
  with gr.Tab("markdown"):
3038
  gr.Markdown("## 請複製以下 markdown 並貼到你的心智圖工具中,建議使用:https://markmap.js.org/repl")
3039
  mind_map = gr.Textbox(container=True, show_copy_button=True, lines=40, elem_id="mind_map_markdown")
 
3184
  mind_map_html,
3185
  transcript_html,
3186
  simple_html_content,
 
 
3187
  reading_passage_text,
3188
  reading_passage,
3189
  content_subject,