Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
@@ -49,6 +49,7 @@ print(gr.__version__)
|
|
49 |
if is_env_local:
|
50 |
with open("local_config.json") as f:
|
51 |
config = json.load(f)
|
|
|
52 |
PASSWORD = config["PASSWORD"]
|
53 |
GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
54 |
DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
@@ -64,7 +65,9 @@ if is_env_local:
|
|
64 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
65 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
66 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
|
|
67 |
else:
|
|
|
68 |
PASSWORD = os.getenv("PASSWORD")
|
69 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
70 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
@@ -426,54 +429,14 @@ def get_video_duration(video_id):
|
|
426 |
|
427 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
428 |
print("====process_transcript_and_screenshots_on_gcs====")
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
# 检查逐字稿是否存在
|
435 |
-
is_new_transcript = False
|
436 |
-
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
437 |
-
video_duration = get_video_duration(video_id)
|
438 |
-
if not is_transcript_exists:
|
439 |
-
print("逐字稿文件不存在于GCS中,重新建立")
|
440 |
-
# 从YouTube获取逐字稿并上传
|
441 |
-
try:
|
442 |
-
transcript = get_transcript_by_yt_api(video_id)
|
443 |
-
except:
|
444 |
-
# call open ai whisper
|
445 |
-
print("===call open ai whisper===")
|
446 |
-
transcript = generate_transcription_by_whisper(video_id)
|
447 |
-
|
448 |
-
if transcript:
|
449 |
-
print("成功獲取字幕")
|
450 |
-
else:
|
451 |
-
print("沒有找到字幕")
|
452 |
-
transcript = generate_transcription_by_whisper(video_id)
|
453 |
-
if video_duration:
|
454 |
-
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
455 |
-
|
456 |
-
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
457 |
-
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
458 |
-
|
459 |
-
is_new_transcript = True
|
460 |
-
else:
|
461 |
-
# 逐字稿已存在,下载逐字稿内容
|
462 |
-
print("逐字稿已存在于GCS中")
|
463 |
-
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
464 |
-
transcript = json.loads(transcript_text)
|
465 |
-
if video_duration:
|
466 |
-
transcript = [entry for entry in transcript if entry['start'] <= video_duration]
|
467 |
-
|
468 |
-
# print("===確認其他衍生文件===")
|
469 |
-
# source = "gcs"
|
470 |
-
# get_questions(video_id, transcript_text, source)
|
471 |
-
# get_video_id_summary(video_id, transcript_text, source)
|
472 |
-
# get_mind_map(video_id, transcript_text, source)
|
473 |
-
# print("===確認其他衍生文件 end ===")
|
474 |
-
|
475 |
|
476 |
# 處理截圖
|
|
|
477 |
for entry in transcript:
|
478 |
if 'img_file_id' not in entry:
|
479 |
# 檢查 OUTPUT_PATH 是否存在 video_id.mp4
|
@@ -488,38 +451,67 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
488 |
if i == 4:
|
489 |
raise gr.Error(f"下载视频失败: {str(e)}")
|
490 |
time.sleep(5)
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
|
|
|
|
498 |
|
499 |
-
# 確認是否更新逐字稿文件
|
500 |
if is_new_transcript:
|
501 |
-
# 更新逐字稿文件
|
502 |
-
print("===更新逐字稿文件===")
|
503 |
-
print(transcript)
|
504 |
print("===更新逐字稿文件===")
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
else:
|
510 |
-
|
|
|
511 |
|
512 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
|
514 |
def process_youtube_link(password, link):
|
515 |
verify_password(password)
|
516 |
-
|
517 |
-
# 使用 YouTube API 获取逐字稿
|
518 |
-
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
|
519 |
video_id = extract_youtube_id(link)
|
520 |
-
|
521 |
try:
|
522 |
-
|
|
|
|
|
|
|
523 |
except Exception as e:
|
524 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
525 |
print("===process_youtube_link error===")
|
@@ -2615,6 +2607,8 @@ def init_params(text, request: gr.Request):
|
|
2615 |
chatbot_ai = gr.update(visible=False)
|
2616 |
ai_chatbot_params = gr.update(visible=True)
|
2617 |
|
|
|
|
|
2618 |
# if youtube_link in query_params
|
2619 |
if "youtube_id" in request.query_params:
|
2620 |
youtube_id = request.query_params["youtube_id"]
|
@@ -2633,11 +2627,15 @@ def init_params(text, request: gr.Request):
|
|
2633 |
lesson_plan_accordion = gr.update(visible=False)
|
2634 |
exit_ticket_accordion = gr.update(visible=False)
|
2635 |
ai_chatbot_params = gr.update(visible=False)
|
|
|
|
|
|
|
2636 |
|
2637 |
return admin, reading_passage_admin, summary_admin, see_detail, \
|
2638 |
worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
|
2639 |
password_text, youtube_link, \
|
2640 |
-
chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params
|
|
|
2641 |
|
2642 |
def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
|
2643 |
# inputs=[content_subject, content_grade, df_string_output],
|
@@ -2699,6 +2697,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
2699 |
# web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
2700 |
user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
|
2701 |
youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
|
|
|
2702 |
with gr.Row() as data_state:
|
2703 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
2704 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
@@ -3567,6 +3566,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3567 |
chatbot_open_ai_streaming,
|
3568 |
chatbot_ai,
|
3569 |
ai_chatbot_params,
|
|
|
3570 |
]
|
3571 |
demo.load(
|
3572 |
init_params,
|
|
|
49 |
if is_env_local:
|
50 |
with open("local_config.json") as f:
|
51 |
config = json.load(f)
|
52 |
+
IS_ENV_PROD = "False"
|
53 |
PASSWORD = config["PASSWORD"]
|
54 |
GCS_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
55 |
DRIVE_KEY = json.dumps(config["GOOGLE_APPLICATION_CREDENTIALS_JSON"])
|
|
|
65 |
AWS_SECRET_KEY = config["AWS_SECRET_KEY"]
|
66 |
AWS_REGION_NAME = config["AWS_REGION_NAME"]
|
67 |
OUTPUT_PATH = config["OUTPUT_PATH"]
|
68 |
+
|
69 |
else:
|
70 |
+
IS_ENV_PROD = os.getenv("IS_ENV_PROD", "False")
|
71 |
PASSWORD = os.getenv("PASSWORD")
|
72 |
GCS_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
73 |
DRIVE_KEY = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
|
|
|
429 |
|
430 |
def process_transcript_and_screenshots_on_gcs(video_id):
|
431 |
print("====process_transcript_and_screenshots_on_gcs====")
|
432 |
+
transcript, exists = get_transcript_from_gcs(video_id)
|
433 |
+
if not exists:
|
434 |
+
print("Transcript file does not exist, creating new transcript...")
|
435 |
+
transcript = generate_transcription_by_whisper(video_id)
|
436 |
+
upload_transcript_to_gcs(video_id, transcript)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
438 |
# 處理截圖
|
439 |
+
is_new_transcript = False
|
440 |
for entry in transcript:
|
441 |
if 'img_file_id' not in entry:
|
442 |
# 檢查 OUTPUT_PATH 是否存在 video_id.mp4
|
|
|
451 |
if i == 4:
|
452 |
raise gr.Error(f"下载视频失败: {str(e)}")
|
453 |
time.sleep(5)
|
454 |
+
try:
|
455 |
+
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
456 |
+
screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
|
457 |
+
img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
|
458 |
+
entry['img_file_id'] = img_file_id
|
459 |
+
print(f"截图已上传到GCS: {img_file_id}")
|
460 |
+
is_new_transcript = True
|
461 |
+
except Exception as e:
|
462 |
+
print(f"Error processing screenshot: {str(e)}")
|
463 |
|
|
|
464 |
if is_new_transcript:
|
|
|
|
|
|
|
465 |
print("===更新逐字稿文件===")
|
466 |
+
upload_transcript_to_gcs(video_id, transcript)
|
467 |
+
|
468 |
+
return transcript
|
469 |
+
|
470 |
+
def get_transcript(video_id):
|
471 |
+
print("====get_transcript====")
|
472 |
+
transcript, exists = get_transcript_from_gcs(video_id)
|
473 |
+
if not exists:
|
474 |
+
raise gr.Error("逐字稿文件不存在於GCS中。")
|
475 |
+
|
476 |
+
if any('img_file_id' not in entry for entry in transcript):
|
477 |
+
raise gr.Error("Some entries in the transcript do not have an associated img_file_id.")
|
478 |
+
|
479 |
+
print("Transcript is verified with all necessary images.")
|
480 |
+
return transcript
|
481 |
+
|
482 |
+
def get_transcript_from_gcs(video_id):
|
483 |
+
print("Checking for transcript in GCS...")
|
484 |
+
bucket_name = 'video_ai_assistant'
|
485 |
+
transcript_file_name = f'{video_id}_transcript.json'
|
486 |
+
transcript_blob_name = f"{video_id}/{transcript_file_name}"
|
487 |
+
# Check if the transcript exists in GCS
|
488 |
+
is_transcript_exists = GCS_SERVICE.check_file_exists(bucket_name, transcript_blob_name)
|
489 |
+
if is_transcript_exists:
|
490 |
+
# Download the transcript if it exists
|
491 |
+
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
492 |
+
return json.loads(transcript_text), True
|
493 |
else:
|
494 |
+
print("No transcript found for video ID:", video_id)
|
495 |
+
return None, False
|
496 |
|
497 |
+
def upload_transcript_to_gcs(video_id, transcript):
|
498 |
+
print("Uploading updated transcript to GCS...")
|
499 |
+
bucket_name = 'video_ai_assistant'
|
500 |
+
transcript_file_name = f'{video_id}_transcript.json'
|
501 |
+
transcript_blob_name = f"{video_id}/{transcript_file_name}"
|
502 |
+
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
503 |
+
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
504 |
+
print("Transcript uploaded successfully.")
|
505 |
|
506 |
def process_youtube_link(password, link):
|
507 |
verify_password(password)
|
|
|
|
|
|
|
508 |
video_id = extract_youtube_id(link)
|
509 |
+
|
510 |
try:
|
511 |
+
if IS_ENV_PROD == "True":
|
512 |
+
transcript = get_transcript(video_id)
|
513 |
+
else:
|
514 |
+
transcript = process_transcript_and_screenshots_on_gcs(video_id)
|
515 |
except Exception as e:
|
516 |
error_msg = f" {video_id} 逐字稿錯誤: {str(e)}"
|
517 |
print("===process_youtube_link error===")
|
|
|
2607 |
chatbot_ai = gr.update(visible=False)
|
2608 |
ai_chatbot_params = gr.update(visible=True)
|
2609 |
|
2610 |
+
is_env_prod = gr.update(value=False)
|
2611 |
+
|
2612 |
# if youtube_link in query_params
|
2613 |
if "youtube_id" in request.query_params:
|
2614 |
youtube_id = request.query_params["youtube_id"]
|
|
|
2627 |
lesson_plan_accordion = gr.update(visible=False)
|
2628 |
exit_ticket_accordion = gr.update(visible=False)
|
2629 |
ai_chatbot_params = gr.update(visible=False)
|
2630 |
+
|
2631 |
+
if IS_ENV_PROD == "True":
|
2632 |
+
is_env_prod = gr.update(value=True)
|
2633 |
|
2634 |
return admin, reading_passage_admin, summary_admin, see_detail, \
|
2635 |
worksheet_accordion, lesson_plan_accordion, exit_ticket_accordion, \
|
2636 |
password_text, youtube_link, \
|
2637 |
+
chatbot_open_ai_streaming, chatbot_ai, ai_chatbot_params, \
|
2638 |
+
is_env_prod
|
2639 |
|
2640 |
def update_state(content_subject, content_grade, trascript, key_moments, questions_answers):
|
2641 |
# inputs=[content_subject, content_grade, df_string_output],
|
|
|
2697 |
# web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
2698 |
user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
|
2699 |
youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
|
2700 |
+
is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
|
2701 |
with gr.Row() as data_state:
|
2702 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
2703 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
|
|
3566 |
chatbot_open_ai_streaming,
|
3567 |
chatbot_ai,
|
3568 |
ai_chatbot_params,
|
3569 |
+
is_env_prod,
|
3570 |
]
|
3571 |
demo.load(
|
3572 |
init_params,
|