import gradio as gr | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
from docx import Document | |
import os | |
from openai import OpenAI | |
import json | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from moviepy.editor import VideoFileClip | |
from pytube import YouTube | |
import os | |
OUTPUT_PATH = 'videos' | |
OPEN_AI_KEY = os.getenv("OPEN_AI_KEY") | |
client = OpenAI(api_key=OPEN_AI_KEY) | |
def process_file(file): | |
# 读取文件 | |
if'.csv'): | |
df = pd.read_csv(file) | |
text = df_to_text(df) | |
elif'.xlsx'): | |
df = pd.read_excel(file) | |
text = df_to_text(df) | |
elif'.docx'): | |
text = docx_to_text(file) | |
else: | |
raise ValueError("Unsupported file type") | |
df_string = df.to_string() | |
# 宜蘭:移除@XX@符号 to | | |
df_string = df_string.replace("@XX@", "|") | |
# 根据上传的文件内容生成问题 | |
questions = generate_questions(df_string) | |
df_summarise = generate_df_summarise(df_string) | |
# 返回按钮文本和 DataFrame 字符串 | |
return questions[0] if len(questions) > 0 else "", \ | |
questions[1] if len(questions) > 1 else "", \ | |
questions[2] if len(questions) > 2 else "", \ | |
df_summarise, \ | |
df_string | |
def df_to_text(df): | |
# 将 DataFrame 转换为纯文本 | |
return df.to_string() | |
def docx_to_text(file): | |
# 将 Word 文档转换为纯文本 | |
doc = Document(file) | |
return "\n".join([para.text for para in doc.paragraphs]) | |
def format_seconds_to_time(seconds): | |
"""将秒数格式化为 时:分:秒 的形式""" | |
hours = int(seconds // 3600) | |
minutes = int((seconds % 3600) // 60) | |
seconds = int(seconds % 60) | |
return f"{hours:02}:{minutes:02}:{seconds:02}" | |
def process_youtube_link(link): | |
# 使用 YouTube API 获取逐字稿 | |
# 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中 | |
video_id = link.split("=")[-1] | |
# 先下載 video | |
download_youtube_video(video_id, output_path=OUTPUT_PATH) | |
# 再取得 transcript | |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW']) | |
# 基于逐字稿生成其他所需的输出 | |
questions = generate_questions(transcript) | |
df_summarise = generate_df_summarise(transcript) | |
formatted_transcript = [] | |
for entry in transcript: | |
start_time = format_seconds_to_time(entry['start']) | |
end_time = format_seconds_to_time(entry['start'] + entry['duration']) | |
embed_url = get_embedded_youtube_link(video_id, entry['start']) | |
# 截圖 | |
screenshot_path = screenshot_youtube_video(video_id, entry['start']) | |
line = { | |
"start_time": start_time, | |
"end_time": end_time, | |
"text": entry['text'], | |
"embed_url": embed_url, | |
"time_sec": entry['start'], | |
"screenshot_path": screenshot_path | |
} | |
formatted_transcript.append(line) | |
html_content = format_transcript_to_html(formatted_transcript) | |
print("=====html_content=====") | |
print(html_content) | |
print("=====html_content=====") | |
# 确保返回与 UI 组件预期匹配的输出 | |
return questions[0] if len(questions) > 0 else "", \ | |
questions[1] if len(questions) > 1 else "", \ | |
questions[2] if len(questions) > 2 else "", \ | |
df_summarise, \ | |
html_content | |
def format_transcript_to_html(formatted_transcript): | |
html_content = "" | |
for entry in formatted_transcript: | |
html_content += f"<h3>{entry['start_time']} - {entry['end_time']}</h3>" | |
html_content += f"<p>{entry['text']}</p>" | |
html_content += f"<img src='{entry['screenshot_path']}' width='500px' />" | |
return html_content | |
def get_embedded_youtube_link(video_id, start_time): | |
embed_url = f"{video_id}?start={start_time}&autoplay=1" | |
return embed_url | |
def download_youtube_video(youtube_id, output_path=OUTPUT_PATH): | |
# Construct the full YouTube URL | |
youtube_url = f'{youtube_id}' | |
# Create the output directory if it doesn't exist | |
if not os.path.exists(output_path): | |
os.makedirs(output_path) | |
# Download the video | |
yt = YouTube(youtube_url) | |
video_stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first() | |, filename=youtube_id+".mp4") | |
print(f"Video downloaded successfully: {output_path}/{youtube_id}.mp4") | |
def screenshot_youtube_video(youtube_id, snapshot_sec): | |
# 由于在 Hugging Face Spaces 中直接下载 YouTube 视频可能不可行, | |
# 您可能需要调整这部分代码,例如允许用户上传视频。 | |
# ... 代码 ... | |
# 这里假设视频已经在适当的位置 | |
video_path = f'{OUTPUT_PATH}/{youtube_id}.mp4' | |
# Load the video and take a screenshot | |
with VideoFileClip(video_path) as video: | |
screenshot_path = f'{OUTPUT_PATH}/screenshots/{youtube_id}_{snapshot_sec}.jpg' | |
video.save_frame(screenshot_path, snapshot_sec) | |
return screenshot_path | |
def process_video(youtube_id): | |
download_youtube_video(youtube_id) | |
video_path = f'{OUTPUT_PATH}/{youtube_id}.mp4' | |
video = VideoFileClip(video_path) | |
duration = int(video.duration) | |
output_path = f'{OUTPUT_PATH}/screenshots/{youtube_id}' | |
os.makedirs(output_path, exist_ok=True) | |
# fake duration | |
duration = 10 | |
screenshot_paths = [] | |
for i in range(1, duration): | |
screenshot_path = screenshot_youtube_video(youtube_id, i) | |
screenshot_paths.append(screenshot_path) | |
return screenshot_paths | |
def get_screenshot_from_video(video_link, start_time): | |
# 实现从视频中提取帧的逻辑 | |
# 由于这需要服务器端处理,你可能需要一种方法来下载视频, | |
# 并使用 ffmpeg 或类似工具提取特定时间点的帧 | |
# 这里只是一个示意性的函数实现 | |
screenshot_url = f"[逻辑以提取视频 {video_link} 在 {start_time} 秒时的截图]" | |
return screenshot_url | |
def process_web_link(link): | |
# 抓取和解析网页内容 | |
response = requests.get(link) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
return soup.get_text() | |
def generate_df_summarise(df_string): | |
# 使用 OpenAI 生成基于上传数据的问题 | |
sys_content = "你是一個資料分析師,服務對象為老師,請精讀資料,使用 zh-TW" | |
user_content = f"請根據 {df_string},大概描述這張表的欄位敘述、資料樣態與資料分析,告訴老師這張表的意義,以及可能的結論與對應方式" | |
messages = [ | |
{"role": "system", "content": sys_content}, | |
{"role": "user", "content": user_content} | |
] | |
print("=====messages=====") | |
print(messages) | |
print("=====messages=====") | |
request_payload = { | |
"model": "gpt-4-1106-preview", | |
"messages": messages, | |
"max_tokens": 4000, | |
} | |
response =**request_payload) | |
df_summarise = response.choices[0].message.content.strip() | |
print("=====df_summarise=====") | |
print(df_summarise) | |
print("=====df_summarise=====") | |
return df_summarise | |
def generate_questions(df_string): | |
# 使用 OpenAI 生成基于上传数据的问题 | |
sys_content = "你是一個資料分析師,user為老師,請精讀資料,並用既有資料為本質猜測用戶可能會問的問題,使用 zh-TW" | |
user_content = f"請根據 {df_string} 生成三個問題,並用 JSON 格式返回 questions:[q1, q2, q3]" | |
messages = [ | |
{"role": "system", "content": sys_content}, | |
{"role": "user", "content": user_content} | |
] | |
response_format = { "type": "json_object" } | |
print("=====messages=====") | |
print(messages) | |
print("=====messages=====") | |
request_payload = { | |
"model": "gpt-4-1106-preview", | |
"messages": messages, | |
"max_tokens": 4000, | |
"response_format": response_format | |
} | |
response =**request_payload) | |
questions = json.loads(response.choices[0].message.content)["questions"] | |
print("=====json_response=====") | |
print(questions) | |
print("=====json_response=====") | |
return questions | |
def send_question(question, df_string_output, chat_history): | |
# 当问题按钮被点击时调用此函数 | |
return respond(question, df_string_output, chat_history) | |
def respond(user_message, df_string_output, chat_history): | |
print("=== 變數:user_message ===") | |
print(user_message) | |
print("=== 變數:chat_history ===") | |
print(chat_history) | |
sys_content = f"你是一個資料分析師,請用 {df_string_output} 為資料進行對話,使用 zh-TW" | |
messages = [ | |
{"role": "system", "content": sys_content}, | |
{"role": "user", "content": user_message} | |
] | |
print("=====messages=====") | |
print(messages) | |
print("=====messages=====") | |
request_payload = { | |
"model": "gpt-4-1106-preview", | |
"messages": messages, | |
"max_tokens": 4000 # 設定一個較大的值,可根據需要調整 | |
} | |
response =**request_payload) | |
print(response) | |
response_text = response.choices[0].message.content.strip() | |
# 更新聊天历史 | |
new_chat_history = (user_message, response_text) | |
if chat_history is None: | |
chat_history = [new_chat_history] | |
else: | |
chat_history.append(new_chat_history) | |
# 返回聊天历史和空字符串清空输入框 | |
return "", chat_history | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
file_upload = gr.File(label="Upload your CSV or Word file") | |
youtube_link = gr.Textbox(label="Enter YouTube Link") | |
web_link = gr.Textbox(label="Enter Web Page Link") | |
chatbot = gr.Chatbot() | |
msg = gr.Textbox(label="Message") | |
send_button = gr.Button("Send") | |
with gr.Column(): | |
with gr.Tab("YouTube Transcript and Video"): | |
transcript_html = gr.HTML(label="YouTube Transcript and Video") | |
with gr.Tab("資料本文"): | |
df_string_output = gr.Textbox() | |
with gr.Tab("資料摘要"): | |
gr.Markdown("## 這是什麼樣的資料?") | |
df_summarise = gr.Textbox(container=True, show_copy_button=True, label="資料本文", lines=40) | |
with gr.Tab("常用問題"): | |
gr.Markdown("## 常用問題") | |
btn_1 = gr.Button() | |
btn_2 = gr.Button() | |
btn_3 = gr.Button() | | | |
respond, | |
inputs=[msg, df_string_output, chatbot], | |
outputs=[msg, chatbot] | |
) | |
# 连接按钮点击事件 | |, inputs=[btn_1, df_string_output, chatbot], outputs=[msg, chatbot]) | |, inputs=[btn_2, df_string_output, chatbot], outputs=[msg, chatbot]) | |, inputs=[btn_3, df_string_output, chatbot], outputs=[msg, chatbot]) | |
# file_upload.change(process_file, inputs=file_upload, outputs=df_string_output) | |
file_upload.change(process_file, inputs=file_upload, outputs=[btn_1, btn_2, btn_3, df_summarise, df_string_output]) | |
# 当输入 YouTube 链接时触发 | |
youtube_link.change(process_youtube_link, inputs=youtube_link, outputs=[btn_1, btn_2, btn_3, df_summarise, transcript_html]) | |
# 当输入网页链接时触发 | |
web_link.change(process_web_link, inputs=web_link, outputs=[btn_1, btn_2, btn_3, df_summarise, df_string_output]) | |
demo.launch() | |