youngtsai commited on
Commit
f99c291
1 Parent(s): b300db2

transcript = process_transcript_and_screenshots(video_id)

Browse files
Files changed (1) hide show
  1. app.py +54 -15
app.py CHANGED
@@ -136,6 +136,27 @@ def set_public_permission(service, file_id):
136
  fields='id',
137
  ).execute()
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  def process_file(file):
140
  # 读取文件
141
  if file.name.endswith('.csv'):
@@ -193,33 +214,48 @@ def extract_youtube_id(url):
193
  else:
194
  return None
195
 
196
- def process_youtube_link(link):
197
- # 使用 YouTube API 获取逐字稿
198
- # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
199
- video_id = extract_youtube_id(link)
200
  service = init_drive_service()
201
- parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL' # youtube逐字稿圖檔的ID
202
-
203
- # 检查/创建视频ID命名的子文件夹
204
  folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id)
205
- file_name = f"{video_id}_transcript.txt"
206
 
207
  # 检查逐字稿是否存在
208
- transcript = None
209
  exists, file_id = check_file_exists(service, folder_id, file_name)
210
  if not exists:
 
211
  transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
212
  transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
213
- upload_content_directly(service, file_name, folder_id, transcript_text)
214
  print("逐字稿已上传到Google Drive")
215
  else:
 
216
  print("逐字稿已存在于Google Drive中")
217
  transcript_text = download_file_as_string(service, file_id)
218
  transcript = json.loads(transcript_text)
219
 
220
- # 基于逐字稿生成其他所需的输出
221
- questions = generate_questions(transcript)
222
- df_summarise = generate_df_summarise(transcript)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  formatted_transcript = []
225
  screenshot_paths = []
@@ -228,8 +264,7 @@ def process_youtube_link(link):
228
  start_time = format_seconds_to_time(entry['start'])
229
  end_time = format_seconds_to_time(entry['start'] + entry['duration'])
230
  embed_url = get_embedded_youtube_link(video_id, entry['start'])
231
- # 截圖
232
- screenshot_path = screenshot_youtube_video(video_id, entry['start'])
233
  line = {
234
  "start_time": start_time,
235
  "end_time": end_time,
@@ -245,6 +280,10 @@ def process_youtube_link(link):
245
  print(html_content)
246
  print("=====html_content=====")
247
 
 
 
 
 
248
  # 确保返回与 UI 组件预期匹配的输出
249
  return questions[0] if len(questions) > 0 else "", \
250
  questions[1] if len(questions) > 1 else "", \
 
136
  fields='id',
137
  ).execute()
138
 
139
+ def update_file_on_drive(service, file_id, file_content):
140
+ """
141
+ 更新Google Drive上的文件内容。
142
+
143
+ 参数:
144
+ - service: Google Drive API服务实例。
145
+ - file_id: 要更新的文件的ID。
146
+ - file_content: 新的文件内容,字符串格式。
147
+ """
148
+ # 将新的文件内容转换为字节流
149
+ fh = io.BytesIO(file_content.encode('utf-8'))
150
+ media = MediaIoBaseUpload(fh, mimetype='application/json', resumable=True)
151
+
152
+ # 更新文件
153
+ updated_file = service.files().update(
154
+ fileId=file_id,
155
+ media_body=media
156
+ ).execute()
157
+
158
+ print(f"文件已更新,文件ID: {updated_file['id']}")
159
+
160
  def process_file(file):
161
  # 读取文件
162
  if file.name.endswith('.csv'):
 
214
  else:
215
  return None
216
 
217
+ def process_transcript_and_screenshots(video_id):
 
 
 
218
  service = init_drive_service()
219
+ parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL'
 
 
220
  folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id)
221
+ file_name = f'{video_id}_transcript.json'
222
 
223
  # 检查逐字稿是否存在
 
224
  exists, file_id = check_file_exists(service, folder_id, file_name)
225
  if not exists:
226
+ # 从YouTube获取逐字稿并上传
227
  transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
228
  transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
229
+ file_id = upload_content_directly(service, file_name, folder_id, transcript_text)
230
  print("逐字稿已上传到Google Drive")
231
  else:
232
+ # 逐字稿已存在,下载逐字稿内容
233
  print("逐字稿已存在于Google Drive中")
234
  transcript_text = download_file_as_string(service, file_id)
235
  transcript = json.loads(transcript_text)
236
 
237
+ # 处理逐字稿中的每个条目,检查并上传截图
238
+ for entry in transcript:
239
+ if 'img_src' not in entry:
240
+ screenshot_path = screenshot_youtube_video(video_id, entry['start'])
241
+ img_file_id = upload_img_directly(service, f"{video_id}_{entry['start']}.jpg", folder_id, screenshot_path)
242
+ img_src = f"https://drive.google.com/uc?export=view&id={img_file_id}"
243
+ entry['img_src'] = img_src
244
+ # 删除本地截图文件
245
+ os.remove(screenshot_path)
246
+
247
+ # 更新逐字稿文件
248
+ updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
249
+ update_file_on_drive(service, file_id, updated_transcript_text)
250
+ print("逐字稿已更新,包括截图链接")
251
+
252
+ return transcript
253
+
254
+ def process_youtube_link(link):
255
+ # 使用 YouTube API 获取逐字稿
256
+ # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
257
+ video_id = extract_youtube_id(link)
258
+ transcript = process_transcript_and_screenshots(video_id)
259
 
260
  formatted_transcript = []
261
  screenshot_paths = []
 
264
  start_time = format_seconds_to_time(entry['start'])
265
  end_time = format_seconds_to_time(entry['start'] + entry['duration'])
266
  embed_url = get_embedded_youtube_link(video_id, entry['start'])
267
+ screenshot_path = entry['img_src']
 
268
  line = {
269
  "start_time": start_time,
270
  "end_time": end_time,
 
280
  print(html_content)
281
  print("=====html_content=====")
282
 
283
+ # 基于逐字稿生成其他所需的输出
284
+ questions = generate_questions(transcript)
285
+ df_summarise = generate_df_summarise(transcript)
286
+
287
  # 确保返回与 UI 组件预期匹配的输出
288
  return questions[0] if len(questions) > 0 else "", \
289
  questions[1] if len(questions) > 1 else "", \