youngtsai commited on
Commit
a931b41
1 Parent(s): cf25313

check_file_exists

Browse files
Files changed (1) hide show
  1. app.py +63 -37
app.py CHANGED
@@ -19,23 +19,23 @@ from urllib.parse import urlparse, parse_qs
19
 
20
 
21
  # 假设您的环境变量或Secret的名称是GOOGLE_APPLICATION_CREDENTIALS_JSON
22
- credentials_json_string = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
23
- credentials_dict = json.loads(credentials_json_string)
24
- SCOPES = ['https://www.googleapis.com/auth/drive']
25
- credentials = service_account.Credentials.from_service_account_info(
26
- credentials_dict, scopes=SCOPES)
27
- service = build('drive', 'v3', credentials=credentials)
28
- # 列出 Google Drive 上的前10個文件
29
- results = service.files().list(pageSize=10, fields="nextPageToken, files(id, name)").execute()
30
- items = results.get('files', [])
31
-
32
- if not items:
33
- print('No files found.')
34
- else:
35
- print("=====Google Drive 上的前10個文件=====")
36
- print('Files:')
37
- for item in items:
38
- print(u'{0} ({1})'.format(item['name'], item['id']))
39
 
40
 
41
 
@@ -45,6 +45,35 @@ OUTPUT_PATH = 'videos'
45
  OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")
46
  client = OpenAI(api_key=OPEN_AI_KEY)
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def process_file(file):
49
  # 读取文件
50
  if file.name.endswith('.csv'):
@@ -106,9 +135,22 @@ def process_youtube_link(link):
106
  # 使用 YouTube API 获取逐字稿
107
  # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
108
  video_id = extract_youtube_id(link)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- # 先下載 video
111
- download_youtube_video(video_id, output_path=OUTPUT_PATH)
112
  # 再取得 transcript
113
  transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
114
  # 基于逐字稿生成其他所需的输出
@@ -175,6 +217,8 @@ def download_youtube_video(youtube_id, output_path=OUTPUT_PATH):
175
 
176
 
177
  def screenshot_youtube_video(youtube_id, snapshot_sec):
 
 
178
  # 这里假设视频已经在适当的位置
179
  video_path = f'{OUTPUT_PATH}/{youtube_id}.mp4'
180
 
@@ -185,24 +229,6 @@ def screenshot_youtube_video(youtube_id, snapshot_sec):
185
 
186
  return screenshot_path
187
 
188
- def process_video(youtube_id):
189
- download_youtube_video(youtube_id)
190
- video_path = f'{OUTPUT_PATH}/{youtube_id}.mp4'
191
- video = VideoFileClip(video_path)
192
- duration = int(video.duration)
193
- output_path = f'{OUTPUT_PATH}/screenshots/{youtube_id}'
194
- os.makedirs(output_path, exist_ok=True)
195
-
196
- # fake duration
197
- duration = 10
198
-
199
- screenshot_paths = []
200
- for i in range(1, duration):
201
- screenshot_path = screenshot_youtube_video(youtube_id, i)
202
- screenshot_paths.append(screenshot_path)
203
-
204
- return screenshot_paths
205
-
206
  def get_screenshot_from_video(video_link, start_time):
207
  # 实现从视频中提取帧的逻辑
208
  # 由于这需要服务器端处理,你可能需要一种方法来下载视频,
 
19
 
20
 
21
  # 假设您的环境变量或Secret的名称是GOOGLE_APPLICATION_CREDENTIALS_JSON
22
+ # credentials_json_string = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
23
+ # credentials_dict = json.loads(credentials_json_string)
24
+ # SCOPES = ['https://www.googleapis.com/auth/drive']
25
+ # credentials = service_account.Credentials.from_service_account_info(
26
+ # credentials_dict, scopes=SCOPES)
27
+ # service = build('drive', 'v3', credentials=credentials)
28
+ # # 列出 Google Drive 上的前10個文件
29
+ # results = service.files().list(pageSize=10, fields="nextPageToken, files(id, name)").execute()
30
+ # items = results.get('files', [])
31
+
32
+ # if not items:
33
+ # print('No files found.')
34
+ # else:
35
+ # print("=====Google Drive 上的前10個文件=====")
36
+ # print('Files:')
37
+ # for item in items:
38
+ # print(u'{0} ({1})'.format(item['name'], item['id']))
39
 
40
 
41
 
 
45
  OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")
46
  client = OpenAI(api_key=OPEN_AI_KEY)
47
 
48
+ # 初始化Google Drive服务
49
+ def init_drive_service():
50
+ credentials_json_string = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_JSON")
51
+ credentials_dict = json.loads(credentials_json_string)
52
+ SCOPES = ['https://www.googleapis.com/auth/drive']
53
+ credentials = service_account.Credentials.from_service_account_info(
54
+ credentials_dict, scopes=SCOPES)
55
+ service = build('drive', 'v3', credentials=credentials)
56
+ return service
57
+
58
+ # 检查Google Drive上是否存在文件
59
+ def check_file_exists(service, folder_name, file_name):
60
+ query = f"name = '{file_name}' and '{folder_name}' in parents and trashed = false"
61
+ response = service.files().list(q=query).execute()
62
+ files = response.get('files', [])
63
+ return len(files) > 0, files[0]['id'] if files else None
64
+
65
+ # 上传文件到Google Drive
66
+ def upload_to_drive(service, file_name, folder_id, content):
67
+ file_metadata = {
68
+ 'name': file_name,
69
+ 'parents': [folder_id]
70
+ }
71
+ media = MediaIoBaseUpload(io.BytesIO(content.encode()), mimetype='text/plain')
72
+ file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
73
+ return file.get('id')
74
+
75
+
76
+
77
  def process_file(file):
78
  # 读取文件
79
  if file.name.endswith('.csv'):
 
135
  # 使用 YouTube API 获取逐字稿
136
  # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
137
  video_id = extract_youtube_id(link)
138
+ service = init_drive_service()
139
+ folder_name = 'youtube逐字稿圖檔/{video_id}' # Google Drive上的文件夹ID
140
+ file_name = f"{video_id}_transcript.txt"
141
+
142
+ # 检查逐字稿是否存在
143
+ exists, file_id = check_file_exists(service, folder_name, file_name)
144
+ if not exists:
145
+ # 获取逐字稿
146
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
147
+ transcript_text = "\n".join([f"{item['start']}: {item['text']}" for item in transcript])
148
+ # 上传到Google Drive
149
+ upload_to_drive(service, file_name, folder_name, transcript_text)
150
+ print("逐字稿已上传到Google Drive")
151
+ else:
152
+ print("逐字稿已存在于Google Drive中")
153
 
 
 
154
  # 再取得 transcript
155
  transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
156
  # 基于逐字稿生成其他所需的输出
 
217
 
218
 
219
  def screenshot_youtube_video(youtube_id, snapshot_sec):
220
+ # 先下載 video
221
+ download_youtube_video(youtube_id, output_path=OUTPUT_PATH)
222
  # 这里假设视频已经在适当的位置
223
  video_path = f'{OUTPUT_PATH}/{youtube_id}.mp4'
224
 
 
229
 
230
  return screenshot_path
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  def get_screenshot_from_video(video_link, start_time):
233
  # 实现从视频中提取帧的逻辑
234
  # 由于这需要服务器端处理,你可能需要一种方法来下载视频,