eusholli commited on
Commit
2127ae4
·
1 Parent(s): 47681bb

fixed getting download clip

Browse files
Files changed (2) hide show
  1. ttv_web_scraper.py +200 -237
  2. video_utils.py +6 -4
ttv_web_scraper.py CHANGED
@@ -5,33 +5,52 @@ import os
5
  import traceback
6
  from pyppeteer import launch
7
  from bs4 import BeautifulSoup, NavigableString
8
- import hashlib
9
  from ai_config_faiss import get_ai_assistant
10
  from video_utils import generate_clips
 
 
 
11
 
 
 
12
 
13
- CACHE_DIR = "cache/"
14
- if not os.path.exists(CACHE_DIR):
15
- os.makedirs(CACHE_DIR)
16
 
 
17
  DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
18
  SUBJECTS = [
19
- " 5G ", " AI ", " Innovation ", " Network ", " Enterprise ", " Open RAN ",
20
- " TechCo ", " B2B ", " API ", " Infrastructure ", " Connectivity "
21
  ]
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- async def get_client_rendered_content(url):
 
25
  browser = None
26
  try:
27
  browser = await launch()
28
  page = await browser.newPage()
29
  await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
30
  await asyncio.sleep(5)
31
- content = await page.content()
32
- return content
33
  except Exception as e:
34
- raise Exception(f"Error fetching content: {str(e)}")
 
35
  finally:
36
  if browser:
37
  await browser.close()
@@ -47,7 +66,7 @@ def extract_text_with_br(element):
47
  return ''.join(result).strip()
48
 
49
 
50
- def extract_info(html_content):
51
  try:
52
  soup = BeautifulSoup(html_content, 'html.parser')
53
  title = soup.title.string.strip() if soup.title else None
@@ -55,289 +74,233 @@ def extract_info(html_content):
55
  date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
56
  youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
57
  youtube_url = youtube_iframe['src'] if youtube_iframe else None
58
- youtube_id = None
59
- if youtube_url:
60
- match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url)
61
- if match:
62
- youtube_id = match.group(1)
63
  transcript_elem = soup.find(id='transcript0')
64
  transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
65
- return {
66
- 'metadata': {
67
- 'title': title,
68
- 'date': date,
69
- 'youtube_id': youtube_id,
70
- },
71
- 'transcript': transcript
72
- }
73
  except Exception as e:
74
- raise Exception(f"Error extracting information: {str(e)}")
 
75
 
76
 
77
- def read_html_from_file(filename):
78
  try:
79
  if os.path.exists(filename):
80
  with open(filename, 'r', encoding='utf-8') as f:
81
  return f.read()
82
  return None
83
  except Exception as e:
84
- raise Exception(f"Error reading file {filename}: {str(e)}")
85
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- def read_json_from_file(filename):
88
- try:
89
- if os.path.exists(filename):
90
- with open(filename, 'r', encoding='utf-8') as f:
91
- return json.load(f)
92
- return None
93
- except json.JSONDecodeError as e:
94
- raise Exception(f"Error decoding JSON in file {filename}: {str(e)}")
95
- except Exception as e:
96
- raise Exception(f"Error reading file {filename}: {str(e)}")
 
 
97
 
 
98
 
99
- def extract_subject_info(text):
100
- # Convert text to lowercase for case-insensitive matching
101
- lower_text = text.lower()
102
 
103
- # Find all subjects present in the text
104
- found_subjects = [
105
- subject for subject in SUBJECTS if subject.lower() in lower_text]
106
 
107
- return found_subjects
108
 
 
 
 
 
 
109
 
110
- PATTERN = r'<br><br>(?:(?P<speaker>[^,(]+?)(?:,\s*(?P<company>[^(]+?))?)?\s*\((?P<timestamp>\d{2}:\d{2}:\d{2}|\d{2}:\d{2})\):<br>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
 
112
 
113
- def extract_speaker_info(segment):
114
- try:
115
- match = re.match(PATTERN, segment)
116
- if match:
117
- return {key: value.strip() if value else None for key, value in match.groupdict().items()}
118
- else:
119
- return None
120
-
121
- except Exception as e:
122
- raise Exception(f"Error extracting speaker info: {str(e)}")
123
 
 
 
124
 
125
- def parse_transcript(content):
126
- try:
127
- parsed_segments = []
128
- saved_info = None
129
-
130
- pattern = r'(<br><br>.*?\((?:\d{2}:)?\d{2}:\d{2}\):<br>)'
131
- segments = re.split(pattern, content)
132
- segments = [segment.strip() for segment in segments if segment.strip()]
133
-
134
- for i, segment in enumerate(segments):
135
- speaker_info = extract_speaker_info(segment)
136
- if speaker_info:
137
- if speaker_info['speaker']:
138
- # Full speaker, company, timestamp format
139
- if saved_info:
140
- text = segments[i-1] if i > 0 else ""
141
- subjects = extract_subject_info(text)
142
- parsed_segments.append({
143
- 'metadata': {
144
- 'speaker': saved_info['speaker'],
145
- 'company': saved_info['company'],
146
- 'start_timestamp': saved_info['timestamp'],
147
- 'end_timestamp': speaker_info['timestamp'],
148
- 'subjects': subjects
149
- },
150
- 'text': text
151
- })
152
- saved_info = speaker_info
153
- else:
154
- # Standalone timestamp format
155
- if saved_info:
156
- text = segments[i-1] if i > 0 else ""
157
- subjects = extract_subject_info(text)
158
- parsed_segments.append({
159
- 'metadata': {
160
- 'speaker': saved_info['speaker'],
161
- 'company': saved_info['company'],
162
- 'start_timestamp': saved_info['timestamp'],
163
- 'end_timestamp': speaker_info['timestamp'],
164
- 'subjects': subjects
165
- },
166
- 'text': text
167
- })
168
- saved_info['timestamp'] = speaker_info['timestamp']
169
- elif saved_info:
170
- # Text segment
171
- continue
172
-
173
- # Add final entry
174
- if saved_info:
175
- text = segments[-1]
176
- subjects = extract_subject_info(text)
177
- parsed_segments.append({
178
- 'metadata': {
179
- 'speaker': saved_info['speaker'],
180
- 'company': saved_info['company'],
181
- 'start_timestamp': saved_info['timestamp'],
182
- 'end_timestamp': "00:00:00",
183
- 'subjects': subjects
184
- },
185
- 'text': text
186
- })
187
-
188
- return parsed_segments
189
- except Exception as e:
190
- raise Exception(f"Error parsing transcript: {str(e)}")
191
 
 
192
 
193
- def get_cached_filename(url):
194
- return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}.html"
 
195
 
196
 
197
- async def process_url(url):
198
- try:
199
- cached_filename = get_cached_filename(url)
200
- json_filename = f"{cached_filename}.json"
201
- info = read_json_from_file(json_filename)
202
 
203
- if info:
204
- return info
205
 
206
- content = read_html_from_file(cached_filename)
 
 
 
 
 
 
 
207
 
208
- if content is None:
209
- print(f"Fetching content from web for {url}...")
210
- content = await get_client_rendered_content(url)
211
- with open(cached_filename, 'w', encoding='utf-8') as f:
212
- f.write(content)
213
- else:
214
- print(f"Using cached content from file for {url}...")
215
 
216
- info = extract_info(content)
217
- transcript = info['transcript']
218
- if (transcript):
219
- info['transcript'] = parse_transcript(transcript)
220
- generate_clips(CACHE_DIR, info)
221
- with open(json_filename, 'w', encoding='utf-8') as f:
222
- json.dump(info, f, ensure_ascii=False, indent=4)
223
- print(f"Information extracted and saved to {json_filename}")
224
- else:
225
- print(f"No transcript found for {url}")
226
- return info
227
 
228
- except Exception as e:
229
- print(f"Error processing URL {url}:")
230
- print(traceback.format_exc())
231
- print(f"Detailed error: {str(e)}")
232
- return None
233
 
 
 
 
 
 
 
 
234
 
235
- async def process_urls(urls):
236
- tasks = [process_url(url) for url in urls]
237
- return await asyncio.gather(*tasks)
238
 
239
 
240
- def main():
241
- global assistant
242
  assistant = get_ai_assistant()
243
-
244
- url_file = "dsp-urls-one.txt" # File containing list of URLs
245
 
246
  if not os.path.exists(url_file):
247
- print(f"Error: {url_file} not found.")
248
  return
249
 
250
- content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
251
-
252
- # Convert companies to a dictionary of speaker sets if it's not already
253
- if not isinstance(companies, dict):
254
- companies = {company: set() for company in companies}
255
 
256
  with open(url_file, 'r') as f:
257
  urls = [line.strip() for line in f if line.strip()]
258
 
259
- for url in urls:
260
- # Generate a hash of the url
261
- filename_hash = hashlib.md5(url.encode()).hexdigest()
262
- # Check if this content has already been added
263
- if filename_hash in content_hashes:
264
- print(f"{url} already added")
265
  continue
266
 
267
- info = asyncio.run(process_url(url))
 
268
  if info is None:
 
269
  continue
270
 
271
- metadata = info['metadata']
272
- transcript = info['transcript']
273
-
274
- if transcript is None:
275
- continue
276
 
277
- for entry in transcript:
278
- metadata.update(entry['metadata'])
279
- company = metadata['company']
280
- speaker = metadata['speaker']
281
- entry_subjects = metadata['subjects']
282
-
283
- speakers.add(speaker)
284
- # Add new subjects to the master set
285
  subjects.update(entry_subjects)
286
 
287
- text = entry['text']
288
-
289
- assistant.add_to_knowledge_base(
290
- text, data_type='text', metadata=metadata.copy())
291
 
292
- if company not in companies:
293
- companies[company] = set()
294
- companies[company].add(speaker)
295
 
296
- content_hashes.add(filename_hash)
297
- print(f"Added new url: {url}")
298
-
299
- # Save updated hashes and metadata
300
- save_metadata_sets(content_hashes, speakers,
301
- companies, sentiments, subjects)
302
 
 
303
  assistant.save()
304
 
305
- print("Processing complete. Check individual URL outputs for any errors.")
306
-
307
-
308
- def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
309
- metadata = {
310
- 'content_hashes': list(content_hashes),
311
- 'speakers': list(speakers),
312
- 'companies': {company: list(speakers) for company, speakers in companies.items()},
313
- 'sentiments': list(sentiments),
314
- 'subjects': list(subjects)
315
- }
316
-
317
- with open(DB_METADATA_FILE, 'w') as f:
318
- json.dump(metadata, f, indent=2)
319
-
320
-
321
- def db_load_metadata_sets():
322
- content_hashes = set()
323
- speakers = set()
324
- companies = {}
325
- sentiments = set()
326
- subjects = set()
327
-
328
- if os.path.exists(DB_METADATA_FILE):
329
- with open(DB_METADATA_FILE, 'r') as f:
330
- metadata = json.load(f)
331
-
332
- content_hashes = set(metadata.get('content_hashes', []))
333
- speakers = set(metadata.get('speakers', []))
334
- companies = {company: set(speakers) for company, speakers in metadata.get(
335
- 'companies', {}).items()}
336
- sentiments = set(metadata.get('sentiments', []))
337
- subjects = set(metadata.get('subjects', SUBJECTS))
338
-
339
- return content_hashes, speakers, companies, sentiments, subjects
340
 
341
 
342
  if __name__ == "__main__":
343
- main()
 
5
  import traceback
6
  from pyppeteer import launch
7
  from bs4 import BeautifulSoup, NavigableString
 
8
  from ai_config_faiss import get_ai_assistant
9
  from video_utils import generate_clips
10
+ from typing import Dict, List, Set, Optional
11
+ from dataclasses import dataclass, asdict
12
+ import logging
13
 
14
+ # Set the TOKENIZERS_PARALLELISM environment variable
15
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
16
 
17
+ # Set up logging
18
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
+ logger = logging.getLogger(__name__)
20
 
21
+ CACHE_DIR = "cache/"
22
  DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
23
  SUBJECTS = [
24
+ " 5G ", " AI ", " Innovation ", " Network ", " Enterprise ", " Open RAN ",
25
+ " TechCo ", " B2B ", " API ", " Infrastructure ", " Connectivity "
26
  ]
27
 
28
+ os.makedirs(CACHE_DIR, exist_ok=True)
29
+
30
+
31
+ @dataclass
32
+ class TranscriptSegment:
33
+ metadata: Dict[str, Optional[str]]
34
+ text: str
35
+
36
+
37
+ @dataclass
38
+ class VideoInfo:
39
+ metadata: Dict[str, Optional[str]]
40
+ transcript: List[TranscriptSegment]
41
 
42
+
43
+ async def get_client_rendered_content(url: str) -> str:
44
  browser = None
45
  try:
46
  browser = await launch()
47
  page = await browser.newPage()
48
  await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
49
  await asyncio.sleep(5)
50
+ return await page.content()
 
51
  except Exception as e:
52
+ logger.error(f"Error fetching content for {url}: {str(e)}")
53
+ raise
54
  finally:
55
  if browser:
56
  await browser.close()
 
66
  return ''.join(result).strip()
67
 
68
 
69
+ def extract_info(html_content: str) -> VideoInfo:
70
  try:
71
  soup = BeautifulSoup(html_content, 'html.parser')
72
  title = soup.title.string.strip() if soup.title else None
 
74
  date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
75
  youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
76
  youtube_url = youtube_iframe['src'] if youtube_iframe else None
77
+ youtube_id = re.search(r'youtube\.com/embed/([^?]+)', youtube_url).group(1) if youtube_url else None
 
 
 
 
78
  transcript_elem = soup.find(id='transcript0')
79
  transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
80
+
81
+ return VideoInfo(
82
+ metadata={'title': title, 'date': date, 'youtube_id': youtube_id},
83
+ transcript=parse_transcript(transcript) if transcript else []
84
+ )
 
 
 
85
  except Exception as e:
86
+ logger.error(f"Error extracting information: {str(e)}")
87
+ raise
88
 
89
 
90
+ def read_file(filename: str) -> Optional[str]:
91
  try:
92
  if os.path.exists(filename):
93
  with open(filename, 'r', encoding='utf-8') as f:
94
  return f.read()
95
  return None
96
  except Exception as e:
97
+ logger.error(f"Error reading file {filename}: {str(e)}")
98
+ raise
99
+
100
+
101
+ def extract_subject_info(text: str) -> List[str]:
102
+ return [subject for subject in SUBJECTS if subject.lower() in text.lower()]
103
+
104
+
105
+ def extract_speaker_info(segment: str) -> Optional[Dict[str, Optional[str]]]:
106
+ pattern = r'<br><br>(?:(?P<speaker>[^,(]+?)(?:,\s*(?P<company>[^(]+?))?)?\s*\((?P<timestamp>\d{2}:\d{2}:\d{2}|\d{2}:\d{2})\):<br>'
107
+
108
+ match = re.match(pattern, segment)
109
+ return {key: value.strip() if value else None for key, value in match.groupdict().items()} if match else None
110
+
111
+
112
+ def parse_transcript(content: str) -> List[TranscriptSegment]:
113
+ parsed_segments = []
114
+ saved_info = None
115
+
116
+ segments = [segment.strip() for segment in re.split(r'(<br><br>.*?\((?:\d{2}:)?\d{2}:\d{2}\):<br>)', content) if segment.strip()]
117
+
118
+ for i, segment in enumerate(segments):
119
+ speaker_info = extract_speaker_info(segment)
120
+ if speaker_info:
121
+ if speaker_info['speaker']:
122
+ if saved_info:
123
+ text = segments[i-1] if i > 0 else ""
124
+ parsed_segments.append(TranscriptSegment(
125
+ metadata={
126
+ 'speaker': saved_info['speaker'],
127
+ 'company': saved_info['company'],
128
+ 'start_timestamp': saved_info['timestamp'],
129
+ 'end_timestamp': speaker_info['timestamp'],
130
+ 'subjects': extract_subject_info(text)
131
+ },
132
+ text=text
133
+ ))
134
+ saved_info = speaker_info
135
+ if not saved_info['company']:
136
+ saved_info['company'] = "Unknown"
137
+ else:
138
+ if saved_info:
139
+ text = segments[i-1] if i > 0 else ""
140
+ parsed_segments.append(TranscriptSegment(
141
+ metadata={
142
+ 'speaker': saved_info['speaker'],
143
+ 'company': saved_info['company'],
144
+ 'start_timestamp': saved_info['timestamp'],
145
+ 'end_timestamp': speaker_info['timestamp'],
146
+ 'subjects': extract_subject_info(text)
147
+ },
148
+ text=text
149
+ ))
150
+ saved_info['timestamp'] = speaker_info['timestamp']
151
+ elif saved_info:
152
+ continue
153
 
154
+ if saved_info:
155
+ text = segments[-1]
156
+ parsed_segments.append(TranscriptSegment(
157
+ metadata={
158
+ 'speaker': saved_info['speaker'],
159
+ 'company': saved_info['company'],
160
+ 'start_timestamp': saved_info['timestamp'],
161
+ 'end_timestamp': "00:00:00",
162
+ 'subjects': extract_subject_info(text)
163
+ },
164
+ text=text
165
+ ))
166
 
167
+ return parsed_segments
168
 
 
 
 
169
 
170
+ def get_cached_filename(url: str) -> str:
171
+ return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}"
 
172
 
 
173
 
174
+ async def process_url(url: str) -> Optional[VideoInfo]:
175
+ try:
176
+ cached_filename = get_cached_filename(url)
177
+ html_filename = f"{cached_filename}.html"
178
+ json_filename = f"{cached_filename}.json"
179
 
180
+ if os.path.exists(json_filename):
181
+ logger.info(f"Using cached JSON for {url}")
182
+ with open(json_filename, 'r', encoding='utf-8') as f:
183
+ data = json.load(f)
184
+ return VideoInfo(
185
+ metadata=data['metadata'],
186
+ transcript=[TranscriptSegment(**segment) for segment in data['transcript']]
187
+ )
188
+
189
+ if os.path.exists(html_filename):
190
+ logger.info(f"Using cached HTML for {url}")
191
+ content = read_file(html_filename)
192
+ else:
193
+ logger.info(f"Fetching content from web for {url}")
194
+ content = await get_client_rendered_content(url)
195
+ with open(html_filename, 'w', encoding='utf-8') as f:
196
+ f.write(content)
197
 
198
+ info = extract_info(content)
199
 
200
+ if info.transcript:
201
+ logger.info(f"Generating clips for {url}")
202
+ info_dict = asdict(info)
203
+ info_dict['transcript'] = generate_clips(CACHE_DIR, info_dict)
204
+ info = VideoInfo(
205
+ metadata=info_dict['metadata'],
206
+ transcript=[TranscriptSegment(**segment) for segment in info_dict['transcript']]
207
+ )
 
 
208
 
209
+ with open(json_filename, 'w', encoding='utf-8') as f:
210
+ json.dump(asdict(info), f, ensure_ascii=False, indent=4)
211
 
212
+ logger.info(f"Information extracted and saved to {json_filename}")
213
+ else:
214
+ logger.warning(f"No transcript found for {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
+ return info
217
 
218
+ except Exception:
219
+ logger.error(f"Error processing URL {url}:\n{traceback.format_exc()}")
220
+ return None
221
 
222
 
223
+ async def process_urls(urls: List[str]) -> List[Optional[VideoInfo]]:
224
+ return await asyncio.gather(*[process_url(url) for url in urls])
 
 
 
225
 
 
 
226
 
227
+ def save_metadata_sets(processed_urls: Set[str], speakers: Set[str], companies: Dict[str, Set[str]], sentiments: Set[str], subjects: Set[str]):
228
+ metadata = {
229
+ 'processed_urls': list(processed_urls),
230
+ 'speakers': list(speakers),
231
+ 'companies': {company: list(speakers) for company, speakers in companies.items()},
232
+ 'sentiments': list(sentiments),
233
+ 'subjects': list(subjects)
234
+ }
235
 
236
+ with open(DB_METADATA_FILE, 'w') as f:
237
+ json.dump(metadata, f, indent=2)
 
 
 
 
 
238
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ def db_load_metadata_sets() -> tuple:
241
+ if os.path.exists(DB_METADATA_FILE):
242
+ with open(DB_METADATA_FILE, 'r') as f:
243
+ metadata = json.load(f)
 
244
 
245
+ return (
246
+ set(metadata.get('processed_urls', [])),
247
+ set(metadata.get('speakers', [])),
248
+ {company: set(speakers) for company, speakers in metadata.get('companies', {}).items()},
249
+ set(metadata.get('sentiments', [])),
250
+ set(metadata.get('subjects', SUBJECTS))
251
+ )
252
 
253
+ return set(), set(), {}, set(), set(SUBJECTS)
 
 
254
 
255
 
256
+ async def main():
 
257
  assistant = get_ai_assistant()
258
+ url_file = "dsp-urls-one.txt"
 
259
 
260
  if not os.path.exists(url_file):
261
+ logger.error(f"Error: {url_file} not found.")
262
  return
263
 
264
+ processed_urls, speakers, companies, sentiments, subjects = db_load_metadata_sets()
 
 
 
 
265
 
266
  with open(url_file, 'r') as f:
267
  urls = [line.strip() for line in f if line.strip()]
268
 
269
+ total_urls = len(urls)
270
+ for i, url in enumerate(urls, 1):
271
+ if url in processed_urls:
272
+ logger.info(f"[{i}/{total_urls}] {url} already processed")
 
 
273
  continue
274
 
275
+ logger.info(f"[{i}/{total_urls}] Processing {url}")
276
+ info = await process_url(url)
277
  if info is None:
278
+ logger.warning(f"[{i}/{total_urls}] Failed to process {url}")
279
  continue
280
 
281
+ for entry in info.transcript:
282
+ metadata = {**info.metadata, **entry.metadata}
283
+ company = metadata.get('company')
284
+ speaker = metadata.get('speaker')
285
+ entry_subjects = metadata.get('subjects', [])
286
 
287
+ if speaker:
288
+ speakers.add(speaker)
 
 
 
 
 
 
289
  subjects.update(entry_subjects)
290
 
291
+ assistant.add_to_knowledge_base(entry.text, data_type='text', metadata=metadata.copy())
 
 
 
292
 
293
+ if company and speaker:
294
+ companies.setdefault(company, set()).add(speaker)
 
295
 
296
+ processed_urls.add(url)
297
+ logger.info(f"[{i}/{total_urls}] Added new url: {url}")
 
 
 
 
298
 
299
+ save_metadata_sets(processed_urls, speakers, companies, sentiments, subjects)
300
  assistant.save()
301
 
302
+ logger.info("Processing complete. Check logs for any errors.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
 
305
  if __name__ == "__main__":
306
+ asyncio.run(main())
video_utils.py CHANGED
@@ -111,9 +111,9 @@ def main():
111
  def generate_clips(cache_dir, info):
112
  yt_id = info['metadata']['youtube_id']
113
  download_file = get_youtube_video(cache_dir, yt_id)
 
114
 
115
  if download_file:
116
- transcript = info['transcript']
117
  video = VideoFileClip(download_file)
118
 
119
  for entry in transcript:
@@ -127,9 +127,6 @@ def generate_clips(cache_dir, info):
127
  end_time = min(video.duration, end_time +
128
  1) if end_time != 0 else video.duration
129
 
130
- # Create clip
131
- clip = video.subclip(start_time, end_time)
132
-
133
  # Generate output filename
134
  output_filename = (
135
  f"{CLIP_DIR}{yt_id}-"
@@ -140,6 +137,9 @@ def generate_clips(cache_dir, info):
140
 
141
  if os.path.exists(output_filename):
142
  continue
 
 
 
143
 
144
  # Write the clip to a file
145
  clip.write_videofile(
@@ -151,6 +151,8 @@ def generate_clips(cache_dir, info):
151
  video.close()
152
  else:
153
  print(f"Failed to download video for YouTube ID: {yt_id}")
 
 
154
 
155
 
156
  if __name__ == "__main__":
 
111
  def generate_clips(cache_dir, info):
112
  yt_id = info['metadata']['youtube_id']
113
  download_file = get_youtube_video(cache_dir, yt_id)
114
+ transcript = info['transcript']
115
 
116
  if download_file:
 
117
  video = VideoFileClip(download_file)
118
 
119
  for entry in transcript:
 
127
  end_time = min(video.duration, end_time +
128
  1) if end_time != 0 else video.duration
129
 
 
 
 
130
  # Generate output filename
131
  output_filename = (
132
  f"{CLIP_DIR}{yt_id}-"
 
137
 
138
  if os.path.exists(output_filename):
139
  continue
140
+
141
+ # Create clip
142
+ clip = video.subclip(start_time, end_time)
143
 
144
  # Write the clip to a file
145
  clip.write_videofile(
 
151
  video.close()
152
  else:
153
  print(f"Failed to download video for YouTube ID: {yt_id}")
154
+
155
+ return transcript
156
 
157
 
158
  if __name__ == "__main__":