Joshua Lochner commited on
Commit
3fd1e1e
1 Parent(s): 17feb06

Add requirements.txt and update model

Browse files
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"URL_TOKEN": 30523, "SHORT_HYPHENATED_TOKEN": 30527, "START_SPONSOR_TOKEN": 30534, "END_INTERACTION_TOKEN": 30539, "END_SELFPROMO_TOKEN": 30537, "[Laughter]": 30531, "BETWEEN_SEGMENTS_TOKEN": 30540, "NUMBER_PERCENTAGE_TOKEN": 30525, "NUMBER_TOKEN": 30526, "PROFANITY_TOKEN": 30532, "NO_SEGMENT_TOKEN": 30533, "HYPHENATED_URL_TOKEN": 30524, "START_SELFPROMO_TOKEN": 30536, "[Music]": 30529, "[Applause]": 30530, "START_INTERACTION_TOKEN": 30538, "LONG_WORD_TOKEN": 30528, "END_SPONSOR_TOKEN": 30535, "EXTRACT_SEGMENTS: ": 30522}
1
+ {"END_SELFPROMO_TOKEN": 30537, "START_SPONSOR_TOKEN": 30534, "START_INTERACTION_TOKEN": 30538, "[Applause]": 30530, "NUMBER_PERCENTAGE_TOKEN": 30525, "END_INTERACTION_TOKEN": 30539, "EXTRACT_SEGMENTS: ": 30522, "START_SELFPROMO_TOKEN": 30536, "LONG_WORD_TOKEN": 30528, "HYPHENATED_URL_TOKEN": 30524, "[Laughter]": 30531, "[Music]": 30529, "PROFANITY_TOKEN": 30532, "SHORT_HYPHENATED_TOKEN": 30527, "NUMBER_TOKEN": 30526, "URL_TOKEN": 30523, "BETWEEN_SEGMENTS_TOKEN": 30540, "END_SPONSOR_TOKEN": 30535, "NO_SEGMENT_TOKEN": 30533}
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "bert-based-uncased",
3
  "architectures": [
4
  "BertForSequenceClassification"
5
  ],
1
  {
2
+ "_name_or_path": "models/sponsorblock-classifier-v2/",
3
  "architectures": [
4
  "BertForSequenceClassification"
5
  ],
pipeline.py CHANGED
@@ -1,6 +1,11 @@
1
  import json
2
  from functools import lru_cache
3
- import youtube_transcript_api2
 
 
 
 
 
4
  import json
5
  import re
6
  import requests
@@ -165,7 +170,7 @@ def parse_transcript_json(json_data, granularity):
165
 
166
  def list_transcripts(video_id):
167
  try:
168
- return youtube_transcript_api2.YouTubeTranscriptApi.list_transcripts(video_id)
169
  except json.decoder.JSONDecodeError:
170
  return None
171
 
@@ -198,14 +203,13 @@ def get_words(video_id, transcript_type='auto', fallback='manual', filter_words_
198
  f'{ts._url}&fmt=json3').content
199
  if raw_transcript:
200
  raw_transcript_json = json.loads(raw_transcript)
201
-
202
- except (youtube_transcript_api2.TooManyRequests, youtube_transcript_api2.YouTubeRequestFailed):
203
  raise # Cannot recover from these errors and do not mark as empty transcript
204
 
205
  except requests.exceptions.RequestException: # Can recover
206
  return get_words(video_id, transcript_type, fallback, granularity)
207
 
208
- except youtube_transcript_api2.CouldNotRetrieveTranscript: # Retrying won't solve
209
  pass # Mark as empty transcript
210
 
211
  except json.decoder.JSONDecodeError:
@@ -285,10 +289,11 @@ class PreTrainedPipeline():
285
  self.tokenizer2 = AutoTokenizer.from_pretrained(path)
286
  self.pipeline2 = SponsorBlockClassificationPipeline(
287
  model=self.model2, tokenizer=self.tokenizer2)
288
-
289
- def __call__(self, inputs: str)-> List[Dict[str, Any]]:
290
 
291
- if ' ' not in inputs and inputs.count(',') >= 2: # Automated call (compressed string)
 
 
 
292
  split_info = inputs.split(',', 1)
293
  times = np.reshape(np.array(split_info[1].split(',')), (-1, 2))
294
  data = []
@@ -304,15 +309,14 @@ class PreTrainedPipeline():
304
  return self.pipeline2(data)
305
 
306
 
307
-
308
  class SponsorBlockClassificationPipeline(TextClassificationPipeline):
309
  def __init__(self, model, tokenizer):
310
  super().__init__(model=model, tokenizer=tokenizer, return_all_scores=True)
311
 
312
  def preprocess(self, data, **tokenizer_kwargs):
313
- if isinstance(data, str): # If string, assume this is what user wants to classify
314
- text = data
315
- else: # Otherwise, get data from transcript
316
  words = get_words(data['video_id'])
317
  segment_words = extract_segment(words, data['start'], data['end'])
318
  text = ' '.join(x['text'] for x in segment_words)
1
  import json
2
  from functools import lru_cache
3
+ from youtube_transcript_api import (
4
+ YouTubeTranscriptApi,
5
+ TooManyRequests,
6
+ YouTubeRequestFailed,
7
+ CouldNotRetrieveTranscript
8
+ )
9
  import json
10
  import re
11
  import requests
170
 
171
  def list_transcripts(video_id):
172
  try:
173
+ return YouTubeTranscriptApi.list_transcripts(video_id)
174
  except json.decoder.JSONDecodeError:
175
  return None
176
 
203
  f'{ts._url}&fmt=json3').content
204
  if raw_transcript:
205
  raw_transcript_json = json.loads(raw_transcript)
206
+ except (TooManyRequests, YouTubeRequestFailed):
 
207
  raise # Cannot recover from these errors and do not mark as empty transcript
208
 
209
  except requests.exceptions.RequestException: # Can recover
210
  return get_words(video_id, transcript_type, fallback, granularity)
211
 
212
+ except CouldNotRetrieveTranscript: # Retrying won't solve
213
  pass # Mark as empty transcript
214
 
215
  except json.decoder.JSONDecodeError:
289
  self.tokenizer2 = AutoTokenizer.from_pretrained(path)
290
  self.pipeline2 = SponsorBlockClassificationPipeline(
291
  model=self.model2, tokenizer=self.tokenizer2)
 
 
292
 
293
+ def __call__(self, inputs: str) -> List[Dict[str, Any]]:
294
+
295
+ # Automated call (compressed string)
296
+ if ' ' not in inputs and inputs.count(',') >= 2:
297
  split_info = inputs.split(',', 1)
298
  times = np.reshape(np.array(split_info[1].split(',')), (-1, 2))
299
  data = []
309
  return self.pipeline2(data)
310
 
311
 
 
312
  class SponsorBlockClassificationPipeline(TextClassificationPipeline):
313
  def __init__(self, model, tokenizer):
314
  super().__init__(model=model, tokenizer=tokenizer, return_all_scores=True)
315
 
316
  def preprocess(self, data, **tokenizer_kwargs):
317
+ if isinstance(data, str): # If string, assume this is what user wants to classify
318
+ text = data
319
+ else: # Otherwise, get data from transcript
320
  words = get_words(data['video_id'])
321
  segment_words = extract_segment(words, data['start'], data['end'])
322
  text = ' '.join(x['text'] for x in segment_words)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59a2338b4e0278a49bf75ae1a771de425811ffdbba69108a4b92526497cd0dae
3
  size 438084653
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:012162a219c071b5208c4179f56b8e6263bad1b532f7fc4fc13f0311de5f1729
3
  size 438084653
requirements.txt ADDED
@@ -0,0 +1 @@
 
1
+ youtube_transcript_api
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:156876b214e338523f93bae5052b71d5b3872e79cd8fe551742463c8af0b6821
3
- size 14439
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a203bf3393a6882ed6527945f185bfa78a49e0b8709ba172146c735dcfc1aa6c
3
+ size 14567
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54284b17a87ffcefec12bdfce42ddab7e6d490e5c99efdd0a5cf1220abf9fd71
3
  size 623
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e1eb9d44a2bee663e6731a6c6d574d4d710f519e640406cfff15e0f37ab9f9
3
  size 623
tokenizer.json CHANGED
@@ -1,7 +1,21 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 512
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 0,
16
+ "pad_type_id": 0,
17
+ "pad_token": "[PAD]"
18
+ },
19
  "added_tokens": [
20
  {
21
  "id": 0,
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "models/classifier-85000", "tokenizer_class": "BertTokenizer"}
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "models/sponsorblock-classifier-v2/", "tokenizer_class": "BertTokenizer"}
trainer_state.json CHANGED
@@ -1,658 +1,229 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.2687460617517328,
5
- "global_step": 415000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.04,
12
- "learning_rate": 1.921235034656585e-05,
13
- "loss": 0.3334,
14
  "step": 5000
15
  },
16
  {
17
  "epoch": 0.08,
18
- "learning_rate": 1.8424700693131696e-05,
19
- "loss": 0.3387,
20
  "step": 10000
21
  },
22
  {
23
  "epoch": 0.12,
24
- "learning_rate": 1.7637051039697544e-05,
25
- "loss": 0.3327,
26
  "step": 15000
27
  },
28
  {
29
  "epoch": 0.16,
30
- "learning_rate": 1.684940138626339e-05,
31
- "loss": 0.3492,
32
  "step": 20000
33
  },
34
  {
35
  "epoch": 0.2,
36
- "learning_rate": 1.606175173282924e-05,
37
- "loss": 0.3349,
38
  "step": 25000
39
  },
40
  {
41
  "epoch": 0.2,
42
- "eval_accuracy": 0.9155995845794678,
43
- "eval_loss": 0.389266699552536,
44
- "eval_runtime": 551.1932,
45
- "eval_samples_per_second": 51.289,
46
- "eval_steps_per_second": 12.823,
47
  "step": 25000
48
  },
49
  {
50
  "epoch": 0.24,
51
- "learning_rate": 1.5274102079395087e-05,
52
- "loss": 0.3279,
53
  "step": 30000
54
  },
55
  {
56
  "epoch": 0.28,
57
- "learning_rate": 1.4486452425960932e-05,
58
- "loss": 0.3301,
59
  "step": 35000
60
  },
61
  {
62
  "epoch": 0.32,
63
- "learning_rate": 1.369880277252678e-05,
64
- "loss": 0.3243,
65
  "step": 40000
66
  },
67
  {
68
  "epoch": 0.35,
69
- "learning_rate": 1.2911153119092628e-05,
70
- "loss": 0.293,
71
  "step": 45000
72
  },
73
  {
74
  "epoch": 0.39,
75
- "learning_rate": 1.2123503465658477e-05,
76
- "loss": 0.3053,
77
  "step": 50000
78
  },
79
  {
80
  "epoch": 0.39,
81
- "eval_accuracy": 0.9235231876373291,
82
- "eval_loss": 0.3810465931892395,
83
- "eval_runtime": 542.4272,
84
- "eval_samples_per_second": 52.118,
85
- "eval_steps_per_second": 13.03,
86
  "step": 50000
87
  },
88
  {
89
  "epoch": 0.43,
90
- "learning_rate": 1.1335853812224324e-05,
91
- "loss": 0.3126,
92
  "step": 55000
93
  },
94
  {
95
  "epoch": 0.47,
96
- "learning_rate": 1.0548204158790173e-05,
97
- "loss": 0.3072,
98
  "step": 60000
99
  },
100
  {
101
  "epoch": 0.51,
102
- "learning_rate": 9.760554505356018e-06,
103
- "loss": 0.2957,
104
  "step": 65000
105
  },
106
  {
107
  "epoch": 0.55,
108
- "learning_rate": 8.972904851921865e-06,
109
- "loss": 0.2968,
110
  "step": 70000
111
  },
112
  {
113
  "epoch": 0.59,
114
- "learning_rate": 8.185255198487714e-06,
115
- "loss": 0.2882,
116
  "step": 75000
117
  },
118
  {
119
  "epoch": 0.59,
120
- "eval_accuracy": 0.9224973320960999,
121
- "eval_loss": 0.37537074089050293,
122
- "eval_runtime": 521.9317,
123
- "eval_samples_per_second": 54.164,
124
- "eval_steps_per_second": 13.542,
125
  "step": 75000
126
  },
127
  {
128
  "epoch": 0.63,
129
- "learning_rate": 7.3976055450535615e-06,
130
- "loss": 0.2754,
131
  "step": 80000
132
  },
133
  {
134
  "epoch": 0.67,
135
- "learning_rate": 6.6099558916194085e-06,
136
- "loss": 0.2607,
137
  "step": 85000
138
  },
139
  {
140
  "epoch": 0.71,
141
- "learning_rate": 5.8223062381852555e-06,
142
- "loss": 0.2818,
143
  "step": 90000
144
  },
145
  {
146
  "epoch": 0.75,
147
- "learning_rate": 5.034656584751103e-06,
148
- "loss": 0.2736,
149
  "step": 95000
150
  },
151
  {
152
  "epoch": 0.79,
153
- "learning_rate": 4.24700693131695e-06,
154
- "loss": 0.2644,
155
  "step": 100000
156
  },
157
  {
158
  "epoch": 0.79,
159
- "eval_accuracy": 0.9297842383384705,
160
- "eval_loss": 0.3645715117454529,
161
- "eval_runtime": 521.9055,
162
- "eval_samples_per_second": 54.167,
163
- "eval_steps_per_second": 13.543,
164
  "step": 100000
165
  },
166
  {
167
  "epoch": 0.83,
168
- "learning_rate": 3.459357277882798e-06,
169
- "loss": 0.2552,
170
  "step": 105000
171
  },
172
  {
173
  "epoch": 0.87,
174
- "learning_rate": 2.6717076244486457e-06,
175
- "loss": 0.266,
176
  "step": 110000
177
  },
178
  {
179
  "epoch": 0.91,
180
- "learning_rate": 1.884057971014493e-06,
181
- "loss": 0.2684,
182
  "step": 115000
183
  },
184
  {
185
  "epoch": 0.95,
186
- "learning_rate": 1.0964083175803404e-06,
187
- "loss": 0.2501,
188
  "step": 120000
189
  },
190
  {
191
  "epoch": 0.98,
192
- "learning_rate": 3.087586641461878e-07,
193
- "loss": 0.273,
194
  "step": 125000
195
  },
196
  {
197
  "epoch": 0.98,
198
- "eval_accuracy": 0.9299964904785156,
199
- "eval_loss": 0.3369257152080536,
200
- "eval_runtime": 522.7551,
201
- "eval_samples_per_second": 54.079,
202
- "eval_steps_per_second": 13.521,
203
  "step": 125000
204
  },
205
  {
206
  "epoch": 1.02,
207
- "learning_rate": 1.7952110901071204e-05,
208
- "loss": 0.2834,
209
  "step": 130000
210
  },
211
  {
212
  "epoch": 1.06,
213
- "learning_rate": 1.787334593572779e-05,
214
- "loss": 0.3047,
215
  "step": 135000
216
  },
217
  {
218
  "epoch": 1.1,
219
- "learning_rate": 1.7794580970384373e-05,
220
- "loss": 0.2963,
221
  "step": 140000
222
- },
223
- {
224
- "epoch": 1.14,
225
- "learning_rate": 1.771581600504096e-05,
226
- "loss": 0.3031,
227
- "step": 145000
228
- },
229
- {
230
- "epoch": 1.18,
231
- "learning_rate": 1.7637051039697544e-05,
232
- "loss": 0.3033,
233
- "step": 150000
234
- },
235
- {
236
- "epoch": 1.18,
237
- "eval_accuracy": 0.9257162809371948,
238
- "eval_loss": 0.4006378650665283,
239
- "eval_runtime": 519.4649,
240
- "eval_samples_per_second": 54.421,
241
- "eval_steps_per_second": 13.606,
242
- "step": 150000
243
- },
244
- {
245
- "epoch": 1.22,
246
- "learning_rate": 1.755828607435413e-05,
247
- "loss": 0.3024,
248
- "step": 155000
249
- },
250
- {
251
- "epoch": 1.26,
252
- "learning_rate": 1.7479521109010713e-05,
253
- "loss": 0.3135,
254
- "step": 160000
255
- },
256
- {
257
- "epoch": 1.3,
258
- "learning_rate": 1.74007561436673e-05,
259
- "loss": 0.3137,
260
- "step": 165000
261
- },
262
- {
263
- "epoch": 1.34,
264
- "learning_rate": 1.732199117832388e-05,
265
- "loss": 0.3227,
266
- "step": 170000
267
- },
268
- {
269
- "epoch": 1.38,
270
- "learning_rate": 1.7243226212980467e-05,
271
- "loss": 0.3246,
272
- "step": 175000
273
- },
274
- {
275
- "epoch": 1.38,
276
- "eval_accuracy": 0.924018383026123,
277
- "eval_loss": 0.3924681842327118,
278
- "eval_runtime": 518.8244,
279
- "eval_samples_per_second": 54.489,
280
- "eval_steps_per_second": 13.623,
281
- "step": 175000
282
- },
283
- {
284
- "epoch": 1.42,
285
- "learning_rate": 1.7164461247637053e-05,
286
- "loss": 0.3281,
287
- "step": 180000
288
- },
289
- {
290
- "epoch": 1.46,
291
- "learning_rate": 1.708569628229364e-05,
292
- "loss": 0.3256,
293
- "step": 185000
294
- },
295
- {
296
- "epoch": 1.5,
297
- "learning_rate": 1.700693131695022e-05,
298
- "loss": 0.313,
299
- "step": 190000
300
- },
301
- {
302
- "epoch": 1.54,
303
- "learning_rate": 1.6928166351606807e-05,
304
- "loss": 0.3313,
305
- "step": 195000
306
- },
307
- {
308
- "epoch": 1.58,
309
- "learning_rate": 1.684940138626339e-05,
310
- "loss": 0.2953,
311
- "step": 200000
312
- },
313
- {
314
- "epoch": 1.58,
315
- "eval_accuracy": 0.9212592840194702,
316
- "eval_loss": 0.3895967900753021,
317
- "eval_runtime": 526.2623,
318
- "eval_samples_per_second": 53.718,
319
- "eval_steps_per_second": 13.431,
320
- "step": 200000
321
- },
322
- {
323
- "epoch": 1.61,
324
- "learning_rate": 1.6770636420919976e-05,
325
- "loss": 0.3103,
326
- "step": 205000
327
- },
328
- {
329
- "epoch": 1.65,
330
- "learning_rate": 1.669187145557656e-05,
331
- "loss": 0.3089,
332
- "step": 210000
333
- },
334
- {
335
- "epoch": 1.69,
336
- "learning_rate": 1.6613106490233147e-05,
337
- "loss": 0.3095,
338
- "step": 215000
339
- },
340
- {
341
- "epoch": 1.73,
342
- "learning_rate": 1.653434152488973e-05,
343
- "loss": 0.3288,
344
- "step": 220000
345
- },
346
- {
347
- "epoch": 1.77,
348
- "learning_rate": 1.6455576559546316e-05,
349
- "loss": 0.3199,
350
- "step": 225000
351
- },
352
- {
353
- "epoch": 1.77,
354
- "eval_accuracy": 0.9203749299049377,
355
- "eval_loss": 0.3942428529262543,
356
- "eval_runtime": 520.6801,
357
- "eval_samples_per_second": 54.294,
358
- "eval_steps_per_second": 13.575,
359
- "step": 225000
360
- },
361
- {
362
- "epoch": 1.81,
363
- "learning_rate": 1.6376811594202898e-05,
364
- "loss": 0.306,
365
- "step": 230000
366
- },
367
- {
368
- "epoch": 1.85,
369
- "learning_rate": 1.6298046628859484e-05,
370
- "loss": 0.3104,
371
- "step": 235000
372
- },
373
- {
374
- "epoch": 1.89,
375
- "learning_rate": 1.621928166351607e-05,
376
- "loss": 0.3139,
377
- "step": 240000
378
- },
379
- {
380
- "epoch": 1.93,
381
- "learning_rate": 1.6140516698172656e-05,
382
- "loss": 0.3179,
383
- "step": 245000
384
- },
385
- {
386
- "epoch": 1.97,
387
- "learning_rate": 1.606175173282924e-05,
388
- "loss": 0.3226,
389
- "step": 250000
390
- },
391
- {
392
- "epoch": 1.97,
393
- "eval_accuracy": 0.9243367314338684,
394
- "eval_loss": 0.4058537185192108,
395
- "eval_runtime": 552.8946,
396
- "eval_samples_per_second": 51.131,
397
- "eval_steps_per_second": 12.784,
398
- "step": 250000
399
- },
400
- {
401
- "epoch": 2.01,
402
- "learning_rate": 1.5982986767485824e-05,
403
- "loss": 0.3167,
404
- "step": 255000
405
- },
406
- {
407
- "epoch": 2.05,
408
- "learning_rate": 1.5904221802142407e-05,
409
- "loss": 0.3034,
410
- "step": 260000
411
- },
412
- {
413
- "epoch": 2.09,
414
- "learning_rate": 1.5825456836798993e-05,
415
- "loss": 0.2976,
416
- "step": 265000
417
- },
418
- {
419
- "epoch": 2.13,
420
- "learning_rate": 1.574669187145558e-05,
421
- "loss": 0.3039,
422
- "step": 270000
423
- },
424
- {
425
- "epoch": 2.17,
426
- "learning_rate": 1.5667926906112164e-05,
427
- "loss": 0.2889,
428
- "step": 275000
429
- },
430
- {
431
- "epoch": 2.17,
432
- "eval_accuracy": 0.9221436381340027,
433
- "eval_loss": 0.39818692207336426,
434
- "eval_runtime": 508.9512,
435
- "eval_samples_per_second": 55.546,
436
- "eval_steps_per_second": 13.887,
437
- "step": 275000
438
- },
439
- {
440
- "epoch": 2.21,
441
- "learning_rate": 1.5589161940768747e-05,
442
- "loss": 0.3079,
443
- "step": 280000
444
- },
445
- {
446
- "epoch": 2.24,
447
- "learning_rate": 1.5510396975425333e-05,
448
- "loss": 0.3148,
449
- "step": 285000
450
- },
451
- {
452
- "epoch": 2.28,
453
- "learning_rate": 1.5431632010081915e-05,
454
- "loss": 0.2829,
455
- "step": 290000
456
- },
457
- {
458
- "epoch": 2.32,
459
- "learning_rate": 1.53528670447385e-05,
460
- "loss": 0.2978,
461
- "step": 295000
462
- },
463
- {
464
- "epoch": 2.36,
465
- "learning_rate": 1.5274102079395087e-05,
466
- "loss": 0.2963,
467
- "step": 300000
468
- },
469
- {
470
- "epoch": 2.36,
471
- "eval_accuracy": 0.922957181930542,
472
- "eval_loss": 0.43214836716651917,
473
- "eval_runtime": 528.9348,
474
- "eval_samples_per_second": 53.447,
475
- "eval_steps_per_second": 13.363,
476
- "step": 300000
477
- },
478
- {
479
- "epoch": 2.4,
480
- "learning_rate": 1.5195337114051671e-05,
481
- "loss": 0.3007,
482
- "step": 305000
483
- },
484
- {
485
- "epoch": 2.44,
486
- "learning_rate": 1.5116572148708256e-05,
487
- "loss": 0.2901,
488
- "step": 310000
489
- },
490
- {
491
- "epoch": 2.48,
492
- "learning_rate": 1.5037807183364841e-05,
493
- "loss": 0.2905,
494
- "step": 315000
495
- },
496
- {
497
- "epoch": 2.52,
498
- "learning_rate": 1.4959042218021424e-05,
499
- "loss": 0.3082,
500
- "step": 320000
501
- },
502
- {
503
- "epoch": 2.56,
504
- "learning_rate": 1.488027725267801e-05,
505
- "loss": 0.2899,
506
- "step": 325000
507
- },
508
- {
509
- "epoch": 2.56,
510
- "eval_accuracy": 0.924124538898468,
511
- "eval_loss": 0.40452826023101807,
512
- "eval_runtime": 520.2943,
513
- "eval_samples_per_second": 54.335,
514
- "eval_steps_per_second": 13.585,
515
- "step": 325000
516
- },
517
- {
518
- "epoch": 2.6,
519
- "learning_rate": 1.4801512287334594e-05,
520
- "loss": 0.301,
521
- "step": 330000
522
- },
523
- {
524
- "epoch": 2.64,
525
- "learning_rate": 1.472274732199118e-05,
526
- "loss": 0.3014,
527
- "step": 335000
528
- },
529
- {
530
- "epoch": 2.68,
531
- "learning_rate": 1.4643982356647764e-05,
532
- "loss": 0.2974,
533
- "step": 340000
534
- },
535
- {
536
- "epoch": 2.72,
537
- "learning_rate": 1.456521739130435e-05,
538
- "loss": 0.3215,
539
- "step": 345000
540
- },
541
- {
542
- "epoch": 2.76,
543
- "learning_rate": 1.4486452425960932e-05,
544
- "loss": 0.2986,
545
- "step": 350000
546
- },
547
- {
548
- "epoch": 2.76,
549
- "eval_accuracy": 0.9239122867584229,
550
- "eval_loss": 0.4114295542240143,
551
- "eval_runtime": 565.9722,
552
- "eval_samples_per_second": 49.949,
553
- "eval_steps_per_second": 12.488,
554
- "step": 350000
555
- },
556
- {
557
- "epoch": 2.8,
558
- "learning_rate": 1.4407687460617518e-05,
559
- "loss": 0.2881,
560
- "step": 355000
561
- },
562
- {
563
- "epoch": 2.84,
564
- "learning_rate": 1.4328922495274103e-05,
565
- "loss": 0.2937,
566
- "step": 360000
567
- },
568
- {
569
- "epoch": 2.87,
570
- "learning_rate": 1.4250157529930688e-05,
571
- "loss": 0.2987,
572
- "step": 365000
573
- },
574
- {
575
- "epoch": 2.91,
576
- "learning_rate": 1.4171392564587273e-05,
577
- "loss": 0.3355,
578
- "step": 370000
579
- },
580
- {
581
- "epoch": 2.95,
582
- "learning_rate": 1.4092627599243858e-05,
583
- "loss": 0.3086,
584
- "step": 375000
585
- },
586
- {
587
- "epoch": 2.95,
588
- "eval_accuracy": 0.9162362813949585,
589
- "eval_loss": 0.4699816107749939,
590
- "eval_runtime": 523.819,
591
- "eval_samples_per_second": 53.969,
592
- "eval_steps_per_second": 13.493,
593
- "step": 375000
594
- },
595
- {
596
- "epoch": 2.99,
597
- "learning_rate": 1.4013862633900441e-05,
598
- "loss": 0.3187,
599
- "step": 380000
600
- },
601
- {
602
- "epoch": 3.03,
603
- "learning_rate": 1.3935097668557027e-05,
604
- "loss": 0.304,
605
- "step": 385000
606
- },
607
- {
608
- "epoch": 3.07,
609
- "learning_rate": 1.3856332703213611e-05,
610
- "loss": 0.2856,
611
- "step": 390000
612
- },
613
- {
614
- "epoch": 3.11,
615
- "learning_rate": 1.3777567737870197e-05,
616
- "loss": 0.2877,
617
- "step": 395000
618
- },
619
- {
620
- "epoch": 3.15,
621
- "learning_rate": 1.369880277252678e-05,
622
- "loss": 0.2896,
623
- "step": 400000
624
- },
625
- {
626
- "epoch": 3.15,
627
- "eval_accuracy": 0.9198089838027954,
628
- "eval_loss": 0.43677982687950134,
629
- "eval_runtime": 526.7389,
630
- "eval_samples_per_second": 53.67,
631
- "eval_steps_per_second": 13.418,
632
- "step": 400000
633
- },
634
- {
635
- "epoch": 3.19,
636
- "learning_rate": 1.3620037807183365e-05,
637
- "loss": 0.2946,
638
- "step": 405000
639
- },
640
- {
641
- "epoch": 3.23,
642
- "learning_rate": 1.354127284183995e-05,
643
- "loss": 0.2899,
644
- "step": 410000
645
- },
646
- {
647
- "epoch": 3.27,
648
- "learning_rate": 1.3462507876496535e-05,
649
- "loss": 0.2907,
650
- "step": 415000
651
  }
652
  ],
653
  "max_steps": 1269600,
654
  "num_train_epochs": 10,
655
- "total_flos": 4.367706162646794e+17,
656
  "trial_name": null,
657
  "trial_params": null
658
  }
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.1027095148078134,
5
+ "global_step": 140000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
  "epoch": 0.04,
12
+ "learning_rate": 1.9921235034656587e-06,
13
+ "loss": 0.2621,
14
  "step": 5000
15
  },
16
  {
17
  "epoch": 0.08,
18
+ "learning_rate": 1.984247006931317e-06,
19
+ "loss": 0.2548,
20
  "step": 10000
21
  },
22
  {
23
  "epoch": 0.12,
24
+ "learning_rate": 1.9763705103969753e-06,
25
+ "loss": 0.2451,
26
  "step": 15000
27
  },
28
  {
29
  "epoch": 0.16,
30
+ "learning_rate": 1.9684940138626337e-06,
31
+ "loss": 0.2479,
32
  "step": 20000
33
  },
34
  {
35
  "epoch": 0.2,
36
+ "learning_rate": 1.9606175173282924e-06,
37
+ "loss": 0.2477,
38
  "step": 25000
39
  },
40
  {
41
  "epoch": 0.2,
42
+ "eval_accuracy": 0.9227449297904968,
43
+ "eval_loss": 0.42459121346473694,
44
+ "eval_runtime": 532.8547,
45
+ "eval_samples_per_second": 53.054,
46
+ "eval_steps_per_second": 13.264,
47
  "step": 25000
48
  },
49
  {
50
  "epoch": 0.24,
51
+ "learning_rate": 1.9527410207939508e-06,
52
+ "loss": 0.253,
53
  "step": 30000
54
  },
55
  {
56
  "epoch": 0.28,
57
+ "learning_rate": 1.9448645242596095e-06,
58
+ "loss": 0.2466,
59
  "step": 35000
60
  },
61
  {
62
  "epoch": 0.32,
63
+ "learning_rate": 1.936988027725268e-06,
64
+ "loss": 0.25,
65
  "step": 40000
66
  },
67
  {
68
  "epoch": 0.35,
69
+ "learning_rate": 1.929111531190926e-06,
70
+ "loss": 0.2402,
71
  "step": 45000
72
  },
73
  {
74
  "epoch": 0.39,
75
+ "learning_rate": 1.9212350346565845e-06,
76
+ "loss": 0.2515,
77
  "step": 50000
78
  },
79
  {
80
  "epoch": 0.39,
81
+ "eval_accuracy": 0.926034688949585,
82
+ "eval_loss": 0.3926495909690857,
83
+ "eval_runtime": 506.3742,
84
+ "eval_samples_per_second": 55.828,
85
+ "eval_steps_per_second": 13.958,
86
  "step": 50000
87
  },
88
  {
89
  "epoch": 0.43,
90
+ "learning_rate": 1.9133585381222433e-06,
91
+ "loss": 0.2383,
92
  "step": 55000
93
  },
94
  {
95
  "epoch": 0.47,
96
+ "learning_rate": 1.9054820415879016e-06,
97
+ "loss": 0.2523,
98
  "step": 60000
99
  },
100
  {
101
  "epoch": 0.51,
102
+ "learning_rate": 1.8976055450535602e-06,
103
+ "loss": 0.2372,
104
  "step": 65000
105
  },
106
  {
107
  "epoch": 0.55,
108
+ "learning_rate": 1.8897290485192185e-06,
109
+ "loss": 0.2395,
110
  "step": 70000
111
  },
112
  {
113
  "epoch": 0.59,
114
+ "learning_rate": 1.881852551984877e-06,
115
+ "loss": 0.2376,
116
  "step": 75000
117
  },
118
  {
119
  "epoch": 0.59,
120
+ "eval_accuracy": 0.9263883829116821,
121
+ "eval_loss": 0.3989144265651703,
122
+ "eval_runtime": 505.8843,
123
+ "eval_samples_per_second": 55.882,
124
+ "eval_steps_per_second": 13.972,
125
  "step": 75000
126
  },
127
  {
128
  "epoch": 0.63,
129
+ "learning_rate": 1.8739760554505356e-06,
130
+ "loss": 0.2331,
131
  "step": 80000
132
  },
133
  {
134
  "epoch": 0.67,
135
+ "learning_rate": 1.8660995589161941e-06,
136
+ "loss": 0.2426,
137
  "step": 85000
138
  },
139
  {
140
  "epoch": 0.71,
141
+ "learning_rate": 1.8582230623818525e-06,
142
+ "loss": 0.2493,
143
  "step": 90000
144
  },
145
  {
146
  "epoch": 0.75,
147
+ "learning_rate": 1.850346565847511e-06,
148
+ "loss": 0.2379,
149
  "step": 95000
150
  },
151
  {
152
  "epoch": 0.79,
153
+ "learning_rate": 1.8424700693131694e-06,
154
+ "loss": 0.2428,
155
  "step": 100000
156
  },
157
  {
158
  "epoch": 0.79,
159
+ "eval_accuracy": 0.9267421364784241,
160
+ "eval_loss": 0.3985295295715332,
161
+ "eval_runtime": 549.265,
162
+ "eval_samples_per_second": 51.469,
163
+ "eval_steps_per_second": 12.868,
164
  "step": 100000
165
  },
166
  {
167
  "epoch": 0.83,
168
+ "learning_rate": 1.834593572778828e-06,
169
+ "loss": 0.2429,
170
  "step": 105000
171
  },
172
  {
173
  "epoch": 0.87,
174
+ "learning_rate": 1.8267170762444864e-06,
175
+ "loss": 0.238,
176
  "step": 110000
177
  },
178
  {
179
  "epoch": 0.91,
180
+ "learning_rate": 1.818840579710145e-06,
181
+ "loss": 0.2322,
182
  "step": 115000
183
  },
184
  {
185
  "epoch": 0.95,
186
+ "learning_rate": 1.8109640831758033e-06,
187
+ "loss": 0.2371,
188
  "step": 120000
189
  },
190
  {
191
  "epoch": 0.98,
192
+ "learning_rate": 1.8030875866414619e-06,
193
+ "loss": 0.2303,
194
  "step": 125000
195
  },
196
  {
197
  "epoch": 0.98,
198
+ "eval_accuracy": 0.9282631874084473,
199
+ "eval_loss": 0.40024659037590027,
200
+ "eval_runtime": 548.3799,
201
+ "eval_samples_per_second": 51.552,
202
+ "eval_steps_per_second": 12.889,
203
  "step": 125000
204
  },
205
  {
206
  "epoch": 1.02,
207
+ "learning_rate": 1.7952110901071202e-06,
208
+ "loss": 0.221,
209
  "step": 130000
210
  },
211
  {
212
  "epoch": 1.06,
213
+ "learning_rate": 1.7873345935727788e-06,
214
+ "loss": 0.2199,
215
  "step": 135000
216
  },
217
  {
218
  "epoch": 1.1,
219
+ "learning_rate": 1.779458097038437e-06,
220
+ "loss": 0.2097,
221
  "step": 140000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  }
223
  ],
224
  "max_steps": 1269600,
225
  "num_train_epochs": 10,
226
+ "total_flos": 1.473443106221998e+17,
227
  "trial_name": null,
228
  "trial_params": null
229
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:644215cf0b9aba1d6b87848f34f5414c4363298f97f869acd603d3745f7e08d1
3
  size 2991
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03c6251eac46fa18822e93d9f2cc88f4b61044f93b32dd36da4e4f50a6315f9c
3
  size 2991
youtube_transcript_api2/__init__.py DELETED
@@ -1,16 +0,0 @@
1
- from ._api import YouTubeTranscriptApi
2
- from ._transcripts import TranscriptList, Transcript
3
- from ._errors import (
4
- TranscriptsDisabled,
5
- NoTranscriptFound,
6
- CouldNotRetrieveTranscript,
7
- VideoUnavailable,
8
- TooManyRequests,
9
- NotTranslatable,
10
- TranslationLanguageNotAvailable,
11
- NoTranscriptAvailable,
12
- CookiePathInvalid,
13
- CookiesInvalid,
14
- FailedToCreateConsentCookie,
15
- YouTubeRequestFailed,
16
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
youtube_transcript_api2/__main__.py DELETED
@@ -1,15 +0,0 @@
1
- import sys
2
-
3
- import logging
4
-
5
- from ._cli import YouTubeTranscriptCli
6
-
7
-
8
- def main():
9
- logging.basicConfig()
10
-
11
- print(YouTubeTranscriptCli(sys.argv[1:]).run())
12
-
13
-
14
- if __name__ == '__main__':
15
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
youtube_transcript_api2/_api.py DELETED
@@ -1,140 +0,0 @@
1
- import requests
2
- try: # pragma: no cover
3
- import http.cookiejar as cookiejar
4
- CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
5
- except ImportError: # pragma: no cover
6
- import cookielib as cookiejar
7
- CookieLoadError = IOError
8
-
9
- from ._transcripts import TranscriptListFetcher
10
-
11
- from ._errors import (
12
- CookiePathInvalid,
13
- CookiesInvalid
14
- )
15
-
16
-
17
- class YouTubeTranscriptApi(object):
18
- @classmethod
19
- def list_transcripts(cls, video_id, proxies=None, cookies=None):
20
- """
21
- Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
22
- which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
23
- over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
24
- metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
25
- `transcript.translate('en')`. Example::
26
-
27
- # retrieve the available transcripts
28
- transcript_list = YouTubeTranscriptApi.get('video_id')
29
-
30
- # iterate over all available transcripts
31
- for transcript in transcript_list:
32
- # the Transcript object provides metadata properties
33
- print(
34
- transcript.video_id,
35
- transcript.language,
36
- transcript.language_code,
37
- # whether it has been manually created or generated by YouTube
38
- transcript.is_generated,
39
- # a list of languages the transcript can be translated to
40
- transcript.translation_languages,
41
- )
42
-
43
- # fetch the actual transcript data
44
- print(transcript.fetch())
45
-
46
- # translating the transcript will return another transcript object
47
- print(transcript.translate('en').fetch())
48
-
49
- # you can also directly filter for the language you are looking for, using the transcript list
50
- transcript = transcript_list.find_transcript(['de', 'en'])
51
-
52
- # or just filter for manually created transcripts
53
- transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
54
-
55
- # or automatically generated ones
56
- transcript = transcript_list.find_generated_transcript(['de', 'en'])
57
-
58
- :param video_id: the youtube video id
59
- :type video_id: str
60
- :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
61
- :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
62
- :param cookies: a string of the path to a text file containing youtube authorization cookies
63
- :type cookies: str
64
- :return: the list of available transcripts
65
- :rtype TranscriptList:
66
- """
67
- with requests.Session() as http_client:
68
- if cookies:
69
- http_client.cookies = cls._load_cookies(cookies, video_id)
70
- http_client.proxies = proxies if proxies else {}
71
- return TranscriptListFetcher(http_client).fetch(video_id)
72
-
73
- @classmethod
74
- def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
75
- """
76
- Retrieves the transcripts for a list of videos.
77
-
78
- :param video_ids: a list of youtube video ids
79
- :type video_ids: list[str]
80
- :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
81
- it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
82
- do so.
83
- :type languages: list[str]
84
- :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
85
- one of the video transcripts
86
- :type continue_after_error: bool
87
- :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
88
- :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
89
- :param cookies: a string of the path to a text file containing youtube authorization cookies
90
- :type cookies: str
91
- :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
92
- video ids, which could not be retrieved
93
- :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
94
- """
95
- data = {}
96
- unretrievable_videos = []
97
-
98
- for video_id in video_ids:
99
- try:
100
- data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
101
- except Exception as exception:
102
- if not continue_after_error:
103
- raise exception
104
-
105
- unretrievable_videos.append(video_id)
106
-
107
- return data, unretrievable_videos
108
-
109
- @classmethod
110
- def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
111
- """
112
- Retrieves the transcript for a single video. This is just a shortcut for calling::
113
-
114
- YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
115
-
116
- :param video_id: the youtube video id
117
- :type video_id: str
118
- :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
119
- it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
120
- do so.
121
- :type languages: list[str]
122
- :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
123
- :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
124
- :param cookies: a string of the path to a text file containing youtube authorization cookies
125
- :type cookies: str
126
- :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
127
- :rtype [{'text': str, 'start': float, 'end': float}]:
128
- """
129
- return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
130
-
131
- @classmethod
132
- def _load_cookies(cls, cookies, video_id):
133
- try:
134
- cookie_jar = cookiejar.MozillaCookieJar()
135
- cookie_jar.load(cookies)
136
- if not cookie_jar:
137
- raise CookiesInvalid(video_id)
138
- return cookie_jar
139
- except CookieLoadError:
140
- raise CookiePathInvalid(video_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
youtube_transcript_api2/_cli.py DELETED
@@ -1,135 +0,0 @@
1
- import argparse
2
-
3
- from ._api import YouTubeTranscriptApi
4
-
5
- from .formatters import FormatterLoader
6
-
7
-
8
- class YouTubeTranscriptCli(object):
9
- def __init__(self, args):
10
- self._args = args
11
-
12
- def run(self):
13
- parsed_args = self._parse_args()
14
-
15
- if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
16
- return ''
17
-
18
- proxies = None
19
- if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
20
- proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
21
-
22
- cookies = parsed_args.cookies
23
-
24
- transcripts = []
25
- exceptions = []
26
-
27
- for video_id in parsed_args.video_ids:
28
- try:
29
- transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
30
- except Exception as exception:
31
- exceptions.append(exception)
32
-
33
- return '\n\n'.join(
34
- [str(exception) for exception in exceptions]
35
- + ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else [])
36
- )
37
-
38
- def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
39
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)
40
-
41
- if parsed_args.list_transcripts:
42
- return str(transcript_list)
43
-
44
- if parsed_args.exclude_manually_created:
45
- transcript = transcript_list.find_generated_transcript(parsed_args.languages)
46
- elif parsed_args.exclude_generated:
47
- transcript = transcript_list.find_manually_created_transcript(parsed_args.languages)
48
- else:
49
- transcript = transcript_list.find_transcript(parsed_args.languages)
50
-
51
- if parsed_args.translate:
52
- transcript = transcript.translate(parsed_args.translate)
53
-
54
- return transcript.fetch()
55
-
56
- def _parse_args(self):
57
- parser = argparse.ArgumentParser(
58
- description=(
59
- 'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
60
- 'It also works for automatically generated subtitles and it does not require a headless browser, like '
61
- 'other selenium based solutions do!'
62
- )
63
- )
64
- parser.add_argument(
65
- '--list-transcripts',
66
- action='store_const',
67
- const=True,
68
- default=False,
69
- help='This will list the languages in which the given videos are available in.',
70
- )
71
- parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
72
- parser.add_argument(
73
- '--languages',
74
- nargs='*',
75
- default=['en',],
76
- type=str,
77
- help=(
78
- 'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
79
- 'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails '
80
- 'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
81
- 'may have to play around with the language codes a bit, to find the one which is working for you!'
82
- ),
83
- )
84
- parser.add_argument(
85
- '--exclude-generated',
86
- action='store_const',
87
- const=True,
88
- default=False,
89
- help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.',
90
- )
91
- parser.add_argument(
92
- '--exclude-manually-created',
93
- action='store_const',
94
- const=True,
95
- default=False,
96
- help='If this flag is set transcripts which have been manually created will not be retrieved.',
97
- )
98
- parser.add_argument(
99
- '--format',
100
- type=str,
101
- default='pretty',
102
- choices=tuple(FormatterLoader.TYPES.keys()),
103
- )
104
- parser.add_argument(
105
- '--translate',
106
- default='',
107
- help=(
108
- 'The language code for the language you want this transcript to be translated to. Use the '
109
- '--list-transcripts feature to find out which languages are translatable and which translation '
110
- 'languages are available.'
111
- )
112
- )
113
- parser.add_argument(
114
- '--http-proxy',
115
- default='',
116
- metavar='URL',
117
- help='Use the specified HTTP proxy.'
118
- )
119
- parser.add_argument(
120
- '--https-proxy',
121
- default='',
122
- metavar='URL',
123
- help='Use the specified HTTPS proxy.'
124
- )
125
- parser.add_argument(
126
- '--cookies',
127
- default=None,
128
- help='The cookie file that will be used for authorization with youtube.'
129
- )
130
-
131
- return self._sanitize_video_ids(parser.parse_args(self._args))
132
-
133
- def _sanitize_video_ids(self, args):
134
- args.video_ids = [video_id.replace('\\', '') for video_id in args.video_ids]
135
- return args
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
youtube_transcript_api2/_errors.py DELETED
@@ -1,112 +0,0 @@
1
- from ._settings import WATCH_URL
2
-
3
-
4
- class CouldNotRetrieveTranscript(Exception):
5
- """
6
- Raised if a transcript could not be retrieved.
7
- """
8
- ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
9
- CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
10
- CAUSE_MESSAGE = ''
11
- GITHUB_REFERRAL = (
12
- '\n\nIf you are sure that the described cause is not responsible for this error '
13
- 'and that a transcript should be retrievable, please create an issue at '
14
- 'https://github.com/jdepoix/youtube-transcript-api/issues. '
15
- 'Please add which version of youtube_transcript_api you are using '
16
- 'and provide the information needed to replicate the error. '
17
- 'Also make sure that there are no open issues which already describe your problem!'
18
- )
19
-
20
- def __init__(self, video_id):
21
- self.video_id = video_id
22
- super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message())
23
-
24
- def _build_error_message(self):
25
- cause = self.cause
26
- error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
27
-
28
- if cause:
29
- error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
30
-
31
- return error_message
32
-
33
- @property
34
- def cause(self):
35
- return self.CAUSE_MESSAGE
36
-
37
-
38
- class YouTubeRequestFailed(CouldNotRetrieveTranscript):
39
- CAUSE_MESSAGE = 'Request to YouTube failed: {reason}'
40
-
41
- def __init__(self, video_id, http_error):
42
- self.reason = str(http_error)
43
- super(YouTubeRequestFailed, self).__init__(video_id)
44
-
45
- @property
46
- def cause(self):
47
- return self.CAUSE_MESSAGE.format(
48
- reason=self.reason,
49
- )
50
-
51
-
52
- class VideoUnavailable(CouldNotRetrieveTranscript):
53
- CAUSE_MESSAGE = 'The video is no longer available'
54
-
55
-
56
- class TooManyRequests(CouldNotRetrieveTranscript):
57
- CAUSE_MESSAGE = (
58
- 'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
59
- 'One of the following things can be done to work around this:\n\
60
- - Manually solve the captcha in a browser and export the cookie. '
61
- 'Read here how to use that cookie with '
62
- 'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
63
- - Use a different IP address\n\
64
- - Wait until the ban on your IP has been lifted'
65
- )
66
-
67
-
68
- class TranscriptsDisabled(CouldNotRetrieveTranscript):
69
- CAUSE_MESSAGE = 'Subtitles are disabled for this video'
70
-
71
-
72
- class NoTranscriptAvailable(CouldNotRetrieveTranscript):
73
- CAUSE_MESSAGE = 'No transcripts are available for this video'
74
-
75
-
76
- class NotTranslatable(CouldNotRetrieveTranscript):
77
- CAUSE_MESSAGE = 'The requested language is not translatable'
78
-
79
-
80
- class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript):
81
- CAUSE_MESSAGE = 'The requested translation language is not available'
82
-
83
-
84
- class CookiePathInvalid(CouldNotRetrieveTranscript):
85
- CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
86
-
87
-
88
- class CookiesInvalid(CouldNotRetrieveTranscript):
89
- CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
90
-
91
-
92
- class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
93
- CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
94
-
95
-
96
- class NoTranscriptFound(CouldNotRetrieveTranscript):
97
- CAUSE_MESSAGE = (
98
- 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
99
- '{transcript_data}'
100
- )
101
-
102
- def __init__(self, video_id, requested_language_codes, transcript_data):
103
- self._requested_language_codes = requested_language_codes
104
- self._transcript_data = transcript_data
105
- super(NoTranscriptFound, self).__init__(video_id)
106
-
107
- @property
108
- def cause(self):
109
- return self.CAUSE_MESSAGE.format(
110
- requested_language_codes=self._requested_language_codes,
111
- transcript_data=str(self._transcript_data),
112
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
youtube_transcript_api2/_html_unescaping.py DELETED
@@ -1,21 +0,0 @@
1
- import sys
2
-
3
-
4
- # This can only be tested by using different python versions, therefore it is not covered by coverage.py
5
- if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover
6
- # Python 3.4+
7
- from html import unescape
8
- else: # pragma: no cover
9
- if sys.version_info.major <= 2:
10
- # Python 2
11
- import HTMLParser
12
-
13
- html_parser = HTMLParser.HTMLParser()
14
- else:
15
- # Python 3.0 - 3.3
16
- import html.parser
17
-
18
- html_parser = html.parser.HTMLParser()
19
-
20
- def unescape(string):
21
- return html_parser.unescape(string)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
youtube_transcript_api2/_settings.py DELETED
@@ -1 +0,0 @@
1
- WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
 
youtube_transcript_api2/_transcripts.py DELETED
@@ -1,332 +0,0 @@
1
- import sys
2
-
3
- # This can only be tested by using different python versions, therefore it is not covered by coverage.py
4
- if sys.version_info.major == 2: # pragma: no cover
5
- reload(sys)
6
- sys.setdefaultencoding('utf-8')
7
-
8
- import json
9
-
10
- from xml.etree import ElementTree
11
-
12
- import re
13
-
14
- from requests import HTTPError
15
-
16
- from ._html_unescaping import unescape
17
- from ._errors import (
18
- VideoUnavailable,
19
- TooManyRequests,
20
- YouTubeRequestFailed,
21
- NoTranscriptFound,
22
- TranscriptsDisabled,
23
- NotTranslatable,
24
- TranslationLanguageNotAvailable,
25
- NoTranscriptAvailable,
26
- FailedToCreateConsentCookie,
27
- )
28
- from ._settings import WATCH_URL
29
-
30
-
31
- def _raise_http_errors(response, video_id):
32
- try:
33
- response.raise_for_status()
34
- return response
35
- except HTTPError as error:
36
- raise YouTubeRequestFailed(error, video_id)
37
-
38
-
39
- class TranscriptListFetcher(object):
40
- def __init__(self, http_client):
41
- self._http_client = http_client
42
-
43
- def fetch(self, video_id):
44
- return TranscriptList.build(
45
- self._http_client,
46
- video_id,
47
- self._extract_captions_json(self._fetch_video_html(video_id), video_id)
48
- )
49
-
50
- def _extract_captions_json(self, html, video_id):
51
- splitted_html = html.split('"captions":')
52
-
53
- if len(splitted_html) <= 1:
54
- if 'class="g-recaptcha"' in html:
55
- raise TooManyRequests(video_id)
56
- if '"playabilityStatus":' not in html:
57
- raise VideoUnavailable(video_id)
58
-
59
- raise TranscriptsDisabled(video_id)
60
-
61
- captions_json = json.loads(
62
- splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
63
- ).get('playerCaptionsTracklistRenderer')
64
- if captions_json is None:
65
- raise TranscriptsDisabled(video_id)
66
-
67
- if 'captionTracks' not in captions_json:
68
- raise NoTranscriptAvailable(video_id)
69
-
70
- return captions_json
71
-
72
- def _create_consent_cookie(self, html, video_id):
73
- match = re.search('name="v" value="(.*?)"', html)
74
- if match is None:
75
- raise FailedToCreateConsentCookie(video_id)
76
- self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
77
-
78
- def _fetch_video_html(self, video_id):
79
- html = self._fetch_html(video_id)
80
- if 'action="https://consent.youtube.com/s"' in html:
81
- self._create_consent_cookie(html, video_id)
82
- html = self._fetch_html(video_id)
83
- if 'action="https://consent.youtube.com/s"' in html:
84
- raise FailedToCreateConsentCookie(video_id)
85
- return html
86
-
87
- def _fetch_html(self, video_id):
88
- response = self._http_client.get(WATCH_URL.format(video_id=video_id))
89
- return unescape(_raise_http_errors(response, video_id).text)
90
-
91
-
92
- class TranscriptList(object):
93
- """
94
- This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
95
- for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
96
- """
97
- def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
98
- """
99
- The constructor is only for internal use. Use the static build method instead.
100
-
101
- :param video_id: the id of the video this TranscriptList is for
102
- :type video_id: str
103
- :param manually_created_transcripts: dict mapping language codes to the manually created transcripts
104
- :type manually_created_transcripts: dict[str, Transcript]
105
- :param generated_transcripts: dict mapping language codes to the generated transcripts
106
- :type generated_transcripts: dict[str, Transcript]
107
- :param translation_languages: list of languages which can be used for translatable languages
108
- :type translation_languages: list[dict[str, str]]
109
- """
110
- self.video_id = video_id
111
- self._manually_created_transcripts = manually_created_transcripts
112
- self._generated_transcripts = generated_transcripts
113
- self._translation_languages = translation_languages
114
-
115
- @staticmethod
116
- def build(http_client, video_id, captions_json):
117
- """
118
- Factory method for TranscriptList.
119
-
120
- :param http_client: http client which is used to make the transcript retrieving http calls
121
- :type http_client: requests.Session
122
- :param video_id: the id of the video this TranscriptList is for
123
- :type video_id: str
124
- :param captions_json: the JSON parsed from the YouTube pages static HTML
125
- :type captions_json: dict
126
- :return: the created TranscriptList
127
- :rtype TranscriptList:
128
- """
129
- translation_languages = [
130
- {
131
- 'language': translation_language['languageName']['simpleText'],
132
- 'language_code': translation_language['languageCode'],
133
- } for translation_language in captions_json['translationLanguages']
134
- ]
135
-
136
- manually_created_transcripts = {}
137
- generated_transcripts = {}
138
-
139
- for caption in captions_json['captionTracks']:
140
- if caption.get('kind', '') == 'asr':
141
- transcript_dict = generated_transcripts
142
- else:
143
- transcript_dict = manually_created_transcripts
144
-
145
- transcript_dict[caption['languageCode']] = Transcript(
146
- http_client,
147
- video_id,
148
- caption['baseUrl'],
149
- caption['name']['simpleText'],
150
- caption['languageCode'],
151
- caption.get('kind', '') == 'asr',
152
- translation_languages if caption.get('isTranslatable', False) else []
153
- )
154
-
155
- return TranscriptList(
156
- video_id,
157
- manually_created_transcripts,
158
- generated_transcripts,
159
- translation_languages,
160
- )
161
-
162
- def __iter__(self):
163
- return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
164
-
165
- def find_transcript(self, language_codes):
166
- """
167
- Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
168
- are found, generated transcripts are used. If you only want generated transcripts use
169
- `find_manually_created_transcript` instead.
170
-
171
- :param language_codes: A list of language codes in a descending priority. For example, if this is set to
172
- ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
173
- it fails to do so.
174
- :type languages: list[str]
175
- :return: the found Transcript
176
- :rtype Transcript:
177
- :raises: NoTranscriptFound
178
- """
179
- return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
180
-
181
- def find_generated_transcript(self, language_codes):
182
- """
183
- Finds a automatically generated transcript for a given language code.
184
-
185
- :param language_codes: A list of language codes in a descending priority. For example, if this is set to
186
- ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
187
- it fails to do so.
188
- :type languages: list[str]
189
- :return: the found Transcript
190
- :rtype Transcript:
191
- :raises: NoTranscriptFound
192
- """
193
- return self._find_transcript(language_codes, [self._generated_transcripts,])
194
-
195
- def find_manually_created_transcript(self, language_codes):
196
- """
197
- Finds a manually created transcript for a given language code.
198
-
199
- :param language_codes: A list of language codes in a descending priority. For example, if this is set to
200
- ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
201
- it fails to do so.
202
- :type languages: list[str]
203
- :return: the found Transcript
204
- :rtype Transcript:
205
- :raises: NoTranscriptFound
206
- """
207
- return self._find_transcript(language_codes, [self._manually_created_transcripts,])
208
-
209
- def _find_transcript(self, language_codes, transcript_dicts):
210
- for language_code in language_codes:
211
- for transcript_dict in transcript_dicts:
212
- if language_code in transcript_dict:
213
- return transcript_dict[language_code]
214
-
215
- raise NoTranscriptFound(
216
- self.video_id,
217
- language_codes,
218
- self
219
- )
220
-
221
- def __str__(self):
222
- return (
223
- 'For this video ({video_id}) transcripts are available in the following languages:\n\n'
224
- '(MANUALLY CREATED)\n'
225
- '{available_manually_created_transcript_languages}\n\n'
226
- '(GENERATED)\n'
227
- '{available_generated_transcripts}\n\n'
228
- '(TRANSLATION LANGUAGES)\n'
229
- '{available_translation_languages}'
230
- ).format(
231
- video_id=self.video_id,
232
- available_manually_created_transcript_languages=self._get_language_description(
233
- str(transcript) for transcript in self._manually_created_transcripts.values()
234
- ),
235
- available_generated_transcripts=self._get_language_description(
236
- str(transcript) for transcript in self._generated_transcripts.values()
237
- ),
238
- available_translation_languages=self._get_language_description(
239
- '{language_code} ("{language}")'.format(
240
- language=translation_language['language'],
241
- language_code=translation_language['language_code'],
242
- ) for translation_language in self._translation_languages
243
- )
244
- )
245
-
246
- def _get_language_description(self, transcript_strings):
247
- description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
248
- return description if description else 'None'
249
-
250
-
251
- class Transcript(object):
252
- def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
253
- """
254
- You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
255
- TranscriptList.
256
-
257
- :param http_client: http client which is used to make the transcript retrieving http calls
258
- :type http_client: requests.Session
259
- :param video_id: the id of the video this TranscriptList is for
260
- :type video_id: str
261
- :param url: the url which needs to be called to fetch the transcript
262
- :param language: the name of the language this transcript uses
263
- :param language_code:
264
- :param is_generated:
265
- :param translation_languages:
266
- """
267
- self._http_client = http_client
268
- self.video_id = video_id
269
- self._url = url
270
- self.language = language
271
- self.language_code = language_code
272
- self.is_generated = is_generated
273
- self.translation_languages = translation_languages
274
- self._translation_languages_dict = {
275
- translation_language['language_code']: translation_language['language']
276
- for translation_language in translation_languages
277
- }
278
-
279
- def fetch(self):
280
- """
281
- Loads the actual transcript data.
282
-
283
- :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
284
- :rtype [{'text': str, 'start': float, 'end': float}]:
285
- """
286
- response = self._http_client.get(self._url)
287
- return _TranscriptParser().parse(
288
- _raise_http_errors(response, self.video_id).text,
289
- )
290
-
291
- def __str__(self):
292
- return '{language_code} ("{language}"){translation_description}'.format(
293
- language=self.language,
294
- language_code=self.language_code,
295
- translation_description='[TRANSLATABLE]' if self.is_translatable else ''
296
- )
297
-
298
- @property
299
- def is_translatable(self):
300
- return len(self.translation_languages) > 0
301
-
302
- def translate(self, language_code):
303
- if not self.is_translatable:
304
- raise NotTranslatable(self.video_id)
305
-
306
- if language_code not in self._translation_languages_dict:
307
- raise TranslationLanguageNotAvailable(self.video_id)
308
-
309
- return Transcript(
310
- self._http_client,
311
- self.video_id,
312
- '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
313
- self._translation_languages_dict[language_code],
314
- language_code,
315
- True,
316
- [],
317
- )
318
-
319
-
320
- class _TranscriptParser(object):
321
- HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
322
-
323
- def parse(self, plain_data):
324
- return [
325
- {
326
- 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
327
- 'start': float(xml_element.attrib['start']),
328
- 'duration': float(xml_element.attrib.get('dur', '0.0')),
329
- }
330
- for xml_element in ElementTree.fromstring(plain_data)
331
- if xml_element.text is not None
332
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
youtube_transcript_api2/formatters.py DELETED
@@ -1,165 +0,0 @@
1
- import json
2
-
3
- import pprint
4
-
5
-
6
- class Formatter(object):
7
- """Formatter should be used as an abstract base class.
8
-
9
- Formatter classes should inherit from this class and implement
10
- their own .format() method which should return a string. A
11
- transcript is represented by a List of Dictionary items.
12
- """
13
-
14
- def format_transcript(self, transcript, **kwargs):
15
- raise NotImplementedError('A subclass of Formatter must implement ' \
16
- 'their own .format_transcript() method.')
17
-
18
- def format_transcripts(self, transcripts, **kwargs):
19
- raise NotImplementedError('A subclass of Formatter must implement ' \
20
- 'their own .format_transcripts() method.')
21
-
22
-
23
- class PrettyPrintFormatter(Formatter):
24
- def format_transcript(self, transcript, **kwargs):
25
- """Pretty prints a transcript.
26
-
27
- :param transcript:
28
- :return: A pretty printed string representation of the transcript.'
29
- :rtype str
30
- """
31
- return pprint.pformat(transcript, **kwargs)
32
-
33
- def format_transcripts(self, transcripts, **kwargs):
34
- """Pretty prints a list of transcripts.
35
-
36
- :param transcripts:
37
- :return: A pretty printed string representation of the transcripts.'
38
- :rtype str
39
- """
40
- return self.format_transcript(transcripts, **kwargs)
41
-
42
-
43
- class JSONFormatter(Formatter):
44
- def format_transcript(self, transcript, **kwargs):
45
- """Converts a transcript into a JSON string.
46
-
47
- :param transcript:
48
- :return: A JSON string representation of the transcript.'
49
- :rtype str
50
- """
51
- return json.dumps(transcript, **kwargs)
52
-
53
- def format_transcripts(self, transcripts, **kwargs):
54
- """Converts a list of transcripts into a JSON string.
55
-
56
- :param transcripts:
57
- :return: A JSON string representation of the transcript.'
58
- :rtype str
59
- """
60
- return self.format_transcript(transcripts, **kwargs)
61
-
62
-
63
- class TextFormatter(Formatter):
64
- def format_transcript(self, transcript, **kwargs):
65
- """Converts a transcript into plain text with no timestamps.
66
-
67
- :param transcript:
68
- :return: all transcript text lines separated by newline breaks.'
69
- :rtype str
70
- """
71
- return '\n'.join(line['text'] for line in transcript)
72
-
73
- def format_transcripts(self, transcripts, **kwargs):
74
- """Converts a list of transcripts into plain text with no timestamps.
75
-
76
- :param transcripts:
77
- :return: all transcript text lines separated by newline breaks.'
78
- :rtype str
79
- """
80
- return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
81
-
82
-
83
- class WebVTTFormatter(Formatter):
84
- def _seconds_to_timestamp(self, time):
85
- """Helper that converts `time` into a transcript cue timestamp.
86
-
87
- :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp
88
-
89
- :param time: a float representing time in seconds.
90
- :type time: float
91
- :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS'
92
- :rtype str
93
- :example:
94
- >>> self._seconds_to_timestamp(6.93)
95
- '00:00:06.930'
96
- """
97
- time = float(time)
98
- hours, remainder = divmod(time, 3600)
99
- mins, secs = divmod(remainder, 60)
100
- ms = int(round((time - int(time))*1000, 2))
101
- return "{:02.0f}:{:02.0f}:{:02.0f}.{:03d}".format(hours, mins, secs, ms)
102
-
103
- def format_transcript(self, transcript, **kwargs):
104
- """A basic implementation of WEBVTT formatting.
105
-
106
- :param transcript:
107
- :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
108
- """
109
- lines = []
110
- for i, line in enumerate(transcript):
111
- if i < len(transcript) - 1:
112
- # Looks ahead, use next start time since duration value
113
- # would create an overlap between start times.
114
- time_text = "{} --> {}".format(
115
- self._seconds_to_timestamp(line['start']),
116
- self._seconds_to_timestamp(transcript[i + 1]['start'])
117
- )
118
- else:
119
- # Reached the end, cannot look ahead, use duration now.
120
- duration = line['start'] + line['duration']
121
- time_text = "{} --> {}".format(
122
- self._seconds_to_timestamp(line['start']),
123
- self._seconds_to_timestamp(duration)
124
- )
125
- lines.append("{}\n{}".format(time_text, line['text']))
126
-
127
- return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"
128
-
129
- def format_transcripts(self, transcripts, **kwargs):
130
- """A basic implementation of WEBVTT formatting for a list of transcripts.
131
-
132
- :param transcripts:
133
- :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
134
- """
135
- return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
136
-
137
-
138
- class FormatterLoader(object):
139
- TYPES = {
140
- 'json': JSONFormatter,
141
- 'pretty': PrettyPrintFormatter,
142
- 'text': TextFormatter,
143
- 'webvtt': WebVTTFormatter,
144
- }
145
-
146
- class UnknownFormatterType(Exception):
147
- def __init__(self, formatter_type):
148
- super(FormatterLoader.UnknownFormatterType, self).__init__(
149
- 'The format \'{formatter_type}\' is not supported. '
150
- 'Choose one of the following formats: {supported_formatter_types}'.format(
151
- formatter_type=formatter_type,
152
- supported_formatter_types=', '.join(FormatterLoader.TYPES.keys()),
153
- )
154
- )
155
-
156
- def load(self, formatter_type='pretty'):
157
- """
158
- Loads the Formatter for the given formatter type.
159
-
160
- :param formatter_type:
161
- :return: Formatter object
162
- """
163
- if formatter_type not in FormatterLoader.TYPES.keys():
164
- raise FormatterLoader.UnknownFormatterType(formatter_type)
165
- return FormatterLoader.TYPES[formatter_type]()