Jonathan Li commited on
Commit
37466ca
1 Parent(s): c988ca1

initial commit

Browse files
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from transformers import AutoTokenizer, pipeline
4
+ from youtube_transcript_api._transcripts import TranscriptListFetcher
5
+
6
+ tagger = pipeline(
7
+ "token-classification",
8
+ "./checkpoint-6000",
9
+ aggregation_strategy="first",
10
+ )
11
+ tokenizer = AutoTokenizer.from_pretrained("./checkpoint-6000")
12
+ max_size = 512
13
+ classes = [False, True]
14
+
15
+ def process(obj):
16
+ o = obj["events"]
17
+ new_l = []
18
+ start_dur = None
19
+ for line in o:
20
+ if "segs" in line:
21
+ if len(line["segs"]) == 1 and line["segs"][0]["utf8"] == "\n":
22
+ if start_dur is not None:
23
+ new_l.append(
24
+ {
25
+ "w": prev["utf8"],
26
+ "s": start_dur + prev["tOffsetMs"],
27
+ "e": line["tStartMs"],
28
+ }
29
+ )
30
+ continue
31
+
32
+ start_dur = line["tStartMs"]
33
+ prev = line["segs"][0]
34
+ prev["tOffsetMs"] = 0
35
+ for word in line["segs"][1:]:
36
+ try:
37
+ new_l.append(
38
+ {
39
+ "w": prev["utf8"],
40
+ "s": start_dur + prev["tOffsetMs"],
41
+ "e": start_dur + word["tOffsetMs"],
42
+ }
43
+ )
44
+ prev = word
45
+ except KeyError:
46
+ pass
47
+
48
+ return new_l
49
+
50
+ def get_transcript(video_id, session):
51
+ fetcher = TranscriptListFetcher(session)
52
+ _json = fetcher._extract_captions_json(
53
+ fetcher._fetch_video_html(video_id), video_id
54
+ )
55
+ captionTracks = _json["captionTracks"]
56
+ transcript_track_url = ""
57
+ for track in captionTracks:
58
+ if track["languageCode"] == "en":
59
+ transcript_track_url = track["baseUrl"] + "&fmt=json3"
60
+
61
+ if not transcript_track_url:
62
+ return None
63
+
64
+ obj = session.get(transcript_track_url)
65
+ p = process(obj.json())
66
+ return p
67
+
68
+ def transcript(video_id):
69
+ return " ".join(l["w"].strip() for l in get_transcript(video_id, requests.Session()))
70
+
71
+ def inference(transcript):
72
+ tokens = tokenizer(transcript.split(" "))["input_ids"]
73
+ current_length = 0
74
+ current_word_length = 0
75
+ batches = []
76
+ for i, w in enumerate(tokens):
77
+ word = w[:-1] if i == 0 else w[1:] if i == (len(tokens) - 1) else w[1:-1]
78
+ if (current_length + len(word)) > max_size:
79
+ batch = " ".join(
80
+ tokenizer.batch_decode(
81
+ [
82
+ tok[1:-1]
83
+ for tok in tokens[max(0, i - current_word_length - 1) : i]
84
+ ]
85
+ )
86
+ )
87
+ batches.append(batch)
88
+ current_word_length = 0
89
+ current_length = 0
90
+ continue
91
+ current_length += len(word)
92
+ current_word_length += 1
93
+ if current_length > 0:
94
+ batches.append(
95
+ " ".join(
96
+ tokenizer.batch_decode(
97
+ [tok[1:-1] for tok in tokens[i - current_word_length :]]
98
+ )
99
+ )
100
+ )
101
+
102
+ results = []
103
+ for split in batches:
104
+ values = tagger(split)
105
+ results.extend(
106
+ {
107
+ "sponsor": v["entity_group"] == "LABEL_1",
108
+ "phrase": v["word"],
109
+ }
110
+ for v in values
111
+ )
112
+
113
+ return results
114
+
115
+ def predict(transcript):
116
+ return [(span["phrase"], "Sponsor" if span["sponsor"] else None) for span in inference(transcript)]
117
+
118
+
119
+ with gr.Blocks() as demo:
120
+ with gr.Row():
121
+ with gr.Column():
122
+ inp = gr.Textbox(label="Video ID or URL", placeholder="Video id", lines=1, max_lines=1)
123
+ btn = gr.Button("Fetch Transcript")
124
+ gr.Examples(["xsLJZyih3Ac"], [inp])
125
+ text = gr.Textbox(label="Transcript", placeholder="<generated transcript>")
126
+ btn.click(fn=transcript, inputs=inp, outputs=text)
127
+ with gr.Column():
128
+ p = gr.Button("Predict Sponsors")
129
+ highlight = gr.HighlightedText()
130
+ p.click(fn=predict, inputs=text, outputs=highlight)
131
+
132
+
133
+ demo.launch()
checkpoint-6000/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roberta-base",
3
+ "architectures": [
4
+ "RobertaForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "roberta",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.14.1",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 50265
27
+ }
checkpoint-6000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-6000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a25ec5e11fbd8fd9a3190d9ab50a545329cf88d1516627e59fa0755d8079879d
3
+ size 992568037
checkpoint-6000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a68c8e91ef466cffc171dca01c569a2e7d2fdca14b8ec6f2077225d9f3599c91
3
+ size 496311025
checkpoint-6000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0925b52aba488abe77b1d5e4ba92d66e652da1713bc029cf45fd5a009047e84
3
+ size 17563
checkpoint-6000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98aaa98d2036a0b027d755b1e9c68a82eea8d4f594070a80372a01e9478efbda
3
+ size 623
checkpoint-6000/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
checkpoint-6000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-6000/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "roberta-base", "tokenizer_class": "RobertaTokenizer"}
checkpoint-6000/trainer_state.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.5710919088766693,
5
+ "global_step": 6000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.13,
12
+ "learning_rate": 1.973815134852056e-05,
13
+ "loss": 0.0851,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.26,
18
+ "learning_rate": 1.9476302697041113e-05,
19
+ "loss": 0.0627,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.39,
24
+ "learning_rate": 1.9214454045561666e-05,
25
+ "loss": 0.0588,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.52,
30
+ "learning_rate": 1.8952605394082224e-05,
31
+ "loss": 0.0576,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.65,
36
+ "learning_rate": 1.8690756742602777e-05,
37
+ "loss": 0.0545,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 0.79,
42
+ "learning_rate": 1.842890809112333e-05,
43
+ "loss": 0.055,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 0.92,
48
+ "learning_rate": 1.8167059439643888e-05,
49
+ "loss": 0.0545,
50
+ "step": 3500
51
+ },
52
+ {
53
+ "epoch": 1.0,
54
+ "eval_loss": 0.04616822674870491,
55
+ "eval_runtime": 92.6422,
56
+ "eval_samples_per_second": 116.265,
57
+ "eval_steps_per_second": 2.245,
58
+ "step": 3819
59
+ },
60
+ {
61
+ "epoch": 1.05,
62
+ "learning_rate": 1.7905210788164442e-05,
63
+ "loss": 0.0484,
64
+ "step": 4000
65
+ },
66
+ {
67
+ "epoch": 1.18,
68
+ "learning_rate": 1.7643362136685e-05,
69
+ "loss": 0.0452,
70
+ "step": 4500
71
+ },
72
+ {
73
+ "epoch": 1.31,
74
+ "learning_rate": 1.7381513485205553e-05,
75
+ "loss": 0.0434,
76
+ "step": 5000
77
+ },
78
+ {
79
+ "epoch": 1.44,
80
+ "learning_rate": 1.711966483372611e-05,
81
+ "loss": 0.0442,
82
+ "step": 5500
83
+ },
84
+ {
85
+ "epoch": 1.57,
86
+ "learning_rate": 1.6857816182246664e-05,
87
+ "loss": 0.043,
88
+ "step": 6000
89
+ }
90
+ ],
91
+ "max_steps": 38190,
92
+ "num_train_epochs": 10,
93
+ "total_flos": 8.152119124379443e+16,
94
+ "trial_name": null,
95
+ "trial_params": null
96
+ }
checkpoint-6000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:789dce56778df249a9b1f337ca4c887c92633bfb5ce316ecc3291d0cd81231e6
3
+ size 2927
checkpoint-6000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ youtube_transcript_api
3
+ torch
4
+ pandas
5
+ numpy
6
+ gradio