Spaces:

jonathanli
/

youtube-sponsor-detection

Runtime error

App Files Files Community

Jonathan Li commited on Aug 30, 2022

Commit

37466ca

•

1 Parent(s): c988ca1

initial commit

Browse files

Files changed (14) hide show

app.py +133 -0
checkpoint-6000/config.json +27 -0
checkpoint-6000/merges.txt +0 -0
checkpoint-6000/optimizer.pt +3 -0
checkpoint-6000/pytorch_model.bin +3 -0
checkpoint-6000/rng_state.pth +3 -0
checkpoint-6000/scheduler.pt +3 -0
checkpoint-6000/special_tokens_map.json +1 -0
checkpoint-6000/tokenizer.json +0 -0
checkpoint-6000/tokenizer_config.json +1 -0
checkpoint-6000/trainer_state.json +96 -0
checkpoint-6000/training_args.bin +3 -0
checkpoint-6000/vocab.json +0 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import gradio as gr
+import requests
+from transformers import AutoTokenizer, pipeline
+from youtube_transcript_api._transcripts import TranscriptListFetcher
+tagger = pipeline(
+    "token-classification",
+    "./checkpoint-6000",
+    aggregation_strategy="first",
+)
+tokenizer = AutoTokenizer.from_pretrained("./checkpoint-6000")
+max_size = 512
+classes = [False, True]
+def process(obj):
+    o = obj["events"]
+    new_l = []
+    start_dur = None
+    for line in o:
+        if "segs" in line:
+            if len(line["segs"]) == 1 and line["segs"][0]["utf8"] == "\n":
+                if start_dur is not None:
+                    new_l.append(
+                        {
+                            "w": prev["utf8"],
+                            "s": start_dur + prev["tOffsetMs"],
+                            "e": line["tStartMs"],
+                        }
+                    )
+                continue
+            start_dur = line["tStartMs"]
+            prev = line["segs"][0]
+            prev["tOffsetMs"] = 0
+            for word in line["segs"][1:]:
+                try:
+                    new_l.append(
+                        {
+                            "w": prev["utf8"],
+                            "s": start_dur + prev["tOffsetMs"],
+                            "e": start_dur + word["tOffsetMs"],
+                        }
+                    )
+                    prev = word
+                except KeyError:
+                    pass
+    return new_l
+def get_transcript(video_id, session):
+    fetcher = TranscriptListFetcher(session)
+    _json = fetcher._extract_captions_json(
+        fetcher._fetch_video_html(video_id), video_id
+    )
+    captionTracks = _json["captionTracks"]
+    transcript_track_url = ""
+    for track in captionTracks:
+        if track["languageCode"] == "en":
+            transcript_track_url = track["baseUrl"] + "&fmt=json3"
+    if not transcript_track_url:
+        return None
+    obj = session.get(transcript_track_url)
+    p = process(obj.json())
+    return p
+def transcript(video_id):
+  return " ".join(l["w"].strip() for l in get_transcript(video_id, requests.Session()))
+def inference(transcript):
+  tokens = tokenizer(transcript.split(" "))["input_ids"]
+  current_length = 0
+  current_word_length = 0
+  batches = []
+  for i, w in enumerate(tokens):
+      word = w[:-1] if i == 0 else w[1:] if i == (len(tokens) - 1) else w[1:-1]
+      if (current_length + len(word)) > max_size:
+          batch = " ".join(
+              tokenizer.batch_decode(
+                  [
+                      tok[1:-1]
+                      for tok in tokens[max(0, i - current_word_length - 1) : i]
+                  ]
+              )
+          )
+          batches.append(batch)
+          current_word_length = 0
+          current_length = 0
+          continue
+      current_length += len(word)
+      current_word_length += 1
+  if current_length > 0:
+      batches.append(
+          " ".join(
+              tokenizer.batch_decode(
+                  [tok[1:-1] for tok in tokens[i - current_word_length :]]
+              )
+          )
+      )
+  results = []
+  for split in batches:
+      values = tagger(split)
+      results.extend(
+          {
+              "sponsor": v["entity_group"] == "LABEL_1",
+              "phrase": v["word"],
+          }
+          for v in values
+      )
+  return results
+def predict(transcript):
+  return [(span["phrase"], "Sponsor" if span["sponsor"] else None) for span in inference(transcript)]
+with gr.Blocks() as demo:
+    with gr.Row():
+      with gr.Column():
+        inp = gr.Textbox(label="Video ID or URL", placeholder="Video id", lines=1, max_lines=1)
+        btn = gr.Button("Fetch Transcript")
+        gr.Examples(["xsLJZyih3Ac"], [inp])
+        text = gr.Textbox(label="Transcript", placeholder="<generated transcript>")
+        btn.click(fn=transcript, inputs=inp, outputs=text)
+      with gr.Column():
+        p = gr.Button("Predict Sponsors")
+        highlight = gr.HighlightedText()
+        p.click(fn=predict, inputs=text, outputs=highlight)
+demo.launch()

checkpoint-6000/config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "roberta-base",
+  "architectures": [
+    "RobertaForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.14.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

checkpoint-6000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-6000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a25ec5e11fbd8fd9a3190d9ab50a545329cf88d1516627e59fa0755d8079879d
+size 992568037

checkpoint-6000/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a68c8e91ef466cffc171dca01c569a2e7d2fdca14b8ec6f2077225d9f3599c91
+size 496311025

checkpoint-6000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0925b52aba488abe77b1d5e4ba92d66e652da1713bc029cf45fd5a009047e84
+size 17563

checkpoint-6000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98aaa98d2036a0b027d755b1e9c68a82eea8d4f594070a80372a01e9478efbda
+size 623

checkpoint-6000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}

checkpoint-6000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-6000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "roberta-base", "tokenizer_class": "RobertaTokenizer"}

checkpoint-6000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,96 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.5710919088766693,
+  "global_step": 6000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.13,
+      "learning_rate": 1.973815134852056e-05,
+      "loss": 0.0851,
+      "step": 500
+    },
+    {
+      "epoch": 0.26,
+      "learning_rate": 1.9476302697041113e-05,
+      "loss": 0.0627,
+      "step": 1000
+    },
+    {
+      "epoch": 0.39,
+      "learning_rate": 1.9214454045561666e-05,
+      "loss": 0.0588,
+      "step": 1500
+    },
+    {
+      "epoch": 0.52,
+      "learning_rate": 1.8952605394082224e-05,
+      "loss": 0.0576,
+      "step": 2000
+    },
+    {
+      "epoch": 0.65,
+      "learning_rate": 1.8690756742602777e-05,
+      "loss": 0.0545,
+      "step": 2500
+    },
+    {
+      "epoch": 0.79,
+      "learning_rate": 1.842890809112333e-05,
+      "loss": 0.055,
+      "step": 3000
+    },
+    {
+      "epoch": 0.92,
+      "learning_rate": 1.8167059439643888e-05,
+      "loss": 0.0545,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.04616822674870491,
+      "eval_runtime": 92.6422,
+      "eval_samples_per_second": 116.265,
+      "eval_steps_per_second": 2.245,
+      "step": 3819
+    },
+    {
+      "epoch": 1.05,
+      "learning_rate": 1.7905210788164442e-05,
+      "loss": 0.0484,
+      "step": 4000
+    },
+    {
+      "epoch": 1.18,
+      "learning_rate": 1.7643362136685e-05,
+      "loss": 0.0452,
+      "step": 4500
+    },
+    {
+      "epoch": 1.31,
+      "learning_rate": 1.7381513485205553e-05,
+      "loss": 0.0434,
+      "step": 5000
+    },
+    {
+      "epoch": 1.44,
+      "learning_rate": 1.711966483372611e-05,
+      "loss": 0.0442,
+      "step": 5500
+    },
+    {
+      "epoch": 1.57,
+      "learning_rate": 1.6857816182246664e-05,
+      "loss": 0.043,
+      "step": 6000
+    }
+  ],
+  "max_steps": 38190,
+  "num_train_epochs": 10,
+  "total_flos": 8.152119124379443e+16,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-6000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:789dce56778df249a9b1f337ca4c887c92633bfb5ce316ecc3291d0cd81231e6
+size 2927

checkpoint-6000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers
+youtube_transcript_api
+torch
+pandas
+numpy
+gradio