Spaces:

jonathanli
/

youtube-sponsor-detection

Runtime error

App Files Files Community

Jonathan Li commited on Sep 5, 2022

Commit

1ebc0dd

1 Parent(s): 2bceb77

Revert "Add broken streamlit (no way to mark sponsors?)"

Browse files

This reverts commit 2bceb77e414dd0e5ef1400dec9e5731109481697.

Files changed (2) hide show

app.py +65 -119
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,26 +1,17 @@
-import re
-import streamlit as st
 import requests
 from transformers import AutoTokenizer, pipeline
 from youtube_transcript_api._transcripts import TranscriptListFetcher
 tagger = pipeline(
-    "token-classification", "./checkpoint-6000", aggregation_strategy="first",
 )
 tokenizer = AutoTokenizer.from_pretrained("./checkpoint-6000")
 max_size = 512
 classes = [False, True]
-pattern = re.compile(
-    r"(?:https?:\/\/)?(?:[0-9A-Z-]+\.)?(?:youtube|youtu|youtube-nocookie)\.(?:com|be)\/(?:watch\?v=|watch\?.+&v=|embed\/|v\/|.+\?v=)?([^&=\n%\?]{11})"
-)
-def video_id(url):
-    p = pattern.match(url)
-    return p.group(1)
 def process(obj):
     o = obj["events"]
     new_l = []
@@ -56,7 +47,6 @@ def process(obj):
     return new_l
 def get_transcript(video_id, session):
     fetcher = TranscriptListFetcher(session)
     _json = fetcher._extract_captions_json(
@@ -75,113 +65,69 @@ def get_transcript(video_id, session):
     p = process(obj.json())
     return p
 def transcript(video_id):
-    return " ".join(
-        l["w"].strip() for l in get_transcript(video_id, requests.Session())
-    )
 def inference(transcript):
-    tokens = tokenizer(transcript.split(" "))["input_ids"]
-    current_length = 0
-    current_word_length = 0
-    batches = []
-    for i, w in enumerate(tokens):
-        word = w[:-1] if i == 0 else w[1:] if i == (len(tokens) - 1) else w[1:-1]
-        if (current_length + len(word)) > max_size:
-            batch = " ".join(
-                tokenizer.batch_decode(
-                    [
-                        tok[1:-1]
-                        for tok in tokens[max(0, i - current_word_length - 1) : i]
-                    ]
-                )
-            )
-            batches.append(batch)
-            current_word_length = 0
-            current_length = 0
-            continue
-        current_length += len(word)
-        current_word_length += 1
-    if current_length > 0:
-        batches.append(
-            " ".join(
-                tokenizer.batch_decode(
-                    [tok[1:-1] for tok in tokens[i - current_word_length :]]
-                )
-            )
-        )
-    results = []
-    for split in batches:
-        values = tagger(split)
-        results.extend(
-            {"sponsor": v["entity_group"] == "LABEL_1", "phrase": v["word"],}
-            for v in values
-        )
-    return results
 def predict(transcript):
-    return [
-        (span["phrase"], "Sponsor" if span["sponsor"] else None)
-        for span in inference(transcript)
-    ]
-st.title("reBlock (AI Sponsor Detector)")
-load_data, run_ai = st.container(), st.container()
-load_data.subheader("Load transcript:")
-run_ai.subheader("Predict sponsors:")
-if "transcript" not in st.session_state:
-    st.session_state["transcript"] = ""
-if "url" not in st.session_state:
-    st.session_state["url"] = ""
-def submit(url):
-    if url:
-        ts = transcript(video_id(url))
-        st.session_state.transcript = ts
-        st.session_state.url = url
-    else:
-        st.error(
-            "Invalid youtube url. Take a look at the examples for a supported format"
-        )
-with load_data:
-    with st.form(key="load_transcript"):
-        url = st.text_input("Youtube Video URL", key="url")
-        submitted = st.form_submit_button("Get Transcript", on_click=lambda: submit(url))
-        transcript_text_area = st.text_area("Scraped Transcript", key="transcript")
-    st.caption("Or, try an example:")
-    examples = ["youtu.be/xsLJZyih3Ac"]
-    col = st.columns(len(examples))
-    for i, example in enumerate(examples):
-        col[i] = st.button(example, on_click=lambda: submit(example))
-with run_ai:
-    with st.form(key="run_ai"):
-        submitted = st.form_submit_button("Predict Sponsors!")
-# read_transcript = st.text("Reading...")
-# with gr.Blocks() as demo:
-#     with gr.Row():
-#       with gr.Column():
-#         inp = gr.Textbox(label="Video URL", placeholder="Video url", lines=1, max_lines=1)
-#         btn = gr.Button("Fetch Transcript")
-#         gr.Examples(["youtu.be/xsLJZyih3Ac"], [inp])
-#         text = gr.Textbox(label="Transcript", placeholder="<generated transcript>")
-#         btn.click(fn=transcript, inputs=inp, outputs=text)
-#       with gr.Column():
-#         p = gr.Button("Predict Sponsors")
-#         highlight = gr.HighlightedText()
-#         p.click(fn=predict, inputs=text, outputs=highlight)
-# demo.launch()

+import gradio as gr
 import requests
 from transformers import AutoTokenizer, pipeline
 from youtube_transcript_api._transcripts import TranscriptListFetcher
 tagger = pipeline(
+    "token-classification",
+    "./checkpoint-6000",
+    aggregation_strategy="first",
 )
 tokenizer = AutoTokenizer.from_pretrained("./checkpoint-6000")
 max_size = 512
 classes = [False, True]
 def process(obj):
     o = obj["events"]
     new_l = []
     return new_l
 def get_transcript(video_id, session):
     fetcher = TranscriptListFetcher(session)
     _json = fetcher._extract_captions_json(
     p = process(obj.json())
     return p
 def transcript(video_id):
+  return " ".join(l["w"].strip() for l in get_transcript(video_id, requests.Session()))
 def inference(transcript):
+  tokens = tokenizer(transcript.split(" "))["input_ids"]
+  current_length = 0
+  current_word_length = 0
+  batches = []
+  for i, w in enumerate(tokens):
+      word = w[:-1] if i == 0 else w[1:] if i == (len(tokens) - 1) else w[1:-1]
+      if (current_length + len(word)) > max_size:
+          batch = " ".join(
+              tokenizer.batch_decode(
+                  [
+                      tok[1:-1]
+                      for tok in tokens[max(0, i - current_word_length - 1) : i]
+                  ]
+              )
+          )
+          batches.append(batch)
+          current_word_length = 0
+          current_length = 0
+          continue
+      current_length += len(word)
+      current_word_length += 1
+  if current_length > 0:
+      batches.append(
+          " ".join(
+              tokenizer.batch_decode(
+                  [tok[1:-1] for tok in tokens[i - current_word_length :]]
+              )
+          )
+      )
+  results = []
+  for split in batches:
+      values = tagger(split)
+      results.extend(
+          {
+              "sponsor": v["entity_group"] == "LABEL_1",
+              "phrase": v["word"],
+          }
+          for v in values
+      )
+  return results
 def predict(transcript):
+  return [(span["phrase"], "Sponsor" if span["sponsor"] else None) for span in inference(transcript)]
+with gr.Blocks() as demo:
+    with gr.Row():
+      with gr.Column():
+        inp = gr.Textbox(label="Video ID or URL", placeholder="Video id", lines=1, max_lines=1)
+        btn = gr.Button("Fetch Transcript")
+        gr.Examples(["xsLJZyih3Ac"], [inp])
+        text = gr.Textbox(label="Transcript", placeholder="<generated transcript>")
+        btn.click(fn=transcript, inputs=inp, outputs=text)
+      with gr.Column():
+        p = gr.Button("Predict Sponsors")
+        highlight = gr.HighlightedText()
+        p.click(fn=predict, inputs=text, outputs=highlight)
+demo.launch()

requirements.txt CHANGED Viewed

@@ -3,4 +3,4 @@ youtube_transcript_api
 torch
 pandas
 numpy
-streamlit

 torch
 pandas
 numpy
+gradio