Spaces:

wldmr
/

rpunct-gr-app

Build error

App Files Files Community

wldmr commited on Nov 28, 2022

Commit

42a2568

1 Parent(s): 9b150b6

new

Browse files

Files changed (6) hide show

app.py +94 -4
myrpunct/__init__.py +2 -0
myrpunct/punctuate.py +174 -0
myrpunct/utils.py +34 -0
requirements.txt +5 -0
sample.srt +20 -0

app.py CHANGED Viewed

@@ -1,7 +1,97 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

+from myrpunct import RestorePuncts
+from youtube_transcript_api import YouTubeTranscriptApi
 import gradio as gr
+import re
+def get_srt(input_link):
+    if "v=" in input_link:
+        video_id = input_link.split("v=")[1]
+    else:
+        return "Error: Invalid Link, it does not have the pattern 'v=' in it."
+    print("video_id: ",video_id)
+    transcript_raw = YouTubeTranscriptApi.get_transcript(video_id)
+    transcript_text= '\n'.join([i['text'] for i in transcript_raw])
+    return transcript_text
+def predict(input_text, input_file, input_link, input_checkbox):
+    if input_checkbox=="File" and input_file is not None:
+        print("Input File ...")
+        with open(input_file.name) as file:
+            input_file_read = file.read()
+        return run_predict(input_file_read)
+    elif input_checkbox=="Text" and len(input_text) >0:
+        print("Input Text ...")
+        return run_predict(input_text)
+    elif input_checkbox=="Link" and len(input_link)>0:
+        print("Input Link ...", input_link)
+        input_link_text = get_srt(input_link)
+        if "Error" in input_link_text:
+            return input_link_text
+        else:
+            return run_predict(input_link_text)
+    else:
+        return "Error: Please provide either an input text or file and select an option accordingly."
+def run_predict(input_text):
+    rpunct = RestorePuncts()
+    output_text = rpunct.punctuate(input_text)
+    print("Punctuation finished...")
+    # restore the carrige returns
+    srt_file = input_text
+    punctuated = output_text
+    srt_file_strip=srt_file.strip()
+    srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
+    srt_file_array=srt_file_sub.split(' ')
+    pcnt_file_array=punctuated.split(' ')
+    # goal: restore the break points i.e. the same number of lines as the srt file
+    # this is necessary, because each line in the srt file corresponds to a frame from the video
+    if len(srt_file_array)!=len(pcnt_file_array):
+        return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
+    pcnt_file_array_hash = []
+    for idx, item in enumerate(srt_file_array):
+        if item.endswith('#'):
+            pcnt_file_array_hash.append(pcnt_file_array[idx]+'#')
+        else:
+            pcnt_file_array_hash.append(pcnt_file_array[idx])
+    # assemble the array back to a string
+    pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n')
+    return pcnt_file_cr
+if __name__ == "__main__":
+    title = "Rpunct Gradio App"
+    description = """
+<b>Description</b>: <br>
+Model restores punctuation and case i.e. of the following punctuations -- [! ? . , - : ; ' ] and also the upper-casing of words. <br>
+<b>Usage</b>: <br>
+There are three input types any text, a file that can be uploaded or a YouTube video. <br>
+Because all three options can be provided by the user (that is you) at the same time <br>
+the user has to decisde which input type has to be processed.
+"""
+    article = "Model by [felflare](https://huggingface.co/felflare/bert-restore-punctuation)"
+    sample_link = "https://www.youtube.com/watch?v=6MI0f6YjJIk"
+    examples = [["my name is clara and i live in berkeley california", "sample.srt", sample_link, "Text"]]
+    interface = gr.Interface(fn = predict,
+                         inputs = ["text", "file", "text", gr.Radio(["Text", "File", "Link"], type="value", label='Input Type')],
+                         outputs = ["text"],
+                         title = title,
+                         description = description,
+                         article = article,
+                         examples=examples,
+                         allow_flagging="never")
+    interface.launch()
+# save flagging to a hf dataset
+# https://github.com/gradio-app/gradio/issues/914
+# the best option here is to use a Hugging Face dataset as the storage for flagged data. And to do that, please check out the HuggingFaceDatasetSaver() flagging handler, which allows you to do that easily.
+#Here is an example Space that uses this: https://huggingface.co/spaces/abidlabs/crowd-speech

myrpunct/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .punctuate import RestorePuncts
2	+ print("init executed ...")

myrpunct/punctuate.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# -*- coding: utf-8 -*-
+# 💾⚙️🔮
+__author__ = "Daulet N."
+__email__ = "daulet.nurmanbetov@gmail.com"
+import logging
+from langdetect import detect
+from simpletransformers.ner import NERModel, NERArgs
+class RestorePuncts:
+    def __init__(self, wrds_per_pred=250, use_cuda=False):
+        self.wrds_per_pred = wrds_per_pred
+        self.overlap_wrds = 30
+        self.valid_labels = ['OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U']
+        self.model_hf = "wldmr/felflare-bert-restore-punctuation"
+        self.model_args = NERArgs()
+        self.model_args.silent = True
+        self.model_args.max_seq_length = 512
+        #self.model_args.use_multiprocessing = False
+        self.model = NERModel("bert", self.model_hf, labels=self.valid_labels, use_cuda=use_cuda, args=self.model_args)
+        #self.model = NERModel("bert", self.model_hf, labels=self.valid_labels, use_cuda=use_cuda, args={"silent": True, "max_seq_length": 512, "use_multiprocessing": False})
+        print("class init ...")
+        print("use_multiprocessing: ",self.model_args.use_multiprocessing)
+    def status(self):
+        print("function called")
+    def punctuate(self, text: str, lang:str=''):
+        """
+        Performs punctuation restoration on arbitrarily large text.
+        Detects if input is not English, if non-English was detected terminates predictions.
+        Overrride by supplying `lang='en'`
+        Args:
+            - text (str): Text to punctuate, can be few words to as large as you want.
+            - lang (str): Explicit language of input text.
+        """
+        if not lang and len(text) > 10:
+            lang = detect(text)
+        if lang != 'en':
+            raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
+            If you are certain the input is English, pass argument lang='en' to this function.
+            Punctuate received: {text}""")
+        # plit up large text into bert digestable chunks
+        splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds)
+        # predict slices
+        # full_preds_lst contains tuple of labels and logits
+        full_preds_lst = [self.predict(i['text']) for i in splits]
+        # extract predictions, and discard logits
+        preds_lst = [i[0][0] for i in full_preds_lst]
+        # join text slices
+        combined_preds = self.combine_results(text, preds_lst)
+        # create punctuated prediction
+        punct_text = self.punctuate_texts(combined_preds)
+        return punct_text
+    def predict(self, input_slice):
+        """
+        Passes the unpunctuated text to the model for punctuation.
+        """
+        predictions, raw_outputs = self.model.predict([input_slice])
+        return predictions, raw_outputs
+    @staticmethod
+    def split_on_toks(text, length, overlap):
+        """
+        Splits text into predefined slices of overlapping text with indexes (offsets)
+        that tie-back to original text.
+        This is done to bypass 512 token limit on transformer models by sequentially
+        feeding chunks of < 512 toks.
+        Example output:
+        [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
+        """
+        wrds = text.replace('\n', ' ').split(" ")
+        resp = []
+        lst_chunk_idx = 0
+        i = 0
+        while True:
+            # words in the chunk and the overlapping portion
+            wrds_len = wrds[(length * i):(length * (i + 1))]
+            wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
+            wrds_split = wrds_len + wrds_ovlp
+            # Break loop if no more words
+            if not wrds_split:
+                break
+            wrds_str = " ".join(wrds_split)
+            nxt_chunk_start_idx = len(" ".join(wrds_len))
+            lst_char_idx = len(" ".join(wrds_split))
+            resp_obj = {
+                "text": wrds_str,
+                "start_idx": lst_chunk_idx,
+                "end_idx": lst_char_idx + lst_chunk_idx,
+            }
+            resp.append(resp_obj)
+            lst_chunk_idx += nxt_chunk_start_idx + 1
+            i += 1
+        logging.info(f"Sliced transcript into {len(resp)} slices.")
+        return resp
+    @staticmethod
+    def combine_results(full_text: str, text_slices):
+        """
+        Given a full text and predictions of each slice combines predictions into a single text again.
+        Performs validataion wether text was combined correctly
+        """
+        split_full_text = full_text.replace('\n', ' ').split(" ")
+        split_full_text = [i for i in split_full_text if i]
+        split_full_text_len = len(split_full_text)
+        output_text = []
+        index = 0
+        if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
+            text_slices = text_slices[:-1]
+        for _slice in text_slices:
+            slice_wrds = len(_slice)
+            for ix, wrd in enumerate(_slice):
+                # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
+                if index == split_full_text_len:
+                    break
+                if split_full_text[index] == str(list(wrd.keys())[0]) and \
+                        ix <= slice_wrds - 3 and text_slices[-1] != _slice:
+                    index += 1
+                    pred_item_tuple = list(wrd.items())[0]
+                    output_text.append(pred_item_tuple)
+                elif split_full_text[index] == str(list(wrd.keys())[0]) and text_slices[-1] == _slice:
+                    index += 1
+                    pred_item_tuple = list(wrd.items())[0]
+                    output_text.append(pred_item_tuple)
+        assert [i[0] for i in output_text] == split_full_text
+        return output_text
+    @staticmethod
+    def punctuate_texts(full_pred: list):
+        """
+        Given a list of Predictions from the model, applies the predictions to text,
+        thus punctuating it.
+        """
+        punct_resp = ""
+        for i in full_pred:
+            word, label = i
+            if label[-1] == "U":
+                punct_wrd = word.capitalize()
+            else:
+                punct_wrd = word
+            if label[0] != "O":
+                punct_wrd += label[0]
+            punct_resp += punct_wrd + " "
+        punct_resp = punct_resp.strip()
+        # Append trailing period if doesnt exist.
+        if punct_resp[-1].isalnum():
+            punct_resp += "."
+        return punct_resp
+if __name__ == "__main__":
+    punct_model = RestorePuncts()
+    # read test file
+    with open('../tests/sample_text.txt', 'r') as fp:
+        test_sample = fp.read()
+    # predict text and print
+    punctuated = punct_model.punctuate(test_sample)
+    print(punctuated)

myrpunct/utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# -*- coding: utf-8 -*-
+# 💾⚙️🔮
+__author__ = "Daulet N."
+__email__ = "daulet.nurmanbetov@gmail.com"
+def prepare_unpunct_text(text):
+    """
+    Given a text, normalizes it to subsequently restore punctuation
+    """
+    formatted_txt = text.replace('\n', '').strip()
+    formatted_txt = formatted_txt.lower()
+    formatted_txt_lst = formatted_txt.split(" ")
+    punct_strp_txt = [strip_punct(i) for i in formatted_txt_lst]
+    normalized_txt = " ".join([i for i in punct_strp_txt if i])
+    return normalized_txt
+def strip_punct(wrd):
+    """
+    Given a word, strips non aphanumeric characters that precede and follow it
+    """
+    if not wrd:
+        return wrd
+    while not wrd[-1:].isalnum():
+        if not wrd:
+            break
+        wrd = wrd[:-1]
+    while not wrd[:1].isalnum():
+        if not wrd:
+            break
+        wrd = wrd[1:]
+    return wrd

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+torch
+langdetect
+simpletransformers
+youtube_transcript_api

sample.srt ADDED Viewed

	@@ -0,0 +1,20 @@

+in 2018 cornell researchers built a
+high-powered detector that in combination
+with an algorithm-driven process called
+ptychography set a world record by tripling
+the resolution of a state-of-the-art electron
+microscope as successful as it was that approach
+had a weakness it only worked with ultrathin
+samples that were a few atoms thick anything
+thicker would cause the electrons to scatter
+in ways that could not be disentangled now a
+team again led by
+david muller
+the samuel beckert professor of engineering
+has bested its own
+record by a factor of two with an electron
+microscope pixel array detector empad that
+incorporates even more sophisticated 3d
+reconstruction algorithms the resolution is so
+fine-tuned the only blurring that remains is
+the thermal jiggling of the atoms themselves