YoutubeToSummary-awsome

Runtime error

App Files Files Community

JUNGU

DiegoLigtenberg commited on Nov 25, 2022

Commit

cada0f5

0 Parent(s):

Duplicate from DiegoLigtenberg/YoutubeToSummary

Browse files

Co-authored-by: Diego Ligtenberg <DiegoLigtenberg@users.noreply.huggingface.co>

Files changed (14) hide show

.gitattributes +34 -0
.gitignore +155 -0
README.md +14 -0
app.py +94 -0
instructions.md +15 -0
models.py +140 -0
parsarg.py +26 -0
requirements.txt +6 -0
settings.py +4 -0
utils/Dockerfile.txt +20 -0
utils/model_names.txt +7 -0
utils/model_names.yaml +42 -0
utils/models.yaml +29 -0
utils/oldmodel.py +47 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,155 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+output/
+.audio

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: YoutubeToSummary
+emoji: 🚀
+colorFrom: green
+colorTo: gray
+sdk: streamlit
+sdk_version: 1.10.0
+app_file: app.py
+pinned: false
+license: mit
+duplicated_from: DiegoLigtenberg/YoutubeToSummary
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import streamlit as st
+from models import BagOfModels, SoundToText, TextToSummary
+from settings import MODEL_PARSER
+args = MODEL_PARSER
+st.set_page_config(
+    page_title="TTS Applications | Incore Solutions",
+    layout="wide",
+    menu_items={
+        "About": """This is a simple GUI for OpenAI's Whisper.""",
+    },
+)
+def open_instructions():
+    with open("instructions.md", "r") as f:
+        st.write(f.read())
+# Render input type selection on the sidebar & the form
+input_type = st.sidebar.selectbox("Input Type", ["YouTube", "File"])
+with st.sidebar.form("input_form"):
+    if input_type == "YouTube":
+        youtube_url = st.text_input("Youtube URL")
+    elif input_type == "File":
+        input_file = st.file_uploader("File", type=["mp3", "wav"])
+    whisper_model = st.selectbox("Whisper model", options = [whisper for whisper in BagOfModels.get_model_names() if "whisper" in whisper] , index=1)
+    summary = st.checkbox("summarize")
+    if summary:
+        min_sum = st.number_input("Minimum words in the summary", min_value=1, step=1)
+        max_sum = min(min_sum,st.number_input("Maximum words in the summary", min_value=2, step=1))
+    st.form_submit_button(label="Save settings")
+with st.sidebar.form("save settings"):
+    transcribe = st.form_submit_button(label="Transcribe!")
+if transcribe:
+    if input_type == "YouTube":
+        if youtube_url and youtube_url.startswith("http"):
+            model = BagOfModels.load_model(whisper_model,**vars(args))
+            st.session_state.transcription = model.predict_stt(source=youtube_url,source_type=input_type,model_task="stt")
+        else:
+            st.error("Please enter a valid YouTube URL")
+            open_instructions()
+    elif input_type == "File":
+        if input_file:
+            model = BagOfModels.load_model(whisper_model,**vars(args))
+            st.session_state.transcription = model.predict_stt(source=input_file,source_type=input_type,model_task="stt")
+        else:
+            st.error("Please upload a file")
+if "transcription" in st.session_state:
+    # st.session_state.transcription.whisper()
+    # create two columns to separate page and youtube video
+    transcription_col, media_col = st.columns(2)
+    with transcription_col:
+        st.markdown("#### Audio")
+        with open(st.session_state.transcription.audio_path, "rb") as f:
+            st.audio(f.read())
+        st.markdown("---")
+        st.markdown(f"#### Transcription (whisper model - `{whisper_model}`)")
+        st.markdown(f"##### Language: `{st.session_state.transcription.language}`")
+        # Trim raw transcribed output off tokens to simplify
+        raw_output = st.expander("Raw output")
+        raw_output.markdown(st.session_state.transcription.raw_output["text"])
+        if summary:
+            summarized_output = st.expander("summarized output")
+            # CURRENTLY ONLY SUPPORTS 1024 WORD TOKENS -> TODO: FIND METHOD TO INCREASE SUMMARY FOR LONGER VIDS -> 1024 * 4 = aprox 800 words within 1024 range
+            text_summary = TextToSummary(str(st.session_state.transcription.text[:1024*4]),min_sum,max_sum).get_summary()
+            summarized_output.markdown(text_summary[0]["summary_text"])
+        # Show transcription in format with timers added to text
+        time_annotated_output = st.expander("time_annotated_output")
+        for segment in st.session_state.transcription.segments:
+            time_annotated_output.markdown(
+                f"""[{round(segment["start"], 1)} - {round(segment["end"], 1)}] - {segment["text"]}"""
+            )
+    # Show input youtube video
+    with media_col:
+        if input_type == "YouTube":
+            st.markdown("---")
+            st.markdown("#### Original YouTube Video")
+            st.video(st.session_state.transcription.source)
+else:
+    pass

instructions.md ADDED Viewed

	@@ -0,0 +1,15 @@

+## Whisper UI - Transcriptions, Summaries & Analytics
+---
+#### Run Whisper
+- Add a YouTube URL or select a local file on the left
+- Select the right Whisper model supported by your machine (extra configs have other whisper params if you want to play around with them)
+- Select whether you want to summarize the video. If so, enter a minimum and maximum length for the summary (usually between 50 and 100 words). Note that only the first 8 minutes of the video can be summarized in the current version.
+- Click Save settings.
+- Click "Transcribe"
+Once a transcription is created, it will be retained as a session variable so you can navigate around raw, summarized and time-annotated output.
+However, if you refresh or add a new video, the old transcription will be replaced.
+---

models.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
+from pydub import AudioSegment
+import whisper
+from settings import MODEL_PARSER
+from pytube import YouTube
+class BagOfModels:
+    '''model            ->  is a model from hugging face
+       model_names      ->  modelnames that can be chosen from in streamlit
+       model_settinsg   ->  settings of model that can be customized by user
+    '''
+    args = MODEL_PARSER
+    barfs = 5
+    def __init__(self,model,model_names,model_settings,model_tasks, **kwargs):
+        self.model = model
+        self.model_names = model_names
+        self.model_settings = model_settings
+        self.model_tasks = model_tasks
+        self.kwargs = kwargs
+    @classmethod
+    def get_model_settings(cls):
+        bag_of_models = BagOfModels(**vars(cls.args))
+        return bag_of_models.model_settings
+    @classmethod
+    def get_model_names(cls):
+        bag_of_models = BagOfModels(**vars(cls.args))
+        return bag_of_models.model_names
+    @classmethod
+    def get_model(cls):
+        bag_of_models = BagOfModels(**vars(cls.args))
+        return bag_of_models.model
+    @classmethod
+    def get_model_tasks(cls):
+        bag_of_models = BagOfModels(**vars(cls.args))
+        return bag_of_models.model_tasks
+    @classmethod
+    def load_model(cls,model_name,**kwargs):
+        bag_of_models = BagOfModels(**vars(cls.args))
+        cls.model = bag_of_models.model
+        assert model_name in bag_of_models.model_names, f"please pick one of the available models: {bag_of_models.model_names}"
+        return Model(model_name,**cls.model[model_name])
+class Model:
+    def __init__(self,model_name,task,url,**kwargs):
+        self.url = url
+        self.model_name = model_name
+        self.name = self.url.split("https://huggingface.co/")[1]
+        self.task = task
+        self.kwargs = kwargs
+        self.init_optional_args(**self.kwargs)
+    def init_optional_args(self,year=None,description=None):
+        self._year = year
+        self._description = description
+    def predict_stt(self,source,source_type,model_task):
+        model = whisper.load_model(self.model_name.split("_")[1]) #tiny - base - medium
+        stt = SoundToText(source,source_type,model_task,model=model,tokenizer=None)
+        stt.whisper()
+        return stt
+    def predict_summary(self):
+        tokenizer = Wav2Vec2Processor.from_pretrained(self.name)
+        model = Wav2Vec2ForCTC.from_pretrained(self.name) # Note: PyTorch Model
+class Transcription():
+    def __init__(self,model,source,source_type) -> None:
+        pass
+class SoundToText():
+    def __init__(self,source,source_type,model_task,model,tokenizer=None):
+        self.source = source
+        self.source_type = source_type
+        self.model = model
+        self.model_task = model_task
+        self.tokenizer = tokenizer
+    def wav2vec(self,size):
+        pass
+    def wav2vec2(self,size):
+        pass
+    def whisper(self):
+        # download youtube url
+        if self.source_type == "YouTube":
+            self.audio_path = YouTube(self.source).streams.get_by_itag(140).download("output/", filename="audio")
+        # if self.source_type == "File":
+        #     audio = None
+        #     if self.source.name.endswith('.wav'): audio = AudioSegment.from_wav(self.source)
+        #     elif self.source.name.endswith('.mp3'): audio = AudioSegment.from_mp3(self.source)
+        #     audio.export('output/audio.wav', format='wav')
+        #     self.audio_path = "output/audio.wav"
+        model = whisper.load_model("base")
+        self.raw_output = model.transcribe(self.audio_path,verbose=True)
+        self.text = self.raw_output["text"]
+        self.language = self.raw_output["language"]
+        self.segments = self.raw_output["segments"]
+        # Remove token ids from the output
+        for segment in self.segments:
+            del segment["tokens"]
+        self.transcribed = True
+class TextToSummary():
+    def __init__(self,input_text,min_length,max_length):
+        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+        self.summary_input = input_text
+        self.summary_output = (self.summarizer(self.summary_input, min_length=min_length, max_length=max_length, do_sample=False))
+    def get_summary(self):
+        return self.summary_output
+    def wav2vec(self):
+        pass
+def record(model_name):
+    args = MODEL_PARSER
+    models = BagOfModels.get_model_names()
+    tasks = BagOfModels.get_model_tasks()
+    whisper_base = BagOfModels.load_model(model_name,**vars(args))
+    whisper_base.predict()
+if __name__== "__main__":
+    args = MODEL_PARSER
+    models = BagOfModels.get_model_names()
+    tasks = BagOfModels.get_model_tasks()
+    whisper_base = BagOfModels.load_model("whisper_base",**vars(args))
+    whisper_base.predict_stt()

parsarg.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import argparse
+import yaml
+def model_parser_args():
+    with open(r'utils/models.yaml') as f:
+        settings = yaml.full_load(f)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", help="see model_settings.yaml",default=settings)
+    parser.add_argument("--model_names", help="see model_settings.yaml",default=list(settings))
+    setting_list = []
+    task_list = []
+    for i in range(len(settings)):
+        setting_list.append(list(settings[list(settings.keys())[i]].keys()))
+    for model in (list(settings.keys())):
+        task = (settings[model]["task"])
+        if task not in task_list:task_list.append(task)
+    setting_list = ([setting for sublist in setting_list for setting in sublist]) # generate all sublists
+    setting_list = [x for i, x in enumerate(setting_list) if x not in setting_list[:i]] # remain order of sublists
+    parser.add_argument("--model_settings",help="see model_settings.yaml",default=setting_list)
+    parser.add_argument("--model_tasks",help="see model_settings.yaml",default=task_list)
+    parser=parser.parse_args()
+    return parser
+if __name__ == "__main__":
+    model_parser_args()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pydub==0.25.1
+pytube==12.1.0
+PyYAML==6.0
+streamlit==1.13.0
+transformers==4.23.1
+git+https://github.com/openai/whisper.git

settings.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from parsarg import model_parser_args
2	+
3	+ MODEL_PARSER = model_parser_args()
4	+

utils/Dockerfile.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.9
+WORKDIR /app
+COPY requirements.txt ./requirements.txt
+RUN apt-get update \
+        && apt-get install libportaudio2 libportaudiocpp0 portaudio19-dev libsndfile1-dev -y \
+        && pip3 install pyaudio
+RUN pip install -r requirements.txt
+EXPOSE 8501
+WORKDIR /src
+COPY . /src
+ENTRYPOINT ["streamlit", "run"]
+CMD ["src/main.py"]

utils/model_names.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+INSERT Hugging face models
+1) Insert tokenizer model name
+2) Insert space
+3) Insert huggingface link to model name
+speech_to_text
+facebook/wav2vec2-base-960h https://huggingface.co/facebook/wav2vec2-base-960h

utils/model_names.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# models that generate text from audio data.
+model_task: # model task
+  speech_to_text:
+    model_name: # model name
+      wav2vec:
+        model_size: # model size
+          base:
+            name: facebook/wav2vec2-base-960h
+            url: https://huggingface.co/facebook/wav2vec2-base-960h
+            year: 2020
+      whisper:
+        model_size:
+          tiny:
+            name: openai/whisper-tiny
+            url: https://huggingface.co/openai/whisper-tiny
+            year: 2022
+          base:
+            name: openai/whisper-base
+            url: https://huggingface.co/openai/whisper-base
+            year: 2022
+          medium:
+            name: openai/whisper-medium
+            url: https://huggingface.co/openai/whisper-medium
+            year: 2022
+  # models that generate summaries from text data.
+  text_to_summary:
+    model_name:
+      bert:
+        model_size:
+          large:
+            name: facebook/bart-large-cnn
+            url: https://huggingface.co/facebook/bart-large-cnn
+            year: 2019
+            fbs: 31231

utils/models.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# models that generate text from audio data.
+wav2vec:
+  task: text_to_speech
+  url: https://huggingface.co/facebook/wav2vec2-base-960h
+wav2vec2:
+  task: text_to_speech
+  url: https://huggingface.co/yongjian/wav2vec2-large-a
+whisper_tiny:
+  task: text_to_speech
+  url: https://huggingface.co/openai/whisper-tiny
+  description: "this is the smallest whisper model that will be used for cloud deployment"
+  year: 2022
+whisper_base:
+  task: text_to_speech
+  url: https://huggingface.co/openai/whisper-base
+  year: 2022
+whisper_medium:
+  task: text_to_speech
+  url: https://huggingface.co/openai/whisper-medium
+  year: 2022
+bart_large:
+  task: text_to_summary
+  url: https://huggingface.co/facebook/bart-large-cnn
+  year: 2022

utils/oldmodel.py ADDED Viewed

	@@ -0,0 +1,47 @@

+'''
+import torch
+import torchaudio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import speech_recognition as sr
+import io
+from pydub import AudioSegment
+import librosa
+import whisper
+from scipy.io import wavfile
+from test import record_voice
+model = Wav2Vec2ForCTC.from_pretrained(r'yongjian/wav2vec2-large-a') # Note: PyTorch Model
+tokenizer = Wav2Vec2Processor.from_pretrained(r'yongjian/wav2vec2-large-a')
+r = sr.Recognizer()
+from transformers import pipeline
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+with sr.Microphone(sample_rate=16000) as source:
+    print("You can start speaking now")
+    record_voice()
+    x,_ = librosa.load("output.wav")
+    model_inputs = tokenizer(x, sampling_rate=16000, return_tensors="pt", padding=True)
+    logits = model(model_inputs.input_values, attention_mask=model_inputs.attention_mask).logits.cuda() # use .cuda() for GPU acceleration
+    pred_ids = torch.argmax(logits, dim=-1).cpu()
+    pred_text = tokenizer.batch_decode(pred_ids)
+    print(x[:10],x.shape)
+    print('Transcription:', pred_text)
+    model = whisper.load_model("base")
+    result = model.transcribe("output.wav")
+    print(result["text"])
+    summary_input = result["text"]
+    summary_output = (summarizer(summary_input, max_length=30, min_length=20, do_sample=False))
+    print(summary_output)
+    with open("raw_text.txt",'w',encoding = 'utf-8') as f:
+        f.write(summary_input)
+        f.close()
+    with open("summary_text.txt",'w',encoding = 'utf-8') as f:
+        f.write(summary_output[0]["summary_text"])
+        f.close()
+'''