JUNGU DiegoLigtenberg commited on
Commit
cada0f5
·
0 Parent(s):

Duplicate from DiegoLigtenberg/YoutubeToSummary

Browse files

Co-authored-by: Diego Ligtenberg <DiegoLigtenberg@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105
+ __pypackages__/
106
+
107
+ # Celery stuff
108
+ celerybeat-schedule
109
+ celerybeat.pid
110
+
111
+ # SageMath parsed files
112
+ *.sage.py
113
+
114
+ # Environments
115
+ .env
116
+ .venv
117
+ env/
118
+ venv/
119
+ ENV/
120
+ env.bak/
121
+ venv.bak/
122
+
123
+ # Spyder project settings
124
+ .spyderproject
125
+ .spyproject
126
+
127
+ # Rope project settings
128
+ .ropeproject
129
+
130
+ # mkdocs documentation
131
+ /site
132
+
133
+ # mypy
134
+ .mypy_cache/
135
+ .dmypy.json
136
+ dmypy.json
137
+
138
+ # Pyre type checker
139
+ .pyre/
140
+
141
+ # pytype static type analyzer
142
+ .pytype/
143
+
144
+ # Cython debug symbols
145
+ cython_debug/
146
+
147
+ # PyCharm
148
+ # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
151
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
152
+ #.idea/
153
+
154
+ output/
155
+ .audio
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: YoutubeToSummary
3
+ emoji: 🚀
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: DiegoLigtenberg/YoutubeToSummary
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from models import BagOfModels, SoundToText, TextToSummary
3
+ from settings import MODEL_PARSER
4
+ args = MODEL_PARSER
5
+
6
+ st.set_page_config(
7
+ page_title="TTS Applications | Incore Solutions",
8
+ layout="wide",
9
+ menu_items={
10
+ "About": """This is a simple GUI for OpenAI's Whisper.""",
11
+ },
12
+ )
13
+
14
+ def open_instructions():
15
+ with open("instructions.md", "r") as f:
16
+ st.write(f.read())
17
+
18
+ # Render input type selection on the sidebar & the form
19
+ input_type = st.sidebar.selectbox("Input Type", ["YouTube", "File"])
20
+
21
+ with st.sidebar.form("input_form"):
22
+ if input_type == "YouTube":
23
+ youtube_url = st.text_input("Youtube URL")
24
+ elif input_type == "File":
25
+ input_file = st.file_uploader("File", type=["mp3", "wav"])
26
+
27
+ whisper_model = st.selectbox("Whisper model", options = [whisper for whisper in BagOfModels.get_model_names() if "whisper" in whisper] , index=1)
28
+
29
+ summary = st.checkbox("summarize")
30
+ if summary:
31
+ min_sum = st.number_input("Minimum words in the summary", min_value=1, step=1)
32
+ max_sum = min(min_sum,st.number_input("Maximum words in the summary", min_value=2, step=1))
33
+ st.form_submit_button(label="Save settings")
34
+
35
+ with st.sidebar.form("save settings"):
36
+ transcribe = st.form_submit_button(label="Transcribe!")
37
+
38
+
39
+ if transcribe:
40
+ if input_type == "YouTube":
41
+ if youtube_url and youtube_url.startswith("http"):
42
+ model = BagOfModels.load_model(whisper_model,**vars(args))
43
+ st.session_state.transcription = model.predict_stt(source=youtube_url,source_type=input_type,model_task="stt")
44
+ else:
45
+ st.error("Please enter a valid YouTube URL")
46
+ open_instructions()
47
+
48
+ elif input_type == "File":
49
+ if input_file:
50
+ model = BagOfModels.load_model(whisper_model,**vars(args))
51
+ st.session_state.transcription = model.predict_stt(source=input_file,source_type=input_type,model_task="stt")
52
+ else:
53
+ st.error("Please upload a file")
54
+
55
+ if "transcription" in st.session_state:
56
+ # st.session_state.transcription.whisper()
57
+
58
+ # create two columns to separate page and youtube video
59
+ transcription_col, media_col = st.columns(2)
60
+
61
+ with transcription_col:
62
+ st.markdown("#### Audio")
63
+ with open(st.session_state.transcription.audio_path, "rb") as f:
64
+ st.audio(f.read())
65
+ st.markdown("---")
66
+ st.markdown(f"#### Transcription (whisper model - `{whisper_model}`)")
67
+ st.markdown(f"##### Language: `{st.session_state.transcription.language}`")
68
+
69
+ # Trim raw transcribed output off tokens to simplify
70
+ raw_output = st.expander("Raw output")
71
+ raw_output.markdown(st.session_state.transcription.raw_output["text"])
72
+
73
+ if summary:
74
+ summarized_output = st.expander("summarized output")
75
+ # CURRENTLY ONLY SUPPORTS 1024 WORD TOKENS -> TODO: FIND METHOD TO INCREASE SUMMARY FOR LONGER VIDS -> 1024 * 4 = aprox 800 words within 1024 range
76
+ text_summary = TextToSummary(str(st.session_state.transcription.text[:1024*4]),min_sum,max_sum).get_summary()
77
+ summarized_output.markdown(text_summary[0]["summary_text"])
78
+
79
+ # Show transcription in format with timers added to text
80
+ time_annotated_output = st.expander("time_annotated_output")
81
+ for segment in st.session_state.transcription.segments:
82
+ time_annotated_output.markdown(
83
+ f"""[{round(segment["start"], 1)} - {round(segment["end"], 1)}] - {segment["text"]}"""
84
+ )
85
+
86
+ # Show input youtube video
87
+ with media_col:
88
+ if input_type == "YouTube":
89
+ st.markdown("---")
90
+ st.markdown("#### Original YouTube Video")
91
+ st.video(st.session_state.transcription.source)
92
+ else:
93
+ pass
94
+
instructions.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Whisper UI - Transcriptions, Summaries & Analytics
2
+
3
+ ---
4
+
5
+ #### Run Whisper
6
+ - Add a YouTube URL or select a local file on the left
7
+ - Select the right Whisper model supported by your machine (extra configs have other whisper params if you want to play around with them)
8
+ - Select whether you want to summarize the video. If so, enter a minimum and maximum length for the summary (usually between 50 and 100 words). Note that only the first 8 minutes of the video can be summarized in the current version.
9
+ - Click Save settings.
10
+ - Click "Transcribe"
11
+
12
+ Once a transcription is created, it will be retained as a session variable so you can navigate around raw, summarized and time-annotated output.
13
+ However, if you refresh or add a new video, the old transcription will be replaced.
14
+
15
+ ---
models.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
2
+ from pydub import AudioSegment
3
+ import whisper
4
+ from settings import MODEL_PARSER
5
+ from pytube import YouTube
6
+
7
+ class BagOfModels:
8
+ '''model -> is a model from hugging face
9
+ model_names -> modelnames that can be chosen from in streamlit
10
+ model_settinsg -> settings of model that can be customized by user
11
+ '''
12
+ args = MODEL_PARSER
13
+ barfs = 5
14
+
15
+ def __init__(self,model,model_names,model_settings,model_tasks, **kwargs):
16
+ self.model = model
17
+ self.model_names = model_names
18
+ self.model_settings = model_settings
19
+ self.model_tasks = model_tasks
20
+ self.kwargs = kwargs
21
+
22
+ @classmethod
23
+ def get_model_settings(cls):
24
+ bag_of_models = BagOfModels(**vars(cls.args))
25
+ return bag_of_models.model_settings
26
+
27
+ @classmethod
28
+ def get_model_names(cls):
29
+ bag_of_models = BagOfModels(**vars(cls.args))
30
+ return bag_of_models.model_names
31
+
32
+ @classmethod
33
+ def get_model(cls):
34
+ bag_of_models = BagOfModels(**vars(cls.args))
35
+ return bag_of_models.model
36
+
37
+ @classmethod
38
+ def get_model_tasks(cls):
39
+ bag_of_models = BagOfModels(**vars(cls.args))
40
+ return bag_of_models.model_tasks
41
+
42
+ @classmethod
43
+ def load_model(cls,model_name,**kwargs):
44
+ bag_of_models = BagOfModels(**vars(cls.args))
45
+ cls.model = bag_of_models.model
46
+ assert model_name in bag_of_models.model_names, f"please pick one of the available models: {bag_of_models.model_names}"
47
+ return Model(model_name,**cls.model[model_name])
48
+
49
+
50
+ class Model:
51
+ def __init__(self,model_name,task,url,**kwargs):
52
+ self.url = url
53
+ self.model_name = model_name
54
+ self.name = self.url.split("https://huggingface.co/")[1]
55
+ self.task = task
56
+ self.kwargs = kwargs
57
+ self.init_optional_args(**self.kwargs)
58
+
59
+ def init_optional_args(self,year=None,description=None):
60
+ self._year = year
61
+ self._description = description
62
+
63
+ def predict_stt(self,source,source_type,model_task):
64
+ model = whisper.load_model(self.model_name.split("_")[1]) #tiny - base - medium
65
+ stt = SoundToText(source,source_type,model_task,model=model,tokenizer=None)
66
+ stt.whisper()
67
+ return stt
68
+
69
+ def predict_summary(self):
70
+ tokenizer = Wav2Vec2Processor.from_pretrained(self.name)
71
+ model = Wav2Vec2ForCTC.from_pretrained(self.name) # Note: PyTorch Model
72
+
73
+ class Transcription():
74
+ def __init__(self,model,source,source_type) -> None:
75
+ pass
76
+
77
+ class SoundToText():
78
+ def __init__(self,source,source_type,model_task,model,tokenizer=None):
79
+ self.source = source
80
+ self.source_type = source_type
81
+ self.model = model
82
+ self.model_task = model_task
83
+ self.tokenizer = tokenizer
84
+
85
+ def wav2vec(self,size):
86
+ pass
87
+
88
+ def wav2vec2(self,size):
89
+ pass
90
+
91
+ def whisper(self):
92
+ # download youtube url
93
+ if self.source_type == "YouTube":
94
+ self.audio_path = YouTube(self.source).streams.get_by_itag(140).download("output/", filename="audio")
95
+
96
+ # if self.source_type == "File":
97
+ # audio = None
98
+ # if self.source.name.endswith('.wav'): audio = AudioSegment.from_wav(self.source)
99
+ # elif self.source.name.endswith('.mp3'): audio = AudioSegment.from_mp3(self.source)
100
+ # audio.export('output/audio.wav', format='wav')
101
+ # self.audio_path = "output/audio.wav"
102
+
103
+ model = whisper.load_model("base")
104
+ self.raw_output = model.transcribe(self.audio_path,verbose=True)
105
+
106
+ self.text = self.raw_output["text"]
107
+ self.language = self.raw_output["language"]
108
+ self.segments = self.raw_output["segments"]
109
+
110
+ # Remove token ids from the output
111
+ for segment in self.segments:
112
+ del segment["tokens"]
113
+
114
+ self.transcribed = True
115
+
116
+ class TextToSummary():
117
+ def __init__(self,input_text,min_length,max_length):
118
+ self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
119
+ self.summary_input = input_text
120
+ self.summary_output = (self.summarizer(self.summary_input, min_length=min_length, max_length=max_length, do_sample=False))
121
+
122
+ def get_summary(self):
123
+ return self.summary_output
124
+
125
+ def wav2vec(self):
126
+ pass
127
+
128
+ def record(model_name):
129
+ args = MODEL_PARSER
130
+ models = BagOfModels.get_model_names()
131
+ tasks = BagOfModels.get_model_tasks()
132
+ whisper_base = BagOfModels.load_model(model_name,**vars(args))
133
+ whisper_base.predict()
134
+
135
+ if __name__== "__main__":
136
+ args = MODEL_PARSER
137
+ models = BagOfModels.get_model_names()
138
+ tasks = BagOfModels.get_model_tasks()
139
+ whisper_base = BagOfModels.load_model("whisper_base",**vars(args))
140
+ whisper_base.predict_stt()
parsarg.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import yaml
3
+
4
+ def model_parser_args():
5
+ with open(r'utils/models.yaml') as f:
6
+ settings = yaml.full_load(f)
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument("--model", help="see model_settings.yaml",default=settings)
9
+ parser.add_argument("--model_names", help="see model_settings.yaml",default=list(settings))
10
+ setting_list = []
11
+ task_list = []
12
+ for i in range(len(settings)):
13
+ setting_list.append(list(settings[list(settings.keys())[i]].keys()))
14
+ for model in (list(settings.keys())):
15
+ task = (settings[model]["task"])
16
+ if task not in task_list:task_list.append(task)
17
+ setting_list = ([setting for sublist in setting_list for setting in sublist]) # generate all sublists
18
+ setting_list = [x for i, x in enumerate(setting_list) if x not in setting_list[:i]] # remain order of sublists
19
+ parser.add_argument("--model_settings",help="see model_settings.yaml",default=setting_list)
20
+ parser.add_argument("--model_tasks",help="see model_settings.yaml",default=task_list)
21
+ parser=parser.parse_args()
22
+ return parser
23
+
24
+ if __name__ == "__main__":
25
+ model_parser_args()
26
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pydub==0.25.1
2
+ pytube==12.1.0
3
+ PyYAML==6.0
4
+ streamlit==1.13.0
5
+ transformers==4.23.1
6
+ git+https://github.com/openai/whisper.git
settings.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from parsarg import model_parser_args
2
+
3
+ MODEL_PARSER = model_parser_args()
4
+
utils/Dockerfile.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt ./requirements.txt
6
+
7
+ RUN apt-get update \
8
+ && apt-get install libportaudio2 libportaudiocpp0 portaudio19-dev libsndfile1-dev -y \
9
+ && pip3 install pyaudio
10
+
11
+ RUN pip install -r requirements.txt
12
+
13
+ EXPOSE 8501
14
+
15
+ WORKDIR /src
16
+ COPY . /src
17
+
18
+ ENTRYPOINT ["streamlit", "run"]
19
+
20
+ CMD ["src/main.py"]
utils/model_names.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ INSERT Hugging face models
2
+ 1) Insert tokenizer model name
3
+ 2) Insert space
4
+ 3) Insert huggingface link to model name
5
+
6
+ speech_to_text
7
+ facebook/wav2vec2-base-960h https://huggingface.co/facebook/wav2vec2-base-960h
utils/model_names.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models that generate text from audio data.
2
+ model_task: # model task
3
+ speech_to_text:
4
+ model_name: # model name
5
+ wav2vec:
6
+ model_size: # model size
7
+ base:
8
+ name: facebook/wav2vec2-base-960h
9
+ url: https://huggingface.co/facebook/wav2vec2-base-960h
10
+ year: 2020
11
+ whisper:
12
+ model_size:
13
+ tiny:
14
+ name: openai/whisper-tiny
15
+ url: https://huggingface.co/openai/whisper-tiny
16
+ year: 2022
17
+ base:
18
+ name: openai/whisper-base
19
+ url: https://huggingface.co/openai/whisper-base
20
+ year: 2022
21
+ medium:
22
+ name: openai/whisper-medium
23
+ url: https://huggingface.co/openai/whisper-medium
24
+ year: 2022
25
+
26
+ # models that generate summaries from text data.
27
+ text_to_summary:
28
+ model_name:
29
+ bert:
30
+ model_size:
31
+ large:
32
+ name: facebook/bart-large-cnn
33
+ url: https://huggingface.co/facebook/bart-large-cnn
34
+ year: 2019
35
+ fbs: 31231
36
+
37
+
38
+
39
+
40
+
41
+
42
+
utils/models.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models that generate text from audio data.
2
+ wav2vec:
3
+ task: text_to_speech
4
+ url: https://huggingface.co/facebook/wav2vec2-base-960h
5
+
6
+ wav2vec2:
7
+ task: text_to_speech
8
+ url: https://huggingface.co/yongjian/wav2vec2-large-a
9
+
10
+ whisper_tiny:
11
+ task: text_to_speech
12
+ url: https://huggingface.co/openai/whisper-tiny
13
+ description: "this is the smallest whisper model that will be used for cloud deployment"
14
+ year: 2022
15
+
16
+ whisper_base:
17
+ task: text_to_speech
18
+ url: https://huggingface.co/openai/whisper-base
19
+ year: 2022
20
+
21
+ whisper_medium:
22
+ task: text_to_speech
23
+ url: https://huggingface.co/openai/whisper-medium
24
+ year: 2022
25
+
26
+ bart_large:
27
+ task: text_to_summary
28
+ url: https://huggingface.co/facebook/bart-large-cnn
29
+ year: 2022
utils/oldmodel.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ import torch
3
+ import torchaudio
4
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
5
+ import speech_recognition as sr
6
+ import io
7
+ from pydub import AudioSegment
8
+ import librosa
9
+ import whisper
10
+ from scipy.io import wavfile
11
+ from test import record_voice
12
+
13
+ model = Wav2Vec2ForCTC.from_pretrained(r'yongjian/wav2vec2-large-a') # Note: PyTorch Model
14
+ tokenizer = Wav2Vec2Processor.from_pretrained(r'yongjian/wav2vec2-large-a')
15
+
16
+
17
+ r = sr.Recognizer()
18
+
19
+ from transformers import pipeline
20
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
21
+
22
+ with sr.Microphone(sample_rate=16000) as source:
23
+ print("You can start speaking now")
24
+ record_voice()
25
+ x,_ = librosa.load("output.wav")
26
+ model_inputs = tokenizer(x, sampling_rate=16000, return_tensors="pt", padding=True)
27
+ logits = model(model_inputs.input_values, attention_mask=model_inputs.attention_mask).logits.cuda() # use .cuda() for GPU acceleration
28
+ pred_ids = torch.argmax(logits, dim=-1).cpu()
29
+ pred_text = tokenizer.batch_decode(pred_ids)
30
+ print(x[:10],x.shape)
31
+ print('Transcription:', pred_text)
32
+
33
+ model = whisper.load_model("base")
34
+ result = model.transcribe("output.wav")
35
+ print(result["text"])
36
+ summary_input = result["text"]
37
+
38
+ summary_output = (summarizer(summary_input, max_length=30, min_length=20, do_sample=False))
39
+ print(summary_output)
40
+ with open("raw_text.txt",'w',encoding = 'utf-8') as f:
41
+ f.write(summary_input)
42
+ f.close()
43
+ with open("summary_text.txt",'w',encoding = 'utf-8') as f:
44
+ f.write(summary_output[0]["summary_text"])
45
+ f.close()
46
+
47
+ '''