AndresAlejandro storresbusquets commited on
Commit
f96037b
0 Parent(s):

Duplicate from storresbusquets/demo1

Browse files

Co-authored-by: Santiago Torres Busquets <storresbusquets@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ TED_ed.wav filter=lfs diff=lfs merge=lfs -text
37
+ audios/TED_platon.wav filter=lfs diff=lfs merge=lfs -text
38
+ audios/TED_lagrange_point.wav filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: demo1
3
+ emoji: 🎓
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.42.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: storresbusquets/demo1
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Imports
2
+ import gradio as gr
3
+ import whisper
4
+ from pytube import YouTube
5
+ from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
6
+ import torch
7
+ from wordcloud import WordCloud
8
+ import re
9
+ import os
10
+
11
+ class GradioInference:
12
+ def __init__(self):
13
+
14
+ # OpenAI's Whisper model sizes
15
+ self.sizes = list(whisper._MODELS.keys())
16
+
17
+ # Whisper's available languages for ASR
18
+ self.langs = ["none"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
19
+
20
+ # Default size
21
+ self.current_size = "base"
22
+
23
+ # Default model size
24
+ self.loaded_model = whisper.load_model(self.current_size)
25
+
26
+ # Initialize Pytube Object
27
+ self.yt = None
28
+
29
+ # Initialize summary model for English
30
+ self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
31
+
32
+ # Initialize VoiceLabT5 model and tokenizer
33
+ self.keyword_model = T5ForConditionalGeneration.from_pretrained(
34
+ "Voicelab/vlt5-base-keywords"
35
+ )
36
+ self.keyword_tokenizer = T5Tokenizer.from_pretrained(
37
+ "Voicelab/vlt5-base-keywords"
38
+ )
39
+
40
+ # Sentiment Classifier
41
+ self.classifier = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", return_all_scores=False)
42
+
43
+ # Initialize Multilingual summary model
44
+ self.tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
45
+ self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
46
+
47
+ # self.llm_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
48
+
49
+ # self.pipeline = pipeline(
50
+ # "text-generation", #task
51
+ # model="tiiuae/falcon-7b-instruct",
52
+ # tokenizer=self.llm_tokenizer,
53
+ # trust_remote_code=True,
54
+ # do_sample=True,
55
+ # top_k=10,
56
+ # num_return_sequences=1,
57
+ # eos_token_id=self.tokenizer.eos_token_id
58
+ # )
59
+
60
+
61
+ def __call__(self, link, lang, size, progress=gr.Progress()):
62
+ """
63
+ Call the Gradio Inference python class.
64
+ This class gets access to a YouTube video using python's library Pytube and downloads its audio.
65
+ Then it uses the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
66
+ Once the function has the transcription of the video it proccess it to obtain:
67
+ - Summary: using Facebook's BART transformer.
68
+ - KeyWords: using VoiceLabT5 keyword extractor.
69
+ - Sentiment Analysis: using Hugging Face's default sentiment classifier
70
+ - WordCloud: using the wordcloud python library.
71
+ """
72
+ progress(0, desc="Starting analysis")
73
+ if self.yt is None:
74
+ self.yt = YouTube(link)
75
+
76
+ # Pytube library to access to YouTube audio stream
77
+ path = self.yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
78
+
79
+ if lang == "none":
80
+ lang = None
81
+
82
+ if size != self.current_size:
83
+ self.loaded_model = whisper.load_model(size)
84
+ self.current_size = size
85
+
86
+ progress(0.20, desc="Transcribing")
87
+
88
+ # Transcribe the audio extracted from pytube
89
+ results = self.loaded_model.transcribe(path, language=lang)
90
+
91
+ progress(0.40, desc="Summarizing")
92
+
93
+ # Perform summarization on the transcription
94
+ transcription_summary = self.summarizer(
95
+ results["text"], max_length=150, min_length=30, do_sample=False
96
+ )
97
+
98
+ #### Prueba
99
+ WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
100
+
101
+ input_ids_sum = self.tokenizer(
102
+ [WHITESPACE_HANDLER(results["text"])],
103
+ return_tensors="pt",
104
+ padding="max_length",
105
+ truncation=True,
106
+ max_length=512
107
+ )["input_ids"]
108
+
109
+ output_ids_sum = self.model.generate(
110
+ input_ids=input_ids_sum,
111
+ max_length=130,
112
+ no_repeat_ngram_size=2,
113
+ num_beams=4
114
+ )[0]
115
+
116
+ summary = self.tokenizer.decode(
117
+ output_ids_sum,
118
+ skip_special_tokens=True,
119
+ clean_up_tokenization_spaces=False
120
+ )
121
+ #### Fin prueba
122
+
123
+ progress(0.50, desc="Extracting Keywords")
124
+
125
+ # Extract keywords using VoiceLabT5
126
+ task_prefix = "Keywords: "
127
+ input_sequence = task_prefix + results["text"]
128
+ input_ids = self.keyword_tokenizer(
129
+ input_sequence, return_tensors="pt", truncation=False
130
+ ).input_ids
131
+ output = self.keyword_model.generate(
132
+ input_ids, no_repeat_ngram_size=3, num_beams=4
133
+ )
134
+ predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
135
+ keywords = [x.strip() for x in predicted.split(",") if x.strip()]
136
+ formatted_keywords = "\n".join([f"• {keyword}" for keyword in keywords])
137
+
138
+ progress(0.80, desc="Extracting Sentiment")
139
+
140
+ # Define a dictionary to map labels to emojis
141
+ sentiment_emojis = {
142
+ "positive": "Positive 👍🏼",
143
+ "negative": "Negative 👎🏼",
144
+ "neutral": "Neutral 😶",
145
+ }
146
+
147
+ # Sentiment label
148
+ label = self.classifier(summary)[0]["label"]
149
+
150
+ # Format the label with emojis
151
+ formatted_sentiment = sentiment_emojis.get(label, label)
152
+
153
+ progress(0.90, desc="Generating Wordcloud")
154
+
155
+ # Generate WordCloud object
156
+ wordcloud = WordCloud(colormap = "Oranges").generate(results["text"])
157
+
158
+ # WordCloud image to display
159
+ wordcloud_image = wordcloud.to_image()
160
+
161
+ if lang == "english":
162
+ return (
163
+ results["text"],
164
+ transcription_summary[0]["summary_text"],
165
+ formatted_keywords,
166
+ formatted_sentiment,
167
+ wordcloud_image,
168
+ )
169
+ else:
170
+ return (
171
+ results["text"],
172
+ summary,
173
+ formatted_keywords,
174
+ formatted_sentiment,
175
+ wordcloud_image,
176
+ )
177
+
178
+
179
+ def populate_metadata(self, link):
180
+ """
181
+ Access to the YouTube video title and thumbnail image to further display it
182
+ params:
183
+ - link: a YouTube URL.
184
+ """
185
+ if not link:
186
+ return None, None
187
+
188
+ self.yt = YouTube(link)
189
+ return self.yt.thumbnail_url, self.yt.title
190
+
191
+ def from_audio_input(self, lang, size, audio_file, progress=gr.Progress()):
192
+ """
193
+ Call the Gradio Inference python class.
194
+ Uses it directly the Whisper model to perform Automatic Speech Recognition (i.e Speech-to-Text).
195
+ Once the function has the transcription of the video it proccess it to obtain:
196
+ - Summary: using Facebook's BART transformer.
197
+ - KeyWords: using VoiceLabT5 keyword extractor.
198
+ - Sentiment Analysis: using Hugging Face's default sentiment classifier
199
+ - WordCloud: using the wordcloud python library.
200
+ """
201
+ progress(0, desc="Starting analysis")
202
+
203
+ if lang == "none":
204
+ lang = None
205
+
206
+ if size != self.current_size:
207
+ self.loaded_model = whisper.load_model(size)
208
+ self.current_size = size
209
+
210
+ progress(0.20, desc="Transcribing")
211
+
212
+ results = self.loaded_model.transcribe(audio_file, language=lang)
213
+
214
+ progress(0.40, desc="Summarizing")
215
+
216
+ # Perform summarization on the transcription
217
+ transcription_summary = self.summarizer(
218
+ results["text"], max_length=150, min_length=30, do_sample=False
219
+ )
220
+
221
+ ########################## PRUEBA LLM #################################
222
+ # from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
223
+
224
+ # llm = HuggingFacePipeline(pipeline = self.pipeline, model_kwargs = {'temperature':0})
225
+
226
+ # template = """
227
+ # Write a concise summary of the following text delimited by triple backquotes.
228
+ # ```{text}```
229
+ # CONCISE SUMMARY:
230
+ # """
231
+
232
+ # prompt = PromptTemplate(template=template, input_variables=["text"])
233
+
234
+ # llm_chain = LLMChain(prompt=prompt, llm=llm)
235
+
236
+ # text = results["text"]
237
+
238
+ # summ = llm_chain.run(text)
239
+ ########################## FIN PRUEBA LLM #################################
240
+
241
+ #### Prueba
242
+ WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
243
+
244
+ input_ids_sum = self.tokenizer(
245
+ [WHITESPACE_HANDLER(results["text"])],
246
+ return_tensors="pt",
247
+ padding="max_length",
248
+ truncation=True,
249
+ max_length=512
250
+ )["input_ids"]
251
+
252
+ output_ids_sum = self.model.generate(
253
+ input_ids=input_ids_sum,
254
+ max_length=130,
255
+ no_repeat_ngram_size=2,
256
+ num_beams=4
257
+ )[0]
258
+
259
+ summary = self.tokenizer.decode(
260
+ output_ids_sum,
261
+ skip_special_tokens=True,
262
+ clean_up_tokenization_spaces=False
263
+ )
264
+ #### Fin prueba
265
+
266
+ progress(0.50, desc="Extracting Keywords")
267
+
268
+ # Extract keywords using VoiceLabT5
269
+ task_prefix = "Keywords: "
270
+ input_sequence = task_prefix + results["text"]
271
+ input_ids = self.keyword_tokenizer(
272
+ input_sequence, return_tensors="pt", truncation=False
273
+ ).input_ids
274
+ output = self.keyword_model.generate(
275
+ input_ids, no_repeat_ngram_size=3, num_beams=4
276
+ )
277
+ predicted = self.keyword_tokenizer.decode(output[0], skip_special_tokens=True)
278
+ keywords = [x.strip() for x in predicted.split(",") if x.strip()]
279
+ formatted_keywords = "\n".join([f"• {keyword}" for keyword in keywords])
280
+
281
+ progress(0.80, desc="Extracting Sentiment")
282
+
283
+ # Define a dictionary to map labels to emojis
284
+ sentiment_emojis = {
285
+ "positive": "Positive 👍🏼",
286
+ "negative": "Negative 👎🏼",
287
+ "neutral": "Neutral 😶",
288
+ }
289
+
290
+ # Sentiment label
291
+ label = self.classifier(summary)[0]["label"]
292
+
293
+ # Format the label with emojis
294
+ formatted_sentiment = sentiment_emojis.get(label, label)
295
+
296
+ progress(0.90, desc="Generating Wordcloud")
297
+ # WordCloud object
298
+ wordcloud = WordCloud(colormap = "Oranges").generate(
299
+ results["text"]
300
+ )
301
+ wordcloud_image = wordcloud.to_image()
302
+
303
+ if lang == "english":
304
+ return (
305
+ results["text"],
306
+ # summ,
307
+ transcription_summary[0]["summary_text"],
308
+ formatted_keywords,
309
+ formatted_sentiment,
310
+ wordcloud_image,
311
+ )
312
+ else:
313
+ return (
314
+ results["text"],
315
+ # summ,
316
+ summary,
317
+ formatted_keywords,
318
+ formatted_sentiment,
319
+ wordcloud_image,
320
+ )
321
+
322
+
323
+ gio = GradioInference()
324
+ title = "YouTube Insights"
325
+ description = "Your AI-powered video analytics tool"
326
+
327
+ block = gr.Blocks()
328
+
329
+ with block as demo:
330
+ gr.HTML(
331
+ """
332
+ <div style="text-align: center; max-width: 500px; margin: 0 auto;">
333
+ <div>
334
+ <h1>YouTube <span style="color: #FFA500;">Insights</span> 💡</h1>
335
+ </div>
336
+ <h4 style="margin-bottom: 10px; font-size: 95%">
337
+ Your AI-powered video analytics tool ✨
338
+ </h4>
339
+ </div>
340
+ """
341
+ )
342
+ with gr.Group():
343
+ with gr.Tab("From YouTube 📹"):
344
+ with gr.Box():
345
+
346
+ with gr.Row().style(equal_height=True):
347
+ size = gr.Dropdown(
348
+ label="Speech-to-text Model Size", choices=gio.sizes, value="base"
349
+ )
350
+ lang = gr.Dropdown(
351
+ label="Language (Optional)", choices=gio.langs, value="none"
352
+ )
353
+ link = gr.Textbox(
354
+ label="YouTube Link", placeholder="Enter YouTube link..."
355
+ )
356
+ title = gr.Label(label="Video Title")
357
+
358
+ with gr.Row().style(equal_height=True):
359
+ img = gr.Image(label="Thumbnail")
360
+ text = gr.Textbox(
361
+ label="Transcription",
362
+ placeholder="Transcription Output...",
363
+ lines=10,
364
+ ).style(show_copy_button=True, container=True)
365
+
366
+ with gr.Row().style(equal_height=True):
367
+ summary = gr.Textbox(
368
+ label="Summary", placeholder="Summary Output...", lines=5
369
+ ).style(show_copy_button=True, container=True)
370
+ keywords = gr.Textbox(
371
+ label="Keywords", placeholder="Keywords Output...", lines=5
372
+ ).style(show_copy_button=True, container=True)
373
+ label = gr.Label(label="Sentiment Analysis")
374
+ wordcloud_image = gr.Image(label="WordCloud")
375
+
376
+ with gr.Row().style(equal_height=True):
377
+ clear = gr.ClearButton(
378
+ [link, title, img, text, summary, keywords, label, wordcloud_image], scale=1, value="Clear 🗑️"
379
+ )
380
+ btn = gr.Button("Get video insights 🔎", variant="primary", scale=1)
381
+ btn.click(
382
+ gio,
383
+ inputs=[link, lang, size],
384
+ outputs=[text, summary, keywords, label, wordcloud_image],
385
+ )
386
+ link.change(gio.populate_metadata, inputs=[link], outputs=[img, title])
387
+
388
+ with gr.Tab("From Audio file 🎙️"):
389
+ with gr.Box():
390
+
391
+ with gr.Row().style(equal_height=True):
392
+ size = gr.Dropdown(
393
+ label="Model Size", choices=gio.sizes, value="base"
394
+ )
395
+ lang = gr.Dropdown(
396
+ label="Language (Optional)", choices=gio.langs, value="none"
397
+ )
398
+ audio_file = gr.Audio(type="filepath")
399
+
400
+ with gr.Row().style(equal_height=True):
401
+ text = gr.Textbox(
402
+ label="Transcription",
403
+ placeholder="Transcription Output...",
404
+ lines=10,
405
+ ).style(show_copy_button=True, container=False)
406
+
407
+ with gr.Row().style(equal_height=True):
408
+ summary = gr.Textbox(
409
+ label="Summary", placeholder="Summary Output", lines=5
410
+ )
411
+ keywords = gr.Textbox(
412
+ label="Keywords", placeholder="Keywords Output", lines=5
413
+ )
414
+ label = gr.Label(label="Sentiment Analysis")
415
+ wordcloud_image = gr.Image(label="WordCloud")
416
+
417
+ with gr.Row().style(equal_height=True):
418
+ clear = gr.ClearButton([audio_file,text, summary, keywords, label, wordcloud_image], scale=1, value="Clear 🗑️")
419
+ btn = gr.Button(
420
+ "Get audio insights 🔎", variant="primary", scale=1
421
+ )
422
+ btn.click(
423
+ gio.from_audio_input,
424
+ inputs=[lang, size, audio_file],
425
+ outputs=[text, summary, keywords, label, wordcloud_image],
426
+ )
427
+
428
+
429
+ with block:
430
+ gr.Markdown("### Video Examples")
431
+ gr.Examples(["https://www.youtube.com/shorts/xDNzz8yAH7I","https://www.youtube.com/watch?v=kib6uXQsxBA&pp=ygURc3RldmUgam9icyBzcGVlY2g%3D"], inputs=link)
432
+
433
+ gr.Markdown("### Audio Examples")
434
+ gr.Examples(
435
+ [[os.path.join(os.path.dirname(__file__),"audios/TED_lagrange_point.wav")],[os.path.join(os.path.dirname(__file__),"audios/TED_platon.wav")]],
436
+ inputs=audio_file)
437
+
438
+ gr.Markdown("### About the app:")
439
+
440
+ with gr.Accordion("What is YouTube Insights?", open=False):
441
+ gr.Markdown(
442
+ "YouTube Insights is a tool developed for academic purposes that allows you to analyze YouTube videos or audio files. It provides features like transcription, summarization, keyword extraction, sentiment analysis, and word cloud generation for multimedia content."
443
+ )
444
+
445
+ with gr.Accordion("How does YouTube Insights work?", open=False):
446
+ gr.Markdown(
447
+ "YouTube Insights leverages several powerful AI models and libraries. It uses OpenAI's Whisper for Automatic Speech Recognition (ASR) to transcribe audio content. It summarizes the transcribed text using Facebook's BART model, extracts keywords with VoiceLabT5, performs sentiment analysis with DistilBERT, and generates word clouds."
448
+ )
449
+
450
+ with gr.Accordion("What languages are supported for the analysis?", open=False):
451
+ gr.Markdown(
452
+ "YouTube Insights supports multiple languages for transcription and analysis. You can select your preferred language from the available options when using the app."
453
+ )
454
+
455
+ with gr.Accordion("Can I analyze audio files instead of YouTube videos?", open=False):
456
+ gr.Markdown(
457
+ "Yes, you can analyze audio files directly. Simply upload your audio file to the app, and it will provide the same transcription, summarization, keyword extraction, sentiment analysis, and word cloud generation features."
458
+ )
459
+
460
+ with gr.Accordion("What are the different model sizes available for transcription?", open=False):
461
+ gr.Markdown(
462
+ "The app uses a Speech-to-text model that has different training sizes, from tiny to large. Hence, the bigger the model the accurate the transcription."
463
+ )
464
+
465
+ with gr.Accordion("How long does it take to analyze a video or audio file?", open=False):
466
+ gr.Markdown(
467
+ "The time taken for analysis may vary based on the duration of the video or audio file and the selected model size. Shorter content will be processed more quickly."
468
+ )
469
+
470
+ with gr.Accordion("Who developed YouTube Insights?" ,open=False):
471
+ gr.Markdown(
472
+ "YouTube Insights was developed by students as part of the 2022/23 Master's in Big Data & Data Science program at Universidad Complutense de Madrid for academic purposes (Trabajo de Fin de Master)."
473
+ )
474
+
475
+ gr.HTML(
476
+ """
477
+ <div style="text-align: center; max-width: 500px; margin: 0 auto;">
478
+ <p style="margin-bottom: 10px; font-size: 96%">
479
+ Trabajo de Fin de Máster - Grupo 3
480
+ </p>
481
+ <p style="margin-bottom: 10px; font-size: 90%">
482
+ 2023 Master in Big Data & Data Science - Universidad Complutense de Madrid
483
+ </p>
484
+ </div>
485
+ """
486
+ )
487
+
488
+ demo.launch()
audios/TED_lagrange_point.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e31b25028419adf17eaaa6bbda2a9729e13fda6ee09fd658ce2b7d70251d7144
3
+ size 4911545
audios/TED_platon.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85f5db77fd43b701110175a707920b98fe40357bfd4b79d1851a97ab455441fa
3
+ size 1685981
audios/tmp.md ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ openai-whisper
2
+ transformers
3
+ torch
4
+ wordcloud
5
+ pytube
6
+ sentencepiece
7
+ langchain
8
+ einops
9
+ bitsandbytes