Spaces:
Runtime error
Runtime error
Commit
·
0f467e3
1
Parent(s):
dc20c68
Update app.py
Browse files
app.py
CHANGED
@@ -2,8 +2,9 @@
|
|
2 |
import gradio as gr
|
3 |
import whisper
|
4 |
from pytube import YouTube
|
5 |
-
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
|
6 |
from wordcloud import WordCloud
|
|
|
7 |
|
8 |
class GradioInference:
|
9 |
def __init__(self):
|
@@ -37,6 +38,12 @@ class GradioInference:
|
|
37 |
# Sentiment Classifier
|
38 |
self.classifier = pipeline("text-classification")
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def __call__(self, link, lang, size):
|
41 |
"""
|
42 |
Call the Gradio Inference python class.
|
@@ -69,6 +76,32 @@ class GradioInference:
|
|
69 |
results["text"], max_length=512, min_length=30, do_sample=False
|
70 |
)
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
# Extract keywords using VoiceLabT5
|
73 |
task_prefix = "Keywords: "
|
74 |
input_sequence = task_prefix + results["text"]
|
@@ -92,7 +125,7 @@ class GradioInference:
|
|
92 |
|
93 |
return (
|
94 |
results["text"],
|
95 |
-
|
96 |
keywords,
|
97 |
label,
|
98 |
wordcloud_image,
|
@@ -131,6 +164,31 @@ class GradioInference:
|
|
131 |
results["text"], max_length=512, min_length=30, do_sample=False
|
132 |
)
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
# Extract keywords using VoiceLabT5
|
135 |
task_prefix = "Keywords: "
|
136 |
input_sequence = task_prefix + results["text"]
|
@@ -154,7 +212,7 @@ class GradioInference:
|
|
154 |
|
155 |
return (
|
156 |
results["text"],
|
157 |
-
|
158 |
keywords,
|
159 |
label,
|
160 |
wordcloud_image,
|
|
|
2 |
import gradio as gr
|
3 |
import whisper
|
4 |
from pytube import YouTube
|
5 |
+
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
|
6 |
from wordcloud import WordCloud
|
7 |
+
import re
|
8 |
|
9 |
class GradioInference:
|
10 |
def __init__(self):
|
|
|
38 |
# Sentiment Classifier
|
39 |
self.classifier = pipeline("text-classification")
|
40 |
|
41 |
+
|
42 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
43 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
def __call__(self, link, lang, size):
|
48 |
"""
|
49 |
Call the Gradio Inference python class.
|
|
|
76 |
results["text"], max_length=512, min_length=30, do_sample=False
|
77 |
)
|
78 |
|
79 |
+
#### Prueba
|
80 |
+
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
|
81 |
+
|
82 |
+
input_ids = self.tokenizer(
|
83 |
+
[WHITESPACE_HANDLER(results["text"])],
|
84 |
+
return_tensors="pt",
|
85 |
+
padding="max_length",
|
86 |
+
truncation=True,
|
87 |
+
max_length=512
|
88 |
+
)["input_ids"]
|
89 |
+
|
90 |
+
output_ids = self.model.generate(
|
91 |
+
input_ids=input_ids,
|
92 |
+
max_length=130,
|
93 |
+
no_repeat_ngram_size=2,
|
94 |
+
num_beams=4
|
95 |
+
)[0]
|
96 |
+
|
97 |
+
summary = tokenizer.decode(
|
98 |
+
output_ids,
|
99 |
+
skip_special_tokens=True,
|
100 |
+
clean_up_tokenization_spaces=False
|
101 |
+
)
|
102 |
+
#### Fin prueba
|
103 |
+
|
104 |
+
|
105 |
# Extract keywords using VoiceLabT5
|
106 |
task_prefix = "Keywords: "
|
107 |
input_sequence = task_prefix + results["text"]
|
|
|
125 |
|
126 |
return (
|
127 |
results["text"],
|
128 |
+
summary,
|
129 |
keywords,
|
130 |
label,
|
131 |
wordcloud_image,
|
|
|
164 |
results["text"], max_length=512, min_length=30, do_sample=False
|
165 |
)
|
166 |
|
167 |
+
#### Prueba
|
168 |
+
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
|
169 |
+
|
170 |
+
input_ids = self.tokenizer(
|
171 |
+
[WHITESPACE_HANDLER(results["text"])],
|
172 |
+
return_tensors="pt",
|
173 |
+
padding="max_length",
|
174 |
+
truncation=True,
|
175 |
+
max_length=512
|
176 |
+
)["input_ids"]
|
177 |
+
|
178 |
+
output_ids = self.model.generate(
|
179 |
+
input_ids=input_ids,
|
180 |
+
max_length=130,
|
181 |
+
no_repeat_ngram_size=2,
|
182 |
+
num_beams=4
|
183 |
+
)[0]
|
184 |
+
|
185 |
+
summary = tokenizer.decode(
|
186 |
+
output_ids,
|
187 |
+
skip_special_tokens=True,
|
188 |
+
clean_up_tokenization_spaces=False
|
189 |
+
)
|
190 |
+
#### Fin prueba
|
191 |
+
|
192 |
# Extract keywords using VoiceLabT5
|
193 |
task_prefix = "Keywords: "
|
194 |
input_sequence = task_prefix + results["text"]
|
|
|
212 |
|
213 |
return (
|
214 |
results["text"],
|
215 |
+
summary,
|
216 |
keywords,
|
217 |
label,
|
218 |
wordcloud_image,
|