storresbusquets commited on
Commit
0f467e3
·
1 Parent(s): dc20c68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -3
app.py CHANGED
@@ -2,8 +2,9 @@
2
  import gradio as gr
3
  import whisper
4
  from pytube import YouTube
5
- from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
6
  from wordcloud import WordCloud
 
7
 
8
  class GradioInference:
9
  def __init__(self):
@@ -37,6 +38,12 @@ class GradioInference:
37
  # Sentiment Classifier
38
  self.classifier = pipeline("text-classification")
39
 
 
 
 
 
 
 
40
  def __call__(self, link, lang, size):
41
  """
42
  Call the Gradio Inference python class.
@@ -69,6 +76,32 @@ class GradioInference:
69
  results["text"], max_length=512, min_length=30, do_sample=False
70
  )
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  # Extract keywords using VoiceLabT5
73
  task_prefix = "Keywords: "
74
  input_sequence = task_prefix + results["text"]
@@ -92,7 +125,7 @@ class GradioInference:
92
 
93
  return (
94
  results["text"],
95
- transcription_summary[0]["summary_text"],
96
  keywords,
97
  label,
98
  wordcloud_image,
@@ -131,6 +164,31 @@ class GradioInference:
131
  results["text"], max_length=512, min_length=30, do_sample=False
132
  )
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  # Extract keywords using VoiceLabT5
135
  task_prefix = "Keywords: "
136
  input_sequence = task_prefix + results["text"]
@@ -154,7 +212,7 @@ class GradioInference:
154
 
155
  return (
156
  results["text"],
157
- transcription_summary[0]["summary_text"],
158
  keywords,
159
  label,
160
  wordcloud_image,
 
2
  import gradio as gr
3
  import whisper
4
  from pytube import YouTube
5
+ from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
6
  from wordcloud import WordCloud
7
+ import re
8
 
9
  class GradioInference:
10
  def __init__(self):
 
38
  # Sentiment Classifier
39
  self.classifier = pipeline("text-classification")
40
 
41
+
42
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
43
+ self.model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
44
+
45
+
46
+
47
  def __call__(self, link, lang, size):
48
  """
49
  Call the Gradio Inference python class.
 
76
  results["text"], max_length=512, min_length=30, do_sample=False
77
  )
78
 
79
+ #### Prueba
80
+ WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
81
+
82
+ input_ids = self.tokenizer(
83
+ [WHITESPACE_HANDLER(results["text"])],
84
+ return_tensors="pt",
85
+ padding="max_length",
86
+ truncation=True,
87
+ max_length=512
88
+ )["input_ids"]
89
+
90
+ output_ids = self.model.generate(
91
+ input_ids=input_ids,
92
+ max_length=130,
93
+ no_repeat_ngram_size=2,
94
+ num_beams=4
95
+ )[0]
96
+
97
+ summary = tokenizer.decode(
98
+ output_ids,
99
+ skip_special_tokens=True,
100
+ clean_up_tokenization_spaces=False
101
+ )
102
+ #### Fin prueba
103
+
104
+
105
  # Extract keywords using VoiceLabT5
106
  task_prefix = "Keywords: "
107
  input_sequence = task_prefix + results["text"]
 
125
 
126
  return (
127
  results["text"],
128
+ summary,
129
  keywords,
130
  label,
131
  wordcloud_image,
 
164
  results["text"], max_length=512, min_length=30, do_sample=False
165
  )
166
 
167
+ #### Prueba
168
+ WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
169
+
170
+ input_ids = self.tokenizer(
171
+ [WHITESPACE_HANDLER(results["text"])],
172
+ return_tensors="pt",
173
+ padding="max_length",
174
+ truncation=True,
175
+ max_length=512
176
+ )["input_ids"]
177
+
178
+ output_ids = self.model.generate(
179
+ input_ids=input_ids,
180
+ max_length=130,
181
+ no_repeat_ngram_size=2,
182
+ num_beams=4
183
+ )[0]
184
+
185
+ summary = tokenizer.decode(
186
+ output_ids,
187
+ skip_special_tokens=True,
188
+ clean_up_tokenization_spaces=False
189
+ )
190
+ #### Fin prueba
191
+
192
  # Extract keywords using VoiceLabT5
193
  task_prefix = "Keywords: "
194
  input_sequence = task_prefix + results["text"]
 
212
 
213
  return (
214
  results["text"],
215
+ summary,
216
  keywords,
217
  label,
218
  wordcloud_image,