Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline, AutoTokenizer, T5ForConditionalGeneration
|
2 |
+
|
3 |
+
|
4 |
+
model_name = "IlyaGusev/rut5_base_sum_gazeta"
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
6 |
+
model_sum = T5ForConditionalGeneration.from_pretrained(model_name)
|
7 |
+
|
8 |
+
|
9 |
+
def summ_mT5_G(text):
|
10 |
+
input_ids = tokenizer(
|
11 |
+
[text],
|
12 |
+
max_length=600,
|
13 |
+
add_special_tokens=True,
|
14 |
+
padding="max_length",
|
15 |
+
truncation=True,
|
16 |
+
return_tensors="pt"
|
17 |
+
)["input_ids"]
|
18 |
+
output_ids = model_sum.generate(
|
19 |
+
input_ids=input_ids,
|
20 |
+
no_repeat_ngram_size=4
|
21 |
+
)[0]
|
22 |
+
|
23 |
+
summary = tokenizer.decode(output_ids, skip_special_tokens=True)
|
24 |
+
return summary
|
25 |
+
|
26 |
+
import torch
|
27 |
+
|
28 |
+
|
29 |
+
# punctuation
|
30 |
+
model_punc, example_texts, languages, punct, apply_te = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_te')
|
31 |
+
|
32 |
+
def punct(text):
|
33 |
+
# print(text)
|
34 |
+
return apply_te(text.lower(), lan='ru')
|
35 |
+
|
36 |
+
|
37 |
+
from pyannote.audio import Pipeline
|
38 |
+
import os
|
39 |
+
|
40 |
+
|
41 |
+
pipeline_a = Pipeline.from_pretrained(
|
42 |
+
"pyannote/speaker-diarization-3.1",
|
43 |
+
use_auth_token=str(os.getenv(s1)))
|
44 |
+
|
45 |
+
|
46 |
+
# верификация
|
47 |
+
def speackers_list(audio_f : str):
|
48 |
+
# # send pipeline to GPU (when available)
|
49 |
+
# import torch
|
50 |
+
# pipeline.to(torch.device("cuda"))
|
51 |
+
|
52 |
+
# apply pretrained pipeline
|
53 |
+
diarization = pipeline_a(audio_f)
|
54 |
+
speackers_list = []
|
55 |
+
# print the result
|
56 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
57 |
+
# print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
|
58 |
+
speackers_list.append([turn.start, turn.end, speaker])
|
59 |
+
|
60 |
+
# speackers_list.sort(key = lambda x: x[0])
|
61 |
+
# print(speackers_list)
|
62 |
+
i = 0
|
63 |
+
while i<len(speackers_list)-1:
|
64 |
+
if speackers_list[i][-1] == speackers_list[i+1][-1] and (i==0 or (speackers_list[i-1][1]<speackers_list[i][1])):
|
65 |
+
speackers_list[i][1] = speackers_list[i+1][1]
|
66 |
+
speackers_list.pop(i+1)
|
67 |
+
else:
|
68 |
+
i+=1
|
69 |
+
return speackers_list
|
70 |
+
|
71 |
+
|
72 |
+
# speackers = speackers_list(name_of_file)
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
from faster_whisper import WhisperModel
|
77 |
+
|
78 |
+
|
79 |
+
model_size = "large-v3"
|
80 |
+
# Run on GPU with FP16
|
81 |
+
model_tts = WhisperModel(model_size) #, lan = "ru") #, device="cpu", compute_type="int8") #, device="cuda", compute_type="float16")
|
82 |
+
|
83 |
+
def speach_to_text(file_name):
|
84 |
+
|
85 |
+
segments, info = model_tts.transcribe(file_name, beam_size=5)
|
86 |
+
|
87 |
+
# print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
|
88 |
+
text_of_seg = ""
|
89 |
+
for segment in segments:
|
90 |
+
# segment_text = Segment_text(text = segment.text, start = round(segment.start,5), end = round(segment.end, 5))
|
91 |
+
# list_of_segments.append(segment_text)
|
92 |
+
# print(f"[{segment.start:.2f} -> {segment.end:.2f}] {segment.text}")
|
93 |
+
|
94 |
+
text_of_seg += segment.text
|
95 |
+
# return list_of_segments
|
96 |
+
return text_of_seg
|
97 |
+
|
98 |
+
|
99 |
+
class Segment_text:
|
100 |
+
text : str # текст сегмента, примененный
|
101 |
+
start : float # время начала сегмента
|
102 |
+
end : float # время окончания сегмента
|
103 |
+
speacker : str # верифицированный спикер
|
104 |
+
summarization : str # абстракт исходного текста сегмента
|
105 |
+
|
106 |
+
|
107 |
+
def __init__(self, text : str, start : float, end : float, speacker : str ): # , summarization : str):
|
108 |
+
self.text = punct(text)
|
109 |
+
self.start = start
|
110 |
+
self.end = end
|
111 |
+
self.speacker = speacker
|
112 |
+
self.value = len(text)
|
113 |
+
self.summarization = summ_mT5_G(text) if self.value > 200 else text # summarization
|
114 |
+
|
115 |
+
def get_text(self):
|
116 |
+
return self.text
|
117 |
+
|
118 |
+
def get_time(self):
|
119 |
+
return (self.start, self.end)
|
120 |
+
|
121 |
+
def get_summarization(self):
|
122 |
+
return self.summarization
|
123 |
+
|
124 |
+
def get_speacker(self):
|
125 |
+
return self.speacker
|
126 |
+
|
127 |
+
from pydub import AudioSegment
|
128 |
+
def init_segments(speackers, name_of_file):
|
129 |
+
list_of_segments = []
|
130 |
+
audio = AudioSegment.from_file(name_of_file)
|
131 |
+
for ind, seg in enumerate(speackers):
|
132 |
+
temp_seg = audio[seg[0]*10**3 : seg[1]*10**3]
|
133 |
+
name_of_seg = "seg"+str(ind)+".mp3"
|
134 |
+
temp_seg.export(name_of_seg, format="mp3")
|
135 |
+
|
136 |
+
temp_text = speach_to_text(name_of_seg)
|
137 |
+
segment_text = Segment_text(text = temp_text, start = seg[0], end = seg[1], speacker = seg[2])
|
138 |
+
list_of_segments.append(segment_text)
|
139 |
+
return list_of_segments
|
140 |
+
|
141 |
+
|
142 |
+
def get_text_to_out(list_of_segments : list):
|
143 |
+
res_text, res_sum = "", ""
|
144 |
+
for seg in list_of_segments:
|
145 |
+
res_text += f"{seg.get_speacker()} : {seg.get_text()}\n"
|
146 |
+
for seg in list_of_segments:
|
147 |
+
res_sum += seg.get_speacker() + ": " + seg.get_summarization() + "\n"
|
148 |
+
return res_text, res_sum
|
149 |
+
|
150 |
+
from random import randint
|
151 |
+
|
152 |
+
|
153 |
+
def do_smth(file):
|
154 |
+
audio = AudioSegment.from_wav(file)
|
155 |
+
name_of_file = "f"+str(randint(1,10**8))
|
156 |
+
audio.export(name_of_file, format="mp3")
|
157 |
+
|
158 |
+
|
159 |
+
speackers = speackers_list(name_of_file)
|
160 |
+
|
161 |
+
list_of_segments = init_segments(speackers, name_of_file)
|
162 |
+
|
163 |
+
out_text, out_sum = get_text_to_out(list_of_segments)
|
164 |
+
|
165 |
+
return out_text, out_sum
|
166 |
+
|
167 |
+
import gradio as gr
|
168 |
+
|
169 |
+
|
170 |
+
demo = gr.Interface(
|
171 |
+
do_smth,
|
172 |
+
gr.Audio(type="filepath"),
|
173 |
+
[
|
174 |
+
gr.Textbox(value=1, label="Исходный текст"),
|
175 |
+
gr.Textbox(value=1, label="Сокращенный текст")
|
176 |
+
]
|
177 |
+
)
|
178 |
+
demo.launch()
|