kanslor821 commited on
Commit
d03e31e
·
verified ·
1 Parent(s): 3022aac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -0
app.py CHANGED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoTokenizer, T5ForConditionalGeneration
2
+
3
+
4
+ model_name = "IlyaGusev/rut5_base_sum_gazeta"
5
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
6
+ model_sum = T5ForConditionalGeneration.from_pretrained(model_name)
7
+
8
+
9
+ def summ_mT5_G(text):
10
+ input_ids = tokenizer(
11
+ [text],
12
+ max_length=600,
13
+ add_special_tokens=True,
14
+ padding="max_length",
15
+ truncation=True,
16
+ return_tensors="pt"
17
+ )["input_ids"]
18
+ output_ids = model_sum.generate(
19
+ input_ids=input_ids,
20
+ no_repeat_ngram_size=4
21
+ )[0]
22
+
23
+ summary = tokenizer.decode(output_ids, skip_special_tokens=True)
24
+ return summary
25
+
26
+ import torch
27
+
28
+
29
+ # punctuation
30
+ model_punc, example_texts, languages, punct, apply_te = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_te')
31
+
32
+ def punct(text):
33
+ # print(text)
34
+ return apply_te(text.lower(), lan='ru')
35
+
36
+
37
+ from pyannote.audio import Pipeline
38
+ import os
39
+
40
+
41
+ pipeline_a = Pipeline.from_pretrained(
42
+ "pyannote/speaker-diarization-3.1",
43
+ use_auth_token=str(os.getenv(s1)))
44
+
45
+
46
+ # верификация
47
+ def speackers_list(audio_f : str):
48
+ # # send pipeline to GPU (when available)
49
+ # import torch
50
+ # pipeline.to(torch.device("cuda"))
51
+
52
+ # apply pretrained pipeline
53
+ diarization = pipeline_a(audio_f)
54
+ speackers_list = []
55
+ # print the result
56
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
57
+ # print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
58
+ speackers_list.append([turn.start, turn.end, speaker])
59
+
60
+ # speackers_list.sort(key = lambda x: x[0])
61
+ # print(speackers_list)
62
+ i = 0
63
+ while i<len(speackers_list)-1:
64
+ if speackers_list[i][-1] == speackers_list[i+1][-1] and (i==0 or (speackers_list[i-1][1]<speackers_list[i][1])):
65
+ speackers_list[i][1] = speackers_list[i+1][1]
66
+ speackers_list.pop(i+1)
67
+ else:
68
+ i+=1
69
+ return speackers_list
70
+
71
+
72
+ # speackers = speackers_list(name_of_file)
73
+
74
+
75
+
76
+ from faster_whisper import WhisperModel
77
+
78
+
79
+ model_size = "large-v3"
80
+ # Run on GPU with FP16
81
+ model_tts = WhisperModel(model_size) #, lan = "ru") #, device="cpu", compute_type="int8") #, device="cuda", compute_type="float16")
82
+
83
+ def speach_to_text(file_name):
84
+
85
+ segments, info = model_tts.transcribe(file_name, beam_size=5)
86
+
87
+ # print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
88
+ text_of_seg = ""
89
+ for segment in segments:
90
+ # segment_text = Segment_text(text = segment.text, start = round(segment.start,5), end = round(segment.end, 5))
91
+ # list_of_segments.append(segment_text)
92
+ # print(f"[{segment.start:.2f} -> {segment.end:.2f}] {segment.text}")
93
+
94
+ text_of_seg += segment.text
95
+ # return list_of_segments
96
+ return text_of_seg
97
+
98
+
99
+ class Segment_text:
100
+ text : str # текст сегмента, примененный
101
+ start : float # время начала сегмента
102
+ end : float # время окончания сегмента
103
+ speacker : str # верифицированный спикер
104
+ summarization : str # абстракт исходного текста сегмента
105
+
106
+
107
+ def __init__(self, text : str, start : float, end : float, speacker : str ): # , summarization : str):
108
+ self.text = punct(text)
109
+ self.start = start
110
+ self.end = end
111
+ self.speacker = speacker
112
+ self.value = len(text)
113
+ self.summarization = summ_mT5_G(text) if self.value > 200 else text # summarization
114
+
115
+ def get_text(self):
116
+ return self.text
117
+
118
+ def get_time(self):
119
+ return (self.start, self.end)
120
+
121
+ def get_summarization(self):
122
+ return self.summarization
123
+
124
+ def get_speacker(self):
125
+ return self.speacker
126
+
127
+ from pydub import AudioSegment
128
+ def init_segments(speackers, name_of_file):
129
+ list_of_segments = []
130
+ audio = AudioSegment.from_file(name_of_file)
131
+ for ind, seg in enumerate(speackers):
132
+ temp_seg = audio[seg[0]*10**3 : seg[1]*10**3]
133
+ name_of_seg = "seg"+str(ind)+".mp3"
134
+ temp_seg.export(name_of_seg, format="mp3")
135
+
136
+ temp_text = speach_to_text(name_of_seg)
137
+ segment_text = Segment_text(text = temp_text, start = seg[0], end = seg[1], speacker = seg[2])
138
+ list_of_segments.append(segment_text)
139
+ return list_of_segments
140
+
141
+
142
+ def get_text_to_out(list_of_segments : list):
143
+ res_text, res_sum = "", ""
144
+ for seg in list_of_segments:
145
+ res_text += f"{seg.get_speacker()} : {seg.get_text()}\n"
146
+ for seg in list_of_segments:
147
+ res_sum += seg.get_speacker() + ": " + seg.get_summarization() + "\n"
148
+ return res_text, res_sum
149
+
150
+ from random import randint
151
+
152
+
153
+ def do_smth(file):
154
+ audio = AudioSegment.from_wav(file)
155
+ name_of_file = "f"+str(randint(1,10**8))
156
+ audio.export(name_of_file, format="mp3")
157
+
158
+
159
+ speackers = speackers_list(name_of_file)
160
+
161
+ list_of_segments = init_segments(speackers, name_of_file)
162
+
163
+ out_text, out_sum = get_text_to_out(list_of_segments)
164
+
165
+ return out_text, out_sum
166
+
167
+ import gradio as gr
168
+
169
+
170
+ demo = gr.Interface(
171
+ do_smth,
172
+ gr.Audio(type="filepath"),
173
+ [
174
+ gr.Textbox(value=1, label="Исходный текст"),
175
+ gr.Textbox(value=1, label="Сокращенный текст")
176
+ ]
177
+ )
178
+ demo.launch()