Sabbah13 commited on
Commit
59f6126
1 Parent(s): 8f58ee5

changed speechlib to whisperx

Browse files
Files changed (1) hide show
  1. app.py +26 -17
app.py CHANGED
@@ -3,43 +3,52 @@ import base64
3
  import os
4
  import json
5
  import streamlit as st
6
- from speechlib import Transcriptor
 
7
 
8
- def transcribe_audio(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization):
9
- transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization)
10
- return transcriptor.whisper()
11
-
12
- def transform_transcript(transcript):
13
  result = []
14
- for segment in transcript:
15
- start_time, end_time, text, speaker = segment
16
- result.append(f"{speaker} ({start_time:.1f} : {end_time:.1f}) : {text}")
 
 
 
 
17
  return '\n'.join(result)
18
 
19
  st.title('Audio Transcription App')
 
 
 
 
 
20
 
21
  ACCESS_TOKEN = st.secrets["HF_TOKEN"]
22
 
23
  uploaded_file = st.file_uploader("Загрузите аудиофайл", type=["mp4", "wav", "m4a"])
24
 
25
  if uploaded_file is not None:
 
26
  file_extension = uploaded_file.name.split(".")[-1] # Получаем расширение файла
27
  temp_file_path = f"temp_file.{file_extension}" # Создаем временное имя файла с правильным расширением
28
 
29
  with open(temp_file_path, "wb") as f:
30
  f.write(uploaded_file.getbuffer())
31
-
32
- log_folder = "logs"
33
- language = "ru"
34
- modelSize = os.getenv('WHISPER_MODEL_SIZE')
35
- voices_folder = ""
36
- quantization = False
37
 
38
  with st.spinner('Транскрибируем...'):
39
- result = transcribe_audio(temp_file_path, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization)
 
 
 
 
 
 
 
 
40
 
41
  st.write("Результат транскрибации:")
42
- transcript = transform_transcript(result)
43
  st.text(transcript)
44
 
45
  with st.spinner('Резюмируем...'):
 
3
  import os
4
  import json
5
  import streamlit as st
6
+ import whisperx
7
+ import torch
8
 
9
+ def convert_segments_to_text(segments):
 
 
 
 
10
  result = []
11
+ for segment in segments:
12
+ speaker = segment['speaker']
13
+ start = segment['start']
14
+ end = segment['end']
15
+ text = segment['text']
16
+ formatted_text = f'{speaker} ({start} : {end}) : {text}'
17
+ result.append(formatted_text)
18
  return '\n'.join(result)
19
 
20
  st.title('Audio Transcription App')
21
+ st.sidebar.title("Settings")
22
+ # Sidebar inputs
23
+ device = st.sidebar.selectbox("Device", ["cpu", "cuda"], index=1)
24
+ batch_size = st.sidebar.number_input("Batch Size", min_value=1, value=16)
25
+ compute_type = st.sidebar.selectbox("Compute Type", ["float16", "int8"], index=0)
26
 
27
  ACCESS_TOKEN = st.secrets["HF_TOKEN"]
28
 
29
  uploaded_file = st.file_uploader("Загрузите аудиофайл", type=["mp4", "wav", "m4a"])
30
 
31
  if uploaded_file is not None:
32
+ st.audio(uploaded_file)
33
  file_extension = uploaded_file.name.split(".")[-1] # Получаем расширение файла
34
  temp_file_path = f"temp_file.{file_extension}" # Создаем временное имя файла с правильным расширением
35
 
36
  with open(temp_file_path, "wb") as f:
37
  f.write(uploaded_file.getbuffer())
 
 
 
 
 
 
38
 
39
  with st.spinner('Транскрибируем...'):
40
+ # Load model
41
+ model = whisperx.load_model("medium", device, compute_type=compute_type)
42
+ # Load and transcribe audio
43
+ audio = whisperx.load_audio(temp_file_path)
44
+ result = model.transcribe(audio, batch_size=batch_size, language="Russian")
45
+ # Load diarization model (replace YOUR_HF_TOKEN with actual token)
46
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=st.secrets["HF_TOKEN"], device=device)
47
+ diarize_segments = diarize_model(audio)
48
+ result = whisperx.assign_word_speakers(diarize_segments, result)
49
 
50
  st.write("Результат транскрибации:")
51
+ transcript = convert_segments_to_text(result)
52
  st.text(transcript)
53
 
54
  with st.spinner('Резюмируем...'):