Spaces:
Sleeping
Sleeping
camparchimedes
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -3,57 +3,84 @@
|
|
3 |
import gradio as gr
|
4 |
import warnings
|
5 |
import torch
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import soundfile as sf
|
9 |
-
import
|
10 |
import os
|
|
|
11 |
from fpdf import FPDF
|
12 |
import time
|
13 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
14 |
-
from sklearn.cluster import KMeans
|
15 |
-
import re
|
16 |
-
|
17 |
-
import nltk
|
18 |
-
from nltk.tokenize import word_tokenize
|
19 |
-
from nltk.corpus import stopwords
|
20 |
-
import pandas as pd
|
21 |
-
|
22 |
-
warnings.filterwarnings("ignore")
|
23 |
|
24 |
nltk.download('punkt')
|
25 |
-
nltk.download('stopwords')
|
26 |
-
|
27 |
-
#tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-large")
|
28 |
-
#model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-large")
|
29 |
-
#processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large")
|
30 |
-
|
31 |
-
generation_config = {
|
32 |
-
"temperature": 0.8,
|
33 |
-
"top_p": 0.9,
|
34 |
-
"top_k": 0.5,
|
35 |
-
"max_output_tokens": 2048
|
36 |
-
}
|
37 |
|
|
|
38 |
|
|
|
39 |
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
|
|
|
42 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
43 |
torch_dtype = torch.float32
|
44 |
|
45 |
-
|
|
|
|
|
46 |
|
47 |
-
def
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
51 |
|
52 |
def transcribe_audio(audio_file, batch_size=4):
|
53 |
start_time = time.time()
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
57 |
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
|
58 |
|
59 |
transcription = ""
|
@@ -63,12 +90,14 @@ def transcribe_audio(audio_file, batch_size=4):
|
|
63 |
inputs = inputs.to(device)
|
64 |
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
|
65 |
with torch.no_grad():
|
66 |
-
output =
|
67 |
inputs.input_features,
|
68 |
-
max_length=2048,
|
69 |
num_beams=7,
|
70 |
task="transcribe",
|
71 |
-
attention_mask=attention_mask
|
|
|
|
|
72 |
)
|
73 |
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
|
74 |
|
@@ -80,59 +109,68 @@ def transcribe_audio(audio_file, batch_size=4):
|
|
80 |
|
81 |
return transcription.strip(), result
|
82 |
|
83 |
-
def
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
pdf.output(pdf_output_path)
|
90 |
-
return pdf_output_path
|
91 |
|
92 |
-
|
93 |
-
sentences = transcription.split(". ")
|
94 |
-
vectorizer = TfidfVectorizer(stop_words='norwegian')
|
95 |
-
X = vectorizer.fit_transform(sentences)
|
96 |
-
|
97 |
-
kmeans = KMeans(n_clusters=1)
|
98 |
-
kmeans.fit(X)
|
99 |
-
avg = X.mean(axis=0)
|
100 |
-
summary = [sentences[i] for i in kmeans.predict(avg)]
|
101 |
-
|
102 |
-
return ". ".join(summary) + "."
|
103 |
-
|
104 |
-
# HTML
|
105 |
banner_html = """
|
106 |
<div style="text-align: center;">
|
107 |
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
|
108 |
</div>
|
109 |
-
"""
|
110 |
-
image_html = """
|
111 |
<div style="text-align: center; margin-top: 20px;">
|
112 |
-
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/
|
113 |
</div>
|
114 |
"""
|
115 |
|
116 |
-
# Gradio interface
|
117 |
iface = gr.Blocks()
|
118 |
|
119 |
with iface:
|
120 |
gr.HTML(banner_html)
|
121 |
-
gr.Markdown("#
|
122 |
audio_input = gr.Audio(type="filepath")
|
123 |
-
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
|
124 |
-
transcription_output = gr.Textbox(
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
transcription, result = transcribe_audio(audio_file, batch_size)
|
131 |
-
pdf_path = save_to_pdf(transcription)
|
132 |
summary = summarize_text(transcription)
|
133 |
-
return
|
134 |
|
135 |
-
transcribe_button.click(fn=
|
|
|
136 |
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
iface.launch(share=True, debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import gradio as gr
|
4 |
import warnings
|
5 |
import torch
|
6 |
+
from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
|
7 |
+
|
8 |
+
warnings.filterwarnings("ignore")
|
9 |
+
|
10 |
+
# Load tokenizer and model
|
11 |
+
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
|
12 |
+
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
|
13 |
+
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
|
14 |
+
|
15 |
+
# Set up the device
|
16 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
17 |
+
torch_dtype = torch.float32
|
18 |
+
|
19 |
+
# Initialize pipeline
|
20 |
+
asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype)
|
21 |
+
|
22 |
+
def transcribe_audio(audio_file):
|
23 |
+
# Perform transcription
|
24 |
+
with torch.no_grad():
|
25 |
+
output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"})
|
26 |
+
return output["text"]
|
27 |
+
|
28 |
+
# Create Gradio interface
|
29 |
+
iface = gr.Interface(
|
30 |
+
fn=transcribe_audio,
|
31 |
+
inputs=gr.Audio(source="upload", type="filepath"),
|
32 |
+
outputs="text",
|
33 |
+
title="Audio Transcription App",
|
34 |
+
description="Upload an audio file to get the transcription",
|
35 |
+
theme="default",
|
36 |
+
layout="vertical",
|
37 |
+
live=False
|
38 |
+
)import gradio as gr
|
39 |
+
import warnings
|
40 |
+
import torch
|
41 |
+
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
|
42 |
+
from pydub import AudioSegment
|
43 |
import soundfile as sf
|
44 |
+
import numpy as np
|
45 |
import os
|
46 |
+
import nltk
|
47 |
from fpdf import FPDF
|
48 |
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
nltk.download('punkt')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
+
warnings.filterwarnings("ignore")
|
53 |
|
54 |
+
# Load processor and model for transcription
|
55 |
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
|
56 |
+
transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
|
57 |
+
|
58 |
+
# Load tokenizer and model for summarization
|
59 |
+
summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/norbert-summarization")
|
60 |
+
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("NbAiLab/norbert-summarization")
|
61 |
|
62 |
+
# Set up the device
|
63 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
64 |
torch_dtype = torch.float32
|
65 |
|
66 |
+
# Move the models to the device
|
67 |
+
transcription_model.to(device)
|
68 |
+
summarization_model.to(device)
|
69 |
|
70 |
+
def convert_to_wav(audio_file):
|
71 |
+
audio = AudioSegment.from_file(audio_file, format="m4a")
|
72 |
+
wav_file = "temp.wav"
|
73 |
+
audio.export(wav_file, format="wav")
|
74 |
+
return wav_file
|
75 |
|
76 |
def transcribe_audio(audio_file, batch_size=4):
|
77 |
start_time = time.time()
|
78 |
+
# Convert .m4a to .wav
|
79 |
+
if audio_file.endswith(".m4a"):
|
80 |
+
audio_file = convert_to_wav(audio_file)
|
81 |
+
|
82 |
+
audio_input, sample_rate = sf.read(audio_file)
|
83 |
+
chunk_size = 16000 * 30
|
84 |
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
|
85 |
|
86 |
transcription = ""
|
|
|
90 |
inputs = inputs.to(device)
|
91 |
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
|
92 |
with torch.no_grad():
|
93 |
+
output = transcription_model.generate(
|
94 |
inputs.input_features,
|
95 |
+
max_length=2048, # Increase max_length for longer outputs
|
96 |
num_beams=7,
|
97 |
task="transcribe",
|
98 |
+
attention_mask=attention_mask,
|
99 |
+
# forced_decoder_ids=None, # OBS! forced_decoder_ids must not be set. Just marked it out for, just in case..
|
100 |
+
language="no"
|
101 |
)
|
102 |
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
|
103 |
|
|
|
109 |
|
110 |
return transcription.strip(), result
|
111 |
|
112 |
+
def summarize_text(text):
|
113 |
+
inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
|
114 |
+
inputs = inputs.to(device)
|
115 |
+
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True)
|
116 |
+
summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
117 |
+
return summary
|
|
|
|
|
118 |
|
119 |
+
# HTML for banner and additional image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
banner_html = """
|
121 |
<div style="text-align: center;">
|
122 |
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
|
123 |
</div>
|
|
|
|
|
124 |
<div style="text-align: center; margin-top: 20px;">
|
125 |
+
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="Additional Image" width="50%" height="auto">
|
126 |
</div>
|
127 |
"""
|
128 |
|
129 |
+
# Create Gradio interface
|
130 |
iface = gr.Blocks()
|
131 |
|
132 |
with iface:
|
133 |
gr.HTML(banner_html)
|
134 |
+
gr.Markdown("# Switch Work Audio Transcription App\nUpload an audio file to get the transcription")
|
135 |
audio_input = gr.Audio(type="filepath")
|
136 |
+
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, default=4, label="Batch Size")
|
137 |
+
transcription_output = gr.Textbox()
|
138 |
+
summary_output = gr.Textbox()
|
139 |
+
transcribe_button = gr.Button("Transcribe and Summarize")
|
140 |
+
|
141 |
+
def transcribe_and_summarize(audio_file, batch_size):
|
142 |
+
transcription = transcribe_audio(audio_file, batch_size)
|
|
|
|
|
143 |
summary = summarize_text(transcription)
|
144 |
+
return transcription, summary
|
145 |
|
146 |
+
transcribe_button.click(fn=transcribe_and_summarize, inputs=[audio_input, batch_size_input], outputs=[transcription_output, summary_output])
|
147 |
+
|
148 |
|
149 |
+
def save_to_pdf(transcription, summary):
|
150 |
+
pdf = FPDF()
|
151 |
+
pdf.add_page()
|
152 |
+
pdf.set_font("Arial", size=12)
|
153 |
+
|
154 |
+
# Add transcription
|
155 |
+
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
|
156 |
+
|
157 |
+
# Add a space between transcription and summary
|
158 |
+
pdf.ln(10)
|
159 |
+
|
160 |
+
# Add summary
|
161 |
+
pdf.multi_cell(0, 10, "Summary:\n" + summary)
|
162 |
+
|
163 |
+
pdf_output_path = "transcription_summary.pdf"
|
164 |
+
pdf.output(pdf_output_path)
|
165 |
+
return pdf_output_path
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
# Launch the interface
|
170 |
iface.launch(share=True, debug=True)
|
171 |
+
|
172 |
+
# Launch the interface
|
173 |
+
iface.launch(share=True, debug=True)
|
174 |
+
|
175 |
+
|
176 |
+
|