camparchimedes commited on
Commit
440d6b7
·
verified ·
1 Parent(s): 7ef83d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -75
app.py CHANGED
@@ -3,57 +3,84 @@
3
  import gradio as gr
4
  import warnings
5
  import torch
6
- #from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
7
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import soundfile as sf
9
- import ffmpeg
10
  import os
 
11
  from fpdf import FPDF
12
  import time
13
- from sklearn.feature_extraction.text import TfidfVectorizer
14
- from sklearn.cluster import KMeans
15
- import re
16
-
17
- import nltk
18
- from nltk.tokenize import word_tokenize
19
- from nltk.corpus import stopwords
20
- import pandas as pd
21
-
22
- warnings.filterwarnings("ignore")
23
 
24
  nltk.download('punkt')
25
- nltk.download('stopwords')
26
-
27
- #tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-large")
28
- #model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-large")
29
- #processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large")
30
-
31
- generation_config = {
32
- "temperature": 0.8,
33
- "top_p": 0.9,
34
- "top_k": 0.5,
35
- "max_output_tokens": 2048
36
- }
37
 
 
38
 
 
39
  processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
40
- model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
 
 
 
 
41
 
 
42
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
43
  torch_dtype = torch.float32
44
 
45
- model.to(device)
 
 
46
 
47
- def convert_audio_format(audio_path):
48
- output_path = "converted_audio.wav"
49
- ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
50
- return output_path
 
51
 
52
  def transcribe_audio(audio_file, batch_size=4):
53
  start_time = time.time()
54
- audio_path = convert_audio_format(audio_file)
55
- audio_input, sample_rate = sf.read(audio_path)
56
- chunk_size = 16000 * 30
 
 
 
57
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
58
 
59
  transcription = ""
@@ -63,12 +90,14 @@ def transcribe_audio(audio_file, batch_size=4):
63
  inputs = inputs.to(device)
64
  attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
65
  with torch.no_grad():
66
- output = model.generate(
67
  inputs.input_features,
68
- max_length=2048,
69
  num_beams=7,
70
  task="transcribe",
71
- attention_mask=attention_mask
 
 
72
  )
73
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
74
 
@@ -80,59 +109,68 @@ def transcribe_audio(audio_file, batch_size=4):
80
 
81
  return transcription.strip(), result
82
 
83
- def save_to_pdf(transcription):
84
- pdf = FPDF()
85
- pdf.add_page()
86
- pdf.set_font("Arial", size=12)
87
- pdf.multi_cell(0, 10, transcription)
88
- pdf_output_path = "transcription.pdf"
89
- pdf.output(pdf_output_path)
90
- return pdf_output_path
91
 
92
- def summarize_text(transcription):
93
- sentences = transcription.split(". ")
94
- vectorizer = TfidfVectorizer(stop_words='norwegian')
95
- X = vectorizer.fit_transform(sentences)
96
-
97
- kmeans = KMeans(n_clusters=1)
98
- kmeans.fit(X)
99
- avg = X.mean(axis=0)
100
- summary = [sentences[i] for i in kmeans.predict(avg)]
101
-
102
- return ". ".join(summary) + "."
103
-
104
- # HTML
105
  banner_html = """
106
  <div style="text-align: center;">
107
  <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
108
  </div>
109
- """
110
- image_html = """
111
  <div style="text-align: center; margin-top: 20px;">
112
- <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/500x_picture.png" alt="picture" width="50%" height="auto">
113
  </div>
114
  """
115
 
116
- # Gradio interface
117
  iface = gr.Blocks()
118
 
119
  with iface:
120
  gr.HTML(banner_html)
121
- gr.Markdown("# 𝐍𝐯𝐢𝐝𝐢𝐚 𝐀𝟏𝟎𝟎 👋🏼👾🦾⚡ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file: ")
122
  audio_input = gr.Audio(type="filepath")
123
- batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
124
- transcription_output = gr.Textbox(label="Transcription")
125
- pdf_output = gr.File(label="Download Transcription as PDF")
126
- summary_output = gr.Textbox(label="Summary")
127
- transcribe_button = gr.Button("Transcribe")
128
-
129
- def process_audio(audio_file, batch_size):
130
- transcription, result = transcribe_audio(audio_file, batch_size)
131
- pdf_path = save_to_pdf(transcription)
132
  summary = summarize_text(transcription)
133
- return result, pdf_path, summary
134
 
135
- transcribe_button.click(fn=process_audio, inputs=[audio_input, batch_size_input], outputs=[transcription_output, pdf_output, summary_output])
 
136
 
137
- # Launch interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  iface.launch(share=True, debug=True)
 
 
 
 
 
 
 
3
  import gradio as gr
4
  import warnings
5
  import torch
6
+ from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
7
+
8
+ warnings.filterwarnings("ignore")
9
+
10
+ # Load tokenizer and model
11
+ tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
12
+ model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
13
+ processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
14
+
15
+ # Set up the device
16
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17
+ torch_dtype = torch.float32
18
+
19
+ # Initialize pipeline
20
+ asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype)
21
+
22
+ def transcribe_audio(audio_file):
23
+ # Perform transcription
24
+ with torch.no_grad():
25
+ output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"})
26
+ return output["text"]
27
+
28
+ # Create Gradio interface
29
+ iface = gr.Interface(
30
+ fn=transcribe_audio,
31
+ inputs=gr.Audio(source="upload", type="filepath"),
32
+ outputs="text",
33
+ title="Audio Transcription App",
34
+ description="Upload an audio file to get the transcription",
35
+ theme="default",
36
+ layout="vertical",
37
+ live=False
38
+ )import gradio as gr
39
+ import warnings
40
+ import torch
41
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
42
+ from pydub import AudioSegment
43
  import soundfile as sf
44
+ import numpy as np
45
  import os
46
+ import nltk
47
  from fpdf import FPDF
48
  import time
 
 
 
 
 
 
 
 
 
 
49
 
50
  nltk.download('punkt')
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ warnings.filterwarnings("ignore")
53
 
54
+ # Load processor and model for transcription
55
  processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
56
+ transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
57
+
58
+ # Load tokenizer and model for summarization
59
+ summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/norbert-summarization")
60
+ summarization_model = AutoModelForSeq2SeqLM.from_pretrained("NbAiLab/norbert-summarization")
61
 
62
+ # Set up the device
63
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
64
  torch_dtype = torch.float32
65
 
66
+ # Move the models to the device
67
+ transcription_model.to(device)
68
+ summarization_model.to(device)
69
 
70
+ def convert_to_wav(audio_file):
71
+ audio = AudioSegment.from_file(audio_file, format="m4a")
72
+ wav_file = "temp.wav"
73
+ audio.export(wav_file, format="wav")
74
+ return wav_file
75
 
76
  def transcribe_audio(audio_file, batch_size=4):
77
  start_time = time.time()
78
+ # Convert .m4a to .wav
79
+ if audio_file.endswith(".m4a"):
80
+ audio_file = convert_to_wav(audio_file)
81
+
82
+ audio_input, sample_rate = sf.read(audio_file)
83
+ chunk_size = 16000 * 30
84
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
85
 
86
  transcription = ""
 
90
  inputs = inputs.to(device)
91
  attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
92
  with torch.no_grad():
93
+ output = transcription_model.generate(
94
  inputs.input_features,
95
+ max_length=2048, # Increase max_length for longer outputs
96
  num_beams=7,
97
  task="transcribe",
98
+ attention_mask=attention_mask,
99
+ # forced_decoder_ids=None, # OBS! forced_decoder_ids must not be set. Just marked it out for, just in case..
100
+ language="no"
101
  )
102
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
103
 
 
109
 
110
  return transcription.strip(), result
111
 
112
+ def summarize_text(text):
113
+ inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
114
+ inputs = inputs.to(device)
115
+ summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True)
116
+ summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
117
+ return summary
 
 
118
 
119
+ # HTML for banner and additional image
 
 
 
 
 
 
 
 
 
 
 
 
120
  banner_html = """
121
  <div style="text-align: center;">
122
  <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
123
  </div>
 
 
124
  <div style="text-align: center; margin-top: 20px;">
125
+ <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="Additional Image" width="50%" height="auto">
126
  </div>
127
  """
128
 
129
+ # Create Gradio interface
130
  iface = gr.Blocks()
131
 
132
  with iface:
133
  gr.HTML(banner_html)
134
+ gr.Markdown("# Switch Work Audio Transcription App\nUpload an audio file to get the transcription")
135
  audio_input = gr.Audio(type="filepath")
136
+ batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, default=4, label="Batch Size")
137
+ transcription_output = gr.Textbox()
138
+ summary_output = gr.Textbox()
139
+ transcribe_button = gr.Button("Transcribe and Summarize")
140
+
141
+ def transcribe_and_summarize(audio_file, batch_size):
142
+ transcription = transcribe_audio(audio_file, batch_size)
 
 
143
  summary = summarize_text(transcription)
144
+ return transcription, summary
145
 
146
+ transcribe_button.click(fn=transcribe_and_summarize, inputs=[audio_input, batch_size_input], outputs=[transcription_output, summary_output])
147
+
148
 
149
+ def save_to_pdf(transcription, summary):
150
+ pdf = FPDF()
151
+ pdf.add_page()
152
+ pdf.set_font("Arial", size=12)
153
+
154
+ # Add transcription
155
+ pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
156
+
157
+ # Add a space between transcription and summary
158
+ pdf.ln(10)
159
+
160
+ # Add summary
161
+ pdf.multi_cell(0, 10, "Summary:\n" + summary)
162
+
163
+ pdf_output_path = "transcription_summary.pdf"
164
+ pdf.output(pdf_output_path)
165
+ return pdf_output_path
166
+
167
+
168
+
169
+ # Launch the interface
170
  iface.launch(share=True, debug=True)
171
+
172
+ # Launch the interface
173
+ iface.launch(share=True, debug=True)
174
+
175
+
176
+