Spaces:
Build error
Build error
File size: 6,173 Bytes
869e885 8cb8264 592f7e1 440d6b7 55eafca 440d6b7 55eafca b3ed824 440d6b7 592f7e1 440d6b7 6d1025b 440d6b7 2d9e081 14c8f51 8cb8264 2d9e081 d18b751 b3ed824 2d9e081 440d6b7 b3ed824 440d6b7 592f7e1 b3ed824 f9cd637 592f7e1 f9cd637 b3ed824 440d6b7 b3ed824 592f7e1 440d6b7 47661bd 6de75ee 14c8f51 440d6b7 47661bd 1b9402b 47661bd 1b9402b 47661bd 440d6b7 47661bd 440d6b7 1b9402b 2d9e081 440d6b7 47661bd 1b9402b 592f7e1 14c8f51 2d9e081 55eafca 440d6b7 2d9e081 55eafca 592f7e1 6de75ee 440d6b7 592f7e1 b3ed824 592f7e1 55eafca 440d6b7 5e4096f 440d6b7 55eafca 2d9e081 55eafca 2d9e081 440d6b7 592f7e1 440d6b7 55eafca 440d6b7 b3ed824 440d6b7 b3ed824 440d6b7 b3ed824 869e885 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# app.py
import gradio as gr
import warnings
import torch
from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
warnings.filterwarnings("ignore")
# Load tokenizer and model
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
# Initialize pipeline
asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype)
def transcribe_audio(audio_file):
# Perform transcription
with torch.no_grad():
output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"})
return output["text"]
# Create Gradio interface
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Transcription App",
description="Upload an audio file to get the transcription",
theme="default",
live=False
)
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
from pydub import AudioSegment
import soundfile as sf
import numpy as np
import os
import nltk
from fpdf import FPDF
import time
nltk.download('punkt')
HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
# transcription
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
# summarization
summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/norbert-summarization")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("NbAiLab/norbert-summarization")
# setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
# move 'em
transcription_model.to(device)
summarization_model.to(device) # PS. model needs to be told to use graph-based summary method (Lexname?)
def convert_to_wav(audio_file):
audio = AudioSegment.from_file(audio_file, format="m4a")
wav_file = "temp.wav"
audio.export(wav_file, format="wav")
return wav_file
def transcribe_audio(audio_file, batch_size=4):
start_time = time.time()
# Convert .m4a to .wav
if audio_file.endswith(".m4a"):
audio_file = convert_to_wav(audio_file)
audio_input, sample_rate = sf.read(audio_file)
chunk_size = 16000 * 30
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = transcription_model.generate(
inputs.input_features,
max_length=2048, # Increase max_length for longer outputs
num_beams=7,
task="transcribe",
attention_mask=attention_mask,
# forced_decoder_ids=None, # OBS! forced_decoder_ids must not be set. Just marked it out for, just in case..
language="no"
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
end_time = time.time()
transcription_time = end_time - start_time
word_count = len(transcription.split())
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
return transcription.strip(), result
def summarize_text(text):
inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
inputs = inputs.to(device)
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True)
summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
<div style="text-align: center; margin-top: 20px;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="Additional Image" width="50%" height="auto">
</div>
"""
# Gradio UI
iface = gr.Blocks()
with iface:
gr.HTML(image_html)
gr.Markdown("# Switch Work Audio Transcription App\nUpload an audio file to get the transcription")
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, default=4, label="Batch Size")
transcription_output = gr.Textbox()
summary_output = gr.Textbox()
transcribe_button = gr.Button("Transcribe and Summarize")
def transcribe_and_summarize(audio_file, batch_size):
transcription, result = transcribe_audio(audio_file, batch_size)
summary = summarize_text(transcription)
return result, summary
transcribe_button.click(fn=transcribe_and_summarize, inputs=[audio_input, batch_size_input], outputs=[transcription_output, summary_output])
def save_to_pdf(transcription, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# include transcription
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
# paragraph space
pdf.ln(10)
# include summary
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription_summary.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
# run
iface.launch(share=True, debug=True)
|