nb / app.py
camparchimedes's picture
Update app.py
e898bd8 verified
raw
history blame
11.7 kB
# -----------------COPY OF NEW EDITION[app.py]-----------------
# check if still the case...........??*********************************************
# "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
import time
import os
import spaces
import contextlib
import warnings
warnings.filterwarnings("ignore")
from pydub import AudioSegment
# If m4a audio, convert to wav (Python)
def convert_to_wav(audio_file):
audio = AudioSegment.from_file(audio_file, format="m4a")
wav_file = "temp.wav"
audio.export(wav_file, format="wav")
return wav_file
import torch
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
# Initialize processor and pipeline
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
language = "no"
task = "transcribe"
@spaces.GPU(queue=True)
def transcribe_audio(audio_file):
if audio_file.endswith(".m4a"):
audio_file = convert_to_wav(audio_file)
start_time = time.time()
# forced_decoder_ids in the correct context
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
with torch.no_grad():
# CUDA within the function
with torch.cuda.device(device) if torch.cuda.is_available() else contextlib.nullcontext():
output = pipe(audio_file, chunk_length_s=30, generate_kwargs={"forced_decoder_ids": forced_decoder_ids})
text = output["text"]
end_time = time.time()
output_time = end_time - start_time
word_count = len(text.split())
result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
return text, result
# [VERSION 3: full-on w/ 3 styles for summarization]
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import re
nltk.download('punkt')
nltk.download('stopwords')
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
def clean_text(text):
text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
text = re.sub(r'\<a href', ' ', str(text))
text = re.sub(r'&amp;', '', str(text))
text = re.sub(r'\(s+', '(', str(text))
text = re.sub(r's+\)', ')', str(text))
text = re.sub(r'\(\)', '', str(text))
text = re.sub(r'\s+', ' ', str(text))
text = re.sub(r'[_"\-;%|+&=*%!?:#$@\[\]]', ' ', str(text))
text = re.sub(r'<br />', ' ', str(text))
text = re.sub(r'\'', '', str(text))
text = re.sub(r'«', '', str(text))
text = re.sub(r'»', '', str(text))
text = re.sub(r'–', '-', str(text))
text = re.sub(r'…', '.', str(text))
text = re.sub(r'[^\x00-\x7F]+', ' ', str(text))
return text
def preprocess_text(text):
try:
words = word_tokenize(text)
stop_words = set(stopwords.words('norwegian'))
words_without_stopwords = [word for word in words if word.lower() not in stop_words]
processed_text = ' '.join(words_without_stopwords)
return processed_text
except Exception as e:
st.error(f"Error during text preprocessing: {e}")
return None
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
summarization_model.to(device)
@spaces.GPU(queue=True)
def summarize_text(text):
preprocessed_text = preprocess_text(text)
if preprocessed_text is None:
return None
inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
inputs = inputs.to(device)
summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
def build_similarity_matrix(sentences, stop_words):
similarity_matrix = nx.Graph()
for i, tokens_a in enumerate(sentences):
for j, tokens_b in enumerate(sentences):
if i != j:
common_words = set(tokens_a) & set(tokens_b)
similarity_matrix.add_edge(i, j, weight=len(common_words))
return similarity_matrix
def graph_based_summary(text, num_paragraphs=3):
sentences = text.strip().split(".")
if len(sentences) < num_paragraphs:
return sentences
sentence_tokens = [word_tokenize(sent) for sent in sentences]
stop_words = set(stopwords.words('norwegian'))
filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens]
similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)
scores = nx.pagerank(similarity_matrix)
ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
return summary
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
sentences = nltk.sent_tokenize(text)
if len(sentences) < num_paragraphs:
return sentences
stop_words = set(stopwords.words('norwegian'))
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
X = vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(X, X)
for i in range(len(similarity_matrix)): # threshold
for j in range(len(similarity_matrix[i])):
if similarity_matrix[i][j] < threshold:
similarity_matrix[i][j] = 0.0
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
return summary
def text_rank_summary(text, num_paragraphs=3):
sentences = nltk.sent_tokenize(text)
if len(sentences) < num_paragraphs:
return sentences
stop_words = set(stopwords.words('norwegian'))
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
X = vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(X, X)
nx_graph = nx.from_numpy_array(similarity_matrix) # graph, nodes (i.e sentences) & edges are similarity scores (is cool)
scores = nx.pagerank(nx_graph) # PageRank algorithm, scoring sentences
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # rank by PageRank scores
summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] # top sentences for summary
return ' '.join(summary)
banner_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/resolve/main/Olas%20AudioSwitch%20Shop.png" alt="Banner Image" width="100%" height="auto">
</div>
"""
import gradio as gr
from fpdf import FPDF
from PIL import Image
def save_to_pdf(text, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
if text:
pdf.multi_cell(0, 10, "text:\n" + text)
# paragraph space
pdf.ln(10)
if summary:
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs=gr.Textbox(label="Transcription"),
title="SW Transcription App",
description="Upload an audio file to get the text",
theme="default",
live=False
)
iface = gr.Blocks()
with iface:
gr.HTML(banner_html)
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
with gr.Tabs():
with gr.TabItem("Transcription"):
audio_input = gr.Audio(type="filepath")
text_output = gr.Textbox(label="text")
result_output = gr.Textbox(label="Time taken and Number of words")
transcribe_button = gr.Button("Transcribe")
transcribe_button.click(
fn=transcribe_audio,
inputs=[audio_input],
outputs=[text_output, result_output]
)
with gr.TabItem("Summary_t1"):
summary_output = gr.Textbox(label="Summary | Graph-based")
summarize_button = gr.Button("Summarize")
def summarize(text):
if not text:
return "Warning: a text must be available."
summary = graph_based_summary(text)
return summary
summarize_button.click(
fn=summarize,
inputs=[text_output],
outputs=summary_output
)
with gr.TabItem("LexRank"):
summary_output = gr.Textbox(label="Summary | LexRank")
summarize_button = gr.Button("Summarize")
def summarize(text):
if not text:
return "Warning: a text must be available."
summary = lex_rank_summary(text)
return summary
summarize_button.click(
fn=summarize,
inputs=[text_output],
outputs=summary_output
)
with gr.TabItem("TextRank"):
summary_output = gr.Textbox(label="Summary | TextRank")
summarize_button = gr.Button("Summarize")
def summarize(text):
if not text:
return "Warning: a text must be available."
summary = text_rank_summary(text)
return summary
summarize_button.click(
fn=summarize,
inputs=[text_output],
outputs=summary_output
)
with gr.TabItem("Download PDF"):
pdf_text_only = gr.Button("Download PDF with text Only")
pdf_summary_only = gr.Button("Download PDF with Summary Only")
pdf_both = gr.Button("Download PDF with Both")
pdf_output_text_only = gr.File(label="Download PDF")
pdf_output_summary_only = gr.File(label="Download PDF")
pdf_output_both = gr.File(label="Download PDF")
def generate_pdf_text_only(text):
return save_to_pdf(text, "")
def generate_pdf_summary_only(summary):
return save_to_pdf("", summary)
def generate_pdf_both(text, summary):
return save_to_pdf(text, summary)
pdf_text_only.click(
fn=generate_pdf_text_only,
inputs=[text_output],
outputs=[pdf_output_text_only]
)
pdf_summary_only.click(
fn=generate_pdf_summary_only,
inputs=[summary_output],
outputs=[pdf_output_summary_only]
)
pdf_both.click(
fn=generate_pdf_both,
inputs=[text_output, summary_output],
outputs=[pdf_output_both]
)
iface.launch(share=True, debug=True)