Spaces:
Runtime error
Runtime error
File size: 6,082 Bytes
d41c42c 27999b6 cb23395 27999b6 d41c42c cb23395 27999b6 cb23395 d41c42c 27999b6 d41c42c 27999b6 cb23395 27999b6 d41c42c cb23395 d41c42c 27999b6 d41c42c 27999b6 d41c42c cb23395 d41c42c 27999b6 cb23395 27999b6 cb23395 27999b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import streamlit as st
import soundfile as sf
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor , pipeline , Wav2Vec2ForCTC , Wav2Vec2Tokenizer
import torch
import spacy
from spacy import displacy
import en_core_web_sm
import spacy.cli
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk import tokenize
nltk.download('punkt')
import spacy_streamlit
from datasets import load_dataset
from transformers import pipeline
st.title('Audio-to-Text')
audio_file = st.file_uploader('Upload Audio' , type=['wav' , 'mp3','m4a'])
st.subheader( 'Please select any of the NLP tasks')
if st.button('Audio Transcription'):
if audio_file is not None:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
st.markdown(result)
else:
st.error('please upload the audio file')
if st.button('Summarize'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
summarize = pipeline("summarization" , model='facebook/bart-large-cnn')
st.markdown(summarize(result)[0]['summary_text'])
if st.button('Sentiment Analysis'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
nlp_sa = pipeline("sentiment-analysis")
st.markdown(nlp_sa(result))
if st.button('Audio Classification'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
dataset = load_dataset("anton-l/superb_demo", "er", split="session1")
classifier = pipeline("audio-classification", model="superb/wav2vec2-base-superb-er")
labels = classifier(dataset[0]["file"], top_k=5)
st.markdown(labels)
if st.button('Name Entity Recognition'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
nlp = spacy.load('en_core_web_sm')
doc=nlp(result)
spacy_streamlit.visualize_ner(doc, labels=nlp.get_pipe("ner").labels, title= "List of Entities")
tokenizer = AutoTokenizer.from_pretrained("t5-base")
@st.cache(allow_output_mutation=True)
def load_model():
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
return model
model1 = load_model()
st.subheader('Select your source and target language below.')
source_lang = st.selectbox("Source language",['English'])
target_lang = st.selectbox("Target language",['German','French'])
if st.button('Translate'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summary_list = [str(sentence) for sentence in text]
result = ' '.join(summary_list)
prefix = 'translate '+str(source_lang)+' to '+str(target_lang)
sentence_token = tokenize.sent_tokenize(result)
output = tokenizer([prefix+sentence for sentence in sentence_token], padding=True, return_tensors="pt")
translated_id = model1.generate(output["input_ids"], attention_mask=output['attention_mask'], max_length=10000)
translated_word = tokenizer.batch_decode(translated_id, skip_special_tokens=True)
st.subheader('Translated Text')
st.write(' '.join(translated_word))
|