import streamlit as st import soundfile as sf import librosa from transformers import HubertForCTC, Wav2Vec2Processor , pipeline , Wav2Vec2ForCTC , Wav2Vec2Tokenizer import torch import spacy from spacy import displacy import en_core_web_sm import spacy.cli from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import nltk from nltk import tokenize nltk.download('punkt') import spacy_streamlit from datasets import load_dataset from transformers import pipeline st.title('Audio-to-Text') audio_file = st.file_uploader('Upload Audio' , type=['wav' , 'mp3','m4a']) st.subheader( 'Please select any of the NLP tasks') if st.button('Audio Transcription'): if audio_file is not None: processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") speech, rate = librosa.load(audio_file, sr=16000) input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) text = processor.batch_decode(predicted_ids) summary_list = [str(sentence) for sentence in text] result = ' '.join(summary_list) st.markdown(result) else: st.error('please upload the audio file') if st.button('Summarize'): processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") speech, rate = librosa.load(audio_file, sr=16000) input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) text = processor.batch_decode(predicted_ids) summary_list = [str(sentence) for sentence in text] result = ' '.join(summary_list) summarize = pipeline("summarization" , model='facebook/bart-large-cnn') st.markdown(summarize(result)[0]['summary_text']) if st.button('Sentiment Analysis'): processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") speech, rate = librosa.load(audio_file, sr=16000) input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) text = processor.batch_decode(predicted_ids) summary_list = [str(sentence) for sentence in text] result = ' '.join(summary_list) nlp_sa = pipeline("sentiment-analysis") st.markdown(nlp_sa(result)) if st.button('Audio Classification'): processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") speech, rate = librosa.load(audio_file, sr=16000) input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) text = processor.batch_decode(predicted_ids) summary_list = [str(sentence) for sentence in text] result = ' '.join(summary_list) dataset = load_dataset("anton-l/superb_demo", "er", split="session1") classifier = pipeline("audio-classification", model="superb/wav2vec2-base-superb-er") labels = classifier(dataset[0]["file"], top_k=5) st.markdown(labels) if st.button('Name Entity Recognition'): processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") speech, rate = librosa.load(audio_file, sr=16000) input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) text = processor.batch_decode(predicted_ids) summary_list = [str(sentence) for sentence in text] result = ' '.join(summary_list) nlp = spacy.load('en_core_web_sm') doc=nlp(result) spacy_streamlit.visualize_ner(doc, labels=nlp.get_pipe("ner").labels, title= "List of Entities") tokenizer = AutoTokenizer.from_pretrained("t5-base") @st.cache(allow_output_mutation=True) def load_model(): model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") return model model1 = load_model() st.subheader('Select your source and target language below.') source_lang = st.selectbox("Source language",['English']) target_lang = st.selectbox("Target language",['German','French']) if st.button('Translate'): processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") speech, rate = librosa.load(audio_file, sr=16000) input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) text = processor.batch_decode(predicted_ids) summary_list = [str(sentence) for sentence in text] result = ' '.join(summary_list) prefix = 'translate '+str(source_lang)+' to '+str(target_lang) sentence_token = tokenize.sent_tokenize(result) output = tokenizer([prefix+sentence for sentence in sentence_token], padding=True, return_tensors="pt") translated_id = model1.generate(output["input_ids"], attention_mask=output['attention_mask'], max_length=10000) translated_word = tokenizer.batch_decode(translated_id, skip_special_tokens=True) st.subheader('Translated Text') st.write(' '.join(translated_word))