NLP / app.py
krishnasai99's picture
Update app.py
d675209
raw
history blame
2.82 kB
import streamlit as st
import soundfile as sf
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor , pipeline , Wav2Vec2ForCTC , Wav2Vec2Tokenizer
import torch
import spacy
from spacy import displacy
st.title('Audio-to-Text')
audio_file = st.file_uploader('Upload Audio' , type=['wav' , 'mp3','m4a'])
if st.button('Trascribe Audio'):
if audio_file is not None:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
st.write(text)
else:
st.error('please upload the audio file')
if st.button('Summarize'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
summarize = pipeline("summarization")
st.write(summarize(text))
if st.button('sentiment-analysis'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
nlp_sa = pipeline("sentiment-analysis")
st.write(nlp_sa(text))
if st.button('Name'):
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
speech, rate = librosa.load(audio_file, sr=16000)
input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
text = processor.batch_decode(predicted_ids)
str = ''.join(text)
trf = spacy.load('en_core_web_trf')
doc=trf(str)
print(displacy.render(doc,style='ent'))