File size: 6,082 Bytes
d41c42c
 
 
 
 
 
 
27999b6
 
 
 
 
 
 
cb23395
 
27999b6
d41c42c
 
 
 
 
cb23395
27999b6
 
cb23395
d41c42c
 
 
 
 
 
 
 
27999b6
 
 
d41c42c
 
 
 
 
 
 
 
 
 
 
 
27999b6
 
cb23395
27999b6
d41c42c
cb23395
d41c42c
 
 
 
 
 
 
27999b6
 
d41c42c
27999b6
d41c42c
cb23395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d41c42c
 
 
 
 
 
 
27999b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb23395
27999b6
 
 
 
 
 
 
 
 
 
 
 
cb23395
27999b6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import streamlit as st 
import soundfile as sf
import librosa
from transformers import HubertForCTC, Wav2Vec2Processor , pipeline , Wav2Vec2ForCTC , Wav2Vec2Tokenizer
import torch
import spacy 
from spacy import displacy
import en_core_web_sm
import spacy.cli
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk import tokenize
nltk.download('punkt')
import spacy_streamlit
from datasets import load_dataset
from transformers import pipeline


st.title('Audio-to-Text')

audio_file = st.file_uploader('Upload Audio' , type=['wav' , 'mp3','m4a'])

st.subheader( 'Please select any of the NLP tasks')


if st.button('Audio Transcription'):
    if audio_file is not None:
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        st.markdown(result) 
    else:
        st.error('please upload the audio file')


if st.button('Summarize'):
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        summarize = pipeline("summarization" , model='facebook/bart-large-cnn')
        st.markdown(summarize(result)[0]['summary_text'])

if st.button('Sentiment Analysis'):
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        nlp_sa = pipeline("sentiment-analysis")
        st.markdown(nlp_sa(result))


if st.button('Audio Classification'):
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        dataset = load_dataset("anton-l/superb_demo", "er", split="session1")
        classifier = pipeline("audio-classification", model="superb/wav2vec2-base-superb-er")
        labels = classifier(dataset[0]["file"], top_k=5)
        st.markdown(labels)



if st.button('Name Entity Recognition'):
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        nlp = spacy.load('en_core_web_sm')
        doc=nlp(result)
        spacy_streamlit.visualize_ner(doc, labels=nlp.get_pipe("ner").labels, title= "List of Entities")


tokenizer = AutoTokenizer.from_pretrained("t5-base")

@st.cache(allow_output_mutation=True)
def load_model():
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
    return model

model1 = load_model()

st.subheader('Select your source and target language below.')
source_lang = st.selectbox("Source language",['English'])
target_lang = st.selectbox("Target language",['German','French'])


if st.button('Translate'): 
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")  
        speech, rate = librosa.load(audio_file, sr=16000)
        input_values = processor(speech, return_tensors="pt", padding="longest", sampling_rate=rate).input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        text = processor.batch_decode(predicted_ids)
        summary_list = [str(sentence) for sentence in text]
        result = ' '.join(summary_list)
        prefix = 'translate '+str(source_lang)+' to '+str(target_lang)
        sentence_token =  tokenize.sent_tokenize(result)
        output = tokenizer([prefix+sentence for sentence in sentence_token], padding=True, return_tensors="pt")
        translated_id = model1.generate(output["input_ids"], attention_mask=output['attention_mask'], max_length=10000)
        translated_word = tokenizer.batch_decode(translated_id, skip_special_tokens=True)
        st.subheader('Translated Text')
        st.write(' '.join(translated_word))