gngpostalsrvc commited on
Commit
f2ba1f6
1 Parent(s): bccf3ee

updated application file

Browse files
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -10,6 +10,7 @@ asr = pipeline('automatic-speech-recognition', model='facebook/wav2vec2-large-96
10
  tokenizer = RobertaTokenizerFast.from_pretrained("arpanghoshal/EmoRoBERTa")
11
  model = TFRobertaForSequenceClassification.from_pretrained("arpanghoshal/EmoRoBERTa")
12
  emo = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa')
 
13
 
14
  def transcribe_and_describe(audio):
15
 
@@ -17,9 +18,13 @@ def transcribe_and_describe(audio):
17
 
18
  text = asr(audio)['text']
19
 
 
 
 
 
20
  flatness = pd.DataFrame(librosa.feature.spectral_flatness(y=audio).T).describe().T
21
  loudness = pd.DataFrame(librosa.feature.rms(audio).T).describe().T
22
- time, frequency, confidence, activation = crepe.predict(audio, sr)
23
  frequency = pd.DataFrame(frequency.T).describe().T
24
 
25
  mean_spectral_flatness = flatness.loc[0, 'mean']
@@ -33,13 +38,14 @@ def transcribe_and_describe(audio):
33
 
34
  emotion = emo(text)[0]['label']
35
 
36
- return (text, words_per_minute, mean_pitch, pitch_std, mean_volume, volume_std, mean_spectral_flatness, spectral_flatness_std, emotion)
37
 
38
  gr.Interface(
39
  fn=transcribe_and_describe,
40
  inputs=gr.Audio(source="microphone", type="filepath"),
41
  outputs=[
42
  gr.Text(label="Transcription"),
 
43
  gr.Text(label="Rate of Speech (WPM)"),
44
  gr.Text(label="Mean Pitch (Hz)"),
45
  gr.Text(label="Pitch Variation (Hz)"),
 
10
  tokenizer = RobertaTokenizerFast.from_pretrained("arpanghoshal/EmoRoBERTa")
11
  model = TFRobertaForSequenceClassification.from_pretrained("arpanghoshal/EmoRoBERTa")
12
  emo = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa')
13
+ pos = pipeline("token-classification", model="vblagoje/bert-english-uncased-finetuned-pos")
14
 
15
  def transcribe_and_describe(audio):
16
 
 
18
 
19
  text = asr(audio)['text']
20
 
21
+ tagged_text = pos(text)
22
+ filler_words = [entry['word'] for entry in tagged_text if entry['entity'] == 'INTJ']
23
+ filler_word_pr = len(filler_words) / len(tagged_text)
24
+
25
  flatness = pd.DataFrame(librosa.feature.spectral_flatness(y=audio).T).describe().T
26
  loudness = pd.DataFrame(librosa.feature.rms(audio).T).describe().T
27
+ time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi=True)
28
  frequency = pd.DataFrame(frequency.T).describe().T
29
 
30
  mean_spectral_flatness = flatness.loc[0, 'mean']
 
38
 
39
  emotion = emo(text)[0]['label']
40
 
41
+ return (text, f"{filler_word_pr:.2f}", f"{words_per_minute:.2f}", f"{mean_pitch:.2f}", f"{pitch_std:.2f}", f"{mean_volume:.2f}", f"{volume_std:.2f}", f"{mean_spectral_flatness:.2f}", f"{spectral_flatness_std:.2f}", emotion)
42
 
43
  gr.Interface(
44
  fn=transcribe_and_describe,
45
  inputs=gr.Audio(source="microphone", type="filepath"),
46
  outputs=[
47
  gr.Text(label="Transcription"),
48
+ fr.Text(label="Filler Word Percent"),
49
  gr.Text(label="Rate of Speech (WPM)"),
50
  gr.Text(label="Mean Pitch (Hz)"),
51
  gr.Text(label="Pitch Variation (Hz)"),