birgermoell commited on
Commit
e6a9b5c
1 Parent(s): 993f0db

Updated pataka working with syllables

Browse files
Files changed (2) hide show
  1. app.py +24 -4
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,6 +4,7 @@ import torch
4
  import numpy as np
5
  import soundfile as sf
6
  import io
 
7
 
8
  st.title("Syllables per Second Calculator")
9
  st.write("Upload an audio file to calculate the number of 'p', 't', and 'k' syllables per second.")
@@ -12,8 +13,17 @@ def get_syllables_per_second(audio_file):
12
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
13
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
14
 
15
- audio_input, sample_rate = sf.read(io.BytesIO(audio_file.read()))
 
16
 
 
 
 
 
 
 
 
 
17
  if audio_input.ndim > 1 and audio_input.shape[1] == 2:
18
  audio_input = np.mean(audio_input, axis=1)
19
 
@@ -24,19 +34,29 @@ def get_syllables_per_second(audio_file):
24
  predicted_ids = torch.argmax(logits, dim=-1)
25
  transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
26
  offsets = transcription['char_offsets']
 
27
 
28
  # Find the start and end time offsets of the syllables
 
29
  syllable_offsets = [item for item in offsets[0] if item['char'] in ['p', 't', 'k']]
30
 
31
  if syllable_offsets: # if any syllable is found
32
- first_syllable_offset = syllable_offsets[0]['start_offset'] / sample_rate
33
- last_syllable_offset = syllable_offsets[-1]['end_offset'] / sample_rate
 
 
 
34
  # Duration from the first to the last syllable
35
  syllable_duration = last_syllable_offset - first_syllable_offset
 
36
  else:
37
  syllable_duration = 0
38
 
39
  syllable_count = len(syllable_offsets)
 
 
 
 
40
  syllables_per_second = syllable_count / syllable_duration if syllable_duration > 0 else 0
41
 
42
  return syllables_per_second
@@ -46,4 +66,4 @@ uploaded_file = st.file_uploader("Choose an audio file", type=["wav"])
46
  if uploaded_file is not None:
47
  with st.spinner("Processing the audio file..."):
48
  result = get_syllables_per_second(uploaded_file)
49
- st.write("Syllables per second: ", result)
 
4
  import numpy as np
5
  import soundfile as sf
6
  import io
7
+ import librosa
8
 
9
  st.title("Syllables per Second Calculator")
10
  st.write("Upload an audio file to calculate the number of 'p', 't', and 'k' syllables per second.")
 
13
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
14
  model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
15
 
16
+ audio_input, original_sample_rate = sf.read(io.BytesIO(audio_file.read()))
17
+ target_sample_rate = processor.feature_extractor.sampling_rate
18
 
19
+ # resample the sample rate if not 16 k
20
+ if original_sample_rate != target_sample_rate:
21
+ if audio_input.ndim > 1:
22
+ audio_input = np.asarray([librosa.resample(channel, orig_sr=original_sample_rate, target_sr=target_sample_rate) for channel in audio_input.T]).T
23
+ else:
24
+ audio_input = librosa.resample(audio_input, orig_sr=original_sample_rate, target_sr=target_sample_rate)
25
+
26
+ # make the audio mono if it is stereo
27
  if audio_input.ndim > 1 and audio_input.shape[1] == 2:
28
  audio_input = np.mean(audio_input, axis=1)
29
 
 
34
  predicted_ids = torch.argmax(logits, dim=-1)
35
  transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
36
  offsets = transcription['char_offsets']
37
+ print("the offets are: ", offsets)
38
 
39
  # Find the start and end time offsets of the syllables
40
+
41
  syllable_offsets = [item for item in offsets[0] if item['char'] in ['p', 't', 'k']]
42
 
43
  if syllable_offsets: # if any syllable is found
44
+ first_syllable_offset = syllable_offsets[0]['start_offset'] * 0.02
45
+ last_syllable_offset = syllable_offsets[-1]['end_offset'] * 0.02
46
+
47
+ print("the first syllable offset is: ", first_syllable_offset)
48
+ print("the last syllable offset is: ", last_syllable_offset)
49
  # Duration from the first to the last syllable
50
  syllable_duration = last_syllable_offset - first_syllable_offset
51
+ print("the syllable duration is: ", syllable_duration)
52
  else:
53
  syllable_duration = 0
54
 
55
  syllable_count = len(syllable_offsets)
56
+ audio_duration = len(audio_input) / target_sample_rate
57
+ print("the audio duration is: ", audio_duration)
58
+ print("the syllable count is: ", syllable_count)
59
+ #print("the syllabels per second is: ", syllable_count / audio_duration)
60
  syllables_per_second = syllable_count / syllable_duration if syllable_duration > 0 else 0
61
 
62
  return syllables_per_second
 
66
  if uploaded_file is not None:
67
  with st.spinner("Processing the audio file..."):
68
  result = get_syllables_per_second(uploaded_file)
69
+ st.write("Syllables per second: ", result)
requirements.txt CHANGED
@@ -2,4 +2,5 @@ torch
2
  numpy
3
  transformers
4
  soundfile
5
- phonemizer
 
 
2
  numpy
3
  transformers
4
  soundfile
5
+ phonemizer
6
+ librosa