Spaces:

KarolinskaInstitutet
/

pataka

Build error

App Files Files Community

birgermoell commited on May 25, 2023

Commit

e6a9b5c

•

1 Parent(s): 993f0db

Updated pataka working with syllables

Browse files

Files changed (2) hide show

app.py +24 -4
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import numpy as np
 import soundfile as sf
 import io
 st.title("Syllables per Second Calculator")
 st.write("Upload an audio file to calculate the number of 'p', 't', and 'k' syllables per second.")
@@ -12,8 +13,17 @@ def get_syllables_per_second(audio_file):
     processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
     model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
-    audio_input, sample_rate = sf.read(io.BytesIO(audio_file.read()))
     if audio_input.ndim > 1 and audio_input.shape[1] == 2:
         audio_input = np.mean(audio_input, axis=1)
@@ -24,19 +34,29 @@ def get_syllables_per_second(audio_file):
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
         offsets = transcription['char_offsets']
     # Find the start and end time offsets of the syllables
     syllable_offsets = [item for item in offsets[0] if item['char'] in ['p', 't', 'k']]
     if syllable_offsets:  # if any syllable is found
-        first_syllable_offset = syllable_offsets[0]['start_offset'] / sample_rate
-        last_syllable_offset = syllable_offsets[-1]['end_offset'] / sample_rate
         # Duration from the first to the last syllable
         syllable_duration = last_syllable_offset - first_syllable_offset
     else:
         syllable_duration = 0
     syllable_count = len(syllable_offsets)
     syllables_per_second = syllable_count / syllable_duration if syllable_duration > 0 else 0
     return syllables_per_second
@@ -46,4 +66,4 @@ uploaded_file = st.file_uploader("Choose an audio file", type=["wav"])
 if uploaded_file is not None:
     with st.spinner("Processing the audio file..."):
         result = get_syllables_per_second(uploaded_file)
-        st.write("Syllables per second: ", result)

 import numpy as np
 import soundfile as sf
 import io
+import librosa
 st.title("Syllables per Second Calculator")
 st.write("Upload an audio file to calculate the number of 'p', 't', and 'k' syllables per second.")
     processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
     model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
+    audio_input, original_sample_rate = sf.read(io.BytesIO(audio_file.read()))
+    target_sample_rate = processor.feature_extractor.sampling_rate
+    # resample the sample rate if not 16 k
+    if original_sample_rate != target_sample_rate:
+        if audio_input.ndim > 1:
+            audio_input = np.asarray([librosa.resample(channel, orig_sr=original_sample_rate, target_sr=target_sample_rate) for channel in audio_input.T]).T
+        else:
+            audio_input = librosa.resample(audio_input, orig_sr=original_sample_rate, target_sr=target_sample_rate)
+    # make the audio mono if it is stereo
     if audio_input.ndim > 1 and audio_input.shape[1] == 2:
         audio_input = np.mean(audio_input, axis=1)
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
         offsets = transcription['char_offsets']
+        print("the offets are: ", offsets)
     # Find the start and end time offsets of the syllables
     syllable_offsets = [item for item in offsets[0] if item['char'] in ['p', 't', 'k']]
     if syllable_offsets:  # if any syllable is found
+        first_syllable_offset = syllable_offsets[0]['start_offset'] * 0.02
+        last_syllable_offset = syllable_offsets[-1]['end_offset'] * 0.02
+        print("the first syllable offset is: ", first_syllable_offset)
+        print("the last syllable offset is: ", last_syllable_offset)
         # Duration from the first to the last syllable
         syllable_duration = last_syllable_offset - first_syllable_offset
+        print("the syllable duration is: ", syllable_duration)
     else:
         syllable_duration = 0
     syllable_count = len(syllable_offsets)
+    audio_duration = len(audio_input) / target_sample_rate
+    print("the audio duration is: ", audio_duration)
+    print("the syllable count is: ", syllable_count)
+    #print("the syllabels per second is: ", syllable_count / audio_duration)
     syllables_per_second = syllable_count / syllable_duration if syllable_duration > 0 else 0
     return syllables_per_second
 if uploaded_file is not None:
     with st.spinner("Processing the audio file..."):
         result = get_syllables_per_second(uploaded_file)
+        st.write("Syllables per second: ", result)

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ torch
 numpy
 transformers
 soundfile
-phonemizer

 numpy
 transformers
 soundfile
+phonemizer
+librosa