Spaces:

clr
/

prosalign

Sleeping

File size: 3,975 Bytes

import numpy as np
import soundfile as sf
from scipy import signal
import librosa
import subprocess
import matplotlib.pyplot as plt
from pydub import AudioSegment


def readaud(sound_path):
    aud, sr = sf.read(sound_path, dtype=np.float32)
    if len(aud.shape) == 2:
        aud = aud.mean(1)
    if sr != 16000:
        alen = int(aud.shape[0] / sr * 16000)
        aud = signal.resample(aud, alen)
    return aud


def normalise_transcript(xcp):
    xcp = xcp.lower()
    while '  ' in xcp:
        xcp = xcp.replace('  ', ' ')
    return xcp



def get_pitch_tracks(sound_path):

    orig_ftype = sound_path.split('.')[-1]

    if orig_ftype == '.wav':
        wav_path = sound_path

    else:
        aud_data = AudioSegment.from_file(sound_path, orig_ftype)
        curdir = subprocess.run(["pwd"], capture_output=True, text=True)
        curdir = curdir.stdout.splitlines()[0]
        fname = sound_path.split('/')[-1].replace(orig_ftype,'')
        tmp_path = f'{curdir}/{fname}_tmp.wav'
        aud_data.export(tmp_path, format="wav")
        wav_path = tmp_path

    #print('FILE PATH:', wav_path)
    f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
    #print('PLAIN:',f0_data)
    f0_data = f0_data.decode()
    #print('DECODE-PITCH:',f0_data)
    f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
    #print(f0_data) 
    f0_data = [l.split(' ') for l in f0_data] 
    f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
    f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']

    if orig_ftype != '.wav':
        subprocess.run(["rm", tmp_path])
        
    return f0_data


    

# transcript could be from a corpus with the wav file,
# input by the user,
# or from a previous speech recognition process
def align_and_graph(sound_path, transcript, aligner_function):

    plt.close('all')
        
    
    # fetch data
    speech = readaud(sound_path)
    w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))

    
    # set up the graph shape
    rec_start = w_align[0][1]
    rec_end = w_align[-1][2]
    
    f0_data = get_pitch_tracks(sound_path)
    if f0_data:
        f_max = max([f0 for t,f0 in f0_data]) + 50
    else:
        f_max = 400


    fig, axes1 = plt.subplots(figsize=(15,3))
    plt.xlim([rec_start, rec_end])
    axes1.set_ylim([0.0, f_max])
    axes1.get_xaxis().set_visible(False)
    
    # draw word boundaries
    for w,s,e in w_align:
        plt.vlines(s,0,f_max,linewidth=0.5,color='black')
        plt.vlines(e,0,f_max,linewidth=0.5,color='dimgrey')
        #plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15)
        plt.text( (s+e)/2, f_max+15, w, fontsize=15, ha="center")
        
    # draw phone / char boundaries
    for p,s,e in seg_align:
        plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.text( (s+e)/2 - (len(p)*.01), -1*f_max/10, p, fontsize=11, color='teal')
    
    
    f0c = "blue"
    axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c)

    
    
    w, sr = librosa.load(sound_path)
    fr_l = 2048 # librosa default
    h_l = 512 # default
    rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
    rmse = rmse[0]


    # show rms energy
    axes2 = axes1.twinx()
    axes2.set_ylim([0.0, 0.5])
    rms_xval = [(h_l*i)/sr for i in range(len(rmse))]
    axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5)

    
    # label the graph
    axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue")
    axes2.set_ylabel("RMS energy", fontsize=14,color="coral")
    #plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15)
    #plt.show()
    
    return fig
    


# uppboðssøla bussleiðini viðmerkingar upprunaligur