File size: 3,975 Bytes
459923a
 
 
 
 
 
e8949c5
459923a
 
e8949c5
 
 
 
459923a
e8949c5
 
 
459923a
 
 
 
 
 
 
 
 
 
e8949c5
 
 
 
 
 
 
 
 
bd41e11
 
 
 
e8949c5
 
 
3df2266
260f9cb
3df2266
62e3c32
3df2266
62e3c32
f54c768
3df2266
 
260f9cb
e8949c5
 
 
 
260f9cb
459923a
 
e8949c5
 
459923a
 
 
e8949c5
459923a
260f9cb
e8949c5
260f9cb
459923a
e8949c5
424bfb6
 
459923a
 
 
 
 
e8949c5
459923a
 
 
 
 
 
d88f981
459923a
 
 
 
 
 
 
1d4bdba
 
 
459923a
4cbda75
459923a
 
 
a2f5f66
459923a
 
 
 
 
 
 
e8949c5
459923a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import numpy as np
import soundfile as sf
from scipy import signal
import librosa
import subprocess
import matplotlib.pyplot as plt
from pydub import AudioSegment


def readaud(sound_path):
    aud, sr = sf.read(sound_path, dtype=np.float32)
    if len(aud.shape) == 2:
        aud = aud.mean(1)
    if sr != 16000:
        alen = int(aud.shape[0] / sr * 16000)
        aud = signal.resample(aud, alen)
    return aud


def normalise_transcript(xcp):
    xcp = xcp.lower()
    while '  ' in xcp:
        xcp = xcp.replace('  ', ' ')
    return xcp



def get_pitch_tracks(sound_path):

    orig_ftype = sound_path.split('.')[-1]

    if orig_ftype == '.wav':
        wav_path = sound_path

    else:
        aud_data = AudioSegment.from_file(sound_path, orig_ftype)
        curdir = subprocess.run(["pwd"], capture_output=True, text=True)
        curdir = curdir.stdout.splitlines()[0]
        fname = sound_path.split('/')[-1].replace(orig_ftype,'')
        tmp_path = f'{curdir}/{fname}_tmp.wav'
        aud_data.export(tmp_path, format="wav")
        wav_path = tmp_path

    #print('FILE PATH:', wav_path)
    f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
    #print('PLAIN:',f0_data)
    f0_data = f0_data.decode()
    #print('DECODE-PITCH:',f0_data)
    f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
    #print(f0_data) 
    f0_data = [l.split(' ') for l in f0_data] 
    f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
    f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']

    if orig_ftype != '.wav':
        subprocess.run(["rm", tmp_path])
        
    return f0_data


    

# transcript could be from a corpus with the wav file,
# input by the user,
# or from a previous speech recognition process
def align_and_graph(sound_path, transcript, aligner_function):

    plt.close('all')
        
    
    # fetch data
    speech = readaud(sound_path)
    w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))

    
    # set up the graph shape
    rec_start = w_align[0][1]
    rec_end = w_align[-1][2]
    
    f0_data = get_pitch_tracks(sound_path)
    if f0_data:
        f_max = max([f0 for t,f0 in f0_data]) + 50
    else:
        f_max = 400


    fig, axes1 = plt.subplots(figsize=(15,3))
    plt.xlim([rec_start, rec_end])
    axes1.set_ylim([0.0, f_max])
    axes1.get_xaxis().set_visible(False)
    
    # draw word boundaries
    for w,s,e in w_align:
        plt.vlines(s,0,f_max,linewidth=0.5,color='black')
        plt.vlines(e,0,f_max,linewidth=0.5,color='dimgrey')
        #plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15)
        plt.text( (s+e)/2, f_max+15, w, fontsize=15, ha="center")
        
    # draw phone / char boundaries
    for p,s,e in seg_align:
        plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
        plt.text( (s+e)/2 - (len(p)*.01), -1*f_max/10, p, fontsize=11, color='teal')
    
    
    f0c = "blue"
    axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c)

    
    
    w, sr = librosa.load(sound_path)
    fr_l = 2048 # librosa default
    h_l = 512 # default
    rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
    rmse = rmse[0]


    # show rms energy
    axes2 = axes1.twinx()
    axes2.set_ylim([0.0, 0.5])
    rms_xval = [(h_l*i)/sr for i in range(len(rmse))]
    axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5)

    
    # label the graph
    axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue")
    axes2.set_ylabel("RMS energy", fontsize=14,color="coral")
    #plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15)
    #plt.show()
    
    return fig
    


# uppboðssøla bussleiðini viðmerkingar upprunaligur