Spaces:

clr
/

prosalign

Sleeping

App Files Files Community

prosalign / graph.py

clr

Update graph.py

a2f5f66 about 1 year ago

raw history blame contribute delete

No virus

3.98 kB

	import numpy as np
	import soundfile as sf
	from scipy import signal
	import librosa
	import subprocess
	import matplotlib.pyplot as plt
	from pydub import AudioSegment


	def readaud(sound_path):
	aud, sr = sf.read(sound_path, dtype=np.float32)
	if len(aud.shape) == 2:
	aud = aud.mean(1)
	if sr != 16000:
	alen = int(aud.shape[0] / sr * 16000)
	aud = signal.resample(aud, alen)
	return aud


	def normalise_transcript(xcp):
	xcp = xcp.lower()
	while ' ' in xcp:
	xcp = xcp.replace(' ', ' ')
	return xcp



	def get_pitch_tracks(sound_path):

	orig_ftype = sound_path.split('.')[-1]

	if orig_ftype == '.wav':
	wav_path = sound_path

	else:
	aud_data = AudioSegment.from_file(sound_path, orig_ftype)
	curdir = subprocess.run(["pwd"], capture_output=True, text=True)
	curdir = curdir.stdout.splitlines()[0]
	fname = sound_path.split('/')[-1].replace(orig_ftype,'')
	tmp_path = f'{curdir}/{fname}_tmp.wav'
	aud_data.export(tmp_path, format="wav")
	wav_path = tmp_path

	#print('FILE PATH:', wav_path)
	f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
	#print('PLAIN:',f0_data)
	f0_data = f0_data.decode()
	#print('DECODE-PITCH:',f0_data)
	f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
	#print(f0_data)
	f0_data = [l.split(' ') for l in f0_data]
	f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
	f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']

	if orig_ftype != '.wav':
	subprocess.run(["rm", tmp_path])

	return f0_data




	# transcript could be from a corpus with the wav file,
	# input by the user,
	# or from a previous speech recognition process
	def align_and_graph(sound_path, transcript, aligner_function):

	plt.close('all')


	# fetch data
	speech = readaud(sound_path)
	w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))


	# set up the graph shape
	rec_start = w_align[0][1]
	rec_end = w_align[-1][2]

	f0_data = get_pitch_tracks(sound_path)
	if f0_data:
	f_max = max([f0 for t,f0 in f0_data]) + 50
	else:
	f_max = 400


	fig, axes1 = plt.subplots(figsize=(15,3))
	plt.xlim([rec_start, rec_end])
	axes1.set_ylim([0.0, f_max])
	axes1.get_xaxis().set_visible(False)

	# draw word boundaries
	for w,s,e in w_align:
	plt.vlines(s,0,f_max,linewidth=0.5,color='black')
	plt.vlines(e,0,f_max,linewidth=0.5,color='dimgrey')
	#plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15)
	plt.text( (s+e)/2, f_max+15, w, fontsize=15, ha="center")

	# draw phone / char boundaries
	for p,s,e in seg_align:
	plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
	plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
	plt.text( (s+e)/2 - (len(p).01), -1f_max/10, p, fontsize=11, color='teal')


	f0c = "blue"
	axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c)



	w, sr = librosa.load(sound_path)
	fr_l = 2048 # librosa default
	h_l = 512 # default
	rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
	rmse = rmse[0]


	# show rms energy
	axes2 = axes1.twinx()
	axes2.set_ylim([0.0, 0.5])
	rms_xval = [(h_l*i)/sr for i in range(len(rmse))]
	axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5)


	# label the graph
	axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue")
	axes2.set_ylabel("RMS energy", fontsize=14,color="coral")
	#plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15)
	#plt.show()

	return fig



	# uppboðssøla bussleiðini viðmerkingar upprunaligur