Spaces:

clr
/

prosalign

Sleeping

App Files Files Community

prosalign / app.py

clr

Update app.py

34e8585 over 1 year ago

raw

history blame

7.32 kB

	import gradio as gr
	import subprocess,os
	from datasets import load_dataset, Audio
	import datas,ctcalign,graph
	from numpy import random


	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt


	def setup():
	r0 = subprocess.run(["pwd"], capture_output=True, text=True)
	print('PWD::', r0.stdout)
	r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/master.zip"], capture_output=True, text=True)
	print(r1.stdout)
	subprocess.run(["unzip", "./master.zip"])
	subprocess.run(["mv", "REAPER-master", "REAPER"])
	subprocess.run(["rm", "./master.zip"])
	os.chdir('./REAPER')
	subprocess.run(["mkdir", "build"])
	os.chdir('./build')
	r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
	print(r2.stdout)
	r3 = subprocess.run(["make"], capture_output=True, text=True)
	print(r3.stdout)

	os.chdir('../..')
	r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
	print('LS::', r9.stdout)


	#print('about to setup')
	setup()

	# return the whole corpus as a state
	# display some of it
	# (because gradio pagination is currently broken)
	# and reset all filter menus
	# return [ds,databrowser,gmenu,amenu,dmenu]
	def pick_lang(langname):
	if langname=="Icelandic":
	df = datas.ds_i
	ages = ["all", '18-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90']
	diaVis = False

	elif langname =="Faroese":
	df = datas.ds_f
	ages = ["all", '15-35', '36-60', '61+']
	diaVis = True

	dfd = df.drop(columns=['audio', 'speaker_id','duration'])
	return (df, dfd[:15], "all", gr.update(choices=ages,value="all"),gr.update(visible=diaVis,value="all"))



	def apply_filters(df,langname,gender,age,dia):

	if langname=="Icelandic":
	df = datas.ds_i
	elif langname =="Faroese":
	df = datas.ds_f
	if dia != "all":
	df = df[df.dialect.str.lower() == dia.lower()]

	if gender != "all":
	df = df[df.gender.str.startswith(gender)]

	if age != "all":
	df = df[df.age == age]


	dfd = df.drop(columns=['audio', 'speaker_id','duration'])
	return (df,dfd[:min(15,len(dfd))])


	def f1(langname,ds):
	if langname=="Icelandic":
	lang_aligner = datas.a_i
	elif langname =="Faroese":
	lang_aligner = datas.a_f


	ex = ds.sample()
	sound_path = ex['audio'].iloc[0]['path']
	transcript = ex['normalized_text'].iloc[0]

	rec_info = f"{ex['audio_id'].iloc[0]}, {ex['gender'].iloc[0]}, {ex['age'].iloc[0]}"
	if langname =="Faroese":
	rec_info += f", {ex['dialect'].iloc[0]}"
	return (graph.align_and_graph(sound_path,transcript,lang_aligner),sound_path,rec_info)



	bl = gr.Blocks()

	with bl:
	gr.Markdown(
	"""
	# Demo under construction
	### 1. Choose a language to load
	### 2. See a small sample of the selected corpus
	### 3. Click the button below to view time-aligned prosody information for a random example
	""" )
	with gr.Row():
	langmenu = gr.Dropdown(["Faroese", "Icelandic"], label="Language")#, info="Loading the dataset takes some time")
	gr.Markdown(
	"""
	Pitch is shown in dark blue and loudness is the light orange line. The pitch estimation, and the time-alignment of words to audio, are completely automated and there will be some inaccuracy.
	The random example can be from the whole corpus, not necessarily one of the visible rows. More information below.
	""" )

	ds = gr.State()

	with gr.Row():
	gmenu = gr.Dropdown(["all", "f", "m"], label="Gender", value="all")
	amenu = gr.Dropdown(["all"], label="Age", value="all")
	dmenu = gr.Dropdown(["all", "Norðuroyggjar (inklusive of Eiði, Gjógv og Funningur)",
	'Norðurstreymoy/Eysturoy (exclusive of Eiði, Gjógv og Funningur)',
	'Vágar', 'Sandoy', 'Suðuroy', 'Suðurstreymoy'], label="Dialect", value = "all", visible = False)
	btn0 = gr.Button(value="Apply filters")


	with gr.Row():
	databrowser = gr.DataFrame(wrap=True, max_rows=50, interactive=False, overflow_row_behaviour='paginate')


	with gr.Row():
	with gr.Column(scale=1):
	btn1 = gr.Button(value="CLICK HERE")
	btn1.style(size="lg",full_width=True)
	with gr.Column(scale=4):
	audio1 = gr.Audio(interactive=False)
	ainfo = gr.Markdown("""
	Audio file info
	""")

	pl1 = gr.Plot()


	# when user selects a language,
	# reset the dataset
	# display some data from it
	# and reset all filter menus
	langmenu.change(pick_lang,langmenu,[ds,databrowser,gmenu,amenu,dmenu])

	# filter the current data and change the state based on this
	# and display the update
	btn0.click(apply_filters,[ds,langmenu,gmenu,amenu,dmenu],[ds,databrowser])


	#
	btn1.click(f1, [langmenu,ds], [pl1,audio1,ainfo])



	gr.Markdown(
	"""
	# ABOUT

	The Icelandic corpus is [samromur-asr](https://huggingface.co/datasets/language-and-voice-lab/samromur_asr), and Faroese uses [ravnursson-asr](https://huggingface.co/datasets/carlosdanielhernandezmena/ravnursson_asr).

	### Forced alignment
	The prosody graphs are marked with time-alignments for the words found by [CTC decoding](https://pytorch.org/audio/main/tutorials/forced_alignment_tutorial.html). This uses wav2vec-2.0 based models ([Faroese](https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h), [Icelandic](https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h)) and tends to be more robust than Montreal Forced Aligner.
	However, this aligner does not contain any phoneme representation, and therefore, segment alignments are for orthographic characters rather than phonemes. Especially in languages with shallow orthography, these letter alignments probably indicate something about the timing of sounds in a word, but the exact durations should not be taken too seriously especially in cases like doubled or silent letters.

	### Pitch tracking (F0 estimation)
	Estimated pitch is shown in blue on the graphs, as tracked by [REAPER](https://github.com/google/REAPER).

	### Intensity
	The orange line is root mean squared energy, which reflects loudness and is also a good indication of syllable placement, as it should correspond to vowels and similar sounds.

	This is a work-in-progress basic demo for automatic prosodic annotation in Faroese and Icelandic.
	So far, you cannot select or upload your own choice of sentence for analysis, nor search the corpora. Also, it does not display well when the sentence is too long. In that case, or if there are serious errors in the automated analyses, try another random sentence.
	Contact caitlinr@ru.is / https://github.com/catiR/ when things break, or with ideas/suggestions about how to apply this. Unfortunately I am not a web/interface designer so this is not going to look nice or be user friendly, I only do speech processing.
	The source code is available under the Files tab at the top of the Space.
	"""
	)


	if __name__ == "__main__":
	bl.launch()