ylacombe HF staff commited on
Commit
db36668
1 Parent(s): e1c31aa

Upload 17 files

Browse files
analyze.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import count, islice
2
+ from typing import Any, Iterable, Literal, Optional, TypeVar, Union, overload, Dict, List, Tuple
3
+ from collections import defaultdict
4
+ import json
5
+
6
+ import torch
7
+
8
+ from datasets import Dataset, Audio
9
+
10
+ from dataspeech import rate_apply, pitch_apply, snr_apply
11
+ from metadata_to_text import bins_to_text, speaker_level_relative_to_gender
12
+
13
+ Row = Dict[str, Any]
14
+ T = TypeVar("T")
15
+ BATCH_SIZE = 20
16
+
17
+
18
+ @overload
19
+ def batched(it: Iterable[T], n: int) -> Iterable[List[T]]:
20
+ ...
21
+
22
+
23
+ @overload
24
+ def batched(it: Iterable[T], n: int, with_indices: Literal[False]) -> Iterable[List[T]]:
25
+ ...
26
+
27
+ @overload
28
+ def batched(it: Iterable[T], n: int, with_indices: Literal[True]) -> Iterable[Tuple[List[int], List[T]]]:
29
+ ...
30
+
31
+
32
+ def batched(
33
+ it: Iterable[T], n: int, with_indices: bool = False
34
+ ) -> Union[Iterable[List[T]], Iterable[Tuple[List[int], List[T]]]]:
35
+ it, indices = iter(it), count()
36
+ while batch := list(islice(it, n)):
37
+ yield (list(islice(indices, len(batch))), batch) if with_indices else batch
38
+
39
+
40
+ def analyze(
41
+ batch: List[Dict[str, Any]],
42
+ cache: Optional[Dict[str, List[Any]]] = None,
43
+ ) -> List[List[Any]]:
44
+ cache = {} if cache is None else cache
45
+ return batch
46
+
47
+
48
+ def run_dataspeech(
49
+ rows: Iterable[Row], audio_column_name: str, text_column_name: str
50
+ ) -> Iterable[Any]:
51
+ cache: Dict[str, List[Any]] = {}
52
+
53
+ # TODO: add speaker and gender to app
54
+ speaker_id_column_name = "speaker_id"
55
+ gender_column_name = "gender"
56
+
57
+ for batch in batched(rows, BATCH_SIZE):
58
+ tmp_dict = defaultdict(list)
59
+ for sample in batch:
60
+ for key in sample:
61
+ if key in [audio_column_name, text_column_name, speaker_id_column_name, gender_column_name]:
62
+ tmp_dict[key].append(sample[key]) if key != audio_column_name else tmp_dict[key].append(sample[key][0]["src"])
63
+
64
+ tmp_dataset = Dataset.from_dict(tmp_dict).cast_column(audio_column_name, Audio())
65
+
66
+
67
+ ## 1. Extract continous tags
68
+ pitch_dataset = tmp_dataset.map(
69
+ pitch_apply,
70
+ batched=True,
71
+ batch_size=BATCH_SIZE,
72
+ with_rank=True if torch.cuda.device_count()>0 else False,
73
+ num_proc=torch.cuda.device_count(),
74
+ remove_columns=[audio_column_name], # tricks to avoid rewritting audio
75
+ fn_kwargs={"audio_column_name": audio_column_name, "penn_batch_size": 4096},
76
+ )
77
+
78
+ snr_dataset = tmp_dataset.map(
79
+ snr_apply,
80
+ batched=True,
81
+ batch_size=BATCH_SIZE,
82
+ with_rank=True if torch.cuda.device_count()>0 else False,
83
+ num_proc=torch.cuda.device_count(),
84
+ remove_columns=[audio_column_name], # tricks to avoid rewritting audio
85
+ fn_kwargs={"audio_column_name": audio_column_name},
86
+ )
87
+
88
+ rate_dataset = tmp_dataset.map(
89
+ rate_apply,
90
+ with_rank=False,
91
+ num_proc=1,
92
+ remove_columns=[audio_column_name], # tricks to avoid rewritting audio
93
+ fn_kwargs={"audio_column_name": audio_column_name, "text_column_name": text_column_name},
94
+ )
95
+
96
+ enriched_dataset = pitch_dataset.add_column("snr", snr_dataset["snr"]).add_column("c50", snr_dataset["c50"])
97
+ enriched_dataset = enriched_dataset.add_column("speaking_rate", rate_dataset["speaking_rate"]).add_column("phonemes", rate_dataset["phonemes"])
98
+
99
+
100
+ ## 2. Map continuous tags to text tags
101
+
102
+ text_bins_dict = {}
103
+ with open("./create_dataset_app/v01_text_bins.json") as json_file:
104
+ text_bins_dict = json.load(json_file)
105
+
106
+ bin_edges_dict = {}
107
+ with open("./create_dataset_app/v01_bin_edges.json") as json_file:
108
+ bin_edges_dict = json.load(json_file)
109
+
110
+ speaker_level_pitch_bins = text_bins_dict.get("speaker_level_pitch_bins")
111
+ speaker_rate_bins = text_bins_dict.get("speaker_rate_bins")
112
+ snr_bins = text_bins_dict.get("snr_bins")
113
+ reverberation_bins = text_bins_dict.get("reverberation_bins")
114
+ utterance_level_std = text_bins_dict.get("utterance_level_std")
115
+
116
+ enriched_dataset = [enriched_dataset]
117
+ if "gender" in batch[0] and "speaker_id" in batch[0]:
118
+ bin_edges = None
119
+ if "pitch_bins_male" in bin_edges_dict and "pitch_bins_female" in bin_edges_dict:
120
+ bin_edges = {"male": bin_edges_dict["pitch_bins_male"], "female": bin_edges_dict["pitch_bins_female"]}
121
+
122
+ enriched_dataset, _ = speaker_level_relative_to_gender(enriched_dataset, speaker_level_pitch_bins, "speaker_id", "gender", "utterance_pitch_mean", "pitch", batch_size=20, num_workers=1, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges)
123
+
124
+ enriched_dataset, _ = bins_to_text(enriched_dataset, speaker_rate_bins, "speaking_rate", "speaking_rate", batch_size=20, num_workers=1, leading_split_for_bins=None, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("speaking_rate",None))
125
+ enriched_dataset, _ = bins_to_text(enriched_dataset, snr_bins, "snr", "noise", batch_size=20, num_workers=1, leading_split_for_bins=None, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("noise",None), lower_range=None)
126
+ enriched_dataset, _ = bins_to_text(enriched_dataset, reverberation_bins, "c50", "reverberation", batch_size=20, num_workers=1, leading_split_for_bins=None, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("reverberation",None))
127
+ enriched_dataset, _ = bins_to_text(enriched_dataset, utterance_level_std, "utterance_pitch_std", "speech_monotony", batch_size=20, num_workers=1, leading_split_for_bins=None, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("speech_monotony",None))
128
+
129
+
130
+ enriched_dataset = enriched_dataset[0]
131
+
132
+ for i,sample in enumerate(batch):
133
+ new_sample = {}
134
+ new_sample[audio_column_name] = f"<audio src='{sample[audio_column_name][0]['src']}' controls></audio>"
135
+ for col in ["speaking_rate", "reverberation", "noise", "speech_monotony", "c50", "snr",]: # phonemes, speaking_rate, utterance_pitch_std, utterance_pitch_mean
136
+ new_sample[col] = enriched_dataset[col][i]
137
+ if "gender" in batch[0] and "speaker_id" in batch[0]:
138
+ new_sample["pitch"] = enriched_dataset["pitch"][i]
139
+ new_sample[gender_column_name] = sample[col]
140
+ new_sample[speaker_id_column_name] = sample[col]
141
+
142
+ new_sample[text_column_name] = sample[text_column_name]
143
+ batch[i] = new_sample
144
+
145
+ yield analyze(
146
+ batch=batch,
147
+ cache=cache,
148
+ )
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ from itertools import count, groupby, islice
3
+ from operator import itemgetter
4
+ from typing import Any, Iterable, TypeVar, List, Dict, Tuple, Optional
5
+
6
+ import gradio as gr
7
+ import requests
8
+ import pandas as pd
9
+ from datasets import Features
10
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
11
+ from requests.adapters import HTTPAdapter, Retry
12
+
13
+ from analyze import run_dataspeech
14
+
15
+ MAX_ROWS = 100
16
+ T = TypeVar("T")
17
+ session = requests.Session()
18
+ retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
19
+ session.mount('http://', HTTPAdapter(max_retries=retries))
20
+
21
+
22
+ def stream_rows(dataset: str, config: str, split: str) -> Iterable[Dict[str, Any]]:
23
+ batch_size = 100
24
+ for i in count():
25
+ rows_resp = session.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=10).json()
26
+ if "error" in rows_resp:
27
+ raise RuntimeError(rows_resp["error"])
28
+ if not rows_resp["rows"]:
29
+ break
30
+ for row_item in rows_resp["rows"]:
31
+ yield row_item["row"]
32
+
33
+ class track_iter:
34
+
35
+ def __init__(self, it: Iterable[T]):
36
+ self.it = it
37
+ self.next_idx = 0
38
+
39
+ def __iter__(self) -> T:
40
+ for item in self.it:
41
+ self.next_idx += 1
42
+ yield item
43
+
44
+
45
+ def report(next_row_idx: int, num_rows: int) -> Dict[str, float]:
46
+ if num_rows == next_row_idx:
47
+ return f"Scan finished: {num_rows} samples analyzed"
48
+ else:
49
+ return f"Tagging in progress - {next_row_idx/num_rows*100}% of rows analyzed..."
50
+
51
+
52
+ def analyze_dataset(dataset: str, audio_column_name: str, text_column_name: str, configuration_name: Optional[str] = None, split_name: Optional[str] = None) -> Tuple[str, List[List[Any]]]:
53
+ info_resp = session.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
54
+ if "error" in info_resp:
55
+ yield "❌ " + info_resp["error"], pd.DataFrame()
56
+ return
57
+
58
+ if configuration_name in info_resp["dataset_info"]:
59
+ config = configuration_name
60
+ elif configuration_name != "" and configuration_name is not None:
61
+ yield "❌ " + f"The configuration you've passed `{configuration_name}` was not found in the dataset configs: {', '.join(info_resp['dataset_info'].keys())}. Try again with the right config name.", gr.DataFrame()
62
+ return
63
+ else:
64
+ config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
65
+
66
+ features = Features.from_dict(info_resp["dataset_info"][config]["features"])
67
+ if split_name in info_resp["dataset_info"][config]["splits"]:
68
+ split = split_name
69
+ elif split_name != "" and split_name is not None:
70
+ yield "❌ " + f"The splt you've passed `{split_name}` was not found in the dataset splits: {', '.join(info_resp['dataset_info'][config]['splits'])}. Try again with the right config name.", gr.DataFrame()
71
+ return
72
+ else:
73
+ split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
74
+ num_rows = min(info_resp["dataset_info"][config]["splits"][split]["num_examples"], MAX_ROWS)
75
+ rows = track_iter(islice(stream_rows(dataset, config, split), MAX_ROWS))
76
+
77
+ if audio_column_name not in features:
78
+ yield "❌ " + f"The audio column name you've passed `{audio_column_name}` was not found in the dataset columns: {', '.join(features.keys())}. Try again with the right column name.", gr.DataFrame()
79
+ return
80
+ if text_column_name not in features:
81
+ yield "❌ " + f"The text column name you've passed `{text_column_name}` was not found in the dataset columns: {', '.join(features.keys())}. Try again with the right column name.", gr.DataFrame()
82
+ return
83
+ if "gender" in features:
84
+ yield "Gender has been detected. We'll compute pitch.", pd.DataFrame()
85
+
86
+ dataframe = []
87
+ for batch in run_dataspeech(
88
+ rows, audio_column_name, text_column_name
89
+ ):
90
+ headers = list(batch[0].keys())
91
+ batch = [list(sample.values()) for sample in batch]
92
+ dataframe.extend(batch)
93
+ datatype = ["str" if col != audio_column_name else "markdown" for col in headers]
94
+ yield (report(next_row_idx=rows.next_idx, num_rows=num_rows), gr.DataFrame(dataframe, headers=headers, datatype=datatype, wrap=True))
95
+ yield (report(next_row_idx=rows.next_idx, num_rows=num_rows), gr.DataFrame(dataframe, headers=headers, datatype=datatype, wrap=True))
96
+
97
+ with gr.Blocks() as demo:
98
+ gr.Markdown("# Analyze speech dataset using Data-Speech")
99
+ gr.Markdown("The space takes an HF dataset name as an input, as well as the audio column name to analyze, and returns the speaking rate, noise level, reverberation level, monotony level and pitch. Note that pitch is only computed if a `speaker_id` column and a `gender` column are found.")
100
+ hub_search = HuggingfaceHubSearch(
101
+ label="Hub Dataset ID",
102
+ placeholder="Search for dataset id on Huggingface",
103
+ search_type="dataset",
104
+ )
105
+ audio_column_name = gr.Textbox(
106
+ value="audio",
107
+ label="Audio column name.",
108
+
109
+ )
110
+ text_column_name = gr.Textbox(
111
+ value="text",
112
+ label="Transcription column name.",
113
+ )
114
+
115
+ with gr.Accordion("(Optional) specify configuration and split of the dataset to be analysed", open=False):
116
+ configuration_name = gr.Textbox(
117
+ value=None,
118
+ label="Configuration name.",
119
+ )
120
+ split_name = gr.Textbox(
121
+ value=None,
122
+ label="Split name.",
123
+ )
124
+
125
+ button = gr.Button("Run Data-Speech Scan")
126
+ outputs = [
127
+ gr.Label(show_label=False),
128
+ gr.DataFrame(),
129
+ ]
130
+
131
+
132
+ button.click(analyze_dataset, [hub_search, audio_column_name, text_column_name, configuration_name, split_name], outputs)
133
+ gr.Examples(
134
+ [
135
+ ["blabble-io/libritts_r", "audio", "text_normalized", "clean"],
136
+ ["blabble-io/libritts_r", "audio", "text_normalized", "other"],
137
+ ["espnet/yodas", "audio", "text", "en000",],
138
+ ["ylacombe/english_dialects", "audio", "text"]
139
+ ],
140
+ [hub_search, audio_column_name, text_column_name, configuration_name],
141
+ outputs,
142
+ fn=analyze_dataset,
143
+ run_on_click=True,
144
+ cache_examples=False,
145
+ )
146
+
147
+ demo.launch(debug=False)
dataspeech/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .cpu_enrichments import rate_apply
2
+ from .gpu_enrichments import pitch_apply, snr_apply
dataspeech/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (276 Bytes). View file
 
dataspeech/cpu_enrichments/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .rate import rate_apply
2
+
dataspeech/cpu_enrichments/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (210 Bytes). View file
 
dataspeech/cpu_enrichments/__pycache__/rate.cpython-38.pyc ADDED
Binary file (860 Bytes). View file
 
dataspeech/cpu_enrichments/rate.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from g2p import make_g2p
2
+
3
+ transducer = make_g2p('eng', 'eng-ipa')
4
+
5
+ def rate_apply(batch, rank=None, audio_column_name="audio", text_column_name="text"):
6
+ if isinstance(batch[audio_column_name], list):
7
+ speaking_rates = []
8
+ phonemes_list = []
9
+ for text, audio in zip(batch[text_column_name], batch[audio_column_name]):
10
+ phonemes = transducer(text).output_string
11
+
12
+ sample_rate = audio["sampling_rate"]
13
+ audio_length = len(audio["array"].squeeze()) / sample_rate
14
+
15
+ speaking_rate = len(phonemes) / audio_length
16
+
17
+
18
+ speaking_rates.append(speaking_rate)
19
+ phonemes_list.append(phonemes)
20
+
21
+ batch["speaking_rate"] = speaking_rates
22
+ batch["phonemes"] = phonemes_list
23
+ else:
24
+ phonemes = transducer(batch[text_column_name]).output_string
25
+
26
+ sample_rate = batch[audio_column_name]["sampling_rate"]
27
+ audio_length = len(batch[audio_column_name]["array"].squeeze()) / sample_rate
28
+
29
+ speaking_rate = len(phonemes) / audio_length
30
+
31
+ batch["speaking_rate"] = speaking_rate
32
+ batch["phonemes"] = phonemes
33
+
34
+ return batch
dataspeech/gpu_enrichments/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .pitch import pitch_apply
2
+ from .snr_and_reverb import snr_apply
dataspeech/gpu_enrichments/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (260 Bytes). View file
 
dataspeech/gpu_enrichments/__pycache__/pitch.cpython-38.pyc ADDED
Binary file (1.25 kB). View file
 
dataspeech/gpu_enrichments/__pycache__/snr_and_reverb.cpython-38.pyc ADDED
Binary file (1.3 kB). View file
 
dataspeech/gpu_enrichments/pitch.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import penn
3
+
4
+
5
+ # Here we'll use a 10 millisecond hopsize
6
+ hopsize = .01
7
+
8
+ # Provide a sensible frequency range given your domain and model
9
+ fmin = 30.
10
+ fmax = 1000.
11
+
12
+ # Select a checkpoint to use for inference. Selecting None will
13
+ # download and use FCNF0++ pretrained on MDB-stem-synth and PTDB
14
+ checkpoint = None
15
+
16
+ # Centers frames at hopsize / 2, 3 * hopsize / 2, 5 * hopsize / 2, ...
17
+ center = 'half-hop'
18
+
19
+ # (Optional) Linearly interpolate unvoiced regions below periodicity threshold
20
+ interp_unvoiced_at = .065
21
+
22
+
23
+ def pitch_apply(batch, rank=None, audio_column_name="audio", output_column_name="utterance_pitch", penn_batch_size=4096):
24
+ if isinstance(batch[audio_column_name], list):
25
+ utterance_pitch_mean = []
26
+ utterance_pitch_std = []
27
+ for sample in batch[audio_column_name]:
28
+ # Infer pitch and periodicity
29
+ pitch, periodicity = penn.from_audio(
30
+ torch.tensor(sample["array"][None, :]).float(),
31
+ sample["sampling_rate"],
32
+ hopsize=hopsize,
33
+ fmin=fmin,
34
+ fmax=fmax,
35
+ checkpoint=checkpoint,
36
+ batch_size=penn_batch_size,
37
+ center=center,
38
+ interp_unvoiced_at=interp_unvoiced_at,
39
+ gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
40
+ )
41
+
42
+ utterance_pitch_mean.append(pitch.mean().cpu())
43
+ utterance_pitch_std.append(pitch.std().cpu())
44
+
45
+ batch[f"{output_column_name}_mean"] = utterance_pitch_mean
46
+ batch[f"{output_column_name}_std"] = utterance_pitch_std
47
+ else:
48
+ sample = batch[audio_column_name]
49
+ pitch, periodicity = penn.from_audio(
50
+ torch.tensor(sample["array"][None, :]).float(),
51
+ sample["sampling_rate"],
52
+ hopsize=hopsize,
53
+ fmin=fmin,
54
+ fmax=fmax,
55
+ checkpoint=checkpoint,
56
+ batch_size=penn_batch_size,
57
+ center=center,
58
+ interp_unvoiced_at=interp_unvoiced_at,
59
+ gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
60
+ )
61
+ batch[f"{output_column_name}_mean"] = pitch.mean().cpu()
62
+ batch[f"{output_column_name}_std"] = pitch.std().cpu()
63
+
64
+ return batch
dataspeech/gpu_enrichments/snr_and_reverb.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pyannote.audio import Model
2
+ from pathlib import Path
3
+ from brouhaha.pipeline import RegressiveActivityDetectionPipeline
4
+ import torch
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ model = None
8
+
9
+ def snr_apply(batch, rank=None, audio_column_name="audio"):
10
+ global model
11
+ if model is None:
12
+ model = Model.from_pretrained(
13
+ Path(hf_hub_download(repo_id="ylacombe/brouhaha-best", filename="best.ckpt")),
14
+ strict=False,
15
+ )
16
+ if rank is not None:
17
+ # move the model to the right GPU if not there already
18
+ device = f"cuda:{(rank or 0)% torch.cuda.device_count()}"
19
+ # move to device and create pipeline here because the pipeline moves to the first GPU it finds anyway
20
+ model.to(device)
21
+
22
+ pipeline = RegressiveActivityDetectionPipeline(segmentation=model)
23
+ if rank:
24
+ pipeline.to(torch.device(device))
25
+
26
+ device = pipeline._models["segmentation"].device
27
+
28
+ if isinstance(batch[audio_column_name], list):
29
+ snr = []
30
+ c50 = []
31
+ for sample in batch[audio_column_name]:
32
+ res = pipeline({"sample_rate": sample["sampling_rate"],
33
+ "waveform": torch.tensor(sample["array"][None, :]).to(device).float()})
34
+
35
+ snr.append(res["snr"].mean())
36
+ c50.append(res["c50"].mean())
37
+
38
+ batch["snr"] = snr
39
+ batch["c50"] = c50
40
+ else:
41
+ res = pipeline({"sample_rate": batch[audio_column_name]["sampling_rate"],
42
+ "waveform": torch.tensor(batch[audio_column_name]["array"][None, :]).to(device).float()})
43
+
44
+ batch["snr"] = res["snr"].mean()
45
+ batch["c50"] = res["c50"].mean()
46
+
47
+ return batch
metadata_to_text.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from datasets import load_dataset, DatasetDict
4
+ from multiprocess import set_start_method
5
+ import argparse
6
+ from pathlib import Path
7
+ import os
8
+ import matplotlib.pyplot as plt
9
+ import json
10
+
11
+ SPEAKER_RATE_BINS = ["very slowly", "quite slowly", "slightly slowly", "moderate speed", "slightly fast", "quite fast", "very fast"]
12
+ SNR_BINS = ["very noisy", "quite noisy", "slightly noisy", "moderate ambient sound", "slightly clear", "quite clear", "very clear"]
13
+ REVERBERATION_BINS = ["very roomy sounding", "quite roomy sounding", "slightly roomy sounding", "moderate reverberation", "slightly confined sounding", "quite confined sounding", "very confined sounding"]
14
+ UTTERANCE_LEVEL_STD = ["very monotone", "quite monotone", "slightly monotone", "moderate intonation", "slightly expressive", "quite expressive", "very expressive"]
15
+
16
+ # this one is supposed to be apply to speaker-level mean pitch, and relative to gender
17
+ SPEAKER_LEVEL_PITCH_BINS = ["very low pitch", "quite low pitch", "slightly low pitch", "moderate pitch", "slightly high pitch", "quite high pitch", "very high pitch"]
18
+
19
+
20
+ def visualize_bins_to_text(values_1, values_2, name_1, name_2, text_bins, save_dir, output_column_name, default_bins=100, lower_range=None):
21
+ # Save both histograms into a single figure
22
+ fig, axs = plt.subplots(2, figsize=(8,6), sharex=True)
23
+
24
+ # Plot histogram and vertical lines for subplot 1
25
+ axs[0].hist(values_1, bins=default_bins, color='blue', alpha=0.7)
26
+ _, bin_edges1 = np.histogram(values_1, bins=len(text_bins), range=(lower_range, values_1.max()) if lower_range else None)
27
+ for edge in bin_edges1:
28
+ axs[0].axvline(x=edge, color='red', linestyle='--', linewidth=1)
29
+
30
+
31
+ # Plot histogram and vertical lines for subplot 2
32
+ axs[1].hist(values_2, bins=50, color='green', alpha=0.7)
33
+ _, bin_edges2 = np.histogram(values_2, bins=len(text_bins), range=(lower_range, values_2.max()) if lower_range else None)
34
+ for edge in bin_edges2:
35
+ axs[1].axvline(x=edge, color='red', linestyle='--', linewidth=1)
36
+
37
+ # Add labels and title
38
+ axs[0].set_title(name_1)
39
+ axs[1].set_title(name_2)
40
+ axs[0].set_yscale('log')
41
+ axs[1].set_yscale('log')
42
+ axs[0].set_ylabel('Frequency')
43
+ axs[1].set_ylabel('Frequency')
44
+ axs[1].set_xlabel(f'{output_column_name}')
45
+
46
+ # Adjust layout
47
+ plt.tight_layout()
48
+
49
+ filename = f"{output_column_name}.png"
50
+ filepath = os.path.join(save_dir, filename)
51
+ plt.savefig(filepath)
52
+ print(f"Plots saved at '{filename}'!")
53
+
54
+ def bins_to_text(dataset, text_bins, column_name, output_column_name, leading_split_for_bins="train", batch_size = 4, num_workers = 1, std_tolerance=5, save_dir=None, only_save_plot=False, lower_range=None, bin_edges=None):
55
+ '''
56
+ Compute bins of `column_name` from the splits `leading_split_for_bins` and apply text bins to every split.
57
+ `leading_split_for_bins` can be a string or a list.
58
+ '''
59
+ if bin_edges is None:
60
+ values = []
61
+ for df in dataset:
62
+ for split in df:
63
+ if leading_split_for_bins is None or leading_split_for_bins in split:
64
+ values.extend(df[split][column_name])
65
+
66
+ # filter out outliers
67
+ values = np.array(values)
68
+ if std_tolerance is not None:
69
+ filtered_values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
70
+
71
+ if save_dir is not None:
72
+ visualize_bins_to_text(values, filtered_values, "Before filtering", "After filtering", text_bins, save_dir, output_column_name, lower_range)
73
+
74
+ # speaking_rate can easily have outliers
75
+ if save_dir is not None and output_column_name=="speaking_rate":
76
+ visualize_bins_to_text(filtered_values, filtered_values, "After filtering", "After filtering", text_bins, save_dir, f"{output_column_name}_after_filtering", lower_range)
77
+
78
+ values = filtered_values
79
+ hist, bin_edges = np.histogram(values, bins = len(text_bins), range=(lower_range, values.max()) if lower_range else None)
80
+
81
+ if only_save_plot:
82
+ return dataset, bin_edges
83
+ else:
84
+ print(f"Already computed bin edges have been passed for {output_column_name}. Will use: {bin_edges}.")
85
+
86
+ def batch_association(batch):
87
+ index_bins = np.searchsorted(bin_edges, batch, side="left")
88
+ # do min(max(...)) when values are outside of the main bins
89
+ # it happens when value = min or max or have been filtered out from bins computation
90
+ batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
91
+ return {
92
+ output_column_name: batch_bins
93
+ }
94
+
95
+ dataset = [df.map(batch_association, batched=True, batch_size=batch_size, input_columns=[column_name], num_proc=num_workers) for df in dataset]
96
+ return dataset, bin_edges
97
+
98
+ def speaker_level_relative_to_gender(dataset, text_bins, speaker_column_name, gender_column_name, column_name, output_column_name, batch_size = 4, num_workers=1, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=None):
99
+ '''
100
+ Computes mean values on a speaker level and computes bins on top relative to the gender column name.
101
+ Then associate a text bin to the column.
102
+ This time, doesn't use leading_split_for_bins, computes it for all. Could probably be optimized
103
+ '''
104
+ list_data = []
105
+ for df in dataset:
106
+ for split in df:
107
+ panda_data = df[split].remove_columns([col for col in df[split].column_names if col not in {speaker_column_name, column_name, gender_column_name}]).to_pandas()
108
+ list_data.append(panda_data)
109
+
110
+ dataframe = pd.concat(list_data, ignore_index=True)
111
+ dataframe = dataframe.groupby(speaker_column_name).agg({column_name: "mean", gender_column_name: "first"})
112
+ if bin_edges is None:
113
+ bin_edges = {}
114
+ if save_dir is not None:
115
+ save_dict = {}
116
+ save_dict_afer_filtering = {}
117
+ for category in ["male", "female"]:
118
+ values = dataframe[dataframe[gender_column_name] == category][column_name]
119
+ values = np.array(values)
120
+ if save_dir is not None:
121
+ save_dict[category] = values
122
+ if std_tolerance is not None:
123
+ # filter out outliers
124
+ values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
125
+ if save_dir is not None:
126
+ save_dict_afer_filtering[category] = values
127
+ bin_edges[category] = np.histogram(values, len(text_bins))[1]
128
+
129
+ if save_dir is not None:
130
+ visualize_bins_to_text(save_dict["male"], save_dict["female"], "Male distribution", "Female distribution", text_bins, save_dir, output_column_name)
131
+ if std_tolerance is not None:
132
+ visualize_bins_to_text(save_dict_afer_filtering["male"], save_dict_afer_filtering["female"], "Male distribution", "Female distribution", text_bins, save_dir, f"{output_column_name}_after_filtering")
133
+
134
+ if only_save_plot:
135
+ return dataset, bin_edges
136
+
137
+ speaker_id_to_bins = dataframe.apply(lambda x: np.searchsorted(bin_edges[x[gender_column_name]], x[column_name]), axis=1).to_dict()
138
+
139
+ def batch_association(batch):
140
+ index_bins = [speaker_id_to_bins[speaker] for speaker in batch]
141
+ # do min(max(...)) when values are outside of the main bins
142
+ # it happens when value = min or max or have been filtered out from bins computation
143
+ batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
144
+ return {
145
+ output_column_name: batch_bins
146
+ }
147
+
148
+
149
+ dataset = [df.map(batch_association, batched=True, input_columns=[speaker_column_name], batch_size=batch_size, num_proc=num_workers) for df in dataset]
150
+ return dataset, bin_edges
151
+
152
+ if __name__ == "__main__":
153
+ set_start_method("spawn")
154
+ parser = argparse.ArgumentParser()
155
+
156
+
157
+ parser.add_argument("dataset_name", type=str, help="Path or name of the dataset(s). If multiple datasets, names have to be separated by `+`.")
158
+ parser.add_argument("--configuration", default=None, type=str, help="Dataset configuration(s) to use (or configuration separated by +).")
159
+ parser.add_argument("--output_dir", default=None, type=str, help="If specified, save the dataset(s) on disk. If multiple datasets, paths have to be separated by `+`.")
160
+ parser.add_argument("--repo_id", default=None, type=str, help="If specified, push the dataset(s) to the hub. If multiple datasets, names have to be separated by `+`.")
161
+ parser.add_argument("--path_to_text_bins", default=None, type=str, help="If specified, points to a JSON file which contains the text bins that will be associated to each bins. Will use default bins.")
162
+ parser.add_argument("--path_to_bin_edges", default=None, type=str, help="If specified, points to a JSON file which contains the bin edges. Useful if you want to apply already computed bins to new datasets. If not specified, will recompute bin edges from scratch.")
163
+ parser.add_argument("--save_bin_edges", default=None, type=str, help="If specified, it's the name of the JSON file which will contains the edge bins that have been computed. Useful if you want to reuse those bin eges on new datasets. By default, it won't save those edges..")
164
+ parser.add_argument("--avoid_pitch_computation", default=False, action="store_true", help="If `True`, will not compute `pitch`. Note that `pitch` is computed on a speaker-level, relative to gender, so you don't need it in a mono-speaker setting.")
165
+ parser.add_argument("--cpu_num_workers", default=1, type=int, help="Number of CPU workers.")
166
+ parser.add_argument("--batch_size", default=16, type=int, help="Batch size in `Dataset.map` operations. https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Dataset.map")
167
+ parser.add_argument("--speaker_id_column_name", default="speaker_id", type=str, help="Speaker id column name. Only used if `avoid_pitch_computation=False`")
168
+ parser.add_argument("--gender_column_name", default="gender", type=str, help="Gender column name. .Only used if `avoid_pitch_computation=False`")
169
+ parser.add_argument("--pitch_std_tolerance", default=2., type=float, help="Standard deviation tolerance for pitch estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `avoid_pitch_computation=False`.")
170
+ parser.add_argument("--speaking_rate_std_tolerance", default=4., type=float, help="Standard deviation tolerance for speaking rate estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `path_to_bin_edges=False`.")
171
+ parser.add_argument("--snr_std_tolerance", default=3.5, type=float, help="Standard deviation tolerance for SNR estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `path_to_bin_edges=False`.")
172
+ parser.add_argument("--reverberation_std_tolerance", default=4, type=float, help="Standard deviation tolerance for reverberation estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `path_to_bin_edges=False`.")
173
+ parser.add_argument("--speech_monotony_std_tolerance", default=4, type=float, help="Standard deviation tolerance for speech monotony estimation. Any value that is outside mean ± std * tolerance is discared. Only used if `path_to_bin_edges=False`.")
174
+ parser.add_argument("--leading_split_for_bins", default=None, type=str, help="If specified, will use every split that contains this string to compute statistics. If not specified, will use every split. Only used if `path_to_bin_edges=False`.")
175
+ parser.add_argument("--plot_directory", default=None, type=str, help="If specified, will save visualizing plots to this directory. Only used if `path_to_bin_edges=False`.")
176
+ parser.add_argument("--only_save_plot", default=False, action="store_true", help="If `True` and `--plot_directory` is specified, will only compute plot. Only used if `path_to_bin_edges=False`.")
177
+ parser.add_argument("--snr_lower_range", default=50, type=float, help="The lower range of the SNR bins")
178
+
179
+ args = parser.parse_args()
180
+
181
+ if args.plot_directory is None and args.only_save_plot:
182
+ raise ValueError("`only_save_plot=true` but `plot_directory` is not specified. Please give a path to the directory where you want the plot to be saved.")
183
+ if args.only_save_plot and args.path_to_bin_edges:
184
+ raise ValueError("`only_save_plot=true` but `path_to_bin_edges` is specified. Since the latter is specified, we won't redo computations that would have been used for plotting. Chose one ar another. Note that if you use this script to label a new dataset for fine-tuning, I'd recommend avoiding plotting and set `only_save_plot=false`")
185
+
186
+ text_bins_dict = {}
187
+ if args.path_to_text_bins:
188
+ with open(args.path_to_text_bins) as json_file:
189
+ text_bins_dict = json.load(json_file)
190
+
191
+ bin_edges_dict = {}
192
+ if args.path_to_bin_edges:
193
+ with open(args.path_to_bin_edges) as json_file:
194
+ bin_edges_dict = json.load(json_file)
195
+
196
+ speaker_level_pitch_bins = text_bins_dict.get("speaker_level_pitch_bins", SPEAKER_LEVEL_PITCH_BINS)
197
+ speaker_rate_bins = text_bins_dict.get("speaker_rate_bins", SPEAKER_RATE_BINS)
198
+ snr_bins = text_bins_dict.get("snr_bins", SNR_BINS)
199
+ reverberation_bins = text_bins_dict.get("reverberation_bins", REVERBERATION_BINS)
200
+ utterance_level_std = text_bins_dict.get("utterance_level_std", UTTERANCE_LEVEL_STD)
201
+
202
+ output_dirs = [args.output_dir] if args.output_dir is not None else None
203
+ repo_ids = [args.repo_id] if args.repo_id is not None else None
204
+ if args.configuration:
205
+ if "+" in args.dataset_name:
206
+ dataset_names = args.dataset_name.split("+")
207
+ dataset_configs = args.configuration.split("+")
208
+ if len(dataset_names) != len(dataset_configs):
209
+ raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(dataset_configs)} configuration spotted")
210
+
211
+ if args.repo_id is not None:
212
+ repo_ids = args.repo_id.split("+")
213
+ if len(dataset_names) != len(repo_ids):
214
+ raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(repo_ids)} repository ids spotted")
215
+
216
+ if args.output_dir is not None:
217
+ output_dirs = args.output_dir.split("+")
218
+ if len(dataset_names) != len(output_dirs):
219
+ raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(output_dirs)} local paths on which to save the datasets spotted")
220
+
221
+ dataset = []
222
+ for dataset_name, dataset_config in zip(dataset_names, dataset_configs):
223
+ tmp_dataset = load_dataset(dataset_name, dataset_config)
224
+ dataset.append(tmp_dataset)
225
+ else:
226
+ dataset = [load_dataset(args.dataset_name, args.configuration)]
227
+ dataset_configs = [args.configuration]
228
+ else:
229
+ if "+" in args.dataset_name:
230
+ dataset_names = args.dataset_name.split("+")
231
+ if args.repo_id is not None:
232
+ repo_ids = args.repo_id.split("+")
233
+ if len(dataset_names) != len(repo_ids):
234
+ raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(repo_ids)} repository ids spotted")
235
+
236
+ if args.output_dir is not None:
237
+ output_dirs = args.output_dir.split("+")
238
+ if len(dataset_names) != len(output_dirs):
239
+ raise ValueError(f"There are {len(dataset_names)} datasets spotted but {len(output_dirs)} local paths on which to save the datasets spotted")
240
+
241
+ dataset = []
242
+ for dataset_name, dataset_config in zip(dataset_names):
243
+ tmp_dataset = load_dataset(dataset_name)
244
+ dataset.append(tmp_dataset)
245
+
246
+ else:
247
+ dataset = [load_dataset(args.dataset_name)]
248
+
249
+ if args.plot_directory:
250
+ Path(args.plot_directory).mkdir(parents=True, exist_ok=True)
251
+
252
+ if not args.avoid_pitch_computation:
253
+ bin_edges = None
254
+ if "pitch_bins_male" in bin_edges_dict and "pitch_bins_female" in bin_edges_dict:
255
+ bin_edges = {"male": bin_edges_dict["pitch_bins_male"], "female": bin_edges_dict["pitch_bins_female"]}
256
+
257
+ dataset, pitch_bin_edges = speaker_level_relative_to_gender(dataset, speaker_level_pitch_bins, args.speaker_id_column_name, args.gender_column_name, "utterance_pitch_mean", "pitch", batch_size=args.batch_size, num_workers=args.cpu_num_workers, std_tolerance=args.pitch_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges)
258
+
259
+ dataset, speaking_rate_bin_edges = bins_to_text(dataset, speaker_rate_bins, "speaking_rate", "speaking_rate", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.speaking_rate_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("speaking_rate",None))
260
+ dataset, noise_bin_edges = bins_to_text(dataset, snr_bins, "snr", "noise", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.snr_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("noise",None), lower_range=args.snr_lower_range)
261
+ dataset, reverberation_bin_edges = bins_to_text(dataset, reverberation_bins, "c50", "reverberation", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.reverberation_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("reverberation",None))
262
+ dataset, speech_monotony_bin_edges = bins_to_text(dataset, utterance_level_std, "utterance_pitch_std", "speech_monotony", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.speech_monotony_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("speech_monotony",None))
263
+
264
+ if args.save_bin_edges:
265
+ bin_edges = {
266
+ "speaking_rate": speaking_rate_bin_edges.tolist(),
267
+ "noise": noise_bin_edges.tolist(),
268
+ "reverberation": reverberation_bin_edges.tolist(),
269
+ "speech_monotony": speech_monotony_bin_edges.tolist(),
270
+ }
271
+ if not args.avoid_pitch_computation:
272
+ bin_edges["pitch_bins_male"] = pitch_bin_edges["male"].tolist()
273
+ bin_edges["pitch_bins_female"] = pitch_bin_edges["female"].tolist()
274
+
275
+ with open(args.save_bin_edges, "w") as outfile:
276
+ json.dump(bin_edges, outfile)
277
+
278
+ if not args.only_save_plot:
279
+ if args.output_dir:
280
+ for output_dir, df in zip(output_dirs, dataset):
281
+ df.save_to_disk(output_dir)
282
+ if args.repo_id:
283
+ for i, (repo_id, df) in enumerate(zip(repo_ids, dataset)):
284
+ if args.configuration:
285
+ df.push_to_hub(repo_id, dataset_configs[i])
286
+ else:
287
+ df.push_to_hub(repo_id)
v01_bin_edges.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"speaking_rate": [3.508771929824561, 6.187242299296628, 8.865712668768696, 11.544183038240764, 14.22265340771283, 16.901123777184896, 19.579594146656966, 22.258064516129032], "noise": [50.0, 53.460838317871094, 56.92167663574219, 60.38251495361328, 63.843353271484375, 67.30419158935547, 70.76502990722656, 74.22586822509766], "reverberation": [30.498437881469727, 34.706024169921875, 38.91361045837402, 43.12119674682617, 47.32878303527832, 51.53636932373047, 55.74395561218262, 59.951541900634766], "speech_monotony": [0.0, 17.430070059640066, 34.86014011928013, 52.2902101789202, 69.72028023856026, 87.15035029820032, 104.5804203578404, 122.01049041748047], "pitch_bins_male": [74.04898071289062, 88.6379623413086, 103.22694396972656, 117.81592559814453, 132.4049072265625, 146.993896484375, 161.58287048339844, 176.17185974121094], "pitch_bins_female": [130.46119689941406, 149.0537567138672, 167.64630126953125, 186.23886108398438, 204.83140563964844, 223.42396545410156, 242.01651000976562, 260.60906982421875]}
v01_text_bins.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "speaker_rate_bins":
3
+ ["very slowly", "quite slowly", "slightly slowly", "moderate speed", "slightly fast", "quite fast", "very fast"],
4
+ "snr_bins":
5
+ ["very noisy", "quite noisy", "slightly noisy", "moderate ambient sound", "slightly clear", "quite clear", "very clear"],
6
+ "reverberation_bins":
7
+ ["very roomy sounding", "quite roomy sounding", "slightly roomy sounding", "moderate reverberation", "slightly confined sounding", "quite confined sounding", "very confined sounding"],
8
+ "utterance_level_std":
9
+ ["very monotone", "quite monotone", "slightly monotone", "moderate intonation", "slightly expressive", "quite expressive", "very expressive"],
10
+ "speaker_level_pitch_bins":
11
+ ["very low pitch", "quite low pitch", "slightly low pitch", "moderate pitch", "slightly high pitch", "quite high pitch", "very high pitch"]
12
+ }