Spaces:

clr
/

pce

Sleeping

App Files Files Community

catiR commited on Oct 13, 2023

Commit

779c244

1 Parent(s): fbb78e7

f0

Browse files

Files changed (3) hide show

scripts/clusterprosody.py +440 -0
scripts/reaper2pass.py +1 -1
scripts/runSQ.py +3 -0

scripts/clusterprosody.py ADDED Viewed

	@@ -0,0 +1,440 @@

+import numpy as np
+import matplotlib.pyplot as plt
+import soundfile as sf
+from collections import defaultdict
+from dtw import dtw
+from sklearn_extra.cluster import KMedoids
+from copy import deepcopy
+import os, librosa, json
+# based on original implementation by
+# https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
+# by magnús freyr morthens 2023 supported by rannís nsn
+# will need:
+# the whole sentence text (index, word) pairs
+# the indices of units the user wants
+# human meta db of all human recordings
+# tts dir, human wav + align + f0 dirs
+# list of tts voices
+# an actual wav file for each human rec, probably
+# params like: use f0, use rmse, (use dur), [.....]
+# .. check what i wrote anywhere abt this.
+def z_score(x, mean, std):
+    return (x - mean) / std
+# TODO ADJUST
+#  new input will be one Meta db
+#  output should probably be the same, e.g.
+#  {'013823-0457777': [('hvaða', 0.89, 1.35),
+#              ('sjúkdómar', 1.35, 2.17),
+#              ('geta', 2.17, 2.4),
+#              ('fylgt', 2.4, 2.83),
+#              ('óbeinum', 2.83, 3.29),
+#              ('reykingum', 3.29, 3.9)],
+#             '014226-0508808': [('hvaða', 1.03, 1.45),
+#              ('sjúkdómar', 1.45, 2.28),
+#              ('geta', 2.41, 2.7),
+#              ('fylgt', 2.7, 3.09),
+#              ('óbeinum', 3.09, 3.74),
+#              ('reykingum', 3.74, 4.42)],
+#             '013726-0843679': [('hvaða', 0.87, 1.14),
+#              ('sjúkdómar', 1.14, 1.75),
+#              ('geta', 1.75, 1.96),
+#              ('fylgt', 1.96, 2.27),
+#              ('óbeinum', 2.27, 2.73),
+#              ('reykingum', 2.73, 3.27)] }
+def get_word_aligns(sentences, directory):
+    """
+    Returns a dictionary of word alignments for a given sentence.
+    """
+    word_aligns = defaultdict(list)
+    for sentence in sentences:
+        print(sentence)
+        slist = sentence.split(" ")
+        for filename in os.listdir(directory):
+            f = os.path.join(directory, filename)
+            with open(f) as f:
+                lines = f.read().splitlines()[1:]
+                lines = [line.split(",") for line in lines]
+                if len(lines) >= len(slist) and lines[0][2] == slist[0] and all([lines[i][2] == slist[i] for i, line in enumerate(slist)]):
+                    id = filename.replace(".csv", "")
+                    word_al = [(lines[j][2], float(lines[j][0]), float(lines[j][1])) for j, line in enumerate(slist)]
+                    # word_aligns[id].append(word_al)   # If one speaker has multiple sentences
+                    word_aligns[id] = word_al
+            if len(word_aligns) >= 10 * len(sentences): break
+    return word_aligns
+# TODO ADJUST
+#  or tbqh it is possibly fine as is
+# well, what file format is it reading.
+# either adjust my f0 file format or adjust this, a little.
+def get_pitches(start_time, end_time, id, path):
+    """
+    Returns an array of pitch values for a given speech.
+    """
+    f = os.path.join(path, id + ".f0")
+    with open(f) as f:
+        lines = f.read().splitlines()[7:]
+        lines = [[float(x) for x in line.split()] for line in lines]    # split lines into floats
+        pitches = []
+        # find the mean of all pitches in the whole sentence
+        mean = np.mean([line[2] for line in lines if line[2] != -1])
+        # find the std of all pitches in the whole sentence
+        std = np.std([line[2] for line in lines if line[2] != -1])
+        fifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 5)
+        ninetyfifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 95)
+        for line in lines:
+            time, is_pitch, pitch = line
+            if start_time <= time <= end_time:
+                if is_pitch:
+                    if fifth_percentile <= pitch <= ninetyfifth_percentile:
+                        pitches.append(z_score(pitch, mean, std))
+                    elif pitch < fifth_percentile:
+                        pitches.append(z_score(fifth_percentile, mean, std))
+                    elif pitch > ninetyfifth_percentile:
+                        pitches.append(z_score(ninetyfifth_percentile, mean, std))
+                else:
+                    pitches.append(z_score(fifth_percentile, mean, std))
+    return pitches
+# TODO adjust
+# probably mainly for the assumption about filepath lol
+# but also then, comprehend it lol
+def get_rmse(start_time, end_time, id, path, pitch_len):
+    """
+    Returns an array of RMSE values for a given speech.
+    """
+    f = os.path.join(path, id + ".wav")
+    audio, sr = librosa.load(f, sr=16000)
+    segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
+    rmse = librosa.feature.rms(segment)
+    rmse = rmse[0]
+    idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
+    return rmse[idx]
+tEMP_start_end_word_pairs = [
+    [("hvaða", "sjúkdómar"), ("geta", "fylgt"), ("óbeinum", "reykingum")],
+    [("en", "af", "hverju"), ("skyldi", "vera"), ("svona", "mikið", "bull"), ("í", "stjórnmálum")],
+]
+#TODO !!!!!!!!!!!!!########
+# make it take any list of (1stword, lastword) or (word)
+#   units and do the thing for those units.
+# make it work if the sentence has 2 of the same word
+# PROBABLY this means i actually need to display the sentence
+#  to the user with the words numbered,
+#  and make the user input word indices.
+def get_data(word_aligns, start_end_word_pairs):
+    """
+    Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
+    """
+    data = defaultdict(list)
+    f0_dir = "aligned-reaper/samromur-queries/f0/"
+    wav_dir = "aligned-reaper/samromur-queries/wav/"
+    for id, word_al in word_aligns.items():
+        for sent in start_end_word_pairs:
+            for word_combs in sent:
+                start, end = word_combs[0], word_combs[-1]
+                if any(x[0] == start for x in word_al) and any(x[0] == end for x in word_al):
+                    start_time = [al[1] for al in word_al if al[0] == start][0]
+                    end_time = [al[2] for al in word_al if al[0] == end][0]
+                    pitches = get_pitches(start_time, end_time, id, f0_dir)
+                    rmses = get_rmse(start_time, end_time, id, wav_dir, len(pitches))
+                    spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
+                    pitches_cpy = np.array(deepcopy(pitches))
+                    rmses_cpy = np.array(deepcopy(rmses))
+                    d = [[p, r, s] for p, r, s in zip(pitches_cpy, rmses_cpy, spectral_centroids)]
+                    words = "-".join(word_combs)
+                    data[f"{words}-{id}"] = d
+    return data
+# output -
+# {'hvaða-sjúkdómar-013823-0457777': [[-1.9923755532468812, 0.0027455997, -0.4325454395749879],
+#              [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
+#              [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
+#              [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
+#              [-1.9923755532468812, 0.0033261522, -0.4428492071628255]],
+#  'geta-fylgt-013823-0457777': [[x,x,x],[x,x,x]],
+#  'hvaða-sjúkdómar-013726-0843679': [[],[]] }
+# e.g. it seems to be a flat dict whose keys are unique speaker&unit tokens
+#  for which each entry is list len timepoints, at each timepoint dim feats (for me up to 2 not 3)
+# up to here was forming the data
+# -----------------------------------------------------
+# from here down is probably clustering it
+# TODO i have no idea how necessary this will be at all
+def dtw_distance(x, y):
+    """
+    Returns the DTW distance between two pitch sequences.
+    """
+    alignment = dtw(x, y, keep_internals=True)
+    return alignment.normalizedDistance
+# TODO idk but it looks p good
+#  HOWEVER consider exclude the 0 self-comparisons
+# or see if there is something later that takes care of them
+dtw_dists = defaultdict(list)
+for key1, value1 in data.items():
+    d = key1.split("-")
+    words1 = d[:-2]
+    id1, id2 = d[-2], d[-1]
+    for key2, value2 in data.items():
+        d = key2.split("-")
+        words2 = d[:-2]
+        id3, id4 = d[-2], d[-1]
+        if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
+            dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
+# dtw dists ends up as the dict from units to list of tuples
+# {'hvaða-sjúkdómar': [('013823-0457777_013823-0457777', 0.0),
+#              ('013823-0457777_013698-0441666', 0.5999433281203399),
+#              ('013823-0457777_014675-0563760', 0.4695447105594414),
+#              ('014226-0508808_013823-0457777', 0.44080874425223393),
+#              ('014226-0508808_014226-0508808', 0.0),
+#              ('014226-0508808_013726-0843679', 0.5599404672667414),
+#              ('014226-0508808_013681-0442313', 0.6871330752342419)] }
+# note that currently the 0 self-comparisons are present here so
+# TODO
+# a) do i need this?
+# b) make n_clusters a param with default 3
+def kmedoids_clustering(X):
+    kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X)
+    y_km = kmedoids.labels_
+    return y_km, kmedoids
+# TODO !!!!!!!!!!!! #########
+# THIS IS LIKE THE MAIN THINGS probably
+# ok ya it can probably use some restructurings
+# like i can make something make ids_dist2 format already earlier.
+# also triplecheck what kind of distancematrix is supposed to go into X
+# and what currently is it
+#  although ok i think it might be, and self-organising,
+#   and why it keeps the 0s and has symmetric doubles of everything.
+# HOWEVER the 10 should possibly be replaced with nspeakers param ?!?!??
+# btw since i guess clustering strictly operates on X,
+#  once i reduce whatever duration thing down to pair-distances,
+# it no longer matters that duration and pitch/energy had different dimensionality...
+# .... in fact should i actually dtw on 3 feats pitch/ener/dur separately and er cluster on
+#   3dim distance mat? or can u not give it distances in multidim space bc distance doesnt do that
+#  in which case i could still, u kno, average the 3 distances into 1 x, altho..
+kmedoids_cluster_dists = defaultdict(list)
+for words, datas in dtw_dists.items():
+    ids_dist = {d[0]: d[1] for d in datas}
+    ids_dist2 = defaultdict(list)
+    for d in datas:
+        id1, id2 = d[0].split("_")
+        ids_dist2[id1].append(d[1])
+    X = [d[1] for d in datas]
+    X = [X[i:i+10] for i in range(0, len(X), 10)]
+    X = np.array(X)
+    y_km, kmedoids = kmedoids_clustering(X)
+    plot_clusters(X, y_km, words)
+    c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
+    result = zip(X, kmedoids.labels_)
+    sortedR = sorted(result, key=lambda x: x[1])
+    for dp in sortedR:
+        arr, label = dp
+        ids = next((k for k, v in ids_dist2.items() if np.array_equal(v, arr)), None)
+        if ids is None:
+            print("ID is none")
+            continue
+        kmedoids_cluster_dists[words].append((label, ids, arr))
+# TODO probably remember to make it RETURN kmedoids_cluster_dists ..
+# ###############
+# TTS and misc ------------------
+#
+# TODO rename this get_audio_part
+# also maybe take that tmp wav-making out of reaper and put it somewhere general.
+# so everything gets a wav.
+# TODO do NOT specify SR
+#  and CHECK if everything that depends on this is ok with arbitrary SR
+def get_audio(start_time, end_time, id, path):
+    """
+    Returns a dictionary of RMSE values for a given sentence.
+    """
+    f = os.path.join(path, id + ".wav")
+    audio, sr = librosa.load(f, sr=16000)
+    segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
+    return segment
+# see near end of notebook for v nice way to grab timespans of tts audio
+# (or just the start/end timestamps to mark them) from alignment json
+# based on word position index -
+#  so probably really do show user the sentence with each word numbered.
+# TODO the speech_marks.json is NOT EXACTLY what u get from tiro
+# but idr how different, so.
+alfur_sents = speech_marks_data["Alfur"]
+with open("speech_marks.json") as f:
+    speech_marks_data = json.load(f)
+# TODO there IS sth for making tts_data
+# but im probably p much on my own rlly for that.
+# TODO this one is v v helpful.
+# but mind if i adjusted a dictionaries earlier.
+speaker_to_tts_dtw_dists = defaultdict(list)
+for key1, value1 in data.items():
+    d = key1.split("-")
+    words1 = d[:-2]
+    id1, id2 = d[-2], d[-1]
+    for key2, value2 in tts_data.items():
+        d = key2.split("-")
+        words2 = d[:-2]
+        id3, id4 = d[-2], d[-1]
+        if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
+            speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
+#TODO i think this is also gr8
+# but like figure out how its doing
+# bc dict format and stuff,
+# working keying by word index instead of word text, ***********
+# and for 1 wd or 3+ wd units...
+tts_dist_to_cluster = defaultdict(list)
+for words1, datas1 in kmedoids_cluster_dists.items():
+    for d1 in datas1:
+        cluster, sp_id1, arr = d1
+        for words2, datas2 in speaker_to_tts_dtw_dists.items():
+            for d2 in datas2:
+                ids, dist = d2
+                sp_id2, tts_alfur = ids.split("_")
+                if sp_id1 == sp_id2 and words1 == words2:
+                    tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)
+tts_mean_dist_to_cluster = {
+    key: np.mean(value) for key, value in tts_dist_to_cluster.items()
+}
+# THEN there is -
+# \# Plot pitch, rmse, and spectral centroid for each word combination for each speaker
+#   - this is one persontoken per graph and has a word division line - idk if works >2 wds.
+# it might be good to do this for tts at least, eh
+# Plot pitch values for each word combination for each speaker in each cluster (with word boundaries)
+# - multi speakers (one cluster) per graph - this will be good to show, with tts on top.
+# i may want to recentre it around wd bound. at least if only 2 wds.
+#  well i could just pick, like, it will be centred around the 1st wboundary & good luck if more.
+# - the same as above, but rmse
+# go all the way to the bottom to see gphs with a tts added on to one cluster.
+# PLOTTING IS GOING TO BE A WHOLE NIGHTMare
+# that is just too bad
+def plot_clusters(X, y, word):
+    u_labels = np.unique(y)
+    # plot the results
+    for i in u_labels:
+        plt.scatter(X[y == i, 0], X[y == i, 1], label=i)
+    plt.title(word)
+    plt.legend()
+    plt.show()

scripts/reaper2pass.py CHANGED Viewed

@@ -17,7 +17,7 @@ def reaper_soundfile(sound_path, orig_filetype):
     curdir = subprocess.run(["pwd"], capture_output=True, text=True)
     curdir = curdir.stdout.splitlines()[0]
     fname = sound_path.split('/')[-1].replace(orig_filetype,'')
-    tmp_path = f'{curdir}/REAPER_TMP/{fname}_tmp.wav'
     if not os.path.exists(f'{curdir}/REAPER_TMP'):
         os.mkdir(f'{curdir}/REAPER_TMP')
     aud_data.export(tmp_path, format="wav")

     curdir = subprocess.run(["pwd"], capture_output=True, text=True)
     curdir = curdir.stdout.splitlines()[0]
     fname = sound_path.split('/')[-1].replace(orig_filetype,'')
+    tmp_path = f'{curdir}/REAPER_TMP/{fname}tmp.wav'
     if not os.path.exists(f'{curdir}/REAPER_TMP'):
         os.mkdir(f'{curdir}/REAPER_TMP')
     aud_data.export(tmp_path, format="wav")

scripts/runSQ.py CHANGED Viewed

@@ -31,6 +31,9 @@ def run(sentence, voices):
     if meta:
         align_human(meta,speech_aligns,speech_dir,align_model_path)
         f0_human(meta, speech_f0, speech_dir)
     if voices:
         temp_a_sample = get_tts(sentence,voices,tts_dir)
         f0_tts(sentence, voices, tts_dir)

     if meta:
         align_human(meta,speech_aligns,speech_dir,align_model_path)
         f0_human(meta, speech_f0, speech_dir)
+        #TODO cluster humans
+        # input - meta, speech dir, human aligns dir, human f0 dir, any cluster params.
+        # output maybe an object.
     if voices:
         temp_a_sample = get_tts(sentence,voices,tts_dir)
         f0_tts(sentence, voices, tts_dir)