SpeechScore / basis.py
alibabasglab's picture
Upload 73 files
936f6fa verified
raw
history blame
4.46 kB
class ScoreBasis:
def __init__(self, name=None):
# the score operates on the specified rate
self.score_rate = None
# is the score intrusive or non-intrusive ?
self.intrusive = True #require a reference
self.name = name
def windowed_scoring(self, audios, score_rate):
raise NotImplementedError(f'In {self.name}, windowed_scoring is not yet implemented')
def scoring(self, data, window=None, score_rate=None):
""" calling the `windowed_scoring` function that should be specialised
depending on the score."""
# imports
#import soundfile as sf
import resampy
from museval.metrics import Framing
#checking rate
audios = data['audio']
score_rate = data['rate']
if self.score_rate is not None:
score_rate = self.score_rate
if score_rate != data['rate']:
for index, audio in enumerate(audios):
audio = resampy.resample(audio, data['rate'], score_rate, axis=0)
audios[index] = audio
if window is not None:
framer = Framing(window * score_rate, window * score_rate, maxlen)
nwin = framer.nwin
result = {}
for (t, win) in enumerate(framer):
result_t = self.windowed_scoring([audio[win] for audio in audios], score_rate)
result[t] = result_t
else:
result = self.windowed_scoring(audios, score_rate)
return result
"""
audios = []
maxlen = 0
if isinstance(test_files, str):
test_files = [test_files]
print(f'test_files: {test_files}')
if not self.intrusive and len(test_files) > 1:
if self.verbose:
print(' [%s] is non-intrusive. Processing first file only'
% self.name)
test_files = [test_files[0],]
for file in test_files:
# Loading sound file
if isinstance(file, str):
audio, rate = sf.read(file, always_2d=True)
else:
rate = array_rate
if rate is None:
raise ValueError('Sampling rate needs to be specified '
'when feeding numpy arrays.')
audio = file
# Standardize shapes
if len(audio.shape) == 1:
audio = audio[:, None]
if len(audio.shape) != 2:
raise ValueError('Please provide 1D or 2D array, received '
'{}D array'.format(len(audio.shape)))
if self.fixed_rate is not None and rate != self.fixed_rate:
if self.verbose:
print(' [%s] preferred is %dkHz rate. resampling'
% (self.name, self.fixed_rate))
audio = resampy.resample(audio, rate, self.fixed_rate, axis=0)
rate = self.fixed_rate
if self.mono and audio.shape[1] > 1:
if self.verbose:
print(' [%s] only supports mono. Will use first channel'
% self.name)
audio = audio[..., 0, None]
if self.mono:
audio = audio[..., 0]
maxlen = max(maxlen, audio.shape[0])
audios += [audio]
audio = audios[1]
audio[:maxlen-320] = audio[320:]
audios[1] = audio
for index, audio in enumerate(audios):
if audio.shape[0] != maxlen:
new = np.zeros((maxlen,) + audio.shape[1:])
new[:audio.shape[0]] = audio
audios[index] = new
if self.window is not None:
framer = Framing(self.window * rate,
self.hop * rate, maxlen)
nwin = framer.nwin
result = {}
for (t, win) in enumerate(framer):
result_t = self.test_window([audio[win] for audio in audios],
rate)
#or metric in result_t.keys():
# if metric not in result.keys():
# result[metric] = np.empty(nwin)
# result[metric][t] = result_t[metric]
result[t] = result_t
else:
result = self.test_window(audios, rate)
return result
"""