ayousanz commited on
Commit
c113ea5
1 Parent(s): 377d5c9

feat: analysis yodas ja000 for WADA SNR

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json
37
+ audio_analysis_results.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python
3
+
4
+ ### Python ###
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ ### Python Patch ###
167
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168
+ poetry.toml
169
+
170
+ # ruff
171
+ .ruff_cache/
172
+
173
+ # LSP config files
174
+ pyrightconfig.json
175
+
176
+ # End of https://www.toptal.com/developers/gitignore/api/python
audio_analysis_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08645bedb6fd8fee832a0c14f74a1a376322dd6773b5e9452ff46451eba0837c
3
+ size 206376891
audio_quality_analysis.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import librosa
3
+ import IPython.display as ipd
4
+ from IPython.display import Audio, display
5
+ import random
6
+ from concurrent.futures import ProcessPoolExecutor
7
+ import numpy as np
8
+
9
+ import json
10
+
11
+
12
+ ds0 = load_dataset('espnet/yodas', 'ja000')
13
+ print("finished loading ja000")
14
+
15
+ def wada_snr(wav):
16
+ # Direct blind estimation of the SNR of a speech signal.
17
+ #
18
+ # Paper on WADA SNR:
19
+ # http://www.cs.cmu.edu/~robust/Papers/KimSternIS08.pdf
20
+ #
21
+ # This function was adapted from this matlab code:
22
+ # https://labrosa.ee.columbia.edu/projects/snreval/#9
23
+
24
+ # init
25
+ eps = 1e-10
26
+ # next 2 lines define a fancy curve derived from a gamma distribution -- see paper
27
+ db_vals = np.arange(-20, 101)
28
+ g_vals = np.array([0.40974774, 0.40986926, 0.40998566, 0.40969089, 0.40986186, 0.40999006, 0.41027138, 0.41052627, 0.41101024, 0.41143264, 0.41231718, 0.41337272, 0.41526426, 0.4178192 , 0.42077252, 0.42452799, 0.42918886, 0.43510373, 0.44234195, 0.45161485, 0.46221153, 0.47491647, 0.48883809, 0.50509236, 0.52353709, 0.54372088, 0.56532427, 0.58847532, 0.61346212, 0.63954496, 0.66750818, 0.69583724, 0.72454762, 0.75414799, 0.78323148, 0.81240985, 0.84219775, 0.87166406, 0.90030504, 0.92880418, 0.95655449, 0.9835349 , 1.01047155, 1.0362095 , 1.06136425, 1.08579312, 1.1094819 , 1.13277995, 1.15472826, 1.17627308, 1.19703503, 1.21671694, 1.23535898, 1.25364313, 1.27103891, 1.28718029, 1.30302865, 1.31839527, 1.33294817, 1.34700935, 1.3605727 , 1.37345513, 1.38577122, 1.39733504, 1.40856397, 1.41959619, 1.42983624, 1.43958467, 1.44902176, 1.45804831, 1.46669568, 1.47486938, 1.48269965, 1.49034339, 1.49748214, 1.50435106, 1.51076426, 1.51698915, 1.5229097 , 1.528578 , 1.53389835, 1.5391211 , 1.5439065 , 1.54858517, 1.55310776, 1.55744391, 1.56164927, 1.56566348, 1.56938671, 1.57307767, 1.57654764, 1.57980083, 1.58304129, 1.58602496, 1.58880681, 1.59162477, 1.5941969 , 1.59693155, 1.599446 , 1.60185011, 1.60408668, 1.60627134, 1.60826199, 1.61004547, 1.61192472, 1.61369656, 1.61534074, 1.61688905, 1.61838916, 1.61985374, 1.62135878, 1.62268119, 1.62390423, 1.62513143, 1.62632463, 1.6274027 , 1.62842767, 1.62945532, 1.6303307 , 1.63128026, 1.63204102])
29
+
30
+ # peak normalize, get magnitude, clip lower bound
31
+ wav = np.array(wav)
32
+ max_val = np.abs(wav).max()
33
+ if max_val == 0:
34
+ max_val = eps
35
+
36
+ wav = wav / max_val
37
+
38
+ abs_wav = np.abs(wav)
39
+ abs_wav[abs_wav < eps] = eps
40
+
41
+ # calcuate statistics
42
+ # E[|z|]
43
+ v1 = max(eps, abs_wav.mean())
44
+ # E[log|z|]
45
+ v2 = np.log(abs_wav).mean()
46
+ # log(E[|z|]) - E[log(|z|)]
47
+ v3 = np.log(v1) - v2
48
+
49
+ # table interpolation
50
+ wav_snr_idx = None
51
+ if any(g_vals < v3):
52
+ wav_snr_idx = np.where(g_vals < v3)[0].max()
53
+ # handle edge cases or interpolate
54
+ if wav_snr_idx is None:
55
+ wav_snr = db_vals[0]
56
+ elif wav_snr_idx == len(db_vals) - 1:
57
+ wav_snr = db_vals[-1]
58
+ else:
59
+ wav_snr = db_vals[wav_snr_idx] + \
60
+ (v3-g_vals[wav_snr_idx]) / (g_vals[wav_snr_idx+1] - \
61
+ g_vals[wav_snr_idx]) * (db_vals[wav_snr_idx+1] - db_vals[wav_snr_idx])
62
+
63
+ # Calculate SNR
64
+ dEng = sum(wav**2)
65
+ dFactor = 10**(wav_snr / 10)
66
+ dNoiseEng = dEng / (1 + dFactor) # Noise energy
67
+ dSigEng = dEng * dFactor / (1 + dFactor) # Signal energy
68
+ snr = 10 * np.log10(dSigEng / dNoiseEng)
69
+
70
+ return snr
71
+
72
+
73
+ def preprocess_audio(data):
74
+ # �?ータが整数型�?�場合、浮動小数点型に変換
75
+ if data.dtype == np.int16:
76
+ data = data.astype(np.float32) / np.iinfo(np.int16).max
77
+ elif data.dtype == np.int32:
78
+ data = data.astype(np.float32) / np.iinfo(np.int32).max
79
+
80
+ # ス�?レオをモノラルに変換?���?要があれば?�?
81
+ if len(data.shape) == 2:
82
+ data = data.mean(axis=1)
83
+
84
+ return data
85
+
86
+ # 音声データの前処理とSNR計算を行う関数
87
+ def process_audio_data(item):
88
+ # 音声データの前処理
89
+ audio_data = item['audio']['array']
90
+
91
+ # 音声データが空でないことを確認
92
+ if len(audio_data) == 0:
93
+ return None
94
+
95
+ preprocessed_data = preprocess_audio(audio_data)
96
+
97
+ # WADA-SNRを計算
98
+ snr = wada_snr(preprocessed_data)
99
+
100
+ # データからidを取得
101
+ uuid = item['utt_id']
102
+ transcription = item['text']
103
+
104
+ return {
105
+ "ファイル名": uuid,
106
+ "SNR値": snr,
107
+ "トランスクリプション": transcription
108
+ }
109
+
110
+ import os
111
+
112
+ if __name__ == '__main__':
113
+ ds = load_dataset('espnet/yodas', 'ja000', trust_remote_code=True)
114
+
115
+ print("データ数: ", ds['train'].dataset_size)
116
+
117
+ # CPUのコア数を取得
118
+ cpu_count = os.cpu_count()
119
+
120
+ # 並列�?��?で関数を実�?
121
+ with ProcessPoolExecutor(max_workers=cpu_count) as executor:
122
+ results = list(executor.map(process_audio_data, ds['train']))
123
+
124
+ # Noneを除去
125
+ results = [result for result in results if result is not None]
126
+
127
+ # 結果をJSONフ��イルに保存
128
+ with open('audio_analysis_results.json', 'w') as f:
129
+ json.dump(results, f, ensure_ascii=False, indent=4)
130
+
131
+ print("JSONファイルが保存されました")
audio_quality_histogram.png ADDED
generate_graph.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+
5
+ # 結果のJSONファイルを読み込む
6
+ with open('audio_analysis_results.json', 'r') as f:
7
+ results = json.load(f)
8
+
9
+ # SNR値のリストを取得
10
+ snr_values = [result['SNR値'] for result in results]
11
+
12
+ # ヒストグラムのパラメータを設定
13
+ bins = np.linspace(-20, 100, 121) # -20dBから60dBまで1dB刻みのビン
14
+ plt.figure(figsize=(10, 6))
15
+ plt.hist(snr_values, bins=bins, edgecolor='black', linewidth=1.2)
16
+ plt.xlabel('WADA-SNR (dB)')
17
+ plt.ylabel('Number of Samples')
18
+ plt.title('Audio Quality Histogram')
19
+ plt.xlim(-20, 100)
20
+ plt.ylim(0, plt.ylim()[1] * 1.1) # y軸の上限を10%拡張
21
+ plt.grid(True)
22
+ plt.tight_layout()
23
+
24
+ # PNGファイルとして保存
25
+ plt.savefig('audio_quality_histogram.png', dpi=300)
26
+
27
+ print("ヒストグラムが audio_quality_histogram.png として保存されました。")