marinone94 commited on
Commit
2b994e2
1 Parent(s): 06d47ea

remove dataset scripts

Browse files
Files changed (1) hide show
  1. upload_nst_sv_to_hf_dataset.py +0 -116
upload_nst_sv_to_hf_dataset.py DELETED
@@ -1,116 +0,0 @@
1
- """ Script to load, transform and upload swedish NST dataset to 🤗 datasets.
2
-
3
- Dataset source: https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-56/
4
-
5
- Procedure:
6
- 1. Loop over annotations
7
- 2. Decide whether to discard specific item
8
- 3. Create DatasetDict = {
9
- features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
10
- num_rows: 11030
11
- }
12
- 3b. Mapping common_voice <---> NST
13
- - 'client_id': info.Speaker_ID
14
- - 'path': val_recording.file
15
- - 'audio': wav file (binary)
16
- - 'sentence': val_recording.text
17
- - 'up_votes': 0
18
- - 'down_votes': 0
19
- - 'age': info.Age
20
- - 'gender': info.Sex
21
- - 'accent': ""
22
- - 'locale': "sv"
23
- - 'segment': ""
24
- 4. Dump to parquet
25
- 5. Upload to hub
26
-
27
- Filter out:
28
- - examples with single words
29
- - examples with single characters
30
- - examples with words splitted in single characters
31
- - remove "\\Punkt", "\\Komma" from sentences
32
-
33
- """
34
-
35
- import json
36
- import os
37
-
38
- import pandas as pd
39
- import torchaudio
40
-
41
- from datasets import load_dataset
42
-
43
-
44
- hf_dataset_repo = "marinone94/nst_sv"
45
- audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files"
46
- annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations"
47
-
48
-
49
- def load_audio_file(rel_filepath):
50
- audio_filepath = f'{audio_files_path}/{rel_filepath}'
51
- data_waveform, sampling_rate = torchaudio.load(audio_filepath)
52
- return {
53
- "path": rel_filepath,
54
- "array": data_waveform[0].t().numpy(),
55
- "sampling_rate": sampling_rate
56
- }
57
-
58
-
59
- def is_record_valid(text):
60
- text_split = text.split()
61
-
62
- if len(text_split) < 2:
63
- return False
64
-
65
- is_all_single_chars = True
66
- for token in text_split:
67
- if len(token) != 1:
68
- is_all_single_chars = False
69
- break
70
- if is_all_single_chars:
71
- return False
72
-
73
- return True
74
-
75
-
76
- def clean_text(text):
77
- return text.replace("\\\\Komma", "").replace("\\\\Punkt", "")
78
-
79
-
80
- def create_dataset_row(annotation_filename):
81
- annotations_filepath = os.path.join(annotations_path, annotation_filename)
82
- with open(annotations_filepath, "r") as f:
83
- annotation = json.load(f)
84
-
85
- dataset_rows = []
86
- for recording in annotation["val_recordings"]:
87
- if is_record_valid(recording["text"]):
88
- rel_filepath = f'{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'.replace(".wav", "-2.wav")
89
- dataset_row = {
90
- "client_id": annotation["info"]["Speaker_ID"],
91
- 'path': rel_filepath,
92
- 'audio': load_audio_file(rel_filepath),
93
- 'sentence': clean_text(recording["text"]),
94
- 'up_votes': 0,
95
- 'down_votes': 0,
96
- 'age': annotation["info"]["Age"],
97
- 'gender': annotation["info"]["Sex"],
98
- 'accent': "",
99
- 'locale': "sv",
100
- 'segment': ""
101
- }
102
- dataset_rows.append(dataset_row)
103
-
104
- return dataset_rows
105
-
106
-
107
- dataset_rows = []
108
- for i, filename in enumerate(os.listdir(annotations_path)):
109
- dataset_rows.extend(create_dataset_row(filename))
110
- if i == 1:
111
- break
112
-
113
- df = pd.DataFrame(dataset_rows)
114
- df.to_parquet("dataset.parquet")
115
- dataset = load_dataset("parquet", data_files="dataset.parquet")
116
- dataset.push_to_hub("marinone94/nst_sv")