strickvl commited on
Commit
470e696
1 Parent(s): c9036ae

Get all text paths

Browse files
Files changed (1) hide show
  1. src/train_tokenizer.ipynb +35 -0
src/train_tokenizer.ipynb CHANGED
@@ -20,6 +20,41 @@
20
  "# load_dataset(\"balochiml/balochi-language-data\", data_dir=\"data\", cache_dir=\"../data\")"
21
  ]
22
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  {
24
  "cell_type": "code",
25
  "execution_count": null,
 
20
  "# load_dataset(\"balochiml/balochi-language-data\", data_dir=\"data\", cache_dir=\"../data\")"
21
  ]
22
  },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 13,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "data": {
30
+ "text/plain": [
31
+ "4294"
32
+ ]
33
+ },
34
+ "execution_count": 13,
35
+ "metadata": {},
36
+ "output_type": "execute_result"
37
+ }
38
+ ],
39
+ "source": [
40
+ "import os\n",
41
+ "\n",
42
+ "def get_txt_file_paths(directory):\n",
43
+ " txt_file_paths = []\n",
44
+ " for root, dirs, files in os.walk(directory):\n",
45
+ " for file in files:\n",
46
+ " if file.endswith(\".txt\"):\n",
47
+ " file_path = os.path.join(root, file)\n",
48
+ " txt_file_paths.append(file_path)\n",
49
+ " return txt_file_paths\n",
50
+ "\n",
51
+ "# Replace \"directory_path\" with the actual path of the directory you want to search\n",
52
+ "directory_path = \"../data/raw_text\"\n",
53
+ "txt_paths = get_txt_file_paths(directory_path)\n",
54
+ "\n",
55
+ "len(txt_paths)\n"
56
+ ]
57
+ },
58
  {
59
  "cell_type": "code",
60
  "execution_count": null,