S-MurilloG commited on
Commit
7450643
1 Parent(s): 4f1b3a6

Creating Training Data

Browse files
Files changed (1) hide show
  1. CARSE_Cleaning.ipynb +31 -31
CARSE_Cleaning.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 123,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -21,7 +21,7 @@
21
  },
22
  {
23
  "cell_type": "code",
24
- "execution_count": 105,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
@@ -64,7 +64,7 @@
64
  },
65
  {
66
  "cell_type": "code",
67
- "execution_count": 106,
68
  "metadata": {},
69
  "outputs": [
70
  {
@@ -115,7 +115,7 @@
115
  },
116
  {
117
  "cell_type": "code",
118
- "execution_count": 107,
119
  "metadata": {},
120
  "outputs": [],
121
  "source": [
@@ -152,7 +152,7 @@
152
  },
153
  {
154
  "cell_type": "code",
155
- "execution_count": 108,
156
  "metadata": {},
157
  "outputs": [
158
  {
@@ -209,7 +209,7 @@
209
  },
210
  {
211
  "cell_type": "code",
212
- "execution_count": 109,
213
  "metadata": {},
214
  "outputs": [],
215
  "source": [
@@ -230,7 +230,7 @@
230
  },
231
  {
232
  "cell_type": "code",
233
- "execution_count": 110,
234
  "metadata": {},
235
  "outputs": [
236
  {
@@ -288,7 +288,7 @@
288
  },
289
  {
290
  "cell_type": "code",
291
- "execution_count": 111,
292
  "metadata": {},
293
  "outputs": [],
294
  "source": [
@@ -345,7 +345,7 @@
345
  },
346
  {
347
  "cell_type": "code",
348
- "execution_count": 112,
349
  "metadata": {},
350
  "outputs": [
351
  {
@@ -461,7 +461,7 @@
461
  "10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
462
  ]
463
  },
464
- "execution_count": 112,
465
  "metadata": {},
466
  "output_type": "execute_result"
467
  }
@@ -505,13 +505,13 @@
505
  },
506
  {
507
  "cell_type": "code",
508
- "execution_count": 113,
509
  "metadata": {},
510
  "outputs": [],
511
  "source": [
512
  "# Ruta del archivo de texto\n",
513
- "ruta_archivo = 'Prompts_finales.txt' \n",
514
- "ruta_archivo_salida_texto = 'Prompts_finales_transformados.txt'\n",
515
  "\n",
516
  "# Leer el contenido del archivo\n",
517
  "with open(ruta_archivo, 'r', encoding='utf-8') as archivo:\n",
@@ -529,7 +529,7 @@
529
  },
530
  {
531
  "cell_type": "code",
532
- "execution_count": 114,
533
  "metadata": {},
534
  "outputs": [
535
  {
@@ -648,7 +648,7 @@
648
  "[670 rows x 2 columns]"
649
  ]
650
  },
651
- "execution_count": 114,
652
  "metadata": {},
653
  "output_type": "execute_result"
654
  }
@@ -667,7 +667,7 @@
667
  },
668
  {
669
  "cell_type": "code",
670
- "execution_count": 115,
671
  "metadata": {},
672
  "outputs": [],
673
  "source": [
@@ -683,7 +683,7 @@
683
  },
684
  {
685
  "cell_type": "code",
686
- "execution_count": 116,
687
  "metadata": {},
688
  "outputs": [
689
  {
@@ -827,7 +827,7 @@
827
  "[670 rows x 3 columns]"
828
  ]
829
  },
830
- "execution_count": 116,
831
  "metadata": {},
832
  "output_type": "execute_result"
833
  }
@@ -840,7 +840,7 @@
840
  },
841
  {
842
  "cell_type": "code",
843
- "execution_count": 117,
844
  "metadata": {},
845
  "outputs": [],
846
  "source": [
@@ -853,7 +853,7 @@
853
  },
854
  {
855
  "cell_type": "code",
856
- "execution_count": 118,
857
  "metadata": {},
858
  "outputs": [
859
  {
@@ -997,7 +997,7 @@
997
  "[670 rows x 3 columns]"
998
  ]
999
  },
1000
- "execution_count": 118,
1001
  "metadata": {},
1002
  "output_type": "execute_result"
1003
  }
@@ -1010,7 +1010,7 @@
1010
  },
1011
  {
1012
  "cell_type": "code",
1013
- "execution_count": 119,
1014
  "metadata": {},
1015
  "outputs": [
1016
  {
@@ -1154,7 +1154,7 @@
1154
  "[670 rows x 3 columns]"
1155
  ]
1156
  },
1157
- "execution_count": 119,
1158
  "metadata": {},
1159
  "output_type": "execute_result"
1160
  }
@@ -1174,7 +1174,7 @@
1174
  },
1175
  {
1176
  "cell_type": "code",
1177
- "execution_count": 120,
1178
  "metadata": {},
1179
  "outputs": [],
1180
  "source": [
@@ -1198,11 +1198,11 @@
1198
  },
1199
  {
1200
  "cell_type": "code",
1201
- "execution_count": 121,
1202
  "metadata": {},
1203
  "outputs": [],
1204
  "source": [
1205
- "nombre_json = 'Training_data/prompts_finales.jsonl'\n",
1206
  "\n",
1207
  "crear_json(chat_df,nombre_json)\n"
1208
  ]
@@ -1216,7 +1216,7 @@
1216
  },
1217
  {
1218
  "cell_type": "code",
1219
- "execution_count": 122,
1220
  "metadata": {},
1221
  "outputs": [
1222
  {
@@ -1229,7 +1229,7 @@
1229
  ],
1230
  "source": [
1231
  "# Ruta del archivo JSONL\n",
1232
- "archivo_jsonl = 'Training_data/prompts_finales.jsonl'\n",
1233
  "\n",
1234
  "# Contar las líneas\n",
1235
  "try:\n",
@@ -1242,7 +1242,7 @@
1242
  },
1243
  {
1244
  "cell_type": "code",
1245
- "execution_count": 124,
1246
  "metadata": {},
1247
  "outputs": [],
1248
  "source": [
@@ -1265,11 +1265,11 @@
1265
  },
1266
  {
1267
  "cell_type": "code",
1268
- "execution_count": 125,
1269
  "metadata": {},
1270
  "outputs": [],
1271
  "source": [
1272
- "dividir_jsonl('Training_data/prompts_finales.jsonl', 5)"
1273
  ]
1274
  },
1275
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": 3,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
 
64
  },
65
  {
66
  "cell_type": "code",
67
+ "execution_count": 4,
68
  "metadata": {},
69
  "outputs": [
70
  {
 
115
  },
116
  {
117
  "cell_type": "code",
118
+ "execution_count": 5,
119
  "metadata": {},
120
  "outputs": [],
121
  "source": [
 
152
  },
153
  {
154
  "cell_type": "code",
155
+ "execution_count": 6,
156
  "metadata": {},
157
  "outputs": [
158
  {
 
209
  },
210
  {
211
  "cell_type": "code",
212
+ "execution_count": 7,
213
  "metadata": {},
214
  "outputs": [],
215
  "source": [
 
230
  },
231
  {
232
  "cell_type": "code",
233
+ "execution_count": 8,
234
  "metadata": {},
235
  "outputs": [
236
  {
 
288
  },
289
  {
290
  "cell_type": "code",
291
+ "execution_count": 9,
292
  "metadata": {},
293
  "outputs": [],
294
  "source": [
 
345
  },
346
  {
347
  "cell_type": "code",
348
+ "execution_count": 10,
349
  "metadata": {},
350
  "outputs": [
351
  {
 
461
  "10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
462
  ]
463
  },
464
+ "execution_count": 10,
465
  "metadata": {},
466
  "output_type": "execute_result"
467
  }
 
505
  },
506
  {
507
  "cell_type": "code",
508
+ "execution_count": 12,
509
  "metadata": {},
510
  "outputs": [],
511
  "source": [
512
  "# Ruta del archivo de texto\n",
513
+ "ruta_archivo = 'Raw_Data/Raw_Prompts.txt' \n",
514
+ "ruta_archivo_salida_texto = 'Raw_Data/Transformed_Prompts.txt'\n",
515
  "\n",
516
  "# Leer el contenido del archivo\n",
517
  "with open(ruta_archivo, 'r', encoding='utf-8') as archivo:\n",
 
529
  },
530
  {
531
  "cell_type": "code",
532
+ "execution_count": 13,
533
  "metadata": {},
534
  "outputs": [
535
  {
 
648
  "[670 rows x 2 columns]"
649
  ]
650
  },
651
+ "execution_count": 13,
652
  "metadata": {},
653
  "output_type": "execute_result"
654
  }
 
667
  },
668
  {
669
  "cell_type": "code",
670
+ "execution_count": 14,
671
  "metadata": {},
672
  "outputs": [],
673
  "source": [
 
683
  },
684
  {
685
  "cell_type": "code",
686
+ "execution_count": 15,
687
  "metadata": {},
688
  "outputs": [
689
  {
 
827
  "[670 rows x 3 columns]"
828
  ]
829
  },
830
+ "execution_count": 15,
831
  "metadata": {},
832
  "output_type": "execute_result"
833
  }
 
840
  },
841
  {
842
  "cell_type": "code",
843
+ "execution_count": 16,
844
  "metadata": {},
845
  "outputs": [],
846
  "source": [
 
853
  },
854
  {
855
  "cell_type": "code",
856
+ "execution_count": 17,
857
  "metadata": {},
858
  "outputs": [
859
  {
 
997
  "[670 rows x 3 columns]"
998
  ]
999
  },
1000
+ "execution_count": 17,
1001
  "metadata": {},
1002
  "output_type": "execute_result"
1003
  }
 
1010
  },
1011
  {
1012
  "cell_type": "code",
1013
+ "execution_count": 18,
1014
  "metadata": {},
1015
  "outputs": [
1016
  {
 
1154
  "[670 rows x 3 columns]"
1155
  ]
1156
  },
1157
+ "execution_count": 18,
1158
  "metadata": {},
1159
  "output_type": "execute_result"
1160
  }
 
1174
  },
1175
  {
1176
  "cell_type": "code",
1177
+ "execution_count": 19,
1178
  "metadata": {},
1179
  "outputs": [],
1180
  "source": [
 
1198
  },
1199
  {
1200
  "cell_type": "code",
1201
+ "execution_count": 20,
1202
  "metadata": {},
1203
  "outputs": [],
1204
  "source": [
1205
+ "nombre_json = 'Training_data/Training_Prompts.jsonl'\n",
1206
  "\n",
1207
  "crear_json(chat_df,nombre_json)\n"
1208
  ]
 
1216
  },
1217
  {
1218
  "cell_type": "code",
1219
+ "execution_count": 22,
1220
  "metadata": {},
1221
  "outputs": [
1222
  {
 
1229
  ],
1230
  "source": [
1231
  "# Ruta del archivo JSONL\n",
1232
+ "archivo_jsonl = 'Training_Data/Training_Prompts.jsonl'\n",
1233
  "\n",
1234
  "# Contar las líneas\n",
1235
  "try:\n",
 
1242
  },
1243
  {
1244
  "cell_type": "code",
1245
+ "execution_count": 23,
1246
  "metadata": {},
1247
  "outputs": [],
1248
  "source": [
 
1265
  },
1266
  {
1267
  "cell_type": "code",
1268
+ "execution_count": 24,
1269
  "metadata": {},
1270
  "outputs": [],
1271
  "source": [
1272
+ "dividir_jsonl('Training_Data/Training_Prompts.jsonl', 5)"
1273
  ]
1274
  },
1275
  {