Spaces:
Sleeping
Sleeping
S-MurilloG
commited on
Commit
•
7450643
1
Parent(s):
4f1b3a6
Creating Training Data
Browse files- CARSE_Cleaning.ipynb +31 -31
CARSE_Cleaning.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
@@ -21,7 +21,7 @@
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
-
"execution_count":
|
25 |
"metadata": {},
|
26 |
"outputs": [],
|
27 |
"source": [
|
@@ -64,7 +64,7 @@
|
|
64 |
},
|
65 |
{
|
66 |
"cell_type": "code",
|
67 |
-
"execution_count":
|
68 |
"metadata": {},
|
69 |
"outputs": [
|
70 |
{
|
@@ -115,7 +115,7 @@
|
|
115 |
},
|
116 |
{
|
117 |
"cell_type": "code",
|
118 |
-
"execution_count":
|
119 |
"metadata": {},
|
120 |
"outputs": [],
|
121 |
"source": [
|
@@ -152,7 +152,7 @@
|
|
152 |
},
|
153 |
{
|
154 |
"cell_type": "code",
|
155 |
-
"execution_count":
|
156 |
"metadata": {},
|
157 |
"outputs": [
|
158 |
{
|
@@ -209,7 +209,7 @@
|
|
209 |
},
|
210 |
{
|
211 |
"cell_type": "code",
|
212 |
-
"execution_count":
|
213 |
"metadata": {},
|
214 |
"outputs": [],
|
215 |
"source": [
|
@@ -230,7 +230,7 @@
|
|
230 |
},
|
231 |
{
|
232 |
"cell_type": "code",
|
233 |
-
"execution_count":
|
234 |
"metadata": {},
|
235 |
"outputs": [
|
236 |
{
|
@@ -288,7 +288,7 @@
|
|
288 |
},
|
289 |
{
|
290 |
"cell_type": "code",
|
291 |
-
"execution_count":
|
292 |
"metadata": {},
|
293 |
"outputs": [],
|
294 |
"source": [
|
@@ -345,7 +345,7 @@
|
|
345 |
},
|
346 |
{
|
347 |
"cell_type": "code",
|
348 |
-
"execution_count":
|
349 |
"metadata": {},
|
350 |
"outputs": [
|
351 |
{
|
@@ -461,7 +461,7 @@
|
|
461 |
"10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
|
462 |
]
|
463 |
},
|
464 |
-
"execution_count":
|
465 |
"metadata": {},
|
466 |
"output_type": "execute_result"
|
467 |
}
|
@@ -505,13 +505,13 @@
|
|
505 |
},
|
506 |
{
|
507 |
"cell_type": "code",
|
508 |
-
"execution_count":
|
509 |
"metadata": {},
|
510 |
"outputs": [],
|
511 |
"source": [
|
512 |
"# Ruta del archivo de texto\n",
|
513 |
-
"ruta_archivo = '
|
514 |
-
"ruta_archivo_salida_texto = '
|
515 |
"\n",
|
516 |
"# Leer el contenido del archivo\n",
|
517 |
"with open(ruta_archivo, 'r', encoding='utf-8') as archivo:\n",
|
@@ -529,7 +529,7 @@
|
|
529 |
},
|
530 |
{
|
531 |
"cell_type": "code",
|
532 |
-
"execution_count":
|
533 |
"metadata": {},
|
534 |
"outputs": [
|
535 |
{
|
@@ -648,7 +648,7 @@
|
|
648 |
"[670 rows x 2 columns]"
|
649 |
]
|
650 |
},
|
651 |
-
"execution_count":
|
652 |
"metadata": {},
|
653 |
"output_type": "execute_result"
|
654 |
}
|
@@ -667,7 +667,7 @@
|
|
667 |
},
|
668 |
{
|
669 |
"cell_type": "code",
|
670 |
-
"execution_count":
|
671 |
"metadata": {},
|
672 |
"outputs": [],
|
673 |
"source": [
|
@@ -683,7 +683,7 @@
|
|
683 |
},
|
684 |
{
|
685 |
"cell_type": "code",
|
686 |
-
"execution_count":
|
687 |
"metadata": {},
|
688 |
"outputs": [
|
689 |
{
|
@@ -827,7 +827,7 @@
|
|
827 |
"[670 rows x 3 columns]"
|
828 |
]
|
829 |
},
|
830 |
-
"execution_count":
|
831 |
"metadata": {},
|
832 |
"output_type": "execute_result"
|
833 |
}
|
@@ -840,7 +840,7 @@
|
|
840 |
},
|
841 |
{
|
842 |
"cell_type": "code",
|
843 |
-
"execution_count":
|
844 |
"metadata": {},
|
845 |
"outputs": [],
|
846 |
"source": [
|
@@ -853,7 +853,7 @@
|
|
853 |
},
|
854 |
{
|
855 |
"cell_type": "code",
|
856 |
-
"execution_count":
|
857 |
"metadata": {},
|
858 |
"outputs": [
|
859 |
{
|
@@ -997,7 +997,7 @@
|
|
997 |
"[670 rows x 3 columns]"
|
998 |
]
|
999 |
},
|
1000 |
-
"execution_count":
|
1001 |
"metadata": {},
|
1002 |
"output_type": "execute_result"
|
1003 |
}
|
@@ -1010,7 +1010,7 @@
|
|
1010 |
},
|
1011 |
{
|
1012 |
"cell_type": "code",
|
1013 |
-
"execution_count":
|
1014 |
"metadata": {},
|
1015 |
"outputs": [
|
1016 |
{
|
@@ -1154,7 +1154,7 @@
|
|
1154 |
"[670 rows x 3 columns]"
|
1155 |
]
|
1156 |
},
|
1157 |
-
"execution_count":
|
1158 |
"metadata": {},
|
1159 |
"output_type": "execute_result"
|
1160 |
}
|
@@ -1174,7 +1174,7 @@
|
|
1174 |
},
|
1175 |
{
|
1176 |
"cell_type": "code",
|
1177 |
-
"execution_count":
|
1178 |
"metadata": {},
|
1179 |
"outputs": [],
|
1180 |
"source": [
|
@@ -1198,11 +1198,11 @@
|
|
1198 |
},
|
1199 |
{
|
1200 |
"cell_type": "code",
|
1201 |
-
"execution_count":
|
1202 |
"metadata": {},
|
1203 |
"outputs": [],
|
1204 |
"source": [
|
1205 |
-
"nombre_json = 'Training_data/
|
1206 |
"\n",
|
1207 |
"crear_json(chat_df,nombre_json)\n"
|
1208 |
]
|
@@ -1216,7 +1216,7 @@
|
|
1216 |
},
|
1217 |
{
|
1218 |
"cell_type": "code",
|
1219 |
-
"execution_count":
|
1220 |
"metadata": {},
|
1221 |
"outputs": [
|
1222 |
{
|
@@ -1229,7 +1229,7 @@
|
|
1229 |
],
|
1230 |
"source": [
|
1231 |
"# Ruta del archivo JSONL\n",
|
1232 |
-
"archivo_jsonl = '
|
1233 |
"\n",
|
1234 |
"# Contar las líneas\n",
|
1235 |
"try:\n",
|
@@ -1242,7 +1242,7 @@
|
|
1242 |
},
|
1243 |
{
|
1244 |
"cell_type": "code",
|
1245 |
-
"execution_count":
|
1246 |
"metadata": {},
|
1247 |
"outputs": [],
|
1248 |
"source": [
|
@@ -1265,11 +1265,11 @@
|
|
1265 |
},
|
1266 |
{
|
1267 |
"cell_type": "code",
|
1268 |
-
"execution_count":
|
1269 |
"metadata": {},
|
1270 |
"outputs": [],
|
1271 |
"source": [
|
1272 |
-
"dividir_jsonl('
|
1273 |
]
|
1274 |
},
|
1275 |
{
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
|
|
21 |
},
|
22 |
{
|
23 |
"cell_type": "code",
|
24 |
+
"execution_count": 3,
|
25 |
"metadata": {},
|
26 |
"outputs": [],
|
27 |
"source": [
|
|
|
64 |
},
|
65 |
{
|
66 |
"cell_type": "code",
|
67 |
+
"execution_count": 4,
|
68 |
"metadata": {},
|
69 |
"outputs": [
|
70 |
{
|
|
|
115 |
},
|
116 |
{
|
117 |
"cell_type": "code",
|
118 |
+
"execution_count": 5,
|
119 |
"metadata": {},
|
120 |
"outputs": [],
|
121 |
"source": [
|
|
|
152 |
},
|
153 |
{
|
154 |
"cell_type": "code",
|
155 |
+
"execution_count": 6,
|
156 |
"metadata": {},
|
157 |
"outputs": [
|
158 |
{
|
|
|
209 |
},
|
210 |
{
|
211 |
"cell_type": "code",
|
212 |
+
"execution_count": 7,
|
213 |
"metadata": {},
|
214 |
"outputs": [],
|
215 |
"source": [
|
|
|
230 |
},
|
231 |
{
|
232 |
"cell_type": "code",
|
233 |
+
"execution_count": 8,
|
234 |
"metadata": {},
|
235 |
"outputs": [
|
236 |
{
|
|
|
288 |
},
|
289 |
{
|
290 |
"cell_type": "code",
|
291 |
+
"execution_count": 9,
|
292 |
"metadata": {},
|
293 |
"outputs": [],
|
294 |
"source": [
|
|
|
345 |
},
|
346 |
{
|
347 |
"cell_type": "code",
|
348 |
+
"execution_count": 10,
|
349 |
"metadata": {},
|
350 |
"outputs": [
|
351 |
{
|
|
|
461 |
"10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
|
462 |
]
|
463 |
},
|
464 |
+
"execution_count": 10,
|
465 |
"metadata": {},
|
466 |
"output_type": "execute_result"
|
467 |
}
|
|
|
505 |
},
|
506 |
{
|
507 |
"cell_type": "code",
|
508 |
+
"execution_count": 12,
|
509 |
"metadata": {},
|
510 |
"outputs": [],
|
511 |
"source": [
|
512 |
"# Ruta del archivo de texto\n",
|
513 |
+
"ruta_archivo = 'Raw_Data/Raw_Prompts.txt' \n",
|
514 |
+
"ruta_archivo_salida_texto = 'Raw_Data/Transformed_Prompts.txt'\n",
|
515 |
"\n",
|
516 |
"# Leer el contenido del archivo\n",
|
517 |
"with open(ruta_archivo, 'r', encoding='utf-8') as archivo:\n",
|
|
|
529 |
},
|
530 |
{
|
531 |
"cell_type": "code",
|
532 |
+
"execution_count": 13,
|
533 |
"metadata": {},
|
534 |
"outputs": [
|
535 |
{
|
|
|
648 |
"[670 rows x 2 columns]"
|
649 |
]
|
650 |
},
|
651 |
+
"execution_count": 13,
|
652 |
"metadata": {},
|
653 |
"output_type": "execute_result"
|
654 |
}
|
|
|
667 |
},
|
668 |
{
|
669 |
"cell_type": "code",
|
670 |
+
"execution_count": 14,
|
671 |
"metadata": {},
|
672 |
"outputs": [],
|
673 |
"source": [
|
|
|
683 |
},
|
684 |
{
|
685 |
"cell_type": "code",
|
686 |
+
"execution_count": 15,
|
687 |
"metadata": {},
|
688 |
"outputs": [
|
689 |
{
|
|
|
827 |
"[670 rows x 3 columns]"
|
828 |
]
|
829 |
},
|
830 |
+
"execution_count": 15,
|
831 |
"metadata": {},
|
832 |
"output_type": "execute_result"
|
833 |
}
|
|
|
840 |
},
|
841 |
{
|
842 |
"cell_type": "code",
|
843 |
+
"execution_count": 16,
|
844 |
"metadata": {},
|
845 |
"outputs": [],
|
846 |
"source": [
|
|
|
853 |
},
|
854 |
{
|
855 |
"cell_type": "code",
|
856 |
+
"execution_count": 17,
|
857 |
"metadata": {},
|
858 |
"outputs": [
|
859 |
{
|
|
|
997 |
"[670 rows x 3 columns]"
|
998 |
]
|
999 |
},
|
1000 |
+
"execution_count": 17,
|
1001 |
"metadata": {},
|
1002 |
"output_type": "execute_result"
|
1003 |
}
|
|
|
1010 |
},
|
1011 |
{
|
1012 |
"cell_type": "code",
|
1013 |
+
"execution_count": 18,
|
1014 |
"metadata": {},
|
1015 |
"outputs": [
|
1016 |
{
|
|
|
1154 |
"[670 rows x 3 columns]"
|
1155 |
]
|
1156 |
},
|
1157 |
+
"execution_count": 18,
|
1158 |
"metadata": {},
|
1159 |
"output_type": "execute_result"
|
1160 |
}
|
|
|
1174 |
},
|
1175 |
{
|
1176 |
"cell_type": "code",
|
1177 |
+
"execution_count": 19,
|
1178 |
"metadata": {},
|
1179 |
"outputs": [],
|
1180 |
"source": [
|
|
|
1198 |
},
|
1199 |
{
|
1200 |
"cell_type": "code",
|
1201 |
+
"execution_count": 20,
|
1202 |
"metadata": {},
|
1203 |
"outputs": [],
|
1204 |
"source": [
|
1205 |
+
"nombre_json = 'Training_data/Training_Prompts.jsonl'\n",
|
1206 |
"\n",
|
1207 |
"crear_json(chat_df,nombre_json)\n"
|
1208 |
]
|
|
|
1216 |
},
|
1217 |
{
|
1218 |
"cell_type": "code",
|
1219 |
+
"execution_count": 22,
|
1220 |
"metadata": {},
|
1221 |
"outputs": [
|
1222 |
{
|
|
|
1229 |
],
|
1230 |
"source": [
|
1231 |
"# Ruta del archivo JSONL\n",
|
1232 |
+
"archivo_jsonl = 'Training_Data/Training_Prompts.jsonl'\n",
|
1233 |
"\n",
|
1234 |
"# Contar las líneas\n",
|
1235 |
"try:\n",
|
|
|
1242 |
},
|
1243 |
{
|
1244 |
"cell_type": "code",
|
1245 |
+
"execution_count": 23,
|
1246 |
"metadata": {},
|
1247 |
"outputs": [],
|
1248 |
"source": [
|
|
|
1265 |
},
|
1266 |
{
|
1267 |
"cell_type": "code",
|
1268 |
+
"execution_count": 24,
|
1269 |
"metadata": {},
|
1270 |
"outputs": [],
|
1271 |
"source": [
|
1272 |
+
"dividir_jsonl('Training_Data/Training_Prompts.jsonl', 5)"
|
1273 |
]
|
1274 |
},
|
1275 |
{
|