Spaces:
Sleeping
Sleeping
S-MurilloG
commited on
Commit
•
2da86a5
1
Parent(s):
bae67a9
Creating training files
Browse files
CARSE_00_Cleaning.ipynb
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
@@ -1242,34 +1242,38 @@
|
|
1242 |
},
|
1243 |
{
|
1244 |
"cell_type": "code",
|
1245 |
-
"execution_count":
|
1246 |
"metadata": {},
|
1247 |
"outputs": [],
|
1248 |
"source": [
|
1249 |
-
"def dividir_jsonl(
|
1250 |
-
" # Leer
|
1251 |
-
" with open(
|
1252 |
" lineas = file.readlines()\n",
|
1253 |
"\n",
|
1254 |
" # Calcular el tamaño de cada parte\n",
|
1255 |
-
"
|
1256 |
-
" tamaño_parte = math.ceil(total_lineas / num_partes)\n",
|
1257 |
"\n",
|
1258 |
-
" # Dividir y guardar las partes\n",
|
1259 |
" for i in range(num_partes):\n",
|
1260 |
-
"
|
1261 |
-
"
|
1262 |
-
"
|
1263 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
1264 |
]
|
1265 |
},
|
1266 |
{
|
1267 |
"cell_type": "code",
|
1268 |
-
"execution_count":
|
1269 |
"metadata": {},
|
1270 |
"outputs": [],
|
1271 |
"source": [
|
1272 |
-
"dividir_jsonl('Training_Data/Training_Prompts.jsonl',
|
1273 |
]
|
1274 |
},
|
1275 |
{
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
"metadata": {},
|
7 |
"outputs": [],
|
8 |
"source": [
|
|
|
1242 |
},
|
1243 |
{
|
1244 |
"cell_type": "code",
|
1245 |
+
"execution_count": 2,
|
1246 |
"metadata": {},
|
1247 |
"outputs": [],
|
1248 |
"source": [
|
1249 |
+
"def dividir_jsonl(ruta_archivo, num_partes):\n",
|
1250 |
+
" # Leer todas las líneas del archivo\n",
|
1251 |
+
" with open(ruta_archivo, 'r', encoding='utf-8') as file:\n",
|
1252 |
" lineas = file.readlines()\n",
|
1253 |
"\n",
|
1254 |
" # Calcular el tamaño de cada parte\n",
|
1255 |
+
" tamano_parte = len(lineas) // num_partes\n",
|
|
|
1256 |
"\n",
|
|
|
1257 |
" for i in range(num_partes):\n",
|
1258 |
+
" # Calcular el inicio y el fin de cada parte\n",
|
1259 |
+
" inicio = i * tamano_parte\n",
|
1260 |
+
" fin = (i + 1) * tamano_parte if i != num_partes - 1 else len(lineas)\n",
|
1261 |
+
"\n",
|
1262 |
+
" # Nombre del nuevo archivo\n",
|
1263 |
+
" nombre_nuevo_archivo = ruta_archivo.replace('.jsonl', f'_{i + 1}.jsonl')\n",
|
1264 |
+
"\n",
|
1265 |
+
" # Escribir las líneas en el nuevo archivo\n",
|
1266 |
+
" with open(nombre_nuevo_archivo, 'w', encoding='utf-8') as nuevo_archivo:\n",
|
1267 |
+
" nuevo_archivo.writelines(lineas[inicio:fin])"
|
1268 |
]
|
1269 |
},
|
1270 |
{
|
1271 |
"cell_type": "code",
|
1272 |
+
"execution_count": 3,
|
1273 |
"metadata": {},
|
1274 |
"outputs": [],
|
1275 |
"source": [
|
1276 |
+
"dividir_jsonl('Training_Data/Training_Prompts.jsonl', 3)"
|
1277 |
]
|
1278 |
},
|
1279 |
{
|
Training_Data/Training_Prompts_1.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Training_Data/Training_Prompts_2.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Training_Data/Training_Prompts_3.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|