S-MurilloG commited on
Commit
2da86a5
1 Parent(s): bae67a9

Creating training files

Browse files
CARSE_00_Cleaning.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -1242,34 +1242,38 @@
1242
  },
1243
  {
1244
  "cell_type": "code",
1245
- "execution_count": 23,
1246
  "metadata": {},
1247
  "outputs": [],
1248
  "source": [
1249
- "def dividir_jsonl(ruta_original, num_partes):\n",
1250
- " # Leer el archivo original\n",
1251
- " with open(ruta_original, 'r', encoding='utf-8') as file:\n",
1252
  " lineas = file.readlines()\n",
1253
  "\n",
1254
  " # Calcular el tamaño de cada parte\n",
1255
- " total_lineas = len(lineas)\n",
1256
- " tamaño_parte = math.ceil(total_lineas / num_partes)\n",
1257
  "\n",
1258
- " # Dividir y guardar las partes\n",
1259
  " for i in range(num_partes):\n",
1260
- " parte = lineas[i*tamaño_parte:(i+1)*tamaño_parte]\n",
1261
- " ruta_nueva = ruta_original.replace('.jsonl', f'_{i+1}.jsonl')\n",
1262
- " with open(ruta_nueva, 'w', encoding='utf-8') as new_file:\n",
1263
- " new_file.writelines(parte)"
 
 
 
 
 
 
1264
  ]
1265
  },
1266
  {
1267
  "cell_type": "code",
1268
- "execution_count": 24,
1269
  "metadata": {},
1270
  "outputs": [],
1271
  "source": [
1272
- "dividir_jsonl('Training_Data/Training_Prompts.jsonl', 5)"
1273
  ]
1274
  },
1275
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
1242
  },
1243
  {
1244
  "cell_type": "code",
1245
+ "execution_count": 2,
1246
  "metadata": {},
1247
  "outputs": [],
1248
  "source": [
1249
+ "def dividir_jsonl(ruta_archivo, num_partes):\n",
1250
+ " # Leer todas las líneas del archivo\n",
1251
+ " with open(ruta_archivo, 'r', encoding='utf-8') as file:\n",
1252
  " lineas = file.readlines()\n",
1253
  "\n",
1254
  " # Calcular el tamaño de cada parte\n",
1255
+ " tamano_parte = len(lineas) // num_partes\n",
 
1256
  "\n",
 
1257
  " for i in range(num_partes):\n",
1258
+ " # Calcular el inicio y el fin de cada parte\n",
1259
+ " inicio = i * tamano_parte\n",
1260
+ " fin = (i + 1) * tamano_parte if i != num_partes - 1 else len(lineas)\n",
1261
+ "\n",
1262
+ " # Nombre del nuevo archivo\n",
1263
+ " nombre_nuevo_archivo = ruta_archivo.replace('.jsonl', f'_{i + 1}.jsonl')\n",
1264
+ "\n",
1265
+ " # Escribir las líneas en el nuevo archivo\n",
1266
+ " with open(nombre_nuevo_archivo, 'w', encoding='utf-8') as nuevo_archivo:\n",
1267
+ " nuevo_archivo.writelines(lineas[inicio:fin])"
1268
  ]
1269
  },
1270
  {
1271
  "cell_type": "code",
1272
+ "execution_count": 3,
1273
  "metadata": {},
1274
  "outputs": [],
1275
  "source": [
1276
+ "dividir_jsonl('Training_Data/Training_Prompts.jsonl', 3)"
1277
  ]
1278
  },
1279
  {
Training_Data/Training_Prompts_1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Training_Data/Training_Prompts_2.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
Training_Data/Training_Prompts_3.jsonl ADDED
The diff for this file is too large to render. See raw diff