Spaces:
Running
Running
S-MurilloG
commited on
Commit
•
1b001d2
1
Parent(s):
58a2b73
Add first scripts
Browse files- .DS_Store +0 -0
- CARSE_Cleaning.ipynb +1304 -0
- CARSE_Prep.ipynb +1 -0
- Raw_Data/Raw_Prompts.txt +0 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
CARSE_Cleaning.ipynb
ADDED
@@ -0,0 +1,1304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 123,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import pandas as pd\n",
|
10 |
+
"import json\n",
|
11 |
+
"import math\n",
|
12 |
+
"import re"
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"cell_type": "markdown",
|
17 |
+
"metadata": {},
|
18 |
+
"source": [
|
19 |
+
"# Función para agrupar mensajes de Remitentes"
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"cell_type": "code",
|
24 |
+
"execution_count": 105,
|
25 |
+
"metadata": {},
|
26 |
+
"outputs": [],
|
27 |
+
"source": [
|
28 |
+
"def agrupar_remitentes(texto_in):\n",
|
29 |
+
" # Dividir el texto en líneas\n",
|
30 |
+
" lineas = texto_in.strip().split('\\n')\n",
|
31 |
+
"\n",
|
32 |
+
" # Procesar las líneas para agrupar los mensajes por remitente\n",
|
33 |
+
" texto_out = \"\"\n",
|
34 |
+
" ultimo_remitente = \"\"\n",
|
35 |
+
" timestamp_actual = \"\"\n",
|
36 |
+
"\n",
|
37 |
+
" for linea in lineas:\n",
|
38 |
+
" # Verificar si la línea tiene un remitente (es decir, si es un nuevo mensaje)\n",
|
39 |
+
" if re.match(r\"\\[\\d{2}/\\d{2}/\\d{2}, \\d{1,2}:\\d{2}:\\d{2} [a.p].m.\\] \\w+:\", linea):\n",
|
40 |
+
" timestamp, remitente, mensaje = re.match(r\"(\\[\\d{2}/\\d{2}/\\d{2}, \\d{1,2}:\\d{2}:\\d{2} [a.p].m.\\]) (\\w+): (.*)\", linea).groups()\n",
|
41 |
+
" # Si el remitente es diferente al último, se inicia un nuevo mensaje\n",
|
42 |
+
" if remitente != ultimo_remitente:\n",
|
43 |
+
" texto_out += f\"{timestamp} {remitente}: {mensaje}\\n\"\n",
|
44 |
+
" ultimo_remitente = remitente\n",
|
45 |
+
" timestamp_actual = timestamp\n",
|
46 |
+
" else:\n",
|
47 |
+
" # Añadir el mensaje actual al último mensaje del mismo remitente\n",
|
48 |
+
" texto_out = texto_out.strip() + \"\\n\" + mensaje + \"\\n\"\n",
|
49 |
+
" else:\n",
|
50 |
+
" # Si la línea no tiene remitente, se añade al mensaje actual\n",
|
51 |
+
" texto_out = texto_out.strip() + \"\\n\" + linea.strip() + \"\\n\"\n",
|
52 |
+
"\n",
|
53 |
+
" texto_out = texto_out.strip()\n",
|
54 |
+
"\n",
|
55 |
+
" return texto_out"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "markdown",
|
60 |
+
"metadata": {},
|
61 |
+
"source": [
|
62 |
+
"### Prueba"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"execution_count": 106,
|
68 |
+
"metadata": {},
|
69 |
+
"outputs": [
|
70 |
+
{
|
71 |
+
"name": "stdout",
|
72 |
+
"output_type": "stream",
|
73 |
+
"text": [
|
74 |
+
"[30/08/22, 4:11:28 p.m.] CarmenQ: Pues eso es bueno\n",
|
75 |
+
"Así te llenas cool\n",
|
76 |
+
"[30/08/22, 4:11:51 p.m.] Sebas: Pues si\n",
|
77 |
+
"Ya me la terminé\n",
|
78 |
+
"Ya voy a regresar\n",
|
79 |
+
"[30/08/22, 4:12:15 p.m.] CarmenQ: Que bueno\n",
|
80 |
+
"Ok amor\n",
|
81 |
+
"Mucha suerte\n",
|
82 |
+
"La verdad es que tengo mucho sueño\n",
|
83 |
+
"[30/08/22, 4:12:33 p.m.] Sebas: Gracias amorcito\n",
|
84 |
+
"Te amo mucho\n",
|
85 |
+
"[30/08/22, 4:12:40 p.m.] CarmenQ: Yo a ti\n"
|
86 |
+
]
|
87 |
+
}
|
88 |
+
],
|
89 |
+
"source": [
|
90 |
+
"texto_in = \"\"\"\n",
|
91 |
+
"[30/08/22, 4:11:28 p.m.] CarmenQ: Pues eso es bueno\n",
|
92 |
+
"[30/08/22, 4:11:33 p.m.] CarmenQ: Así te llenas cool\n",
|
93 |
+
"[30/08/22, 4:11:51 p.m.] Sebas: Pues si\n",
|
94 |
+
"[30/08/22, 4:11:55 p.m.] Sebas: Ya me la terminé\n",
|
95 |
+
"Ya voy a regresar\n",
|
96 |
+
"[30/08/22, 4:12:15 p.m.] CarmenQ: Que bueno\n",
|
97 |
+
"Ok amor\n",
|
98 |
+
"[30/08/22, 4:12:25 p.m.] CarmenQ: Mucha suerte\n",
|
99 |
+
"La verdad es que tengo mucho sueño\n",
|
100 |
+
"[30/08/22, 4:12:33 p.m.] Sebas: Gracias amorcito\n",
|
101 |
+
"Te amo mucho\n",
|
102 |
+
"[30/08/22, 4:12:40 p.m.] CarmenQ: Yo a ti\n",
|
103 |
+
"\"\"\"\n",
|
104 |
+
"\n",
|
105 |
+
"# Imprimir el texto transformado\n",
|
106 |
+
"print(agrupar_remitentes(texto_in))"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "markdown",
|
111 |
+
"metadata": {},
|
112 |
+
"source": [
|
113 |
+
"# Función para dejar explícitos los Saltos de Línea"
|
114 |
+
]
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"cell_type": "code",
|
118 |
+
"execution_count": 107,
|
119 |
+
"metadata": {},
|
120 |
+
"outputs": [],
|
121 |
+
"source": [
|
122 |
+
"def formatear_salto_linea(texto_entrada):\n",
|
123 |
+
" # Dividimos el texto en líneas\n",
|
124 |
+
" lineas = texto_entrada.split(\"\\n\")\n",
|
125 |
+
"\n",
|
126 |
+
" texto_salida = \"\"\n",
|
127 |
+
" mensaje_actual = \"\"\n",
|
128 |
+
"\n",
|
129 |
+
" for linea in lineas:\n",
|
130 |
+
" # Si la línea tiene el formato de timestamp, es un nuevo mensaje\n",
|
131 |
+
" if \"[\" in linea and \"]\" in linea and \":\" in linea and linea[1:3].isdigit() and linea[4:6].isdigit():\n",
|
132 |
+
" # Agregamos el mensaje anterior al texto de salida (si existe)\n",
|
133 |
+
" if mensaje_actual:\n",
|
134 |
+
" texto_salida += mensaje_actual + \"\\n\"\n",
|
135 |
+
" mensaje_actual = linea\n",
|
136 |
+
" else:\n",
|
137 |
+
" # Si no es un nuevo mensaje, es una continuación del mensaje actual\n",
|
138 |
+
" mensaje_actual += \"\\\\n\" + linea\n",
|
139 |
+
"\n",
|
140 |
+
" # Agregamos el último mensaje al texto de salida\n",
|
141 |
+
" texto_salida += mensaje_actual\n",
|
142 |
+
"\n",
|
143 |
+
" return texto_salida"
|
144 |
+
]
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"cell_type": "markdown",
|
148 |
+
"metadata": {},
|
149 |
+
"source": [
|
150 |
+
"### Prueba"
|
151 |
+
]
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"cell_type": "code",
|
155 |
+
"execution_count": 108,
|
156 |
+
"metadata": {},
|
157 |
+
"outputs": [
|
158 |
+
{
|
159 |
+
"name": "stdout",
|
160 |
+
"output_type": "stream",
|
161 |
+
"text": [
|
162 |
+
"\\n\n",
|
163 |
+
"[09/10/23, 3:04:35 p.m.] Sebas: Que bueno amorrr, te creo\\nPues sigue aprendiendo un buennn\n",
|
164 |
+
"[05/10/23, 5:56:18 p.m.] CarmenQ: Ya ando en casa bb\n",
|
165 |
+
"[05/10/23, 6:06:25 p.m.] Sebas: Qué bueno mi amor, llegaron con bien?\\nLa verdad es que tengo un buen de sueño\\nY te extraño\n",
|
166 |
+
"[04/10/23, 6:29:34 p.m.] CarmenQ: Pues si quieres te paso a ver a tu casita mi amor\\nTienes clase hasta las 3 no?\n",
|
167 |
+
"[04/10/23, 7:20:44 p.m.] Sebas: Sí bb, justo tengo clase a esa hora\n",
|
168 |
+
"[04/10/23, 3:37:58 p.m.] CarmenQ: Solo deja acabo mi clase y te cuento\\nPero tienes que ponerme atención eh\n",
|
169 |
+
"[04/10/23, 3:44:47 p.m.] Sebas: Okey amor, cuando puedas, no hay prisa\n",
|
170 |
+
"[02/10/23, 10:44:20 p.m.] CarmenQ: Te paso a ver más tarde\\nO el jueves voy\\nYa que no tengo clase\n",
|
171 |
+
"[02/10/23, 10:45:05 p.m.] Sebas: Como como? En la noche?\n",
|
172 |
+
"[18/06/23, 11:00:27 p.m.] CarmenQ: Te amo mucho mi amor\n",
|
173 |
+
"[18/10/23, 11:11:24 p.m.] Sebas: Te amo mucho más mi C\\n\n"
|
174 |
+
]
|
175 |
+
}
|
176 |
+
],
|
177 |
+
"source": [
|
178 |
+
"texto_entrada = \"\"\"\n",
|
179 |
+
"[09/10/23, 3:04:35 p.m.] Sebas: Que bueno amorrr, te creo\n",
|
180 |
+
"Pues sigue aprendiendo un buennn\n",
|
181 |
+
"[05/10/23, 5:56:18 p.m.] CarmenQ: Ya ando en casa bb\n",
|
182 |
+
"[05/10/23, 6:06:25 p.m.] Sebas: Qué bueno mi amor, llegaron con bien?\n",
|
183 |
+
"La verdad es que tengo un buen de sueño\n",
|
184 |
+
"Y te extraño\n",
|
185 |
+
"[04/10/23, 6:29:34 p.m.] CarmenQ: Pues si quieres te paso a ver a tu casita mi amor\n",
|
186 |
+
"Tienes clase hasta las 3 no?\n",
|
187 |
+
"[04/10/23, 7:20:44 p.m.] Sebas: Sí bb, justo tengo clase a esa hora\n",
|
188 |
+
"[04/10/23, 3:37:58 p.m.] CarmenQ: Solo deja acabo mi clase y te cuento\n",
|
189 |
+
"Pero tienes que ponerme atención eh\n",
|
190 |
+
"[04/10/23, 3:44:47 p.m.] Sebas: Okey amor, cuando puedas, no hay prisa\n",
|
191 |
+
"[02/10/23, 10:44:20 p.m.] CarmenQ: Te paso a ver más tarde\n",
|
192 |
+
"O el jueves voy\n",
|
193 |
+
"Ya que no tengo clase\n",
|
194 |
+
"[02/10/23, 10:45:05 p.m.] Sebas: Como como? En la noche?\n",
|
195 |
+
"[18/06/23, 11:00:27 p.m.] CarmenQ: Te amo mucho mi amor\n",
|
196 |
+
"[18/10/23, 11:11:24 p.m.] Sebas: Te amo mucho más mi C\n",
|
197 |
+
"\"\"\"\n",
|
198 |
+
"\n",
|
199 |
+
"# Imprimir el texto transformado\n",
|
200 |
+
"print(formatear_salto_linea(texto_entrada))"
|
201 |
+
]
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"cell_type": "markdown",
|
205 |
+
"metadata": {},
|
206 |
+
"source": [
|
207 |
+
"# Función para eliminar Timestamps"
|
208 |
+
]
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"cell_type": "code",
|
212 |
+
"execution_count": 109,
|
213 |
+
"metadata": {},
|
214 |
+
"outputs": [],
|
215 |
+
"source": [
|
216 |
+
"def eliminar_timestamps(texto_entrada):\n",
|
217 |
+
" # Usamos una expresión regular para encontrar y eliminar los timestamps\n",
|
218 |
+
" # La expresión regular busca un patrón de la forma [dd/mm/aa, hh:mm:ss am/pm]\n",
|
219 |
+
" texto_salida = re.sub(r\"\\[\\d{2}/\\d{2}/\\d{2}, \\d{1,2}:\\d{2}:\\d{2} [ap]\\.m\\.\\] \", \"\", texto_entrada)\n",
|
220 |
+
"\n",
|
221 |
+
" return texto_salida"
|
222 |
+
]
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"cell_type": "markdown",
|
226 |
+
"metadata": {},
|
227 |
+
"source": [
|
228 |
+
"### Prueba"
|
229 |
+
]
|
230 |
+
},
|
231 |
+
{
|
232 |
+
"cell_type": "code",
|
233 |
+
"execution_count": 110,
|
234 |
+
"metadata": {},
|
235 |
+
"outputs": [
|
236 |
+
{
|
237 |
+
"name": "stdout",
|
238 |
+
"output_type": "stream",
|
239 |
+
"text": [
|
240 |
+
"\n",
|
241 |
+
"Sebas: Que bueno amorrr, te creo\n",
|
242 |
+
"Pues sigue aprendiendo un buennn\n",
|
243 |
+
"CarmenQ: Ya ando en casa bb\n",
|
244 |
+
"Sebas: Qué bueno mi amor, llegaron con bien?\n",
|
245 |
+
"La verdad es que tengo un buen de sueño\n",
|
246 |
+
"Y te extraño\n",
|
247 |
+
"CarmenQ: Pues si quieres te paso a ver a tu casita mi amor\n",
|
248 |
+
"Tienes clase hasta las 3 no?\n",
|
249 |
+
"Sebas: Sí bb, justo tengo clase a esa hora\n",
|
250 |
+
"CarmenQ: Solo deja acabo mi clase y te cuento\n",
|
251 |
+
"Pero tienes que ponerme atención eh\n",
|
252 |
+
"Sebas: Okey amor, cuando puedas, no hay prisa\n",
|
253 |
+
"CarmenQ: Te paso a ver más tarde\n",
|
254 |
+
"O el jueves voy\n",
|
255 |
+
"Ya que no tengo clase\n",
|
256 |
+
"Sebas: Como como? En la noche?\n",
|
257 |
+
"CarmenQ: Te amo mucho mi amor\n",
|
258 |
+
"Sebas: Te amo mucho más mi C\n",
|
259 |
+
"\n"
|
260 |
+
]
|
261 |
+
}
|
262 |
+
],
|
263 |
+
"source": [
|
264 |
+
"texto_entrada = \"\"\"\n",
|
265 |
+
"[09/10/23, 3:04:35 p.m.] Sebas: Que bueno amorrr, te creo\\nPues sigue aprendiendo un buennn\n",
|
266 |
+
"[05/10/23, 5:56:18 p.m.] CarmenQ: Ya ando en casa bb\n",
|
267 |
+
"[05/10/23, 6:06:25 p.m.] Sebas: Qué bueno mi amor, llegaron con bien?\\nLa verdad es que tengo un buen de sueño\\nY te extraño\n",
|
268 |
+
"[04/10/23, 6:29:34 p.m.] CarmenQ: Pues si quieres te paso a ver a tu casita mi amor\\nTienes clase hasta las 3 no?\n",
|
269 |
+
"[04/10/23, 7:20:44 p.m.] Sebas: Sí bb, justo tengo clase a esa hora\n",
|
270 |
+
"[04/10/23, 3:37:58 p.m.] CarmenQ: Solo deja acabo mi clase y te cuento\\nPero tienes que ponerme atención eh\n",
|
271 |
+
"[04/10/23, 3:44:47 p.m.] Sebas: Okey amor, cuando puedas, no hay prisa\n",
|
272 |
+
"[02/10/23, 10:44:20 p.m.] CarmenQ: Te paso a ver más tarde\\nO el jueves voy\\nYa que no tengo clase\n",
|
273 |
+
"[02/10/23, 10:45:05 p.m.] Sebas: Como como? En la noche?\n",
|
274 |
+
"[18/06/23, 11:00:27 p.m.] CarmenQ: Te amo mucho mi amor\n",
|
275 |
+
"[18/10/23, 11:11:24 p.m.] Sebas: Te amo mucho más mi C\n",
|
276 |
+
"\"\"\"\n",
|
277 |
+
"\n",
|
278 |
+
"# Imprimir el texto transformado\n",
|
279 |
+
"print(eliminar_timestamps(texto_entrada))"
|
280 |
+
]
|
281 |
+
},
|
282 |
+
{
|
283 |
+
"cell_type": "markdown",
|
284 |
+
"metadata": {},
|
285 |
+
"source": [
|
286 |
+
"### Función para crear un Dataset con todos los mensajes de CarmenQ y Sebas"
|
287 |
+
]
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"cell_type": "code",
|
291 |
+
"execution_count": 111,
|
292 |
+
"metadata": {},
|
293 |
+
"outputs": [],
|
294 |
+
"source": [
|
295 |
+
"def crear_dataset(texto_entrada):\n",
|
296 |
+
" # Dividimos el texto en líneas\n",
|
297 |
+
" lineas = texto_entrada.strip().split(\"\\n\")\n",
|
298 |
+
"\n",
|
299 |
+
" # Crear listas para guardar los mensajes de CarmenQ y Sebas\n",
|
300 |
+
" mensajes_carmen = []\n",
|
301 |
+
" mensajes_sebas = []\n",
|
302 |
+
"\n",
|
303 |
+
" # Variables temporales para guardar los mensajes actuales de CarmenQ y Sebas\n",
|
304 |
+
" mensaje_carmen_actual = \"\"\n",
|
305 |
+
" mensaje_sebas_actual = \"\"\n",
|
306 |
+
" es_mensaje_carmen = False\n",
|
307 |
+
"\n",
|
308 |
+
" for linea in lineas:\n",
|
309 |
+
" if linea.startswith(\"CarmenQ:\"):\n",
|
310 |
+
" # Si hay un mensaje previo de Sebas, lo guardamos junto con el mensaje de Carmen anterior\n",
|
311 |
+
" if mensaje_sebas_actual:\n",
|
312 |
+
" mensajes_carmen.append(mensaje_carmen_actual)\n",
|
313 |
+
" mensajes_sebas.append(mensaje_sebas_actual)\n",
|
314 |
+
" mensaje_sebas_actual = \"\"\n",
|
315 |
+
"\n",
|
316 |
+
" mensaje_carmen_actual = linea[len(\"CarmenQ: \"):]\n",
|
317 |
+
" es_mensaje_carmen = True\n",
|
318 |
+
" elif linea.startswith(\"Sebas:\"):\n",
|
319 |
+
" mensaje_sebas_actual = linea[len(\"Sebas: \"):]\n",
|
320 |
+
" es_mensaje_carmen = False\n",
|
321 |
+
" else:\n",
|
322 |
+
" # Continuación del mensaje actual\n",
|
323 |
+
" if es_mensaje_carmen:\n",
|
324 |
+
" mensaje_carmen_actual += \"\\\\n\" + linea\n",
|
325 |
+
" else:\n",
|
326 |
+
" mensaje_sebas_actual += \"\\\\n\" + linea\n",
|
327 |
+
"\n",
|
328 |
+
" # Agregar el último par de mensajes si existen\n",
|
329 |
+
" if mensaje_carmen_actual and mensaje_sebas_actual:\n",
|
330 |
+
" mensajes_carmen.append(mensaje_carmen_actual)\n",
|
331 |
+
" mensajes_sebas.append(mensaje_sebas_actual)\n",
|
332 |
+
"\n",
|
333 |
+
" # Crear el DataFrame\n",
|
334 |
+
" df = pd.DataFrame({'CarmenQ': mensajes_carmen, 'Sebas': mensajes_sebas})\n",
|
335 |
+
"\n",
|
336 |
+
" return df"
|
337 |
+
]
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"cell_type": "markdown",
|
341 |
+
"metadata": {},
|
342 |
+
"source": [
|
343 |
+
"### Prueba"
|
344 |
+
]
|
345 |
+
},
|
346 |
+
{
|
347 |
+
"cell_type": "code",
|
348 |
+
"execution_count": 112,
|
349 |
+
"metadata": {},
|
350 |
+
"outputs": [
|
351 |
+
{
|
352 |
+
"data": {
|
353 |
+
"text/html": [
|
354 |
+
"<div>\n",
|
355 |
+
"<style scoped>\n",
|
356 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
357 |
+
" vertical-align: middle;\n",
|
358 |
+
" }\n",
|
359 |
+
"\n",
|
360 |
+
" .dataframe tbody tr th {\n",
|
361 |
+
" vertical-align: top;\n",
|
362 |
+
" }\n",
|
363 |
+
"\n",
|
364 |
+
" .dataframe thead th {\n",
|
365 |
+
" text-align: right;\n",
|
366 |
+
" }\n",
|
367 |
+
"</style>\n",
|
368 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
369 |
+
" <thead>\n",
|
370 |
+
" <tr style=\"text-align: right;\">\n",
|
371 |
+
" <th></th>\n",
|
372 |
+
" <th>CarmenQ</th>\n",
|
373 |
+
" <th>Sebas</th>\n",
|
374 |
+
" </tr>\n",
|
375 |
+
" </thead>\n",
|
376 |
+
" <tbody>\n",
|
377 |
+
" <tr>\n",
|
378 |
+
" <th>0</th>\n",
|
379 |
+
" <td>Quieres ver \\nuna peli?</td>\n",
|
380 |
+
" <td>Sisisi, sí quierooo\\nVamos a ver Kung Fu Panda...</td>\n",
|
381 |
+
" </tr>\n",
|
382 |
+
" <tr>\n",
|
383 |
+
" <th>1</th>\n",
|
384 |
+
" <td>Podemos\\nver una\\nserie bb</td>\n",
|
385 |
+
" <td>Uyyy sí\\namor, pero cuál? Ya hemos\\nvisto muchas</td>\n",
|
386 |
+
" </tr>\n",
|
387 |
+
" <tr>\n",
|
388 |
+
" <th>2</th>\n",
|
389 |
+
" <td>Luego vemos la de Coraline?</td>\n",
|
390 |
+
" <td>Cuando quieras mi amor, ya sabes que sí\\nPero ...</td>\n",
|
391 |
+
" </tr>\n",
|
392 |
+
" <tr>\n",
|
393 |
+
" <th>3</th>\n",
|
394 |
+
" <td>Si\\nte están\\ngustando las pelis bb?</td>\n",
|
395 |
+
" <td>Si amorrrr, mucho\\nA ver cuándo me invitar a v...</td>\n",
|
396 |
+
" </tr>\n",
|
397 |
+
" <tr>\n",
|
398 |
+
" <th>4</th>\n",
|
399 |
+
" <td>Ya ponle a la peli padrino</td>\n",
|
400 |
+
" <td>JAJAJA voy amor, no me presiones</td>\n",
|
401 |
+
" </tr>\n",
|
402 |
+
" <tr>\n",
|
403 |
+
" <th>5</th>\n",
|
404 |
+
" <td>Regresando vemos la peli si?</td>\n",
|
405 |
+
" <td>Si mi amorrrr, ya justo que regreses ya ando a...</td>\n",
|
406 |
+
" </tr>\n",
|
407 |
+
" <tr>\n",
|
408 |
+
" <th>6</th>\n",
|
409 |
+
" <td>En un\\nratito quieres que veamos la peli?</td>\n",
|
410 |
+
" <td>Sí amor, estaría perfecto\\nPero si está chafa,...</td>\n",
|
411 |
+
" </tr>\n",
|
412 |
+
" <tr>\n",
|
413 |
+
" <th>7</th>\n",
|
414 |
+
" <td>O podemos ver la peli amor</td>\n",
|
415 |
+
" <td>Pues yo si quiero ir a ver la peli, así que es...</td>\n",
|
416 |
+
" </tr>\n",
|
417 |
+
" <tr>\n",
|
418 |
+
" <th>8</th>\n",
|
419 |
+
" <td>Que quieres ver bb?\\nUna serie o peli?</td>\n",
|
420 |
+
" <td>Tal vez una peli, no? Que luego me cuesta segu...</td>\n",
|
421 |
+
" </tr>\n",
|
422 |
+
" <tr>\n",
|
423 |
+
" <th>9</th>\n",
|
424 |
+
" <td>Andamos viendo una peli</td>\n",
|
425 |
+
" <td>Cómo se llama la peli?</td>\n",
|
426 |
+
" </tr>\n",
|
427 |
+
" <tr>\n",
|
428 |
+
" <th>10</th>\n",
|
429 |
+
" <td>Yo me voy a bañar rápido\\nEn lo que te aclimat...</td>\n",
|
430 |
+
" <td>Vale mi amor, disfruta tu baño\\nSabes que me e...</td>\n",
|
431 |
+
" </tr>\n",
|
432 |
+
" </tbody>\n",
|
433 |
+
"</table>\n",
|
434 |
+
"</div>"
|
435 |
+
],
|
436 |
+
"text/plain": [
|
437 |
+
" CarmenQ \\\n",
|
438 |
+
"0 Quieres ver \\nuna peli? \n",
|
439 |
+
"1 Podemos\\nver una\\nserie bb \n",
|
440 |
+
"2 Luego vemos la de Coraline? \n",
|
441 |
+
"3 Si\\nte están\\ngustando las pelis bb? \n",
|
442 |
+
"4 Ya ponle a la peli padrino \n",
|
443 |
+
"5 Regresando vemos la peli si? \n",
|
444 |
+
"6 En un\\nratito quieres que veamos la peli? \n",
|
445 |
+
"7 O podemos ver la peli amor \n",
|
446 |
+
"8 Que quieres ver bb?\\nUna serie o peli? \n",
|
447 |
+
"9 Andamos viendo una peli \n",
|
448 |
+
"10 Yo me voy a bañar rápido\\nEn lo que te aclimat... \n",
|
449 |
+
"\n",
|
450 |
+
" Sebas \n",
|
451 |
+
"0 Sisisi, sí quierooo\\nVamos a ver Kung Fu Panda... \n",
|
452 |
+
"1 Uyyy sí\\namor, pero cuál? Ya hemos\\nvisto muchas \n",
|
453 |
+
"2 Cuando quieras mi amor, ya sabes que sí\\nPero ... \n",
|
454 |
+
"3 Si amorrrr, mucho\\nA ver cuándo me invitar a v... \n",
|
455 |
+
"4 JAJAJA voy amor, no me presiones \n",
|
456 |
+
"5 Si mi amorrrr, ya justo que regreses ya ando a... \n",
|
457 |
+
"6 Sí amor, estaría perfecto\\nPero si está chafa,... \n",
|
458 |
+
"7 Pues yo si quiero ir a ver la peli, así que es... \n",
|
459 |
+
"8 Tal vez una peli, no? Que luego me cuesta segu... \n",
|
460 |
+
"9 Cómo se llama la peli? \n",
|
461 |
+
"10 Vale mi amor, disfruta tu baño\\nSabes que me e... "
|
462 |
+
]
|
463 |
+
},
|
464 |
+
"execution_count": 112,
|
465 |
+
"metadata": {},
|
466 |
+
"output_type": "execute_result"
|
467 |
+
}
|
468 |
+
],
|
469 |
+
"source": [
|
470 |
+
"text = \"\"\"\n",
|
471 |
+
"CarmenQ: Quieres ver \\nuna peli?\n",
|
472 |
+
"Sebas: Sisisi, sí quierooo\\nVamos a ver Kung Fu Panda? Jejeje\n",
|
473 |
+
"CarmenQ: Podemos\\nver una\\nserie bb\n",
|
474 |
+
"Sebas: Uyyy sí\\namor, pero cuál? Ya hemos\\nvisto muchas\n",
|
475 |
+
"CarmenQ: Luego vemos la de Coraline?\n",
|
476 |
+
"Sebas: Cuando quieras mi amor, ya sabes que sí\\nPero a ver si vamos a comprar unas palomitas también, nooo?\n",
|
477 |
+
"CarmenQ: Si\\nte están\\ngustando las pelis bb?\n",
|
478 |
+
"Sebas: Si amorrrr, mucho\\nA ver cuándo me invitar a ver las otras\n",
|
479 |
+
"CarmenQ: Ya ponle a la peli padrino\n",
|
480 |
+
"Sebas: JAJAJA voy amor, no me presiones\n",
|
481 |
+
"CarmenQ: Regresando vemos la peli si?\n",
|
482 |
+
"Sebas: Si mi amorrrr, ya justo que regreses ya ando aquí con Netflix prendido\n",
|
483 |
+
"CarmenQ: En un\\nratito quieres que veamos la peli?\n",
|
484 |
+
"Sebas: Sí amor, estaría perfecto\\nPero si está chafa, me debes un helado\n",
|
485 |
+
"CarmenQ: O podemos ver la peli amor\n",
|
486 |
+
"Sebas: Pues yo si quiero ir a ver la peli, así que estoy puesto\n",
|
487 |
+
"CarmenQ: Que quieres ver bb?\\nUna serie o peli?\n",
|
488 |
+
"Sebas: Tal vez una peli, no? Que luego me cuesta seguirle a las series\n",
|
489 |
+
"CarmenQ: Andamos viendo una peli\n",
|
490 |
+
"Sebas: Cómo se llama la peli?\n",
|
491 |
+
"CarmenQ: Yo me voy a bañar rápido\\nEn lo que te aclimatas y todo\\nY vemos la peli si tienes ganas\n",
|
492 |
+
"Sebas: Vale mi amor, disfruta tu baño\\nSabes que me encantaría ver una peli a distancia\n",
|
493 |
+
"\"\"\"\n",
|
494 |
+
"\n",
|
495 |
+
"df = crear_dataset(text)\n",
|
496 |
+
"df"
|
497 |
+
]
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"cell_type": "markdown",
|
501 |
+
"metadata": {},
|
502 |
+
"source": [
|
503 |
+
"# Llamado a todas las funciones para crear el archivo de Texto Final y el Dataset con las conversaciones"
|
504 |
+
]
|
505 |
+
},
|
506 |
+
{
|
507 |
+
"cell_type": "code",
|
508 |
+
"execution_count": 113,
|
509 |
+
"metadata": {},
|
510 |
+
"outputs": [],
|
511 |
+
"source": [
|
512 |
+
"# Ruta del archivo de texto\n",
|
513 |
+
"ruta_archivo = 'Prompts_finales.txt' \n",
|
514 |
+
"ruta_archivo_salida_texto = 'Prompts_finales_transformados.txt'\n",
|
515 |
+
"\n",
|
516 |
+
"# Leer el contenido del archivo\n",
|
517 |
+
"with open(ruta_archivo, 'r', encoding='utf-8') as archivo:\n",
|
518 |
+
" texto = archivo.read()\n",
|
519 |
+
"\n",
|
520 |
+
"# Hacer todas las transformaciones\n",
|
521 |
+
"texto_agrupado = agrupar_remitentes(texto)\n",
|
522 |
+
"texto_linea = formatear_salto_linea(texto_agrupado)\n",
|
523 |
+
"texto_sin_timestamp = eliminar_timestamps(texto_linea)\n",
|
524 |
+
"\n",
|
525 |
+
"# Escribir el texto modificado en el archivo de salida\n",
|
526 |
+
"with open(ruta_archivo_salida_texto, 'w', encoding='utf-8') as archivo_salida:\n",
|
527 |
+
" archivo_salida.write(texto_sin_timestamp)"
|
528 |
+
]
|
529 |
+
},
|
530 |
+
{
|
531 |
+
"cell_type": "code",
|
532 |
+
"execution_count": 114,
|
533 |
+
"metadata": {},
|
534 |
+
"outputs": [
|
535 |
+
{
|
536 |
+
"data": {
|
537 |
+
"text/html": [
|
538 |
+
"<div>\n",
|
539 |
+
"<style scoped>\n",
|
540 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
541 |
+
" vertical-align: middle;\n",
|
542 |
+
" }\n",
|
543 |
+
"\n",
|
544 |
+
" .dataframe tbody tr th {\n",
|
545 |
+
" vertical-align: top;\n",
|
546 |
+
" }\n",
|
547 |
+
"\n",
|
548 |
+
" .dataframe thead th {\n",
|
549 |
+
" text-align: right;\n",
|
550 |
+
" }\n",
|
551 |
+
"</style>\n",
|
552 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
553 |
+
" <thead>\n",
|
554 |
+
" <tr style=\"text-align: right;\">\n",
|
555 |
+
" <th></th>\n",
|
556 |
+
" <th>CarmenQ</th>\n",
|
557 |
+
" <th>Sebas</th>\n",
|
558 |
+
" </tr>\n",
|
559 |
+
" </thead>\n",
|
560 |
+
" <tbody>\n",
|
561 |
+
" <tr>\n",
|
562 |
+
" <th>0</th>\n",
|
563 |
+
" <td>Buenos días mi amorchis, cómo dormiste hoy?</td>\n",
|
564 |
+
" <td>Buenos días mi amorrrr\\nBien bien, pero hacía ...</td>\n",
|
565 |
+
" </tr>\n",
|
566 |
+
" <tr>\n",
|
567 |
+
" <th>1</th>\n",
|
568 |
+
" <td>Holis mi amor, espero que tengas un muy bonito...</td>\n",
|
569 |
+
" <td>Hola hola bebitaaaa, te amo mucho másss\\nTambi...</td>\n",
|
570 |
+
" </tr>\n",
|
571 |
+
" <tr>\n",
|
572 |
+
" <th>2</th>\n",
|
573 |
+
" <td>Buenos días mi bb, disfruta mucho tu día guapo</td>\n",
|
574 |
+
" <td>Buenos días amorrr, graciassssss\\nTambién disf...</td>\n",
|
575 |
+
" </tr>\n",
|
576 |
+
" <tr>\n",
|
577 |
+
" <th>3</th>\n",
|
578 |
+
" <td>Buenos días mi amor, espero que hayas podido d...</td>\n",
|
579 |
+
" <td>Hola mi amorcito, cómo amaneciste hoyyy???\\nTa...</td>\n",
|
580 |
+
" </tr>\n",
|
581 |
+
" <tr>\n",
|
582 |
+
" <th>4</th>\n",
|
583 |
+
" <td>Hellouuuuu, te amo mucho guapiiii, ten lindo dia</td>\n",
|
584 |
+
" <td>Holi mi vida, cómo estás hoy??\\nTe amo mucho m...</td>\n",
|
585 |
+
" </tr>\n",
|
586 |
+
" <tr>\n",
|
587 |
+
" <th>...</th>\n",
|
588 |
+
" <td>...</td>\n",
|
589 |
+
" <td>...</td>\n",
|
590 |
+
" </tr>\n",
|
591 |
+
" <tr>\n",
|
592 |
+
" <th>665</th>\n",
|
593 |
+
" <td>Dime algo bonito</td>\n",
|
594 |
+
" <td>Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ...</td>\n",
|
595 |
+
" </tr>\n",
|
596 |
+
" <tr>\n",
|
597 |
+
" <th>666</th>\n",
|
598 |
+
" <td>Dime algo bonito</td>\n",
|
599 |
+
" <td>Yo digo que tal vez el universo ha dicho que e...</td>\n",
|
600 |
+
" </tr>\n",
|
601 |
+
" <tr>\n",
|
602 |
+
" <th>667</th>\n",
|
603 |
+
" <td>Oye dime algo bonito</td>\n",
|
604 |
+
" <td>Deja veo si ya me depositaste en mi cuenta y v...</td>\n",
|
605 |
+
" </tr>\n",
|
606 |
+
" <tr>\n",
|
607 |
+
" <th>668</th>\n",
|
608 |
+
" <td>Oye dime algo bonito</td>\n",
|
609 |
+
" <td>Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng...</td>\n",
|
610 |
+
" </tr>\n",
|
611 |
+
" <tr>\n",
|
612 |
+
" <th>669</th>\n",
|
613 |
+
" <td>Oye dime algo bonito</td>\n",
|
614 |
+
" <td>Solo si veo un depósito en mi cuenta en menos ...</td>\n",
|
615 |
+
" </tr>\n",
|
616 |
+
" </tbody>\n",
|
617 |
+
"</table>\n",
|
618 |
+
"<p>670 rows × 2 columns</p>\n",
|
619 |
+
"</div>"
|
620 |
+
],
|
621 |
+
"text/plain": [
|
622 |
+
" CarmenQ \\\n",
|
623 |
+
"0 Buenos días mi amorchis, cómo dormiste hoy? \n",
|
624 |
+
"1 Holis mi amor, espero que tengas un muy bonito... \n",
|
625 |
+
"2 Buenos días mi bb, disfruta mucho tu día guapo \n",
|
626 |
+
"3 Buenos días mi amor, espero que hayas podido d... \n",
|
627 |
+
"4 Hellouuuuu, te amo mucho guapiiii, ten lindo dia \n",
|
628 |
+
".. ... \n",
|
629 |
+
"665 Dime algo bonito \n",
|
630 |
+
"666 Dime algo bonito \n",
|
631 |
+
"667 Oye dime algo bonito \n",
|
632 |
+
"668 Oye dime algo bonito \n",
|
633 |
+
"669 Oye dime algo bonito \n",
|
634 |
+
"\n",
|
635 |
+
" Sebas \n",
|
636 |
+
"0 Buenos días mi amorrrr\\nBien bien, pero hacía ... \n",
|
637 |
+
"1 Hola hola bebitaaaa, te amo mucho másss\\nTambi... \n",
|
638 |
+
"2 Buenos días amorrr, graciassssss\\nTambién disf... \n",
|
639 |
+
"3 Hola mi amorcito, cómo amaneciste hoyyy???\\nTa... \n",
|
640 |
+
"4 Holi mi vida, cómo estás hoy??\\nTe amo mucho m... \n",
|
641 |
+
".. ... \n",
|
642 |
+
"665 Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ... \n",
|
643 |
+
"666 Yo digo que tal vez el universo ha dicho que e... \n",
|
644 |
+
"667 Deja veo si ya me depositaste en mi cuenta y v... \n",
|
645 |
+
"668 Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng... \n",
|
646 |
+
"669 Solo si veo un depósito en mi cuenta en menos ... \n",
|
647 |
+
"\n",
|
648 |
+
"[670 rows x 2 columns]"
|
649 |
+
]
|
650 |
+
},
|
651 |
+
"execution_count": 114,
|
652 |
+
"metadata": {},
|
653 |
+
"output_type": "execute_result"
|
654 |
+
}
|
655 |
+
],
|
656 |
+
"source": [
|
657 |
+
"chat_df = crear_dataset(texto_sin_timestamp)\n",
|
658 |
+
"chat_df"
|
659 |
+
]
|
660 |
+
},
|
661 |
+
{
|
662 |
+
"cell_type": "markdown",
|
663 |
+
"metadata": {},
|
664 |
+
"source": [
|
665 |
+
"# Agregamos la columna 'system' y transformamos el Dataset"
|
666 |
+
]
|
667 |
+
},
|
668 |
+
{
|
669 |
+
"cell_type": "code",
|
670 |
+
"execution_count": 115,
|
671 |
+
"metadata": {},
|
672 |
+
"outputs": [],
|
673 |
+
"source": [
|
674 |
+
"def agregar_columna_system(df):\n",
|
675 |
+
" # Definimos el contenido de la columna 'system'\n",
|
676 |
+
" contenido_system = \"\"\"CARSE es un chatbot que imita el estilo en que Sebas conversa. Fue creado con amor solo para Carmen. Su objetivo es ofrecer una experiencia de chat divertida y familiar que exprese la personalidad, el humor y el amor de Sebas hacia Carmen.\"\"\"\n",
|
677 |
+
"\n",
|
678 |
+
" # Agregamos la columna al DataFrame\n",
|
679 |
+
" df.insert(0, 'system', contenido_system)\n",
|
680 |
+
"\n",
|
681 |
+
" return df"
|
682 |
+
]
|
683 |
+
},
|
684 |
+
{
|
685 |
+
"cell_type": "code",
|
686 |
+
"execution_count": 116,
|
687 |
+
"metadata": {},
|
688 |
+
"outputs": [
|
689 |
+
{
|
690 |
+
"data": {
|
691 |
+
"text/html": [
|
692 |
+
"<div>\n",
|
693 |
+
"<style scoped>\n",
|
694 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
695 |
+
" vertical-align: middle;\n",
|
696 |
+
" }\n",
|
697 |
+
"\n",
|
698 |
+
" .dataframe tbody tr th {\n",
|
699 |
+
" vertical-align: top;\n",
|
700 |
+
" }\n",
|
701 |
+
"\n",
|
702 |
+
" .dataframe thead th {\n",
|
703 |
+
" text-align: right;\n",
|
704 |
+
" }\n",
|
705 |
+
"</style>\n",
|
706 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
707 |
+
" <thead>\n",
|
708 |
+
" <tr style=\"text-align: right;\">\n",
|
709 |
+
" <th></th>\n",
|
710 |
+
" <th>system</th>\n",
|
711 |
+
" <th>CarmenQ</th>\n",
|
712 |
+
" <th>Sebas</th>\n",
|
713 |
+
" </tr>\n",
|
714 |
+
" </thead>\n",
|
715 |
+
" <tbody>\n",
|
716 |
+
" <tr>\n",
|
717 |
+
" <th>0</th>\n",
|
718 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
719 |
+
" <td>Buenos días mi amorchis, cómo dormiste hoy?</td>\n",
|
720 |
+
" <td>Buenos días mi amorrrr\\nBien bien, pero hacía ...</td>\n",
|
721 |
+
" </tr>\n",
|
722 |
+
" <tr>\n",
|
723 |
+
" <th>1</th>\n",
|
724 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
725 |
+
" <td>Holis mi amor, espero que tengas un muy bonito...</td>\n",
|
726 |
+
" <td>Hola hola bebitaaaa, te amo mucho másss\\nTambi...</td>\n",
|
727 |
+
" </tr>\n",
|
728 |
+
" <tr>\n",
|
729 |
+
" <th>2</th>\n",
|
730 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
731 |
+
" <td>Buenos días mi bb, disfruta mucho tu día guapo</td>\n",
|
732 |
+
" <td>Buenos días amorrr, graciassssss\\nTambién disf...</td>\n",
|
733 |
+
" </tr>\n",
|
734 |
+
" <tr>\n",
|
735 |
+
" <th>3</th>\n",
|
736 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
737 |
+
" <td>Buenos días mi amor, espero que hayas podido d...</td>\n",
|
738 |
+
" <td>Hola mi amorcito, cómo amaneciste hoyyy???\\nTa...</td>\n",
|
739 |
+
" </tr>\n",
|
740 |
+
" <tr>\n",
|
741 |
+
" <th>4</th>\n",
|
742 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
743 |
+
" <td>Hellouuuuu, te amo mucho guapiiii, ten lindo dia</td>\n",
|
744 |
+
" <td>Holi mi vida, cómo estás hoy??\\nTe amo mucho m...</td>\n",
|
745 |
+
" </tr>\n",
|
746 |
+
" <tr>\n",
|
747 |
+
" <th>...</th>\n",
|
748 |
+
" <td>...</td>\n",
|
749 |
+
" <td>...</td>\n",
|
750 |
+
" <td>...</td>\n",
|
751 |
+
" </tr>\n",
|
752 |
+
" <tr>\n",
|
753 |
+
" <th>665</th>\n",
|
754 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
755 |
+
" <td>Dime algo bonito</td>\n",
|
756 |
+
" <td>Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ...</td>\n",
|
757 |
+
" </tr>\n",
|
758 |
+
" <tr>\n",
|
759 |
+
" <th>666</th>\n",
|
760 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
761 |
+
" <td>Dime algo bonito</td>\n",
|
762 |
+
" <td>Yo digo que tal vez el universo ha dicho que e...</td>\n",
|
763 |
+
" </tr>\n",
|
764 |
+
" <tr>\n",
|
765 |
+
" <th>667</th>\n",
|
766 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
767 |
+
" <td>Oye dime algo bonito</td>\n",
|
768 |
+
" <td>Deja veo si ya me depositaste en mi cuenta y v...</td>\n",
|
769 |
+
" </tr>\n",
|
770 |
+
" <tr>\n",
|
771 |
+
" <th>668</th>\n",
|
772 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
773 |
+
" <td>Oye dime algo bonito</td>\n",
|
774 |
+
" <td>Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng...</td>\n",
|
775 |
+
" </tr>\n",
|
776 |
+
" <tr>\n",
|
777 |
+
" <th>669</th>\n",
|
778 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
779 |
+
" <td>Oye dime algo bonito</td>\n",
|
780 |
+
" <td>Solo si veo un depósito en mi cuenta en menos ...</td>\n",
|
781 |
+
" </tr>\n",
|
782 |
+
" </tbody>\n",
|
783 |
+
"</table>\n",
|
784 |
+
"<p>670 rows × 3 columns</p>\n",
|
785 |
+
"</div>"
|
786 |
+
],
|
787 |
+
"text/plain": [
|
788 |
+
" system \\\n",
|
789 |
+
"0 CARSE es un chatbot que imita el estilo en que... \n",
|
790 |
+
"1 CARSE es un chatbot que imita el estilo en que... \n",
|
791 |
+
"2 CARSE es un chatbot que imita el estilo en que... \n",
|
792 |
+
"3 CARSE es un chatbot que imita el estilo en que... \n",
|
793 |
+
"4 CARSE es un chatbot que imita el estilo en que... \n",
|
794 |
+
".. ... \n",
|
795 |
+
"665 CARSE es un chatbot que imita el estilo en que... \n",
|
796 |
+
"666 CARSE es un chatbot que imita el estilo en que... \n",
|
797 |
+
"667 CARSE es un chatbot que imita el estilo en que... \n",
|
798 |
+
"668 CARSE es un chatbot que imita el estilo en que... \n",
|
799 |
+
"669 CARSE es un chatbot que imita el estilo en que... \n",
|
800 |
+
"\n",
|
801 |
+
" CarmenQ \\\n",
|
802 |
+
"0 Buenos días mi amorchis, cómo dormiste hoy? \n",
|
803 |
+
"1 Holis mi amor, espero que tengas un muy bonito... \n",
|
804 |
+
"2 Buenos días mi bb, disfruta mucho tu día guapo \n",
|
805 |
+
"3 Buenos días mi amor, espero que hayas podido d... \n",
|
806 |
+
"4 Hellouuuuu, te amo mucho guapiiii, ten lindo dia \n",
|
807 |
+
".. ... \n",
|
808 |
+
"665 Dime algo bonito \n",
|
809 |
+
"666 Dime algo bonito \n",
|
810 |
+
"667 Oye dime algo bonito \n",
|
811 |
+
"668 Oye dime algo bonito \n",
|
812 |
+
"669 Oye dime algo bonito \n",
|
813 |
+
"\n",
|
814 |
+
" Sebas \n",
|
815 |
+
"0 Buenos días mi amorrrr\\nBien bien, pero hacía ... \n",
|
816 |
+
"1 Hola hola bebitaaaa, te amo mucho másss\\nTambi... \n",
|
817 |
+
"2 Buenos días amorrr, graciassssss\\nTambién disf... \n",
|
818 |
+
"3 Hola mi amorcito, cómo amaneciste hoyyy???\\nTa... \n",
|
819 |
+
"4 Holi mi vida, cómo estás hoy??\\nTe amo mucho m... \n",
|
820 |
+
".. ... \n",
|
821 |
+
"665 Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ... \n",
|
822 |
+
"666 Yo digo que tal vez el universo ha dicho que e... \n",
|
823 |
+
"667 Deja veo si ya me depositaste en mi cuenta y v... \n",
|
824 |
+
"668 Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng... \n",
|
825 |
+
"669 Solo si veo un depósito en mi cuenta en menos ... \n",
|
826 |
+
"\n",
|
827 |
+
"[670 rows x 3 columns]"
|
828 |
+
]
|
829 |
+
},
|
830 |
+
"execution_count": 116,
|
831 |
+
"metadata": {},
|
832 |
+
"output_type": "execute_result"
|
833 |
+
}
|
834 |
+
],
|
835 |
+
"source": [
|
836 |
+
"# Llamamos a la función para agregar la columna 'system'\n",
|
837 |
+
"chat_df = agregar_columna_system(chat_df)\n",
|
838 |
+
"chat_df"
|
839 |
+
]
|
840 |
+
},
|
841 |
+
{
|
842 |
+
"cell_type": "code",
|
843 |
+
"execution_count": 117,
|
844 |
+
"metadata": {},
|
845 |
+
"outputs": [],
|
846 |
+
"source": [
|
847 |
+
"def renombrar_columnas(df):\n",
|
848 |
+
" # Cambiamos el nombre de las columnas\n",
|
849 |
+
" df.rename(columns={'CarmenQ': 'user', 'Sebas': 'assistant'}, inplace=True)\n",
|
850 |
+
"\n",
|
851 |
+
" return df"
|
852 |
+
]
|
853 |
+
},
|
854 |
+
{
|
855 |
+
"cell_type": "code",
|
856 |
+
"execution_count": 118,
|
857 |
+
"metadata": {},
|
858 |
+
"outputs": [
|
859 |
+
{
|
860 |
+
"data": {
|
861 |
+
"text/html": [
|
862 |
+
"<div>\n",
|
863 |
+
"<style scoped>\n",
|
864 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
865 |
+
" vertical-align: middle;\n",
|
866 |
+
" }\n",
|
867 |
+
"\n",
|
868 |
+
" .dataframe tbody tr th {\n",
|
869 |
+
" vertical-align: top;\n",
|
870 |
+
" }\n",
|
871 |
+
"\n",
|
872 |
+
" .dataframe thead th {\n",
|
873 |
+
" text-align: right;\n",
|
874 |
+
" }\n",
|
875 |
+
"</style>\n",
|
876 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
877 |
+
" <thead>\n",
|
878 |
+
" <tr style=\"text-align: right;\">\n",
|
879 |
+
" <th></th>\n",
|
880 |
+
" <th>system</th>\n",
|
881 |
+
" <th>user</th>\n",
|
882 |
+
" <th>assistant</th>\n",
|
883 |
+
" </tr>\n",
|
884 |
+
" </thead>\n",
|
885 |
+
" <tbody>\n",
|
886 |
+
" <tr>\n",
|
887 |
+
" <th>0</th>\n",
|
888 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
889 |
+
" <td>Buenos días mi amorchis, cómo dormiste hoy?</td>\n",
|
890 |
+
" <td>Buenos días mi amorrrr\\nBien bien, pero hacía ...</td>\n",
|
891 |
+
" </tr>\n",
|
892 |
+
" <tr>\n",
|
893 |
+
" <th>1</th>\n",
|
894 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
895 |
+
" <td>Holis mi amor, espero que tengas un muy bonito...</td>\n",
|
896 |
+
" <td>Hola hola bebitaaaa, te amo mucho másss\\nTambi...</td>\n",
|
897 |
+
" </tr>\n",
|
898 |
+
" <tr>\n",
|
899 |
+
" <th>2</th>\n",
|
900 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
901 |
+
" <td>Buenos días mi bb, disfruta mucho tu día guapo</td>\n",
|
902 |
+
" <td>Buenos días amorrr, graciassssss\\nTambién disf...</td>\n",
|
903 |
+
" </tr>\n",
|
904 |
+
" <tr>\n",
|
905 |
+
" <th>3</th>\n",
|
906 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
907 |
+
" <td>Buenos días mi amor, espero que hayas podido d...</td>\n",
|
908 |
+
" <td>Hola mi amorcito, cómo amaneciste hoyyy???\\nTa...</td>\n",
|
909 |
+
" </tr>\n",
|
910 |
+
" <tr>\n",
|
911 |
+
" <th>4</th>\n",
|
912 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
913 |
+
" <td>Hellouuuuu, te amo mucho guapiiii, ten lindo dia</td>\n",
|
914 |
+
" <td>Holi mi vida, cómo estás hoy??\\nTe amo mucho m...</td>\n",
|
915 |
+
" </tr>\n",
|
916 |
+
" <tr>\n",
|
917 |
+
" <th>...</th>\n",
|
918 |
+
" <td>...</td>\n",
|
919 |
+
" <td>...</td>\n",
|
920 |
+
" <td>...</td>\n",
|
921 |
+
" </tr>\n",
|
922 |
+
" <tr>\n",
|
923 |
+
" <th>665</th>\n",
|
924 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
925 |
+
" <td>Dime algo bonito</td>\n",
|
926 |
+
" <td>Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ...</td>\n",
|
927 |
+
" </tr>\n",
|
928 |
+
" <tr>\n",
|
929 |
+
" <th>666</th>\n",
|
930 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
931 |
+
" <td>Dime algo bonito</td>\n",
|
932 |
+
" <td>Yo digo que tal vez el universo ha dicho que e...</td>\n",
|
933 |
+
" </tr>\n",
|
934 |
+
" <tr>\n",
|
935 |
+
" <th>667</th>\n",
|
936 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
937 |
+
" <td>Oye dime algo bonito</td>\n",
|
938 |
+
" <td>Deja veo si ya me depositaste en mi cuenta y v...</td>\n",
|
939 |
+
" </tr>\n",
|
940 |
+
" <tr>\n",
|
941 |
+
" <th>668</th>\n",
|
942 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
943 |
+
" <td>Oye dime algo bonito</td>\n",
|
944 |
+
" <td>Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng...</td>\n",
|
945 |
+
" </tr>\n",
|
946 |
+
" <tr>\n",
|
947 |
+
" <th>669</th>\n",
|
948 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
949 |
+
" <td>Oye dime algo bonito</td>\n",
|
950 |
+
" <td>Solo si veo un depósito en mi cuenta en menos ...</td>\n",
|
951 |
+
" </tr>\n",
|
952 |
+
" </tbody>\n",
|
953 |
+
"</table>\n",
|
954 |
+
"<p>670 rows × 3 columns</p>\n",
|
955 |
+
"</div>"
|
956 |
+
],
|
957 |
+
"text/plain": [
|
958 |
+
" system \\\n",
|
959 |
+
"0 CARSE es un chatbot que imita el estilo en que... \n",
|
960 |
+
"1 CARSE es un chatbot que imita el estilo en que... \n",
|
961 |
+
"2 CARSE es un chatbot que imita el estilo en que... \n",
|
962 |
+
"3 CARSE es un chatbot que imita el estilo en que... \n",
|
963 |
+
"4 CARSE es un chatbot que imita el estilo en que... \n",
|
964 |
+
".. ... \n",
|
965 |
+
"665 CARSE es un chatbot que imita el estilo en que... \n",
|
966 |
+
"666 CARSE es un chatbot que imita el estilo en que... \n",
|
967 |
+
"667 CARSE es un chatbot que imita el estilo en que... \n",
|
968 |
+
"668 CARSE es un chatbot que imita el estilo en que... \n",
|
969 |
+
"669 CARSE es un chatbot que imita el estilo en que... \n",
|
970 |
+
"\n",
|
971 |
+
" user \\\n",
|
972 |
+
"0 Buenos días mi amorchis, cómo dormiste hoy? \n",
|
973 |
+
"1 Holis mi amor, espero que tengas un muy bonito... \n",
|
974 |
+
"2 Buenos días mi bb, disfruta mucho tu día guapo \n",
|
975 |
+
"3 Buenos días mi amor, espero que hayas podido d... \n",
|
976 |
+
"4 Hellouuuuu, te amo mucho guapiiii, ten lindo dia \n",
|
977 |
+
".. ... \n",
|
978 |
+
"665 Dime algo bonito \n",
|
979 |
+
"666 Dime algo bonito \n",
|
980 |
+
"667 Oye dime algo bonito \n",
|
981 |
+
"668 Oye dime algo bonito \n",
|
982 |
+
"669 Oye dime algo bonito \n",
|
983 |
+
"\n",
|
984 |
+
" assistant \n",
|
985 |
+
"0 Buenos días mi amorrrr\\nBien bien, pero hacía ... \n",
|
986 |
+
"1 Hola hola bebitaaaa, te amo mucho másss\\nTambi... \n",
|
987 |
+
"2 Buenos días amorrr, graciassssss\\nTambién disf... \n",
|
988 |
+
"3 Hola mi amorcito, cómo amaneciste hoyyy???\\nTa... \n",
|
989 |
+
"4 Holi mi vida, cómo estás hoy??\\nTe amo mucho m... \n",
|
990 |
+
".. ... \n",
|
991 |
+
"665 Hmmmm vemos. Deposítame 5 mil pesotes y va JAJ... \n",
|
992 |
+
"666 Yo digo que tal vez el universo ha dicho que e... \n",
|
993 |
+
"667 Deja veo si ya me depositaste en mi cuenta y v... \n",
|
994 |
+
"668 Algo bonito? Tuuuuu, mi C, estás hermosa\\nTeng... \n",
|
995 |
+
"669 Solo si veo un depósito en mi cuenta en menos ... \n",
|
996 |
+
"\n",
|
997 |
+
"[670 rows x 3 columns]"
|
998 |
+
]
|
999 |
+
},
|
1000 |
+
"execution_count": 118,
|
1001 |
+
"metadata": {},
|
1002 |
+
"output_type": "execute_result"
|
1003 |
+
}
|
1004 |
+
],
|
1005 |
+
"source": [
|
1006 |
+
"# Llamamos a la función para renombrar las columnas en el dataset\n",
|
1007 |
+
"chat_df = renombrar_columnas(chat_df)\n",
|
1008 |
+
"chat_df"
|
1009 |
+
]
|
1010 |
+
},
|
1011 |
+
{
|
1012 |
+
"cell_type": "code",
|
1013 |
+
"execution_count": 119,
|
1014 |
+
"metadata": {},
|
1015 |
+
"outputs": [
|
1016 |
+
{
|
1017 |
+
"data": {
|
1018 |
+
"text/html": [
|
1019 |
+
"<div>\n",
|
1020 |
+
"<style scoped>\n",
|
1021 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
1022 |
+
" vertical-align: middle;\n",
|
1023 |
+
" }\n",
|
1024 |
+
"\n",
|
1025 |
+
" .dataframe tbody tr th {\n",
|
1026 |
+
" vertical-align: top;\n",
|
1027 |
+
" }\n",
|
1028 |
+
"\n",
|
1029 |
+
" .dataframe thead th {\n",
|
1030 |
+
" text-align: right;\n",
|
1031 |
+
" }\n",
|
1032 |
+
"</style>\n",
|
1033 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
1034 |
+
" <thead>\n",
|
1035 |
+
" <tr style=\"text-align: right;\">\n",
|
1036 |
+
" <th></th>\n",
|
1037 |
+
" <th>system</th>\n",
|
1038 |
+
" <th>user</th>\n",
|
1039 |
+
" <th>assistant</th>\n",
|
1040 |
+
" </tr>\n",
|
1041 |
+
" </thead>\n",
|
1042 |
+
" <tbody>\n",
|
1043 |
+
" <tr>\n",
|
1044 |
+
" <th>0</th>\n",
|
1045 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1046 |
+
" <td>Tengo mucho sueño aún</td>\n",
|
1047 |
+
" <td>Ay amorcito, a ver si te echas a dormir en el ...</td>\n",
|
1048 |
+
" </tr>\n",
|
1049 |
+
" <tr>\n",
|
1050 |
+
" <th>1</th>\n",
|
1051 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1052 |
+
" <td>Ay amor que frío hace</td>\n",
|
1053 |
+
" <td>Pues que bueno, mínimo no estamos sudando</td>\n",
|
1054 |
+
" </tr>\n",
|
1055 |
+
" <tr>\n",
|
1056 |
+
" <th>2</th>\n",
|
1057 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1058 |
+
" <td>Ya adivina\\nLo que se te ocurra</td>\n",
|
1059 |
+
" <td>Esque no se me ocurrió nada mas</td>\n",
|
1060 |
+
" </tr>\n",
|
1061 |
+
" <tr>\n",
|
1062 |
+
" <th>3</th>\n",
|
1063 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1064 |
+
" <td>Biennnn, pero tengo sueño aún</td>\n",
|
1065 |
+
" <td>Si tienes descanso, duerme</td>\n",
|
1066 |
+
" </tr>\n",
|
1067 |
+
" <tr>\n",
|
1068 |
+
" <th>4</th>\n",
|
1069 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1070 |
+
" <td>Yo te amo massssss</td>\n",
|
1071 |
+
" <td>Eso es mega feik eh</td>\n",
|
1072 |
+
" </tr>\n",
|
1073 |
+
" <tr>\n",
|
1074 |
+
" <th>...</th>\n",
|
1075 |
+
" <td>...</td>\n",
|
1076 |
+
" <td>...</td>\n",
|
1077 |
+
" <td>...</td>\n",
|
1078 |
+
" </tr>\n",
|
1079 |
+
" <tr>\n",
|
1080 |
+
" <th>665</th>\n",
|
1081 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1082 |
+
" <td>Pues ya toca casarnos</td>\n",
|
1083 |
+
" <td>Por fiiiin, ya te habías tardado en decir eso</td>\n",
|
1084 |
+
" </tr>\n",
|
1085 |
+
" <tr>\n",
|
1086 |
+
" <th>666</th>\n",
|
1087 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1088 |
+
" <td>Biennnnn\\n¿Y tu?</td>\n",
|
1089 |
+
" <td>Bien igual</td>\n",
|
1090 |
+
" </tr>\n",
|
1091 |
+
" <tr>\n",
|
1092 |
+
" <th>667</th>\n",
|
1093 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1094 |
+
" <td>JAJAJJAJAJA no</td>\n",
|
1095 |
+
" <td>Ya dimeeeee</td>\n",
|
1096 |
+
" </tr>\n",
|
1097 |
+
" <tr>\n",
|
1098 |
+
" <th>668</th>\n",
|
1099 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1100 |
+
" <td>Casarme contigo algún día</td>\n",
|
1101 |
+
" <td>Awwww amoorr, sí quiero casarme contigo algún ...</td>\n",
|
1102 |
+
" </tr>\n",
|
1103 |
+
" <tr>\n",
|
1104 |
+
" <th>669</th>\n",
|
1105 |
+
" <td>CARSE es un chatbot que imita el estilo en que...</td>\n",
|
1106 |
+
" <td>Me estaba acabando el café</td>\n",
|
1107 |
+
" <td>Disfruta tu cafecito</td>\n",
|
1108 |
+
" </tr>\n",
|
1109 |
+
" </tbody>\n",
|
1110 |
+
"</table>\n",
|
1111 |
+
"<p>670 rows × 3 columns</p>\n",
|
1112 |
+
"</div>"
|
1113 |
+
],
|
1114 |
+
"text/plain": [
|
1115 |
+
" system \\\n",
|
1116 |
+
"0 CARSE es un chatbot que imita el estilo en que... \n",
|
1117 |
+
"1 CARSE es un chatbot que imita el estilo en que... \n",
|
1118 |
+
"2 CARSE es un chatbot que imita el estilo en que... \n",
|
1119 |
+
"3 CARSE es un chatbot que imita el estilo en que... \n",
|
1120 |
+
"4 CARSE es un chatbot que imita el estilo en que... \n",
|
1121 |
+
".. ... \n",
|
1122 |
+
"665 CARSE es un chatbot que imita el estilo en que... \n",
|
1123 |
+
"666 CARSE es un chatbot que imita el estilo en que... \n",
|
1124 |
+
"667 CARSE es un chatbot que imita el estilo en que... \n",
|
1125 |
+
"668 CARSE es un chatbot que imita el estilo en que... \n",
|
1126 |
+
"669 CARSE es un chatbot que imita el estilo en que... \n",
|
1127 |
+
"\n",
|
1128 |
+
" user \\\n",
|
1129 |
+
"0 Tengo mucho sueño aún \n",
|
1130 |
+
"1 Ay amor que frío hace \n",
|
1131 |
+
"2 Ya adivina\\nLo que se te ocurra \n",
|
1132 |
+
"3 Biennnn, pero tengo sueño aún \n",
|
1133 |
+
"4 Yo te amo massssss \n",
|
1134 |
+
".. ... \n",
|
1135 |
+
"665 Pues ya toca casarnos \n",
|
1136 |
+
"666 Biennnnn\\n¿Y tu? \n",
|
1137 |
+
"667 JAJAJJAJAJA no \n",
|
1138 |
+
"668 Casarme contigo algún día \n",
|
1139 |
+
"669 Me estaba acabando el café \n",
|
1140 |
+
"\n",
|
1141 |
+
" assistant \n",
|
1142 |
+
"0 Ay amorcito, a ver si te echas a dormir en el ... \n",
|
1143 |
+
"1 Pues que bueno, mínimo no estamos sudando \n",
|
1144 |
+
"2 Esque no se me ocurrió nada mas \n",
|
1145 |
+
"3 Si tienes descanso, duerme \n",
|
1146 |
+
"4 Eso es mega feik eh \n",
|
1147 |
+
".. ... \n",
|
1148 |
+
"665 Por fiiiin, ya te habías tardado en decir eso \n",
|
1149 |
+
"666 Bien igual \n",
|
1150 |
+
"667 Ya dimeeeee \n",
|
1151 |
+
"668 Awwww amoorr, sí quiero casarme contigo algún ... \n",
|
1152 |
+
"669 Disfruta tu cafecito \n",
|
1153 |
+
"\n",
|
1154 |
+
"[670 rows x 3 columns]"
|
1155 |
+
]
|
1156 |
+
},
|
1157 |
+
"execution_count": 119,
|
1158 |
+
"metadata": {},
|
1159 |
+
"output_type": "execute_result"
|
1160 |
+
}
|
1161 |
+
],
|
1162 |
+
"source": [
|
1163 |
+
"# Revolvemos el orden de los datos\n",
|
1164 |
+
"chat_df = chat_df.sample(frac=1, random_state=20210703).reset_index(drop=True)\n",
|
1165 |
+
"chat_df"
|
1166 |
+
]
|
1167 |
+
},
|
1168 |
+
{
|
1169 |
+
"cell_type": "markdown",
|
1170 |
+
"metadata": {},
|
1171 |
+
"source": [
|
1172 |
+
"### Función para crear el archivo JSONL que requiere OpenAI para entrenar"
|
1173 |
+
]
|
1174 |
+
},
|
1175 |
+
{
|
1176 |
+
"cell_type": "code",
|
1177 |
+
"execution_count": 120,
|
1178 |
+
"metadata": {},
|
1179 |
+
"outputs": [],
|
1180 |
+
"source": [
|
1181 |
+
"def crear_json(df,nombre_json):\n",
|
1182 |
+
"\n",
|
1183 |
+
" jsonl_data = []\n",
|
1184 |
+
"\n",
|
1185 |
+
" for index, row in df.iterrows():\n",
|
1186 |
+
" message_list = [\n",
|
1187 |
+
" {\"role\": \"system\", \"content\": row['system']},\n",
|
1188 |
+
" {\"role\": \"user\", \"content\": row['user']},\n",
|
1189 |
+
" {\"role\": \"assistant\", \"content\": row['assistant']}\n",
|
1190 |
+
" ]\n",
|
1191 |
+
" jsonl_data.append({\"messages\": message_list})\n",
|
1192 |
+
" \n",
|
1193 |
+
" # Guardar en un archivo JSONL con codificación UTF-8\n",
|
1194 |
+
" with open(nombre_json, 'w', encoding='utf-8') as file:\n",
|
1195 |
+
" for item in jsonl_data:\n",
|
1196 |
+
" file.write(json.dumps(item, ensure_ascii=False) + '\\n')"
|
1197 |
+
]
|
1198 |
+
},
|
1199 |
+
{
|
1200 |
+
"cell_type": "code",
|
1201 |
+
"execution_count": 121,
|
1202 |
+
"metadata": {},
|
1203 |
+
"outputs": [],
|
1204 |
+
"source": [
|
1205 |
+
"nombre_json = 'Training_data/prompts_finales.jsonl'\n",
|
1206 |
+
"\n",
|
1207 |
+
"crear_json(chat_df,nombre_json)\n"
|
1208 |
+
]
|
1209 |
+
},
|
1210 |
+
{
|
1211 |
+
"cell_type": "markdown",
|
1212 |
+
"metadata": {},
|
1213 |
+
"source": [
|
1214 |
+
"### Separación del File de entrenamiento en partes iguales"
|
1215 |
+
]
|
1216 |
+
},
|
1217 |
+
{
|
1218 |
+
"cell_type": "code",
|
1219 |
+
"execution_count": 122,
|
1220 |
+
"metadata": {},
|
1221 |
+
"outputs": [
|
1222 |
+
{
|
1223 |
+
"name": "stdout",
|
1224 |
+
"output_type": "stream",
|
1225 |
+
"text": [
|
1226 |
+
"El archivo tiene 670 líneas.\n"
|
1227 |
+
]
|
1228 |
+
}
|
1229 |
+
],
|
1230 |
+
"source": [
|
1231 |
+
"# Ruta del archivo JSONL\n",
|
1232 |
+
"archivo_jsonl = 'Training_data/prompts_finales.jsonl'\n",
|
1233 |
+
"\n",
|
1234 |
+
"# Contar las líneas\n",
|
1235 |
+
"try:\n",
|
1236 |
+
" with open(archivo_jsonl, 'r', encoding='utf-8') as file:\n",
|
1237 |
+
" lineas = sum(1 for _ in file)\n",
|
1238 |
+
" print(f\"El archivo tiene {lineas} líneas.\")\n",
|
1239 |
+
"except FileNotFoundError:\n",
|
1240 |
+
" print(\"Archivo no encontrado. Por favor verifica la ruta.\")"
|
1241 |
+
]
|
1242 |
+
},
|
1243 |
+
{
|
1244 |
+
"cell_type": "code",
|
1245 |
+
"execution_count": 124,
|
1246 |
+
"metadata": {},
|
1247 |
+
"outputs": [],
|
1248 |
+
"source": [
|
1249 |
+
"def dividir_jsonl(ruta_original, num_partes):\n",
|
1250 |
+
" # Leer el archivo original\n",
|
1251 |
+
" with open(ruta_original, 'r', encoding='utf-8') as file:\n",
|
1252 |
+
" lineas = file.readlines()\n",
|
1253 |
+
"\n",
|
1254 |
+
" # Calcular el tamaño de cada parte\n",
|
1255 |
+
" total_lineas = len(lineas)\n",
|
1256 |
+
" tamaño_parte = math.ceil(total_lineas / num_partes)\n",
|
1257 |
+
"\n",
|
1258 |
+
" # Dividir y guardar las partes\n",
|
1259 |
+
" for i in range(num_partes):\n",
|
1260 |
+
" parte = lineas[i*tamaño_parte:(i+1)*tamaño_parte]\n",
|
1261 |
+
" ruta_nueva = ruta_original.replace('.jsonl', f'_{i+1}.jsonl')\n",
|
1262 |
+
" with open(ruta_nueva, 'w', encoding='utf-8') as new_file:\n",
|
1263 |
+
" new_file.writelines(parte)"
|
1264 |
+
]
|
1265 |
+
},
|
1266 |
+
{
|
1267 |
+
"cell_type": "code",
|
1268 |
+
"execution_count": 125,
|
1269 |
+
"metadata": {},
|
1270 |
+
"outputs": [],
|
1271 |
+
"source": [
|
1272 |
+
"dividir_jsonl('Training_data/prompts_finales.jsonl', 5)"
|
1273 |
+
]
|
1274 |
+
},
|
1275 |
+
{
|
1276 |
+
"cell_type": "code",
|
1277 |
+
"execution_count": null,
|
1278 |
+
"metadata": {},
|
1279 |
+
"outputs": [],
|
1280 |
+
"source": []
|
1281 |
+
}
|
1282 |
+
],
|
1283 |
+
"metadata": {
|
1284 |
+
"kernelspec": {
|
1285 |
+
"display_name": "base",
|
1286 |
+
"language": "python",
|
1287 |
+
"name": "python3"
|
1288 |
+
},
|
1289 |
+
"language_info": {
|
1290 |
+
"codemirror_mode": {
|
1291 |
+
"name": "ipython",
|
1292 |
+
"version": 3
|
1293 |
+
},
|
1294 |
+
"file_extension": ".py",
|
1295 |
+
"mimetype": "text/x-python",
|
1296 |
+
"name": "python",
|
1297 |
+
"nbconvert_exporter": "python",
|
1298 |
+
"pygments_lexer": "ipython3",
|
1299 |
+
"version": "3.9.7"
|
1300 |
+
}
|
1301 |
+
},
|
1302 |
+
"nbformat": 4,
|
1303 |
+
"nbformat_minor": 2
|
1304 |
+
}
|
CARSE_Prep.ipynb
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyMzN+lsyb4smMC9hf/gYgCs"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["#! pip install tiktoken"],"metadata":{"id":"pEb6CJl3ZFKP","executionInfo":{"status":"ok","timestamp":1702156660830,"user_tz":360,"elapsed":173,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"}}},"execution_count":17,"outputs":[]},{"cell_type":"code","execution_count":18,"metadata":{"id":"zj6dSJC3Y01I","executionInfo":{"status":"ok","timestamp":1702156660831,"user_tz":360,"elapsed":2,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"}}},"outputs":[],"source":["import json\n","import tiktoken # for token counting\n","import numpy as np\n","from collections import defaultdict"]},{"cell_type":"markdown","source":["<br>\n","<br>\n","<br>\n","\n","## Data Loading"],"metadata":{"id":"SmN-A6KJZQPo"}},{"cell_type":"code","source":["data_path = \"CARSE_data/prompts_finales.jsonl\"\n","\n","# Load the dataset\n","with open(data_path, 'r', encoding='utf-8') as f:\n"," dataset = [json.loads(line) for line in f]\n","\n","# Initial dataset stats\n","print(\"\\nNum examples:\", len(dataset))\n","print(\"First example:\")\n","for message in dataset[0][\"messages\"]:\n"," print(message)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"-geBXCMTZObN","executionInfo":{"status":"ok","timestamp":1702156661009,"user_tz":360,"elapsed":6,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"}},"outputId":"924e87bd-4c01-4eff-ac93-10f07e27fe41"},"execution_count":19,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","Num examples: 670\n","First example:\n","{'role': 'system', 'content': 'CARSE es un chatbot que imita el estilo en que Sebas conversa. Fue creado con amor solo para Carmen. Su objetivo es ofrecer una experiencia de chat divertida y familiar que exprese la personalidad, el humor y el amor de Sebas hacia Carmen.'}\n","{'role': 'user', 'content': 'Tengo mucho sueño aún'}\n","{'role': 'assistant', 'content': 'Ay amorcito, a ver si te echas a dormir en el coche un rato'}\n"]}]},{"cell_type":"markdown","source":["<br>\n","<br>\n","<br>\n","\n","## Format Validation"],"metadata":{"id":"UfxL9TQMaCCG"}},{"cell_type":"code","source":["# Format error checks\n","format_errors = defaultdict(int)\n","\n","for ex in dataset:\n"," if not isinstance(ex, dict):\n"," format_errors[\"data_type\"] += 1\n"," continue\n","\n"," messages = ex.get(\"messages\", None)\n"," if not messages:\n"," format_errors[\"missing_messages_list\"] += 1\n"," continue\n","\n"," for message in messages:\n"," if \"role\" not in message or \"content\" not in message:\n"," format_errors[\"message_missing_key\"] += 1\n","\n"," if any(k not in (\"role\", \"content\", \"name\", \"function_call\") for k in message):\n"," format_errors[\"message_unrecognized_key\"] += 1\n","\n"," if message.get(\"role\", None) not in (\"system\", \"user\", \"assistant\", \"function\"):\n"," format_errors[\"unrecognized_role\"] += 1\n","\n"," content = message.get(\"content\", None)\n"," function_call = message.get(\"function_call\", None)\n","\n"," if (not content and not function_call) or not isinstance(content, str):\n"," format_errors[\"missing_content\"] += 1\n","\n"," if not any(message.get(\"role\", None) == \"assistant\" for message in messages):\n"," format_errors[\"example_missing_assistant_message\"] += 1\n","\n","if format_errors:\n"," print(\"\\nFound errors:\")\n"," for k, v in format_errors.items():\n"," print(f\"{k}: {v}\")\n","else:\n"," print(\"\\nNo errors found\")"],"metadata":{"id":"bOXkqnLgZakM","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1702156661009,"user_tz":360,"elapsed":4,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"}},"outputId":"8c716779-0f83-4fbf-c302-56d834735327"},"execution_count":20,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","No errors found\n"]}]},{"cell_type":"markdown","source":["<br>\n","<br>\n","<br>\n","\n","## Token Counting Utilities"],"metadata":{"id":"UXrIdBKtaHkx"}},{"cell_type":"code","source":["encoding = tiktoken.get_encoding(\"cl100k_base\")\n","\n","# not exact!\n","# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb\n","def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):\n"," num_tokens = 0\n"," for message in messages:\n"," num_tokens += tokens_per_message\n"," for key, value in message.items():\n"," num_tokens += len(encoding.encode(value))\n"," if key == \"name\":\n"," num_tokens += tokens_per_name\n"," num_tokens += 3\n"," return num_tokens\n","\n","def num_assistant_tokens_from_messages(messages):\n"," num_tokens = 0\n"," for message in messages:\n"," if message[\"role\"] == \"assistant\":\n"," num_tokens += len(encoding.encode(message[\"content\"]))\n"," return num_tokens\n","\n","def print_distribution(values, name):\n"," print(f\"\\n#### Distribution of {name}:\")\n"," print(f\"min / max: {min(values)}, {max(values)}\")\n"," print(f\"mean / median: {np.mean(values)}, {np.median(values)}\")\n"," print(f\"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}\")"],"metadata":{"id":"A6ev3s_TaGeJ","executionInfo":{"status":"ok","timestamp":1702156661009,"user_tz":360,"elapsed":3,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"}}},"execution_count":21,"outputs":[]},{"cell_type":"markdown","source":["<br>\n","<br>\n","<br>\n","\n","## Data Warnings and Token Counts"],"metadata":{"id":"2duSc8L7aYki"}},{"cell_type":"code","source":["# Warnings and tokens counts\n","n_missing_system = 0\n","n_missing_user = 0\n","n_messages = []\n","convo_lens = []\n","assistant_message_lens = []\n","\n","for ex in dataset:\n"," messages = ex[\"messages\"]\n"," if not any(message[\"role\"] == \"system\" for message in messages):\n"," n_missing_system += 1\n"," if not any(message[\"role\"] == \"user\" for message in messages):\n"," n_missing_user += 1\n"," n_messages.append(len(messages))\n"," convo_lens.append(num_tokens_from_messages(messages))\n"," assistant_message_lens.append(num_assistant_tokens_from_messages(messages))\n","\n","print(\"\\nNum examples missing system message:\", n_missing_system)\n","print(\"Num examples missing user message:\", n_missing_user)\n","print_distribution(n_messages, \"num_messages_per_example\")\n","print_distribution(convo_lens, \"num_total_tokens_per_example\")\n","print_distribution(assistant_message_lens, \"num_assistant_tokens_per_example\")\n","n_too_long = sum(l > 4096 for l in convo_lens)\n","print(f\"\\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"CfDEmpgiaLuS","executionInfo":{"status":"ok","timestamp":1702156661009,"user_tz":360,"elapsed":3,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"}},"outputId":"f1e8ed95-f77b-4340-80ce-b08ee6025a2e"},"execution_count":22,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","Num examples missing system message: 0\n","Num examples missing user message: 0\n","\n","#### Distribution of num_messages_per_example:\n","min / max: 3, 3\n","mean / median: 3.0, 3.0\n","p5 / p95: 3.0, 3.0\n","\n","#### Distribution of num_total_tokens_per_example:\n","min / max: 77, 263\n","mean / median: 102.54776119402985, 97.0\n","p5 / p95: 84.0, 120.10000000000002\n","\n","#### Distribution of num_assistant_tokens_per_example:\n","min / max: 2, 184\n","mean / median: 19.9955223880597, 15.0\n","p5 / p95: 6.0, 34.0\n","\n","0 examples may be over the 4096 token limit, they will be truncated during fine-tuning\n"]}]},{"cell_type":"markdown","source":["<br>\n","<br>\n","<br>\n","\n","## Cost Estimation"],"metadata":{"id":"x8GusuUOapUB"}},{"cell_type":"code","source":["# Pricing and default n_epochs estimate\n","MAX_TOKENS_PER_EXAMPLE = 4096\n","\n","TARGET_EPOCHS = 3\n","MIN_TARGET_EXAMPLES = 100\n","MAX_TARGET_EXAMPLES = 25000\n","MIN_DEFAULT_EPOCHS = 1\n","MAX_DEFAULT_EPOCHS = 25\n","\n","n_epochs = TARGET_EPOCHS\n","n_train_examples = len(dataset)\n","if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:\n"," n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)\n","elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:\n"," n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)\n","\n","n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)\n","print(f\"\\nDataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n","print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n","print(f\"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"EnAWqf3SadOj","executionInfo":{"status":"ok","timestamp":1702156661200,"user_tz":360,"elapsed":194,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"}},"outputId":"3d0527e9-a3cf-4b6e-9ce4-89a99611d219"},"execution_count":23,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","Dataset has ~68707 tokens that will be charged for during training\n","By default, you'll train for 3 epochs on this dataset\n","By default, you'll be charged for ~206121 tokens\n"]}]},{"cell_type":"code","source":["# Suponiendo que me cargan $0.008/1k tokens\n","value = 0.008\n","\n","final_money = (n_epochs * n_billing_tokens_in_dataset)*value/1000\n","final_money"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"-a820sVicdat","executionInfo":{"status":"ok","timestamp":1702156777183,"user_tz":360,"elapsed":2,"user":{"displayName":"Sebastián M.","userId":"05974347998580401662"}},"outputId":"c96c5167-c91e-4e1c-e8a4-0bd16a6aeb70"},"execution_count":26,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1.648968"]},"metadata":{},"execution_count":26}]},{"cell_type":"code","source":[],"metadata":{"id":"0anBNWtKczWR"},"execution_count":null,"outputs":[]}]}
|
Raw_Data/Raw_Prompts.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|