MathJake commited on
Commit
68a8bf7
·
verified ·
1 Parent(s): 90d5441

Delete Rag_milvus.ipynb

Browse files
Files changed (1) hide show
  1. Rag_milvus.ipynb +0 -571
Rag_milvus.ipynb DELETED
@@ -1,571 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "bf597549",
6
- "metadata": {},
7
- "source": [
8
- "PRIMERO PREPARAMOS TODAS NUESTRAS FUNCIONES PARA PODER SER CONVOCADAS LUEGO."
9
- ]
10
- },
11
- {
12
- "cell_type": "markdown",
13
- "id": "7968949c",
14
- "metadata": {},
15
- "source": [
16
- "Instalamos "
17
- ]
18
- },
19
- {
20
- "cell_type": "code",
21
- "execution_count": 1,
22
- "id": "9a192af6",
23
- "metadata": {},
24
- "outputs": [
25
- {
26
- "name": "stdout",
27
- "output_type": "stream",
28
- "text": [
29
- "Collecting qdrant-client\n",
30
- " Obtaining dependency information for qdrant-client from https://files.pythonhosted.org/packages/e4/52/f49b0aa96253010f57cf80315edecec4f469e7a39c1ed92bf727fa290e57/qdrant_client-1.14.2-py3-none-any.whl.metadata\n",
31
- " Downloading qdrant_client-1.14.2-py3-none-any.whl.metadata (10 kB)\n",
32
- "Requirement already satisfied: transformers in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (4.51.3)\n",
33
- "Requirement already satisfied: torch in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (2.7.0)\n",
34
- "Requirement already satisfied: langchain in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (0.3.24)\n",
35
- "Collecting pymupdf\n",
36
- " Obtaining dependency information for pymupdf from https://files.pythonhosted.org/packages/71/c2/a9059607f80dcaf2392f991748cfc53456820392c0220cff02572653512a/pymupdf-1.25.5-cp39-abi3-win_amd64.whl.metadata\n",
37
- " Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl.metadata (3.4 kB)\n",
38
- "Collecting grpcio>=1.41.0 (from qdrant-client)\n",
39
- " Obtaining dependency information for grpcio>=1.41.0 from https://files.pythonhosted.org/packages/ee/3f/cf92e7e62ccb8dbdf977499547dfc27133124d6467d3a7d23775bcecb0f9/grpcio-1.71.0-cp311-cp311-win_amd64.whl.metadata\n",
40
- " Using cached grpcio-1.71.0-cp311-cp311-win_amd64.whl.metadata (4.0 kB)\n",
41
- "Requirement already satisfied: httpx[http2]>=0.20.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from qdrant-client) (0.28.1)\n",
42
- "Requirement already satisfied: numpy>=1.21 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from qdrant-client) (2.2.5)\n",
43
- "Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)\n",
44
- " Obtaining dependency information for portalocker<3.0.0,>=2.7.0 from https://files.pythonhosted.org/packages/9b/fb/a70a4214956182e0d7a9099ab17d50bfcba1056188e9b14f35b9e2b62a0d/portalocker-2.10.1-py3-none-any.whl.metadata\n",
45
- " Using cached portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)\n",
46
- "Collecting protobuf>=3.20.0 (from qdrant-client)\n",
47
- " Obtaining dependency information for protobuf>=3.20.0 from https://files.pythonhosted.org/packages/97/e9/7b9f1b259d509aef2b833c29a1f3c39185e2bf21c9c1be1cd11c22cb2149/protobuf-6.30.2-cp310-abi3-win_amd64.whl.metadata\n",
48
- " Downloading protobuf-6.30.2-cp310-abi3-win_amd64.whl.metadata (593 bytes)\n",
49
- "Requirement already satisfied: pydantic!=2.0.*,!=2.1.*,!=2.2.0,>=1.10.8 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from qdrant-client) (2.11.4)\n",
50
- "Requirement already satisfied: urllib3<3,>=1.26.14 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from qdrant-client) (2.4.0)\n",
51
- "Requirement already satisfied: filelock in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from transformers) (3.18.0)\n",
52
- "Requirement already satisfied: huggingface-hub<1.0,>=0.30.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from transformers) (0.30.2)\n",
53
- "Requirement already satisfied: packaging>=20.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from transformers) (24.2)\n",
54
- "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from transformers) (6.0.2)\n",
55
- "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from transformers) (2024.11.6)\n",
56
- "Requirement already satisfied: requests in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from transformers) (2.32.3)\n",
57
- "Requirement already satisfied: tokenizers<0.22,>=0.21 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from transformers) (0.21.1)\n",
58
- "Requirement already satisfied: safetensors>=0.4.3 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from transformers) (0.5.3)\n",
59
- "Requirement already satisfied: tqdm>=4.27 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from transformers) (4.67.1)\n",
60
- "Requirement already satisfied: typing-extensions>=4.10.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from torch) (4.13.2)\n",
61
- "Requirement already satisfied: sympy>=1.13.3 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from torch) (1.14.0)\n",
62
- "Requirement already satisfied: networkx in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from torch) (3.4.2)\n",
63
- "Requirement already satisfied: jinja2 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from torch) (3.1.6)\n",
64
- "Requirement already satisfied: fsspec in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from torch) (2025.3.2)\n",
65
- "Requirement already satisfied: langchain-core<1.0.0,>=0.3.55 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from langchain) (0.3.56)\n",
66
- "Requirement already satisfied: langchain-text-splitters<1.0.0,>=0.3.8 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from langchain) (0.3.8)\n",
67
- "Requirement already satisfied: langsmith<0.4,>=0.1.17 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from langchain) (0.3.39)\n",
68
- "Requirement already satisfied: SQLAlchemy<3,>=1.4 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from langchain) (2.0.40)\n",
69
- "Requirement already satisfied: anyio in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from httpx[http2]>=0.20.0->qdrant-client) (4.9.0)\n",
70
- "Requirement already satisfied: certifi in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from httpx[http2]>=0.20.0->qdrant-client) (2025.4.26)\n",
71
- "Requirement already satisfied: httpcore==1.* in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from httpx[http2]>=0.20.0->qdrant-client) (1.0.9)\n",
72
- "Requirement already satisfied: idna in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from httpx[http2]>=0.20.0->qdrant-client) (3.10)\n",
73
- "Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->qdrant-client)\n",
74
- " Obtaining dependency information for h2<5,>=3 from https://files.pythonhosted.org/packages/d0/9e/984486f2d0a0bd2b024bf4bc1c62688fcafa9e61991f041fb0e2def4a982/h2-4.2.0-py3-none-any.whl.metadata\n",
75
- " Using cached h2-4.2.0-py3-none-any.whl.metadata (5.1 kB)\n",
76
- "Requirement already satisfied: h11>=0.16 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from httpcore==1.*->httpx[http2]>=0.20.0->qdrant-client) (0.16.0)\n",
77
- "Requirement already satisfied: tenacity!=8.4.0,<10.0.0,>=8.1.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from langchain-core<1.0.0,>=0.3.55->langchain) (9.1.2)\n",
78
- "Requirement already satisfied: jsonpatch<2.0,>=1.33 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from langchain-core<1.0.0,>=0.3.55->langchain) (1.33)\n",
79
- "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from langsmith<0.4,>=0.1.17->langchain) (3.10.18)\n",
80
- "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from langsmith<0.4,>=0.1.17->langchain) (1.0.0)\n",
81
- "Requirement already satisfied: zstandard<0.24.0,>=0.23.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from langsmith<0.4,>=0.1.17->langchain) (0.23.0)\n",
82
- "Requirement already satisfied: pywin32>=226 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from portalocker<3.0.0,>=2.7.0->qdrant-client) (310)\n",
83
- "Requirement already satisfied: annotated-types>=0.6.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.0,>=1.10.8->qdrant-client) (0.7.0)\n",
84
- "Requirement already satisfied: pydantic-core==2.33.2 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.0,>=1.10.8->qdrant-client) (2.33.2)\n",
85
- "Requirement already satisfied: typing-inspection>=0.4.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.0,>=1.10.8->qdrant-client) (0.4.0)\n",
86
- "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from requests->transformers) (3.4.1)\n",
87
- "Requirement already satisfied: greenlet>=1 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.2.1)\n",
88
- "Requirement already satisfied: mpmath<1.4,>=1.1.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from sympy>=1.13.3->torch) (1.3.0)\n",
89
- "Requirement already satisfied: colorama in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from tqdm>=4.27->transformers) (0.4.6)\n",
90
- "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from jinja2->torch) (3.0.2)\n",
91
- "Collecting hyperframe<7,>=6.1 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client)\n",
92
- " Obtaining dependency information for hyperframe<7,>=6.1 from https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl.metadata\n",
93
- " Using cached hyperframe-6.1.0-py3-none-any.whl.metadata (4.3 kB)\n",
94
- "Collecting hpack<5,>=4.1 (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client)\n",
95
- " Obtaining dependency information for hpack<5,>=4.1 from https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl.metadata\n",
96
- " Using cached hpack-4.1.0-py3-none-any.whl.metadata (4.6 kB)\n",
97
- "Requirement already satisfied: jsonpointer>=1.9 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from jsonpatch<2.0,>=1.33->langchain-core<1.0.0,>=0.3.55->langchain) (3.0.0)\n",
98
- "Requirement already satisfied: sniffio>=1.1 in c:\\users\\adm\\documents\\rag_milvus\\.venv\\lib\\site-packages (from anyio->httpx[http2]>=0.20.0->qdrant-client) (1.3.1)\n",
99
- "Downloading qdrant_client-1.14.2-py3-none-any.whl (327 kB)\n",
100
- " ---------------------------------------- 0.0/327.7 kB ? eta -:--:--\n",
101
- " -------------- ------------------------- 122.9/327.7 kB 3.6 MB/s eta 0:00:01\n",
102
- " ---------------------------------------- 327.7/327.7 kB 5.1 MB/s eta 0:00:00\n",
103
- "Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl (16.6 MB)\n",
104
- " ---------------------------------------- 0.0/16.6 MB ? eta -:--:--\n",
105
- " - -------------------------------------- 0.5/16.6 MB 14.4 MB/s eta 0:00:02\n",
106
- " -- ------------------------------------- 1.0/16.6 MB 12.8 MB/s eta 0:00:02\n",
107
- " ---- ----------------------------------- 1.8/16.6 MB 14.0 MB/s eta 0:00:02\n",
108
- " ------ --------------------------------- 2.5/16.6 MB 14.6 MB/s eta 0:00:01\n",
109
- " ------- -------------------------------- 3.3/16.6 MB 15.0 MB/s eta 0:00:01\n",
110
- " --------- ------------------------------ 4.0/16.6 MB 14.9 MB/s eta 0:00:01\n",
111
- " ----------- ---------------------------- 4.7/16.6 MB 15.1 MB/s eta 0:00:01\n",
112
- " ------------- -------------------------- 5.4/16.6 MB 15.1 MB/s eta 0:00:01\n",
113
- " -------------- ------------------------- 6.2/16.6 MB 15.1 MB/s eta 0:00:01\n",
114
- " ---------------- ----------------------- 6.9/16.6 MB 15.2 MB/s eta 0:00:01\n",
115
- " ------------------ --------------------- 7.6/16.6 MB 15.2 MB/s eta 0:00:01\n",
116
- " -------------------- ------------------- 8.3/16.6 MB 15.6 MB/s eta 0:00:01\n",
117
- " --------------------- ------------------ 9.1/16.6 MB 15.7 MB/s eta 0:00:01\n",
118
- " ----------------------- ---------------- 9.8/16.6 MB 15.6 MB/s eta 0:00:01\n",
119
- " ------------------------- -------------- 10.4/16.6 MB 15.6 MB/s eta 0:00:01\n",
120
- " -------------------------- ------------- 11.0/16.6 MB 15.6 MB/s eta 0:00:01\n",
121
- " ---------------------------- ----------- 11.8/16.6 MB 15.6 MB/s eta 0:00:01\n",
122
- " ------------------------------ --------- 12.5/16.6 MB 15.6 MB/s eta 0:00:01\n",
123
- " ------------------------------- -------- 13.2/16.6 MB 15.6 MB/s eta 0:00:01\n",
124
- " --------------------------------- ------ 13.9/16.6 MB 15.6 MB/s eta 0:00:01\n",
125
- " ----------------------------------- ---- 14.6/16.6 MB 15.2 MB/s eta 0:00:01\n",
126
- " ------------------------------------ --- 15.3/16.6 MB 15.2 MB/s eta 0:00:01\n",
127
- " -------------------------------------- - 16.1/16.6 MB 16.0 MB/s eta 0:00:01\n",
128
- " --------------------------------------- 16.6/16.6 MB 15.6 MB/s eta 0:00:01\n",
129
- " ---------------------------------------- 16.6/16.6 MB 15.2 MB/s eta 0:00:00\n",
130
- "Using cached grpcio-1.71.0-cp311-cp311-win_amd64.whl (4.3 MB)\n",
131
- "Using cached portalocker-2.10.1-py3-none-any.whl (18 kB)\n",
132
- "Downloading protobuf-6.30.2-cp310-abi3-win_amd64.whl (431 kB)\n",
133
- " ---------------------------------------- 0.0/431.0 kB ? eta -:--:--\n",
134
- " --------------------------------------- 431.0/431.0 kB 26.3 MB/s eta 0:00:00\n",
135
- "Using cached h2-4.2.0-py3-none-any.whl (60 kB)\n",
136
- "Using cached hpack-4.1.0-py3-none-any.whl (34 kB)\n",
137
- "Using cached hyperframe-6.1.0-py3-none-any.whl (13 kB)\n",
138
- "Installing collected packages: pymupdf, protobuf, portalocker, hyperframe, hpack, grpcio, h2, qdrant-client\n",
139
- "Successfully installed grpcio-1.71.0 h2-4.2.0 hpack-4.1.0 hyperframe-6.1.0 portalocker-2.10.1 protobuf-6.30.2 pymupdf-1.25.5 qdrant-client-1.14.2\n",
140
- "Note: you may need to restart the kernel to use updated packages.\n"
141
- ]
142
- },
143
- {
144
- "name": "stderr",
145
- "output_type": "stream",
146
- "text": [
147
- "\n",
148
- "[notice] A new release of pip is available: 23.2.1 -> 25.1\n",
149
- "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
150
- ]
151
- }
152
- ],
153
- "source": [
154
- "%pip install qdrant-client transformers torch langchain pymupdf"
155
- ]
156
- },
157
- {
158
- "cell_type": "markdown",
159
- "id": "a4833977",
160
- "metadata": {},
161
- "source": [
162
- "Importamos librerias"
163
- ]
164
- },
165
- {
166
- "cell_type": "code",
167
- "execution_count": 19,
168
- "id": "1684b4de",
169
- "metadata": {},
170
- "outputs": [],
171
- "source": [
172
- "from sentence_transformers import SentenceTransformer\n",
173
- "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
174
- "from langchain.schema import Document\n",
175
- "from qdrant_client import QdrantClient\n",
176
- "from qdrant_client.models import PointStruct, Distance, VectorParams\n",
177
- "import fitz # PyMuPDF"
178
- ]
179
- },
180
- {
181
- "cell_type": "markdown",
182
- "id": "e612e674",
183
- "metadata": {},
184
- "source": [
185
- "Definimos funciones\n",
186
- "1) Cargar los pdf por sus bloques de paginas"
187
- ]
188
- },
189
- {
190
- "cell_type": "code",
191
- "execution_count": 20,
192
- "id": "5a594ed8",
193
- "metadata": {},
194
- "outputs": [],
195
- "source": [
196
- "def pdfachunk(path, chunk_size_pages=20):\n",
197
- " doc = fitz.open(path)\n",
198
- " chunks = []\n",
199
- " for i in range(0, len(doc), chunk_size_pages):\n",
200
- " text = \"\"\n",
201
- " for page_num in range(i, min(i + chunk_size_pages, len(doc))):\n",
202
- " text += doc[page_num].get_text()\n",
203
- " chunks.append(text)\n",
204
- " doc.close()\n",
205
- " return chunks"
206
- ]
207
- },
208
- {
209
- "cell_type": "markdown",
210
- "id": "59d048b7",
211
- "metadata": {},
212
- "source": [
213
- "2) Dividir texto en chunks más pequeños con solapamiento"
214
- ]
215
- },
216
- {
217
- "cell_type": "code",
218
- "execution_count": 21,
219
- "id": "bffac6eb",
220
- "metadata": {},
221
- "outputs": [],
222
- "source": [
223
- "def split_chunks(raw_chunks, chunk_size=1024, chunk_overlap=100):\n",
224
- " docs = [Document(page_content=chunk) for chunk in raw_chunks]\n",
225
- " splitter = RecursiveCharacterTextSplitter(\n",
226
- " chunk_size=chunk_size,\n",
227
- " chunk_overlap=chunk_overlap,\n",
228
- " separators=[\"\\n\\n\", \"\\n\", \".\", \" \"]\n",
229
- " )\n",
230
- " return splitter.split_documents(docs)"
231
- ]
232
- },
233
- {
234
- "cell_type": "markdown",
235
- "id": "8664bf6f",
236
- "metadata": {},
237
- "source": [
238
- "3) Generar embeddings en batch"
239
- ]
240
- },
241
- {
242
- "cell_type": "code",
243
- "execution_count": 22,
244
- "id": "35a4df0b",
245
- "metadata": {},
246
- "outputs": [],
247
- "source": [
248
- "def generaremben(model, texts):\n",
249
- " texts = [t for t in texts if t.strip()] # filtra vacíos\n",
250
- " if not texts:\n",
251
- " raise ValueError(\"No hay textos válidos para generar embeddings.\")\n",
252
- " return model.encode(texts, batch_size=16, show_progress_bar=True)\n"
253
- ]
254
- },
255
- {
256
- "cell_type": "markdown",
257
- "id": "c28a5724",
258
- "metadata": {},
259
- "source": [
260
- "4) Insertar los docemtos en QDRANT localemente"
261
- ]
262
- },
263
- {
264
- "cell_type": "code",
265
- "execution_count": 23,
266
- "id": "3c61ddca",
267
- "metadata": {},
268
- "outputs": [],
269
- "source": [
270
- "def insertarenqdra(embeddings, texts, collection_name=\"pdf_chunks\"):\n",
271
- " client = QdrantClient(path=\"./qdrant_data\") # persistente\n",
272
- "\n",
273
- " dim = len(embeddings[0])\n",
274
- " client.recreate_collection(\n",
275
- " collection_name=collection_name,\n",
276
- " vectors_config=VectorParams(size=dim, distance=Distance.COSINE)\n",
277
- " )\n",
278
- "\n",
279
- " points = [\n",
280
- " PointStruct(id=i, vector=embeddings[i].tolist(), payload={\"text\": texts[i]})\n",
281
- " for i in range(len(embeddings))\n",
282
- " ]\n",
283
- "\n",
284
- " client.upsert(collection_name=collection_name, points=points)\n",
285
- " print(f\"✅ Insertados {len(points)} vectores en Qdrant.\")"
286
- ]
287
- },
288
- {
289
- "cell_type": "markdown",
290
- "id": "566e06c8",
291
- "metadata": {},
292
- "source": [
293
- "5) Funcion modelo para no cargarlo siempre"
294
- ]
295
- },
296
- {
297
- "cell_type": "code",
298
- "execution_count": 6,
299
- "id": "eec86477",
300
- "metadata": {},
301
- "outputs": [],
302
- "source": [
303
- "def load_nv_model():\n",
304
- " return AutoModel.from_pretrained(\"nvidia/NV-Embed-v2\", trust_remote_code=True)"
305
- ]
306
- },
307
- {
308
- "cell_type": "markdown",
309
- "id": "08510d36",
310
- "metadata": {},
311
- "source": [
312
- "Probamos"
313
- ]
314
- },
315
- {
316
- "cell_type": "code",
317
- "execution_count": 24,
318
- "id": "2735d1a1",
319
- "metadata": {},
320
- "outputs": [],
321
- "source": [
322
- "pdf_path=\"./DOCS/Decreto-Supremo-N_-018-2019-JUS.pdf\" "
323
- ]
324
- },
325
- {
326
- "cell_type": "code",
327
- "execution_count": 25,
328
- "id": "6ede8122",
329
- "metadata": {},
330
- "outputs": [],
331
- "source": [
332
- "pdf_chunks = pdfachunk(pdf_path)"
333
- ]
334
- },
335
- {
336
- "cell_type": "code",
337
- "execution_count": 26,
338
- "id": "8f33af13",
339
- "metadata": {},
340
- "outputs": [],
341
- "source": [
342
- "split_docs = split_chunks(pdf_chunks)"
343
- ]
344
- },
345
- {
346
- "cell_type": "code",
347
- "execution_count": 27,
348
- "id": "b0fb3761",
349
- "metadata": {},
350
- "outputs": [],
351
- "source": [
352
- "texts = [doc.page_content for doc in split_docs]"
353
- ]
354
- },
355
- {
356
- "cell_type": "markdown",
357
- "id": "85f2ee3f",
358
- "metadata": {},
359
- "source": [
360
- "Definimos nuestro modelo de embbending"
361
- ]
362
- },
363
- {
364
- "cell_type": "markdown",
365
- "id": "5eb18c36",
366
- "metadata": {},
367
- "source": [
368
- "NECESITAMOS DATASETS Y EINOPS"
369
- ]
370
- },
371
- {
372
- "cell_type": "code",
373
- "execution_count": null,
374
- "id": "74262eaa",
375
- "metadata": {},
376
- "outputs": [],
377
- "source": [
378
- "%pip install datasets einops"
379
- ]
380
- },
381
- {
382
- "cell_type": "code",
383
- "execution_count": 28,
384
- "id": "93bbbcde",
385
- "metadata": {},
386
- "outputs": [
387
- {
388
- "name": "stderr",
389
- "output_type": "stream",
390
- "text": [
391
- "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n"
392
- ]
393
- }
394
- ],
395
- "source": [
396
- "model = SentenceTransformer(\"all-MiniLM-L6-v2\")"
397
- ]
398
- },
399
- {
400
- "cell_type": "code",
401
- "execution_count": 13,
402
- "id": "f9f4d5bd",
403
- "metadata": {},
404
- "outputs": [
405
- {
406
- "name": "stdout",
407
- "output_type": "stream",
408
- "text": [
409
- "<class 'transformers_modules.nvidia.NV-Embed-v2.c50d55f43bde7e6a18e0eaa15a62fd63a930f1a1.modeling_nvembed.NVEmbedModel'>\n"
410
- ]
411
- }
412
- ],
413
- "source": [
414
- "print(type(model))"
415
- ]
416
- },
417
- {
418
- "cell_type": "code",
419
- "execution_count": 29,
420
- "id": "e594d6af",
421
- "metadata": {},
422
- "outputs": [
423
- {
424
- "name": "stderr",
425
- "output_type": "stream",
426
- "text": [
427
- "Batches: 100%|██████████| 12/12 [00:03<00:00, 3.29it/s]\n"
428
- ]
429
- }
430
- ],
431
- "source": [
432
- "embeddings = generaremben(model, texts)"
433
- ]
434
- },
435
- {
436
- "cell_type": "code",
437
- "execution_count": 30,
438
- "id": "beba3991",
439
- "metadata": {},
440
- "outputs": [
441
- {
442
- "name": "stderr",
443
- "output_type": "stream",
444
- "text": [
445
- "C:\\Users\\adm\\AppData\\Local\\Temp\\ipykernel_26272\\752761030.py:5: DeprecationWarning: `recreate_collection` method is deprecated and will be removed in the future. Use `collection_exists` to check collection existence and `create_collection` instead.\n",
446
- " client.recreate_collection(\n"
447
- ]
448
- },
449
- {
450
- "name": "stdout",
451
- "output_type": "stream",
452
- "text": [
453
- "✅ Insertados 181 vectores en Qdrant.\n"
454
- ]
455
- }
456
- ],
457
- "source": [
458
- "insertarenqdra(embeddings, texts, collection_name=\"jus_decreto_018\")"
459
- ]
460
- },
461
- {
462
- "cell_type": "markdown",
463
- "id": "2c241dc6",
464
- "metadata": {},
465
- "source": [
466
- "Funcion para consultar con qdrant"
467
- ]
468
- },
469
- {
470
- "cell_type": "code",
471
- "execution_count": 31,
472
- "id": "86beaf73",
473
- "metadata": {},
474
- "outputs": [],
475
- "source": [
476
- "from qdrant_client import QdrantClient\n",
477
- "import numpy as np"
478
- ]
479
- },
480
- {
481
- "cell_type": "code",
482
- "execution_count": null,
483
- "id": "b81de586",
484
- "metadata": {},
485
- "outputs": [],
486
- "source": [
487
- "def query_qdrant(query, model, collection_name, top_k=5):\n",
488
- " # Generar embedding de la consulta\n",
489
- " query_embedding = model.encode([query])[0]\n",
490
- " \n",
491
- " # Conexión al cliente Qdrant\n",
492
- " client = QdrantClient(path=\"./qdrant_data\")\n",
493
- "\n",
494
- " # Realizar búsqueda en la colección\n",
495
- " results = client.search(\n",
496
- " collection_name=collection_name,\n",
497
- " query_vector=query_embedding.tolist(),\n",
498
- " limit=top_k, # Limitar a los primeros K resultados más similares\n",
499
- " with_payload=True # Incluir el texto en los resultados\n",
500
- " )\n",
501
- "\n",
502
- " return results"
503
- ]
504
- },
505
- {
506
- "cell_type": "code",
507
- "execution_count": 36,
508
- "id": "9d448736",
509
- "metadata": {},
510
- "outputs": [
511
- {
512
- "ename": "RuntimeError",
513
- "evalue": "Storage folder ./qdrant_data is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.",
514
- "output_type": "error",
515
- "traceback": [
516
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
517
- "\u001b[31merror\u001b[39m Traceback (most recent call last)",
518
- "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\adm\\Documents\\rag_MILVUS\\.venv\\Lib\\site-packages\\portalocker\\portalocker.py:49\u001b[39m, in \u001b[36mlock\u001b[39m\u001b[34m(file_, flags)\u001b[39m\n\u001b[32m 48\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m49\u001b[39m \u001b[43mwin32file\u001b[49m\u001b[43m.\u001b[49m\u001b[43mLockFileEx\u001b[49m\u001b[43m(\u001b[49m\u001b[43mos_fh\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m-\u001b[49m\u001b[32;43m0x10000\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m__overlapped\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 50\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m pywintypes.error \u001b[38;5;28;01mas\u001b[39;00m exc_value:\n\u001b[32m 51\u001b[39m \u001b[38;5;66;03m# error: (33, 'LockFileEx', 'The process cannot access the file\u001b[39;00m\n\u001b[32m 52\u001b[39m \u001b[38;5;66;03m# because another process has locked a portion of the file.')\u001b[39;00m\n",
519
- "\u001b[31merror\u001b[39m: (33, 'LockFileEx', 'El proceso no tiene acceso al archivo porque otro proceso tiene bloqueada una parte del archivo.')",
520
- "\nThe above exception was the direct cause of the following exception:\n",
521
- "\u001b[31mAlreadyLocked\u001b[39m Traceback (most recent call last)",
522
- "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\adm\\Documents\\rag_MILVUS\\.venv\\Lib\\site-packages\\qdrant_client\\local\\qdrant_local.py:133\u001b[39m, in \u001b[36mQdrantLocal._load\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 132\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m133\u001b[39m \u001b[43mportalocker\u001b[49m\u001b[43m.\u001b[49m\u001b[43mlock\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 134\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_flock_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 135\u001b[39m \u001b[43m \u001b[49m\u001b[43mportalocker\u001b[49m\u001b[43m.\u001b[49m\u001b[43mLockFlags\u001b[49m\u001b[43m.\u001b[49m\u001b[43mEXCLUSIVE\u001b[49m\u001b[43m \u001b[49m\u001b[43m|\u001b[49m\u001b[43m \u001b[49m\u001b[43mportalocker\u001b[49m\u001b[43m.\u001b[49m\u001b[43mLockFlags\u001b[49m\u001b[43m.\u001b[49m\u001b[43mNON_BLOCKING\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 136\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 137\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m portalocker.exceptions.LockException:\n",
523
- "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\adm\\Documents\\rag_MILVUS\\.venv\\Lib\\site-packages\\portalocker\\portalocker.py:54\u001b[39m, in \u001b[36mlock\u001b[39m\u001b[34m(file_, flags)\u001b[39m\n\u001b[32m 53\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m exc_value.winerror == winerror.ERROR_LOCK_VIOLATION:\n\u001b[32m---> \u001b[39m\u001b[32m54\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m exceptions.AlreadyLocked(\n\u001b[32m 55\u001b[39m exceptions.LockException.LOCK_FAILED,\n\u001b[32m 56\u001b[39m exc_value.strerror,\n\u001b[32m 57\u001b[39m fh=file_,\n\u001b[32m 58\u001b[39m ) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mexc_value\u001b[39;00m\n\u001b[32m 59\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 60\u001b[39m \u001b[38;5;66;03m# Q: Are there exceptions/codes we should be dealing with\u001b[39;00m\n\u001b[32m 61\u001b[39m \u001b[38;5;66;03m# here?\u001b[39;00m\n",
524
- "\u001b[31mAlreadyLocked\u001b[39m: (1, 'El proceso no tiene acceso al archivo porque otro proceso tiene bloqueada una parte del archivo.')",
525
- "\nDuring handling of the above exception, another exception occurred:\n",
526
- "\u001b[31mRuntimeError\u001b[39m Traceback (most recent call last)",
527
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[36]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m query = \u001b[33m\"\u001b[39m\u001b[33m¿Cuál es el propósito de la Ley 018-2019?\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m results = \u001b[43mquery_qdrant\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mjus_decreto_018\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
528
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[34]\u001b[39m\u001b[32m, line 6\u001b[39m, in \u001b[36mquery_qdrant\u001b[39m\u001b[34m(query, model, collection_name, top_k)\u001b[39m\n\u001b[32m 3\u001b[39m query_embedding = model.encode([query])[\u001b[32m0\u001b[39m]\n\u001b[32m 5\u001b[39m \u001b[38;5;66;03m# Conexión al cliente Qdrant\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m client = \u001b[43mQdrantClient\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m./qdrant_data\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 8\u001b[39m \u001b[38;5;66;03m# Realizar búsqueda en la colección\u001b[39;00m\n\u001b[32m 9\u001b[39m results = client.search(\n\u001b[32m 10\u001b[39m collection_name=collection_name,\n\u001b[32m 11\u001b[39m query_vector=query_embedding.tolist(),\n\u001b[32m 12\u001b[39m limit=top_k, \u001b[38;5;66;03m# Limitar a los primeros K resultados más similares\u001b[39;00m\n\u001b[32m 13\u001b[39m with_payload=\u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m# Incluir el texto en los resultados\u001b[39;00m\n\u001b[32m 14\u001b[39m )\n",
529
- "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\adm\\Documents\\rag_MILVUS\\.venv\\Lib\\site-packages\\qdrant_client\\qdrant_client.py:133\u001b[39m, in \u001b[36mQdrantClient.__init__\u001b[39m\u001b[34m(self, location, url, port, grpc_port, prefer_grpc, https, api_key, prefix, timeout, host, path, force_disable_check_same_thread, grpc_options, auth_token_provider, cloud_inference, local_inference_batch_size, check_compatibility, **kwargs)\u001b[39m\n\u001b[32m 128\u001b[39m \u001b[38;5;28mself\u001b[39m._client = QdrantLocal(\n\u001b[32m 129\u001b[39m location=location,\n\u001b[32m 130\u001b[39m force_disable_check_same_thread=force_disable_check_same_thread,\n\u001b[32m 131\u001b[39m )\n\u001b[32m 132\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m133\u001b[39m \u001b[38;5;28mself\u001b[39m._client = \u001b[43mQdrantLocal\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 134\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 135\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_disable_check_same_thread\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_disable_check_same_thread\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 136\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 137\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 138\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m location \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m url \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
530
- "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\adm\\Documents\\rag_MILVUS\\.venv\\Lib\\site-packages\\qdrant_client\\local\\qdrant_local.py:66\u001b[39m, in \u001b[36mQdrantLocal.__init__\u001b[39m\u001b[34m(self, location, force_disable_check_same_thread)\u001b[39m\n\u001b[32m 64\u001b[39m \u001b[38;5;28mself\u001b[39m.aliases: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mstr\u001b[39m] = {}\n\u001b[32m 65\u001b[39m \u001b[38;5;28mself\u001b[39m._flock_file: Optional[TextIOWrapper] = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m66\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 67\u001b[39m \u001b[38;5;28mself\u001b[39m._closed: \u001b[38;5;28mbool\u001b[39m = \u001b[38;5;28;01mFalse\u001b[39;00m\n",
531
- "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\adm\\Documents\\rag_MILVUS\\.venv\\Lib\\site-packages\\qdrant_client\\local\\qdrant_local.py:138\u001b[39m, in \u001b[36mQdrantLocal._load\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 133\u001b[39m portalocker.lock(\n\u001b[32m 134\u001b[39m \u001b[38;5;28mself\u001b[39m._flock_file,\n\u001b[32m 135\u001b[39m portalocker.LockFlags.EXCLUSIVE | portalocker.LockFlags.NON_BLOCKING,\n\u001b[32m 136\u001b[39m )\n\u001b[32m 137\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m portalocker.exceptions.LockException:\n\u001b[32m--> \u001b[39m\u001b[32m138\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[32m 139\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mStorage folder \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.location\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m is already accessed by another instance of Qdrant client.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 140\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m If you require concurrent access, use Qdrant server instead.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 141\u001b[39m )\n",
532
- "\u001b[31mRuntimeError\u001b[39m: Storage folder ./qdrant_data is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead."
533
- ]
534
- }
535
- ],
536
- "source": [
537
- "query = \"¿Cuál es el propósito de la Ley 018-2019?\"\n",
538
- "results = query_qdrant(query, model,\"jus_decreto_018\")"
539
- ]
540
- },
541
- {
542
- "cell_type": "code",
543
- "execution_count": null,
544
- "id": "61d76427",
545
- "metadata": {},
546
- "outputs": [],
547
- "source": []
548
- }
549
- ],
550
- "metadata": {
551
- "kernelspec": {
552
- "display_name": ".venv",
553
- "language": "python",
554
- "name": "python3"
555
- },
556
- "language_info": {
557
- "codemirror_mode": {
558
- "name": "ipython",
559
- "version": 3
560
- },
561
- "file_extension": ".py",
562
- "mimetype": "text/x-python",
563
- "name": "python",
564
- "nbconvert_exporter": "python",
565
- "pygments_lexer": "ipython3",
566
- "version": "3.11.5"
567
- }
568
- },
569
- "nbformat": 4,
570
- "nbformat_minor": 5
571
- }