MathJake commited on
Commit
c899d69
·
verified ·
1 Parent(s): 68a8bf7

Upload Rag_milvus.py

Browse files
Files changed (1) hide show
  1. src/Rag_milvus.py +97 -0
src/Rag_milvus.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.schema import Document
3
+ from qdrant_client import QdrantClient
4
+ from qdrant_client.models import PointStruct, Distance, VectorParams
5
+ import fitz # PyMuPDF
6
+ from qdrant_client import QdrantClient
7
+ import numpy as np
8
+ import streamlit as st
9
+
10
+ def pdfachunk(file, chunk_size_pages=20):
11
+ # Usar el buffer binario del archivo subido
12
+ doc = fitz.open(stream=file.read(), filetype="pdf")
13
+ chunks = []
14
+ for i in range(0, len(doc), chunk_size_pages):
15
+ text = ""
16
+ for page_num in range(i, min(i + chunk_size_pages, len(doc))):
17
+ text += doc[page_num].get_text()
18
+ chunks.append(text)
19
+ doc.close()
20
+ return chunks
21
+
22
+ def split_chunks(raw_chunks, chunk_size=1024, chunk_overlap=100):
23
+ docs = [Document(page_content=chunk) for chunk in raw_chunks]
24
+ splitter = RecursiveCharacterTextSplitter(
25
+ chunk_size=chunk_size,
26
+ chunk_overlap=chunk_overlap,
27
+ separators=["\n\n", "\n", ".", " "]
28
+ )
29
+ return splitter.split_documents(docs)
30
+
31
+ def generaremben(model, texts):
32
+ texts = [t for t in texts if t.strip()] # filtra vacíos
33
+ if not texts:
34
+ raise ValueError("No hay textos válidos para generar embeddings.")
35
+ return model.encode(texts, batch_size=16, show_progress_bar=True)
36
+
37
+ def insertarenqdra(embeddings, texts, nombre_coleccion):
38
+ client = QdrantClient(path="./data_v2") # persistente
39
+
40
+ dim = len(embeddings[0])
41
+ client.recreate_collection(
42
+ collection_name=nombre_coleccion,
43
+ vectors_config=VectorParams(size=dim, distance=Distance.COSINE)
44
+ )
45
+
46
+ points = [
47
+ PointStruct(id=i, vector=embeddings[i].tolist(), payload={"text": texts[i]})
48
+ for i in range(len(embeddings))
49
+ ]
50
+
51
+ client.upsert(collection_name=nombre_coleccion, points=points)
52
+ print(f"✅ Insertados {len(points)} vectores en Qdrant.")
53
+
54
+ def query_qdrant(query, model, nombre_coleccion, top_k, umbral):
55
+ query_embedding = generaremben(model, [query])[0]
56
+
57
+
58
+ query_embedding = np.array(query_embedding).tolist()
59
+
60
+ client = QdrantClient(path="./data_v2")
61
+
62
+ results = client.query_points(
63
+ collection_name=nombre_coleccion,
64
+ query=query_embedding,
65
+ limit=top_k,
66
+ with_payload=True,
67
+ score_threshold=umbral
68
+ )
69
+
70
+ return results
71
+
72
+ def query_qdrant_sinumbral(query, model, nombre_coleccion, top_k=5):
73
+ query_embedding = generaremben(model, [query])[0]
74
+
75
+
76
+ query_embedding = np.array(query_embedding).tolist()
77
+
78
+ client = QdrantClient(path="./data_v2")
79
+
80
+ results = client.query_points(
81
+ collection_name=nombre_coleccion,
82
+ query=query_embedding,
83
+ limit=top_k,
84
+ with_payload=True,
85
+ )
86
+
87
+ return results
88
+
89
+
90
+ def obtener_colecciones(path="./data_v2"):
91
+ client = QdrantClient(path=path)
92
+ collections = [col.name for col in client.get_collections().collections]
93
+ return ["Todas las colecciones"] + collections
94
+
95
+
96
+
97
+