PrubasPy / src /Rag_milvus.py
MathJake's picture
Upload Rag_milvus.py
c899d69 verified
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams
import fitz # PyMuPDF
from qdrant_client import QdrantClient
import numpy as np
import streamlit as st
def pdfachunk(file, chunk_size_pages=20):
# Usar el buffer binario del archivo subido
doc = fitz.open(stream=file.read(), filetype="pdf")
chunks = []
for i in range(0, len(doc), chunk_size_pages):
text = ""
for page_num in range(i, min(i + chunk_size_pages, len(doc))):
text += doc[page_num].get_text()
chunks.append(text)
doc.close()
return chunks
def split_chunks(raw_chunks, chunk_size=1024, chunk_overlap=100):
docs = [Document(page_content=chunk) for chunk in raw_chunks]
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ".", " "]
)
return splitter.split_documents(docs)
def generaremben(model, texts):
texts = [t for t in texts if t.strip()] # filtra vacíos
if not texts:
raise ValueError("No hay textos válidos para generar embeddings.")
return model.encode(texts, batch_size=16, show_progress_bar=True)
def insertarenqdra(embeddings, texts, nombre_coleccion):
client = QdrantClient(path="./data_v2") # persistente
dim = len(embeddings[0])
client.recreate_collection(
collection_name=nombre_coleccion,
vectors_config=VectorParams(size=dim, distance=Distance.COSINE)
)
points = [
PointStruct(id=i, vector=embeddings[i].tolist(), payload={"text": texts[i]})
for i in range(len(embeddings))
]
client.upsert(collection_name=nombre_coleccion, points=points)
print(f"✅ Insertados {len(points)} vectores en Qdrant.")
def query_qdrant(query, model, nombre_coleccion, top_k, umbral):
query_embedding = generaremben(model, [query])[0]
query_embedding = np.array(query_embedding).tolist()
client = QdrantClient(path="./data_v2")
results = client.query_points(
collection_name=nombre_coleccion,
query=query_embedding,
limit=top_k,
with_payload=True,
score_threshold=umbral
)
return results
def query_qdrant_sinumbral(query, model, nombre_coleccion, top_k=5):
query_embedding = generaremben(model, [query])[0]
query_embedding = np.array(query_embedding).tolist()
client = QdrantClient(path="./data_v2")
results = client.query_points(
collection_name=nombre_coleccion,
query=query_embedding,
limit=top_k,
with_payload=True,
)
return results
def obtener_colecciones(path="./data_v2"):
client = QdrantClient(path=path)
collections = [col.name for col in client.get_collections().collections]
return ["Todas las colecciones"] + collections