|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain.schema import Document
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import PointStruct, Distance, VectorParams
|
|
import fitz
|
|
from qdrant_client import QdrantClient
|
|
import numpy as np
|
|
import streamlit as st
|
|
|
|
def pdfachunk(file, chunk_size_pages=20):
|
|
|
|
doc = fitz.open(stream=file.read(), filetype="pdf")
|
|
chunks = []
|
|
for i in range(0, len(doc), chunk_size_pages):
|
|
text = ""
|
|
for page_num in range(i, min(i + chunk_size_pages, len(doc))):
|
|
text += doc[page_num].get_text()
|
|
chunks.append(text)
|
|
doc.close()
|
|
return chunks
|
|
|
|
def split_chunks(raw_chunks, chunk_size=1024, chunk_overlap=100):
|
|
docs = [Document(page_content=chunk) for chunk in raw_chunks]
|
|
splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
separators=["\n\n", "\n", ".", " "]
|
|
)
|
|
return splitter.split_documents(docs)
|
|
|
|
def generaremben(model, texts):
|
|
texts = [t for t in texts if t.strip()]
|
|
if not texts:
|
|
raise ValueError("No hay textos válidos para generar embeddings.")
|
|
return model.encode(texts, batch_size=16, show_progress_bar=True)
|
|
|
|
def insertarenqdra(embeddings, texts, nombre_coleccion):
|
|
client = QdrantClient(path="./data_v2")
|
|
|
|
dim = len(embeddings[0])
|
|
client.recreate_collection(
|
|
collection_name=nombre_coleccion,
|
|
vectors_config=VectorParams(size=dim, distance=Distance.COSINE)
|
|
)
|
|
|
|
points = [
|
|
PointStruct(id=i, vector=embeddings[i].tolist(), payload={"text": texts[i]})
|
|
for i in range(len(embeddings))
|
|
]
|
|
|
|
client.upsert(collection_name=nombre_coleccion, points=points)
|
|
print(f"✅ Insertados {len(points)} vectores en Qdrant.")
|
|
|
|
def query_qdrant(query, model, nombre_coleccion, top_k, umbral):
|
|
query_embedding = generaremben(model, [query])[0]
|
|
|
|
|
|
query_embedding = np.array(query_embedding).tolist()
|
|
|
|
client = QdrantClient(path="./data_v2")
|
|
|
|
results = client.query_points(
|
|
collection_name=nombre_coleccion,
|
|
query=query_embedding,
|
|
limit=top_k,
|
|
with_payload=True,
|
|
score_threshold=umbral
|
|
)
|
|
|
|
return results
|
|
|
|
def query_qdrant_sinumbral(query, model, nombre_coleccion, top_k=5):
|
|
query_embedding = generaremben(model, [query])[0]
|
|
|
|
|
|
query_embedding = np.array(query_embedding).tolist()
|
|
|
|
client = QdrantClient(path="./data_v2")
|
|
|
|
results = client.query_points(
|
|
collection_name=nombre_coleccion,
|
|
query=query_embedding,
|
|
limit=top_k,
|
|
with_payload=True,
|
|
)
|
|
|
|
return results
|
|
|
|
|
|
def obtener_colecciones(path="./data_v2"):
|
|
client = QdrantClient(path=path)
|
|
collections = [col.name for col in client.get_collections().collections]
|
|
return ["Todas las colecciones"] + collections
|
|
|
|
|
|
|
|
|
|
|