import fitz import re import chromadb from chromadb.utils import embedding_functions import uuid import torch from langchain.text_splitter import SentenceTransformersTokenTextSplitter from sentence_transformers import CrossEncoder emb_model_name = "sentence-transformers/all-mpnet-base-v2" sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2") cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') client = chromadb.PersistentClient(path='.vectorstore') collection = client.get_or_create_collection(name='huerto',embedding_function=sentence_transformer_ef,metadata={"hnsw:space": "cosine"}) def parse_pdf(file) : '''transforma un pdf en una lista''' pdf = fitz.open(file) output = [] for page_num in range(pdf.page_count): page = pdf[page_num] text = page.get_text() # Merge hyphenated words text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) # Fix newlines in the middle of sentences text = re.sub(r"(?