import textract from datasets import Dataset as hfd from sentence_transformers import SentenceTransformer from config import FEATURE_EXTRACTOR_CHECKPOINT FEATURE_EXTRACTOR = SentenceTransformer(FEATURE_EXTRACTOR_CHECKPOINT) def encode_sentence(instance: hfd, text_col: str): return { "embedding": FEATURE_EXTRACTOR.encode( instance[text_col], normalize_embeddings=True ) } def parse_pdf(pdf_path: str): """Gets text from a pdf file using textract""" txt = textract.process(pdf_path, method="pdfminer", encoding="latin-1").decode() return txt def chunk_text(text: str, split_sentence="ARTÍCULO"): """creates chunks of texts using a split_sentence""" chunks = [ {"chunk": split_sentence + " " + c.replace("\n", " ").strip()} for c in text.split(split_sentence) ] return chunks def create_df(text_chunks: list[dict[str]]): "creates a HuggingFace dataset based on a list of dicts [str,str]" return hfd.from_list(text_chunks)