import textract
from datasets import Dataset as hfd
from sentence_transformers import SentenceTransformer

from config import FEATURE_EXTRACTOR_CHECKPOINT

FEATURE_EXTRACTOR = SentenceTransformer(FEATURE_EXTRACTOR_CHECKPOINT)


def encode_sentence(instance: hfd, text_col: str):
    return {
        "embedding": FEATURE_EXTRACTOR.encode(
            instance[text_col], normalize_embeddings=True
        )
    }


def parse_pdf(pdf_path: str):
    """Gets text from a pdf file using textract"""
    txt = textract.process(pdf_path, method="pdfminer", encoding="latin-1").decode()
    return txt


def chunk_text(text: str, split_sentence="ARTÍCULO"):
    """creates chunks of texts using a split_sentence"""
    chunks = [
        {"chunk": split_sentence + " " + c.replace("\n", " ").strip()}
        for c in text.split(split_sentence)
    ]
    return chunks


def create_df(text_chunks: list[dict[str]]):
    "creates a HuggingFace dataset based on a list of dicts [str,str]"
    return hfd.from_list(text_chunks)