Spaces:
Configuration error
Configuration error
import hashlib | |
import urllib | |
import uuid | |
from pathlib import Path | |
from typing import List, Tuple | |
import pandas as pd | |
from loguru import logger | |
from app.config.models.configs import Document, Config | |
from app.parsers.markdown import markdown_splitter | |
HASH_BLOCKSIZE = 65536 | |
class DocumentSplitter: | |
def __init__(self, config: Config) -> None: | |
self.document_path_settings = config.embeddings.document_settings | |
self.chunk_sizes = config.embeddings.chunk_sizes | |
def split( | |
self, | |
limit: int = None, | |
) -> Tuple[List[Document], pd.DataFrame, pd.DataFrame]: | |
all_docs = [] | |
hash_filename_mappings = [] | |
hash_docid_mappings = [] | |
for setting in self.document_path_settings: | |
passage_prefix = setting.passage_prefix | |
docs_path = Path(setting.doc_path) | |
extension = "md" | |
for chunk_size in self.chunk_sizes: | |
paths = [p for p in list(docs_path.glob(f"**/*.{extension}"))] | |
additional_parser_settings = setting.additional_parser_settings.get( | |
extension, dict() | |
) | |
( | |
docs, | |
hf_mappings, | |
hd_mappings, | |
) = self._get_documents_from_custom_splitter( | |
document_paths=paths, | |
splitter_func=markdown_splitter, | |
max_size=chunk_size, | |
passage_prefix=passage_prefix, | |
**additional_parser_settings, | |
) | |
all_docs.extend(docs) | |
hash_filename_mappings.extend(hf_mappings) | |
hash_docid_mappings.extend(hd_mappings) | |
all_hash_filename_mappings = pd.DataFrame(hash_filename_mappings) | |
all_hash_docid_mappings = pd.concat(hash_docid_mappings, axis=0) | |
if limit: | |
all_docs = all_docs[:limit] | |
all_hash_filename_mappings = all_hash_filename_mappings[:limit] | |
all_hash_docid_mappings = all_hash_docid_mappings[:limit] | |
return all_docs, all_hash_filename_mappings, all_hash_docid_mappings | |
def _get_documents_from_custom_splitter( | |
self, | |
document_paths: List[Path], | |
splitter_func, | |
max_size, | |
passage_prefix: str, | |
**additional_kwargs, | |
) -> Tuple[List[Document], List[dict], List[pd.DataFrame]]: | |
all_docs = [] | |
hash_filename_mappings = [] | |
hash_docid_mappings = [] | |
for path in document_paths: | |
filepath = str(path) | |
filename = filepath.split("/")[-1].replace(f".{path.suffix}", "") | |
if path.suffix != ".md": | |
continue | |
additional_kwargs.update({"filename": filepath}) | |
docs_data = splitter_func(path, max_size, **additional_kwargs) | |
file_hash = get_md5_hash(path) | |
path = urllib.parse.quote(str(path)) # type: ignore | |
logger.info(path) | |
docs = [ | |
Document( | |
page_content=passage_prefix + d["text"], | |
metadata={ | |
**d["metadata"], | |
**{ | |
"source": str(path), | |
"chunk_size": max_size, | |
"document_id": str(uuid.uuid1()), | |
"label": filename, | |
}, | |
}, | |
) | |
for d in docs_data | |
] | |
for d in docs: | |
if 'page' in d.metadata and d.metadata['page'] is None: | |
d.metadata['page'] = -1 | |
all_docs.extend(docs) | |
hash_filename_mappings.append(dict(filename=filepath, filehash=file_hash)) | |
df_hash_docid = ( | |
pd.DataFrame() | |
.assign(docid=[d.metadata["document_id"] for d in docs]) | |
.assign(filehash=file_hash) | |
) | |
hash_docid_mappings.append(df_hash_docid) | |
logger.info(f"Got {len(all_docs)} nodes.") | |
return all_docs, hash_filename_mappings, hash_docid_mappings | |
def get_md5_hash(file_path: Path) -> str: | |
hasher = hashlib.md5() | |
with open(file_path, "rb") as file: | |
buf = file.read(HASH_BLOCKSIZE) | |
while buf: | |
hasher.update(buf) | |
buf = file.read(HASH_BLOCKSIZE) | |
return hasher.hexdigest() | |