|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from transformers import AutoTokenizer |
|
from sentence_transformers import SentenceTransformer |
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import FAISS |
|
from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline |
|
from transformers import AutoTokenizer, pipeline |
|
from langchain.docstore.document import Document as LangchainDocument |
|
from typing import List, Optional |
|
|
|
|
|
|
|
EMBEDDING_MODEL_NAME = "OrdalieTech/Solon-embeddings-large-0.1" |
|
|
|
|
|
def split_documents( |
|
chunk_size: int, |
|
knowledge_base: List[LangchainDocument], |
|
tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME, |
|
separator:List[str]=None, |
|
) -> List[LangchainDocument]: |
|
""" |
|
Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents. |
|
""" |
|
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( |
|
AutoTokenizer.from_pretrained(tokenizer_name), |
|
chunk_size=chunk_size, |
|
chunk_overlap=int(chunk_size / 10), |
|
add_start_index=True, |
|
strip_whitespace=True, |
|
separators=separator, |
|
) |
|
|
|
docs_processed = [] |
|
for doc in knowledge_base: |
|
docs_processed += text_splitter.split_documents([doc]) |
|
|
|
|
|
unique_texts = {} |
|
docs_processed_unique = [] |
|
for doc in docs_processed: |
|
if doc.page_content not in unique_texts: |
|
unique_texts[doc.page_content] = True |
|
docs_processed_unique.append(doc) |
|
|
|
return docs_processed_unique |