Spaces:
Runtime error
Runtime error
import glob | |
import os | |
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter | |
from transformers import AutoTokenizer | |
from langchain_community.document_loaders import PyMuPDFLoader | |
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings | |
from langchain_community.vectorstores import Qdrant | |
#from dotenv import load_dotenv | |
#load_dotenv() | |
#HF_token = os.environ["HF_TOKEN"] | |
path_to_data = "./data/" | |
def process_pdf(): | |
files = {'ABC':'./data/MWTS2021.pdf', | |
'XYZ':'./data/MWTS2022.pdf'} | |
docs = {} | |
for file,value in files.items(): | |
try: | |
docs[file] = PyMuPDFLoader(value).load() | |
except Exception as e: | |
print("Exception: ", e) | |
# text splitter based on the tokenizer of a model of your choosing | |
# to make texts fit exactly a transformer's context window size | |
# langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/ | |
chunk_size = 256 | |
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( | |
AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"), | |
chunk_size=chunk_size, | |
chunk_overlap=int(chunk_size / 10), | |
add_start_index=True, | |
strip_whitespace=True, | |
separators=["\n\n", "\n"], | |
) | |
all_documents = {} | |
for file,value in docs.items(): | |
doc_processed = text_splitter.split_documents(value) | |
for doc in doc_processed: | |
doc.metadata["source"] = file | |
all_documents[file] = doc_processed | |
print(all_documents.keys()) | |
embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-mpnet-base-v2" | |
) | |
qdrant_collections = {} | |
for file,value in all_documents.items(): | |
print("emebddings for:",file) | |
qdrant_collections[file] = Qdrant.from_documents( | |
value, | |
embeddings, | |
location=":memory:", | |
collection_name=file, | |
) | |
print("done") | |
return |