File size: 2,667 Bytes
3ab64ac
 
 
 
16ff9c3
3ab64ac
2d17ff2
22b7264
16ff9c3
68cba5e
 
2d17ff2
68cba5e
3ab64ac
 
 
 
7f1736e
86a6762
 
3ab64ac
 
 
 
 
 
8edd3eb
 
 
 
 
1f81843
8edd3eb
 
 
9ca031b
8edd3eb
 
7ebdd15
8edd3eb
1593f40
 
 
7ebdd15
 
 
 
6240195
1593f40
 
 
 
24a7885
1593f40
 
 
 
 
68cba5e
16ff9c3
6240195
 
22b7264
1593f40
22b7264
1593f40
a6f29ed
24a7885
22b7264
 
 
 
 
e491ae1
 
4af9e36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import glob
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from transformers import AutoTokenizer
from torch import cuda
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Qdrant
device = 'cuda' if cuda.is_available() else 'cpu'
#from dotenv import load_dotenv
#load_dotenv()

#HF_token = os.environ["HF_TOKEN"]
path_to_data = "./data/"


def process_pdf():
    files = {'MWTS2021':'./data/MWTS2021.pdf',
            'MWTS2022':'./data/MWTS2022.pdf',
            'Consolidated2021':'./data/Consolidated2021.pdf'}
    docs = {}
    for file,value in files.items():
        try:
            docs[file] = PyMuPDFLoader(value).load()
        except Exception as e:
            print("Exception: ", e)

    
    # text splitter based on the tokenizer of a model of your choosing
    # to make texts fit exactly a transformer's context window size
    # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
    chunk_size = 256
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
            AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
            chunk_size=chunk_size,
            chunk_overlap=10,
            add_start_index=True,
            strip_whitespace=True,
            separators=["\n\n", "\n"],
    )
    
    all_documents = {'Consolidated':[], 'MWTS':[]}
    
    for file,value in docs.items():
        doc_processed = text_splitter.split_documents(value)
        for doc in doc_processed:
            doc.metadata["source"] = file
            doc.metadata["year"] = file[-4:]
        for key in all_documents:
            if key in file:
                print(key)
                all_documents[key].append(doc_processed)
  
    for key, docs_processed in all_documents.items():
        docs_processed = [item for sublist in docs_processed for item in sublist]
        all_documents[key] = docs_processed


    embeddings = HuggingFaceEmbeddings(
        model_kwargs = {'device': device},
        encode_kwargs = {'normalize_embeddings': True},
        model_name="BAAI/bge-small-en-v1.5"
    )

    qdrant_collections = {}

    for file,value in all_documents.items():
        print("emebddings for:",file)
        qdrant_collections[file] = Qdrant.from_documents(
            value,
            embeddings,
            location=":memory:", 
            collection_name=file,
        )
    print("done")
    return qdrant_collections