File size: 2,779 Bytes
b4bdfee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afc2218
b4bdfee
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
Indexing with vector database
"""

from pathlib import Path
import re

import chromadb

from unidecode import unidecode

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings



# Load PDF document and create doc splits
def load_doc(list_file_path, chunk_size, chunk_overlap):
    """Load PDF document and create doc splits"""

    loaders = [PyPDFLoader(x) for x in list_file_path]
    pages = []
    for loader in loaders:
        pages.extend(loader.load())
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    doc_splits = text_splitter.split_documents(pages)
    return doc_splits


# Generate collection name for vector database
#  - Use filepath as input, ensuring unicode text
#  - Handle multiple languages (arabic, chinese)
def create_collection_name(filepath):
    """Create collection name for vector database"""

    # Extract filename without extension
    collection_name = Path(filepath).stem
    # Fix potential issues from naming convention
    ## Remove space
    collection_name = collection_name.replace(" ", "-")
    ## ASCII transliterations of Unicode text
    collection_name = unidecode(collection_name)
    ## Remove special characters
    collection_name = re.sub("[^A-Za-z0-9]+", "-", collection_name)
    ## Limit length to 50 characters
    collection_name = collection_name[:50]
    ## Minimum length of 3 characters
    if len(collection_name) < 3:
        collection_name = collection_name + "xyz"
    ## Enforce start and end as alphanumeric character
    if not collection_name[0].isalnum():
        collection_name = "A" + collection_name[1:]
    if not collection_name[-1].isalnum():
        collection_name = collection_name[:-1] + "Z"
    print("\n\nFilepath: ", filepath)
    print("Collection name: ", collection_name)
    return collection_name


# Create vector database
def create_db(splits, collection_name):
    """Create embeddings and vector database"""

    embedding = HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        # model_name="sentence-transformers/all-MiniLM-L6-v2",
        # model_kwargs={"device": "cpu"},
        # encode_kwargs={'normalize_embeddings': False}
    )
    chromadb.api.client.SharedSystemClient.clear_system_cache()
    new_client = chromadb.EphemeralClient()
    vectordb = Chroma.from_documents(
        documents=splits,
        embedding=embedding,
        client=new_client,
        collection_name=collection_name,
        # persist_directory=default_persist_directory
    )
    return vectordb