File size: 2,350 Bytes
58964c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader as reader
import os

# experiment with larger models
MODEL_NAME = "Salesforce/SFR-Embedding-Mistral" # ~ 1.2 gb
DISTANCE_FUNCTION = "cosine"
COLLECTION_NAME = "scheme"
EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL_NAME)
client = chromadb.PersistentClient(path="./chromadb_linux_two/")
print("Getting Collection")

schemer = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=EMBEDDING_FUNC,
)
print(f"Number enteries in collection: {schemer.count()}")


###########################################################################
def get_text(pdf_path: str) -> str:
    doc = reader(pdf_path)
    text_content = ''

    for page in range(len(doc.pages)):
        page = doc.pages[page]
        text_content += page.extract_text()
    return text_content

def clean_text(text: str)-> str:
    return text.replace('\n', ' ')

files = os.listdir('./data/')
dataset = []

for file in files:
    if file.endswith(".pdf"):
        text_content = str(get_text(os.path.join('data', file)))
        dataset.append(text_content)
        print(file)

batch_size = 1024
padding_element = '.'
batch_documents = []
batch_ids = []
batch_metadata = []

for i, document in enumerate(dataset):

    # entering each batch
    for j in range(0, len(document), batch_size):
        try:
            j_end = min(j + batch_size, len(document))
            batch = document[j:min(j+batch_size, len(document))]

            if len(batch) < batch_size: # Extend the batch with the padding elements
                padding_needed = batch_size - len(batch)
                batch = batch + str(padding_element * padding_needed)
            
            print(f"Doc {i+1}/{len(dataset)}: Batch {j}/{len(document)}")
            text = clean_text(batch)
            batch_documents.append(text)
            batch_ids.append(f'batch{i}{j}{batch[0]}')
            batch_metadata.append({"length": len(batch)})

        except Exception as e:
            print(f"Error processing batch {j} of document {i}: {e}")

print("Upserting into collection")
schemer.upsert(
    ids=[str(id) for id in batch_ids],
    metadatas=batch_metadata,
    documents=batch_documents,
    )