File size: 3,926 Bytes
7009660
 
 
 
fbb697c
7009660
 
 
 
 
 
 
 
 
8d717c1
7009660
 
 
fbb697c
 
7009660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f84a9a
7009660
1f84a9a
 
7009660
 
 
 
39b12fb
 
7009660
aeb550e
7009660
fbb697c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7009660
 
c474a36
7009660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d717c1
7009660
 
 
 
 
 
98639ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# %%
import nltk
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
from chromadb.config import Settings
import chromadb
from chromadb.utils import embedding_functions
from hashlib import sha256
import cloudpickle
import logging
import os
from load_model import load_embedding, load_vectorstore
import torch
import re
import pathlib
import tempfile


current_path = str( pathlib.Path(__file__).parent.resolve() )

os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
nltk.download('punkt')

persist_directory = current_path + "/VectorStore"
logger = logging.getLogger()


# %%

def create_collection(collection_name, model_name, client):
    """Not used atm"""
    if not torch.cuda.is_available():
        device= "cpu"
    else:
        device= "cuda"
    ef = embedding_functions.InstructorEmbeddingFunction(
        model_name=model_name, device=device)
    client.get_or_create_collection(collection_name, embedding_function=ef)
    return True

def create_and_add(collection_name, sub_docs, model_name, metadata):
    logging.info(f"Adding documents to {collection_name}")
    embeddings = load_embedding(model_name)
    vectorstore = load_vectorstore(model_name, collection_name, metadata = metadata)  
    vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
    vectorstore.persist()

    # Test Vectorstore
    vectorstore2 = load_vectorstore(model_name, collection_name, metadata = metadata)
    print( vectorstore2.similarity_search_with_score(query="What are AXAs green Goals?", k=4) )

    return True

def load_from_file(files):

    saved_files=[]
    with tempfile.TemporaryDirectory() as tmpdirname:
        for file in files:
            temp_dir = pathlib.Path(tmpdirname)
            file_name = os.path.join(temp_dir,file.name)
            saved_files.append(file_name)
            with open(file_name, mode='wb') as w:
                w.write(file.read())

        print(saved_files)
        loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files]
        docs = []
        print(loaders)
        for loader in loaders:
            docs.extend(loader.load())
    return docs

def load_from_web(urls, cache=True):
    docs_list = urls
    filename=f"{current_path}/.cache/{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"

    isFile = os.path.isfile(filename)

    if cache and isFile:
        logger.info("Using Cache")
        pikd = open(filename, "rb")
        docs = cloudpickle.load(pikd)
    else:
        loaders=[OnlinePDFLoader(pdf) for pdf in docs_list]
        docs = []
        for loader in loaders:
            docs.extend(loader.load())
        with open(filename, 'wb') as output:
            cloudpickle.dump(docs, output)

    #update metadata
    i=0
    for doc in docs:
        doc.metadata = {'source': docs_list[i], 'url': docs_list[i], 'owner':'Heiko Wagner'}
        i=i+1
    return docs
        
def load_and_split(docs, chunk_size=700):
    text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    sub_docs = text_splitter.split_documents(docs)
    return sub_docs

def metadata_generator(doc, llm,max_token=4000):
    #query = f"Document = {doc.page_content[1:max_token]} -> Respond a python code using a dict filling xxxx like {{'document_type': xxxx, 'summary (max. 30 letters)':'xxxx'}} resond at leat 10 letter"
    query = f"""
    Cluster the following Input document into topic categories based on patterns seen within the text. Also mention reasoning behind how these categories were defined.
Output format:
{{
"DOCUMENT TYPE": "",
"SUMMARY": [],
"REASONING": ""
}}

Input document:
{doc.page_content[1:max_token]}
Output:
    """
    return llm(query)