Spaces:
Build error
Build error
File size: 3,926 Bytes
7009660 fbb697c 7009660 8d717c1 7009660 fbb697c 7009660 1f84a9a 7009660 1f84a9a 7009660 39b12fb 13ad645 7009660 aeb550e 7009660 fbb697c 7009660 c474a36 7009660 8d717c1 7009660 98639ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# %%
import nltk
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, NLTKTextSplitter
from langchain.document_loaders import OnlinePDFLoader, UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings, HuggingFaceInstructEmbeddings
from chromadb.config import Settings
import chromadb
from chromadb.utils import embedding_functions
from hashlib import sha256
import cloudpickle
import logging
import os
from load_model import load_embedding, load_vectorstore
import torch
import re
import pathlib
import tempfile
current_path = str( pathlib.Path(__file__).parent.resolve() )
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
nltk.download('punkt')
persist_directory = current_path + "/VectorStore"
logger = logging.getLogger()
# %%
def create_collection(collection_name, model_name, client):
"""Not used atm"""
if not torch.cuda.is_available():
device= "cpu"
else:
device= "cuda"
ef = embedding_functions.InstructorEmbeddingFunction(
model_name=model_name, device=device)
client.get_or_create_collection(collection_name, embedding_function=ef)
return True
def create_and_add(collection_name, sub_docs, model_name, metadata):
logging.info(f"Adding documents to {collection_name}")
embeddings = load_embedding(model_name)
vectorstore = load_vectorstore(model_name, collection_name, metadata = metadata)
vectorstore.add_documents(documents=sub_docs, embedding=embeddings)
vectorstore.persist()
# Test Vectorstore
vectorstore2 = load_vectorstore(model_name, collection_name, metadata = metadata)
print( vectorstore2.similarity_search_with_score(query="What is a transformer llm?", k=4) )
return True
def load_from_file(files):
saved_files=[]
with tempfile.TemporaryDirectory() as tmpdirname:
for file in files:
temp_dir = pathlib.Path(tmpdirname)
file_name = os.path.join(temp_dir,file.name)
saved_files.append(file_name)
with open(file_name, mode='wb') as w:
w.write(file.read())
print(saved_files)
loaders=[UnstructuredPDFLoader(pdf) for pdf in saved_files]
docs = []
print(loaders)
for loader in loaders:
docs.extend(loader.load())
return docs
def load_from_web(urls, cache=True):
docs_list = urls
filename=f"{current_path}/.cache/{sha256(str(urls).encode('utf-8')).hexdigest()}.pkl"
isFile = os.path.isfile(filename)
if cache and isFile:
logger.info("Using Cache")
pikd = open(filename, "rb")
docs = cloudpickle.load(pikd)
else:
loaders=[OnlinePDFLoader(pdf) for pdf in docs_list]
docs = []
for loader in loaders:
docs.extend(loader.load())
with open(filename, 'wb') as output:
cloudpickle.dump(docs, output)
#update metadata
i=0
for doc in docs:
doc.metadata = {'source': docs_list[i], 'url': docs_list[i], 'owner':'Heiko Wagner'}
i=i+1
return docs
def load_and_split(docs, chunk_size=700):
text_splitter = NLTKTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
sub_docs = text_splitter.split_documents(docs)
return sub_docs
def metadata_generator(doc, llm,max_token=4000):
#query = f"Document = {doc.page_content[1:max_token]} -> Respond a python code using a dict filling xxxx like {{'document_type': xxxx, 'summary (max. 30 letters)':'xxxx'}} resond at leat 10 letter"
query = f"""
Cluster the following Input document into topic categories based on patterns seen within the text. Also mention reasoning behind how these categories were defined.
Output format:
{{
"DOCUMENT TYPE": "",
"SUMMARY": [],
"REASONING": ""
}}
Input document:
{doc.page_content[1:max_token]}
Output:
"""
return llm(query) |