|
from transformers import AutoTokenizer |
|
from langchain.document_loaders import BSHTMLLoader, DirectoryLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import Chroma |
|
from langchain import HuggingFacePipeline |
|
|
|
def prepare_data(db_path, llm_path): |
|
bshtml_dir_loader = DirectoryLoader('./data/', loader_cls=BSHTMLLoader) |
|
data = bshtml_dir_loader.load() |
|
bloomz_tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-1b7") |
|
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(bloomz_tokenizer, chunk_size=100, chunk_overlap=0, separator="\n") |
|
documents = text_splitter.split_documents(data) |
|
|
|
embeddings = HuggingFaceEmbeddings() |
|
|
|
llm = HuggingFacePipeline.from_model_id( |
|
model_id="bigscience/bloomz-1b7", |
|
task="text-generation", |
|
model_kwargs={"temperature" : 0, "max_length" : 500}) |
|
|
|
vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=db_path) |
|
vectordb.persist() |
|
|
|
return llm |
|
|