"""Load html from files, clean up, split, ingest into Weaviate.""" import os from pathlib import Path from markdown import markdown import pickle import re from bs4 import BeautifulSoup from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.vectorstores import FAISS from InstructorEmbedding import INSTRUCTOR print(os.environ["HUGGINFACE_APIKEY"]) def clean_data(data): html = markdown(data) soup = BeautifulSoup(html, "html.parser") text = ''.join(soup.findAll(text=True)) cleaned_text = re.sub(r"", "", text, flags=re.DOTALL) print(cleaned_text) return "\n".join([t for t in cleaned_text.split("\n") if t]) docs = [] metadatas = [] for p in Path("docs").rglob("*"): if p.is_dir(): continue if str(p).lower().endswith(('.md', '.mdx')): with open(p) as f: filename = os.path.splitext(p)[0] docs.append(clean_data(f.read())) newfile_name = filename.replace("\\", "/")[5:] print("file:" + newfile_name) metadatas.append({"source": newfile_name}) text_splitter = CharacterTextSplitter( separator="\n", chunk_size=768, chunk_overlap=128, length_function=len, ) documents = text_splitter.create_documents(docs, metadatas=metadatas) print("making embedding") model_name = "hkunlp/instructor-large" embed_instruction = "Represent the text from the Hugging Face code documentation" query_instruction = "Query the most relevant text from the Hugging Face code documentation" embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction) print("beginning construction of faiss") search_index = FAISS.from_documents(documents, embedding) print("beginning pickle") with open("docs.pkl", 'wb') as f: pickle.dump(search_index, f) print("Pickle complete")