transformers-chat / ingest.py
enoreyes's picture
Upload 4 files
a56c9af
raw
history blame
1.95 kB
"""Load html from files, clean up, split, ingest into Weaviate."""
import os
from pathlib import Path
from markdown import markdown
import pickle
import re
from bs4 import BeautifulSoup
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from InstructorEmbedding import INSTRUCTOR
print(os.environ["HUGGINFACE_APIKEY"])
def clean_data(data):
html = markdown(data)
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(text=True))
cleaned_text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
print(cleaned_text)
return "\n".join([t for t in cleaned_text.split("\n") if t])
docs = []
metadatas = []
for p in Path("docs").rglob("*"):
if p.is_dir():
continue
if str(p).lower().endswith(('.md', '.mdx')):
with open(p) as f:
filename = os.path.splitext(p)[0]
docs.append(clean_data(f.read()))
newfile_name = filename.replace("\\", "/")[5:]
print("file:" + newfile_name)
metadatas.append({"source": newfile_name})
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=768,
chunk_overlap=128,
length_function=len,
)
documents = text_splitter.create_documents(docs, metadatas=metadatas)
print("making embedding")
model_name = "hkunlp/instructor-large"
embed_instruction = "Represent the text from the Hugging Face code documentation"
query_instruction = "Query the most relevant text from the Hugging Face code documentation"
embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)
print("beginning construction of faiss")
search_index = FAISS.from_documents(documents, embedding)
print("beginning pickle")
with open("docs.pkl", 'wb') as f:
pickle.dump(search_index, f)
print("Pickle complete")