Spaces:
Runtime error
Runtime error
"""Load html from files, clean up, split, ingest into Weaviate.""" | |
import os | |
from pathlib import Path | |
from markdown import markdown | |
import pickle | |
import re | |
from bs4 import BeautifulSoup | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings import HuggingFaceInstructEmbeddings | |
from langchain.vectorstores import FAISS | |
from InstructorEmbedding import INSTRUCTOR | |
print(os.environ["HUGGINFACE_APIKEY"]) | |
def clean_data(data): | |
html = markdown(data) | |
soup = BeautifulSoup(html, "html.parser") | |
text = ''.join(soup.findAll(text=True)) | |
cleaned_text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL) | |
print(cleaned_text) | |
return "\n".join([t for t in cleaned_text.split("\n") if t]) | |
docs = [] | |
metadatas = [] | |
for p in Path("docs").rglob("*"): | |
if p.is_dir(): | |
continue | |
if str(p).lower().endswith(('.md', '.mdx')): | |
with open(p) as f: | |
filename = os.path.splitext(p)[0] | |
docs.append(clean_data(f.read())) | |
newfile_name = filename.replace("\\", "/")[5:] | |
print("file:" + newfile_name) | |
metadatas.append({"source": newfile_name}) | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=768, | |
chunk_overlap=128, | |
length_function=len, | |
) | |
documents = text_splitter.create_documents(docs, metadatas=metadatas) | |
print("making embedding") | |
model_name = "hkunlp/instructor-large" | |
embed_instruction = "Represent the text from the Hugging Face code documentation" | |
query_instruction = "Query the most relevant text from the Hugging Face code documentation" | |
embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction) | |
print("beginning construction of faiss") | |
search_index = FAISS.from_documents(documents, embedding) | |
print("beginning pickle") | |
with open("docs.pkl", 'wb') as f: | |
pickle.dump(search_index, f) | |
print("Pickle complete") |