transformers-chat

Runtime error

App Files Files Community

transformers-chat / ingest.py

enoreyes

Upload 4 files

a56c9af almost 2 years ago

raw

history blame

1.95 kB

	"""Load html from files, clean up, split, ingest into Weaviate."""
	import os
	from pathlib import Path
	from markdown import markdown

	import pickle
	import re
	from bs4 import BeautifulSoup
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.embeddings import HuggingFaceInstructEmbeddings
	from langchain.vectorstores import FAISS
	from InstructorEmbedding import INSTRUCTOR

	print(os.environ["HUGGINFACE_APIKEY"])

	def clean_data(data):
	html = markdown(data)
	soup = BeautifulSoup(html, "html.parser")
	text = ''.join(soup.findAll(text=True))
	cleaned_text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)
	print(cleaned_text)
	return "\n".join([t for t in cleaned_text.split("\n") if t])

	docs = []
	metadatas = []
	for p in Path("docs").rglob("*"):
	if p.is_dir():
	continue
	if str(p).lower().endswith(('.md', '.mdx')):
	with open(p) as f:
	filename = os.path.splitext(p)[0]
	docs.append(clean_data(f.read()))
	newfile_name = filename.replace("\\", "/")[5:]
	print("file:" + newfile_name)
	metadatas.append({"source": newfile_name})

	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=768,
	chunk_overlap=128,
	length_function=len,
	)

	documents = text_splitter.create_documents(docs, metadatas=metadatas)

	print("making embedding")
	model_name = "hkunlp/instructor-large"
	embed_instruction = "Represent the text from the Hugging Face code documentation"
	query_instruction = "Query the most relevant text from the Hugging Face code documentation"
	embedding = HuggingFaceInstructEmbeddings(model_name=model_name, embed_instruction=embed_instruction, query_instruction=query_instruction)

	print("beginning construction of faiss")
	search_index = FAISS.from_documents(documents, embedding)

	print("beginning pickle")
	with open("docs.pkl", 'wb') as f:
	pickle.dump(search_index, f)

	print("Pickle complete")