Spaces:

hwchase17
/

chat-langchain

Runtime error

chat-langchain / ingest.py

Harrison Chase

initial commit

3c87692 almost 2 years ago

2.55 kB

	"""Load html from files, clean up, split, ingest into Weaviate."""
	import os
	from pathlib import Path

	import weaviate
	from bs4 import BeautifulSoup
	from langchain.text_splitter import CharacterTextSplitter


	def clean_data(data):
	soup = BeautifulSoup(data)
	text = soup.find_all("main", {"id": "main-content"})[0].get_text()
	return "\n".join([t for t in text.split("\n") if t])


	docs = []
	metadatas = []
	for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
	if p.is_dir():
	continue
	with open(p) as f:
	docs.append(clean_data(f.read()))
	metadatas.append({"source": p})


	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len,
	)

	documents = text_splitter.create_documents(docs, metadatas=metadatas)


	WEAVIATE_URL = os.environ["WEAVIATE_URL"]
	client = weaviate.Client(
	url=WEAVIATE_URL,
	additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
	)

	client.schema.delete_class("Paragraph")
	client.schema.get()
	schema = {
	"classes": [
	{
	"class": "Paragraph",
	"description": "A written paragraph",
	"vectorizer": "text2vec-openai",
	"moduleConfig": {
	"text2vec-openai": {
	"model": "ada",
	"modelVersion": "002",
	"type": "text",
	}
	},
	"properties": [
	{
	"dataType": ["text"],
	"description": "The content of the paragraph",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": False,
	"vectorizePropertyName": False,
	}
	},
	"name": "content",
	},
	{
	"dataType": ["text"],
	"description": "The link",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": True,
	"vectorizePropertyName": False,
	}
	},
	"name": "source",
	},
	],
	},
	]
	}

	client.schema.create(schema)

	with client.batch as batch:
	for text in documents:
	batch.add_data_object(
	{"content": text.page_content, "source": str(text.metadata["source"])},
	"Paragraph",
	)