Spaces:

hwchase17
/

chat-langchain

Running

App Files Files Community

chat-langchain / ingest.py

Harrison Chase

initial commit

3c87692 over 1 year ago

raw

history blame contribute delete

No virus

2.55 kB

	"""Load html from files, clean up, split, ingest into Weaviate."""
	import os
	from pathlib import Path

	import weaviate
	from bs4 import BeautifulSoup
	from langchain.text_splitter import CharacterTextSplitter


	def clean_data(data):
	soup = BeautifulSoup(data)
	text = soup.find_all("main", {"id": "main-content"})[0].get_text()
	return "\n".join([t for t in text.split("\n") if t])


	docs = []
	metadatas = []
	for p in Path("langchain.readthedocs.io/en/latest/").rglob("*"):
	if p.is_dir():
	continue
	with open(p) as f:
	docs.append(clean_data(f.read()))
	metadatas.append({"source": p})


	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len,
	)

	documents = text_splitter.create_documents(docs, metadatas=metadatas)


	WEAVIATE_URL = os.environ["WEAVIATE_URL"]
	client = weaviate.Client(
	url=WEAVIATE_URL,
	additional_headers={"X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]},
	)

	client.schema.delete_class("Paragraph")
	client.schema.get()
	schema = {
	"classes": [
	{
	"class": "Paragraph",
	"description": "A written paragraph",
	"vectorizer": "text2vec-openai",
	"moduleConfig": {
	"text2vec-openai": {
	"model": "ada",
	"modelVersion": "002",
	"type": "text",
	}
	},
	"properties": [
	{
	"dataType": ["text"],
	"description": "The content of the paragraph",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": False,
	"vectorizePropertyName": False,
	}
	},
	"name": "content",
	},
	{
	"dataType": ["text"],
	"description": "The link",
	"moduleConfig": {
	"text2vec-openai": {
	"skip": True,
	"vectorizePropertyName": False,
	}
	},
	"name": "source",
	},
	],
	},
	]
	}

	client.schema.create(schema)

	with client.batch as batch:
	for text in documents:
	batch.add_data_object(
	{"content": text.page_content, "source": str(text.metadata["source"])},
	"Paragraph",
	)