buster / data /embed_documents.py
jerpint's picture
move files around (#17)
2b4f517 unverified
raw
history blame
625 Bytes
import pandas as pd
from buster.documents_manager import DeepLakeDocumentsManager
if __name__ == "__main__":
vector_store_path = "wiki_tai_langchain"
chunk_file = "./data/wiki_tai_langchain.csv"
overwrite = True
df = pd.read_csv(chunk_file)
print(f"before drop: {len(df)}")
df = df.dropna()
print(f"after drop: {len(df)}")
dm = DeepLakeDocumentsManager(
vector_store_path,
overwrite=overwrite,
required_columns=["url", "source", "content", "title"],
)
dm.batch_add(df)
zipped_file_path = dm.to_zip()
print(f"Contents zipped to: {zipped_file_path}")