Spaces:
Paused
Paused
File size: 2,008 Bytes
6ab28e5 9bc4a6c 6ab28e5 9bc4a6c 6ab28e5 9bc4a6c 6ab28e5 9bc4a6c 6ab28e5 9bc4a6c 6ab28e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
# from qdrant_client import QdrantClient
from config import DB_CONFIG
CHUNK_SIZE = 500
def _remove_prefix_path(p: str):
prefix = "data/rtdocs/nvdajp-book.readthedocs.io/"
return p.removeprefix(prefix)
def get_documents(path: str):
loader = ReadTheDocsLoader(path, encoding="utf-8")
docs = loader.load()
base_url = "https://nvdajp-book.readthedocs.io/"
add_meta = {"category": "ja-book"}
for doc in docs:
org_metadata = doc.metadata
source = _remove_prefix_path(org_metadata["source"])
add_meta = {"category": "ja-book", "source": source, "url": f"{base_url}{source}"}
doc.metadata = org_metadata | add_meta
yield doc
# return docs
def get_text_chunk(docs):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0)
texts = text_splitter.split_documents(docs)
return texts
def store(texts):
embeddings = OpenAIEmbeddings()
db_url, db_api_key, db_collection_name = DB_CONFIG
# client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True)
_ = Qdrant.from_documents(
texts,
embeddings,
url=db_url,
api_key=db_api_key,
collection_name=db_collection_name
)
def main(path: str):
docs = get_documents(path)
texts = get_text_chunk(docs)
store(texts)
if __name__ == "__main__":
"""
$ python store.py "data/rtdocs/nvdajp-book.readthedocs.io/ja/latest"
"""
import sys
args = sys.argv
if len(args) != 2:
print("No args, you need two args for html_path")
docs = get_documents("data/rtdocs/nvdajp-book.readthedocs.io/ja/latest")
print(type(docs))
breakpoint()
else:
path = args[1]
# dir_name = args[2]
main(path)
|