Spaces:

ChallengeHub
/

Chinese-LangChain

Runtime error

Chinese-LangChain / create_knowledge.py

feature@添加知识库选择功能

a10fbed over 1 year ago

1.45 kB

	#!/usr/bin/env python
	# -- coding:utf-8 _-
	"""
	@author:quincy qiang
	@license: Apache Licence
	@file: create_knowledge.py
	@time: 2023/04/18
	@contact: yanqiangmiffy@gamil.com
	@software: PyCharm
	@description: - emoji：https://emojixd.com/pocket/science
	"""
	import os

	from langchain.document_loaders import UnstructuredFileLoader
	from langchain.embeddings.huggingface import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from tqdm import tqdm
	# 中文Wikipedia数据导入示例：
	embedding_model_name = '/root/pretrained_models/text2vec-large-chinese'
	docs_path = '/root/GoMall/Knowledge-ChatGLM/cache/financial_research_reports'
	embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

	# docs = []

	# with open('docs/zh_wikipedia/zhwiki.sim.utf8', 'r', encoding='utf-8') as f:
	# for idx, line in tqdm(enumerate(f.readlines())):
	# metadata = {"source": f'doc_id_{idx}'}
	# docs.append(Document(page_content=line.strip(), metadata=metadata))
	#
	# vector_store = FAISS.from_documents(docs, embeddings)
	# vector_store.save_local('cache/zh_wikipedia/')

	docs = []

	for doc in tqdm(os.listdir(docs_path)):
	if doc.endswith('.txt'):
	# print(doc)
	loader = UnstructuredFileLoader(f'{docs_path}/{doc}', mode="elements")
	doc = loader.load()
	docs.extend(doc)
	vector_store = FAISS.from_documents(docs, embeddings)
	vector_store.save_local('cache/financial_research_reports')