''' Start: 以下加载本地知识的核心内容。''' import langchain import os # from langchain.document_loaders import UnstructuredFileLoader from langchain.text_splitter import CharacterTextSplitter # # from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings.huggingface import HuggingFaceEmbeddings # from langchain.vectorstores import FAISS from langchain_community.vectorstores import FAISS # ## 加载单个文件 # filepath = "/Users/yunshi/Downloads/txt_dir/Sparks_of_AGI.pdf" # filepath = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/大模型LLM解决方案调研问卷.pdf" # filepath = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/gpt-index-readthedocs-io-en-latest.pdf" # # # filepath = "/Users/yunshi/Downloads/txt_dir/浙江省院前急救质控统计指标.pdf" # loader = UnstructuredFileLoader(filepath) # docs = loader.load() ## 注意后面在应用多文件时,变量名是documents,不是docs。 def localKB_construct(fileDirectory): ### 以下是加载多个文档的方式。 # fileDirectory = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/rawdata/PDF/" # from langchain.document_loaders import Docx2txtLoader from langchain_community.document_loaders import Docx2txtLoader # from langchain.document_loaders import TextLoader from langchain_community.document_loaders import TextLoader from langchain_community.document_loaders import PyPDFLoader # from langchain.document_loaders import PyPDFLoader documents = [] for file in os.listdir(fileDirectory): if file.endswith('.pdf'): pdf_path = fileDirectory + file loader = PyPDFLoader(pdf_path) # loader = PdfReader(pdf_path) documents.extend(loader.load()) elif file.endswith('.docx') or file.endswith('.doc'): doc_path = fileDirectory + file loader = Docx2txtLoader(doc_path) documents.extend(loader.load()) elif file.endswith('.txt'): text_path = fileDirectory + file loader = TextLoader(text_path) documents.extend(loader.load()) print("length of all documents:", len(documents)) # documents = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) # text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) # chunked_documents = text_splitter.split_documents(documents) # ## 文本分割 # text_splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=200) docs = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20).split_documents(documents) #NOTE: 不易chunk_size切割太大,超过模型的最大长度max tokens。 # ## 创建向量数据库 # # embeddings = OpenAIEmbeddings(disallowed_special=()) # embedding_model_name = 'GanymedeNil/text2vec-large-chinese' # # embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) ## 这里是联网情况下连接huggingface后使用。 embeddings = HuggingFaceEmbeddings(model_name='/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/bge-large-zh/') ## 切换成BGE的embedding。 # embeddings = HuggingFaceEmbeddings(model_name='/Users/yunshi/Downloads/chatGLM/My_LocalKB_Project/GanymedeNil_text2vec-large-chinese/') ## 这里会有个“No sentence-transformers model found with name“的warning,但不是error,不影响使用。 ### 中文embeddding之一。 # from text2vec import SentenceModel # embeddings = SentenceModel('shibing624/text2vec-base-chinese-sentence', device=mps_device) # # embeddings = HuggingFaceEmbeddings() vector_store = FAISS.from_documents(docs, embeddings) vector_store.save_local('./FAISS/') print('vector_store construction complete:', vector_store) return vector_store # vs = localKB_construct(fileDirectory = "/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/RAG/rawdata/PDF/") if __name__ == '__main__': # localKB_construct(input("请输入本地文件夹路径:")) ##导入文件夹路径,最后需要加上“/”。 localKB_construct("/Users/yunshi/Downloads/360Data/Data Center/Working-On Task/演讲与培训/2023ChatGPT/Coding/gradio/中交建/产品演示DEMO/交付_简易知识库查询系统(含基座)/KB/") ### 这里是本地文件夹路径,用于构建本地知识库。