import os import openai from dotenv import load_dotenv from pprint import pprint from langchain_community.vectorstores.faiss import FAISS from langchain_community.document_loaders import TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter, MarkdownHeaderTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings load_dotenv() os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN") # Step1. 獲取所有文檔的路徑 main_path = "./text-database" text_file_path = os.listdir(main_path)[:30] data_list = [ TextLoader(file_path = os.path.join(main_path, i)).load() for i in text_file_path ] # text_splitter = RecursiveCharacterTextSplitter( # chunk_size = 256, # chunk_overlap = 16 # ) # Step2. 切割不同文本為不同段落後,合併在一起 text_splitter = MarkdownHeaderTextSplitter( headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2") ] ) print("檔案一共有", data_list.__len__(), "份") split_result = [ i.page_content for one_data in data_list for i in text_splitter.split_text(one_data[0].page_content) ] print(split_result.__len__()) # Step3. 建立 Embeddings embeddings_model = HuggingFaceEmbeddings( model_name = "sentence-transformers/all-MiniLM-L6-v2" ) vectorstore = FAISS.from_texts( texts = split_result, embedding = embeddings_model ) retriever = vectorstore.as_retriever() # Step4. 依照問題搜索最相似的文本 query = "I want to backup my data from the database. Please introduce me about its information. " docs = vectorstore.similarity_search(query = query) # Step5. 使用 GPT-3.5 簡易一問一答 client = openai.OpenAI( api_key = os.getenv("OPENAI-API-KEY") ) messages = [ {"role": "system", "content": docs[0].page_content}, {"role": "user", "content": query} ] print("Question: ", query) print("=" * 50) print("Reference: ", docs[0].page_content) print("=" * 50) response = client.chat.completions.create( model = "gpt-3.5-turbo", messages = messages ) print("Answer: ", response.choices[0].message.content) print("=" * 50)