File size: 2,025 Bytes
129cd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

import os
os.environ["OPENAI_API_KEY"] = "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"
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
import openai
from pypinyin import lazy_pinyin
from tqdm import tqdm

embedding = OpenAIEmbeddings()


def list_files(directory):
    select = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            select.append(os.path.join(root, file))
    return select

if __name__ == "__main__":

    domains = ["农业", "宗教与文化", "建筑业与制造业", "医疗卫生保健", "国家治理", "法律法规", "财政税收", "教育", "金融", "贸易", "宏观经济", "社会发展", "科学技术", "能源环保", "国际关系", "国防安全"]
    for domain_name in domains:
        directory_path = f"./example_data/{domain_name}"
        select_files = list_files(directory_path)
        select_pages = []
        for i, item in tqdm(enumerate(select_files)):
            print(item)
            loader = PyPDFLoader(item)
            pages = loader.load_and_split()
            select_pages.extend(pages)

        pinyin = "".join(lazy_pinyin(domain_name))
        persist_vector_path = f"./vector_data/{pinyin}_{len(select_files)}_{len(select_pages)}"
        print(persist_vector_path)
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        documents = text_splitter.split_documents(select_pages)
        db = Chroma.from_documents(documents, OpenAIEmbeddings(), persist_directory=persist_vector_path)
        # db = Chroma(persist_directory='path', embedding_function=embedding)
        

    

    # docs = db.similarity_search_with_score(query="宏观经济有什么影响", k=3)
    # contents = [doc[0] for doc in docs]
    # relevance = "  ".join(doc.page_content for doc in contents)
    # source = [doc.metadata for doc in contents]