File size: 4,448 Bytes
fcf4068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import glob
import os
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader, CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
from langchain_pinecone import PineconeVectorStore
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time 
from langchain_community.embeddings import SentenceTransformerEmbeddings

from dotenv import load_dotenv
load_dotenv()


# ๋ฐ์ดํ„ฐ ๋ฐ›์œผ๋ฉด ๊ฐˆ๋ผ์ค˜
def come_data(splits):
    docs = []
    for i in range(len(splits)):
        spcon = splits[i].page_content
        url = splits[i].metadata['source']
        con = Document(page_content=spcon, metadata={'source': url})
        docs.append(con)
    return docs





# ํ‰ํƒ„ํ™”
def flatten_list(lst):
    return [item for sublist in lst for item in flatten_list(sublist)] if isinstance(lst, list) else [lst]


# ๋ชจ๋ธ ๋ถˆ๋Ÿฌ์™€์„œ VectorDB๋กœ ์˜ฌ๋ฆฌ๋Š” ๋ถ€๋ถ„
def all_files(path):
    print(f'RAG์— ๋“ค์–ด๊ฐˆ ๋ชจ๋“  ๋ฐ์ดํ„ฐ๋Š” {path}์— ๋‹ด์•„์ฃผ์„ธ์š”.\n\n\n')
    f = glob.glob(path + '/**', recursive=True)
    f_docs = []
    for file in f:
        a = False
        if file.endswith('.txt'):
            loader = TextLoader(file)
            document = loader.load()
            a = True
        elif file.endswith('.csv'):
            loader = CSVLoader(file)
            document = loader.load()
            a = True
        elif file.endswith('.pdf'):
            loader = PyMuPDFLoader(file)
            document = loader.load()
            a = True
        # ------------------- ํŒŒ์ผ ํƒ€์ž… ์ถ”๊ฐ€ ์‚ฌํ•ญ ์žˆ์„ ์‹œ ์œ„์— ์ถ”๊ฐ€ ----------------#
        if a:
            print(file.split('/')[-1] + ' split ์ง„ํ–‰ ์ค‘')
            text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
                separator=".",
                chunk_size=500,
                chunk_overlap=0,
            )
            splits = text_splitter.split_documents(document)
            docs = come_data(splits)
            f_docs.append(docs)
            print(file.split('/')[-1] + ' split ์ง„ํ–‰ ์™„๋ฃŒ. \n' + file.split('/')[-1] + ' split ๊ฐฏ์ˆ˜ : ' + str(len(docs)))
    flattened_list = flatten_list(f_docs)
    
    '''
    flattened ๋œ docs๋ฅผ ๋ฒกํ„ฐ db๋กœ ๋„ฃ์–ด์ค„ ๊ฒƒ
    '''


    
    # ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ์„ ์–ธ
    embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True}) 
    
    # ๋ฒกํ„ฐ์Šคํ† ์–ด ์„ ์–ธ

    api_key = os.environ['PINECONE_API_KEY']
    pc = Pinecone(api_key=api_key)   

    index_name = os.getenv('INDEX_NAME')

    print('Vector DB ์ดˆ๊ธฐํ™”. Index_name = ' + str(index_name))
    spec = ServerlessSpec(cloud='aws', region='us-east-1')

    # ์ธ๋ฑ์Šค ์กด์žฌ์—ฌ๋ถ€ ํ™•์ธ ๋ฐ ์‚ญ์ œ
    collect_name = []
    for n in pc.list_indexes().indexes:
        collect_name.append(n.name)
    
    if index_name in collect_name:  
        pc.delete_index(index_name) 
        print('๊ธฐ์กด ์ธ๋ฑ์Šค ์‚ญ์ œ์™„๋ฃŒ')  
    time.sleep(3)
    
    # ํŒŒ์ธ์ฝ˜ ์ธ๋ฑ์Šค ์ƒ์„ฑ
    pc.create_index(  
        index_name,  
        dimension=768, 
        metric='cosine',  
        spec=spec  
    )  
    
    # ์ธ๋ฑ์Šค ์žฌ์ƒ์„ฑ ๋ฐ ๋ฐ์ดํ„ฐ ์ž…๋ ฅ
    # index = pc.Index(index_name)
    print('Vector DB ๋“ค์–ด๊ฐ€๋Š” ์ค‘. Index_name = ' + str(index_name))

    # # ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
    # texts = [doc.page_content for doc in flattened_list]
    # embedded_texts = []
    # for txt in texts:
    #     embedded_texts.append(embedding_model.embed_query(txt))

    
    # # ๋ฒกํ„ฐ DB์— ์ž„๋ฒ ๋”ฉ ์ถ”๊ฐ€
    # ids = [str(i) for i in range(len(embedded_texts))]
    # metadata = [doc.metadata for doc in flattened_list]
    
    # # db์˜ฌ๋ฆด๋•Œ ๋ฌด๋ฃŒ๋ฒ„์ „์ด๊ธฐ๋•Œ๋ฌธ์— ์šฉ๋Ÿ‰ ํ„ฐ์ง -> ๋‚˜๋ˆ ์„œ ์˜ฌ๋ฆฌ์ž
    # batch_size = 28
    # for i in range(0, len(embedded_texts), batch_size):
    #     batch_vectors = [{"id": id, "values": vector, "metadata": meta} for id, vector, meta in zip(ids[i:i + batch_size], embedded_texts[i:i + batch_size], metadata[i:i + batch_size])]
    #     index.upsert(vectors=batch_vectors)
        
    
    Vectorstore = PineconeVectorStore.from_documents(
    documents=flattened_list,
    index_name=index_name,
    embedding=embedding_model
    )

    print('์ €์žฅ ์™„๋ฃŒ')
    return Vectorstore, flattened_list