mintaeng commited on
Commit
6669581
โ€ข
1 Parent(s): 358b100

Update retriever.py

Browse files
Files changed (1) hide show
  1. retriever.py +39 -38
retriever.py CHANGED
@@ -1,38 +1,39 @@
1
- from langchain_core.runnables import RunnablePassthrough
2
- from langchain_core.output_parsers import StrOutputParser
3
- from langchain_community.chat_models import ChatOllama
4
- from langchain_core.prompts import ChatPromptTemplate
5
- from langchain_pinecone import PineconeVectorStore
6
- from langchain_community.embeddings import SentenceTransformerEmbeddings
7
-
8
- import os
9
- from dotenv import load_dotenv
10
- from langchain.retrievers import BM25Retriever, EnsembleRetriever
11
- from kiwipiepy import Kiwi
12
- load_dotenv()
13
-
14
- kiwi = Kiwi()
15
-
16
- def kiwi_tokenize(text):
17
- return [token.form for token in kiwi.tokenize(text)]
18
- # embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True})
19
-
20
- def retriever(pc, bm25):
21
- pcretriever = pc.as_retriever(search_kwargs={'k':4})
22
- kiwi_bm25 = BM25Retriever.from_documents(bm25,preprocess_func=kiwi_tokenize)
23
- kiwi_bm25.k=4
24
-
25
- kiwibm25_pc_37 = EnsembleRetriever(
26
- retrievers=[kiwi_bm25, pcretriever], # ์‚ฌ์šฉํ•  ๊ฒ€์ƒ‰ ๋ชจ๋ธ์˜ ๋ฆฌ์ŠคํŠธ
27
- weights=[0.3, 0.7], # ๊ฐ ๊ฒ€์ƒ‰ ๋ชจ๋ธ์˜ ๊ฒฐ๊ณผ์— ์ ์šฉํ•  ๊ฐ€์ค‘์น˜
28
- search_type="mmr", # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์˜ ๋‹ค์–‘์„ฑ์„ ์ฆ์ง„์‹œํ‚ค๋Š” MMR ๋ฐฉ์‹์„ ์‚ฌ์šฉ
29
- )
30
- # Pinecone vector store ์ดˆ๊ธฐํ™”
31
- # vectorstore = PineconeVectorStore(
32
- # index_name=os.getenv("INDEX_NAME"), embedding=embedding_model
33
- # )
34
-
35
- # retriever = vectorstore.as_retriever(search_kwargs={'k': 2})
36
-
37
- return kiwibm25_pc_37
38
-
 
 
1
+ from langchain_core.runnables import RunnablePassthrough
2
+ from langchain_core.output_parsers import StrOutputParser
3
+ from langchain_community.chat_models import ChatOllama
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from langchain_pinecone import PineconeVectorStore
6
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
7
+
8
+ import os
9
+ from dotenv import load_dotenv
10
+ from langchain_community.retrievers import BM25Retriever, EnsembleRetriever
11
+ # from langchain.retrievers import BM25Retriever, EnsembleRetriever
12
+ from kiwipiepy import Kiwi
13
+ load_dotenv()
14
+
15
+ kiwi = Kiwi()
16
+
17
+ def kiwi_tokenize(text):
18
+ return [token.form for token in kiwi.tokenize(text)]
19
+ # embedding_model = SentenceTransformerEmbeddings(model_name='BM-K/KoSimCSE-roberta-multitask', model_kwargs={"trust_remote_code":True})
20
+
21
+ def retriever(pc, bm25):
22
+ pcretriever = pc.as_retriever(search_kwargs={'k':4})
23
+ kiwi_bm25 = BM25Retriever.from_documents(bm25,preprocess_func=kiwi_tokenize)
24
+ kiwi_bm25.k=4
25
+
26
+ kiwibm25_pc_37 = EnsembleRetriever(
27
+ retrievers=[kiwi_bm25, pcretriever], # ์‚ฌ์šฉํ•  ๊ฒ€์ƒ‰ ๋ชจ๋ธ์˜ ๋ฆฌ์ŠคํŠธ
28
+ weights=[0.3, 0.7], # ๊ฐ ๊ฒ€์ƒ‰ ๋ชจ๋ธ์˜ ๊ฒฐ๊ณผ์— ์ ์šฉํ•  ๊ฐ€์ค‘์น˜
29
+ search_type="mmr", # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์˜ ๋‹ค์–‘์„ฑ์„ ์ฆ์ง„์‹œํ‚ค๋Š” MMR ๋ฐฉ์‹์„ ์‚ฌ์šฉ
30
+ )
31
+ # Pinecone vector store ์ดˆ๊ธฐํ™”
32
+ # vectorstore = PineconeVectorStore(
33
+ # index_name=os.getenv("INDEX_NAME"), embedding=embedding_model
34
+ # )
35
+
36
+ # retriever = vectorstore.as_retriever(search_kwargs={'k': 2})
37
+
38
+ return kiwibm25_pc_37
39
+