ShynBui commited on
Commit
6036494
1 Parent(s): c84cd95

Upload 6 files

Browse files
Data/4535c3c9-7f2b-4eca-b646-879de0a63f30/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8783732ca7632e9ef581dc35eb0aa5f1de727d46f16c249daabec4824c4edf99
3
+ size 1676000
Data/4535c3c9-7f2b-4eca-b646-879de0a63f30/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
Data/4535c3c9-7f2b-4eca-b646-879de0a63f30/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b19222fde386d1b2bb005fc8ab45fdbe43cb0d650a119a0fb7ef6c6c1479479
3
+ size 4000
Data/chroma.sqlite3 ADDED
Binary file (147 kB). View file
 
raw_data/so_tay_sinh_vien_ou_data1.txt ADDED
The diff for this file is too large to render. See raw diff
 
utils.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import TextLoader
2
+ from langchain_community.docstore.document import Document
3
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.retrievers import BM25Retriever
7
+
8
+ import os
9
+
10
+ def split_with_source(text, source):
11
+ splitter = CharacterTextSplitter(
12
+ separator = "\n",
13
+ chunk_size = 256,
14
+ chunk_overlap = 72,
15
+ length_function = len,
16
+ add_start_index = True,
17
+ )
18
+ documents = splitter.create_documents([text])
19
+ for doc in documents:
20
+ doc.metadata["source"] = source
21
+ # print(doc.metadata)
22
+
23
+ return documents
24
+
25
+
26
+ def count_files_in_folder(folder_path):
27
+ # Kiểm tra xem đường dẫn thư mục có tồn tại không
28
+ if not os.path.isdir(folder_path):
29
+ print("Đường dẫn không hợp lệ.")
30
+ return None
31
+
32
+ # Sử dụng os.listdir() để lấy danh sách các tập tin và thư mục trong thư mục
33
+ files = os.listdir(folder_path)
34
+
35
+ # Đếm số lượng tập tin trong danh sách
36
+ file_count = len(files)
37
+
38
+ return file_count
39
+
40
+ def get_document_from_raw_text():
41
+ documents = [Document(page_content="", metadata={'source': 0})]
42
+ files = os.listdir(os.path.join(os.getcwd(), "raw_data"))
43
+ # print(files)
44
+ for i in files:
45
+ file_path = i
46
+ with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file:
47
+ # Tiền xử lý văn bản
48
+ content = file.read().replace('\n\n', "\n")
49
+ # content = ''.join(content.split('.'))
50
+ new_doc = content
51
+ texts = split_with_source(new_doc, i)
52
+ documents = documents + texts
53
+
54
+ return documents
55
+
56
+ def load_the_embedding_retrieve(is_ready = False, k = 3, model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
57
+ if is_ready:
58
+ embeddings = HuggingFaceEmbeddings(model_name=model)
59
+ retriever = Chroma(persist_directory=os.path.join(os.getcwd(), "Data"), embedding_function=embeddings).as_retriever(
60
+ search_kwargs={"k": k}
61
+ )
62
+ else:
63
+
64
+ documents = get_document_from_raw_text()
65
+
66
+ retriever = Chroma.from_documents(documents, embedding=model).as_retriever(
67
+ search_kwargs={"k": k}
68
+ )
69
+
70
+ return retriever
71
+
72
+ def load_the_bm25_retrieve(k = 3):
73
+ documents = get_document_from_raw_text()
74
+ bm25_retriever = BM25Retriever.from_documents(documents)
75
+ bm25_retriever.k = k
76
+
77
+ return bm25_retriever
78
+
79
+
80
+