LevGervich commited on
Commit
fa4b416
·
1 Parent(s): 692156d

Fix imports

Browse files
backend/__init__.py ADDED
File without changes
backend/semantic_search.py CHANGED
@@ -6,7 +6,7 @@ import gradio as gr
6
  from sentence_transformers import SentenceTransformer
7
  from FlagEmbedding import FlagReranker
8
 
9
- from gradio_app.utils.time_decorator import timeit
10
 
11
  db = lancedb.connect(".lancedb")
12
 
 
6
  from sentence_transformers import SentenceTransformer
7
  from FlagEmbedding import FlagReranker
8
 
9
+ from utils.time_decorator import timeit
10
 
11
  db = lancedb.connect(".lancedb")
12
 
backend/utils/__init__.py ADDED
File without changes
backend/utils/data_chunking.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import typing
2
+
3
+ import nltk
4
+ from transformers import AutoTokenizer
5
+ import pathlib
6
+
7
+
8
+ def fixed_strategy(tokenizer, data: str, max_length: int) -> typing.List[str]:
9
+ tokens = tokenizer(data)['input_ids']
10
+ token_chunks = [tokens[idx: idx + max_length] for idx in range(0, len(tokens), max_length)]
11
+ chunks = [tokenizer.decode(token_chunk, skip_special_tokens=True) for token_chunk in token_chunks]
12
+ return chunks
13
+
14
+
15
+ def content_aware_strategy(tokenizer, data: str, max_length: int) -> typing.List[str]:
16
+ sentences = nltk.sent_tokenize(data)
17
+ chunks = []
18
+ current_chunk = None
19
+ current_chunk_length = 0
20
+ for sentence in sentences:
21
+ if current_chunk is None:
22
+ current_chunk = sentence
23
+ current_chunk_length = len(tokenizer(sentence)['input_ids'])
24
+ else:
25
+ current_sentence_length = len(tokenizer(sentence)['input_ids'])
26
+ if current_chunk_length + current_sentence_length > max_length:
27
+ chunks.append(current_chunk)
28
+ current_chunk = sentence
29
+ current_chunk_length = current_sentence_length
30
+ else:
31
+ current_chunk += sentence
32
+ current_chunk_length += current_sentence_length
33
+ if current_chunk is not None:
34
+ chunks.append(current_chunk)
35
+ return chunks
36
+
37
+
38
+ class DataChunker:
39
+ def __init__(self, model_name: str, max_length: int):
40
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
41
+ self.max_length = max_length
42
+
43
+ def chunk_folder(self, input_dir: str, output_dir: str, strategy: typing.Callable):
44
+ p = pathlib.Path(output_dir)
45
+ p.mkdir(parents=True, exist_ok=True)
46
+ input_dir = pathlib.Path(input_dir)
47
+ for input_file_path in input_dir.glob("*.txt"):
48
+ with open(input_file_path, 'r') as f:
49
+ data = f.read()
50
+ chunks = strategy(self.tokenizer, data, self.max_length)
51
+ for i, chunk in enumerate(chunks):
52
+ new_file_path = f'{output_dir}/{input_file_path.stem}_{i}.txt'
53
+ with open(new_file_path, 'w') as fw:
54
+ fw.write(chunk)
55
+
56
+
57
+ if __name__ == "__main__":
58
+ nltk.download('punkt')
59
+ model_names = ["sentence-transformers/all-MiniLM-L6-v2", "BAAI/bge-large-en-v1.5"]
60
+ max_length = 512
61
+ for model_name in model_names:
62
+ data_chunker = DataChunker(model_name, max_length)
63
+ model_suffix = model_name.split("/")[1]
64
+ data_chunker.chunk_folder("../docs", f"../docs_chunked_{model_suffix}", fixed_strategy)
65
+ data_chunker.chunk_folder("../docs", f"../docs_chunked_ca_{model_suffix}", content_aware_strategy)
{utils → backend/utils}/llm_judge.py RENAMED
File without changes
{utils → backend/utils}/time_decorator.py RENAMED
File without changes