Spaces:
Runtime error
Runtime error
LevGervich
commited on
Commit
·
fa4b416
1
Parent(s):
692156d
Fix imports
Browse files
backend/__init__.py
ADDED
File without changes
|
backend/semantic_search.py
CHANGED
@@ -6,7 +6,7 @@ import gradio as gr
|
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from FlagEmbedding import FlagReranker
|
8 |
|
9 |
-
from
|
10 |
|
11 |
db = lancedb.connect(".lancedb")
|
12 |
|
|
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from FlagEmbedding import FlagReranker
|
8 |
|
9 |
+
from utils.time_decorator import timeit
|
10 |
|
11 |
db = lancedb.connect(".lancedb")
|
12 |
|
backend/utils/__init__.py
ADDED
File without changes
|
backend/utils/data_chunking.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import typing
|
2 |
+
|
3 |
+
import nltk
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
import pathlib
|
6 |
+
|
7 |
+
|
8 |
+
def fixed_strategy(tokenizer, data: str, max_length: int) -> typing.List[str]:
|
9 |
+
tokens = tokenizer(data)['input_ids']
|
10 |
+
token_chunks = [tokens[idx: idx + max_length] for idx in range(0, len(tokens), max_length)]
|
11 |
+
chunks = [tokenizer.decode(token_chunk, skip_special_tokens=True) for token_chunk in token_chunks]
|
12 |
+
return chunks
|
13 |
+
|
14 |
+
|
15 |
+
def content_aware_strategy(tokenizer, data: str, max_length: int) -> typing.List[str]:
|
16 |
+
sentences = nltk.sent_tokenize(data)
|
17 |
+
chunks = []
|
18 |
+
current_chunk = None
|
19 |
+
current_chunk_length = 0
|
20 |
+
for sentence in sentences:
|
21 |
+
if current_chunk is None:
|
22 |
+
current_chunk = sentence
|
23 |
+
current_chunk_length = len(tokenizer(sentence)['input_ids'])
|
24 |
+
else:
|
25 |
+
current_sentence_length = len(tokenizer(sentence)['input_ids'])
|
26 |
+
if current_chunk_length + current_sentence_length > max_length:
|
27 |
+
chunks.append(current_chunk)
|
28 |
+
current_chunk = sentence
|
29 |
+
current_chunk_length = current_sentence_length
|
30 |
+
else:
|
31 |
+
current_chunk += sentence
|
32 |
+
current_chunk_length += current_sentence_length
|
33 |
+
if current_chunk is not None:
|
34 |
+
chunks.append(current_chunk)
|
35 |
+
return chunks
|
36 |
+
|
37 |
+
|
38 |
+
class DataChunker:
|
39 |
+
def __init__(self, model_name: str, max_length: int):
|
40 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
41 |
+
self.max_length = max_length
|
42 |
+
|
43 |
+
def chunk_folder(self, input_dir: str, output_dir: str, strategy: typing.Callable):
|
44 |
+
p = pathlib.Path(output_dir)
|
45 |
+
p.mkdir(parents=True, exist_ok=True)
|
46 |
+
input_dir = pathlib.Path(input_dir)
|
47 |
+
for input_file_path in input_dir.glob("*.txt"):
|
48 |
+
with open(input_file_path, 'r') as f:
|
49 |
+
data = f.read()
|
50 |
+
chunks = strategy(self.tokenizer, data, self.max_length)
|
51 |
+
for i, chunk in enumerate(chunks):
|
52 |
+
new_file_path = f'{output_dir}/{input_file_path.stem}_{i}.txt'
|
53 |
+
with open(new_file_path, 'w') as fw:
|
54 |
+
fw.write(chunk)
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
nltk.download('punkt')
|
59 |
+
model_names = ["sentence-transformers/all-MiniLM-L6-v2", "BAAI/bge-large-en-v1.5"]
|
60 |
+
max_length = 512
|
61 |
+
for model_name in model_names:
|
62 |
+
data_chunker = DataChunker(model_name, max_length)
|
63 |
+
model_suffix = model_name.split("/")[1]
|
64 |
+
data_chunker.chunk_folder("../docs", f"../docs_chunked_{model_suffix}", fixed_strategy)
|
65 |
+
data_chunker.chunk_folder("../docs", f"../docs_chunked_ca_{model_suffix}", content_aware_strategy)
|
{utils → backend/utils}/llm_judge.py
RENAMED
File without changes
|
{utils → backend/utils}/time_decorator.py
RENAMED
File without changes
|