Spaces:

LevGervich
/

rag_time

Runtime error

App Files Files Community

LevGervich commited on Feb 20, 2024

Commit

fa4b416

1 Parent(s): 692156d

Fix imports

Browse files

Files changed (6) hide show

backend/__init__.py +0 -0
backend/semantic_search.py +1 -1
backend/utils/__init__.py +0 -0
backend/utils/data_chunking.py +65 -0
{utils → backend/utils}/llm_judge.py +0 -0
{utils → backend/utils}/time_decorator.py +0 -0

backend/__init__.py ADDED Viewed

File without changes

backend/semantic_search.py CHANGED Viewed

@@ -6,7 +6,7 @@ import gradio as gr
 from sentence_transformers import SentenceTransformer
 from FlagEmbedding import FlagReranker
-from gradio_app.utils.time_decorator import timeit
 db = lancedb.connect(".lancedb")

 from sentence_transformers import SentenceTransformer
 from FlagEmbedding import FlagReranker
+from utils.time_decorator import timeit
 db = lancedb.connect(".lancedb")

backend/utils/__init__.py ADDED Viewed

File without changes

backend/utils/data_chunking.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import typing
+import nltk
+from transformers import AutoTokenizer
+import pathlib
+def fixed_strategy(tokenizer, data: str, max_length: int) -> typing.List[str]:
+    tokens = tokenizer(data)['input_ids']
+    token_chunks = [tokens[idx: idx + max_length] for idx in range(0, len(tokens), max_length)]
+    chunks = [tokenizer.decode(token_chunk, skip_special_tokens=True) for token_chunk in token_chunks]
+    return chunks
+def content_aware_strategy(tokenizer, data: str, max_length: int) -> typing.List[str]:
+    sentences = nltk.sent_tokenize(data)
+    chunks = []
+    current_chunk = None
+    current_chunk_length = 0
+    for sentence in sentences:
+        if current_chunk is None:
+            current_chunk = sentence
+            current_chunk_length = len(tokenizer(sentence)['input_ids'])
+        else:
+            current_sentence_length = len(tokenizer(sentence)['input_ids'])
+            if current_chunk_length + current_sentence_length > max_length:
+                chunks.append(current_chunk)
+                current_chunk = sentence
+                current_chunk_length = current_sentence_length
+            else:
+                current_chunk += sentence
+                current_chunk_length += current_sentence_length
+    if current_chunk is not None:
+        chunks.append(current_chunk)
+    return chunks
+class DataChunker:
+    def __init__(self, model_name: str, max_length: int):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.max_length = max_length
+    def chunk_folder(self, input_dir: str, output_dir: str, strategy: typing.Callable):
+        p = pathlib.Path(output_dir)
+        p.mkdir(parents=True, exist_ok=True)
+        input_dir = pathlib.Path(input_dir)
+        for input_file_path in input_dir.glob("*.txt"):
+            with open(input_file_path, 'r') as f:
+                data = f.read()
+                chunks = strategy(self.tokenizer, data, self.max_length)
+                for i, chunk in enumerate(chunks):
+                    new_file_path = f'{output_dir}/{input_file_path.stem}_{i}.txt'
+                    with open(new_file_path, 'w') as fw:
+                        fw.write(chunk)
+if __name__ == "__main__":
+    nltk.download('punkt')
+    model_names = ["sentence-transformers/all-MiniLM-L6-v2", "BAAI/bge-large-en-v1.5"]
+    max_length = 512
+    for model_name in model_names:
+        data_chunker = DataChunker(model_name, max_length)
+        model_suffix = model_name.split("/")[1]
+        data_chunker.chunk_folder("../docs", f"../docs_chunked_{model_suffix}", fixed_strategy)
+        data_chunker.chunk_folder("../docs", f"../docs_chunked_ca_{model_suffix}", content_aware_strategy)

{utils → backend/utils}/llm_judge.py RENAMED Viewed

File without changes

{utils → backend/utils}/time_decorator.py RENAMED Viewed

File without changes