rag2

Paused

App Files Files Community

AlexanderKazakov commited on Nov 27, 2023

Commit

d7fdb42

1 Parent(s): 8b1c859

improve markdown chunking

Browse files

Files changed (3) hide show

prep_scripts/lancedb_setup.py +49 -35
prep_scripts/markdown_to_text.py +87 -43
settings.py +10 -2

prep_scripts/lancedb_setup.py CHANGED Viewed

@@ -11,14 +11,10 @@ import numpy as np
 from sentence_transformers import SentenceTransformer
 from settings import *
-emb_sizes = {
-    "sentence-transformers/all-MiniLM-L6-v2": 384,
-    "thenlper/gte-large": 0
-}
 shutil.rmtree(LANCEDB_DIRECTORY, ignore_errors=True)
 db = lancedb.connect(LANCEDB_DIRECTORY)
 batch_size = 32
@@ -33,42 +29,60 @@ elif torch.cuda.is_available():
 else:
     device = "cpu"
-schema = pa.schema(
-  [
-      pa.field(VECTOR_COLUMN_NAME, pa.list_(pa.float32(), emb_sizes[EMB_MODEL_NAME])),
-      pa.field(TEXT_COLUMN_NAME, pa.string())
-  ])
 tbl = db.create_table(LANCEDB_TABLE_NAME, schema=schema, mode="overwrite")
-input_dir = Path(TEXT_CHUNKS_DIR)
 files = list(input_dir.rglob("*"))
-sentences = []
 for file in files:
-    with open(file, encoding='utf-8') as f:
-        sentences.append(f.read())
-for i in tqdm.tqdm(range(0, int(np.ceil(len(sentences) / batch_size)))):
-    try:
-        batch = [sent for sent in sentences[i * batch_size:(i + 1) * batch_size] if len(sent) > 0]
-        encoded = model.encode(batch, normalize_embeddings=True, device=device)
-        encoded = [list(vec) for vec in encoded]
-        df = pd.DataFrame({
-            VECTOR_COLUMN_NAME: encoded,
-            TEXT_COLUMN_NAME: batch
-        })
-        tbl.add(df)
-    except:
-        print(f"batch {i} was skipped: {traceback.format_exc()}")
-'''
-create ivf-pd index https://lancedb.github.io/lancedb/ann_indexes/
-with the size of the transformer docs, index is not really needed
-but we'll do it for demonstration purposes
-'''
 # tbl.create_index(num_partitions=256, num_sub_vectors=96, vector_column_name=VECTOR_COLUMN_NAME)

 from sentence_transformers import SentenceTransformer
+from markdown_to_text import *
 from settings import *
 shutil.rmtree(LANCEDB_DIRECTORY, ignore_errors=True)
 db = lancedb.connect(LANCEDB_DIRECTORY)
 batch_size = 32
 else:
     device = "cpu"
+schema = pa.schema([
+    pa.field(VECTOR_COLUMN_NAME, pa.list_(pa.float32(), emb_sizes[EMB_MODEL_NAME])),
+    pa.field(TEXT_COLUMN_NAME, pa.string()),
+    pa.field(DOCUMENT_PATH_COLUMN_NAME, pa.string()),
+])
 tbl = db.create_table(LANCEDB_TABLE_NAME, schema=schema, mode="overwrite")
+input_dir = Path(MARKDOWN_SOURCE_DIR)
 files = list(input_dir.rglob("*"))
+chunks = []
 for file in files:
+    if not os.path.isfile(file):
+        continue
+    file_path, file_ext = os.path.splitext(os.path.relpath(file, input_dir))
+    if file_ext != '.md':
+        print(f'Skipped {file_ext} extension: {file}')
+        continue
+    doc_header = ' / '.join(split_path(file_path)) + ':\n\n'
+    with open(file, encoding='utf-8') as f:
+        f = f.read()
+        f = remove_comments(f)
+        f = split_markdown(f)
+        chunks.extend((doc_header + chunk, os.path.abspath(file)) for chunk in f)
+from matplotlib import pyplot as plt
+plt.hist([len(c) for c, d in chunks], bins=100)
+plt.show()
+for i in tqdm.tqdm(range(0, int(np.ceil(len(chunks) / batch_size)))):
+    texts, doc_paths = [], []
+    for text, doc_path in chunks[i * batch_size:(i + 1) * batch_size]:
+        if len(text) > 0:
+            texts.append(text)
+            doc_paths.append(doc_path)
+    encoded = model.encode(texts, normalize_embeddings=True, device=device)
+    encoded = [list(vec) for vec in encoded]
+    df = pd.DataFrame({
+        VECTOR_COLUMN_NAME: encoded,
+        TEXT_COLUMN_NAME: texts,
+        DOCUMENT_PATH_COLUMN_NAME: doc_paths,
+    })
+    tbl.add(df)
+# '''
+# create ivf-pd index https://lancedb.github.io/lancedb/ann_indexes/
+# with the size of the transformer docs, index is not really needed
+# but we'll do it for demonstration purposes
+# '''
 # tbl.create_index(num_partitions=256, num_sub_vectors=96, vector_column_name=VECTOR_COLUMN_NAME)

prep_scripts/markdown_to_text.py CHANGED Viewed

@@ -1,50 +1,94 @@
-import shutil
-from bs4 import BeautifulSoup
-from markdown import markdown
 import os
 import re
-from pathlib import Path
 from settings import *
-def markdown_to_text(markdown_string):
-    """ Converts a markdown string to plaintext """
-    # md -> html -> text since BeautifulSoup can extract text cleanly
-    html = markdown(markdown_string)
-    html = re.sub(r'<!--((.|\n)*)-->', '', html)
-    html = re.sub('<code>bash', '<code>', html)
-    # extract text
-    soup = BeautifulSoup(html, "html.parser")
-    text = ''.join(soup.findAll(string=True))
-    text = re.sub('```(py|diff|python)', '', text)
-    text = re.sub('```\n', '\n', text)
-    text = re.sub('-         .*', '', text)
-    text = text.replace('...', '')
-    text = re.sub('\n(\n)+', '\n\n', text)
-    return text
-dir_to_scrape = Path(MARKDOWN_DIR_TO_SCRAPE)
-files = list(dir_to_scrape.rglob("*"))
-shutil.rmtree(TEXT_CHUNKS_DIR, ignore_errors=True)
-os.makedirs(TEXT_CHUNKS_DIR)
-for file in files:
-    parent = file.parent.stem if file.parent.stem != dir_to_scrape.stem else ""
-    if file.is_file():
-        with open(file, encoding='utf-8') as f:
-            md = f.read()
-        text = markdown_to_text(md)
-        with open(os.path.join(TEXT_CHUNKS_DIR, f"{parent}_{file.stem}.txt"), "w", encoding='utf-8') as f:
-            f.write(text)

 import os
 import re
 from settings import *
+def split_path(path):
+    components = []
+    while True:
+        path, tail = os.path.split(path)
+        if tail == "":
+            if path != "":
+                components.append(path)
+            break
+        components.append(tail)
+    components.reverse()
+    return components
+def remove_comments(md):
+    return re.sub(r'<!--((.|\n)*)-->', '', md)
+header_pattern = re.compile(r'\n\s*\n(#{1,3})\s.*\n\s*\n')
+def split_content(content):
+    _parts = content.split('\n\n')
+    parts = []
+    for p in _parts:
+        if len(p) < 2 * TEXT_CHUNK_SIZE:
+            parts.append(p)
+        else:
+            parts.extend(p.split('\n'))
+    res = ['']
+    for p in parts:
+        if len(res[-1]) + len(p) < TEXT_CHUNK_SIZE:
+            res[-1] += p + '\n\n'
+        else:
+            res.append(p + '\n\n')
+    if (
+            len(res) >= 2 and
+            len(res[-1]) < TEXT_CHUNK_SIZE / 4 and
+            len(res[-2]) < TEXT_CHUNK_SIZE
+    ):
+        res[-2] += res[-1]
+        res.pop()
+    return res
+def split_markdown(md):
+    def construct_chunks(content):
+        parts = split_content(content)
+        for p in parts:
+            construct_chunk(p)
+    def construct_chunk(content):
+        content = content.strip()
+        if len(content) == 0:
+            return
+        chunk = ''
+        for i in sorted(name_hierarchy):
+            if len(name_hierarchy[i]) != 0:
+                chunk += name_hierarchy[i] + '\n\n'
+        chunk += content
+        chunk = chunk.strip()
+        res.append(chunk)
+    md = f'\n\n{md}'  # to find a header at the top of a file
+    headers = list(header_pattern.finditer(md))
+    name_hierarchy = {i: '' for i in (1, 2, 3)}
+    res = []
+    for i in range(len(headers)):
+        header = headers[i]
+        level = len(header.group(1))
+        name = header.group().strip()
+        name_hierarchy[level] = name
+        if i == 0 and header.start() != 0:
+            construct_chunks(md[:header.start()])
+        start = header.end()
+        end = headers[i + 1].start() if i + 1 < len(headers) else None
+        construct_chunks(md[start:end])
+    if len(headers) == 0:
+        construct_chunks(md)
+    return res

settings.py CHANGED Viewed

@@ -1,13 +1,21 @@
-MARKDOWN_DIR_TO_SCRAPE = "data/transformers/docs/source/en/"
-TEXT_CHUNKS_DIR = "data/docs_dump"
 EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 LANCEDB_DIRECTORY = "data/lancedb"
 LANCEDB_TABLE_NAME = "table"
 VECTOR_COLUMN_NAME = "embedding"
 TEXT_COLUMN_NAME = "text"
 HF_LLM_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
 OPENAI_LLM_NAME = "gpt-3.5-turbo"
 context_lengths = {
     "mistralai/Mistral-7B-Instruct-v0.1": 4096,
     "gpt-3.5-turbo": 4096,

+MARKDOWN_SOURCE_DIR = "data/transformers/docs/source/en/"
 EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 LANCEDB_DIRECTORY = "data/lancedb"
 LANCEDB_TABLE_NAME = "table"
 VECTOR_COLUMN_NAME = "embedding"
 TEXT_COLUMN_NAME = "text"
+DOCUMENT_PATH_COLUMN_NAME = "document_path"
 HF_LLM_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
 OPENAI_LLM_NAME = "gpt-3.5-turbo"
+""" in symbols, approximate, without headers """
+TEXT_CHUNK_SIZE = 1000
+emb_sizes = {
+    "sentence-transformers/all-MiniLM-L6-v2": 384,
+    "thenlper/gte-large": 0
+}
 context_lengths = {
     "mistralai/Mistral-7B-Instruct-v0.1": 4096,
     "gpt-3.5-turbo": 4096,