import os import re from bs4 import BeautifulSoup from markdown import markdown from settings import * def split_path(path): components = [] while True: path, tail = os.path.split(path) if tail == "": if path != "": components.append(path) break components.append(tail) components.reverse() return components def remove_comments(md): return re.sub(r'', '', md) header_pattern = re.compile(r'\n\s*\n(#{1,3})\s(.*)\n\s*\n') def split_content(content): text_chunk_size = context_lengths[EMBED_NAME] - 32 _parts = content.split('\n\n') parts = [] for p in _parts: if len(p) < text_chunk_size: parts.append(p) else: parts.extend(p.split('\n')) res = [''] for p in parts: if len(res[-1]) + len(p) < text_chunk_size: res[-1] += p + '\n\n' else: res.append(p + '\n\n') return res def split_markdown(md): def construct_chunks(content): parts = split_content(content) for p in parts: construct_chunk(p) def construct_chunk(content): content = content.strip() if len(content) == 0: return chunk = '' for i in sorted(name_hierarchy): if len(name_hierarchy[i]) != 0: j = i + 1 while j in name_hierarchy: if name_hierarchy[j].find(name_hierarchy[i]) != -1: break j += 1 else: chunk += f'{"#" * (i + 1)}{name_hierarchy[i]}\n\n' chunk += content chunk = chunk.strip() res.append(chunk) # to find a header at the top of a file md = f'\n\n{md}' headers = list(header_pattern.finditer(md)) # only first header can be first-level headers = [h for i, h in enumerate(headers) if i == 0 or len(h.group(1)) > 1] name_hierarchy = {i: '' for i in (1, 2, 3)} res = [] for i in range(len(headers)): header = headers[i] level = len(header.group(1)) name = header.group(2).strip() name_hierarchy[level] = name if i == 0 and header.start() != 0: construct_chunks(md[:header.start()]) start = header.end() end = headers[i + 1].start() if i + 1 < len(headers) else None construct_chunks(md[start:end]) if len(headers) == 0: construct_chunks(md) return res def markdown_to_text(markdown_string): """ Converts a markdown string to plaintext """ # md -> html -> text since BeautifulSoup can extract text cleanly html = markdown(markdown_string) html = re.sub(r'', '', html) html = re.sub('

bash', '', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(string=True))

    text = re.sub('```(py|diff|python)', '', text)
    text = re.sub('```\n', '\n', text)
    text = re.sub('-         .*', '', text)
    text = text.replace('...', '')
    text = re.sub('\n(\n)+', '\n\n', text)

    return text


def md2txt_then_split(md):
    txt = markdown_to_text(md)
    return split_content(txt)