import os import re from bs4 import BeautifulSoup from markdown import markdown from settings import * def split_path(path): components = [] while True: path, tail = os.path.split(path) if tail == "": if path != "": components.append(path) break components.append(tail) components.reverse() return components def remove_comments(md): return re.sub(r'', '', md) header_pattern = re.compile(r'\n\s*\n(#{1,3})\s(.*)\n\s*\n') def split_content(content): text_chunk_size = context_lengths[EMBED_NAME] - 32 _parts = content.split('\n\n') parts = [] for p in _parts: if len(p) < text_chunk_size: parts.append(p) else: parts.extend(p.split('\n')) res = [''] for p in parts: if len(res[-1]) + len(p) < text_chunk_size: res[-1] += p + '\n\n' else: res.append(p + '\n\n') return res def split_markdown(md): def construct_chunks(content): parts = split_content(content) for p in parts: construct_chunk(p) def construct_chunk(content): content = content.strip() if len(content) == 0: return chunk = '' for i in sorted(name_hierarchy): if len(name_hierarchy[i]) != 0: j = i + 1 while j in name_hierarchy: if name_hierarchy[j].find(name_hierarchy[i]) != -1: break j += 1 else: chunk += f'{"#" * (i + 1)}{name_hierarchy[i]}\n\n' chunk += content chunk = chunk.strip() res.append(chunk) # to find a header at the top of a file md = f'\n\n{md}' headers = list(header_pattern.finditer(md)) # only first header can be first-level headers = [h for i, h in enumerate(headers) if i == 0 or len(h.group(1)) > 1] name_hierarchy = {i: '' for i in (1, 2, 3)} res = [] for i in range(len(headers)): header = headers[i] level = len(header.group(1)) name = header.group(2).strip() name_hierarchy[level] = name if i == 0 and header.start() != 0: construct_chunks(md[:header.start()]) start = header.end() end = headers[i + 1].start() if i + 1 < len(headers) else None construct_chunks(md[start:end]) if len(headers) == 0: construct_chunks(md) return res def markdown_to_text(markdown_string): """ Converts a markdown string to plaintext """ # md -> html -> text since BeautifulSoup can extract text cleanly html = markdown(markdown_string) html = re.sub(r'', '', html) html = re.sub('bash', '', html) # extract text soup = BeautifulSoup(html, "html.parser") text = ''.join(soup.findAll(string=True)) text = re.sub('```(py|diff|python)', '', text) text = re.sub('```\n', '\n', text) text = re.sub('- .*', '', text) text = text.replace('...', '') text = re.sub('\n(\n)+', '\n\n', text) return text def md2txt_then_split(md): txt = markdown_to_text(md) return split_content(txt)