import os import pandas as pd import tiktoken from langchain.text_splitter import MarkdownHeaderTextSplitter def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int: encoding = tiktoken.get_encoding(encoding_name) num_tokens = len(encoding.encode(string)) return num_tokens def drop_outlier_chunks(df: pd.DataFrame, max_tokens_by_chunk: int = 4500): # drops chunks with abnormally high token counts, usually they contain lots of links filtered_df = df[df.content.apply(num_tokens_from_string) < max_tokens_by_chunk] outliers_df = df[df.content.apply(num_tokens_from_string) >= max_tokens_by_chunk] print(f"Dropping {len(df) - len(filtered_df)} outlier chunks") print(f"Dropped outliers: {outliers_df.content.to_list()}") return filtered_df def find_md_files(folder_path): """Recursively find .md files, extract content and use filename as title.""" md_files = [] for root, _, files in os.walk(folder_path): for file in files: if file.endswith(".md"): file_path = os.path.join(root, file) title = os.path.splitext(file)[0] # Remove the trailing junk (the last word is some kind of hash) title = " ".join(title.split()[:-1]) with open(file_path, "r", encoding="utf-8") as md_file: content = md_file.read() md_files.append({"title": title, "content": content}) return md_files def split_string_by_max_words(input_string, max_words): words = input_string.split() return [" ".join(words[i : i + max_words]) for i in range(0, len(words), max_words)] def get_title_link_from_md_title(md_title: str, title_link_data: dict): for data in title_link_data: title = data["title"] if md_title in title.replace(":", "").replace(".", " ").replace("? ", ""): return data["title"], data["link"] # default back to course link if not found... print("\nNot found: ", md_title) return md_title, "https://learn.activeloop.ai/courses/langchain/" if __name__ == "__main__": folder_path = "/path/to/folder/with/md_content/" folder_path = "/Users/jeremypinto/Downloads/d22d1e98-345f-490d-870e-3b082938741c_Export-0a33c13f-6d42-4a94-8f23-7459e7b2c024" md_files = find_md_files(folder_path) headers_to_split_on = [ ("#", "#"), ("##", "##"), ] markdown_splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on ) chunks = [] import json from tqdm import tqdm with open("title_link_langchaincourse.json", "r") as f: title_link_data = json.load(f) for md_file in tqdm(md_files): md_title = md_file["title"] md_raw_content = md_file["content"] md_header_splits = markdown_splitter.split_text(md_raw_content) title, link = get_title_link_from_md_title( md_title, title_link_data=title_link_data ) for split in md_header_splits: # add the headers back to the content headers = "\n".join( [ k + " " + v for k, v in zip(split.metadata.keys(), split.metadata.values()) ] ) substrings = split_string_by_max_words(split.page_content, max_words=600) for substring in substrings: chunk = { "title": title, "content": headers + "\n" + substring, "source": "TAI Course", "url": link, } chunks.append(chunk) df = pd.DataFrame(chunks) df = drop_outlier_chunks(df, max_tokens_by_chunk=2000) print(f"Exported {len(df)} chunks from {len(md_files)} articles.") df.to_csv("langchain_course.csv")