ppsingh commited on
Commit
44f0ae2
·
verified ·
1 Parent(s): 25a09d3

Create doc_process.py

Browse files
Files changed (1) hide show
  1. doc_process.py +27 -0
doc_process.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ from langchain_text_splitters import MarkdownHeaderTextSplitter
3
+ from langchain_community.document_loaders import UnstructuredMarkdownLoader
4
+ path_to_data = "./data/"
5
+
6
+ def process_markdown():
7
+ headers_to_split_on = [
8
+ ("#", "Header 1"),
9
+ ("##", "Header 2"),
10
+ ("###", "Header 3"),
11
+ ("####", "Header 4"),
12
+ ("#####", "Header 5")
13
+ ]
14
+ markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
15
+
16
+ files = glob.glob(path_to_data+"*.md")
17
+ docs = []
18
+ for file in files:
19
+ try:
20
+ loader = UnstructuredMarkdownLoader(file)
21
+ data = loader.load()
22
+ docs.append(data)
23
+ except Exception as e:
24
+ print("Exception: ", e)
25
+ docs_processed = [markdown_splitter.split_text(doc) for doc in docs]
26
+ print(len(docs_processed))
27
+ print(docs_processed[0])