MightyOctopus commited on
Commit
3014654
·
verified ·
1 Parent(s): aa04185

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +41 -0
utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from html import unescape
3
+
4
+
5
+ def clean_text(t):
6
+ t = clean_markdown(t)
7
+ t = t.replace("\n"," ")
8
+ t = t.replace("\t"," ")
9
+ t = t.replace("^M"," ")
10
+ t = t.replace("\r"," ")
11
+ t = t.replace(" ,", ",")
12
+ t = re.sub(" +", " ", t)
13
+ return t
14
+
15
+ def clean_markdown(md_text):
16
+ # Remove code blocks
17
+ md_text = re.sub(r'```.*?```', '', md_text, flags=re.DOTALL)
18
+ # Remove inline code
19
+ md_text = re.sub(r'`[^`]*`', '', md_text)
20
+ # Remove images
21
+ md_text = re.sub(r'!\[.*?\]\(.*?\)', '', md_text)
22
+ # Remove links but keep link text
23
+ md_text = re.sub(r'\[([^\]]+)\]\(.*?\)', r'\1', md_text)
24
+ # Remove bold and italic (groups of *, _)
25
+ md_text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', md_text)
26
+ md_text = re.sub(r'(\*|_)(.*?)\1', r'\2', md_text)
27
+ # Remove headings
28
+ md_text = re.sub(r'#+ ', '', md_text)
29
+ # Remove blockquotes
30
+ md_text = re.sub(r'^>.*$', '', md_text, flags=re.MULTILINE)
31
+ # Remove list markers
32
+ md_text = re.sub(r'^(\s*[-*+]|\d+\.)\s+', '', md_text, flags=re.MULTILINE)
33
+ # Remove horizontal rules
34
+ md_text = re.sub(r'^\s*[-*_]{3,}\s*$', '', md_text, flags=re.MULTILINE)
35
+ # Remove tables
36
+ md_text = re.sub(r'\|.*?\|', '', md_text)
37
+ # Remove raw HTML tags
38
+ md_text = re.sub(r'<.*?>', '', md_text)
39
+ # Decode HTML entities
40
+ md_text = unescape(md_text)
41
+ return md_text