File size: 3,585 Bytes
387e2e0
 
 
 
 
23ead00
 
387e2e0
 
 
 
23ead00
 
 
387e2e0
 
 
 
23ead00
387e2e0
 
 
 
 
23ead00
387e2e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23ead00
387e2e0
23ead00
 
 
 
835920f
 
 
23ead00
 
 
 
 
 
 
 
 
 
 
835920f
 
 
 
 
 
23ead00
 
 
 
 
 
 
 
835920f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import csv
import re

md_directory = "allmd"
input_directory = "allTests"
csv_file = "moose_md_i.csv"
num_files = 2000  # Specify the number of files to iterate over

# Get a list of markdown files in the directory
md_files = [f for f in os.listdir(md_directory) if f.endswith(".md")]
input_files = [f for f in os.listdir(input_directory) if f.endswith(".i")]


url = "https://mooseframework.inl.gov/"

# Open the CSV file for writing
with open(csv_file, "w", newline="") as file:

    writer = csv.writer(file)
    writer.writerow(["title", "content", "source", "url"])  # Write the header row

    # Iterate over each markdown file up to the specified number of files
    for i, md_file in enumerate(md_files):
        source = "MOOSE Docs md files"
        with open(os.path.join(md_directory, md_file), "r") as f:
            content = f.read().lower() # read the content of the markdown file and lower case it
            content = re.sub(r'^#.*$', '', content, flags=re.MULTILINE)  # Remove headers
            content = re.split('\|!', content)  # Split content by multiple delimiters

            for text in content:
                # if text == "":
                #     continue
                # if "!" in text:
                #     continue

                text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
                text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
                # remove bolds in md 
                text = re.sub(r'(\*\*|__)(.*?)\1', '', text)
                # remove italics in md
                text = re.sub(r'(\*|_)(.*?)\1', '', text)
                #remove links in md
                text = re.sub(r'\[.*?\]\(.*?\)', '', text)
                # remove |
                text = text.replace('|', '')
                # remove "syntax description"
                text = re.sub(r'syntax description', '', text)
                # remove "
                text = re.sub(r'\"', '', text)


                # if text is less than 50 characters continue 
                if len(text) < 50:
                    continue

                writer.writerow([md_file, text, source, url])

    for i, inputfile in enumerate(input_files):                   
        source = "MOOSE Docs input files"
        with open(os.path.join(input_directory, inputfile), "r") as f:
            content = f.read().lower()
                    # if text is less than 50 characters continue 
            if len(content) < 50:
                continue

            # remove commas 
            content = re.sub(r',', '', content)
           
            content = re.sub(r'\n', ' ', content) #remove new lines
            # remove #
            content = re.sub(r'#', '', content)
            #remove any links that start with https
            content = re.sub(r'https?:\/\/[^\s]*', '', content)

            # remove repeating characters
            content = re.sub(r'(.)\1{3,}\s', ' ', content)
    
            # remove "
            content = re.sub(r'\"', '', content)

            # remove extra white space
            content = re.sub(r'\s+', ' ', content)

            # if content is greater than 3000 character split into two overlaping chunks
            if len(content) > 3000:
                mid = len(content) // 2
                text = content[:mid+200]
                writer.writerow([inputfile, text, source, url])
                text = content[mid-200:]
                writer.writerow([inputfile, text, source, url])
            else:        

                writer.writerow([inputfile, content, source, url])