|
import os |
|
import csv |
|
import re |
|
|
|
md_directory = "allmd" |
|
input_directory = "allTests" |
|
csv_file = "moose_md_i.csv" |
|
num_files = 2000 |
|
|
|
|
|
md_files = [f for f in os.listdir(md_directory) if f.endswith(".md")] |
|
input_files = [f for f in os.listdir(input_directory) if f.endswith(".i")] |
|
|
|
|
|
url = "https://mooseframework.inl.gov/" |
|
|
|
|
|
with open(csv_file, "w", newline="") as file: |
|
|
|
writer = csv.writer(file) |
|
writer.writerow(["title", "content", "source", "url"]) |
|
|
|
|
|
for i, md_file in enumerate(md_files): |
|
source = "MOOSE Docs md files" |
|
with open(os.path.join(md_directory, md_file), "r") as f: |
|
content = f.read().lower() |
|
content = re.sub(r'^#.*$', '', content, flags=re.MULTILINE) |
|
content = re.split('\|!', content) |
|
|
|
for text in content: |
|
|
|
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'[^\w\s]', '', text) |
|
|
|
text = re.sub(r'(\*\*|__)(.*?)\1', '', text) |
|
|
|
text = re.sub(r'(\*|_)(.*?)\1', '', text) |
|
|
|
text = re.sub(r'\[.*?\]\(.*?\)', '', text) |
|
|
|
text = text.replace('|', '') |
|
|
|
text = re.sub(r'syntax description', '', text) |
|
|
|
text = re.sub(r'\"', '', text) |
|
|
|
|
|
|
|
if len(text) < 50: |
|
continue |
|
|
|
writer.writerow([md_file, text, source, url]) |
|
|
|
for i, inputfile in enumerate(input_files): |
|
source = "MOOSE Docs input files" |
|
with open(os.path.join(input_directory, inputfile), "r") as f: |
|
content = f.read().lower() |
|
|
|
if len(content) < 50: |
|
continue |
|
|
|
|
|
content = re.sub(r',', '', content) |
|
|
|
content = re.sub(r'\n', ' ', content) |
|
|
|
content = re.sub(r'#', '', content) |
|
|
|
content = re.sub(r'https?:\/\/[^\s]*', '', content) |
|
|
|
|
|
content = re.sub(r'(.)\1{3,}\s', ' ', content) |
|
|
|
|
|
content = re.sub(r'\"', '', content) |
|
|
|
|
|
content = re.sub(r'\s+', ' ', content) |
|
|
|
|
|
if len(content) > 3000: |
|
mid = len(content) // 2 |
|
text = content[:mid+200] |
|
writer.writerow([inputfile, text, source, url]) |
|
text = content[mid-200:] |
|
writer.writerow([inputfile, text, source, url]) |
|
else: |
|
|
|
writer.writerow([inputfile, content, source, url]) |
|
|