import os import csv import re md_directory = "allmd" input_directory = "allTests" csv_file = "moose_md_i.csv" num_files = 2000 # Specify the number of files to iterate over # Get a list of markdown files in the directory md_files = [f for f in os.listdir(md_directory) if f.endswith(".md")] input_files = [f for f in os.listdir(input_directory) if f.endswith(".i")] url = "https://mooseframework.inl.gov/" # Open the CSV file for writing with open(csv_file, "w", newline="") as file: writer = csv.writer(file) writer.writerow(["title", "content", "source", "url"]) # Write the header row # Iterate over each markdown file up to the specified number of files for i, md_file in enumerate(md_files): source = "MOOSE Docs md files" with open(os.path.join(md_directory, md_file), "r") as f: content = f.read().lower() # read the content of the markdown file and lower case it content = re.sub(r'^#.*$', '', content, flags=re.MULTILINE) # Remove headers content = re.split('\|!', content) # Split content by multiple delimiters for text in content: # if text == "": # continue # if "!" in text: # continue text = re.sub(r'\s+', ' ', text) # Remove extra whitespace text = re.sub(r'[^\w\s]', '', text) # Remove punctuation # remove bolds in md text = re.sub(r'(\*\*|__)(.*?)\1', '', text) # remove italics in md text = re.sub(r'(\*|_)(.*?)\1', '', text) #remove links in md text = re.sub(r'\[.*?\]\(.*?\)', '', text) # remove | text = text.replace('|', '') # remove "syntax description" text = re.sub(r'syntax description', '', text) # remove " text = re.sub(r'\"', '', text) # if text is less than 50 characters continue if len(text) < 50: continue writer.writerow([md_file, text, source, url]) for i, inputfile in enumerate(input_files): source = "MOOSE Docs input files" with open(os.path.join(input_directory, inputfile), "r") as f: content = f.read().lower() # if text is less than 50 characters continue if len(content) < 50: continue # remove commas content = re.sub(r',', '', content) content = re.sub(r'\n', ' ', content) #remove new lines # remove # content = re.sub(r'#', '', content) #remove any links that start with https content = re.sub(r'https?:\/\/[^\s]*', '', content) # remove repeating characters content = re.sub(r'(.)\1{3,}\s', ' ', content) # remove " content = re.sub(r'\"', '', content) # remove extra white space content = re.sub(r'\s+', ' ', content) # if content is greater than 3000 character split into two overlaping chunks if len(content) > 3000: mid = len(content) // 2 text = content[:mid+200] writer.writerow([inputfile, text, source, url]) text = content[mid-200:] writer.writerow([inputfile, text, source, url]) else: writer.writerow([inputfile, content, source, url])