parrot / generateCSV.py
aikubo's picture
Upload folder using huggingface_hub
835920f verified
import os
import csv
import re
md_directory = "allmd"
input_directory = "allTests"
csv_file = "moose_md_i.csv"
num_files = 2000 # Specify the number of files to iterate over
# Get a list of markdown files in the directory
md_files = [f for f in os.listdir(md_directory) if f.endswith(".md")]
input_files = [f for f in os.listdir(input_directory) if f.endswith(".i")]
url = "https://mooseframework.inl.gov/"
# Open the CSV file for writing
with open(csv_file, "w", newline="") as file:
writer = csv.writer(file)
writer.writerow(["title", "content", "source", "url"]) # Write the header row
# Iterate over each markdown file up to the specified number of files
for i, md_file in enumerate(md_files):
source = "MOOSE Docs md files"
with open(os.path.join(md_directory, md_file), "r") as f:
content = f.read().lower() # read the content of the markdown file and lower case it
content = re.sub(r'^#.*$', '', content, flags=re.MULTILINE) # Remove headers
content = re.split('\|!', content) # Split content by multiple delimiters
for text in content:
# if text == "":
# continue
# if "!" in text:
# continue
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
# remove bolds in md
text = re.sub(r'(\*\*|__)(.*?)\1', '', text)
# remove italics in md
text = re.sub(r'(\*|_)(.*?)\1', '', text)
#remove links in md
text = re.sub(r'\[.*?\]\(.*?\)', '', text)
# remove |
text = text.replace('|', '')
# remove "syntax description"
text = re.sub(r'syntax description', '', text)
# remove "
text = re.sub(r'\"', '', text)
# if text is less than 50 characters continue
if len(text) < 50:
continue
writer.writerow([md_file, text, source, url])
for i, inputfile in enumerate(input_files):
source = "MOOSE Docs input files"
with open(os.path.join(input_directory, inputfile), "r") as f:
content = f.read().lower()
# if text is less than 50 characters continue
if len(content) < 50:
continue
# remove commas
content = re.sub(r',', '', content)
content = re.sub(r'\n', ' ', content) #remove new lines
# remove #
content = re.sub(r'#', '', content)
#remove any links that start with https
content = re.sub(r'https?:\/\/[^\s]*', '', content)
# remove repeating characters
content = re.sub(r'(.)\1{3,}\s', ' ', content)
# remove "
content = re.sub(r'\"', '', content)
# remove extra white space
content = re.sub(r'\s+', ' ', content)
# if content is greater than 3000 character split into two overlaping chunks
if len(content) > 3000:
mid = len(content) // 2
text = content[:mid+200]
writer.writerow([inputfile, text, source, url])
text = content[mid-200:]
writer.writerow([inputfile, text, source, url])
else:
writer.writerow([inputfile, content, source, url])