Spaces:

aikubo
/

parrot

Sleeping

App Files Files Community

parrot / generateCSV.py

aikubo

Upload folder using huggingface_hub

835920f verified over 1 year ago

raw

history blame contribute delete

3.59 kB

	import os
	import csv
	import re

	md_directory = "allmd"
	input_directory = "allTests"
	csv_file = "moose_md_i.csv"
	num_files = 2000 # Specify the number of files to iterate over

	# Get a list of markdown files in the directory
	md_files = [f for f in os.listdir(md_directory) if f.endswith(".md")]
	input_files = [f for f in os.listdir(input_directory) if f.endswith(".i")]


	url = "https://mooseframework.inl.gov/"

	# Open the CSV file for writing
	with open(csv_file, "w", newline="") as file:

	writer = csv.writer(file)
	writer.writerow(["title", "content", "source", "url"]) # Write the header row

	# Iterate over each markdown file up to the specified number of files
	for i, md_file in enumerate(md_files):
	source = "MOOSE Docs md files"
	with open(os.path.join(md_directory, md_file), "r") as f:
	content = f.read().lower() # read the content of the markdown file and lower case it
	content = re.sub(r'^#.*$', '', content, flags=re.MULTILINE) # Remove headers
	content = re.split('\\|!', content) # Split content by multiple delimiters

	for text in content:
	# if text == "":
	# continue
	# if "!" in text:
	# continue

	text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
	text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
	# remove bolds in md
	text = re.sub(r'(\\\|__)(.*?)\1', '', text)
	# remove italics in md
	text = re.sub(r'(\\|_)(.?)\1', '', text)
	#remove links in md
	text = re.sub(r'\[.?\]$.?$', '', text)
	# remove \|
	text = text.replace('\|', '')
	# remove "syntax description"
	text = re.sub(r'syntax description', '', text)
	# remove "
	text = re.sub(r'\"', '', text)


	# if text is less than 50 characters continue
	if len(text) < 50:
	continue

	writer.writerow([md_file, text, source, url])

	for i, inputfile in enumerate(input_files):
	source = "MOOSE Docs input files"
	with open(os.path.join(input_directory, inputfile), "r") as f:
	content = f.read().lower()
	# if text is less than 50 characters continue
	if len(content) < 50:
	continue

	# remove commas
	content = re.sub(r',', '', content)

	content = re.sub(r'\n', ' ', content) #remove new lines
	# remove #
	content = re.sub(r'#', '', content)
	#remove any links that start with https
	content = re.sub(r'https?:\/\/[^\s]*', '', content)

	# remove repeating characters
	content = re.sub(r'(.)\1{3,}\s', ' ', content)

	# remove "
	content = re.sub(r'\"', '', content)

	# remove extra white space
	content = re.sub(r'\s+', ' ', content)

	# if content is greater than 3000 character split into two overlaping chunks
	if len(content) > 3000:
	mid = len(content) // 2
	text = content[:mid+200]
	writer.writerow([inputfile, text, source, url])
	text = content[mid-200:]
	writer.writerow([inputfile, text, source, url])
	else:

	writer.writerow([inputfile, content, source, url])