PG_ESSAYS_FILENAME = "paul_graham_essays.txt" CLEANED_PG_ESSAYS_FILENAME = "paul_graham_essays_cleaned.txt" import re from bs4 import BeautifulSoup # Read in the txt file PG_ESSAYS_FILENAME and convert it from html to plain text using the BeautifulSoup library. # Write the new text to CLEANED_PG_ESSAYS_FILENAME. def clean_pg_data(): with open(PG_ESSAYS_FILENAME, 'r') as f: html = f.read() soup = BeautifulSoup(html, 'html.parser') text = soup.get_text() # Take all instances of "20" followed by two numbers and replace them with "----" text = re.sub(r'20\d\d', '----', text) # Delete the word that occurs before all instances of "----" text = re.sub(r'\w+ ----', '----', text) with open(CLEANED_PG_ESSAYS_FILENAME, 'w') as f: f.write(text) if __name__ == "__main__": clean_pg_data()