Spaces:
Runtime error
Runtime error
PG_ESSAYS_FILENAME = "paul_graham_essays.txt" | |
CLEANED_PG_ESSAYS_FILENAME = "paul_graham_essays_cleaned.txt" | |
import re | |
from bs4 import BeautifulSoup | |
# Read in the txt file PG_ESSAYS_FILENAME and convert it from html to plain text using the BeautifulSoup library. | |
# Write the new text to CLEANED_PG_ESSAYS_FILENAME. | |
def clean_pg_data(): | |
with open(PG_ESSAYS_FILENAME, 'r') as f: | |
html = f.read() | |
soup = BeautifulSoup(html, 'html.parser') | |
text = soup.get_text() | |
# Take all instances of "20" followed by two numbers and replace them with "----" | |
text = re.sub(r'20\d\d', '----', text) | |
# Delete the word that occurs before all instances of "----" | |
text = re.sub(r'\w+ ----', '----', text) | |
with open(CLEANED_PG_ESSAYS_FILENAME, 'w') as f: | |
f.write(text) | |
if __name__ == "__main__": | |
clean_pg_data() | |