File size: 879 Bytes
9716b27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
PG_ESSAYS_FILENAME = "paul_graham_essays.txt"
CLEANED_PG_ESSAYS_FILENAME = "paul_graham_essays_cleaned.txt"

import re
from bs4 import BeautifulSoup

# Read in the txt file PG_ESSAYS_FILENAME and convert it from html to plain text using the BeautifulSoup library.
# Write the new text to CLEANED_PG_ESSAYS_FILENAME.

def clean_pg_data():
    with open(PG_ESSAYS_FILENAME, 'r') as f:
        html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        text = soup.get_text()

        # Take all instances of "20" followed by two numbers and replace them with "----"
        text = re.sub(r'20\d\d', '----', text)

        # Delete the word that occurs before all instances of "----"
        text = re.sub(r'\w+ ----', '----', text)

        with open(CLEANED_PG_ESSAYS_FILENAME, 'w') as f:
            f.write(text)

if __name__ == "__main__":
    clean_pg_data()