Spaces:
Runtime error
Runtime error
| PG_ESSAYS_FILENAME = "paul_graham_essays.txt" | |
| CLEANED_PG_ESSAYS_FILENAME = "paul_graham_essays_cleaned.txt" | |
| import re | |
| from bs4 import BeautifulSoup | |
| # Read in the txt file PG_ESSAYS_FILENAME and convert it from html to plain text using the BeautifulSoup library. | |
| # Write the new text to CLEANED_PG_ESSAYS_FILENAME. | |
| def clean_pg_data(): | |
| with open(PG_ESSAYS_FILENAME, 'r') as f: | |
| html = f.read() | |
| soup = BeautifulSoup(html, 'html.parser') | |
| text = soup.get_text() | |
| # Take all instances of "20" followed by two numbers and replace them with "----" | |
| text = re.sub(r'20\d\d', '----', text) | |
| # Delete the word that occurs before all instances of "----" | |
| text = re.sub(r'\w+ ----', '----', text) | |
| with open(CLEANED_PG_ESSAYS_FILENAME, 'w') as f: | |
| f.write(text) | |
| if __name__ == "__main__": | |
| clean_pg_data() | |