|
import random |
|
import requests |
|
import os, glob |
|
|
|
|
|
books = [ |
|
'https://www.gutenberg.org/cache/epub/1513/pg1513.txt', |
|
'https://www.gutenberg.org/files/2701/2701-0.txt', |
|
'https://www.gutenberg.org/cache/epub/84/pg84.txt', |
|
'https://www.gutenberg.org/cache/epub/2641/pg2641.txt', |
|
'https://www.gutenberg.org/cache/epub/1342/pg1342.txt', |
|
'https://www.gutenberg.org/cache/epub/100/pg100.txt' |
|
] |
|
|
|
|
|
|
|
|
|
|
|
allowed_chars = ' aäbcdefghijklmnoöpqrsßtuüvwxyzABCDEFGHIJKLMNOÖPQRSTUÜVWXYZ0123456789!@#$%^&*()-_+=\"\':;[]{}/<>,.`~\n\\' |
|
|
|
|
|
def download_book(book): |
|
return requests.get(book).content.decode('utf-8') |
|
|
|
|
|
def filter_data(data): |
|
print('Filtering data') |
|
return ''.join([char for char in data if char in allowed_chars]) |
|
|
|
|
|
def load_books(fromfolder=False): |
|
text_data = [] |
|
if fromfolder: |
|
current_working_directory = os.getcwd() |
|
print(current_working_directory) |
|
path = 'text' |
|
for filename in glob.glob(os.path.join(path, '*.txt')): |
|
with open(os.path.join(os.getcwd(), filename), 'r') as f: |
|
print(f'Loading {filename}') |
|
text_data.append(filter_data(str(f.read()))) |
|
else: |
|
print(f'Loading {len(books)} books into ram') |
|
for book in books: |
|
text_data.append(filter_data(str(download_book(book)))) |
|
print('Loaded books') |
|
return ' '.join(text_data) |
|
|
|
|
|
def random_split_chunk(data, size=14): |
|
data = data.split(' ') |
|
index = random.randrange(0, len(data)) |
|
return ' '.join(data[index:index+size]) |
|
|