import docx2txt import re import string def split_string(path): doc = docx2txt.process(path) #global text_list res = re.sub('['+string.punctuation+']', '', doc).split() return res