from utils.preprocessing import processingpipeline | |
def get_paragraphs(file_path_input): | |
# Declare params | |
SPLIT_BY = 'word' | |
# usually models have max-length of 384/512 | |
SPLIT_LENGTH = 100 | |
# too much overlap can lead to repeatitive text | |
# but as a rule fo thumb we keep (20% of Split Length) | |
SPLIT_OVERLAP = 10 | |
# the text is cleaned for removing htmls and other annoying texts | |
# but if you need to remove all punctuations like ,.; etc. | |
# good to use for non-Transformers based models. | |
REMOVE_PUNC = False | |
# This param is used only for split_by ='word' | |
RESPECT_SENTENCE_BOUNDARY = True | |
# initialize the preprocessing pipeline and pass params for Preprocessor either | |
# on go or as per delcared variables above. | |
prep_pipeline = processingpipeline() | |
output_pre = prep_pipeline.run(file_paths = file_path_input, | |
params= {"FileConverter": {"file_path": file_path, \ | |
"file_name": file_name}, | |
"UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \ | |
"split_by": SPLIT_BY, \ | |
"split_length":SPLIT_LENGTH,\ | |
"split_overlap": SPLIT_OVERLAP, \ | |
"split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}}) | |
output_pre.keys() | |
par_list = output_pre['paraList'] | |
#print(par_list) | |
return par_list |