cpv_test / file_processing.py
leavoigt's picture
Update file_processing.py
b5c1366
raw
history blame
1.53 kB
from utils.preprocessing import processingpipeline
def get_paragraphs(file_path_input):
# Declare params
SPLIT_BY = 'word'
# usually models have max-length of 384/512
SPLIT_LENGTH = 100
# too much overlap can lead to repeatitive text
# but as a rule fo thumb we keep (20% of Split Length)
SPLIT_OVERLAP = 10
# the text is cleaned for removing htmls and other annoying texts
# but if you need to remove all punctuations like ,.; etc.
# good to use for non-Transformers based models.
REMOVE_PUNC = False
# This param is used only for split_by ='word'
RESPECT_SENTENCE_BOUNDARY = True
# initialize the preprocessing pipeline and pass params for Preprocessor either
# on go or as per delcared variables above.
prep_pipeline = processingpipeline()
output_pre = prep_pipeline.run(file_paths = file_path_input,
params= {"FileConverter": {"file_path": file_path, \
"file_name": file_name},
"UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \
"split_by": SPLIT_BY, \
"split_length":SPLIT_LENGTH,\
"split_overlap": SPLIT_OVERLAP, \
"split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}})
output_pre.keys()
par_list = output_pre['paraList']
#print(par_list)
return par_list