VN_laws_qa / src /clean_dataset.py
wanderer2k1's picture
f
9833a80
raw
history blame
2.02 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on
@author:
@title: clean_dataset
@descriptions: set of functions that enable splitting and cleaning.
"""
#%%
import pandas as pd
import numpy as np
import string
from itertools import chain
from textwrap3 import wrap
import re
def split_at_length(dataframe, column, length, title = True):
wrapped = []
for i in dataframe[column]:
wrapped.append(wrap(str(i), length))
dataframe = dataframe.assign(wrapped=wrapped)
dataframe['wrapped'] = dataframe['wrapped'].apply(lambda x: '; '.join(map(str, x)))
if title == True:
splitted = pd.concat([pd.Series(row['title'], row['wrapped'].split("; "), )
for _, row in dataframe.iterrows()]).reset_index()
splitted = splitted.rename(columns={"index": "text", 0: "title"})
else:
splitted = []
return dataframe, splitted
def basic(s):
"""
:param s: string to be processed
:return: processed string: see comments in the source code for more info
"""
# Text Lowercase
s = s.lower()
# Remove punctuation
translator = str.maketrans(' ', ' ', string.punctuation)
s = s.translate(translator)
# Remove URLs
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
s = re.sub(r"http\S+", " ", s)
# Remove new line characters
s = re.sub('\n', ' ', s)
# Remove distracting single quotes
s = re.sub("\'", " ", s)
# Remove all remaining numbers and non alphanumeric characters
s = re.sub(r'\d+', ' ', s)
s = re.sub(r'\W+', ' ', s)
# define custom words to replace:
#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
return s.strip()
def remove_linebreaks(s):
"""
:param s: string to be processed
:return: processed string: see comments in the source code for more info
"""
# Remove new line characters
s = re.sub('\n', ' ', s)
return s.strip()