Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on | |
@author: | |
@title: clean_dataset | |
@descriptions: set of functions that enable splitting and cleaning. | |
""" | |
#%% | |
import pandas as pd | |
import numpy as np | |
import string | |
from itertools import chain | |
from textwrap3 import wrap | |
import re | |
def split_at_length(dataframe, column, length, title = True): | |
wrapped = [] | |
for i in dataframe[column]: | |
wrapped.append(wrap(str(i), length)) | |
dataframe = dataframe.assign(wrapped=wrapped) | |
dataframe['wrapped'] = dataframe['wrapped'].apply(lambda x: '; '.join(map(str, x))) | |
if title == True: | |
splitted = pd.concat([pd.Series(row['title'], row['wrapped'].split("; "), ) | |
for _, row in dataframe.iterrows()]).reset_index() | |
splitted = splitted.rename(columns={"index": "text", 0: "title"}) | |
else: | |
splitted = [] | |
return dataframe, splitted | |
def basic(s): | |
""" | |
:param s: string to be processed | |
:return: processed string: see comments in the source code for more info | |
""" | |
# Text Lowercase | |
s = s.lower() | |
# Remove punctuation | |
translator = str.maketrans(' ', ' ', string.punctuation) | |
s = s.translate(translator) | |
# Remove URLs | |
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE) | |
s = re.sub(r"http\S+", " ", s) | |
# Remove new line characters | |
s = re.sub('\n', ' ', s) | |
# Remove distracting single quotes | |
s = re.sub("\'", " ", s) | |
# Remove all remaining numbers and non alphanumeric characters | |
s = re.sub(r'\d+', ' ', s) | |
s = re.sub(r'\W+', ' ', s) | |
# define custom words to replace: | |
#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s) | |
return s.strip() | |
def remove_linebreaks(s): | |
""" | |
:param s: string to be processed | |
:return: processed string: see comments in the source code for more info | |
""" | |
# Remove new line characters | |
s = re.sub('\n', ' ', s) | |
return s.strip() |