bjorn-hommel's picture
update
ad021d6
raw
history blame
843 Bytes
import cleantext
import ftfy
import json
import hashlib
from io import StringIO
from datetime import datetime
def clean_text(input):
text = ftfy.fix_text(input)
text = cleantext.clean(
text,
extra_spaces=True,
stemming=False,
stopwords=False,
lowercase=True,
numbers=False,
punct=False
)
return(text)
def df_to_csv(df):
csv = StringIO()
df.to_csv(csv, index=False)
csv.seek(0)
csv_data = csv.getvalue()
return(csv_data)
def serialize_data(data):
def converter(o):
if isinstance(o, datetime):
return o.__str__()
return json.dumps(data, default=converter)
def hash(input):
sha1 = hashlib.sha1()
encoded = json.dumps([input], sort_keys=True).encode()
sha1.update(encoded)
return sha1.hexdigest()