Spaces:

GIZ
/

SDSN-demo

Running on CPU Upgrade

App Files Files Community

SDSN-demo / udfPreprocess /cleaning.py

ppsingh

new_version

22b8e0b about 2 years ago

raw

history blame

4.42 kB

	import pandas as pd
	import numpy as np
	import string
	import nltk
	import spacy
	import en_core_web_sm
	import re
	import streamlit as st

	from haystack.nodes import PreProcessor

	'''basic cleaning - suitable for transformer models'''
	def basic(s):
	"""
	:param s: string to be processed
	:return: processed string: see comments in the source code for more info
	"""
	# Text Lowercase
	#s = s.lower()
	# Remove punctuation
	#translator = str.maketrans(' ', ' ', string.punctuation)
	#s = s.translate(translator)
	# Remove URLs
	s = re.sub(r'^https?:\/\/.[\r\n]', ' ', s, flags=re.MULTILINE)
	s = re.sub(r"http\S+", " ", s)
	# Remove new line characters
	#s = re.sub('\n', ' ', s)

	# Remove distracting single quotes
	#s = re.sub("\'", " ", s)
	# Remove all remaining numbers and non alphanumeric characters
	#s = re.sub(r'\d+', ' ', s)
	#s = re.sub(r'\W+', ' ', s)

	# define custom words to replace:
	#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)

	return s.strip()


	def preprocessingForSDG(document):

	"""
	takes in haystack document object and splits it into paragraphs and applies simple cleaning.

	Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
	list that contains all text joined together.
	"""

	preprocessor = PreProcessor(
	clean_empty_lines=True,
	clean_whitespace=True,
	clean_header_footer=True,
	split_by="word",
	split_length=120,
	split_respect_sentence_boundary=False,
	#split_overlap=1
	)
	for i in document:
	docs_processed = preprocessor.process([i])
	for item in docs_processed:
	item.content = basic(item.content)

	st.write("your document has been splitted to", len(docs_processed), "paragraphs")

	# create dataframe of text and list of all text
	df = pd.DataFrame(docs_processed)
	all_text = " ".join(df.content.to_list())
	par_list = df.content.to_list()

	return docs_processed, df, all_text, par_list

	def preprocessing(document):

	"""
	takes in haystack document object and splits it into paragraphs and applies simple cleaning.

	Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
	list that contains all text joined together.
	"""

	preprocessor = PreProcessor(
	clean_empty_lines=True,
	clean_whitespace=True,
	clean_header_footer=True,
	split_by="sentence",
	split_length=3,
	split_respect_sentence_boundary=False,
	split_overlap=1
	)
	for i in document:
	docs_processed = preprocessor.process([i])
	for item in docs_processed:
	item.content = basic(item.content)

	st.write("your document has been splitted to", len(docs_processed), "paragraphs")

	# create dataframe of text and list of all text
	df = pd.DataFrame(docs_processed)
	all_text = " ".join(df.content.to_list())
	par_list = df.content.to_list()

	return docs_processed, df, all_text, par_list

	'''processing with spacy - suitable for models such as tf-idf, word2vec'''
	def spacy_clean(alpha:str, use_nlp:bool = True) -> str:

	"""

	Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and

	filters out all but proper nouns, nounts, verbs and adjectives.

	Parameters
	----------
	alpha : str

	The input string.

	use_nlp : bool, default False

	Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.

	Should be set to False if used inside nlp.pipeline

	Returns
	-------
	' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string

	Notes
	-----
	Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
	Use together with nlp.pipeline for batch processing.

	"""

	nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])

	if use_nlp:

	alpha = nlp(alpha)



	beta = []

	for tok in alpha:

	if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):

	beta.append(tok.lemma_)


	text = ' '.join(beta)
	text = text.lower()
	return text