|
import logging |
|
import pandas as pd |
|
import numpy as np |
|
import string |
|
import nltk |
|
import spacy |
|
import en_core_web_sm |
|
import re |
|
import streamlit as st |
|
|
|
from haystack.nodes import PreProcessor |
|
|
|
'''basic cleaning - suitable for transformer models''' |
|
def basic(s,SDG = False): |
|
""" |
|
:param s: string to be processed |
|
:return: processed string: see comments in the source code for more info |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE) |
|
s = re.sub(r"http\S+", " ", s) |
|
if SDG == True: |
|
s = s.lower() |
|
translator = str.maketrans(' ', ' ', string.punctuation) |
|
s = s.translate(translator) |
|
s = re.sub('\n', ' ', s) |
|
s = re.sub("\'", " ", s) |
|
s = re.sub(r'\d+', ' ', s) |
|
s = re.sub(r'\W+', ' ', s) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return s.strip() |
|
|
|
|
|
def preprocessingForSDG(document): |
|
|
|
""" |
|
takes in haystack document object and splits it into paragraphs and applies simple cleaning. |
|
|
|
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and |
|
list that contains all text joined together. |
|
""" |
|
|
|
preprocessor = PreProcessor( |
|
clean_empty_lines=True, |
|
clean_whitespace=True, |
|
clean_header_footer=True, |
|
split_by="word", |
|
split_length=120, |
|
split_respect_sentence_boundary=False, |
|
|
|
) |
|
for i in document: |
|
docs_processed = preprocessor.process([i]) |
|
for item in docs_processed: |
|
item.content = basic(item.content, SDG = True) |
|
|
|
with st.spinner("๐ document being splitted into paragraphs"): |
|
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed))) |
|
|
|
|
|
df = pd.DataFrame(docs_processed) |
|
all_text = " ".join(df.content.to_list()) |
|
par_list = df.content.to_list() |
|
|
|
return docs_processed, df, all_text, par_list |
|
|
|
def preprocessing(document): |
|
|
|
""" |
|
takes in haystack document object and splits it into paragraphs and applies simple cleaning. |
|
|
|
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and |
|
list that contains all text joined together. |
|
""" |
|
|
|
preprocessor = PreProcessor( |
|
clean_empty_lines=True, |
|
clean_whitespace=True, |
|
clean_header_footer=True, |
|
split_by="sentence", |
|
split_length=3, |
|
split_respect_sentence_boundary=False, |
|
split_overlap=1 |
|
) |
|
for i in document: |
|
docs_processed = preprocessor.process([i]) |
|
for item in docs_processed: |
|
item.content = basic(item.content) |
|
|
|
with st.spinner("๐ document being splitted into paragraphs"): |
|
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed))) |
|
|
|
|
|
df = pd.DataFrame(docs_processed) |
|
all_text = " ".join(df.content.to_list()) |
|
par_list = df.content.to_list() |
|
|
|
return docs_processed, df, all_text, par_list |
|
|
|
'''processing with spacy - suitable for models such as tf-idf, word2vec''' |
|
def spacy_clean(alpha:str, use_nlp:bool = True) -> str: |
|
|
|
""" |
|
|
|
Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and |
|
|
|
filters out all but proper nouns, nounts, verbs and adjectives. |
|
|
|
Parameters |
|
---------- |
|
alpha : str |
|
|
|
The input string. |
|
|
|
use_nlp : bool, default False |
|
|
|
Indicates whether Spacy needs to use NLP. Enable this when using this function on its own. |
|
|
|
Should be set to False if used inside nlp.pipeline |
|
|
|
Returns |
|
------- |
|
' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string |
|
|
|
Notes |
|
----- |
|
Fails if alpha is an NA value. Performance decreases as len(alpha) gets large. |
|
Use together with nlp.pipeline for batch processing. |
|
|
|
""" |
|
|
|
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"]) |
|
|
|
if use_nlp: |
|
|
|
alpha = nlp(alpha) |
|
|
|
|
|
|
|
beta = [] |
|
|
|
for tok in alpha: |
|
|
|
if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]): |
|
|
|
beta.append(tok.lemma_) |
|
|
|
|
|
text = ' '.join(beta) |
|
text = text.lower() |
|
return text |