File size: 4,898 Bytes
49a314a 22b8e0b 49a314a 22b8e0b 49a314a 22b8e0b 49a314a 22b8e0b 49a314a 22b8e0b 49a314a 22b8e0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import logging
import pandas as pd
import numpy as np
import string
import nltk
import spacy
import en_core_web_sm
import re
import streamlit as st
from haystack.nodes import PreProcessor
'''basic cleaning - suitable for transformer models'''
def basic(s,SDG = False):
"""
:param s: string to be processed
:return: processed string: see comments in the source code for more info
"""
# Text Lowercase
#s = s.lower()
# Remove punctuation
#translator = str.maketrans(' ', ' ', string.punctuation)
#s = s.translate(translator)
# Remove URLs
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
s = re.sub(r"http\S+", " ", s)
if SDG == True:
s = s.lower()
translator = str.maketrans(' ', ' ', string.punctuation)
s = s.translate(translator)
s = re.sub('\n', ' ', s)
s = re.sub("\'", " ", s)
s = re.sub(r'\d+', ' ', s)
s = re.sub(r'\W+', ' ', s)
# Remove new line characters
#s = re.sub('\n', ' ', s)
# Remove distracting single quotes
#s = re.sub("\'", " ", s)
# Remove all remaining numbers and non alphanumeric characters
#s = re.sub(r'\d+', ' ', s)
#s = re.sub(r'\W+', ' ', s)
# define custom words to replace:
#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
return s.strip()
def preprocessingForSDG(document):
"""
takes in haystack document object and splits it into paragraphs and applies simple cleaning.
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
list that contains all text joined together.
"""
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=120,
split_respect_sentence_boundary=False,
#split_overlap=1
)
for i in document:
docs_processed = preprocessor.process([i])
for item in docs_processed:
item.content = basic(item.content, SDG = True)
with st.spinner("👑 document being splitted into paragraphs"):
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
# create dataframe of text and list of all text
df = pd.DataFrame(docs_processed)
all_text = " ".join(df.content.to_list())
par_list = df.content.to_list()
return docs_processed, df, all_text, par_list
def preprocessing(document):
"""
takes in haystack document object and splits it into paragraphs and applies simple cleaning.
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
list that contains all text joined together.
"""
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="sentence",
split_length=3,
split_respect_sentence_boundary=False,
split_overlap=1
)
for i in document:
docs_processed = preprocessor.process([i])
for item in docs_processed:
item.content = basic(item.content)
with st.spinner("👑 document being splitted into paragraphs"):
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
# create dataframe of text and list of all text
df = pd.DataFrame(docs_processed)
all_text = " ".join(df.content.to_list())
par_list = df.content.to_list()
return docs_processed, df, all_text, par_list
'''processing with spacy - suitable for models such as tf-idf, word2vec'''
def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
"""
Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
filters out all but proper nouns, nounts, verbs and adjectives.
Parameters
----------
alpha : str
The input string.
use_nlp : bool, default False
Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
Should be set to False if used inside nlp.pipeline
Returns
-------
' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
Notes
-----
Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
Use together with nlp.pipeline for batch processing.
"""
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
if use_nlp:
alpha = nlp(alpha)
beta = []
for tok in alpha:
if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
beta.append(tok.lemma_)
text = ' '.join(beta)
text = text.lower()
return text |