peter2000 commited on
Commit
be432e1
1 Parent(s): e1fbc3c

Create new file

Browse files
Files changed (1) hide show
  1. udfPreprocess/cleaning,py +144 -0
udfPreprocess/cleaning,py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import string
4
+ import nltk
5
+ import spacy
6
+ import en_core_web_sm
7
+ import re
8
+ import streamlit as st
9
+
10
+ from haystack.nodes import PreProcessor
11
+
12
+ '''basic cleaning - suitable for transformer models'''
13
+ def basic(s):
14
+ """
15
+ :param s: string to be processed
16
+ :return: processed string: see comments in the source code for more info
17
+ """
18
+ # Text Lowercase
19
+ #s = s.lower()
20
+ # Remove punctuation
21
+ #translator = str.maketrans(' ', ' ', string.punctuation)
22
+ #s = s.translate(translator)
23
+ # Remove URLs
24
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
25
+ s = re.sub(r"http\S+", " ", s)
26
+ # Remove new line characters
27
+ #s = re.sub('\n', ' ', s)
28
+
29
+ # Remove distracting single quotes
30
+ #s = re.sub("\'", " ", s)
31
+ # Remove all remaining numbers and non alphanumeric characters
32
+ #s = re.sub(r'\d+', ' ', s)
33
+ #s = re.sub(r'\W+', ' ', s)
34
+
35
+ # define custom words to replace:
36
+ #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
37
+
38
+ return s.strip()
39
+
40
+
41
+ def preprocessingForSDG(document):
42
+
43
+ """
44
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
45
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
46
+ list that contains all text joined together.
47
+ """
48
+
49
+ preprocessor = PreProcessor(
50
+ clean_empty_lines=True,
51
+ clean_whitespace=True,
52
+ clean_header_footer=True,
53
+ split_by="word",
54
+ split_length=100,
55
+ split_respect_sentence_boundary=True,
56
+ split_overlap=4
57
+ )
58
+ for i in document:
59
+ docs_processed = preprocessor.process([i])
60
+ for item in docs_processed:
61
+ item.content = basic(item.content)
62
+
63
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
64
+
65
+ # create dataframe of text and list of all text
66
+ df = pd.DataFrame(docs_processed)
67
+ all_text = " ".join(df.content.to_list())
68
+ par_list = df.content.to_list()
69
+
70
+ return docs_processed, df, all_text, par_list
71
+
72
+ def preprocessing(document):
73
+
74
+ """
75
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
76
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
77
+ list that contains all text joined together.
78
+ """
79
+
80
+ preprocessor = PreProcessor(
81
+ clean_empty_lines=True,
82
+ clean_whitespace=True,
83
+ clean_header_footer=True,
84
+ split_by="sentence",
85
+ split_length=3,
86
+ split_respect_sentence_boundary=False,
87
+ split_overlap=1
88
+ )
89
+ for i in document:
90
+ docs_processed = preprocessor.process([i])
91
+ for item in docs_processed:
92
+ item.content = basic(item.content)
93
+
94
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
95
+
96
+ # create dataframe of text and list of all text
97
+ df = pd.DataFrame(docs_processed)
98
+ all_text = " ".join(df.content.to_list())
99
+ par_list = df.content.to_list()
100
+
101
+ return docs_processed, df, all_text, par_list
102
+
103
+ '''processing with spacy - suitable for models such as tf-idf, word2vec'''
104
+ def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
105
+
106
+ """
107
+ Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
108
+ filters out all but proper nouns, nounts, verbs and adjectives.
109
+ Parameters
110
+ ----------
111
+ alpha : str
112
+ The input string.
113
+ use_nlp : bool, default False
114
+ Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
115
+ Should be set to False if used inside nlp.pipeline
116
+ Returns
117
+ -------
118
+ ' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
119
+ Notes
120
+ -----
121
+ Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
122
+ Use together with nlp.pipeline for batch processing.
123
+ """
124
+
125
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
126
+
127
+ if use_nlp:
128
+
129
+ alpha = nlp(alpha)
130
+
131
+
132
+
133
+ beta = []
134
+
135
+ for tok in alpha:
136
+
137
+ if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
138
+
139
+ beta.append(tok.lemma_)
140
+
141
+
142
+ text = ' '.join(beta)
143
+ text = text.lower()
144
+ return text