peter2000 commited on
Commit
67458c0
1 Parent(s): 0fed137

Create new file

Browse files
Files changed (1) hide show
  1. scripts/clean.py +143 -0
scripts/clean.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import string
4
+ import nltk
5
+ import spacy
6
+ import en_core_web_sm
7
+ import re
8
+ import streamlit as st
9
+
10
+ from haystack.nodes import PreProcessor
11
+
12
+ '''basic cleaning - suitable for transformer models'''
13
+ def basic(s):
14
+ """
15
+ :param s: string to be processed
16
+ :return: processed string: see comments in the source code for more info
17
+ """
18
+ # Text Lowercase
19
+ #s = s.lower()
20
+ # Remove punctuation
21
+ #translator = str.maketrans(' ', ' ', string.punctuation)
22
+ #s = s.translate(translator)
23
+ # Remove URLs
24
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
25
+ s = re.sub(r"http\S+", " ", s)
26
+ # Remove new line characters
27
+ #s = re.sub('\n', ' ', s)
28
+
29
+ # Remove distracting single quotes
30
+ #s = re.sub("\'", " ", s)
31
+ # Remove all remaining numbers and non alphanumeric characters
32
+ #s = re.sub(r'\d+', ' ', s)
33
+ #s = re.sub(r'\W+', ' ', s)
34
+
35
+ # define custom words to replace:
36
+ #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
37
+
38
+ return s.strip()
39
+
40
+
41
+ def preprocessingForSDG(document):
42
+
43
+ """
44
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
45
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
46
+ list that contains all text joined together.
47
+ """
48
+
49
+ preprocessor = PreProcessor(
50
+ clean_empty_lines=True,
51
+ clean_whitespace=True,
52
+ clean_header_footer=True,
53
+ split_by="word",
54
+ split_length=100,
55
+ split_respect_sentence_boundary=True,
56
+ split_overlap=4
57
+ )
58
+
59
+ for i in document:
60
+ docs_processed = preprocessor.process([i])
61
+ for item in docs_processed:
62
+ item.content = basic(item.content)
63
+
64
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
65
+
66
+ # create dataframe of text and list of all text
67
+ df = pd.DataFrame(docs_processed)
68
+ all_text = " ".join(df.content.to_list())
69
+ par_list = df.content.to_list()
70
+
71
+ return docs_processed, df, all_text, par_list
72
+
73
+ def preprocessing(document):
74
+
75
+ """
76
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
77
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
78
+ list that contains all text joined together.
79
+ """
80
+
81
+ preprocessor = PreProcessor(
82
+ clean_empty_lines=True,
83
+ clean_whitespace=True,
84
+ clean_header_footer=True,
85
+ split_by="sentence",
86
+ split_length=3,
87
+ split_respect_sentence_boundary=True,
88
+ split_overlap=1
89
+ )
90
+ for i in document:
91
+ docs_processed = preprocessor.process([i])
92
+ for item in docs_processed:
93
+ item.content = basic(item.content)
94
+
95
+ st.write("your document has been splitted to", len(docs_processed), "paragraphs")
96
+
97
+ # create dataframe of text and list of all text
98
+ df = pd.DataFrame(docs_processed)
99
+ all_text = " ".join(df.content.to_list())
100
+ par_list = df.content.to_list()
101
+
102
+ return docs_processed, df, all_text, par_list
103
+
104
+ '''processing with spacy - suitable for models such as tf-idf, word2vec'''
105
+ def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
106
+
107
+ """
108
+ Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
109
+ filters out all but proper nouns, nounts, verbs and adjectives.
110
+ Parameters
111
+ ----------
112
+ alpha : str
113
+ The input string.
114
+ use_nlp : bool, default False
115
+ Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
116
+ Should be set to False if used inside nlp.pipeline
117
+ Returns
118
+ -------
119
+ ' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
120
+ Notes
121
+ -----
122
+ Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
123
+ Use together with nlp.pipeline for batch processing.
124
+ """
125
+
126
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
127
+
128
+ if use_nlp:
129
+
130
+ alpha = nlp(alpha)
131
+
132
+ beta = []
133
+
134
+ for tok in alpha:
135
+
136
+ if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
137
+
138
+ beta.append(tok.lemma_)
139
+
140
+
141
+ text = ' '.join(beta)
142
+ text = text.lower()
143
+ return text