vasevooo commited on
Commit
7479f89
1 Parent(s): c159ad3

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. data/imdb.csv +3 -0
  3. data/rnn_preprocessing.py +81 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ data/imdb.csv filter=lfs diff=lfs merge=lfs -text
data/imdb.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfc447764f82be365fa9c2beef4e8df89d3919e3da95f5088004797d79695aa2
3
+ size 66212309
data/rnn_preprocessing.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import numpy as np
4
+ import torch
5
+
6
+ from nltk.corpus import stopwords
7
+ import nltk
8
+ nltk.download('stopwords')
9
+ stop_words = set(stopwords.words('english'))
10
+
11
+ def data_preprocessing(text: str) -> str:
12
+ """preprocessing string: lowercase, removing html-tags, punctuation,
13
+ stopwords, digits
14
+
15
+ Args:
16
+ text (str): input string for preprocessing
17
+
18
+ Returns:
19
+ str: preprocessed string
20
+ """
21
+
22
+ text = text.lower()
23
+ text = re.sub('<.*?>', '', text) # html tags
24
+ text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
25
+ text = ' '.join([word for word in text.split() if word not in stop_words])
26
+ text = [word for word in text.split() if not word.isdigit()]
27
+ text = ' '.join(text)
28
+ return text
29
+
30
+ def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
31
+ return list(filter(lambda x: x[1] > n, sorted_words))
32
+
33
+ def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
34
+ """Make left-sided padding for input list of tokens
35
+
36
+ Args:
37
+ review_int (list): input list of tokens
38
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
39
+
40
+ Returns:
41
+ np.array: padded sequences
42
+ """
43
+ features = np.zeros((len(review_int), seq_len), dtype = int)
44
+ for i, review in enumerate(review_int):
45
+ if len(review) <= seq_len:
46
+ zeros = list(np.zeros(seq_len - len(review)))
47
+ new = zeros + review
48
+ else:
49
+ new = review[: seq_len]
50
+ features[i, :] = np.array(new)
51
+
52
+ return features
53
+
54
+ def preprocess_single_string(
55
+ input_string: str,
56
+ seq_len: int,
57
+ vocab_to_int: dict,
58
+ ) -> torch.tensor:
59
+ """Function for all preprocessing steps on a single string
60
+
61
+ Args:
62
+ input_string (str): input single string for preprocessing
63
+ seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
64
+ vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
65
+
66
+ Returns:
67
+ list: preprocessed string
68
+ """
69
+
70
+ preprocessed_string = data_preprocessing(input_string)
71
+ result_list = []
72
+ for word in preprocessed_string.split():
73
+ try:
74
+ result_list.append(vocab_to_int[word])
75
+ except KeyError as e:
76
+ print(f'{e}: not in dictionary!')
77
+ result_padded = padding([result_list], seq_len)[0]
78
+
79
+ return torch.tensor(result_padded)
80
+
81
+