File size: 4,818 Bytes
93eddbd
 
 
 
 
 
 
 
 
3f52831
f992477
93eddbd
 
3f52831
 
32c1e2a
 
 
 
57ba920
93eddbd
 
 
 
 
 
 
 
7d19bb3
 
 
1fc98d6
93eddbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32c1e2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57ba920
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import string
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.metrics import ConfusionMatrixDisplay
from keras.preprocessing.text import Tokenizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer

def download_if_non_existent(res_path, res_name):
  try:
    nltk.data.find(res_path)
  except LookupError:
    print(f'resource {res_path} not found. Downloading now...')
    nltk.download(res_name)

download_if_non_existent('corpora/stopwords', 'stopwords')
download_if_non_existent('taggers/averaged_perceptron_tagger', 'averaged_perceptron_tagger')
download_if_non_existent('corpora/wordnet', 'wordnet')

def fit_model(pipeline, x_train, y_train, x_test, y_test):
  pipeline.fit(x_train, y_train)
  return ConfusionMatrixDisplay.from_estimator(pipeline, x_test, y_test, normalize="true")

class LinguisticPreprocessor(TransformerMixin):
  def __init__(self, ):
    super().__init__()
    self.lemmatizer = WordNetLemmatizer()
    self.tokenizer = Tokenizer()
    self.stop_words = set(stopwords.words('english'))
    self.stop = stopwords.words('english')

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    X = self._remove_html_tags(X)
    X = self._remove_all_punctuations(X)
    X = self._remove_double_spaces(X)
    X = self._lemmatize(X)
    X = self._remove_stopwords(X)
    return X

  def _remove_html_tags(self, X):
    X = list(map( lambda x: BeautifulSoup(x, 'html.parser').get_text(), X))
    return X

  def _remove_all_punctuations(self, X):
    X = list(
        map(
            lambda text: re.sub('[%s]' % re.escape(string.punctuation), '', text),
            X
        )
    )
    return X

  def _remove_double_spaces(self, X):
    X = list(map(lambda text: re.sub(" +", " ", text), X))
    return X

  def _remove_stopwords(self, X):
    X = list(map(
            lambda text:  " ".join(
                [
                    word for word in text.split() if word not in (self.stop_words)
                ]
            ),
            X
        )
    )
    return X

  def _lemmatize(self, X):
    X = list(map(lambda text: self._lemmatize_one_sentence(text), X))
    return X

  def _lemmatize_one_sentence(self, sentence):
    sentence = nltk.word_tokenize(sentence)
    sentence = list(map(lambda word: self.lemmatizer.lemmatize(word), sentence))
    return " ".join(sentence)

def training_data(dataset_1, dataset_2, dataset_3):
  X_test = dataset_1['test']['text']
  y_test = dataset_1['test']['label']

  test_df = pd.DataFrame({
      'text':X_test,
      'label': y_test
  })

  combined_train_df = pd.DataFrame({
      'text': dataset_1['train']['text'] + dataset_2['train']['text'] + dataset_3['train']['text'],
      'label': dataset_1['train']['label'] + dataset_2['train']['label'] + dataset_3['train']['label']
  })

  combined_train_df.drop_duplicates(subset=['text'], inplace=True)

  merged_df = pd.merge(combined_train_df, test_df, on="text", how='left', indicator=True)
  result_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])


  X_train = result_df['text'].tolist()
  y_train = result_df['label_x'].tolist()
  X_test = np.array(X_test)
  X_train = np.array(X_train)

  return X_train, y_train, X_test, y_test

class CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, n_filters, filter_sizes, dropout, num_classes):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embed_size)) for fs in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(len(filter_sizes) * n_filters, num_classes)

    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.leaky_relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc1(cat)

def build_vocab(data_iter):
    tokenizer = get_tokenizer("basic_english")

    def yield_tokens():
        for example in data_iter:
            cleaned_text = clean_text(example['text'])
            yield tokenizer(cleaned_text)

    vocab = build_vocab_from_iterator(yield_tokens(), specials=["<unk>", "<pad>"])
    vocab.set_default_index(vocab["<unk>"])
    return vocab, tokenizer