Spaces:
Paused
Paused
File size: 4,818 Bytes
93eddbd 3f52831 f992477 93eddbd 3f52831 32c1e2a 57ba920 93eddbd 7d19bb3 1fc98d6 93eddbd 32c1e2a 57ba920 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import string
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.metrics import ConfusionMatrixDisplay
from keras.preprocessing.text import Tokenizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
def download_if_non_existent(res_path, res_name):
try:
nltk.data.find(res_path)
except LookupError:
print(f'resource {res_path} not found. Downloading now...')
nltk.download(res_name)
download_if_non_existent('corpora/stopwords', 'stopwords')
download_if_non_existent('taggers/averaged_perceptron_tagger', 'averaged_perceptron_tagger')
download_if_non_existent('corpora/wordnet', 'wordnet')
def fit_model(pipeline, x_train, y_train, x_test, y_test):
pipeline.fit(x_train, y_train)
return ConfusionMatrixDisplay.from_estimator(pipeline, x_test, y_test, normalize="true")
class LinguisticPreprocessor(TransformerMixin):
def __init__(self, ):
super().__init__()
self.lemmatizer = WordNetLemmatizer()
self.tokenizer = Tokenizer()
self.stop_words = set(stopwords.words('english'))
self.stop = stopwords.words('english')
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X = self._remove_html_tags(X)
X = self._remove_all_punctuations(X)
X = self._remove_double_spaces(X)
X = self._lemmatize(X)
X = self._remove_stopwords(X)
return X
def _remove_html_tags(self, X):
X = list(map( lambda x: BeautifulSoup(x, 'html.parser').get_text(), X))
return X
def _remove_all_punctuations(self, X):
X = list(
map(
lambda text: re.sub('[%s]' % re.escape(string.punctuation), '', text),
X
)
)
return X
def _remove_double_spaces(self, X):
X = list(map(lambda text: re.sub(" +", " ", text), X))
return X
def _remove_stopwords(self, X):
X = list(map(
lambda text: " ".join(
[
word for word in text.split() if word not in (self.stop_words)
]
),
X
)
)
return X
def _lemmatize(self, X):
X = list(map(lambda text: self._lemmatize_one_sentence(text), X))
return X
def _lemmatize_one_sentence(self, sentence):
sentence = nltk.word_tokenize(sentence)
sentence = list(map(lambda word: self.lemmatizer.lemmatize(word), sentence))
return " ".join(sentence)
def training_data(dataset_1, dataset_2, dataset_3):
X_test = dataset_1['test']['text']
y_test = dataset_1['test']['label']
test_df = pd.DataFrame({
'text':X_test,
'label': y_test
})
combined_train_df = pd.DataFrame({
'text': dataset_1['train']['text'] + dataset_2['train']['text'] + dataset_3['train']['text'],
'label': dataset_1['train']['label'] + dataset_2['train']['label'] + dataset_3['train']['label']
})
combined_train_df.drop_duplicates(subset=['text'], inplace=True)
merged_df = pd.merge(combined_train_df, test_df, on="text", how='left', indicator=True)
result_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
X_train = result_df['text'].tolist()
y_train = result_df['label_x'].tolist()
X_test = np.array(X_test)
X_train = np.array(X_train)
return X_train, y_train, X_test, y_test
class CNN(nn.Module):
def __init__(self, vocab_size, embed_size, n_filters, filter_sizes, dropout, num_classes):
super(CNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embed_size)) for fs in filter_sizes])
self.dropout = nn.Dropout(dropout)
self.fc1 = nn.Linear(len(filter_sizes) * n_filters, num_classes)
def forward(self, text):
embedded = self.embedding(text)
embedded = embedded.unsqueeze(1)
conved = [F.leaky_relu(conv(embedded)).squeeze(3) for conv in self.convs]
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
cat = self.dropout(torch.cat(pooled, dim=1))
return self.fc1(cat)
def build_vocab(data_iter):
tokenizer = get_tokenizer("basic_english")
def yield_tokens():
for example in data_iter:
cleaned_text = clean_text(example['text'])
yield tokenizer(cleaned_text)
vocab = build_vocab_from_iterator(yield_tokens(), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
return vocab, tokenizer
|