File size: 4,882 Bytes
cee1077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "import nltk\n",
    "import csv\n",
    "\n",
    "# Load the saved CRF model\n",
    "crf_model = joblib.load(r'D:\\Thesis\\POS Tag Automation\\crf_model.pkl')\n",
    "\n",
    "def word_features(sent, i):\n",
    "    word = sent[i][0]\n",
    "    pos = sent[i][1]\n",
    "    \n",
    "    # first word\n",
    "    if i == 0:\n",
    "        prevword = '<START>'\n",
    "        prevpos = '<START>'\n",
    "    else:\n",
    "        prevword = sent[i-1][0]\n",
    "        prevpos = sent[i-1][1]\n",
    "        \n",
    "    # first or second word\n",
    "    if i == 0 or i == 1:\n",
    "        prev2word = '<START>'\n",
    "        prev2pos = '<START>'\n",
    "    else:\n",
    "        prev2word = sent[i-2][0]\n",
    "        prev2pos = sent[i-2][1]\n",
    "    \n",
    "    # last word\n",
    "    if i == len(sent) - 1:\n",
    "        nextword = '<END>'\n",
    "        nextpos = '<END>'\n",
    "    else:\n",
    "        nextword = sent[i+1][0]\n",
    "        nextpos = sent[i+1][1]\n",
    "    \n",
    "    # suffixes and prefixes\n",
    "    pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]\n",
    "    suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]\n",
    "    \n",
    "    return {'word':word,            \n",
    "            'prevword': prevword,\n",
    "            'prevpos': prevpos,  \n",
    "            'nextword': nextword, \n",
    "            'nextpos': nextpos,          \n",
    "            'suff_1': suff_1,  \n",
    "            'suff_2': suff_2,  \n",
    "            'suff_3': suff_3,  \n",
    "            'suff_4': suff_4, \n",
    "            'pref_1': pref_1,  \n",
    "            'pref_2': pref_2,  \n",
    "            'pref_3': pref_3, \n",
    "            'pref_4': pref_4,\n",
    "            'prev2word': prev2word,\n",
    "            'prev2pos': prev2pos           \n",
    "           }\n",
    "\n",
    "# Function to process a sentence and output tokens with their POS tags\n",
    "def process_sentence(sentence, label):\n",
    "    tokens = nltk.word_tokenize(sentence)\n",
    "    tagged_tokens = [(token, nltk.pos_tag([token])[0][1]) for token in tokens]\n",
    "    features = [word_features(tagged_tokens, i) for i in range(len(tagged_tokens))]\n",
    "    predicted_labels = crf_model.predict([features])[0]\n",
    "    predicted_tokens_with_labels = list(zip(tokens, predicted_labels))\n",
    "    input_tokens = [token[0] for token in predicted_tokens_with_labels]\n",
    "    pos_tags = [token[1] for token in predicted_tokens_with_labels]\n",
    "    return input_tokens, pos_tags, [label] * len(input_tokens)\n",
    "\n",
    "# Input CSV file path\n",
    "input_csv_file = \"D:\\Thesis\\Datasets\\preprocessed_dataset.csv\"\n",
    "# Output CSV file path\n",
    "output_csv_file = \"testing_bert_finetune.csv\"\n",
    "\n",
    "# Open input CSV file for reading\n",
    "with open(input_csv_file, 'r', newline='', encoding='utf-8') as csv_input_file:\n",
    "    reader = csv.reader(csv_input_file)\n",
    "    # Open output CSV file for writing\n",
    "    with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_output_file:\n",
    "        writer = csv.writer(csv_output_file)\n",
    "        # Write header to output CSV file\n",
    "        writer.writerow(['sentence', 'pos_tag', 'label'])\n",
    "        # Skip header row in input CSV file\n",
    "        next(reader)\n",
    "        # Process each row in input CSV file\n",
    "        for row in reader:\n",
    "            sentence = row[0]\n",
    "            label = row[1]\n",
    "            # Process the sentence to obtain tokens with POS tags and labels\n",
    "            tokens, pos_tags, labels = process_sentence(sentence, label)\n",
    "            # Write [CLS] token\n",
    "            writer.writerow(['[CLS]', '[CLS]', '1'])\n",
    "            # Write each token with its POS tag and label to the output CSV file\n",
    "            for token, pos_tag, label in zip(tokens, pos_tags, labels):\n",
    "                writer.writerow([token, '[POS_' + pos_tag + ']', label])\n",
    "            # Write [SEP] token at the end of the sentence\n",
    "            writer.writerow(['[SEP]', '[SEP]', '1'])\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}