Spaces:
Build error
Build error
File size: 5,198 Bytes
b11ac48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
"""
Learn to classify the manually annotated CDA attributes (frames, 'riferimento', orientation)
"""
GLOVE_MODEL = "/net/aistaff/gminnema/thesis_data/data/glove-it/glove_WIKI"
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
import gensim
import pandas as pd
import spacy
import json
def train(attrib):
assert attrib in ["cda_frame", "riferimento", "orientation"]
# load data
print("Loading data...")
x_train, y_train, x_dev, y_dev = load_data(attrib)
print(f"\t\ttrain size: {len(x_train)}")
print(f"\t\tdev size: {len(x_dev)}")
# try different setups
print("Running training setups...")
scores = []
setups = [
# defaults: remove_punct=True, lowercase=True, lemmatize=False, remove_stop=False
# ({}, {}, SVC(kernel='linear')),
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear')),
# ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5}, SVC(kernel='linear')),
# ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5, "max_freq": .70}, SVC(kernel='linear')),
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.6)),
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.7)),
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.8)),
({"lemmatize": True, "remove_stop": True}, {"embed": "glove"}, SVC(kernel='linear', C=0.8)),
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel="rbf")),
]
nlp = spacy.load("it_core_news_md")
for s_idx, (text_options, vect_options, model) in enumerate(setups):
print(f"\tSetup #{s_idx}")
# extract features
print("\t\tExtracting features...")
x_train_fts, vectorizer = extract_features(x_train, nlp, text_options, **vect_options)
x_dev_fts, _ = extract_features(x_dev, nlp, text_options, **vect_options, vectorizer=vectorizer)
print(f"\t\t\tnum features: {len(vectorizer.vocabulary_)}")
print("\t\tTraining the model...")
model.fit(x_train_fts, y_train)
# evaluate on dev
print("\t\tValidating the model...")
y_dev_pred = model.predict(x_dev_fts)
p_micro, r_micro, f_micro, _ = precision_recall_fscore_support(
y_dev, y_dev_pred, average="micro")
p_classes, r_classes, f_classes, _ = precision_recall_fscore_support(
y_dev, y_dev_pred, average=None, labels=model.classes_, zero_division=0)
print(
f"\t\t\tOverall scores (micro-averaged):\tP={p_micro}\tR={r_micro}\tF={f_micro}"
)
scores.append({
"micro": {
"p": p_micro,
"r": r_micro,
"f": f_micro
},
"classes": {
"p": list(zip(model.classes_, p_classes)),
"r": list(zip(model.classes_, r_classes)),
"f": list(zip(model.classes_, f_classes)),
}
})
prediction_df = pd.DataFrame(zip(x_dev, y_dev, y_dev_pred), columns=["headline", "gold", "prediction"])
prediction_df.to_csv(f"output/migration/cda_classify/predictions_{s_idx:02}.csv")
with open("output/migration/cda_classify/scores.json", "w", encoding="utf-8") as f_scores:
json.dump(scores, f_scores, indent=4)
def load_data(attrib):
train_data = pd.read_csv(
"output/migration/preprocess/annotations_train.csv")
dev_data = pd.read_csv("output/migration/preprocess/annotations_dev.csv")
x_train = train_data["Titolo"]
x_dev = dev_data["Titolo"]
if attrib == "cda_frame":
y_train = train_data["frame"]
y_dev = dev_data["frame"]
elif attrib == "riferimento":
y_train = train_data["riferimento"]
y_dev = dev_data["riferimento"]
else:
x_train = train_data["orientation"]
y_dev = dev_data["orientation"]
return x_train, y_train, x_dev, y_dev
def extract_features(headlines, nlp, text_options, min_freq=1, max_freq=1.0, embed=None, vectorizer=None):
tokenized = [" ".join(sent) for sent in tokenize(headlines, nlp, **text_options)]
if vectorizer is None:
if embed is None:
vectorizer = CountVectorizer(lowercase=False, analyzer="word", min_df=min_freq, max_df=max_freq)
vectorized = vectorizer.fit_transform(tokenized)
else:
vectorizer = gensim.models.
else:
vectorized = vectorizer.transform(tokenized)
return vectorized, vectorizer
def tokenize(headlines, nlp, remove_punct=True, lowercase=True, lemmatize=False, remove_stop=False):
for sent in headlines:
doc = nlp(sent)
tokens = (
t.lemma_ if lemmatize else t.text
for t in doc
if (not remove_stop or not t.is_stop) and (not remove_punct or t.pos_ not in ["PUNCT", "SYM", "X"])
)
if lowercase:
tokens = [t.lower() for t in tokens]
else:
tokens = [t for t in tokens]
yield tokens
if __name__ == '__main__':
train(attrib="cda_frame")
|