File size: 4,240 Bytes
7c257bf f61c42f 7c257bf f38de6b 7c257bf f38de6b 7c257bf f38de6b 7c257bf f38de6b 7c257bf f38de6b 7c257bf f38de6b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import re
import os
import spacy_streamlit
from collections import Counter
import glob
import spacy
from spacy.tokens import Doc
from spacy.cli._util import import_code
from utils.visualize import visualize_spans
from utils.utility import preprocess, delete_overlapping_span, cleanup_justify
from resources.text_list import TEXT_LIST
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
from resources.colors import COLORS_1
from skbio import diversity as dv
from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
import pandas as pd
# from pipeline.custom_functions import custom_functions
SPAN_ATTRS = ["text", "label_", "start", "end"]
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
# spacy.prefer_gpu()
def load_model(spacy_model):
# source = spacy.blank("en")
nlp = spacy.load(spacy_model) # , vocab=nlp_to_copy.vocab
nlp.add_pipe('sentencizer')
return (nlp)
# source = spacy.blank("en")
modelname = "en_engagement_LSTM_f3"
# modelname = "en_engagement_LSTM_f5"
# modelname = "en_engagement_Dual_RoBERTa_acad3_f4"
os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname))
import_code("pipeline/custom_functions.py")
# nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384")
nlp = spacy.load(modelname)
# doc = nlp(preprocess(TEXT_LIST[0]))
# cleanup_justify(doc, doc.spans["sc"])
# delete_overlapping_span(doc.spans['sc'])
# data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
# seq = [s for s in doc.spans["sc"]]
# span_ngrams = ngrammar(seq=seq, n=3)
# df = pd.DataFrame(data, columns=cols)
# constant_value = 42
# new_col = pd.Series([constant_value] * df.shape[0], name='new_col')
# doclen = len(doc)
# doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
# df.insert(0, "new", new_col, True)
# df.insert(1, "nwords", doc_len, True)
# df.to_csv("results/test.csv")
# inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt")
inputfiles = glob.glob("ICNALE_texts/*/*.txt")
savedir = "ICNALE_analysis"
storeall = True
storage = []
os.makedirs(os.path.join("ICNALE_analysis", modelname))
doc_level_storage = []
for file in inputfiles:
filename = os.path.split(file)[-1]
with open(file, "r") as f:
text = f.read()
text = preprocess(text)
doc = nlp(text)
cleanup_justify(doc, doc.spans["sc"])
delete_overlapping_span(doc.spans['sc'])
data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
seq = [s for s in doc.spans["sc"]]
span_ngrams = ngrammar(seq=seq, n=3)
### Make it a dataset
df = pd.DataFrame(data, columns=cols)
df = df.astype({"start": int, "end": int}) #convert col type
df = df.sort_values(by= ['start']) #and sort by start
# constant_value = 42
new_col = pd.Series([filename] * df.shape[0], name='filename')
doclen = len(doc)
doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
df.insert(0, "filename", new_col, True)
df.insert(1, "nwords", doc_len, True)
df.to_csv(f"{savedir}/{modelname}/{filename}.csv")
sequences = list(df['label_'])
# Engagement ngrams
span_bigrams = ngrammar(seq=seq, n=2)
bidf = pd.DataFrame(span_bigrams)
# constant_value = 42
new_col = pd.Series([filename] * bidf.shape[0], name='filename')
bidf = bidf.insert(0, "filename", new_col, True)
## Document level
doc_level = {}
counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
div = diversity_values(list(counts))
div_data = pd.DataFrame.from_dict(div, orient='index')
doc_data = pd.concat([counts, div_data], axis = 0).T
doc_data.insert(0, "filename", filename, True)
doc_data.insert(1, "nwords", doc_len, True)
doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv")
if storeall:
storage.append(df)
doc_level_storage.append(doc_data)
alldf = pd.concat(storage)
alldf.to_csv(f"{savedir}/0_{modelname}_20230726.csv")
# alldoc = pd.concat(doc_level_storage)
# alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")
|