Spaces:

egumasa
/

engagement-analyzer-demo

Running

File size: 4,240 Bytes

import re
import os
import spacy_streamlit
from collections import Counter
import glob

import spacy
from spacy.tokens import Doc
from spacy.cli._util import import_code

from utils.visualize import visualize_spans
from utils.utility import preprocess, delete_overlapping_span, cleanup_justify

from resources.text_list import TEXT_LIST
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
from resources.colors import COLORS_1


from skbio import diversity as dv

from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
import pandas as pd

# from pipeline.custom_functions import custom_functions
SPAN_ATTRS = ["text", "label_", "start", "end"]
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]


# spacy.prefer_gpu()

def load_model(spacy_model):
    # source = spacy.blank("en")
    nlp = spacy.load(spacy_model)  # , vocab=nlp_to_copy.vocab
    nlp.add_pipe('sentencizer')
    return (nlp)

# source = spacy.blank("en")

modelname = "en_engagement_LSTM_f3"
# modelname = "en_engagement_LSTM_f5"
# modelname = "en_engagement_Dual_RoBERTa_acad3_f4"

os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname))

import_code("pipeline/custom_functions.py")

# nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384")
nlp = spacy.load(modelname)
# doc = nlp(preprocess(TEXT_LIST[0]))

# cleanup_justify(doc, doc.spans["sc"])
# delete_overlapping_span(doc.spans['sc'])

# data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
# seq = [s for s in doc.spans["sc"]]
# span_ngrams = ngrammar(seq=seq, n=3)

# df = pd.DataFrame(data, columns=cols)

# constant_value = 42
# new_col = pd.Series([constant_value] * df.shape[0], name='new_col')

# doclen = len(doc)
# doc_len = pd.Series([doclen] * df.shape[0], name='nwords')

# df.insert(0, "new", new_col, True)
# df.insert(1, "nwords", doc_len, True)

# df.to_csv("results/test.csv")



# inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt")
inputfiles = glob.glob("ICNALE_texts/*/*.txt")
savedir = "ICNALE_analysis"
storeall = True
storage = []
os.makedirs(os.path.join("ICNALE_analysis", modelname))


doc_level_storage = []

for file in inputfiles:

    filename = os.path.split(file)[-1]

    with open(file, "r") as f:
        text = f.read()

    text = preprocess(text)
    doc = nlp(text)
    cleanup_justify(doc, doc.spans["sc"])
    delete_overlapping_span(doc.spans['sc'])

    data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
    seq = [s for s in doc.spans["sc"]]
    span_ngrams = ngrammar(seq=seq, n=3)


    ### Make it a dataset
    df = pd.DataFrame(data, columns=cols)
    df = df.astype({"start": int, "end": int}) #convert col type
    df = df.sort_values(by= ['start']) #and sort by start
    # constant_value = 42
    new_col = pd.Series([filename] * df.shape[0], name='filename')

    doclen = len(doc)
    doc_len = pd.Series([doclen] * df.shape[0], name='nwords')

    df.insert(0, "filename", new_col, True)
    df.insert(1, "nwords", doc_len, True)
    df.to_csv(f"{savedir}/{modelname}/{filename}.csv")

    sequences = list(df['label_'])
    # Engagement ngrams
    span_bigrams = ngrammar(seq=seq, n=2)
    bidf = pd.DataFrame(span_bigrams)

    # constant_value = 42
    new_col = pd.Series([filename] * bidf.shape[0], name='filename')
    bidf = bidf.insert(0, "filename", new_col, True)


    ## Document level 
    doc_level = {}
    counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
    div = diversity_values(list(counts))

    div_data = pd.DataFrame.from_dict(div, orient='index')
    
    doc_data = pd.concat([counts, div_data], axis = 0).T
    doc_data.insert(0, "filename", filename, True)
    doc_data.insert(1, "nwords", doc_len, True)
    doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv")

    if storeall:
        storage.append(df)
        doc_level_storage.append(doc_data)


alldf = pd.concat(storage)

alldf.to_csv(f"{savedir}/0_{modelname}_20230726.csv")


# alldoc = pd.concat(doc_level_storage)
# alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")