egumasa commited on
Commit
0146ef9
1 Parent(s): ede0fe0
# Specify latent true scores ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Specify latent true scores
2
+ lx1 =~ 1 * x1
3
+ lx2 =~ 1 * x2
4
+ lx3 =~ 1 * x3
5
+
6
+ # Specify mean of latent true scores
7
+ lx1 ~ gamma_lx1 * 1
8
+ lx2 ~ 0 * 1
9
+ lx3 ~ 0 * 1
10
+ # Specify variance of latent true scores
11
+ lx1 ~~ sigma2_lx1 * lx1
12
+ lx2 ~~ 0 * lx2
13
+ lx3 ~~ 0 * lx3
14
+ # Specify intercept of obseved scores
15
+ x1 ~ 0 * 1
16
+ x2 ~ 0 * 1
17
+ x3 ~ 0 * 1
18
+ # Specify variance of observed scores
19
+ x1 ~~ sigma2_ux * x1
20
+ x2 ~~ sigma2_ux * x2
21
+ x3 ~~ sigma2_ux * x3
22
+ # Specify autoregressions of latent variables
23
+ lx2 ~ 1 * lx1
24
+ lx3 ~ 1 * lx2
25
+ # Specify latent change scores
26
+ dx2 =~ 1 * lx2
27
+ dx3 =~ 1 * lx3
28
+ # Specify latent change scores means
29
+ dx2 ~ 0 * 1
30
+ dx3 ~ 0 * 1
31
+ # Specify latent change scores variances
32
+ dx2 ~~ 0 * dx2
33
+ dx3 ~~ 0 * dx3
34
+ # Specify constant change factor
35
+ g2 =~ 1 * dx2 + 1 * dx3
36
+ # Specify constant change factor mean
37
+ g2 ~ alpha_g2 * 1
38
+ # Specify constant change factor variance
39
+ g2 ~~ sigma2_g2 * g2
40
+ # Specify constant change factor covariance with the initial true score
41
+ g2 ~~ sigma_g2lx1 * lx1
42
+ # Specify proportional change component
43
+ dx2 ~ beta_x * lx1
44
+ dx3 ~ beta_x * lx2
45
+ # Specify autoregression of change score
46
+ dx3 ~ phi_x * dx2
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ECCE_analysis
2
+ ECCE_texts
3
+ ICNALE_analysis
4
+ ICNALE_texts
5
+ results
6
+ inputtexts
7
+ .DS_Store
8
+
README.md CHANGED
@@ -1,14 +1,15 @@
1
  ---
2
- title: Engagement Analyzer Demo5
3
- emoji: 🔥
4
- colorFrom: pink
5
- colorTo: green
6
  sdk: streamlit
7
- sdk_version: 1.39.0
8
- app_file: app.py
9
  pinned: false
10
- license: cc-by-nc-sa-4.0
11
- short_description: demo5
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
  ---
2
+ title: Engagement analyzer (demo)
3
+ emoji: 👀
4
+ colorFrom: gray
5
+ colorTo: red
6
  sdk: streamlit
7
+ sdk_version: 1.25.0
8
+ app_file: demo.py
9
  pinned: false
 
 
10
  ---
11
 
12
+ # engagement-analyzer-demo
13
+
14
+ This is a demo of automatic analysis tool for Engagement (Martin & White, 2005).
15
+
analyzer.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import spacy_streamlit
4
+ from collections import Counter
5
+ import glob
6
+
7
+ import spacy
8
+ from spacy.tokens import Doc
9
+ from spacy.cli._util import import_code
10
+
11
+ from utils.visualize import visualize_spans
12
+ from utils.utility import preprocess, delete_overlapping_span, cleanup_justify
13
+
14
+ from resources.text_list import TEXT_LIST
15
+ from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
16
+ from resources.colors import COLORS_1
17
+
18
+
19
+ from skbio import diversity as dv
20
+
21
+ from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
22
+ import pandas as pd
23
+
24
+ # from pipeline.custom_functions import custom_functions
25
+ SPAN_ATTRS = ["text", "label_", "start", "end"]
26
+ CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
27
+
28
+
29
+ # spacy.prefer_gpu()
30
+
31
+ def load_model(spacy_model):
32
+ # source = spacy.blank("en")
33
+ nlp = spacy.load(spacy_model) # , vocab=nlp_to_copy.vocab
34
+ nlp.add_pipe('sentencizer')
35
+ return (nlp)
36
+
37
+ # source = spacy.blank("en")
38
+
39
+ modelname = "en_engagement_LSTM_f3"
40
+ # modelname = "en_engagement_LSTM_f5"
41
+ # modelname = "en_engagement_Dual_RoBERTa_acad3_f4"
42
+
43
+ os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname))
44
+
45
+ import_code("pipeline/custom_functions.py")
46
+
47
+ # nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384")
48
+ nlp = spacy.load(modelname)
49
+ # doc = nlp(preprocess(TEXT_LIST[0]))
50
+
51
+ # cleanup_justify(doc, doc.spans["sc"])
52
+ # delete_overlapping_span(doc.spans['sc'])
53
+
54
+ # data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
55
+ # seq = [s for s in doc.spans["sc"]]
56
+ # span_ngrams = ngrammar(seq=seq, n=3)
57
+
58
+ # df = pd.DataFrame(data, columns=cols)
59
+
60
+ # constant_value = 42
61
+ # new_col = pd.Series([constant_value] * df.shape[0], name='new_col')
62
+
63
+ # doclen = len(doc)
64
+ # doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
65
+
66
+ # df.insert(0, "new", new_col, True)
67
+ # df.insert(1, "nwords", doc_len, True)
68
+
69
+ # df.to_csv("results/test.csv")
70
+
71
+
72
+
73
+ # inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt")
74
+ inputfiles = glob.glob("ICNALE_texts/*/*.txt")
75
+ savedir = "ICNALE_analysis"
76
+ storeall = True
77
+ storage = []
78
+ os.makedirs(os.path.join("ICNALE_analysis", modelname))
79
+
80
+
81
+ doc_level_storage = []
82
+
83
+ for file in inputfiles:
84
+
85
+ filename = os.path.split(file)[-1]
86
+
87
+ with open(file, "r") as f:
88
+ text = f.read()
89
+
90
+ text = preprocess(text)
91
+ doc = nlp(text)
92
+ cleanup_justify(doc, doc.spans["sc"])
93
+ delete_overlapping_span(doc.spans['sc'])
94
+
95
+ data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
96
+ seq = [s for s in doc.spans["sc"]]
97
+ span_ngrams = ngrammar(seq=seq, n=3)
98
+
99
+
100
+ ### Make it a dataset
101
+ df = pd.DataFrame(data, columns=cols)
102
+ df = df.astype({"start": int, "end": int}) #convert col type
103
+ df = df.sort_values(by= ['start']) #and sort by start
104
+ # constant_value = 42
105
+ new_col = pd.Series([filename] * df.shape[0], name='filename')
106
+
107
+ doclen = len(doc)
108
+ doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
109
+
110
+ df.insert(0, "filename", new_col, True)
111
+ df.insert(1, "nwords", doc_len, True)
112
+ df.to_csv(f"{savedir}/{modelname}/{filename}.csv")
113
+
114
+ sequences = list(df['label_'])
115
+ # Engagement ngrams
116
+ span_bigrams = ngrammar(seq=seq, n=2)
117
+ bidf = pd.DataFrame(span_bigrams)
118
+
119
+ # constant_value = 42
120
+ new_col = pd.Series([filename] * bidf.shape[0], name='filename')
121
+ bidf = bidf.insert(0, "filename", new_col, True)
122
+
123
+
124
+ ## Document level
125
+ doc_level = {}
126
+ counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
127
+ div = diversity_values(list(counts))
128
+
129
+ div_data = pd.DataFrame.from_dict(div, orient='index')
130
+
131
+ doc_data = pd.concat([counts, div_data], axis = 0).T
132
+ doc_data.insert(0, "filename", filename, True)
133
+ doc_data.insert(1, "nwords", doc_len, True)
134
+ doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv")
135
+
136
+ if storeall:
137
+ storage.append(df)
138
+ doc_level_storage.append(doc_data)
139
+
140
+
141
+ alldf = pd.concat(storage)
142
+
143
+ alldf.to_csv(f"{savedir}/0_{modelname}_20230726.csv")
144
+
145
+
146
+ # alldoc = pd.concat(doc_level_storage)
147
+ # alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")
demo.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ # import spacy_streamlit
3
+ # from collections import Counter
4
+
5
+ import spacy
6
+ # from spacy.tokens import Doc
7
+
8
+ # from spacy_streamlit import visualize_spans
9
+
10
+ import streamlit as st
11
+
12
+ from utils.utility import delete_overlapping_span, cleanup_justify
13
+ from utils.visualize import visualize_spans
14
+
15
+ # nlp = spacy.load(
16
+ # "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
17
+ # )
18
+
19
+ # Load from local storage
20
+ # MODEL_LIST = ['en_engagement_RoBERTa-ME-AtoE.tar.gz']
21
+
22
+ # model = st.selectbox('Select model', MODEL_LIST, index=0)
23
+ # nlp = spacy.load("packages/" + model)
24
+
25
+ # Load from huggingface
26
+ # sm = spacy.load('en_core_web_sm', disable=['ner'])
27
+
28
+ st.set_page_config(
29
+ page_title="ENGAGEMENT analyzer (beta ver 0.3)",
30
+ layout="wide",
31
+ initial_sidebar_state="expanded",
32
+ )
33
+
34
+
35
+ @st.cache_resource
36
+ def load_model():
37
+ # nlp = spacy.load("en_engagement_RoBERTa_context_flz")
38
+ nlp = spacy.load("en_engagement_LSTM")
39
+ # nlp = spacy.load("en_engagement_spl_RoBERTa_base_attention")
40
+ return nlp
41
+
42
+
43
+ nlp = load_model()
44
+
45
+ doc = nlp(
46
+ "Welcome! Probably this is one of the few attempts to teach a machine how to read the discourse...! Although it is not perfect, you should be able to get a good place to start for your stance-taking analyses. The result will be presented here."
47
+ )
48
+
49
+ # TPL_ENT = """
50
+ # <mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
51
+ # {text}
52
+ # <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{label}</span>
53
+ # </mark>
54
+ # """
55
+
56
+ TPL_SPANS = """
57
+ <div class="spans" style="line-height: 4.5;">
58
+ {text}
59
+ {span_slices}
60
+ {span_starts}
61
+ </div>
62
+ """
63
+
64
+ TPL_SPAN = """
65
+ <span style="font-weight: bold; display: inline-block; line-height: 3; padding-bottom: 12px;position: relative;">
66
+ {text}
67
+ {span_slices}
68
+ {span_starts}
69
+ </span>
70
+ """
71
+
72
+ TPL_SPAN_SLICE = """
73
+ <span style="background: {bg}; top: {top_offset}px; display: inline-block; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
74
+ </span>
75
+ """
76
+
77
+ TPL_SPAN_START = """
78
+ <span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
79
+ <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
80
+
81
+ {label}{kb_link}
82
+ </span>
83
+ </span>
84
+
85
+ """
86
+
87
+ # TPL_SPAN_START_RTL = """
88
+ # <span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
89
+ # <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
90
+ # {label}{kb_link}
91
+ # </span>
92
+ # </span>
93
+ # """
94
+
95
+ DEFAULT_TEXT = """Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia."""
96
+
97
+ TEXT_LIST = [
98
+ """To a significant extent, individuals can be considered responsible for the rise of Hitler to power on the 31st of January, 1933. Hitler himself, the charismatic leader of the Nazi Party, as well as creator of Nazi policy, played a key role in his own rise to power. However, other individuals in government, such as Hindenburg and von Papen were influential in Hitler’s rise. To a small extent, other factors also enabled Hitler to rise to power such as the Depression and the weakness of the political system. Nevertheless to a significant extent, individuals can be held responsible for the rise of Adolf Hitler to power.""",
99
+ """Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia.""",
100
+ """Certainly, the argumentation is not without some faults. For example, the statement that “linking homosexuality to witches fulfills the same purpose” is not supported by references to the readings. It is not clear who was linking homosexuality to witches and in what context. Nevertheless, overall and in line with the general tendencies reported in the previous section, the author employs various contracting and expanding engagement resources successfully. However, a large part of the successful use of engagement resources seems to be related to how the author structures these strategies throughout the text, namely in a wave-like fashion: from acknowledging the opinions of others, to countering them by offering one’s own interpretation, to supporting it by acknowledging other sources.""",
101
+ """As the centuries passed, accounts of witchcraft became more and more specific; details of witches’ ceremonies and oaths became more concrete and whatever the condemned humans confessed to was treated as fact. As discussants correctly pointed out, Bernardino of Siena, Martin Le Franc, and the anonymous author of the Errores Gazariorum all have an even more aggressive campaign against witches than did the authors of our previous readings. By depicting their rituals and customs, they look to paint the most grotesque picture of witches possible. Their frenzied accusations, were some of the main catalysts of the subsequent witch hunts.""",
102
+ """The post labeled “Witchcraft as a Problem in Society” clearly explains the contribution that each text makes to the witch hunts. While two of the authors focused on describing, in full detail, the shocking and disturbing practices that witches partook of, the others tried to prove that the witch threat was real. These last texts sought to explain witchcraft so as to convince readers that witches actually existed. As all posts reiterate, the devil is definitely at the source of witchcraft.""",
103
+ """The third part temporarily puts aside mediation analysis and shifts the discussion to moderation analysis. In Chapter 7, I show how a multiple regression model can be made more flexible by allowing one variable’s effect to depend linearly on another variable in the model. The resulting moderated multiple regression model allows an investigator to ascertain the extent to which X’s influence on outcome variable Y is contingent on or interacts with a moderator variable W.""",
104
+ """For instance, research has shown that people have a tendency to justify close others’ unethical actions to protect them (Gino and Galinsky 2012). Research has also shown that parents who feel close to their children often adopt strict curfew practices (Elder et al., 1995). (EC-33)""",
105
+ """Fitzpatrick and Pagani (2013) found that engagement skills in classroom behaviour at kindergarten were related with better math scores and academic success. (LC-0525-EN)""",
106
+ """The COAG Reform Council (2013) indicated that when compared to other students, Australian Year 4 students who attended one year of ECEC services or programs gained 11 points higher in reading (LC-0471-MA). Preliminary evidence suggests that teaching children from low-income families using humanoid robots increases motivation, sense of community, and self-expression... (EC-64). These findings suggest that visual perception takes up only a small fraction of fixation durations. Specifically, Verdelhan (2010) proposes a two-country, one-good model in which each country has an exogenously specified i.i.d. consumption growth process. Waters & Baur (2003) suggest that children or adolescents who are overweight or obese suffer from social and psychological issues. (LC-0460-EN)""",
107
+ """According to the Australian Bureau of Statistics (2008), the percentage of obese or overweight adults is a staggering 60%.
108
+ According to George et al. (2011), in the UK immigration has improved the academic performance of the native children.
109
+ According to UNICEF (2011) a child that is breastfed within the first hour of life is fourteen times less likely to die from diarrhoea or pneumonia.""",
110
+ """As far as I am concerned, I do think globalization is good chance for China’s developing. From my point of view, I prefer to think that advantages of globalization outweighs disadvantages. """,
111
+ """As we know, China has made great progress for these years. I think it is the result of globalization. We all know China is a fast-developing country. We can seethe great progress that China has made. """,
112
+ """His idea was that an important ninth century bishop called John Anglicus may indeed have given birth to a child in full view of everyone on the streets of Rome, but that this bishop was not and never had been the pope. Of course, there is no evidence whatever for this, as Leibnitz himself well knew.""",
113
+ """On the whole, however, when evaluating meanings metaphorically, the Chinese EFL learners hedge and qualify their statements subjectively, tempering the certainty and authority of their assertions rather than using the resources of interpersonal metaphor to reinforce and substantiate their arguments. These tendencies reveal a key area for pedagogical intervention. Namely, instruction could focus on the value of construing metaphors objectively to obscure the author as the source of the evaluation. Similarly, raising students’ awareness of the space of negotiation and the value of offering assertions on a cline of certainty (e.g., IT IS EVIDENT) rather than through exclusive declarations of shared knowledge (e.g., AS WE ALL KNOW) is critical for academic writing refinement. Instructional interventions such as these are key areas for further investigation.""",
114
+ """Of the defendants involved in Utah Pie Company’s case only one seems to have emerged as exceptionally successful. However this success was not a factor of overwhelming market power, as can be seen by the dominant position of Mrs. Smith’s during this time, which had maintained a 39-45 percent market share over the corresponding period.""",
115
+ """Because of the evidence presented by Tremblay and Tremblay, it would appear that mergers in the brewing industry would have been procompetitive because of economies of scale. However, allowing a firm to acquire more than 20% of the market in Wisconsin would give it too much power to charge higher prices, even if the merger would help lower total average costs.""",
116
+ """Taken in whole, the economic evidence for grocery retailers in the decades after the Von’s decision suggests that increased concentration is pro-competitive and good for consumers, running contrary to the fears proposed by the Court.""",
117
+ """The remedies that Justice Lewis Powell prescribed did not gain the desired effect, and I feel that they were not very effective in promoting competition. (Elan, S86)""",
118
+ """There is the possibility for abuse if the producer sets different maximum prices for different retailers, allowing some to reap higher profits.""",
119
+ """Such a program, with appropriate limits, would provide a balanced structure that would ensure quality patient care.""",
120
+ """A recent survey of physician satisfaction by Harvard Medical School found that physician autonomy and the ability to provide high-quality care, not income, are most strongly associated with changes in job satisfaction . Thus, it seems reasonable to assume that health care providers would take advantage of the greater bargaining power to improve the quality of care. (Ken, S78-79)""",
121
+ """It appears, then, that maximum price fixing does the greatest harm when set below a competitive level [evidentialize]. In Case 4 it could potentially do harm to small retailers trying to enter the market [suggest], but does so for the benefit of consumers and the producer. Based purely on the models, it appears that, at the very least, maximum prices deserve a Rule of Reason approach to evaluate their cost and benefits.""",
122
+ """It could be seen that for this 68% of the respondents, Tampines was characteristically a location that provided for them all their basic needs. It can be seen from chart [11] that many people quoted accessibility and proximity to home, and even shopping as one of the ideal factors that drew them there. Accessibility is quite a key factor because it is evident that the regional centre was built on the basis of good infrastructure. In comparison, 32% of the respondents felt that the conventional downtown was still a major attraction, even though the regional centre had gained quite a vast amount of popularity and did to large extent have an air of modernity.""",
123
+ ]
124
+
125
+
126
+ @st.cache_resource
127
+ def preprocess(text):
128
+ text = re.sub("\n\n", " &&&&&&&&#&#&#&#&", text)
129
+ text = re.sub("\n", " ", text)
130
+ text = re.sub("\s+", " ", text)
131
+ text = re.sub("&&&&&&&&#&#&#&#&", "\n\n", text)
132
+ return text
133
+
134
+
135
+ @st.cache_resource
136
+ def delete_span(span_sc: dict):
137
+ id_del = []
138
+ for n, spn in enumerate(span_sc, start=1):
139
+ # print(spn)
140
+ # print(spn.label_)
141
+ if len(list(spn.sents)) > 1:
142
+ id_del.append(n)
143
+ # print(len(list(spn.sents)))
144
+
145
+ for idx in id_del:
146
+ # print(idx)
147
+ del span_sc[idx]
148
+
149
+
150
+ # st.markdown('''
151
+ # <style>
152
+ # .sidebar .sidebar-content {{
153
+ # width: 300px;
154
+ # }}
155
+ # </style>
156
+ # ''',
157
+ # unsafe_allow_html=True)
158
+
159
+ with st.sidebar:
160
+ st.markdown("""
161
+
162
+ ## Engagement moves analyzed in this tool (adapted from Martin & White, 2005).
163
+
164
+ | Engagement moves | Description |
165
+ | ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
166
+ | `Deny` (Contract -> disclaim) | An utterance which invokes a contrary position but which at the same time rejects it directly. The contrary position is hence given very little dialogic space. |
167
+ | `Counter` (Contract -> disclaim) | An utterance which expresses the present proposition as replacing and thus 'countering' another proposition which would have been expected. |
168
+ | `Concur` (Contract -> proclaim) | An utterance which shows writers' expectation/assumption that the putative readers will agree with the preposition and/or to have the same knowledge. |
169
+ | `Pronounce` (Contract -> proclaim) | An utterance which expresses a strong level of writer commitment through the author's explicit emphasis and interpolation, thereby closing down the dialogic space. |
170
+ | `Endorse` (Contract -> proclaim) | An utterance which refers to external sources as warrantable, undeniable, and/or reliable. It expresses the writer’s alignment with and endorsement of an attributed proposition. As such, the dialogic space is somewhat narrowed. |
171
+ | `Entertain` (Expand) | An utterance which indicates author's position but as only one possibility amongst others, thereby opening up dialogic space. |
172
+ | `Attribute` (Expand) | An utterance which signifies dialogic space as the writer attributes the proposition to an external source. |
173
+ | `Monogloss` | An utterance which does not employ any value of engagement. Such an utterance ignores the dialogic potential in an utterance. |
174
+
175
+ """)
176
+ # For a more complete description of the category, visit [the annotation guideline](https://egumasa.github.io/engagement-annotation-project/3_Categories/)!!
177
+
178
+ st.sidebar.markdown("""
179
+ Engagement Analyzer is developed by [Masaki Eguchi](https://masakieguchi.weebly.com).
180
+
181
+ ### Acknowledgements:
182
+
183
+ The development of this tool has been supported by the following grants:
184
+
185
+ - The TIRF Doctoral Dissertation Grant 2022 sponsored by the International Research Foundation for English Language Education (TIRF)
186
+ - The NFMLTA-MLJ Doctoral Dissertation Writing Support Grant 2022 sponsored by the National Federation of Modern Language Teachers Associations (NFMLTA)
187
+ - Duolingo English Test Doctoral Dissertation Award, 2022
188
+ - The Graduate Student Research Award sponsored by the Department of Linguistics, University of Oregon
189
+
190
+ I would also like to thank:
191
+ - Aaron Miller (Linguistics, University of Oregon) for corpus annotation
192
+ - Ryan Walker (Linguistics/Antholopology, University of Oregon) for corpus annotation
193
+ - Dr. Kristopher Kyle (Associate Professor in Linguistics, University of Oregon)
194
+ """)
195
+
196
+
197
+ cc = '<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>.'
198
+
199
+ st.sidebar.markdown(cc, unsafe_allow_html=True)
200
+
201
+ st.header("Engagement Analyzer (beta ver 0.2)")
202
+ st.write(
203
+ "Engagement Analyzer is a free tool that analyzes English texts for rhetorical strategies under the Engagement system framework (Martin & White, 2005). Martin and White (2005) propose two basic stance-taking strategies: expansion and contraction, which are in turn divided into finer-grained rhetorical strategies. The current tool allows you to analyze texts for a total of nine rhetorical strategies. The definitions of each category label can be found from the side bar"
204
+ )
205
+
206
+ with st.expander("See more explanation"):
207
+ st.markdown("""
208
+ According to Martin & White (2005), Engagement is about how the writer of a text takes stances on a topic of discussion by `Expanding (= open)` or `Contracting (= close)` the discourse for alternative viewpoints.
209
+
210
+ **Expansion strategy** = Discourse moves which open-up the dialogic space; the speaker/writer actively makes allowances for dialogically alternative positions and voices. (e.g., `ENTERTAIN`, `ATTRIBUTE`)
211
+
212
+ **Contraction strategy** = Discourse moves which close down dialogic space; the speaker/writer acts to challenge, fend off or restrict other alternative positions and voices (e.g., `DENY`, `COUNTER`, `PRONOUNCE`, `ENDORSE`).
213
+
214
+ """)
215
+
216
+ st.info("""Updated on Jan.11th, 2023\n
217
+ The current version was trained on 2,519 sentences and tested on 443 sentences. It achieved the following benchmark:
218
+ - Macro F1 = .75
219
+ - Macro Precision = .78
220
+ - Macro Recall = .74
221
+ I expect that the model's performance improves as the annotated dataset gets larger.
222
+ """)
223
+
224
+ with st.form("my_form"):
225
+ st.subheader("Option 1: selecting example text from list")
226
+ text = st.selectbox("", TEXT_LIST)
227
+
228
+ st.subheader("Option 2: analyze your own text")
229
+ input_text = st.text_area(
230
+ label="",
231
+ value="I would strongly encourage you to put your texts here to analyze it for stance-taking expressions.",
232
+ height=120,
233
+ )
234
+ st.text(
235
+ "The text from the pull-down list and in the textbox cannot be analyzed at the same time. Please select the mode."
236
+ )
237
+
238
+ textmode = st.radio(
239
+ label="Choose the mode.",
240
+ options=["Option 1: Pull-down choice", "Option 2: My own text"],
241
+ index=1,
242
+ )
243
+
244
+ submitted = st.form_submit_button("Submit")
245
+ if submitted:
246
+ if textmode == "Option 2: My own text":
247
+ text = input_text
248
+ with st.spinner("Analysis in progress..."):
249
+ doc = nlp(preprocess(text))
250
+ # st.markdown("> " + input_text)
251
+ else:
252
+ with st.spinner("Analysis in progress..."):
253
+ doc = nlp(preprocess(text))
254
+ # st.markdown("> " + text)
255
+
256
+ ## Dependency parsing
257
+
258
+ # if textmode == 'My own text':
259
+ # text = input_text
260
+ # doc = nlp(preprocess(text))
261
+ # #st.markdown("> " + input_text)
262
+ # else:
263
+ # doc = nlp(preprocess(text))
264
+ # #st.markdown("> " + text)
265
+
266
+ # st.header("Text", "text")
267
+ # st.write(text)
268
+ # delete_span(doc.spans['sc'])
269
+
270
+ cleanup_justify(doc, doc.spans["sc"])
271
+ delete_overlapping_span(doc.spans["sc"])
272
+
273
+ visualize_spans(
274
+ doc,
275
+ spans_key="sc",
276
+ displacy_options={
277
+ "template": {
278
+ "span": TPL_SPAN,
279
+ "slice": TPL_SPAN_SLICE,
280
+ "start": TPL_SPAN_START,
281
+ },
282
+ "colors": {
283
+ "ENTERTAIN": "#82b74b",
284
+ "DENY": "#c94c4c",
285
+ "COUNTER": "#eea29a",
286
+ "PRONOUNCE": "#92a8d1",
287
+ "ENDORSE": "#034f84",
288
+ "CITATION": "#b2b2b2",
289
+ # "MONOGLOSS": "#3e4444",
290
+ "ATTRIBUTE": "#f7786b",
291
+ "ATTRIBUTION": "#f7786b",
292
+ "PROCLAIM": "#92a8d1",
293
+ "CITATION": "#F8C471",
294
+ "SOURCES": "#F7DC6F",
295
+ "JUSTIFYING": "#2ECC71",
296
+ "ENDOPHORIC": "#FAD7A0",
297
+ },
298
+ },
299
+ simple=False,
300
+ show_diversity=True,
301
+ show_confidence=False,
302
+ )
303
+
304
+ st.subheader("Bibliography")
305
+ st.markdown("""
306
+ * Chang, P., & Schleppegrell, M. (2011). Taking an effective authorial stance in academic writing: Making the linguistic resources explicit for L2 writers in the social sciences. _Journal of English for Academic Purposes, 10_ (3), 140–151. https://doi.org/10.1016/j.jeap.2011.05.005
307
+ * Martin, J. R., & White, P. R. R. (2005). _The language of evaluation: Appraisal in English._ Palgrave Macmillan.
308
+ * Ryshina-Pankova, M. (2014). Exploring academic argumentation in course-related blogs through ENGAGEMENT. In G. Thompson & L. Alba-Juez (Eds.), _Pragmatics & Beyond New Series (Vol. 242, pp. 281–302)_. John Benjamins Publishing Company. https://doi.org/10.1075/pbns.242.14rys
309
+ * Wu, S. M. (2007). The use of engagement resources in high- and low-rated undergraduate geography essays. _Journal of English for Academic Purposes, 6_ (3), 254–271. https://doi.org/10.1016/j.jeap.2007.09.006
310
+
311
+ """)
312
+
313
+ st.subheader("Please cite the following papers:")
314
+ st.markdown("""* Eguchi, M., & Kyle, K. (2023). Span Identification of Epistemic Stance-Taking in Academic Written English. Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023), 429–442. https://aclanthology.org/2023.bea-1.35
315
+ * Eguchi, M., & Kyle, K. (2024). Building custom NLP tools to annotate discourse-functional features for second language writing research: A tutorial. *Research Methods in Applied Linguistics, 3*(3), 100153. https://doi.org/10.1016/j.rmal.2024.100153
316
+ """)
main.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy_streamlit
3
+ from spacy_streamlit import visualize_parser
4
+ from collections import Counter
5
+
6
+ import spacy
7
+ import streamlit as st
8
+
9
+ # try:
10
+ # from .scripts.custom_functions import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3
11
+ # except ImportError:
12
+ # from pipeline.custom_functions import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3
13
+ from spacy.tokens import Doc
14
+ from spacy.cli._util import import_code
15
+
16
+ from utils.visualize import visualize_spans
17
+ from utils.utility import preprocess, delete_overlapping_span, cleanup_justify
18
+ from resources.text_list import TEXT_LIST
19
+ from resources.text_list_BAWE import TEXT_LIST_BAWE
20
+ from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
21
+ from resources.colors import COLORS_1
22
+
23
+ import_code("pipeline/custom_functions.py")
24
+ st.set_page_config(page_title='Engagement model comparaer', layout="wide")
25
+
26
+ # spacy.prefer_gpu()
27
+
28
+ MODEL_LIST =['en_engagement_LSTM', 'en_engagement_LSTM']
29
+
30
+ # MODEL_LIST = [
31
+ # 'en_engagement_three_RoBERTa_base_LSTM384-0.9.2/en_engagement_three_RoBERTa_base_LSTM384/en_engagement_three_RoBERTa_base_LSTM384-0.9.2',
32
+ # 'en_engagement_three_RoBERTa_acad3_db-0.9.2/en_engagement_three_RoBERTa_acad3_db/en_engagement_three_RoBERTa_acad3_db-0.9.2',
33
+ # 'silver-sweep-34/model-best',
34
+ # 'expert-sweep-4/model-best',
35
+ # 'confused-sweep-6/model-best',
36
+ # 'warm-sweep-20/model-best',
37
+ # "en_engagement_three_RoBERTa_base-1.10.0/en_engagement_three_RoBERTa_base/en_engagement_three_RoBERTa_base-1.10.0",
38
+ # "en_engagement_three_RoBERTa_acad_db-1.10.0/en_engagement_three_RoBERTa_acad_db/en_engagement_three_RoBERTa_acad_db-1.10.0",
39
+ # "en_engagement_para_RoBERTa_acad_db3-0.9.0/en_engagement_para_RoBERTa_acad_db3/en_engagement_para_RoBERTa_acad_db3-0.9.0",
40
+ # "en_engagement_para_RoBERTa_acad_LSTM2-0.9.0/en_engagement_para_RoBERTa_acad_LSTM2/en_engagement_para_RoBERTa_acad_LSTM2-0.9.0",
41
+ # "en_engagement_three_RoBERTa_acad_db3-0.9.1/en_engagement_three_RoBERTa_acad_db3/en_engagement_three_RoBERTa_acad_db3-0.9.1",
42
+ # "en_engagement_three_RoBERTa_acad_LSTM2-0.9.1/en_engagement_three_RoBERTa_acad_LSTM2/en_engagement_three_RoBERTa_acad_LSTM2-0.9.1",
43
+ # "en_engagement_three_RoBERTa_acad_db3-0.9.2/en_engagement_three_RoBERTa_acad_db3/en_engagement_three_RoBERTa_acad_db3-0.9.2",
44
+ # 'en_engagement_spl_RoBERTa_acad_db-0.7.4/en_engagement_spl_RoBERTa_acad_db/en_engagement_spl_RoBERTa_acad_db-0.7.4',
45
+ # 'en_engagement_spl_RoBERTa_acad_db3-0.9.0/en_engagement_spl_RoBERTa_acad_db3/en_engagement_spl_RoBERTa_acad_db3-0.9.0',
46
+ # 'en_engagement_spl_RoBERTa_acad_LSTM-0.7.2/en_engagement_spl_RoBERTa_acad_LSTM/en_engagement_spl_RoBERTa_acad_LSTM-0.7.2',
47
+ # 'en_engagement_spl_RoBERTa_acad_512',
48
+ # 'en_engagement_spl_RoBERTa_acad',
49
+ # 'en_engagement_spl_RoBERTa_exp-0.6.5/en_engagement_spl_RoBERTa_exp/en_engagement_spl_RoBERTa_exp-0.6.5',
50
+ # # 'en_engagement_spl_RoBERTa_acad-0.3.4.1221/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.3.4.1221',
51
+ # # 'en_engagement_spl_RoBERTa_acad-0.2.2.1228/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.2.1228',
52
+ # # 'en_engagement_spl_RoBERTa_acad-0.2.1.1228/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.1.1228',
53
+ # # 'en_engagement_spl_RoBERTa_acad-0.2.2.1220/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.2.1220',
54
+ # # 'en_engagement_spl_RoBERTa2-0.2.2.1210/en_engagement_spl_RoBERTa2/en_engagement_spl_RoBERTa2-0.2.2.1210',
55
+ # # 'en_engagement_spl_RoBERTa-0.2.2.1210/en_engagement_spl_RoBERTa/en_engagement_spl_RoBERTa-0.2.2.1210',
56
+ # # 'en_engagement_spl_RoBERTa_acad_max1_do02',
57
+ # # 'en_engagement_spl_RoBERTa2-0.2.2.1210/en_engagement_spl_RoBERTa2/en_engagement_spl_RoBERTa2-0.2.2.1210',
58
+ # # 'en_engagement_spl_RoBERTa_acad-0.2.3.1210/en_engagement_spl_RoBERTa_acad/en_engagement_spl_RoBERTa_acad-0.2.3.1210',
59
+ # # 'en_engagement_spl_RoBERTa_acad_max1_do02',
60
+ # # 'en_engagement_spl_RoBERTa_sqbatch_RAdam-20221202_0.1.5/en_engagement_spl_RoBERTa_sqbatch_RAdam/en_engagement_spl_RoBERTa_sqbatch_RAdam-20221202_0.1.5',
61
+ # # 'en_engagement_spl_RoBERTa_context_flz-20221130_0.1.4/en_engagement_spl_RoBERTa_context_flz/en_engagement_spl_RoBERTa_context_flz-20221130_0.1.4',
62
+ # # 'en_engagement_spl_RoBERTa_cx_max1_do2-20221202_0.1.5/en_engagement_spl_RoBERTa_cx_max1_do2/en_engagement_spl_RoBERTa_cx_max1_do2-20221202_0.1.5',
63
+ # # 'en_engagement_spl_RoBERTa_context_flz-20221125_0.1.4/en_engagement_spl_RoBERTa_context_flz/en_engagement_spl_RoBERTa_context_flz-20221125_0.1.4',
64
+ # # 'en_engagement_RoBERTa_context_flz-20221125_0.1.4/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221125_0.1.4',
65
+ # # 'en_engagement_RoBERTa_context_flz-20221117_0.1.3/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221117_0.1.3',
66
+ # # 'en_engagement_spl_RoBERTa_acad_context_flz-20221117_0.1.3/en_engagement_spl_RoBERTa_acad_context_flz/en_engagement_spl_RoBERTa_acad_context_flz-20221117_0.1.3',
67
+ # # 'en_engagement_RoBERTa_context_flz-Batch2_0.1.1/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-Batch2_0.1.1',
68
+ # # 'en_engagement_RoBERTa_context_flz-20221113_0.1.3/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221113_0.1.3',
69
+ # # 'en_engagement_RoBERTa_context_flz-20221113_0.1.1/en_engagement_RoBERTa_context_flz/en_engagement_RoBERTa_context_flz-20221113_0.1.1',
70
+ # # 'en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2',
71
+ # # 'en_engagement_RoBERTa_combined-Batch2Eng_0.2/en_engagement_RoBERTa_combined/en_engagement_RoBERTa_combined-Batch2Eng_0.2',
72
+ # # 'en_engagement_RoBERTa_acad-0.2.1/en_engagement_RoBERTa_acad/en_engagement_RoBERTa_acad-0.2.1',
73
+ # # # 'en_engagement_BERT-0.0.2/en_engagement_BERT/en_engagement_BERT-0.0.2',
74
+ # # # 'en_engagement_BERT_acad-0.0.2/en_engagement_BERT_acad/en_engagement_BERT_acad-0.0.2',
75
+ # # # 'en_engagement_RoBERTa_acad-0.0.2/en_engagement_RoBERTa_acad/en_engagement_RoBERTa_acad-0.0.2',
76
+ # # 'en_engagement_RoBERTa-0.0.1/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.1',
77
+ # # # ' en_engagement_RoBERTa_sent-0.0.1_null/en_engagement_RoBERTa_sent/en_engagement_RoBERTa_sent-0.0.1_null',
78
+ # # # 'en_engagement_RoBERTa_combined-0.0.1/en_engagement_RoBERTa_combined/en_engagement_RoBERTa_combined-0.0.1',
79
+ # # 'en_engagement_RoBERTa-ME_AtoE/en_engagement_RoBERTa/en_engagement_RoBERTa-ME_AtoE',
80
+ # # 'en_engagement_RoBERTa-AtoI_0.0.3/en_engagement_RoBERTa/en_engagement_RoBERTa-AtoI_0.0.3',
81
+ # # 'en_engagement_RoBERTa-AtoI_0.0.3/en_engagement_RoBERTa/en_engagement_RoBERTa-AtoI_0.0.2'
82
+ # ]
83
+
84
+ multicol = st.checkbox("Compare two models", value=False, key=None, help=None)
85
+
86
+ model1 = st.selectbox('Select model option 1', MODEL_LIST, index=0)
87
+ model2 = st.selectbox('Select model option 2', MODEL_LIST, index=1)
88
+
89
+ if '/' in model1:
90
+ model1 = "packages/" + model1
91
+
92
+ if '/' in model2:
93
+ model2 = "packages/" + model2
94
+
95
+
96
+ @st.cache(allow_output_mutation=True)
97
+ def load_model(spacy_model):
98
+ # source = spacy.blank("en")
99
+ nlp = spacy.load(spacy_model) #, vocab=nlp_to_copy.vocab
100
+ nlp.add_pipe('sentencizer')
101
+ return (nlp)
102
+
103
+ # source = spacy.blank("en")
104
+ nlp = load_model(model1)
105
+
106
+ if multicol:
107
+ nlp2 = load_model(model2)
108
+
109
+
110
+ text = st.selectbox('select sent to debug', TEXT_LIST_BAWE)
111
+
112
+ input_text = st.text_area("", height=200)
113
+
114
+ # Dependency parsing
115
+ st.header("Text", "text")
116
+ if len(input_text.split(" ")) > 1:
117
+ doc = nlp(preprocess(input_text))
118
+ if multicol:
119
+ doc2 = nlp2(preprocess(input_text))
120
+ # st.markdown("> " + input_text)
121
+ else:
122
+ doc = nlp(preprocess(text))
123
+ if multicol:
124
+ doc2 = nlp2(preprocess(text))
125
+ # st.markdown("> " + text)
126
+
127
+ clearjustify = st.checkbox(
128
+ "Clear problematic JUSTIFYING spans", value=True, key=None, help=None)
129
+
130
+ delete_overlaps = st.checkbox(
131
+ "Delete overlaps", value=True, key=None, help=None)
132
+
133
+ # combine = st.checkbox(
134
+ # "Combine", value=False, key=None, help=None)
135
+
136
+ # import copy
137
+ # def combine_spangroups(doc1, doc2):
138
+ # # new_doc = Doc.from_docs([doc1, doc2], ensure_whitespace=True)
139
+ # new_doc = copy.deepcopy(doc1)
140
+ # # type()
141
+ # new_doc.spans['sc'].extend(doc2.spans['sc'])
142
+
143
+ # return new_doc
144
+
145
+
146
+ # if combine:
147
+ # new_doc = combine_spangroups(doc, doc2)
148
+ # visualize_spans(new_doc,
149
+ # spans_key="sc",
150
+ # title='Combined spans:',
151
+ # displacy_options={
152
+ # 'template': {
153
+ # "span": TPL_SPAN,
154
+ # 'slice': TPL_SPAN_SLICE,
155
+ # 'start': TPL_SPAN_START,
156
+ # },
157
+ # "colors": COLORS_1,
158
+ # },
159
+ # simple=False)
160
+
161
+ if clearjustify:
162
+ cleanup_justify(doc, doc.spans['sc'])
163
+
164
+ if delete_overlaps:
165
+ delete_overlapping_span(doc.spans['sc'])
166
+ if multicol:
167
+ delete_overlapping_span(doc2.spans['sc'])
168
+
169
+ if not multicol:
170
+ visualize_spans(doc,
171
+ spans_key="sc",
172
+ title='Engagement Span Anotations 1',
173
+ displacy_options={
174
+ 'template': {
175
+ "span": TPL_SPAN,
176
+ 'slice': TPL_SPAN_SLICE,
177
+ 'start': TPL_SPAN_START,
178
+ },
179
+ "colors": COLORS_1,
180
+ },
181
+ simple=False)
182
+
183
+
184
+ else:
185
+ col1, col2 = st.columns(2)
186
+
187
+ with col1:
188
+ visualize_spans(doc,
189
+ spans_key="sc",
190
+ title='Engagement Span Anotations 1',
191
+ displacy_options={
192
+ 'template': {
193
+ "span": TPL_SPAN,
194
+ 'slice': TPL_SPAN_SLICE,
195
+ 'start': TPL_SPAN_START,
196
+ },
197
+ "colors": COLORS_1,
198
+ },
199
+ simple=False)
200
+
201
+ with col2:
202
+ visualize_spans(doc2,
203
+ spans_key="sc",
204
+ title='Engagement Span Anotations 2',
205
+ displacy_options={
206
+ 'template': {
207
+ "span": TPL_SPAN,
208
+ 'slice': TPL_SPAN_SLICE,
209
+ 'start': TPL_SPAN_START,
210
+ },
211
+ "colors": COLORS_1,
212
+ },
213
+ simple=False)
214
+
215
+
216
+ dep_options = {"fine_grained": True, "distance": 120}
217
+ visualize_parser(doc, displacy_options=dep_options)
pipeline/__pycache__/custom_functions.cpython-39.pyc ADDED
Binary file (3.61 kB). View file
 
pipeline/__pycache__/post_processors.cpython-310.pyc ADDED
Binary file (20.4 kB). View file
 
pipeline/__pycache__/post_processors.cpython-39.pyc ADDED
Binary file (21.4 kB). View file
 
pipeline/custom_functions.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ from pathlib import Path
3
+ from typing import Iterable, Callable
4
+ import spacy
5
+ from spacy.training import Example
6
+ from spacy.tokens import DocBin, Doc
7
+
8
+ # make the factory work
9
+ # from scripts.rel_pipe import make_relation_extractor
10
+
11
+ # make the config work
12
+ # from scripts.rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors
13
+ # from scripts.custom_comps.SpanCat_extention import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3, build_mean_max_reducer4
14
+
15
+ from typing import List, Tuple, cast
16
+ from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
17
+ from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init, PyTorchLSTM
18
+ from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
19
+ from thinc.types import Ragged, Floats2d
20
+
21
+ from spacy.util import registry
22
+ from spacy.tokens import Doc
23
+ from spacy.ml.extract_spans import extract_spans
24
+
25
+ # @registry.layers("spacy.LinearLogistic.v1")
26
+ # def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
27
+ # """An output layer for multi-label classification. It uses a linear layer
28
+ # followed by a logistic activation.
29
+ # """
30
+ # return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
31
+
32
+
33
+ @registry.layers("mean_max_reducer.v1.5")
34
+ def build_mean_max_reducer1(hidden_size: int,
35
+ dropout: float = 0.0) -> Model[Ragged, Floats2d]:
36
+ """Reduce sequences by concatenating their mean and max pooled vectors,
37
+ and then combine the concatenated vectors with a hidden layer.
38
+ """
39
+ return chain(
40
+ concatenate(
41
+ cast(Model[Ragged, Floats2d], reduce_last()),
42
+ cast(Model[Ragged, Floats2d], reduce_first()),
43
+ reduce_mean(),
44
+ reduce_max(),
45
+ ),
46
+ Maxout(nO=hidden_size, normalize=True, dropout=dropout),
47
+ )
48
+
49
+
50
+ @registry.layers("mean_max_reducer.v2")
51
+ def build_mean_max_reducer2(hidden_size: int,
52
+ dropout: float = 0.0) -> Model[Ragged, Floats2d]:
53
+ """Reduce sequences by concatenating their mean and max pooled vectors,
54
+ and then combine the concatenated vectors with a hidden layer.
55
+ """
56
+ return chain(
57
+ concatenate(
58
+ cast(Model[Ragged, Floats2d], reduce_last()),
59
+ cast(Model[Ragged, Floats2d], reduce_first()),
60
+ reduce_mean(),
61
+ reduce_max(),
62
+ ), Maxout(nO=hidden_size, normalize=True, dropout=dropout),
63
+ Maxout(nO=hidden_size, normalize=True, dropout=dropout))
64
+
65
+
66
+ # @registry.layers("mean_max_reducer.v2")
67
+ # def build_mean_max_reducer2(hidden_size: int,
68
+ # depth: int) -> Model[Ragged, Floats2d]:
69
+ # """Reduce sequences by concatenating their mean and max pooled vectors,
70
+ # and then combine the concatenated vectors with a hidden layer.
71
+ # """
72
+ # return chain(
73
+ # concatenate(
74
+ # cast(Model[Ragged, Floats2d], reduce_last()),
75
+ # cast(Model[Ragged, Floats2d], reduce_first()),
76
+ # reduce_mean(),
77
+ # reduce_max(),
78
+ # ), Maxout(nO=hidden_size, normalize=True, dropout=0.0),
79
+ # PyTorchLSTM(nO=64, nI=hidden_size, bi=True, depth=depth, dropout=0.2))
80
+
81
+
82
+ @registry.layers("mean_max_reducer.v3")
83
+ def build_mean_max_reducer3(hidden_size: int,
84
+ maxout_pieces: int = 3,
85
+ dropout: float = 0.0) -> Model[Ragged, Floats2d]:
86
+ """Reduce sequences by concatenating their mean and max pooled vectors,
87
+ and then combine the concatenated vectors with a hidden layer.
88
+ """
89
+ hidden_size2 = int(hidden_size / 2)
90
+ hidden_size3 = int(hidden_size / 2)
91
+ return chain(
92
+ concatenate(
93
+ cast(Model[Ragged, Floats2d], reduce_last()),
94
+ cast(Model[Ragged, Floats2d], reduce_first()),
95
+ reduce_mean(),
96
+ reduce_max(),
97
+ ),
98
+ Maxout(nO=hidden_size,
99
+ nP=maxout_pieces,
100
+ normalize=True,
101
+ dropout=dropout),
102
+ Maxout(nO=hidden_size2,
103
+ nP=maxout_pieces,
104
+ normalize=True,
105
+ dropout=dropout),
106
+ Maxout(nO=hidden_size3,
107
+ nP=maxout_pieces,
108
+ normalize=True,
109
+ dropout=dropout))
110
+
111
+
112
+ @registry.layers("mean_max_reducer.v3.3")
113
+ def build_mean_max_reducer4(hidden_size: int,
114
+ depth: int) -> Model[Ragged, Floats2d]:
115
+ """Reduce sequences by concatenating their mean and max pooled vectors,
116
+ and then combine the concatenated vectors with a hidden layer.
117
+ """
118
+ hidden_size2 = int(hidden_size / 2)
119
+ hidden_size3 = int(hidden_size / 2)
120
+ return chain(
121
+ concatenate(
122
+ cast(Model[Ragged, Floats2d], reduce_last()),
123
+ cast(Model[Ragged, Floats2d], reduce_first()),
124
+ reduce_mean(),
125
+ reduce_max(),
126
+ ), Maxout(nO=hidden_size, nP=3, normalize=True, dropout=0.0),
127
+ Maxout(nO=hidden_size2, nP=3, normalize=True, dropout=0.0),
128
+ Maxout(nO=hidden_size3, nP=3, normalize=True, dropout=0.0))
129
+
130
+
131
+ @registry.architectures("CustomSpanCategorizer.v2")
132
+ def build_spancat_model(
133
+ tok2vec: Model[List[Doc], List[Floats2d]],
134
+ reducer: Model[Ragged, Floats2d],
135
+ scorer: Model[Floats2d, Floats2d],
136
+ ) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
137
+ """Build a span categorizer model, given a token-to-vector model, a
138
+ reducer model to map the sequence of vectors for each span down to a single
139
+ vector, and a scorer model to map the vectors to probabilities.
140
+ tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
141
+ reducer (Model[Ragged, Floats2d]): The reducer model.
142
+ scorer (Model[Floats2d, Floats2d]): The scorer model.
143
+ """
144
+ model = chain(
145
+ cast(
146
+ Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
147
+ with_getitem(
148
+ 0,
149
+ chain(tok2vec,
150
+ cast(Model[List[Floats2d], Ragged], list2ragged()))),
151
+ ),
152
+ extract_spans(),
153
+ reducer,
154
+ scorer,
155
+ )
156
+ model.set_ref("tok2vec", tok2vec)
157
+ model.set_ref("reducer", reducer)
158
+ model.set_ref("scorer", scorer)
159
+ return model
160
+
161
+
162
+ # @registry.architectures("spacy.SpanCategorizer.v1")
163
+ # def build_spancat_model(
164
+ # tok2vec: Model[List[Doc], List[Floats2d]],
165
+ # reducer: Model[Ragged, Floats2d],
166
+ # scorer: Model[Floats2d, Floats2d],
167
+ # ) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
168
+ # """Build a span categorizer model, given a token-to-vector model, a
169
+ # reducer model to map the sequence of vectors for each span down to a single
170
+ # vector, and a scorer model to map the vectors to probabilities.
171
+ # tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
172
+ # reducer (Model[Ragged, Floats2d]): The reducer model.
173
+ # scorer (Model[Floats2d, Floats2d]): The scorer model.
174
+ # """
175
+ # model = chain(
176
+ # cast(
177
+ # Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
178
+ # with_getitem(
179
+ # 0,
180
+ # chain(tok2vec,
181
+ # cast(Model[List[Floats2d], Ragged], list2ragged()))),
182
+ # ),
183
+ # extract_spans(),
184
+ # reducer,
185
+ # scorer,
186
+ # )
187
+ # model.set_ref("tok2vec", tok2vec)
188
+ # model.set_ref("reducer", reducer)
189
+ # model.set_ref("scorer", scorer)
190
+ # return model
pipeline/post_processors.py ADDED
@@ -0,0 +1,1189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
2
+ import pandas as pd
3
+ import spacy
4
+ from spacy.language import Language
5
+ from skbio import diversity as dv
6
+
7
+ SPAN_ATTRS = ["text", "label_", "start", "end"]
8
+ CATEGORIES = [
9
+ "ATTRIBUTION",
10
+ "CITATION",
11
+ "COUNTER",
12
+ "DENY",
13
+ "ENDOPHORIC",
14
+ "ENTERTAIN",
15
+ "JUSTIFYING",
16
+ "MONOGLOSS",
17
+ "PROCLAIM",
18
+ "SOURCES",
19
+ ]
20
+
21
+
22
+ def simple_table(
23
+ doc: Union[spacy.tokens.Doc, Dict[str, str]],
24
+ spans_key: str = "sc",
25
+ attrs: List[str] = SPAN_ATTRS,
26
+ ):
27
+ columns = attrs + ["Conf. score"]
28
+ data = [
29
+ [str(getattr(span, attr)) for attr in attrs] + [score] # [f'{score:.5f}']
30
+ for span, score in zip(
31
+ doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]
32
+ )
33
+ ]
34
+ return data, columns
35
+
36
+
37
+ # def span_info_aggregator()
38
+
39
+
40
+ def construction_classifier(doc, span):
41
+ category = None
42
+ spanroot = span.root
43
+
44
+ ## Grabbing lexico-grammatical information
45
+ span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
46
+ span_dep = [t.dep_ for t in span]
47
+ span_token = [t.norm_ for t in span]
48
+ span_tag = [t.tag_ for t in span]
49
+
50
+ c = [c for c in spanroot.children]
51
+ c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
52
+
53
+ c_norm = [c.norm_ for c in spanroot.children]
54
+ c_dep = [c.dep_ for c in spanroot.children]
55
+ c_pos = [c.pos_ for c in spanroot.children]
56
+ c_tag = [c.tag_ for c in spanroot.children]
57
+
58
+ right_dep = [c.dep_ for c in spanroot.rights]
59
+
60
+ # conditionals
61
+ subjless = all(
62
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
63
+ for c in spanroot.children
64
+ )
65
+ argmentless = all(
66
+ c.dep_
67
+ not in [
68
+ "nsubj",
69
+ "nsubjpass",
70
+ "csubj",
71
+ "csubjpass",
72
+ "dobj",
73
+ "ccomp",
74
+ "xcomp",
75
+ "dative",
76
+ "attr",
77
+ "oprd",
78
+ "acomp",
79
+ ]
80
+ for c in spanroot.children
81
+ )
82
+ argless_span = all(
83
+ c.dep_
84
+ not in [
85
+ "nsubj",
86
+ "nsubjpass",
87
+ "csubj",
88
+ "csubjpass",
89
+ "dobj",
90
+ "ccomp",
91
+ "xcomp",
92
+ "dative",
93
+ "attr",
94
+ "oprd",
95
+ "acomp",
96
+ ]
97
+ for c in span
98
+ )
99
+
100
+ ## nesting classifiers
101
+ if spanroot.dep_ == "conj":
102
+ while spanroot.dep_ == "conj":
103
+ spanroot = spanroot.head
104
+ # if spanroot.dep_ == "poss":
105
+ # while spanroot.dep_ == 'poss':
106
+ # spanroot = spanroot.head
107
+
108
+ ## Conjunctions
109
+ # Preconjunctions
110
+ if spanroot.dep_ in ["preconj", "cc"]:
111
+ category = "Conjunction"
112
+
113
+ ## NOUN PHRASES
114
+ # adverbial phrases
115
+ if spanroot.dep_ in ["amod"]:
116
+ category = "Adjectival modifier"
117
+ # adverbial phrases
118
+ if spanroot.dep_ in ["compound"]:
119
+ category = "Compound noun"
120
+
121
+ ## Nominal category
122
+ if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
123
+ if "acl" in c_dep:
124
+ category = "Noun + Complement (Object)"
125
+ else:
126
+ category = "Object"
127
+
128
+ if spanroot.dep_ in ["nsubj", "nsubjpass"]:
129
+ if "acl" in c_dep:
130
+ category = "Noun + Complement (Subject)"
131
+ else:
132
+ category = "Subject"
133
+
134
+ ## ADJUNCTS
135
+ # prep phrases
136
+ if spanroot.dep_ in ["prep", "agent"]:
137
+ category = "Prepositional phrase"
138
+ # adverbial phrases
139
+ if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]:
140
+ category = "Adverbial phrase"
141
+
142
+ ## Predication patterns
143
+ if spanroot.dep_ in ["acomp", "oprd"]:
144
+ if "xcomp" in c_dep:
145
+ category = "Subject predicate to-cl"
146
+ else:
147
+ category = "Adjectival complement"
148
+
149
+ if spanroot.dep_ in ["attr"]:
150
+ subjless = all(
151
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
152
+ for c in spanroot.children
153
+ )
154
+
155
+ c_head = [c.dep_ for c in spanroot.head.children]
156
+ if "expl" in c_head and "no_det" in span_t_dep_:
157
+ category = "There is/are no NOUN"
158
+ elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
159
+ category = "There is/are + Noun complement"
160
+ elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
161
+ category = "There is/are + Noun complement"
162
+
163
+ elif spanroot.pos_ in ["NOUN", "PRON"]:
164
+ if "acl" in c_dep:
165
+ category = "Noun + Complement (attr)"
166
+ else:
167
+ category = "Nominal complement"
168
+
169
+ elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
170
+ category = "Main verb 4"
171
+
172
+ elif spanroot.tag_ in ["NNP"]:
173
+ category = "Nominal complement"
174
+
175
+ ####################################
176
+ ### clausal ####
177
+ ####################################
178
+ if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]:
179
+ _check_to = [
180
+ c.dep_
181
+ for c in spanroot.subtree
182
+ if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
183
+ and c.head.dep_ == "xcomp"
184
+ ]
185
+ _check_ing = [
186
+ c.dep_
187
+ for c in spanroot.subtree
188
+ if "Prog" in str(c.morph) and c.dep_ == "xcomp"
189
+ ]
190
+ root_before_ccomp = [
191
+ c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
192
+ ]
193
+
194
+ _check_for_to = [
195
+ "_".join([c.norm_, c.dep_])
196
+ for c in spanroot.subtree
197
+ if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
198
+ ]
199
+ entire_cl = (
200
+ spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
201
+ )
202
+
203
+ ## Start with broad category, which is then re-evaluated for specific constructions.
204
+ if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]:
205
+ ## Adverbial clauses
206
+ ### Finite-adverbial clauses
207
+ ### Non-finite adverbial clauses
208
+ subjless = all(
209
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
210
+ for c in spanroot.children
211
+ )
212
+ entire_cl = (
213
+ spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
214
+ )
215
+
216
+ if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
217
+ category = "Finite adverbial clause"
218
+ elif "mark" in span_dep and "aux" in span_dep:
219
+ category = "Finite adverbial clause"
220
+
221
+ elif (
222
+ "mark" in span_dep
223
+ and spanroot.pos_ in ["VERB", "AUX"]
224
+ and "expl" in c_dep
225
+ ):
226
+ category = "Finite adverbial clause"
227
+
228
+ elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
229
+ if spanroot.pos_ in ["VERB", "AUX"]:
230
+ category = "Finite adverbial clause"
231
+
232
+ elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
233
+ category = "Non-finite adv clause 1"
234
+
235
+ elif entire_cl:
236
+ category = "Finite adverbial clause"
237
+
238
+ elif (
239
+ str(spanroot.morph)
240
+ in [
241
+ "Aspect=Prog|Tense=Pres|VerbForm=Part",
242
+ "Aspect=Perf|Tense=Past|VerbForm=Part",
243
+ ]
244
+ and "aux" not in c_dep
245
+ ):
246
+ # he doing his job
247
+ if argmentless:
248
+ # e.g., frankly speaking, strictly speaking
249
+ category = "Adverbial Phrase"
250
+ else:
251
+ category = "Non-finite adv clause 2"
252
+
253
+ elif (
254
+ spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
255
+ ):
256
+ category = "Non-finite adv clause 3"
257
+
258
+ elif "aux" in c_dep and "TO" in c_tag:
259
+ category = "Adverbial Phrase"
260
+
261
+ elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
262
+ category = "Dependent Verb phrase"
263
+
264
+ elif not argmentless:
265
+ category = "Adverbial clause"
266
+
267
+ elif spanroot.dep_ == "advcl":
268
+ category = "Adverbial phrase"
269
+
270
+ if spanroot.dep_ in ["relcl", "ccomp", "acl"]:
271
+ head = spanroot.head
272
+ if ";" in [t.norm_ for t in head.children]:
273
+ category = "Main verb 3"
274
+ elif "nsubj" not in span_dep:
275
+ category = "Dependent verb 1"
276
+ elif "mark" in span_dep:
277
+ category = "Complement clause"
278
+ elif (
279
+ str(spanroot.morph)
280
+ in [
281
+ "Aspect=Prog|Tense=Pres|VerbForm=Part",
282
+ "Aspect=Perf|Tense=Past|VerbForm=Part",
283
+ ]
284
+ and "aux" not in c_dep
285
+ ):
286
+ category = "Non-finite complement clause"
287
+ elif spanroot.dep_ in ["relcl"]:
288
+ category = "Relative clause"
289
+ elif spanroot.dep_ in ["ccomp"]:
290
+ category = "Complement clause"
291
+ elif spanroot.dep_ in ["acl"]:
292
+ category = "Noun Complement clause"
293
+ else:
294
+ # print(_check_for_to)
295
+ category = "this one"
296
+
297
+ ## Specific constructions
298
+ # Extraposed that-clause or to-infinitives
299
+ if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
300
+ "VERB",
301
+ "AUX",
302
+ ]:
303
+ print(c_dep)
304
+ if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
305
+ # eg it seems odd (oprd) that X.
306
+ # eg it is certain (acomp) that X.
307
+ category = (
308
+ "Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
309
+ )
310
+
311
+ elif "xcomp" in c_dep or ("advcl" in c_dep):
312
+ if "for_mark" in _check_for_to:
313
+ category = (
314
+ "Extraposed to-cl (explicit subj)" # eg It is possible to .
315
+ )
316
+ elif _check_to:
317
+ category = "Extraposed to-cl 1" # eg It is possible to .
318
+ elif _check_ing:
319
+ category = "Extraposed -ing 1" # eg It is possible to .
320
+ elif (
321
+ ("prep" in right_dep or "npadvmod" in right_dep)
322
+ and "ccomp" in right_dep
323
+ and spanroot.lemma_ == "be"
324
+ ):
325
+ category = "Cleft construction"
326
+
327
+ elif "attr" in c_dep:
328
+ category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
329
+
330
+ else:
331
+ category = "Extraposed that-cl (VERB)"
332
+
333
+ # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
334
+ # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
335
+ elif (
336
+ "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
337
+ ) and "acomp" in c_dep:
338
+ if "xcomp" in c_dep:
339
+ if _check_to:
340
+ category = "Extraposed to-cl 2" # eg it is difficult to decide.
341
+ elif _check_ing:
342
+ category = "Extraposed -ing 2" # eg it is difficult to decide.
343
+
344
+ else:
345
+ category = "Extraposed that-cl (adj-complement) 2"
346
+
347
+ elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
348
+ category = (
349
+ "Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
350
+ )
351
+
352
+ # something without dummy subject "it"
353
+ elif (
354
+ (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
355
+ and spanroot.pos_ in ["AUX", "VERB"]
356
+ and "it" not in c_norm
357
+ ):
358
+ # store xcomp, if the head of the xcomp is acomp
359
+ _check_xcomp = [
360
+ c.dep_
361
+ for c in spanroot.subtree
362
+ if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
363
+ ]
364
+ _check_ccomp = [
365
+ c.dep_
366
+ for c in spanroot.subtree
367
+ if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
368
+ ]
369
+ # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
370
+ # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
371
+
372
+ if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
373
+ if any(root_before_ccomp):
374
+ category = "Post-predicate that-cl"
375
+ else:
376
+ category = "Comment clause"
377
+
378
+ elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
379
+ category = "Post-predicate that-cl 2"
380
+
381
+ elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
382
+ category = "Post-predicate to-cl"
383
+
384
+ elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
385
+ category = "Subject predicate to-cl"
386
+
387
+ elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
388
+ category = "Subject predicate to-cl (passive)"
389
+
390
+ elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
391
+ category = "Subject predicate -ing"
392
+ elif "ccomp" in c_dep:
393
+ category = "Subject predicate that-cl"
394
+ elif "acomp" in c_dep:
395
+ category = "Adjectival predicate"
396
+
397
+ elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
398
+ category = "Finite-adverbial clause"
399
+ else:
400
+ category = "Main verb 1"
401
+
402
+ ## without dummy subject it, and lexical verbs
403
+ elif (
404
+ ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
405
+ and spanroot.pos_ in ["AUX", "VERB"]
406
+ and "it" not in c_norm
407
+ and spanroot.lemma_ not in ["be"]
408
+ ):
409
+ _check_wh = [
410
+ c.dep_
411
+ for c in spanroot.subtree
412
+ if (
413
+ c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
414
+ and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
415
+ )
416
+ and c.head.dep_ == "ccomp"
417
+ ]
418
+ _check_if = [
419
+ c.dep_
420
+ for c in spanroot.subtree
421
+ if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
422
+ and c.head.dep_ == "ccomp"
423
+ ]
424
+
425
+ # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
426
+ # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
427
+
428
+ if "ccomp" in c_dep and (_check_wh or _check_if):
429
+ category = "Post-predicate wh-cl"
430
+
431
+ elif "ccomp" in c_dep:
432
+ if any(root_before_ccomp):
433
+ category = "Post-predicate that-cl"
434
+ else:
435
+ category = "Comment clause"
436
+
437
+ elif "xcomp" in c_dep:
438
+ if _check_to:
439
+ category = "Post-predicate to-cl"
440
+ elif _check_ing:
441
+ category = "Post-predicate -ing"
442
+
443
+ # Existential
444
+ elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
445
+ category = "There is/are NOUN"
446
+
447
+ elif (
448
+ "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
449
+ ):
450
+ category = "Cleft construction"
451
+
452
+ if spanroot.dep_ in ["parataxis"]:
453
+ if "_".join(span_dep) in [
454
+ "nsubj_parataxis",
455
+ "aux_parataxis",
456
+ "nsubj_aux_parataxis",
457
+ ]:
458
+ category = "Comment clause"
459
+ else:
460
+ category = "parataxis (for now)"
461
+
462
+ ## External comp
463
+ if spanroot.dep_ in ["xcomp"]:
464
+ if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
465
+ category = "Adjective complement to-cl"
466
+ if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
467
+ category = "Verb complement to-cl"
468
+
469
+ if spanroot.dep_ in ["pcomp"]:
470
+ if (
471
+ str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
472
+ and "ccomp" in c_dep
473
+ ):
474
+ category = "Participle + that-cl"
475
+ elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
476
+ category = "Participle"
477
+
478
+ ## Simple classifier
479
+ # if spanroot.dep_ in ['pcomp']:
480
+ # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
481
+ # category = "Gerund"
482
+
483
+ if spanroot.dep_ in ["neg"]:
484
+ category = "Negative particle"
485
+ if spanroot.dep_ in ["aux", "auxpass"]:
486
+ category = "Auxiliary"
487
+
488
+ # Modal verbs
489
+ if spanroot.tag_ == "MD":
490
+ category = "Modal auxiliary"
491
+
492
+ if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
493
+ if (
494
+ spanroot.head.dep_ in ["ROOT", "ccomp"]
495
+ and spanroot.head.pos_ in ["AUX", "VERB"]
496
+ and spanroot.pos_ in ["AUX", "VERB"]
497
+ ):
498
+ if spanroot.morph == spanroot.head.morph:
499
+ category = "Main verb 4"
500
+ else:
501
+ category = "Dependent verb 2"
502
+ elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
503
+ category = "Gerund"
504
+ elif spanroot.head.dep_ in ["conj", "acl", "relcl"]:
505
+ if spanroot.morph == spanroot.head.morph:
506
+ category = "Main verb 4"
507
+ else:
508
+ category = "Dependent verb 2"
509
+ elif "VerbForm=Fin" in str(spanroot.morph):
510
+ category = "Dependent verb 2"
511
+
512
+ # Appositive phrases
513
+ if spanroot.dep_ in ["appos"]:
514
+ if "nummod" in c_dep:
515
+ category = "Apposition"
516
+ elif spanroot.pos_ in ["PROPN"]:
517
+ category = "Appositive Proper Nouns"
518
+ elif spanroot.pos_ in ["NOUN"]:
519
+ category = "Appositive Noun Phrase"
520
+ elif spanroot.pos_ in ["VERB", "AUX"]:
521
+ _check = any(
522
+ c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
523
+ for c in spanroot.children
524
+ )
525
+ if _check:
526
+ category = "Appositive Finite-clause"
527
+
528
+ if spanroot.dep_ in ["appos", "dep", "attr"]:
529
+ if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
530
+ category = "Main verb 5"
531
+
532
+ if spanroot.dep_ in ["dep", "mark"]:
533
+ if spanroot.tag_ in ["RB", "IN", "CC"]:
534
+ category = "Conjunction"
535
+
536
+ # sometimes the extra-clausal links are not accurate
537
+ if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]:
538
+ if spanroot.head.dep_ == "ROOT":
539
+ category = "Main verb"
540
+ else:
541
+ category = "dependent verb 5"
542
+
543
+ if span.label_ == "CITATION":
544
+ if "NNP" in span_tag or "NNPS" in span_tag:
545
+ if span_dep[0] == "punct" and span_dep[-1] == "punct":
546
+ category = "Parenthetical Citation"
547
+ elif span_tag[0] in ["NNP", "NNPS"]:
548
+ category = "Narrative Citation"
549
+ else:
550
+ category = "Other Citation"
551
+
552
+ if category == None:
553
+ category = spanroot.dep_
554
+
555
+ return category
556
+
557
+
558
+ def construction_classifier2(doc, span):
559
+ category = None
560
+ spanroot = span.root
561
+
562
+ ## Grabbing lexico-grammatical information
563
+ span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
564
+ span_dep = [t.dep_ for t in span]
565
+ span_token = [t.norm_ for t in span]
566
+ span_tag = [t.tag_ for t in span]
567
+
568
+ c = [c for c in spanroot.children]
569
+ c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
570
+
571
+ c_norm = [c.norm_ for c in spanroot.children]
572
+ c_dep = [c.dep_ for c in spanroot.children]
573
+ c_pos = [c.pos_ for c in spanroot.children]
574
+ c_tag = [c.tag_ for c in spanroot.children]
575
+
576
+ right_dep = [c.dep_ for c in spanroot.rights]
577
+
578
+ # conditionals
579
+ subjless = all(
580
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
581
+ for c in spanroot.children
582
+ )
583
+ argmentless = all(
584
+ c.dep_
585
+ not in [
586
+ "nsubj",
587
+ "nsubjpass",
588
+ "csubj",
589
+ "csubjpass",
590
+ "dobj",
591
+ "ccomp",
592
+ "xcomp",
593
+ "dative",
594
+ "attr",
595
+ "oprd",
596
+ "acomp",
597
+ ]
598
+ for c in spanroot.children
599
+ )
600
+ argless_span = all(
601
+ c.dep_
602
+ not in [
603
+ "nsubj",
604
+ "nsubjpass",
605
+ "csubj",
606
+ "csubjpass",
607
+ "dobj",
608
+ "ccomp",
609
+ "xcomp",
610
+ "dative",
611
+ "attr",
612
+ "oprd",
613
+ "acomp",
614
+ ]
615
+ for c in span
616
+ )
617
+ argless_span = all(
618
+ c.dep_
619
+ not in [
620
+ "nsubj",
621
+ "nsubjpass",
622
+ "csubj",
623
+ "csubjpass",
624
+ "dobj",
625
+ "ccomp",
626
+ "xcomp",
627
+ "dative",
628
+ "attr",
629
+ "oprd",
630
+ "acomp",
631
+ ]
632
+ for c in span
633
+ )
634
+
635
+ ## nesting classifiers
636
+ if spanroot.dep_ == "conj":
637
+ while spanroot.dep_ == "conj":
638
+ spanroot = spanroot.head
639
+
640
+ if spanroot.dep_ == "poss":
641
+ head = spanroot.head
642
+ if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
643
+ category = "Posessive Noun (Object)"
644
+ elif head.dep_ in ["nsubj", "nsubjpass"]:
645
+ category = "Posessive Noun (Subject)"
646
+ else:
647
+ category = "Posessive Noun (Other)"
648
+
649
+ ## Conjunctions
650
+ # Preconjunctions
651
+ if spanroot.dep_ in ["preconj", "cc"]:
652
+ category = "Conjunction"
653
+
654
+ ## NOUN PHRASES
655
+ # adverbial phrases
656
+ if spanroot.dep_ in ["amod"]:
657
+ category = "Adjectival modifier"
658
+ # adverbial phrases
659
+ if spanroot.dep_ in ["compound"]:
660
+ category = "Compound noun"
661
+
662
+ ## Nominal category
663
+ if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
664
+ if "acl" in c_dep:
665
+ category = "Noun + Complement (Object)"
666
+ else:
667
+ category = "Object"
668
+
669
+ if spanroot.dep_ in ["nsubj", "nsubjpass"]:
670
+ if "acl" in c_dep:
671
+ category = "Noun + Complement (Subject)"
672
+ else:
673
+ category = "Subject"
674
+
675
+ ## ADJUNCTS
676
+ # prep phrases
677
+ if spanroot.dep_ in ["prep", "agent"]:
678
+ category = "Prepositional phrase"
679
+
680
+ # adverbial phrases
681
+ if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]:
682
+ category = "Adverbial phrase"
683
+
684
+ ## Predication patterns
685
+ if spanroot.dep_ in ["acomp", "oprd"]:
686
+ if "xcomp" in c_dep:
687
+ category = "Subject predicate to-cl"
688
+ else:
689
+ category = "Adjectival complement"
690
+
691
+ if spanroot.dep_ in ["attr"]:
692
+ subjless = all(
693
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
694
+ for c in spanroot.children
695
+ )
696
+
697
+ c_head = [c.dep_ for c in spanroot.head.children]
698
+ if "expl" in c_head and "no_det" in span_t_dep_:
699
+ category = "There is/are no NOUN"
700
+ elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
701
+ category = "There is/are + Noun complement"
702
+ elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
703
+ category = "There is/are + Noun complement"
704
+
705
+ elif spanroot.pos_ in ["NOUN", "PRON"]:
706
+ if "acl" in c_dep:
707
+ category = "Noun + Complement (attr)"
708
+ else:
709
+ category = "Nominal complement"
710
+
711
+ elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
712
+ category = "Main verb 4"
713
+
714
+ elif spanroot.tag_ in ["NNP"]:
715
+ category = "Nominal complement"
716
+
717
+ ## External comp
718
+ if spanroot.dep_ in ["xcomp"]:
719
+ if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
720
+ category = "Adjective complement to-cl"
721
+ if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
722
+ category = "Verb complement to-cl"
723
+
724
+ if spanroot.dep_ in ["pcomp"]:
725
+ if (
726
+ str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
727
+ and "ccomp" in c_dep
728
+ ):
729
+ category = "Participle + that-cl"
730
+ elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
731
+ category = "Participle"
732
+
733
+ ## Simple classifier
734
+ # if spanroot.dep_ in ['pcomp']:
735
+ # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
736
+ # category = "Gerund"
737
+
738
+ if spanroot.dep_ in ["neg"]:
739
+ category = "Negative particle"
740
+ if spanroot.dep_ in ["aux", "auxpass"]:
741
+ category = "Auxiliary"
742
+
743
+ # Modal verbs
744
+ if spanroot.tag_ == "MD":
745
+ category = "Modal auxiliary"
746
+
747
+ ####################################
748
+ ### clausal ####
749
+ ####################################
750
+ if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]:
751
+ _check_to = [
752
+ c.dep_
753
+ for c in spanroot.subtree
754
+ if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
755
+ and c.head.dep_ == "xcomp"
756
+ ]
757
+ _check_ing = [
758
+ c.dep_
759
+ for c in spanroot.subtree
760
+ if "Prog" in str(c.morph) and c.dep_ == "xcomp"
761
+ ]
762
+ root_before_ccomp = [
763
+ c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
764
+ ]
765
+
766
+ _check_for_to = [
767
+ "_".join([c.norm_, c.dep_])
768
+ for c in spanroot.subtree
769
+ if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
770
+ ]
771
+ entire_cl = (
772
+ spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
773
+ )
774
+
775
+ ## Start with broad category, which is then re-evaluated for specific constructions.
776
+ if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]: #'mark',
777
+ ## Adverbial clauses
778
+ subjless = all(
779
+ c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
780
+ for c in spanroot.children
781
+ )
782
+ entire_cl = (
783
+ spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
784
+ )
785
+
786
+ ### Finite-adverbial clauses
787
+ if "mark" in span_dep and (
788
+ spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep
789
+ ):
790
+ category = "Finite adverbial clause"
791
+
792
+ elif "mark" in span_dep and "aux" in span_dep:
793
+ category = "Finite adverbial clause"
794
+
795
+ elif (
796
+ "mark" in span_dep
797
+ and spanroot.pos_ in ["VERB", "AUX"]
798
+ and "expl" in c_dep
799
+ ):
800
+ category = "Finite adverbial clause"
801
+
802
+ elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
803
+ if spanroot.pos_ in ["VERB", "AUX"]:
804
+ category = "Finite adverbial clause"
805
+
806
+ elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
807
+ category = "Non-finite adv clause 1"
808
+
809
+ elif not argmentless:
810
+ category = "Finite adverbial clause"
811
+
812
+ ## non-finite
813
+ elif (
814
+ str(spanroot.morph)
815
+ in [
816
+ "Aspect=Prog|Tense=Pres|VerbForm=Part",
817
+ "Aspect=Perf|Tense=Past|VerbForm=Part",
818
+ ]
819
+ and "aux" not in c_dep
820
+ ):
821
+ # he doing his job
822
+ if argmentless:
823
+ # e.g., frankly speaking, strictly speaking
824
+ category = "Adverbial Phrase"
825
+ else:
826
+ category = "Non-finite adv clause 2"
827
+
828
+ elif (
829
+ spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
830
+ ):
831
+ category = "Non-finite adv clause 3"
832
+
833
+ elif "aux" in c_dep and "TO" in c_tag:
834
+ category = "Adverbial Phrase"
835
+
836
+ elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
837
+ category = "Dependent Verb phrase"
838
+
839
+ elif not argmentless:
840
+ category = "Adverbial clause"
841
+
842
+ elif spanroot.dep_ == "advcl":
843
+ category = "Adverbial phrase"
844
+
845
+ else:
846
+ category = "Finite adverbial clause "
847
+
848
+ if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]:
849
+ head = spanroot.head
850
+ if ";" in [t.norm_ for t in head.children]:
851
+ category = "Main verb 3"
852
+
853
+ elif "nsubj" not in span_dep:
854
+ category = "Dependent verb 1"
855
+
856
+ elif "mark" in span_dep:
857
+ category = "Complement clause"
858
+ elif (
859
+ str(spanroot.morph)
860
+ in [
861
+ "Aspect=Prog|Tense=Pres|VerbForm=Part",
862
+ "Aspect=Perf|Tense=Past|VerbForm=Part",
863
+ ]
864
+ and "aux" not in c_dep
865
+ ):
866
+ category = "Non-finite complement clause"
867
+ elif spanroot.dep_ in ["relcl"]:
868
+ category = "Relative clause"
869
+ elif spanroot.dep_ in ["ccomp"]:
870
+ category = "Complement clause"
871
+ elif spanroot.dep_ in ["acl"]:
872
+ category = "Noun Complement clause"
873
+
874
+ ## Specific constructions
875
+ # Extraposed that-clause or to-infinitives
876
+ if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
877
+ "VERB",
878
+ "AUX",
879
+ ]:
880
+ # print(c_dep)
881
+ if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
882
+ # eg it seems odd (oprd) that X.
883
+ # eg it is certain (acomp) that X.
884
+ category = (
885
+ "Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
886
+ )
887
+
888
+ elif "xcomp" in c_dep or ("advcl" in c_dep):
889
+ if "for_mark" in _check_for_to:
890
+ category = (
891
+ "Extraposed to-cl (explicit subj)" # eg It is possible to .
892
+ )
893
+ elif _check_to:
894
+ category = "Extraposed to-cl 1" # eg It is possible to .
895
+ elif _check_ing:
896
+ category = "Extraposed -ing 1" # eg It is possible to .
897
+ elif (
898
+ ("prep" in right_dep or "npadvmod" in right_dep)
899
+ and "ccomp" in right_dep
900
+ and spanroot.lemma_ == "be"
901
+ ):
902
+ category = "Cleft construction"
903
+
904
+ elif "attr" in c_dep:
905
+ category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
906
+
907
+ else:
908
+ category = "Extraposed that-cl (VERB)"
909
+
910
+ # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
911
+ # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
912
+ elif (
913
+ "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
914
+ ) and "acomp" in c_dep:
915
+ if "xcomp" in c_dep:
916
+ if _check_to:
917
+ category = "Extraposed to-cl 2" # eg it is difficult to decide.
918
+ elif _check_ing:
919
+ category = "Extraposed -ing 2" # eg it is difficult to decide.
920
+
921
+ else:
922
+ category = "Extraposed that-cl (adj-complement) 2"
923
+
924
+ elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
925
+ category = (
926
+ "Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
927
+ )
928
+
929
+ # something without dummy subject "it"
930
+ elif (
931
+ (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
932
+ and spanroot.pos_ in ["AUX", "VERB"]
933
+ and "it" not in c_norm
934
+ ):
935
+ # store xcomp, if the head of the xcomp is acomp
936
+ _check_xcomp = [
937
+ c.dep_
938
+ for c in spanroot.subtree
939
+ if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
940
+ ]
941
+ _check_ccomp = [
942
+ c.dep_
943
+ for c in spanroot.subtree
944
+ if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
945
+ ]
946
+ # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
947
+ # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
948
+
949
+ if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
950
+ if any(root_before_ccomp):
951
+ category = "Post-predicate that-cl"
952
+ else:
953
+ category = "Comment clause"
954
+
955
+ elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
956
+ category = "Post-predicate that-cl 2"
957
+
958
+ elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
959
+ category = "Post-predicate to-cl"
960
+
961
+ elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
962
+ category = "Subject predicate to-cl"
963
+
964
+ elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
965
+ category = "Subject predicate to-cl (passive)"
966
+
967
+ elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
968
+ category = "Subject predicate -ing"
969
+ elif "ccomp" in c_dep:
970
+ category = "Subject predicate that-cl"
971
+ elif "acomp" in c_dep:
972
+ category = "Adjectival predicate"
973
+
974
+ elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
975
+ category = "Finite-adverbial clause"
976
+ elif not argmentless and "SCONJ" in c_pos:
977
+ category = "Finite-adverbial clause"
978
+ else:
979
+ category = "Main verb 1"
980
+
981
+ ## without dummy subject it, and lexical verbs
982
+ elif (
983
+ ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
984
+ and spanroot.pos_ in ["AUX", "VERB"]
985
+ and "it" not in c_norm
986
+ and spanroot.lemma_ not in ["be"]
987
+ ):
988
+ _check_wh = [
989
+ c.dep_
990
+ for c in spanroot.subtree
991
+ if (
992
+ c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
993
+ and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
994
+ )
995
+ and c.head.dep_ == "ccomp"
996
+ ]
997
+ _check_if = [
998
+ c.dep_
999
+ for c in spanroot.subtree
1000
+ if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
1001
+ and c.head.dep_ == "ccomp"
1002
+ ]
1003
+
1004
+ # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
1005
+ # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
1006
+
1007
+ if "ccomp" in c_dep and (_check_wh or _check_if):
1008
+ category = "Post-predicate wh-cl"
1009
+
1010
+ elif "ccomp" in c_dep:
1011
+ if any(root_before_ccomp):
1012
+ category = "Post-predicate that-cl"
1013
+ else:
1014
+ category = "Comment clause"
1015
+
1016
+ elif "xcomp" in c_dep:
1017
+ if _check_to:
1018
+ category = "Post-predicate to-cl"
1019
+ elif _check_ing:
1020
+ category = "Post-predicate -ing"
1021
+
1022
+ # Existential
1023
+ elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
1024
+ category = "There is/are NOUN"
1025
+
1026
+ elif (
1027
+ "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
1028
+ ):
1029
+ category = "Cleft construction"
1030
+
1031
+ ### The end of clausal analysis
1032
+
1033
+ if spanroot.dep_ in ["parataxis"]:
1034
+ if "_".join(span_dep) in [
1035
+ "nsubj_parataxis",
1036
+ "aux_parataxis",
1037
+ "nsubj_aux_parataxis",
1038
+ ]:
1039
+ category = "Comment clause"
1040
+ else:
1041
+ category = "Parataxis"
1042
+
1043
+ if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
1044
+ if (
1045
+ spanroot.head.dep_ in ["ROOT", "ccomp"]
1046
+ and spanroot.head.pos_ in ["AUX", "VERB"]
1047
+ and spanroot.pos_ in ["AUX", "VERB"]
1048
+ ):
1049
+ if spanroot.morph == spanroot.head.morph:
1050
+ category = "Main verb 4"
1051
+ else:
1052
+ category = "Dependent verb 2"
1053
+ elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
1054
+ category = "Gerund"
1055
+ elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(
1056
+ spanroot.morph
1057
+ ):
1058
+ category = "Dependent verb 2"
1059
+ elif spanroot.dep_ in ["csubj", "csubjpass"]:
1060
+ category = "Dependent verb (csubj)"
1061
+
1062
+ # Appositive phrases
1063
+ if spanroot.dep_ in ["appos"]:
1064
+ if "nummod" in c_dep:
1065
+ category = "Apposition"
1066
+ if spanroot.pos_ in ["PROPN"]:
1067
+ category = "Appositive Proper Nouns"
1068
+ elif spanroot.pos_ in ["NOUN"]:
1069
+ category = "Appositive Noun Phrase"
1070
+ elif spanroot.pos_ in ["VERB", "AUX"]:
1071
+ _check = any(
1072
+ c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
1073
+ for c in spanroot.children
1074
+ )
1075
+ if _check:
1076
+ category = "Appositive Finite-clause"
1077
+
1078
+ if spanroot.dep_ in ["appos", "dep", "attr"]:
1079
+ if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
1080
+ category = "Main verb (likely parsing error)"
1081
+
1082
+ # sometimes the dep are on the conjunctions
1083
+ if spanroot.dep_ in ["dep", "mark"]:
1084
+ if spanroot.tag_ in ["RB", "IN", "CC"]:
1085
+ category = "Conjunction"
1086
+
1087
+ if spanroot.dep_ in ["intj"]:
1088
+ category = "Introjection"
1089
+
1090
+ # sometimes the extra-clausal links are not accurate
1091
+ if (
1092
+ spanroot.dep_
1093
+ in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"]
1094
+ and category == None
1095
+ ):
1096
+ if spanroot.head.dep_ == "ROOT":
1097
+ category = "Main verb"
1098
+ else:
1099
+ category = "dependent verb 5"
1100
+
1101
+ if span.label_ == "CITATION":
1102
+ if "NNP" in span_tag or "NNPS" in span_tag:
1103
+ if span_dep[0] == "punct" and span_dep[-1] == "punct":
1104
+ category = "Parenthetical Citation"
1105
+ elif span_tag[0] in ["NNP", "NNPS"]:
1106
+ category = "Narrative Citation"
1107
+ else:
1108
+ category = "Other Citation"
1109
+
1110
+ if category == None:
1111
+ category = spanroot.dep_
1112
+
1113
+ return category
1114
+
1115
+
1116
+ def const_table(
1117
+ doc: Union[spacy.tokens.Doc, Dict[str, str]],
1118
+ spans_key: str = "sc",
1119
+ attrs: List[str] = SPAN_ATTRS,
1120
+ ):
1121
+ columns = attrs + [
1122
+ "Conf. score",
1123
+ "sent no.",
1124
+ "grammatical realization",
1125
+ "span dep",
1126
+ "ner",
1127
+ "POS",
1128
+ "span dep seq",
1129
+ "TAG sequence",
1130
+ "POS sequence",
1131
+ "head",
1132
+ "head dep",
1133
+ "children",
1134
+ "morphology",
1135
+ "sent",
1136
+ ]
1137
+ data = []
1138
+ # data = span_info_aggregator(doc, columns)
1139
+ sentences = {s: i for i, s in enumerate(doc.sents)}
1140
+
1141
+ for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]):
1142
+ span_info = []
1143
+ span_info.extend([str(getattr(span, attr)) for attr in attrs])
1144
+
1145
+ span_info.append(score)
1146
+ span_info.append(int(sentences[span.sent]))
1147
+ span_info.append(construction_classifier2(doc, span))
1148
+ span_info.append(span.root.dep_)
1149
+ span_info.append(span.root.ent_type_)
1150
+ span_info.append(span.root.tag_)
1151
+ span_info.append("_".join([t.dep_ for t in span]))
1152
+ span_info.append("_".join([t.tag_ for t in span]))
1153
+ span_info.append("_".join([t.pos_ for t in span]))
1154
+ span_info.append(span.root.head.norm_)
1155
+ span_info.append(span.root.head.dep_)
1156
+ span_info.append("_".join([c.dep_ for c in span.root.children]))
1157
+ span_info.append(str(span.root.morph))
1158
+ span_info.append(span.sent.text.strip())
1159
+
1160
+ data.append(span_info)
1161
+
1162
+ return data, columns
1163
+
1164
+
1165
+ def ngrammar(seq: list, n=2, concat=False, sep="-"):
1166
+ result = []
1167
+ n_item = len(seq)
1168
+ for idx, item in enumerate(seq):
1169
+ if idx + n <= n_item:
1170
+ if concat:
1171
+ result.append(sep.join(seq[idx : idx + n]))
1172
+ else:
1173
+ result.append(seq[idx : idx + n])
1174
+ return result
1175
+
1176
+
1177
+ def diversity_values(count_vec: list):
1178
+ result = {}
1179
+ if len(count_vec) == 0:
1180
+ count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1181
+
1182
+ result["shannon"] = dv.alpha.shannon(list(count_vec), base=2)
1183
+ result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec))
1184
+ result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec))
1185
+ result["simpson_e"] = dv.alpha.simpson_e(list(count_vec))
1186
+ # result['gini_index'] = dv.alpha.gini_index(list(count_vec))
1187
+ # result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))
1188
+
1189
+ return result
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip == 24.2
2
+ spacy-streamlit #==1.0.4
3
+ spacy>3.4.4, <3.7
4
+ # spacy-experimental==0.6.1
5
+ # spacy-huggingface-hub==0.0.8
6
+ # spacy-transformers==1.1.8
7
+ # srsly==2.4.5
8
+ scikit-bio
9
+ # pip==23.3.1
10
+ # setuptools
11
+ # pydantic==1.* #necessary for spacy 3.4.4?
12
+ # altair<5
13
+ # streamlit
14
+ typing_extensions<4.6.0
15
+
16
+
17
+ # https://huggingface.co/egumasa/en_engagement_RoBERTa_combined/resolve/main/en_engagement_RoBERTa_combined-any-py3-none-any.whl
18
+ # https://huggingface.co/egumasa/en_engagement_RoBERTa_context_flz/resolve/main/en_engagement_RoBERTa_context_flz-any-py3-none-any.whl
19
+ # https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_acad_max1_do02/resolve/main/en_engagement_spl_RoBERTa_acad_max1_do02-any-py3-none-any.whl
20
+ # https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_acad/resolve/main/en_engagement_spl_RoBERTa_acad-any-py3-none-any.whl
21
+ # https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_acad2/resolve/main/en_engagement_spl_RoBERTa_acad2-any-py3-none-any.whl
22
+ # https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_acad2/resolve/main/en_engagement_spl_RoBERTa_acad2-any-py3-none-any.whl
23
+ # https://huggingface.co/egumasa/en_engagement_LSTM/resolve/main/en_engagement_LSTM-any-py3-none-any.whl
24
+ https://huggingface.co/egumasa/en_engagement_LSTM/resolve/main/en_engagement_LSTM-any-py3-none-any.whl #This is the best in 2023
25
+ # https://huggingface.co/egumasa/en_engagement_spl_RoBERTa_base_attention/resolve/main/en_engagement_spl_RoBERTa_base_attention-any-py3-none-any.whl
resources/__pycache__/colors.cpython-39.pyc ADDED
Binary file (447 Bytes). View file
 
resources/__pycache__/template_list.cpython-39.pyc ADDED
Binary file (2.35 kB). View file
 
resources/__pycache__/text_list.cpython-39.pyc ADDED
Binary file (122 kB). View file
 
resources/__pycache__/text_list_BAWE.cpython-39.pyc ADDED
Binary file (111 kB). View file
 
resources/colors.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ COLORS_1 = {
2
+ "ENTERTAIN": "#82b74b",
3
+ "DENY": '#c94c4c',
4
+ "COUNTER": "#eea29a",
5
+ "PRONOUNCE": "#92a8d1",
6
+ "ENDORSE": "#034f84",
7
+ "CITATION": "#b2b2b2",
8
+ # "MONOGLOSS": "#3e4444",
9
+ "ATTRIBUTE": "#f7786b",
10
+ "ATTRIBUTION": "#f7786b",
11
+ "PROCLAIM": "#92a8d1",
12
+ "ENDOPHORIC": "#FAD7A0",
13
+ "SOURCES": "#F9E79F"
14
+
15
+ }
16
+
resources/template_list.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TPL_ENT = """
2
+ <mark class="entity" style="background: {bg}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">
3
+ {text}
4
+ <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{label}</span>
5
+ </mark>
6
+ """
7
+
8
+ TPL_SPANS = """
9
+ <div class="spans" style="line-height: 4.5;">
10
+ {text}
11
+ {span_slices}
12
+ {span_starts}
13
+ </div>
14
+ """
15
+
16
+ TPL_SPAN = """
17
+ <span style="font-weight: bold; display: inline-block; line-height: 3; padding-bottom: 12px;position: relative;">
18
+ {text}
19
+ {span_slices}
20
+ {span_starts}
21
+ </span>
22
+ """
23
+
24
+ TPL_SPAN_SLICE = """
25
+ <span style="background: {bg}; top: {top_offset}px; display: inline-block; height: 4px; left: -1px; width: calc(100% + 2px); position: absolute;">
26
+ </span>
27
+ """
28
+
29
+ TPL_SPAN_START = """
30
+ <span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
31
+ <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
32
+
33
+ {label}{kb_link}
34
+ </span>
35
+ </span>
36
+
37
+ """
38
+
39
+ TPL_SPAN_START_RTL = """
40
+ <span style="background: {bg}; top: {top_offset}px; height: 4px; border-top-left-radius: 3px; border-bottom-left-radius: 3px; left: -1px; width: calc(100% + 2px); position: absolute;">
41
+ <span style="background: {bg}; z-index: 10; color: #000; top: -0.5em; padding: 2px 3px; position: absolute; font-size: 0.6em; font-weight: bold; line-height: 1; border-radius: 3px">
42
+ {label}{kb_link}
43
+ </span>
44
+ </span>
45
+ """
46
+
47
+ DEFAULT_TEXT = """Tickner said regardless of the result, the royal commission was a waste of money and he would proceed with a separate inquiry into the issue headed by Justice Jane Matthews. His attack came as the Aboriginal women involved in the case demanded a female minister examine the religious beliefs they claim are inherent in their fight against a bridge to the island near Goolwa in South Australia."""
48
+
resources/text_list.py ADDED
The diff for this file is too large to render. See raw diff
 
resources/text_list_BAWE.py ADDED
The diff for this file is too large to render. See raw diff
 
utils/__pycache__/util.cpython-39.pyc ADDED
Binary file (2.93 kB). View file
 
utils/__pycache__/utility.cpython-310.pyc ADDED
Binary file (2.91 kB). View file
 
utils/__pycache__/utility.cpython-39.pyc ADDED
Binary file (2.93 kB). View file
 
utils/__pycache__/visualize.cpython-310.pyc ADDED
Binary file (4.42 kB). View file
 
utils/__pycache__/visualize.cpython-39.pyc ADDED
Binary file (4.18 kB). View file
 
utils/utility.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from collections import Counter
3
+ from spacy.tokens import SpanGroup
4
+
5
+
6
+ def preprocess(text):
7
+ text = re.sub("--- Para SEP ---", '\n', text)
8
+ text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
9
+ text = re.sub('\n', ' ', text)
10
+ text = re.sub(r'\s+', " ", text)
11
+ text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
12
+ return text
13
+
14
+
15
+ def del_spans(span_sc, indexes: list):
16
+
17
+ indexes.sort(
18
+ reverse=True
19
+ ) # reversing allows the deletion from the last, keeping the original index
20
+
21
+ for idx in indexes:
22
+ if idx + 1 < len(span_sc):
23
+ del span_sc[idx + 1]
24
+
25
+
26
+ def delete_overlapping_span(span_sc: dict):
27
+ # print(span_sc)
28
+ start_token_list = [spn.start for spn in span_sc]
29
+ dict_ = Counter(start_token_list)
30
+ overlap = {k: v for k, v in dict_.items() if v > 1}
31
+
32
+ id_del = []
33
+ id_comp = {}
34
+
35
+ info = {}
36
+ for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
37
+ start=0):
38
+ res = {
39
+ 'score': score,
40
+ 'spn': spn,
41
+ 'label': spn.label_,
42
+ 'start': spn.start,
43
+ 'end': spn.end,
44
+ 'compare': spn.start in overlap,
45
+ "sents": len(list(spn.sents))
46
+ }
47
+ # print(res)
48
+ info[n] = res
49
+
50
+ if res['compare']:
51
+ if spn.start not in id_comp:
52
+ id_comp[spn.start] = n
53
+ else:
54
+ same_lbl = res['label'] == info[id_comp[spn.start]]['label']
55
+ update = res['score'] > info[id_comp[spn.start]]['score']
56
+ if update and same_lbl:
57
+ print(res['label'], info[id_comp[spn.start]]['label'])
58
+ print(same_lbl)
59
+ id_del.append(id_comp[spn.start])
60
+ id_comp[spn.start] = n
61
+ else:
62
+ id_del.append(n)
63
+ # print(update)
64
+
65
+ # delete span beyond sentences
66
+ if len(list(spn.sents)) > 1:
67
+ id_del.append(n)
68
+
69
+ # print(id_comp)
70
+ del_spans(span_sc, id_del)
71
+ # for n, idx in enumerate(id_del):
72
+ # # print(idx)
73
+
74
+ # try:
75
+ # del span_sc[idx - n]
76
+ # except IndexError:
77
+ # continue
78
+
79
+
80
+ def cleanup_justify(doc, span_sc: dict):
81
+ # This function adjusts the JUSTIFYING span
82
+
83
+ # First create an index of span with JUSTIFYING tags
84
+ justifies = {}
85
+ for idx, span in enumerate(span_sc):
86
+ # temp_root = span.root
87
+ # while span.start <= temp_root.head.i <= span.end:
88
+ # temp_root = temp_root.head
89
+ if span.label_ in ['JUSTIFYING']:
90
+ justifies[span.root] = {
91
+ "span": span,
92
+ "head": span.root.head,
93
+ "start": span.start,
94
+ "end": span.end,
95
+ "del": False,
96
+ "dependent": False,
97
+ "span_idx": idx
98
+ }
99
+ # print(justifies)
100
+
101
+ # flagging the dependency
102
+ for spanroot, info in justifies.items():
103
+ if spanroot.head in justifies:
104
+ info['dependent'] = True
105
+ info['del'] = True
106
+
107
+ # print(justifies)
108
+ new_spans = []
109
+ for spanroot, info in justifies.items():
110
+
111
+ if not info['dependent']:
112
+ # print("New Justifying candidate span:")
113
+ # print(doc[spanroot.left_edge.i:spanroot.right_edge.i + 1])
114
+
115
+ new_span = doc[spanroot.left_edge.i:spanroot.right_edge.i + 1]
116
+ new_span.label_ = "JUSTIFYING"
117
+
118
+ if new_span not in span_sc:
119
+ new_spans.append(new_span)
120
+ info['del'] = True
121
+
122
+ else:
123
+ info['del'] = True
124
+
125
+ to_delete = [
126
+ info['span_idx'] for spanroot, info in justifies.items() if info['del']
127
+ ]
128
+
129
+ to_delete_span = [
130
+ info['span'] for spanroot, info in justifies.items() if info['del']
131
+ ]
132
+
133
+ # print(to_delete)
134
+ # print(to_delete_span)
135
+
136
+ del_spans(span_sc, to_delete)
137
+
138
+ span_grp = SpanGroup(doc, spans=new_spans)
139
+ span_sc.extend(span_grp)
140
+
141
+ # print(justifies)
utils/visualize.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #
5
+ # This code is adapted from spacy-streamlit package by explosion
6
+ # https://github.com/explosion/spacy-streamlit/blob/master/spacy_streamlit/__init__.py
7
+ #
8
+
9
+ from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
10
+ import streamlit as st
11
+ import spacy
12
+ from spacy.language import Language
13
+ from spacy import displacy
14
+ import pandas as pd
15
+
16
+ import streamlit as st
17
+ from spacy_streamlit import visualize_spans
18
+ from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
19
+
20
+ from pipeline.post_processors import (
21
+ simple_table,
22
+ const_table,
23
+ ngrammar,
24
+ diversity_values,
25
+ )
26
+ from skbio import diversity as dv
27
+
28
+ SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
29
+
30
+ # fmt: off
31
+ # SPAN_ATTRS = ["text", "label_", "start", "end", "start_char", "end_char"]
32
+ SPAN_ATTRS = [
33
+ "text",
34
+ "label_",
35
+ "start",
36
+ "end",
37
+ ]
38
+
39
+ CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
40
+
41
+ def visualize_spans(
42
+ doc: Union[spacy.tokens.Doc, Dict[str, str]],
43
+ *,
44
+ spans_key: str = "sc",
45
+ attrs: List[str] = SPAN_ATTRS,
46
+ show_table: bool = True,
47
+ title: Optional[str] = "Spans",
48
+ manual: bool = False,
49
+ displacy_options: Optional[Dict] = None,
50
+ simple: bool = True,
51
+ show_confidence: bool = False,
52
+ show_diversity: bool = False,
53
+ show_ngrams: bool = False,
54
+ ):
55
+ """
56
+ Visualizer for spans.
57
+ doc (Doc, Dict): The document to visualize.
58
+ spans_key (str): Which spans key to render spans from. Default is "sc".
59
+ attrs (list): The attributes on the entity Span to be labeled. Attributes are displayed only when the show_table
60
+ argument is True.
61
+ show_table (bool): Flag signifying whether to show a table with accompanying span attributes.
62
+ title (str): The title displayed at the top of the Spans visualization.
63
+ manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing span information.
64
+ displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered.
65
+ See https://spacy.io/api/top-level#displacy_options-span
66
+ """
67
+ if SPACY_VERSION < (3, 3, 0):
68
+ raise ValueError(
69
+ f"'visualize_spans' requires spacy>=3.3.0. You have spacy=={spacy.__version__}"
70
+ )
71
+ if not displacy_options:
72
+ displacy_options = dict()
73
+ displacy_options["spans_key"] = spans_key
74
+
75
+ if title:
76
+ st.header(title)
77
+
78
+ if manual:
79
+ if show_table:
80
+ st.warning(
81
+ "When the parameter 'manual' is set to True, the parameter 'show_table' must be set to False."
82
+ )
83
+ if not isinstance(doc, dict):
84
+ st.warning(
85
+ "When the parameter 'manual' is set to True, the parameter 'doc' must be of type 'Dict', not 'spacy.tokens.Doc'."
86
+ )
87
+ html = displacy.render(
88
+ doc,
89
+ style="span",
90
+ options=displacy_options,
91
+ manual=manual,
92
+ )
93
+ st.write(f"{get_html(html)}", unsafe_allow_html=True)
94
+
95
+ if show_table:
96
+ # data = [
97
+ # [str(getattr(span, attr)) for attr in attrs] + [str(score)]
98
+ # for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
99
+ # ]
100
+ if simple:
101
+ data, cols = simple_table(doc, spans_key='sc', attrs=attrs)
102
+ else:
103
+ data, cols = const_table(doc, spans_key='sc', attrs=attrs)
104
+
105
+ # seq = [s for s in doc.spans[spans_key]]
106
+
107
+ if data:
108
+ df = pd.DataFrame(data, columns=cols)
109
+ df = df.astype({"start": int, "end": int})
110
+ df = df.sort_values(by= ['start'])
111
+ st.subheader("Engagement span information")
112
+
113
+ st.dataframe(
114
+ df.style.highlight_between(subset='Conf. score', right=.7))
115
+
116
+ counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
117
+
118
+ if show_confidence:
119
+ st.subheader("Label counts & Diagnostic confidence score summary")
120
+
121
+ print(counts)
122
+ print(list(counts))
123
+ label_counts = df.groupby('label_').agg({
124
+ "label_":
125
+ 'count',
126
+ "Conf. score": ['median', 'min', 'max']
127
+ }).round(4).reindex(CATEGORIES, fill_value=0)
128
+
129
+ st.dataframe(label_counts)
130
+ # print(list(label_counts))
131
+
132
+ if show_ngrams:
133
+ sequences = list(df['label_'])
134
+
135
+ # Engagement ngrams
136
+ span_bigrams = ngrammar(seq=sequences, n=2, concat=True)
137
+ span_trigrams = ngrammar(seq=sequences, n=3, concat=True)
138
+
139
+ st.dataframe(pd.DataFrame(span_bigrams))
140
+ st.code(span_trigrams)
141
+
142
+
143
+ st.subheader("Engagement label by grammatical function")
144
+ label_dep = pd.crosstab(df['grammatical realization'], df['label_'])
145
+ st.dataframe(label_dep)
146
+
147
+ if show_diversity:
148
+ st.subheader('Diversity of rhetorical features')
149
+ # st.markdown(
150
+ # f"Shannon's index: {dv.alpha.shannon(list(counts), base=2): .3f}")
151
+ # st.markdown(
152
+ # f"Simpson's e index: {1 - dv.alpha.simpson_e(list(counts)): .3f}")
153
+
154
+ st.markdown("##### Entropy based diversity measures")
155
+
156
+ filename = "NA"
157
+
158
+ div = diversity_values(list(counts))
159
+ div_data = pd.DataFrame.from_dict(div, orient='index')
160
+ # st.dataframe(div_data)
161
+
162
+ doc_data = pd.concat([div_data, counts, ], axis = 0).T
163
+ filename = "NA"
164
+ doc_data.insert(0, "filename", filename, True)
165
+ doc_data.insert(1, "nwords", len(doc), True)
166
+ st.dataframe(doc_data)
167
+
168
+ # st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
169
+ # print(dv.get_alpha_diversity_metrics())