Spaces:
Runtime error
Runtime error
init commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- __init__.py +0 -0
- app.py +24 -0
- requirements.txt +3 -0
- text_complexity_analyzer_cm/.DS_Store +0 -0
- text_complexity_analyzer_cm/__init__.py +3 -0
- text_complexity_analyzer_cm/coh_metrix_indices/.DS_Store +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__init__.py +3 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/__init__.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/connective_indices.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/descriptive_indices.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/lexical_diversity_indices.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/readability_indices.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/referential_cohesion_indices.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/syntactic_complexity_indices.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/syntactic_pattern_density_indices.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/word_information_indices.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/coh_metrix_indices/connective_indices.py +77 -0
- text_complexity_analyzer_cm/coh_metrix_indices/descriptive_indices.py +123 -0
- text_complexity_analyzer_cm/coh_metrix_indices/lexical_diversity_indices.py +37 -0
- text_complexity_analyzer_cm/coh_metrix_indices/readability_indices.py +66 -0
- text_complexity_analyzer_cm/coh_metrix_indices/referential_cohesion_indices.py +402 -0
- text_complexity_analyzer_cm/coh_metrix_indices/syntactic_complexity_indices.py +56 -0
- text_complexity_analyzer_cm/coh_metrix_indices/syntactic_pattern_density_indices.py +126 -0
- text_complexity_analyzer_cm/coh_metrix_indices/word_information_indices.py +99 -0
- text_complexity_analyzer_cm/constants.py +22 -0
- text_complexity_analyzer_cm/perm.py +79 -0
- text_complexity_analyzer_cm/pipes/__init__.py +3 -0
- text_complexity_analyzer_cm/pipes/__pycache__/__init__.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/additive_connectives_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/adversative_connectives_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/asks_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/causal_connectives_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/emphatics_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/feature_counter.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/logical_connectives_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/negative_expression_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/noun_phrase_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/polites_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/referential_cohesion_adjacent_sentences_analyzer.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/referential_cohesion_all_sentences_analyzer.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/syllable_splitter.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/temporal_connectives_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/__pycache__/verb_phrase_tagger.cpython-39.pyc +0 -0
- text_complexity_analyzer_cm/pipes/additive_connectives_tagger.py +59 -0
- text_complexity_analyzer_cm/pipes/adversative_connectives_tagger.py +64 -0
- text_complexity_analyzer_cm/pipes/asks_tagger.py +37 -0
- text_complexity_analyzer_cm/pipes/causal_connectives_tagger.py +34 -0
- text_complexity_analyzer_cm/pipes/emphatics_tagger.py +42 -0
- text_complexity_analyzer_cm/pipes/feature_counter.py +32 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
__init__.py
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from text_complexity_analyzer_cm.perm import PERM
|
3 |
+
|
4 |
+
|
5 |
+
def predict(text):
|
6 |
+
perm = PERM('en')
|
7 |
+
analytics = []
|
8 |
+
analytics.append(perm.calculate_word_information_indices_for_one_text(text, workers=-1))
|
9 |
+
analytics.append(perm.calculate_descriptive_indices_for_one_text(text, workers=-1))
|
10 |
+
return analytics
|
11 |
+
|
12 |
+
|
13 |
+
title = "Get the Analytics of your Message"
|
14 |
+
|
15 |
+
iface = gr.Interface(fn=predict,
|
16 |
+
inputs=gr.inputs.Textbox(
|
17 |
+
lines=3, label='Insert any given text to get textual analytics.'),
|
18 |
+
outputs="text",
|
19 |
+
title=title,
|
20 |
+
theme="huggingface",
|
21 |
+
examples=[
|
22 |
+
'We are going to analyze this text for persuasiveness, word information, and descriptive information.']
|
23 |
+
)
|
24 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
spacy
|
2 |
+
Pyphen
|
3 |
+
python3 -m spacy download en_core_web_md
|
text_complexity_analyzer_cm/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
text_complexity_analyzer_cm/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
This module contains the classes and functions that handle the processing of text using the coh-metrix indices.
|
3 |
+
'''
|
text_complexity_analyzer_cm/coh_metrix_indices/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
This module contains the classes to calculate the coh metrix indices.
|
3 |
+
'''
|
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (285 Bytes). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/connective_indices.cpython-39.pyc
ADDED
Binary file (8.01 kB). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/descriptive_indices.cpython-39.pyc
ADDED
Binary file (8.17 kB). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/lexical_diversity_indices.cpython-39.pyc
ADDED
Binary file (2.14 kB). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/readability_indices.cpython-39.pyc
ADDED
Binary file (3.28 kB). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/referential_cohesion_indices.cpython-39.pyc
ADDED
Binary file (17.7 kB). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/syntactic_complexity_indices.cpython-39.pyc
ADDED
Binary file (3.98 kB). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/syntactic_pattern_density_indices.cpython-39.pyc
ADDED
Binary file (7.2 kB). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/word_information_indices.cpython-39.pyc
ADDED
Binary file (11.8 kB). View file
|
|
text_complexity_analyzer_cm/coh_metrix_indices/connective_indices.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
import pyphen
|
3 |
+
import spacy
|
4 |
+
import string
|
5 |
+
|
6 |
+
from typing import Callable
|
7 |
+
from typing import List
|
8 |
+
from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
|
9 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
10 |
+
from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
|
11 |
+
from text_complexity_analyzer_cm.utils.utils import split_text_into_sentences
|
12 |
+
|
13 |
+
class ConnectiveIndices:
|
14 |
+
def __init__(self, nlp, language: str='en', descriptive_indices: DescriptiveIndices=None) -> None:
|
15 |
+
self.language = language
|
16 |
+
self._nlp = nlp
|
17 |
+
self._incidence = 1
|
18 |
+
if descriptive_indices is None:
|
19 |
+
self._di = DescriptiveIndices(language)
|
20 |
+
else:
|
21 |
+
self._di = descriptive_indices
|
22 |
+
|
23 |
+
def _get_connectives_incidence(self, text: str, disable_pipeline: List, count_connectives_function: Callable, word_count: int=None, workers: int=-1) -> float:
|
24 |
+
paragraphs = split_text_into_paragraphs(text)
|
25 |
+
pc = len(paragraphs)
|
26 |
+
threads = 1
|
27 |
+
wc = word_count if word_count is not None else self._di.get_word_count_from_text(text)
|
28 |
+
self._nlp.get_pipe('feature counter').counter_function = count_connectives_function
|
29 |
+
connectives = sum(doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads))
|
30 |
+
return connectives
|
31 |
+
|
32 |
+
def get_causal_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
33 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['causal connective tagger', 'feature counter']]
|
34 |
+
causal_connectives_counter = lambda doc: len(doc._.causal_connectives)
|
35 |
+
result = self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=causal_connectives_counter, workers=workers)
|
36 |
+
return result
|
37 |
+
|
38 |
+
def get_temporal_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
39 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['temporal connective tagger', 'feature counter']]
|
40 |
+
temporal_connectives_counter = lambda doc: len(doc._.temporal_connectives)
|
41 |
+
result = self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=temporal_connectives_counter, workers=workers)
|
42 |
+
return result
|
43 |
+
|
44 |
+
def get_exemplifications_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
45 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['exemplifications tagger', 'tagger', 'feature counter']]
|
46 |
+
exemplifications_counter = lambda doc: len(doc._.exemplifications)
|
47 |
+
return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=exemplifications_counter, workers=workers)
|
48 |
+
|
49 |
+
def get_emphatics_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
50 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['emphatics tagger', 'tagger', 'feature counter']]
|
51 |
+
emphatics_counter = lambda doc: len(doc._.emphatics)
|
52 |
+
return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=emphatics_counter, workers=workers)
|
53 |
+
|
54 |
+
def get_asks_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
55 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['asks tagger', 'tagger', 'feature counter']]
|
56 |
+
asks_counter = lambda doc: len(doc._.asks)
|
57 |
+
return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=asks_counter, workers=workers)
|
58 |
+
|
59 |
+
def get_polites_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
60 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['polites tagger', 'tagger', 'feature counter']]
|
61 |
+
polites_counter = lambda doc: len(doc._.polites)
|
62 |
+
return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=polites_counter, workers=workers)
|
63 |
+
|
64 |
+
def get_logical_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
65 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['logical connective tagger', 'tagger', 'feature counter']]
|
66 |
+
logical_connectives_counter = lambda doc: len(doc._.logical_connectives)
|
67 |
+
return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=logical_connectives_counter, workers=workers)
|
68 |
+
|
69 |
+
def get_adversative_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
70 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['adversative connective tagger', 'tagger', 'feature counter']]
|
71 |
+
adversative_connectives_counter = lambda doc: len(doc._.adversative_connectives)
|
72 |
+
return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=adversative_connectives_counter, workers=workers)
|
73 |
+
|
74 |
+
def get_additive_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
75 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['additive connective tagger', 'tagger', 'feature counter']]
|
76 |
+
additive_connectives_counter = lambda doc: len(doc._.additive_connectives)
|
77 |
+
return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=additive_connectives_counter, workers=workers)
|
text_complexity_analyzer_cm/coh_metrix_indices/descriptive_indices.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
import spacy
|
3 |
+
import statistics
|
4 |
+
import string
|
5 |
+
|
6 |
+
from typing import Callable
|
7 |
+
from typing import List
|
8 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES, LANGUAGES_DICTIONARY_PYPHEN
|
9 |
+
from text_complexity_analyzer_cm.pipes.syllable_splitter import SyllableSplitter
|
10 |
+
from text_complexity_analyzer_cm.utils.statistics_results import StatisticsResults
|
11 |
+
from text_complexity_analyzer_cm.utils.utils import is_word
|
12 |
+
from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
|
13 |
+
from text_complexity_analyzer_cm.utils.utils import split_text_into_sentences
|
14 |
+
from text_complexity_analyzer_cm.utils.utils import split_doc_into_sentences
|
15 |
+
|
16 |
+
|
17 |
+
class DescriptiveIndices:
|
18 |
+
def __init__(self, nlp, language: str='en') -> None:
|
19 |
+
if not language in ACCEPTED_LANGUAGES:
|
20 |
+
raise ValueError(f'Language {language} is not supported yet')
|
21 |
+
|
22 |
+
self.language = language
|
23 |
+
self._nlp = nlp
|
24 |
+
|
25 |
+
def get_paragraph_count_from_text(self, text: str) -> int:
|
26 |
+
if len(text) == 0:
|
27 |
+
raise ValueError('The text is empty.')
|
28 |
+
|
29 |
+
return len(split_text_into_paragraphs(text))
|
30 |
+
|
31 |
+
def get_sentence_count_from_text(self, text: str, workers: int=-1) -> int:
|
32 |
+
if len(text) == 0:
|
33 |
+
raise ValueError('The text is empty.')
|
34 |
+
elif workers == 0 or workers < -1:
|
35 |
+
raise ValueError('Workers must be -1 or any positive number greater than 0')
|
36 |
+
else:
|
37 |
+
paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs
|
38 |
+
threads = 1#multiprocessing.cpu_count() if workers == -1 else workers
|
39 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['sentencizer', 'feature counter']]
|
40 |
+
|
41 |
+
sentence_counter = lambda doc: sum(1 for _ in doc.sents)
|
42 |
+
self._nlp.get_pipe('feature counter').counter_function = sentence_counter
|
43 |
+
|
44 |
+
sentences = sum(doc._.feature_count
|
45 |
+
for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads))
|
46 |
+
return sentences
|
47 |
+
|
48 |
+
def get_word_count_from_text(self, text: str, workers: int=-1) -> int:
|
49 |
+
if len(text) == 0:
|
50 |
+
raise ValueError('The text is empty.')
|
51 |
+
elif workers == 0 or workers < -1:
|
52 |
+
raise ValueError('Workers must be -1 or any positive number greater than 0')
|
53 |
+
else:
|
54 |
+
paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs
|
55 |
+
threads = 1#multiprocessing.cpu_count() if workers == -1 else workers
|
56 |
+
word_counter = lambda doc: sum(1 for token in doc if is_word(token))
|
57 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe != 'feature counter']
|
58 |
+
self._nlp.get_pipe('feature counter').counter_function = word_counter
|
59 |
+
|
60 |
+
total_words = sum(doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads))
|
61 |
+
|
62 |
+
|
63 |
+
return total_words
|
64 |
+
|
65 |
+
def _get_mean_std_of_metric(self, text: str, disable_pipeline: List, counter_function: Callable, statistic_type: str='all', workers=-1) -> StatisticsResults:
|
66 |
+
paragraphs = split_text_into_paragraphs(text)
|
67 |
+
threads = 1
|
68 |
+
self._nlp.get_pipe('feature counter').counter_function = counter_function
|
69 |
+
counter = []
|
70 |
+
|
71 |
+
for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads):
|
72 |
+
current_result = doc._.feature_count # Find the values to add to the counter
|
73 |
+
if not isinstance(current_result, list): # Add any numbers
|
74 |
+
counter.append(current_result)
|
75 |
+
else:
|
76 |
+
if len(current_result) > 0: # Only add values if its not an empty array
|
77 |
+
counter.extend(current_result)
|
78 |
+
|
79 |
+
stat_results = StatisticsResults()
|
80 |
+
if statistic_type in ['std', 'all']:
|
81 |
+
stat_results.std = statistics.pstdev(counter)
|
82 |
+
|
83 |
+
if statistic_type in ['mean', 'all']:
|
84 |
+
stat_results.mean = statistics.mean(counter)
|
85 |
+
|
86 |
+
return stat_results
|
87 |
+
|
88 |
+
def get_length_of_paragraphs(self, text: str, workers: int=-1) -> StatisticsResults:
|
89 |
+
count_length_of_paragraphs = lambda doc: sum(1 for _ in split_doc_into_sentences(doc))
|
90 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['sentencizer', 'feature counter']]
|
91 |
+
return self._get_mean_std_of_metric(text, disable_pipeline=disable_pipeline, counter_function=count_length_of_paragraphs, statistic_type='all', workers=workers)
|
92 |
+
|
93 |
+
def get_length_of_sentences(self, text: str, workers: int=-1) -> StatisticsResults:
|
94 |
+
count_length_of_sentences = lambda doc: [len([1 for token in sentence
|
95 |
+
if is_word(token)])
|
96 |
+
for sentence in doc.sents]
|
97 |
+
|
98 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['sentencizer', 'feature counter']]
|
99 |
+
|
100 |
+
return self._get_mean_std_of_metric(text, disable_pipeline=disable_pipeline, counter_function=count_length_of_sentences, statistic_type='all', workers=workers)
|
101 |
+
|
102 |
+
def get_length_of_words(self, text: str, workers: int=-1) -> StatisticsResults:
|
103 |
+
|
104 |
+
count_letters_per_word = lambda doc: [len(token)
|
105 |
+
for token in doc
|
106 |
+
if is_word(token)]
|
107 |
+
|
108 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe != 'feature counter']
|
109 |
+
|
110 |
+
result = self._get_mean_std_of_metric(text, disable_pipeline=disable_pipeline, counter_function=count_letters_per_word, statistic_type='all', workers=workers)
|
111 |
+
|
112 |
+
return result
|
113 |
+
|
114 |
+
def get_syllables_per_word(self, text: str, workers=-1) -> StatisticsResults:
|
115 |
+
count_syllables_per_word = lambda doc: [len(token._.syllables)
|
116 |
+
for token in doc
|
117 |
+
if is_word(token) and token._.syllables is not None]
|
118 |
+
|
119 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['syllables', 'feature counter']]
|
120 |
+
|
121 |
+
result = self._get_mean_std_of_metric(text, disable_pipeline=disable_pipeline, counter_function=count_syllables_per_word, statistic_type='all', workers=workers)
|
122 |
+
|
123 |
+
return result
|
text_complexity_analyzer_cm/coh_metrix_indices/lexical_diversity_indices.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
import spacy
|
3 |
+
import string
|
4 |
+
|
5 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
6 |
+
from text_complexity_analyzer_cm.utils.utils import is_content_word
|
7 |
+
from text_complexity_analyzer_cm.utils.utils import is_word
|
8 |
+
from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
|
9 |
+
|
10 |
+
class LexicalDiversityIndices:
|
11 |
+
def __init__(self, nlp, language: str='en') -> None:
|
12 |
+
self.language = language
|
13 |
+
self._nlp = nlp
|
14 |
+
|
15 |
+
def get_type_token_ratio_between_all_words(self, text: str, workers=-1) -> float:
|
16 |
+
paragraphs = split_text_into_paragraphs(text)
|
17 |
+
threads = 1
|
18 |
+
tokens = []
|
19 |
+
disable_pipeline = []
|
20 |
+
|
21 |
+
tokens = [token.text.lower()
|
22 |
+
for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)
|
23 |
+
for token in doc
|
24 |
+
if is_word(token)]
|
25 |
+
|
26 |
+
return 0 if len(tokens) == 0 else len(set(tokens)) / len(tokens)
|
27 |
+
|
28 |
+
def get_type_token_ratio_of_content_words(self, text: str, workers=-1) -> float:
|
29 |
+
paragraphs = split_text_into_paragraphs(text)
|
30 |
+
threads = 1
|
31 |
+
tokens = []
|
32 |
+
disable_pipeline = []
|
33 |
+
tokens = [token.text.lower()
|
34 |
+
for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)
|
35 |
+
for token in doc
|
36 |
+
if is_content_word(token)]
|
37 |
+
return 0 if len(tokens) == 0 else len(set(tokens)) / len(tokens)
|
text_complexity_analyzer_cm/coh_metrix_indices/readability_indices.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
|
3 |
+
import spacy
|
4 |
+
|
5 |
+
from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
|
6 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
7 |
+
|
8 |
+
|
9 |
+
class ReadabilityIndices:
|
10 |
+
'''
|
11 |
+
This class will handle all operations to find the readability indices of a text according to Coh-Metrix.
|
12 |
+
'''
|
13 |
+
|
14 |
+
def __init__(self, nlp, language: str='en', descriptive_indices: DescriptiveIndices=None) -> None:
|
15 |
+
'''
|
16 |
+
The constructor will initialize this object that calculates the readability indices for a specific language of those that are available.
|
17 |
+
|
18 |
+
Parameters:
|
19 |
+
nlp: The spacy model that corresponds to a language.
|
20 |
+
language(str): The language that the texts to process will have.
|
21 |
+
descriptive_indices(DescriptiveIndices): The class that calculates the descriptive indices of a text in a certain language.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
None.
|
25 |
+
'''
|
26 |
+
if not language in ACCEPTED_LANGUAGES:
|
27 |
+
raise ValueError(f'Language {language} is not supported yet')
|
28 |
+
elif descriptive_indices is not None and descriptive_indices.language != language:
|
29 |
+
raise ValueError(f'The descriptive indices analyzer must be of the same language as the word information analyzer.')
|
30 |
+
|
31 |
+
self.language = language
|
32 |
+
self._nlp = nlp
|
33 |
+
|
34 |
+
if descriptive_indices is None: # Assign the descriptive indices to an attribute
|
35 |
+
self._di = DescriptiveIndices(language=language, nlp=nlp)
|
36 |
+
else:
|
37 |
+
self._di = descriptive_indices
|
38 |
+
|
39 |
+
def calculate_fernandez_huertas_grade_level(self, text: str=None, mean_syllables_per_word: int=None, mean_words_per_sentence: int=None, workers: int=-1) -> float:
|
40 |
+
'''
|
41 |
+
This function obtains the Fernández-Huertas readability index for a text.
|
42 |
+
|
43 |
+
Parameters:
|
44 |
+
text(str): The text to be analized.
|
45 |
+
word_count(int): The amount of words in the text.
|
46 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
47 |
+
mean_syllables_per_word(int): The mean of syllables per word in the text.
|
48 |
+
mean_words_per_sentence(int): The mean amount of words per sentences in the text.
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
float: The Fernández-Huertas readability index for a text.
|
52 |
+
'''
|
53 |
+
if self.language != 'en':
|
54 |
+
raise ValueError('This readability index is for spanish.')
|
55 |
+
elif text is not None and len(text) == 0:
|
56 |
+
raise ValueError('The word is empty.')
|
57 |
+
elif text is None and (mean_syllables_per_word is None or mean_words_per_sentence is None):
|
58 |
+
raise ValueError('If there\'s no text, then you must pass mean_syllables_per_word and mean_words_per_sentence at the same time.')
|
59 |
+
elif workers == 0 or workers < -1:
|
60 |
+
raise ValueError('Workers must be -1 or any positive number greater than 0')
|
61 |
+
else:
|
62 |
+
threads = multiprocessing.cpu_count() if workers == -1 else workers
|
63 |
+
mspw = mean_syllables_per_word if mean_syllables_per_word is not None else self._di.get_mean_of_syllables_per_word(text=text, workers=threads)
|
64 |
+
mwps = mean_words_per_sentence if mean_words_per_sentence is not None else self._di.get_mean_of_length_of_sentences(text=text, workers=threads)
|
65 |
+
|
66 |
+
return 206.84 - 0.6 * mspw - 1.02 * mwps
|
text_complexity_analyzer_cm/coh_metrix_indices/referential_cohesion_indices.py
ADDED
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
import spacy
|
3 |
+
import statistics
|
4 |
+
|
5 |
+
from itertools import combinations
|
6 |
+
from spacy.tokens import Span
|
7 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
8 |
+
from text_complexity_analyzer_cm.utils.statistics_results import StatisticsResults
|
9 |
+
from text_complexity_analyzer_cm.utils.utils import is_word
|
10 |
+
from text_complexity_analyzer_cm.utils.utils import is_content_word
|
11 |
+
from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
|
12 |
+
from typing import Callable
|
13 |
+
from typing import List
|
14 |
+
|
15 |
+
|
16 |
+
class ReferentialCohesionIndices:
|
17 |
+
'''
|
18 |
+
This class will handle all operations to find the synthactic pattern density indices of a text according to Coh-Metrix.
|
19 |
+
'''
|
20 |
+
# TODO: Implement multiprocessing
|
21 |
+
def __init__(self, nlp, language: str='en') -> None:
|
22 |
+
'''
|
23 |
+
The constructor will initialize this object that calculates the synthactic pattern density indices for a specific language of those that are available.
|
24 |
+
|
25 |
+
Parameters:
|
26 |
+
nlp: The spacy model that corresponds to a language.
|
27 |
+
language(str): The language that the texts to process will have.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
None.
|
31 |
+
'''
|
32 |
+
if not language in ACCEPTED_LANGUAGES:
|
33 |
+
raise ValueError(f'Language {language} is not supported yet')
|
34 |
+
|
35 |
+
self.language = language
|
36 |
+
self._nlp = nlp
|
37 |
+
|
38 |
+
def _calculate_overlap_for_adjacent_sentences(self, text: str, disable_pipeline: List, sentence_analyzer: Callable, statistic_type: str='mean', workers: int=-1) -> StatisticsResults:
|
39 |
+
'''
|
40 |
+
This method calculates the overlap for adjacent sentences in a text. MULTIPROCESSING STILL NOT IMPLEMENTED.
|
41 |
+
|
42 |
+
Parameters:
|
43 |
+
text(str): The text to be analyzed.
|
44 |
+
disable_pipeline(List): The pipeline elements to be disabled.
|
45 |
+
sentence_analyzer(Callable): The function that analyzes sentences to check cohesion.
|
46 |
+
statistic_type(str): Whether to calculate the mean and/or the standard deviation. It accepts 'mean', 'std' or 'all'.
|
47 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
StatisticsResults: The standard deviation and mean of the overlap.
|
51 |
+
'''
|
52 |
+
# TODO MULTIPROCESSING. WORKERS IS JUST A PLACEHOLDER
|
53 |
+
if len(text) == 0:
|
54 |
+
raise ValueError('The text is empty.')
|
55 |
+
elif statistic_type not in ['mean', 'std', 'all']:
|
56 |
+
raise ValueError('\'statistic_type\' can only take \'mean\', \'std\' or \'all\'.')
|
57 |
+
else:
|
58 |
+
self._nlp.get_pipe('referential cohesion adjacent sentences analyzer').sentence_analyzer = sentence_analyzer
|
59 |
+
doc = self._nlp(text, disable=disable_pipeline)
|
60 |
+
stat_results = StatisticsResults() # Create empty container
|
61 |
+
|
62 |
+
if len(doc._.referential_cohesion_adjacent) == 0:
|
63 |
+
return stat_results
|
64 |
+
else:
|
65 |
+
if statistic_type in ['mean', 'all']:
|
66 |
+
stat_results.mean = statistics.mean(doc._.referential_cohesion_adjacent)
|
67 |
+
|
68 |
+
if statistic_type in ['std', 'all']:
|
69 |
+
stat_results.std = statistics.pstdev(doc._.referential_cohesion_adjacent)
|
70 |
+
|
71 |
+
return stat_results
|
72 |
+
|
73 |
+
def _calculate_overlap_for_all_sentences(self, text: str, disable_pipeline: List, sentence_analyzer: Callable, statistic_type: str='all', workers: int=-1) -> StatisticsResults:
|
74 |
+
'''
|
75 |
+
This method calculates the overlap for all sentences in a text. MULTIPROCESSING STILL NOT IMPLEMENTED.
|
76 |
+
|
77 |
+
Parameters:
|
78 |
+
text(str): The text to be analyzed.
|
79 |
+
disable_pipeline(List): The pipeline elements to be disabled.
|
80 |
+
sentence_analyzer(Callable): The function that analyzes sentences to check cohesion.
|
81 |
+
statistic_type(str): Whether to calculate the mean and/or the standard deviation. It accepts 'mean', 'std' or 'all'.
|
82 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
StatisticsResults: The standard deviation and mean of the overlap.
|
86 |
+
'''
|
87 |
+
# TODO MULTIPROCESSING. WORKERS IS JUST A PLACEHOLDER.
|
88 |
+
if len(text) == 0:
|
89 |
+
raise ValueError('The text is empty.')
|
90 |
+
elif statistic_type not in ['mean', 'std', 'all']:
|
91 |
+
raise ValueError('\'statistic_type\' can only take \'mean\', \'std\' or \'all\'.')
|
92 |
+
else:
|
93 |
+
self._nlp.get_pipe('referential cohesion all sentences analyzer').sentence_analyzer = sentence_analyzer
|
94 |
+
doc = self._nlp(text, disable=disable_pipeline)
|
95 |
+
stat_results = StatisticsResults() # Create empty container
|
96 |
+
|
97 |
+
if len(doc._.referential_cohesion_all) == 0:
|
98 |
+
return stat_results
|
99 |
+
else:
|
100 |
+
if statistic_type in ['mean', 'all']:
|
101 |
+
stat_results.mean = statistics.mean(doc._.referential_cohesion_all)
|
102 |
+
|
103 |
+
if statistic_type in ['std', 'all']:
|
104 |
+
stat_results.std = statistics.pstdev(doc._.referential_cohesion_all)
|
105 |
+
|
106 |
+
return stat_results
|
107 |
+
|
108 |
+
def get_noun_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
|
109 |
+
'''
|
110 |
+
This method calculates the noun overlap for adjacent sentences in a text.
|
111 |
+
|
112 |
+
Parameters:
|
113 |
+
text(str): The text to be analyzed.
|
114 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
float: The mean noun overlap.
|
118 |
+
'''
|
119 |
+
disable_pipeline = [pipe
|
120 |
+
for pipe in self._nlp.pipe_names
|
121 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
|
122 |
+
return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_noun_overlap, statistic_type='mean').mean
|
123 |
+
|
124 |
+
def get_noun_overlap_all_sentences(self, text: str, workers: int=-1) -> float:
|
125 |
+
'''
|
126 |
+
This method calculates the noun overlap for all sentences in a text.
|
127 |
+
|
128 |
+
Parameters:
|
129 |
+
text(str): The text to be analyzed.
|
130 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
float: The mean noun overlap.
|
134 |
+
'''
|
135 |
+
disable_pipeline = [pipe
|
136 |
+
for pipe in self._nlp.pipe_names
|
137 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
|
138 |
+
return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_noun_overlap, statistic_type='mean').mean
|
139 |
+
|
140 |
+
def get_argument_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
|
141 |
+
'''
|
142 |
+
This method calculates the argument overlap for adjacent sentences in a text.
|
143 |
+
|
144 |
+
Parameters:
|
145 |
+
text(str): The text to be analyzed.
|
146 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
float: The mean argument overlap.
|
150 |
+
'''
|
151 |
+
disable_pipeline = [pipe
|
152 |
+
for pipe in self._nlp.pipe_names
|
153 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
|
154 |
+
return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_argument_overlap, statistic_type='mean').mean
|
155 |
+
|
156 |
+
def get_argument_overlap_all_sentences(self, text: str, workers: int=-1) -> float:
|
157 |
+
'''
|
158 |
+
This method calculates the argument overlap for all sentences in a text.
|
159 |
+
|
160 |
+
Parameters:
|
161 |
+
text(str): The text to be analyzed.
|
162 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
163 |
+
|
164 |
+
Returns:
|
165 |
+
float: The mean argument overlap.
|
166 |
+
'''
|
167 |
+
disable_pipeline = [pipe
|
168 |
+
for pipe in self._nlp.pipe_names
|
169 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
|
170 |
+
return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_argument_overlap, statistic_type='mean').mean
|
171 |
+
|
172 |
+
def get_stem_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
|
173 |
+
'''
|
174 |
+
This method calculates the stem overlap for adjacent sentences in a text.
|
175 |
+
|
176 |
+
Parameters:
|
177 |
+
text(str): The text to be analyzed.
|
178 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
179 |
+
|
180 |
+
Returns:
|
181 |
+
float: The mean stem overlap.
|
182 |
+
'''
|
183 |
+
disable_pipeline = [pipe
|
184 |
+
for pipe in self._nlp.pipe_names
|
185 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
|
186 |
+
return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_stem_overlap, statistic_type='mean').mean
|
187 |
+
|
188 |
+
def get_stem_overlap_all_sentences(self, text: str, workers: int=-1) -> float:
|
189 |
+
'''
|
190 |
+
This method calculates the stem overlap for all sentences in a text.
|
191 |
+
|
192 |
+
Parameters:
|
193 |
+
text(str): The text to be analyzed.
|
194 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
195 |
+
|
196 |
+
Returns:
|
197 |
+
float: The mean stem overlap.
|
198 |
+
'''
|
199 |
+
disable_pipeline = [pipe
|
200 |
+
for pipe in self._nlp.pipe_names
|
201 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
|
202 |
+
return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_stem_overlap, statistic_type='mean').mean
|
203 |
+
|
204 |
+
def get_content_word_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
|
205 |
+
'''
|
206 |
+
This method calculates the mean and standard deviation of the content word overlap for adjacent sentences in a text.
|
207 |
+
|
208 |
+
Parameters:
|
209 |
+
text(str): The text to be analyzed.
|
210 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
211 |
+
|
212 |
+
Returns:
|
213 |
+
float: The mean mean and standard deviation of the content word overlap.
|
214 |
+
'''
|
215 |
+
disable_pipeline = [pipe
|
216 |
+
for pipe in self._nlp.pipe_names
|
217 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
|
218 |
+
return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_content_word_overlap, statistic_type='all')
|
219 |
+
|
220 |
+
def get_content_word_overlap_all_sentences(self, text: str, workers: int=-1) -> StatisticsResults:
|
221 |
+
'''
|
222 |
+
This method calculates the mean and standard deviation of the content word overlap for all sentences in a text.
|
223 |
+
|
224 |
+
Parameters:
|
225 |
+
text(str): The text to be analyzed.
|
226 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
227 |
+
|
228 |
+
Returns:
|
229 |
+
StatisticsResults: The mean mean and standard deviation of the content word overlap.
|
230 |
+
'''
|
231 |
+
disable_pipeline = [pipe
|
232 |
+
for pipe in self._nlp.pipe_names
|
233 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
|
234 |
+
return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_content_word_overlap, statistic_type='all')
|
235 |
+
|
236 |
+
def get_anaphore_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
|
237 |
+
'''
|
238 |
+
This method calculates the mean of the anaphore overlap for adjacent sentences in a text.
|
239 |
+
|
240 |
+
Parameters:
|
241 |
+
text(str): The text to be analyzed.
|
242 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
243 |
+
|
244 |
+
Returns:
|
245 |
+
float: The mean mean of the anaphore overlap.
|
246 |
+
'''
|
247 |
+
disable_pipeline = [pipe
|
248 |
+
for pipe in self._nlp.pipe_names
|
249 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
|
250 |
+
return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_anaphore_overlap, statistic_type='all').mean
|
251 |
+
|
252 |
+
def get_anaphore_overlap_all_sentences(self, text: str, workers: int=-1) -> float:
|
253 |
+
'''
|
254 |
+
This method calculates the mean of the anaphore overlap for all sentences in a text.
|
255 |
+
|
256 |
+
Parameters:
|
257 |
+
text(str): The text to be analyzed.
|
258 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
259 |
+
|
260 |
+
Returns:
|
261 |
+
float: The mean mean of the anaphore overlap.
|
262 |
+
'''
|
263 |
+
disable_pipeline = [pipe
|
264 |
+
for pipe in self._nlp.pipe_names
|
265 |
+
if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
|
266 |
+
return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_anaphore_overlap, statistic_type='all').mean
|
267 |
+
|
268 |
+
def analyze_noun_overlap(prev_sentence: Span, cur_sentence: Span, language: str='en') -> int:
|
269 |
+
'''
|
270 |
+
This function analyzes whether or not there's noun overlap between two sentences for a language.
|
271 |
+
|
272 |
+
Parameters:
|
273 |
+
prev_sentence(Span): The previous sentence to analyze.
|
274 |
+
cur_sentence(Span): The current sentence to analyze.
|
275 |
+
language(str): The language of the sentences.
|
276 |
+
|
277 |
+
Returns:
|
278 |
+
int: 1 if there's overlap between the two sentences and 0 if no.
|
279 |
+
'''
|
280 |
+
# Place the tokens in a dictionary for search efficiency
|
281 |
+
prev_sentence_noun_tokens = {token.text.lower(): None
|
282 |
+
for token in prev_sentence
|
283 |
+
if is_word(token) and token.pos_ == 'NOUN'}
|
284 |
+
|
285 |
+
for token in cur_sentence:
|
286 |
+
if language == 'en':
|
287 |
+
if is_word(token) and token.pos_ == 'NOUN' and token.text.lower() in prev_sentence_noun_tokens:
|
288 |
+
return 1 # There's cohesion
|
289 |
+
|
290 |
+
return 0 # No cohesion
|
291 |
+
|
292 |
+
|
293 |
+
def analyze_argument_overlap(prev_sentence: Span, cur_sentence: Span, language: str='en') -> int:
|
294 |
+
'''
|
295 |
+
This function analyzes whether or not there's argument overlap between two sentences.
|
296 |
+
|
297 |
+
Parameters:
|
298 |
+
prev_sentence(Span): The previous sentence to analyze.
|
299 |
+
cur_sentence(Span): The current sentence to analyze.
|
300 |
+
language(str): The language of the sentences.
|
301 |
+
|
302 |
+
Returns:
|
303 |
+
int: 1 if there's overlap between the two sentences and 0 if no.
|
304 |
+
'''
|
305 |
+
# Place the tokens in a dictionary for search efficiency
|
306 |
+
prev_sentence_noun_tokens = {token.lemma_.lower(): None
|
307 |
+
for token in prev_sentence
|
308 |
+
if is_word(token) and token.pos_ == 'NOUN'}
|
309 |
+
|
310 |
+
prev_sentence_personal_pronouns_tokens = {token.text.lower(): None
|
311 |
+
for token in prev_sentence
|
312 |
+
if is_word(token) and 'PronType=Prs' in token.tag_}
|
313 |
+
|
314 |
+
for token in cur_sentence: # Iterate every token of the current sentence
|
315 |
+
if language == 'en':
|
316 |
+
if is_word(token) and token.pos_ == 'NOUN' and token.lemma_.lower() in prev_sentence_noun_tokens:
|
317 |
+
return 1 # There's cohesion by noun lemma
|
318 |
+
|
319 |
+
if is_word(token) and 'PronType=Prs' in token.tag_ and token.text.lower() in prev_sentence_personal_pronouns_tokens:
|
320 |
+
return 1 # There's cohesion by personal pronoun
|
321 |
+
|
322 |
+
return 0 # No cohesion
|
323 |
+
|
324 |
+
|
325 |
+
def analyze_stem_overlap(prev_sentence: Span, cur_sentence: Span, language: str='en') -> int:
|
326 |
+
'''
|
327 |
+
This function analyzes whether or not there's stem overlap between two sentences.
|
328 |
+
|
329 |
+
Parameters:
|
330 |
+
prev_sentence(Span): The previous sentence to analyze.
|
331 |
+
cur_sentence(Span): The current sentence to analyze.
|
332 |
+
language(str): The language of the sentences.
|
333 |
+
|
334 |
+
Returns:
|
335 |
+
int: 1 if there's overlap between the two sentences and 0 if no.
|
336 |
+
'''
|
337 |
+
# Place the tokens in a dictionary for search efficiency
|
338 |
+
prev_sentence_content_stem_tokens = {token.lemma_.lower(): None
|
339 |
+
for token in prev_sentence
|
340 |
+
if is_content_word(token)}
|
341 |
+
|
342 |
+
for token in cur_sentence:
|
343 |
+
if language == 'en':
|
344 |
+
if is_word(token) and token.pos_ in ['NOUN', 'PROPN'] and token.lemma_.lower() in prev_sentence_content_stem_tokens:
|
345 |
+
return 1 # There's cohesion
|
346 |
+
|
347 |
+
return 0 # No cohesion
|
348 |
+
|
349 |
+
|
350 |
+
def analyze_content_word_overlap(prev_sentence: Span, cur_sentence: Span, language='en') -> float:
|
351 |
+
'''
|
352 |
+
This function calculates the proportional content word overlap between two sentences.
|
353 |
+
|
354 |
+
Parameters:
|
355 |
+
prev_sentence(Span): The previous sentence to analyze.
|
356 |
+
cur_sentence(Span): The current sentence to analyze.
|
357 |
+
language(str): The language of the sentences.
|
358 |
+
|
359 |
+
Returns:
|
360 |
+
float: Proportion of tokens that overlap between the current and previous sentences
|
361 |
+
'''
|
362 |
+
total_tokens = len([token for token in prev_sentence if is_content_word(token)]) + len([token for token in cur_sentence if is_content_word(token)])
|
363 |
+
|
364 |
+
if total_tokens == 0: # Nothing to compute
|
365 |
+
return 0
|
366 |
+
else:
|
367 |
+
prev_sentence_content_words_tokens = {token.text.lower(): None
|
368 |
+
for token in prev_sentence
|
369 |
+
if is_content_word(token)}
|
370 |
+
matches = 0 # Matcher counter
|
371 |
+
|
372 |
+
for token in cur_sentence:
|
373 |
+
if language == 'en':
|
374 |
+
if is_content_word(token) and token.text.lower() in prev_sentence_content_words_tokens:
|
375 |
+
matches += 2 # There's cohesion
|
376 |
+
|
377 |
+
return matches / total_tokens
|
378 |
+
|
379 |
+
|
380 |
+
def analyze_anaphore_overlap(prev_sentence: Span, cur_sentence: Span, language: str='en') -> int:
|
381 |
+
'''
|
382 |
+
This function analyzes whether or not there's anaphore overlap between two sentences.
|
383 |
+
|
384 |
+
Parameters:
|
385 |
+
prev_sentence(Span): The previous sentence to analyze.
|
386 |
+
cur_sentence(Span): The current sentence to analyze.
|
387 |
+
language(str): The language of the sentences.
|
388 |
+
|
389 |
+
Returns:
|
390 |
+
int: 1 if there's overlap between the two sentences and 0 if no.
|
391 |
+
'''
|
392 |
+
# Place the tokens in a dictionary for search efficiency
|
393 |
+
prev_sentence_pronoun_tokens = {token.text.lower(): None
|
394 |
+
for token in prev_sentence
|
395 |
+
if is_word(token) and token.pos_ == 'PRON'}
|
396 |
+
|
397 |
+
for token in cur_sentence:
|
398 |
+
if language == 'en':
|
399 |
+
if is_word(token) and token.pos_ == 'PRON' and token.text.lower() in prev_sentence_pronoun_tokens:
|
400 |
+
return 1 # There's cohesion
|
401 |
+
|
402 |
+
return 0 # No cohesion
|
text_complexity_analyzer_cm/coh_metrix_indices/syntactic_complexity_indices.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
from typing import Tuple
|
3 |
+
|
4 |
+
import spacy
|
5 |
+
import statistics
|
6 |
+
|
7 |
+
from spacy.tokens import Span
|
8 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
9 |
+
from text_complexity_analyzer_cm.utils.utils import is_word
|
10 |
+
from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
|
11 |
+
from text_complexity_analyzer_cm.utils.utils import split_doc_into_sentences
|
12 |
+
|
13 |
+
|
14 |
+
class SyntacticComplexityIndices:
|
15 |
+
def __init__(self, nlp, language: str='en') -> None:
|
16 |
+
if not language in ACCEPTED_LANGUAGES:
|
17 |
+
raise ValueError(f'Language {language} is not supported yet')
|
18 |
+
|
19 |
+
self.language = language
|
20 |
+
self._nlp = nlp
|
21 |
+
|
22 |
+
def get_mean_number_of_modifiers_per_noun_phrase(self, text: str, workers: int=-1) -> float:
|
23 |
+
paragraphs = split_text_into_paragraphs(text)
|
24 |
+
threads = 1
|
25 |
+
modifiers_per_noun_phrase = []
|
26 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['parser', 'tagger', 'noun phrase tagger', 'feature counter']]
|
27 |
+
modifiers_counter = lambda doc: [sum(1 for token in nph if token.pos_ == 'ADJ')
|
28 |
+
for nph in doc._.noun_phrases]
|
29 |
+
self._nlp.get_pipe('feature counter').counter_function = modifiers_counter
|
30 |
+
modifiers_per_noun_phrase = []
|
31 |
+
|
32 |
+
for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads):
|
33 |
+
modifiers_per_noun_phrase.extend(doc._.feature_count)
|
34 |
+
|
35 |
+
return statistics.mean(modifiers_per_noun_phrase)
|
36 |
+
|
37 |
+
def get_mean_number_of_words_before_main_verb(self, text: str, workers: int=-1) -> float:
|
38 |
+
paragraphs = split_text_into_paragraphs(text)
|
39 |
+
threads = 1
|
40 |
+
words_before_main_verb = []
|
41 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['feature counter', 'sentencizer']]
|
42 |
+
words_before_main_verb_counter = lambda doc: [amount_of_words_before_main_verb(s) for s in split_doc_into_sentences(doc)]
|
43 |
+
self._nlp.get_pipe('feature counter').counter_function = words_before_main_verb_counter
|
44 |
+
for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads):
|
45 |
+
words_before_main_verb.extend(doc._.feature_count)
|
46 |
+
return statistics.mean(words_before_main_verb)
|
47 |
+
|
48 |
+
def amount_of_words_before_main_verb(sentence: Span) -> int:
|
49 |
+
left_words = []
|
50 |
+
for token in sentence:
|
51 |
+
if token.pos_ in ['VERB', 'AUX'] and token.dep_ == 'ROOT':
|
52 |
+
break
|
53 |
+
else:
|
54 |
+
if is_word(token):
|
55 |
+
left_words.append(token.text)
|
56 |
+
return len(left_words)
|
text_complexity_analyzer_cm/coh_metrix_indices/syntactic_pattern_density_indices.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
|
3 |
+
import spacy
|
4 |
+
|
5 |
+
from typing import Callable
|
6 |
+
from typing import List
|
7 |
+
from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
|
8 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
9 |
+
from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
|
10 |
+
|
11 |
+
|
12 |
+
class SyntacticPatternDensityIndices:
|
13 |
+
'''
|
14 |
+
This class will handle all operations to find the synthactic pattern density indices of a text according to Coh-Metrix.
|
15 |
+
'''
|
16 |
+
|
17 |
+
def __init__(self, nlp, language: str='en', descriptive_indices: DescriptiveIndices=None) -> None:
|
18 |
+
'''
|
19 |
+
The constructor will initialize this object that calculates the synthactic pattern density indices for a specific language of those that are available.
|
20 |
+
|
21 |
+
Parameters:
|
22 |
+
nlp: The spacy model that corresponds to a language.
|
23 |
+
language(str): The language that the texts to process will have.
|
24 |
+
descriptive_indices(DescriptiveIndices): The class that calculates the descriptive indices of a text in a certain language.
|
25 |
+
|
26 |
+
Returns:
|
27 |
+
None.
|
28 |
+
'''
|
29 |
+
if not language in ACCEPTED_LANGUAGES:
|
30 |
+
raise ValueError(f'Language {language} is not supported yet')
|
31 |
+
elif descriptive_indices is not None and descriptive_indices.language != language:
|
32 |
+
raise ValueError(f'The descriptive indices analyzer must be of the same language as the word information analyzer.')
|
33 |
+
|
34 |
+
self.language = language
|
35 |
+
self._nlp = nlp
|
36 |
+
self._incidence = 1000
|
37 |
+
|
38 |
+
if descriptive_indices is None: # Assign the descriptive indices to an attribute
|
39 |
+
self._di = DescriptiveIndices(language=language, nlp=nlp)
|
40 |
+
else:
|
41 |
+
self._di = descriptive_indices
|
42 |
+
|
43 |
+
def _get_syntactic_pattern_density(self, text: str, disable_pipeline: List, sp_counter_function: Callable=None, word_count: int=None, workers: int=-1) -> int:
|
44 |
+
'''
|
45 |
+
This function obtains the incidence of a syntactic pattern that exist on a text per {self._incidence} words.
|
46 |
+
|
47 |
+
Parameters:
|
48 |
+
text(str): The text to be analized.
|
49 |
+
disable_pipeline(List): The pipeline elements to be disabled.
|
50 |
+
sp_counter_function(Callable): The function that counts a syntactic pattern for a Spacy document. It returns an integer.
|
51 |
+
word_count(int): The amount of words in the text.
|
52 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
53 |
+
|
54 |
+
Returns:
|
55 |
+
int: The incidence of a syntactic pattern per {self._incidence} words.
|
56 |
+
'''
|
57 |
+
if len(text) == 0:
|
58 |
+
raise ValueError('The word is empty.')
|
59 |
+
elif workers == 0 or workers < -1:
|
60 |
+
raise ValueError('Workers must be -1 or any positive number greater than 0')
|
61 |
+
else:
|
62 |
+
paragraphs = split_text_into_paragraphs(text) # Find all paragraphs
|
63 |
+
threads = multiprocessing.cpu_count() if workers == -1 else workers
|
64 |
+
wc = word_count if word_count is not None else self._di.get_word_count_from_text(text)
|
65 |
+
self._nlp.get_pipe('feature counter').counter_function = sp_counter_function
|
66 |
+
density = sum(doc._.feature_count
|
67 |
+
for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)) # Calculate with multiprocessing
|
68 |
+
|
69 |
+
return (density / wc) * self._incidence
|
70 |
+
|
71 |
+
def get_noun_phrase_density(self, text: str, word_count: int=None, workers: int=-1) -> int:
|
72 |
+
'''
|
73 |
+
This function obtains the incidence of noun phrases that exist on a text per {self._incidence} words.
|
74 |
+
|
75 |
+
Parameters:
|
76 |
+
text(str): The text to be analized.
|
77 |
+
word_count(int): The amount of words in the text.
|
78 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
int: The incidence of noun phrases per {self._incidence} words.
|
82 |
+
'''
|
83 |
+
count_noun_phrases = lambda doc: len(doc._.noun_phrases)
|
84 |
+
disable_pipeline = [pipe
|
85 |
+
for pipe in self._nlp.pipe_names
|
86 |
+
if pipe not in ['noun phrase tagger', 'tagger', 'parser', 'feature counter']]
|
87 |
+
|
88 |
+
return self._get_syntactic_pattern_density(text, disable_pipeline=disable_pipeline, sp_counter_function=count_noun_phrases, workers=workers)
|
89 |
+
|
90 |
+
def get_verb_phrase_density(self, text: str, word_count: int=None, workers: int=-1) -> int:
|
91 |
+
'''
|
92 |
+
This function obtains the incidence of verb phrases that exist on a text per {self._incidence} words.
|
93 |
+
|
94 |
+
Parameters:
|
95 |
+
text(str): The text to be analized.
|
96 |
+
word_count(int): The amount of words in the text.
|
97 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
int: The incidence of verb phrases per {self._incidence} words.
|
101 |
+
'''
|
102 |
+
count_verb_phrases = lambda doc: len(doc._.verb_phrases)
|
103 |
+
disable_pipeline = [pipe
|
104 |
+
for pipe in self._nlp.pipe_names
|
105 |
+
if pipe not in ['verb phrase tagger', 'tagger', 'feature counter']]
|
106 |
+
|
107 |
+
return self._get_syntactic_pattern_density(text, disable_pipeline=disable_pipeline, sp_counter_function=count_verb_phrases, workers=workers)
|
108 |
+
|
109 |
+
def get_negation_expressions_density(self, text: str, word_count: int=None, workers: int=-1) -> int:
|
110 |
+
'''
|
111 |
+
This function obtains the incidence of negation expressions that exist on a text per {self._incidence} words.
|
112 |
+
|
113 |
+
Parameters:
|
114 |
+
text(str): The text to be analized.
|
115 |
+
word_count(int): The amount of words in the text.
|
116 |
+
workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
int: The incidence of negation expressions per {self._incidence} words.
|
120 |
+
'''
|
121 |
+
count_negation_expressions = lambda doc: len(doc._.negation_expressions)
|
122 |
+
disable_pipeline = [pipe
|
123 |
+
for pipe in self._nlp.pipe_names
|
124 |
+
if pipe not in ['negative expression tagger', 'tagger', 'feature counter']]
|
125 |
+
|
126 |
+
return self._get_syntactic_pattern_density(text, disable_pipeline=disable_pipeline, sp_counter_function=count_negation_expressions, workers=workers)
|
text_complexity_analyzer_cm/coh_metrix_indices/word_information_indices.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
import spacy
|
3 |
+
|
4 |
+
from typing import Callable
|
5 |
+
from typing import List
|
6 |
+
from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
|
7 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
8 |
+
from text_complexity_analyzer_cm.utils.utils import is_word
|
9 |
+
from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
|
10 |
+
|
11 |
+
class WordInformationIndices:
|
12 |
+
def __init__(self, nlp, language: str='en', descriptive_indices: DescriptiveIndices=None) -> None:
|
13 |
+
self.language = language
|
14 |
+
self._nlp = nlp
|
15 |
+
self._incidence = 1000
|
16 |
+
if descriptive_indices is None:
|
17 |
+
self._di = DescriptiveIndices(language=language, nlp=nlp)
|
18 |
+
else:
|
19 |
+
self._di = descriptive_indices
|
20 |
+
|
21 |
+
def _get_word_type_incidence(self, text: str, disable_pipeline :List, counter_function: Callable, word_count: int=None, workers: int=-1) -> float:
|
22 |
+
paragraphs = split_text_into_paragraphs(text)
|
23 |
+
wc = word_count if word_count is not None else self._di.get_word_count_from_text(text)
|
24 |
+
self._nlp.get_pipe('feature counter').counter_function = counter_function
|
25 |
+
words = sum(doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=1, disable=disable_pipeline, n_process=1))
|
26 |
+
result = words #(words / wc)
|
27 |
+
return result
|
28 |
+
|
29 |
+
def get_noun_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
30 |
+
noun_counter = lambda doc: sum(1 for token in doc if is_word(token) and token.pos_ in ['NOUN', 'PROPN'])
|
31 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
32 |
+
result = self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=noun_counter, workers=workers)
|
33 |
+
return result
|
34 |
+
|
35 |
+
def get_verb_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
36 |
+
verb_counter = lambda doc: sum(1 for token in doc if is_word(token) and token.pos_ == 'VERB')
|
37 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
38 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=verb_counter, workers=workers)
|
39 |
+
|
40 |
+
def get_adjective_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
41 |
+
adjective_counter = lambda doc: sum(1
|
42 |
+
for token in doc
|
43 |
+
if is_word(token) and token.pos_ == 'ADJ')
|
44 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
45 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=adjective_counter, workers=workers)
|
46 |
+
|
47 |
+
def get_adverb_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
48 |
+
adverb_counter = lambda doc: sum(1
|
49 |
+
for token in doc
|
50 |
+
if is_word(token) and token.pos_ == 'ADV')
|
51 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
52 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=adverb_counter, workers=workers)
|
53 |
+
|
54 |
+
def get_personal_pronoun_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
55 |
+
pronoun_counter = lambda doc: sum(1
|
56 |
+
for token in doc
|
57 |
+
if is_word(token) and token.pos_ == 'PRON')
|
58 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
59 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
|
60 |
+
|
61 |
+
def get_personal_pronoun_first_person_singular_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
62 |
+
pronoun_counter = lambda doc: sum(1
|
63 |
+
for token in doc
|
64 |
+
if is_word(token) and token.pos_ == 'PRON' and 'Number=Sing' in token.morph and 'Person=1' in token.morph)
|
65 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
66 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
|
67 |
+
|
68 |
+
def get_personal_pronoun_first_person_plural_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
69 |
+
pronoun_counter = lambda doc: sum(1
|
70 |
+
for token in doc
|
71 |
+
if is_word(token) and token.pos_ == 'PRON' and 'Number=Plur' in token.morph and 'Person=1' in token.morph)
|
72 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
73 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
|
74 |
+
|
75 |
+
def get_personal_pronoun_second_person_singular_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
76 |
+
pronoun_counter = lambda doc: sum(1
|
77 |
+
for token in doc
|
78 |
+
if is_word(token) and token.pos_ == 'PRON' and 'Number=Sing' in token.morph and 'Person=2' in token.morph)
|
79 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
80 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
|
81 |
+
|
82 |
+
def get_personal_pronoun_second_person_plural_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
83 |
+
pronoun_counter = lambda doc: sum(1
|
84 |
+
for token in doc
|
85 |
+
if is_word(token) and token.pos_ == 'PRON' and 'Number=Plur' in token.morph and 'Person=2' in token.morph)
|
86 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
87 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
|
88 |
+
|
89 |
+
def get_personal_pronoun_third_person_singular_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
90 |
+
pronoun_counter = lambda doc: sum(1
|
91 |
+
for token in doc
|
92 |
+
if is_word(token) and token.pos_ == 'PRON' and 'Number=Sing' in token.morph and 'Person=3' in token.morph)
|
93 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
94 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
|
95 |
+
|
96 |
+
def get_personal_pronoun_third_person_plural_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
|
97 |
+
pronoun_counter = lambda doc: sum(1 for token in doc if is_word(token) and token.pos_ == 'PRON' and 'Number=Plur' in token.morph and 'Person=3' in token.morph)
|
98 |
+
disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
|
99 |
+
return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
|
text_complexity_analyzer_cm/constants.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
This function contains constants that will be used across the entire library.
|
3 |
+
'''
|
4 |
+
|
5 |
+
import os
|
6 |
+
|
7 |
+
language = {
|
8 |
+
'es': 'es_core_news_lg',
|
9 |
+
'en': 'en_core_web_sm'
|
10 |
+
}
|
11 |
+
|
12 |
+
ACCEPTED_LANGUAGES = {
|
13 |
+
'es': 'es_core_news_lg',
|
14 |
+
'en': 'en_core_web_sm',
|
15 |
+
}
|
16 |
+
|
17 |
+
LANGUAGES_DICTIONARY_PYPHEN = {
|
18 |
+
'es': 'es',
|
19 |
+
'en': 'en'
|
20 |
+
}
|
21 |
+
|
22 |
+
BASE_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
|
text_complexity_analyzer_cm/perm.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
3 |
+
from text_complexity_analyzer_cm.coh_metrix_indices.connective_indices import ConnectiveIndices
|
4 |
+
from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
|
5 |
+
from text_complexity_analyzer_cm.coh_metrix_indices.lexical_diversity_indices import LexicalDiversityIndices
|
6 |
+
from text_complexity_analyzer_cm.coh_metrix_indices.syntactic_complexity_indices import SyntacticComplexityIndices
|
7 |
+
from text_complexity_analyzer_cm.coh_metrix_indices.word_information_indices import WordInformationIndices
|
8 |
+
from text_complexity_analyzer_cm.pipes.syllable_splitter import SyllableSplitter
|
9 |
+
from text_complexity_analyzer_cm.pipes.causal_connectives_tagger import CausalConnectivesTagger
|
10 |
+
from text_complexity_analyzer_cm.pipes.emphatics_tagger import EmphaticsTagger
|
11 |
+
from text_complexity_analyzer_cm.pipes.asks_tagger import AsksTagger
|
12 |
+
from text_complexity_analyzer_cm.pipes.polites_tagger import PolitesTagger
|
13 |
+
from text_complexity_analyzer_cm.pipes.logical_connectives_tagger import LogicalConnectivesTagger
|
14 |
+
from text_complexity_analyzer_cm.pipes.adversative_connectives_tagger import AdversativeConnectivesTagger
|
15 |
+
from text_complexity_analyzer_cm.pipes.temporal_connectives_tagger import TemporalConnectivesTagger
|
16 |
+
from text_complexity_analyzer_cm.pipes.additive_connectives_tagger import AdditiveConnectivesTagger
|
17 |
+
from text_complexity_analyzer_cm.pipes.feature_counter import FeatureCounter
|
18 |
+
from typing import Dict
|
19 |
+
|
20 |
+
class PERM:
|
21 |
+
def __init__(self, language:str = 'en') -> None:
|
22 |
+
self.language = language
|
23 |
+
self._nlp = spacy.load(ACCEPTED_LANGUAGES[language], disable=['ner'])
|
24 |
+
self._nlp.max_length = 3000000
|
25 |
+
self._nlp.add_pipe('sentencizer')
|
26 |
+
self._nlp.add_pipe('syllables', config={"language": 'en'}, after='tagger')
|
27 |
+
self._nlp.add_pipe('causal connective tagger', config={"language": 'en'}, after='tagger')
|
28 |
+
self._nlp.add_pipe('temporal connective tagger', config={"language": 'en'}, after='tagger')
|
29 |
+
self._nlp.add_pipe('emphatics tagger', config={"language": 'en'}, after='tagger')
|
30 |
+
self._nlp.add_pipe('asks tagger', config={"language": 'en'}, after='tagger')
|
31 |
+
self._nlp.add_pipe('polites tagger', config={"language": 'en'}, after='tagger')
|
32 |
+
self._nlp.add_pipe('logical connective tagger', config={"language": 'en'}, after='tagger')
|
33 |
+
self._nlp.add_pipe('adversative connective tagger', config={"language": 'en'}, after='tagger')
|
34 |
+
self._nlp.add_pipe('additive connective tagger', config={"language": 'en'}, after='tagger')
|
35 |
+
self._nlp.add_pipe('feature counter', config={"language": 'en'}, last=True)
|
36 |
+
self._di = DescriptiveIndices(language=language, nlp=self._nlp)
|
37 |
+
self._ci = ConnectiveIndices(language=language, nlp=self._nlp, descriptive_indices=self._di)
|
38 |
+
self._ldi = LexicalDiversityIndices(language=language, nlp=self._nlp)
|
39 |
+
self._sci = SyntacticComplexityIndices(language=language, nlp=self._nlp)
|
40 |
+
self._wii = WordInformationIndices(language=language, nlp=self._nlp, descriptive_indices=self._di)
|
41 |
+
|
42 |
+
def calculate_descriptive_indices_for_one_text(self, text: str, workers: int=-1) -> Dict:
|
43 |
+
indices = {}
|
44 |
+
|
45 |
+
indices['Paragraph Count'] = self._di.get_paragraph_count_from_text(text=text)
|
46 |
+
indices['Sentence Count'] = self._di.get_sentence_count_from_text(text=text, workers=workers)
|
47 |
+
indices['Word Count'] = self._di.get_word_count_from_text(text=text, workers=workers)
|
48 |
+
length_of_paragraph = self._di.get_length_of_paragraphs(text=text, workers=workers)
|
49 |
+
indices['Mean Length of Paragraphs'] = length_of_paragraph.mean
|
50 |
+
length_of_sentences = self._di.get_length_of_sentences(text=text, workers=workers)
|
51 |
+
indices['Mean Length of Sentences'] = length_of_sentences.mean
|
52 |
+
length_of_words = self._di.get_length_of_words(text=text, workers=workers)
|
53 |
+
indices['Mean Length of Words'] = length_of_words.mean
|
54 |
+
syllables_per_word = self._di.get_syllables_per_word(text=text, workers=workers)
|
55 |
+
indices['Mean Syllables of Words'] = syllables_per_word.mean
|
56 |
+
|
57 |
+
return indices
|
58 |
+
|
59 |
+
def calculate_word_information_indices_for_one_text(self, text: str, workers: int=-1, word_count: int=None) -> Dict:
|
60 |
+
indices = {}
|
61 |
+
|
62 |
+
indices['#Nouns'] = self._wii.get_noun_incidence(text=text, workers=workers, word_count=word_count)
|
63 |
+
indices['#Verbs'] = self._wii.get_verb_incidence(text=text, workers=workers, word_count=word_count)
|
64 |
+
indices['#Adjectives'] = self._wii.get_adjective_incidence(text=text, workers=workers, word_count=word_count)
|
65 |
+
indices['#Adverbs'] = self._wii.get_adverb_incidence(text=text, workers=workers, word_count=word_count)
|
66 |
+
indices['#Personal Pronouns'] = self._wii.get_personal_pronoun_incidence(text=text, workers=workers, word_count=word_count)
|
67 |
+
indices['#Pers1s'] = self._wii.get_personal_pronoun_first_person_singular_form_incidence(text=text, workers=workers, word_count=word_count)
|
68 |
+
indices['#Pers1p'] = self._wii.get_personal_pronoun_first_person_plural_form_incidence(
|
69 |
+
text=text, workers=workers, word_count=word_count)
|
70 |
+
indices['#Pers2s'] = self._wii.get_personal_pronoun_second_person_singular_form_incidence(
|
71 |
+
text=text, workers=workers, word_count=word_count)
|
72 |
+
indices['#Pers2p'] = self._wii.get_personal_pronoun_second_person_plural_form_incidence(
|
73 |
+
text=text, workers=workers, word_count=word_count)
|
74 |
+
indices['#Pers3s'] = self._wii.get_personal_pronoun_third_person_singular_form_incidence(
|
75 |
+
text=text, workers=workers, word_count=word_count)
|
76 |
+
indices['#Pers3p'] = self._wii.get_personal_pronoun_third_person_plural_form_incidence(
|
77 |
+
text=text, workers=workers, word_count=word_count)
|
78 |
+
|
79 |
+
return indices
|
text_complexity_analyzer_cm/pipes/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
This module contains the Spacy pipes used to calculate certain coh-metrix indices.
|
3 |
+
'''
|
text_complexity_analyzer_cm/pipes/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (285 Bytes). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/additive_connectives_tagger.cpython-39.pyc
ADDED
Binary file (2.98 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/adversative_connectives_tagger.cpython-39.pyc
ADDED
Binary file (3.09 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/asks_tagger.cpython-39.pyc
ADDED
Binary file (2.06 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/causal_connectives_tagger.cpython-39.pyc
ADDED
Binary file (2.46 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/emphatics_tagger.cpython-39.pyc
ADDED
Binary file (2.29 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/feature_counter.cpython-39.pyc
ADDED
Binary file (1.42 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/logical_connectives_tagger.cpython-39.pyc
ADDED
Binary file (2.63 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/negative_expression_tagger.cpython-39.pyc
ADDED
Binary file (3.04 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/noun_phrase_tagger.cpython-39.pyc
ADDED
Binary file (2.61 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/polites_tagger.cpython-39.pyc
ADDED
Binary file (2.61 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/referential_cohesion_adjacent_sentences_analyzer.cpython-39.pyc
ADDED
Binary file (2.25 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/referential_cohesion_all_sentences_analyzer.cpython-39.pyc
ADDED
Binary file (2.18 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/syllable_splitter.cpython-39.pyc
ADDED
Binary file (1.25 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/temporal_connectives_tagger.cpython-39.pyc
ADDED
Binary file (2.54 kB). View file
|
|
text_complexity_analyzer_cm/pipes/__pycache__/verb_phrase_tagger.cpython-39.pyc
ADDED
Binary file (3.01 kB). View file
|
|
text_complexity_analyzer_cm/pipes/additive_connectives_tagger.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from spacy.matcher import PhraseMatcher
|
2 |
+
from spacy.tokens import Doc
|
3 |
+
from spacy.tokens import Span
|
4 |
+
from spacy.util import filter_spans
|
5 |
+
from spacy.language import Language
|
6 |
+
|
7 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
8 |
+
|
9 |
+
additive_connectives_getter = lambda doc: [doc[span['start']:span['end']]
|
10 |
+
for span in doc._.additive_connectives_span_indices]
|
11 |
+
|
12 |
+
Doc.set_extension('additive_connectives_span_indices', force=False, default=[])
|
13 |
+
Doc.set_extension('additive_connectives', force=False, getter=additive_connectives_getter)
|
14 |
+
|
15 |
+
@Language.factory('additive connective tagger')
|
16 |
+
class AdditiveConnectivesTagger:
|
17 |
+
def __init__(self, name, nlp, language: str='en') -> None:
|
18 |
+
'''
|
19 |
+
This constructor will initialize the object that tags additive connectives.
|
20 |
+
|
21 |
+
Parameters:
|
22 |
+
nlp: The Spacy model to use this tagger with.
|
23 |
+
language: The language that this pipeline will be used in.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
None.
|
27 |
+
'''
|
28 |
+
if not language in ACCEPTED_LANGUAGES:
|
29 |
+
raise ValueError(f'Language {language} is not supported yet')
|
30 |
+
|
31 |
+
self._language = language
|
32 |
+
self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
|
33 |
+
self._connectives = []
|
34 |
+
if language == 'en': # Temporal connectives for spanish
|
35 |
+
self._connectives = ['then', 'moreover', 'after', 'from here on', 'even', 'next', 'to top it all', 'another', 'finally', 'of equal importance', 'is more', 'first', 'besides', 'gradually', 'too', 'last', 'equally important', 'third', 'as soon as', 'on the other hand', 'furthermore', 'to begin with', 'above', 'also', 'first ', 'likewise', 'in addition', 'second', 'inclusive', 'further', 'before', 'hence', 'in the end', 'last of all']
|
36 |
+
else: # Support for future languages
|
37 |
+
# self._connectives = ['asimismo', 'igualmente' 'de igual modo', 'de igual manera', 'de igual forma', 'del mismo modo', 'de la misma manera', 'de la misma forma', 'en primer lugar', 'en segundo lugar', 'en tercer lugar', 'en último lugar', 'por su parte', 'por otro lado', 'además', 'encima', 'es más', 'por añadidura', 'incluso', 'inclusive', 'para colmo']
|
38 |
+
pass
|
39 |
+
|
40 |
+
for con in self._connectives:
|
41 |
+
self._matcher.add(con, None, nlp(con))
|
42 |
+
|
43 |
+
|
44 |
+
def __call__(self, doc: Doc) -> Doc:
|
45 |
+
'''
|
46 |
+
This method will find all additive connectives and store them in an iterable.
|
47 |
+
|
48 |
+
Parameters:
|
49 |
+
doc(Doc): A Spacy document.
|
50 |
+
'''
|
51 |
+
matches = self._matcher(doc)
|
52 |
+
additive_connectives_spans = [doc[start:end] for _, start, end in matches]
|
53 |
+
|
54 |
+
doc._.additive_connectives_span_indices = [{'start': span.start,
|
55 |
+
'end': span.end,
|
56 |
+
'label': span.label}
|
57 |
+
for span in filter_spans(additive_connectives_spans)] # Save the temporal connectives found
|
58 |
+
|
59 |
+
return doc
|
text_complexity_analyzer_cm/pipes/adversative_connectives_tagger.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from spacy.matcher import PhraseMatcher
|
2 |
+
from spacy.tokens import Doc
|
3 |
+
from spacy.tokens import Span
|
4 |
+
from spacy.util import filter_spans
|
5 |
+
from spacy.language import Language
|
6 |
+
|
7 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
8 |
+
|
9 |
+
adversative_connectives_getter = lambda doc: [doc[span['start']:span['end']]
|
10 |
+
for span in doc._.adversative_connectives_span_indices]
|
11 |
+
|
12 |
+
Doc.set_extension('adversative_connectives_span_indices', force=False, default=[])
|
13 |
+
Doc.set_extension('adversative_connectives', force=False, getter=adversative_connectives_getter)
|
14 |
+
|
15 |
+
@Language.factory('adversative connective tagger')
|
16 |
+
class AdversativeConnectivesTagger:
|
17 |
+
'''
|
18 |
+
This tagger has the task to find all adversative connectives in a document. It needs to go after the 'Tagger' pipeline component.
|
19 |
+
'''
|
20 |
+
name = 'adversative connective tagger'
|
21 |
+
|
22 |
+
def __init__(self, name, nlp, language: str='en') -> None:
|
23 |
+
'''
|
24 |
+
This constructor will initialize the object that tags adversative connectives.
|
25 |
+
|
26 |
+
Parameters:
|
27 |
+
nlp: The Spacy model to use this tagger with.
|
28 |
+
language: The language that this pipeline will be used in.
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
None.
|
32 |
+
'''
|
33 |
+
if not language in ACCEPTED_LANGUAGES:
|
34 |
+
raise ValueError(f'Language {language} is not supported yet')
|
35 |
+
|
36 |
+
self._language = language
|
37 |
+
self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
|
38 |
+
self._connectives = []
|
39 |
+
if language == 'en': # Adversative connectives for spanish
|
40 |
+
self._connectives = ['although', 'instead', 'and yet', 'nonetheless', 'nevertheless', 'rather', 'more than', 'yet', 'in fact', 'on the other hand', 'but yes', 'now well', 'on the contrary', 'however', 'in spite of this', 'conversely', 'still', 'less', 'actually', 'but rather', 'in contrast', 'but', 'except', 'only']
|
41 |
+
# self._connectives = ['pero', 'sino', 'no obstante', 'sino que', 'sin embargo', 'pero sí', 'aunque', 'menos', 'solo', 'excepto', 'salvo', 'más que', 'en cambio', 'ahora bien', 'más bien']
|
42 |
+
else: # Support for future languages
|
43 |
+
pass
|
44 |
+
|
45 |
+
for con in self._connectives:
|
46 |
+
self._matcher.add(con, None, nlp(con))
|
47 |
+
|
48 |
+
|
49 |
+
def __call__(self, doc: Doc) -> Doc:
|
50 |
+
'''
|
51 |
+
This method will find all adversative connectives and store them in an iterable.
|
52 |
+
|
53 |
+
Parameters:
|
54 |
+
doc(Doc): A Spacy document.
|
55 |
+
'''
|
56 |
+
matches = self._matcher(doc)
|
57 |
+
adversative_connectives_spans = [doc[start:end] for _, start, end in matches]
|
58 |
+
|
59 |
+
doc._.adversative_connectives_span_indices = [{'start': span.start,
|
60 |
+
'end': span.end,
|
61 |
+
'label': span.label}
|
62 |
+
for span in filter_spans(adversative_connectives_spans)] # Save the causal connectives found
|
63 |
+
|
64 |
+
return doc
|
text_complexity_analyzer_cm/pipes/asks_tagger.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from spacy.matcher import PhraseMatcher
|
2 |
+
from spacy.tokens import Doc
|
3 |
+
from spacy.tokens import Span
|
4 |
+
from spacy.util import filter_spans
|
5 |
+
from spacy.language import Language
|
6 |
+
|
7 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
8 |
+
|
9 |
+
asks_getter = lambda doc: [doc[span['start']:span['end']] for span in doc._.asks_span_indices]
|
10 |
+
|
11 |
+
Doc.set_extension('asks_span_indices', force=False, default=[])
|
12 |
+
Doc.set_extension('asks', force=False, getter=asks_getter)
|
13 |
+
|
14 |
+
@Language.factory('asks tagger')
|
15 |
+
class AsksTagger:
|
16 |
+
def __init__(self, name, nlp, language: str='en') -> None:
|
17 |
+
if not language in ACCEPTED_LANGUAGES:
|
18 |
+
raise ValueError(f'Language {language} is not supported yet')
|
19 |
+
|
20 |
+
self._language = language
|
21 |
+
self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
|
22 |
+
self._connectives = []
|
23 |
+
if language == 'en': # question words and questionmark
|
24 |
+
self._connectives = ['?', 'what', 'how', 'who', 'when']
|
25 |
+
else: # Support for future languages
|
26 |
+
pass
|
27 |
+
|
28 |
+
for con in self._connectives:
|
29 |
+
self._matcher.add(con, None, nlp(con))
|
30 |
+
|
31 |
+
|
32 |
+
def __call__(self, doc: Doc) -> Doc:
|
33 |
+
matches = self._matcher(doc)
|
34 |
+
asks_spans = [doc[start:end] for _, start, end in matches]
|
35 |
+
doc._.asks_span_indices = [{'start': span.start, 'end': span.end, 'label': span.label} for span in filter_spans(asks_spans)] # Save the asks connectives found
|
36 |
+
|
37 |
+
return doc
|
text_complexity_analyzer_cm/pipes/causal_connectives_tagger.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from spacy.matcher import PhraseMatcher
|
2 |
+
from spacy.tokens import Doc
|
3 |
+
from spacy.tokens import Span
|
4 |
+
from spacy.util import filter_spans
|
5 |
+
from spacy.language import Language
|
6 |
+
|
7 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
8 |
+
|
9 |
+
causal_connectives_getter = lambda doc: [doc[span['start']:span['end']] for span in doc._.causal_connectives_span_indices]
|
10 |
+
|
11 |
+
Doc.set_extension('causal_connectives_span_indices', force=False, default=[])
|
12 |
+
Doc.set_extension('causal_connectives', force=False, getter=causal_connectives_getter)
|
13 |
+
|
14 |
+
@Language.factory('causal connective tagger')
|
15 |
+
class CausalConnectivesTagger:
|
16 |
+
def __init__(self, name, nlp, language) -> None:
|
17 |
+
self._language = language
|
18 |
+
self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
|
19 |
+
self.causal_connectives = []
|
20 |
+
if language == 'en':
|
21 |
+
self.causal_connectives = ['to repeat, briefly', 'finally', 'therefore', 'with this in mind', 'in conclusion', 'because of this', 'because of', 'as a consequence', 'to this end', 'on the score of', 'then', 'because', 'so', 'later', 'hence', 'in short', 'for this reason', 'thus', 'so much that', 'accordingly', 'for', 'so then', 'as I have said', 'therefore', 'in summary', 'on the whole', 'consequently', 'for this purpose', 'since', 'as a result', 'to sum up', 'so that', 'as you can see']
|
22 |
+
else:
|
23 |
+
pass
|
24 |
+
for con in self.causal_connectives:
|
25 |
+
self._matcher.add(con, None, nlp(con))
|
26 |
+
|
27 |
+
def __call__(self, doc: Doc) -> Doc:
|
28 |
+
matches = self._matcher(doc)
|
29 |
+
causal_connectives_spans = [doc[start:end] for _, start, end in matches]
|
30 |
+
doc._.causal_connectives_span_indices = [{'start': span.start,
|
31 |
+
'end': span.end,
|
32 |
+
'label': span.label}
|
33 |
+
for span in filter_spans(causal_connectives_spans)]
|
34 |
+
return doc
|
text_complexity_analyzer_cm/pipes/emphatics_tagger.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from spacy.matcher import PhraseMatcher
|
2 |
+
from spacy.tokens import Doc
|
3 |
+
from spacy.tokens import Span
|
4 |
+
from spacy.util import filter_spans
|
5 |
+
from spacy.language import Language
|
6 |
+
|
7 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
8 |
+
|
9 |
+
emphatics_getter = lambda doc: [doc[span['start']:span['end']]
|
10 |
+
for span in doc._.emphatics_span_indices]
|
11 |
+
|
12 |
+
Doc.set_extension('emphatics_span_indices', force=False, default=[])
|
13 |
+
Doc.set_extension('emphatics', force=False, getter=emphatics_getter)
|
14 |
+
|
15 |
+
@Language.factory('emphatics tagger')
|
16 |
+
class EmphaticsTagger:
|
17 |
+
def __init__(self, name, nlp, language) -> None:
|
18 |
+
if not language in ACCEPTED_LANGUAGES:
|
19 |
+
raise ValueError(f'Language {language} is not supported yet')
|
20 |
+
|
21 |
+
self._language = language
|
22 |
+
self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
|
23 |
+
self._connectives = []
|
24 |
+
if language == 'en': # emphatics connectives for spanish
|
25 |
+
self._connectives = ['him', 'there', 'their', 'it', 'he', 'she', 'we', 'who', 'them', 'they', 'you', 'himself', 'her', 'whom', 'itself', 'somebody', 'something', 'us', 'anybody', 'herself', 'anyone', 'everybody', 'nobody', 'everyone', 'themselves', 'yourself', 'someone', 'his', 'yours']
|
26 |
+
else: # Support for future languages
|
27 |
+
pass
|
28 |
+
|
29 |
+
for con in self._connectives:
|
30 |
+
self._matcher.add(con, None, nlp(con))
|
31 |
+
|
32 |
+
|
33 |
+
def __call__(self, doc: Doc) -> Doc:
|
34 |
+
matches = self._matcher(doc)
|
35 |
+
emphatics_spans = [doc[start:end] for _, start, end in matches]
|
36 |
+
|
37 |
+
doc._.emphatics_span_indices = [{'start': span.start,
|
38 |
+
'end': span.end,
|
39 |
+
'label': span.label}
|
40 |
+
for span in filter_spans(emphatics_spans)] # Save the emphatics connectives found
|
41 |
+
|
42 |
+
return doc
|
text_complexity_analyzer_cm/pipes/feature_counter.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from itertools import tee
|
2 |
+
from spacy.tokens import Doc
|
3 |
+
from spacy.tokens import Token
|
4 |
+
from spacy.language import Language
|
5 |
+
|
6 |
+
from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
|
7 |
+
|
8 |
+
Doc.set_extension('feature_count', default=None, force=True)
|
9 |
+
|
10 |
+
@Language.factory('feature counter')
|
11 |
+
class FeatureCounter:
|
12 |
+
|
13 |
+
def __init__(self, nlp, name, language) -> None:
|
14 |
+
if not language in ACCEPTED_LANGUAGES:
|
15 |
+
raise ValueError(f'Language {language} is not supported yet')
|
16 |
+
self.language = language
|
17 |
+
self.counter_function = None
|
18 |
+
|
19 |
+
|
20 |
+
def __call__(self, doc: Doc) -> Doc:
|
21 |
+
'''
|
22 |
+
This method will calculate the the 'counter_function' on a text. Said function will be handle different counting.
|
23 |
+
|
24 |
+
Parameters:
|
25 |
+
doc(Doc): A Spacy document.
|
26 |
+
'''
|
27 |
+
if self.counter_function is None:
|
28 |
+
raise AttributeError('No function to count features was provided.')
|
29 |
+
# Prepare iterators to extract previous and current sentence pairs.
|
30 |
+
doc._.feature_count = self.counter_function(doc)
|
31 |
+
|
32 |
+
return doc
|