seduerr commited on
Commit
900d7f1
1 Parent(s): 19b9c2d

init commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. __init__.py +0 -0
  3. app.py +24 -0
  4. requirements.txt +3 -0
  5. text_complexity_analyzer_cm/.DS_Store +0 -0
  6. text_complexity_analyzer_cm/__init__.py +3 -0
  7. text_complexity_analyzer_cm/coh_metrix_indices/.DS_Store +0 -0
  8. text_complexity_analyzer_cm/coh_metrix_indices/__init__.py +3 -0
  9. text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/__init__.cpython-39.pyc +0 -0
  10. text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/connective_indices.cpython-39.pyc +0 -0
  11. text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/descriptive_indices.cpython-39.pyc +0 -0
  12. text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/lexical_diversity_indices.cpython-39.pyc +0 -0
  13. text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/readability_indices.cpython-39.pyc +0 -0
  14. text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/referential_cohesion_indices.cpython-39.pyc +0 -0
  15. text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/syntactic_complexity_indices.cpython-39.pyc +0 -0
  16. text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/syntactic_pattern_density_indices.cpython-39.pyc +0 -0
  17. text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/word_information_indices.cpython-39.pyc +0 -0
  18. text_complexity_analyzer_cm/coh_metrix_indices/connective_indices.py +77 -0
  19. text_complexity_analyzer_cm/coh_metrix_indices/descriptive_indices.py +123 -0
  20. text_complexity_analyzer_cm/coh_metrix_indices/lexical_diversity_indices.py +37 -0
  21. text_complexity_analyzer_cm/coh_metrix_indices/readability_indices.py +66 -0
  22. text_complexity_analyzer_cm/coh_metrix_indices/referential_cohesion_indices.py +402 -0
  23. text_complexity_analyzer_cm/coh_metrix_indices/syntactic_complexity_indices.py +56 -0
  24. text_complexity_analyzer_cm/coh_metrix_indices/syntactic_pattern_density_indices.py +126 -0
  25. text_complexity_analyzer_cm/coh_metrix_indices/word_information_indices.py +99 -0
  26. text_complexity_analyzer_cm/constants.py +22 -0
  27. text_complexity_analyzer_cm/perm.py +79 -0
  28. text_complexity_analyzer_cm/pipes/__init__.py +3 -0
  29. text_complexity_analyzer_cm/pipes/__pycache__/__init__.cpython-39.pyc +0 -0
  30. text_complexity_analyzer_cm/pipes/__pycache__/additive_connectives_tagger.cpython-39.pyc +0 -0
  31. text_complexity_analyzer_cm/pipes/__pycache__/adversative_connectives_tagger.cpython-39.pyc +0 -0
  32. text_complexity_analyzer_cm/pipes/__pycache__/asks_tagger.cpython-39.pyc +0 -0
  33. text_complexity_analyzer_cm/pipes/__pycache__/causal_connectives_tagger.cpython-39.pyc +0 -0
  34. text_complexity_analyzer_cm/pipes/__pycache__/emphatics_tagger.cpython-39.pyc +0 -0
  35. text_complexity_analyzer_cm/pipes/__pycache__/feature_counter.cpython-39.pyc +0 -0
  36. text_complexity_analyzer_cm/pipes/__pycache__/logical_connectives_tagger.cpython-39.pyc +0 -0
  37. text_complexity_analyzer_cm/pipes/__pycache__/negative_expression_tagger.cpython-39.pyc +0 -0
  38. text_complexity_analyzer_cm/pipes/__pycache__/noun_phrase_tagger.cpython-39.pyc +0 -0
  39. text_complexity_analyzer_cm/pipes/__pycache__/polites_tagger.cpython-39.pyc +0 -0
  40. text_complexity_analyzer_cm/pipes/__pycache__/referential_cohesion_adjacent_sentences_analyzer.cpython-39.pyc +0 -0
  41. text_complexity_analyzer_cm/pipes/__pycache__/referential_cohesion_all_sentences_analyzer.cpython-39.pyc +0 -0
  42. text_complexity_analyzer_cm/pipes/__pycache__/syllable_splitter.cpython-39.pyc +0 -0
  43. text_complexity_analyzer_cm/pipes/__pycache__/temporal_connectives_tagger.cpython-39.pyc +0 -0
  44. text_complexity_analyzer_cm/pipes/__pycache__/verb_phrase_tagger.cpython-39.pyc +0 -0
  45. text_complexity_analyzer_cm/pipes/additive_connectives_tagger.py +59 -0
  46. text_complexity_analyzer_cm/pipes/adversative_connectives_tagger.py +64 -0
  47. text_complexity_analyzer_cm/pipes/asks_tagger.py +37 -0
  48. text_complexity_analyzer_cm/pipes/causal_connectives_tagger.py +34 -0
  49. text_complexity_analyzer_cm/pipes/emphatics_tagger.py +42 -0
  50. text_complexity_analyzer_cm/pipes/feature_counter.py +32 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from text_complexity_analyzer_cm.perm import PERM
3
+
4
+
5
+ def predict(text):
6
+ perm = PERM('en')
7
+ analytics = []
8
+ analytics.append(perm.calculate_word_information_indices_for_one_text(text, workers=-1))
9
+ analytics.append(perm.calculate_descriptive_indices_for_one_text(text, workers=-1))
10
+ return analytics
11
+
12
+
13
+ title = "Get the Analytics of your Message"
14
+
15
+ iface = gr.Interface(fn=predict,
16
+ inputs=gr.inputs.Textbox(
17
+ lines=3, label='Insert any given text to get textual analytics.'),
18
+ outputs="text",
19
+ title=title,
20
+ theme="huggingface",
21
+ examples=[
22
+ 'We are going to analyze this text for persuasiveness, word information, and descriptive information.']
23
+ )
24
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ spacy
2
+ Pyphen
3
+ python3 -m spacy download en_core_web_md
text_complexity_analyzer_cm/.DS_Store ADDED
Binary file (8.2 kB). View file
 
text_complexity_analyzer_cm/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ '''
2
+ This module contains the classes and functions that handle the processing of text using the coh-metrix indices.
3
+ '''
text_complexity_analyzer_cm/coh_metrix_indices/.DS_Store ADDED
Binary file (6.15 kB). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ '''
2
+ This module contains the classes to calculate the coh metrix indices.
3
+ '''
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (285 Bytes). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/connective_indices.cpython-39.pyc ADDED
Binary file (8.01 kB). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/descriptive_indices.cpython-39.pyc ADDED
Binary file (8.17 kB). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/lexical_diversity_indices.cpython-39.pyc ADDED
Binary file (2.14 kB). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/readability_indices.cpython-39.pyc ADDED
Binary file (3.28 kB). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/referential_cohesion_indices.cpython-39.pyc ADDED
Binary file (17.7 kB). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/syntactic_complexity_indices.cpython-39.pyc ADDED
Binary file (3.98 kB). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/syntactic_pattern_density_indices.cpython-39.pyc ADDED
Binary file (7.2 kB). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/__pycache__/word_information_indices.cpython-39.pyc ADDED
Binary file (11.8 kB). View file
 
text_complexity_analyzer_cm/coh_metrix_indices/connective_indices.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ import pyphen
3
+ import spacy
4
+ import string
5
+
6
+ from typing import Callable
7
+ from typing import List
8
+ from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
9
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
10
+ from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
11
+ from text_complexity_analyzer_cm.utils.utils import split_text_into_sentences
12
+
13
+ class ConnectiveIndices:
14
+ def __init__(self, nlp, language: str='en', descriptive_indices: DescriptiveIndices=None) -> None:
15
+ self.language = language
16
+ self._nlp = nlp
17
+ self._incidence = 1
18
+ if descriptive_indices is None:
19
+ self._di = DescriptiveIndices(language)
20
+ else:
21
+ self._di = descriptive_indices
22
+
23
+ def _get_connectives_incidence(self, text: str, disable_pipeline: List, count_connectives_function: Callable, word_count: int=None, workers: int=-1) -> float:
24
+ paragraphs = split_text_into_paragraphs(text)
25
+ pc = len(paragraphs)
26
+ threads = 1
27
+ wc = word_count if word_count is not None else self._di.get_word_count_from_text(text)
28
+ self._nlp.get_pipe('feature counter').counter_function = count_connectives_function
29
+ connectives = sum(doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads))
30
+ return connectives
31
+
32
+ def get_causal_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
33
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['causal connective tagger', 'feature counter']]
34
+ causal_connectives_counter = lambda doc: len(doc._.causal_connectives)
35
+ result = self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=causal_connectives_counter, workers=workers)
36
+ return result
37
+
38
+ def get_temporal_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
39
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['temporal connective tagger', 'feature counter']]
40
+ temporal_connectives_counter = lambda doc: len(doc._.temporal_connectives)
41
+ result = self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=temporal_connectives_counter, workers=workers)
42
+ return result
43
+
44
+ def get_exemplifications_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
45
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['exemplifications tagger', 'tagger', 'feature counter']]
46
+ exemplifications_counter = lambda doc: len(doc._.exemplifications)
47
+ return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=exemplifications_counter, workers=workers)
48
+
49
+ def get_emphatics_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
50
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['emphatics tagger', 'tagger', 'feature counter']]
51
+ emphatics_counter = lambda doc: len(doc._.emphatics)
52
+ return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=emphatics_counter, workers=workers)
53
+
54
+ def get_asks_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
55
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['asks tagger', 'tagger', 'feature counter']]
56
+ asks_counter = lambda doc: len(doc._.asks)
57
+ return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=asks_counter, workers=workers)
58
+
59
+ def get_polites_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
60
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['polites tagger', 'tagger', 'feature counter']]
61
+ polites_counter = lambda doc: len(doc._.polites)
62
+ return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=polites_counter, workers=workers)
63
+
64
+ def get_logical_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
65
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['logical connective tagger', 'tagger', 'feature counter']]
66
+ logical_connectives_counter = lambda doc: len(doc._.logical_connectives)
67
+ return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=logical_connectives_counter, workers=workers)
68
+
69
+ def get_adversative_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
70
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['adversative connective tagger', 'tagger', 'feature counter']]
71
+ adversative_connectives_counter = lambda doc: len(doc._.adversative_connectives)
72
+ return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=adversative_connectives_counter, workers=workers)
73
+
74
+ def get_additive_connectives_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
75
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['additive connective tagger', 'tagger', 'feature counter']]
76
+ additive_connectives_counter = lambda doc: len(doc._.additive_connectives)
77
+ return self._get_connectives_incidence(text, disable_pipeline=disable_pipeline, count_connectives_function=additive_connectives_counter, workers=workers)
text_complexity_analyzer_cm/coh_metrix_indices/descriptive_indices.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ import spacy
3
+ import statistics
4
+ import string
5
+
6
+ from typing import Callable
7
+ from typing import List
8
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES, LANGUAGES_DICTIONARY_PYPHEN
9
+ from text_complexity_analyzer_cm.pipes.syllable_splitter import SyllableSplitter
10
+ from text_complexity_analyzer_cm.utils.statistics_results import StatisticsResults
11
+ from text_complexity_analyzer_cm.utils.utils import is_word
12
+ from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
13
+ from text_complexity_analyzer_cm.utils.utils import split_text_into_sentences
14
+ from text_complexity_analyzer_cm.utils.utils import split_doc_into_sentences
15
+
16
+
17
+ class DescriptiveIndices:
18
+ def __init__(self, nlp, language: str='en') -> None:
19
+ if not language in ACCEPTED_LANGUAGES:
20
+ raise ValueError(f'Language {language} is not supported yet')
21
+
22
+ self.language = language
23
+ self._nlp = nlp
24
+
25
+ def get_paragraph_count_from_text(self, text: str) -> int:
26
+ if len(text) == 0:
27
+ raise ValueError('The text is empty.')
28
+
29
+ return len(split_text_into_paragraphs(text))
30
+
31
+ def get_sentence_count_from_text(self, text: str, workers: int=-1) -> int:
32
+ if len(text) == 0:
33
+ raise ValueError('The text is empty.')
34
+ elif workers == 0 or workers < -1:
35
+ raise ValueError('Workers must be -1 or any positive number greater than 0')
36
+ else:
37
+ paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs
38
+ threads = 1#multiprocessing.cpu_count() if workers == -1 else workers
39
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['sentencizer', 'feature counter']]
40
+
41
+ sentence_counter = lambda doc: sum(1 for _ in doc.sents)
42
+ self._nlp.get_pipe('feature counter').counter_function = sentence_counter
43
+
44
+ sentences = sum(doc._.feature_count
45
+ for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads))
46
+ return sentences
47
+
48
+ def get_word_count_from_text(self, text: str, workers: int=-1) -> int:
49
+ if len(text) == 0:
50
+ raise ValueError('The text is empty.')
51
+ elif workers == 0 or workers < -1:
52
+ raise ValueError('Workers must be -1 or any positive number greater than 0')
53
+ else:
54
+ paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs
55
+ threads = 1#multiprocessing.cpu_count() if workers == -1 else workers
56
+ word_counter = lambda doc: sum(1 for token in doc if is_word(token))
57
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe != 'feature counter']
58
+ self._nlp.get_pipe('feature counter').counter_function = word_counter
59
+
60
+ total_words = sum(doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads))
61
+
62
+
63
+ return total_words
64
+
65
+ def _get_mean_std_of_metric(self, text: str, disable_pipeline: List, counter_function: Callable, statistic_type: str='all', workers=-1) -> StatisticsResults:
66
+ paragraphs = split_text_into_paragraphs(text)
67
+ threads = 1
68
+ self._nlp.get_pipe('feature counter').counter_function = counter_function
69
+ counter = []
70
+
71
+ for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads):
72
+ current_result = doc._.feature_count # Find the values to add to the counter
73
+ if not isinstance(current_result, list): # Add any numbers
74
+ counter.append(current_result)
75
+ else:
76
+ if len(current_result) > 0: # Only add values if its not an empty array
77
+ counter.extend(current_result)
78
+
79
+ stat_results = StatisticsResults()
80
+ if statistic_type in ['std', 'all']:
81
+ stat_results.std = statistics.pstdev(counter)
82
+
83
+ if statistic_type in ['mean', 'all']:
84
+ stat_results.mean = statistics.mean(counter)
85
+
86
+ return stat_results
87
+
88
+ def get_length_of_paragraphs(self, text: str, workers: int=-1) -> StatisticsResults:
89
+ count_length_of_paragraphs = lambda doc: sum(1 for _ in split_doc_into_sentences(doc))
90
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['sentencizer', 'feature counter']]
91
+ return self._get_mean_std_of_metric(text, disable_pipeline=disable_pipeline, counter_function=count_length_of_paragraphs, statistic_type='all', workers=workers)
92
+
93
+ def get_length_of_sentences(self, text: str, workers: int=-1) -> StatisticsResults:
94
+ count_length_of_sentences = lambda doc: [len([1 for token in sentence
95
+ if is_word(token)])
96
+ for sentence in doc.sents]
97
+
98
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['sentencizer', 'feature counter']]
99
+
100
+ return self._get_mean_std_of_metric(text, disable_pipeline=disable_pipeline, counter_function=count_length_of_sentences, statistic_type='all', workers=workers)
101
+
102
+ def get_length_of_words(self, text: str, workers: int=-1) -> StatisticsResults:
103
+
104
+ count_letters_per_word = lambda doc: [len(token)
105
+ for token in doc
106
+ if is_word(token)]
107
+
108
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe != 'feature counter']
109
+
110
+ result = self._get_mean_std_of_metric(text, disable_pipeline=disable_pipeline, counter_function=count_letters_per_word, statistic_type='all', workers=workers)
111
+
112
+ return result
113
+
114
+ def get_syllables_per_word(self, text: str, workers=-1) -> StatisticsResults:
115
+ count_syllables_per_word = lambda doc: [len(token._.syllables)
116
+ for token in doc
117
+ if is_word(token) and token._.syllables is not None]
118
+
119
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['syllables', 'feature counter']]
120
+
121
+ result = self._get_mean_std_of_metric(text, disable_pipeline=disable_pipeline, counter_function=count_syllables_per_word, statistic_type='all', workers=workers)
122
+
123
+ return result
text_complexity_analyzer_cm/coh_metrix_indices/lexical_diversity_indices.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ import spacy
3
+ import string
4
+
5
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
6
+ from text_complexity_analyzer_cm.utils.utils import is_content_word
7
+ from text_complexity_analyzer_cm.utils.utils import is_word
8
+ from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
9
+
10
+ class LexicalDiversityIndices:
11
+ def __init__(self, nlp, language: str='en') -> None:
12
+ self.language = language
13
+ self._nlp = nlp
14
+
15
+ def get_type_token_ratio_between_all_words(self, text: str, workers=-1) -> float:
16
+ paragraphs = split_text_into_paragraphs(text)
17
+ threads = 1
18
+ tokens = []
19
+ disable_pipeline = []
20
+
21
+ tokens = [token.text.lower()
22
+ for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)
23
+ for token in doc
24
+ if is_word(token)]
25
+
26
+ return 0 if len(tokens) == 0 else len(set(tokens)) / len(tokens)
27
+
28
+ def get_type_token_ratio_of_content_words(self, text: str, workers=-1) -> float:
29
+ paragraphs = split_text_into_paragraphs(text)
30
+ threads = 1
31
+ tokens = []
32
+ disable_pipeline = []
33
+ tokens = [token.text.lower()
34
+ for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)
35
+ for token in doc
36
+ if is_content_word(token)]
37
+ return 0 if len(tokens) == 0 else len(set(tokens)) / len(tokens)
text_complexity_analyzer_cm/coh_metrix_indices/readability_indices.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+
3
+ import spacy
4
+
5
+ from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
6
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
7
+
8
+
9
+ class ReadabilityIndices:
10
+ '''
11
+ This class will handle all operations to find the readability indices of a text according to Coh-Metrix.
12
+ '''
13
+
14
+ def __init__(self, nlp, language: str='en', descriptive_indices: DescriptiveIndices=None) -> None:
15
+ '''
16
+ The constructor will initialize this object that calculates the readability indices for a specific language of those that are available.
17
+
18
+ Parameters:
19
+ nlp: The spacy model that corresponds to a language.
20
+ language(str): The language that the texts to process will have.
21
+ descriptive_indices(DescriptiveIndices): The class that calculates the descriptive indices of a text in a certain language.
22
+
23
+ Returns:
24
+ None.
25
+ '''
26
+ if not language in ACCEPTED_LANGUAGES:
27
+ raise ValueError(f'Language {language} is not supported yet')
28
+ elif descriptive_indices is not None and descriptive_indices.language != language:
29
+ raise ValueError(f'The descriptive indices analyzer must be of the same language as the word information analyzer.')
30
+
31
+ self.language = language
32
+ self._nlp = nlp
33
+
34
+ if descriptive_indices is None: # Assign the descriptive indices to an attribute
35
+ self._di = DescriptiveIndices(language=language, nlp=nlp)
36
+ else:
37
+ self._di = descriptive_indices
38
+
39
+ def calculate_fernandez_huertas_grade_level(self, text: str=None, mean_syllables_per_word: int=None, mean_words_per_sentence: int=None, workers: int=-1) -> float:
40
+ '''
41
+ This function obtains the Fernández-Huertas readability index for a text.
42
+
43
+ Parameters:
44
+ text(str): The text to be analized.
45
+ word_count(int): The amount of words in the text.
46
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
47
+ mean_syllables_per_word(int): The mean of syllables per word in the text.
48
+ mean_words_per_sentence(int): The mean amount of words per sentences in the text.
49
+
50
+ Returns:
51
+ float: The Fernández-Huertas readability index for a text.
52
+ '''
53
+ if self.language != 'en':
54
+ raise ValueError('This readability index is for spanish.')
55
+ elif text is not None and len(text) == 0:
56
+ raise ValueError('The word is empty.')
57
+ elif text is None and (mean_syllables_per_word is None or mean_words_per_sentence is None):
58
+ raise ValueError('If there\'s no text, then you must pass mean_syllables_per_word and mean_words_per_sentence at the same time.')
59
+ elif workers == 0 or workers < -1:
60
+ raise ValueError('Workers must be -1 or any positive number greater than 0')
61
+ else:
62
+ threads = multiprocessing.cpu_count() if workers == -1 else workers
63
+ mspw = mean_syllables_per_word if mean_syllables_per_word is not None else self._di.get_mean_of_syllables_per_word(text=text, workers=threads)
64
+ mwps = mean_words_per_sentence if mean_words_per_sentence is not None else self._di.get_mean_of_length_of_sentences(text=text, workers=threads)
65
+
66
+ return 206.84 - 0.6 * mspw - 1.02 * mwps
text_complexity_analyzer_cm/coh_metrix_indices/referential_cohesion_indices.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ import spacy
3
+ import statistics
4
+
5
+ from itertools import combinations
6
+ from spacy.tokens import Span
7
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
8
+ from text_complexity_analyzer_cm.utils.statistics_results import StatisticsResults
9
+ from text_complexity_analyzer_cm.utils.utils import is_word
10
+ from text_complexity_analyzer_cm.utils.utils import is_content_word
11
+ from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
12
+ from typing import Callable
13
+ from typing import List
14
+
15
+
16
+ class ReferentialCohesionIndices:
17
+ '''
18
+ This class will handle all operations to find the synthactic pattern density indices of a text according to Coh-Metrix.
19
+ '''
20
+ # TODO: Implement multiprocessing
21
+ def __init__(self, nlp, language: str='en') -> None:
22
+ '''
23
+ The constructor will initialize this object that calculates the synthactic pattern density indices for a specific language of those that are available.
24
+
25
+ Parameters:
26
+ nlp: The spacy model that corresponds to a language.
27
+ language(str): The language that the texts to process will have.
28
+
29
+ Returns:
30
+ None.
31
+ '''
32
+ if not language in ACCEPTED_LANGUAGES:
33
+ raise ValueError(f'Language {language} is not supported yet')
34
+
35
+ self.language = language
36
+ self._nlp = nlp
37
+
38
+ def _calculate_overlap_for_adjacent_sentences(self, text: str, disable_pipeline: List, sentence_analyzer: Callable, statistic_type: str='mean', workers: int=-1) -> StatisticsResults:
39
+ '''
40
+ This method calculates the overlap for adjacent sentences in a text. MULTIPROCESSING STILL NOT IMPLEMENTED.
41
+
42
+ Parameters:
43
+ text(str): The text to be analyzed.
44
+ disable_pipeline(List): The pipeline elements to be disabled.
45
+ sentence_analyzer(Callable): The function that analyzes sentences to check cohesion.
46
+ statistic_type(str): Whether to calculate the mean and/or the standard deviation. It accepts 'mean', 'std' or 'all'.
47
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
48
+
49
+ Returns:
50
+ StatisticsResults: The standard deviation and mean of the overlap.
51
+ '''
52
+ # TODO MULTIPROCESSING. WORKERS IS JUST A PLACEHOLDER
53
+ if len(text) == 0:
54
+ raise ValueError('The text is empty.')
55
+ elif statistic_type not in ['mean', 'std', 'all']:
56
+ raise ValueError('\'statistic_type\' can only take \'mean\', \'std\' or \'all\'.')
57
+ else:
58
+ self._nlp.get_pipe('referential cohesion adjacent sentences analyzer').sentence_analyzer = sentence_analyzer
59
+ doc = self._nlp(text, disable=disable_pipeline)
60
+ stat_results = StatisticsResults() # Create empty container
61
+
62
+ if len(doc._.referential_cohesion_adjacent) == 0:
63
+ return stat_results
64
+ else:
65
+ if statistic_type in ['mean', 'all']:
66
+ stat_results.mean = statistics.mean(doc._.referential_cohesion_adjacent)
67
+
68
+ if statistic_type in ['std', 'all']:
69
+ stat_results.std = statistics.pstdev(doc._.referential_cohesion_adjacent)
70
+
71
+ return stat_results
72
+
73
+ def _calculate_overlap_for_all_sentences(self, text: str, disable_pipeline: List, sentence_analyzer: Callable, statistic_type: str='all', workers: int=-1) -> StatisticsResults:
74
+ '''
75
+ This method calculates the overlap for all sentences in a text. MULTIPROCESSING STILL NOT IMPLEMENTED.
76
+
77
+ Parameters:
78
+ text(str): The text to be analyzed.
79
+ disable_pipeline(List): The pipeline elements to be disabled.
80
+ sentence_analyzer(Callable): The function that analyzes sentences to check cohesion.
81
+ statistic_type(str): Whether to calculate the mean and/or the standard deviation. It accepts 'mean', 'std' or 'all'.
82
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
83
+
84
+ Returns:
85
+ StatisticsResults: The standard deviation and mean of the overlap.
86
+ '''
87
+ # TODO MULTIPROCESSING. WORKERS IS JUST A PLACEHOLDER.
88
+ if len(text) == 0:
89
+ raise ValueError('The text is empty.')
90
+ elif statistic_type not in ['mean', 'std', 'all']:
91
+ raise ValueError('\'statistic_type\' can only take \'mean\', \'std\' or \'all\'.')
92
+ else:
93
+ self._nlp.get_pipe('referential cohesion all sentences analyzer').sentence_analyzer = sentence_analyzer
94
+ doc = self._nlp(text, disable=disable_pipeline)
95
+ stat_results = StatisticsResults() # Create empty container
96
+
97
+ if len(doc._.referential_cohesion_all) == 0:
98
+ return stat_results
99
+ else:
100
+ if statistic_type in ['mean', 'all']:
101
+ stat_results.mean = statistics.mean(doc._.referential_cohesion_all)
102
+
103
+ if statistic_type in ['std', 'all']:
104
+ stat_results.std = statistics.pstdev(doc._.referential_cohesion_all)
105
+
106
+ return stat_results
107
+
108
+ def get_noun_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
109
+ '''
110
+ This method calculates the noun overlap for adjacent sentences in a text.
111
+
112
+ Parameters:
113
+ text(str): The text to be analyzed.
114
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
115
+
116
+ Returns:
117
+ float: The mean noun overlap.
118
+ '''
119
+ disable_pipeline = [pipe
120
+ for pipe in self._nlp.pipe_names
121
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
122
+ return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_noun_overlap, statistic_type='mean').mean
123
+
124
+ def get_noun_overlap_all_sentences(self, text: str, workers: int=-1) -> float:
125
+ '''
126
+ This method calculates the noun overlap for all sentences in a text.
127
+
128
+ Parameters:
129
+ text(str): The text to be analyzed.
130
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
131
+
132
+ Returns:
133
+ float: The mean noun overlap.
134
+ '''
135
+ disable_pipeline = [pipe
136
+ for pipe in self._nlp.pipe_names
137
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
138
+ return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_noun_overlap, statistic_type='mean').mean
139
+
140
+ def get_argument_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
141
+ '''
142
+ This method calculates the argument overlap for adjacent sentences in a text.
143
+
144
+ Parameters:
145
+ text(str): The text to be analyzed.
146
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
147
+
148
+ Returns:
149
+ float: The mean argument overlap.
150
+ '''
151
+ disable_pipeline = [pipe
152
+ for pipe in self._nlp.pipe_names
153
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
154
+ return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_argument_overlap, statistic_type='mean').mean
155
+
156
+ def get_argument_overlap_all_sentences(self, text: str, workers: int=-1) -> float:
157
+ '''
158
+ This method calculates the argument overlap for all sentences in a text.
159
+
160
+ Parameters:
161
+ text(str): The text to be analyzed.
162
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
163
+
164
+ Returns:
165
+ float: The mean argument overlap.
166
+ '''
167
+ disable_pipeline = [pipe
168
+ for pipe in self._nlp.pipe_names
169
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
170
+ return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_argument_overlap, statistic_type='mean').mean
171
+
172
+ def get_stem_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
173
+ '''
174
+ This method calculates the stem overlap for adjacent sentences in a text.
175
+
176
+ Parameters:
177
+ text(str): The text to be analyzed.
178
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
179
+
180
+ Returns:
181
+ float: The mean stem overlap.
182
+ '''
183
+ disable_pipeline = [pipe
184
+ for pipe in self._nlp.pipe_names
185
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
186
+ return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_stem_overlap, statistic_type='mean').mean
187
+
188
+ def get_stem_overlap_all_sentences(self, text: str, workers: int=-1) -> float:
189
+ '''
190
+ This method calculates the stem overlap for all sentences in a text.
191
+
192
+ Parameters:
193
+ text(str): The text to be analyzed.
194
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
195
+
196
+ Returns:
197
+ float: The mean stem overlap.
198
+ '''
199
+ disable_pipeline = [pipe
200
+ for pipe in self._nlp.pipe_names
201
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
202
+ return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_stem_overlap, statistic_type='mean').mean
203
+
204
+ def get_content_word_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
205
+ '''
206
+ This method calculates the mean and standard deviation of the content word overlap for adjacent sentences in a text.
207
+
208
+ Parameters:
209
+ text(str): The text to be analyzed.
210
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
211
+
212
+ Returns:
213
+ float: The mean mean and standard deviation of the content word overlap.
214
+ '''
215
+ disable_pipeline = [pipe
216
+ for pipe in self._nlp.pipe_names
217
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
218
+ return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_content_word_overlap, statistic_type='all')
219
+
220
+ def get_content_word_overlap_all_sentences(self, text: str, workers: int=-1) -> StatisticsResults:
221
+ '''
222
+ This method calculates the mean and standard deviation of the content word overlap for all sentences in a text.
223
+
224
+ Parameters:
225
+ text(str): The text to be analyzed.
226
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
227
+
228
+ Returns:
229
+ StatisticsResults: The mean mean and standard deviation of the content word overlap.
230
+ '''
231
+ disable_pipeline = [pipe
232
+ for pipe in self._nlp.pipe_names
233
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
234
+ return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_content_word_overlap, statistic_type='all')
235
+
236
+ def get_anaphore_overlap_adjacent_sentences(self, text: str, workers: int=-1) -> float:
237
+ '''
238
+ This method calculates the mean of the anaphore overlap for adjacent sentences in a text.
239
+
240
+ Parameters:
241
+ text(str): The text to be analyzed.
242
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
243
+
244
+ Returns:
245
+ float: The mean mean of the anaphore overlap.
246
+ '''
247
+ disable_pipeline = [pipe
248
+ for pipe in self._nlp.pipe_names
249
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion adjacent sentences analyzer']]
250
+ return self._calculate_overlap_for_adjacent_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_anaphore_overlap, statistic_type='all').mean
251
+
252
+ def get_anaphore_overlap_all_sentences(self, text: str, workers: int=-1) -> float:
253
+ '''
254
+ This method calculates the mean of the anaphore overlap for all sentences in a text.
255
+
256
+ Parameters:
257
+ text(str): The text to be analyzed.
258
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
259
+
260
+ Returns:
261
+ float: The mean mean of the anaphore overlap.
262
+ '''
263
+ disable_pipeline = [pipe
264
+ for pipe in self._nlp.pipe_names
265
+ if pipe not in ['sentencizer', 'tagger', 'referential cohesion all sentences analyzer']]
266
+ return self._calculate_overlap_for_all_sentences(text=text, workers=workers, disable_pipeline=disable_pipeline, sentence_analyzer=analyze_anaphore_overlap, statistic_type='all').mean
267
+
268
+ def analyze_noun_overlap(prev_sentence: Span, cur_sentence: Span, language: str='en') -> int:
269
+ '''
270
+ This function analyzes whether or not there's noun overlap between two sentences for a language.
271
+
272
+ Parameters:
273
+ prev_sentence(Span): The previous sentence to analyze.
274
+ cur_sentence(Span): The current sentence to analyze.
275
+ language(str): The language of the sentences.
276
+
277
+ Returns:
278
+ int: 1 if there's overlap between the two sentences and 0 if no.
279
+ '''
280
+ # Place the tokens in a dictionary for search efficiency
281
+ prev_sentence_noun_tokens = {token.text.lower(): None
282
+ for token in prev_sentence
283
+ if is_word(token) and token.pos_ == 'NOUN'}
284
+
285
+ for token in cur_sentence:
286
+ if language == 'en':
287
+ if is_word(token) and token.pos_ == 'NOUN' and token.text.lower() in prev_sentence_noun_tokens:
288
+ return 1 # There's cohesion
289
+
290
+ return 0 # No cohesion
291
+
292
+
293
+ def analyze_argument_overlap(prev_sentence: Span, cur_sentence: Span, language: str='en') -> int:
294
+ '''
295
+ This function analyzes whether or not there's argument overlap between two sentences.
296
+
297
+ Parameters:
298
+ prev_sentence(Span): The previous sentence to analyze.
299
+ cur_sentence(Span): The current sentence to analyze.
300
+ language(str): The language of the sentences.
301
+
302
+ Returns:
303
+ int: 1 if there's overlap between the two sentences and 0 if no.
304
+ '''
305
+ # Place the tokens in a dictionary for search efficiency
306
+ prev_sentence_noun_tokens = {token.lemma_.lower(): None
307
+ for token in prev_sentence
308
+ if is_word(token) and token.pos_ == 'NOUN'}
309
+
310
+ prev_sentence_personal_pronouns_tokens = {token.text.lower(): None
311
+ for token in prev_sentence
312
+ if is_word(token) and 'PronType=Prs' in token.tag_}
313
+
314
+ for token in cur_sentence: # Iterate every token of the current sentence
315
+ if language == 'en':
316
+ if is_word(token) and token.pos_ == 'NOUN' and token.lemma_.lower() in prev_sentence_noun_tokens:
317
+ return 1 # There's cohesion by noun lemma
318
+
319
+ if is_word(token) and 'PronType=Prs' in token.tag_ and token.text.lower() in prev_sentence_personal_pronouns_tokens:
320
+ return 1 # There's cohesion by personal pronoun
321
+
322
+ return 0 # No cohesion
323
+
324
+
325
+ def analyze_stem_overlap(prev_sentence: Span, cur_sentence: Span, language: str='en') -> int:
326
+ '''
327
+ This function analyzes whether or not there's stem overlap between two sentences.
328
+
329
+ Parameters:
330
+ prev_sentence(Span): The previous sentence to analyze.
331
+ cur_sentence(Span): The current sentence to analyze.
332
+ language(str): The language of the sentences.
333
+
334
+ Returns:
335
+ int: 1 if there's overlap between the two sentences and 0 if no.
336
+ '''
337
+ # Place the tokens in a dictionary for search efficiency
338
+ prev_sentence_content_stem_tokens = {token.lemma_.lower(): None
339
+ for token in prev_sentence
340
+ if is_content_word(token)}
341
+
342
+ for token in cur_sentence:
343
+ if language == 'en':
344
+ if is_word(token) and token.pos_ in ['NOUN', 'PROPN'] and token.lemma_.lower() in prev_sentence_content_stem_tokens:
345
+ return 1 # There's cohesion
346
+
347
+ return 0 # No cohesion
348
+
349
+
350
+ def analyze_content_word_overlap(prev_sentence: Span, cur_sentence: Span, language='en') -> float:
351
+ '''
352
+ This function calculates the proportional content word overlap between two sentences.
353
+
354
+ Parameters:
355
+ prev_sentence(Span): The previous sentence to analyze.
356
+ cur_sentence(Span): The current sentence to analyze.
357
+ language(str): The language of the sentences.
358
+
359
+ Returns:
360
+ float: Proportion of tokens that overlap between the current and previous sentences
361
+ '''
362
+ total_tokens = len([token for token in prev_sentence if is_content_word(token)]) + len([token for token in cur_sentence if is_content_word(token)])
363
+
364
+ if total_tokens == 0: # Nothing to compute
365
+ return 0
366
+ else:
367
+ prev_sentence_content_words_tokens = {token.text.lower(): None
368
+ for token in prev_sentence
369
+ if is_content_word(token)}
370
+ matches = 0 # Matcher counter
371
+
372
+ for token in cur_sentence:
373
+ if language == 'en':
374
+ if is_content_word(token) and token.text.lower() in prev_sentence_content_words_tokens:
375
+ matches += 2 # There's cohesion
376
+
377
+ return matches / total_tokens
378
+
379
+
380
+ def analyze_anaphore_overlap(prev_sentence: Span, cur_sentence: Span, language: str='en') -> int:
381
+ '''
382
+ This function analyzes whether or not there's anaphore overlap between two sentences.
383
+
384
+ Parameters:
385
+ prev_sentence(Span): The previous sentence to analyze.
386
+ cur_sentence(Span): The current sentence to analyze.
387
+ language(str): The language of the sentences.
388
+
389
+ Returns:
390
+ int: 1 if there's overlap between the two sentences and 0 if no.
391
+ '''
392
+ # Place the tokens in a dictionary for search efficiency
393
+ prev_sentence_pronoun_tokens = {token.text.lower(): None
394
+ for token in prev_sentence
395
+ if is_word(token) and token.pos_ == 'PRON'}
396
+
397
+ for token in cur_sentence:
398
+ if language == 'en':
399
+ if is_word(token) and token.pos_ == 'PRON' and token.text.lower() in prev_sentence_pronoun_tokens:
400
+ return 1 # There's cohesion
401
+
402
+ return 0 # No cohesion
text_complexity_analyzer_cm/coh_metrix_indices/syntactic_complexity_indices.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ from typing import Tuple
3
+
4
+ import spacy
5
+ import statistics
6
+
7
+ from spacy.tokens import Span
8
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
9
+ from text_complexity_analyzer_cm.utils.utils import is_word
10
+ from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
11
+ from text_complexity_analyzer_cm.utils.utils import split_doc_into_sentences
12
+
13
+
14
+ class SyntacticComplexityIndices:
15
+ def __init__(self, nlp, language: str='en') -> None:
16
+ if not language in ACCEPTED_LANGUAGES:
17
+ raise ValueError(f'Language {language} is not supported yet')
18
+
19
+ self.language = language
20
+ self._nlp = nlp
21
+
22
+ def get_mean_number_of_modifiers_per_noun_phrase(self, text: str, workers: int=-1) -> float:
23
+ paragraphs = split_text_into_paragraphs(text)
24
+ threads = 1
25
+ modifiers_per_noun_phrase = []
26
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['parser', 'tagger', 'noun phrase tagger', 'feature counter']]
27
+ modifiers_counter = lambda doc: [sum(1 for token in nph if token.pos_ == 'ADJ')
28
+ for nph in doc._.noun_phrases]
29
+ self._nlp.get_pipe('feature counter').counter_function = modifiers_counter
30
+ modifiers_per_noun_phrase = []
31
+
32
+ for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads):
33
+ modifiers_per_noun_phrase.extend(doc._.feature_count)
34
+
35
+ return statistics.mean(modifiers_per_noun_phrase)
36
+
37
+ def get_mean_number_of_words_before_main_verb(self, text: str, workers: int=-1) -> float:
38
+ paragraphs = split_text_into_paragraphs(text)
39
+ threads = 1
40
+ words_before_main_verb = []
41
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['feature counter', 'sentencizer']]
42
+ words_before_main_verb_counter = lambda doc: [amount_of_words_before_main_verb(s) for s in split_doc_into_sentences(doc)]
43
+ self._nlp.get_pipe('feature counter').counter_function = words_before_main_verb_counter
44
+ for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads):
45
+ words_before_main_verb.extend(doc._.feature_count)
46
+ return statistics.mean(words_before_main_verb)
47
+
48
+ def amount_of_words_before_main_verb(sentence: Span) -> int:
49
+ left_words = []
50
+ for token in sentence:
51
+ if token.pos_ in ['VERB', 'AUX'] and token.dep_ == 'ROOT':
52
+ break
53
+ else:
54
+ if is_word(token):
55
+ left_words.append(token.text)
56
+ return len(left_words)
text_complexity_analyzer_cm/coh_metrix_indices/syntactic_pattern_density_indices.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+
3
+ import spacy
4
+
5
+ from typing import Callable
6
+ from typing import List
7
+ from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
8
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
9
+ from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
10
+
11
+
12
+ class SyntacticPatternDensityIndices:
13
+ '''
14
+ This class will handle all operations to find the synthactic pattern density indices of a text according to Coh-Metrix.
15
+ '''
16
+
17
+ def __init__(self, nlp, language: str='en', descriptive_indices: DescriptiveIndices=None) -> None:
18
+ '''
19
+ The constructor will initialize this object that calculates the synthactic pattern density indices for a specific language of those that are available.
20
+
21
+ Parameters:
22
+ nlp: The spacy model that corresponds to a language.
23
+ language(str): The language that the texts to process will have.
24
+ descriptive_indices(DescriptiveIndices): The class that calculates the descriptive indices of a text in a certain language.
25
+
26
+ Returns:
27
+ None.
28
+ '''
29
+ if not language in ACCEPTED_LANGUAGES:
30
+ raise ValueError(f'Language {language} is not supported yet')
31
+ elif descriptive_indices is not None and descriptive_indices.language != language:
32
+ raise ValueError(f'The descriptive indices analyzer must be of the same language as the word information analyzer.')
33
+
34
+ self.language = language
35
+ self._nlp = nlp
36
+ self._incidence = 1000
37
+
38
+ if descriptive_indices is None: # Assign the descriptive indices to an attribute
39
+ self._di = DescriptiveIndices(language=language, nlp=nlp)
40
+ else:
41
+ self._di = descriptive_indices
42
+
43
+ def _get_syntactic_pattern_density(self, text: str, disable_pipeline: List, sp_counter_function: Callable=None, word_count: int=None, workers: int=-1) -> int:
44
+ '''
45
+ This function obtains the incidence of a syntactic pattern that exist on a text per {self._incidence} words.
46
+
47
+ Parameters:
48
+ text(str): The text to be analized.
49
+ disable_pipeline(List): The pipeline elements to be disabled.
50
+ sp_counter_function(Callable): The function that counts a syntactic pattern for a Spacy document. It returns an integer.
51
+ word_count(int): The amount of words in the text.
52
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
53
+
54
+ Returns:
55
+ int: The incidence of a syntactic pattern per {self._incidence} words.
56
+ '''
57
+ if len(text) == 0:
58
+ raise ValueError('The word is empty.')
59
+ elif workers == 0 or workers < -1:
60
+ raise ValueError('Workers must be -1 or any positive number greater than 0')
61
+ else:
62
+ paragraphs = split_text_into_paragraphs(text) # Find all paragraphs
63
+ threads = multiprocessing.cpu_count() if workers == -1 else workers
64
+ wc = word_count if word_count is not None else self._di.get_word_count_from_text(text)
65
+ self._nlp.get_pipe('feature counter').counter_function = sp_counter_function
66
+ density = sum(doc._.feature_count
67
+ for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)) # Calculate with multiprocessing
68
+
69
+ return (density / wc) * self._incidence
70
+
71
+ def get_noun_phrase_density(self, text: str, word_count: int=None, workers: int=-1) -> int:
72
+ '''
73
+ This function obtains the incidence of noun phrases that exist on a text per {self._incidence} words.
74
+
75
+ Parameters:
76
+ text(str): The text to be analized.
77
+ word_count(int): The amount of words in the text.
78
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
79
+
80
+ Returns:
81
+ int: The incidence of noun phrases per {self._incidence} words.
82
+ '''
83
+ count_noun_phrases = lambda doc: len(doc._.noun_phrases)
84
+ disable_pipeline = [pipe
85
+ for pipe in self._nlp.pipe_names
86
+ if pipe not in ['noun phrase tagger', 'tagger', 'parser', 'feature counter']]
87
+
88
+ return self._get_syntactic_pattern_density(text, disable_pipeline=disable_pipeline, sp_counter_function=count_noun_phrases, workers=workers)
89
+
90
+ def get_verb_phrase_density(self, text: str, word_count: int=None, workers: int=-1) -> int:
91
+ '''
92
+ This function obtains the incidence of verb phrases that exist on a text per {self._incidence} words.
93
+
94
+ Parameters:
95
+ text(str): The text to be analized.
96
+ word_count(int): The amount of words in the text.
97
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
98
+
99
+ Returns:
100
+ int: The incidence of verb phrases per {self._incidence} words.
101
+ '''
102
+ count_verb_phrases = lambda doc: len(doc._.verb_phrases)
103
+ disable_pipeline = [pipe
104
+ for pipe in self._nlp.pipe_names
105
+ if pipe not in ['verb phrase tagger', 'tagger', 'feature counter']]
106
+
107
+ return self._get_syntactic_pattern_density(text, disable_pipeline=disable_pipeline, sp_counter_function=count_verb_phrases, workers=workers)
108
+
109
+ def get_negation_expressions_density(self, text: str, word_count: int=None, workers: int=-1) -> int:
110
+ '''
111
+ This function obtains the incidence of negation expressions that exist on a text per {self._incidence} words.
112
+
113
+ Parameters:
114
+ text(str): The text to be analized.
115
+ word_count(int): The amount of words in the text.
116
+ workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.
117
+
118
+ Returns:
119
+ int: The incidence of negation expressions per {self._incidence} words.
120
+ '''
121
+ count_negation_expressions = lambda doc: len(doc._.negation_expressions)
122
+ disable_pipeline = [pipe
123
+ for pipe in self._nlp.pipe_names
124
+ if pipe not in ['negative expression tagger', 'tagger', 'feature counter']]
125
+
126
+ return self._get_syntactic_pattern_density(text, disable_pipeline=disable_pipeline, sp_counter_function=count_negation_expressions, workers=workers)
text_complexity_analyzer_cm/coh_metrix_indices/word_information_indices.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ import spacy
3
+
4
+ from typing import Callable
5
+ from typing import List
6
+ from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
7
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
8
+ from text_complexity_analyzer_cm.utils.utils import is_word
9
+ from text_complexity_analyzer_cm.utils.utils import split_text_into_paragraphs
10
+
11
+ class WordInformationIndices:
12
+ def __init__(self, nlp, language: str='en', descriptive_indices: DescriptiveIndices=None) -> None:
13
+ self.language = language
14
+ self._nlp = nlp
15
+ self._incidence = 1000
16
+ if descriptive_indices is None:
17
+ self._di = DescriptiveIndices(language=language, nlp=nlp)
18
+ else:
19
+ self._di = descriptive_indices
20
+
21
+ def _get_word_type_incidence(self, text: str, disable_pipeline :List, counter_function: Callable, word_count: int=None, workers: int=-1) -> float:
22
+ paragraphs = split_text_into_paragraphs(text)
23
+ wc = word_count if word_count is not None else self._di.get_word_count_from_text(text)
24
+ self._nlp.get_pipe('feature counter').counter_function = counter_function
25
+ words = sum(doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=1, disable=disable_pipeline, n_process=1))
26
+ result = words #(words / wc)
27
+ return result
28
+
29
+ def get_noun_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
30
+ noun_counter = lambda doc: sum(1 for token in doc if is_word(token) and token.pos_ in ['NOUN', 'PROPN'])
31
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
32
+ result = self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=noun_counter, workers=workers)
33
+ return result
34
+
35
+ def get_verb_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
36
+ verb_counter = lambda doc: sum(1 for token in doc if is_word(token) and token.pos_ == 'VERB')
37
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
38
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=verb_counter, workers=workers)
39
+
40
+ def get_adjective_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
41
+ adjective_counter = lambda doc: sum(1
42
+ for token in doc
43
+ if is_word(token) and token.pos_ == 'ADJ')
44
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
45
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=adjective_counter, workers=workers)
46
+
47
+ def get_adverb_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
48
+ adverb_counter = lambda doc: sum(1
49
+ for token in doc
50
+ if is_word(token) and token.pos_ == 'ADV')
51
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
52
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=adverb_counter, workers=workers)
53
+
54
+ def get_personal_pronoun_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
55
+ pronoun_counter = lambda doc: sum(1
56
+ for token in doc
57
+ if is_word(token) and token.pos_ == 'PRON')
58
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
59
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
60
+
61
+ def get_personal_pronoun_first_person_singular_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
62
+ pronoun_counter = lambda doc: sum(1
63
+ for token in doc
64
+ if is_word(token) and token.pos_ == 'PRON' and 'Number=Sing' in token.morph and 'Person=1' in token.morph)
65
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
66
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
67
+
68
+ def get_personal_pronoun_first_person_plural_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
69
+ pronoun_counter = lambda doc: sum(1
70
+ for token in doc
71
+ if is_word(token) and token.pos_ == 'PRON' and 'Number=Plur' in token.morph and 'Person=1' in token.morph)
72
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
73
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
74
+
75
+ def get_personal_pronoun_second_person_singular_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
76
+ pronoun_counter = lambda doc: sum(1
77
+ for token in doc
78
+ if is_word(token) and token.pos_ == 'PRON' and 'Number=Sing' in token.morph and 'Person=2' in token.morph)
79
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
80
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
81
+
82
+ def get_personal_pronoun_second_person_plural_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
83
+ pronoun_counter = lambda doc: sum(1
84
+ for token in doc
85
+ if is_word(token) and token.pos_ == 'PRON' and 'Number=Plur' in token.morph and 'Person=2' in token.morph)
86
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
87
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
88
+
89
+ def get_personal_pronoun_third_person_singular_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
90
+ pronoun_counter = lambda doc: sum(1
91
+ for token in doc
92
+ if is_word(token) and token.pos_ == 'PRON' and 'Number=Sing' in token.morph and 'Person=3' in token.morph)
93
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
94
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
95
+
96
+ def get_personal_pronoun_third_person_plural_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float:
97
+ pronoun_counter = lambda doc: sum(1 for token in doc if is_word(token) and token.pos_ == 'PRON' and 'Number=Plur' in token.morph and 'Person=3' in token.morph)
98
+ disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tok2vec', 'tagger', 'attribute_ruler', 'feature counter']]
99
+ return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
text_complexity_analyzer_cm/constants.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ This function contains constants that will be used across the entire library.
3
+ '''
4
+
5
+ import os
6
+
7
+ language = {
8
+ 'es': 'es_core_news_lg',
9
+ 'en': 'en_core_web_sm'
10
+ }
11
+
12
+ ACCEPTED_LANGUAGES = {
13
+ 'es': 'es_core_news_lg',
14
+ 'en': 'en_core_web_sm',
15
+ }
16
+
17
+ LANGUAGES_DICTIONARY_PYPHEN = {
18
+ 'es': 'es',
19
+ 'en': 'en'
20
+ }
21
+
22
+ BASE_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
text_complexity_analyzer_cm/perm.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
3
+ from text_complexity_analyzer_cm.coh_metrix_indices.connective_indices import ConnectiveIndices
4
+ from text_complexity_analyzer_cm.coh_metrix_indices.descriptive_indices import DescriptiveIndices
5
+ from text_complexity_analyzer_cm.coh_metrix_indices.lexical_diversity_indices import LexicalDiversityIndices
6
+ from text_complexity_analyzer_cm.coh_metrix_indices.syntactic_complexity_indices import SyntacticComplexityIndices
7
+ from text_complexity_analyzer_cm.coh_metrix_indices.word_information_indices import WordInformationIndices
8
+ from text_complexity_analyzer_cm.pipes.syllable_splitter import SyllableSplitter
9
+ from text_complexity_analyzer_cm.pipes.causal_connectives_tagger import CausalConnectivesTagger
10
+ from text_complexity_analyzer_cm.pipes.emphatics_tagger import EmphaticsTagger
11
+ from text_complexity_analyzer_cm.pipes.asks_tagger import AsksTagger
12
+ from text_complexity_analyzer_cm.pipes.polites_tagger import PolitesTagger
13
+ from text_complexity_analyzer_cm.pipes.logical_connectives_tagger import LogicalConnectivesTagger
14
+ from text_complexity_analyzer_cm.pipes.adversative_connectives_tagger import AdversativeConnectivesTagger
15
+ from text_complexity_analyzer_cm.pipes.temporal_connectives_tagger import TemporalConnectivesTagger
16
+ from text_complexity_analyzer_cm.pipes.additive_connectives_tagger import AdditiveConnectivesTagger
17
+ from text_complexity_analyzer_cm.pipes.feature_counter import FeatureCounter
18
+ from typing import Dict
19
+
20
+ class PERM:
21
+ def __init__(self, language:str = 'en') -> None:
22
+ self.language = language
23
+ self._nlp = spacy.load(ACCEPTED_LANGUAGES[language], disable=['ner'])
24
+ self._nlp.max_length = 3000000
25
+ self._nlp.add_pipe('sentencizer')
26
+ self._nlp.add_pipe('syllables', config={"language": 'en'}, after='tagger')
27
+ self._nlp.add_pipe('causal connective tagger', config={"language": 'en'}, after='tagger')
28
+ self._nlp.add_pipe('temporal connective tagger', config={"language": 'en'}, after='tagger')
29
+ self._nlp.add_pipe('emphatics tagger', config={"language": 'en'}, after='tagger')
30
+ self._nlp.add_pipe('asks tagger', config={"language": 'en'}, after='tagger')
31
+ self._nlp.add_pipe('polites tagger', config={"language": 'en'}, after='tagger')
32
+ self._nlp.add_pipe('logical connective tagger', config={"language": 'en'}, after='tagger')
33
+ self._nlp.add_pipe('adversative connective tagger', config={"language": 'en'}, after='tagger')
34
+ self._nlp.add_pipe('additive connective tagger', config={"language": 'en'}, after='tagger')
35
+ self._nlp.add_pipe('feature counter', config={"language": 'en'}, last=True)
36
+ self._di = DescriptiveIndices(language=language, nlp=self._nlp)
37
+ self._ci = ConnectiveIndices(language=language, nlp=self._nlp, descriptive_indices=self._di)
38
+ self._ldi = LexicalDiversityIndices(language=language, nlp=self._nlp)
39
+ self._sci = SyntacticComplexityIndices(language=language, nlp=self._nlp)
40
+ self._wii = WordInformationIndices(language=language, nlp=self._nlp, descriptive_indices=self._di)
41
+
42
+ def calculate_descriptive_indices_for_one_text(self, text: str, workers: int=-1) -> Dict:
43
+ indices = {}
44
+
45
+ indices['Paragraph Count'] = self._di.get_paragraph_count_from_text(text=text)
46
+ indices['Sentence Count'] = self._di.get_sentence_count_from_text(text=text, workers=workers)
47
+ indices['Word Count'] = self._di.get_word_count_from_text(text=text, workers=workers)
48
+ length_of_paragraph = self._di.get_length_of_paragraphs(text=text, workers=workers)
49
+ indices['Mean Length of Paragraphs'] = length_of_paragraph.mean
50
+ length_of_sentences = self._di.get_length_of_sentences(text=text, workers=workers)
51
+ indices['Mean Length of Sentences'] = length_of_sentences.mean
52
+ length_of_words = self._di.get_length_of_words(text=text, workers=workers)
53
+ indices['Mean Length of Words'] = length_of_words.mean
54
+ syllables_per_word = self._di.get_syllables_per_word(text=text, workers=workers)
55
+ indices['Mean Syllables of Words'] = syllables_per_word.mean
56
+
57
+ return indices
58
+
59
+ def calculate_word_information_indices_for_one_text(self, text: str, workers: int=-1, word_count: int=None) -> Dict:
60
+ indices = {}
61
+
62
+ indices['#Nouns'] = self._wii.get_noun_incidence(text=text, workers=workers, word_count=word_count)
63
+ indices['#Verbs'] = self._wii.get_verb_incidence(text=text, workers=workers, word_count=word_count)
64
+ indices['#Adjectives'] = self._wii.get_adjective_incidence(text=text, workers=workers, word_count=word_count)
65
+ indices['#Adverbs'] = self._wii.get_adverb_incidence(text=text, workers=workers, word_count=word_count)
66
+ indices['#Personal Pronouns'] = self._wii.get_personal_pronoun_incidence(text=text, workers=workers, word_count=word_count)
67
+ indices['#Pers1s'] = self._wii.get_personal_pronoun_first_person_singular_form_incidence(text=text, workers=workers, word_count=word_count)
68
+ indices['#Pers1p'] = self._wii.get_personal_pronoun_first_person_plural_form_incidence(
69
+ text=text, workers=workers, word_count=word_count)
70
+ indices['#Pers2s'] = self._wii.get_personal_pronoun_second_person_singular_form_incidence(
71
+ text=text, workers=workers, word_count=word_count)
72
+ indices['#Pers2p'] = self._wii.get_personal_pronoun_second_person_plural_form_incidence(
73
+ text=text, workers=workers, word_count=word_count)
74
+ indices['#Pers3s'] = self._wii.get_personal_pronoun_third_person_singular_form_incidence(
75
+ text=text, workers=workers, word_count=word_count)
76
+ indices['#Pers3p'] = self._wii.get_personal_pronoun_third_person_plural_form_incidence(
77
+ text=text, workers=workers, word_count=word_count)
78
+
79
+ return indices
text_complexity_analyzer_cm/pipes/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ '''
2
+ This module contains the Spacy pipes used to calculate certain coh-metrix indices.
3
+ '''
text_complexity_analyzer_cm/pipes/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (285 Bytes). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/additive_connectives_tagger.cpython-39.pyc ADDED
Binary file (2.98 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/adversative_connectives_tagger.cpython-39.pyc ADDED
Binary file (3.09 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/asks_tagger.cpython-39.pyc ADDED
Binary file (2.06 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/causal_connectives_tagger.cpython-39.pyc ADDED
Binary file (2.46 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/emphatics_tagger.cpython-39.pyc ADDED
Binary file (2.29 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/feature_counter.cpython-39.pyc ADDED
Binary file (1.42 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/logical_connectives_tagger.cpython-39.pyc ADDED
Binary file (2.63 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/negative_expression_tagger.cpython-39.pyc ADDED
Binary file (3.04 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/noun_phrase_tagger.cpython-39.pyc ADDED
Binary file (2.61 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/polites_tagger.cpython-39.pyc ADDED
Binary file (2.61 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/referential_cohesion_adjacent_sentences_analyzer.cpython-39.pyc ADDED
Binary file (2.25 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/referential_cohesion_all_sentences_analyzer.cpython-39.pyc ADDED
Binary file (2.18 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/syllable_splitter.cpython-39.pyc ADDED
Binary file (1.25 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/temporal_connectives_tagger.cpython-39.pyc ADDED
Binary file (2.54 kB). View file
 
text_complexity_analyzer_cm/pipes/__pycache__/verb_phrase_tagger.cpython-39.pyc ADDED
Binary file (3.01 kB). View file
 
text_complexity_analyzer_cm/pipes/additive_connectives_tagger.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.matcher import PhraseMatcher
2
+ from spacy.tokens import Doc
3
+ from spacy.tokens import Span
4
+ from spacy.util import filter_spans
5
+ from spacy.language import Language
6
+
7
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
8
+
9
+ additive_connectives_getter = lambda doc: [doc[span['start']:span['end']]
10
+ for span in doc._.additive_connectives_span_indices]
11
+
12
+ Doc.set_extension('additive_connectives_span_indices', force=False, default=[])
13
+ Doc.set_extension('additive_connectives', force=False, getter=additive_connectives_getter)
14
+
15
+ @Language.factory('additive connective tagger')
16
+ class AdditiveConnectivesTagger:
17
+ def __init__(self, name, nlp, language: str='en') -> None:
18
+ '''
19
+ This constructor will initialize the object that tags additive connectives.
20
+
21
+ Parameters:
22
+ nlp: The Spacy model to use this tagger with.
23
+ language: The language that this pipeline will be used in.
24
+
25
+ Returns:
26
+ None.
27
+ '''
28
+ if not language in ACCEPTED_LANGUAGES:
29
+ raise ValueError(f'Language {language} is not supported yet')
30
+
31
+ self._language = language
32
+ self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
33
+ self._connectives = []
34
+ if language == 'en': # Temporal connectives for spanish
35
+ self._connectives = ['then', 'moreover', 'after', 'from here on', 'even', 'next', 'to top it all', 'another', 'finally', 'of equal importance', 'is more', 'first', 'besides', 'gradually', 'too', 'last', 'equally important', 'third', 'as soon as', 'on the other hand', 'furthermore', 'to begin with', 'above', 'also', 'first ', 'likewise', 'in addition', 'second', 'inclusive', 'further', 'before', 'hence', 'in the end', 'last of all']
36
+ else: # Support for future languages
37
+ # self._connectives = ['asimismo', 'igualmente' 'de igual modo', 'de igual manera', 'de igual forma', 'del mismo modo', 'de la misma manera', 'de la misma forma', 'en primer lugar', 'en segundo lugar', 'en tercer lugar', 'en último lugar', 'por su parte', 'por otro lado', 'además', 'encima', 'es más', 'por añadidura', 'incluso', 'inclusive', 'para colmo']
38
+ pass
39
+
40
+ for con in self._connectives:
41
+ self._matcher.add(con, None, nlp(con))
42
+
43
+
44
+ def __call__(self, doc: Doc) -> Doc:
45
+ '''
46
+ This method will find all additive connectives and store them in an iterable.
47
+
48
+ Parameters:
49
+ doc(Doc): A Spacy document.
50
+ '''
51
+ matches = self._matcher(doc)
52
+ additive_connectives_spans = [doc[start:end] for _, start, end in matches]
53
+
54
+ doc._.additive_connectives_span_indices = [{'start': span.start,
55
+ 'end': span.end,
56
+ 'label': span.label}
57
+ for span in filter_spans(additive_connectives_spans)] # Save the temporal connectives found
58
+
59
+ return doc
text_complexity_analyzer_cm/pipes/adversative_connectives_tagger.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.matcher import PhraseMatcher
2
+ from spacy.tokens import Doc
3
+ from spacy.tokens import Span
4
+ from spacy.util import filter_spans
5
+ from spacy.language import Language
6
+
7
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
8
+
9
+ adversative_connectives_getter = lambda doc: [doc[span['start']:span['end']]
10
+ for span in doc._.adversative_connectives_span_indices]
11
+
12
+ Doc.set_extension('adversative_connectives_span_indices', force=False, default=[])
13
+ Doc.set_extension('adversative_connectives', force=False, getter=adversative_connectives_getter)
14
+
15
+ @Language.factory('adversative connective tagger')
16
+ class AdversativeConnectivesTagger:
17
+ '''
18
+ This tagger has the task to find all adversative connectives in a document. It needs to go after the 'Tagger' pipeline component.
19
+ '''
20
+ name = 'adversative connective tagger'
21
+
22
+ def __init__(self, name, nlp, language: str='en') -> None:
23
+ '''
24
+ This constructor will initialize the object that tags adversative connectives.
25
+
26
+ Parameters:
27
+ nlp: The Spacy model to use this tagger with.
28
+ language: The language that this pipeline will be used in.
29
+
30
+ Returns:
31
+ None.
32
+ '''
33
+ if not language in ACCEPTED_LANGUAGES:
34
+ raise ValueError(f'Language {language} is not supported yet')
35
+
36
+ self._language = language
37
+ self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
38
+ self._connectives = []
39
+ if language == 'en': # Adversative connectives for spanish
40
+ self._connectives = ['although', 'instead', 'and yet', 'nonetheless', 'nevertheless', 'rather', 'more than', 'yet', 'in fact', 'on the other hand', 'but yes', 'now well', 'on the contrary', 'however', 'in spite of this', 'conversely', 'still', 'less', 'actually', 'but rather', 'in contrast', 'but', 'except', 'only']
41
+ # self._connectives = ['pero', 'sino', 'no obstante', 'sino que', 'sin embargo', 'pero sí', 'aunque', 'menos', 'solo', 'excepto', 'salvo', 'más que', 'en cambio', 'ahora bien', 'más bien']
42
+ else: # Support for future languages
43
+ pass
44
+
45
+ for con in self._connectives:
46
+ self._matcher.add(con, None, nlp(con))
47
+
48
+
49
+ def __call__(self, doc: Doc) -> Doc:
50
+ '''
51
+ This method will find all adversative connectives and store them in an iterable.
52
+
53
+ Parameters:
54
+ doc(Doc): A Spacy document.
55
+ '''
56
+ matches = self._matcher(doc)
57
+ adversative_connectives_spans = [doc[start:end] for _, start, end in matches]
58
+
59
+ doc._.adversative_connectives_span_indices = [{'start': span.start,
60
+ 'end': span.end,
61
+ 'label': span.label}
62
+ for span in filter_spans(adversative_connectives_spans)] # Save the causal connectives found
63
+
64
+ return doc
text_complexity_analyzer_cm/pipes/asks_tagger.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.matcher import PhraseMatcher
2
+ from spacy.tokens import Doc
3
+ from spacy.tokens import Span
4
+ from spacy.util import filter_spans
5
+ from spacy.language import Language
6
+
7
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
8
+
9
+ asks_getter = lambda doc: [doc[span['start']:span['end']] for span in doc._.asks_span_indices]
10
+
11
+ Doc.set_extension('asks_span_indices', force=False, default=[])
12
+ Doc.set_extension('asks', force=False, getter=asks_getter)
13
+
14
+ @Language.factory('asks tagger')
15
+ class AsksTagger:
16
+ def __init__(self, name, nlp, language: str='en') -> None:
17
+ if not language in ACCEPTED_LANGUAGES:
18
+ raise ValueError(f'Language {language} is not supported yet')
19
+
20
+ self._language = language
21
+ self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
22
+ self._connectives = []
23
+ if language == 'en': # question words and questionmark
24
+ self._connectives = ['?', 'what', 'how', 'who', 'when']
25
+ else: # Support for future languages
26
+ pass
27
+
28
+ for con in self._connectives:
29
+ self._matcher.add(con, None, nlp(con))
30
+
31
+
32
+ def __call__(self, doc: Doc) -> Doc:
33
+ matches = self._matcher(doc)
34
+ asks_spans = [doc[start:end] for _, start, end in matches]
35
+ doc._.asks_span_indices = [{'start': span.start, 'end': span.end, 'label': span.label} for span in filter_spans(asks_spans)] # Save the asks connectives found
36
+
37
+ return doc
text_complexity_analyzer_cm/pipes/causal_connectives_tagger.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.matcher import PhraseMatcher
2
+ from spacy.tokens import Doc
3
+ from spacy.tokens import Span
4
+ from spacy.util import filter_spans
5
+ from spacy.language import Language
6
+
7
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
8
+
9
+ causal_connectives_getter = lambda doc: [doc[span['start']:span['end']] for span in doc._.causal_connectives_span_indices]
10
+
11
+ Doc.set_extension('causal_connectives_span_indices', force=False, default=[])
12
+ Doc.set_extension('causal_connectives', force=False, getter=causal_connectives_getter)
13
+
14
+ @Language.factory('causal connective tagger')
15
+ class CausalConnectivesTagger:
16
+ def __init__(self, name, nlp, language) -> None:
17
+ self._language = language
18
+ self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
19
+ self.causal_connectives = []
20
+ if language == 'en':
21
+ self.causal_connectives = ['to repeat, briefly', 'finally', 'therefore', 'with this in mind', 'in conclusion', 'because of this', 'because of', 'as a consequence', 'to this end', 'on the score of', 'then', 'because', 'so', 'later', 'hence', 'in short', 'for this reason', 'thus', 'so much that', 'accordingly', 'for', 'so then', 'as I have said', 'therefore', 'in summary', 'on the whole', 'consequently', 'for this purpose', 'since', 'as a result', 'to sum up', 'so that', 'as you can see']
22
+ else:
23
+ pass
24
+ for con in self.causal_connectives:
25
+ self._matcher.add(con, None, nlp(con))
26
+
27
+ def __call__(self, doc: Doc) -> Doc:
28
+ matches = self._matcher(doc)
29
+ causal_connectives_spans = [doc[start:end] for _, start, end in matches]
30
+ doc._.causal_connectives_span_indices = [{'start': span.start,
31
+ 'end': span.end,
32
+ 'label': span.label}
33
+ for span in filter_spans(causal_connectives_spans)]
34
+ return doc
text_complexity_analyzer_cm/pipes/emphatics_tagger.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy.matcher import PhraseMatcher
2
+ from spacy.tokens import Doc
3
+ from spacy.tokens import Span
4
+ from spacy.util import filter_spans
5
+ from spacy.language import Language
6
+
7
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
8
+
9
+ emphatics_getter = lambda doc: [doc[span['start']:span['end']]
10
+ for span in doc._.emphatics_span_indices]
11
+
12
+ Doc.set_extension('emphatics_span_indices', force=False, default=[])
13
+ Doc.set_extension('emphatics', force=False, getter=emphatics_getter)
14
+
15
+ @Language.factory('emphatics tagger')
16
+ class EmphaticsTagger:
17
+ def __init__(self, name, nlp, language) -> None:
18
+ if not language in ACCEPTED_LANGUAGES:
19
+ raise ValueError(f'Language {language} is not supported yet')
20
+
21
+ self._language = language
22
+ self._matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
23
+ self._connectives = []
24
+ if language == 'en': # emphatics connectives for spanish
25
+ self._connectives = ['him', 'there', 'their', 'it', 'he', 'she', 'we', 'who', 'them', 'they', 'you', 'himself', 'her', 'whom', 'itself', 'somebody', 'something', 'us', 'anybody', 'herself', 'anyone', 'everybody', 'nobody', 'everyone', 'themselves', 'yourself', 'someone', 'his', 'yours']
26
+ else: # Support for future languages
27
+ pass
28
+
29
+ for con in self._connectives:
30
+ self._matcher.add(con, None, nlp(con))
31
+
32
+
33
+ def __call__(self, doc: Doc) -> Doc:
34
+ matches = self._matcher(doc)
35
+ emphatics_spans = [doc[start:end] for _, start, end in matches]
36
+
37
+ doc._.emphatics_span_indices = [{'start': span.start,
38
+ 'end': span.end,
39
+ 'label': span.label}
40
+ for span in filter_spans(emphatics_spans)] # Save the emphatics connectives found
41
+
42
+ return doc
text_complexity_analyzer_cm/pipes/feature_counter.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import tee
2
+ from spacy.tokens import Doc
3
+ from spacy.tokens import Token
4
+ from spacy.language import Language
5
+
6
+ from text_complexity_analyzer_cm.constants import ACCEPTED_LANGUAGES
7
+
8
+ Doc.set_extension('feature_count', default=None, force=True)
9
+
10
+ @Language.factory('feature counter')
11
+ class FeatureCounter:
12
+
13
+ def __init__(self, nlp, name, language) -> None:
14
+ if not language in ACCEPTED_LANGUAGES:
15
+ raise ValueError(f'Language {language} is not supported yet')
16
+ self.language = language
17
+ self.counter_function = None
18
+
19
+
20
+ def __call__(self, doc: Doc) -> Doc:
21
+ '''
22
+ This method will calculate the the 'counter_function' on a text. Said function will be handle different counting.
23
+
24
+ Parameters:
25
+ doc(Doc): A Spacy document.
26
+ '''
27
+ if self.counter_function is None:
28
+ raise AttributeError('No function to count features was provided.')
29
+ # Prepare iterators to extract previous and current sentence pairs.
30
+ doc._.feature_count = self.counter_function(doc)
31
+
32
+ return doc