Goran Glavaš commited on
Commit
cf27868
1 Parent(s): f6fa5b0

Code, binary, data, and README

Browse files
Files changed (41) hide show
  1. README.txt +60 -0
  2. binary/graphseg.jar +3 -0
  3. data/manifestos-gold-segmented/61320_200411.txt +0 -0
  4. data/manifestos-gold-segmented/61320_200811.txt +0 -0
  5. data/manifestos-gold-segmented/61320_201211.txt +0 -0
  6. data/manifestos-gold-segmented/61620_200411.txt +0 -0
  7. data/manifestos-gold-segmented/61620_200811.txt +0 -0
  8. data/manifestos-gold-segmented/61620_201211.txt +0 -0
  9. data/manifestos-original-clean/61320_200411.txt +0 -0
  10. data/manifestos-original-clean/61320_200811.txt +0 -0
  11. data/manifestos-original-clean/61320_201211.txt +0 -0
  12. data/manifestos-original-clean/61620_200411.txt +0 -0
  13. data/manifestos-original-clean/61620_200811.txt +0 -0
  14. data/manifestos-original-clean/61620_201211.txt +0 -0
  15. source/pom.xml +85 -0
  16. source/src/config.properties +3 -0
  17. source/src/edu/uma/nlp/graphseg/ClusteringHandler.java +206 -0
  18. source/src/edu/uma/nlp/graphseg/GraphHandler.java +134 -0
  19. source/src/edu/uma/nlp/graphseg/IOHandler.java +33 -0
  20. source/src/edu/uma/nlp/graphseg/STSHandler.java +37 -0
  21. source/src/edu/uma/nlp/graphseg/Start.java +122 -0
  22. source/src/edu/uma/nlp/graphseg/preprocessing/Annotation.java +36 -0
  23. source/src/edu/uma/nlp/graphseg/preprocessing/AnnotationType.java +14 -0
  24. source/src/edu/uma/nlp/graphseg/preprocessing/AnnotatorChain.java +35 -0
  25. source/src/edu/uma/nlp/graphseg/preprocessing/AnnotatorType.java +11 -0
  26. source/src/edu/uma/nlp/graphseg/preprocessing/Document.java +110 -0
  27. source/src/edu/uma/nlp/graphseg/preprocessing/IAnnotator.java +9 -0
  28. source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityAnnotation.java +88 -0
  29. source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityTokenAnnotation.java +38 -0
  30. source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityType.java +18 -0
  31. source/src/edu/uma/nlp/graphseg/preprocessing/PartOfSpeechAnnotation.java +69 -0
  32. source/src/edu/uma/nlp/graphseg/preprocessing/SentenceAnnotation.java +66 -0
  33. source/src/edu/uma/nlp/graphseg/preprocessing/StanfordAnnotator.java +142 -0
  34. source/src/edu/uma/nlp/graphseg/preprocessing/TokenAnnotation.java +104 -0
  35. source/src/edu/uma/nlp/graphseg/semantics/InformationContent.java +77 -0
  36. source/src/edu/uma/nlp/graphseg/semantics/SemanticSimilarity.java +252 -0
  37. source/src/edu/uma/nlp/graphseg/semantics/WordVectorSpace.java +151 -0
  38. source/src/edu/uma/nlp/graphseg/utils/ApplicationConfiguration.java +49 -0
  39. source/src/edu/uma/nlp/graphseg/utils/IOHelper.java +385 -0
  40. source/src/edu/uma/nlp/graphseg/utils/MemoryStorage.java +26 -0
  41. source/src/edu/uma/nlp/graphseg/utils/VectorOperations.java +45 -0
README.txt ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ About
2
+ ========
3
+
4
+ GraphSeg is a tool for semantic/topical segmentation of text that employs semantic relatedness and a graph-based algorithm to identify semantically coherent segments in text.
5
+ Segmentation is performed at the sentence level (no intra-sentential segment beginnings/end)
6
+
7
+ Content
8
+ ========
9
+
10
+ This repository contains:
11
+
12
+ (1) the Java source code (as Maven project)
13
+ (2) the ready-to-use binary version of the tool (graphseg.jar in the /binary folder)
14
+ (3) the dataset of political manifestos manually annotated with segments (used for evaluation in the research paper the GraphSeg tool accompanies).
15
+
16
+ Usage
17
+ ========
18
+
19
+ The following command with four arguments runs the GraphSeg tool:
20
+
21
+ java -jar graphseg.jar <input-folder-path> <output-folder-path> <relatedness-treshold> <minimal-segment-size>
22
+
23
+ The argument (all mandatory) to be provided are:
24
+
25
+ (1) <input-folder-path> is the path to the folder (directory) containing the raw text documents that need to be topically/semantically segmented;
26
+ (2) <output-folder-path> is the path to the folder in which the semantically/topically segmented input documents are to be stored;
27
+ (3) <relatedness-treshold> is the value of the relatedness treshold (decimal number) to be used in the construction of the relatedness graph: larger values will give large number of smalled segments, whereas the smaller treshold values will provide a smaller number of coarse segments;
28
+ (4) <minimal-segment-size> defines the minimal segment size m (in number of sentences). This means that GraphSeg will not produce segments containing less than m sentences.
29
+
30
+ Example command:
31
+
32
+ java -jar graphseg.jar /home/seg-input /home/seg-output 0.25 3
33
+
34
+ Credit
35
+ ========
36
+
37
+ In case you use GraphSeg in your research, please give approproate credit to our work by citing the following publication:
38
+
39
+ @InProceedings{glavavs-nanni-ponzetto:2016:*SEM,
40
+ author = {Glava\v{s}, Goran and Nanni, Federico and Ponzetto, Simone Paolo},
41
+ title = {Unsupervised Text Segmentation Using Semantic Relatedness Graphs},
42
+ booktitle = {Proceedings of the Fifth Joint Conference on Lexical and Computational Semantics},
43
+ month = {August},
44
+ year = {2016},
45
+ address = {Berlin, Germany},
46
+ publisher = {Association for Computational Linguistics},
47
+ pages = {125--130},
48
+ url = {http://anthology.aclweb.org/S16-2016}
49
+ }
50
+
51
+ Contact
52
+ ========
53
+
54
+ Please address all questions about the GraphSeg tool and the *SEM publication to:
55
+
56
+ Dr. Goran Glava�
57
+ Data and Web Science Group
58
+ University of Mannheim
59
+
60
+ Email: goran@informatik.uni-mannheim.de
binary/graphseg.jar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83ac4ce85663bd97072a2fad76349bf923d1869b7acd7a67f797e6c16a1a47b2
3
+ size 350762888
data/manifestos-gold-segmented/61320_200411.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-gold-segmented/61320_200811.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-gold-segmented/61320_201211.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-gold-segmented/61620_200411.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-gold-segmented/61620_200811.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-gold-segmented/61620_201211.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-original-clean/61320_200411.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-original-clean/61320_200811.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-original-clean/61320_201211.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-original-clean/61620_200411.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-original-clean/61620_200811.txt ADDED
The diff for this file is too large to render. See raw diff
 
data/manifestos-original-clean/61620_201211.txt ADDED
The diff for this file is too large to render. See raw diff
 
source/pom.xml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2
+ <modelVersion>4.0.0</modelVersion>
3
+ <groupId>edu.uma.nlp.graphseg</groupId>
4
+ <artifactId>graphseg</artifactId>
5
+ <version>0.0.1-SNAPSHOT</version>
6
+ <name>graphseg</name>
7
+ <description>Textual segmentation using graph-based algorithm using semantic relatedness</description>
8
+ <build>
9
+ <sourceDirectory>src</sourceDirectory>
10
+ <resources>
11
+ <resource>
12
+ <directory>src</directory>
13
+ <excludes>
14
+ <exclude>**/*.java</exclude>
15
+ </excludes>
16
+ </resource>
17
+ </resources>
18
+ <plugins>
19
+ <plugin>
20
+ <artifactId>maven-compiler-plugin</artifactId>
21
+ <version>3.3</version>
22
+ <configuration>
23
+ <source>1.8</source>
24
+ <target>1.8</target>
25
+ </configuration>
26
+ </plugin>
27
+ <plugin>
28
+ <artifactId>maven-assembly-plugin</artifactId>
29
+ <configuration>
30
+ <archive>
31
+ <manifest>
32
+ <mainClass>edu.uma.nlp.graphseg.Start</mainClass>
33
+ </manifest>
34
+ </archive>
35
+ <descriptorRefs>
36
+ <descriptorRef>jar-with-dependencies</descriptorRef>
37
+ </descriptorRefs>
38
+ </configuration>
39
+ <executions>
40
+ <execution>
41
+ <id>make-assembly</id>
42
+ <!-- bind to the packaging phase -->
43
+ <phase>package</phase>
44
+ <goals>
45
+ <goal>single</goal>
46
+ </goals>
47
+ </execution>
48
+ </executions>
49
+ </plugin>
50
+ </plugins>
51
+ </build>
52
+ <dependencies>
53
+ <dependency>
54
+ <groupId>org.jgrapht</groupId>
55
+ <artifactId>jgrapht-core</artifactId>
56
+ <version>0.9.1</version>
57
+ </dependency>
58
+ <dependency>
59
+ <groupId>org.javatuples</groupId>
60
+ <artifactId>javatuples</artifactId>
61
+ <version>1.2</version>
62
+ </dependency>
63
+ <dependency>
64
+ <groupId>commons-io</groupId>
65
+ <artifactId>commons-io</artifactId>
66
+ <version>2.4</version>
67
+ </dependency>
68
+ <dependency>
69
+ <groupId>org.apache.commons</groupId>
70
+ <artifactId>commons-lang3</artifactId>
71
+ <version>3.4</version>
72
+ </dependency>
73
+ <dependency>
74
+ <groupId>edu.stanford.nlp</groupId>
75
+ <artifactId>stanford-corenlp</artifactId>
76
+ <version>3.5.2</version>
77
+ </dependency>
78
+ <dependency>
79
+ <groupId>edu.stanford.nlp</groupId>
80
+ <artifactId>stanford-corenlp</artifactId>
81
+ <version>3.5.2</version>
82
+ <classifier>models</classifier>
83
+ </dependency>
84
+ </dependencies>
85
+ </project>
source/src/config.properties ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ inf-cont-path=C:/Goran/Corpora/unigram-freqs-english.txt
2
+ word-vec-path=C:/Goran/Corpora/WordVectors/glove-vectors-6b-200d.txt
3
+ stop-words-path=C:/Goran/Corpora/stopwords.txt
source/src/edu/uma/nlp/graphseg/ClusteringHandler.java ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.List;
5
+ import java.util.Map;
6
+ import java.util.Optional;
7
+ import java.util.stream.Collectors;
8
+
9
+ public class ClusteringHandler {
10
+
11
+ public List<List<Integer>> getSequentialClusters(List<List<Integer>> cliques, Map<Integer, Map<Integer, Double>> allSimilarities, int largestTooSmallClusterSize)
12
+ {
13
+ List<List<Integer>> sequentialClusters = new ArrayList<List<Integer>>();
14
+
15
+ System.out.println("Merging cliques...");
16
+ mergeCliques(cliques, sequentialClusters);
17
+ System.out.println("Merging singletons...");
18
+ mergeSingletons(cliques, sequentialClusters, allSimilarities);
19
+ System.out.println("Merging too small sequences...");
20
+ mergeTooSmallSequences(sequentialClusters, allSimilarities, largestTooSmallClusterSize);
21
+
22
+ return sequentialClusters;
23
+ }
24
+
25
+ private void mergeCliques(List<List<Integer>> cliques, List<List<Integer>> sequentialClusters)
26
+ {
27
+ boolean change = true;
28
+ while(change)
29
+ {
30
+ change = false;
31
+ for(List<Integer> clique : cliques)
32
+ {
33
+ for(int i = 0; i < clique.size() - 1; i++)
34
+ {
35
+ for(int j = i+1; j < clique.size(); j++)
36
+ {
37
+ int ind = i;
38
+ int jond = j;
39
+ Optional<List<Integer>> existingClusterFirst = sequentialClusters.stream().filter(sc -> sc.contains(clique.get(ind))).findFirst();
40
+ Optional<List<Integer>> existingClusterSecond = sequentialClusters.stream().filter(sc -> sc.contains(clique.get(jond))).findFirst();
41
+
42
+ // Both nodes from the clique already placed in clusters
43
+ if (existingClusterFirst.isPresent() && existingClusterSecond.isPresent())
44
+ {
45
+ continue;
46
+ }
47
+
48
+ // Neither of the nodes is in the cluster
49
+ else if (!existingClusterFirst.isPresent() && !existingClusterSecond.isPresent())
50
+ {
51
+ // if these are consecutive sentences, we make a new cluster
52
+ if (Math.abs(clique.get(i) - clique.get(j)) == 1)
53
+ {
54
+ List<Integer> newCluster = new ArrayList<Integer>();
55
+ newCluster.add(Math.min(clique.get(i), clique.get(j)));
56
+ newCluster.add(Math.max(clique.get(i), clique.get(j)));
57
+
58
+ int insertIndex = -1;
59
+ for(int k = 0; k < sequentialClusters.size(); k++)
60
+ {
61
+ if (newCluster.get(newCluster.size() - 1) < sequentialClusters.get(k).get(0))
62
+ {
63
+ insertIndex = k;
64
+ break;
65
+ }
66
+ }
67
+
68
+ if (insertIndex >= 0) sequentialClusters.add(insertIndex, newCluster);
69
+ else sequentialClusters.add(newCluster);
70
+
71
+ change = true;
72
+ }
73
+ }
74
+
75
+ // one node is in one cluster, the other isn't
76
+ else
77
+ {
78
+ List<Integer> cluster = existingClusterFirst.isPresent() ? existingClusterFirst.get() : existingClusterSecond.get();
79
+ int node = existingClusterFirst.isPresent() ? clique.get(j) : clique.get(i);
80
+
81
+ if ((node == cluster.get(0) - 1) || (node == cluster.get(cluster.size()-1) + 1))
82
+ {
83
+ cluster.add(node);
84
+ cluster.sort((e1, e2) -> e1 < e2 ? -1 : (e1 > e2 ? 1 : 0));
85
+
86
+ change = true;
87
+ }
88
+ }
89
+ }
90
+ }
91
+ }
92
+ }
93
+ }
94
+
95
+ private List<Integer> computeSingletons(List<List<Integer>> cliques, List<List<Integer>> sequentialClusters)
96
+ {
97
+ List<Integer> singletons = new ArrayList<Integer>();
98
+ for(List<Integer> c : cliques)
99
+ {
100
+ for(int n : c)
101
+ {
102
+ if (!sequentialClusters.stream().anyMatch(sc -> sc.contains(n))) singletons.add(n);
103
+ }
104
+ }
105
+
106
+ singletons = singletons.stream().distinct().collect(Collectors.toList());
107
+ singletons.sort((s1, s2) -> s1 < s2 ? -1 : (s1 > s2 ? 1 : 0));
108
+ return singletons;
109
+ }
110
+
111
+ private void mergeTooSmallSequences(List<List<Integer>> sequentialClusters, Map<Integer, Map<Integer, Double>> allSimilarities, int largestSmallCluster)
112
+ {
113
+ boolean change = true;
114
+ while(change)
115
+ {
116
+ change = false;
117
+ Optional<List<Integer>> firstSmallCluster = sequentialClusters.stream().filter(c -> c.size() <= largestSmallCluster).findFirst();
118
+ if (firstSmallCluster.isPresent())
119
+ {
120
+ int i = sequentialClusters.indexOf(firstSmallCluster.get());
121
+ double similarityPrevious = (i == 0) ? 0 : averageClusterSimilarity(sequentialClusters.get(i-1), sequentialClusters.get(i), allSimilarities);
122
+ double similarityNext = (i == (sequentialClusters.size() - 1)) ? 0 : averageClusterSimilarity(sequentialClusters.get(i), sequentialClusters.get(i+1), allSimilarities);
123
+
124
+ List<Integer> clusterToMergeWith = (similarityPrevious > similarityNext) ? sequentialClusters.get(i-1) : sequentialClusters.get(i+1);
125
+ List<Integer> newCluster = new ArrayList<Integer>();
126
+ newCluster.addAll(clusterToMergeWith);
127
+ newCluster.addAll(sequentialClusters.get(i));
128
+ newCluster.sort((i1, i2) -> i1 > i2 ? 1 : (i1 < i2 ? -1 : 0));
129
+
130
+ sequentialClusters.add((similarityPrevious > similarityNext) ? i-1 : i, newCluster);
131
+ sequentialClusters.remove(firstSmallCluster.get());
132
+ sequentialClusters.remove(clusterToMergeWith);
133
+
134
+ change = true;
135
+ }
136
+ }
137
+ }
138
+
139
+ private double averageClusterSimilarity(List<Integer> first, List<Integer> second, Map<Integer, Map<Integer, Double>> allSimilarities)
140
+ {
141
+ double sum = 0;
142
+ for(int i = 0; i < first.size(); i++)
143
+ {
144
+ for(int j = 0; j < second.size(); j++)
145
+ {
146
+ sum += allSimilarities.get(Math.min(first.get(i), second.get(j))).get(Math.max(first.get(i), second.get(j)));
147
+ }
148
+ }
149
+ return sum / ((double)(first.size() * second.size()));
150
+ }
151
+
152
+ private void mergeSingletons(List<List<Integer>> cliques, List<List<Integer>> sequentialClusters, Map<Integer, Map<Integer, Double>> allSimilarities)
153
+ {
154
+ List<Integer> singletons = computeSingletons(cliques, sequentialClusters);
155
+
156
+ while(singletons.size() > 0)
157
+ {
158
+ if (singletons.size() % 10 == 0) System.out.println("Remaining singletons: " + singletons.size());
159
+
160
+ int node = singletons.get(0);
161
+ Optional<List<Integer>> previousNodeCluster = sequentialClusters.stream().filter(sc -> sc.contains(node - 1)).findFirst();
162
+ Optional<List<Integer>> nextNodeCluster = sequentialClusters.stream().filter(sc -> sc.contains(node + 1)).findFirst();
163
+
164
+ double similarityPrevious = node == 0 ? -1.0 : (previousNodeCluster.isPresent() ? similarityNodeCluster(node, previousNodeCluster.get(), allSimilarities) : allSimilarities.get(node - 1).get(node));
165
+ double similarityNext = node == allSimilarities.size() ? -1.0 : (nextNodeCluster.isPresent() ? similarityNodeCluster(node, nextNodeCluster.get(), allSimilarities) : allSimilarities.get(node).get(node + 1));
166
+
167
+ boolean previous = similarityPrevious >= similarityNext;
168
+ boolean mergeWithCluster = previous ? previousNodeCluster.isPresent() : nextNodeCluster.isPresent();
169
+
170
+ if (mergeWithCluster)
171
+ {
172
+ if (previous) previousNodeCluster.get().add(node);
173
+ else nextNodeCluster.get().add(0, node);
174
+ }
175
+ else
176
+ {
177
+ List<Integer> newCluster = new ArrayList<Integer>();
178
+ newCluster.add(previous ? node - 1 : node);
179
+ newCluster.add(previous ? node : node + 1);
180
+
181
+ int insertIndex = -1;
182
+
183
+ for(int k = 0; k < sequentialClusters.size(); k++)
184
+ {
185
+ if (newCluster.get(newCluster.size() - 1) < sequentialClusters.get(k).get(0))
186
+ {
187
+ insertIndex = k;
188
+ break;
189
+ }
190
+ }
191
+
192
+ if (insertIndex >= 0) sequentialClusters.add(insertIndex, newCluster);
193
+ else sequentialClusters.add(newCluster);
194
+ }
195
+
196
+ singletons = computeSingletons(cliques, sequentialClusters);
197
+ }
198
+ }
199
+
200
+ private double similarityNodeCluster(int node, List<Integer> cluster, Map<Integer, Map<Integer, Double>> allSimilarities)
201
+ {
202
+ double average = 0;
203
+ for(Integer n2 : cluster) average += allSimilarities.get(Math.min(node, n2)).get(Math.max(node, n2));
204
+ return average /((double)cluster.size());
205
+ }
206
+ }
source/src/edu/uma/nlp/graphseg/GraphHandler.java ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.HashMap;
5
+ import java.util.List;
6
+ import java.util.Map;
7
+ import java.util.stream.Collectors;
8
+
9
+ import org.jgrapht.UndirectedGraph;
10
+ import org.jgrapht.alg.BronKerboschCliqueFinder;
11
+ import org.jgrapht.alg.ConnectivityInspector;
12
+ import org.jgrapht.alg.KuhnMunkresMinimalWeightBipartitePerfectMatching;
13
+ import org.jgrapht.generate.SimpleWeightedBipartiteGraphMatrixGenerator;
14
+ import org.jgrapht.generate.WeightedGraphGeneratorAdapter;
15
+ import org.jgrapht.graph.DefaultEdge;
16
+ import org.jgrapht.graph.DefaultWeightedEdge;
17
+ import org.jgrapht.graph.SimpleGraph;
18
+ import org.jgrapht.graph.SimpleWeightedGraph;
19
+
20
+ import edu.uma.nlp.graphseg.preprocessing.Document;
21
+ import edu.uma.nlp.graphseg.preprocessing.TokenAnnotation;
22
+ import edu.uma.nlp.graphseg.utils.MemoryStorage;
23
+
24
+ public class GraphHandler {
25
+
26
+ private static List<String> stopwords;
27
+ public static void setStopwords(List<String> stwrds)
28
+ {
29
+ stopwords = stwrds;
30
+ }
31
+
32
+ private static Map<Integer, Map<Integer, Double>> allSimilarities;
33
+ public static Map<Integer, Map<Integer, Double>> getAllSimilarities()
34
+ {
35
+ return allSimilarities;
36
+ }
37
+
38
+ public static UndirectedGraph<Integer, DefaultEdge> constructGraph(List<Document> snippets, double similarityTreshold)
39
+ {
40
+ int localizationSize = 100;
41
+ allSimilarities = new HashMap<Integer, Map<Integer, Double>>();
42
+
43
+ UndirectedGraph<Integer, DefaultEdge> graph = new SimpleGraph<Integer, DefaultEdge>(DefaultEdge.class);
44
+
45
+ snippets.forEach(s -> graph.addVertex(Integer.parseInt(s.getId())));
46
+ for(int i = 0; i < snippets.size() - 1; i++)
47
+ {
48
+ if (i % 10 == 0) System.out.println("Constructing graph, outer loop " + i + "/" + snippets.size());
49
+ allSimilarities.put(i, new HashMap<Integer, Double>());
50
+
51
+ for(int j = i + 1; j < Math.min(snippets.size(), i + localizationSize); j++)
52
+ {
53
+ List<TokenAnnotation> contentTokenFirst = snippets.get(i).getTokens().stream().filter(t -> t.getPartOfSpeech().isContent() && !stopwords.contains(t.getLemma().toLowerCase())).collect(Collectors.toList());
54
+ List<TokenAnnotation> contentTokenSecond = snippets.get(j).getTokens().stream().filter(t -> t.getPartOfSpeech().isContent() && !stopwords.contains(t.getLemma().toLowerCase())).collect(Collectors.toList());
55
+
56
+ if (contentTokenFirst.size() == 0 || contentTokenSecond.size() == 0)
57
+ {
58
+ allSimilarities.get(i).put(j, 0.0);
59
+ continue;
60
+ }
61
+
62
+ // preparing for bipartite graph min matching
63
+ double[][] dissimilarities = new double[Math.max(contentTokenFirst.size(), contentTokenSecond.size())][Math.max(contentTokenFirst.size(), contentTokenSecond.size())];
64
+ List<Integer> firstPartition = new ArrayList<Integer>();
65
+ List<Integer> secondPartition = new ArrayList<Integer>();
66
+
67
+ for(int k = 0; k < Math.max(contentTokenFirst.size(), contentTokenSecond.size()); k++)
68
+ {
69
+ for(int l = 0; l < Math.max(contentTokenFirst.size(), contentTokenSecond.size()); l++)
70
+ {
71
+ if (k >= contentTokenFirst.size() || l >= contentTokenSecond.size())
72
+ {
73
+ dissimilarities[k][l] = 1;
74
+ }
75
+ else
76
+ {
77
+ double icFactor = Math.max(MemoryStorage.getInformationContent().getRelativeInformationContent(contentTokenFirst.get(k).getLemma().toLowerCase()), MemoryStorage.getInformationContent().getRelativeInformationContent(contentTokenSecond.get(l).getLemma().toLowerCase()));
78
+ double simTokens = MemoryStorage.getWordVectorSpace().similarity(contentTokenFirst.get(k).getLemma().toLowerCase(), contentTokenSecond.get(l).getLemma().toLowerCase());
79
+ if (simTokens < 0) simTokens = 0;
80
+
81
+ dissimilarities[k][l] = 1 - icFactor * simTokens;
82
+ }
83
+ }
84
+ }
85
+ for(int z = 0; z < Math.max(contentTokenFirst.size(), contentTokenSecond.size()); z++)
86
+ {
87
+ firstPartition.add(z);
88
+ secondPartition.add(z + Math.max(contentTokenFirst.size(), contentTokenSecond.size()));
89
+ }
90
+
91
+ double bmScore = minimumAverageBipartiteGraphMatchingScore(dissimilarities, firstPartition, secondPartition) - (Math.abs(contentTokenFirst.size() - contentTokenSecond.size()));
92
+ double similarityNonNormalized = Math.min(contentTokenFirst.size(), contentTokenSecond.size()) - bmScore;
93
+ double similarity = ((similarityNonNormalized / contentTokenFirst.size()) + (similarityNonNormalized / contentTokenSecond.size())) / 2.0;
94
+
95
+ //double similarity = SemanticSimilarity.greedyAlignmentOverlapFScore(snippets.get(i).getTokens(), snippets.get(j).getTokens(), MemoryStorage.getWordVectorSpace(), MemoryStorage.getInformationContent(), true);
96
+ allSimilarities.get(i).put(j, similarity);
97
+
98
+ if (similarity > similarityTreshold)
99
+ {
100
+ graph.addEdge(Integer.parseInt(snippets.get(i).getId()), Integer.parseInt(snippets.get(j).getId()));
101
+ }
102
+ }
103
+ }
104
+
105
+ return graph;
106
+ }
107
+
108
+ public static double minimumAverageBipartiteGraphMatchingScore(double[][] dissimilarities, List<Integer> firstPartition, List<Integer> secondPartition)
109
+ {
110
+ SimpleWeightedGraph<Integer, DefaultWeightedEdge> bipartiteGraph = new SimpleWeightedGraph<>(DefaultWeightedEdge.class);
111
+ WeightedGraphGeneratorAdapter<Integer, DefaultWeightedEdge, Integer> generator =
112
+ new SimpleWeightedBipartiteGraphMatrixGenerator<Integer, DefaultWeightedEdge>()
113
+ .first (firstPartition)
114
+ .second (secondPartition)
115
+ .weights(dissimilarities);
116
+
117
+ generator.generateGraph(bipartiteGraph, null, null);
118
+ KuhnMunkresMinimalWeightBipartitePerfectMatching<Integer, DefaultWeightedEdge> bipartiteMatching = new KuhnMunkresMinimalWeightBipartitePerfectMatching<Integer, DefaultWeightedEdge>(bipartiteGraph, firstPartition, secondPartition);
119
+
120
+ return bipartiteMatching.getMatchingWeight();
121
+ }
122
+
123
+ public static List<List<Integer>> getAllCliques(UndirectedGraph<Integer, DefaultEdge> graph)
124
+ {
125
+ BronKerboschCliqueFinder<Integer, DefaultEdge> finder = new BronKerboschCliqueFinder<Integer, DefaultEdge>(graph);
126
+ return finder.getAllMaximalCliques().stream().map(x -> x.stream().collect(Collectors.toList())).collect(Collectors.toList());
127
+ }
128
+
129
+ public static List<List<Integer>> getAllConnectedComponents(UndirectedGraph<Integer, DefaultEdge> graph)
130
+ {
131
+ ConnectivityInspector<Integer, DefaultEdge> finder = new ConnectivityInspector<Integer, DefaultEdge>(graph);
132
+ return finder.connectedSets().stream().map(x -> x.stream().collect(Collectors.toList())).collect(Collectors.toList());
133
+ }
134
+ }
source/src/edu/uma/nlp/graphseg/IOHandler.java ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg;
2
+
3
+ import java.io.BufferedWriter;
4
+ import java.io.File;
5
+ import java.io.FileOutputStream;
6
+ import java.io.OutputStreamWriter;
7
+ import java.util.List;
8
+
9
+ public class IOHandler {
10
+ public static void writeSegmentation(List<String> rawLines, List<List<Integer>> segmentation, String path)
11
+ {
12
+ try {
13
+ File fout = new File(path);
14
+ FileOutputStream fos;
15
+ fos = new FileOutputStream(fout);
16
+ BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos));
17
+
18
+ for(int i = 0; i < segmentation.size(); i++)
19
+ {
20
+ for(int j = 0; j < segmentation.get(i).size(); j++)
21
+ {
22
+ bw.write(rawLines.get(segmentation.get(i).get(j)) + "\n");
23
+ }
24
+ bw.write("==========\n");
25
+ }
26
+ bw.close();
27
+ }
28
+ catch (Exception e) {
29
+ // TODO Auto-generated catch block
30
+ e.printStackTrace();
31
+ }
32
+ }
33
+ }
source/src/edu/uma/nlp/graphseg/STSHandler.java ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.List;
5
+
6
+ import org.javatuples.Triplet;
7
+
8
+ import edu.uma.nlp.graphseg.preprocessing.Document;
9
+ import edu.uma.nlp.graphseg.semantics.InformationContent;
10
+ import edu.uma.nlp.graphseg.semantics.SemanticSimilarity;
11
+ import edu.uma.nlp.graphseg.semantics.WordVectorSpace;
12
+
13
+
14
+ public class STSHandler {
15
+ public static List<Triplet<Document, Document, Double>> getSemanticSimilarities(List<Document> snippets, double simTreshold, WordVectorSpace vectorSpace, InformationContent informationContent)
16
+ {
17
+ List<Triplet<Document, Document, Double>> similarityGraph = new ArrayList<Triplet<Document, Document, Double>>();
18
+
19
+ for(int i = 0; i < snippets.size() - 1; i++)
20
+ {
21
+ System.out.println("Outer loop: " + String.valueOf(i+1) + "/" + String.valueOf(snippets.size() - 1));
22
+ for(int j = i + 1; j < snippets.size(); j++)
23
+ {
24
+ //if (j % 100 == 0) System.out.println("Inner loop: " + String.valueOf(j+1) + "/" + String.valueOf(snippets.size()));
25
+
26
+ double similarity = SemanticSimilarity.greedyAlignmentOverlapFScore(snippets.get(i).getTokens(), snippets.get(j).getTokens(), vectorSpace, informationContent, true);
27
+ if (similarity > simTreshold)
28
+ {
29
+ similarityGraph.add(new Triplet<Document, Document, Double>(snippets.get(i), snippets.get(j), similarity));
30
+ }
31
+ }
32
+ }
33
+
34
+ similarityGraph.sort((i1, i2) -> i1.getValue2() > i2.getValue2() ? -1 : (i1.getValue2() < i2.getValue2() ? 1 : 0));
35
+ return similarityGraph;
36
+ }
37
+ }
source/src/edu/uma/nlp/graphseg/Start.java ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg;
2
+
3
+ import java.io.File;
4
+ import java.io.IOException;
5
+ import java.nio.file.Files;
6
+ import java.nio.file.Path;
7
+ import java.nio.file.Paths;
8
+ import java.util.ArrayList;
9
+ import java.util.Arrays;
10
+ import java.util.List;
11
+ import java.util.stream.Collectors;
12
+
13
+ import org.apache.commons.io.FileUtils;
14
+ import org.jgrapht.UndirectedGraph;
15
+ import org.jgrapht.graph.DefaultEdge;
16
+
17
+ import edu.uma.nlp.graphseg.preprocessing.Document;
18
+ import edu.uma.nlp.graphseg.preprocessing.StanfordAnnotator;
19
+ import edu.uma.nlp.graphseg.semantics.InformationContent;
20
+ import edu.uma.nlp.graphseg.semantics.SemanticSimilarity;
21
+ import edu.uma.nlp.graphseg.semantics.WordVectorSpace;
22
+ import edu.uma.nlp.graphseg.utils.ApplicationConfiguration;
23
+ import edu.uma.nlp.graphseg.utils.IOHelper;
24
+ import edu.uma.nlp.graphseg.utils.MemoryStorage;
25
+
26
+ public class Start {
27
+
28
+ public static void main(String[] args) throws NumberFormatException, IOException {
29
+ // TODO Auto-generated method stub
30
+
31
+ // checking the arguments
32
+ if (args.length < 4)
33
+ {
34
+ System.out.println("USAGE: java -jar graphseg.jar <input-dir> <output-dir> <rel-treshold> <min-segment>");
35
+ return;
36
+ }
37
+
38
+ File inputDirFile = new File(args[0]);
39
+ File outputDirFile = new File(args[1]);
40
+
41
+ if (!inputDirFile.exists() || !outputDirFile.exists() || !inputDirFile.isDirectory() || !outputDirFile.isDirectory())
42
+ {
43
+ System.out.println("USAGE: java -jar graphseg.jar <input-dir> <output-dir> <rel-treshold (double, <0,1>)> <min-segment (int)>");
44
+ return;
45
+ }
46
+
47
+ double treshold = 0;
48
+ try
49
+ {
50
+ treshold = Double.parseDouble(args[2]);
51
+ if (treshold < 0 || treshold > 1)
52
+ {
53
+ throw new UnsupportedOperationException();
54
+ }
55
+ }
56
+ catch(NumberFormatException ex)
57
+ {
58
+ System.out.println("USAGE: java -jar graphseg.jar <input-dir> <output-dir> <rel-treshold (double, <0,1>)> <min-segment (int)>");
59
+ return;
60
+ }
61
+
62
+ int minseg = 0;
63
+ try
64
+ {
65
+ minseg = Integer.parseInt(args[3]);
66
+ if (minseg < 1)
67
+ {
68
+ throw new UnsupportedOperationException();
69
+ }
70
+ }
71
+ catch(NumberFormatException ex)
72
+ {
73
+ System.out.println("USAGE: java -jar graphseg.jar <input-dir> <output-dir> <rel-treshold (double, <0,1>)> <min-segment (int, >=1)>");
74
+ return;
75
+ }
76
+
77
+ List<String> stopwords = IOHelper.getAllLines(ApplicationConfiguration.config.getValue("stop-words-path"));
78
+ MemoryStorage.setWordVectorSpace(new WordVectorSpace());
79
+ MemoryStorage.getWordVectorSpace().load(ApplicationConfiguration.config.getValue("word-vec-path"), null);
80
+
81
+ MemoryStorage.setInformationContent(new InformationContent(ApplicationConfiguration.config.getValue("inf-cont-path"), 1));
82
+
83
+
84
+ SemanticSimilarity.setStopwords(stopwords);
85
+ GraphHandler.setStopwords(stopwords);
86
+
87
+ StanfordAnnotator annotator = new StanfordAnnotator();
88
+
89
+
90
+ for(Path file : Files.walk(Paths.get(args[0])).filter(x -> (new File(x.toString()).isFile())).collect(Collectors.toList()))
91
+ {
92
+ System.out.println("Segmenting file: " + file.toString());
93
+
94
+ annotator.setStanfordAnnotators(new ArrayList<String>(Arrays.asList("tokenize", "ssplit")));
95
+
96
+ String content = FileUtils.readFileToString(new File(file.toString()));
97
+ Document doc = new Document();
98
+ doc.setText(content);
99
+ annotator.annotate(doc);
100
+
101
+ annotator.setStanfordAnnotators(new ArrayList<String>(Arrays.asList("tokenize", "ssplit", "pos", "lemma")));
102
+
103
+ List<Document> snippets = new ArrayList<Document>();
104
+ for(int i = 0; i < doc.getSentences().size(); i++)
105
+ {
106
+ Document snippet = new Document(doc.getSentences().get(i).getText());
107
+ annotator.annotate(snippet);
108
+ snippet.setId(String.valueOf(i));
109
+ snippets.add(snippet);
110
+ }
111
+
112
+ UndirectedGraph<Integer, DefaultEdge> graph = GraphHandler.constructGraph(snippets, treshold);
113
+ System.out.println("Computing cliques...");
114
+ List<List<Integer>> cliques = GraphHandler.getAllCliques(graph);
115
+
116
+ ClusteringHandler clusterer = new ClusteringHandler();
117
+ System.out.println("Constructing linear segments...");
118
+ List<List<Integer>> sequentialClusters = clusterer.getSequentialClusters(cliques, GraphHandler.getAllSimilarities(), minseg);
119
+ IOHandler.writeSegmentation(doc.getSentences().stream().map(x -> x.getText()).collect(Collectors.toList()), sequentialClusters, args[1] + (args[1].endsWith("/") ? "" : "/") + file.getFileName().toString());
120
+ }
121
+ }
122
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/Annotation.java ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.HashMap;
5
+ import java.util.List;
6
+
7
+ public class Annotation {
8
+
9
+ protected HashMap<AnnotationType, List<Annotation>> childAnnotations;
10
+
11
+ public Annotation()
12
+ {
13
+ childAnnotations = new HashMap<AnnotationType, List<Annotation>>();
14
+ }
15
+
16
+ public List<Annotation> getChildAnnotations(AnnotationType type)
17
+ {
18
+ if (childAnnotations.containsKey(type)) return childAnnotations.get(type);
19
+ else return new ArrayList<Annotation>();
20
+ }
21
+
22
+ public void addChildAnnotation(Annotation annotation, AnnotationType type)
23
+ {
24
+ if (!childAnnotations.containsKey(type)) childAnnotations.put(type, new ArrayList<Annotation>());
25
+ childAnnotations.get(type).add(annotation);
26
+ }
27
+
28
+ public void removeChildAnnotation(Annotation annotation)
29
+ {
30
+ if (childAnnotations.containsKey(annotation))
31
+ {
32
+ childAnnotations.remove(annotation);
33
+ }
34
+ }
35
+
36
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/AnnotationType.java ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ public enum AnnotationType {
4
+ Corpus,
5
+ Document,
6
+ SentenceAnnotation,
7
+ TokenAnnotation,
8
+ MorphologyAnnotation,
9
+ PartOfSpeechAnnotation,
10
+ NamedEntityAnnotation,
11
+ NamedEntityTokenAnnotation,
12
+ ChunkAnnotation,
13
+ DependencyAnnotation
14
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/AnnotatorChain.java ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.List;
5
+
6
+ public class AnnotatorChain {
7
+
8
+ private List<IAnnotator> chain;
9
+
10
+ public AnnotatorChain()
11
+ {
12
+ }
13
+
14
+
15
+ public AnnotatorChain(List<IAnnotator> annotators)
16
+ {
17
+ chain = annotators;
18
+ }
19
+
20
+ public AnnotatorChain addAnnotator(IAnnotator annotator)
21
+ {
22
+ if (chain == null) chain = new ArrayList<IAnnotator>();
23
+ chain.add(annotator);
24
+ return this;
25
+ }
26
+
27
+ public void apply(Annotation textUnit)
28
+ {
29
+ for (int i = 0; i < chain.size(); i++)
30
+ {
31
+ chain.get(i).annotate(textUnit);
32
+ }
33
+ }
34
+
35
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/AnnotatorType.java ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ public enum AnnotatorType
4
+ {
5
+ SentenceSplitter,
6
+ Tokenizer,
7
+ POSTagger,
8
+ Morphology,
9
+ NamedEntityExtractor,
10
+ Chunker
11
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/Document.java ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.List;
5
+ import java.util.stream.Collectors;
6
+
7
+ public class Document extends Annotation {
8
+
9
+ private String id;
10
+
11
+ public String getId() {
12
+ return id;
13
+ }
14
+
15
+ public void setId(String id) {
16
+ this.id = id;
17
+ }
18
+
19
+ private String path;
20
+
21
+ public String getPath() {
22
+ return path;
23
+ }
24
+
25
+ public void setPath(String path) {
26
+ this.path = path;
27
+ }
28
+
29
+ private String text;
30
+
31
+ public String getText() {
32
+ return text;
33
+ }
34
+
35
+ public void setText(String text) {
36
+ this.text = text;
37
+ }
38
+
39
+
40
+ // Lazy loading
41
+
42
+ private List<TokenAnnotation> tokens;
43
+
44
+ public List<TokenAnnotation> getTokens() {
45
+ if (tokens == null) tokens = loadTokens();
46
+ return tokens;
47
+ }
48
+
49
+ private List<SentenceAnnotation> sentences;
50
+
51
+ public List<SentenceAnnotation> getSentences() {
52
+ if (sentences == null) sentences = loadSentences();
53
+ return sentences;
54
+ }
55
+
56
+ private List<NamedEntityAnnotation> namedEntities;
57
+
58
+ public List<NamedEntityAnnotation> getNamedEntities() {
59
+ if (namedEntities == null) namedEntities = loadNamedEntities();
60
+ return namedEntities;
61
+ }
62
+
63
+ private List<TokenAnnotation> loadTokens()
64
+ {
65
+ if (getSentences() != null)
66
+ {
67
+ List<TokenAnnotation> toks = new ArrayList<TokenAnnotation>();
68
+ for (int i = 0; i < sentences.size(); i++)
69
+ {
70
+ toks.addAll(sentences.get(i).getTokens());
71
+ }
72
+
73
+ toks.sort((t1, t2) -> (t1.getStartPosition() < t2.getStartPosition()) ? -1 : ((t1.getStartPosition() > t2.getStartPosition()) ? 1 : 0));
74
+ return toks;
75
+ }
76
+ else return null;
77
+
78
+ }
79
+
80
+ private List<SentenceAnnotation> loadSentences()
81
+ {
82
+ if (childAnnotations.containsKey(AnnotationType.SentenceAnnotation))
83
+ return childAnnotations.get(AnnotationType.SentenceAnnotation).stream().map(x -> (SentenceAnnotation)x).collect(Collectors.toList());
84
+ else return null;
85
+ }
86
+
87
+ private List<NamedEntityAnnotation> loadNamedEntities()
88
+ {
89
+ if (childAnnotations.containsKey(AnnotationType.NamedEntityAnnotation))
90
+ return childAnnotations.get(AnnotationType.NamedEntityAnnotation).stream().map(x -> (NamedEntityAnnotation)x).collect(Collectors.toList());
91
+ else return null;
92
+ }
93
+
94
+ // Ctors
95
+
96
+ public Document()
97
+ {
98
+ }
99
+
100
+ public Document(String text)
101
+ {
102
+ this.text = text;
103
+ }
104
+
105
+ public Document(String id, String text)
106
+ {
107
+ this.id = id;
108
+ this.text = text;
109
+ }
110
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/IAnnotator.java ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ import java.util.List;
4
+
5
+ public interface IAnnotator
6
+ {
7
+ void annotate(Annotation textUnit);
8
+ List<Annotation> annotate(String text);
9
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityAnnotation.java ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ import java.util.List;
4
+ import java.util.stream.Collectors;
5
+
6
+ import org.apache.commons.lang3.StringUtils;
7
+
8
+ public class NamedEntityAnnotation extends Annotation {
9
+
10
+ private NamedEntityType namedEntityType;
11
+
12
+ public NamedEntityType getNamedEntityType() {
13
+ return namedEntityType;
14
+ }
15
+
16
+ public void setNamedEntityType(NamedEntityType namedEntityType) {
17
+ this.namedEntityType = namedEntityType;
18
+ }
19
+
20
+ private String text;
21
+
22
+ public String getText() {
23
+ if ((text == null || StringUtils.isEmpty(text)) && getTokens().size() > 0)
24
+ {
25
+ text = "";
26
+ for(int i = 0; i < tokens.size(); i++)
27
+ {
28
+ text += tokens.get(i).getText() + " ";
29
+ }
30
+ text = text.trim();
31
+ }
32
+ return text;
33
+ }
34
+
35
+ public void setText(String text) {
36
+ this.text = text;
37
+ }
38
+
39
+ private int startPosition;
40
+
41
+ public int getStartPosition() {
42
+ return startPosition;
43
+ }
44
+
45
+ public void setStartPosition(int startPosition) {
46
+ this.startPosition = startPosition;
47
+ }
48
+
49
+ public Boolean isPerson()
50
+ {
51
+ return namedEntityType == NamedEntityType.Person;
52
+ }
53
+
54
+ public Boolean isLocation()
55
+ {
56
+ return namedEntityType == NamedEntityType.Location;
57
+ }
58
+
59
+ public Boolean isOrganization()
60
+ {
61
+ return namedEntityType == NamedEntityType.Organization;
62
+ }
63
+
64
+ private List<TokenAnnotation> tokens;
65
+ public List<TokenAnnotation> getTokens()
66
+ {
67
+ if (tokens == null) tokens = loadTokens();
68
+ return tokens;
69
+ }
70
+
71
+ private List<TokenAnnotation> loadTokens()
72
+ {
73
+ if (childAnnotations.containsKey(AnnotationType.TokenAnnotation)) return childAnnotations.get(AnnotationType.TokenAnnotation).stream().map(x -> (TokenAnnotation)x).collect(Collectors.toList());
74
+ else return null;
75
+ }
76
+
77
+ public NamedEntityAnnotation(NamedEntityType type)
78
+ {
79
+ namedEntityType = type;
80
+ }
81
+
82
+ public NamedEntityAnnotation(NamedEntityType type, String text, int startPosition)
83
+ {
84
+ this.namedEntityType = type;
85
+ this.text = text;
86
+ this.startPosition = startPosition;
87
+ }
88
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityTokenAnnotation.java ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ public class NamedEntityTokenAnnotation extends Annotation {
4
+
5
+ private String namedEntityLabel;
6
+
7
+ public String getNamedEntityLabel() {
8
+ return namedEntityLabel;
9
+ }
10
+
11
+ public void setNamedEntityLabel(String namedEntityLabel) {
12
+ this.namedEntityLabel = namedEntityLabel;
13
+ }
14
+
15
+ public NamedEntityTokenAnnotation()
16
+ {
17
+ }
18
+
19
+ public NamedEntityTokenAnnotation(String label)
20
+ {
21
+ namedEntityLabel = label;
22
+ }
23
+
24
+ public Boolean constitutesNamedEntity()
25
+ {
26
+ return startsNamedEntity() || insideNamedEntity();
27
+ }
28
+
29
+ public Boolean startsNamedEntity()
30
+ {
31
+ return namedEntityLabel.startsWith("B");
32
+ }
33
+
34
+ public Boolean insideNamedEntity()
35
+ {
36
+ return namedEntityLabel.startsWith("I");
37
+ }
38
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityType.java ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ public enum NamedEntityType {
4
+ Person,
5
+ Location,
6
+ Organization,
7
+ Money,
8
+ Percentage,
9
+ Date,
10
+ Time,
11
+ Ordinal,
12
+ Percent,
13
+ Number,
14
+ Set,
15
+ Duration,
16
+ Misc,
17
+ None
18
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/PartOfSpeechAnnotation.java ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ import java.util.Arrays;
4
+ import java.util.List;
5
+
6
+ public class PartOfSpeechAnnotation extends Annotation {
7
+
8
+ private String tag;
9
+
10
+ public String getTag() {
11
+ return tag;
12
+ }
13
+
14
+ public void setTag(String tag) {
15
+ this.tag = tag;
16
+ }
17
+
18
+ private String chunkTag;
19
+
20
+ public String getChunkTag() {
21
+ return chunkTag;
22
+ }
23
+
24
+ public void setChunkTag(String chunkTag) {
25
+ this.chunkTag = chunkTag;
26
+ }
27
+
28
+ private String coarseTag;
29
+ public String getCoarseTag() {
30
+ if (tag != null) coarseTag = loadCoarsePoSTag();
31
+ return coarseTag;
32
+ }
33
+
34
+ private String loadCoarsePoSTag()
35
+ {
36
+ if (isNoun()) return "N";
37
+ else if (isVerb()) return "V";
38
+ else if (isAdjective()) return "J";
39
+ else if (isAdverb()) return "R";
40
+ else return "O";
41
+ }
42
+
43
+ public Boolean isContent()
44
+ {
45
+ List<String> otherContentPOS = Arrays.asList("CD", "FW", "MD", "SYM", "UH");
46
+ return isNoun() || isVerb() || isAdjective() || isAdverb() || otherContentPOS.contains(tag);
47
+ }
48
+
49
+ public Boolean isNoun()
50
+ {
51
+ return tag != null && tag.startsWith("N");
52
+ }
53
+
54
+ public Boolean isVerb()
55
+ {
56
+ return tag != null && tag.startsWith("V");
57
+ }
58
+
59
+ public Boolean isAdjective()
60
+ {
61
+ return tag != null && tag.startsWith("J");
62
+ }
63
+
64
+ public Boolean isAdverb()
65
+ {
66
+ return tag != null && tag.startsWith("RB");
67
+ }
68
+
69
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/SentenceAnnotation.java ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ import java.util.List;
4
+ import java.util.stream.Collectors;
5
+
6
+ public class SentenceAnnotation extends Annotation {
7
+
8
+ // Fields & properties
9
+
10
+ private String text;
11
+
12
+ public String getText() {
13
+ return text;
14
+ }
15
+
16
+ public void setText(String text) {
17
+ this.text = text;
18
+ }
19
+
20
+ private int startPosition;
21
+
22
+ public int getStartPosition() {
23
+ return startPosition;
24
+ }
25
+
26
+ public void setStartPosition(int startPosition) {
27
+ this.startPosition = startPosition;
28
+ }
29
+
30
+ public int getEndPosition()
31
+ {
32
+ return text != null ? startPosition + text.length() - 1 : startPosition;
33
+ }
34
+
35
+ // Lazy properties
36
+
37
+ private List<TokenAnnotation> tokens;
38
+ public List<TokenAnnotation> getTokens()
39
+ {
40
+ if (tokens == null) tokens = loadTokens();
41
+ return tokens;
42
+ }
43
+
44
+ public void setTokens(List<TokenAnnotation> tokens)
45
+ {
46
+ this.tokens = tokens;
47
+ }
48
+
49
+ private List<TokenAnnotation> loadTokens()
50
+ {
51
+ if (childAnnotations.containsKey(AnnotationType.TokenAnnotation)) return childAnnotations.get(AnnotationType.TokenAnnotation).stream().map(x -> (TokenAnnotation)x).collect(Collectors.toList());
52
+ else return null;
53
+ }
54
+
55
+
56
+ // Ctors
57
+ public SentenceAnnotation()
58
+ {
59
+ }
60
+
61
+ public SentenceAnnotation(String text, int startPosition)
62
+ {
63
+ this.text = text;
64
+ this.startPosition = startPosition;
65
+ }
66
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/StanfordAnnotator.java ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.Arrays;
5
+ import java.util.List;
6
+ import java.util.Properties;
7
+
8
+ import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
9
+ import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
10
+ import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
11
+ import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
12
+ import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
13
+ import edu.stanford.nlp.ling.CoreLabel;
14
+ import edu.stanford.nlp.pipeline.StanfordCoreNLP;
15
+ import edu.stanford.nlp.util.CoreMap;
16
+
17
+ public class StanfordAnnotator implements IAnnotator {
18
+
19
+ private List<String> stanfordAnnotators;
20
+ private String stanfordAnnotatorsString;
21
+
22
+ public void setStanfordAnnotators(List<String> stanfordAnnotators) {
23
+ this.stanfordAnnotators = stanfordAnnotators;
24
+
25
+ stanfordAnnotatorsString = "";
26
+ for(int i = 0; i < this.stanfordAnnotators.size(); i++)
27
+ {
28
+ if (i == 0) stanfordAnnotatorsString += this.stanfordAnnotators.get(i);
29
+ else stanfordAnnotatorsString += ", " + this.stanfordAnnotators.get(i);
30
+ }
31
+ }
32
+
33
+ @Override
34
+ public void annotate(Annotation textUnit)
35
+ {
36
+ if (textUnit instanceof Document)
37
+ {
38
+ Properties props = new Properties();
39
+ props.setProperty("annotators", stanfordAnnotatorsString);
40
+
41
+ StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
42
+
43
+ edu.stanford.nlp.pipeline.Annotation docAnnotation = new edu.stanford.nlp.pipeline.Annotation(((Document)textUnit).getText());
44
+ pipeline.annotate(docAnnotation);
45
+
46
+ List<CoreMap> sentences = docAnnotation.get(SentencesAnnotation.class);
47
+
48
+ for(CoreMap stanfordSentence : sentences)
49
+ {
50
+ SentenceAnnotation sentence = new SentenceAnnotation();
51
+ sentence.setText(stanfordSentence.get(TextAnnotation.class));
52
+ sentence.setStartPosition(stanfordSentence.get(CharacterOffsetBeginAnnotation.class));
53
+
54
+ for (CoreLabel stanfordToken: stanfordSentence.get(TokensAnnotation.class))
55
+ {
56
+ TokenAnnotation token = new TokenAnnotation(stanfordToken.get(TextAnnotation.class));
57
+ token.setStartPosition(stanfordToken.beginPosition());
58
+ token.setSentenceIndex(stanfordToken.sentIndex());
59
+
60
+ if (stanfordAnnotators.contains("lemma"))
61
+ {
62
+ token.setLemma(stanfordToken.lemma());
63
+ }
64
+
65
+ if (stanfordAnnotators.contains("pos"))
66
+ {
67
+ PartOfSpeechAnnotation posAnnotation = new PartOfSpeechAnnotation();
68
+ posAnnotation.setTag(stanfordToken.get(edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation.class));
69
+ token.addChildAnnotation(posAnnotation, AnnotationType.PartOfSpeechAnnotation);
70
+ }
71
+
72
+ if (stanfordAnnotators.contains("ner"))
73
+ {
74
+ NamedEntityTokenAnnotation neta = new NamedEntityTokenAnnotation(stanfordToken.get(NamedEntityTagAnnotation.class));
75
+ token.addChildAnnotation(neta, AnnotationType.NamedEntityTokenAnnotation);
76
+ }
77
+
78
+ sentence.addChildAnnotation(token, AnnotationType.TokenAnnotation);
79
+ }
80
+
81
+ // linking continuous token-level NE annotations into whole named entity annotations
82
+ if (stanfordAnnotators.contains("ner"))
83
+ {
84
+ List<NamedEntityAnnotation> nes = new ArrayList<NamedEntityAnnotation>();
85
+ NamedEntityAnnotation ne = null;
86
+ for(int i = 0; i < sentence.getTokens().size(); i++)
87
+ {
88
+ String neLabel = sentence.getTokens().get(i).getNamedEntityLabel().getNamedEntityLabel();
89
+ String neLabelPrevious = i > 0 ? sentence.getTokens().get(i-1).getNamedEntityLabel().getNamedEntityLabel() : "O";
90
+
91
+ if (neLabel.compareTo("O") == 0)
92
+ {
93
+ if (ne != null)
94
+ {
95
+ nes.add(ne);
96
+ ne = null;
97
+ }
98
+ }
99
+ else if (neLabel.compareTo(neLabelPrevious) != 0)
100
+ {
101
+ NamedEntityType type = Arrays.stream(NamedEntityType.values()).filter(e -> e.name().equalsIgnoreCase(neLabel)).findAny().orElse(null);
102
+ if (type == null)
103
+ {
104
+ throw new UnsupportedOperationException("Unknown named entity type!");
105
+ }
106
+
107
+ ne = new NamedEntityAnnotation(type);
108
+ ne.setStartPosition(sentence.getTokens().get(i).getStartPosition());
109
+ ne.addChildAnnotation(sentence.getTokens().get(i), AnnotationType.TokenAnnotation);
110
+
111
+ }
112
+ else
113
+ {
114
+ ne.addChildAnnotation(sentence.getTokens().get(i), AnnotationType.TokenAnnotation);
115
+ }
116
+ }
117
+ if (ne != null) nes.add(ne);
118
+
119
+ nes.forEach(n -> textUnit.addChildAnnotation(n, AnnotationType.NamedEntityAnnotation));
120
+ }
121
+
122
+ textUnit.addChildAnnotation(sentence, AnnotationType.SentenceAnnotation);
123
+ }
124
+
125
+ // coreference, crosses sentence borders
126
+ if (stanfordAnnotators.contains("dcoref"))
127
+ {
128
+ // TODO: coref annotations
129
+ }
130
+ }
131
+ else throw new UnsupportedOperationException("Only whole documents can be processed by Stanford's CoreNLP pipeline");
132
+ }
133
+
134
+ @Override
135
+ public List<Annotation> annotate(String text)
136
+ {
137
+ Document document = new Document(text);
138
+ annotate(document);
139
+
140
+ return new ArrayList<Annotation>(Arrays.asList(document));
141
+ }
142
+ }
source/src/edu/uma/nlp/graphseg/preprocessing/TokenAnnotation.java ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.preprocessing;
2
+
3
+ public class TokenAnnotation extends Annotation {
4
+
5
+ // Fields & properties
6
+
7
+ private String text;
8
+
9
+ public String getText() {
10
+ return text;
11
+ }
12
+ public void setText(String text) {
13
+ this.text = text;
14
+ }
15
+
16
+ private String lemma;
17
+
18
+ public String getLemma() {
19
+ return lemma;
20
+ }
21
+ public void setLemma(String lemma) {
22
+ this.lemma = lemma;
23
+ }
24
+
25
+ private int startPosition;
26
+
27
+ public int getStartPosition() {
28
+ return startPosition;
29
+ }
30
+
31
+ public void setStartPosition(int startPosition) {
32
+ this.startPosition = startPosition;
33
+ }
34
+
35
+ public int getEndPosition() {
36
+ return text != null ? startPosition + text.length() - 1 : startPosition;
37
+ }
38
+
39
+ private int startPositionSentence;
40
+
41
+ public int getStartPositionSentence() {
42
+ return startPositionSentence;
43
+ }
44
+
45
+ public void setStartPositionSentence(int startPositionSentence) {
46
+ this.startPositionSentence = startPositionSentence;
47
+ }
48
+
49
+ public int getEndPositionSentence() {
50
+ return text != null ? startPositionSentence + text.length() - 1 : startPositionSentence;
51
+ }
52
+
53
+ private int sentenceIndex;
54
+
55
+ public int getSentenceIndex() {
56
+ return sentenceIndex;
57
+ }
58
+
59
+ public void setSentenceIndex(int sentenceIndex) {
60
+ this.sentenceIndex = sentenceIndex;
61
+ }
62
+
63
+ // Lazy loading properties
64
+
65
+ private PartOfSpeechAnnotation partOfSpeech;
66
+ public PartOfSpeechAnnotation getPartOfSpeech()
67
+ {
68
+ if (partOfSpeech == null) partOfSpeech = loadPartOfSpeech();
69
+ return partOfSpeech;
70
+ }
71
+
72
+
73
+ private NamedEntityTokenAnnotation namedEntityLabel;
74
+ public NamedEntityTokenAnnotation getNamedEntityLabel()
75
+ {
76
+ if (namedEntityLabel == null) namedEntityLabel = loadTokenNELabel();
77
+ return namedEntityLabel;
78
+ }
79
+
80
+ private PartOfSpeechAnnotation loadPartOfSpeech()
81
+ {
82
+ if (!childAnnotations.containsKey(AnnotationType.PartOfSpeechAnnotation)) this.addChildAnnotation(new PartOfSpeechAnnotation(), AnnotationType.PartOfSpeechAnnotation);
83
+ return ((PartOfSpeechAnnotation)(getChildAnnotations(AnnotationType.PartOfSpeechAnnotation).get(0)));
84
+ }
85
+
86
+ private NamedEntityTokenAnnotation loadTokenNELabel()
87
+ {
88
+ if (!childAnnotations.containsKey(AnnotationType.NamedEntityTokenAnnotation)) return null; //this.addChildAnnotation(new NamedEntityTokenAnnotation(), AnnotationType.NamedEntityTokenAnnotation);
89
+ else return ((NamedEntityTokenAnnotation)(childAnnotations.get(AnnotationType.NamedEntityTokenAnnotation).get(0)));
90
+ }
91
+
92
+ public TokenAnnotation(String text, int startPosition, int startPositionSentence, int sentenceIndex)
93
+ {
94
+ this.text = text;
95
+ this.startPosition = startPosition;
96
+ this.startPositionSentence = startPositionSentence;
97
+ this.sentenceIndex = sentenceIndex;
98
+ }
99
+
100
+ public TokenAnnotation(String text)
101
+ {
102
+ this.text = text;
103
+ }
104
+ }
source/src/edu/uma/nlp/graphseg/semantics/InformationContent.java ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.semantics;
2
+
3
+ import java.io.IOException;
4
+ import java.io.InputStream;
5
+ import java.io.UnsupportedEncodingException;
6
+ import java.util.HashMap;
7
+ import java.util.List;
8
+
9
+ import edu.uma.nlp.graphseg.utils.IOHelper;
10
+
11
+ public class InformationContent {
12
+ private HashMap<String, Integer> frequencies = new HashMap<String, Integer>();
13
+ private double sumFrequencies = 0;
14
+ private double minFreq = 1;
15
+ private double divideFactor = 1;
16
+
17
+ public InformationContent(String path, double divideFactor)
18
+ {
19
+ this.divideFactor = divideFactor;
20
+
21
+ frequencies = IOHelper.loadCounts(path);
22
+ sumFrequencies = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).sum();
23
+ minFreq = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).min().getAsDouble();
24
+ if (minFreq == 0) minFreq = 1;
25
+ }
26
+
27
+ public InformationContent(InputStream stream, double divideFactor) throws UnsupportedEncodingException, IOException
28
+ {
29
+ this.divideFactor = divideFactor;
30
+
31
+ frequencies = IOHelper.loadCounts(stream);
32
+ sumFrequencies = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).sum();
33
+ minFreq = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).min().getAsDouble();
34
+ if (minFreq == 0) minFreq = 1;
35
+ }
36
+
37
+ public InformationContent(HashMap<String, Integer> frequenciesDictionary, double divideFactor)
38
+ {
39
+ this.divideFactor = divideFactor;
40
+
41
+ frequencies = frequenciesDictionary;
42
+ sumFrequencies = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).sum();
43
+ minFreq = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).min().getAsDouble();
44
+ }
45
+
46
+ public double getInformationContent(String word)
47
+ {
48
+ if (frequencies.containsKey(word.toLowerCase())) return (-1) * Math.log(((((double)frequencies.get(word.toLowerCase())) + minFreq) / divideFactor) / sumFrequencies);
49
+ else return (-1) * Math.log((minFreq / divideFactor) / sumFrequencies);
50
+ }
51
+
52
+ public double getRelativeInformationContent(String word)
53
+ {
54
+ double maxInfCont = (-1) * Math.log((minFreq / divideFactor) / sumFrequencies);
55
+ double infCont = (frequencies.containsKey(word.toLowerCase())) ? (-1) * Math.log(((((double)frequencies.get(word.toLowerCase())) + minFreq) / divideFactor) / sumFrequencies) : maxInfCont;
56
+
57
+ return infCont / maxInfCont;
58
+ }
59
+
60
+ public double getLogRelativeInformationContent(String word)
61
+ {
62
+ double maxInfCont = (-1) * Math.log((minFreq / divideFactor) / sumFrequencies);
63
+ double infCont = (frequencies.containsKey(word.toLowerCase())) ? (-1) * Math.log(((((double)frequencies.get(word.toLowerCase())) + minFreq) / divideFactor) / sumFrequencies) : maxInfCont;
64
+
65
+ return Math.log(infCont) / Math.log(maxInfCont);
66
+ }
67
+
68
+ public double getInformationContent(List<String> phrase)
69
+ {
70
+ double ic = 1;
71
+ for(String w : phrase)
72
+ {
73
+ ic *= getInformationContent(w);
74
+ }
75
+ return ic;
76
+ }
77
+ }
source/src/edu/uma/nlp/graphseg/semantics/SemanticSimilarity.java ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.semantics;
2
+
3
+ import java.util.ArrayList;
4
+ import java.util.HashMap;
5
+ import java.util.List;
6
+ import java.util.stream.Collectors;
7
+
8
+ import org.javatuples.Triplet;
9
+
10
+ import edu.uma.nlp.graphseg.preprocessing.TokenAnnotation;
11
+ import edu.uma.nlp.graphseg.utils.VectorOperations;
12
+
13
+ public class SemanticSimilarity {
14
+
15
+ private static List<String> stopwords;
16
+ public static void setStopwords(List<String> stwrds)
17
+ {
18
+ stopwords = stwrds;
19
+ }
20
+
21
+ public static double greedyAlignmentOverlapFScore(List<TokenAnnotation> firstPhrase, List<TokenAnnotation> secondPhrase, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
22
+ {
23
+ return greedyAlignmentOverlap(firstPhrase, secondPhrase, vectorSpace, informationContent, contentWordsOnly).getValue2();
24
+ }
25
+
26
+ public static double greedyAlignmentOverlapPrecision(List<TokenAnnotation> firstPhrase, List<TokenAnnotation> secondPhrase, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
27
+ {
28
+ return greedyAlignmentOverlap(firstPhrase, secondPhrase, vectorSpace, informationContent, contentWordsOnly).getValue0();
29
+ }
30
+
31
+ public static double greedyAlignmentOverlapRecall(List<TokenAnnotation> firstPhrase, List<TokenAnnotation> secondPhrase, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
32
+ {
33
+ return greedyAlignmentOverlap(firstPhrase, secondPhrase, vectorSpace, informationContent, contentWordsOnly).getValue1();
34
+ }
35
+
36
+ public static Triplet<Double, Double, Double> greedyAlignmentOverlap(List<TokenAnnotation> firstPhrase, List<TokenAnnotation> secondPhrase, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
37
+ {
38
+ List<TokenAnnotation> firstPhraseCopy = new ArrayList<TokenAnnotation>();
39
+ List<TokenAnnotation> secondPhraseCopy = new ArrayList<TokenAnnotation>();
40
+ if (contentWordsOnly)
41
+ {
42
+ firstPhraseCopy.addAll(firstPhrase.stream().filter(x -> (contentWordsOnly ? x.getPartOfSpeech().isContent() : 1 != 2)).collect(Collectors.toList()));
43
+ secondPhraseCopy.addAll(secondPhrase.stream().filter(x -> (contentWordsOnly ? x.getPartOfSpeech().isContent() : 1 != 2)).collect(Collectors.toList()));
44
+ }
45
+ else
46
+ {
47
+ firstPhraseCopy.addAll(firstPhrase);
48
+ secondPhraseCopy.addAll(secondPhrase);
49
+ }
50
+
51
+ if (stopwords != null && stopwords.size() > 0)
52
+ {
53
+ firstPhraseCopy = firstPhraseCopy.stream().filter(t -> !stopwords.contains(t.getLemma().toLowerCase()) && !stopwords.contains(t.getText().toLowerCase())).collect(Collectors.toList());
54
+ secondPhraseCopy = secondPhraseCopy.stream().filter(t -> !stopwords.contains(t.getLemma().toLowerCase()) && !stopwords.contains(t.getText().toLowerCase())).collect(Collectors.toList());
55
+ }
56
+
57
+ List<Double> pairSimilarities = new ArrayList<Double>();
58
+ while(firstPhraseCopy.size() > 0 && secondPhraseCopy.size() > 0)
59
+ {
60
+ double maxSim = -1;
61
+ TokenAnnotation firstToken = null;
62
+ TokenAnnotation secondToken = null;
63
+ for(TokenAnnotation nf : firstPhraseCopy)
64
+ {
65
+ for(TokenAnnotation ns : secondPhraseCopy)
66
+ {
67
+ double sim = vectorSpace.similarity(nf.getText().toLowerCase(), ns.getText().toLowerCase());
68
+ if (sim < 0) sim = 0;
69
+
70
+ if (sim > maxSim)
71
+ {
72
+ firstToken = nf;
73
+ secondToken = ns;
74
+ maxSim = sim;
75
+ }
76
+ }
77
+ }
78
+
79
+ if (informationContent != null)
80
+ {
81
+ pairSimilarities.add(maxSim * Math.max(informationContent.getInformationContent(firstToken.getText().toLowerCase()), informationContent.getInformationContent(secondToken.getText().toLowerCase())));
82
+ }
83
+ else pairSimilarities.add(maxSim);
84
+
85
+ firstPhraseCopy.remove(firstToken);
86
+ secondPhraseCopy.remove(secondToken);
87
+ }
88
+
89
+ double precision = 0;
90
+ double recall = 0;
91
+ double overlap = pairSimilarities.stream().mapToDouble(s -> s).sum();
92
+
93
+ if (informationContent != null)
94
+ {
95
+ double infContentFirst = contentWordsOnly ?
96
+ firstPhrase.stream().filter(x -> x.getPartOfSpeech().isContent()).mapToDouble(t -> informationContent.getInformationContent(t.getText().toLowerCase())).sum() :
97
+ firstPhrase.stream().mapToDouble(t -> informationContent.getInformationContent(t.getText().toLowerCase())).sum();
98
+
99
+ double infContentSecond = contentWordsOnly ?
100
+ secondPhrase.stream().filter(x -> x.getPartOfSpeech().isContent()).mapToDouble(t -> informationContent.getInformationContent(t.getText().toLowerCase())).sum() :
101
+ secondPhrase.stream().mapToDouble(t -> informationContent.getInformationContent(t.getText().toLowerCase())).sum();
102
+
103
+ precision = overlap / infContentFirst;
104
+ recall = overlap / infContentSecond;
105
+ }
106
+ else
107
+ {
108
+ precision = overlap / firstPhrase.size();
109
+ recall = overlap / secondPhrase.size();
110
+ }
111
+
112
+ double fScore = 0;
113
+ if (precision == 0 && recall == 0) fScore = 0;
114
+ else fScore = (2 * precision * recall) / (precision + recall);
115
+ if (Double.isNaN(fScore)) fScore = 0;
116
+
117
+ return new Triplet<Double, Double, Double>(precision, recall, fScore);
118
+ }
119
+
120
+ public static double embeddingSumSimilarity(List<TokenAnnotation> first, List<TokenAnnotation> second, WordVectorSpace vectorSpace, int embeddingLength, Boolean content, List<InformationContent> infContents)
121
+ {
122
+ double[] embeddingFirst = new double[embeddingLength];
123
+ double[] embeddingSecond = new double[embeddingLength];
124
+
125
+ if (content)
126
+ {
127
+ first = first.stream().filter(x -> x.getPartOfSpeech().isContent()).collect(Collectors.toList());
128
+ second = second.stream().filter(x -> x.getPartOfSpeech().isContent()).collect(Collectors.toList());
129
+ }
130
+
131
+ first.forEach(x ->
132
+ {
133
+ double[] wordEmbedding = vectorSpace.getEmbedding(x.getText().trim());
134
+ if (wordEmbedding == null)
135
+ {
136
+ wordEmbedding = vectorSpace.getEmbedding(x.getText().trim().toLowerCase());
137
+ }
138
+ if (wordEmbedding != null)
139
+ {
140
+ double ic = 1;
141
+ for(InformationContent inco : infContents)
142
+ {
143
+ ic *= inco.getInformationContent(x.getText().trim().toLowerCase());
144
+ };
145
+ VectorOperations.multiply(wordEmbedding, ic);
146
+ VectorOperations.addVector(embeddingFirst, wordEmbedding);
147
+ }
148
+ });
149
+
150
+ second.forEach(x ->
151
+ {
152
+ double[] wordEmbedding = vectorSpace.getEmbedding(x.getText().trim());
153
+ if (wordEmbedding == null)
154
+ {
155
+ wordEmbedding = vectorSpace.getEmbedding(x.getText().trim().toLowerCase());
156
+ }
157
+ if (wordEmbedding != null)
158
+ {
159
+ double ic = 1;
160
+ for(InformationContent inco : infContents)
161
+ {
162
+ ic *= inco.getInformationContent(x.getText().trim().toLowerCase());
163
+ };
164
+ VectorOperations.multiply(wordEmbedding, ic);
165
+ VectorOperations.addVector(embeddingSecond, wordEmbedding);
166
+ }
167
+ });
168
+
169
+ double res;
170
+ try {
171
+ res = VectorOperations.cosine(embeddingFirst, embeddingSecond);
172
+ } catch (Exception e) {
173
+ // TODO Auto-generated catch block
174
+ e.printStackTrace();
175
+ res = 0;
176
+ }
177
+ if (Double.isNaN(res))
178
+ {
179
+ res = 0;
180
+ }
181
+
182
+ return res;
183
+ }
184
+
185
+ public static double averagePhraseGreedyAlignmentOverlap(List<List<TokenAnnotation>> firstPhrases, List<List<TokenAnnotation>> secondPhrases, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
186
+ {
187
+ double sum = 0;
188
+ double counter = 0;
189
+
190
+ for(List<TokenAnnotation> fp : firstPhrases){
191
+ for(List<TokenAnnotation> sp : secondPhrases){
192
+ double sim = greedyAlignmentOverlapFScore(fp, sp, vectorSpace, informationContent, contentWordsOnly);
193
+ sum += sim;
194
+ counter++;
195
+ }
196
+ }
197
+
198
+ double score = sum/counter;
199
+ if (Double.isNaN(score) || Double.isInfinite(score)) return 0;
200
+ else return score;
201
+ }
202
+
203
+ public static double maxPhraseGreedyAlignmentOverlap(List<List<TokenAnnotation>> firstPhrases, List<List<TokenAnnotation>> secondPhrases, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
204
+ {
205
+ double maxSim = 0;
206
+
207
+ for(List<TokenAnnotation> fp : firstPhrases){
208
+ for(List<TokenAnnotation> sp : secondPhrases){
209
+ double sim = greedyAlignmentOverlapFScore(fp, sp, vectorSpace, informationContent, contentWordsOnly);
210
+ if (sim > maxSim)
211
+ {
212
+ maxSim = sim;
213
+ }
214
+ }
215
+ }
216
+ return maxSim;
217
+ }
218
+
219
+ public static int numSufficientlySimilarPhrasesGreedyAlignmentOverlap(List<List<TokenAnnotation>> firstPhrases, List<List<TokenAnnotation>> secondPhrases, double treshold, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
220
+ {
221
+ int counter = 0;
222
+
223
+ for(List<TokenAnnotation> fp : firstPhrases){
224
+ for(List<TokenAnnotation> sp : secondPhrases){
225
+ double sim = greedyAlignmentOverlapFScore(fp, sp, vectorSpace, informationContent, contentWordsOnly);
226
+ if (sim >= treshold)
227
+ {
228
+ counter++;
229
+ }
230
+ }
231
+ }
232
+ return counter;
233
+ }
234
+
235
+ public static HashMap<String, Double> allToAllSimilarity(WordVectorSpace vectorSpace, List<String> vocabulary)
236
+ {
237
+ HashMap<String, Double> similarities = new HashMap<String, Double>();
238
+ for(int i = 0; i < vocabulary.size() - 1; i++)
239
+ {
240
+ if (i % 100 == 0) System.out.println("Outer loop: " + String.valueOf(i + 1) + "/" + String.valueOf(vocabulary.size() - 1));
241
+ for(int j = i+1; j < vocabulary.size(); j++)
242
+ {
243
+ double sim = vectorSpace.similarity(vocabulary.get(i), vocabulary.get(j));
244
+ if (sim >= -1)
245
+ {
246
+ similarities.put(vocabulary.get(i).compareTo(vocabulary.get(j)) < 0 ? vocabulary.get(i) + "<=>" + vocabulary.get(j) : vocabulary.get(j) + "<=>" + vocabulary.get(i), sim);
247
+ }
248
+ }
249
+ }
250
+ return similarities;
251
+ }
252
+ }
source/src/edu/uma/nlp/graphseg/semantics/WordVectorSpace.java ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.semantics;
2
+
3
+ import java.io.BufferedReader;
4
+ import java.io.BufferedWriter;
5
+ import java.io.File;
6
+ import java.io.FileNotFoundException;
7
+ import java.io.FileOutputStream;
8
+ import java.io.FileReader;
9
+ import java.io.IOException;
10
+ import java.io.OutputStreamWriter;
11
+ import java.util.ArrayList;
12
+ import java.util.HashMap;
13
+ import java.util.List;
14
+
15
+ import org.javatuples.Pair;
16
+
17
+ import edu.uma.nlp.graphseg.utils.VectorOperations;
18
+
19
+ public class WordVectorSpace {
20
+
21
+ private HashMap<String, double[]> embeddings;
22
+ private int dimension;
23
+
24
+ public int getDimension() {
25
+ return dimension;
26
+ }
27
+
28
+ public void load(String path, HashMap<String, Integer> filters) throws FileNotFoundException, IOException
29
+ {
30
+ embeddings = new HashMap<String, double[]>();
31
+
32
+ try (BufferedReader br = new BufferedReader(new FileReader(path))) {
33
+ String line;
34
+ int counter = 0;
35
+ while ((line = br.readLine()) != null) {
36
+ try
37
+ {
38
+ String split[] = line.trim().split("\\s+");
39
+
40
+ if (filters == null || filters.containsKey(split[0].toLowerCase()))
41
+ {
42
+ dimension = split.length - 1;
43
+
44
+ if (!embeddings.containsKey(split[0])) embeddings.put(split[0], new double[split.length - 1]);
45
+ for (int i = 1; i < split.length; i++)
46
+ {
47
+ embeddings.get(split[0])[i - 1] = Double.parseDouble(split[i]);
48
+ }
49
+ }
50
+ counter++;
51
+ if (counter % 1000 == 0)
52
+ {
53
+ System.out.println("Loading vectors... " + String.valueOf(counter));
54
+ }
55
+ }
56
+ catch(Exception e)
57
+ {
58
+ System.out.println("Error processing line!");
59
+ continue;
60
+ };
61
+ }
62
+ }
63
+ }
64
+
65
+ public void save(String path) throws Exception
66
+ {
67
+ File fout = new File(path);
68
+ FileOutputStream fos = new FileOutputStream(fout);
69
+
70
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos));
71
+
72
+ embeddings.forEach((key, value) -> {
73
+ try {
74
+ writer.write(key + " ");
75
+ } catch (Exception e1) {
76
+ // TODO Auto-generated catch block
77
+ e1.printStackTrace();
78
+ }
79
+ for(int i = 0; i < value.length; i++)
80
+ {
81
+ try {
82
+ writer.write(String.valueOf(value[i]) + " ");
83
+ } catch (Exception e) {
84
+ // TODO Auto-generated catch block
85
+ e.printStackTrace();
86
+ }
87
+ }
88
+ try {
89
+ writer.newLine();
90
+ } catch (Exception e) {
91
+ // TODO Auto-generated catch block
92
+ e.printStackTrace();
93
+ }
94
+ });
95
+ writer.close();
96
+ }
97
+
98
+ public double similarity(String word1, String word2)
99
+ {
100
+ if (word1.compareTo(word2) == 0) return 1;
101
+ if (embeddings.containsKey(word1) && embeddings.containsKey(word2))
102
+ {
103
+ try {
104
+ return VectorOperations.cosine(embeddings.get(word1), embeddings.get(word2));
105
+ } catch (Exception e) {
106
+ // TODO Auto-generated catch block
107
+ return -2;
108
+ }
109
+ }
110
+ else return -2;
111
+ }
112
+
113
+ public double[] getEmbedding(String word)
114
+ {
115
+ if (embeddings.containsKey(word)) return embeddings.get(word);
116
+ else return null;
117
+ }
118
+
119
+ public List<Pair<String, Double>> getMostSimilar(String word, int numMostSimilar)
120
+ {
121
+ List<Pair<String, Double>> mostSimilar = new ArrayList<Pair<String, Double>>();
122
+ if (embeddings.containsKey(word))
123
+ {
124
+ embeddings.forEach((key, val) -> {
125
+ if (key.trim() != word)
126
+ {
127
+ double sim;
128
+ try {
129
+ sim = VectorOperations.cosine(embeddings.get(word), val);
130
+ } catch (Exception e) {
131
+ // TODO Auto-generated catch block
132
+ sim = -2;
133
+ }
134
+ if (mostSimilar.size() < numMostSimilar)
135
+ {
136
+ mostSimilar.add(new Pair<String, Double>(key, sim));
137
+ mostSimilar.sort((x,y) -> x.getValue1() > y.getValue1() ? -1 : (x.getValue1() < y.getValue1() ? 1 : 0));
138
+ }
139
+ else if (sim > mostSimilar.get(mostSimilar.size() - 1).getValue1())
140
+ {
141
+ mostSimilar.set(mostSimilar.size() - 1, new Pair<String, Double>(key, sim));
142
+ mostSimilar.sort((x,y) -> x.getValue1() > y.getValue1() ? -1 : (x.getValue1() < y.getValue1() ? 1 : 0));
143
+ }
144
+ }
145
+ });
146
+
147
+ return mostSimilar;
148
+ }
149
+ else return null;
150
+ }
151
+ }
source/src/edu/uma/nlp/graphseg/utils/ApplicationConfiguration.java ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.utils;
2
+
3
+ import java.io.IOException;
4
+ import java.io.InputStream;
5
+ import java.util.Properties;
6
+
7
+ public class ApplicationConfiguration {
8
+
9
+ public static ApplicationConfiguration config = new ApplicationConfiguration();
10
+
11
+ private Properties prop;
12
+
13
+ public ApplicationConfiguration()
14
+ {
15
+ prop = new Properties();
16
+ InputStream inStream = getClass().getClassLoader().getResourceAsStream("config.properties");
17
+
18
+ if (inStream != null)
19
+ {
20
+ try
21
+ {
22
+ prop.load(inStream);
23
+
24
+ }
25
+ catch (IOException e) {
26
+ e.printStackTrace();
27
+ }
28
+ finally
29
+ {
30
+ try
31
+ {
32
+ inStream.close();
33
+ }
34
+ catch (IOException e) {
35
+ e.printStackTrace();
36
+ }
37
+ }
38
+ }
39
+ }
40
+
41
+ public String getValue(String key)
42
+ {
43
+ if (prop != null)
44
+ {
45
+ return prop.getProperty(key);
46
+ }
47
+ else return null;
48
+ }
49
+ }
source/src/edu/uma/nlp/graphseg/utils/IOHelper.java ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.utils;
2
+
3
+ import java.io.BufferedReader;
4
+ import java.io.BufferedWriter;
5
+ import java.io.File;
6
+ import java.io.FileNotFoundException;
7
+ import java.io.FileOutputStream;
8
+ import java.io.FileReader;
9
+ import java.io.IOException;
10
+ import java.io.InputStream;
11
+ import java.io.InputStreamReader;
12
+ import java.io.OutputStreamWriter;
13
+ import java.io.UnsupportedEncodingException;
14
+ import java.nio.file.Files;
15
+ import java.nio.file.Paths;
16
+ import java.util.ArrayList;
17
+ import java.util.Collections;
18
+ import java.util.HashMap;
19
+ import java.util.List;
20
+ import java.util.Map;
21
+ import java.util.stream.Collectors;
22
+
23
+ import org.apache.commons.io.FileUtils;
24
+ import org.apache.commons.lang3.StringUtils;
25
+
26
+ public class IOHelper {
27
+ public static List<String> getAllLines(String path)
28
+ {
29
+ try {
30
+ return FileUtils.readLines(new File(path));
31
+ } catch (IOException e) {
32
+ System.out.println("File not found or error reading the file: " + path);
33
+ System.out.println(e.getMessage());
34
+ return null;
35
+ }
36
+ }
37
+
38
+ public static List<String> getAllLinesWithoutEmpty(String path)
39
+ {
40
+ try {
41
+ List<String> alllines = Files.readAllLines(Paths.get(path));
42
+ List<String> noEmpty = new ArrayList<String>();
43
+
44
+ for(int i = 0; i < alllines.size(); i++)
45
+ {
46
+ if (!StringUtils.isEmpty(alllines.get(i).trim()))
47
+ {
48
+ noEmpty.add(alllines.get(i));
49
+ }
50
+ }
51
+
52
+ return noEmpty;
53
+
54
+ } catch (IOException e) {
55
+ System.out.println("File not found or error reading the file: " + path);
56
+ return null;
57
+ }
58
+ }
59
+
60
+
61
+ public static void writeLines(List<String> lines, String path)
62
+ {
63
+ StringBuilder builder = new StringBuilder();
64
+ for(int i = 0; i < lines.size(); i++)
65
+ {
66
+ builder.append(lines.get(i) + "\n");
67
+ }
68
+
69
+ try {
70
+ FileUtils.writeStringToFile(new File(path), builder.toString());
71
+ } catch (IOException e) {
72
+ // TODO Auto-generated catch block
73
+ e.printStackTrace();
74
+ }
75
+ }
76
+
77
+ public static void writeCounts(Map<String, Integer> dictionary, String path, Boolean ordered)
78
+ {
79
+ writeCounts(dictionary.entrySet().stream().collect(Collectors.toList()), path, ordered);
80
+ }
81
+
82
+ public static void writeCounts(List<Map.Entry<String, Integer>> entries, String path, Boolean ordered)
83
+ {
84
+ try {
85
+ File fout = new File(path);
86
+ FileOutputStream fos;
87
+ fos = new FileOutputStream(fout);
88
+ BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
89
+
90
+ if (ordered) entries.sort((i1, i2) -> i1.getValue() > i2.getValue() ? -1 : (i2.getValue() > i1.getValue() ? 1 : 0));
91
+
92
+ for(int i = 0; i < entries.size(); i++)
93
+ {
94
+ bw.write(entries.get(i).getKey() + " " + entries.get(i).getValue() + "\n");
95
+ }
96
+
97
+ bw.close();
98
+
99
+ } catch (FileNotFoundException e) {
100
+ // TODO Auto-generated catch block
101
+ e.printStackTrace();
102
+ } catch (IOException e) {
103
+ // TODO Auto-generated catch block
104
+ e.printStackTrace();
105
+ }
106
+
107
+ }
108
+
109
+ public static void writeScores(Map<String, Double> dictionary, String path, Boolean orderedDescending, Map<String, Integer> additionalData, Boolean mweUnderscore)
110
+ {
111
+ writeScores(dictionary.entrySet().stream().collect(Collectors.toList()), path, orderedDescending, additionalData, mweUnderscore);
112
+ }
113
+
114
+ public static void writeScores(List<Map.Entry<String, Double>> entries, String path, Boolean orderedDescending, Map<String, Integer> additionalData, Boolean mweUnderscore)
115
+ {
116
+ try {
117
+ BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(path)), "UTF-8"));
118
+
119
+ entries.sort((i1, i2) -> i1.getValue() > i2.getValue() ? (orderedDescending ? -1 : 1) : (i2.getValue() > i1.getValue() ? (orderedDescending ? 1 : -1) : 0));
120
+
121
+ for(int i = 0; i < entries.size(); i++)
122
+ {
123
+ String line = "";
124
+ if (mweUnderscore)
125
+ {
126
+ String[] split = entries.get(i).getKey().split("\\s+");
127
+ StringBuilder singlePhrase = new StringBuilder();
128
+ for(int j = 0; j < split.length; j++)
129
+ {
130
+ singlePhrase.append(split[j]);
131
+ if (j < split.length - 1) singlePhrase.append("_");
132
+ }
133
+ line += singlePhrase.toString() + " ";
134
+ }
135
+ else line = entries.get(i).getKey() + " ";
136
+
137
+ line += String.valueOf(entries.get(i).getValue());
138
+
139
+ if (additionalData != null)
140
+ {
141
+ line += " " + String.valueOf(additionalData.get(entries.get(i).getKey()));
142
+ }
143
+ bw.write(line.trim() + "\n");
144
+ }
145
+
146
+ bw.close();
147
+
148
+ } catch (FileNotFoundException e) {
149
+ // TODO Auto-generated catch block
150
+ e.printStackTrace();
151
+ } catch (IOException e) {
152
+ // TODO Auto-generated catch block
153
+ e.printStackTrace();
154
+ }
155
+
156
+ }
157
+
158
+ public static HashMap<String, Integer> loadCounts(String path)
159
+ {
160
+ HashMap<String, Integer> dict = new HashMap<String, Integer>();
161
+ List<String> lines = getAllLines(path);
162
+
163
+ for(int i = 0; i < lines.size(); i++)
164
+ {
165
+ //if (i % 100 == 0) System.out.println("Loading counts: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
166
+ String split[] = lines.get(i).split("\\s+");
167
+ if (!dict.containsKey(split[0]))
168
+ {
169
+ dict.put(split[0], Integer.parseInt(split[1]));
170
+ }
171
+ }
172
+ return dict;
173
+ }
174
+
175
+ public static HashMap<String, Integer> loadCounts(InputStream stream) throws UnsupportedEncodingException, IOException
176
+ {
177
+ HashMap<String, Integer> dict = new HashMap<String, Integer>();
178
+ try(BufferedReader br = new BufferedReader(new InputStreamReader(stream, "UTF-8"))) {
179
+ for(String line; (line = br.readLine()) != null; ) {
180
+ if (StringUtils.isNotEmpty(line.trim()))
181
+ {
182
+ //if (i % 100 == 0) System.out.println("Loading counts: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
183
+ String split[] = line.split("\\s+");
184
+ if (!dict.containsKey(split[0]))
185
+ {
186
+ dict.put(split[0], Integer.parseInt(split[1]));
187
+ }
188
+ }
189
+ }
190
+ }
191
+ return dict;
192
+ }
193
+
194
+ public static HashMap<String, Double> loadScores(String path)
195
+ {
196
+ HashMap<String, Double> dict = new HashMap<String, Double>();
197
+ List<String> lines = getAllLines(path);
198
+
199
+ for(int i = 0; i < lines.size(); i++)
200
+ {
201
+ //if (i % 100 == 0) System.out.println("Loading counts: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
202
+ String split[] = lines.get(i).split("\\s+");
203
+ if (!dict.containsKey(split[0]))
204
+ {
205
+ dict.put(split[0], Double.parseDouble(split[1]));
206
+ }
207
+ }
208
+ return dict;
209
+ }
210
+
211
+ public static void peekTopLines(String inputpath, String outputPath, int numLines)
212
+ {
213
+ List<String> lines = new ArrayList<String>();
214
+ try(BufferedReader br = new BufferedReader(new FileReader(inputpath))) {
215
+ for(int i = 0; i < numLines; i++) {
216
+ lines.add(br.readLine());
217
+ }
218
+ IOHelper.writeLines(lines, outputPath);
219
+ // line is not visible here.
220
+ } catch (FileNotFoundException e) {
221
+ // TODO Auto-generated catch block
222
+ e.printStackTrace();
223
+ } catch (IOException e) {
224
+ // TODO Auto-generated catch block
225
+ e.printStackTrace();
226
+ }
227
+ }
228
+
229
+ public static Map<String, Double> loadScoresLineByLine(String path)
230
+ {
231
+ Map<String, Double> dict = Collections.synchronizedMap(new HashMap<String, Double>());
232
+
233
+ try(BufferedReader br = new BufferedReader(new FileReader(path))) {
234
+ for(String line; (line = br.readLine()) != null; ) {
235
+ if (StringUtils.isNotEmpty(line.trim()))
236
+ {
237
+ String split[] = line.split("\\s+");
238
+ if (!dict.containsKey(split[0]))
239
+ {
240
+ dict.put(split[0], Double.parseDouble(split[1]));
241
+ }
242
+ }
243
+ }
244
+ // line is not visible here.
245
+ } catch (FileNotFoundException e) {
246
+ // TODO Auto-generated catch block
247
+ e.printStackTrace();
248
+ } catch (IOException e) {
249
+ // TODO Auto-generated catch block
250
+ e.printStackTrace();
251
+ }
252
+
253
+ return dict;
254
+ }
255
+
256
+ public static Map<String, Integer> loadRanks(String path)
257
+ {
258
+ Map<String, Integer> dict = Collections.synchronizedMap(new HashMap<String, Integer>());
259
+
260
+ try(BufferedReader br = new BufferedReader(new FileReader(path))) {
261
+ int counter = 0;
262
+ for(String line; (line = br.readLine()) != null; ) {
263
+ if (StringUtils.isNotEmpty(line.trim()))
264
+ {
265
+ counter++;
266
+ String split[] = line.split("\\s+");
267
+ if (!dict.containsKey(split[0]))
268
+ {
269
+ dict.put(split[0], counter);
270
+ }
271
+ }
272
+ }
273
+ // line is not visible here.
274
+ } catch (FileNotFoundException e) {
275
+ // TODO Auto-generated catch block
276
+ e.printStackTrace();
277
+ } catch (IOException e) {
278
+ // TODO Auto-generated catch block
279
+ e.printStackTrace();
280
+ }
281
+
282
+ return dict;
283
+ }
284
+
285
+ public static Map<String, Double> loadScoresLineByLine(String path, double treshold, Boolean sorted)
286
+ {
287
+ Map<String, Double> dict = Collections.synchronizedMap(new HashMap<String, Double>());
288
+
289
+ try(BufferedReader br = new BufferedReader(new FileReader(path))) {
290
+ for(String line; (line = br.readLine()) != null; ) {
291
+ if (StringUtils.isNotEmpty(line.trim()))
292
+ {
293
+ String split[] = line.split("\\s+");
294
+ if (!dict.containsKey(split[0]))
295
+ {
296
+ Double score = Double.parseDouble(split[1]);
297
+ if (score >= treshold) dict.put(split[0], score);
298
+ else if (sorted)
299
+ {
300
+ return dict;
301
+ }
302
+ }
303
+ }
304
+ }
305
+ // line is not visible here.
306
+ } catch (FileNotFoundException e) {
307
+ // TODO Auto-generated catch block
308
+ e.printStackTrace();
309
+ } catch (IOException e) {
310
+ // TODO Auto-generated catch block
311
+ e.printStackTrace();
312
+ }
313
+
314
+ return dict;
315
+ }
316
+
317
+ public static Map<String, Double> loadScoresLineByLine(String path, int topN)
318
+ {
319
+ Map<String, Double> dict = Collections.synchronizedMap(new HashMap<String, Double>());
320
+
321
+ try(BufferedReader br = new BufferedReader(new FileReader(path))) {
322
+ for(int i = 0; i < topN; i++) {
323
+ String line = br.readLine();
324
+ if (StringUtils.isNotEmpty(line.trim()))
325
+ {
326
+ String split[] = line.split("\\s+");
327
+ if (!dict.containsKey(split[0]))
328
+ {
329
+ Double score = Double.parseDouble(split[1]);
330
+ dict.put(split[0], score);
331
+ }
332
+ }
333
+ }
334
+ // line is not visible here.
335
+ } catch (FileNotFoundException e) {
336
+ // TODO Auto-generated catch block
337
+ e.printStackTrace();
338
+ } catch (IOException e) {
339
+ // TODO Auto-generated catch block
340
+ e.printStackTrace();
341
+ }
342
+
343
+ return dict;
344
+ }
345
+
346
+ public static HashMap<String, String> loadMappings(String path)
347
+ {
348
+ HashMap<String, String> dict = new HashMap<String, String>();
349
+ List<String> lines = getAllLines(path);
350
+
351
+ for(int i = 0; i < lines.size(); i++)
352
+ {
353
+ //if (i % 100 == 0) System.out.println("Loading mappings: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
354
+ String split[] = lines.get(i).split("\\s+");
355
+ if (!dict.containsKey(split[0]))
356
+ {
357
+ dict.put(split[0], split[1]);
358
+ }
359
+ }
360
+ return dict;
361
+ }
362
+
363
+ public static HashMap<String, List<String>> loadMultiMappings(String path)
364
+ {
365
+ HashMap<String, List<String>> dict = new HashMap<String, List<String>>();
366
+ List<String> lines = getAllLines(path);
367
+
368
+ for(int i = 0; i < lines.size(); i++)
369
+ {
370
+ //if (i % 100 == 0) System.out.println("Loading mappings: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
371
+ String split[] = lines.get(i).split("\\s+");
372
+ if (!dict.containsKey(split[0]))
373
+ {
374
+ dict.put(split[0], new ArrayList<String>());
375
+ }
376
+ dict.get(split[0]).add(split[1]);
377
+ }
378
+ return dict;
379
+ }
380
+
381
+ public static void writeStringToFile(String content, String path) throws IOException
382
+ {
383
+ FileUtils.writeStringToFile(new File(path), content, "UTF-8");
384
+ }
385
+ }
source/src/edu/uma/nlp/graphseg/utils/MemoryStorage.java ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.utils;
2
+
3
+ import edu.uma.nlp.graphseg.semantics.InformationContent;
4
+ import edu.uma.nlp.graphseg.semantics.WordVectorSpace;
5
+
6
+ public class MemoryStorage {
7
+
8
+ private static WordVectorSpace wordVectorSpace;
9
+
10
+ public static WordVectorSpace getWordVectorSpace() {
11
+ return wordVectorSpace;
12
+ }
13
+ public static void setWordVectorSpace(WordVectorSpace wordVectorSpace) {
14
+ MemoryStorage.wordVectorSpace = wordVectorSpace;
15
+ }
16
+
17
+ private static InformationContent informationContent;
18
+
19
+ public static InformationContent getInformationContent() {
20
+ return informationContent;
21
+ }
22
+ public static void setInformationContent(InformationContent informationContent) {
23
+ MemoryStorage.informationContent = informationContent;
24
+ }
25
+
26
+ }
source/src/edu/uma/nlp/graphseg/utils/VectorOperations.java ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ package edu.uma.nlp.graphseg.utils;
2
+
3
+ public class VectorOperations {
4
+ public static double cosine(double[] vector, double[] otherVector) throws Exception
5
+ {
6
+ if (vector.length != otherVector.length)
7
+ {
8
+ throw new UnsupportedOperationException("Vectors are of different length");
9
+ }
10
+
11
+ double dp = 0;
12
+ double sum1 = 0;
13
+ double sum2 = 0;
14
+
15
+ for (int i = 0; i < vector.length; i++)
16
+ {
17
+ dp += vector[i] * otherVector[i];
18
+ sum1 += vector[i] * vector[i];
19
+ sum2 += otherVector[i] * otherVector[i];
20
+ }
21
+
22
+ return dp / (Math.sqrt(sum1) * Math.sqrt(sum2));
23
+ }
24
+
25
+ public static void multiply(double[] vector, double factor)
26
+ {
27
+ for (int i = 0; i < vector.length; i++) vector[i] *= factor;
28
+ }
29
+
30
+ public double[] sumVectors(double[] vector, double[] otherVector)
31
+ {
32
+ if (vector.length != otherVector.length) throw new UnsupportedOperationException("Vectors are of different length");
33
+
34
+ double[] result = new double[vector.length];
35
+ for (int i = 0; i < vector.length; i++) result[i] = vector[i] += otherVector[i];
36
+
37
+ return result;
38
+ }
39
+
40
+ public static void addVector(double[] vector, double[] otherVector)
41
+ {
42
+ if (vector.length != otherVector.length) throw new UnsupportedOperationException("Vectors are of different length");
43
+ for (int i = 0; i < vector.length; i++) vector[i] += otherVector[i];
44
+ }
45
+ }