xin commited on
Commit
22738ca
1 Parent(s): ec77065

initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. App/__init__.py +0 -0
  3. App/__pycache__/__init__.cpython-38.pyc +3 -0
  4. App/assets/FakeConcept +2 -0
  5. App/assets/FilterS +3 -0
  6. App/assets/FilterWords +7 -0
  7. App/assets/Incrementer +4 -0
  8. App/assets/OneCommaDiscriminator +11 -0
  9. App/assets/abbreviation_sentence_splitter +24 -0
  10. App/assets/claims_indices +1 -0
  11. App/assets/commonWords +0 -0
  12. App/assets/dicts/dec.yml +2 -0
  13. App/assets/dicts/inc.yml +4 -0
  14. App/assets/dicts/inv.yml +6 -0
  15. App/assets/dicts/negative.yml +102 -0
  16. App/assets/dicts/positive.yml +113 -0
  17. App/assets/dropPart +39 -0
  18. App/assets/examplificationclues +8 -0
  19. App/assets/exclude_from_parameters +22 -0
  20. App/assets/exclusionList +73 -0
  21. App/assets/getFromClaims +1 -0
  22. App/assets/includeLinks +7 -0
  23. App/assets/inclusionList +17 -0
  24. App/assets/parameter_core +237 -0
  25. App/assets/problem_markers +151 -0
  26. App/assets/referencing_indices +4 -0
  27. App/assets/removeItems +22 -0
  28. App/assets/stopword_common_English +500 -0
  29. App/assets/stopword_patent_English +38 -0
  30. App/assets/stopword_rake.txt +590 -0
  31. App/assets/stopwords +567 -0
  32. App/assets/trainingsNegative +216 -0
  33. App/assets/trainingsPositive +213 -0
  34. App/assets/wordAfterNumber +95 -0
  35. App/assets/wordBeforeNumber +20 -0
  36. App/assets/wordtagVerb +2 -0
  37. App/bin/ClassifierWithIncr.py +189 -0
  38. App/bin/ComplexParser.py +45 -0
  39. App/bin/CorpusProcessor.py +460 -0
  40. App/bin/FiguresCleaner.py +44 -0
  41. App/bin/FindTechnologies.py +64 -0
  42. App/bin/InformationExtractor.py +588 -0
  43. App/bin/InformationExtractor_Claims.py +165 -0
  44. App/bin/InputHandler.py +35 -0
  45. App/bin/MagicParser.py +45 -0
  46. App/bin/PGProcessor.py +107 -0
  47. App/bin/ParamProcessor.py +99 -0
  48. App/bin/ParameterExtractor.py +51 -0
  49. App/bin/PatentHandler.py +254 -0
  50. App/bin/SentenceClassifier.py +60 -0
.gitattributes CHANGED
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.csv filter=lfs diff=lfs merge=lfs -text
29
+ *.pyc filter=lfs diff=lfs merge=lfs -text
App/__init__.py ADDED
File without changes
App/__pycache__/__init__.cpython-38.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ecebdb48cd6c7c6738e1a46ec4a92320a1e39163fdf8b688b16d9101dc77606
3
+ size 161
App/assets/FakeConcept ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ such as
2
+ as such
App/assets/FilterS ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ figure
2
+ figures
3
+ previous
App/assets/FilterWords ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ if
2
+ claim 1
3
+ claim 2
4
+ claim 3
5
+ however
6
+ but
7
+ cause
App/assets/Incrementer ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ approximately
2
+ suitable
3
+ good
4
+ preferably
App/assets/OneCommaDiscriminator ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ preferably,\s
2
+ in addition
3
+ which
4
+ thus
5
+ preferably
6
+ but
7
+ generally
8
+ conventional
9
+ in particular
10
+ specifically
11
+ as necessary
App/assets/abbreviation_sentence_splitter ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ vol
2
+ no
3
+ pp
4
+ p
5
+ ch
6
+ pat
7
+ i.e
8
+ fig
9
+ vdd
10
+ p
11
+ sec
12
+ centigrade
13
+ vols
14
+ figs
15
+ approx
16
+ e.g
17
+ etc
18
+ cf
19
+ ser
20
+ \]
21
+ deg
22
+ ver
23
+ sup
24
+ A
App/assets/claims_indices ADDED
@@ -0,0 +1 @@
 
 
1
+ ^.*\bclaim [1-3]\b\s?,?(.*$)
App/assets/commonWords ADDED
The diff for this file is too large to render. See raw diff
 
App/assets/dicts/dec.yml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ barely: [dec]
2
+ little: [dec]
App/assets/dicts/inc.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ too: [inc]
2
+ very: [inc]
3
+ sorely: [inc]
4
+ is: [inc]
App/assets/dicts/inv.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ increase: [inv]
2
+ decrease: [inv]
3
+ increases: [inv]
4
+ decreases: [inv]
5
+ increased: [inv]
6
+ decreased: [inv]
App/assets/dicts/negative.yml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bad: [negative]
2
+ uninspired: [negative]
3
+ expensive: [negative]
4
+ dissapointed: [negative]
5
+ recommend others to avoid: [negative]
6
+ do not: [negative]
7
+ costly: [negative]
8
+ drawbacks: [negative]
9
+ drawbacks: [negative]
10
+ problem: [negative]
11
+ uneven: [negative]
12
+ perturbation: [negative]
13
+ may not: [negative]
14
+ is not: [negative]
15
+ are not: [negative]
16
+ fragil: [negative]
17
+ damage: [negative]
18
+ prevent: [negative]
19
+ can vary: [negative]
20
+ can not: [negative]
21
+ cannot: [negative]
22
+ due to: [negative]
23
+ thus no longer: [negative]
24
+ no longer: [negative]
25
+ annoying: [negative]
26
+ annoy: [negative]
27
+ annoys: [negative]
28
+ annoyed: [negative]
29
+ saturated: [negative]
30
+ undesirable: [negative]
31
+ pollute: [negative]
32
+ pollutes: [negative]
33
+ saturates: [negative]
34
+ saturating: [negative]
35
+ saturate: [negative]
36
+ blocked: [negative]
37
+ dirt: [negative]
38
+ dirts: [negative]
39
+ complexity: [negative]
40
+ tiresome: [negative]
41
+ does not: [negative]
42
+ abnormally: [negative]
43
+ abruptly: [negative]
44
+ critically: [negative]
45
+ dangerously: [negative]
46
+ disadvantages: [negative]
47
+ disadvantage: [negative]
48
+ dramatically: [negative]
49
+ erroneously: [negative]
50
+ exceedingly: [negative]
51
+ excessively: [negative]
52
+ hardly: [negative]
53
+ heavily: [negative]
54
+ irreversibly: [negative]
55
+ poorly: [negative]
56
+ randomly: [negative]
57
+ severely: [negative]
58
+ unacceptably: [negative]
59
+ unconditionally: [negative]
60
+ unevenly: [negative]
61
+ unexpectedly: [negative]
62
+ unfortunately: [negative]
63
+ unusually: [negative]
64
+ uselessly: [negative]
65
+ badly: [negative]
66
+ impair: [negative]
67
+ increasing: [negative]
68
+ weight: [negative]
69
+ costs: [negative]
70
+ cost: [negative]
71
+ is not: [negative]
72
+ damage: [negative]
73
+ complicate: [negative]
74
+ destroy: [negative]
75
+ was necessary: [negative]
76
+ annoy: [negative]
77
+ annoys: [negative]
78
+ annoyed: [negative]
79
+ annoying: [negative]
80
+ noise: [negative]
81
+ inevitably: [negative]
82
+ be annoying: [negative]
83
+ is annoying: [negative]
84
+ difficulties: [negative]
85
+ difficulty: [negative]
86
+ difficult: [negative]
87
+ fatty: [negative]
88
+ prevent: [negative]
89
+ leakage: [negative]
90
+ corrosive: [negative]
91
+ require: [negative]
92
+ deformation: [negative]
93
+ necessity: [negative]
94
+ error: [negative]
95
+ errors: [negative]
96
+ occur: [negative]
97
+ occurs: [negative]
98
+ may nevertheless: [negative]
99
+ may not: [negative]
100
+ continue to increase: [negative]
101
+ decrease: [negative]
102
+
App/assets/dicts/positive.yml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ nice: [positive]
2
+ awesome: [positive]
3
+ cool: [positive]
4
+ superb: [positive]
5
+ comprises: [positive]
6
+ is: [positive]
7
+ advantageously: [positive]
8
+ advantageous: [positive]
9
+ preferably: [positive]
10
+ advantage: [positive]
11
+ advantages: [positive]
12
+ avoid: [positive]
13
+ useful: [positve]
14
+ good: [positive]
15
+ suitable: [positive]
16
+ be: [positive]
17
+ improve: [positive]
18
+ improved: [positive]
19
+ improves: [positive]
20
+ does not have to: [positive]
21
+ removed: [positive]
22
+ acceptably: [positive]
23
+ accurately: [positive]
24
+ acutely: [positive]
25
+ adequately: [positive]
26
+ advantageously: [positive]
27
+ appreciably: [positive]
28
+ beneficially: [positive]
29
+ brightly: [positive]
30
+ cheaply: [positive]
31
+ clearly: [positive]
32
+ conveniently: [positive]
33
+ correctly: [positive]
34
+ cost-effectively: [positive]
35
+ desirably: [positive]
36
+ effectively: [positive]
37
+ effectually: [positive]
38
+ efficaciously: [positive]
39
+ efficiently: [positive]
40
+ elaborately: [positive]
41
+ favourably: [positive]
42
+ finely: [positive]
43
+ greatly: [positive]
44
+ inexpensively: [positive]
45
+ interestingly: [positive]
46
+ measurably: [positive]
47
+ necessarily: [positive]
48
+ determines: [positive]
49
+ determine: [positive]
50
+ optimally: [positive]
51
+ potentially: [positive]
52
+ precisely: [positive]
53
+ properly: [positive]
54
+ purposefully: [positive]
55
+ qualitatively: [positive]
56
+ quantitatively: [positive]
57
+ quite: [positive]
58
+ rapidly: [positive]
59
+ safely: [positive]
60
+ satisfactorily: [positive]
61
+ securely: [positive]
62
+ sharply: [positive]
63
+ uniform: [positive]
64
+ distribution: [positive]
65
+ reduce: [positive]
66
+ reduced: [positive]
67
+ reduces: [positive]
68
+ desire: [positive]
69
+ desired: [positive]
70
+ desires: [positive]
71
+ have: [positive]
72
+ has: [positive]
73
+ be: [positive]
74
+ about: [positive]
75
+ drive: [positive]
76
+ drives: [positive]
77
+ generally: [positive]
78
+ accomplish: [positive]
79
+ accomplished: [positive]
80
+ consists: [positive]
81
+ consist: [positive]
82
+ include: [positive]
83
+ includes: [positive]
84
+ remove: [positive]
85
+ removes: [positive]
86
+ present: [positive]
87
+ presents: [positive]
88
+ separates: [positive]
89
+ connected: [positive]
90
+ enclosed: [positive]
91
+ with respect: [positive]
92
+ disconnected: [positive]
93
+ relevance: [positive]
94
+ extend: [positive]
95
+ enclose: [positive]
96
+ create: [positive]
97
+ creates: [positive]
98
+ so that: [positive]
99
+ well known: [positive]
100
+ contain: [positive]
101
+ contains: [positive]
102
+ receive: [positive]
103
+ receives: [positive]
104
+ processes: [positive]
105
+ processe: [positive]
106
+ optimal: [positive]
107
+ inexpensive: [positive]
108
+ operates: [positive]
109
+ operate: [positive]
110
+ minimize: [positive]
111
+ can not therefore invade: [positive]
112
+ installed: [positive]
113
+ install: [positive]
App/assets/dropPart ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DETAILED DESCRIPTION OF THE INVENTION
2
+ BRIEF DESCRIPTION OF DRAWINGS
3
+ BACKGROUND OF INVENTION
4
+ BACKGROUND OF THE INVENTION
5
+ FIELD OF INVENTION
6
+ BRIEF DESCRIPTION OF THE DRAWINGS
7
+ Brief Description of the Drawings
8
+ Brief Description of the Prior Art
9
+ DETAILED DESCRIPTION OF THE PREFERRED EMBODIMENTS
10
+ Object of the Invention
11
+ DETALIED DESCRIPTION
12
+ Field of the Invention
13
+ Background Art
14
+ Background art
15
+ Object of the Invention
16
+ DESCRIPTION
17
+ BACKGROUND
18
+ DRAWINGS
19
+ SUMMARY
20
+ Brief Description of the Prior Art
21
+ detailed description of the invention and preferred embodiments
22
+ detailed description of the preferred embodiments
23
+ detailed description of the preferred embodiment
24
+ detailed description
25
+ background of the invention
26
+ Background of the Invention
27
+ detailed description of the invention
28
+ summary of the inventions
29
+ summary of the invention
30
+ background and summary of the invention
31
+ field of the invention
32
+ description of the related art
33
+ related background art
34
+ description of the prior art
35
+ field of the invention
36
+ brief description of the invention
37
+ description of the invention
38
+ description of the preferred embodiments.
39
+ what is claimed is
App/assets/examplificationclues ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ moreover,\s
2
+ in addition,\s
3
+ in pratice,\s
4
+ lastly,\s
5
+ such a\s
6
+ such\s
7
+ typically,\s
8
+ in this example,\s
App/assets/exclude_from_parameters ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ invention
2
+ comprising
3
+ that
4
+ initiation
5
+ between
6
+ investigation
7
+ investigations
8
+ example
9
+ illustration
10
+ examples
11
+ illustrations
12
+ thereof
13
+ increase
14
+ decrease
15
+ under
16
+ problem
17
+ as
18
+ parison
19
+ regardless
20
+ by
21
+ through
22
+ another
App/assets/exclusionList ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ figure
2
+ figures
3
+ shows
4
+ ;
5
+ wherein
6
+ the present invention
7
+ list of reference characters
8
+ brief description of the drawings
9
+ detailed description of preferred embodiments
10
+ description of a preferred embodiment
11
+ :
12
+ for example
13
+ this
14
+ drawing
15
+ drawings
16
+ embodiments
17
+ embodiment
18
+ arrangement
19
+ arrangements
20
+ invention
21
+ inventions
22
+ provisional application
23
+ provisional patent application
24
+ method
25
+ present application
26
+ international application
27
+ reference characters
28
+ description
29
+ descriptions
30
+ in addition
31
+ likewise
32
+ figure
33
+ Figure
34
+ Figures
35
+ figures
36
+ comprise
37
+ comprises
38
+ comprising
39
+ Another purpose
40
+ related patent
41
+ DETAILED
42
+ OVERVIEW
43
+ summary
44
+ present disclosure
45
+ cross-references
46
+ cross-reference
47
+ disclosed herein
48
+ aforesaid document
49
+ describes
50
+ describe
51
+ relates
52
+ relate
53
+ present inventive
54
+ Description of the Related Art
55
+ publication
56
+ Publication
57
+ referred
58
+ we think
59
+ thereafter
60
+ Apparatus
61
+ apparatus
62
+ what is claimed is
63
+ What is claimed is
64
+ herein
65
+ Herein
66
+ will not be presented more in details
67
+ discloses
68
+ previous
69
+ this
70
+ This
71
+ table
72
+ Table
73
+ TABLE
App/assets/getFromClaims ADDED
@@ -0,0 +1 @@
 
 
1
+ wherein
App/assets/includeLinks ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ However
2
+ however
3
+ If
4
+ Because
5
+ When\b
6
+ Since
7
+ since
App/assets/inclusionList ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ however
2
+ may
3
+ if
4
+ are
5
+ advantageously
6
+ may be
7
+ may cause
8
+ cause
9
+ causes
10
+ is
11
+ when
12
+ since
13
+ in order to
14
+ in practice
15
+ many
16
+ Many
17
+ problem
App/assets/parameter_core ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ability
2
+ absorption
3
+ accent
4
+ accuracy
5
+ adaptability
6
+ air pressure
7
+ albedo
8
+ amount
9
+ anatomy
10
+ area
11
+ arena
12
+ atmospheric pressure
13
+ automation
14
+ bar
15
+ baron
16
+ beat
17
+ bod
18
+ boiling point
19
+ book
20
+ breadth
21
+ brittleness
22
+ broker
23
+ build
24
+ bulk
25
+ cadence
26
+ capacitance
27
+ catching
28
+ charge
29
+ chassis
30
+ cistron
31
+ color
32
+ comfort
33
+ commission
34
+ complexity
35
+ complexness
36
+ concentration
37
+ condition
38
+ configuration
39
+ conformation
40
+ constancy
41
+ constituent
42
+ content
43
+ continuance
44
+ contour
45
+ density
46
+ departure
47
+ dependability
48
+ dependableness
49
+ deprivation
50
+ dielectric constant
51
+ difficultness
52
+ difficulty
53
+ direction
54
+ distance
55
+ distribution
56
+ divisor
57
+ domain
58
+ ductility
59
+ durability
60
+ duration
61
+ ease
62
+ easiness
63
+ effect
64
+ effectiveness
65
+ efficacy
66
+ electric charge
67
+ electric field
68
+ electric potential
69
+ electrical conductivity
70
+ electrical impedance
71
+ electrical resistivity
72
+ emission
73
+ emphasis
74
+ enduringness
75
+ energy
76
+ espial
77
+ essence
78
+ exercising weight
79
+ exit
80
+ expanse
81
+ expiration
82
+ exponent
83
+ extent
84
+ fabrication
85
+ factor
86
+ factors
87
+ fastness
88
+ field
89
+ flesh
90
+ flexibility
91
+ flow
92
+ flow rate
93
+ flowing
94
+ fluidity
95
+ focus
96
+ force
97
+ force-out
98
+ forcefulness
99
+ form
100
+ frequency
101
+ gist
102
+ guidance
103
+ hardness
104
+ height
105
+ hurrying
106
+ illuminance
107
+ illumination
108
+ index
109
+ inductance
110
+ informality
111
+ ingredient
112
+ insistence
113
+ insistency
114
+ instruction
115
+ intensity
116
+ intensiveness
117
+ intrinsic impedance
118
+ inwardness
119
+ irradiance
120
+ issue
121
+ kernel
122
+ king
123
+ length
124
+ level
125
+ light
126
+ location
127
+ loss
128
+ loudness
129
+ luminance
130
+ luster
131
+ magnate
132
+ magnetic field
133
+ magnetic flux
134
+ malleability
135
+ management
136
+ manufacture
137
+ manufacturing
138
+ marrow
139
+ mass
140
+ material body
141
+ meaning
142
+ measure
143
+ measurement
144
+ measuring
145
+ melting point
146
+ menses
147
+ menstruation
148
+ mensuration
149
+ meter
150
+ metre
151
+ mightiness
152
+ moment
153
+ momentum
154
+ muscularity
155
+ number
156
+ office
157
+ orbit
158
+ passing
159
+ pattern
160
+ period
161
+ permeability
162
+ permittivity
163
+ physical body
164
+ physique
165
+ pith
166
+ point
167
+ posture
168
+ potency
169
+ power
170
+ powerfulness
171
+ preciseness
172
+ precision
173
+ pressure
174
+ productiveness
175
+ productivity
176
+ push
177
+ quantity
178
+ radiance
179
+ rate
180
+ rate of flow
181
+ ratio
182
+ reflectivity
183
+ region
184
+ release
185
+ reliability
186
+ reliableness
187
+ resistivity
188
+ saturation
189
+ shape
190
+ simpleness
191
+ simplicity
192
+ solubility
193
+ speciality
194
+ specialty
195
+ specific heat
196
+ speed
197
+ speeding
198
+ sphere
199
+ spin
200
+ spotting
201
+ spying
202
+ stability
203
+ stableness
204
+ standard
205
+ steering
206
+ step
207
+ strain
208
+ stream
209
+ strength
210
+ stress
211
+ substance
212
+ surface
213
+ swiftness
214
+ temperature
215
+ tenseness
216
+ tension
217
+ thermal conductivity
218
+ touchstone
219
+ trouble
220
+ truth
221
+ tycoon
222
+ velocity
223
+ versatility
224
+ vigor
225
+ vigour
226
+ violence
227
+ viscosity
228
+ vitality
229
+ vividness
230
+ voltage
231
+ volume
232
+ wave impedance
233
+ weight
234
+ weightiness
235
+ weighting
236
+ weights
237
+ width
App/assets/problem_markers ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ would
2
+ may
3
+ cause without present invention or method etc
4
+ conventional
5
+ problem
6
+ turbulence
7
+ polluting
8
+ aggravate
9
+ uneven
10
+ irregular
11
+ pertubation
12
+ disturbance
13
+ dissipation
14
+ affect
15
+ noise-inducing
16
+ dirt
17
+ undesirable
18
+ pollute
19
+ contaminant
20
+ difficulty
21
+ difficult
22
+ rancid
23
+ hazard
24
+ time-consuming
25
+ painstaking
26
+ byproduct
27
+ leak
28
+ leakage
29
+ blemish
30
+ blemished
31
+ blemishes
32
+ blemishing
33
+ blemishings
34
+ break
35
+ breaking
36
+ breakings
37
+ breaks
38
+ broken
39
+ bug
40
+ bugs
41
+ cause
42
+ caused
43
+ causes
44
+ causing
45
+ complicate
46
+ complicated
47
+ complicates
48
+ complicating
49
+ complication
50
+ crack
51
+ cracked
52
+ cracking
53
+ crackings
54
+ critical
55
+ damage
56
+ damaged
57
+ damages
58
+ damaging
59
+ defect
60
+ defected
61
+ defecting
62
+ defection
63
+ defections
64
+ defects
65
+ deficiencies
66
+ deficiency
67
+ deform
68
+ deformed
69
+ deformities
70
+ deformity
71
+ degradation
72
+ degrade
73
+ degraded
74
+ degrades
75
+ deprivation
76
+ deprive
77
+ deprived
78
+ deprives
79
+ depriving
80
+ destroy
81
+ destroyed
82
+ destroying
83
+ destroys
84
+ destruction
85
+ deteriorate
86
+ deteriorates
87
+ deteriorating
88
+ deteriored
89
+ detriment
90
+ difficulties
91
+ difficulty
92
+ difficult
93
+ disadvantage
94
+ disparates
95
+ drawbacks
96
+ grave
97
+ hard
98
+ hamper
99
+ hampered
100
+ hampering
101
+ hampers
102
+ harm
103
+ harmed
104
+ harming
105
+ harms
106
+ hinder
107
+ impair
108
+ impaired
109
+ impairing
110
+ impairs
111
+ imperfection
112
+ imperfections
113
+ incident
114
+ instabilities
115
+ instability
116
+ mar
117
+ marring
118
+ prejudice
119
+ problem
120
+ problems
121
+ serious
122
+ severe
123
+ smashes
124
+ smashing
125
+ spoil
126
+ spoiling
127
+ stain
128
+ stains
129
+ trouble
130
+ troubled
131
+ troubles
132
+ weaken
133
+ failure
134
+ unintended
135
+ limitation
136
+ limitations
137
+ drawback
138
+ weakened
139
+ degrading
140
+ undesired
141
+ weakening
142
+ disadvantageous
143
+ weakness
144
+ weaknesses
145
+ worse
146
+ worseing
147
+ worsen
148
+ worsened
149
+ worsens
150
+
151
+
App/assets/referencing_indices ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ explained before
2
+ but as
3
+ most of
4
+ ^if
App/assets/removeItems ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ brief description of the drawings
2
+ detailed description of preferred embodiments
3
+ description of a preferred embodiment
4
+ detailed description of the invention and preferred embodiments
5
+ detailed description of the preferred embodiments
6
+ detailed description of the preferred embodiment
7
+ detailed description
8
+ background of the invention
9
+ detailed description of the invention
10
+ summary of the inventions
11
+ summary of the invention
12
+ background and summary of the invention
13
+ field of the invention
14
+ description of the related art
15
+ related background art
16
+ description of the prior art
17
+ field of the invention
18
+ brief description of the invention
19
+ description of the invention
20
+ description of the preferred embodiments.
21
+ summary
22
+ mitsubishi heavy ind ltd [jp]
App/assets/stopword_common_English ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a
2
+ able
3
+ about
4
+ above
5
+ abst
6
+ accordance
7
+ according
8
+ accordingly
9
+ across
10
+ act
11
+ actually
12
+ added
13
+ adj
14
+ affected
15
+ affecting
16
+ affects
17
+ after
18
+ afterwards
19
+ again
20
+ against
21
+ ah
22
+ all
23
+ almost
24
+ alone
25
+ along
26
+ already
27
+ also
28
+ although
29
+ always
30
+ am
31
+ among
32
+ amongst
33
+ an
34
+ and
35
+ announce
36
+ another
37
+ any
38
+ anybody
39
+ anyhow
40
+ anymore
41
+ anyone
42
+ anything
43
+ anyway
44
+ anyways
45
+ anywhere
46
+ apparently
47
+ approximately
48
+ are
49
+ aren
50
+ arent
51
+ arise
52
+ around
53
+ as
54
+ aside
55
+ ask
56
+ asking
57
+ at
58
+ auth
59
+ available
60
+ away
61
+ awfully
62
+ b
63
+ back
64
+ be
65
+ became
66
+ because
67
+ become
68
+ becomes
69
+ becoming
70
+ been
71
+ before
72
+ beforehand
73
+ begin
74
+ beginning
75
+ beginnings
76
+ begins
77
+ behind
78
+ being
79
+ believe
80
+ below
81
+ beside
82
+ besides
83
+ between
84
+ beyond
85
+ biol
86
+ both
87
+ brief
88
+ briefly
89
+ but
90
+ by
91
+ c
92
+ ca
93
+ came
94
+ can
95
+ cannot
96
+ can't
97
+ cause
98
+ causes
99
+ certain
100
+ certainly
101
+ co
102
+ com
103
+ come
104
+ comes
105
+ contain
106
+ containing
107
+ contains
108
+ could
109
+ couldnt
110
+ d
111
+ date
112
+ did
113
+ didn't
114
+ different
115
+ do
116
+ does
117
+ doesn't
118
+ doing
119
+ done
120
+ don't
121
+ down
122
+ downwards
123
+ due
124
+ during
125
+ e
126
+ each
127
+ ed
128
+ edu
129
+ effect
130
+ eg
131
+ eight
132
+ eighty
133
+ either
134
+ else
135
+ elsewhere
136
+ end
137
+ ending
138
+ enough
139
+ especially
140
+ et
141
+ et-al
142
+ etc
143
+ even
144
+ ever
145
+ every
146
+ everybody
147
+ everyone
148
+ everything
149
+ everywhere
150
+ ex
151
+ except
152
+ f
153
+ far
154
+ few
155
+ ff
156
+ fifth
157
+ first
158
+ five
159
+ fix
160
+ followed
161
+ following
162
+ follows
163
+ for
164
+ former
165
+ formerly
166
+ forth
167
+ found
168
+ four
169
+ from
170
+ further
171
+ furthermore
172
+ g
173
+ gave
174
+ get
175
+ gets
176
+ getting
177
+ give
178
+ given
179
+ gives
180
+ giving
181
+ go
182
+ goes
183
+ gone
184
+ got
185
+ gotten
186
+ h
187
+ had
188
+ happens
189
+ hardly
190
+ has
191
+ hasn't
192
+ have
193
+ haven't
194
+ having
195
+ he
196
+ hed
197
+ hence
198
+ her
199
+ here
200
+ hereafter
201
+ hereby
202
+ herein
203
+ heres
204
+ hereupon
205
+ hers
206
+ herself
207
+ hes
208
+ hi
209
+ hid
210
+ him
211
+ himself
212
+ his
213
+ hither
214
+ home
215
+ how
216
+ howbeit
217
+ however
218
+ hundred
219
+ i
220
+ id
221
+ ie
222
+ if
223
+ i'll
224
+ im
225
+ immediate
226
+ immediately
227
+ importance
228
+ important
229
+ in
230
+ inc
231
+ indeed
232
+ index
233
+ information
234
+ instead
235
+ into
236
+ invention
237
+ inward
238
+ is
239
+ isn't
240
+ it
241
+ itd
242
+ it'll
243
+ its
244
+ itself
245
+ i've
246
+ j
247
+ just
248
+ k
249
+ keep keeps
250
+ kept
251
+ kg
252
+ km
253
+ know
254
+ known
255
+ knows
256
+ l
257
+ largely
258
+ last
259
+ lately
260
+ later
261
+ latter
262
+ latterly
263
+ least
264
+ less
265
+ lest
266
+ let
267
+ lets
268
+ like
269
+ liked
270
+ likely
271
+ line
272
+ little
273
+ 'll
274
+ look
275
+ looking
276
+ looks
277
+ ltd
278
+ m
279
+ made
280
+ mainly
281
+ make
282
+ makes
283
+ many
284
+ may
285
+ maybe
286
+ me
287
+ mean
288
+ means
289
+ meantime
290
+ meanwhile
291
+ merely
292
+ mg
293
+ might
294
+ million
295
+ miss
296
+ ml
297
+ more
298
+ moreover
299
+ most
300
+ mostly
301
+ mr
302
+ mrs
303
+ much
304
+ mug
305
+ must
306
+ my
307
+ myself
308
+ n
309
+ na
310
+ name
311
+ namely
312
+ nay
313
+ nd
314
+ near
315
+ nearly
316
+ necessarily
317
+ necessary
318
+ need
319
+ needs
320
+ neither
321
+ never
322
+ nevertheless
323
+ new
324
+ next
325
+ nine
326
+ ninety
327
+ no
328
+ nobody
329
+ non
330
+ none
331
+ nonetheless
332
+ noone
333
+ nor
334
+ normally
335
+ nos
336
+ not
337
+ noted
338
+ nothing
339
+ now
340
+ nowhere
341
+ o
342
+ obtain
343
+ obtained
344
+ obviously
345
+ of
346
+ off
347
+ often
348
+ oh
349
+ ok
350
+ okay
351
+ old
352
+ omitted
353
+ on
354
+ once
355
+ one
356
+ ones
357
+ only
358
+ onto
359
+ or
360
+ ord
361
+ other
362
+ others
363
+ otherwise
364
+ ought
365
+ our
366
+ ours
367
+ ourselves
368
+ out
369
+ outside
370
+ over
371
+ overall
372
+ owing
373
+ own
374
+ p
375
+ page
376
+ pages
377
+ part
378
+ particular
379
+ particularly
380
+ past
381
+ per
382
+ perhaps
383
+ placed
384
+ please
385
+ plus
386
+ poorly
387
+ possible
388
+ possibly
389
+ potentially
390
+ pp
391
+ predominantly
392
+ present
393
+ previously
394
+ primarily
395
+ probably
396
+ promptly
397
+ proud
398
+ provides
399
+ put
400
+ q
401
+ que
402
+ quickly
403
+ quite
404
+ qv
405
+ r
406
+ ran
407
+ rather
408
+ rd
409
+ re
410
+ readily
411
+ really
412
+ recent
413
+ recently
414
+ ref
415
+ refs
416
+ regarding
417
+ regardless
418
+ regards
419
+ related
420
+ relatively
421
+ research
422
+ respectively
423
+ resulted
424
+ resulting
425
+ results
426
+ right
427
+ run
428
+ s
429
+ said
430
+ same
431
+ saw
432
+ say
433
+ saying
434
+ says
435
+ sec
436
+ section
437
+ see
438
+ seeing
439
+ seem
440
+ seemed
441
+ seeming
442
+ seems
443
+ seen
444
+ self
445
+ selves
446
+ sent
447
+ seven
448
+ several
449
+ shall
450
+ she
451
+ shed
452
+ she'll
453
+ shes
454
+ should
455
+ shouldn't
456
+ show
457
+ showed
458
+ shown
459
+ showns
460
+ shows
461
+ significant
462
+ significantly
463
+ similar
464
+ similarly
465
+ since
466
+ six
467
+ slightly
468
+ so
469
+ some
470
+ somebody
471
+ somehow
472
+ someone
473
+ somethan
474
+ something
475
+ sometime
476
+ sometimes
477
+ somewhat
478
+ somewhere
479
+ soon
480
+ sorry
481
+ specifically
482
+ specified
483
+ specify
484
+ specifying
485
+ still
486
+ stop
487
+ strongly
488
+ sub
489
+ substantially
490
+ successfully
491
+ such
492
+ sufficiently
493
+ suggest
494
+ sup
495
+ sure
496
+ invention
497
+ method
498
+
499
+
500
+
App/assets/stopword_patent_English ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scope
2
+ about
3
+ contiguous
4
+ device for
5
+ dispose
6
+ include
7
+ furtherheretofore
8
+ indicium
9
+ means
10
+ member
11
+ multitude
12
+ pivotability
13
+ whereby
14
+ such that
15
+ thereby
16
+ wherein
17
+ surrounding
18
+ substantially
19
+ so that
20
+ thereof
21
+ system
22
+ process
23
+ invention
24
+ a kind of
25
+ equipment
26
+ belong to
27
+ according to
28
+ characterized in that
29
+ provide
30
+ get
31
+ characterized in that
32
+ handle
33
+ achieve
34
+ form
35
+ used for
36
+ utilize
37
+ adopt
38
+ get
App/assets/stopword_rake.txt ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #stop word list from SMART (Salton,1971). Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop
2
+ a
3
+ a's
4
+ able
5
+ about
6
+ above
7
+ according
8
+ accordingly
9
+ across
10
+ actually
11
+ after
12
+ afterwards
13
+ again
14
+ against
15
+ ain't
16
+ all
17
+ allow
18
+ allows
19
+ almost
20
+ alone
21
+ along
22
+ already
23
+ also
24
+ although
25
+ always
26
+ am
27
+ among
28
+ amongst
29
+ an
30
+ and
31
+ another
32
+ any
33
+ anybody
34
+ anyhow
35
+ anyone
36
+ anything
37
+ anyway
38
+ anyways
39
+ anywhere
40
+ apart
41
+ appear
42
+ appreciate
43
+ appropriate
44
+ are
45
+ aren't
46
+ around
47
+ as
48
+ aside
49
+ ask
50
+ asking
51
+ associated
52
+ at
53
+ available
54
+ away
55
+ awfully
56
+ b
57
+ be
58
+ became
59
+ because
60
+ become
61
+ becomes
62
+ becoming
63
+ been
64
+ before
65
+ beforehand
66
+ behind
67
+ being
68
+ believe
69
+ below
70
+ beside
71
+ besides
72
+ best
73
+ better
74
+ between
75
+ beyond
76
+ both
77
+ brief
78
+ but
79
+ by
80
+ c
81
+ c'mon
82
+ c's
83
+ came
84
+ can
85
+ can't
86
+ cannot
87
+ cant
88
+ cause
89
+ causes
90
+ certain
91
+ certainly
92
+ changes
93
+ clearly
94
+ co
95
+ com
96
+ come
97
+ comes
98
+ concerning
99
+ consequently
100
+ consider
101
+ considering
102
+ contain
103
+ containing
104
+ contains
105
+ corresponding
106
+ could
107
+ couldn't
108
+ course
109
+ currently
110
+ d
111
+ definitely
112
+ described
113
+ despite
114
+ did
115
+ didn't
116
+ different
117
+ do
118
+ does
119
+ doesn't
120
+ doing
121
+ don't
122
+ done
123
+ down
124
+ downwards
125
+ during
126
+ e
127
+ each
128
+ edu
129
+ eg
130
+ eight
131
+ either
132
+ else
133
+ elsewhere
134
+ enough
135
+ entirely
136
+ especially
137
+ et
138
+ etc
139
+ even
140
+ ever
141
+ every
142
+ everybody
143
+ everyone
144
+ everything
145
+ everywhere
146
+ ex
147
+ exactly
148
+ example
149
+ except
150
+ f
151
+ far
152
+ few
153
+ fifth
154
+ first
155
+ five
156
+ followed
157
+ following
158
+ follows
159
+ for
160
+ former
161
+ formerly
162
+ forth
163
+ four
164
+ from
165
+ further
166
+ furthermore
167
+ g
168
+ get
169
+ gets
170
+ getting
171
+ given
172
+ gives
173
+ go
174
+ goes
175
+ going
176
+ gone
177
+ got
178
+ gotten
179
+ greetings
180
+ h
181
+ had
182
+ hadn't
183
+ happens
184
+ hardly
185
+ has
186
+ hasn't
187
+ have
188
+ haven't
189
+ having
190
+ he
191
+ he's
192
+ hello
193
+ help
194
+ hence
195
+ her
196
+ here
197
+ here's
198
+ hereafter
199
+ hereby
200
+ herein
201
+ hereupon
202
+ hers
203
+ herself
204
+ hi
205
+ him
206
+ himself
207
+ his
208
+ hither
209
+ hopefully
210
+ how
211
+ howbeit
212
+ however
213
+ i
214
+ i'd
215
+ i'll
216
+ i'm
217
+ i've
218
+ ie
219
+ if
220
+ ignored
221
+ immediate
222
+ in
223
+ inasmuch
224
+ inc
225
+ indeed
226
+ indicate
227
+ indicated
228
+ indicates
229
+ inner
230
+ insofar
231
+ instead
232
+ into
233
+ inward
234
+ is
235
+ isn't
236
+ it
237
+ it'd
238
+ it'll
239
+ it's
240
+ its
241
+ itself
242
+ j
243
+ just
244
+ k
245
+ keep
246
+ keeps
247
+ kept
248
+ know
249
+ knows
250
+ known
251
+ l
252
+ last
253
+ lately
254
+ later
255
+ latter
256
+ latterly
257
+ least
258
+ less
259
+ lest
260
+ let
261
+ let's
262
+ like
263
+ liked
264
+ likely
265
+ little
266
+ look
267
+ looking
268
+ looks
269
+ ltd
270
+ m
271
+ mainly
272
+ many
273
+ may
274
+ maybe
275
+ me
276
+ mean
277
+ meanwhile
278
+ merely
279
+ might
280
+ more
281
+ moreover
282
+ most
283
+ mostly
284
+ much
285
+ must
286
+ my
287
+ myself
288
+ n
289
+ name
290
+ namely
291
+ nd
292
+ near
293
+ nearly
294
+ necessary
295
+ need
296
+ needs
297
+ neither
298
+ never
299
+ nevertheless
300
+ new
301
+ next
302
+ nine
303
+ no
304
+ nobody
305
+ non
306
+ none
307
+ noone
308
+ nor
309
+ normally
310
+ not
311
+ nothing
312
+ novel
313
+ now
314
+ nowhere
315
+ o
316
+ obviously
317
+ of
318
+ off
319
+ often
320
+ oh
321
+ ok
322
+ okay
323
+ old
324
+ on
325
+ once
326
+ one
327
+ ones
328
+ only
329
+ onto
330
+ or
331
+ other
332
+ others
333
+ otherwise
334
+ ought
335
+ our
336
+ ours
337
+ ourselves
338
+ out
339
+ outside
340
+ over
341
+ overall
342
+ own
343
+ p
344
+ particular
345
+ particularly
346
+ per
347
+ perhaps
348
+ placed
349
+ please
350
+ plus
351
+ possible
352
+ presumably
353
+ probably
354
+ provides
355
+ q
356
+ que
357
+ quite
358
+ qv
359
+ r
360
+ rather
361
+ present
362
+ rd
363
+ wherein
364
+ comprises
365
+ device
366
+ method
367
+ disclosure
368
+ comprising
369
+ providing
370
+ including
371
+ re
372
+ really
373
+ reasonably
374
+ regarding
375
+ regardless
376
+ regards
377
+ relatively
378
+ respectively
379
+ right
380
+ s
381
+ said
382
+ assembly
383
+ same
384
+ saw
385
+ say
386
+ saying
387
+ says
388
+ second
389
+ secondly
390
+ see
391
+ seeing
392
+ seem
393
+ seemed
394
+ seeming
395
+ seems
396
+ seen
397
+ self
398
+ selves
399
+ element
400
+ sensible
401
+ sent
402
+ serious
403
+ seriously
404
+ seven
405
+ several
406
+ shall
407
+ she
408
+ should
409
+ shouldn't
410
+ since
411
+ six
412
+ so
413
+ some
414
+ somebody
415
+ somehow
416
+ someone
417
+ something
418
+ sometime
419
+ sometimes
420
+ somewhat
421
+ somewhere
422
+ soon
423
+ sorry
424
+ specified
425
+ specify
426
+ specifying
427
+ still
428
+ sub
429
+ such
430
+ sup
431
+ sure
432
+ t
433
+ t's
434
+ take
435
+ taken
436
+ tell
437
+ tends
438
+ th
439
+ than
440
+ thank
441
+ thanks
442
+ thanx
443
+ that
444
+ that's
445
+ thats
446
+ the
447
+ their
448
+ theirs
449
+ them
450
+ themselves
451
+ then
452
+ thence
453
+ there
454
+ there's
455
+ thereafter
456
+ thereby
457
+ therefore
458
+ therein
459
+ theres
460
+ thereupon
461
+ these
462
+ they
463
+ they'd
464
+ they'll
465
+ they're
466
+ they've
467
+ think
468
+ third
469
+ this
470
+ thorough
471
+ thoroughly
472
+ those
473
+ though
474
+ three
475
+ through
476
+ throughout
477
+ thru
478
+ thus
479
+ to
480
+ together
481
+ too
482
+ took
483
+ toward
484
+ towards
485
+ tried
486
+ tries
487
+ truly
488
+ try
489
+ trying
490
+ twice
491
+ two
492
+ u
493
+ un
494
+ under
495
+ unfortunately
496
+ unless
497
+ unlikely
498
+ until
499
+ unto
500
+ up
501
+ upon
502
+ us
503
+ use
504
+ used
505
+ useful
506
+ uses
507
+ using
508
+ usually
509
+ uucp
510
+ v
511
+ value
512
+ various
513
+ very
514
+ via
515
+ viz
516
+ vs
517
+ w
518
+ want
519
+ wants
520
+ was
521
+ wasn't
522
+ way
523
+ we
524
+ we'd
525
+ we'll
526
+ we're
527
+ we've
528
+ welcome
529
+ well
530
+ went
531
+ were
532
+ weren't
533
+ what
534
+ what's
535
+ whatever
536
+ when
537
+ whence
538
+ whenever
539
+ where
540
+ where's
541
+ whereafter
542
+ whereas
543
+ whereby
544
+ wherein
545
+ whereupon
546
+ wherever
547
+ whether
548
+ which
549
+ while
550
+ whither
551
+ who
552
+ who's
553
+ whoever
554
+ whole
555
+ whom
556
+ whose
557
+ why
558
+ will
559
+ willing
560
+ wish
561
+ unit
562
+ with
563
+ comprising
564
+ including
565
+ within
566
+ without
567
+ won't
568
+ wonder
569
+ section
570
+ would
571
+ would
572
+ wouldn't
573
+ system
574
+ amount
575
+ comprise
576
+ x
577
+ y
578
+ yes
579
+ yet
580
+ you
581
+ you'd
582
+ you'll
583
+ you're
584
+ you've
585
+ your
586
+ yours
587
+ yourself
588
+ yourselves
589
+ z
590
+ zero
App/assets/stopwords ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'll
2
+ a
3
+ a kind of
4
+ able
5
+ about
6
+ above
7
+ abst
8
+ accordance
9
+ according
10
+ according to
11
+ accordingly
12
+ achieve
13
+ across
14
+ act
15
+ actually
16
+ added
17
+ adj
18
+ adopt
19
+ affected
20
+ affecting
21
+ affects
22
+ after
23
+ afterwards
24
+ again
25
+ against
26
+ ah
27
+ all
28
+ almost
29
+ alone
30
+ along
31
+ already
32
+ also
33
+ although
34
+ always
35
+ am
36
+ among
37
+ amongst
38
+ an
39
+ and
40
+ announce
41
+ another
42
+ any
43
+ anybody
44
+ anyhow
45
+ anymore
46
+ anyone
47
+ anything
48
+ anyway
49
+ anyways
50
+ anywhere
51
+ apparently
52
+ approximately
53
+ are
54
+ aren
55
+ arent
56
+ arise
57
+ around
58
+ as
59
+ aside
60
+ ask
61
+ asking
62
+ at
63
+ auth
64
+ available
65
+ away
66
+ awfully
67
+ b
68
+ back
69
+ be
70
+ became
71
+ because
72
+ become
73
+ becomes
74
+ becoming
75
+ been
76
+ before
77
+ beforehand
78
+ begin
79
+ beginning
80
+ beginnings
81
+ begins
82
+ behind
83
+ being
84
+ believe
85
+ belong to
86
+ below
87
+ beside
88
+ besides
89
+ between
90
+ beyond
91
+ biol
92
+ both
93
+ brief
94
+ briefly
95
+ but
96
+ by
97
+ c
98
+ ca
99
+ came
100
+ can
101
+ can't
102
+ cannot
103
+ cause
104
+ causes
105
+ certain
106
+ certainly
107
+ characterized in that
108
+ co
109
+ com
110
+ come
111
+ comes
112
+ contain
113
+ containing
114
+ contains
115
+ contiguous
116
+ could
117
+ couldnt
118
+ d
119
+ comprises
120
+ date
121
+ device for
122
+ did
123
+ didn't
124
+ different
125
+ dispose
126
+ do
127
+ does
128
+ doesn't
129
+ doing
130
+ don't
131
+ done
132
+ down
133
+ downwards
134
+ due
135
+ during
136
+ e
137
+ each
138
+ ed
139
+ edu
140
+ effect
141
+ eg
142
+ eight
143
+ eighty
144
+ either
145
+ else
146
+ elsewhere
147
+ end
148
+ ending
149
+ enough
150
+ equipment
151
+ especially
152
+ et
153
+ et-al
154
+ etc
155
+ even
156
+ ever
157
+ every
158
+ everybody
159
+ everyone
160
+ everything
161
+ everywhere
162
+ ex
163
+ except
164
+ f
165
+ far
166
+ few
167
+ ff
168
+ fifth
169
+ first
170
+ five
171
+ fix
172
+ followed
173
+ following
174
+ follows
175
+ for
176
+ form
177
+ former
178
+ formerly
179
+ forth
180
+ found
181
+ four
182
+ from
183
+ further
184
+ furtherheretofore
185
+ furthermore
186
+ g
187
+ gave
188
+ get
189
+ gets
190
+ getting
191
+ give
192
+ given
193
+ gives
194
+ giving
195
+ go
196
+ goes
197
+ gone
198
+ got
199
+ gotten
200
+ h
201
+ had
202
+ handle
203
+ happens
204
+ hardly
205
+ has
206
+ hasn't
207
+ have
208
+ haven't
209
+ having
210
+ he
211
+ hed
212
+ hence
213
+ her
214
+ here
215
+ hereafter
216
+ hereby
217
+ herein
218
+ heres
219
+ hereupon
220
+ hers
221
+ herself
222
+ hes
223
+ hi
224
+ hid
225
+ him
226
+ himself
227
+ his
228
+ hither
229
+ home
230
+ how
231
+ howbeit
232
+ however
233
+ hundred
234
+ i
235
+ i'll
236
+ i've
237
+ id
238
+ ie
239
+ if
240
+ im
241
+ immediate
242
+ immediately
243
+ importance
244
+ important
245
+ in
246
+ inc
247
+ include
248
+ indeed
249
+ index
250
+ indicium
251
+ information
252
+ instead
253
+ into
254
+ invention
255
+ inward
256
+ is
257
+ isn't
258
+ it
259
+ it'll
260
+ itd
261
+ its
262
+ itself
263
+ j
264
+ just
265
+ k
266
+ keep
267
+ kept
268
+ kg
269
+ km
270
+ know
271
+ known
272
+ knows
273
+ l
274
+ largely
275
+ last
276
+ lately
277
+ later
278
+ latter
279
+ latterly
280
+ least
281
+ less
282
+ lest
283
+ let
284
+ lets
285
+ like
286
+ liked
287
+ likely
288
+ line
289
+ little
290
+ look
291
+ looking
292
+ looks
293
+ ltd
294
+ m
295
+ made
296
+ mainly
297
+ keeps
298
+ make
299
+ makes
300
+ comprise
301
+ comprises
302
+ many
303
+ may
304
+ maybe
305
+ me
306
+ mean
307
+ means
308
+ meantime
309
+ meanwhile
310
+ member
311
+ merely
312
+ method
313
+ mg
314
+ might
315
+ million
316
+ miss
317
+ ml
318
+ more
319
+ moreover
320
+ most
321
+ mostly
322
+ mr
323
+ mrs
324
+ much
325
+ mug
326
+ multitude
327
+ must
328
+ my
329
+ myself
330
+ n
331
+ na
332
+ name
333
+ namely
334
+ nay
335
+ nd
336
+ near
337
+ nearly
338
+ necessarily
339
+ necessary
340
+ need
341
+ needs
342
+ neither
343
+ never
344
+ nevertheless
345
+ new
346
+ next
347
+ nine
348
+ ninety
349
+ no
350
+ nobody
351
+ non
352
+ none
353
+ nonetheless
354
+ noone
355
+ nor
356
+ normally
357
+ nos
358
+ not
359
+ noted
360
+ nothing
361
+ now
362
+ nowhere
363
+ o
364
+ obtain
365
+ obtained
366
+ obviously
367
+ of
368
+ off
369
+ often
370
+ oh
371
+ ok
372
+ okay
373
+ old
374
+ omitted
375
+ on
376
+ once
377
+ one
378
+ ones
379
+ only
380
+ onto
381
+ or
382
+ ord
383
+ other
384
+ others
385
+ otherwise
386
+ ought
387
+ our
388
+ ours
389
+ ourselves
390
+ out
391
+ outside
392
+ over
393
+ overall
394
+ owing
395
+ own
396
+ p
397
+ page
398
+ pages
399
+ part
400
+ particular
401
+ particularly
402
+ past
403
+ per
404
+ perhaps
405
+ pivotability
406
+ placed
407
+ please
408
+ plus
409
+ poorly
410
+ possible
411
+ possibly
412
+ potentially
413
+ pp
414
+ predominantly
415
+ present
416
+ previously
417
+ primarily
418
+ probably
419
+ process
420
+ promptly
421
+ proud
422
+ provide
423
+ provides
424
+ put
425
+ q
426
+ que
427
+ quickly
428
+ quite
429
+ qv
430
+ r
431
+ ran
432
+ rather
433
+ rd
434
+ re
435
+ readily
436
+ really
437
+ recent
438
+ recently
439
+ ref
440
+ refs
441
+ regarding
442
+ regardless
443
+ regards
444
+ related
445
+ relatively
446
+ research
447
+ respectively
448
+ resulted
449
+ resulting
450
+ results
451
+ right
452
+ run
453
+ s
454
+ said
455
+ same
456
+ saw
457
+ say
458
+ saying
459
+ says
460
+ scope
461
+ sec
462
+ section
463
+ see
464
+ seeing
465
+ seem
466
+ seemed
467
+ seeming
468
+ seems
469
+ seen
470
+ self
471
+ selves
472
+ sent
473
+ seven
474
+ several
475
+ shall
476
+ she
477
+ she'll
478
+ shed
479
+ shes
480
+ should
481
+ shouldn't
482
+ show
483
+ showed
484
+ shown
485
+ showns
486
+ shows
487
+ significant
488
+ significantly
489
+ similar
490
+ similarly
491
+ since
492
+ six
493
+ slightly
494
+ so
495
+ so that
496
+ some
497
+ somebody
498
+ somehow
499
+ someone
500
+ somethan
501
+ something
502
+ sometime
503
+ sometimes
504
+ somewhat
505
+ somewhere
506
+ soon
507
+ sorry
508
+ specifically
509
+ specified
510
+ specify
511
+ specifying
512
+ still
513
+ stop
514
+ strongly
515
+ sub
516
+ substantially
517
+ successfully
518
+ such
519
+ such that
520
+ sufficiently
521
+ suggest
522
+ sup
523
+ sure
524
+ surrounding
525
+ system
526
+ thereby
527
+ thereof
528
+ used for
529
+ utilize
530
+ whereby
531
+ wherein
532
+ u.s. pat
533
+ pat
534
+ patent
535
+ present invention
536
+ deg
537
+ bottom thereof
538
+ certain embodiments
539
+ field
540
+ a
541
+ b
542
+ c
543
+ d
544
+ e
545
+ f
546
+ g
547
+ h
548
+ i
549
+ j
550
+ k
551
+ l
552
+ m
553
+ n
554
+ o
555
+ p
556
+ q
557
+ r
558
+ s
559
+ t
560
+ u
561
+ v
562
+ w
563
+ x
564
+ y
565
+ z
566
+ ]
567
+ [
App/assets/trainingsNegative ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ extremely low:problem
2
+ creates undesirable:problem
3
+ their lifetime is also limited to a few months:problem
4
+ cooking creates undesirable by-products as smoke and odor that can pollute an inhabited airspace:problem
5
+ it will be understood that some leakage may occur:problem
6
+ becomes heavy:problem
7
+ are not likely:problem
8
+ is designed:problem
9
+ could cause:problem
10
+ may impair:problem
11
+ do not:problem
12
+ signs of wear:problem
13
+ sign of wear:problem
14
+ wear:problem
15
+ blemish:problem
16
+ break:problem
17
+ bug:problem
18
+ complicate:problem
19
+ crack:problem
20
+ damage:problem
21
+ deflect:problem
22
+ deform:problem
23
+ usually face difficulties:problem
24
+ degrade:problem
25
+ the smoke particles are permanently trapped within the filter element because they are larger than the openings in the filter media:problem
26
+ deprive:problem
27
+ destroy:problem
28
+ deteriorate:problem
29
+ the disadvantage of a polarizing filter is that it withdraws observation the nondesired reflections but also of relevant information relating to the surface quality of the skin:problem
30
+ disparate:problem
31
+ fail:problem
32
+ hamper:problem
33
+ harm:problem
34
+ hinder:problem
35
+ impair:problem
36
+ smash:problem
37
+ spoil:problem
38
+ stain:problem
39
+ trouble:problem
40
+ weaken:problem
41
+ worsen:problem
42
+ break:problem
43
+ blemish:problem
44
+ bug:problem
45
+ cause:problem
46
+ complication:problem
47
+ crack:problem
48
+ damage:problem
49
+ defect:problem
50
+ deficiency:problem
51
+ deformity:problem
52
+ degradation:problem
53
+ deprivation:problem
54
+ destruction:problem
55
+ deterioration:problem
56
+ detriment:problem
57
+ difficulty:problem
58
+ disadvantage:problem
59
+ disadvantages:problem
60
+ a consultation of important duration presents disadvantages for the patient like for the user:problem
61
+ drawback:problem
62
+ failure:problem
63
+ flaw:problem
64
+ hamper:problem
65
+ harm:problem
66
+ impairing:problem
67
+ imperfection:problem
68
+ instability:problem
69
+ limitation:problem
70
+ prejudice:problem
71
+ problem:problem
72
+ spoiling:problem
73
+ stain:problem
74
+ trouble:problem
75
+ weakness:problem
76
+ difficut:problem
77
+ worse:problem
78
+ abnormal:problem
79
+ abolish:problem
80
+ abominable:problem
81
+ abominably:problem
82
+ abominate:problem
83
+ abomination:problem
84
+ abort:problem
85
+ aborted:problem
86
+ aborts:problem
87
+ abrade:problem
88
+ abrasive:problem
89
+ abrupt:problem
90
+ abruptly:problem
91
+ abscond:problem
92
+ absence:problem
93
+ absent-minded:problem
94
+ absentee:problem
95
+ absurd:problem
96
+ absurdity:problem
97
+ absurdly:problem
98
+ absurdness:problem
99
+ abuse:problem
100
+ abused:problem
101
+ abuses:problem
102
+ abusive:problem
103
+ abysmal:problem
104
+ abysmally:problem
105
+ abyss:problem
106
+ accidental:problem
107
+ accost:problem
108
+ accursed:problem
109
+ accusation:problem
110
+ accusations:problem
111
+ accuse:problem
112
+ accuses:problem
113
+ accusing:problem
114
+ accusingly:problem
115
+ acerbate:problem
116
+ acerbic:problem
117
+ acerbically:problem
118
+ ache:problem
119
+ ached:problem
120
+ aches:problem
121
+ achey:problem
122
+ aching:problem
123
+ acrid:problem
124
+ acridly:problem
125
+ acridness:problem
126
+ acrimonious:problem
127
+ acrimoniously:problem
128
+ acrimony:problem
129
+ adamant:problem
130
+ adamantly:problem
131
+ addict:problem
132
+ addicted:problem
133
+ addicting:problem
134
+ addicts:problem
135
+ admonish:problem
136
+ admonisher:problem
137
+ admonishingly:problem
138
+ admonishment:problem
139
+ admonition:problem
140
+ adulterate:problem
141
+ adulterated:problem
142
+ adulteration:problem
143
+ adulterier:problem
144
+ adversarial:problem
145
+ adversary:problem
146
+ adverse:problem
147
+ adversity:problem
148
+ afflict:problem
149
+ affliction:problem
150
+ afflictive:problem
151
+ affront:problem
152
+ afraid:problem
153
+ aggravate:problem
154
+ aggravating:problem
155
+ aggravation:problem
156
+ aggression:problem
157
+ aggressive:problem
158
+ aggressiveness:problem
159
+ aggressor:problem
160
+ aggrieve:problem
161
+ aggrieved:problem
162
+ aggrivation:problem
163
+ aghast:problem
164
+ agonies:problem
165
+ agonize:problem
166
+ agonizing:problem
167
+ agonizingly:problem
168
+ agony:problem
169
+ aground:problem
170
+ ail:problem
171
+ ailing:problem
172
+ ailment:problem
173
+ aimless:problem
174
+ alarm:problem
175
+ alarmed:problem
176
+ alarming:problem
177
+ alarmingly:problem
178
+ alienate:problem
179
+ alienated:problem
180
+ alienation:problem
181
+ allegation:problem
182
+ allegations:problem
183
+ allege:problem
184
+ allergic:problem
185
+ allergies:problem
186
+ allergy:problem
187
+ aloof:problem
188
+ altercation:problem
189
+ ambiguity:problem
190
+ ambiguous:problem
191
+ ambivalence:problem
192
+ ambivalent:problem
193
+ ambush:problem
194
+ amiss:problem
195
+ amputate:problem
196
+ anarchism:problem
197
+ anarchist:problem
198
+ anarchistic:problem
199
+ kokai publication no. 2000-331699 is not provided with a leaking liquid sensor:problem
200
+ anarchy:problem
201
+ anemic:problem
202
+ are not compatible:problem
203
+ an aqueous solution of methanol or water leaks from the dmfc unit:problem
204
+ many commercially available flow-rate sensors are generally considered to be incompatible with existing liquid-cooling systems suitable for computer systems:problem
205
+ leakage may occur during mating and demating:problem
206
+ a maximum number of passengers is thus attained for such a configuration of 550 passengers (with five doors):problem
207
+ it is however not ruled out to provide a stairway or an elevator in the aircraft:problem
208
+ many of these requirements are imposed by law or regulation:problem
209
+ such doors are known in the aircraft industry:problem
210
+ that space is lost with the curved ramps proposed by ep2460727:problem
211
+ the additional equipment required for multiple use needs to be stowed away in a space-saving way in the aircraft during the time it is not required:problem
212
+ the first and second supply areas may coincide or may partly overlap:problem
213
+ the ud-ccrc is one example of a crc:problem
214
+ therefore a flexibility of the light projection is considerably increased:problem
215
+ thus the possible scope of use of the module is significantly enhanced:problem
216
+ some of the sealed perforations may pop so that a hole exists in the continuous sheet of polymer:Problem
App/assets/trainingsPositive ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ameliorate:partialSolution
2
+ detect:partialSolution
3
+ enhance:partialSolution
4
+ ameliorate:partialSolution
5
+ detect:partialSolution
6
+ enhance:partialSolution
7
+ unit 102 comprises two beds and an upper stowage compartment:partialSolution
8
+ provides:partialSolution
9
+ are within the scope:partialSolution
10
+ facilitate:partialSolution
11
+ improve:partialSolution
12
+ maintain:partialSolution
13
+ measure:partialSolution
14
+ preserve:partialSolution
15
+ save:partialSolution
16
+ stabilize:partialSolution
17
+ including:partialSolution
18
+ includes:partialSolution
19
+ included:partialSolution
20
+ better:partialSolution
21
+ allow:partialSolution
22
+ detect:partialSolution
23
+ amelioration:partialSolution
24
+ ensure:partialSolution
25
+ ensures:partialSolution
26
+ enhancement:partialSolution
27
+ improvement:partialSolution
28
+ detection:partialSolution
29
+ improvement:partialSolution
30
+ maintenance:partialSolution
31
+ at least:partialSolution
32
+ are:partialSolution
33
+ solution:partialSolution
34
+ include:partialSolution
35
+ includes:partialSolution
36
+ small construction:partialSolution
37
+ low power consumption:partialSolution
38
+ advantageously:partialSolution
39
+ is adapted:partialSolution
40
+ provided:partialSolution
41
+ can be:partialSolution
42
+ beneficiate:partialSolution
43
+ abound:partialSolution
44
+ abounds:partialSolution
45
+ abundance:partialSolution
46
+ abundant:partialSolution
47
+ accessable:partialSolution
48
+ accessible:partialSolution
49
+ acclaim:partialSolution
50
+ acclaimed:partialSolution
51
+ acclamation:partialSolution
52
+ accolade:partialSolution
53
+ accolades:partialSolution
54
+ accommodative:partialSolution
55
+ accomodative:partialSolution
56
+ accomplish:partialSolution
57
+ accomplished:partialSolution
58
+ accomplishment:partialSolution
59
+ accomplishments:partialSolution
60
+ accurate:partialSolution
61
+ accurately:partialSolution
62
+ achievable:partialSolution
63
+ achievement:partialSolution
64
+ achievements:partialSolution
65
+ achievible:partialSolution
66
+ acumen:partialSolution
67
+ adaptable:partialSolution
68
+ adaptive:partialSolution
69
+ adequate:partialSolution
70
+ adjustable:partialSolution
71
+ admirable:partialSolution
72
+ admirably:partialSolution
73
+ admiration:partialSolution
74
+ admire:partialSolution
75
+ admirer:partialSolution
76
+ admiring:partialSolution
77
+ admiringly:partialSolution
78
+ adorable:partialSolution
79
+ adore:partialSolution
80
+ adored:partialSolution
81
+ adorer:partialSolution
82
+ adoring:partialSolution
83
+ adoringly:partialSolution
84
+ adroit:partialSolution
85
+ adroitly:partialSolution
86
+ adulate:partialSolution
87
+ adulation:partialSolution
88
+ adulatory:partialSolution
89
+ advanced:partialSolution
90
+ advantage:partialSolution
91
+ advantageous:partialSolution
92
+ advantageously:partialSolution
93
+ advantages:partialSolution
94
+ adventuresome:partialSolution
95
+ adventurous:partialSolution
96
+ advocate:partialSolution
97
+ advocated:partialSolution
98
+ advocates:partialSolution
99
+ affability:partialSolution
100
+ affable:partialSolution
101
+ affably:partialSolution
102
+ affectation:partialSolution
103
+ affection:partialSolution
104
+ affectionate:partialSolution
105
+ affinity:partialSolution
106
+ affirm:partialSolution
107
+ affirmation:partialSolution
108
+ affirmative:partialSolution
109
+ affluence:partialSolution
110
+ affluent:partialSolution
111
+ afford:partialSolution
112
+ affordable:partialSolution
113
+ affordably:partialSolution
114
+ afordable:partialSolution
115
+ agile:partialSolution
116
+ agilely:partialSolution
117
+ agility:partialSolution
118
+ agreeable:partialSolution
119
+ agreeableness:partialSolution
120
+ agreeably:partialSolution
121
+ all-around:partialSolution
122
+ alluring:partialSolution
123
+ alluringly:partialSolution
124
+ altruistic:partialSolution
125
+ altruistically:partialSolution
126
+ amaze:partialSolution
127
+ amazed:partialSolution
128
+ amazement:partialSolution
129
+ amazes:partialSolution
130
+ amazing:partialSolution
131
+ amazingly:partialSolution
132
+ ambitious:partialSolution
133
+ ambitiously:partialSolution
134
+ ameliorate:partialSolution
135
+ amenable:partialSolution
136
+ amenity:partialSolution
137
+ amiability:partialSolution
138
+ amiabily:partialSolution
139
+ amiable:partialSolution
140
+ amicability:partialSolution
141
+ amicable:partialSolution
142
+ amicably:partialSolution
143
+ amity:partialSolution
144
+ ample:partialSolution
145
+ amply:partialSolution
146
+ amuse:partialSolution
147
+ amusing:partialSolution
148
+ amusingly:partialSolution
149
+ angel:partialSolution
150
+ angelic:partialSolution
151
+ apotheosis:partialSolution
152
+ appeal:partialSolution
153
+ appealing:partialSolution
154
+ applaud:partialSolution
155
+ appreciable:partialSolution
156
+ appreciate:partialSolution
157
+ appreciated:partialSolution
158
+ appreciates:partialSolution
159
+ appreciative:partialSolution
160
+ appreciatively:partialSolution
161
+ appropriately:paritalSolution
162
+ the accumulated grease and particulate matter within reservoir is then appropriately discarded:partialSolution
163
+ such materials are inexpensive and readily processed:partialSolution
164
+ more or fewer sensors per hose line or hose line segment may be deployed if desired:partialSolution
165
+ cable is saved in the supply channel due to the use of the flexible supply module: partialSolution
166
+ these carts 10 are arranged in three superposed rows of four carts 10: partialSolution
167
+ these supply modules are usually arranged above the group of seats beneath the overhead compartment:partialSolution
168
+ the cabin lighting is seated on the light halo:partialSolution
169
+ the continuous adaptable light projection is adapted as a projection of a holographic film:partialSolution
170
+ a corresponding design of a fastening element may make it possible in a simple manner to provide a combined rotational and translational movement:partialSolution
171
+ a crc that is also to be used by passengers should have a height clearance that makes it possible for people to comfortably stand up:partialSolution
172
+ a further possible supply medium is medical oxygen:partialSolution
173
+ a lager variety of surfaces and positions can be offered to the passenger and therefore the comfort and the flexibility of handling of functional units is increased by means of the operator surface:partialSolution
174
+ a motorized mechanism not described in detail here allows movement of platform 30 :partialSolution
175
+ a passenger service unit comprising a lighting device according to claim 1 :partialSolution
176
+ a passenger service unit comprising a lighting device according to one of claims 1 to 8:partialSolution
177
+ a small overall height may result in a weight of only a few kg:partialSolution
178
+ all the seats are disposed in groups 42 of two columns of seats:partialSolution
179
+ control surfaces may also be activated in a variable manner and made available to each individual passenger:partialSolution
180
+ crcs are separate rooms that are only available for use by members of the crew:partialSolution
181
+ each gasper 66 includes a vent opening through which air is controllably emitted:partialSolution
182
+ each of the supply modules 710 is connected to a data bus 720 of the cms:partialSolution
183
+ a supply unit comprises a lighting device:partialSolution
184
+ the operator surface comprises a continuous adaptable illumination pattern:partialSolution
185
+ the suitable body is formed by a passenger's hand:partialSolution
186
+ it may also be used to receive additional seats:partialSolution
187
+ massage is thus not only relaxing but also health promoting:partialSolution
188
+ one or two rows of seats may be supplied:partialSolution
189
+ seats may also be disposed at that location:partialSolution
190
+ second compartment 24 thus is arranged above first compartment 22 but at a distance from the latter:partialSolution
191
+ steps 901 and 904 are optional:partialSolution
192
+ synonyms for certain terms are provided:partialSolution
193
+ the above-mentioned electro-optical devices may be used for modulating the light:partialSolution
194
+ the centre of the room comprises a universal gaming table 3702 : partialSolution
195
+ the external system comprises a cms:partialSolution
196
+ the light forming module 60 is adapted to the light source:partialSolution
197
+ the light source of the illumination unit may be a single white led which generates a very intense light with a low power consumption:partialSolution
198
+ the pisa has a small construction and a low power consumption:partialSolution
199
+ the psu pod assembly 77 can also include a panel stiffener 96 :partialSolution
200
+ the psu pods are positioned below the first height and the psu channel is positioned above the first height:partialSolution
201
+ the psu pods are positioned below the system components:partialSolution
202
+ the resulting connection is then rather of a visual nature:partialSolution
203
+ the resulting laser beam is combined with a mems mirror/scanner 4 for a dynamic light modulation and a reflection sensor 23 :partialSolution
204
+ the second zone of the first deck advantageously has a substantially planar floor:partialSolution
205
+ the space adjacent the ramps is not lost space:partialSolution
206
+ the stowage space 76 then advantageously continues over the trolleys or the toilets:partialSolution
207
+ the unit further comprises the front panel 1202 :partialSolution
208
+ these sites 62 may be equipped for securing in place a wheelchair:partialSolution
209
+ unit 102 comprises two beds and an upper stowage compartment:partialSolution
210
+ unit 104 comprises three beds :partialSolution
211
+ whole-body or partial-body massage are offered:partialSolution
212
+ the analysis element is powered by durable long duration battery:partialSolution
213
+ to ml of 2.5 % w/v pva in water in a ml glass vial is added ml of the polymer solution dropwise with stirring:partialSolution
App/assets/wordAfterNumber ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \b[ mm ]\b
2
+ \bmm\b
3
+ \bnm\b
4
+ \bor\b
5
+ \bcm\b
6
+ \]
7
+ \[
8
+ <
9
+ >
10
+ %
11
+ \'
12
+ °
13
+ \brpm\b
14
+ \bdeg\b
15
+ \bdegree\b
16
+ \bdegrees\b
17
+ \bx\b
18
+ \bkPa\b
19
+ \bm\b
20
+ \bpounds\b
21
+ \bseconds\b
22
+ \bsecond\b
23
+ \bcc\b
24
+ \bcc/sec\b
25
+ \bpsi\b
26
+ \bmol/mol\b
27
+ \bm2\b
28
+ \bm/s2\b
29
+ \bm/m\b
30
+ \bhz\b
31
+ \bm\b
32
+ \bcm2\b
33
+ \br/min\b
34
+ \bm/m\b
35
+ \bg/mol\b
36
+ \bkg\b
37
+ \bkg/s\b
38
+ \bm2/s\b
39
+ \bpa\b
40
+ \bkg/m3\b
41
+ \bpa/pa\b
42
+ \bµm\b
43
+ \bk\b
44
+ \bcm\b
45
+ \b°c\b
46
+ \b°f\b
47
+ \b°\b
48
+ \bs\b
49
+ \bm3\b
50
+ \bm3/s\b
51
+ \bg/g\b
52
+ \bar\b
53
+ \bc\b
54
+ \bch2o\b
55
+ \bch3oh\b
56
+ \bch4\b
57
+ \bc2h4o\b
58
+ \bc2h5oh\b
59
+ \bc2h6\b
60
+ \bc3h7oh\b
61
+ \bc3h8\b
62
+ \bc4h10\b
63
+ \bc5h12\b
64
+ \bco\b
65
+ \bco2\b
66
+ \bh\b
67
+ \bh2\b
68
+ \bh2o\b
69
+ \bh2so4\b
70
+ \bhc\b
71
+ \bhe\b
72
+ \b85kr\b
73
+ \bn2\b
74
+ \bnh3\b
75
+ \bnmhc\b
76
+ \bnmhce\b
77
+ \bno\b
78
+ \bno2\b
79
+ \bnox\b
80
+ \bn2o\b
81
+ \bnmog\b
82
+ \bnonmhc\b
83
+ \bnothc\b
84
+ \bo2\b
85
+ \bohc\b
86
+ \bpm\b
87
+ \bs\b
88
+ \bsvoc\b
89
+ \bthc\b
90
+ \bthce\b
91
+ \bzro2\b
92
+ \bpercent\b
93
+ \bpercents\b
94
+ \.
95
+ \bto\b
App/assets/wordBeforeNumber ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \bclaim\b
2
+ \bclaims\b
3
+ \[
4
+ \bmm\b
5
+ \bnm\b
6
+ \bto\b
7
+ <
8
+ >
9
+ %
10
+ °
11
+ \ba\b
12
+ \.
13
+ \bkPa\b
14
+ \bof\b
15
+ us
16
+ \bx\b
17
+ \bapproximately\b
18
+ \bthe\b
19
+ \bbeetween\b
20
+ \brpm\b
App/assets/wordtagVerb ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ VBZ
2
+ VBG
App/bin/ClassifierWithIncr.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ basic_sentiment_analysis
4
+ ~~~~~~~~~~~~~~~~~~~~~~~~
5
+
6
+ This module contains the code and examples described in
7
+ http://fjavieralba.com/basic-sentiment-analysis-with-python.html
8
+
9
+ """
10
+
11
+ from pprint import pprint
12
+ import nltk
13
+ import yaml
14
+ import sys
15
+ import os
16
+ import re
17
+ from App.bin.constants import ASSETS
18
+
19
+
20
+ class Splitter(object):
21
+ def __init__(self):
22
+ self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
23
+ self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
24
+
25
+ def split(self, text):
26
+ """
27
+ input format: a paragraph of text
28
+ output format: a list of lists of words.
29
+ e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
30
+ """
31
+ sentences = self.nltk_splitter.tokenize(text)
32
+ tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
33
+ return tokenized_sentences
34
+
35
+
36
+ class POSTagger(object):
37
+ def __init__(self):
38
+ pass
39
+
40
+ def pos_tag(self, sentences):
41
+ """
42
+ input format: list of lists of words
43
+ e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
44
+ output format: list of lists of tagged tokens. Each tagged tokens has a
45
+ form, a lemma, and a list of tags
46
+ e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
47
+ [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
48
+ """
49
+
50
+ pos = [nltk.pos_tag(sentence) for sentence in sentences]
51
+ # adapt format
52
+ pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
53
+ return pos
54
+
55
+
56
+ class DictionaryTagger(object):
57
+ def __init__(self, dictionary_paths):
58
+ files = [open(path, 'r') for path in dictionary_paths]
59
+ dictionaries = [yaml.safe_load(dict_file) for dict_file in files]
60
+ map(lambda x: x.close(), files)
61
+ self.dictionary = {}
62
+ self.max_key_size = 0
63
+ for curr_dict in dictionaries:
64
+ for key in curr_dict:
65
+ if key in self.dictionary:
66
+ self.dictionary[key].extend(curr_dict[key])
67
+ else:
68
+ self.dictionary[key] = curr_dict[key]
69
+ self.max_key_size = max(self.max_key_size, len(key))
70
+
71
+ def tag(self, postagged_sentences):
72
+ return [self.tag_sentence(sentence) for sentence in postagged_sentences]
73
+
74
+ def tag_sentence(self, sentence, tag_with_lemmas=False):
75
+ """
76
+ the result is only one tagging of all the possible ones.
77
+ The resulting tagging is determined by these two priority rules:
78
+ - longest matches have higher priority
79
+ - search is made from left to right
80
+ """
81
+ tag_sentence = []
82
+ N = len(sentence)
83
+ if self.max_key_size == 0:
84
+ self.max_key_size = N
85
+ i = 0
86
+ while (i < N):
87
+ j = min(i + self.max_key_size, N) # avoid overflow
88
+ tagged = False
89
+ while (j > i):
90
+ expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
91
+ expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
92
+ if tag_with_lemmas:
93
+ literal = expression_lemma
94
+ else:
95
+ literal = expression_form
96
+ if literal in self.dictionary:
97
+ # self.logger.debug("found: %s" % literal)
98
+ is_single_token = j - i == 1
99
+ original_position = i
100
+ i = j
101
+ taggings = [tag for tag in self.dictionary[literal]]
102
+ tagged_expression = (expression_form, expression_lemma, taggings)
103
+ if is_single_token: # if the tagged literal is a single token, conserve its previous taggings:
104
+ original_token_tagging = sentence[original_position][2]
105
+ tagged_expression[2].extend(original_token_tagging)
106
+ tag_sentence.append(tagged_expression)
107
+ tagged = True
108
+ else:
109
+ j = j - 1
110
+ if not tagged:
111
+ tag_sentence.append(sentence[i])
112
+ i += 1
113
+ return tag_sentence
114
+
115
+ class ClassifyWithIncr_it(object):
116
+
117
+ def __init__(self):
118
+ print("printing")
119
+
120
+
121
+ def value_of(self,sentiment):
122
+ if sentiment == 'positive': return 1
123
+ if sentiment == 'negative': return -1
124
+ return 0
125
+
126
+
127
+ def sentence_score(self, sentence_tokens, previous_token, acum_score):
128
+ if not sentence_tokens:
129
+ return acum_score
130
+ else:
131
+ current_token = sentence_tokens[0]
132
+ tags = current_token[2]
133
+ token_score = sum([self.value_of(tag) for tag in tags])
134
+ if previous_token is not None:
135
+ previous_tags = previous_token[2]
136
+ if 'inc' in previous_tags:
137
+ token_score *= 2.0
138
+ elif 'dec' in previous_tags:
139
+ token_score /= 2.0
140
+ elif 'inv' in previous_tags:
141
+ token_score *= -1.0
142
+ return self.sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)
143
+
144
+
145
+ def sentiment_score(self,review):
146
+
147
+ return sum([self.sentence_score(sentence, None, 0.0) for sentence in review])
148
+
149
+
150
+ def main(self,sentence):
151
+
152
+
153
+ splitter = Splitter()
154
+ postagger = POSTagger()
155
+ pos=ASSETS+"dicts/positive.yml"
156
+ neg= ASSETS+"dicts/negative.yml"
157
+ inc=ASSETS+"dicts/inc.yml"
158
+ dec=ASSETS+"dicts/dec.yml"
159
+ inv=ASSETS+"dicts/inv.yml"
160
+ dicttagger = DictionaryTagger([pos, neg,
161
+ inc, dec, inv])
162
+
163
+ splitted_sentences = splitter.split(sentence)
164
+
165
+
166
+ pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
167
+
168
+
169
+ dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
170
+
171
+ print("Classification...")
172
+
173
+ result = self.sentiment_score(dict_tagged_sentences)
174
+ print (result)
175
+ if result < 0:
176
+ polarity = "problem"
177
+ elif result > 0:
178
+ polarity ="partialSolution"
179
+ else:
180
+ polarity = "neutre"
181
+ return polarity
182
+
183
+ if __name__ == '__main__':
184
+ text = """this/these can be annoying"""
185
+ test = ClassifyWithIncr_it()
186
+ print(test.main(text))
187
+
188
+
189
+
App/bin/ComplexParser.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Nov 28 16:02:26 2016
4
+
5
+ @author: Achille Souili
6
+ """
7
+ import re
8
+ import nltk
9
+
10
+
11
+
12
+ class ComplexParser(object):
13
+
14
+ def __init__(self, sentence):
15
+ self.sentence = sentence
16
+
17
+ def extract_parameters(self):
18
+ sentence = self.sentence
19
+ concept = []
20
+
21
+
22
+ words = nltk.word_tokenize(sentence)
23
+ sentence = nltk.pos_tag(words)
24
+ grammar = """CLAUSES: {<DT>?<JJ.*>?<DT><NN><.*>?<VB.*>?<.*>+}
25
+ """
26
+ parameter_parser = nltk.RegexpParser(grammar)
27
+ tree = parameter_parser.parse(sentence)
28
+ for subtree in tree.subtrees():
29
+ if subtree.label() == 'CLAUSES':
30
+ #print(subtree)
31
+ parameter_candidate = " ".join(word for word, tag in subtree.leaves())
32
+ concept.append(parameter_candidate)
33
+ concept = "d".join(concept)
34
+ return concept
35
+
36
+ if __name__ == "__main__":
37
+
38
+ Paragraph = "in which the surface of diffusion (24) is concave."
39
+ words = nltk.word_tokenize(Paragraph)
40
+ tagged = nltk.pos_tag(words)
41
+ print(tagged)
42
+ get_parameter = ComplexParser(Paragraph)
43
+ parameters_list = get_parameter.extract_parameters()
44
+
45
+ print (parameters_list)
App/bin/CorpusProcessor.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ import json
6
+ import os
7
+ import re
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import Levenshtein
11
+ from io import StringIO
12
+ from App.bin import constants
13
+ import hashlib
14
+ from collections import OrderedDict
15
+ from App.bin.InformationExtractor import InformationExtractor
16
+ from App.bin.ParameterExtractor import ParameterExtractor
17
+ from App.bin.TechnologyFinder import TechnologyFinder
18
+ from App.bin.InformationExtractor_Claims import InformationExtractorClaims
19
+
20
+ class CorpusProcessor(object):
21
+
22
+ def __init__(self, patents,input_folder, file_extension):
23
+ self.patents = patents
24
+ self.input_folder = input_folder
25
+ self.file_extension = file_extension
26
+ print("Processing started")
27
+
28
+
29
+ def make_graphic (self, sizes, text, colors, labels):
30
+
31
+ col = [[i / 255. for i in c] for c in colors]
32
+
33
+ fig, ax = plt.subplots()
34
+ ax.axis('equal')
35
+ width = 0.35
36
+ kwargs = dict(colors=col, startangle=180)
37
+ outside, _ = ax.pie(sizes, radius=1, pctdistance=1 - width / 2, labels=labels, **kwargs)
38
+ plt.setp(outside, width=width, edgecolor='white')
39
+
40
+ kwargs = dict(size=20, fontweight='bold', va='center')
41
+ ax.text(0, 0, text, ha='center', **kwargs)
42
+
43
+ plt.show()
44
+
45
+ def change_keys(self, dictionnary, number):
46
+ number = number+'-'
47
+ if type(dictionnary) is dict:
48
+ return dict([(number+str(k) , self.change_keys(v, number)) for k, v in dictionnary.items()])
49
+ else:
50
+ return dictionnary
51
+
52
+ def process_corpus(self):
53
+
54
+ count_abstract = 0
55
+ count_claims = 0
56
+ count_description = 0
57
+ count_patent = 0
58
+ total_sentences_number =0
59
+ count_concepts_solupart = 0
60
+ count_concepts_problem = 0
61
+ patents = self.patents
62
+ input_folder = self.input_folder
63
+ file_extension = self.file_extension
64
+ project_folder = os.path.basename(os.path.normpath(input_folder))
65
+ graph_folder = constants.GRAPH_FOLDER + project_folder+"/"
66
+ extracted_concepts = []
67
+ output_result = []
68
+ parameters_graph = []
69
+ reduced_content = []
70
+ patent_corpus = []
71
+ source_list = []
72
+ parameters_list =[]
73
+ technologies_graph =[]
74
+
75
+
76
+ for patent_file in patents:
77
+ output_json_claims ={}
78
+ total_sentences_number_claims =0
79
+
80
+ if type(patent_file) is dict:
81
+ patent_file = json.dumps(patent_file)
82
+
83
+ read_patent = StringIO(patent_file)
84
+ patent = json.load(read_patent)
85
+ nNumber = patent['number']
86
+ aAbstract = patent['abstract']
87
+ cClaims = patent['claims']
88
+ dDescription = patent['description']
89
+
90
+ root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
91
+ root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
92
+
93
+ if nNumber is not None:
94
+ match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', nNumber)
95
+ # CC for country code
96
+ CC = match.group(1)
97
+ # NR for Number
98
+ NR = match.group(2)
99
+ NR = re.sub(r'\s', '', NR)
100
+ # KC for Kind code
101
+ KC = match.group(4)
102
+
103
+ urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC
104
+ urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#'
105
+
106
+
107
+
108
+ #Find a more elegant way to do it
109
+ patent_content = aAbstract + cClaims + dDescription
110
+ patent_content = patent_content.splitlines()
111
+ # for line in patent_content:
112
+ # line = self.dataCleaner(line)
113
+ # reduced_content.append(line)
114
+
115
+ for line in patent_content:
116
+ get_parameters = ParameterExtractor(line)
117
+ parameters = get_parameters.extract_parameters()
118
+ if parameters:
119
+ parameters_list.extend( parameters)
120
+ for i in parameters_list:
121
+ for j in parameters_list:
122
+ if i != j and len(i.split()) == 1:
123
+ if j.find(i) > -1 and i in parameters_list:
124
+
125
+ parameters_list.remove(i)
126
+
127
+ parameters_list=list(set(parameters_list))
128
+ if len(parameters_list) > 50:
129
+ for i in parameters_list:
130
+ for j in parameters_list:
131
+ if i!=j:
132
+ comp = Levenshtein.ratio(i, j)
133
+ if comp >=.4 and i in parameters_list and j in parameters_list:
134
+ if len(i) > len(j):
135
+ # print('{} is near duplicate of {}'.format(i, j))
136
+ parameters_list.remove(i)
137
+
138
+ for el in parameters_list:
139
+ if len(el.split()) == 1:
140
+ parameters_list.remove(el)
141
+
142
+ parameters = dict(enumerate(parameters_list, 1))
143
+
144
+ parameters = self.change_keys(parameters, nNumber.lower())
145
+
146
+
147
+
148
+ source = input_folder+"/"+nNumber+file_extension.strip("*")
149
+
150
+ parameters_array = OrderedDict({
151
+ "concept": {
152
+ "source": source,
153
+ "valeurs": parameters,
154
+ "image": urlImg,
155
+ "pdf": urlPDF
156
+ }
157
+
158
+ })
159
+ pParameters= json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
160
+
161
+ parameters_graph.append(pParameters)
162
+
163
+ if dDescription !="" or cClaims!="":
164
+ count_description +=1
165
+ extract_concepts = InformationExtractor(dDescription,input_folder, file_extension, nNumber )
166
+ output_json, total_sentences_number = extract_concepts.get_from_description()
167
+ extract_concepts_claims = InformationExtractorClaims(cClaims,input_folder, file_extension, nNumber )
168
+ output_json_claims_result= extract_concepts_claims.main()
169
+ if output_json_claims_result is not None:
170
+ output_json_claims, total_sentences_number_claims = output_json_claims_result
171
+
172
+ count_claims += 1
173
+ if output_json is not None:
174
+ if type(output_json) is dict:
175
+ output_json = json.dumps(output_json)
176
+ extracted_concepts.append(output_json)
177
+ total_sentences_number += total_sentences_number
178
+ if output_json_claims is not None :
179
+ if type(output_json_claims) is dict:
180
+ output_json_claims = json.dumps(output_json_claims)
181
+ extracted_concepts.append(output_json_claims)
182
+ total_sentences_number += total_sentences_number_claims
183
+ elif cClaims !="":
184
+ count_claims +=1
185
+ print('Processing claims')
186
+ else:
187
+ count_abstract +=1
188
+ print("processing abstract")
189
+ count_patent +=1
190
+
191
+
192
+ #print(source)
193
+ source_list.append(source)
194
+ patent_corpus.append(reduced_content)
195
+ patent_corpus = dict(zip(source_list, patent_corpus))
196
+ '''
197
+ get_patent_technologies = TechnologyFinder(patent_corpus)
198
+ technologies = get_patent_technologies.get_technologies()
199
+
200
+
201
+ for source_file, technologies_list in technologies.items():
202
+
203
+ technologies_array = OrderedDict({
204
+ "concept": {
205
+ "source": source_file,
206
+ "values": technologies_list
207
+ }
208
+
209
+ })
210
+ tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
211
+
212
+ technologies_graph.append(tTechnologies)
213
+ '''
214
+ print(type(extracted_concepts))
215
+ header = '{'
216
+ graph = '"problem_graph": [%s],' % ','.join(extracted_concepts)
217
+ parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
218
+ #technologies_output = '"technologies": [%s]' % ','.join(technologies_graph)
219
+ footer = '}'
220
+ #output_result.extend((header, graph, parameters_output,technologies_output, footer ))
221
+ output_result.extend((header, graph, parameters_output, footer))
222
+
223
+ output_result = "".join(output_result)
224
+ output_result = re.sub(r'\,{2,}', ',', output_result)
225
+ output_result = re.sub(r'\}\,\]', '}]', output_result)
226
+
227
+
228
+ # exit()
229
+ # print(output_result)
230
+ concepts_json = json.loads(output_result)
231
+
232
+ # concepts_json = json.loads(concepts_json)
233
+
234
+
235
+ count_concepts = len(concepts_json['problem_graph'])
236
+ for item, value in concepts_json.items():
237
+ #if cle == "type" and value =="partialSolution":
238
+ # print ("yes")
239
+ for element in value:
240
+ for cle, valeur in element.items():
241
+ for k,v in valeur.items():
242
+ if k == "type" and v =="partialSolution":
243
+ count_concepts_solupart += 1
244
+ elif k == "type" and v =="problem":
245
+ count_concepts_problem += 1
246
+ json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
247
+ #print(concepts_json.keys())
248
+
249
+ # original code
250
+ with open(graph_folder+"graph.json", 'w') as json_graph:
251
+
252
+ # with open(graph_folder + 'graph.json', 'w') as json_graph:
253
+ json_graph.write(json_write_to_file)
254
+ number_neutre = count_concepts - count_concepts_problem - count_concepts_solupart
255
+ print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (count_patent, count_abstract, count_claims, count_description))
256
+ print("%s phrases ont été analysée(s)" % (total_sentences_number))
257
+ print("%s concepts ont été trouvé(s) dont %s problèmes, %s solutions partielles et %s neutres" % (count_concepts, count_concepts_problem, count_concepts_solupart, number_neutre))
258
+
259
+ #Display graphics
260
+ first_color = (46, 204, 113)
261
+ second_color = (245, 176, 65)
262
+ #self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
263
+ return json_write_to_file
264
+
265
+ def process_corpus_json(self):
266
+
267
+ count_abstract = 0
268
+ count_claims = 0
269
+ count_description = 0
270
+ count_patent = 0
271
+ total_sentences_number = 0
272
+ count_concepts_solupart = 0
273
+ count_concepts_problem = 0
274
+ patents = self.patents
275
+ input_folder = self.input_folder
276
+ file_extension = self.file_extension
277
+ project_folder = os.path.basename(os.path.normpath(input_folder))
278
+ graph_folder = constants.GRAPH_FOLDER + project_folder + "/"
279
+ extracted_concepts = []
280
+ output_result = []
281
+ parameters_graph = []
282
+ reduced_content = []
283
+ patent_corpus = []
284
+ source_list = []
285
+ parameters_list = []
286
+ technologies_graph = []
287
+ for patent_file in patents:
288
+ # print(type(patent_file))
289
+
290
+ #if type(patent_file) is dict:
291
+ patent_file = json.dumps(patent_file)
292
+
293
+ read_patent = StringIO(patent_file)
294
+ patent = json.load(read_patent)
295
+ # print(type(patent))
296
+ filename = patent['filename']
297
+ nNumber = patent['number']
298
+ aAbstract = patent['abstract']
299
+ cClaims = patent['claims']
300
+ dDescription = patent['description']
301
+
302
+ # Find a more elegant way to do it
303
+ patent_content = aAbstract + cClaims + dDescription
304
+ patent_content = patent_content.splitlines()
305
+ # for line in patent_content:
306
+ # line = self.dataCleaner(line)
307
+ # reduced_content.append(line)
308
+
309
+ for line in patent_content:
310
+ get_parameters = ParameterExtractor(line)
311
+ parameters = get_parameters.extract_parameters()
312
+ if parameters:
313
+ parameters_list.extend(parameters)
314
+ for i in parameters_list:
315
+ for j in parameters_list:
316
+ if i != j and len(i.split()) == 1:
317
+ if j.find(i) > -1 and i in parameters_list:
318
+
319
+ parameters_list.remove(i)
320
+
321
+ parameters_list = list(set(parameters_list))
322
+
323
+ if len(parameters_list) > 50:
324
+ for i in parameters_list:
325
+ for j in parameters_list:
326
+ if i!=j:
327
+ comp = Levenshtein.ratio(i, j)
328
+ if comp >=.4 and i in parameters_list and j in parameters_list:
329
+ if len(i) > len(j):
330
+ # print('{} is near duplicate of {}'.format(i, j))
331
+ parameters_list.remove(i)
332
+
333
+ for el in parameters_list:
334
+ if len(el.split()) == 1:
335
+ parameters_list.remove(el)
336
+
337
+
338
+
339
+
340
+
341
+ print('{} {}'.format('Taille: ', len(parameters_list)))
342
+
343
+
344
+ parameters = dict(enumerate(parameters_list, 1))
345
+
346
+ parameters = self.change_keys(parameters, nNumber.lower())
347
+
348
+ source = input_folder + "/" + nNumber + file_extension.strip("*")
349
+
350
+ parameters_array = OrderedDict({
351
+ "concept": {
352
+ "source": source,
353
+ "valeurs": parameters
354
+ }
355
+
356
+ })
357
+ pParameters = json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
358
+
359
+ parameters_graph.append(pParameters)
360
+
361
+ #if dDescription != "" and cClaims!="":
362
+ if dDescription != "":
363
+ count_description += 1
364
+ extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, filename)
365
+ output_json, total_sentences_number_d = extract_concepts.get_from_description()
366
+ if output_json != "":
367
+ extracted_concepts.append(output_json)
368
+ total_sentences_number += total_sentences_number_d
369
+ #count_claims += 1
370
+ #extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber)
371
+ #output_json, total_sentences_number_c = extract_concepts.get_from_claims()
372
+ #if output_json != "":
373
+ #extracted_concepts.append(output_json)
374
+ #total_sentences_number_c += total_sentences_number_c
375
+ #total_sentences_number = total_sentences_number_c+total_sentences_number_d
376
+
377
+ elif cClaims != "":
378
+ count_claims += 1
379
+ extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber)
380
+ output_json, total_sentences_number = extract_concepts.get_from_claims()
381
+ if output_json != "":
382
+ extracted_concepts.append(output_json)
383
+ total_sentences_number += total_sentences_number
384
+ elif dDescription != "":
385
+ count_description += 1
386
+ extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, nNumber)
387
+ output_json, total_sentences_number = extract_concepts.get_from_description()
388
+ if output_json != "":
389
+ extracted_concepts.append(output_json)
390
+ total_sentences_number += total_sentences_number
391
+ count_claims += 1
392
+
393
+ else:
394
+ count_abstract += 1
395
+ print("processing abstract")
396
+ count_patent += 1
397
+
398
+ # print(source)
399
+ # source_list.append(source)
400
+ # patent_corpus.append(reduced_content)
401
+ # patent_corpus = dict(zip(source_list, patent_corpus))
402
+ '''
403
+ get_patent_technologies = TechnologyFinder(patent_corpus)
404
+ technologies = get_patent_technologies.get_technologies()
405
+
406
+
407
+ for source_file, technologies_list in technologies.items():
408
+
409
+ technologies_array = OrderedDict({
410
+ "concept": {
411
+ "source": source_file,
412
+ "values": technologies_list
413
+ }
414
+
415
+ })
416
+ tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
417
+
418
+ technologies_graph.append(tTechnologies)
419
+ '''
420
+
421
+ header = '{'
422
+ graph = '"problem_graph": [%s],' % ','.join(extracted_concepts)
423
+ parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
424
+ # technologies_output = '"technologies": [%s]' % ','.join(technologies_graph)
425
+ footer = '}'
426
+ # output_result.extend((header, graph, parameters_output,technologies_output, footer ))
427
+ output_result.extend((header, graph, parameters_output, footer))
428
+
429
+ output_result = "".join(output_result)
430
+ output_result = re.sub(r'\,{2,}', ',', output_result)
431
+ output_result = re.sub(r'\}\,\]', '}]', output_result)
432
+ concepts_json = json.loads(output_result)
433
+
434
+ count_concepts = len(concepts_json['problem_graph'])
435
+ for item, value in concepts_json.items():
436
+ # if cle == "type" and value =="partialSolution":
437
+ # print ("yes")
438
+ for element in value:
439
+ for cle, valeur in element.items():
440
+ for k, v in valeur.items():
441
+ if k == "type" and v == "partialSolution":
442
+ count_concepts_solupart += 1
443
+ elif k == "type" and v == "problem":
444
+ count_concepts_problem += 1
445
+ json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
446
+ # print(concepts_json.keys())
447
+ with open(graph_folder + "graph.json", 'w') as json_graph:
448
+ json_graph.write(json_write_to_file)
449
+
450
+ print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (
451
+ count_patent, count_abstract, count_claims, count_description))
452
+ print("%s phrases ont été analysée(s)" % (total_sentences_number))
453
+ print("%s concepts ont été trouvé(s) dont %s problèmes et %s solutions partielles" % (
454
+ count_concepts, count_concepts_problem, count_concepts_solupart))
455
+
456
+ # Display graphics
457
+ first_color = (46, 204, 113)
458
+ second_color = (245, 176, 65)
459
+ # self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
460
+ return json_write_to_file
App/bin/FiguresCleaner.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import re
4
+ import nltk
5
+ import json
6
+
7
+ from App.bin import constants
8
+
9
+
10
+ class FiguresCleaner(object):
11
+
12
+ def __init__(self, sections):
13
+ self.sections = sections
14
+
15
+ def clean_figures(self):
16
+ sections = self.sections
17
+ clean_content = []
18
+ with open(constants.ASSETS + "wordAfterNumber", 'r') as l:
19
+ after_words = l.read().splitlines()
20
+ after_words_patterns = re.compile('|'.join(after_words))
21
+ with open(constants.ASSETS + "wordBeforeNumber", 'r') as l:
22
+ before_words = l.read().splitlines()
23
+ before_words_patterns = re.compile('|'.join(before_words))
24
+
25
+ #sections = sections.splitlines()
26
+ words = nltk.word_tokenize(sections)
27
+ tagged_words = nltk.pos_tag(words)
28
+ for i in range(len(tagged_words)):
29
+ if i < len(tagged_words) - 1:
30
+ next_word = tagged_words[i + 1][0]
31
+ current_word = tagged_words[i][0]
32
+ previous_word = tagged_words[i - 1][0]
33
+ currentWordTag = tagged_words[i][1]
34
+ if currentWordTag == 'CD' and not re.match(after_words_patterns,
35
+ next_word) is not None and not re.match(
36
+ before_words_patterns, previous_word) is not None:
37
+ if re.search(r'\d', current_word) is not None:
38
+ continue
39
+ else:
40
+ clean_content.append(current_word + " ")
41
+ else:
42
+ clean_content.append("\n")
43
+
44
+ return clean_content
App/bin/FindTechnologies.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*
3
+ import sys
4
+ import os
5
+ import math
6
+ import xlsxwriter
7
+ from textblob import TextBlob as tb
8
+
9
+ class FindTechnologies(object):
10
+
11
+ def __init__(self):
12
+
13
+ print("Starting")
14
+
15
+ def tf(word, blob):
16
+ return (float)(blob.noun_phrases.count(word)) / (float)(len(blob.noun_phrases))
17
+
18
+
19
+ def n_containing(word, bloblist):
20
+ return sum(1 for blob in bloblist if word in blob.noun_phrases)
21
+
22
+
23
+ def idf(word, bloblist):
24
+ return math.log(len(bloblist) / (float)(1 + n_containing(word, bloblist)))
25
+
26
+
27
+ def tfidf(word, blob, bloblist):
28
+ return tf(word, blob) * idf(word, bloblist)
29
+
30
+
31
+ # Create an excel file for validation purpose
32
+
33
+ def get_technologies(self):
34
+ folder_path = "C:/Users/asouili01/Documents/PatSemBeta-v3/Data/input/Gaggenau/"
35
+ stopwords = open('C:/Users/asouili01/Documents/PIXSEB/Ressources/stopwords.txt', 'r').read().split('\r\n')
36
+ bloblist = []
37
+
38
+ filenamelist = []
39
+
40
+ for path, dirs, files in os.walk(folder_path):
41
+ for filename in files:
42
+ print(filename)
43
+ filenamelist.append(filename)
44
+ name, extension = filename.split('.')
45
+ filepath = folder_path + "/" + filename
46
+ filehandler = open(filepath, "r",encoding="utf-8")
47
+
48
+ content = filehandler.read()
49
+ filteredtext = [t for t in content if t.lower() not in stopwords]
50
+ filteredcontent = ''.join(filteredtext)
51
+ blob = 'blob_' + name.lower()
52
+ print (blob)
53
+ blob = tb(filteredcontent.lower())
54
+ bloblist.append(blob)
55
+
56
+ print(bloblist)
57
+
58
+ for i, blob in enumerate(bloblist):
59
+ print("Top words in document {}".format(i + 1))
60
+ scores = {word: tfidf(word, blob, bloblist) for word in blob.noun_phrases}
61
+ sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
62
+ for word, score in sorted_words[:5]:
63
+ print("\tWord: {}, TF-IDF: {}".format(word, round(score, 10)))
64
+
App/bin/InformationExtractor.py ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ #java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
4
+ import nltk
5
+ nltk.download('all')
6
+ import os
7
+ import re
8
+ import json
9
+ import hashlib
10
+ import Levenshtein
11
+ import uuid
12
+ from App.bin import constants
13
+ from collections import OrderedDict
14
+ from nltk import word_tokenize
15
+
16
+ from App.bin.SharpClassifier import SharpClassifier
17
+ from App.bin.ClassifierWithIncr import ClassifyWithIncr_it
18
+ from App.bin.SentenceClassifier import SentenceClassifier
19
+ from App.bin.ParameterExtractor import ParameterExtractor
20
+
21
+ class InformationExtractor(object):
22
+
23
+ patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split()
24
+ sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle')
25
+ sentence_finder._params.abbrev_types.update(patent_abbreviations)
26
+
27
+ def __init__(self, section, input_folder,file_extension, file_name):
28
+ self.section = section
29
+ self.input_folder = input_folder
30
+ self.file_extension = file_extension
31
+ self.file_name = file_name
32
+
33
+ print("Extracting problem graph")
34
+
35
+ #@staticmethod
36
+
37
+
38
+ def discardLines(self, line,lexic):
39
+ with open (constants.ASSETS+ lexic) as m:
40
+ exclusion_list = m.read().splitlines()
41
+ if any(word in line for word in exclusion_list):
42
+ pass
43
+ else:
44
+ return line
45
+
46
+
47
+ def selectLines(self, line, lexic):
48
+ with open(constants.ASSETS + lexic) as n:
49
+ inclusion_list = n.read().splitlines()
50
+ if any(word in line for word in inclusion_list):
51
+ return line
52
+
53
+ def last_cleansing(self, concept):
54
+ concept = str(concept)
55
+ concept = concept.lower()
56
+ if concept.endswith("."):
57
+ concept = concept.strip(".")
58
+ concept = re.sub(r'^consequently ','', concept)
59
+ concept = re.sub(r'^such ', '', concept)
60
+ concept = re.sub(r'^said ', '', concept)
61
+ concept = re.sub(r'^\s+', '', concept)
62
+ concept = re.sub(r'^it is worth noting that ', '', concept)
63
+ concept = re.sub(r'^example of ', '', concept)
64
+ concept = re.sub(r'^since ', '', concept)
65
+ concept = re.sub(r'^\( |\)$ ', '', concept)
66
+ return concept
67
+
68
+ # def get_from_claims(self):
69
+ #
70
+ # section = self.section
71
+ # content = []
72
+ # sentence_finder = InformationExtractor.sentence_finder
73
+ # sentences = sentence_finder.tokenize(section.strip())
74
+ # with open(constants.ASSETS + "getFromClaims") as concept:
75
+ # # next(concept)
76
+ # included_words = concept.read().splitlines()
77
+ # include_link_pattern = re.compile('|'.join(included_words))
78
+
79
+
80
+ def get_from_description(self):
81
+ previous_polarity = ''
82
+ noise_trash =[]
83
+
84
+ content = []
85
+ include_links = []
86
+ output_content = []
87
+ ex_output_content = []
88
+ output_result=[]
89
+ output_linked_content = []
90
+ output_inter_content = []
91
+ uniq_output_linked_content =[]
92
+ ex_output_content_linked =[]
93
+ section = self.section
94
+ input_folder = self.input_folder
95
+ file_name = self.file_name
96
+ file_extension = self.file_extension
97
+ projectFolder = os.path.basename(os.path.normpath(input_folder))
98
+ output_file_name = input_folder+"/"+file_name+file_extension.strip("*")
99
+
100
+ graphItemId = hashlib.md5(file_name.encode())
101
+ graphItemIdValue = graphItemId.hexdigest()
102
+ graphItemIdValue = str(uuid.uuid4())
103
+ t_sline = ""
104
+ t_sline_ex =[]
105
+ compt_Id = 30
106
+ compt_Id_ex = 40
107
+
108
+ root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
109
+ root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
110
+
111
+ if file_name is not None:
112
+ match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name)
113
+ # CC for country code
114
+ CC = match.group(1)
115
+ # NR for Number
116
+ NR = match.group(2)
117
+ NR = re.sub(r'\s', '', NR)
118
+ # KC for Kind code
119
+ KC = match.group(4)
120
+
121
+ urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC
122
+ urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#'
123
+
124
+ sentence_finder = InformationExtractor.sentence_finder
125
+
126
+ #section = self.dataCleaner(section)
127
+ #print(section)
128
+ sentences = sentence_finder.tokenize(section.strip())
129
+
130
+
131
+ with open(constants.ASSETS + "includeLinks") as concept:
132
+ # next(concept)
133
+ included_words = concept.read().splitlines()
134
+ include_link_pattern = re.compile('|'.join(included_words))
135
+ #open examplification wordfile
136
+ with open(constants.ASSETS + "examplificationclues") as examplif:
137
+ # next(concept)
138
+ exam_words = examplif.read().splitlines()
139
+ examplif_word_pattern = re.compile('|'.join(exam_words))
140
+
141
+ description_sentences_number = len(sentences)
142
+ number_of_words = 0
143
+ for sentence in sentences:
144
+
145
+ # with open(constants.DATA + 'sentences.txt', 'a', encoding='utf8') as file_handler:
146
+ # for item in sentences:
147
+ # file_handler.write("{}\n".format(item))
148
+ number_of_word = len(nltk.word_tokenize(sentence))
149
+ number_of_words += number_of_word
150
+
151
+
152
+ sentenced = self.discardLines(sentence, "exclusionList")
153
+
154
+
155
+ if sentenced is not None:
156
+
157
+
158
+ content.append(sentenced)
159
+ #print("origine=> "+sentence)
160
+ total_sentences_number = len(sentences)
161
+ # mean_sentence_length = int(round(number_of_words/total_sentences_number))
162
+ # print(mean_sentence_length)
163
+
164
+ for line in content:
165
+
166
+ line = self.selectLines(line, "inclusionList")
167
+
168
+
169
+
170
+ if line is not None:
171
+
172
+ if re.match(include_link_pattern, line):
173
+ include_links.append(line)
174
+ #print(line)
175
+ if line.count(',') == 0:
176
+ output_content.append(line)
177
+ # content.remove(line)
178
+ if line.count(',') > 0:
179
+ output_inter_content.append(line)
180
+ content.remove(line)
181
+ for s in content:
182
+ # print(s, file_name)
183
+ sentence = self.discardLines(s, "FilterS")
184
+ if sentence is not None:
185
+ if s.count(',') <= 2 and re.match(examplif_word_pattern, s.lower()):
186
+ s = str(s)
187
+ cs = s.lower()
188
+ cs = re.sub(examplif_word_pattern, '', cs)
189
+ cs = re.sub('which', 'this/these', cs)
190
+ cs = re.sub(r'\.$', '', cs)
191
+ #print(s)
192
+ if cs.count(',') == 1 and cs.count('such as')==0:
193
+ ex_output_content_linked.append(cs)
194
+ else:
195
+ ex_output_content.append(cs)
196
+ elif s.count(',') == 1:
197
+ s = str(s)
198
+ s = s.lower()
199
+ s = self.selectLines(s, "OneCommaDiscriminator")
200
+ if s is not None:
201
+ #s = re.sub('which', 'this/these', s)
202
+ #print(s)
203
+ s = re.sub(r'^thus, ', '', s)
204
+ s = re.sub(r'^preferably, ', '', s)
205
+ s = re.sub(r'^conventional ', '', s)
206
+ s = re.sub(r'^in particular, ', '', s)
207
+ s = re.sub(r'^specifically, ', '', s)
208
+ s = re.sub(r'^as necessary, ', '', s)
209
+ s = re.sub(', which', ',this/these', s)
210
+ s = re.sub(r'\.$', '', s)
211
+
212
+ if s.count(',')==1:
213
+ ex_output_content_linked.append(s)
214
+ else:
215
+ ex_output_content.append(s)
216
+ else:
217
+ pass
218
+
219
+ print(len(ex_output_content_linked))
220
+ ex_output_content_linked = list(set(ex_output_content_linked))
221
+ for line in ex_output_content_linked:
222
+ line = line.lower()
223
+ if 'figure' not in line:
224
+ #if line.count(',') <= 1:
225
+ t_sline_ex = line.strip().split(',')
226
+ #print("outpib"+str(t_sline_ex))
227
+ for concept in t_sline_ex:
228
+ #print("outpib" + str(concept))
229
+ words = nltk.word_tokenize(concept)
230
+ tagged = nltk.pos_tag(words)
231
+ #print(tagged)
232
+ parameters_list = []
233
+ compteur = 0
234
+ compt_Id_ex += 1
235
+ tagged = nltk.pos_tag(word_tokenize(concept))
236
+ tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
237
+ if len(tags) < 1:
238
+ continue
239
+ # classifyT = SentenceClassifier(concept)
240
+ # polarite = classifyT.classifySentence()
241
+ classifyT = ClassifyWithIncr_it()
242
+ polarite = classifyT.main(concept)
243
+ # if polarite == 'neutre':
244
+ # classify = SentenceClassifier(concept)
245
+ # polarite = classify.classifySentence()
246
+ # print(concept)
247
+
248
+ get_parameters = ParameterExtractor(concept)
249
+ parameters = get_parameters.extract_parameters()
250
+
251
+ parameters_list.extend( parameters)
252
+ # parameters_list=", ".join(parameters_list)
253
+ # parameters_list = parameters_list
254
+ #print("Index is: ")
255
+ #print(t_sline_ex.index(concept))
256
+ #print(concept)
257
+
258
+ clean_concept = self.last_cleansing(concept)
259
+ # if polarite == 'neutre':
260
+ # words = word_tokenize(clean_concept)
261
+ # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
262
+ # noise_trash.append(hit)
263
+
264
+ validity = self.discardLines(concept, 'referencing_indices')
265
+ if t_sline_ex.index(concept) == 0 and validity is not None:
266
+ previous_polarity = polarite
267
+ values = OrderedDict({
268
+ "concept": {
269
+ "type": polarite,
270
+ "enfants": graphItemIdValue + str(compt_Id_ex + 1),
271
+ "id": graphItemIdValue + str(compt_Id_ex),
272
+ "sentence": clean_concept,
273
+ "source": output_file_name,
274
+ "parameters":parameters_list,
275
+ "image": urlImg,
276
+ "pdf": urlPDF
277
+ }
278
+
279
+ })
280
+
281
+ else:
282
+ print("Previous polarity is : " + str(previous_polarity))
283
+ if previous_polarity =='partialSolution' or validity is None:
284
+ continue
285
+ else:
286
+ compteur += 1
287
+ values = OrderedDict({
288
+ "concept": {
289
+ "type": polarite,
290
+ "parents": graphItemIdValue + str(compt_Id_ex - 1),
291
+ "id": graphItemIdValue + str(compt_Id_ex),
292
+ "sentence": clean_concept,
293
+ "source": output_file_name,
294
+ "parameters": parameters_list,
295
+ "image": urlImg,
296
+ "pdf": urlPDF
297
+
298
+ }
299
+
300
+ })
301
+
302
+ json_string_linkes = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
303
+
304
+ output_result.append(json_string_linkes)
305
+
306
+
307
+
308
+ #for line in output_content:
309
+ #print ("include=> "+line)
310
+ #just examplification sentences
311
+ #make a function of that
312
+ ex_output_content = list(set(ex_output_content))
313
+ for concept in ex_output_content:
314
+ tagged = nltk.pos_tag(word_tokenize(concept))
315
+ tags = [word for word, pos in tagged if
316
+ pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
317
+ if len(tags) < 1:
318
+ continue
319
+ parameters_list = []
320
+ concept = concept.lower()
321
+ compt_Id_ex += 1
322
+ # classify = SentenceClassifier(sline)
323
+ # polarite = classify.classifySentence()
324
+ classifyT = ClassifyWithIncr_it()
325
+ polarite = classifyT.main(concept)
326
+
327
+ # if polarite =='neutre':
328
+ # classify = SentenceClassifier(concept)
329
+ # polarite = classify.classifySentence()
330
+ # print(sline)
331
+
332
+ #if polarite == 'partialSolution':
333
+ #print(sline)
334
+ #Insert a classifier here
335
+ get_parameters = ParameterExtractor(concept)
336
+ parameters = get_parameters.extract_parameters()
337
+
338
+ clean_concept = self.last_cleansing(concept)
339
+ parameters_list.extend(parameters)
340
+ # if polarite == 'neutre':
341
+ # words = word_tokenize(clean_concept)
342
+ # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
343
+ # noise_trash.append(hit)
344
+ # parameters_list = ", ".join(parameters_list)
345
+ validity = self.discardLines(concept, 'referencing_indices')
346
+ if polarite != 'partialSolution' and validity is not None:
347
+
348
+ values = OrderedDict({
349
+ "concept": {
350
+ "type": polarite,
351
+ "id": graphItemIdValue + str(compt_Id_ex),
352
+ "sentence": clean_concept,
353
+ "source": output_file_name,
354
+ "parameters": parameters_list,
355
+ "image": urlImg,
356
+ "pdf": urlPDF
357
+
358
+
359
+ }
360
+
361
+ })
362
+ json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
363
+ output_result.append(json_string)
364
+
365
+
366
+
367
+ for line in include_links:
368
+ #print(line)
369
+ #Put in lower case to improve matching
370
+ line = line.lower()
371
+
372
+ if re.match(r'however', line) and line.count(',') <= 1:
373
+ line = str(line)
374
+ sline = re.sub(r'however|,', '', line)
375
+ if sline not in output_linked_content:
376
+ output_linked_content.append(sline)
377
+ if re.match(r'however', line) and line.count(',') > 1:
378
+ sline = re.sub(r'^however,?(\s\w+)\s*, that ', '', line)
379
+ # sline = re.sub(r'however,.+, that ', '', sline)
380
+ sline = re.sub(r'^however,?(\s\w+)+\s(above), ', '', sline)
381
+ sline = re.sub(r'^however,?\s\w+ed(\s\w+)+,\s*', '', sline)
382
+ sline = re.sub(r'^however,?\sif\s(desired|said)\s*,\s', '', sline)
383
+ sline = re.sub(r'^however,?\s(it)\s(will be appreciated)\s*,\s(that)+\s*', '', sline)
384
+ sline = re.sub(r'^however,?\s(as|if|because|when|since)\s*(?!is)', '', sline)
385
+ sline = re.sub(r'^however,?\s*', '', sline)
386
+ if sline not in output_linked_content:
387
+ output_linked_content.append(sline)
388
+ if re.match(r'if', line) and line.count(',') <= 1:
389
+ line = str(line)
390
+ sline = re.sub(r'^if\s?(and when|not|desired|necessary)\s?,?\s*', '', line)
391
+ sline = re.sub(r'^if,?\s*', '', sline)
392
+ sline = re.sub(r'^if ', '', sline)
393
+ if sline not in output_linked_content:
394
+ output_linked_content.append(sline)
395
+ # print (sline)
396
+
397
+ if re.match(r'when', line):
398
+ line = str(line)
399
+ line = line.lower()
400
+ sline = re.sub(r'^when\s*', '', line)
401
+ sline = re.sub(r'^when,?\s*', '', sline)
402
+ sline = re.sub(r'^when ', '', sline)
403
+ if sline not in output_linked_content:
404
+ output_linked_content.append(sline)
405
+ if re.match(r'(^since)|(^\w+\s?,\s?since\s?)', line):
406
+ sline = re.sub(r'^since', '', line)
407
+ sline = re.sub(r'^\w+\s?,\s?since\s?', '', sline)
408
+ if sline not in output_linked_content:
409
+ output_linked_content.append(sline)
410
+
411
+ for line in output_content:
412
+ line = line.lower()
413
+ if re.match(r'if', line):
414
+ line = str(line)
415
+ sline = re.sub(r'^if ', '', line)
416
+ if sline not in output_linked_content:
417
+ output_content.append(sline)
418
+ #output_content.remove(line)
419
+
420
+ uniq_output_linked_content = list(set(output_linked_content))
421
+ for line in uniq_output_linked_content:
422
+ #print("long sentences = > " + line)
423
+ # line = str(i)
424
+ #print(line)
425
+ line = line.lower()
426
+ if 'figure' in line:
427
+ uniq_output_linked_content.remove(line)
428
+ sline = re.sub(r'^\s+', '', line)
429
+ sline = re.sub(r'^\d+\.+$', '', sline)
430
+
431
+ if sline.count(',') <= 1:
432
+ t_sline = tuple(sline.strip().split(', '))
433
+ #print("outpib"+str(t_sline))
434
+ for concept in t_sline:
435
+ tagged = nltk.pos_tag(word_tokenize(concept))
436
+ tags = [word for word, pos in tagged if
437
+ pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
438
+ if len(tags) < 1:
439
+ continue
440
+ else:
441
+ parameters_list = []
442
+ compteur = 0
443
+ compt_Id += 1
444
+ # classifyT = SentenceClassifier(concept)
445
+ # polarite = classifyT.classifySentence()
446
+ tagged = nltk.pos_tag(word_tokenize(concept))
447
+ tags = [word for word, pos in tagged if pos.startswith('V') or pos == 'JJR']
448
+ if len(tags) < 1:
449
+ continue
450
+ classifyT = ClassifyWithIncr_it()
451
+ polarite = classifyT.main(concept)
452
+
453
+
454
+ # if polarite == 'neutre':
455
+ # classify = SentenceClassifier(concept)
456
+ # polarite = classify.classifySentence()
457
+ # print(concept)
458
+
459
+ get_parameters = ParameterExtractor(concept)
460
+ parameters = get_parameters.extract_parameters()
461
+
462
+ parameters_list.extend( parameters)
463
+ # parameters_list=", ".join(parameters_list)
464
+ # parameters_list = parameters_list
465
+
466
+ clean_concept = self.last_cleansing(concept)
467
+ validity = self.discardLines(concept, 'referencing_indices')
468
+ # if polarite == 'neutre':
469
+ # words = word_tokenize(clean_concept)
470
+ # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
471
+ # noise_trash.append(hit)
472
+
473
+
474
+ if t_sline.index(concept) == 0 and validity is not None:
475
+ previous_polarity = polarite
476
+ values = OrderedDict({
477
+ "concept": {
478
+ "type": polarite,
479
+ "enfants": graphItemIdValue + str(compt_Id + 1),
480
+ "id": graphItemIdValue + str(compt_Id),
481
+ "sentence": clean_concept,
482
+ "source": output_file_name,
483
+ "parameters":parameters_list,
484
+ "image": urlImg,
485
+ "pdf": urlPDF
486
+ }
487
+
488
+ })
489
+
490
+ else:
491
+ print("Previous polarity is : " + str(previous_polarity))
492
+ if previous_polarity =='partialSolutiond' or validity is None:
493
+ continue
494
+ else:
495
+ compteur += 1
496
+ values = OrderedDict({
497
+ "concept": {
498
+ "type": polarite,
499
+ "parents": graphItemIdValue + str(compt_Id - 1),
500
+ "id": graphItemIdValue + str(compt_Id),
501
+ "sentence": clean_concept,
502
+ "source": output_file_name,
503
+ "parameters": parameters_list,
504
+ "image": urlImg,
505
+ "pdf": urlPDF
506
+
507
+ }
508
+
509
+ })
510
+
511
+ json_string_linked = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
512
+
513
+ output_result.append(json_string_linked)
514
+
515
+
516
+ uniq_output_content = list(set(output_content))
517
+ for s in uniq_output_content:
518
+ for y in uniq_output_content:
519
+ if s != y:
520
+ result = Levenshtein.ratio(s, y)
521
+ if result > .7:
522
+ # print(s + " :IS SIMILAR TO: " + y)
523
+ if len(s) > len(y):
524
+ uniq_output_content.remove(y)
525
+ elif len(y) < len(s):
526
+ uniq_output_content.remove(s)
527
+
528
+
529
+ for concept in uniq_output_content:
530
+ tagged = nltk.pos_tag(word_tokenize(concept))
531
+ tags = [word for word, pos in tagged if
532
+ pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
533
+ if len(tags) < 1:
534
+ continue
535
+ parameters_list = []
536
+ concept = concept.lower()
537
+ compt_Id += 1
538
+ sline = re.sub(r'^if ', '', concept)
539
+ sline = re.sub(r'^(if|preferably) ', '', sline)
540
+ sline = re.sub(r'^\s+?said ', '', sline)
541
+ # classify = SentenceClassifier(sline)
542
+ # polarite = classify.classifySentence()
543
+ classifyT = ClassifyWithIncr_it()
544
+ polarite = classifyT.main(concept)
545
+ # if polarite =='neutre':
546
+ # classify = SentenceClassifier(sline)
547
+ # polarite = classify.classifySentence()
548
+ # print(sline)
549
+
550
+ #if polarite == 'partialSolution':
551
+ #print(sline)
552
+ #Insert a classifier here
553
+ get_parameters = ParameterExtractor(concept)
554
+ parameters = get_parameters.extract_parameters()
555
+
556
+ parameters_list.extend(parameters)
557
+ # parameters_list = ", ".join(parameters_list)
558
+ clean_concept = self.last_cleansing(sline)
559
+ # if polarite == 'neutre':
560
+ # words = word_tokenize(clean_concept)
561
+ # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
562
+ # noise_trash.append(hit)
563
+
564
+ validity = self.discardLines(concept, 'referencing_indices')
565
+ if polarite !='partialSolution' and validity is not None:
566
+
567
+ values = OrderedDict({
568
+ "concept": {
569
+ "type": polarite,
570
+ "id": graphItemIdValue + str(compt_Id),
571
+ "sentence": clean_concept,
572
+ "source": output_file_name,
573
+ "parameters": parameters_list,
574
+ "image": urlImg,
575
+ "pdf": urlPDF
576
+ }
577
+
578
+ })
579
+ json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
580
+ output_result.append(json_string)
581
+ output_result = list(set(output_result))
582
+
583
+
584
+
585
+
586
+
587
+ output_json = ",".join(output_result)
588
+ return output_json, total_sentences_number
App/bin/InformationExtractor_Claims.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from App.bin.FiguresCleaner import FiguresCleaner
2
+ from App.bin.ParameterExtractor import ParameterExtractor
3
+ from App.bin import constants
4
+ import nltk
5
+ import re
6
+ import os
7
+
8
+ import json
9
+ import hashlib
10
+ import Levenshtein
11
+ import uuid
12
+ from collections import OrderedDict
13
+ from App.bin.SharpClassifier import SharpClassifier
14
+ from App.bin.ClassifierWithIncr import ClassifyWithIncr_it
15
+
16
+
17
+ class InformationExtractorClaims(object):
18
+
19
+ def __init__(self, section, input_folder, file_extension, file_name):
20
+ self.section = section
21
+ self.input_folder = input_folder
22
+ self.file_extension = file_extension
23
+ self.file_name = file_name
24
+
25
+ patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split()
26
+ sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle')
27
+ sentence_finder._params.abbrev_types.update(patent_abbreviations)
28
+ self.sentence_finder = sentence_finder
29
+
30
+ def clean_data (self, sentence):
31
+
32
+ sentence = str(sentence.lower())
33
+ sentence = re.sub(r'\(\s,?\s?\)', '', sentence)
34
+ sentence = re.sub(r'\s+,', ',', sentence)
35
+ sentence = re.sub(r'^\d+', '', sentence)
36
+ sentence = re.sub(r'\s+', ' ', sentence)
37
+ if sentence is not None:
38
+ return sentence
39
+
40
+ def truncate_data (self, sentence):
41
+
42
+ sentence = str(sentence.lower())
43
+ sentence = re.sub(r'wherein said\s*', '', sentence)
44
+ sentence = re.sub(r'characterized in that said\s*|characterised in that said?\s*', '', sentence)
45
+ sentence = re.sub(r'wherein\s*', '', sentence)
46
+ sentence = re.sub(r'characterized\s*|characterised\s*', '', sentence)
47
+ sentence = re.sub(r'characterized in that\s*', '', sentence)
48
+ sentence = re.sub(r'where\s*', '', sentence)
49
+ sentence = re.sub(r'where said\s*', '', sentence)
50
+ sentence = re.sub(r'further comprising', 'the system or method comprises', sentence)
51
+ sentence = re.sub(r'.*thereof\s*\,?', '', sentence)
52
+ sentence = re.sub(r'^\s+', '', sentence)
53
+ sentence = re.sub(r'\s+\.$', '', sentence)
54
+ if sentence is not None:
55
+ return sentence
56
+
57
+ def selectLines(self, line, lexic):
58
+ with open(constants.ASSETS + lexic) as n:
59
+ inclusion_list = n.read().splitlines()
60
+ claims_words = re.compile('|'.join(inclusion_list))
61
+ m = re.search(claims_words, line)
62
+ if m is not None:
63
+ return m.group(1)
64
+ # pass
65
+ # return line
66
+ def main(self):
67
+
68
+ output_result = []
69
+ compt_Id = 50
70
+ count_concept = 3
71
+
72
+ clean_content_list = []
73
+ concept_list = []
74
+
75
+ output_content = []
76
+
77
+ uniq_output_linked_content =[]
78
+ parameters_list = []
79
+ total_sentences_number =0
80
+ section = self.section
81
+ input_folder = self.input_folder
82
+ file_name = self.file_name
83
+ file_extension = self.file_extension
84
+ projectFolder = os.path.basename(os.path.normpath(input_folder))
85
+ output_file_name = input_folder+"/"+file_name+file_extension.strip("*")
86
+
87
+ root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
88
+ root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
89
+
90
+
91
+
92
+ if file_name is not None:
93
+ match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name)
94
+ # CC for country code
95
+ CC = match.group(1)
96
+ #NR for Number
97
+ NR = match.group(2)
98
+ NR = re.sub(r'\s', '', NR)
99
+ #KC for Kind code
100
+ KC = match.group(4)
101
+
102
+ urlImg = root_img_url+'&CC='+CC+'&NR='+NR+'&KC='+KC
103
+ urlPDF = root_pdf_url+'CC='+CC+'&NR='+NR+'&KC='+KC+'&FT=D&ND=3&date='+'&DB=&locale=en_EP#'
104
+
105
+ graphItemId = hashlib.md5(file_name.encode())
106
+ graphItemIdValue = graphItemId.hexdigest()
107
+ graphItemIdValue = str(uuid.uuid4())
108
+
109
+ sentence_finder = self.sentence_finder
110
+ sentences = sentence_finder.tokenize(section.strip())
111
+ for sentence in sentences:
112
+ # print(sentence)
113
+ sentence = self.clean_data(sentence)
114
+ if sentence !='':
115
+ clean_content_list.append(sentence)
116
+ for line in clean_content_list:
117
+ # print(len(line.split()))
118
+ if not re.match(r'^\s*$', line):
119
+
120
+ line = self.selectLines(line, 'claims_indices')
121
+
122
+ if line is not None and count_concept > 0:
123
+ line = self.truncate_data(line)
124
+ line = re.sub(r'in that', '', line)
125
+ # print(line, len(line.split()))
126
+ concept_list.append(line)
127
+ count_concept -= 1
128
+
129
+ count_concept = 3
130
+ if len(concept_list) is not None:
131
+ total_sentences_number = len(concept_list)
132
+ for concept in concept_list :
133
+
134
+
135
+ if concept is not None and not re.match(r'^\s,', concept) and len(concept.split())<50:
136
+ classifyT = ClassifyWithIncr_it()
137
+ polarite = classifyT.main(concept)
138
+ get_parameters = ParameterExtractor(concept)
139
+ parameters = get_parameters.extract_parameters()
140
+
141
+ parameters_list.extend(parameters)
142
+
143
+ values = OrderedDict({
144
+ "concept": {
145
+ "type": polarite,
146
+ "id": graphItemIdValue + str(compt_Id),
147
+ "sentence": concept,
148
+ "source": output_file_name,
149
+ "parameters": parameters_list,
150
+ "image": urlImg,
151
+ "pdf": urlPDF
152
+
153
+ }
154
+
155
+ })
156
+ json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
157
+ output_result.append(json_string)
158
+ output_result = list(set(output_result))
159
+
160
+ output_json = ",".join(output_result)
161
+
162
+ return output_json, total_sentences_number
163
+
164
+
165
+
App/bin/InputHandler.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ #java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
4
+ import glob
5
+ import os
6
+
7
+
8
+ class InputHandler(object):
9
+
10
+ def __init__(self, folder_path, extension):
11
+ self.folder_path = folder_path
12
+ self.extension = extension
13
+
14
+ print("Handling Corpus...")
15
+
16
+
17
+ def _get_dirs(self, base):
18
+ return [x for x in glob.iglob(os.path.join(base, '*')) if os.path.isdir(x)]
19
+
20
+ def get_base_file(self, base, pattern):
21
+ lList = []
22
+ lList.extend(glob.glob(os.path.join(base, pattern)))
23
+ dirs = self._get_dirs(base)
24
+ if len(dirs):
25
+ for d in dirs:
26
+ lList.extend(self.get_base_file(os.path.join(base, d), pattern))
27
+ return lList
28
+
29
+ def get_input(self):
30
+ folder_path = self.folder_path
31
+ extension = self.extension
32
+ patent_files = self.get_base_file(folder_path, extension)
33
+ return patent_files
34
+
35
+
App/bin/MagicParser.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from App.bin import constants
3
+
4
+
5
+ class MagicParser(object):
6
+
7
+ def __init__(self, jsonFile):
8
+
9
+ self.jsonFile = jsonFile
10
+
11
+
12
+ def get_graph(self):
13
+
14
+ jsonFile = self.jsonFile
15
+ with open(jsonFile) as data_file:
16
+ data = json.load(data_file)
17
+ return data
18
+
19
+ def magic_parse(self):
20
+
21
+ count_problem = 0
22
+ count_partial_solution = 0
23
+ count_concepts = 0
24
+ count_parameters = 0
25
+ parameters = []
26
+ graph = self.get_graph(self.json_file)
27
+
28
+ for item in graph['problem_graph']:
29
+ count_concepts +=1
30
+ for sub_item, value in item.items():
31
+ if value['type'] =='partialSolution':
32
+ count_partial_solution +=1
33
+ else:
34
+ count_problem +=1
35
+
36
+ for item in graph['parameters']:
37
+ for sub_item, value in item.items():
38
+ for id, parameter in value['valeurs'].items():
39
+ parameters.append(parameter)
40
+ count_parameters += 1
41
+
42
+ uniq_parameters_number = len(list(set(parameters)))
43
+
44
+ return {"concepts_number":count_concepts, "problems_number": count_problem, "partialSol_numbers":count_partial_solution, "parameters_number": count_parameters, "uniq_param_number": uniq_parameters_number}
45
+
App/bin/PGProcessor.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ from io import StringIO
9
+ from App4api.bin import constants
10
+ from collections import OrderedDict
11
+ from App4api.bin.InformationExtractor import InformationExtractor
12
+ from App4api.bin.ParameterExtractor import ParameterExtractor
13
+ from App4api.bin.TechnologyFinder import TechnologyFinder
14
+
15
+ class PGProcessor(object):
16
+
17
+ def __init__(self, patents,input_folder, file_extension):
18
+ self.patents = patents
19
+ self.input_folder = input_folder
20
+ self.file_extension = file_extension
21
+ print("Processing started")
22
+
23
+ def process_corpus(self):
24
+
25
+ count_abstract = 0
26
+ count_claims = 0
27
+ count_description = 0
28
+ count_patent = 0
29
+ total_sentences_number =0
30
+ count_concepts_solupart = 0
31
+ count_concepts_problem = 0
32
+ patents = self.patents
33
+ input_folder = self.input_folder
34
+ file_extension = self.file_extension
35
+ project_folder = os.path.basename(os.path.normpath(input_folder))
36
+ graph_folder = constants.GRAPH_FOLDER + project_folder+"/"
37
+ extracted_concepts = []
38
+ output_result = []
39
+ parameters_graph = []
40
+ reduced_content = []
41
+ patent_corpus = []
42
+ source_list = []
43
+ parameters_list =[]
44
+ technologies_graph =[]
45
+
46
+
47
+ for patent_file in patents:
48
+
49
+ read_patent = StringIO(patent_file)
50
+ patent = json.load(read_patent)
51
+ nNumber = patent['number']
52
+ aAbstract = patent['abstract']
53
+ cClaims = patent['claims']
54
+ dDescription = patent['description']
55
+ source = patent['source']
56
+
57
+ if dDescription !="":
58
+ count_description +=1
59
+ extract_concepts = InformationExtractor(dDescription,input_folder, file_extension, nNumber, source )
60
+ output_json, total_sentences_number = extract_concepts.get_from_description()
61
+ if output_json !="":
62
+ extracted_concepts.append(output_json)
63
+ total_sentences_number += total_sentences_number
64
+ elif cClaims !="":
65
+ count_claims +=1
66
+ print('Processing claims')
67
+ else:
68
+ count_abstract +=1
69
+ print("processing abstract")
70
+ count_patent +=1
71
+
72
+
73
+ #print(source)
74
+ source_list.append(source)
75
+
76
+
77
+ header = '{'
78
+ graph = '"problem_graph": [%s]' % ','.join(extracted_concepts)
79
+ footer = '}'
80
+ output_result.extend((header, graph, footer))
81
+ output_result = "".join(output_result)
82
+ concepts_json = json.loads(output_result)
83
+ count_concepts = len(concepts_json['problem_graph'])
84
+ for item, value in concepts_json.items():
85
+ #if cle == "type" and value =="partialSolution":
86
+ # print ("yes")
87
+ for element in value:
88
+ for cle, valeur in element.items():
89
+ for k,v in valeur.items():
90
+ if k == "type" and v =="partialSolution":
91
+ count_concepts_solupart += 1
92
+ elif k == "type" and v =="problem":
93
+ count_concepts_problem += 1
94
+ json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
95
+ #print(concepts_json.keys())
96
+ with open(graph_folder+"graph.json", 'w') as json_graph:
97
+ json_graph.write(json_write_to_file)
98
+
99
+ print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (count_patent, count_abstract, count_claims, count_description))
100
+ print("%s phrases ont été analysée(s)" % (total_sentences_number))
101
+ print("%s concepts ont été trouvé(s) dont %s problèmes et %s solutions partielles" % (count_concepts, count_concepts_problem, count_concepts_solupart))
102
+
103
+ #Display graphics
104
+ first_color = (46, 204, 113)
105
+ second_color = (245, 176, 65)
106
+ #self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
107
+ return concepts_json
App/bin/ParamProcessor.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ from io import StringIO
9
+ from App4api.bin import constants
10
+ from collections import OrderedDict
11
+ from App4api.bin.InformationExtractor import InformationExtractor
12
+ from App4api.bin.ParameterExtractor import ParameterExtractor
13
+ from App4api.bin.TechnologyFinder import TechnologyFinder
14
+
15
+ class ParamProcessor(object):
16
+
17
+ def __init__(self, patents,input_folder, file_extension):
18
+ self.patents = patents
19
+ self.input_folder = input_folder
20
+ self.file_extension = file_extension
21
+ print("Processing started")
22
+
23
+ def change_keys(self, dictionnary, number):
24
+ number = number+'-'
25
+ if type(dictionnary) is dict:
26
+ return dict([(number+str(k) , self.change_keys(v, number)) for k, v in dictionnary.items()])
27
+ else:
28
+ return dictionnary
29
+
30
+ def process_corpus(self):
31
+
32
+ count_patent = 0
33
+ patents = self.patents
34
+ input_folder = self.input_folder
35
+ project_folder = os.path.basename(os.path.normpath(input_folder))
36
+ graph_folder = constants.GRAPH_FOLDER + project_folder+"/"
37
+ output_result = []
38
+ parameters_graph = []
39
+ reduced_content = []
40
+ patent_corpus = []
41
+ source_list = []
42
+ parameters_list =[]
43
+
44
+
45
+ for patent_file in patents:
46
+
47
+ read_patent = StringIO(patent_file)
48
+ patent = json.load(read_patent)
49
+ nNumber = patent['number']
50
+ aAbstract = patent['abstract']
51
+ cClaims = patent['claims']
52
+ dDescription = patent['description']
53
+ source = patent['source']
54
+
55
+ patent_content = aAbstract + cClaims + dDescription
56
+ patent_content = patent_content.splitlines()
57
+
58
+ for line in patent_content:
59
+ get_parameters = ParameterExtractor(line)
60
+ parameters = get_parameters.extract_parameters()
61
+ if parameters:
62
+ parameters_list.extend( parameters)
63
+
64
+
65
+ parameters_list=list(set(parameters_list))
66
+
67
+ parameters = dict(enumerate(parameters_list, 1))
68
+
69
+ parameters = self.change_keys(parameters, nNumber.lower())
70
+
71
+ parameters_array = OrderedDict({
72
+ "concept": {
73
+ "source": source,
74
+ "valeurs": parameters,
75
+
76
+ }
77
+
78
+ })
79
+ pParameters= json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
80
+ parameters_graph.append(pParameters)
81
+ count_patent +=1
82
+ source_list.append(source)
83
+ patent_corpus.append(reduced_content)
84
+
85
+ header = '{'
86
+ parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
87
+ footer = '}'
88
+ output_result.extend((header, parameters_output, footer))
89
+
90
+ output_result = "".join(output_result)
91
+ concepts_json = json.loads(output_result)
92
+
93
+
94
+ json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
95
+
96
+ with open(graph_folder+"parameters-graph.json", 'w') as json_graph:
97
+ json_graph.write(json_write_to_file)
98
+
99
+ return concepts_json
App/bin/ParameterExtractor.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import re
4
+ import nltk
5
+ import Levenshtein
6
+ from App.bin import constants
7
+
8
+ class ParameterExtractor(object):
9
+
10
+ def __init__(self, sentence):
11
+ self.sentence = sentence
12
+
13
+ def clean_parameter(self, parameter):
14
+ line = re.sub(r'\s[a-zA-Z]$', r'', parameter)
15
+ line = line.strip()
16
+ return line
17
+
18
+ def extract_parameters(self):
19
+ sentence = self.sentence
20
+ parameters_list = []
21
+ with open(constants.ASSETS + "parameter_core", 'r') as l:
22
+ words_list = l.read().splitlines()
23
+ match_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(words_list))
24
+
25
+ with open(constants.ASSETS + "exclude_from_parameters", 'r') as m:
26
+ not_included_words_list = m.read().splitlines()
27
+ match_not_included_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(not_included_words_list))
28
+
29
+ parameter_indice = re.search(match_word, sentence)
30
+ if parameter_indice:
31
+ words = nltk.word_tokenize(sentence)
32
+ sentence = nltk.pos_tag(words)
33
+ grammar = """PARAMETER:{<NN>+<IN><DT>?<NN.*>+}
34
+ {<NN*>+}
35
+ """
36
+ parameter_parser = nltk.RegexpParser(grammar)
37
+ tree = parameter_parser.parse(sentence)
38
+ for subtree in tree.subtrees():
39
+ if subtree.label() == 'PARAMETER':
40
+ parameter_candidate = " ".join(word for word, tag in subtree.leaves())
41
+ parameter_candidate_indice = re.search(match_word, parameter_candidate)
42
+ not_parameter = re.search(match_not_included_word, parameter_candidate)
43
+ if parameter_candidate_indice and not not_parameter :
44
+ #parameter_candidate=self.clean_parameter(parameter_candidate)
45
+ parameters_list.append(parameter_candidate)
46
+ parameters_list = list(set(parameters_list))
47
+
48
+
49
+
50
+ return list(parameters_list)
51
+
App/bin/PatentHandler.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ #java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
4
+ import glob
5
+ import nltk
6
+ import os
7
+ import re
8
+ import codecs
9
+ import chardet
10
+ import shutil
11
+ import json
12
+ from io import StringIO
13
+ from App.bin import constants
14
+ from App.bin.FiguresCleaner import FiguresCleaner
15
+
16
+
17
+ from collections import OrderedDict
18
+
19
+ class PatentHandler(object):
20
+
21
+ def __init__(self, patents):
22
+ self.patents = patents
23
+
24
+ def custom_cleaner(self, line):
25
+ line = str(line)
26
+ #line = line.lower()
27
+ line = re.sub(r'PatentInspiration Url', '', line)
28
+ line = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', line)
29
+ line = re.sub(r'{', '(', line)
30
+ line = re.sub(r'&quot;', '\'', line)
31
+ line = re.sub(r'}', ')', line)
32
+ line = re.sub(r'\t.*patentinspiration.*\n', '', line)
33
+ line = re.sub(r'^|\n{2,}\bAbstract\b\n?', '', line)
34
+ line = re.sub(r'^|\n{2,}\bClaims\b\n?', '', line)
35
+ line = re.sub(r'^|\n{2,}\bDescription\b\n?', '', line)
36
+ line = re.sub(r'fig\.', 'figure', line)
37
+ line = re.sub(r'Fig\.', 'Figure', line)
38
+ line = re.sub(r'FIG\.', 'Figure', line)
39
+ line = re.sub(r'figs\.', 'figures', line)
40
+ line = re.sub(r'FIGS\.', 'Figures', line)
41
+ line = re.sub(r'(\w+\.)', r'\1 ', line)
42
+ line = re.sub(r'&#39;', '\'', line)
43
+ line = re.sub(r'&gt;', '>', line)
44
+ line = re.sub(r'&lt;', '<', line)
45
+ line = re.sub(r'&#176;', ' deg.', line)
46
+ line = re.sub(r' ', ' ', line)
47
+ line = line.strip()
48
+ return line
49
+
50
+ def dataCleaner(self,line):
51
+ with open(constants.ASSETS + "dropPart") as l:
52
+ # next(l)
53
+ drop_part = l.read().splitlines()
54
+ drop_part_pattern = re.compile('|'.join(drop_part))
55
+
56
+ line = str(line)
57
+ #line = line.lower()
58
+ line = re.sub(r'^([A-Z-/]+\s)+([A-Z])', r'\n\2', line)
59
+ line = re.sub(drop_part_pattern, r'\n', line)
60
+ line = re.sub(r'\s+\.\s?\d+\s+', ' ', line)
61
+ line = line.strip()
62
+ return line
63
+
64
+ def smooth_data_cleaner(self,line):
65
+ line = str(line)
66
+ # line = line.lower()
67
+ line = re.sub(r'\s+,', ',', line)
68
+ line = re.sub(r'\d\w-\d\w (and? \d\w-\d\w)?', '', line)
69
+ line = re.sub(r'\d\w-\d\w', '', line)
70
+ line = re.sub(r'\(\s?(,\s?|;\s?)+\s?\)', '', line)
71
+ line = re.sub(r'\s+\.\s\.', '.\n', line)
72
+ line = re.sub(r'\s+\.\s+([a-z]+)', r' \1', line)
73
+ line = re.sub(r'\s+(\.)\s+\[\s?\d+\s?]\s+', r'.\n', line)
74
+ line = re.sub(r'\s?\[\s?\d+\s?]\s+', r'\n', line)
75
+ line = re.sub(r'\s+(\.)\s+([A-Z]+)', r'.\n\2', line)
76
+ line = re.sub(r'\s+;\s+', '; ', line)
77
+ line = re.sub(r'\(\s+\'\s+\)', '', line)
78
+ line = re.sub(r'\(\s+\)', '', line)
79
+ line = re.sub(r'\(\s?\.\s?\)', '', line)
80
+ line = re.sub(r'\(\s/\s?\)', '', line)
81
+ line = re.sub(r'\s{2,}', ' ', line)
82
+ line = re.sub(r'(\d+)\s+(\.)\s+(\d+)', r'\1.\3', line)
83
+ line = line.strip()
84
+ return line
85
+
86
+
87
+ def get_project_folder(self):
88
+ patents = self.patents
89
+ if patents:
90
+ file = patents[0]
91
+ project_folder = os.path.basename(os.path.dirname(file))
92
+ return project_folder
93
+
94
+ def convert_to_uf8(self, input_file_name,output_file_name, file_encoding):
95
+
96
+ BLOCKSIZE = 1048576
97
+ with codecs.open(input_file_name, "r", file_encoding) as input_file:
98
+ with codecs.open(output_file_name, "w", "utf-8") as output_file:
99
+ while True:
100
+ file_contents = input_file.read(BLOCKSIZE)
101
+ if not file_contents:
102
+ break
103
+ output_file.write(file_contents)
104
+
105
+ def sectionFinder(self, file_name, start_delimiter, end_delimiter):
106
+
107
+ patent_file = open(file_name, encoding='utf-8')
108
+ section = ""
109
+ found = False
110
+
111
+ for line in patent_file:
112
+ if found :
113
+ section += line
114
+ if line.strip() == end_delimiter:
115
+ break
116
+ else:
117
+ if line.strip() == start_delimiter:
118
+ found = True
119
+ # abstract = "Abstract\n"
120
+ return section
121
+
122
+ def pretreat_data(self):
123
+ clean_patent_data= []
124
+ patents = self.patents
125
+
126
+ project_folder = self.get_project_folder()
127
+
128
+ # original code
129
+ # corpus_folder = constants.CORPUS + project_folder + "/"
130
+
131
+ corpus_folder = str(constants.CORPUS)+str(project_folder)+"/"
132
+ temp_folder = str(constants.TEMP)+str(project_folder)+"/"
133
+ graph_folder = str(constants.GRAPH_FOLDER)+str(project_folder)+"/"
134
+
135
+ folders = [corpus_folder, temp_folder, graph_folder]
136
+ for folder in folders:
137
+ if not os.path.exists(folder):
138
+ os.makedirs(folder)
139
+ else:
140
+ shutil.rmtree(folder)
141
+ os.makedirs(folder)
142
+
143
+ for patent in patents:
144
+
145
+ patent_name_with_extension = os.path.basename(patent)
146
+ patent_name, extension= patent_name_with_extension.split('.')
147
+ corpus_patent_path = corpus_folder + patent_name_with_extension
148
+ #temp_patent_path = temp_folder + patent_name+'.json'
149
+
150
+ patent_binary = open(patent, 'rb').read()
151
+
152
+ file_encoding = chardet.detect(patent_binary)
153
+ file_encoding = file_encoding['encoding']
154
+ self.convert_to_uf8(patent,corpus_patent_path, file_encoding)
155
+
156
+ temp_file = StringIO()
157
+ #print(temp_patent_path)
158
+ a_abstract = self.sectionFinder(corpus_patent_path,"Abstract", "Claims")
159
+ a_abstract= self.custom_cleaner(a_abstract)
160
+ abstract_cleaner = FiguresCleaner(a_abstract)
161
+ a_abstract = ''.join(abstract_cleaner.clean_figures())
162
+ a_abstract = self.smooth_data_cleaner(a_abstract)
163
+ a_abstract = self.dataCleaner(a_abstract)
164
+
165
+ c_claims = self.sectionFinder(corpus_patent_path, "Claims", "")
166
+ c_claims = self.custom_cleaner(c_claims)
167
+ claims_cleaner = FiguresCleaner(c_claims)
168
+ c_claims = ''.join(claims_cleaner.clean_figures())
169
+ c_claims = self.smooth_data_cleaner(c_claims)
170
+ c_claims = self.smooth_data_cleaner(c_claims)
171
+
172
+ d_description = self.sectionFinder(corpus_patent_path,"Description", "Claims")
173
+ d_description = self.custom_cleaner(d_description)
174
+ description_cleaner = FiguresCleaner(d_description)
175
+ d_description = ''.join(description_cleaner.clean_figures())
176
+ d_description = self.smooth_data_cleaner(d_description)
177
+ d_description = self.dataCleaner(d_description)
178
+
179
+ #TODO Manipulate data on system memory.
180
+
181
+ data = {
182
+
183
+ 'number': patent_name,
184
+ 'abstract': a_abstract,
185
+ 'claims': c_claims,
186
+ 'description': d_description
187
+ }
188
+
189
+ json.dump(data, temp_file)
190
+ clean_patent_data.append(temp_file.getvalue())
191
+ return clean_patent_data
192
+
193
+
194
+ def pretreat_json(self):
195
+ clean_patent_data= []
196
+ patents = self.patents
197
+ temp_file = StringIO()
198
+
199
+ for patent in patents:
200
+ patent = json.dumps(patent)
201
+
202
+ read_patent_t = StringIO(patent)
203
+ patent_section = json.load(read_patent_t)
204
+ filename = patent_section['filename']
205
+ number = patent_section['number']
206
+
207
+ a_abstract = patent_section['abstract']
208
+ a_abstract= self.custom_cleaner(a_abstract)
209
+ abstract_cleaner = FiguresCleaner(a_abstract)
210
+ a_abstract = ''.join(abstract_cleaner.clean_figures())
211
+ a_abstract = self.smooth_data_cleaner(a_abstract)
212
+ a_abstract = self.dataCleaner(a_abstract)
213
+
214
+ c_claims = patent_section['claims']
215
+ c_claims = self.custom_cleaner(c_claims)
216
+ claims_cleaner = FiguresCleaner(c_claims)
217
+ c_claims = ''.join(claims_cleaner.clean_figures())
218
+ c_claims = self.smooth_data_cleaner(c_claims)
219
+ c_claims = self.smooth_data_cleaner(c_claims)
220
+
221
+ d_description = patent_section['description']
222
+ d_description = self.custom_cleaner(d_description)
223
+ description_cleaner = FiguresCleaner(d_description)
224
+ d_description = ''.join(description_cleaner.clean_figures())
225
+ d_description = self.smooth_data_cleaner(d_description)
226
+ d_description = self.dataCleaner(d_description)
227
+
228
+ #TODO Manipulate data on system memory.
229
+
230
+ data = {
231
+ 'filename': filename,
232
+ 'number': number,
233
+ 'abstract': a_abstract,
234
+ 'claims': c_claims,
235
+ 'description': d_description
236
+ }
237
+
238
+
239
+ clean_patent_data.append(data)
240
+ #json.dumps(clean_patent_data, temp_file)
241
+
242
+ #print(json.dumps(clean_patent_data))
243
+ return clean_patent_data
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
App/bin/SentenceClassifier.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+
4
+ import nltk
5
+ from App.bin import constants
6
+
7
+ class SentenceClassifier(object):
8
+ def __init__(self, sentence):
9
+ self.sentence = sentence
10
+ print("Classification....")
11
+
12
+
13
+ def classifySentence(self):
14
+
15
+ sentence = self.sentence
16
+
17
+ def bagOfWords(labelled):
18
+ wordsList = []
19
+ for (words, sentiment) in labelled:
20
+ wordsList.extend(words)
21
+ return wordsList
22
+
23
+ def wordFeatures(wordList):
24
+ wordList = nltk.FreqDist(wordList)
25
+ wordFeatures = wordList.keys()
26
+ return wordFeatures
27
+
28
+ def extract_Features(doc):
29
+ docWords = set(doc)
30
+ feat = {}
31
+ for word in wordFeatures:
32
+ feat['contains(%s)' % word] = (word in docWords)
33
+ return feat
34
+
35
+
36
+ with open(constants.ASSETS+"trainingsNegative") as l:
37
+ problems = [tuple(map(str, i.strip().split(':'))) for i in l]
38
+ with open(constants.ASSETS+"trainingsPositive") as f:
39
+ solutions = [tuple(map(str, i.strip().split(':'))) for i in f]
40
+
41
+ labelled = []
42
+ for (words, polarity) in solutions + problems:
43
+ words_filtered = [e.lower() for e in nltk.word_tokenize(words) if len(e) >= 3]
44
+ labelled.append((words_filtered, polarity))
45
+
46
+
47
+
48
+ wordFeatures = wordFeatures(bagOfWords(labelled))
49
+
50
+ training_set = nltk.classify.apply_features(extract_Features, labelled)
51
+
52
+ classifier = nltk.NaiveBayesClassifier.train(training_set)
53
+
54
+ #print(classifier.show_most_informative_features(32))
55
+
56
+
57
+ #print (sentence)
58
+ #print("{0} \n Polarity: {1} \n".format(sentence, classifier.classify(extract_Features(sentence.split()))))
59
+ classes = classifier.classify(extract_Features(sentence.split()))
60
+ return classes