ppolx commited on
Commit
e7de395
·
1 Parent(s): 50550af

runnability

Browse files
Files changed (2) hide show
  1. .gitignore +3 -0
  2. recognize.py +61 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .DS_Store
2
+ .idea
3
+
recognize.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # recognize: processes two-levels-topic-recognition
2
+ # Authors: Paul Libbrecht, Pierre Günthner and Alexander Gantikow from the AISOP project
3
+
4
+ # Installation: Install spacy then...
5
+ # Usage: python recognize.py l1-model l2-models "this is a text"
6
+ # l1-model path: a relative path (starting from this script) pointing to the level-1 model folder
7
+ # l2-model path: a relative path (starting from this script) pointing to the folder containing a folder for each L2-label
8
+ # "this is a text": the text to recognize
9
+
10
+
11
+ import sys # System-specific parameters and functions, part of Py
12
+ import spacy # Natural language processing
13
+ from pathlib import Path # Object-oriented filesystem paths, part of Py
14
+ import json # JSON object dumping functions
15
+
16
+
17
+ RoundTo = 2 # Round to precision of n decimals
18
+ Encoding = 'utf8' # Encoding of the html file to be read and parsed via BeautifulSoup
19
+ ScoreThreshold = 0.2 # Min. spacy probability value for an element to be analysed/enriched.
20
+ MaxResults = 3 # Max. number of concepts/labels added to an html element
21
+ ParagraphMinLetters = 10 # Min. number of letters of paragraph to be considered in analysis
22
+ ListMinLetters = 10 # Min. number of letters of <ul> and <ol> to be considered in analysis
23
+
24
+ SubModels = {}
25
+
26
+ Nlp = spacy.load(sys.argv[1])
27
+ SubModelDir = Path(__file__).parent.joinpath(sys.argv[2]).absolute()
28
+ input = sys.argv[3]
29
+
30
+
31
+ def filterDoc(doc, scoreThreshold, maxResults, roundTo=2):
32
+ cats = doc.cats.items()
33
+ filt = list(filter(lambda c: c[1] > scoreThreshold, cats))
34
+ sort = sorted(filt, key=lambda c: c[1], reverse=True)
35
+ maxi = sort[0:maxResults]
36
+ rund = [(l[0], round(l[1], roundTo)) for l in maxi ]
37
+ return dict(rund)
38
+
39
+
40
+
41
+ def recognize(text):
42
+ global Nlp
43
+ # find l1 labels
44
+ labels = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
45
+
46
+ # find L2 labels
47
+ relabels = dict()
48
+ for label in labels.keys():
49
+ label2 = label.strip()
50
+ SubModelPath = SubModelDir.joinpath(label2).absolute()
51
+ if SubModelPath.exists():
52
+ Nlp = spacy.load(SubModelPath)
53
+ docSub = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
54
+ relabels[label2] = {'score': labels[label], 'subs': docSub}
55
+ else:
56
+ relabels[label2] = {'score': labels[label]}
57
+ relabels["messages"] = "Submodel path \"" + str(SubModelPath) + "\" not found"
58
+
59
+ return relabels
60
+
61
+ print(json.dumps(recognize(input)))