PH-Weingarten
/

AISOP-oop-classifier

Text Classification

computer-science

Model card Files Files and versions

ppolx commited on Oct 30

Commit

e7de395

·

1 Parent(s): 50550af

runnability

Files changed (2) hide show

.gitignore +3 -0
recognize.py +61 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ .DS_Store
2	+ .idea
3	+

recognize.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# recognize: processes two-levels-topic-recognition
+# Authors: Paul Libbrecht, Pierre Günthner and Alexander Gantikow from the AISOP project
+# Installation: Install spacy then...
+# Usage: python recognize.py l1-model l2-models "this is a text"
+# l1-model path: a relative path (starting from this script) pointing to the level-1 model folder
+# l2-model path: a relative path (starting from this script) pointing to the folder containing a folder for each L2-label
+# "this is a text": the text to recognize
+import sys                       # System-specific parameters and functions, part of Py
+import spacy                     # Natural language processing
+from pathlib import Path         # Object-oriented filesystem paths, part of Py
+import json                      # JSON object dumping functions
+RoundTo = 2                      # Round to precision of n decimals
+Encoding = 'utf8'                # Encoding of the html file to be read and parsed via BeautifulSoup
+ScoreThreshold = 0.2             # Min. spacy probability value for an element to be analysed/enriched.
+MaxResults = 3                   # Max. number of concepts/labels added to an html element
+ParagraphMinLetters = 10         # Min. number of letters of paragraph to be considered in analysis
+ListMinLetters = 10              # Min. number of letters of <ul> and <ol> to be considered in analysis
+SubModels = {}
+Nlp = spacy.load(sys.argv[1])
+SubModelDir = Path(__file__).parent.joinpath(sys.argv[2]).absolute()
+input = sys.argv[3]
+def filterDoc(doc, scoreThreshold, maxResults, roundTo=2):
+    cats = doc.cats.items()
+    filt = list(filter(lambda c: c[1] > scoreThreshold, cats))
+    sort = sorted(filt, key=lambda c: c[1], reverse=True)
+    maxi = sort[0:maxResults]
+    rund = [(l[0], round(l[1], roundTo)) for l in maxi ]
+    return dict(rund)
+def recognize(text):
+    global Nlp
+    # find l1 labels
+    labels = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
+    # find L2 labels
+    relabels = dict()
+    for label in labels.keys():
+        label2 = label.strip()
+        SubModelPath = SubModelDir.joinpath(label2).absolute()
+        if SubModelPath.exists():
+            Nlp = spacy.load(SubModelPath)
+            docSub = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
+            relabels[label2] = {'score': labels[label], 'subs': docSub}
+        else:
+            relabels[label2] = {'score': labels[label]}
+            relabels["messages"] = "Submodel path \"" + str(SubModelPath) + "\" not found"
+    return relabels
+print(json.dumps(recognize(input)))