wietsedv commited on
Commit
0d557da
1 Parent(s): 9bce6fd

Fix token aggregation

Browse files
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -31,8 +31,14 @@ def tag(text, lang_index):
31
  loaded_model_id = model_id
32
  pipe = pipeline("token-classification", model_id, aggregation_strategy="first")
33
 
34
- out = pipe(text)
35
- out = [(g["word"], g["entity_group"]) for g in out]
 
 
 
 
 
 
36
 
37
  return out, model_link(model_id)
38
 
31
  loaded_model_id = model_id
32
  pipe = pipeline("token-classification", model_id, aggregation_strategy="first")
33
 
34
+ # Aggregate words:
35
+ # split on whitespace and PUNCT, but merge other subtokens (keep first tag)
36
+ out = []
37
+ for g in pipe(text):
38
+ if g["word"][0] == "▁" or g["entity"] == "PUNCT":
39
+ out.append((g["word"].lstrip("▁"), g["entity"]))
40
+ else:
41
+ out[-1] = (out[-1][0] + g["word"], out[-1][1])
42
 
43
  return out, model_link(model_id)
44