File size: 702 Bytes
b11ac48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
import json

import spacy


TARGET_POS = [
    "NOUN",
    "VERB",
    "ADJ",
    "ADV"
]


def do_frameid():
    nlp = spacy.load("it_core_news_md")

    with open("data/migration/corpus_titoli_all_raw.txt", encoding="utf-8") as f_in, \
        open("output/migration/pos_based_targetid/corpus_titoli_all_raw.jsonl", "w", encoding="utf-8") as f_out:
        
        for line in f_in:
            doc = nlp(line.strip())
            out = {
                "tokens": [t.text for t in doc],
                "predicates": [i for i, t in enumerate(doc) if t.pos_ in TARGET_POS]  
            }
            f_out.write(json.dumps(out) + os.linesep)


if __name__ == "__main__":
    do_frameid()