Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame
702 Bytes
import os
import json
import spacy
TARGET_POS = [
"NOUN",
"VERB",
"ADJ",
"ADV"
]
def do_frameid():
nlp = spacy.load("it_core_news_md")
with open("data/migration/corpus_titoli_all_raw.txt", encoding="utf-8") as f_in, \
open("output/migration/pos_based_targetid/corpus_titoli_all_raw.jsonl", "w", encoding="utf-8") as f_out:
for line in f_in:
doc = nlp(line.strip())
out = {
"tokens": [t.text for t in doc],
"predicates": [i for i, t in enumerate(doc) if t.pos_ in TARGET_POS]
}
f_out.write(json.dumps(out) + os.linesep)
if __name__ == "__main__":
do_frameid()