Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame
No virus
1.9 kB
from collections import defaultdict
import json
import pandas as pd
from sociofillmore.common.analyze_text import analyze_single_document, load_deep_frames_cache
def main():
frames_of_interest = ["Killing", "Death", "Dead_or_alive", "Event", "Catastrophe", "Undergoing"]
texts_df = pd.read_csv(
"output/femicides/split_data/rai/split_main.texts.meta.csv")
deep_frames_cache = load_deep_frames_cache()
fcp_to_sentences = defaultdict(list)
for i, (_, row) in enumerate(texts_df.iterrows()):
if i % 100 == 0:
print(i)
doc_analysis = analyze_single_document(row["text_id"], row["event_id"], "lome_0shot",
"femicides/rai_main", texts_df, deep_frames_cache)
for i, sent_analysis in enumerate(doc_analysis):
sentence = " ".join(sent_analysis["sentence"])
for fn_st in sent_analysis["fn_structures"]:
frame = fn_st["frame"]
tgt_idx = str(fn_st["target"]["tokens_idx"][0])
if frame in frames_of_interest:
construction = sent_analysis["syntax"][tgt_idx][0]["syn_construction"]
fcp_to_sentences[f"{frame}++{construction}"].append({
"event_id": row["event_id"],
"frame": frame,
"construction": construction,
"target": fn_st["target"]["tokens_str"],
"text_id": row["text_id"],
"sentence_idx": i,
"sentence_str": sentence,
"selected_frame": frame,
"selected_cx": construction
})
with open("output/scoring/extracted_frames.json", "w") as f:
json.dump(fcp_to_sentences, f, indent=4, sort_keys=True)
if __name__ == "__main__":
main()