LasRuinasCirculares
/

knowledge_conflict_eneity_based

Model card Files Files and versions Community

LasRuinasCirculares commited on Apr 21

Commit

861bb01

•

1 Parent(s): 2a5821d

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
knowledge_conflict_entity_based/.DS_Store +0 -0
knowledge_conflict_entity_based/entity_substitute.py +126 -0
knowledge_conflict_entity_based/requirements.txt +5 -0
knowledge_conflict_entity_based/result/.DS_Store +0 -0
knowledge_conflict_entity_based/result/entity_info.json +3 -0
knowledge_conflict_entity_based/run.sh +2 -0
knowledge_conflict_entity_based/setup.sh +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+knowledge_conflict_entity_based/result/entity_info.json filter=lfs diff=lfs merge=lfs -text

knowledge_conflict_entity_based/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

knowledge_conflict_entity_based/entity_substitute.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import spacy
+import zstandard as zstd
+import json
+import typing
+import os
+from tqdm import tqdm
+import multiprocessing
+import random
+from langdetect import detect
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--input_dir', type=str, help='Path to the input file')
+args = parser.parse_args()
+input_dir = args.input_dir
+def is_english(text):
+    try:
+        lang = detect(text)
+        return lang == 'en'
+    except:
+        return False
+def process_text(texts, model, out_f, lock):
+    for text in texts:
+        doc = model(text)
+        freq_cnt = {}
+        for e in doc.ents:
+            if e not in freq_cnt:
+                freq_cnt[e] = 0
+            freq_cnt[e] += 1
+        if len(freq_cnt) == 0:
+            continue
+        sorted_freq = sorted(freq_cnt.items(), key = lambda x:[1])
+        most_freq = sorted_freq[-1][0]
+        data = {'text':text, 'main_entity':most_freq.text, 'label': most_freq.label_, 'id': most_freq.kb_id_}
+        json_data = json.dumps(data)
+        with lock:
+            out_f.write(json_data + '\n')
+            out_f.flush()
+def run_ner_linking(texts: typing.List[str], ner_model_path: str):
+    nlp = spacy.load(ner_model_path)
+    out_f = open('result/temp_store_data.json', 'w', encoding='utf-8')
+    lock = multiprocessing.Lock()
+    processes = []
+    for i in tqdm(range(0, len(texts), 1000)):
+        p = multiprocessing.Process(target=process_text, args=(texts[i:i+1000], nlp, out_f, lock))
+        processes.append(p)
+        p.start()
+    for p in processes:
+        p.join()
+    out_f.close()
+    return
+wikipedia_out_path='result/wikipedia.json'
+subdirectories = [f.path for f in os.scandir(input_dir) if f.is_dir()]
+wikipedia_data = []
+for sub_dir in subdirectories:
+    chunk_dir = sub_dir+'/'
+    zst_files = [f for f in os.listdir(chunk_dir) if f.endswith('.zst')]
+    for file in tqdm(zst_files):
+        with open(chunk_dir+file, 'rb') as compressed_file:
+            decompressor = zstd.ZstdDecompressor()
+            with decompressor.stream_reader(compressed_file) as reader:
+                decompressed_data = reader.read()
+        for line in decompressed_data.splitlines():
+            data = json.loads(line)
+            # print(data)
+            if data['meta']['redpajama_set_name']=='RedPajamaWikipedia':
+                if is_english(data['text']):
+                    wikipedia_data.append(data)
+with open(wikipedia_out_path, 'w', encoding='utf-8') as f:
+    for data in wikipedia_data:
+        json_data = json.dumps(data)
+        f.write(json_data+'\n')
+wikipedia_data = []
+ner_model_path = 'kc-ner-model'
+with open(wikipedia_out_path, 'r', encoding='utf-8') as f:
+    for line in tqdm(f):
+        data = json.loads(line)
+        wikipedia_data.append(data['text'])
+run_ner_linking(wikipedia_data, ner_model_path)
+entity_info_path = 'result/entity_info.json'
+with open(entity_info_path, 'r', encoding='utf-8') as f:
+    entity_info = json.load(f)
+all_original_data = []
+category = {}
+all_data = []
+with open('result/temp_store_data.json', 'r', encoding='utf-8') as f:
+    for line in f:
+        data = json.loads(line)
+        all_data.append(data)
+        if data['label'] not in category:
+            category[data['label']] = []
+        category[data['label']].append(data['main_entity'])
+with open('result/processed_data.json', 'w', encoding='utf-8') as f:
+    for data in tqdm(all_data):
+        text = data['text']
+        main_entity = [data['main_entity']]
+        if data['id'] in entity_info:
+            main_entity.extend(entity_info[data['id']]['aliases'])
+        if len(category[data['label']]) == 1:
+            continue
+        replaced_eneity = random.sample(category[data['label']], 1)
+        while replaced_eneity[0] in main_entity:
+            replaced_eneity = random.sample(category[data['label']], 1)
+        for entity in main_entity:
+            text = text.replace(entity, replaced_eneity[0])
+        data = {
+            'text':text,
+            'original_main_entity':main_entity,
+            'replaced_entity':replaced_eneity[0]
+        }
+        json_data = json.dumps(data)
+        f.write(json_data+'\n')

knowledge_conflict_entity_based/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+spacy==2.2.4
+langdetect
+zstandard
+tqdm
+wget

knowledge_conflict_entity_based/result/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

knowledge_conflict_entity_based/result/entity_info.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:423a217fa602456b961b6169b0bac15659ec85c90de2b261ca924c0ebe7d04a4
+size 742977816

knowledge_conflict_entity_based/run.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ### the processed data will be stored to the path {result/processed_data.json}
2	+ python entity_substitute.py --input_dir /opt/data/private/szc/ml-knowledge-conflicts-main/test

knowledge_conflict_entity_based/setup.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+pip install -r requirements.txt
+# Download the SpaCy Named Entity Recognizer (NER) and Entity Linker (EL) model
+# See https://spacy.io/usage/linguistic-features#named-entities and https://v2.spacy.io/usage/training#entity-linker
+wget https://docs-assets.developer.apple.com/ml-research/models/kc-ner/model.gz -O kc-ner-model.gz
+tar -xvzf kc-ner-model.gz -C kc-ner-model