File size: 2,678 Bytes
4da642e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import json
import glob


def tag_document(keywords, tokens):

    # Initialize the tags list with all O's
    tags = ['O'] * len(tokens)

    # Loop over the keywords and tag the document
    for keyword in keywords:
        # Split the keyword into words
        keyword_words = keyword.split()

        # Loop over the words in the document
        for i in range(len(tokens)):
            # If the current word matches the first word of the keyword
            if tokens[i] == keyword_words[0]:
                match = True
                # Check if the rest of the words in the keyword match the following words in the document
                for j in range(1, len(keyword_words)):
                    if i+j >= len(tokens) or tokens[i+j] != keyword_words[j]:
                        match = False
                        break
                # If all the words in the keyword match the following words in the document, tag them as B-KEYWORD and I-KEYWORD
                if match:
                    tags[i] = 'B-KEYWORD'
                    for j in range(1, len(keyword_words)):
                        tags[i+j] = 'I-KEYWORD'

    return tags


def create_tner_dataset(all_tags, all_tokens, output_file_addr):
    output_f = open(output_file_addr, 'a+')
    for tags, tokens in zip(all_tags, all_tokens):
        for tag, tok in zip(tags, tokens):
            line = '\t'.join([tok, tag])
            output_f.write(line)
            output_f.write('\n')
        output_f.write('\n')
    

if __name__ == '__main__':

    data_df = pd.read_csv('truncated_wiki_plus_shuffled_41203.csv')
    id2document = data_df.set_index('id')['truncated_text_300'].to_dict()

    
    #tag documents!
    print('------------------  tag documents --------------------')
    all_tags = []
    all_tokens = []
    for tagged_data_addr in glob.iglob('./tagged_data*'):
        for line in open(tagged_data_addr):
            item = json.loads(line)
            if type(item['keyphrases']) == list:
                keywords = item['keyphrases']
                document = id2document[item['id']]
                tokens = document.split()
                tags = tag_document(keywords, tokens)
                assert len(tokens) == len(tags)
                all_tags.append(tags)
                all_tokens.append(tokens)
                print(len(keywords), len(tags), len(document.split()), len([t for t in tags if t[0]== 'B']))
    nerda_dataset = {'sentences':all_tokens, 'tags': all_tags}
    with open('nerda_dataset.json', 'w+') as f:
        json.dump(nerda_dataset, f)
    # create_tner_dataset(all_tags, all_tokens, output_file_addr='./sample_train.conll')