fukatani commited on
Commit
fce29f6
1 Parent(s): c183948
app.py CHANGED
@@ -1,5 +1,26 @@
1
  import streamlit as st
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
 
1
  import streamlit as st
2
 
3
+ from japanese.embedding import encode_sentences, get_cadidate_embeddings
4
+ from japanese.tokenizer import extract_keyphrase_candidates
5
+ from japanese.ranker import DirectedCentralityRnak
6
+
7
+ from transformers import AutoTokenizer
8
+ from transformers import AutoModel
9
+
10
+
11
+ if __name__ == '__main__':
12
+ # load model
13
+ model = AutoModel.from_pretrained('cl-tohoku/bert-base-japanese')
14
+ tokenizer = AutoTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')
15
+
16
+ text = "紀元前509年、第7代の王タルクィニウス・スペルブスを追放し共和制を敷いたローマだが、問題は山積していた。まず、王に代わった執政官(コンスル)が元老院の意向で決められるようになったこと、またその被選挙権が40歳以上に限定されていたことから、若い市民を中心としてタルクィニウスを王位に復する王政復古の企みが起こった。これは失敗して、初代執政官ルキウス・ユニウス・ブルトゥスは、彼自身の息子ティトゥスを含む陰謀への参加者を処刑した。ラテン同盟諸都市やエトルリア諸都市との同盟は、これらの都市とローマ王との同盟という形であったため、王の追放で当然に同盟は解消され、対立関係となった。"
17
+ tokens, keyphrases = extract_keyphrase_candidates(text, tokenizer)
18
+
19
+ document_embs = encode_sentences([tokens], tokenizer, model)
20
+ document_feats = get_cadidate_embeddings([keyphrases], document_embs, [tokens])
21
+ ranker = DirectedCentralityRnak(document_feats, beta=0.1, lambda1=1, lambda2=0.9, alpha=1.2, processors=8)
22
+ phrases = ranker.extract_summary()
23
+ x = st.slider('Select a value')
24
+ st.write(x, 'squared is', x * x)
25
+ phrases
26
 
japanese/__init__.py ADDED
File without changes
japanese/embedding.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import string
4
+ import tojson
5
+ import pickle
6
+
7
+ import torch
8
+ import numpy as np
9
+
10
+ from transformers import BertTokenizer, AutoTokenizer, BertModel, AutoModel
11
+
12
+
13
+ def encode_sentence(tokenizer, model, tokens):
14
+ is_split = []
15
+ input_tokens = ['[CLS]']
16
+ for token in tokens:
17
+ tmp = tokenizer.tokenize(token)
18
+
19
+ if len(input_tokens) + len(tmp) >= 511:
20
+ break
21
+ else:
22
+ input_tokens.extend(tmp)
23
+ is_split.append(len(tmp))
24
+ input_tokens += ["[SEP]"]
25
+ input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
26
+
27
+ input_ids = torch.LongTensor([input_ids])
28
+ outputs = model(input_ids, output_hidden_states=True).last_hidden_state.detach().numpy()
29
+ bertcls = outputs[0, 0, :]
30
+ o1 = outputs[0, :, :]
31
+ cls_token = o1[0]
32
+
33
+ tokens_emb = []
34
+ i = 1
35
+ for j in is_split:
36
+ if j == 1:
37
+ tokens_emb.append(o1[i])
38
+ i += 1
39
+ else:
40
+ tokens_emb.append(sum(o1[i:i+j]) / j)
41
+ # tokens_emb.append(np.max(np.array(o1[i: i+j]), axis=0))
42
+ i += j
43
+ # if i >= len(is_split):
44
+ # break
45
+ assert len(tokens_emb) == len(is_split)
46
+ return tokens_emb, bertcls, cls_token
47
+
48
+ def flat_list(l):
49
+ return [x for ll in l for x in ll]
50
+
51
+ def encode_sentences(token_list, tokenizer, model):
52
+ tokenizer.do_word_tokenize = False
53
+
54
+ document_embeddings = []
55
+ cnt = 0
56
+ for tokens in token_list:
57
+ tokens_emb, bertcls, cls_token = encode_sentence(tokenizer, model, tokens)
58
+
59
+ document_embeddings.append({
60
+ 'document_id': cnt,
61
+ 'doc_cls': cls_token,
62
+ 'doc_bertcls': bertcls,
63
+ "tokens": tokens_emb
64
+ })
65
+ cnt += 1
66
+
67
+ return document_embeddings
68
+
69
+
70
+ def get_cadidate_embeddings(token_list, document_embeddings, tokens):
71
+ document_feats = []
72
+ cnt = 0
73
+ for candidate_phrase, document_emb, each_tokens in zip(token_list, document_embeddings, tokens):
74
+ sentence_emb = document_emb['tokens']
75
+
76
+ tmp_embeddings = []
77
+ tmp_candidate_phrase = []
78
+
79
+ for tmp, (i, j) in candidate_phrase:
80
+ if j<=i:
81
+ continue
82
+ if j >= len(sentence_emb):
83
+ break
84
+ # tmp_embeddings.append(sum(sentence_emb[i:j]) / (j-i))
85
+ tmp_embeddings.append(np.max(np.array(sentence_emb[i:j]), axis=0))
86
+ tmp_candidate_phrase.append(tmp)
87
+
88
+ candidate_phrases_embeddings = tmp_embeddings
89
+ candidate_phrases = tmp_candidate_phrase
90
+
91
+ document_feats.append({
92
+ 'document_id': cnt,
93
+ 'tokens': each_tokens,
94
+ 'candidate_phrases': candidate_phrases,
95
+ 'candidate_phrases_embeddings': candidate_phrases_embeddings,
96
+ # 'sentence_embeddings': document_emb['doc_bertcls'],
97
+ 'sentence_embeddings': document_emb['doc_cls'],
98
+ })
99
+ cnt += 1
100
+ return document_feats
japanese/ranker.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy
2
+ import pickle
3
+
4
+ import numpy as np
5
+ import os
6
+ import re
7
+ import string
8
+ import tojson
9
+ from multiprocessing import Pool
10
+
11
+ import sys
12
+ import tojson
13
+ import re
14
+ import string
15
+
16
+
17
+ import numpy as np
18
+ from numpy.linalg import norm
19
+
20
+ class DirectedCentralityRnak(object):
21
+ def __init__(self,
22
+ document_feats,
23
+ extract_num=20,
24
+ beta=0.2,
25
+ lambda1=1,
26
+ lambda2=0.8,
27
+ alpha=1,
28
+ processors=8):
29
+ self.extract_num = extract_num
30
+ self.processors = processors
31
+ self.beta = beta
32
+ self.lambda1 = lambda1
33
+ self.lambda2 = lambda2
34
+ self.alpha = alpha
35
+
36
+ self.candidate_phrases = [x['candidate_phrases'] for x in document_feats]
37
+ self.doc_embeddings = [x['sentence_embeddings'] for x in document_feats]
38
+ self.tokens_embeddings = [x['candidate_phrases_embeddings'] for x in document_feats]
39
+
40
+ def flat_list(self, l):
41
+ return [x for ll in l for x in ll]
42
+
43
+ def extract_summary(self,):
44
+ paired_scores = self.rank()
45
+
46
+
47
+ rank_list_phrases = []
48
+ for candidate, paired_score in zip(self.candidate_phrases, paired_scores):
49
+ candidates = []
50
+ for i in range(len(candidate)):
51
+ phrase = candidate[i]
52
+ candidates.append([phrase, paired_score[i][0], paired_score[i][1]])
53
+ rank_list_phrases.append(candidates)
54
+
55
+
56
+ predicted_candidation = []
57
+ for i in range(len(rank_list_phrases)):
58
+ final_score = []
59
+ position_weight = 1 / (np.array(list(range(1, len(rank_list_phrases[i]) + 1))))
60
+ position_weight = np.exp(position_weight) / np.sum(np.exp(position_weight))
61
+ cnt = 0
62
+ for candidate, index, score in rank_list_phrases[i]:
63
+ final_score.append([candidate, score * position_weight[cnt]])
64
+ cnt += 1
65
+ final_score.sort(key = lambda x: x[1], reverse = True)
66
+ candidates = [x[0].strip() for x in final_score]
67
+ predicted_candidation.append(candidates)
68
+ return predicted_candidation
69
+
70
+
71
+ def pairdown(self, scores, pair_indice, length):
72
+ out_matrix = np.ones((length, length))
73
+ for pair in pair_indice:
74
+ out_matrix[pair[0][0]][pair[0][1]] = scores[pair[1]]
75
+ out_matrix[pair[0][1]][pair[0][0]] = scores[pair[1]]
76
+
77
+ return out_matrix
78
+
79
+ def get_similarity_matrix(self, sentence_embeddings):
80
+ pairs = []
81
+ scores = []
82
+ cnt = 0
83
+ for i in range(len(sentence_embeddings)-1):
84
+ for j in range(i, len(sentence_embeddings)):
85
+ if type(sentence_embeddings[i]) == float or type(sentence_embeddings[j]) == float:
86
+ scores.append(0)
87
+ else:
88
+ scores.append(np.dot(sentence_embeddings[i], sentence_embeddings[j]))
89
+
90
+ pairs.append(([i, j], cnt))
91
+ cnt += 1
92
+ return self.pairdown(scores, pairs, len(sentence_embeddings))
93
+
94
+ def compute_scores(self, similarity_matrix, edge_threshold=0):
95
+
96
+ forward_scores = [1e-10 for i in range(len(similarity_matrix))]
97
+ backward_scores = [1e-10 for i in range(len(similarity_matrix))]
98
+ edges = []
99
+ n = len(similarity_matrix)
100
+ alpha = self.alpha
101
+ for i in range(len(similarity_matrix)):
102
+ for j in range(i+1, len(similarity_matrix[i])):
103
+ edge_score = similarity_matrix[i][j]
104
+ # boundary_position_function
105
+ db_i = min(i, alpha * (n-i))
106
+ db_j = min(j, alpha * (n-j))
107
+ if edge_score > edge_threshold:
108
+ if db_i < db_j:
109
+ forward_scores[i] += edge_score
110
+ backward_scores[j] += edge_score
111
+ edges.append((i,j,edge_score))
112
+ else:
113
+ forward_scores[j] += edge_score
114
+ backward_scores[i] += edge_score
115
+ edges.append((j,i,edge_score))
116
+
117
+ return np.asarray(forward_scores), np.asarray(backward_scores), edges
118
+
119
+ def _rank_part(self, similarity_matrix, doc_vector, candidate_phrases_embeddings):
120
+ min_score = np.min(similarity_matrix)
121
+ max_score = np.max(similarity_matrix)
122
+ threshold = min_score + self.beta * (max_score - min_score)
123
+ new_matrix = similarity_matrix - threshold
124
+ dist = []
125
+ for emb in candidate_phrases_embeddings:
126
+ if type(doc_vector) == float or type(emb) == float:
127
+ dist.append(0)
128
+ else:
129
+ dist.append(1/np.sum(np.abs(emb - doc_vector)))
130
+
131
+ forward_score, backward_score, _ = self.compute_scores(new_matrix)
132
+
133
+ paired_scores = []
134
+ for node in range(len(forward_score)):
135
+ paired_scores.append([node, (self.lambda1 * forward_score[node] + self.lambda2 * backward_score[node]) * (dist[node])])
136
+
137
+ return paired_scores
138
+
139
+ def rank(self,):
140
+
141
+ similarity_matrix = []
142
+ extracted_list = []
143
+ for embedded in self.tokens_embeddings:
144
+ similarity_matrix.append(self.get_similarity_matrix(embedded))
145
+ for matrix, doc_vector, candidate_phrases_embeddings in zip(similarity_matrix, self.doc_embeddings, self.tokens_embeddings):
146
+ extracted_list.append(self._rank_part(matrix, doc_vector, candidate_phrases_embeddings))
147
+ return extracted_list
japanese/tokenizer.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import MeCab
2
+ import os
3
+
4
+
5
+ def extract_keyphrase_candidates(text, tokenizer):
6
+ tagger = MeCab.Tagger()
7
+ tagger.parse("")
8
+
9
+ t = [to.split('\t') for to in tagger.parse(text).split('\n') if to]
10
+ t = [(to[0], to[1].split(',')[0]) for to in t if len(to) > 1]
11
+
12
+ keyphrase_candidates = []
13
+ phrase = []
14
+
15
+ tokens = []
16
+ idx = len(t) - 1
17
+ start_pos = -1
18
+ end_pos = -1
19
+ cnt = 0
20
+ phrase_set = set()
21
+
22
+ while idx >= 0:
23
+ while idx >= 0 and t[idx][1] != '名詞':
24
+ tokens.append(t[idx][0])
25
+ idx -= 1
26
+
27
+ if idx >= 0 and t[idx][1] == '名詞':
28
+ tokens.append(t[idx][0])
29
+ end_pos = len(tokens)
30
+ phrase.append(t[idx][0])
31
+ idx -= 1
32
+
33
+ while idx >= 0 and t[idx][1] == '名詞':
34
+ tokens.append(t[idx][0])
35
+ phrase.append(t[idx][0])
36
+ idx -= 1
37
+
38
+ while idx >= 0 and t[idx][1] == '形容詞':
39
+ tokens.append(t[idx][0])
40
+ phrase.append(t[idx][0])
41
+ idx -= 1
42
+
43
+ if len(phrase) > 1:
44
+ start_pos = len(tokens)
45
+ keyphrase_candidates.append(('_'.join(phrase[::-1]), (len(t) - start_pos, len(t) - end_pos)))
46
+
47
+ phrase = []
48
+ start_pos = -1
49
+ end_pos = -1
50
+
51
+ while idx >= 0:
52
+ tokens.extend(tokenizer.tokenize(t[idx][0])[::-1])
53
+ idx -= 1
54
+
55
+
56
+ outputs = []
57
+ for keyphrase in keyphrase_candidates[::-1]:
58
+ if keyphrase[0] not in phrase_set:
59
+ outputs.append(keyphrase)
60
+ phrase_set.add(keyphrase[0])
61
+ return tokens[::-1], outputs