Spaces:
Sleeping
Sleeping
import pandas as pd | |
KIFU_TO_SQUARE_NAMES = [ | |
'1一', '1二', '1三', '1四', '1五', '1六', '1七', '1八', '1九', | |
'2一', '2二', '2三', '2四', '2五', '2六', '2七', '2八', '2九', | |
'3一', '3二', '3三', '3四', '3五', '3六', '3七', '3八', '3九', | |
'4一', '4二', '4三', '4四', '4五', '4六', '4七', '4八', '4九', | |
'5一', '5二', '5三', '5四', '5五', '5六', '5七', '5八', '5九', | |
'6一', '6二', '6三', '6四', '6五', '6六', '6七', '6八', '6九', | |
'7一', '7二', '7三', '7四', '7五', '7六', '7七', '7八', '7九', | |
'8一', '8二', '8三', '8四', '8五', '8六', '8七', '8八', '8九', | |
'9一', '9二', '9三', '9四', '9五', '9六', '9七', '9八', '9九', | |
] | |
KIFU_FROM_SQUARE_NAMES = [ | |
'11', '12', '13', '14', '15', '16', '17', '18', '19', | |
'21', '22', '23', '24', '25', '26', '27', '28', '29', | |
'31', '32', '33', '34', '35', '36', '37', '38', '39', | |
'41', '42', '43', '44', '45', '46', '47', '48', '49', | |
'51', '52', '53', '54', '55', '56', '57', '58', '59', | |
'61', '62', '63', '64', '65', '66', '67', '68', '69', | |
'71', '72', '73', '74', '75', '76', '77', '78', '79', | |
'81', '82', '83', '84', '85', '86', '87', '88', '89', | |
'91', '92', '93', '94', '95', '96', '97', '98', '99', | |
] | |
def nomalize_precedence_name(df): | |
#先手の対局者の名前から段位、タイトル名を削除する | |
for x in range(len(df)): | |
df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace(" ","").replace(" ","").replace("\u3000","") | |
if df["precedence_name"].iloc[x].endswith("段"): | |
df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x][:-2] | |
df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace("十七世名人","").replace("十八世名人","").replace("十九世名人","") | |
df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace("王将","").replace("王座","").replace("名人","").replace("竜王","").replace("棋聖","").replace("叡王","").replace("王位","").replace("棋王","") | |
df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace("・","").replace("二冠","").replace("三冠","") | |
return df | |
def nomalize_kif(df): | |
for x in range(len(df)): | |
kif = eval(df.iloc[x]["kif"]) | |
#kifの正規化処理 手数、消費時間を削除する | |
cnt = -1 | |
for y in kif: | |
cnt += 1 | |
while(1): | |
if "0" <= y[0] <= "9": | |
y = y[1:] | |
kif[cnt] = y | |
else: | |
break | |
kif[cnt] = kif[cnt].replace("\u3000","") | |
for z in range(len(y)): | |
if y[z] == "(": | |
kif[cnt] = y[:z] | |
break | |
kifs = "" | |
for i in kif: | |
kifs += i.replace("\u3000","") | |
df["kif"].iloc[x] = kifs | |
return df | |
def nomalize_comment(df): | |
#文章中のword省略処理 | |
for cnt in range(len(df["output"])): | |
x = df["output"].iloc[cnt] | |
read = x.split("。") | |
#print(read) | |
line = "" | |
for z in read: | |
if "期" in z or "出身" in z or "優勝" in z or "受賞" in z or "回" in z or "記録" in z or "棋士番号" in z or "勝" in z or "敗" in z or "名人" in z: | |
pass | |
elif "時" in z or "分" in z or "成績" in z or "棋戦" in z or "段" in z or "本日" in z or "立会" in z or "ABEMA" in z or "第" in z or "本局" in z: | |
pass | |
elif "対局" in z or "永世" in z: | |
pass | |
elif z == "": | |
pass | |
else: | |
#print(z) | |
line += z+"。" | |
df["output"].iloc[cnt] = line | |
return df | |
def accuracy_bestlist(df): | |
cnt2 = 0 | |
num = 0 | |
for z in range(len(df)): | |
blist = eval(df["bestlist"].iloc[z]) | |
b2list = eval(df["best2list"].iloc[z]) | |
te = eval(df["kif"].iloc[z]) | |
#print(blist[0][0]) | |
#print(b2list[0][0]) | |
cnt = 0 | |
for x in range(1,len(te)): | |
try: | |
if blist[x-1][0] in te[x] or b2list[x-1][0] in te[x]: | |
cnt += 1 | |
#print(te[x],blist[x][0],b2list[x][0]) | |
except Exception as e: | |
pass | |
if cnt == 0: | |
print("accuracy = 0",z) | |
print("z = ",z," accuracy = ",cnt/len(te)) | |
cnt2 += cnt/len(te) | |
num += 1 | |
print("mean_acuuracy",cnt2/num) | |
def nomalize_sfen(s): | |
flag = 0 | |
movelist = [] | |
for x in range(len(s)): | |
if x < 2: | |
continue | |
if len(s[x]) < 30 and flag == 0: | |
#半角の指し手を全角に変換する | |
temp = s[x].split() | |
num = temp[1][0] + temp[1][1] | |
for y in range(len(KIFU_FROM_SQUARE_NAMES)): | |
if num == KIFU_FROM_SQUARE_NAMES[y]: | |
sq = KIFU_TO_SQUARE_NAMES[y] | |
word = sq+temp[1][2:] | |
word = word.replace("竜","龍").replace("成銀","全").replace("成桂","圭").replace("成香","杏") | |
if s[x].split()[1] not in ["投了" , "千日手" , "持将棋" , "反則勝ち"]: | |
movelist.append(word) | |
else: | |
movelist.append(s[x].split()[1]) | |
flag = 1 | |
return movelist | |
def make_triplets(df, column): | |
# 重複を除いたユニークな文章リストを作成 | |
triplets = [] | |
for x in range(len(df)): | |
anchor = df.iloc[x] | |
# Anchorと同じではない文章をPositiveとして選択 | |
num = df.loc[(df[column] == anchor[column]) & (df["kif"] != anchor["kif"])].sample(n=1).index | |
# print(df.loc[num]) | |
positive = df.loc[num]["kif"].values[0] | |
# Anchorと異なる文章をNegativeとして選択 | |
num2 = df.loc[(df[column] != anchor[column]) & (df["kif"] != anchor["kif"])].sample(n=1).index | |
# print(df.loc[num2]) | |
negative = df.loc[num2]["kif"].values[0] | |
triplets.append((anchor["kif"], positive, negative,df.loc[num][column].values[0],df.loc[num2][column].values[0])) | |
def add_symbol(df,column): | |
teban ="▲" | |
kif = "" | |
for x in range(len(df)): | |
for y in df[column].iloc[x]: | |
if y in ["0","1","2","3","4","5","6","7","8","9","同",0,1,2,3,4,5,6,7,8,9]: | |
kif += teban + y | |
if teban =="▲": | |
teban = "△" | |
else: | |
teban = "▲" | |
else: | |
kif += y | |
df[column].iloc[x] = kif | |
kif = "" | |
return df | |