File size: 7,658 Bytes
654e9ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import pandas as pd
import numpy as np
import math
import time

def main():
    """This is the script version of creatingVPdata.ipynb for the dvc pipeline."""
    # load data in
    folderlocation = "D:/PycharmProjects/TerraBot/terra-mystica"
    gameevents = pd.read_csv(f'{folderlocation}/game_events.csv')
    games = pd.read_csv(f'{folderlocation}/games.csv')
    gameslist = list(pd.unique(gameevents['game']))
    allfactions = pd.unique(gameevents['faction'])
    gamescoringtiles = pd.read_csv(f'{folderlocation}/game_scoring_tiles.csv')
    gameoptions = pd.read_csv(f'{folderlocation}/game_options.csv')
    stats = pd.read_csv(f'{folderlocation}/stats.csv')

    # two vp dataset functions
    def makenewdf():
        """make an empty dataframe, organised in the way we want the target data, ready to be populated"""
        validfactions = ['witches', 'auren', 'swarmlings', 'mermaids', 'cultists', 'halflings', 'dwarves', 'engineers', 'chaosmagicians', 'giants', 'fakirs', 'nomads', 'darklings', 'alchemists']
        dfcols = ['game'] + validfactions
        vpdf = pd.DataFrame(columns=dfcols)

        return vpdf, dfcols, validfactions

    vpdf, dfcols, validfactions = makenewdf()

    def get_vp_from_game(singleGameEvents):
        """Input game events for a single game. This is a pd.DataFrame.
        Output a row where each faction in the game has its vp populated (the rest are nans)
        """
        newdf = pd.DataFrame([[np.nan] * 15], columns=dfcols)

        # assign the game number
        gameno = list(pd.unique(singleGameEvents['game']))

        # assert len(gameno) == 1, 'More than 1 unique game was found'
        try:
            newdf['game'].replace({np.nan: gameno[0]}, inplace=True)
        except:
            print(f'DEBUGGING: len of table is {len(singleGameEvents)}')
            print(f'DEBUGGING: gamnos list: {gameno}')
            print(singleGameEvents)
            raise

        # find factions - there are some artifacts in the data. E.g. the "faction", "all". We need to filter them out.
        rawfactions = list(pd.unique(singleGameEvents['faction']))
        verifiedfactions = [rawfaction for rawfaction in rawfactions if rawfaction in validfactions]

        for faction in verifiedfactions:
            vpfaction = sum(singleGameEvents[(singleGameEvents['event'] == 'vp') & (singleGameEvents['faction'] == faction)]['num'])
            newdf[faction].replace({np.nan: vpfaction}, inplace=True)

        return newdf

    # two features dataset functions
    def emptyfeaturesdf():
        """make an empty dataframe, organised in the way we want the feature data, ready to be populated"""
        colnames = ['game']
        uniqueScoreTiles = np.sort(pd.unique(gamescoringtiles['tile']))

        # One-hot of round tiles, for each round
        for gameround in range(1, 7):
            roundstr = f'r{gameround}'
            for tile in uniqueScoreTiles:
                colnames.append(roundstr + '_' + tile)

        # Boolean of bonus tiles
        for bon in range(1, 11):
            colnames.append(f'BON{bon}')

        # One-hot player count (from 2, 3, 4 or 5 players)
        for player in range(2, 6):
            colnames.append(f'{player}players')

        # one hot of the map used
        """126fe960806d587c78546b30f1a90853b1ada468 - map1
           95a66999127893f5925a5f591d54f8bcb9a670e6 - map2
           be8f6ebf549404d015547152d5f2a1906ae8dd90 - map3
        """
        colnames = colnames + ['map1', 'map2', 'map3']

        featuresdf = pd.DataFrame(columns=colnames)

        return featuresdf, colnames

    featuresdf, featcolnames = emptyfeaturesdf()

    def get_features_from_game(singlegameevents, singlegamemeta, singlegameST, singleendplayers=None):
        """
        Inputs:
            singlegameevents <pd.DataFrame>   - is game events for a single game
            singlegamemeta   <pd.DataFrame>   - is a single row from `games` that gives map & player count
            singlegameST     <pd.DataFrame>   - is a single row from `gamescoringtiles` that gives... score tile (suprisingly)
            singleendplayers <pd.DataFrame>   - is a single row from `end players` that gives the amount of players at end of game, after dropouts
        Return:          <pd.DataFrame>   - a row where features have been found (will be sparse)
        """
        newdf = pd.DataFrame([[0] * len(featcolnames)], columns=featcolnames)

        # assign game string
        singlegamemeta.iloc[0]['game']
        newdf['game'].replace({0: singlegamemeta.iloc[0]['game']}, inplace=True)

        # find the round tiles for each round
        for gameround in range(1, 7):
            roundstr = f'r{gameround}'
            scoretile = roundstr + '_' + singlegameST[singlegameST['round'] == gameround]['tile'].values[0]
            newdf[scoretile].replace({0: 1}, inplace=True)

        # Boolean of bonus tiles
        uniqueevents = list(pd.unique(singlegameevents['event']))
        bonustiles = [event[5:] for event in uniqueevents if event.startswith('pass:BON')]
        for bontile in bonustiles:
            newdf[bontile].replace({0: 1}, inplace=True)

        # One-hot player count (from 2, 3, 4 or 5 players)
        if singleendplayers is None:
            noplayers = singlegamemeta.iloc[0]['player_count']
            print('gamemeta used for player count')
        else:
            noplayers = singleendplayers.iloc[0]['endplayers']

        players = f'{noplayers}players'
        newdf[players].replace({0: 1}, inplace=True)

        # one hot of the map used
        mapdict = {'126fe960806d587c78546b30f1a90853b1ada468': 'map1',
                   '95a66999127893f5925a5f591d54f8bcb9a670e6': 'map2',
                   'be8f6ebf549404d015547152d5f2a1906ae8dd90': 'map3'
                   }
        basemap = singlegamemeta.iloc[0]['base_map']
        gamemap = mapdict[basemap]
        newdf[gamemap].replace({0: 1}, inplace=True)

        return newdf

    # filtering
    # making a dataset for ease
    data = dict()
    data['gameevents'] = gameevents
    data['games'] = games
    data['gamescoringtiles'] = gamescoringtiles

    def filteringByBadgames(data, badgames):
        """ Data is a dict containing gameevents, games, gamescoringtiles
            badgames is a pd.dataframe that contains ['game'] to filter by
        """
        gameeventsfil = data['gameevents']
        gamesfil = data['games']
        gamescoringtilesfil = data['gamescoringtiles']

        badgameslist = badgames['game']
        gameeventsfilbefore = len(gameeventsfil)
        gamesbefore = len(gamesfil)
        gameSTbefore = len(gamescoringtilesfil)

        gameeventsfil = gameeventsfil[~gameeventsfil['game'].isin(badgameslist)]
        gamesfil = gamesfil[~gamesfil['game'].isin(badgameslist)]
        gamescoringtilesfil = gamescoringtilesfil[~gamescoringtilesfil['game'].isin(badgameslist)]

        print(f'game events before: {gameeventsfilbefore}, game events after: {len(gameeventsfil)}, game events removed: {gameeventsfilbefore-len(gameeventsfil)}')
        print(f'games before: {gamesbefore}, games after: {len(gamesfil)}, games removed: {gamesbefore-len(gamesfil)}')
        print(f'gameST before: {gameSTbefore}, gameST after: {len(gamescoringtilesfil)}, games removed: {gameSTbefore-len(gamescoringtilesfil)}')

        data['gameevents'] = gameeventsfil
        data['games'] = gamesfil
        data['gamescoringtiles'] = gamescoringtilesfil

        return data

    # player count
    badgames = games[games["player_count"].isin([1, 6, 7])]
    data = filteringByBadgames(data, badgames)


if __name__ == "__main__":
    main()