In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import sklearn
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
%matplotlib inline

In [2]:
train = pd.read_csv('data/features.csv')
test = pd.read_csv('data/features_test.csv')

In [3]:
train, validation = train_test_split(train, train_size=0.9)

In [4]:
train.head()

Unnamed: 0,match_id,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
16710,19688,1437741248,1,39,5,2160,2091,20,2,1,...,3,3,0,-33.0,3243,1,1920,1536,48,0
29564,34758,1442612264,7,67,3,1067,1071,12,0,1,...,2,3,1,14.0,2679,0,0,1972,0,63
41771,49130,1444844127,1,46,4,1951,1928,13,2,0,...,3,3,1,10.0,1990,0,1926,1974,51,63
50162,58986,1446664551,1,46,4,1768,1299,17,0,0,...,3,2,1,-24.0,2044,1,2036,1542,63,35
43810,51522,1445267298,1,10,3,1083,1741,24,1,0,...,3,2,0,-37.0,1927,1,1983,0,63,0


In [5]:
train.corrwith(train['radiant_win']).sort_values(ascending=False)

radiant_win                1.000000
barracks_status_radiant    0.747533
tower_status_radiant       0.729062
radiant_boots_count        0.119573
first_blood_player2        0.114069
r2_gold                    0.099887
r5_gold                    0.094885
r1_gold                    0.094100
r4_gold                    0.092888
dire_tpscroll_count        0.088654
r3_gold                    0.088604
d1_deaths                  0.085848
d5_deaths                  0.085722
r1_kills                   0.083411
d2_deaths                  0.080585
d3_deaths                  0.080545
d4_deaths                  0.080394
r2_kills                   0.077740
r3_kills                   0.076147
r4_kills                   0.074169
r5_kills                   0.073717
dire_bottle_time           0.065992
r2_lh                      0.064364
r2_xp                      0.061239
r5_xp                      0.061105
r5_lh                      0.059644
r4_xp                      0.059320
r4_lh                      0

## Пробуем идеи и смотрим на качество на train

### Идея 1: если команда сделала first_blood (первая убила противника), она выиграет матч

In [6]:
def predict_first_blood(df):
    df['first_blood_prediction'] = (df.first_blood_team == 0.0).astype(int)

predict_first_blood(train)
print('Training accuracy = %0.4f' % (train['first_blood_prediction'] == train['radiant_win']).mean())

Training accuracy = 0.5534


#### Но важнее AUC

In [7]:
print('Training ROC AUC score = %0.4f' % roc_auc_score(train['radiant_win'], train['first_blood_prediction']))

Training ROC AUC score = 0.5578


### Идея 2: если команда купила много ботинок, она вероятнее выиграет матч

In [8]:
def predict_boots(df):
    df['boots_prediction'] = df.radiant_boots_count.astype(int)

predict_boots(train)
print('Training ROC AUC score = %0.4f' % roc_auc_score(train['radiant_win'], train['boots_prediction']))

Training ROC AUC score = 0.5662


#### Идея с ботинками оказалась лучше! Проверим качество на валидации

In [9]:
predict_boots(validation)
predict_first_blood(validation)
print('Validation ROC AUC score = %0.4f' % roc_auc_score(validation['radiant_win'], validation['first_blood_prediction']))
print('Validation ROC AUC score = %0.4f' % roc_auc_score(validation['radiant_win'], validation['boots_prediction']))

Validation ROC AUC score = 0.5644
Validation ROC AUC score = 0.5640


#### Гипотеза подтвердилась и на валидации. Засылаем!

In [10]:
predict_boots(test)
test['radiant_win'] = test['boots_prediction']
test[['match_id', 'radiant_win']].to_csv('prediction.csv', index=False)