{ "cells": [ { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from scipy.sparse import csr_matrix\n", "import sklearn\n", "from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import roc_auc_score\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, HashingVectorizer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Работа с категориальными признками" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Все эксперименты в этой лабораторной работе предлагается проводить на данных соревнования Amazon Employee Access Challenge: https://www.kaggle.com/c/amazon-employee-access-challenge\n", "\n", "В данной задаче предлагается предсказать, будет ли одобрен запрос сотрудника на получение доступа к тому или иному ресурсу. Все признаки являются категориальными.\n", "\n", "Данные доступны по ссылке https://www.dropbox.com/s/q6fbs1vvhd5kvek/amazon.csv\n", "\n", "Для оценки качества используется ROC-AUC" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | ACTION | \n", "RESOURCE | \n", "MGR_ID | \n", "ROLE_ROLLUP_1 | \n", "ROLE_ROLLUP_2 | \n", "ROLE_DEPTNAME | \n", "ROLE_TITLE | \n", "ROLE_FAMILY_DESC | \n", "ROLE_FAMILY | \n", "ROLE_CODE | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "39353 | \n", "85475 | \n", "117961 | \n", "118300 | \n", "123472 | \n", "117905 | \n", "117906 | \n", "290919 | \n", "117908 | \n", "
1 | \n", "1 | \n", "17183 | \n", "1540 | \n", "117961 | \n", "118343 | \n", "123125 | \n", "118536 | \n", "118536 | \n", "308574 | \n", "118539 | \n", "
2 | \n", "1 | \n", "36724 | \n", "14457 | \n", "118219 | \n", "118220 | \n", "117884 | \n", "117879 | \n", "267952 | \n", "19721 | \n", "117880 | \n", "
3 | \n", "1 | \n", "36135 | \n", "5396 | \n", "117961 | \n", "118343 | \n", "119993 | \n", "118321 | \n", "240983 | \n", "290919 | \n", "118322 | \n", "
4 | \n", "1 | \n", "42680 | \n", "5905 | \n", "117929 | \n", "117930 | \n", "119569 | \n", "119323 | \n", "123932 | \n", "19793 | \n", "119325 | \n", "