上一次修改时间：2018-06-26 20:31:57

特征工程-特征工程-代码

对纽约公寓租赁数据进行特征工程------数据工程

# python 2.7    python 2.7    python 2.7  
import sys
sys.path.append('./data') #引入不同目录下的文件,此处是为了引入MeanEncoder.py该文件
import MeanEncoder

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy import sparse

from sklearn.preprocessing import LabelEncoder

from sklearn.cluster import KMeans
from nltk.metrics import distance as distance

from sklearn.model_selection import StratifiedKFold

#from MeanEncoder import MeanEncoder

文件读取

dpath = './data/'
train = pd.read_json(dpath + 'RentListingInquries_train.json')
test = pd.read_json(dpath + 'RentListingInquries_test.json')
train.head()

标签interest_level

从类别型的标签interest_level编码为数字

从前面的分析和常识来看，listing_id对确定interest_level没有用，去掉特征编码对训练集和测试集都要做，所以干脆将二者连起来一起处理

y_map = {'low' : 2 , 'medium' : 1 , 'high' : 0}
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])

#y_train = train.interest_level.values
y_train = train.interest_level
train = train.drop(['listing_id' , 'interest_level'] , axis = 1)

listing_id = test.listing_id.values
test = test.drop('listing_id' , axis = 1)

ntrain = train.shape[0]
# concat函数是在pandas底下的方法，可以将数据根据不同的轴作简单的融合
train_test = pd.concat((train , test) , axis = 0).reset_index(drop = True)

y_train

out:

10        1
10000     2
100004    0
100007    2
100013    2
100014    1
100016    2
100020    2
100026    1
100027    2
100030    2
10004     2
100044    0
100048    2
10005     2
100051    1
100052    2
100053    2
100055    2
100058    2
100062    2
100063    1
100065    2
100066    2
10007     1
100071    2
100075    1
100076    2
100079    0
100081    2
         ..
99915     2
99917     2
99919     1
99921     1
99923     2
99924     2
99931     2
99933     2
99935     2
99937     2
9994      2
99953     2
99956     2
99960     1
99961     2
99964     1
99965     2
99966     2
99979     2
99980     2
99982     0
99984     2
99986     2
99987     2
99988     1
9999      1
99991     2
99992     2
99993     2
99994     2
Name: interest_level, dtype: int64

price,bathrooms,bedrooms

数值型特征，+ / - / * / / ,特征的单调变换对XGBoost不必要

# remove some noise
#ulimit = np.percentile(train_test.price.values , 99)
train_test['price'].ix[train_test['price'] > 13000] = 13000

# remove some noise
#pandas.loc 选取指定列进行操作
#df.loc[行标签,列标签]df.loc['a':'b']#选取ab两行数据df.loc[:,'one']#选取one列的数据
#下面的代码是分别将bathrooms列中，值为112，10，20的值分别置换成1.5,1,2
train_test.loc[train_test["bathrooms"] == 112 , "bathrooms"] = 1.5
train_test.loc[train_test["bathrooms"] == 10 , "bathrooms"] = 1
train_test.loc[train_test["bathrooms"] == 20 , "bathrooms"] = 2

#构造新特征
#price_bathrooms:单位bathroom的价格
#price_bedrooms:单位bedroom的价格
train_test['price_bathrooms'] = (train_test["price"]) / (train_test["bathrooms"] + 1.0)
train_test['price_bedrooms'] = (train_test["price"] / (train_test["bedrooms"]) + 1.0)

#构造新特征
#room_diff: bathroom房间数 - bedrooms房间数
#room_num: bathroom房间数 - bedroom房间数
train_test["room_diff"] = train_test["bathrooms"] - train_test["bedrooms"]
train_test["room_num"] = train_test["bedrooms"] + train_test["bathrooms"]

print train_test["bathrooms"]

out:

0         1.5
1         1.0
2         1.0
3         1.0
4         1.0
5         2.0
6         1.0
7         2.0
8         1.0
9         2.0
10        1.0
11        1.0
12        1.0
13        2.0
14        1.0
15        1.0
16        1.0
17        1.0
18        1.0
19        1.0
20        1.0
21        1.0
22        1.0
23        1.0
24        2.0
25        3.5
26        1.0
27        1.0
28        1.0
29        2.0
         ... 
123981    1.0
123982    2.0
123983    1.0
123984    1.0
123985    2.0
123986    1.0
123987    2.0
123988    1.0
123989    2.0
123990    1.0
123991    0.0
123992    1.0
123993    2.0
123994    1.0
123995    1.0
123996    2.0
123997    1.0
123998    1.0
123999    1.0
124000    1.0
124001    1.0
124002    1.0
124003    2.0
124004    1.0
124005    1.0
124006    1.0
124007    1.0
124008    1.0
124009    1.0
124010    1.0
Name: bathrooms, dtype: float64

创建日期

train_test['Date'] = pd.to_datetime(train_test['created'])
train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour

train_test = train_test.drop(['Date', 'created'], axis=1)

description

# count of words present in description colum
train_test["num_description_words"] = train_test["description"].apply(lambda x: len(x.split(" ")))
train_test = train_test.drop(['description'] , axis = 1)

manager_id

将manager分为几个等级 top 1%， 2%， 5， 10， 15， 20， 25， 30， 50

managers_count = train_test['manager_id'].value_counts()

train_test['top_1_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
train_test['top_2_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
train_test['top_5_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
train_test['top_10_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
train_test['top_15_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
train_test['top_20_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
train_test['top_25_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
train_test['top_30_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 70)] else 0)
train_test['top_50_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 50)] else 0)

print train_test['top_1_manager']

out:

1         0
2         1
3         1
4         0
5         1
6         0
7         1
8         0
9         0
10        0
11        0
12        1
13        0
14        0
15        1
16        0
17        0
18        1
19        1
20        0
21        1
22        1
23        1
24        0
25        1
26        1
27        0
28        0
29        0
         ..
123981    1
123982    1
123983    0
123984    0
123985    1
123986    0
123987    0
123988    0
123989    0
123990    0
123991    1
123992    0
123993    0
123994    0
123995    1
123996    0
123997    0
123998    0
123999    0
124000    0
124001    1
124002    0
124003    1
124004    0
124005    0
124006    0
124007    1
124008    0
124009    0
124010    0
Name: top_1_manager, dtype: int64

building_id

类似manager_id处理

buildings_count = train_test['building_id'].value_counts()

train_test['top_1_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
train_test['top_2_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
train_test['top_5_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
train_test['top_10_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
train_test['top_15_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
train_test['top_20_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
train_test['top_25_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
train_test['top_30_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)
train_test['top_50_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)

photos

train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x))
train_test.drop(['photos'] , axis = 1 , inplace = True)

print train_test['photos_count']

out:

0          5
1         11
2          8
3          3
4          3
5          5
6         10
7          5
8          5
9          9
10         1
11         5
12         4
13         6
14         6
15         2
16         5
17         8
18         0
19        32
20         6
21         5
22         0
23         4
24         6
25         7
26         6
27         6
28         1
29         1
          ..
123981     9
123982     5
123983     5
123984     1
123985     6
123986     1
123987     6
123988     4
123989     4
123990    10
123991     6
123992    21
123993     5
123994     7
123995     5
123996     4
123997     3
123998     5
123999     6
124000     5
124001     3
124002     3
124003     7
124004     9
124005     0
124006     0
124007     0
124008     2
124009     3
124010     2
Name: photos_count, dtype: int64

latitude,longtitude

聚类降维编码(#用训练数据训练，对训练数据和测试数据都做变换)到中心的距离(论坛上讨论到曼哈顿中心的距离更好)

 # Clustering
train_location = train_test.loc[:ntrain-1, ['latitude', 'longitude']]
test_location = train_test.loc[ntrain:, ['latitude', 'longitude']]

kmeans_cluster = KMeans(n_clusters=20)
res = kmeans_cluster.fit(train_location)
res = kmeans_cluster.predict( pd.concat((train_location, test_location), axis=0).reset_index(drop=True))

train_test['cenroid'] = res

# L1 distance
center = [ train_location['latitude'].mean(), train_location['longitude'].mean()]
train_test['distance'] = abs(train_test['latitude'] - center[0]) + abs(train_test['longitude'] - center[1])

display_address

train_test['display_address'] = train_test['display_address'].apply(lambda x: x.lower().strip())

street_address

train_test['street_address'] = train_test['street_address'].apply(lambda x: x.lower().strip())

print train_test['street_address']

out:

0         21630
1         21801
2          8941
3         13018
4         17671
5         13968
6          6744
7          3723
8         20097
9         18372
10         4449
11        20320
12         2493
13        10803
14        11947
15        13455
16         6955
17        22855
18         2014
19        10214
20         1688
21         6659
22        12597
23          151
24        10780
25        11655
26        16591
27        19731
28         1708
29        16321
          ...  
123981     9772
123982     5623
123983     7040
123984    12511
123985    15322
123986    10911
123987     5995
123988    22254
123989      797
123990     8795
123991     5871
123992     5080
123993    10206
123994    13569
123995    18949
123996     9193
123997    20031
123998     7789
123999    16247
124000     1293
124001     3831
124002    10982
124003     9739
124004     6945
124005       28
124006    22740
124007     9322
124008    13984
124009    13025
124010    22493
Name: street_address, dtype: int64

类别型特征

LableEncode

categoricals = ['building_id', 'manager_id', 'display_address', 'street_address']
#categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
for feat in categoricals:
    lbl = LabelEncoder()
    lbl.fit(list(train_test[feat].values))
    train_test[feat] = lbl.transform(list(train_test[feat].values))

定义高基数类别型特征编码函数（manager_id, building_id, display_address,street_address ）对这些特征进行均值编码（该特征值在每个类别的概率，即原来的一维特征变成了C-1维特征，C为标签类别数目）

from MeanEncoder import MeanEncoder
me = MeanEncoder(categoricals)

#trian
#import pdb
#pdb.set_trace()
train_new = train_test.iloc[:ntrain, :]
train_new_cat = me.fit_transform(train_new, y_train)

#test
test_new = train_test.iloc[ntrain:, :]
test_new_cat = me.transform(test_new)

train_new

out:

train_new_cat

features

描述特征文字长度特征中单词的词频，相##当于以数据集features中出现的词语为字典的one-hot编码（虽然是词频，但在这个任务中每个单词）

train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x))

c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))
c_vect_sparse = c_vect.fit_transform(train_test['features2'])
c_vect_sparse_cols = c_vect.get_feature_names()

train_test.drop(['features', 'features2'], axis=1, inplace=True)

#hstack作为特征处理的最后一部，先将其他所有特征都转换成数值型特征才能处理
train_test_sparse = sparse.hstack([train_test, c_vect_sparse]).tocsr()

train_test['features_count']

out:

0          0
1          5
2          4
3          2
4          1
5          0
6          8
7          8
8          4
9          2
10         0
11         7
12         4
13         6
14         5
15         5
16         1
17         5
18         3
19         2
20         4
21         9
22         2
23        10
24        12
25        12
26         3
27         8
28         0
29         8
          ..
123981    15
123982     7
123983     0
123984     4
123985    10
123986    10
123987    10
123988     4
123989     2
123990     3
123991     9
123992     3
123993     7
123994     4
123995     4
123996     5
123997     2
123998     6
123999     1
124000     9
124001     3
124002     6
124003    10
124004     1
124005     2
124006     3
124007     0
124008     6
124009     3
124010     3
Name: features_count, dtype: int64

c_vect_sparse_cols

[u'1br',
 u'24',
 u'2br',
 u'3br',
 u'ac',
 u'access',
 u'actual',
 u'air',
 u'allowed',
 u'appliances',
 u'approval',
 u'apt',
 u'areas',
 u'attended',
 u'available',
 u'backyard',
 u'balcony',
 u'basement',
 u'bath',
 u'bathroom',
 u'bedford',
 u'bedroom',
 u'bedrooms',
 u'bike',
 u'billiards',
 u'bldg',
 u'blks',
 u'brand',
 u'brick',
 u'brownstone',
 u'building',
 u'business',
 u'cable',
 u'cats',
 u'ceiling',
 u'ceilings',
 u'center',
 u'central',
 u'chef',
 u'children',
 u'childrens',
 u'clean',
 u'cleaning',
 u'close',
 u'closet',
 u'closets',
 u'club',
 u'common',
 u'concierge',
 u'conditioning',
 u'construction',
 u'courtyard',
 u'deck',
 u'decorative',
 u'details',
 u'dining',
 u'dishwasher',
 u'dogs',
 u'doorman',
 u'dry',
 u'dryer',
 u'duplex',
 u'eat',
 u'elev',
 u'elevator',
 u'entertainment',
 u'exclusive',
 u'exposed',
 u'facilities',
 u'fee',
 u'fireplace',
 u'fitness',
 u'flex',
 u'floor',
 u'floors',
 u'free',
 u'friendly',
 u'ft',
 u'fully',
 u'furnished',
 u'garage',
 u'garden',
 u'granite',
 u'green',
 u'gut',
 u'gym',
 u'hardwood',
 u'health',
 u'hi',
 u'high',
 u'highrise',
 u'home',
 u'hour',
 u'housekeeping',
 u'huge',
 u'included',
 u'indoor',
 u'intercom',
 u'internet',
 u'kitchen',
 u'large',
 u'laundry',
 u'level',
 u'light',
 u'live',
 u'living',
 u'lndry',
 u'lobby',
 u'loft',
 u'lot',
 u'lounge',
 u'lowrise',
 u'luxury',
 u'marble',
 u'massive',
 u'microwave',
 u'midrise',
 u'modern',
 u'month',
 u'multi',
 u'natural',
 u'new',
 u'newly',
 u'ok',
 u'outdoor',
 u'oversized',
 u'park',
 u'parking',
 u'patio',
 u'pet',
 u'pets',
 u'photos',
 u'playroom',
 u'pool',
 u'post',
 u'pre',
 u'prewar',
 u'private',
 u'publicoutdoor',
 u'queen',
 u'ramp',
 u'reduced',
 u'renovated',
 u'rent',
 u'residents',
 u'rise',
 u'roof',
 u'roofdeck',
 u'rooftop',
 u'room',
 u'sauna',
 u'service',
 u'services',
 u'share',
 u'shared',
 u'shares',
 u'short',
 u'simplex',
 u'site',
 u'sized',
 u'skylight',
 u'space',
 u'spacious',
 u'speed',
 u'sprawling',
 u'ss',
 u'stainless',
 u'steel',
 u'steps',
 u'storage',
 u'studio',
 u'sublet',
 u'subway',
 u'super',
 u'superintendent',
 u'swimming',
 u'tenant',
 u'term',
 u'terrace',
 u'time',
 u'tons',
 u'train',
 u'true',
 u'tv',
 u'unit',
 u'utilities',
 u'valet',
 u'video',
 u'view',
 u'views',
 u'virtual',
 u'walk',
 u'walls',
 u'war',
 u'washer',
 u'water',
 u'wheelchair',
 u'wifi',
 u'windows',
 u'work']

特征处理结果存为文件

y_train

out:

10        1
10000     2
100004    0
100007    2
100013    2
100014    1
100016    2
100020    2
100026    1
100027    2
100030    2
10004     2
100044    0
100048    2
10005     2
100051    1
100052    2
100053    2
100055    2
100058    2
100062    2
100063    1
100065    2
100066    2
10007     1
100071    2
100075    1
100076    2
100079    0
100081    2
         ..
99915     2
99917     2
99919     1
99921     1
99923     2
99924     2
99931     2
99933     2
99935     2
99937     2
9994      2
99953     2
99956     2
99960     1
99961     2
99964     1
99965     2
99966     2
99979     2
99980     2
99982     0
99984     2
99986     2
99987     2
99988     1
9999      1
99991     2
99992     2
99993     2
99994     2
Name: interest_level, dtype: int64

#存为csv格式方便用excel查看
train_test_new = pd.DataFrame(train_test_sparse.toarray())
X_train = train_test_new.iloc[:ntrain, :]
X_test = train_test_new.iloc[ntrain:, :]

train_new = pd.concat((X_train, y_train), axis=1).reset_index(drop=True)
train_new.to_csv(dpath + 'RentListingInquries_FE_train.csv', index=False)
X_test.to_csv(dpath + 'RentListingInquries_FE_test.csv', index=False)

from  scipy.io import mmwrite

X_train_sparse = train_test_sparse[:ntrain, :]
X_test_sparse = train_test_sparse[ntrain:, :]

train_sparse = sparse.hstack([X_train_sparse, sparse.csr_matrix(y_train).T]).tocsr()

mmwrite(dpath + 'RentListingInquries_FE_train.txt',train_sparse)
mmwrite(dpath + 'RentListingInquries_FE_test.txt',X_test_sparse)

#存为libsvm稀疏格式，直接调用XGBoost的话用稀疏格式更高效
#from sklearn.datasets import dump_svmlight_file
#dump_svmlight_file(, y_train, dpath + 'RentListingInquries_FE_train.txt',X_train_sparse) 
#dump_svmlight_file(X_test_sparse,  dpath + 'RentListingInquries_FE_test.txt')

train_test_new = pd.DataFrame(train_test_sparse.toarray())
X_train = train_test_new.iloc[:ntrain, :]
X_test = train_test_new.iloc[ntrain:, :]

train_new = pd.concat((X_train, y_train), axis=1)