# python 2.7 python 2.7 python 2.7 import sys sys.path.append('./data') #引入不同目录下的文件,此处是为了引入MeanEncoder.py该文件 import MeanEncoder import numpy as np import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from scipy import sparse from sklearn.preprocessing import LabelEncoder from sklearn.cluster import KMeans from nltk.metrics import distance as distance from sklearn.model_selection import StratifiedKFold #from MeanEncoder import MeanEncoder
dpath = './data/' train = pd.read_json(dpath + 'RentListingInquries_train.json') test = pd.read_json(dpath + 'RentListingInquries_test.json') train.head()
从前面的分析和常识来看,listing_id对确定interest_level没有用,去掉特征编码对训练集和测试集都要做,所以干脆将二者连起来一起处理
y_map = {'low' : 2 , 'medium' : 1 , 'high' : 0} train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x]) #y_train = train.interest_level.values y_train = train.interest_level train = train.drop(['listing_id' , 'interest_level'] , axis = 1) listing_id = test.listing_id.values test = test.drop('listing_id' , axis = 1) ntrain = train.shape[0] # concat函数是在pandas底下的方法,可以将数据根据不同的轴作简单的融合 train_test = pd.concat((train , test) , axis = 0).reset_index(drop = True)
y_train
out:
10 1 10000 2 100004 0 100007 2 100013 2 100014 1 100016 2 100020 2 100026 1 100027 2 100030 2 10004 2 100044 0 100048 2 10005 2 100051 1 100052 2 100053 2 100055 2 100058 2 100062 2 100063 1 100065 2 100066 2 10007 1 100071 2 100075 1 100076 2 100079 0 100081 2 .. 99915 2 99917 2 99919 1 99921 1 99923 2 99924 2 99931 2 99933 2 99935 2 99937 2 9994 2 99953 2 99956 2 99960 1 99961 2 99964 1 99965 2 99966 2 99979 2 99980 2 99982 0 99984 2 99986 2 99987 2 99988 1 9999 1 99991 2 99992 2 99993 2 99994 2 Name: interest_level, dtype: int64
数值型特征,+ / - / * / / ,特征的单调变换对XGBoost不必要
# remove some noise #ulimit = np.percentile(train_test.price.values , 99) train_test['price'].ix[train_test['price'] > 13000] = 13000
# remove some noise #pandas.loc 选取指定列进行操作 #df.loc[行标签,列标签]df.loc['a':'b']#选取ab两行数据df.loc[:,'one']#选取one列的数据 #下面的代码是分别将bathrooms列中,值为112,10,20的值分别置换成1.5,1,2 train_test.loc[train_test["bathrooms"] == 112 , "bathrooms"] = 1.5 train_test.loc[train_test["bathrooms"] == 10 , "bathrooms"] = 1 train_test.loc[train_test["bathrooms"] == 20 , "bathrooms"] = 2
#构造新特征 #price_bathrooms:单位bathroom的价格 #price_bedrooms:单位bedroom的价格 train_test['price_bathrooms'] = (train_test["price"]) / (train_test["bathrooms"] + 1.0) train_test['price_bedrooms'] = (train_test["price"] / (train_test["bedrooms"]) + 1.0)
#构造新特征 #room_diff: bathroom房间数 - bedrooms房间数 #room_num: bathroom房间数 - bedroom房间数 train_test["room_diff"] = train_test["bathrooms"] - train_test["bedrooms"] train_test["room_num"] = train_test["bedrooms"] + train_test["bathrooms"]
print train_test["bathrooms"]
0 1.5 1 1.0 2 1.0 3 1.0 4 1.0 5 2.0 6 1.0 7 2.0 8 1.0 9 2.0 10 1.0 11 1.0 12 1.0 13 2.0 14 1.0 15 1.0 16 1.0 17 1.0 18 1.0 19 1.0 20 1.0 21 1.0 22 1.0 23 1.0 24 2.0 25 3.5 26 1.0 27 1.0 28 1.0 29 2.0 ... 123981 1.0 123982 2.0 123983 1.0 123984 1.0 123985 2.0 123986 1.0 123987 2.0 123988 1.0 123989 2.0 123990 1.0 123991 0.0 123992 1.0 123993 2.0 123994 1.0 123995 1.0 123996 2.0 123997 1.0 123998 1.0 123999 1.0 124000 1.0 124001 1.0 124002 1.0 124003 2.0 124004 1.0 124005 1.0 124006 1.0 124007 1.0 124008 1.0 124009 1.0 124010 1.0 Name: bathrooms, dtype: float64
train_test['Date'] = pd.to_datetime(train_test['created']) train_test['Year'] = train_test['Date'].dt.year train_test['Month'] = train_test['Date'].dt.month train_test['Day'] = train_test['Date'].dt.day train_test['Wday'] = train_test['Date'].dt.dayofweek train_test['Yday'] = train_test['Date'].dt.dayofyear train_test['hour'] = train_test['Date'].dt.hour train_test = train_test.drop(['Date', 'created'], axis=1)
# count of words present in description colum train_test["num_description_words"] = train_test["description"].apply(lambda x: len(x.split(" "))) train_test = train_test.drop(['description'] , axis = 1)
将manager分为几个等级 top 1%, 2%, 5, 10, 15, 20, 25, 30, 50
managers_count = train_test['manager_id'].value_counts() train_test['top_1_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[ managers_count.values >= np.percentile(managers_count.values, 99)] else 0) train_test['top_2_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[ managers_count.values >= np.percentile(managers_count.values, 98)] else 0) train_test['top_5_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[ managers_count.values >= np.percentile(managers_count.values, 95)] else 0) train_test['top_10_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[ managers_count.values >= np.percentile(managers_count.values, 90)] else 0) train_test['top_15_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[ managers_count.values >= np.percentile(managers_count.values, 85)] else 0) train_test['top_20_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[ managers_count.values >= np.percentile(managers_count.values, 80)] else 0) train_test['top_25_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[ managers_count.values >= np.percentile(managers_count.values, 75)] else 0) train_test['top_30_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[ managers_count.values >= np.percentile(managers_count.values, 70)] else 0) train_test['top_50_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[ managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
print train_test['top_1_manager']
1 0 2 1 3 1 4 0 5 1 6 0 7 1 8 0 9 0 10 0 11 0 12 1 13 0 14 0 15 1 16 0 17 0 18 1 19 1 20 0 21 1 22 1 23 1 24 0 25 1 26 1 27 0 28 0 29 0 .. 123981 1 123982 1 123983 0 123984 0 123985 1 123986 0 123987 0 123988 0 123989 0 123990 0 123991 1 123992 0 123993 0 123994 0 123995 1 123996 0 123997 0 123998 0 123999 0 124000 0 124001 1 124002 0 124003 1 124004 0 124005 0 124006 0 124007 1 124008 0 124009 0 124010 0 Name: top_1_manager, dtype: int64
类似manager_id处理
buildings_count = train_test['building_id'].value_counts() train_test['top_1_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[ buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0) train_test['top_2_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[ buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0) train_test['top_5_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[ buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0) train_test['top_10_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[ buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0) train_test['top_15_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[ buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0) train_test['top_20_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[ buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0) train_test['top_25_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[ buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0) train_test['top_30_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[ buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0) train_test['top_50_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[ buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x)) train_test.drop(['photos'] , axis = 1 , inplace = True)
print train_test['photos_count']
0 5 1 11 2 8 3 3 4 3 5 5 6 10 7 5 8 5 9 9 10 1 11 5 12 4 13 6 14 6 15 2 16 5 17 8 18 0 19 32 20 6 21 5 22 0 23 4 24 6 25 7 26 6 27 6 28 1 29 1 .. 123981 9 123982 5 123983 5 123984 1 123985 6 123986 1 123987 6 123988 4 123989 4 123990 10 123991 6 123992 21 123993 5 123994 7 123995 5 123996 4 123997 3 123998 5 123999 6 124000 5 124001 3 124002 3 124003 7 124004 9 124005 0 124006 0 124007 0 124008 2 124009 3 124010 2 Name: photos_count, dtype: int64
聚类降维编码(#用训练数据训练,对训练数据和测试数据都做变换)到中心的距离(论坛上讨论到曼哈顿中心的距离更好)
# Clustering train_location = train_test.loc[:ntrain-1, ['latitude', 'longitude']] test_location = train_test.loc[ntrain:, ['latitude', 'longitude']] kmeans_cluster = KMeans(n_clusters=20) res = kmeans_cluster.fit(train_location) res = kmeans_cluster.predict( pd.concat((train_location, test_location), axis=0).reset_index(drop=True)) train_test['cenroid'] = res # L1 distance center = [ train_location['latitude'].mean(), train_location['longitude'].mean()] train_test['distance'] = abs(train_test['latitude'] - center[0]) + abs(train_test['longitude'] - center[1])
train_test['display_address'] = train_test['display_address'].apply(lambda x: x.lower().strip())
train_test['street_address'] = train_test['street_address'].apply(lambda x: x.lower().strip())
print train_test['street_address']
0 21630 1 21801 2 8941 3 13018 4 17671 5 13968 6 6744 7 3723 8 20097 9 18372 10 4449 11 20320 12 2493 13 10803 14 11947 15 13455 16 6955 17 22855 18 2014 19 10214 20 1688 21 6659 22 12597 23 151 24 10780 25 11655 26 16591 27 19731 28 1708 29 16321 ... 123981 9772 123982 5623 123983 7040 123984 12511 123985 15322 123986 10911 123987 5995 123988 22254 123989 797 123990 8795 123991 5871 123992 5080 123993 10206 123994 13569 123995 18949 123996 9193 123997 20031 123998 7789 123999 16247 124000 1293 124001 3831 124002 10982 124003 9739 124004 6945 124005 28 124006 22740 124007 9322 124008 13984 124009 13025 124010 22493 Name: street_address, dtype: int64
LableEncode
categoricals = ['building_id', 'manager_id', 'display_address', 'street_address'] #categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object'] for feat in categoricals: lbl = LabelEncoder() lbl.fit(list(train_test[feat].values)) train_test[feat] = lbl.transform(list(train_test[feat].values))
定义高基数类别型特征编码函数 (manager_id, building_id, display_address,street_address ) 对这些特征进行均值编码(该特征值在每个类别的概率,即原来的一维特征变成了C-1维特征,C为标签类别数目)
from MeanEncoder import MeanEncoder me = MeanEncoder(categoricals) #trian #import pdb #pdb.set_trace() train_new = train_test.iloc[:ntrain, :] train_new_cat = me.fit_transform(train_new, y_train) #test test_new = train_test.iloc[ntrain:, :] test_new_cat = me.transform(test_new)
train_new
train_new_cat
描述特征文字长度 特征中单词的词频,相##当于以数据集features中出现的词语为字典的one-hot编码(虽然是词频,但在这个任务中每个单词)
train_test['features_count'] = train_test['features'].apply(lambda x: len(x)) train_test['features2'] = train_test['features'] train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x)) c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1)) c_vect_sparse = c_vect.fit_transform(train_test['features2']) c_vect_sparse_cols = c_vect.get_feature_names() train_test.drop(['features', 'features2'], axis=1, inplace=True) #hstack作为特征处理的最后一部,先将其他所有特征都转换成数值型特征才能处理 train_test_sparse = sparse.hstack([train_test, c_vect_sparse]).tocsr()
train_test['features_count']
0 0 1 5 2 4 3 2 4 1 5 0 6 8 7 8 8 4 9 2 10 0 11 7 12 4 13 6 14 5 15 5 16 1 17 5 18 3 19 2 20 4 21 9 22 2 23 10 24 12 25 12 26 3 27 8 28 0 29 8 .. 123981 15 123982 7 123983 0 123984 4 123985 10 123986 10 123987 10 123988 4 123989 2 123990 3 123991 9 123992 3 123993 7 123994 4 123995 4 123996 5 123997 2 123998 6 123999 1 124000 9 124001 3 124002 6 124003 10 124004 1 124005 2 124006 3 124007 0 124008 6 124009 3 124010 3 Name: features_count, dtype: int64
c_vect_sparse_cols
[u'1br', u'24', u'2br', u'3br', u'ac', u'access', u'actual', u'air', u'allowed', u'appliances', u'approval', u'apt', u'areas', u'attended', u'available', u'backyard', u'balcony', u'basement', u'bath', u'bathroom', u'bedford', u'bedroom', u'bedrooms', u'bike', u'billiards', u'bldg', u'blks', u'brand', u'brick', u'brownstone', u'building', u'business', u'cable', u'cats', u'ceiling', u'ceilings', u'center', u'central', u'chef', u'children', u'childrens', u'clean', u'cleaning', u'close', u'closet', u'closets', u'club', u'common', u'concierge', u'conditioning', u'construction', u'courtyard', u'deck', u'decorative', u'details', u'dining', u'dishwasher', u'dogs', u'doorman', u'dry', u'dryer', u'duplex', u'eat', u'elev', u'elevator', u'entertainment', u'exclusive', u'exposed', u'facilities', u'fee', u'fireplace', u'fitness', u'flex', u'floor', u'floors', u'free', u'friendly', u'ft', u'fully', u'furnished', u'garage', u'garden', u'granite', u'green', u'gut', u'gym', u'hardwood', u'health', u'hi', u'high', u'highrise', u'home', u'hour', u'housekeeping', u'huge', u'included', u'indoor', u'intercom', u'internet', u'kitchen', u'large', u'laundry', u'level', u'light', u'live', u'living', u'lndry', u'lobby', u'loft', u'lot', u'lounge', u'lowrise', u'luxury', u'marble', u'massive', u'microwave', u'midrise', u'modern', u'month', u'multi', u'natural', u'new', u'newly', u'ok', u'outdoor', u'oversized', u'park', u'parking', u'patio', u'pet', u'pets', u'photos', u'playroom', u'pool', u'post', u'pre', u'prewar', u'private', u'publicoutdoor', u'queen', u'ramp', u'reduced', u'renovated', u'rent', u'residents', u'rise', u'roof', u'roofdeck', u'rooftop', u'room', u'sauna', u'service', u'services', u'share', u'shared', u'shares', u'short', u'simplex', u'site', u'sized', u'skylight', u'space', u'spacious', u'speed', u'sprawling', u'ss', u'stainless', u'steel', u'steps', u'storage', u'studio', u'sublet', u'subway', u'super', u'superintendent', u'swimming', u'tenant', u'term', u'terrace', u'time', u'tons', u'train', u'true', u'tv', u'unit', u'utilities', u'valet', u'video', u'view', u'views', u'virtual', u'walk', u'walls', u'war', u'washer', u'water', u'wheelchair', u'wifi', u'windows', u'work']
y_train
out:
10 1 10000 2 100004 0 100007 2 100013 2 100014 1 100016 2 100020 2 100026 1 100027 2 100030 2 10004 2 100044 0 100048 2 10005 2 100051 1 100052 2 100053 2 100055 2 100058 2 100062 2 100063 1 100065 2 100066 2 10007 1 100071 2 100075 1 100076 2 100079 0 100081 2 .. 99915 2 99917 2 99919 1 99921 1 99923 2 99924 2 99931 2 99933 2 99935 2 99937 2 9994 2 99953 2 99956 2 99960 1 99961 2 99964 1 99965 2 99966 2 99979 2 99980 2 99982 0 99984 2 99986 2 99987 2 99988 1 9999 1 99991 2 99992 2 99993 2 99994 2 Name: interest_level, dtype: int64
#存为csv格式方便用excel查看 train_test_new = pd.DataFrame(train_test_sparse.toarray()) X_train = train_test_new.iloc[:ntrain, :] X_test = train_test_new.iloc[ntrain:, :] train_new = pd.concat((X_train, y_train), axis=1).reset_index(drop=True) train_new.to_csv(dpath + 'RentListingInquries_FE_train.csv', index=False) X_test.to_csv(dpath + 'RentListingInquries_FE_test.csv', index=False)
from scipy.io import mmwrite X_train_sparse = train_test_sparse[:ntrain, :] X_test_sparse = train_test_sparse[ntrain:, :] train_sparse = sparse.hstack([X_train_sparse, sparse.csr_matrix(y_train).T]).tocsr() mmwrite(dpath + 'RentListingInquries_FE_train.txt',train_sparse) mmwrite(dpath + 'RentListingInquries_FE_test.txt',X_test_sparse) #存为libsvm稀疏格式,直接调用XGBoost的话用稀疏格式更高效 #from sklearn.datasets import dump_svmlight_file #dump_svmlight_file(, y_train, dpath + 'RentListingInquries_FE_train.txt',X_train_sparse) #dump_svmlight_file(X_test_sparse, dpath + 'RentListingInquries_FE_test.txt')
train_test_new = pd.DataFrame(train_test_sparse.toarray()) X_train = train_test_new.iloc[:ntrain, :] X_test = train_test_new.iloc[ntrain:, :] train_new = pd.concat((X_train, y_train), axis=1)