数据为Kaggle2017年举办的Two Sigma Connect:Rental Listing lnquiries竞赛数据; kernel中数据分析示例:https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/kernels
知道数据集的基本信息对我们后续进行特征工程和选择机器学习模型有用。
# python 2.7 python 2.7 python 2.7 # import 必要的模块 import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder import seaborn as sns %matplotlib inline
dpath = './data/' train = pd.read_json(dpath + 'RentListingInquries_train.json') train.head()
检查数据规模读取测试数据
print("Train:" , train.shape)
该数据集中有各种类型的特征,数值型特征、类别型特征、日期特征、地理位置特征、文本特征和图像特征
train.info()
#统计量 train.describe()
# 空值检查 train.isnull().sum()
在python中,有很多数据可视化途径。Matplotlib非常强大,也很复杂。可以使用它做几乎所有的事情,然而,它并不是很易于学习。很多工具(尤其是Pandas和Seaborn)都对它进行了封装。pandas提供内置的图表功能,可使用pandas.DataFrame画各种图形。Pandas对于简单绘图,可以随手用,但需要学习定制matplotlib。seaborn是在matplotlib的基础上进行了更高级的API封装,从而使得作图更加容易,在大多数情况下使用seaborn就能做出具有吸引力的图,而使用matplotlib就能制作具有更多特色的图。
# 该问题为分类问题,类别型特征直方图可用countplot sns.countplot(train.interest_level , order = ['low' , 'medium' , 'high']) plt.xlabel('Interest Level') plt.ylabel('Number of occurrences')
#用有序数字编码:0:low,1:medium,2:high train['interest'] = np.where(train.interest_level == 'low' , 0 ,np.where(train.interest_level == 'medium' , 1 , 2))
大多数样本都是interest level为low,然后是medium,最后是high。此处不用LableEncoder,因为LableEncoder不能手动指定每个标签对应的数值。 也可以使用另一种转换方式:target_num_map = {'high':2,'medium':1,'low':0},y = train["interest_level"].apply(lambda x:target_num_map[x])
bathrooms , bedrooms , price bathrooms和bedrooms特征的取值集合较小,可视为类别型特征,可以用seaborn.countplot画分布图; price可能的取值多,用seaborn.distplott画分布图
fig = plt.figure() ### Number of occurrences sns.countplot(train.bathrooms) plt.xlabel('Number of Bathrooms') plt.ylabel('Number of occurences')
顺便查看bathrooms与标签之间的关系不同interest_level下的bathrooms散点图:stripplot散点图有时会重叠,所以打点时有某种随机jitter
order = ['low' , 'medium' , 'high'] sns.stripplot(train["interest_level"] , train["bathrooms"] , jitter = True , order = order) plt.title("Number of Bathrooms Vs Interest_level")
从直方图也可以看出超过4个bathrooms的房子很少
# 删除bathrooms的值大于等于10的样本 ulimit = 10 train = train[train['bathrooms'] < ulimit]
train.shape
删除了一个样本
fig = plt.figure() # Number of occurrences(出现次数) sns.countplot(train.bathrooms) plt.xlabel('Number of Bathrooms') plt.ylabel('Number of occurrences')
sns.stripplot(y = "bathrooms" , x = "interest_level" , data = train , jitter = True , order = order)
sns.countplot(x = "bathrooms" , hue = "interest_level" , data = train)
没有bathroom的房子极少high interest
fig = plt.figure() # Number of occurrences sns.countplot(train.bedrooms) plt.xlabel("Number of Bedrooms") plt.ylabel("Number of occurrences")
查看bedrooms与标签之间的关系
order = ['low' , 'medium' , 'high'] sns.stripplot(train["interest_level"] , train["bedrooms"] , jitter = True , order = order) plt.title("Number of Bedrooms Vs Interest_level")
sns.countplot(x = "bedrooms" , hue = "interest_level" , data = train)
plt.scatter(range(train.shape[0]) , train["price"].values , color = 'purple') plt.title("Distribution of Price")
# 删除price的值大于等于1000000的样本 ulimit = 1000000 train = train[train['price'] < ulimit]
train.shape
plt.scatter(range(train.shape[0]) , train["price"].values , color = 'purple') plt.title("Distribution of Price")
# 99%分位数 ulimit = np.percentile(train.price.values , 99) train['price'].ix[train['price'] > ulimit] = ulimit sns.distplot(train.price.values , bins = 50 , kde = True) plt.xlabel('price' , fontsize = 12) plt.show()
可以看出,该分布为right skewed(右偏),可以加一个log变换,使数据看起来更对称,更向正态分布
plt.figure(figsize = (13 , 9)) sns.distplot(np.log1p(train["price"]))
查看price与标签之间的关系
order = ['low' , 'medium' , 'high'] sns.stripplot(train["interest_level"] , train["price"] , jitter = True , order = order) plt.title("Price Vs Interest_level")
low interest的price看起来均匀分布,medium和high的price更多在1500-8000之间 violinplot提供在不同类别条件下特征更多的分布信息:核密度估计(KDE)三个四分位数quartile(1/4 , 1/2 , 3/4) 1.5倍四分位间距(nterquartile range , IQR) IQR:第三四分位数和第一四分位数的区别(即Q1-Q3的差距),表示变量分散情形,比方差更稳健的统计量
order = ['low' , 'medium' , 'high'] sns.violinplot(x = 'interest_level' , y = 'price' , data = train , order = order) plt.xlabel('Interest level' , fontsize = 12) plt.ylabel('price' , fontsize = 12) plt.show()
sns.distplot(train.listing_id.values , bins = 50 , kde = True) plt.xlabel('listing_id') plt.show()
listing_id与标签之间的关系
order = ['low' , 'medium' , 'high'] sns.stripplot(train["interest_level"] , train["listing_id"] , jitter = True , order = order) plt.title("listing_id Vs Interest_level")
order = ['low' , 'medium' , 'high'] sns.violinplot(x = 'interest_level' , y = 'listing_id' , data = train , order = order) plt.xlabel('Interest level' , fontsize = 12) plt.ylabel('price' , fontsize = 12) plt.show()
Latitude & Longitude虽然是数值型变量,但其物理含义是房屋的地理位置。
sns.lmplot(x = "longitude" , y = "latitude" , fit_reg = False , hue = 'interest_level', hue_order = ['low', 'medium', 'high'] , size = 9, scatter_kws = {'alpha':0.4,'s':30}, data = train[(train.longitude > train.longitude.quantile(0.005)) &(train.longitude < train.longitude.quantile(0.995)) &(train.latitude > train.latitude.quantile(0.005)) &(train.latitude < train.latitude.quantile(0.995))] ) plt.xlabel('Longitude') plt.ylabel('Latitude')
上述显示去掉了经度和纬度偏大或偏小的数据点。可以看出higt interet的房屋在一小段很集中。可以load google earth进一步看看其具体位置。下面两段代码都是现在地图,可任选一段试试。不过需要先安装相应的工具包。 from mpl_toolkits.basemap import Basemap from matplotlib import cm
west, south, east, north = -74.02, 40.64, -73.85, 40.86
fig = plt.figure(figsize=(14,10)) ax = fig.add_subplot(111) m = Basemap(projection='merc', llcrnrlat=south, urcrnrlat=north, llcrnrlon=west, urcrnrlon=east, lat_ts=south, resolution='i') x, y = m(train['longitude'].values, train['latitude'].values) m.hexbin(x, y, gridsize=200, bins='log', cmap=cm.YlOrRd_r);
import gpxpy as gpx import gpxpy.gpx
gpx = gpxpy.gpx.GPX()
for index, row in train.iterrows():
#print (row['latitude'], row['longitude'])
if row['interest_level'] == 'high': #opting for all nominals results in poor performance of Google Earth gps_waypoint = gpxpy.gpx.GPXWaypoint(row['latitude'],row['longitude'],elevation=10) gpx.waypoints.append(gps_waypoint)
filename = "GoogleEarth.gpx" FILE = open(filename,"w") FILE.writelines(gpx.to_xml()) FILE.close()
""" cnt_srs为pandas中series类对象,左边的地址为索引,右边的对索引对应的值,即该地址出现了几次 cnt_srs的打印值为: display_address 135 \tEast 10th Street and East 11th Street 1 \tWashington Street 1 2 Allen street 1 Hell's Kitchen, Midtown West 3 Hell's Kitchen, Midtown West 2 Hell's Kitchen, Midtown West 4 11th Street 2 17th Street 1 1st Ave. 1 """ cnt_srs = train.groupby('display_address')['display_address'].count() # (cnt_srs < i ).mean()为cnt_srs中,值小于i的值占总数的百分比(值范围0-1) for i in [2 , 10 , 50 , 100 , 500]: print('Display_address tha appear less than: {}%'.format(i , round((cnt_srs < i ).mean() * 100 , 2))) plt.figure() plt.hist(cnt_srs.values , bins = 100 , log = True , alpha = 0.9) #sns.distplot(cnt_srs.values , bins = 100) plt.xlabel('Number of times display_address appeared') plt.ylabel('log(Count)')
# 打印display_address出现最多的10个地址 ### Let's get a list of top 10 display address top10da = train.display_address.value_counts().nlargest(10).index.tolist() fig = plt.figure() ax = sns.countplot(x="display_address", hue="interest_level", data=train[train.display_address.isin(top10da)]); plt.xlabel('display_address'); plt.ylabel('Number of advert occurrences'); ### Manager_ids are too long. Let's remove them plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off'); ### Adding percents over bars height = [0 if np.isnan(p.get_height()) else p.get_height() for p in ax.patches] ncol = int(len(height)/3) total = [height[i] + height[i + ncol] + height[i + 2*ncol] for i in range(ncol)] * 3 for i, p in enumerate(ax.patches): ax.text(p.get_x()+p.get_width()/2, height[i] + 20, '{:1.0%}'.format(height[i]/total[i]), ha="center")
### Let's get a list of top 10 building id top10building = train.building_id.value_counts().nlargest(10).index.tolist() ### ...and plot number of different Interest Level rental adverts for each of them fig = plt.figure() ax = sns.countplot(x="building_id", hue="interest_level", data=train[train.building_id.isin(top10building)]); plt.xlabel('Biulding'); plt.ylabel('Number of advert occurrences'); ### Manager_ids are too long. Let's remove them plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off'); ### Adding percents over bars height = [0 if np.isnan(p.get_height()) else p.get_height() for p in ax.patches] ncol = int(len(height)/3) total = [height[i] + height[i + ncol] + height[i + 2*ncol] for i in range(ncol)] * 3 for i, p in enumerate(ax.patches): ax.text(p.get_x()+p.get_width()/2, height[i] + 20, '{:1.0%}'.format(height[i]/total[i]), ha="center")
处理方法类似building_id
### Let's get a list of top 10 managers top10managers = train.manager_id.value_counts().nlargest(10).index.tolist() ### ...and plot number of different Interest Level rental adverts for each of them fig = plt.figure() ax = sns.countplot(x="manager_id", hue="interest_level", data=train[train.manager_id.isin(top10managers)]); plt.xlabel('Manager'); plt.ylabel('Number of advert occurrences'); ### Manager_ids are too long. Let's remove them plt.tick_params( axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off'); ### Adding percents over bars height = [0 if np.isnan(p.get_height()) else p.get_height() for p in ax.patches] ncol = int(len(height)/3) total = [height[i] + height[i + ncol] + height[i + 2*ncol] for i in range(ncol)] * 3 for i, p in enumerate(ax.patches): ax.text(p.get_x()+p.get_width()/2, height[i] + 20, '{:1.0%}'.format(height[i]/total[i]), ha="center")
格式示例:2016-06-24 07:54:24
train['created'] = pd.to_datetime(train['created']) train['date'] = train['created'].dt.date train['year'] = train['created'].dt.year train['month'] = train['created'].dt.month train['day'] = train['created'].dt.day train['hour'] = train['created'].dt.hour train['weekday'] = train['created'].dt.weekday train['week'] = train['created'].dt.week train['quarter'] = train['created'].dt.quarter train['weekend'] = ((train['weekday'] == 5) & (train['weekday'] == 6)) train['wd'] = ((train['weekday'] != 5) & (train['weekday'] != 6))
cnt_srs = train['date'].value_counts() plt.figure(figsize = (12 , 4)) ax = plt.subplot(111) ax.bar(cnt_srs.index , cnt_srs.values) ax.xaxis_date() plt.xticks(rotation = 'vertical') plt.show()
hourDF = train.groupby(['hour' , 'interest_level'])['hour'].count().unstack('interest_level').fillna(0) hourDF[['low' , 'medium' , 'high']].plot(kind = 'bar' , stacked = True)
monthDF = train.groupby(['month', 'interest_level'])['month'].count().unstack('interest_level').fillna(0) monthDF[['low','medium',"high"]].plot(kind='bar', stacked=True);
train['num_photos'] = train['photos'].apply(len) #percentile(a,q)函数计算数组a的百分这q的百分位数 ulimit = np.percentile(train.num_photos.values , 99) #下面一行代码表示如果train['num_photos']中的值大于ulimit的值,就将ulimit赋值给它 train['num_photos'].ix[train['num_photos'] > ulimit] = ulimit sns.countplot(train.num_photos); plt.xlabel('Number of photoes'); plt.ylabel('Number of occurrences');
train['num_photos'].ix[train['num_photos']>15] = 15 #sns.stripplot(y="num_photos", x="interest_level",data=train,jitter=True,order=order); plt.figure() sns.violinplot(x="num_photos", y="interest_level", data=train, order =['low','medium','high']) plt.xlabel('Number of Photos') plt.ylabel('Interest Level') plt.show()
train['len_features'] = train['features'].apply(len) sns.countplot(train.len_features); plt.xlabel('Length of features'); plt.ylabel('Number of occurrences');
train['len_features'].ix[train['len_features'] > 16] = 16 plt.figure() sns.violinplot(x="len_features", y="interest_level", data=train, order =['low','medium','high']) plt.xlabel('Length of Features') plt.ylabel('Interest Level') plt.show()
下面统计的是文本的单词个数,用空格做为分割符
train['num_description_words'] = train['description'].apply(lambda x: len(x.split(' '))) train['len_description'] = train['description'].apply(len) #ulimit = np.percentile(train.len_description.values, 99) #train['len_description'].ix[train['len_description']>ulimit] = ulimit sns.countplot(train.len_description) plt.xlabel('Length of description') plt.ylabel('Number of occurrences') plt.show()
fig = plt.figure() order = ['low', 'medium', 'high'] #ulimit = np.percentile(train.len_description.values, 99) #train['len_description'].ix[train['len_description']>ulimit] = ulimit sns.stripplot(train["interest_level"],train["len_description"],jitter=True,order=order) plt.title("Length of description Vs Interest_level");
plt.figure() sns.violinplot(x="len_description", y="interest_level", data=train, order =['low','medium','high']) plt.xlabel('Length of Description') plt.ylabel('Interest Level') plt.show()
sns.countplot(train.num_description_words); plt.xlabel('Number of words of description'); plt.ylabel('Number of occurrences');
fig = plt.figure() order = ['low', 'medium', 'high'] #ulimit = np.percentile(train.num_description_words.values, 99) #ulimit = 500 #train['num_description_words'].ix[train['num_description_words']>ulimit] = ulimit sns.stripplot(train["interest_level"],train["num_description_words"],jitter=True,order=order) plt.title("Length of description Vs Interest_level");
plt.figure() sns.violinplot(x="num_description_words", y="interest_level", data=train, order =['low','medium','high']) plt.xlabel('Number of Description Words') plt.ylabel('Interest Level') plt.show()
WordCloud官网: https://amueller.github.io/word_cloud/ github: https://github.com/amueller/word_cloudWordCloud下载地址:https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud
from wordcloud import WordCloud text = '' text_da = '' text_street = '' #text_desc = '' for ind, row in train.iterrows(): for feature in row['features']: text = " ".join([text, "_".join(feature.strip().split(" "))]) text_da = " ".join([text_da,"_".join(row['display_address'].strip().split(" "))]) text_street = " ".join([text_street,"_".join(row['street_address'].strip().split(" "))]) #text_desc = " ".join([text_desc, row['description']]) text = text.strip() text_da = text_da.strip() text_street = text_street.strip() #text_desc = text_desc.strip() plt.figure(figsize=(12,6)) wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text) wordcloud.recolor(random_state=0) plt.imshow(wordcloud) plt.title("Wordcloud for features", fontsize=30) plt.axis("off") plt.show() # wordcloud for display address plt.figure() wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_da) wordcloud.recolor(random_state=0) plt.imshow(wordcloud) plt.title("Wordcloud for Display Address", fontsize=30) plt.axis("off") plt.show() # wordcloud for street address plt.figure() wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_street) wordcloud.recolor(random_state=0) plt.imshow(wordcloud) plt.title("Wordcloud for Street Address", fontsize=30) plt.axis("off") plt.show()
contFeatureslist = [] contFeatureslist.append("bathrooms") contFeatureslist.append("bedrooms") contFeatureslist.append("price") print(contFeatureslist)
correlationMatrix = train[contFeatureslist].corr().abs() plt.subplots(figsize=(13, 9)) sns.heatmap(correlationMatrix,annot=True) # Mask unimportant features sns.heatmap(correlationMatrix, mask=correlationMatrix < 1, cbar=False) plt.show()