#series为一维序列,DataFrame为二维表结构 #DataFrame中的每一列都可以看成是一个series的结构 import pandas as pd import numpy as np s1 = pd.Series([4,7,-5,3])#创建一个series,索引为默认值 print(s1) print('serise的值为:\n' , s1.values)#series的值 #RangeIndex(start=0, stop=4, step=1),表示索引是从0开始到4结束,步长为1 print("series的索引为:\n" , s1.index)#series的索引 print('------------------------') s2 = pd.Series([4.0,6.5,-0.5,4.2],index=['d','b','a','c']) print(s2) print("s2['a']的值为:\n" , s2['a'])#根据索引取值 print("s2[['a','b','c']]的值为:\n" , s2[['a','b','c']])#根据索引取值 print('------------------------') print('b' in s2) print('e' in s2) #Series可以看成是一个定长的有序字典 dic1 = {'apple':5,'pen':3,'applepen':10} s3 = pd.Series(dic1) print(s3) print('------------------------') data = {'year':[2014,2015,2016,2017], 'income':[10000,30000,50000,80000], 'pay':[5000,20000,30000,30000] } df1 = pd.DataFrame(data) print(df1) print('df1的列名为:\n' , df1.columns) #列 print('df1的行信息为:\n' , df1.index) #行 print('df1的值为:\n' , df1.values) print('df1的统计信息为:\n' , df1.describe()) print('df1的转置为:\n' , df1.T) print('------------------------') df3 = pd.DataFrame(np.arange(12).reshape((3,4)),index=['a','c','b'],columns=[2,33,44,5]) print(df3) df3_col_sort = df3.sort_index(axis=1)#列排序 print('列排序的结果为:\n' , df3_col_sort) df3_row_sort = df3.sort_index(axis=0)#行排序 print('行排序的结果为:\n' , df3_row_sort) df3_by_44 = df3.sort_values(by=44)#对列名为44的列进行排序 print('对列名为44的列排序的结果为:\n' , df3_by_44)
dates = pd.date_range('20170101',periods=6) df1 = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) print(df1) print('将A列获取为一个series后的结果为:\n' , df1['A'])#将DataFrame的列获取为一个Series print('选择第B的结果为:\n' , df1.B) print('选择0-1行的结果为:\n' , df1[0:2])#选择0-1行 print('选择20170102-20170104的列的结果为:\n' , df1['20170102':'20170104'])#选择20170102-20170104的列 print('------------------------') #通过标签选择数据 print('\n以下为通过标签选择数据:\n') print('通过标签20170102选择数据的结果为:\n' , df1.loc['20170102']) print('通过行标签20170101和列标签A、C选择数据的结果为:\n' , df1.loc['20170101',['A','C']]) print('选择列标签为A和B的所有数据的结果为:\n' , df1.loc[:,['A','B']]) print('------------------------') #通过位置选择数据 print('\n以下为通过位置选择数据:\n') print('选择第二行的数据的结果为:\n' , df1.iloc[2]) #第二行 print('选择第1到3行和2到4列的结果为:\n' , df1.iloc[1:3,2:4]) print('选择1,2,4行和1,3列的结果为:\n' , df1.iloc[[1,2,4],[1,3]]) print('------------------------') #混合标签位置选择 print('\n以下为混合标签位置选择:\n') print('选择第2-4行和A、C列的结果为:\n' , df1.ix[2:4,['A','C']]) print('选择20170104行和2-4列的结果为:\n' , df1.ix['20170102':'20170104',2:4]) print('第A列的值与6进行逻辑比较后的结果为:\n' , df1.A > 6) print('选择第A列中,值大于6的所有行:\n' , df1[df1.A>6])
dates = np.arange(20170101,20170107) df1 = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) print(df1) df1.iloc[2,2] = 100 print('更新后第2行第2列的值为:' , df1.iloc[2,2]) df1.loc[20170102,'B'] = 200 print('更新后行标签为20170102,列标签为B的值为:' , df1.loc[20170102,'B']) print('------------------------') df1[df1.A>10] = 0 print('将第A列的值大于10的所有行赋值为0:\n' , df1) df1.A[df1.A==0] = 1 print('将第A列的值等于0的行中,将标签为A的行的值赋值为1:\n' , df1) df1['E'] = 10 #添加一列 print('添加一列值为10的列:\n' , df1) df1['F'] = pd.Series([1,2,3,4,5,6],index=dates)#添加一列 print('将一个指定的series添加为一列:\n' , df1) df1.loc[20170107,['A','B','C']] = [1,2,3] print('添加一行,且该行A、B、C列的值分别为1,2,3:\n' , df1) print('------------------------') s1 = pd.Series([1,2,3,4,5,6],index=['A','B','C','D','E','F']) s1.name = 'S1' df2 = df1.append(s1) print('将一个定义好的series添加为df1的一行:\n' , df2) df1.insert(1,'G',df2['E'])#在第一列插入索引为G的df2中的E列 print('在第一列插入索引为G的df2中的第E列:\n' , df1) g = df1.pop('G')#弹出G列 df1.insert(6,'G',g)#在最后插入 print('弹出第G列,并将其插入df1中的最后一列:\n' , df1) del df1['G']#删除G列 print('删除第G列后的结果为:\n' , df1) df2 = df1.drop(['A','B'],axis=1)#删除AB列,源数据没有删除,而是将删除后的结果返回 print('删除第A列和B列后的结果为:\n' , df2) df2 = df1.drop([20170101,20170102],axis=0)#删除20170101,20170102行 print('删除标签为20170101和20170102的行:\n' , df2)
dates = np.arange(20170101,20170105) df1 = pd.DataFrame(np.arange(12).reshape((4,3)),index=dates,columns=['A','B','C']) print(df1) df2 = pd.DataFrame(df1,index=dates,columns=['A','B','C','D','E']) print(df2) s1 = pd.Series([3,4,6],index=dates[:3]) s2 = pd.Series([32,5,2],index=dates[1:]) df2['D'] = s1 df2['E'] = s2 print(df2) print('------------------------') df3 = df2.dropna(axis=0,how='any') #axis=[0,1] 0代表行,1代表列。how=['any','all'] any任意一个或多个 all全部 print("去除掉有空值的行:\n" , df3) df3 = df2.dropna(axis=1,how='any') #axis=[0,1] 0代表行,1代表列。how=['any','all'] any任意一个或多个 all全部 print("去除掉有空值的列:\n" , df3) df3 = df2.fillna(value=0)#把空值赋值为0 print('将空值赋值为0:\n' , df3) df3 = df2.isnull()#查看空值 print("空值的查看:\n" , df3) df3 = np.any(df2.isnull())#只要有一个或多个空值就会返回true print('检测是否有空值:\n' , df3) df3 = np.all(df2.isnull())#所有为空值才返回true print('检测是否所有值都是空值:\n' , df3)
file = pd.read_csv('file1.txt',encoding='gbk') print(file) file.iloc[1,0] = '皮卡丘' print(file) file.to_csv('people2.csv',encoding='gbk')
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','d']) df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','b','c','d']) df3 = pd.DataFrame(np.arange(24,36).reshape((3,4)),columns=['a','b','c','d']) print(df1) print(df2) print(df3) print('------------------------') df4 = pd.concat([df1,df2,df3],axis=0)#纵向合并 print('纵向合并df1,df2,df3:\n' , df4) df4 = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#纵向合并,不考虑原来的index print('不考虑原来的Index的纵向合并:\n' , df4) print('------------------------') df5 = pd.concat([df1,df2,df3],axis=1)#横向合并 print('横向合并df1,df2,df3:\n' , df5) df7 = pd.concat([df1,df2],join='inner',ignore_index=True)#合并两个表,缺少的部分去掉 print('合并两个表,缺少的部分去掉:\n' , df7) print('------------------------') df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f']) df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','c','d','e']) print('df1为:\n' , df1) print('df2为:\n' , df2) df6 = pd.concat([df1,df2],join='outer',ignore_index=True)#合并两个表,缺少的部分填充NaN print('合并两个表,缺少的部分填充NaN' , df6) print('------------------------') df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f']) df2 = pd.DataFrame(np.arange(12,24).reshape((4,3)),columns=['a','c','d']) print('df1为:\n' , df1) print('df2为:\n' , df2) df8 = pd.concat([df1,df2],axis=1,join_axes=[df1.index])#横向合并,index使用df1的index print('横向合并df1,df2,index使用df1的index:\n' , df8) df8 = pd.concat([df1,df2],axis=1)#横向合并 print('横向合并df1,df2:\n' , df8)
left = pd.DataFrame({'key':['K0','K1','K2','K3'], 'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3']}) right = pd.DataFrame({'key':['K0','K1','K2','K3'], 'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']}) print(left) print(right) res = pd.merge(left,right,on='key')#以key为主键进行数据合并,类似sql中的连表 print("\n" , res) print('------------------------') left = pd.DataFrame({'key1':['K0','K0','K1','K2'], 'key2':['K0','K1','K0','K1'], 'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3']}) right = pd.DataFrame({'key1':['K0','K1','K1','K3'], 'key2':['K0','K0','K0','K0'], 'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']}) print(left) print(right) #how = ['left','right','inner','outer'] res = pd.merge(left,right,on=['key1','key2'],how='outer')#how默认inner print('outer方式合并:\n' , res) res = pd.merge(left,right,on=['key1','key2'],how='inner')#how默认inner print('inner方式合并:\n' , res) res = pd.merge(left,right,on=['key1','key2'],how='left')#how默认inner print('left方式合并:\n' , res) res = pd.merge(left,right,on=['key1','key2'],how='outer',indicator=True)#显示merge信息 print('out方式合并,显示merge信息:\n' , res) res = pd.merge(left,right,on=['key1','key2'],how='outer',indicator='indicator_column')#显示merge信息 print('out方式合并,显示merge信息,并指定merge信息的标签:\n' , res) print('------------------------') left = pd.DataFrame({'A':['A0','A1','A2'], 'B':['B0','B1','B2']}, index = ['K0','K1','K2']) right = pd.DataFrame({'C':['C0','C2','C3'], 'D':['D0','D2','D3']}, index=['K0','K2','K3']) print(left) print(right) res = pd.merge(left,right,left_index=True,right_index=True,how='outer') print('out方式合并:\n' , res) boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]}) girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]}) print(boys) print(girls) res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='outer') print('合并时对同名列重命名示例:\n' , res)
import matplotlib.pyplot as plt data = pd.Series(np.random.randn(1000),index=np.arange(1000)) data = data.cumsum() data.plot() plt.show() data = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=['A','B','C','D']) data = data.cumsum() print(data.head()) data.plot() plt.show() ax = data.plot.scatter(x='A',y='B',color='Blue',label='class 1') data.plot.scatter(x='A',y='C',color='Green',label='class 2',ax=ax) plt.show()