# 查看df2对象各列的数据类型 print(df2.dtypes) > A B C D E F 01.02013-01-021.03 test foo 11.02013-01-021.03 train foo 21.02013-01-021.03 test foo 31.02013-01-021.03 train foo A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
观察数据
查看一个DataFrame对象的前几行和最后几行。
1 2 3 4 5 6 7 8 9 10 11 12 13 14
print(df.head()) print(df.tail(3)) # 默认情况下,.head()和.tail()输出首尾的前5行,也可以手动指定输出行数。 > A B C D 2013-01-010.194508-0.8975070.2247450.090260 2013-01-022.412146-1.191852-1.6447370.190971 2013-01-03-0.6746450.3959601.425822-0.718231 2013-01-040.3490460.3150262.058357-0.882345 2013-01-051.4670930.146932-0.680309-0.519155 A B C D 2013-01-040.3490460.3150262.058357-0.882345 2013-01-051.4670930.146932-0.680309-0.519155 2013-01-062.223284-0.305247-0.5590431.017710
# 统计 print(df.describe()) > A B C D count 6.0000006.0000006.0000006.000000 mean 0.995239-0.2561150.137473-0.136798 std 1.2317150.6638151.3919360.711615 min -0.674645-1.191852-1.644737-0.882345 25% 0.233143-0.749442-0.649992-0.668462 50% 0.908069-0.079158-0.167149-0.214448 75% 2.0342360.2730031.1255530.165793 max 2.4121460.3959602.0583571.017710
# 用[]分割DataFrame print(df[0:3]) print(df['20130102':'20130104']) > A B C D 2013-01-010.194508-0.8975070.2247450.090260 2013-01-022.412146-1.191852-1.6447370.190971 2013-01-03-0.6746450.3959601.425822-0.718231 A B C D 2013-01-022.412146-1.191852-1.6447370.190971 2013-01-03-0.6746450.3959601.425822-0.718231 2013-01-040.3490460.3150262.058357-0.882345
# 选中一整行 print(df.loc[dates[0]]) > A 0.194508 B -0.897507 C 0.224745 D 0.090260 Name: 2013-01-0100:00:00, dtype: float64 # 按标签选中复数列(所有行,输出只显示前5行) print(df.loc[:,['A','B']]) > A B 2013-01-010.194508-0.897507 2013-01-022.412146-1.191852 2013-01-03-0.6746450.395960 2013-01-040.3490460.315026 2013-01-051.4670930.146932 2013-01-062.223284-0.305247
# 行/列同时划分(包括起止点) print(df.loc['20130102':'20130104',['A','B']]) > A B 2013-01-022.412146-1.191852 2013-01-03-0.6746450.395960 2013-01-040.3490460.315026
# 位置索引为3的行(从0开始,所以其实是第4行) print(df.iloc[3]) > A 0.349046 B 0.315026 C 2.058357 D -0.882345 Name: 2013-01-0400:00:00, dtype: float64 # 按位置索引分割DataFrame print(df.iloc[3:5,0:2]) print(df.iloc[[1,2,4],[0,2]]) > A B 2013-01-040.3490460.315026 2013-01-051.4670930.146932
# 直接定位一个特定元素 df.iloc[1,1] df.iat[1,1] > A C 2013-01-022.412146-1.644737 2013-01-03-0.6746451.425822 2013-01-051.467093-0.680309
# 提取df2中'E'值属于['two', 'four']的行 print(df2[df2['E'].isin(['two','four'])]) > A B C D E 2013-01-03-0.6746450.3959601.425822-0.718231 two 2013-01-051.4670930.146932-0.680309-0.519155 four
print(df1) > A B C D E 2013-01-010.194508-0.8975070.2247450.0902601.0 2013-01-022.412146-1.191852-1.6447370.1909711.0 2013-01-03-0.6746450.3959601.425822-0.718231 NaN 2013-01-040.3490460.3150262.058357-0.882345 NaN
# 求平均值 print(df.mean()) > A -0.190821 B -0.050040 C -0.203207 D 5.000000 F 3.000000 dtype: float64 # 指定求平均值的轴 print(df.mean(1)) > 2013-01-011.264749 2013-01-021.049748 2013-01-031.578067 2013-01-041.035639 2013-01-051.855754 2013-01-061.936110 Freq: D, dtype: float64 # 创建Series对象s,以dates为索引并平移2个位置 s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) print(s) > 2013-01-01 NaN 2013-01-02 NaN 2013-01-031.0 2013-01-043.0 2013-01-055.0 2013-01-06 NaN Freq: D, dtype: float64
# 从df中逐列减去s(若有NaN则得NaN) print(df.sub(s, axis='index')) > A B C D F 2013-01-01 NaN NaN NaN NaN NaN 2013-01-02 NaN NaN NaN NaN NaN 2013-01-03-1.5723120.570952-1.1083054.01.0 2013-01-04-3.401496-4.304842-4.1154672.00.0 2013-01-05-4.955735-5.576715-4.1887800.0-1.0 2013-01-06 NaN NaN NaN NaN NaN
# 逐行累加 print(df.apply(np.cumsum)) > A B C D F 2013-01-010.0000000.0000000.0589975 NaN 2013-01-020.277465-0.161767-0.807960101.0 2013-01-03-0.2948471.409186-0.916265153.0 2013-01-04-0.6963430.104344-2.031732206.0 2013-01-05-0.652078-0.472372-1.2205122510.0 2013-01-06-1.144929-0.300242-1.2192413015.0
# 每列的最大值减最小值 print(df.apply(lambda x: x.max() - x.min())) > A 0.849776 B 2.875794 C 1.926687 D 0.000000 F 4.000000 dtype: float64
字符
Series对象的str属性具有一系列字符处理方法,可以很轻松地操作数组的每个元素。
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# 字符串变为小写 s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) print(s.str.lower()) > 0 a 1 b 2 c 3 aaba 4 baca 5 NaN 6 caba 7 dog 8 cat dtype: object
print(df) > A B C D 0 foo one 0.298545-0.101893 1 bar one 1.080680-0.717276 2 foo two 1.3653950.939482 3 bar three 0.783108-0.575995 4 foo two -1.089990-0.033826 5 bar two 0.4420841.533146 6 foo one 0.0417150.190613 7 foo three 0.5292310.380723
# 对'A'列进行合并并应用.sum()函数 print(df.groupby('A').sum()) > C D A bar 2.3058710.239875 foo 1.1448971.375099
# 对'A', 'B'两列分别合并形成层级结构,再应用.sum()函数 print(df.groupby(['A','B']).sum()) > C D A B bar one 1.080680-0.717276 three 0.783108-0.575995 two 0.4420841.533146 foo one 0.3402600.088720 three 0.5292310.380723 two 0.2754060.905656
print(df2) > A B first second bar one -1.144920-0.823033 two 0.250615-1.423107 baz one 0.291435-1.580619 two -0.574831-0.742291
# .stack()方法将DataFrame的列“压缩”了一级 stacked = df2.stack() print(stacked) > first second bar one A -1.144920 B -0.823033 two A 0.250615 B -1.423107 baz one A 0.291435 B -1.580619 two A -0.574831 B -0.742291 dtype: float64
print(stacked.unstack()) > A B first second bar one -1.144920-0.823033 two 0.250615-1.423107 baz one 0.291435-1.580619 two -0.574831-0.742291
print(stacked.unstack(1)) > second one two first bar A -1.1449200.250615 B -0.823033-1.423107 baz A 0.291435-0.574831 B -1.580619-0.742291 print(stacked.unstack(0)) > first bar baz second one A -1.1449200.291435 B -0.823033-1.580619 two A 0.250615-0.574831 B -1.423107-0.742291
print(df) > A B C D E 0 one A foo -0.4116740.284523 1 one B foo -1.2179441.519293 2 two C foo 0.502824-0.167898 3 three A bar 0.5651860.226860 4 one B bar 0.6260230.401529 5 one C bar -0.4372170.832881 6 two A foo -0.8251280.346303 7 three B foo 0.0692360.728729 8 one C foo 1.647690-0.531091 9 one A bar -0.8815530.070718 10 two B bar 0.2036721.601761 11 three C bar 1.334214-0.778639
# 生成数据透视表 print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])) > C bar foo A B one A -0.881553-0.411674 B 0.626023-1.217944 C -0.4372171.647690 three A 0.565186 NaN B NaN 0.069236 C 1.334214 NaN two A NaN -0.825128 B 0.203672 NaN C NaN 0.502824