Python練手，pandas

發布時間：2020-07-21 03:47:05 來源：網絡閱讀：936 作者：hadoooo 欄目：大數據
'''
http://pandas.pydata.org/pandas-docs/stable/10min.html

    numpy的主要數據結構是ndarry
    pandas的主要數據結構是Series、DataFrame
'''


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  

df1 = pd.DataFrame(np.array(range(101,125)).reshape(6,4),
                   index=range(6),
                   columns=list('ABCD'))
print(df1)
#      A    B    C    D
# 0  101  102  103  104
# 1  105  106  107  108
# 2  109  110  111  112
# 3  113  114  115  116
# 4  117  118  119  120
# 5  121  122  123  124

df2 = pd.DataFrame({'custID':['C0001','C0002','C0004','C0004','C0004','C0003'],
                    'accountID':pd.Series(['6214C000101',
                                           '6214C000201',
                                           '6214C000401',
                                           '6214C000403',
                                           '6214C000402',
                                           '6214C000301'],index=range(6),dtype='str'),
                    'tradeDate':pd.Series(['2018-01-18 14:00:00',
                                           '2018-01-18 14:00:00',
                                           '2018-01-18 14:00:01',
                                           '2018-01-18 14:00:03',
                                           '2018-01-18 14:00:02',
                                           '2018-01-18 14:00:00'],index=range(6),dtype='str'),
                    'tradeAmt':pd.Series([100.0,
                                          100.0,
                                          101.0,
                                          103.0,
                                          102.0,
                                          100.0],index=range(6),dtype='float'),
                    'tradeDesc':'xxxxxx',
                    'mark':pd.Categorical(["row1","row2","row3","row4","row5","row6"])},
                   index=range(6))  #注意：表DateFrame與列Series的索引保持一致。DateFrame的真實index默認是從0開始的，這里設置的其實是index的標簽，如果自定義了DateFrame的index（標簽），假如某列是Series，那么Series的index也必須保持一致，否則會錯位。
print(df2)
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx
# 5  6214C000301  C0003  row6     100.0  2018-01-18 14:00:00    xxxxxx

print(df2.dtypes)
# accountID      object
# custID         object
# mark         category
# tradeAmt      float64
# tradeDate      object
# tradeDesc      object
# dtype: object

print(df2.index)
# RangeIndex(start=0, stop=6, step=1)

print(df2.columns)
# Index(['accountID', 'custID', 'mark', 'tradeAmt', 'tradeDate', 'tradeDesc'], dtype='object')

print(df2.values)
# [['6214C000101' 'C0001' 'row1' 100.0 '2018-01-18 14:00:00' 'xxxxxx']
#  ['6214C000201' 'C0002' 'row2' 100.0 '2018-01-18 14:00:00' 'xxxxxx']
#  ['6214C000401' 'C0004' 'row3' 101.0 '2018-01-18 14:00:01' 'xxxxxx']
#  ['6214C000403' 'C0004' 'row4' 103.0 '2018-01-18 14:00:03' 'xxxxxx']
#  ['6214C000402' 'C0004' 'row5' 102.0 '2018-01-18 14:00:02' 'xxxxxx']
#  ['6214C000301' 'C0003' 'row6' 100.0 '2018-01-18 14:00:00' 'xxxxxx']]

print(df2.head(2))
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx

print(df2.tail(2))
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx
# 5  6214C000301  C0003  row6     100.0  2018-01-18 14:00:00    xxxxxx


print(df2.describe())  #統計，但僅限數值的列，非數值的列不會輸出統計
#          tradeAmt
# count    6.000000
# mean   101.000000
# std      1.264911
# min    100.000000
# 25%    100.000000
# 50%    100.500000
# 75%    101.750000
# max    103.000000

print(df2.T)
#                              0                    1                    2  \
# accountID          6214C000101          6214C000201          6214C000401   
# custID                   C0001                C0002                C0004   
# mark                      row1                 row2                 row3   
# tradeAmt                   100                  100                  101   
# tradeDate  2018-01-18 14:00:00  2018-01-18 14:00:00  2018-01-18 14:00:01   
# tradeDesc               xxxxxx               xxxxxx               xxxxxx   
# 
#                              3                    4                    5  
# accountID          6214C000403          6214C000402          6214C000301  
# custID                   C0004                C0004                C0003  
# mark                      row4                 row5                 row6  
# tradeAmt                   103                  102                  100  
# tradeDate  2018-01-18 14:00:03  2018-01-18 14:00:02  2018-01-18 14:00:00  
# tradeDesc               xxxxxx               xxxxxx               xxxxxx  

print('------------------------------------------------------------------------------------')

print(df2.sort_values(by='tradeDate',ascending=False)) #排序  按指定列的值 降序
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 5  6214C000301  C0003  row6     100.0  2018-01-18 14:00:00    xxxxxx

print(df2.sort_values(by=['custID','tradeDate'],ascending=[True,False])) #聯合排序
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 5  6214C000301  C0003  row6     100.0  2018-01-18 14:00:00    xxxxxx
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx

print(df2.sort_index(axis=0,ascending=False)) #索引排序  按照行的索引
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 5  6214C000301  C0003  row6     100.0  2018-01-18 14:00:00    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx

print(df2.sort_index(axis=1,ascending=True)) #索引排序  按照列的索引（默認是按照列名生成的行索引）
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx
# 5  6214C000301  C0003  row6     100.0  2018-01-18 14:00:00    xxxxxx

print('------------------------------------------------------------------------------------')

'''
    iloc按索引查找，loc按標簽查找
    iat按索引查找，iat按標簽查找
'''

print(df2['custID'])
# 0    C0001
# 1    C0002
# 2    C0004
# 3    C0004
# 4    C0004
# 5    C0003
# Name: custID, dtype: object

print(df2[0:4]) #切片 按行索引
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx

print(df2[1:4]) #切片 按行索引
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx

print(df2.loc[1,'accountID']) #按行列標簽查找，不是按行列索引查找
# 6214C000201

print(df2.iloc[3]) #第4行
# accountID            6214C000403
# custID                     C0004
# mark                        row4
# tradeAmt                     103
# tradeDate    2018-01-18 14:00:03
# tradeDesc                 xxxxxx
# Name: 3, dtype: object

print(df2.iloc[3,4]) #第4行 第5列
# 2018-01-18 14:00:03

print(df2.iloc[3:4]) #第4至5行（不含第5行）
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx

print(df2.iloc[3:5,1:3]) #第4、5行，第2、3列（列索引如果沒有自定義，是按列名排序自動生成的）
#   custID  mark
# 3  C0004  row4
# 4  C0004  row5
print(df2.iloc[[3,4],[1,2]]) #第4、5行，第2、3列
#   custID  mark
# 3  C0004  row4
# 4  C0004  row5

print(df2.iloc[3:5,:]) #第4、5行，所有列
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx

print(df2.iloc[:,1:3]) #所有行，第2、3列
#   custID  mark
# 0  C0001  row1
# 1  C0002  row2
# 2  C0004  row3
# 3  C0004  row4
# 4  C0004  row5
# 5  C0003  row6

print(df2[df2.tradeAmt > 101.0]) #篩選
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx

print('------------------------------------------------------------------------------------')

df3 = df2.copy()
df3["custID"] = ["NEW","NEW","NEW","NEW","NEW","NEW"] # 更新 整列
df3.loc[:,'tradeAmt'] = range(len(df3))  #更新  按行列標簽查找
df3.at[range(7)[1],'accountID'] = '==========='  # 更新  按行列標簽查找
df3.iat[0,0] = '+++++++++++' # 更新  按行列索引查找
# df3[df3.tradeDate == '2018-01-18 14:00:03'] = -df3 #找出符合條件的行，然后取反，如果所有字段都是數值的話是可以的
print(df3)
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 0  +++++++++++    NEW  row1         0  2018-01-18 14:00:00    xxxxxx
# 1  ===========    NEW  row2         1  2018-01-18 14:00:00    xxxxxx
# 2  6214C000401    NEW  row3         2  2018-01-18 14:00:01    xxxxxx
# 3  6214C000403    NEW  row4         3  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402    NEW  row5         4  2018-01-18 14:00:02    xxxxxx
# 5  6214C000301    NEW  row6         5  2018-01-18 14:00:00    xxxxxx

print('------------------------------------------------------------------------------------')

df4 = df2.reindex(index=range(4), columns=['custID','accountID','tradeAmt']) #重新組合 抽取
df4.loc[0:1,'tradeAmt'] = 200  #如果該列存在，則更新
df4.loc[0:1,'newColumn'] = 1 #如果該列不存在，則新增列
print(df4)
#   custID    accountID  tradeAmt  newColumn
# 0  C0001  6214C000101     200.0        1.0
# 1  C0002  6214C000201     200.0        1.0
# 2  C0004  6214C000401     101.0        NaN
# 3  C0004  6214C000403     103.0        NaN

print(df4.dropna(how='any'))  #過濾所有包含空值的行
#   custID    accountID  tradeAmt  newColumn
# 0  C0001  6214C000101     200.0        1.0
# 1  C0002  6214C000201     200.0        1.0

print(df4.fillna(value=999)) #填充空值
#   custID    accountID  tradeAmt  newColumn
# 0  C0001  6214C000101     200.0        1.0
# 1  C0002  6214C000201     200.0        1.0
# 2  C0004  6214C000401     101.0      999.0
# 3  C0004  6214C000403     103.0      999.0

print(pd.isnull(df4)) #判斷空值
#   custID accountID tradeAmt newColumn
# 0  False     False    False     False
# 1  False     False    False     False
# 2  False     False    False      True
# 3  False     False    False      True

print('------------------------------------------------------------------------------------')

print(df2)
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx
# 5  6214C000301  C0003  row6     100.0  2018-01-18 14:00:00    xxxxxx

print(df2.mean())
# tradeAmt    101.0
# dtype: float64



s = pd.Series([1,3,5,np.nan,6,8], index=range(6)).shift(2)  # 向后移動幾行，前面置空
print(s)
# 0    NaN
# 1    1.0
# 2    3.0
# 3    5.0
# 4    NaN
# 5    6.0
# dtype: float64

print(df2.shift(2))
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 0          NaN    NaN   NaN       NaN                  NaN       NaN
# 1          NaN    NaN   NaN       NaN                  NaN       NaN
# 2  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 3  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 4  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 5  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx

print('------------------------------------------------------------------------------------')

print(df2.apply(lambda x: max(x))) #列函數 lambda或者function都可以
# accountID            6214C000403
# custID                     C0004
# mark                        row6
# tradeAmt                     103
# tradeDate    2018-01-18 14:00:03
# tradeDesc                 xxxxxx
# dtype: object

print('------------------------------------------------------------------------------------')

print(df2["custID"].value_counts()) #類似 group by count 
# C0004    3
# C0001    1
# C0002    1
# C0003    1
# Name: custID, dtype: int64

print('------------------------------------------------------------------------------------')


print(df2["mark"].str.upper()) #大小寫轉換
# 0    ROW1
# 1    ROW2
# 2    ROW3
# 3    ROW4
# 4    ROW5
# 5    ROW6
# Name: mark, dtype: object

print('------------------------------------------------------------------------------------')

df5 = pd.DataFrame(np.random.randn(9,3))
print(df5)
#           0         1         2
# 0  1.303158 -0.125934 -0.205285
# 1  0.760388 -1.004298  1.143800
# 2  2.063722  0.229955  0.020368
# 3 -2.024974  0.307957 -0.579090
# 4 -1.571883  0.260561 -0.884209
# 5  2.465572 -1.001873  1.243028
# 6  0.025388 -0.372608  1.431214
# 7 -0.079416 -0.401075 -0.973337
# 8 -1.088755 -1.947188 -1.100827

pieces = [df5[:2],df5[5:6],df5[7:]]  #頭、中間、尾，切幾塊拼起來
print(pieces)
# [          0         1         2
# 0  1.303158 -0.125934 -0.205285
# 1  0.760388 -1.004298  1.143800,           0         1         2
# 5  2.465572 -1.001873  1.243028,           0         1         2 #index重復打印了幾次
# 7 -0.079416 -0.401075 -0.973337
# 8 -1.088755 -1.947188 -1.100827]

print(pd.concat(pieces)) #包含
#           0         1         2
# 0  1.303158 -0.125934 -0.205285
# 1  0.760388 -1.004298  1.143800
# 5  2.465572 -1.001873  1.243028
# 7 -0.079416 -0.401075 -0.973337
# 8 -1.088755 -1.947188 -1.100827

print('------------------------------------------------------------------------------------')

df_left = pd.DataFrame({'key':['001','002','007'],'val':['999','1','2']})
df_right = pd.DataFrame({'key':['001','002','009'],'key2':['001','002','009'],'val':['999','3','4']})

print(df_left)
#    key  val
# 0  001  999
# 1  002    1
# 2  007    2
print(df_right)
#    key key2  val
# 0  001  001  999
# 1  002  002    3
# 2  009  009    4
print( pd.merge(df_left, df_right,how='inner', on='key') ) #內關聯
#    key val_x key2 val_y
# 0  001   999  001   999
# 1  002     1  002     3
print( pd.merge(df_left, df_right, how='inner', left_on='key',right_on='key2') ) #內關聯 不同字段
#   key_x val_x key_y key2 val_y
# 0   001   999   001  001   999
# 1   002     1   002  002     3
print( pd.merge(df_left, df_right,how='inner', on=['key','val']) ) #內關聯 多字段
#    key  val key2
# 0  001  999  001
print( pd.merge(df_left, df_right, how='left', on='key') ) #左外關聯
#    key val_x key2 val_y
# 0  001   999  001   999
# 1  002     1  002     3
# 2  007     2  NaN   NaN
print( pd.merge(df_left, df_right, how='right', on='key') ) #右外關聯
#    key val_x key2 val_y
# 0  001   999  001   999
# 1  002     1  002     3
# 2  009   NaN  009     4

print('------------------------------------------------------------------------------------')

print(df2.append(df2[:3],ignore_index=True)) #對原表做行切片，再追加到原表，追加的時候忽略切片的索引標簽，索引自動重新編排標簽
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx
# 5  6214C000301  C0003  row6     100.0  2018-01-18 14:00:00    xxxxxx
# 6  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx  （這行是追加的）
# 7  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx  （這行是追加的）
# 8  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx  （這行是追加的）

print(df2.append(df2[:3],ignore_index=False))  #追加之后，保留切片的索引標簽，發現了嗎，索引標簽是允許重復的
#      accountID custID  mark  tradeAmt            tradeDate tradeDesc
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx
# 3  6214C000403  C0004  row4     103.0  2018-01-18 14:00:03    xxxxxx
# 4  6214C000402  C0004  row5     102.0  2018-01-18 14:00:02    xxxxxx
# 5  6214C000301  C0003  row6     100.0  2018-01-18 14:00:00    xxxxxx
# 0  6214C000101  C0001  row1     100.0  2018-01-18 14:00:00    xxxxxx  （這行是追加的）
# 1  6214C000201  C0002  row2     100.0  2018-01-18 14:00:00    xxxxxx  （這行是追加的）
# 2  6214C000401  C0004  row3     101.0  2018-01-18 14:00:01    xxxxxx  （這行是追加的）

print('------------------------------------------------------------------------------------')

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',   #zip()函數，將可迭代的對象作為參數，將對象中對應的元素打包成一個個元組，然后返回由這些元組組成的列表
                      'foo', 'foo', 'qux', 'qux'], 
                     ['one', 'two', 'one', 'two',
                      'one', 'two', 'one', 'two']])) 
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) # 多索引標簽MultiIndex

df6 = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
print(df6)
#                      A         B
# first second                    
# bar   one    -0.101234 -0.956210
#       two    -0.480354  1.308950
# baz   one     0.943706  0.976480
#       two    -0.788852 -1.556547
# foo   one     0.997527 -0.337391
#       two    -0.191448 -0.083129
# qux   one    -0.919527 -0.414051
#       two    -0.579727  1.595290

stacked = df6.stack()   # 把“行列表結構”變成“堆棧結構”（姑且這樣稱呼它），把列標簽追加到行標簽之后
print(stacked) 
# first  second   
# bar    one     A   -0.101234
#                B   -0.956210
#        two     A   -0.480354
#                B    1.308950
# baz    one     A    0.943706
#                B    0.976480
#        two     A   -0.788852
#                B   -1.556547
# foo    one     A    0.997527
#                B   -0.337391
#        two     A   -0.191448
#                B   -0.083129
# qux    one     A   -0.919527
#                B   -0.414051
#        two     A   -0.579727
#                B    1.595290

print(stacked["bar"]["one"]["A"]) # “堆棧結構”的好處是，你可以這樣訪問數據，可以想象“堆棧結構”其實就是多層數組
# dtype: float64
# -0.101233870095

unstacked = stacked.unstack() # 還原回去，把“堆棧結構”變成“行列表結構”，把行標簽變成列
print(unstacked)
#                      A         B
# first second                    
# bar   one    -0.101234 -0.956210
#       two    -0.480354  1.308950
# baz   one     0.943706  0.976480
#       two    -0.788852 -1.556547
# foo   one     0.997527 -0.337391
#       two    -0.191448 -0.083129
# qux   one    -0.919527 -0.414051
#       two    -0.579727  1.595290

unstacked_unstacked_0 = unstacked.unstack(0) #還能繼續吧行標簽變成列標簽
print(unstacked_unstacked_0)
#                A                                      B                      
# first        bar       baz       foo       qux      bar       baz       foo        qux 
# second                                                                                 
# one    -0.101234  0.943706  0.997527 -0.919527 -0.95621  0.976480 -0.337391  -0.414051 
# two    -0.480354 -0.788852 -0.191448 -0.579727  1.30895 -1.556547 -0.083129   1.595290 

unstacked_unstacked_1 = unstacked.unstack(1) #還能繼續吧行標簽變成列標簽  把第2個標簽變成列標簽
print(unstacked_unstacked_1)
#                A                   B          
# second       one       two       one       two
# first                                         
# bar    -0.101234 -0.480354 -0.956210  1.308950
# baz     0.943706 -0.788852  0.976480 -1.556547
# foo     0.997527 -0.191448 -0.337391 -0.083129
# qux    -0.919527 -0.579727 -0.414051  1.595290


print('------------------------------------------------------------------------------------')

df7 = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})

print(df7)
#         A  B    C         D         E
# 0     one  A  foo -0.516297 -0.860641
# 1     one  B  foo -1.560483 -1.647366
# 2     two  C  foo  1.124756  0.329971
# 3   three  A  bar -0.312954  0.040263
# 4     one  B  bar -1.355079  0.358829
# 5     one  C  bar  0.749617  0.978513
# 6     two  A  foo -2.173830  0.434789
# 7   three  B  foo -1.070213  0.641253
# 8     one  C  foo -0.515032  0.127273
# 9     one  A  bar -1.408970  0.025128
# 10    two  B  bar -0.390044  0.060392
# 11  three  C  bar  0.067667  0.676595

print( pd.pivot_table(df7, values='D', index=['A', 'B'], columns=['C'])  ) #透視表
# C             bar       foo
# A     B                    
# one   A -1.408970 -0.516297
#       B -1.355079 -1.560483
#       C  0.749617 -0.515032
# three A -0.312954       NaN
#       B       NaN -1.070213
#       C  0.067667       NaN
# two   A       NaN -2.173830
#       B -0.390044       NaN
#       C       NaN  1.124756

print('------------------------------------------------------------------------------------')


rng = pd.date_range('1/1/2012', periods=10, freq='min') #看結果，是個時間索引DatetimeIndex
print(rng)
# DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:01:00',
#                '2012-01-01 00:02:00', '2012-01-01 00:03:00',
#                '2012-01-01 00:04:00', '2012-01-01 00:05:00',
#                '2012-01-01 00:06:00', '2012-01-01 00:07:00',
#                '2012-01-01 00:08:00', '2012-01-01 00:09:00'],
#               dtype='datetime64[ns]', freq='T')

ts = pd.Series(range(10), index=rng)  # 時間序列數據
print(ts)
# 2012-01-01 00:00:00    0
# 2012-01-01 00:01:00    1
# 2012-01-01 00:02:00    2
# 2012-01-01 00:03:00    3
# 2012-01-01 00:04:00    4
# 2012-01-01 00:05:00    5
# 2012-01-01 00:06:00    6
# 2012-01-01 00:07:00    7
# 2012-01-01 00:08:00    8
# 2012-01-01 00:09:00    9
# Freq: T, dtype: int32

print( ts.resample('5Min').sum() ) #resample()是對時間序列數據進行重新采樣的便捷方法
# 2012-01-01 00:00:00    10
# 2012-01-01 00:05:00    35
# Freq: 5T, dtype: int32

ts_utc = ts.tz_localize('UTC') #改變時區標準 UTC世界時 GMT格里尼治時
print( ts_utc )
# 2012-01-01 00:00:00+00:00    0
# 2012-01-01 00:01:00+00:00    1
# 2012-01-01 00:02:00+00:00    2
# 2012-01-01 00:03:00+00:00    3
# 2012-01-01 00:04:00+00:00    4
# 2012-01-01 00:05:00+00:00    5
# 2012-01-01 00:06:00+00:00    6
# 2012-01-01 00:07:00+00:00    7
# 2012-01-01 00:08:00+00:00    8
# 2012-01-01 00:09:00+00:00    9
# Freq: T, dtype: int32

print( ts_utc.tz_convert('US/Eastern') ) #時區轉換
# 2011-12-31 19:00:00-05:00    0
# 2011-12-31 19:01:00-05:00    1
# 2011-12-31 19:02:00-05:00    2
# 2011-12-31 19:03:00-05:00    3
# 2011-12-31 19:04:00-05:00    4
# 2011-12-31 19:05:00-05:00    5
# 2011-12-31 19:06:00-05:00    6
# 2011-12-31 19:07:00-05:00    7
# 2011-12-31 19:08:00-05:00    8
# 2011-12-31 19:09:00-05:00    9
# Freq: T, dtype: int32


print( ts.to_period() ) #時間序列顯示格式，只顯示到你定義的單位 
# 2012-01-01 00:00    0
# 2012-01-01 00:01    1
# 2012-01-01 00:02    2
# 2012-01-01 00:03    3
# 2012-01-01 00:04    4
# 2012-01-01 00:05    5
# 2012-01-01 00:06    6
# 2012-01-01 00:07    7
# 2012-01-01 00:08    8
# 2012-01-01 00:09    9
# Freq: T, dtype: int32

print( ts.to_period().to_timestamp() ) #時間序列顯示格式，標準時間格式
# 2012-01-01 00:00:00    0
# 2012-01-01 00:01:00    1
# 2012-01-01 00:02:00    2
# 2012-01-01 00:03:00    3
# 2012-01-01 00:04:00    4
# 2012-01-01 00:05:00    5
# 2012-01-01 00:06:00    6
# 2012-01-01 00:07:00    7
# 2012-01-01 00:08:00    8
# 2012-01-01 00:09:00    9
# Freq: T, dtype: int32

print('------------------------------------------------------------------------------------')

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df["grade"] = df["raw_grade"].astype("category") #創建新的列，支持category類型數據（category是一種類別標簽）

print( df["grade"] )
# 0    a
# 1    b
# 2    b
# 3    a
# 4    a
# 5    e
# Name: grade, dtype: category

df["grade"].cat.categories = ["very good", "good", "very bad"]
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) #重新定義類別，覆蓋原來的類別
print( df["grade"] )
# 0    very good
# 1         good
# 2         good
# 3    very good
# 4    very good
# 5     very bad
# Name: grade, dtype: category
# Categories (5, object): [very bad, bad, medium, good, very good]

print( df.groupby("grade").size() ) #按類別統計
# grade
# very bad     1
# bad          0
# medium       0
# good         2
# very good    3
# dtype: int64


print('------------------------------------------------------------------------------------')

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) #1000日的時間序列+隨機數

ts = ts.cumsum() #累加統計

print(ts)
ts.plot() #有的環境到這步就顯式了
plt.show() #有的要導入matplotlib.pyplot模塊，這樣開啟圖像顯示
#圖像是一條曲線，X軸：1000日，y軸：每日的累加統計結果

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,columns=['A', 'B', 'C', 'D']) #時間序列的索引標簽，4列的表
df = df.cumsum() #每列的累加統計
df.plot()
plt.show()
#圖像是4條曲線，X軸：1000日，y軸：每日的累加統計結果
向AI問一下細節
91超碰碰碰碰久久久久久综合_超碰av人澡人澡人澡人澡人掠_国产黄大片在线观看画质优化_txt小说免费全本

Python練手，pandas

猜你喜歡

91超碰碰碰碰久久久久久综合_超碰av人澡人澡人澡人澡人掠_国产黄大片在线观看画质优化_txt小说免费全本

Python練手，pandas

猜你喜歡

最新資訊

相關推薦

相關標簽