Python练手,pandas
发表于:2025-01-25 作者:千家信息网编辑
千家信息网最后更新 2025年01月25日,'''http://pandas.pydata.org/pandas-docs/stable/10min.html numpy的主要数据结构是ndarry pandas的主要数据结构是Se
千家信息网最后更新 2025年01月25日Python练手,pandas
'''http://pandas.pydata.org/pandas-docs/stable/10min.html numpy的主要数据结构是ndarry pandas的主要数据结构是Series、DataFrame'''import pandas as pdimport numpy as npimport matplotlib.pyplot as plt df1 = pd.DataFrame(np.array(range(101,125)).reshape(6,4), index=range(6), columns=list('ABCD'))print(df1)# A B C D# 0 101 102 103 104# 1 105 106 107 108# 2 109 110 111 112# 3 113 114 115 116# 4 117 118 119 120# 5 121 122 123 124df2 = pd.DataFrame({'custID':['C0001','C0002','C0004','C0004','C0004','C0003'], 'accountID':pd.Series(['6214C000101', '6214C000201', '6214C000401', '6214C000403', '6214C000402', '6214C000301'],index=range(6),dtype='str'), 'tradeDate':pd.Series(['2018-01-18 14:00:00', '2018-01-18 14:00:00', '2018-01-18 14:00:01', '2018-01-18 14:00:03', '2018-01-18 14:00:02', '2018-01-18 14:00:00'],index=range(6),dtype='str'), 'tradeAmt':pd.Series([100.0, 100.0, 101.0, 103.0, 102.0, 100.0],index=range(6),dtype='float'), 'tradeDesc':'xxxxxx', 'mark':pd.Categorical(["row1","row2","row3","row4","row5","row6"])}, index=range(6)) #注意:表DateFrame与列Series的索引保持一致。DateFrame的真实index默认是从0开始的,这里设置的其实是index的标签,如果自定义了DateFrame的index(标签),假如某列是Series,那么Series的index也必须保持一致,否则会错位。print(df2)# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.dtypes)# accountID object# custID object# mark category# tradeAmt float64# tradeDate object# tradeDesc object# dtype: objectprint(df2.index)# RangeIndex(start=0, stop=6, step=1)print(df2.columns)# Index(['accountID', 'custID', 'mark', 'tradeAmt', 'tradeDate', 'tradeDesc'], dtype='object')print(df2.values)# [['6214C000101' 'C0001' 'row1' 100.0 '2018-01-18 14:00:00' 'xxxxxx']# ['6214C000201' 'C0002' 'row2' 100.0 '2018-01-18 14:00:00' 'xxxxxx']# ['6214C000401' 'C0004' 'row3' 101.0 '2018-01-18 14:00:01' 'xxxxxx']# ['6214C000403' 'C0004' 'row4' 103.0 '2018-01-18 14:00:03' 'xxxxxx']# ['6214C000402' 'C0004' 'row5' 102.0 '2018-01-18 14:00:02' 'xxxxxx']# ['6214C000301' 'C0003' 'row6' 100.0 '2018-01-18 14:00:00' 'xxxxxx']]print(df2.head(2))# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.tail(2))# accountID custID mark tradeAmt tradeDate tradeDesc# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.describe()) #统计,但仅限数值的列,非数值的列不会输出统计# tradeAmt# count 6.000000# mean 101.000000# std 1.264911# min 100.000000# 25% 100.000000# 50% 100.500000# 75% 101.750000# max 103.000000print(df2.T)# 0 1 2 \# accountID 6214C000101 6214C000201 6214C000401 # custID C0001 C0002 C0004 # mark row1 row2 row3 # tradeAmt 100 100 101 # tradeDate 2018-01-18 14:00:00 2018-01-18 14:00:00 2018-01-18 14:00:01 # tradeDesc xxxxxx xxxxxx xxxxxx # # 3 4 5 # accountID 6214C000403 6214C000402 6214C000301 # custID C0004 C0004 C0003 # mark row4 row5 row6 # tradeAmt 103 102 100 # tradeDate 2018-01-18 14:00:03 2018-01-18 14:00:02 2018-01-18 14:00:00 # tradeDesc xxxxxx xxxxxx xxxxxx print('------------------------------------------------------------------------------------')print(df2.sort_values(by='tradeDate',ascending=False)) #排序 按指定列的值 降序# accountID custID mark tradeAmt tradeDate tradeDesc# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.sort_values(by=['custID','tradeDate'],ascending=[True,False])) #联合排序# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxxprint(df2.sort_index(axis=0,ascending=False)) #索引排序 按照行的索引# accountID custID mark tradeAmt tradeDate tradeDesc# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.sort_index(axis=1,ascending=True)) #索引排序 按照列的索引(默认是按照列名生成的行索引)# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint('------------------------------------------------------------------------------------')''' iloc按索引查找,loc按标签查找 iat按索引查找,iat按标签查找'''print(df2['custID'])# 0 C0001# 1 C0002# 2 C0004# 3 C0004# 4 C0004# 5 C0003# Name: custID, dtype: objectprint(df2[0:4]) #切片 按行索引# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxxprint(df2[1:4]) #切片 按行索引# accountID custID mark tradeAmt tradeDate tradeDesc# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxxprint(df2.loc[1,'accountID']) #按行列标签查找,不是按行列索引查找# 6214C000201print(df2.iloc[3]) #第4行# accountID 6214C000403# custID C0004# mark row4# tradeAmt 103# tradeDate 2018-01-18 14:00:03# tradeDesc xxxxxx# Name: 3, dtype: objectprint(df2.iloc[3,4]) #第4行 第5列# 2018-01-18 14:00:03print(df2.iloc[3:4]) #第4至5行(不含第5行)# accountID custID mark tradeAmt tradeDate tradeDesc# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxxprint(df2.iloc[3:5,1:3]) #第4、5行,第2、3列(列索引如果没有自定义,是按列名排序自动生成的)# custID mark# 3 C0004 row4# 4 C0004 row5print(df2.iloc[[3,4],[1,2]]) #第4、5行,第2、3列# custID mark# 3 C0004 row4# 4 C0004 row5print(df2.iloc[3:5,:]) #第4、5行,所有列# accountID custID mark tradeAmt tradeDate tradeDesc# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxxprint(df2.iloc[:,1:3]) #所有行,第2、3列# custID mark# 0 C0001 row1# 1 C0002 row2# 2 C0004 row3# 3 C0004 row4# 4 C0004 row5# 5 C0003 row6print(df2[df2.tradeAmt > 101.0]) #筛选# accountID custID mark tradeAmt tradeDate tradeDesc# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxxprint('------------------------------------------------------------------------------------')df3 = df2.copy()df3["custID"] = ["NEW","NEW","NEW","NEW","NEW","NEW"] # 更新 整列df3.loc[:,'tradeAmt'] = range(len(df3)) #更新 按行列标签查找df3.at[range(7)[1],'accountID'] = '===========' # 更新 按行列标签查找df3.iat[0,0] = '+++++++++++' # 更新 按行列索引查找# df3[df3.tradeDate == '2018-01-18 14:00:03'] = -df3 #找出符合条件的行,然后取反,如果所有字段都是数值的话是可以的print(df3)# accountID custID mark tradeAmt tradeDate tradeDesc# 0 +++++++++++ NEW row1 0 2018-01-18 14:00:00 xxxxxx# 1 =========== NEW row2 1 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 NEW row3 2 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 NEW row4 3 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 NEW row5 4 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 NEW row6 5 2018-01-18 14:00:00 xxxxxxprint('------------------------------------------------------------------------------------')df4 = df2.reindex(index=range(4), columns=['custID','accountID','tradeAmt']) #重新组合 抽取df4.loc[0:1,'tradeAmt'] = 200 #如果该列存在,则更新df4.loc[0:1,'newColumn'] = 1 #如果该列不存在,则新增列print(df4)# custID accountID tradeAmt newColumn# 0 C0001 6214C000101 200.0 1.0# 1 C0002 6214C000201 200.0 1.0# 2 C0004 6214C000401 101.0 NaN# 3 C0004 6214C000403 103.0 NaNprint(df4.dropna(how='any')) #过滤所有包含空值的行# custID accountID tradeAmt newColumn# 0 C0001 6214C000101 200.0 1.0# 1 C0002 6214C000201 200.0 1.0print(df4.fillna(value=999)) #填充空值# custID accountID tradeAmt newColumn# 0 C0001 6214C000101 200.0 1.0# 1 C0002 6214C000201 200.0 1.0# 2 C0004 6214C000401 101.0 999.0# 3 C0004 6214C000403 103.0 999.0print(pd.isnull(df4)) #判断空值# custID accountID tradeAmt newColumn# 0 False False False False# 1 False False False False# 2 False False False True# 3 False False False Trueprint('------------------------------------------------------------------------------------')print(df2)# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxxprint(df2.mean())# tradeAmt 101.0# dtype: float64s = pd.Series([1,3,5,np.nan,6,8], index=range(6)).shift(2) # 向后移动几行,前面置空print(s)# 0 NaN# 1 1.0# 2 3.0# 3 5.0# 4 NaN# 5 6.0# dtype: float64print(df2.shift(2))# accountID custID mark tradeAmt tradeDate tradeDesc# 0 NaN NaN NaN NaN NaN NaN# 1 NaN NaN NaN NaN NaN NaN# 2 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 3 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 4 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 5 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxxprint('------------------------------------------------------------------------------------')print(df2.apply(lambda x: max(x))) #列函数 lambda或者function都可以# accountID 6214C000403# custID C0004# mark row6# tradeAmt 103# tradeDate 2018-01-18 14:00:03# tradeDesc xxxxxx# dtype: objectprint('------------------------------------------------------------------------------------')print(df2["custID"].value_counts()) #类似 group by count # C0004 3# C0001 1# C0002 1# C0003 1# Name: custID, dtype: int64print('------------------------------------------------------------------------------------')print(df2["mark"].str.upper()) #大小写转换# 0 ROW1# 1 ROW2# 2 ROW3# 3 ROW4# 4 ROW5# 5 ROW6# Name: mark, dtype: objectprint('------------------------------------------------------------------------------------')df5 = pd.DataFrame(np.random.randn(9,3))print(df5)# 0 1 2# 0 1.303158 -0.125934 -0.205285# 1 0.760388 -1.004298 1.143800# 2 2.063722 0.229955 0.020368# 3 -2.024974 0.307957 -0.579090# 4 -1.571883 0.260561 -0.884209# 5 2.465572 -1.001873 1.243028# 6 0.025388 -0.372608 1.431214# 7 -0.079416 -0.401075 -0.973337# 8 -1.088755 -1.947188 -1.100827pieces = [df5[:2],df5[5:6],df5[7:]] #头、中间、尾,切几块拼起来print(pieces)# [ 0 1 2# 0 1.303158 -0.125934 -0.205285# 1 0.760388 -1.004298 1.143800, 0 1 2# 5 2.465572 -1.001873 1.243028, 0 1 2 #index重复打印了几次# 7 -0.079416 -0.401075 -0.973337# 8 -1.088755 -1.947188 -1.100827]print(pd.concat(pieces)) #包含# 0 1 2# 0 1.303158 -0.125934 -0.205285# 1 0.760388 -1.004298 1.143800# 5 2.465572 -1.001873 1.243028# 7 -0.079416 -0.401075 -0.973337# 8 -1.088755 -1.947188 -1.100827print('------------------------------------------------------------------------------------')df_left = pd.DataFrame({'key':['001','002','007'],'val':['999','1','2']})df_right = pd.DataFrame({'key':['001','002','009'],'key2':['001','002','009'],'val':['999','3','4']})print(df_left)# key val# 0 001 999# 1 002 1# 2 007 2print(df_right)# key key2 val# 0 001 001 999# 1 002 002 3# 2 009 009 4print( pd.merge(df_left, df_right,how='inner', on='key') ) #内关联# key val_x key2 val_y# 0 001 999 001 999# 1 002 1 002 3print( pd.merge(df_left, df_right, how='inner', left_on='key',right_on='key2') ) #内关联 不同字段# key_x val_x key_y key2 val_y# 0 001 999 001 001 999# 1 002 1 002 002 3print( pd.merge(df_left, df_right,how='inner', on=['key','val']) ) #内关联 多字段# key val key2# 0 001 999 001print( pd.merge(df_left, df_right, how='left', on='key') ) #左外关联# key val_x key2 val_y# 0 001 999 001 999# 1 002 1 002 3# 2 007 2 NaN NaNprint( pd.merge(df_left, df_right, how='right', on='key') ) #右外关联# key val_x key2 val_y# 0 001 999 001 999# 1 002 1 002 3# 2 009 NaN 009 4print('------------------------------------------------------------------------------------')print(df2.append(df2[:3],ignore_index=True)) #对原表做行切片,再追加到原表,追加的时候忽略切片的索引标签,索引自动重新编排标签# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx# 6 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)# 7 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)# 8 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx (这行是追加的)print(df2.append(df2[:3],ignore_index=False)) #追加之后,保留切片的索引标签,发现了吗,索引标签是允许重复的# accountID custID mark tradeAmt tradeDate tradeDesc# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx# 3 6214C000403 C0004 row4 103.0 2018-01-18 14:00:03 xxxxxx# 4 6214C000402 C0004 row5 102.0 2018-01-18 14:00:02 xxxxxx# 5 6214C000301 C0003 row6 100.0 2018-01-18 14:00:00 xxxxxx# 0 6214C000101 C0001 row1 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)# 1 6214C000201 C0002 row2 100.0 2018-01-18 14:00:00 xxxxxx (这行是追加的)# 2 6214C000401 C0004 row3 101.0 2018-01-18 14:00:01 xxxxxx (这行是追加的)print('------------------------------------------------------------------------------------')tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', #zip()函数,将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) # 多索引标签MultiIndexdf6 = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])print(df6)# A B# first second # bar one -0.101234 -0.956210# two -0.480354 1.308950# baz one 0.943706 0.976480# two -0.788852 -1.556547# foo one 0.997527 -0.337391# two -0.191448 -0.083129# qux one -0.919527 -0.414051# two -0.579727 1.595290stacked = df6.stack() # 把"行列表结构"变成"堆栈结构"(姑且这样称呼它),把列标签追加到行标签之后print(stacked) # first second # bar one A -0.101234# B -0.956210# two A -0.480354# B 1.308950# baz one A 0.943706# B 0.976480# two A -0.788852# B -1.556547# foo one A 0.997527# B -0.337391# two A -0.191448# B -0.083129# qux one A -0.919527# B -0.414051# two A -0.579727# B 1.595290print(stacked["bar"]["one"]["A"]) # "堆栈结构"的好处是,你可以这样访问数据,可以想象"堆栈结构"其实就是多层数组# dtype: float64# -0.101233870095unstacked = stacked.unstack() # 还原回去,把"堆栈结构"变成"行列表结构",把行标签变成列print(unstacked)# A B# first second # bar one -0.101234 -0.956210# two -0.480354 1.308950# baz one 0.943706 0.976480# two -0.788852 -1.556547# foo one 0.997527 -0.337391# two -0.191448 -0.083129# qux one -0.919527 -0.414051# two -0.579727 1.595290unstacked_unstacked_0 = unstacked.unstack(0) #还能继续吧行标签变成列标签print(unstacked_unstacked_0)# A B # first bar baz foo qux bar baz foo qux # second # one -0.101234 0.943706 0.997527 -0.919527 -0.95621 0.976480 -0.337391 -0.414051 # two -0.480354 -0.788852 -0.191448 -0.579727 1.30895 -1.556547 -0.083129 1.595290 unstacked_unstacked_1 = unstacked.unstack(1) #还能继续吧行标签变成列标签 把第2个标签变成列标签print(unstacked_unstacked_1)# A B # second one two one two# first # bar -0.101234 -0.480354 -0.956210 1.308950# baz 0.943706 -0.788852 0.976480 -1.556547# foo 0.997527 -0.191448 -0.337391 -0.083129# qux -0.919527 -0.579727 -0.414051 1.595290print('------------------------------------------------------------------------------------')df7 = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)})print(df7)# A B C D E# 0 one A foo -0.516297 -0.860641# 1 one B foo -1.560483 -1.647366# 2 two C foo 1.124756 0.329971# 3 three A bar -0.312954 0.040263# 4 one B bar -1.355079 0.358829# 5 one C bar 0.749617 0.978513# 6 two A foo -2.173830 0.434789# 7 three B foo -1.070213 0.641253# 8 one C foo -0.515032 0.127273# 9 one A bar -1.408970 0.025128# 10 two B bar -0.390044 0.060392# 11 three C bar 0.067667 0.676595print( pd.pivot_table(df7, values='D', index=['A', 'B'], columns=['C']) ) #透视表# C bar foo# A B # one A -1.408970 -0.516297# B -1.355079 -1.560483# C 0.749617 -0.515032# three A -0.312954 NaN# B NaN -1.070213# C 0.067667 NaN# two A NaN -2.173830# B -0.390044 NaN# C NaN 1.124756print('------------------------------------------------------------------------------------')rng = pd.date_range('1/1/2012', periods=10, freq='min') #看结果,是个时间索引DatetimeIndexprint(rng)# DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:01:00',# '2012-01-01 00:02:00', '2012-01-01 00:03:00',# '2012-01-01 00:04:00', '2012-01-01 00:05:00',# '2012-01-01 00:06:00', '2012-01-01 00:07:00',# '2012-01-01 00:08:00', '2012-01-01 00:09:00'],# dtype='datetime64[ns]', freq='T')ts = pd.Series(range(10), index=rng) # 时间序列数据print(ts)# 2012-01-01 00:00:00 0# 2012-01-01 00:01:00 1# 2012-01-01 00:02:00 2# 2012-01-01 00:03:00 3# 2012-01-01 00:04:00 4# 2012-01-01 00:05:00 5# 2012-01-01 00:06:00 6# 2012-01-01 00:07:00 7# 2012-01-01 00:08:00 8# 2012-01-01 00:09:00 9# Freq: T, dtype: int32print( ts.resample('5Min').sum() ) #resample()是对时间序列数据进行重新采样的便捷方法# 2012-01-01 00:00:00 10# 2012-01-01 00:05:00 35# Freq: 5T, dtype: int32ts_utc = ts.tz_localize('UTC') #改变时区标准 UTC世界时 GMT格里尼治时print( ts_utc )# 2012-01-01 00:00:00+00:00 0# 2012-01-01 00:01:00+00:00 1# 2012-01-01 00:02:00+00:00 2# 2012-01-01 00:03:00+00:00 3# 2012-01-01 00:04:00+00:00 4# 2012-01-01 00:05:00+00:00 5# 2012-01-01 00:06:00+00:00 6# 2012-01-01 00:07:00+00:00 7# 2012-01-01 00:08:00+00:00 8# 2012-01-01 00:09:00+00:00 9# Freq: T, dtype: int32print( ts_utc.tz_convert('US/Eastern') ) #时区转换# 2011-12-31 19:00:00-05:00 0# 2011-12-31 19:01:00-05:00 1# 2011-12-31 19:02:00-05:00 2# 2011-12-31 19:03:00-05:00 3# 2011-12-31 19:04:00-05:00 4# 2011-12-31 19:05:00-05:00 5# 2011-12-31 19:06:00-05:00 6# 2011-12-31 19:07:00-05:00 7# 2011-12-31 19:08:00-05:00 8# 2011-12-31 19:09:00-05:00 9# Freq: T, dtype: int32print( ts.to_period() ) #时间序列显示格式,只显示到你定义的单位 # 2012-01-01 00:00 0# 2012-01-01 00:01 1# 2012-01-01 00:02 2# 2012-01-01 00:03 3# 2012-01-01 00:04 4# 2012-01-01 00:05 5# 2012-01-01 00:06 6# 2012-01-01 00:07 7# 2012-01-01 00:08 8# 2012-01-01 00:09 9# Freq: T, dtype: int32print( ts.to_period().to_timestamp() ) #时间序列显示格式,标准时间格式# 2012-01-01 00:00:00 0# 2012-01-01 00:01:00 1# 2012-01-01 00:02:00 2# 2012-01-01 00:03:00 3# 2012-01-01 00:04:00 4# 2012-01-01 00:05:00 5# 2012-01-01 00:06:00 6# 2012-01-01 00:07:00 7# 2012-01-01 00:08:00 8# 2012-01-01 00:09:00 9# Freq: T, dtype: int32print('------------------------------------------------------------------------------------')df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})df["grade"] = df["raw_grade"].astype("category") #创建新的列,支持category类型数据(category是一种类别标签)print( df["grade"] )# 0 a# 1 b# 2 b# 3 a# 4 a# 5 e# Name: grade, dtype: categorydf["grade"].cat.categories = ["very good", "good", "very bad"]df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) #重新定义类别,覆盖原来的类别print( df["grade"] )# 0 very good# 1 good# 2 good# 3 very good# 4 very good# 5 very bad# Name: grade, dtype: category# Categories (5, object): [very bad, bad, medium, good, very good]print( df.groupby("grade").size() ) #按类别统计# grade# very bad 1# bad 0# medium 0# good 2# very good 3# dtype: int64print('------------------------------------------------------------------------------------')ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) #1000日的时间序列+随机数ts = ts.cumsum() #累加统计print(ts)ts.plot() #有的环境到这步就显式了plt.show() #有的要导入matplotlib.pyplot模块,这样开启图像显示#图像是一条曲线,X轴:1000日,y轴:每日的累加统计结果df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,columns=['A', 'B', 'C', 'D']) #时间序列的索引标签,4列的表df = df.cumsum() #每列的累加统计df.plot()plt.show()#图像是4条曲线,X轴:1000日,y轴:每日的累加统计结果
标签
索引
结构
时间
统计
序列
数据
时间序列
行列
关联
排序
更新
堆栈
类别
字段
数值
格式
结果
一致
函数
数据库的安全要保护哪些东西
数据库安全各自的含义是什么
生产安全数据库录入
数据库的安全性及管理
数据库安全策略包含哪些
海淀数据库安全审计系统
建立农村房屋安全信息数据库
易用的数据库客户端支持安全管理
连接数据库失败ssl安全错误
数据库的锁怎样保障安全
高校网络安全宣传活动总结
广东双线服务器
新乡吉山网络技术服务有限公司
c 数据控件连接数据库表视图
网络技术中专
nist数据库哪里有
学网络安全怎么入门
广州略胜互联网科技有限公司
国际城市数据库
株洲IOS软件开发招聘
excel怎么做成数据库
拉萨教学综合信息服务器平台
数据库哪种最常用
数据库插入遗漏数据
ACM数据库包含多少种期刊
计算机网络技术题库含答案
细胞参数数据库
绝地求生服务器错误代码105
方舟手游怎样自己创建服务器
目前软件开发工程工资待遇
怎样在服务器上看到访问信息
网络安全交流i春秋论坛
服务器安全后端策略
网络技术支持服务保障
spl数据库是啥
强一互联网科技有限公司
服务器的存储空间不足
甲骨文数据库有几种形态
计算机三级网络技术知识点野
湘江服务器投资