参考

创建数据框架

import numpy as np
import pandas as pd

### 创建pandas序列

s = pd.Series([1, 2, 3, np.nan, 5, 6])
print(s)
# 0 1.0
# 1 2.0
# 2 3.0
# 3 NaN
# 4 5.0
# 5 6.0
# dtype: float64

# 创建日期序列
dates = pd.date_range('20160101', periods=6)
print(dates)
# DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
# '2016-01-05', '2016-01-06'],
# dtype='datetime64[ns]', freq='D')

# 自定义创建有名称的数据
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
print(df)
# a b c d
# 2016-01-01 1.089923 0.339409 -1.985301 0.092102
# 2016-01-02 -1.435824 -1.532524 0.111640 -1.111556
# 2016-01-03 -2.353343 -0.647695 1.373838 -1.570191
# 2016-01-04 0.890142 0.242409 1.408043 1.513587
# 2016-01-05 0.844925 -0.448368 0.307331 -0.418320
# 2016-01-06 0.425206 -0.162394 -0.440428 2.418776

df = pd.DataFrame(np.random.rand(5, 5))
print(df)
# 0 1 2 3 4
# 0 0.923439 0.666389 0.004973 0.756111 0.035804
# 1 0.803169 0.739656 0.386507 0.681868 0.013253
# 2 0.090509 0.978360 0.007774 0.720060 0.965550
# 3 0.206403 0.345211 0.835298 0.340604 0.801049
# 4 0.624743 0.679517 0.520393 0.167856 0.868801

df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
# 0 1 2 3 4
# 0 0.671648 0.524993 0.503769 0.554579 0.653587
# 1 0.479951 0.599060 0.899605 0.868518 0.629865
# 2 0.517626 0.029945 0.093076 0.687467 0.238109
# 3 0.506243 0.861011 0.588524 0.519029 0.020215
# 4 0.667327 0.143652 0.749364 0.300464 0.140889


# 按列定义数据
df2 = pd.DataFrame({
'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': pd.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'
})
print(df2)
# A B C D E F
# 0 1.0 2013-01-02 1.0 3 test foo
# 1 1.0 2013-01-02 1.0 3 train foo
# 2 1.0 2013-01-02 1.0 3 test foo
# 3 1.0 2013-01-02 1.0 3 train foo

print(df2.dtypes)
# A float64
# B datetime64[ns]
# C float32
# D int32
# E category
# F object
# dtype: object

print(df2.index)
# Int64Index([0, 1, 2, 3], dtype='int64')

print(df2.columns)
# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

print(df2.values)
# [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
# [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]

print(df2.describe())
# A C D
# count 4.0 4.0 4.0
# mean 1.0 1.0 3.0
# std 0.0 0.0 0.0
# min 1.0 1.0 3.0
# 25% 1.0 1.0 3.0
# 50% 1.0 1.0 3.0
# 75% 1.0 1.0 3.0
# max 1.0 1.0 3.0

print(df2.T)
# 0 ... 3
# A 1.0 ... 1.0
# B 2013-01-02 00:00:00 ... 2013-01-02 00:00:00
# C 1.0 ... 1.0
# D 3 ... 3
# E test ... train
# F foo ... foo

# 对标签排序
print(df2.sort_index(axis=1,ascending=False))
# [6 rows x 4 columns]
# F E D C B A
# 0 foo test 3 1.0 2013-01-02 1.0
# 1 foo train 3 1.0 2013-01-02 1.0
# 2 foo test 3 1.0 2013-01-02 1.0
# 3 foo train 3 1.0 2013-01-02 1.0
print(df2.sort_index(axis=0,ascending=False))
# A B C D E F
# 3 1.0 2013-01-02 1.0 3 train foo
# 2 1.0 2013-01-02 1.0 3 test foo
# 1 1.0 2013-01-02 1.0 3 train foo
# 0 1.0 2013-01-02 1.0 3 test foo

# 对某一项标签排序
print(df2.sort_values(by='E'))
# A B C D E F
# 0 1.0 2013-01-02 1.0 3 test foo
# 2 1.0 2013-01-02 1.0 3 test foo
# 1 1.0 2013-01-02 1.0 3 train foo
# 3 1.0 2013-01-02 1.0 3 train foo

选择数据

# 选择数据

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((4, 6)))
print(df)
# 0 1 2 3 4 5
# 0 0 1 2 3 4 5
# 1 6 7 8 9 10 11
# 2 12 13 14 15 16 17
# 3 18 19 20 21 22 23
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
# A B C D
# 2013-01-01 0 1 2 3
# 2013-01-02 4 5 6 7
# 2013-01-03 8 9 10 11
# 2013-01-04 12 13 14 15
# 2013-01-05 16 17 18 19
# 2013-01-06 20 21 22 23
print(df['A'], df.A)
# 2013-01-01 0
# 2013-01-02 4
# 2013-01-03 8
# 2013-01-04 12
# 2013-01-05 16
# 2013-01-06 20
# Freq: D, Name: A

# dtype: int32
# 2013-01-01 0
# 2013-01-02 4
# 2013-01-03 8
# 2013-01-04 12
# 2013-01-05 16
# 2013-01-06 20
print(df[0:3], df['20130102':'20130104'])
# A B C D
# 2013-01-01 0 1 2 3
# 2013-01-02 4 5 6 7
# 2013-01-03 8 9 10 11

# A B C D
# 2013-01-02 4 5 6 7
# 2013-01-03 8 9 10 11
# 2013-01-04 12 13 14 15

# 通过label:loc选择
print(df.loc['20130102'])
# A 4
# B 5
# C 6
# D 7
# Name: 2013-01-02 00:00:00, dtype: int32
print(df.loc['20130102', ['A', 'B']])
# A 4
# B 5
# Name: 2013-01-02 00:00:00, dtype: int32


# 通过position筛选,iloc
print(df.iloc[[1, 3, 5], 1: 3]) # 1、3、5行, 1至2列
# B C
# 2013-01-02 5 6
# 2013-01-04 13 14
# 2013-01-06 21 22

# 混合使用 ix 但是新版被官方弃用了
#print(pd.ix[:3, ['A', 'C']])

# Boolean indexing 是或否筛选
print(df[df.A >= 8 ])
# A B C D
# 2013-01-03 8 9 10 11
# 2013-01-04 12 13 14 15
# 2013-01-05 16 17 18 19
# 2013-01-06 20 21 22 23

赋值

# 重新赋值
dates = pd.date_range('20220909', periods=6)
print(dates)
# DatetimeIndex(['2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12',
# '2022-09-13', '2022-09-14'],
# dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
# A B C D
# 2022-09-09 0 1 2 3
# 2022-09-10 4 5 6 7
# 2022-09-11 8 9 10 11
# 2022-09-12 12 13 14 15
# 2022-09-13 16 17 18 19
# 2022-09-14 20 21 22 23

# iloc位置选择
for i in range(0,4):
df.iloc[i,i] = 0
print(df)
# A B C D
# 2022-09-09 0 1 2 3
# 2022-09-10 4 0 6 7
# 2022-09-11 8 9 0 11
# 2022-09-12 12 13 14 0
# 2022-09-13 16 17 18 19
# 2022-09-14 20 21 22 23

# 也可用loc定位再改值
df.loc['20220909','A'] = 2222
print(df)
# A B C D
# 2022-09-09 2222 1 2 3
# 2022-09-10 4 0 6 7
# 2022-09-11 8 9 0 11
# 2022-09-12 12 13 14 0
# 2022-09-13 16 17 18 19
# 2022-09-14 20 21 22 23


# 只更改A的那一列
df.A[df.A >4] = 0
print(df)
# A B C D
# 2022-09-09 0 1 2 3
# 2022-09-10 4 0 6 7
# 2022-09-11 0 9 0 11
# 2022-09-12 0 13 14 0
# 2022-09-13 0 17 18 19
# 2022-09-14 0 21 22 23

# 判断将A项 > 10 的那一行赋值为0
df[df.A > 3] = 0
print(df)
# A B C D
# 2022-09-09 0 1 2 3
# 2022-09-10 0 0 0 0
# 2022-09-11 0 9 0 11
# 2022-09-12 0 13 14 0
# 2022-09-13 0 17 18 19
# 2022-09-14 0 21 22 23

# 加空行
df['F'] = np.nan
print(df)
# A B C D F
# 2022-09-09 0 1 2 3 NaN
# 2022-09-10 0 0 0 0 NaN
# 2022-09-11 0 9 0 11 NaN
# 2022-09-12 0 13 14 0 NaN
# 2022-09-13 0 17 18 19 NaN
# 2022-09-14 0 21 22 23 NaN


df['E'] = pd.Series([1,2,3,4,5,6],index = pd.date_range('20220909',periods=6))
print(df)
# A B C D F E
# 2022-09-09 0 1 2 3 NaN 1
# 2022-09-10 0 0 0 0 NaN 2
# 2022-09-11 0 9 0 11 NaN 3
# 2022-09-12 0 13 14 0 NaN 4
# 2022-09-13 0 17 18 19 NaN 5
# 2022-09-14 0 21 22 23 NaN 6

处理数据缺失

# 处理数据缺失
dates = pd.date_range('20220909', periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
# A B C D
# 2022-09-09 0 1 2 3
# 2022-09-10 4 5 6 7
# 2022-09-11 8 9 10 11
# 2022-09-12 12 13 14 15
# 2022-09-13 16 17 18 19
# 2022-09-14 20 21 22 23

# 现在假设有两个丢失的数据
df.iloc[1, 1] = np.nan
df.iloc[2, 2] = np.nan
df['E'] = np.nan
print(df)
# A B C D E
# 2022-09-09 0 1.0 2.0 3 NaN
# 2022-09-10 4 NaN 6.0 7 NaN
# 2022-09-11 8 9.0 NaN 11 NaN
# 2022-09-12 12 13.0 14.0 15 NaN
# 2022-09-13 16 17.0 18.0 19 NaN
# 2022-09-14 20 21.0 22.0 23 NaN

# 丢掉有nan的行, axis = 1表示丢掉列,也就是第二维的数据,all必须全为nan才除去
print(df.dropna(axis=0, how='any')) # how = { 'any','all}
# Empty DataFrame
# Columns: [A, B, C, D, E]
# Index: []
# A B C D E
# 2022-09-09 0 1.0 2.0 3 0.0
# 2022-09-10 4 0.0 6.0 7 0.0
# 2022-09-11 8 9.0 0.0 11 0.0
# 2022-09-12 12 13.0 14.0 15 0.0
# 2022-09-13 16 17.0 18.0 19 0.0
# 2022-09-14 20 21.0 22.0 23 0.0

# 将nan数据填充为0
print(df.fillna(value=0))
# A B C D E
# 2022-09-09 0 1.0 2.0 3 0.0
# 2022-09-10 4 0.0 6.0 7 0.0
# 2022-09-11 8 9.0 0.0 11 0.0
# 2022-09-12 12 13.0 14.0 15 0.0
# 2022-09-13 16 17.0 18.0 19 0.0
# 2022-09-14 20 21.0 22.0 23 0.0

# 检测是否有缺失数据
print(df.isnull())
# A B C D E
# 2022-09-09 False False False False True
# 2022-09-10 False True False False True
# 2022-09-11 False False True False True
# 2022-09-12 False False False False True
# 2022-09-13 False False False False True
# 2022-09-14 False False False False True

# 判断表格中是否有一个是缺失的
print(np.any(df.isnull()) == True)
# True

数据的导入

# 数据的导入导出
# pandas支持多种格式的读取,html、exel、推荐用csv读取
# 具体查看 https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

# 有许多参数可以设定
data = pd.read_csv('student.csv')
print(data)
print(help(pd.read_csv))
# 保存,pickle它能够实现任意对象与文本之间的相互转化,也可以实现任意对象与二进制之间的相互转化。也就是说,pickle 可以实现 Python 对象的存储及恢复。
data.to_pickle('student.pickle')

数据合并

# 数据合并

res = pd.concat([df1, df2, df3], axis=0, ignore_index=True) # 一维项合并
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
# 6 2.0 2.0 2.0 2.0
# 7 2.0 2.0 2.0 2.0
# 8 2.0 2.0 2.0 2.0

# join,['inner','outer']
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
print(df1, '\n', df2)
# 示例数据
# a b c d
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 0.0 0.0 0.0 0.0
# b c d e
# 2 1.0 1.0 1.0 1.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0

# 默认是outer模式
res = pd.concat([df1, df2], axis=0)
print(res)
# a b c d e
# 1 0.0 0.0 0.0 0.0 NaN
# 2 0.0 0.0 0.0 0.0 NaN
# 3 0.0 0.0 0.0 0.0 NaN
# 2 NaN 1.0 1.0 1.0 1.0
# 3 NaN 1.0 1.0 1.0 1.0
# 4 NaN 1.0 1.0 1.0 1.0

# inner属性,默认合并有相同数据的列
res = pd.concat([df1, df2], join='inner', ignore_index=True)
print(res)
# b c d
# 0 0.0 0.0 0.0
# 1 0.0 0.0 0.0
# 2 0.0 0.0 0.0
# 3 1.0 1.0 1.0
# 4 1.0 1.0 1.0
# 5 1.0 1.0 1.0

# join_axes现在没有这个方法了,可以采用merge
res = pd.merge(df1, df2, how='left', left_index=True, right_index=True)
print(res)

# append
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
# res = df1.append(df2, ignore_index=True)
# res = df1.append([df2, df3])
A = [df1, df2, df3]
for i in A:
print(i)
# The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
print(df1.append(df2, ignore_index=True))
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
#

merge合并

# merge 合并

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})

print(left)
# key A B
# 0 K0 A0 B0
# 1 K1 A1 B1
# 2 K2 A2 B2
# 3 K3 A3 B3
print(right)
# key C D
# 0 K0 C0 D0
# 1 K1 C1 D1
# 2 K2 C2 D2
# 3 K3 C3 D3

# on 属性设置依据什么合并
res = pd.merge(left, right, on='key')
print(res)
# key A B C D
# 0 K0 A0 B0 C0 D0
# 1 K1 A1 B1 C1 D1
# 2 K2 A2 B2 C2 D2
# 3 K3 A3 B3 C3 D3

# 依据两组key合并
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
# key1 key2 A B
# 0 K0 K0 A0 B0
# 1 K0 K1 A1 B1
# 2 K1 K0 A2 B2
# 3 K2 K1 A3 B3
print(right)
# key1 key2 C D
# 0 K0 K0 C0 D0
# 1 K1 K0 C1 D1
# 2 K1 K0 C2 D2
# 3 K2 K0 C3 D3

# 依据key1与key2 columns进行合并,并打印出四种结果['left', 'right', 'outer', 'inner']

# how = ['left', 'right', 'outer', 'inner']
# inner 合并两个key,必须两个key都相同才合并
# outer 不考虑两个key是否相同
# left 考虑左边的值是否相同
# right 考虑右边的值是否相同
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2

# 比较左边,此时如果某一行key不相同,不相同的那一行后面补nan,比较的那行顺延与下一行比较是否相同
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K0 K1 A1 B1 NaN NaN
# 2 K1 K0 A2 B2 C1 D1
# 3 K1 K0 A2 B2 C2 D2
# 4 K2 K1 A3 B3 NaN NaN

# indicator
df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']})
df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]})
print(df1)
# col1 col_left
# 0 0 a
# 1 1 b

print(df2)
# col1 col_right
# 0 1 2
# 1 2 2
# 2 2 2

res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
# col1 col_left col_right _merge
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only

# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
# col1 col_left col_right indicator_column
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only


# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])

print(left)
# A B
# K0 A0 B0
# K1 A1 B1
# K2 A2 B2

print(right)
# C D
# K0 C0 D0
# K2 C2 D2
# K3 C3 D3

# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K1 A1 B1 NaN NaN
# K2 A2 B2 C2 D2
# K3 NaN NaN C3 D3

# 只合并index相同的行
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K2 A2 B2 C2 D2


# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(boys)
# k age
# 0 K0 1
# 1 K1 2
# 2 K2 3
print(girls)
# k age
# 0 K0 4
# 1 K0 5
# 2 K3 6

# suffixes 添加后缀
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
# k age_boy age_girl
# 0 K0 1 4
# 1 K0 1 5

图像显示

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 图像显示

# Series
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
print(data)
# 0 0.950266
# 1 0.305635
# 2 0.036322
# 3 1.848374
# 4 -2.056577
# ...
# 995 -0.434937
# 996 2.124252
# 997 -1.048899
# 998 -1.059221
# 999 0.711013
# Length: 1000, dtype: float64
# 进行累加
data = data.cumsum()
data.plot()
# plt.plot(x = , y = )
plt.show()



# DataFrame
data = pd.DataFrame(np.random.randn(10000,4), index=np.arange(10000),columns = list("ABCD"))
data = data.cumsum()
print(data)
# A B C D
# 0 0.400010 0.627432 1.149208 -0.627808
# 1 -0.552524 -0.708598 2.404034 -1.542343
# 2 0.591223 -0.263857 1.726772 -1.780456
# 3 -0.330246 -0.481474 0.354288 -2.018577
# 4 -1.586038 0.690416 -1.213552 -1.286241
# ... ... ... ... ...
# 9995 38.894470 106.568333 -95.487749 -50.046078
# 9996 39.309699 107.150841 -96.254740 -49.966948
# 9997 39.594292 108.638756 -96.410577 -52.685819
# 9998 40.168192 111.200276 -97.354596 -53.236421
# 9999 40.896590 111.690111 -97.617748 -53.540661
# [10000 rows x 4 columns]
data.plot()
plt.show() #很多关于图像的参数

# 很多图像的显示
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
plt.show()