Pandas笔记

参考

莫烦教程 https://mofanpy.com/tutorials/data-manipulation/np-pd/pd-intro

创建数据框架

import numpy as np
import pandas as pd

### 创建pandas序列

s = pd.Series([1, 2, 3, np.nan, 5, 6])
print(s)
# 0    1.0
# 1    2.0
# 2    3.0
# 3    NaN
# 4    5.0
# 5    6.0
# dtype: float64

# 创建日期序列
dates = pd.date_range('20160101', periods=6)
print(dates)
# DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
#                '2016-01-05', '2016-01-06'],
#               dtype='datetime64[ns]', freq='D')

# 自定义创建有名称的数据
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
print(df)
#                    a         b         c         d
# 2016-01-01  1.089923  0.339409 -1.985301  0.092102
# 2016-01-02 -1.435824 -1.532524  0.111640 -1.111556
# 2016-01-03 -2.353343 -0.647695  1.373838 -1.570191
# 2016-01-04  0.890142  0.242409  1.408043  1.513587
# 2016-01-05  0.844925 -0.448368  0.307331 -0.418320
# 2016-01-06  0.425206 -0.162394 -0.440428  2.418776

df = pd.DataFrame(np.random.rand(5, 5))
print(df)
#           0         1         2         3         4
# 0  0.923439  0.666389  0.004973  0.756111  0.035804
# 1  0.803169  0.739656  0.386507  0.681868  0.013253
# 2  0.090509  0.978360  0.007774  0.720060  0.965550
# 3  0.206403  0.345211  0.835298  0.340604  0.801049
# 4  0.624743  0.679517  0.520393  0.167856  0.868801

df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
#           0         1         2         3         4
# 0  0.671648  0.524993  0.503769  0.554579  0.653587
# 1  0.479951  0.599060  0.899605  0.868518  0.629865
# 2  0.517626  0.029945  0.093076  0.687467  0.238109
# 3  0.506243  0.861011  0.588524  0.519029  0.020215
# 4  0.667327  0.143652  0.749364  0.300464  0.140889


# 按列定义数据
df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': pd.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo'
})
print(df2)
#      A          B    C  D      E    F
# 0  1.0 2013-01-02  1.0  3   test  foo
# 1  1.0 2013-01-02  1.0  3  train  foo
# 2  1.0 2013-01-02  1.0  3   test  foo
# 3  1.0 2013-01-02  1.0  3  train  foo

print(df2.dtypes)
# A           float64
# B    datetime64[ns]
# C           float32
# D             int32
# E          category
# F            object
# dtype: object

print(df2.index)
# Int64Index([0, 1, 2, 3], dtype='int64')

print(df2.columns)
# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

print(df2.values)
# [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
#  [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]

print(df2.describe())
#          A    C    D
# count  4.0  4.0  4.0
# mean   1.0  1.0  3.0
# std    0.0  0.0  0.0
# min    1.0  1.0  3.0
# 25%    1.0  1.0  3.0
# 50%    1.0  1.0  3.0
# 75%    1.0  1.0  3.0
# max    1.0  1.0  3.0

print(df2.T)
#                      0  ...                    3
# A                  1.0  ...                  1.0
# B  2013-01-02 00:00:00  ...  2013-01-02 00:00:00
# C                  1.0  ...                  1.0
# D                    3  ...                    3
# E                 test  ...                train
# F                  foo  ...                  foo

# 对标签排序
print(df2.sort_index(axis=1,ascending=False))
# [6 rows x 4 columns]
#      F      E  D    C          B    A
# 0  foo   test  3  1.0 2013-01-02  1.0
# 1  foo  train  3  1.0 2013-01-02  1.0
# 2  foo   test  3  1.0 2013-01-02  1.0
# 3  foo  train  3  1.0 2013-01-02  1.0
print(df2.sort_index(axis=0,ascending=False))
#      A          B    C  D      E    F
# 3  1.0 2013-01-02  1.0  3  train  foo
# 2  1.0 2013-01-02  1.0  3   test  foo
# 1  1.0 2013-01-02  1.0  3  train  foo
# 0  1.0 2013-01-02  1.0  3   test  foo

# 对某一项标签排序
print(df2.sort_values(by='E'))
#      A          B    C  D      E    F
# 0  1.0 2013-01-02  1.0  3   test  foo
# 2  1.0 2013-01-02  1.0  3   test  foo
# 1  1.0 2013-01-02  1.0  3  train  foo
# 3  1.0 2013-01-02  1.0  3  train  foo

选择数据

# 选择数据

dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((4, 6)))
print(df)
#     0   1   2   3   4   5
# 0   0   1   2   3   4   5
# 1   6   7   8   9  10  11
# 2  12  13  14  15  16  17
# 3  18  19  20  21  22  23
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
#              A   B   C   D
# 2013-01-01   0   1   2   3
# 2013-01-02   4   5   6   7
# 2013-01-03   8   9  10  11
# 2013-01-04  12  13  14  15
# 2013-01-05  16  17  18  19
# 2013-01-06  20  21  22  23
print(df['A'], df.A)
# 2013-01-01     0
# 2013-01-02     4
# 2013-01-03     8
# 2013-01-04    12
# 2013-01-05    16
# 2013-01-06    20
# Freq: D, Name: A

# dtype: int32
# 2013-01-01     0
# 2013-01-02     4
# 2013-01-03     8
# 2013-01-04    12
# 2013-01-05    16
# 2013-01-06    20
print(df[0:3], df['20130102':'20130104'])
#             A  B   C   D
# 2013-01-01  0  1   2   3
# 2013-01-02  4  5   6   7
# 2013-01-03  8  9  10  11

#              A   B   C   D
# 2013-01-02   4   5   6   7
# 2013-01-03   8   9  10  11
# 2013-01-04  12  13  14  15

# 通过label:loc选择
print(df.loc['20130102'])
# A    4
# B    5
# C    6
# D    7
# Name: 2013-01-02 00:00:00, dtype: int32
print(df.loc['20130102', ['A', 'B']])
# A    4
# B    5
# Name: 2013-01-02 00:00:00, dtype: int32


# 通过position筛选,iloc
print(df.iloc[[1, 3, 5], 1: 3])  # 1、3、5行, 1至2列
#              B   C
# 2013-01-02   5   6
# 2013-01-04  13  14
# 2013-01-06  21  22

# 混合使用 ix 但是新版被官方弃用了
#print(pd.ix[:3, ['A', 'C']])

# Boolean indexing  是或否筛选
print(df[df.A >= 8 ])
#              A   B   C   D
# 2013-01-03   8   9  10  11
# 2013-01-04  12  13  14  15
# 2013-01-05  16  17  18  19
# 2013-01-06  20  21  22  23

赋值

# 重新赋值
dates = pd.date_range('20220909', periods=6)
print(dates)
# DatetimeIndex(['2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12',
#                '2022-09-13', '2022-09-14'],
#               dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
#              A   B   C   D
# 2022-09-09   0   1   2   3
# 2022-09-10   4   5   6   7
# 2022-09-11   8   9  10  11
# 2022-09-12  12  13  14  15
# 2022-09-13  16  17  18  19
# 2022-09-14  20  21  22  23

# iloc位置选择
for i in range(0,4):
    df.iloc[i,i] = 0
print(df)
#              A   B   C   D
# 2022-09-09   0   1   2   3
# 2022-09-10   4   0   6   7
# 2022-09-11   8   9   0  11
# 2022-09-12  12  13  14   0
# 2022-09-13  16  17  18  19
# 2022-09-14  20  21  22  23

# 也可用loc定位再改值
df.loc['20220909','A'] = 2222
print(df)
#                A   B   C   D
# 2022-09-09  2222   1   2   3
# 2022-09-10     4   0   6   7
# 2022-09-11     8   9   0  11
# 2022-09-12    12  13  14   0
# 2022-09-13    16  17  18  19
# 2022-09-14    20  21  22  23


# 只更改A的那一列
df.A[df.A >4] = 0
print(df)
#             A   B   C   D
# 2022-09-09  0   1   2   3
# 2022-09-10  4   0   6   7
# 2022-09-11  0   9   0  11
# 2022-09-12  0  13  14   0
# 2022-09-13  0  17  18  19
# 2022-09-14  0  21  22  23

# 判断将A项 > 10 的那一行赋值为0
df[df.A > 3] = 0
print(df)
#             A   B   C   D
# 2022-09-09  0   1   2   3
# 2022-09-10  0   0   0   0
# 2022-09-11  0   9   0  11
# 2022-09-12  0  13  14   0
# 2022-09-13  0  17  18  19
# 2022-09-14  0  21  22  23

# 加空行
df['F'] = np.nan
print(df)
#             A   B   C   D   F
# 2022-09-09  0   1   2   3 NaN
# 2022-09-10  0   0   0   0 NaN
# 2022-09-11  0   9   0  11 NaN
# 2022-09-12  0  13  14   0 NaN
# 2022-09-13  0  17  18  19 NaN
# 2022-09-14  0  21  22  23 NaN


df['E'] = pd.Series([1,2,3,4,5,6],index = pd.date_range('20220909',periods=6))
print(df)
#             A   B   C   D   F  E
# 2022-09-09  0   1   2   3 NaN  1
# 2022-09-10  0   0   0   0 NaN  2
# 2022-09-11  0   9   0  11 NaN  3
# 2022-09-12  0  13  14   0 NaN  4
# 2022-09-13  0  17  18  19 NaN  5
# 2022-09-14  0  21  22  23 NaN  6

处理数据缺失

# 处理数据缺失
dates = pd.date_range('20220909', periods=6)
df = pd.DataFrame(np.arange(24).reshape(6, 4), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
#              A   B   C   D
# 2022-09-09   0   1   2   3
# 2022-09-10   4   5   6   7
# 2022-09-11   8   9  10  11
# 2022-09-12  12  13  14  15
# 2022-09-13  16  17  18  19
# 2022-09-14  20  21  22  23

# 现在假设有两个丢失的数据
df.iloc[1, 1] = np.nan
df.iloc[2, 2] = np.nan
df['E'] = np.nan
print(df)
#              A     B     C   D   E
# 2022-09-09   0   1.0   2.0   3 NaN
# 2022-09-10   4   NaN   6.0   7 NaN
# 2022-09-11   8   9.0   NaN  11 NaN
# 2022-09-12  12  13.0  14.0  15 NaN
# 2022-09-13  16  17.0  18.0  19 NaN
# 2022-09-14  20  21.0  22.0  23 NaN

# 丢掉有nan的行, axis = 1表示丢掉列，也就是第二维的数据，all必须全为nan才除去
print(df.dropna(axis=0, how='any'))  # how = { 'any','all}
# Empty DataFrame
# Columns: [A, B, C, D, E]
# Index: []
#              A     B     C   D    E
# 2022-09-09   0   1.0   2.0   3  0.0
# 2022-09-10   4   0.0   6.0   7  0.0
# 2022-09-11   8   9.0   0.0  11  0.0
# 2022-09-12  12  13.0  14.0  15  0.0
# 2022-09-13  16  17.0  18.0  19  0.0
# 2022-09-14  20  21.0  22.0  23  0.0

# 将nan数据填充为0
print(df.fillna(value=0))
#              A     B     C   D    E
# 2022-09-09   0   1.0   2.0   3  0.0
# 2022-09-10   4   0.0   6.0   7  0.0
# 2022-09-11   8   9.0   0.0  11  0.0
# 2022-09-12  12  13.0  14.0  15  0.0
# 2022-09-13  16  17.0  18.0  19  0.0
# 2022-09-14  20  21.0  22.0  23  0.0

# 检测是否有缺失数据
print(df.isnull())
#                 A      B      C      D     E
# 2022-09-09  False  False  False  False  True
# 2022-09-10  False   True  False  False  True
# 2022-09-11  False  False   True  False  True
# 2022-09-12  False  False  False  False  True
# 2022-09-13  False  False  False  False  True
# 2022-09-14  False  False  False  False  True

# 判断表格中是否有一个是缺失的
print(np.any(df.isnull()) == True)
# True

数据的导入

# 数据的导入导出
# pandas支持多种格式的读取，html、exel、推荐用csv读取
# 具体查看 https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

# 有许多参数可以设定
data = pd.read_csv('student.csv')
print(data)
print(help(pd.read_csv))
# 保存，pickle它能够实现任意对象与文本之间的相互转化，也可以实现任意对象与二进制之间的相互转化。也就是说，pickle 可以实现 Python 对象的存储及恢复。
data.to_pickle('student.pickle')

数据合并

# 数据合并

res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)  # 一维项合并
print(res)
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  1.0  1.0  1.0  1.0
# 4  1.0  1.0  1.0  1.0
# 5  1.0  1.0  1.0  1.0
# 6  2.0  2.0  2.0  2.0
# 7  2.0  2.0  2.0  2.0
# 8  2.0  2.0  2.0  2.0

# join,['inner','outer']
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
print(df1, '\n', df2)
# 示例数据
#      a    b    c    d
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  0.0  0.0  0.0  0.0
#      b    c    d    e
# 2  1.0  1.0  1.0  1.0
# 3  1.0  1.0  1.0  1.0
# 4  1.0  1.0  1.0  1.0

# 默认是outer模式
res = pd.concat([df1, df2], axis=0)
print(res)
#      a    b    c    d    e
# 1  0.0  0.0  0.0  0.0  NaN
# 2  0.0  0.0  0.0  0.0  NaN
# 3  0.0  0.0  0.0  0.0  NaN
# 2  NaN  1.0  1.0  1.0  1.0
# 3  NaN  1.0  1.0  1.0  1.0
# 4  NaN  1.0  1.0  1.0  1.0

# inner属性，默认合并有相同数据的列
res = pd.concat([df1, df2], join='inner', ignore_index=True)
print(res)
#      b    c    d
# 0  0.0  0.0  0.0
# 1  0.0  0.0  0.0
# 2  0.0  0.0  0.0
# 3  1.0  1.0  1.0
# 4  1.0  1.0  1.0
# 5  1.0  1.0  1.0

# join_axes现在没有这个方法了,可以采用merge
res = pd.merge(df1, df2, how='left', left_index=True, right_index=True)
print(res)

# append
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
# res = df1.append(df2, ignore_index=True)
# res = df1.append([df2, df3])
A = [df1, df2, df3]
for i in A:
    print(i)
# The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
print(df1.append(df2, ignore_index=True))
#      a    b    c    d
# 0  0.0  0.0  0.0  0.0
# 1  0.0  0.0  0.0  0.0
# 2  0.0  0.0  0.0  0.0
# 3  1.0  1.0  1.0  1.0
# 4  1.0  1.0  1.0  1.0
# 5  1.0  1.0  1.0  1.0
#

merge合并

# merge 合并

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

print(left)
#   key   A   B
# 0  K0  A0  B0
# 1  K1  A1  B1
# 2  K2  A2  B2
# 3  K3  A3  B3
print(right)
#   key   C   D
# 0  K0  C0  D0
# 1  K1  C1  D1
# 2  K2  C2  D2
# 3  K3  C3  D3

# on 属性设置依据什么合并
res = pd.merge(left, right, on='key')
print(res)
#   key   A   B   C   D
# 0  K0  A0  B0  C0  D0
# 1  K1  A1  B1  C1  D1
# 2  K2  A2  B2  C2  D2
# 3  K3  A3  B3  C3  D3

# 依据两组key合并
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
#   key1 key2   A   B
# 0   K0   K0  A0  B0
# 1   K0   K1  A1  B1
# 2   K1   K0  A2  B2
# 3   K2   K1  A3  B3
print(right)
#   key1 key2   C   D
# 0   K0   K0  C0  D0
# 1   K1   K0  C1  D1
# 2   K1   K0  C2  D2
# 3   K2   K0  C3  D3

# 依据key1与key2 columns进行合并，并打印出四种结果['left', 'right', 'outer', 'inner']

# how = ['left', 'right', 'outer', 'inner']
# inner 合并两个key，必须两个key都相同才合并
# outer 不考虑两个key是否相同
# left 考虑左边的值是否相同
# right 考虑右边的值是否相同
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
#   key1 key2   A   B   C   D
# 0   K0   K0  A0  B0  C0  D0
# 1   K1   K0  A2  B2  C1  D1
# 2   K1   K0  A2  B2  C2  D2

# 比较左边，此时如果某一行key不相同，不相同的那一行后面补nan，比较的那行顺延与下一行比较是否相同
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
#   key1 key2   A   B    C    D
# 0   K0   K0  A0  B0   C0   D0
# 1   K0   K1  A1  B1  NaN  NaN
# 2   K1   K0  A2  B2   C1   D1
# 3   K1   K0  A2  B2   C2   D2
# 4   K2   K1  A3  B3  NaN  NaN

# indicator
df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']})
df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]})
print(df1)
#    col1 col_left
# 0     0        a
# 1     1        b

print(df2)
#    col1  col_right
# 0     1          2
# 1     2          2
# 2     2          2

res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
#    col1 col_left  col_right      _merge
# 0     0        a        NaN   left_only
# 1     1        b        2.0        both
# 2     2      NaN        2.0  right_only
# 3     2      NaN        2.0  right_only

# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
#    col1 col_left  col_right indicator_column
# 0     0        a        NaN        left_only
# 1     1        b        2.0             both
# 2     2      NaN        2.0       right_only
# 3     2      NaN        2.0       right_only


# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])

print(left)
#      A   B
# K0  A0  B0
# K1  A1  B1
# K2  A2  B2

print(right)
#      C   D
# K0  C0  D0
# K2  C2  D2
# K3  C3  D3

# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
#       A    B    C    D
# K0   A0   B0   C0   D0
# K1   A1   B1  NaN  NaN
# K2   A2   B2   C2   D2
# K3  NaN  NaN   C3   D3

# 只合并index相同的行
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
#      A   B   C   D
# K0  A0  B0  C0  D0
# K2  A2  B2  C2  D2


# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(boys)
#     k  age
# 0  K0    1
# 1  K1    2
# 2  K2    3
print(girls)
#     k  age
# 0  K0    4
# 1  K0    5
# 2  K3    6

# suffixes 添加后缀
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
#     k  age_boy  age_girl
# 0  K0        1         4
# 1  K0        1         5

图像显示

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 图像显示

# Series
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
print(data)
# 0      0.950266
# 1      0.305635
# 2      0.036322
# 3      1.848374
# 4     -2.056577
#          ...
# 995   -0.434937
# 996    2.124252
# 997   -1.048899
# 998   -1.059221
# 999    0.711013
# Length: 1000, dtype: float64
# 进行累加
data = data.cumsum()
data.plot()
# plt.plot(x = , y = )
plt.show()



# DataFrame
data = pd.DataFrame(np.random.randn(10000,4), index=np.arange(10000),columns = list("ABCD"))
data = data.cumsum()
print(data)
#               A           B          C          D
# 0      0.400010    0.627432   1.149208  -0.627808
# 1     -0.552524   -0.708598   2.404034  -1.542343
# 2      0.591223   -0.263857   1.726772  -1.780456
# 3     -0.330246   -0.481474   0.354288  -2.018577
# 4     -1.586038    0.690416  -1.213552  -1.286241
# ...         ...         ...        ...        ...
# 9995  38.894470  106.568333 -95.487749 -50.046078
# 9996  39.309699  107.150841 -96.254740 -49.966948
# 9997  39.594292  108.638756 -96.410577 -52.685819
# 9998  40.168192  111.200276 -97.354596 -53.236421
# 9999  40.896590  111.690111 -97.617748 -53.540661
# [10000 rows x 4 columns]
data.plot()
plt.show()  #很多关于图像的参数

# 很多图像的显示
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
plt.show()