1.导包
import numpy as np
import pandas as pd
2.删除重复行
def make_df ( indexs, columns) :
data = [ [ str ( j) + str ( i) for j in columns] for i in indexs]
df = pd. DataFrame( data= data, index= indexs, columns= columns)
return df
使用 duplicated() 函数检测重复的行
返回元素为布尔类型的 Series 对象 每个元素对应一行,如果该行不是第一次出现,则元素为 True
df = make_df( [ 1 , 2 , 3 , 4 ] , list ( "ABCD" ) )
df
A B C D 1 A1 B1 C1 D1 2 A2 B2 C2 D2 3 A3 B3 C3 D3 4 A4 B4 C4 D4
df. loc[ 1 ] = df. loc[ 2 ]
df
A B C D 1 A2 B2 C2 D2 2 A2 B2 C2 D2 3 A3 B3 C3 D3 4 A4 B4 C4 D4
df. duplicated( )
1 False
2 True
3 False
4 False
dtype: bool
df. duplicated( keep= "first" )
1 False
2 True
3 False
4 False
dtype: bool
df. duplicated( keep= "last" )
1 True
2 False
3 False
4 False
dtype: bool
df. duplicated( keep= False )
1 True
2 True
3 False
4 False
dtype: bool
df. loc[ 1 , "D" ] = "DDD"
df
A B C D 1 A2 B2 C2 DDD 2 A2 B2 C2 D2 3 A3 B3 C3 D3 4 A4 B4 C4 D4
df. duplicated( )
1 False
2 False
3 False
4 False
dtype: bool
df. duplicated( subset= [ "A" , "B" , "C" ] )
1 False
2 True
3 False
4 False
dtype: bool
使用 drop_duplicates() 函数删除重复的行
df. drop_duplicates( )
A B C D 1 A2 B2 C2 DDD 2 A2 B2 C2 D2 3 A3 B3 C3 D3 4 A4 B4 C4 D4
df. drop_duplicates( subset= [ "A" , "B" , "C" ] )
A B C D 1 A2 B2 C2 DDD 3 A3 B3 C3 D3 4 A4 B4 C4 D4
df. drop_duplicates( subset= [ "A" , "B" , "C" ] , keep= "last" )
A B C D 2 A2 B2 C2 D2 3 A3 B3 C3 D3 4 A4 B4 C4 D4
3.映射
映射的含义:创建一个映射关系列表,把 values 元素和一个特定的标签或者字符串绑定。
包含三种操作:
replace()函数:替换元素 map()函数:新建一列,最重要 rename()函数:替换索引
(1)replace()函数:替换元素
使用replace()函数,对values进行替换操作
index = [ "张三" , "张三丰" , "李白" , "杜甫" ]
columns = [ "Python" , "Java" , "H5" , "UI" ]
data = np. random. randint( 0 , 100 , size= ( 4 , 4 ) )
df = pd. DataFrame( data= data, index= index, columns= columns)
df
Python Java H5 UI 张三 35 35 46 59 张三丰 46 96 25 48 李白 35 86 34 32 杜甫 17 56 50 29
df. replace( { 25 : 50 , 17 : 100 } )
Python Java H5 UI 张三 35 35 46 59 张三丰 46 96 50 48 李白 35 86 34 32 杜甫 100 56 50 29
(2)map()函数:适合处理某一单独的列
df2 = df. copy( )
df2
Python Java H5 UI 张三 35 35 46 59 张三丰 46 96 25 48 李白 35 86 34 32 杜甫 17 56 50 29
df2[ "Python" ] . map ( { 35 : 35 , 46 : 46 , 17 : 100 } )
张三 35
张三丰 46
李白 35
杜甫 100
Name: Python, dtype: int64
df2[ "Python" ] . map ( lambda x: x* 10 )
张三 350
张三丰 460
李白 350
杜甫 170
Name: Python, dtype: int64
df2[ "Pandas" ] = df2[ "Python" ] . map ( lambda x: x* 10 )
df2
Python Java H5 UI Pandas 张三 35 35 46 59 350 张三丰 46 96 25 48 460 李白 35 86 34 32 350 杜甫 17 56 50 29 170
df2[ "Java是否及格" ] = df2[ "Java" ] . map ( lambda n: "及格" if n>= 60 else "不及格" )
df2
Python Java H5 UI Pandas Java是否及格 张三 35 35 46 59 350 不及格 张三丰 46 96 25 48 460 及格 李白 35 86 34 32 350 及格 杜甫 17 56 50 29 170 不及格
def fn ( n) :
if n < 40 :
return "不及格"
elif n < 50 :
return "及格"
return "优秀"
df2 [ "UI等级" ] = df2[ "UI" ] . map ( fn)
df2
Python Java H5 UI Pandas Java是否及格 UI等级 张三 35 35 46 59 350 不及格 优秀 张三丰 46 96 25 48 460 及格 及格 李白 35 86 34 32 350 及格 不及格 杜甫 17 56 50 29 170 不及格 不及格
(3)rename()函数:替换索引
df3 = df. copy( )
df3
Python Java H5 UI 张三 35 35 46 59 张三丰 46 96 25 48 李白 35 86 34 32 杜甫 17 56 50 29
df3. rename( { "张三" : "Mr Zhang" } )
Python Java H5 UI Mr Zhang 35 35 46 59 张三丰 46 96 25 48 李白 35 86 34 32 杜甫 17 56 50 29
df3. rename( index= { "张三" : "Mr Zhang" } )
Python Java H5 UI Mr Zhang 35 35 46 59 张三丰 46 96 25 48 李白 35 86 34 32 杜甫 17 56 50 29
df3. rename( { "Python" : "派森" } , axis= 1 )
派森 Java H5 UI 张三 35 35 46 59 张三丰 46 96 25 48 李白 35 86 34 32 杜甫 17 56 50 29
df3. rename( columns= { "Python" : "派森" } )
派森 Java H5 UI 张三 35 35 46 59 张三丰 46 96 25 48 李白 35 86 34 32 杜甫 17 56 50 29
df3. reset_index( )
index Python Java H5 UI 0 张三 35 35 46 59 1 张三丰 46 96 25 48 2 李白 35 86 34 32 3 杜甫 17 56 50 29
df3. set_index( keys= [ "H5" ] )
Python Java UI H5 46 35 35 59 25 46 96 48 34 35 86 32 50 17 56 29
(4)apply()函数:既支持 Series,也支持 DataFrame
df = pd. DataFrame( data= np. random. randint( 0 , 10 , size= ( 5 , 3 ) ) ,
index= list ( "ABCDE" ) ,
columns= [ "Python" , "NumPy" , "Pandas" ]
)
df
Python NumPy Pandas A 4 0 2 B 6 5 5 C 8 7 0 D 1 4 5 E 8 7 2
df[ "Python" ] . apply ( lambda x: True if x> 5 else False )
A False
B True
C True
D False
E True
Name: Python, dtype: bool
df. apply ( lambda x: x. mean( ) , axis= 0 )
Python 5.4
NumPy 4.6
Pandas 2.8
dtype: float64
df. apply ( lambda x: x. mean( ) , axis= 1 )
A 2.000000
B 5.333333
C 5.000000
D 3.333333
E 5.666667
dtype: float64
def fn2 ( x) :
return ( np. round ( x. mean( ) , 1 ) , x. count( ) )
df. apply ( fn2, axis= 1 )
A (2.0, 3)
B (5.3, 3)
C (5.0, 3)
D (3.3, 3)
E (5.7, 3)
dtype: object
df. applymap( lambda x: x + 100 )
Python NumPy Pandas A 104 100 102 B 106 105 105 C 108 107 100 D 101 104 105 E 108 107 102
(5)transform()函数
df = pd. DataFrame( data= np. random. randint( 0 , 10 , size= ( 5 , 3 ) ) ,
index= list ( "ABCDE" ) ,
columns= [ "Python" , "NumPy" , "Pandas" ]
)
df
Python NumPy Pandas A 7 9 1 B 1 6 9 C 5 7 4 D 1 1 7 E 1 6 2
df[ "Python" ] . transform( [ np. sqrt, np. exp] )
sqrt exp A 2.645751 1096.633158 B 1.000000 2.718282 C 2.236068 148.413159 D 1.000000 2.718282 E 1.000000 2.718282
def convert ( x) :
if x. mean( ) > 5 :
return x * 10
return x * ( - 10 )
df. transform( convert)
Python NumPy Pandas A -70 90 -10 B -10 60 -90 C -50 70 -40 D -10 10 -70 E -10 60 -20
df. transform( convert, axis= 1 )
Python NumPy Pandas A 70 90 10 B 10 60 90 C 50 70 40 D -10 -10 -70 E -10 -60 -20
4.异常值检测和过滤
df = pd. DataFrame( data= np. random. randint( 0 , 10 , size= ( 5 , 3 ) ) ,
index= list ( "ABCDE" ) ,
columns= [ "Python" , "NumPy" , "Pandas" ]
)
df
Python NumPy Pandas A 0 1 2 B 7 8 1 C 6 1 5 D 2 8 4 E 6 6 7
df. describe( )
Python NumPy Pandas count 5.00000 5.000000 5.000000 mean 4.20000 4.800000 3.800000 std 3.03315 3.563706 2.387467 min 0.00000 1.000000 1.000000 25% 2.00000 1.000000 2.000000 50% 6.00000 6.000000 4.000000 75% 6.00000 8.000000 5.000000 max 7.00000 8.000000 7.000000
df. describe( [ 0.01 , 0.3 , 0.4 , 0.9 , 0.99 ] )
Python NumPy Pandas count 5.00000 5.000000 5.000000 mean 4.20000 4.800000 3.800000 std 3.03315 3.563706 2.387467 min 0.00000 1.000000 1.000000 1% 0.08000 1.000000 1.040000 30% 2.80000 2.000000 2.400000 40% 4.40000 4.000000 3.200000 50% 6.00000 6.000000 4.000000 90% 6.60000 8.000000 6.200000 99% 6.96000 8.000000 6.920000 max 7.00000 8.000000 7.000000
df. describe( [ 0.01 , 0.3 , 0.4 , 0.9 , 0.99 ] ) . T
count mean std min 1% 30% 40% 50% 90% 99% max Python 5.0 4.2 3.033150 0.0 0.08 2.8 4.4 6.0 6.6 6.96 7.0 NumPy 5.0 4.8 3.563706 1.0 1.00 2.0 4.0 6.0 8.0 8.00 8.0 Pandas 5.0 3.8 2.387467 1.0 1.04 2.4 3.2 4.0 6.2 6.92 7.0
df.std():可以求得DataFrame对象每一列的标准差
df. std( )
Python 3.033150
NumPy 3.563706
Pandas 2.387467
dtype: float64
df2 = df. copy( )
df2
Python NumPy Pandas A 0 1 2 B 7 8 1 C 6 1 5 D 2 8 4 E 6 6 7
df2. drop( "A" )
Python NumPy Pandas B 7 8 1 C 6 1 5 D 2 8 4 E 6 6 7
df2. drop( index= "A" )
Python NumPy Pandas B 7 8 1 C 6 1 5 D 2 8 4 E 6 6 7
df2. drop( "Python" , axis= 1 )
NumPy Pandas A 1 2 B 8 1 C 1 5 D 8 4 E 6 7
df2. drop( columns= "Python" )
NumPy Pandas A 1 2 B 8 1 C 1 5 D 8 4 E 6 7
df2. drop( columns= [ "NumPy" , "Python" ] )
df2. drop( index= [ "A" , "B" ] )
Python NumPy Pandas C 6 1 5 D 2 8 4 E 6 6 7
df2. drop( index= [ "A" , "B" ] , inplace= True )
df2
Python NumPy Pandas C 6 1 5 D 2 8 4 E 6 6 7
unique():唯一,去重(只能用于Series一维数组)
df[ "Python" ] . unique( )
array([0, 7, 6, 2])
df. query( "Python == 6" )
Python NumPy Pandas C 6 1 5 E 6 6 7
df. query( "Python > 6" )
df. query( "Python < 6" )
Python NumPy Pandas A 0 1 2 D 2 8 4
df. query( "Python > 6 and NumPy == 8" )
df. query( "Python > 6 & NumPy == 8" )
df. query( "Python > 6 or NumPy == 8" )
Python NumPy Pandas B 7 8 1 D 2 8 4
df. query( "Python > 6 | NumPy == 8" )
Python NumPy Pandas B 7 8 1 D 2 8 4
df. query( "Python in [0,2,7]" )
Python NumPy Pandas A 0 1 2 B 7 8 1 D 2 8 4
n = 7
df. query( "Python == @n" )
m = [ 0 , 2 , 7 ]
df. query( "Python in @m" )
Python NumPy Pandas A 0 1 2 B 7 8 1 D 2 8 4
df.sort_values():根据值排序 df.sort_index():根据索引排序
df. sort_values( "Python" )
Python NumPy Pandas A 0 1 2 D 2 8 4 C 6 1 5 E 6 6 7 B 7 8 1
df. sort_values( "Python" , ascending= False )
Python NumPy Pandas B 7 8 1 C 6 1 5 E 6 6 7 D 2 8 4 A 0 1 2
df. sort_values( "B" , axis= 1 )
Pandas Python NumPy A 2 0 1 B 1 7 8 C 5 6 1 D 4 2 8 E 7 6 6
df. sort_index( ascending= False )
Python NumPy Pandas E 6 6 7 D 2 8 4 C 6 1 5 B 7 8 1 A 0 1 2
df. sort_index( ascending= False , axis= 1 )
Python Pandas NumPy A 0 2 1 B 7 1 8 C 6 5 1 D 2 4 8 E 6 7 6
df. info( )
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Python 5 non-null int32
1 NumPy 5 non-null int32
2 Pandas 5 non-null int32
dtypes: int32(3)
memory usage: 272.0+ bytes
新建一个形状为10000*3的标准正态分布的DataFrame(np.random.randn),去除掉所有满足以下情况的行:其中任一元素绝对值大约3陪标准差
df = pd. DataFrame( np. random. randn( 10000 , 3 ) )
df
0 1 2 0 -0.742336 1.219393 -0.982067 1 -0.432367 1.697112 -0.159940 2 0.079646 -0.234158 -0.500751 3 -0.106336 -0.033381 0.645390 4 -1.799314 -1.511863 -1.160497 ... ... ... ... 9995 -1.480727 0.716863 0.080706 9996 0.453228 -0.360598 1.056758 9997 1.365573 1.100529 0.574414 9998 0.606161 -0.111569 -0.423250 9999 -0.199662 -1.031638 1.920479
10000 rows × 3 columns
cond = df. abs ( ) > df. std( ) * 3
cond
0 1 2 0 False False False 1 False False False 2 False False False 3 False False False 4 False False False ... ... ... ... 9995 False False False 9996 False False False 9997 False False False 9998 False False False 9999 False False False
10000 rows × 3 columns
cond2 = cond. any ( axis= 1 )
cond2
0 False
1 False
2 False
3 False
4 False
...
9995 False
9996 False
9997 False
9998 False
9999 False
Length: 10000, dtype: bool
df. loc[ ~ cond2]
0 1 2 0 -0.742336 1.219393 -0.982067 1 -0.432367 1.697112 -0.159940 2 0.079646 -0.234158 -0.500751 3 -0.106336 -0.033381 0.645390 4 -1.799314 -1.511863 -1.160497 ... ... ... ... 9995 -1.480727 0.716863 0.080706 9996 0.453228 -0.360598 1.056758 9997 1.365573 1.100529 0.574414 9998 0.606161 -0.111569 -0.423250 9999 -0.199662 -1.031638 1.920479
9925 rows × 3 columns
5.抽样
使用 .take() 函数排序 可以借助 np.random.permutation() 函数随机排序
无放回抽样
df2 = pd. DataFrame( data= np. random. randint( 0 , 10 , size= ( 3 , 3 ) ) ,
index= list ( "CDE" ) ,
columns= [ "Python" , "NumPy" , "Pandas" ]
)
df2
Python NumPy Pandas C 7 6 9 D 8 9 6 E 7 4 4
df2. take( [ 1 , 0 , 2 ] )
Python NumPy Pandas D 8 9 6 C 7 6 9 E 7 4 4
df2. take( [ 1 , 0 , 2 ] , axis= 1 )
NumPy Python Pandas C 6 7 9 D 9 8 6 E 4 7 4
np. random. permutation( [ 0 , 1 , 2 ] )
array([0, 2, 1])
df2. take( np. random. permutation( [ 0 , 1 , 2 ] ) )
Python NumPy Pandas E 7 4 4 D 8 9 6 C 7 6 9
有放回抽样
np. random. randint( 0 , 3 , size= 5 )
array([1, 2, 1, 0, 2])
df2. take( np. random. randint( 0 , 3 , size= 5 ) )
Python NumPy Pandas E 7 4 4 D 8 9 6 E 7 4 4 D 8 9 6 D 8 9 6