Basic and tricky pandas functions

Main preprocessing techniques for pandas package

In [ ]:
import pandas as pd
import numpy as np

read csv file and print first five samples

In [ ]:
dataset_path = 'https://datahub.io/machine-learning/iris/r/iris.csv' 
dataframe = pd.read_csv(dataset_path)
dataframe.head()
Out[ ]:
sepallength sepalwidth petallength petalwidth class
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa

save dataframe as tsv file ( tab separated)

In [ ]:
dataframe.to_csv('iris.tsv',sep='\t')

sort based on petalwidth and petallength

In [ ]:
# sort by petalwidth and petal length 
dataframe = dataframe.sort_values(by=['petalwidth','petallength'])
dataframe.head(10)
Out[ ]:
sepallength sepalwidth petallength petalwidth class
13 4.3 3.0 1.1 0.1 Iris-setosa
12 4.8 3.0 1.4 0.1 Iris-setosa
9 4.9 3.1 1.5 0.1 Iris-setosa
32 5.2 4.1 1.5 0.1 Iris-setosa
34 4.9 3.1 1.5 0.1 Iris-setosa
37 4.9 3.1 1.5 0.1 Iris-setosa
22 4.6 3.6 1.0 0.2 Iris-setosa
14 5.8 4.0 1.2 0.2 Iris-setosa
35 5.0 3.2 1.2 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa

numerical data ranks convert data to ranks of values

select sepallength of first five samples

In [ ]:
dataframe['sepallength'].head()
Out[ ]:
0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepallength, dtype: float64

group by: get mean of each class using groupby function

In [ ]:
dataframe.groupby(['class']).mean()
Out[ ]:
sepallength sepalwidth petallength petalwidth
class
Iris-setosa 5.006 3.418 1.464 0.244
Iris-versicolor 5.936 2.770 4.260 1.326
Iris-virginica 6.588 2.974 5.552 2.026

save first ten rows to a csv file

In [ ]:
dataframe.head(10).to_csv('first_ten_rows.csv',index=False)

describe overview of dataset main details using describe function

In [ ]:
dataframe.describe()
Out[ ]:
sepallength sepalwidth petallength petalwidth
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000

find memory usage of each column

In [ ]:
dataframe.memory_usage()
Out[ ]:
Index           128
sepallength    1200
sepalwidth     1200
petallength    1200
petalwidth     1200
class          1200
dtype: int64

convert type of sepalwidth to int

In [ ]:
dataframe['sepalwidth'] = dataframe.sepalwidth.astype(int)
dataframe.head()
Out[ ]:
sepallength sepalwidth petallength petalwidth class
0 5.1 3 1.4 0.2 Iris-setosa
1 4.9 3 1.4 0.2 Iris-setosa
2 4.7 3 1.3 0.2 Iris-setosa
3 4.6 3 1.5 0.2 Iris-setosa
4 5.0 3 1.4 0.2 Iris-setosa

count values of petalwidth

In [ ]:
dataframe['petalwidth'].value_counts()
Out[ ]:
0.2    28
1.3    13
1.8    12
1.5    12
1.4     8
2.3     8
1.0     7
0.4     7
0.3     7
0.1     6
2.1     6
2.0     6
1.2     5
1.9     5
1.6     4
2.5     3
2.2     3
2.4     3
1.1     3
1.7     2
0.6     1
0.5     1
Name: petalwidth, dtype: int64

merge two dataframes outer join

In [ ]:
import random
data1 = [[i ,random.randint(0,20)] for i in range(100)]
data2 = [[random.randint(0,20) , i] for i in range(40,140)]

dataframe1 = pd.DataFrame(data1 , columns=['id1','code1'])
dataframe2 = pd.DataFrame(data2 , columns=['code2','id2'])
merged = dataframe1.merge(dataframe2,left_on='id1', right_on='id2', suffixes=('_left', '_right'),how='outer')

select columns 2 and 3 of row 100 to row 150

In [ ]:
dataframe.iloc[100:150,1:3]
Out[ ]:
sepalwidth petallength
100 3.3 6.0
101 2.7 5.1
102 3.0 5.9
103 2.9 5.6
104 3.0 5.8
105 3.0 6.6
106 2.5 4.5
107 2.9 6.3
108 2.5 5.8
109 3.6 6.1
110 3.2 5.1
111 2.7 5.3
112 3.0 5.5
113 2.5 5.0
114 2.8 5.1
115 3.2 5.3
116 3.0 5.5
117 3.8 6.7
118 2.6 6.9
119 2.2 5.0
120 3.2 5.7
121 2.8 4.9
122 2.8 6.7
123 2.7 4.9
124 3.3 5.7
125 3.2 6.0
126 2.8 4.8
127 3.0 4.9
128 2.8 5.6
129 3.0 5.8
130 2.8 6.1
131 3.8 6.4
132 2.8 5.6
133 2.8 5.1
134 2.6 5.6
135 3.0 6.1
136 3.4 5.6
137 3.1 5.5
138 3.0 4.8
139 3.1 5.4
140 3.1 5.6
141 3.1 5.1
142 2.7 5.1
143 3.2 5.9
144 3.3 5.7
145 3.0 5.2
146 2.5 5.0
147 3.0 5.2
148 3.4 5.4
149 3.0 5.1

select petalwidth , sepallength where class is Iris-setosa

In [ ]:
dataframe.loc[(dataframe['class']=='Iris-setosa'),['petalwidth','sepallength']]
Out[ ]:
petalwidth sepallength
0 0.2 5.1
1 0.2 4.9
2 0.2 4.7
3 0.2 4.6
4 0.2 5.0
5 0.4 5.4
6 0.3 4.6
7 0.2 5.0
8 0.2 4.4
9 0.1 4.9
10 0.2 5.4
11 0.2 4.8
12 0.1 4.8
13 0.1 4.3
14 0.2 5.8
15 0.4 5.7
16 0.4 5.4
17 0.3 5.1
18 0.3 5.7
19 0.3 5.1
20 0.2 5.4
21 0.4 5.1
22 0.2 4.6
23 0.5 5.1
24 0.2 4.8
25 0.2 5.0
26 0.4 5.0
27 0.2 5.2
28 0.2 5.2
29 0.2 4.7
30 0.2 4.8
31 0.4 5.4
32 0.1 5.2
33 0.2 5.5
34 0.1 4.9
35 0.2 5.0
36 0.2 5.5
37 0.1 4.9
38 0.2 4.4
39 0.2 5.1
40 0.3 5.0
41 0.3 4.5
42 0.2 4.4
43 0.6 5.0
44 0.4 5.1
45 0.3 4.8
46 0.2 5.1
47 0.2 4.6
48 0.2 5.3
49 0.2 5.0

select all rows with petalwidth > 0.1 and sepallength < 5.0

In [ ]:
mask = (dataframe['petalwidth'] > 0.1) & (dataframe['sepallength'] < 5.0)
dataframe[mask]
Out[ ]:
sepallength sepalwidth petallength petalwidth class
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
6 4.6 3.4 1.4 0.3 Iris-setosa
8 4.4 2.9 1.4 0.2 Iris-setosa
11 4.8 3.4 1.6 0.2 Iris-setosa
22 4.6 3.6 1.0 0.2 Iris-setosa
24 4.8 3.4 1.9 0.2 Iris-setosa
29 4.7 3.2 1.6 0.2 Iris-setosa
30 4.8 3.1 1.6 0.2 Iris-setosa
38 4.4 3.0 1.3 0.2 Iris-setosa
41 4.5 2.3 1.3 0.3 Iris-setosa
42 4.4 3.2 1.3 0.2 Iris-setosa
45 4.8 3.0 1.4 0.3 Iris-setosa
47 4.6 3.2 1.4 0.2 Iris-setosa
57 4.9 2.4 3.3 1.0 Iris-versicolor
106 4.9 2.5 4.5 1.7 Iris-virginica