Basic and tricky pandas functions
Main preprocessing techniques for pandas package
import pandas as pd
import numpy as np
read csv file and print first five samples
dataset_path = 'https://datahub.io/machine-learning/iris/r/iris.csv'
dataframe = pd.read_csv(dataset_path)
dataframe.head()
save dataframe as tsv file ( tab separated)
dataframe.to_csv('iris.tsv',sep='\t')
sort based on petalwidth and petallength
# sort by petalwidth and petal length
dataframe = dataframe.sort_values(by=['petalwidth','petallength'])
dataframe.head(10)
numerical data ranks convert data to ranks of values
select sepallength of first five samples
dataframe['sepallength'].head()
group by: get mean of each class using groupby function
dataframe.groupby(['class']).mean()
save first ten rows to a csv file
dataframe.head(10).to_csv('first_ten_rows.csv',index=False)
describe overview of dataset main details using describe function
dataframe.describe()
find memory usage of each column
dataframe.memory_usage()
convert type of sepalwidth to int
dataframe['sepalwidth'] = dataframe.sepalwidth.astype(int)
dataframe.head()
count values of petalwidth
dataframe['petalwidth'].value_counts()
merge two dataframes outer join
import random
data1 = [[i ,random.randint(0,20)] for i in range(100)]
data2 = [[random.randint(0,20) , i] for i in range(40,140)]
dataframe1 = pd.DataFrame(data1 , columns=['id1','code1'])
dataframe2 = pd.DataFrame(data2 , columns=['code2','id2'])
merged = dataframe1.merge(dataframe2,left_on='id1', right_on='id2', suffixes=('_left', '_right'),how='outer')
select columns 2 and 3 of row 100 to row 150
dataframe.iloc[100:150,1:3]
select petalwidth , sepallength where class is Iris-setosa
dataframe.loc[(dataframe['class']=='Iris-setosa'),['petalwidth','sepallength']]
select all rows with petalwidth > 0.1 and sepallength < 5.0
mask = (dataframe['petalwidth'] > 0.1) & (dataframe['sepallength'] < 5.0)
dataframe[mask]