# sample data
= {
data "Name" : [ 'Alice', 'Bob', 'Charlie', 'David', 'Eva'],
"Age" : [ 25, 30, 35, 40, 45 ],
"City" : [ 'New York', 'Los Angeles', 'Chicago', 'Houston',
'Phoenix' ]
}
Introduction to Pandas
data
{'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'Age': [25, 30, 35, 40, 45],
'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']}
type(data)
dict
import pandas as pd
= pd.DataFrame(data) df
df.head()
Name | Age | City | |
---|---|---|---|
0 | Alice | 25 | New York |
1 | Bob | 30 | Los Angeles |
2 | Charlie | 35 | Chicago |
3 | David | 40 | Houston |
4 | Eva | 45 | Phoenix |
type(df)
pandas.core.frame.DataFrame
df.shape
(5, 3)
df.columns
Index(['Name', 'Age', 'City'], dtype='object')
df.index
RangeIndex(start=0, stop=5, step=1)
df.dtypes
Name object
Age int64
City object
dtype: object
Data Series
df.Age
0 25
1 30
2 35
3 40
4 45
Name: Age, dtype: int64
type(df.Age)
pandas.core.series.Series
type(df)
pandas.core.frame.DataFrame
Data Cleaning
import numpy as np
# create a dataframe with missing values
= {
data "Name": ['Alice', 'Bob', None, 'David', 'Eva'],
"Age" : [ 25, None, 35, 40, 'Unknown' ],
"Salary": [50000, 60000, 55000, None, 45000],
"City": ['New York', 'Los Angeles', np.nan, 'Houston',
'Phoenix' ]
}
= pd.DataFrame(data)
df df
Name | Age | Salary | City | |
---|---|---|---|---|
0 | Alice | 25 | 50000.0 | New York |
1 | Bob | None | 60000.0 | Los Angeles |
2 | None | 35 | 55000.0 | NaN |
3 | David | 40 | NaN | Houston |
4 | Eva | Unknown | 45000.0 | Phoenix |
'Name'].fillna('Unknown', inplace=True)
df['City'].fillna('Unknown', inplace=True)
df[ df
Name | Age | Salary | City | |
---|---|---|---|---|
0 | Alice | 25 | 50000.0 | New York |
1 | Bob | None | 60000.0 | Los Angeles |
2 | Unknown | 35 | 55000.0 | Unknown |
3 | David | 40 | NaN | Houston |
4 | Eva | Unknown | 45000.0 | Phoenix |
'Age'] = pd.to_numeric(df['Age'], errors='coerce') df[
df
Name | Age | Salary | City | |
---|---|---|---|---|
0 | Alice | 25.0 | 50000.0 | New York |
1 | Bob | NaN | 60000.0 | Los Angeles |
2 | Unknown | 35.0 | 55000.0 | Unknown |
3 | David | 40.0 | NaN | Houston |
4 | Eva | NaN | 45000.0 | Phoenix |
'Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
df[ df
Name | Age | Salary | City | |
---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York |
1 | Bob | 33.333333 | 60000.0 | Los Angeles |
2 | Unknown | 35.000000 | 55000.0 | Unknown |
3 | David | 40.000000 | 52500.0 | Houston |
4 | Eva | 33.333333 | 45000.0 | Phoenix |
Handling dtypes
= {
fips_codes 'New York': '36061',
'Los Angeles': '06037',
'Unknown': '00000',
'Houston': '48201',
'Phoenix': '04013'
} fips_codes
{'New York': '36061',
'Los Angeles': '06037',
'Unknown': '00000',
'Houston': '48201',
'Phoenix': '04013'}
df
Name | Age | Salary | City | |
---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York |
1 | Bob | 33.333333 | 60000.0 | Los Angeles |
2 | Unknown | 35.000000 | 55000.0 | Unknown |
3 | David | 40.000000 | 52500.0 | Houston |
4 | Eva | 33.333333 | 45000.0 | Phoenix |
'FIPS'] = df['City'].map(fips_codes)
df[ df
Name | Age | Salary | City | FIPS | |
---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 |
= df.FIPS.astype(int)
fips_int
fips_int
0 36061
1 6037
2 0
3 48201
4 4013
Name: FIPS, dtype: int64
df.FIPS
0 36061
1 06037
2 00000
3 48201
4 04013
Name: FIPS, dtype: object
'fips_int'] = fips_int df[
df.head()
Name | Age | Salary | City | FIPS | fips_int | |
---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 |
'new_fips'] = df.fips_int.astype(str).str.zfill(5)
df[ df.head()
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
Indexing
df
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
3] df.loc[
Name David
Age 40.0
Salary 52500.0
City Houston
FIPS 48201
fips_int 48201
new_fips 48201
Name: 3, dtype: object
3] df.iloc[
Name David
Age 40.0
Salary 52500.0
City Houston
FIPS 48201
fips_int 48201
new_fips 48201
Name: 3, dtype: object
df
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
='Name') df.sort_values(by
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
= df.sort_values(by='Name')
dfsorted dfsorted
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
3] dfsorted.loc[
Name David
Age 40.0
Salary 52500.0
City Houston
FIPS 48201
fips_int 48201
new_fips 48201
Name: 3, dtype: object
3] dfsorted.iloc[
Name Eva
Age 33.333333
Salary 45000.0
City Phoenix
FIPS 04013
fips_int 4013
new_fips 04013
Name: 4, dtype: object
dfsorted
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
=True, drop=True)
dfsorted.reset_index(inplace dfsorted
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
2 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
3 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
4 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
0,1], [2, 3]] df.iloc[[
Salary | City | |
---|---|---|
0 | 50000.0 | New York |
1 | 60000.0 | Los Angeles |
0,1], [0, 4]] df.iloc[[
Name | FIPS | |
---|---|---|
0 | Alice | 36061 |
1 | Bob | 06037 |
0,2]] df.iloc[[
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.0 | 50000.0 | New York | 36061 | 36061 | 36061 |
2 | Unknown | 35.0 | 55000.0 | Unknown | 00000 | 0 | 00000 |
0,2]].loc[:, ['City','FIPS']] df.iloc[[
City | FIPS | |
---|---|---|
0 | New York | 36061 |
2 | Unknown | 00000 |
Filtering
df
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
> 50000] df[df.Salary
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
> 50000 df.Salary
0 False
1 True
2 True
3 True
4 False
Name: Salary, dtype: bool
>= 35] df[df.Age
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
2 | Unknown | 35.0 | 55000.0 | Unknown | 00000 | 0 | 00000 |
3 | David | 40.0 | 52500.0 | Houston | 48201 | 48201 | 48201 |
'Age'] >= 35) & (df['Salary'] > 53000) ] df[ (df[
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
2 | Unknown | 35.0 | 55000.0 | Unknown | 00000 | 0 | 00000 |
'Age'] >= 35) | (df['Salary'] > 53000) ] df[ (df[
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
Dropping
df
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
!= 'Unknown'] df[df.Name
Name | Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|---|
0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
'Name', inplace=True)
df.set_index( df
Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|
Name | ||||||
Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
='Unknown') df.drop(index
Age | Salary | City | FIPS | fips_int | new_fips | |
---|---|---|---|---|---|---|
Name | ||||||
Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |