# sample data
data = {
"Name" : [ 'Alice', 'Bob', 'Charlie', 'David', 'Eva'],
"Age" : [ 25, 30, 35, 40, 45 ],
"City" : [ 'New York', 'Los Angeles', 'Chicago', 'Houston',
'Phoenix' ]
}
Introduction to Pandas
data{'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'Age': [25, 30, 35, 40, 45],
'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']}
type(data)dict
import pandas as pddf = pd.DataFrame(data)df.head()| Name | Age | City | |
|---|---|---|---|
| 0 | Alice | 25 | New York |
| 1 | Bob | 30 | Los Angeles |
| 2 | Charlie | 35 | Chicago |
| 3 | David | 40 | Houston |
| 4 | Eva | 45 | Phoenix |
type(df)pandas.core.frame.DataFrame
df.shape(5, 3)
df.columnsIndex(['Name', 'Age', 'City'], dtype='object')
df.indexRangeIndex(start=0, stop=5, step=1)
df.dtypesName object
Age int64
City object
dtype: object
Data Series
df.Age0 25
1 30
2 35
3 40
4 45
Name: Age, dtype: int64
type(df.Age)pandas.core.series.Series
type(df)pandas.core.frame.DataFrame
Data Cleaning
import numpy as np# create a dataframe with missing values
data = {
"Name": ['Alice', 'Bob', None, 'David', 'Eva'],
"Age" : [ 25, None, 35, 40, 'Unknown' ],
"Salary": [50000, 60000, 55000, None, 45000],
"City": ['New York', 'Los Angeles', np.nan, 'Houston',
'Phoenix' ]
}df = pd.DataFrame(data)
df| Name | Age | Salary | City | |
|---|---|---|---|---|
| 0 | Alice | 25 | 50000.0 | New York |
| 1 | Bob | None | 60000.0 | Los Angeles |
| 2 | None | 35 | 55000.0 | NaN |
| 3 | David | 40 | NaN | Houston |
| 4 | Eva | Unknown | 45000.0 | Phoenix |
df['Name'].fillna('Unknown', inplace=True)
df['City'].fillna('Unknown', inplace=True)
df| Name | Age | Salary | City | |
|---|---|---|---|---|
| 0 | Alice | 25 | 50000.0 | New York |
| 1 | Bob | None | 60000.0 | Los Angeles |
| 2 | Unknown | 35 | 55000.0 | Unknown |
| 3 | David | 40 | NaN | Houston |
| 4 | Eva | Unknown | 45000.0 | Phoenix |
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')df| Name | Age | Salary | City | |
|---|---|---|---|---|
| 0 | Alice | 25.0 | 50000.0 | New York |
| 1 | Bob | NaN | 60000.0 | Los Angeles |
| 2 | Unknown | 35.0 | 55000.0 | Unknown |
| 3 | David | 40.0 | NaN | Houston |
| 4 | Eva | NaN | 45000.0 | Phoenix |
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
df| Name | Age | Salary | City | |
|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown |
| 3 | David | 40.000000 | 52500.0 | Houston |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix |
Handling dtypes
fips_codes = {
'New York': '36061',
'Los Angeles': '06037',
'Unknown': '00000',
'Houston': '48201',
'Phoenix': '04013'
}
fips_codes{'New York': '36061',
'Los Angeles': '06037',
'Unknown': '00000',
'Houston': '48201',
'Phoenix': '04013'}
df| Name | Age | Salary | City | |
|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown |
| 3 | David | 40.000000 | 52500.0 | Houston |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix |
df['FIPS'] = df['City'].map(fips_codes)
df| Name | Age | Salary | City | FIPS | |
|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 |
fips_int = df.FIPS.astype(int)
fips_int
0 36061
1 6037
2 0
3 48201
4 4013
Name: FIPS, dtype: int64
df.FIPS0 36061
1 06037
2 00000
3 48201
4 04013
Name: FIPS, dtype: object
df['fips_int'] = fips_intdf.head()| Name | Age | Salary | City | FIPS | fips_int | |
|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 |
df['new_fips'] = df.fips_int.astype(str).str.zfill(5)
df.head()| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
Indexing
df| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
df.loc[3]Name David
Age 40.0
Salary 52500.0
City Houston
FIPS 48201
fips_int 48201
new_fips 48201
Name: 3, dtype: object
df.iloc[3]Name David
Age 40.0
Salary 52500.0
City Houston
FIPS 48201
fips_int 48201
new_fips 48201
Name: 3, dtype: object
df| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
df.sort_values(by='Name')| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
dfsorted = df.sort_values(by='Name')
dfsorted| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
dfsorted.loc[3]Name David
Age 40.0
Salary 52500.0
City Houston
FIPS 48201
fips_int 48201
new_fips 48201
Name: 3, dtype: object
dfsorted.iloc[3]Name Eva
Age 33.333333
Salary 45000.0
City Phoenix
FIPS 04013
fips_int 4013
new_fips 04013
Name: 4, dtype: object
dfsorted| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
dfsorted.reset_index(inplace=True, drop=True)
dfsorted| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 2 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 3 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
| 4 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
df.iloc[[0,1], [2, 3]]| Salary | City | |
|---|---|---|
| 0 | 50000.0 | New York |
| 1 | 60000.0 | Los Angeles |
df.iloc[[0,1], [0, 4]]| Name | FIPS | |
|---|---|---|
| 0 | Alice | 36061 |
| 1 | Bob | 06037 |
df.iloc[[0,2]]| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.0 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 2 | Unknown | 35.0 | 55000.0 | Unknown | 00000 | 0 | 00000 |
df.iloc[[0,2]].loc[:, ['City','FIPS']]| City | FIPS | |
|---|---|---|
| 0 | New York | 36061 |
| 2 | Unknown | 00000 |
Filtering
df| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
df[df.Salary > 50000]| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
df.Salary > 500000 False
1 True
2 True
3 True
4 False
Name: Salary, dtype: bool
df[df.Age >= 35]| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 2 | Unknown | 35.0 | 55000.0 | Unknown | 00000 | 0 | 00000 |
| 3 | David | 40.0 | 52500.0 | Houston | 48201 | 48201 | 48201 |
df[ (df['Age'] >= 35) & (df['Salary'] > 53000) ]| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 2 | Unknown | 35.0 | 55000.0 | Unknown | 00000 | 0 | 00000 |
df[ (df['Age'] >= 35) | (df['Salary'] > 53000) ]| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
Dropping
df| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 2 | Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
df[df.Name != 'Unknown']| Name | Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|---|
| 0 | Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| 1 | Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| 3 | David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| 4 | Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
df.set_index('Name', inplace=True)
df| Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|
| Name | ||||||
| Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| Unknown | 35.000000 | 55000.0 | Unknown | 00000 | 0 | 00000 |
| David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |
df.drop(index='Unknown')| Age | Salary | City | FIPS | fips_int | new_fips | |
|---|---|---|---|---|---|---|
| Name | ||||||
| Alice | 25.000000 | 50000.0 | New York | 36061 | 36061 | 36061 |
| Bob | 33.333333 | 60000.0 | Los Angeles | 06037 | 6037 | 06037 |
| David | 40.000000 | 52500.0 | Houston | 48201 | 48201 | 48201 |
| Eva | 33.333333 | 45000.0 | Phoenix | 04013 | 4013 | 04013 |