Introduction to Pandas

# sample data

data = {
    "Name" : [ 'Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    "Age" : [ 25, 30, 35, 40, 45 ],
    "City" : [ 'New York', 'Los Angeles', 'Chicago',  'Houston',
              'Phoenix' ]
}
    
            
data
{'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 'Age': [25, 30, 35, 40, 45],
 'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']}
type(data)
dict
import pandas as pd
df = pd.DataFrame(data)
df.head()
Name Age City
0 Alice 25 New York
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
3 David 40 Houston
4 Eva 45 Phoenix
type(df)
pandas.core.frame.DataFrame
df.shape
(5, 3)
df.columns
Index(['Name', 'Age', 'City'], dtype='object')
df.index
RangeIndex(start=0, stop=5, step=1)
df.dtypes
Name    object
Age      int64
City    object
dtype: object

Data Series

df.Age
0    25
1    30
2    35
3    40
4    45
Name: Age, dtype: int64
type(df.Age)
pandas.core.series.Series
type(df)
pandas.core.frame.DataFrame

Data Cleaning

import numpy as np
# create a dataframe with missing values

data = {
    "Name": ['Alice', 'Bob', None, 'David', 'Eva'], 
    "Age" : [ 25, None, 35, 40, 'Unknown' ],
    "Salary": [50000,  60000, 55000, None, 45000],
    "City": ['New York', 'Los Angeles', np.nan, 'Houston',
             'Phoenix' ]
}
df = pd.DataFrame(data)
df
Name Age Salary City
0 Alice 25 50000.0 New York
1 Bob None 60000.0 Los Angeles
2 None 35 55000.0 NaN
3 David 40 NaN Houston
4 Eva Unknown 45000.0 Phoenix
df['Name'].fillna('Unknown', inplace=True)
df['City'].fillna('Unknown', inplace=True)
df
Name Age Salary City
0 Alice 25 50000.0 New York
1 Bob None 60000.0 Los Angeles
2 Unknown 35 55000.0 Unknown
3 David 40 NaN Houston
4 Eva Unknown 45000.0 Phoenix
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df
Name Age Salary City
0 Alice 25.0 50000.0 New York
1 Bob NaN 60000.0 Los Angeles
2 Unknown 35.0 55000.0 Unknown
3 David 40.0 NaN Houston
4 Eva NaN 45000.0 Phoenix
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
df
Name Age Salary City
0 Alice 25.000000 50000.0 New York
1 Bob 33.333333 60000.0 Los Angeles
2 Unknown 35.000000 55000.0 Unknown
3 David 40.000000 52500.0 Houston
4 Eva 33.333333 45000.0 Phoenix

Handling dtypes

fips_codes = {
    'New York': '36061',
    'Los Angeles': '06037',
    'Unknown': '00000',
    'Houston': '48201',
    'Phoenix': '04013'
}
fips_codes
{'New York': '36061',
 'Los Angeles': '06037',
 'Unknown': '00000',
 'Houston': '48201',
 'Phoenix': '04013'}
df
Name Age Salary City
0 Alice 25.000000 50000.0 New York
1 Bob 33.333333 60000.0 Los Angeles
2 Unknown 35.000000 55000.0 Unknown
3 David 40.000000 52500.0 Houston
4 Eva 33.333333 45000.0 Phoenix
df['FIPS'] = df['City'].map(fips_codes)
df
Name Age Salary City FIPS
0 Alice 25.000000 50000.0 New York 36061
1 Bob 33.333333 60000.0 Los Angeles 06037
2 Unknown 35.000000 55000.0 Unknown 00000
3 David 40.000000 52500.0 Houston 48201
4 Eva 33.333333 45000.0 Phoenix 04013
fips_int = df.FIPS.astype(int)
fips_int
    
0    36061
1     6037
2        0
3    48201
4     4013
Name: FIPS, dtype: int64
df.FIPS
0    36061
1    06037
2    00000
3    48201
4    04013
Name: FIPS, dtype: object
df['fips_int'] = fips_int
df.head()
Name Age Salary City FIPS fips_int
0 Alice 25.000000 50000.0 New York 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037
2 Unknown 35.000000 55000.0 Unknown 00000 0
3 David 40.000000 52500.0 Houston 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013
df['new_fips'] = df.fips_int.astype(str).str.zfill(5)
df.head()
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
3 David 40.000000 52500.0 Houston 48201 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013 04013

Indexing

df
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
3 David 40.000000 52500.0 Houston 48201 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013 04013
df.loc[3]
Name          David
Age            40.0
Salary      52500.0
City        Houston
FIPS          48201
fips_int      48201
new_fips      48201
Name: 3, dtype: object
df.iloc[3]
Name          David
Age            40.0
Salary      52500.0
City        Houston
FIPS          48201
fips_int      48201
new_fips      48201
Name: 3, dtype: object
df
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
3 David 40.000000 52500.0 Houston 48201 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013 04013
df.sort_values(by='Name')
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
3 David 40.000000 52500.0 Houston 48201 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013 04013
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
dfsorted = df.sort_values(by='Name')
dfsorted
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
3 David 40.000000 52500.0 Houston 48201 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013 04013
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
dfsorted.loc[3]
Name          David
Age            40.0
Salary      52500.0
City        Houston
FIPS          48201
fips_int      48201
new_fips      48201
Name: 3, dtype: object
dfsorted.iloc[3]
Name              Eva
Age         33.333333
Salary        45000.0
City          Phoenix
FIPS            04013
fips_int         4013
new_fips        04013
Name: 4, dtype: object
dfsorted
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
3 David 40.000000 52500.0 Houston 48201 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013 04013
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
dfsorted.reset_index(inplace=True, drop=True)
dfsorted
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
2 David 40.000000 52500.0 Houston 48201 48201 48201
3 Eva 33.333333 45000.0 Phoenix 04013 4013 04013
4 Unknown 35.000000 55000.0 Unknown 00000 0 00000
df.iloc[[0,1], [2, 3]]
Salary City
0 50000.0 New York
1 60000.0 Los Angeles
df.iloc[[0,1], [0, 4]]
Name FIPS
0 Alice 36061
1 Bob 06037
df.iloc[[0,2]]
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.0 50000.0 New York 36061 36061 36061
2 Unknown 35.0 55000.0 Unknown 00000 0 00000
df.iloc[[0,2]].loc[:, ['City','FIPS']]
City FIPS
0 New York 36061
2 Unknown 00000

Filtering

df
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
3 David 40.000000 52500.0 Houston 48201 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013 04013
df[df.Salary > 50000]
Name Age Salary City FIPS fips_int new_fips
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
3 David 40.000000 52500.0 Houston 48201 48201 48201
df.Salary > 50000
0    False
1     True
2     True
3     True
4    False
Name: Salary, dtype: bool
df[df.Age >= 35]
Name Age Salary City FIPS fips_int new_fips
2 Unknown 35.0 55000.0 Unknown 00000 0 00000
3 David 40.0 52500.0 Houston 48201 48201 48201
df[ (df['Age'] >= 35) & (df['Salary'] > 53000) ]
Name Age Salary City FIPS fips_int new_fips
2 Unknown 35.0 55000.0 Unknown 00000 0 00000
df[ (df['Age'] >= 35) | (df['Salary'] > 53000) ]
Name Age Salary City FIPS fips_int new_fips
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
3 David 40.000000 52500.0 Houston 48201 48201 48201

Dropping

df
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
2 Unknown 35.000000 55000.0 Unknown 00000 0 00000
3 David 40.000000 52500.0 Houston 48201 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013 04013
df[df.Name != 'Unknown']
Name Age Salary City FIPS fips_int new_fips
0 Alice 25.000000 50000.0 New York 36061 36061 36061
1 Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
3 David 40.000000 52500.0 Houston 48201 48201 48201
4 Eva 33.333333 45000.0 Phoenix 04013 4013 04013
df.set_index('Name', inplace=True)
df
Age Salary City FIPS fips_int new_fips
Name
Alice 25.000000 50000.0 New York 36061 36061 36061
Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
Unknown 35.000000 55000.0 Unknown 00000 0 00000
David 40.000000 52500.0 Houston 48201 48201 48201
Eva 33.333333 45000.0 Phoenix 04013 4013 04013
df.drop(index='Unknown')
Age Salary City FIPS fips_int new_fips
Name
Alice 25.000000 50000.0 New York 36061 36061 36061
Bob 33.333333 60000.0 Los Angeles 06037 6037 06037
David 40.000000 52500.0 Houston 48201 48201 48201
Eva 33.333333 45000.0 Phoenix 04013 4013 04013