Regional Inequality Decomposition

import seaborn
import pandas
import geopandas
import pysal
import numpy
import mapclassify
import matplotlib.pyplot as plt
from pysal.explore import esda
from pysal.lib import weights

/opt/tljh/user/lib/python3.10/site-packages/numba/core/decorators.py:262: NumbaDeprecationWarning: numba.generated_jit is deprecated. Please see the documentation at: https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-generated-jit for more information and advice on a suitable replacement.
  warnings.warn(msg, NumbaDeprecationWarning)
/opt/tljh/user/lib/python3.10/site-packages/quantecon/lss.py:20: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def simulate_linear_model(A, x0, v, ts_length):
/opt/tljh/user/lib/python3.10/site-packages/spaghetti/network.py:40: FutureWarning: The next major release of pysal/spaghetti (2.0.0) will drop support for all ``libpysal.cg`` geometries. This change is a first step in refactoring ``spaghetti`` that is expected to result in dramatically reduced runtimes for network instantiation and operations. Users currently requiring network and point pattern input as ``libpysal.cg`` geometries should prepare for this simply by converting to ``shapely`` geometries.
  warnings.warn(dep_msg, FutureWarning, stacklevel=1)

pci_df = geopandas.read_file(
    "uscountypcincome.gpkg"
)
pci_df.columns

ERROR 1: PROJ: proj_create_from_database: Open of /opt/tljh/user/share/proj failed

Index(['STATEFP', 'COUNTYFP', 'COUNTYNS', 'GEOID', 'NAME', 'NAMELSAD', 'LSAD',
       'CLASSFP', 'MTFCC', 'CSAFP', 'CBSAFP', 'METDIVFP', 'FUNCSTAT', 'ALAND',
       'AWATER', 'INTPTLAT', 'INTPTLON', 'GeoFIPS', 'GeoName', 'Region',
       'TableName', 'LineCode', 'Descriptio', 'Unit', '1969', '1970', '1971',
       '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980',
       '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989',
       '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', 'index', 'IndustryCl', 'Descript_1', 'geometry'],
      dtype='object')

pci_df.shape

(3076, 77)

pci_df.query('NAME == "Jackson" & STATEFP == "28"').loc[
    :, "1969":"1979"
]

	1969	1970	1971	1972	1973	1974	1975	1976	1977	1978	1979
1417	2957	3121	3327	3939	4203	4547	5461	5927	6315	6619	6967

seaborn.histplot(x=pci_df["1969"], kde=True);

pci_df = pci_df.to_crs(
    # Albers Equal Area North America
    epsg=5070
)

ax = pci_df.plot(
    column="1969",
    scheme="Quantiles",
    legend=True,
    edgecolor="none",
    legend_kwds={"loc": "lower left"},
    figsize=(12, 12),
)
ax.set_axis_off()
plt.show()

top20, bottom20 = pci_df["1969"].quantile([0.8, 0.2])

top20/bottom20

1.5022494887525562

def ineq_20_20(values):
    top20, bottom20 = values.quantile([0.8, 0.2])
    return top20 / bottom20


# Generate range of strings from 1969 to 2018
years = numpy.arange(1969, 2018).astype(str)
# Compute 20:20 ratio for every year
ratio_2020 = pci_df[years].apply(ineq_20_20, axis=0)
# Plot evolution of 20:20 ratio
ax = plt.plot(years, ratio_2020)

# Grab figure generated in the plot
figure = plt.gcf()
# Replace tick labels with every other year
plt.xticks(years[::2])
# Set vertical label
plt.ylabel("20:20 ratio")
# Set horizontal label
plt.xlabel("Year")
# Rotate year labels
figure.autofmt_xdate(rotation=45)

plt.show()

from pysal.explore import inequality

n = len(pci_df)
share_of_population = numpy.arange(1, n + 1) / n

incomes = pci_df["1969"].sort_values()

shares = incomes / incomes.sum()

cumulative_share = shares.cumsum()

# Generate figure with one axis
f, ax = plt.subplots()
# Plot Lorenz Curve
ax.plot(share_of_population, cumulative_share, label="Lorenz Curve")
# Plot line of perfect equality
ax.plot((0, 1), (0, 1), color="r", label="Perfect Equality")
# Label horizontal axis
ax.set_xlabel("Share of population")
# Label vertical axis
ax.set_ylabel("Share of income")
# Add legend
ax.legend()

plt.show()

def lorenz(y):
    y = numpy.asarray(y)
    incomes = numpy.sort(y)
    income_shares = (incomes / incomes.sum()).cumsum()
    N = y.shape[0]
    pop_shares = numpy.arange(1, N + 1) / N
    return pop_shares, income_shares

lorenz(pci_df["1969"])

(array([3.25097529e-04, 6.50195059e-04, 9.75292588e-04, ...,
        9.99349805e-01, 9.99674902e-01, 1.00000000e+00]),
 array([1.22486441e-04, 2.52956561e-04, 3.83636778e-04, ...,
        9.98429316e-01, 9.99176315e-01, 1.00000000e+00]))

lorenz_curves = pci_df[years].apply(lorenz, axis=0)

# Set up figure with one axis
f, ax = plt.subplots()
# Plot line of perfect equality
ax.plot((0, 1), (0, 1), color="r")
# Loop over every year in the series
for year in lorenz_curves.columns:
    # Extract the two arrays or each dimension
    year_pop_shares, year_inc_shares = lorenz_curves[year].values
    # Plot Lorenz curve for a given year
    ax.plot(year_pop_shares, year_inc_shares, color="k", alpha=0.05)

g69 = inequality.gini.Gini(pci_df["1969"].values)

g69.g

0.13556175504269904

def gini_by_col(column):
    return inequality.gini.Gini(column.values).g

inequalities = (
    pci_df[years].apply(gini_by_col, axis=0).to_frame("gini")
)

inequalities.head()

	gini
1969	0.135562
1970	0.130076
1971	0.128540
1972	0.129126
1973	0.142166

inequalities.plot(figsize=(10, 3));

def theil(column):
    return inequality.theil.Theil(column.values).T


inequalities["theil"] = pci_df[years].apply(theil, axis=0)

_ = seaborn.regplot(x="theil", y="gini", data=inequalities)

wq = weights.Queen.from_dataframe(pci_df)

/tmp/ipykernel_268899/2833574135.py:1: FutureWarning: `use_index` defaults to False but will default to True in future. Set True/False directly to control this behavior and silence this warning
  wq = weights.Queen.from_dataframe(pci_df)

def moran_by_col(y, w=wq):
    mo = esda.Moran(y, w=w)
    mo_s = pandas.Series(
        {"I": mo.I, "I-P value": mo.p_sim},
    )
    return mo_s

moran_stats = pci_df[years].apply(moran_by_col, axis=0).T

moran_stats.head()

	I	I-P value
1969	0.649090	0.001
1970	0.647438	0.001
1971	0.626546	0.001
1972	0.606760	0.001
1973	0.640226	0.001

inequalities = inequalities.join(moran_stats)

inequalities[["I", "I-P value"]].plot(subplots=True, figsize=(10, 6))
plt.show()

region_names = {
    1: "New England",
    2: "Mideast",
    3: "Great Lakes",
    4: "Plains",
    5: "Southeast",
    6: "Southwest",
    7: "Rocky Mountain",
    8: "Far West",
}

ax = pci_df.assign(Region_Name=pci_df.Region.map(region_names)).plot(
    "Region_Name",
    linewidth=0,
    legend=True,
    categorical=True,
    legend_kwds=dict(bbox_to_anchor=(1.2, 0.5)),
)
ax.set_axis_off();

theil_dr = inequality.theil.TheilD(
    pci_df[years].values, pci_df.Region
)

theil_dr.bg

array([0.00914353, 0.00822696, 0.00782675, 0.00768201, 0.01022634,
       0.0081274 , 0.00783943, 0.00572543, 0.00560271, 0.0054971 ,
       0.00511791, 0.00566001, 0.00486877, 0.00466134, 0.00474425,
       0.00424528, 0.00428434, 0.00453503, 0.00465829, 0.00456699,
       0.00467363, 0.00412391, 0.00366334, 0.00342112, 0.00327131,
       0.00312475, 0.00326071, 0.00359733, 0.00327591, 0.00363014,
       0.00382409, 0.00436261, 0.00399156, 0.00402506, 0.00397   ,
       0.00394649, 0.00353368, 0.00362698, 0.00400508, 0.00449814,
       0.0043533 , 0.00470988, 0.0063954 , 0.00642426, 0.00694236,
       0.00644971, 0.00591871, 0.00554072, 0.00528702])

inequalities["theil_between"] = theil_dr.bg
inequalities["theil_within"] = theil_dr.wg

inequalities["theil_between_share"] = (
    inequalities["theil_between"] / inequalities["theil"]
)

inequalities[
    ["theil_between", "theil_within", "theil_between_share"]
].plot(subplots=True, figsize=(10, 8));

theil_drs = inequality.theil.TheilDSim(
    pci_df[years].values, pci_df.Region
)

theil_drs.bg_pvalue

array([0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.01, 0.01, 0.01, 0.01])

sim = theil_drs.bg[1:,:]

sim.shape

(99, 49)

import pandas as pd
df = pd.DataFrame(index=years, data={'mean':sim.mean(axis=0), 'std': sim.std(axis=0)})

df.plot(kind='bar', y='mean', yerr='std', rot=0)

<Axes: >

fig, ax = plt.subplots()
ax.plot(years, theil_drs.bg[0], 'o', color='r')
ax.fill_between(years, df['mean']- 1.96*df['std'], df['mean']+ 1.96*df['std'], alpha=0.2)
ax.plot(years, df['mean']);

import matplotlib.ticker as mticker

myLocator = mticker.MultipleLocator(5)
ax.xaxis.set_major_locator(myLocator)
fig, ax = plt.subplots()
ax.plot(years, theil_drs.bg[0], 'o', color='r')
ax.fill_between(years, df['mean']- 1.96*df['std'], df['mean']+ 1.96*df['std'], alpha=0.2)
ax.plot(years, df['mean'])
myLocator = mticker.MultipleLocator(5)
ax.xaxis.set_major_locator(myLocator);

df.columns

Index(['mean', 'std'], dtype='object')

df.head()

	mean	std
1969	0.000071	0.000037
1970	0.000067	0.000034
1971	0.000065	0.000036
1972	0.000068	0.000036
1973	0.000084	0.000046