---
title: Geocoding addresses
execute:
eval: false
echo: true
---
```{python}
import osmnx as ox
import pandas as pd
```
```{python}
import pandas as pd
```
```{python}
df = pd.read_csv('~/data/385/financial_ao.csv' )
```
```{python}
df.head()
```
```{python}
import geopandas
```
```{python}
%% time
results = geopandas.tools.geocode(df.address, provider= 'nominatim' , user_agent= 'geog385' )
```
```{python}
results.shape
```
```{python}
df.shape
```
```{python}
type (df), type (results)
```
```{python}
results.explore()
```
```{python}
results.head()
```
```{python}
gdf = results
```
### Drop the obvious error
- use a point in polygon
```{python}
import osmnx as ox
county_boundary = ox.geocode_to_gdf("San Diego County, California, USA" )
```
```{python}
county_boundary.plot()
```
```{python}
points_in_county = geopandas.sjoin(gdf, county_boundary, how= "inner" , predicate= "within" )
```
```{python}
points_in_county.explore()
```
```{python}
gdf = points_in_county
```
```{python}
gdf.head()
```
```{python}
gdf.to_file('sd_banks.json' )
```
```{python}
gdf.address.str .split("," ).str [- 2 ].str .strip()
```
```{python}
gdf['zipcode' ] = gdf.address.str .split("," ).str [- 2 ].str .strip()
```
```{python}
gdf.groupby(by= 'zipcode' ).count()
```
```{python}
from geosnap import DataStore
```
```{python}
datasets = DataStore("/srv/data/geosnap" )
```
```{python}
dir (datasets)
```
```{python}
from geosnap import io as gio
```
```{python}
from geosnap.io import get_acs
ca = get_acs(datasets, state_fips= ['06' ], level= 'tract' , years= [2016 ])
```
```{python}
sd = ca[ca.geoid.str .startswith('06073' )]
```
```{python}
sd.plot()
```
```{python}
sd.crs
```
```{python}
gdf.crs
```
```{python}
sd = sd.to_crs(gdf.crs)
```
```{python}
sd.crs == gdf.crs
```
```{python}
tracts = sd
points = gdf[['geometry' , 'address' , 'name' ]]
```
```{python}
points.columns
```
```{python}
points_with_tracts = geopandas.sjoin(points, tracts, how= "inner" , predicate= "within" , lsuffix= "left" , rsuffix= "right" )
```
```{python}
points_with_tracts.head()
```
```{python}
points_with_tracts.shape
```
```{python}
tracts.head()
```
```{python}
# Replace 'GEOID' with the appropriate identifier column in your tracts GeoDataFrame
tract_counts = points_with_tracts.groupby('geoid' ).size().reset_index(name= 'point_count' )
```
```{python}
tracts = tracts.merge(tract_counts, on= 'geoid' , how= 'left' )
tracts['point_count' ] = tracts['point_count' ].fillna(0 ).astype(int )
```
```{python}
tracts.plot('point_count' , legend= True )
```
```{python}
tracts[['geoid' , 'point_count' ]].sort_values(by= 'point_count' , ascending= False ).head(10 )
```
```{python}
tracts[['geoid' , 'point_count' ]].groupby(by= 'point_count' ).count()
```