Predictive Modelling

Random Forest

We’ll extend our analysis to do a hyperparameter grid search to find the best model configuration. Choosing varibles based on correlations and piping those into random forest model. We can see here that factors like percentage of people taking public transit and residents having longer commute times have a negative correlation with number of parking meters per street, indicating that an increase in these variables can be associated with meter-rich areas.

Code
# Import packages

import altair as alt
import geopandas as gpd
import pandas as pd
import numpy as np
import hvplot.pandas
import pandas as pd
from matplotlib import pyplot as plt
import holoviews as hv
from shapely.geometry import Polygon
from shapely.geometry import MultiPolygon
import requests
import geoviews as gv
import geoviews.tile_sources as gvts
import folium
from folium import plugins
from shapely.geometry import Point
import xyzservices
import osmnx as ox
import networkx as nx
import pygris
import cenpy

import requests
import seaborn as sns
import contextily as ctx

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

np.random.seed(42)



%matplotlib inline

# See lots of columns
pd.options.display.max_rows = 9999 
pd.options.display.max_colwidth = 200

# Hide warnings due to issue in shapely package 
# See: https://github.com/shapely/shapely/issues/1345
np.seterr(invalid="ignore");
Code
#bringing in data
# joining census and OSM data 

#columns_to_drop = ['in']

#sf_final.drop(columns=columns_to_drop, inplace=True)
sf_final = gpd.read_file("./data/census.geojson")
merged_gdf = gpd.read_file("./data/merged_gdf.geojson")

sf_final = sf_final.to_crs('EPSG:3857')

merged_gdf = merged_gdf.to_crs('EPSG:3857')

final_gdf = gpd.sjoin(merged_gdf, sf_final, how='left', op='intersects')

#columns_to_drop = ['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE', 'GEOID', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'INTPTLAT', 'INTPTLON']

#final_gdf.drop(columns=columns_to_drop, inplace=True)


#final_gdf.head()
D:\Fall_2023\Python\Mambaforge\envs\musa-550-fall-2023\lib\site-packages\IPython\core\interactiveshell.py:3448: FutureWarning: The `op` parameter is deprecated and will be removed in a future release. Please use the `predicate` parameter instead.
  if await self.run_code(code, result, async_=asy):
Code
#selecting the variables with good correlations

cols = [
    "truecount",
 'Total_Pop', 
    'Travel_Time', 
    'Means_of_Transport', 
    'Total_Public_Trans', 
    'workforce_16', 
    'households_car_pct', 
    'commute_30_90min_pct', 
    'Drive_Work_pct'
]

# Trim to these columns and remove NaNs
model_data = final_gdf[cols + ["geometry"]].dropna()
model_data.head()
truecount Total_Pop Travel_Time Means_of_Transport Total_Public_Trans workforce_16 households_car_pct commute_30_90min_pct Drive_Work_pct geometry
0 0.350877 7621.0 135615.0 4821.0 1299.0 4821.0 0.240937 0.416719 0.196225 LINESTRING (-13623984.058 4546847.107, -13623982.756 4546832.729)
1 0.090023 4218.0 59975.0 2653.0 554.0 2653.0 0.120824 0.390124 0.354693 LINESTRING (-13629159.824 4541826.151, -13629200.645 4541836.792)
2 0.087558 3772.0 54970.0 2319.0 467.0 2319.0 0.414820 0.339802 0.149202 LINESTRING (-13625599.493 4548143.825, -13625630.050 4548174.641)
3 0.488599 2024.0 41265.0 1402.0 464.0 1402.0 0.211877 0.399429 0.048502 LINESTRING (-13627260.424 4549700.424, -13627229.711 4549705.397)
4 0.031525 3772.0 54970.0 2319.0 467.0 2319.0 0.414820 0.339802 0.149202 LINESTRING (-13625103.854 4548726.940, -13625181.255 4548634.529)

Splitting the data

Split random 30/70

Code
# Split the data 70/30
train_set, test_set = train_test_split(model_data, test_size=0.3, random_state=42)

# the target labels
y_train = train_set["truecount"]
y_test = test_set["truecount"]

#y_test.head()
2511    0.122466
3603    0.326433
1275    0.019455
1237    0.059726
2502    0.129769
Name: truecount, dtype: float64

transforming columns

Code
# Set up the column transformer with two transformers
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), cols)
    ]
)
Code
pipe = make_pipeline(
    transformer, RandomForestRegressor(random_state=42)
)
pipe.fit(train_set, y_train);
pipe.score(test_set, y_test)
0.9903700341139564

Optimize hyperparameters

Code
param_grid = {
    "randomforestregressor__n_estimators": [5, 10, 25, 50, 100, 200], #Reduce estimators to reduce run time
    "randomforestregressor__max_depth": [None, 2, 5, 7, 9],#Reduce depth to reduce run time
}

grid = GridSearchCV(pipe, param_grid, cv=10, n_jobs=-1)  # Utilize all available processors

# Run the search
grid.fit(train_set, y_train)
GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['truecount',
                                                                          'Total_Pop',
                                                                          'Travel_Time',
                                                                          'Means_of_Transport',
                                                                          'Total_Public_Trans',
                                                                          'workforce_16',
                                                                          'households_car_pct',
                                                                          'commute_30_90min_pct',
                                                                          'Drive_Work_pct'])])),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestregressor__max_depth': [None, 2, 5, 7, 9],
                         'randomforestregressor__n_estimators': [5, 10, 25, 50,
                                                                 100, 200]})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Code
best_random = grid.best_estimator_
grid.score(test_set, y_test)
0.999872947311154
Code
print("Best R^2 score:", grid.best_score_)
Best R^2 score: 0.9999591625411464

High R2: model is likely overfit

Code
# Predictions for log of sales price
log_predictions = grid.best_estimator_.predict(test_set)

X = model_data.loc[test_set.index]

# Convert the predicted test values from log
X['prediction'] = np.exp(log_predictions)

#X.head()
ValueError: Length of values (1615) does not match length of index (1751)
# Ensure test_set indices are in model_data
common_indices = test_set.index.intersection(model_data.index)

# Filter model_data based on common indices
X = model_data.loc[common_indices]

# Check the length of log_predictions and assign to 'prediction' column
if len(log_predictions) == len(common_indices):
    X['prediction'] = np.exp(log_predictions)
else:
    print("Length mismatch between log_predictions and common_indices.")

#X.head()
Length mismatch between log_predictions and common_indices.