Predictive Modelling

Random Forest

We’ll extend our analysis to do a hyperparameter grid search to find the best model configuration. Choosing varibles based on correlations and piping those into random forest model. We can see here that factors like percentage of people taking public transit and residents having longer commute times have a negative correlation with number of parking meters per street, indicating that an increase in these variables can be associated with meter-rich areas.

Code

# Import packages

import altair as alt
import geopandas as gpd
import pandas as pd
import numpy as np
import hvplot.pandas
import pandas as pd
from matplotlib import pyplot as plt
import holoviews as hv
from shapely.geometry import Polygon
from shapely.geometry import MultiPolygon
import requests
import geoviews as gv
import geoviews.tile_sources as gvts
import folium
from folium import plugins
from shapely.geometry import Point
import xyzservices
import osmnx as ox
import networkx as nx
import pygris
import cenpy

import requests
import seaborn as sns
import contextily as ctx

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

np.random.seed(42)



%matplotlib inline

# See lots of columns
pd.options.display.max_rows = 9999 
pd.options.display.max_colwidth = 200

# Hide warnings due to issue in shapely package 
# See: https://github.com/shapely/shapely/issues/1345
np.seterr(invalid="ignore");

Code

#bringing in data
# joining census and OSM data 

#columns_to_drop = ['in']

#sf_final.drop(columns=columns_to_drop, inplace=True)
sf_final = gpd.read_file("./data/census.geojson")
merged_gdf = gpd.read_file("./data/merged_gdf.geojson")

sf_final = sf_final.to_crs('EPSG:3857')

merged_gdf = merged_gdf.to_crs('EPSG:3857')

final_gdf = gpd.sjoin(merged_gdf, sf_final, how='left', op='intersects')

#columns_to_drop = ['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE', 'GEOID', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'INTPTLAT', 'INTPTLON']

#final_gdf.drop(columns=columns_to_drop, inplace=True)


#final_gdf.head()

D:\Fall_2023\Python\Mambaforge\envs\musa-550-fall-2023\lib\site-packages\IPython\core\interactiveshell.py:3448: FutureWarning: The `op` parameter is deprecated and will be removed in a future release. Please use the `predicate` parameter instead.
  if await self.run_code(code, result, async_=asy):

Code

#selecting the variables with good correlations

cols = [
    "truecount",
 'Total_Pop', 
    'Travel_Time', 
    'Means_of_Transport', 
    'Total_Public_Trans', 
    'workforce_16', 
    'households_car_pct', 
    'commute_30_90min_pct', 
    'Drive_Work_pct'
]

# Trim to these columns and remove NaNs
model_data = final_gdf[cols + ["geometry"]].dropna()
model_data.head()

	truecount	Total_Pop	Travel_Time	Means_of_Transport	Total_Public_Trans	workforce_16	households_car_pct	commute_30_90min_pct	Drive_Work_pct	geometry
0	0.350877	7621.0	135615.0	4821.0	1299.0	4821.0	0.240937	0.416719	0.196225	LINESTRING (-13623984.058 4546847.107, -13623982.756 4546832.729)
1	0.090023	4218.0	59975.0	2653.0	554.0	2653.0	0.120824	0.390124	0.354693	LINESTRING (-13629159.824 4541826.151, -13629200.645 4541836.792)
2	0.087558	3772.0	54970.0	2319.0	467.0	2319.0	0.414820	0.339802	0.149202	LINESTRING (-13625599.493 4548143.825, -13625630.050 4548174.641)
3	0.488599	2024.0	41265.0	1402.0	464.0	1402.0	0.211877	0.399429	0.048502	LINESTRING (-13627260.424 4549700.424, -13627229.711 4549705.397)
4	0.031525	3772.0	54970.0	2319.0	467.0	2319.0	0.414820	0.339802	0.149202	LINESTRING (-13625103.854 4548726.940, -13625181.255 4548634.529)

Splitting the data

Split random 30/70

Code

# Split the data 70/30
train_set, test_set = train_test_split(model_data, test_size=0.3, random_state=42)

# the target labels
y_train = train_set["truecount"]
y_test = test_set["truecount"]

#y_test.head()

2511    0.122466
3603    0.326433
1275    0.019455
1237    0.059726
2502    0.129769
Name: truecount, dtype: float64

transforming columns

Code

# Set up the column transformer with two transformers
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), cols)
    ]
)

Code

pipe = make_pipeline(
    transformer, RandomForestRegressor(random_state=42)
)
pipe.fit(train_set, y_train);
pipe.score(test_set, y_test)

0.9903700341139564

Optimize hyperparameters

Code

param_grid = {
    "randomforestregressor__n_estimators": [5, 10, 25, 50, 100, 200], #Reduce estimators to reduce run time
    "randomforestregressor__max_depth": [None, 2, 5, 7, 9],#Reduce depth to reduce run time
}

grid = GridSearchCV(pipe, param_grid, cv=10, n_jobs=-1)  # Utilize all available processors

# Run the search
grid.fit(train_set, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['truecount',
                                                                          'Total_Pop',
                                                                          'Travel_Time',
                                                                          'Means_of_Transport',
                                                                          'Total_Public_Trans',
                                                                          'workforce_16',
                                                                          'households_car_pct',
                                                                          'commute_30_90min_pct',
                                                                          'Drive_Work_pct'])])),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestregressor__max_depth': [None, 2, 5, 7, 9],
                         'randomforestregressor__n_estimators': [5, 10, 25, 50,
                                                                 100, 200]})

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

GridSearchCV

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['truecount',
                                                                          'Total_Pop',
                                                                          'Travel_Time',
                                                                          'Means_of_Transport',
                                                                          'Total_Public_Trans',
                                                                          'workforce_16',
                                                                          'households_car_pct',
                                                                          'commute_30_90min_pct',
                                                                          'Drive_Work_pct'])])),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'randomforestregressor__max_depth': [None, 2, 5, 7, 9],
                         'randomforestregressor__n_estimators': [5, 10, 25, 50,
                                                                 100, 200]})

estimator: Pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['truecount', 'Total_Pop',
                                                   'Travel_Time',
                                                   'Means_of_Transport',
                                                   'Total_Public_Trans',
                                                   'workforce_16',
                                                   'households_car_pct',
                                                   'commute_30_90min_pct',
                                                   'Drive_Work_pct'])])),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['truecount', 'Total_Pop', 'Travel_Time',
                                  'Means_of_Transport', 'Total_Public_Trans',
                                  'workforce_16', 'households_car_pct',
                                  'commute_30_90min_pct', 'Drive_Work_pct'])])

num

['truecount', 'Total_Pop', 'Travel_Time', 'Means_of_Transport', 'Total_Public_Trans', 'workforce_16', 'households_car_pct', 'commute_30_90min_pct', 'Drive_Work_pct']

StandardScaler

StandardScaler()

RandomForestRegressor

RandomForestRegressor(random_state=42)

Code

best_random = grid.best_estimator_
grid.score(test_set, y_test)

0.999872947311154

Code

print("Best R^2 score:", grid.best_score_)

Best R^2 score: 0.9999591625411464

High R2: model is likely overfit

Code

# Predictions for log of sales price
log_predictions = grid.best_estimator_.predict(test_set)

X = model_data.loc[test_set.index]

# Convert the predicted test values from log
X['prediction'] = np.exp(log_predictions)

#X.head()

ValueError: Length of values (1615) does not match length of index (1751)

# Ensure test_set indices are in model_data
common_indices = test_set.index.intersection(model_data.index)

# Filter model_data based on common indices
X = model_data.loc[common_indices]

# Check the length of log_predictions and assign to 'prediction' column
if len(log_predictions) == len(common_indices):
    X['prediction'] = np.exp(log_predictions)
else:
    print("Length mismatch between log_predictions and common_indices.")

#X.head()

Length mismatch between log_predictions and common_indices.