Random Forest

This notebook trains random forest (RF) models to classify dominant tree species type and regress forest AGB using LiDAR metrics.

LiDAR metrics were calculated using the code below:

LiDAR Metric Derivation

Load packages

Code

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, mean_squared_error, r2_score
import rioxarray as rio

from src.utils.regression_utils import plot_regression

Set filepaths

LABELS_FPATH = r'data/petawawa/labels.csv'

METRICS_FPATH = r'data/petawawa/lidar_metrics.csv'

Load labels and LiDAR metrics

metrics_df = pd.read_csv(METRICS_FPATH)

metric_names = metrics_df.drop('plot_id', axis=1).columns.tolist()

df = (pd.read_csv(LABELS_FPATH)
        .merge(metrics_df, on="plot_id", how='left')
        .rename(columns={'total_agb_mg_ha': 'agb_mg_ha_obs'})
        )

# View relationship between mean height and biomass
ax = sns.scatterplot(df, y='agb_mg_ha_obs', x='zmean')
ax.set(xlabel='Mean Height', ylabel='AGB (Mg/ha)')

Prepare data for RF Modelling

# Check which ALS metric cols have NAs
na_counts = df.isna().sum()

# Print all cols with NAs
for col, na_count in na_counts.items():
    if na_count > 0:
        print(f"{col}: {na_count} NAs")

# Drop rows with any NAs
df = df.dropna()
df

	plot_id	dom_sp_type	perc_decid	agb_mg_ha_obs	split	total_agb_z	n	zmax	zmin	zmean	...	vzsd	vzcv	OpenGapSpace	ClosedGapSpace	Euphotic	Oligophotic	kde_peaks_count	kde_peak1_elev	kde_peak1_value	HOME
0	PRF013	conif	0.000000	42.263648	train	-1.326847	19533	12.11	-0.46	4.449033	...	0.339932	86.693595	48.448852	24.437781	19.086611	8.026756	1	4.310411	0.149233	12.11
1	PRF090	mixed	52.285188	127.743892	train	-0.130919	37466	29.91	-0.50	12.639579	...	0.350017	107.110615	38.931903	43.288246	12.285448	5.494403	3	27.776575	0.009072	29.91
2	PRF015	conif	5.186690	172.347008	train	0.493109	18859	30.50	-0.75	15.571104	...	0.198087	77.504066	32.883558	49.915951	10.412975	6.787515	3	22.196184	0.062241	30.50
3	PRF146	conif	3.228891	174.069178	train	0.517204	17623	33.47	-0.34	10.340951	...	0.174707	179.575843	53.155319	36.181278	6.507298	4.156105	3	29.429452	0.043175	33.47
4	PRF208R	conif	0.000000	13.298206	train	-1.732093	20061	5.60	-0.14	2.092026	...	0.389900	76.030501	37.703165	13.280582	25.684346	23.331908	1	2.157182	0.163660	5.60
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
244	PRF212	conif	26.719682	141.950651	test	0.067843	17128	32.35	-0.45	16.189422	...	0.191256	76.990694	35.943012	48.303935	10.384441	5.368611	2	18.623973	0.058992	32.35
245	PRF049	mixed	66.054144	36.550006	test	-1.406785	19089	18.85	0.00	5.244531	...	0.243248	128.427960	58.916293	23.871450	10.014948	7.197309	2	13.318982	0.035563	18.85
246	PRF161	decid	98.688073	131.689708	test	-0.075714	17089	23.53	-0.16	17.017618	...	0.334060	133.360794	21.892537	62.961194	14.095522	1.050746	2	17.447084	0.134810	23.53
247	PRF199	conif	5.880717	133.009047	test	-0.057256	58565	27.82	-0.38	10.962897	...	0.192104	136.609411	40.449952	41.689097	11.096440	6.764512	3	20.052877	0.061518	27.82
248	PRF072	mixed	54.907966	65.969887	test	-0.995180	17283	21.11	-0.15	7.421538	...	0.362311	84.959235	50.054600	26.167076	13.452088	10.326235	1	7.062877	0.096511	21.11

249 rows × 102 columns

# Assign nummieric IDs to each dominant species
df['dom_sp_type_id'] = df['dom_sp_type'].astype('category').cat.codes

# Create a dictionary mapping species names to their IDs
dom_sp_dict = (df[['dom_sp_type', 'dom_sp_type_id']]
               .drop_duplicates()
               .sort_values(by='dom_sp_type_id')
               .set_index('dom_sp_type_id', drop=True)
               .to_dict()['dom_sp_type'])

dom_sp_dict

{0: 'conif', 1: 'decid', 2: 'mixed'}

train_df = df[df['split'] == 'train'].reset_index(drop=True)
test_df = df[df['split'].isin(('val', 'test'))].reset_index(drop=True)

train_df.columns

Index(['plot_id', 'dom_sp_type', 'perc_decid', 'agb_mg_ha_obs', 'split',
       'total_agb_z', 'n', 'zmax', 'zmin', 'zmean',
       ...
       'vzcv', 'OpenGapSpace', 'ClosedGapSpace', 'Euphotic', 'Oligophotic',
       'kde_peaks_count', 'kde_peak1_elev', 'kde_peak1_value', 'HOME',
       'dom_sp_type_id'],
      dtype='object', length=103)

# Split data into train and test sets (combine val into test since not needed as seperate for RF)
train_df = df[df['split'].isin(('train', 'val'))].reset_index(drop=True)
test_df = df[df['split'] == 'test'].reset_index(drop=True)

# Create histograms showing each dataset class distributions
fig, axes = plt.subplots(1, 2, figsize=(10, 8), sharey=True)
sns.countplot(x='dom_sp_type_id', data=train_df, ax=axes[0])
axes[0].set_title('Training Set Class Distribution')
sns.countplot(x='dom_sp_type_id', data=test_df, ax=axes[1])
axes[1].set_title('Test Set Class Distribution')


plt.show()

Train RF for Dominant Species Type Classification

def plot_feature_importance(rf, metric_names, title):
    importances = rf.feature_importances_
    importance_df = pd.DataFrame({'Metric': metric_names, 'importance': importances
                                    }).sort_values(by='importance', ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='Metric', hue='importance', data=importance_df, palette='viridis', legend=False)
    plt.title(title)
    plt.yticks(fontsize=5)
    plt.xlabel('Importance Score')
    plt.show()

rf = RandomForestClassifier(n_estimators=100, random_state=25)

rf.fit(train_df[metric_names], train_df['dom_sp_type_id'])

# Apply the model to the test set
test_df['pred_dom_sp_group_id'] = rf.predict(test_df[metric_names])

# Calculate accuracy and kappa
accuracy = accuracy_score(test_df['dom_sp_type_id'], test_df['pred_dom_sp_group_id'])
kappa = cohen_kappa_score(test_df['dom_sp_type_id'], test_df['pred_dom_sp_group_id'])

print(f"Overall Accuracy: {accuracy:.2f}")
print(f"Kappa: {kappa:.2f}")

# Class specific accuracies
class_accuracies = {}
for class_id in test_df['dom_sp_type_id'].unique():
    class_mask = test_df['dom_sp_type_id'] == class_id
    class_accuracy = accuracy_score(test_df.loc[class_mask, 'dom_sp_type_id'],
                                    test_df.loc[class_mask, 'pred_dom_sp_group_id'])
    class_accuracies[class_id] = class_accuracy
    print(f"Class {class_id} ({dom_sp_dict[class_id]}): Accuracy = {class_accuracy:.2f}")


# Print confusion matrix
conf_matrix = confusion_matrix(test_df['dom_sp_type_id'], test_df['pred_dom_sp_group_id'])

fig, ax = plt.subplots(figsize=(6,5), layout='tight')

sns.heatmap(ax=ax, data=conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=dom_sp_dict.values(),
            yticklabels=dom_sp_dict.values())

ax.set_xlabel('Predicted Label')
ax.set_ylabel('Observed Label')
ax.set_title('Random Forest', fontsize=14, fontweight='bold')

fig.savefig("images/rf_cls_confusion_matrix.jpg", dpi=300)

# View metric importance
plot_feature_importance(rf, metric_names, title='Feature Importance from Random Forest Classifier')

Overall Accuracy: 0.76
Kappa: 0.57
Class 1 (decid): Accuracy = 0.67
Class 0 (conif): Accuracy = 0.95
Class 2 (mixed): Accuracy = 0.44

Train RF for AGB estimation

rf = RandomForestRegressor(n_estimators=100, random_state=25)

rf.fit(train_df[metric_names], train_df['agb_mg_ha_obs'])

test_df['agb_mg_ha_pred'] = rf.predict(test_df[metric_names])

# Plot regression results
plot_regression(obs=test_df['agb_mg_ha_obs'], 
                pred=test_df['agb_mg_ha_pred'],
                title='Random Forest',
                ax_lims=(0, 400),
                fig_out_fpath='images/rf_regression_plot.jpg')

plot_feature_importance(rf, metric_names, title='Feature Importance from Random Forest Regressor')