Code
library(tidyverse)
library(Metrics)
library(cowplot)
library(here)
library(patchwork)
library(tidymodels)
library(broom)
library(kableExtra)February 14, 2024

Data Description:
Data used in this analysis comes from a study conducted by Warren Abrahamson which looked at survival, growth, and biomass estimated of two dominant palmetto species of South Florida: Serenoa repens and Sabal etonia. The study includes three data sets, and the one used in this analysis, palmetto_data, contains survival and growth data from 1981 through 1997, and then again in 2001 and 2017; data collection is ongoing at 5-year intervals.
Objective:
This analysis aims to test the feasibility of using variables plant height, canopy length, canopy width, and number of green leaves to classify whether a palmetto species is Serenoa repens or Sabal etonia. It will examine two different models with combinations of these predictor variables, determine which model is the ‘best’, and then examine the success of that model in classifying plant species correctly.
Data Citation:
Abrahamson, W.G. 2019. Survival, growth and biomass estimates of two dominant palmetto species of south-central Florida from 1981 - 2017, ongoing at 5-year intervals ver 1. Environmental Data Initiative. https://doi.org/10.6073/pasta/f2f96ec76fbbd4b9db431c79a770c4d5
Pseudo-code:
palmetto data, clean it up, and wrangle it to prep for exploratory data visualizations.#create a df to visualize canopy height for each species
palmetto_height <- palmetto_mod %>%
group_by(spec_fac) %>%
summarize(height_avg = mean(height, na.rm = TRUE))
palmetto_height$height_avg <- round(palmetto_height$height_avg, digits = 1)
#create a df to visualize canopy length
palmetto_length <- palmetto_mod %>%
group_by(spec_fac) %>%
summarize(length_avg = mean(length, na.rm = TRUE))
palmetto_length$length_avg <- round(palmetto_length$length_avg, digits = 1)
#create a df to visualize canopy width
palmetto_width <- palmetto_mod %>%
group_by(spec_fac) %>%
summarize(width_avg = mean(width, na.rm = TRUE))
palmetto_width$width_avg <- round(palmetto_width$width_avg, digits = 1)
#Create a df to visualize # of green leaves
palmetto_leaves <- palmetto_mod %>%
group_by(spec_fac, year) %>%
summarize(leaves_avg = mean(green_lvs, na.rm = TRUE))height_plot <- ggplot(palmetto_height,
aes(x = spec_fac, y = height_avg, fill = spec_fac)) +
geom_col() +
geom_text(aes(label = height_avg), vjust = -0.5, size = 3.5) +
theme_bw() +
labs(x = " ",
y = "Average measurement (cm)",
title = "Height") +
scale_fill_manual(values = c("1" = "chartreuse3",
"2" = "darkolivegreen")) +
theme(legend.position = "none") +
scale_x_discrete(labels = c("Serenoa repens", "Sabal etonia")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
height_plot <- height_plot +
coord_cartesian(ylim = c(0, 125)) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
#height_plotlength_plot <- ggplot(palmetto_length,
aes(x = spec_fac, y = length_avg, fill = spec_fac)) +
geom_col() +
geom_text(aes(label = length_avg), vjust = -0.5, size = 3.5) +
theme_bw() +
labs(x = " ",
y = " ",
title = "Length") +
scale_fill_manual(values = c("1" = "chartreuse3",
"2" = "darkolivegreen")) +
theme(legend.position = "none") +
scale_x_discrete(labels = c("Serenoa repens", "Sabal etonia")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
length_plot <- length_plot +
coord_cartesian(ylim = c(0, 175)) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
#length_plotwidth_plot <- ggplot(palmetto_width,
aes(x = spec_fac, y = width_avg, fill = spec_fac)) +
geom_col() +
geom_text(aes(label = width_avg), vjust = -0.5, size = 3.5) +
theme_bw() +
labs(x = " ",
y = " ",
title = "Width") +
scale_fill_manual(values = c("1" = "chartreuse3",
"2" = "darkolivegreen")) +
theme(legend.position = "none") +
scale_x_discrete(labels = c("Serenoa repens", "Sabal etonia")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
width_plot <- width_plot +
coord_cartesian(ylim = c(0, 125)) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
#width_plot
ggplot(palmetto_leaves,
aes(x = spec_fac, y = leaves_avg, fill = spec_fac)) +
geom_boxplot(alpha = 0.5) +
theme_bw() +
scale_x_discrete(labels = c("Serenoa repens", "Sabal etonia")) +
labs(y = "Average count of leaves",
x = " ") +
theme(legend.position = "none") +
scale_fill_manual(values = c("1" = "chartreuse3",
"2" = "darkolivegreen")) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
Key Takeaways:
#define the two models
m1 <- spec_fac ~ height + length + width + green_lvs
m2 <- spec_fac ~ height + width + green_lvs
#create each of the BLR's with the two models
blr1 <- glm(formula = m1, data = palmetto_mod, family = binomial)
summary(blr1)
blr2 <- glm(formula = m2, data = palmetto_mod, family = binomial)
summary(blr2)
# AIC & BIC scores for each model
AIC(blr1) #5195
AIC(blr2) #5987
BIC(blr1) #5231
BIC(blr2) #6017#check balance of the spec_fac column
palmetto_mod %>%
group_by(spec_fac) %>%
summarize(n = n()) %>%
ungroup() %>%
mutate(prop = n / sum(n))
#species counts are pretty much split 50/50
#no need to choose a stratified split
spec_split <- initial_split(palmetto_mod, prop = 0.80)
spec_train_df <- training(spec_split)
spec_test_df <- testing(spec_split)## Model 1 ##
spec_test_predict1 <- spec_test_df %>%
mutate(predict(blr1_fit, new_data = spec_test_df)) %>%
mutate(predict(blr1_fit, new_data = ., type = 'prob'))
## Model 2 ##
spec_test_predict2 <- spec_test_df %>%
mutate(predict(blr2_fit, new_data = spec_test_df)) %>%
mutate(predict(blr2_fit, new_data = ., type = 'prob'))
# Examine relationship b/w `.pred_class`, `.pred_1`, and `.pred_2`.
table(spec_test_predict1 %>%
select(spec_fac, .pred_class))
table(spec_test_predict2 %>%
select(spec_fac, .pred_class))
# Examine the accuracy of each model
accuracy(spec_test_predict1, truth = spec_fac, estimate = .pred_class)
accuracy(spec_test_predict2, truth = spec_fac, estimate = .pred_class)## Model 1 ##
blr1_roc_df <- roc_curve(spec_test_predict1,
truth = spec_fac, .pred_1)
#autoplot(blr1_roc_df)
## Model 2 ##
blr2_roc_df <- roc_curve(spec_test_predict2,
truth = spec_fac, .pred_1)
#autoplot(blr2_roc_df)
# Calculate area under curve
# 50% is random guessing, 100% is perfect classifier
yardstick::roc_auc(spec_test_predict1, truth = spec_fac, .pred_1)
yardstick::roc_auc(spec_test_predict2, truth = spec_fac, .pred_1)# Define the number of folds, and number of repeats
spec_train_folds <- vfold_cv(spec_train_df, v = 10, repeats = 10)
spec_train_folds
##### Model 1 #####
# Create a workflow that combines our model & a formula
blr1_wf <- workflow() %>%
add_model(blr_mdl) %>%
add_formula(spec_fac ~ height + length + width + green_lvs)
# Apply the workflow to the train dataset & see how it performs
blr1_fit_folds <- blr1_wf %>%
fit_resamples(spec_train_folds)
blr1_fit_folds
### Average the predictive performance of the ten models:
collect_metrics(blr1_fit_folds)
##### Model 2 #####
# Create a workflow that combines our model & a formula
blr2_wf <- workflow() %>%
add_model(blr_mdl) %>%
add_formula(spec_fac ~ height + width + green_lvs)
# Apply the workflow to the train dataset & see how it performs
blr2_fit_folds <- blr2_wf %>%
fit_resamples(spec_train_folds)
blr2_fit_folds
### Average the predictive performance of the ten models:
collect_metrics(blr2_fit_folds)tidy(last_mdl_fit) %>%
kableExtra::kable(col.names = c("Variable", "Coefficient Estimate",
"Standard Error", "Statistic",
"p-value")) %>%
kableExtra::kable_classic_2()
# tidy(blr1_fit) %>%
# kableExtra::kable(col.names = c("Variable", "Coefficient Estimate",
# "Standard Error", "Statistic",
# "p-value"),
# caption = "Table 1. Fill in later") %>%
# kableExtra::kable_classic()| Variable | Coefficient Estimate | Standard Error | Statistic | p-value |
|---|---|---|---|---|
| (Intercept) | 3.2266851 | 0.1420708 | 22.71180 | 0 |
| height | -0.0292173 | 0.0023061 | -12.66984 | 0 |
| length | 0.0458233 | 0.0018661 | 24.55600 | 0 |
| width | 0.0394434 | 0.0021000 | 18.78227 | 0 |
| green_lvs | -1.9084747 | 0.0388634 | -49.10728 | 0 |
### Last Fit ###
# Apply our workflow to the train dataset & see how it performs
last_mdl_eval <- blr1_wf %>%
last_fit(spec_split)
### Average the predictive performance of the ten models:
collect_metrics(last_mdl_eval)
### Look at how well the trained model 1 predicts species ###
spec_pred_final <- palmetto_mod %>%
mutate(predict(last_mdl_fit, new_data = palmetto_mod)) %>%
mutate(predict(last_mdl_fit, new_data = ., type = 'prob'))
table(spec_pred_final %>%
select(spec_fac, .pred_class))
# Examine the accuracy
accuracy(spec_pred_final, truth = spec_fac, estimate = .pred_class)Species <- c("Serenoa repens", "Sabal etonia")
correctly_classified <- c(5548, 5701)
incorrectly_classified <- c(564, 454)
percent_correct <- c(90.77, 92.62)
percent_incorrect <- c(9.23, 7.38)
mdl_1_sucess <- tibble(Species = Species,
Correct = correctly_classified,
Incorrect = incorrectly_classified,
Percent.Correct = percent_correct,
Perfect.Incorrect = percent_incorrect)
kable(mdl_1_sucess, col.names = c("Species", "Correct", "Incorrect",
"% Correct", "% Incorrect")) %>%
kable_classic_2() %>%
kable_styling(full_width = FALSE,
bootstrap_options = c("striped", "hover", "condensed"),
font_size = 14)| Species | Correct | Incorrect | % Correct | % Incorrect |
|---|---|---|---|---|
| Serenoa repens | 5548 | 564 | 90.77 | 9.23 |
| Sabal etonia | 5701 | 454 | 92.62 | 7.38 |
Conclusion:
After generating and comparing two models to classify whether a palmetto species is Serenoa repens or Sabal etonia, we found that model 1 (species ~ height + length + width + green_lvs) performed best. We then trained this model on the entire dataset and evaluated its classification success, finding that it correctly identified Serenoa repens about 90.77% of the time, and Sabal etonia about 92.62% of the time (see Table 2 above).
This assignment was created and organized by Casey O’Hara at the Bren School for ESM 244 (Advanced Data Analysis for Environmental Science & Management). ESM 244 is offered in the Master of Environmental Science & Management (MESM) program.
@online{pepperdine2024,
author = {Pepperdine, Maxwell},
title = {Plant Classification Using Logistic Regression},
date = {2024-02-14},
url = {https://maxpepperdine.github.io/posts/2024-02-14-blr-plant-classification/},
langid = {en}
}