library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyverse)
library(wesanderson)
library(factoextra) # fviz_cluster
library(gridExtra)
library(cluster)
library(ggpubr)
# color palette
pal = wes_palette("Darjeeling1")
# bring in data
df <- read.csv('data/whales_rings_chl_tmp.csv')
df<- df %>%
mutate(date = lubridate::mdy(date),
year = lubridate::year(date),
week = lubridate::week(date))
# df.21.23 <- df %>% filter(year %in% c(2021:2023)) %>%
# mutate(num_ring = as.integer(num_ring))
# create a variable for proportion of rings associated with streamer
# df.21.23$prop_stream_ring <- (df.21.23$num_ring_with_streamer)/(df.21.23$num_ring)
df$prop_stream_ring <- (df$num_ring_with_streamer)/(df$num_ring)
df_tally <- df %>%
group_by(year) %>%
summarise(ttl_right = sum(na.omit(right_whale_sight)),
ttl_other = sum(na.omit(other_whale_sight)),
ttl_unidentified = sum(na.omit(unidentified_whale_sight)),
ttl_streamer_ring = sum(na.omit(num_ring_with_streamer)),
ttl_ring = sum(na.omit(num_ring)))
df_tally$prop_stream_ring <- df_tally$ttl_streamer_ring/df_tally$ttl_ring
df_tally$ttl_whale <- rowSums(df_tally[,c(2:4)])
tally.long <- df_tally %>%
pivot_longer(ttl_right:ttl_whale, names_to = 'whales', values_to = 'count')
This is just a very quick preliminary look at the data we have available so far. This work mostly compares number of whale sightings to number of warm core rings associated with the date of the whale sighting. If there was not a JC chart for the exact date of the sighting I used the available chart from the closest date prior to the sighting. For instance, if a whale was sighted on 5/5/23, and there is not a chart on that day, I used a chart from 5/4/23 or 5/3/23.
Figure 1. Total number of whale sightings from Offshore Flight data
Sightings were summed across years. Bars indicate the whale sightings, where red represents right whales, green represents other large whales and yellow indicates unidentified large whales.
g = ggplot() +
geom_col(data = tally.long %>%
filter(whales %in% c('ttl_right','ttl_other','ttl_unidentified')),
aes(x = factor(year), y = count, fill = whales),
size = 1, color = 'black',
stat='identity', position='dodge') +
scale_fill_manual(values = pal,
name = 'Species',
labels = c('Right Whale',
'Other Large Whale',
'Unidentified Large Whale')) +
# geom_line(data = tally.long %>% filter(whales =='ttl_whale'),
# aes(x = factor(year), y = count), size = 1.5,
# color = 'black', group = 1) +
xlab('Year') +
ylab('Number of Whale Sightings (n)') +
#labs(title = 'Offshore Flight Data Summary')+
theme_classic()
g + theme(text = element_text(size = 14),
axis.text = element_text(size = 13),
axis.title = element_text(size = 15),
legend.position = c(0.25, 0.85),
legend.background = element_rect(fill = "white", color = "black"),
legend.title = element_text(size = 11),
legend.text = element_text(size = 11))
Figure 2. Number of whale sightings and number of rings with streamers over time.
Bars indicate the whale sightings, where red represents right whales, green represents other large whales and yellow indicates unidentified large whales. The black line is the number of rings associated with a shelf streamer present (West of -65W) on the date of sighting. Note, I did not have JC charts for all the years, thus the yellow points on top of the black line are showing the years for which I had JC charts to reference. This data is also based on a limited set of JC charts, with most years only having images from Jan-May.
g2 = ggplot() +
geom_col(data = tally.long %>%
filter(whales %in% c('ttl_right','ttl_other','ttl_unidentified')),
aes(x = factor(year), y = count, fill = whales),
size = 1, color = 'black',
stat='identity', position='dodge') +
scale_fill_manual(values = pal,
name = 'Species',
labels = c('Right Whale',
'Other Large Whale',
'Unidentified Large Whale')) +
geom_line(data = tally.long %>% filter(whales =='ttl_streamer_ring'),
aes(x = factor(year), y = count), size = 1.5,
color = 'black', group = 1) +
geom_point(data = tally.long %>% filter(whales =='ttl_streamer_ring' & year %in% c(2014, 2016, 2017, 2019, 2020, 2021, 2022,2023 )),
aes(x = factor(year), y = count), bg = 'yellow', size = 3, pch = 21,
color = 'black', group = 1) +
#scale_y_continuous(sec.axis = sec_axis(~., name = 'Catch (lbs)')) +
xlab('Year') +
ylab('Number of Whale Sightings / Number of Rings w/Streamer') +
#labs(title = 'Offshore Flight Data Summary')+
theme_classic()
g2 + theme(text = element_text(size = 14),
axis.text = element_text(size = 13),
axis.title = element_text(size = 15),
legend.position = c(0.25, 0.85),
legend.background = element_rect(fill = "white", color = "black"),
legend.title = element_text(size = 11),
legend.text = element_text(size = 11))
Figure 3. Number of whale sightings and percentage of rings with streamers over time.
Bars indicate the whale sightings, where red represents right whales, green represents other large whales and yellow indicates unidentified large whales. The black line is the percentage of all rings present (West of -65) that were associated with a shelf streamer on the given date of sighting. Note, I did not have JC charts for all the years, thus the yellow points on top of the black line are showing the years for which I had JC charts to reference. This data is also based on a limited set of JC charts, with most years only having images from Jan-May.
g3 = ggplot() +
geom_col(data = tally.long %>%
filter(whales %in% c('ttl_right','ttl_other','ttl_unidentified')),
aes(x = factor(year), y = count, fill = whales),
size = 1, color = 'black',
stat='identity', position='dodge') +
scale_fill_manual(values = pal,
name = 'Species',
labels = c('Right Whale',
'Other Large Whale',
'Unidentified Large Whale')) +
geom_line(data = tally.long %>% filter(whales =='prop_stream_ring'),
aes(x = factor(year), y = count*100), size = 1.5,
color = 'black', group = 1) +
geom_point(data = tally.long %>% filter(whales =='prop_stream_ring' & year %in% c(2014, 2016, 2017, 2019, 2020, 2021, 2022,2023 )),
aes(x = factor(year), y = count*100), bg = 'yellow', size = 3, pch = 21,
color = 'black', group = 1) +
scale_y_continuous(sec.axis = sec_axis(~., name = 'Proportion of rings w/streamer')) +
xlab('Year') +
ylab('Number of Whale Sightings / Number of Rings w/Streamer') +
#labs(title = 'Offshore Flight Data Summary')+
theme_classic()
g3 + theme(text = element_text(size = 14),
axis.text = element_text(size = 13),
axis.title = element_text(size = 15),
legend.position = c(0.25, 0.85),
legend.background = element_rect(fill = "white", color = "black"),
legend.title = element_text(size = 11),
legend.text = element_text(size = 11))
K-means clustering (MacQueen 1967) is an unsupervised machine learning algorithm for partitioning a given data set into a set of k groups (i.e. k clusters). The value K (the number of groups) is specified ahead of time.
This clustering is used to classify observations into k groups based on their similarity. Each group is represented by its center (cluster centroid), which correspsonds to the mean of the points assigned to the cluster.
objects within the same cluster are as similar as possible (i.e., high intra-class similarity),
objects from different clusters are as dissimilar as possible (i.e., low inter-class similarity)
More details can be found here
Figure 4. Cluster plot of Right whales and number of WCRs with shelf streamers
Each data point in the following scatter plot is colored according to its cluster assignment. In order to get a better sense of what these clusters represent, qualitative values were assigned to each variable. To do this, first, the number of rings associated with a shelf streamer was divided by the total number of rings for a given date. Next, the resulting proportion of rings with shelf streamers was categorized as either:
This allowed me to represent the data points according to whether the rings with streamers made up a high (circle), moderate (square), or low (triangle) proportion of the total rings present on the day of the whale sighting.
Note, only rings west of -65W were counted for this tally. Going forward, we may want to specify a more specific latitudinal and longitudinal range.
### Cluster : Right whales
selected.vars <- df[, c('right_whale_sight',
'num_ring_with_streamer',
'num_ring', 'prop_stream_ring')]
# remove NAs for kmeans to work
selected.vars <- na.omit(selected.vars)
# optimal number of clusters:
fviz_nbclust(selected.vars[,-c(3,4)], kmeans, method = "silhouette")
# -- Assigning a catagorical attribute to better understand clusters -- #
# Using proportion of rings that are associated with streamers
## High low ##
# selected.vars$prop <- case_when(
# selected.vars$prop_stream_ring >= .5 ~ 'high',
# selected.vars$prop_stream_ring < .5 ~ 'low'
# )
## High med low ##
selected.vars$prop <- case_when(
selected.vars$prop_stream_ring >= .6 ~ 'high',
selected.vars$prop_stream_ring >= .3 & selected.vars$prop_stream_ring < .6 ~ 'mod',
selected.vars$prop_stream_ring <= .3 ~ 'low'
)
# Compute k-means with k = 2
set.seed(123)
res.km <- kmeans(scale(selected.vars[, -c(3,4,5)]), 2, nstart = 25)
# K-means clusters showing the group of each individuals
# res.km$cluster
# Dimension reduction using PCA
res.pca <- prcomp(selected.vars[,-c(3,4,5)], scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(res.km$cluster)
# Add Species groups from the original data sett
ind.coord$prop <- selected.vars$prop
# Data inspection
head(ind.coord)
## Dim.1 Dim.2 cluster prop
## 2 0.91655346 1.5852977 1 high
## 5 -1.68152275 -1.0127785 2 low
## 6 0.52820252 0.2415978 1 high
## 7 0.80859583 -1.7708463 1 high
## 8 -1.68152275 -1.0127785 2 low
## 11 0.05052805 0.7192723 1 high
# Percentage of variance explained by dimensions
eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
head(eigenvalue)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 1.2 58.1 58.1
## Dim.2 0.8 41.9 100.0
ggscatter(
ind.coord, x = "Dim.1", y = "Dim.2",
color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
shape = 'prop', size = 3, legend = "right", ggtheme = theme_bw(),
xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
stat_mean(aes(color = cluster), size = 4)
Figure 5. Cluster plot of other large whales and number of WCRs with shelf streamers
### Cluster : Other whales
selected.vars.ow <- df[, c('other_whale_sight',
'num_ring_with_streamer',
'num_ring', 'prop_stream_ring')]
selected.vars.ow <- na.omit(selected.vars.ow)
# optimal number of clusters:
fviz_nbclust(selected.vars.ow[,-c(3,4)], kmeans, method = "silhouette")
# High low
selected.vars.ow$prop <- case_when(
selected.vars.ow$prop_stream_ring >= .5 ~ 'high',
selected.vars.ow$prop_stream_ring < .5 ~ 'low'
)
# High med low
selected.vars.ow$prop <- case_when(
selected.vars.ow$prop_stream_ring >= .6 ~ 'high',
selected.vars.ow$prop_stream_ring >= .3 & selected.vars.ow$prop_stream_ring < .6 ~ 'mod',
selected.vars.ow$prop_stream_ring <= .3 ~ 'low'
)
# Compute k-means with k = 3
set.seed(123)
res.km <- kmeans(scale(selected.vars.ow[, -c(3,4,5)]), 3, nstart = 25)
# K-means clusters showing the group of each individuals
# res.km$cluster
# Dimension reduction using PCA
res.pca <- prcomp(selected.vars.ow[,-c(3,4,5)], scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(res.km$cluster)
# Add Species groups from the original data sett
ind.coord$prop <- selected.vars.ow$prop
# Data inspection
head(ind.coord)
## Dim.1 Dim.2 cluster prop
## 2 -2.04084739 0.3139595 1 high
## 5 0.80100838 -1.7349375 2 low
## 6 -0.03628385 0.6882721 1 high
## 7 -1.89739403 -2.8030890 3 high
## 8 1.19748773 -1.3384582 2 low
## 11 0.06283599 0.7873920 1 high
# Percentage of variance explained by dimensions
eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
head(eigenvalue)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 1.1 54.4 54.4
## Dim.2 0.9 45.6 100.0
ggscatter(
ind.coord, x = "Dim.1", y = "Dim.2",
color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
shape = 'prop', size = 3, legend = "right", ggtheme = theme_bw(),
xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
stat_mean(aes(color = cluster), size = 4)
A few caveats to Kmeans clustering:
References: