Preliminary look at whale sighting data

library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyverse)
library(wesanderson)
library(factoextra) # fviz_cluster
library(gridExtra)
library(cluster)
library(ggpubr)

# color palette
pal = wes_palette("Darjeeling1")
# bring in data 
df <- read.csv('data/whales_rings_chl_tmp.csv')
df<- df %>% 
  mutate(date = lubridate::mdy(date),
         year = lubridate::year(date), 
         week = lubridate::week(date))
# df.21.23 <- df %>% filter(year %in% c(2021:2023)) %>%
#  mutate(num_ring = as.integer(num_ring))
# create a variable for proportion of rings associated with streamer
# df.21.23$prop_stream_ring <- (df.21.23$num_ring_with_streamer)/(df.21.23$num_ring)
df$prop_stream_ring <- (df$num_ring_with_streamer)/(df$num_ring)
df_tally <- df %>%
  group_by(year) %>%
  summarise(ttl_right = sum(na.omit(right_whale_sight)), 
            ttl_other = sum(na.omit(other_whale_sight)), 
            ttl_unidentified = sum(na.omit(unidentified_whale_sight)),
            ttl_streamer_ring = sum(na.omit(num_ring_with_streamer)), 
            ttl_ring = sum(na.omit(num_ring)))
df_tally$prop_stream_ring <- df_tally$ttl_streamer_ring/df_tally$ttl_ring
df_tally$ttl_whale <- rowSums(df_tally[,c(2:4)])

tally.long <- df_tally %>%
  pivot_longer(ttl_right:ttl_whale, names_to = 'whales', values_to = 'count')

This is just a very quick preliminary look at the data we have available so far. This work mostly compares number of whale sightings to number of warm core rings associated with the date of the whale sighting. If there was not a JC chart for the exact date of the sighting I used the available chart from the closest date prior to the sighting. For instance, if a whale was sighted on 5/5/23, and there is not a chart on that day, I used a chart from 5/4/23 or 5/3/23.

Visualing Offshore Flight Summary Data

Figure 1. Total number of whale sightings from Offshore Flight data

Sightings were summed across years. Bars indicate the whale sightings, where red represents right whales, green represents other large whales and yellow indicates unidentified large whales.

g = ggplot() +
  geom_col(data = tally.long %>% 
             filter(whales %in% c('ttl_right','ttl_other','ttl_unidentified')),
           aes(x = factor(year), y = count, fill = whales), 
           size = 1, color = 'black',
           stat='identity', position='dodge') +
  scale_fill_manual(values = pal,
                    name = 'Species',
                    labels = c('Right Whale',
                               'Other Large Whale',
                               'Unidentified Large Whale')) +
  # geom_line(data = tally.long %>% filter(whales =='ttl_whale'), 
  #                                 aes(x = factor(year), y = count), size = 1.5, 
  #           color = 'black', group = 1) +
  xlab('Year') +
  ylab('Number of Whale Sightings (n)') +
  #labs(title = 'Offshore Flight Data Summary')+
  theme_classic()  
g +  theme(text = element_text(size = 14), 
         axis.text = element_text(size = 13), 
         axis.title = element_text(size = 15),
         legend.position = c(0.25, 0.85),
         legend.background = element_rect(fill = "white", color = "black"), 
         legend.title = element_text(size = 11),
         legend.text = element_text(size = 11))

Figure 2. Number of whale sightings and number of rings with streamers over time.

Bars indicate the whale sightings, where red represents right whales, green represents other large whales and yellow indicates unidentified large whales. The black line is the number of rings associated with a shelf streamer present (West of -65W) on the date of sighting. Note, I did not have JC charts for all the years, thus the yellow points on top of the black line are showing the years for which I had JC charts to reference. This data is also based on a limited set of JC charts, with most years only having images from Jan-May.

g2 = ggplot() +
  geom_col(data = tally.long %>%
             filter(whales %in% c('ttl_right','ttl_other','ttl_unidentified')),
           aes(x = factor(year), y = count, fill = whales),
           size = 1, color = 'black',
           stat='identity', position='dodge') +
  scale_fill_manual(values = pal,
                    name = 'Species',
                    labels = c('Right Whale',
                               'Other Large Whale',
                               'Unidentified Large Whale')) +
  geom_line(data = tally.long %>% filter(whales =='ttl_streamer_ring'), 
            aes(x = factor(year), y = count), size = 1.5, 
            color = 'black', group = 1) +
  geom_point(data = tally.long %>% filter(whales =='ttl_streamer_ring' & year %in% c(2014, 2016, 2017, 2019, 2020, 2021, 2022,2023 )), 
             aes(x = factor(year), y = count), bg = 'yellow', size = 3, pch = 21,
             color = 'black', group = 1) +
  #scale_y_continuous(sec.axis = sec_axis(~., name = 'Catch (lbs)')) + 
  xlab('Year') +
  ylab('Number of Whale Sightings / Number of Rings w/Streamer') +
  #labs(title = 'Offshore Flight Data Summary')+
  theme_classic()  
g2 +  theme(text = element_text(size = 14), 
           axis.text = element_text(size = 13), 
           axis.title = element_text(size = 15),
           legend.position = c(0.25, 0.85),
           legend.background = element_rect(fill = "white", color = "black"), 
           legend.title = element_text(size = 11),
           legend.text = element_text(size = 11))

Figure 3. Number of whale sightings and percentage of rings with streamers over time.

Bars indicate the whale sightings, where red represents right whales, green represents other large whales and yellow indicates unidentified large whales. The black line is the percentage of all rings present (West of -65) that were associated with a shelf streamer on the given date of sighting. Note, I did not have JC charts for all the years, thus the yellow points on top of the black line are showing the years for which I had JC charts to reference. This data is also based on a limited set of JC charts, with most years only having images from Jan-May.

g3 = ggplot() +
  geom_col(data = tally.long %>% 
             filter(whales %in% c('ttl_right','ttl_other','ttl_unidentified')),
           aes(x = factor(year), y = count, fill = whales), 
           size = 1, color = 'black',
           stat='identity', position='dodge') +
  scale_fill_manual(values = pal,
                    name = 'Species',
                    labels = c('Right Whale',
                               'Other Large Whale',
                               'Unidentified Large Whale')) +
  geom_line(data = tally.long %>% filter(whales =='prop_stream_ring'), 
            aes(x = factor(year), y = count*100), size = 1.5, 
            color = 'black', group = 1) +
  geom_point(data = tally.long %>% filter(whales =='prop_stream_ring' & year %in% c(2014, 2016, 2017, 2019, 2020, 2021, 2022,2023 )), 
             aes(x = factor(year), y = count*100), bg = 'yellow', size = 3, pch = 21,
             color = 'black', group = 1) +
  scale_y_continuous(sec.axis = sec_axis(~., name = 'Proportion of rings w/streamer')) + 
  xlab('Year') +
  ylab('Number of Whale Sightings / Number of Rings w/Streamer') +
  #labs(title = 'Offshore Flight Data Summary')+
  theme_classic()  
g3 +  theme(text = element_text(size = 14), 
           axis.text = element_text(size = 13), 
           axis.title = element_text(size = 15),
           legend.position = c(0.25, 0.85),
           legend.background = element_rect(fill = "white", color = "black"), 
           legend.title = element_text(size = 11),
           legend.text = element_text(size = 11))

Kmeans clustering

K-means clustering (MacQueen 1967) is an unsupervised machine learning algorithm for partitioning a given data set into a set of k groups (i.e. k clusters). The value K (the number of groups) is specified ahead of time.

This clustering is used to classify observations into k groups based on their similarity. Each group is represented by its center (cluster centroid), which correspsonds to the mean of the points assigned to the cluster.

objects within the same cluster are as similar as possible (i.e., high intra-class similarity),
objects from different clusters are as dissimilar as possible (i.e., low inter-class similarity)

More details can be found here

Right Whales:

Figure 4. Cluster plot of Right whales and number of WCRs with shelf streamers

Each data point in the following scatter plot is colored according to its cluster assignment. In order to get a better sense of what these clusters represent, qualitative values were assigned to each variable. To do this, first, the number of rings associated with a shelf streamer was divided by the total number of rings for a given date. Next, the resulting proportion of rings with shelf streamers was categorized as either:

high (> 0.6, or > %60)
moderate (>= 0.3 < 0.6, or 31-59%)
low (< 0.3, or < %30)

This allowed me to represent the data points according to whether the rings with streamers made up a high (circle), moderate (square), or low (triangle) proportion of the total rings present on the day of the whale sighting.

Note, only rings west of -65W were counted for this tally. Going forward, we may want to specify a more specific latitudinal and longitudinal range.

### Cluster : Right whales 
selected.vars <- df[, c('right_whale_sight', 
                              'num_ring_with_streamer',
                              'num_ring', 'prop_stream_ring')]
# remove NAs for kmeans to work 
selected.vars <- na.omit(selected.vars)
# optimal number of clusters:   
fviz_nbclust(selected.vars[,-c(3,4)], kmeans, method = "silhouette")

# -- Assigning a catagorical attribute to better understand clusters -- #
# Using proportion of rings that are associated with streamers
## High low ##
# selected.vars$prop <- case_when(
#   selected.vars$prop_stream_ring >= .5 ~ 'high',
#   selected.vars$prop_stream_ring < .5 ~ 'low'
# )
## High med low ##
selected.vars$prop <- case_when(
  selected.vars$prop_stream_ring >= .6 ~ 'high',
  selected.vars$prop_stream_ring >= .3 & selected.vars$prop_stream_ring < .6 ~ 'mod',
  selected.vars$prop_stream_ring <= .3 ~ 'low'
)
# Compute k-means with k = 2
set.seed(123)
res.km <- kmeans(scale(selected.vars[, -c(3,4,5)]), 2, nstart = 25)
# K-means clusters showing the group of each individuals
# res.km$cluster
# Dimension reduction using PCA
res.pca <- prcomp(selected.vars[,-c(3,4,5)],  scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(res.km$cluster)
# Add Species groups from the original data sett
ind.coord$prop <- selected.vars$prop
# Data inspection
head(ind.coord)

##          Dim.1      Dim.2 cluster prop
## 2   0.91655346  1.5852977       1 high
## 5  -1.68152275 -1.0127785       2  low
## 6   0.52820252  0.2415978       1 high
## 7   0.80859583 -1.7708463       1 high
## 8  -1.68152275 -1.0127785       2  low
## 11  0.05052805  0.7192723       1 high

# Percentage of variance explained by dimensions
eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
head(eigenvalue)

##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1        1.2             58.1                        58.1
## Dim.2        0.8             41.9                       100.0

ggscatter(
  ind.coord, x = "Dim.1", y = "Dim.2", 
  color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
  shape = 'prop', size = 3,  legend = "right", ggtheme = theme_bw(),
  xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
  ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
  stat_mean(aes(color = cluster), size = 4)

Other Large Whales:

Figure 5. Cluster plot of other large whales and number of WCRs with shelf streamers

### Cluster : Other whales 
selected.vars.ow <- df[, c('other_whale_sight', 
                               'num_ring_with_streamer',
                               'num_ring', 'prop_stream_ring')]
selected.vars.ow <- na.omit(selected.vars.ow)
# optimal number of clusters:   
fviz_nbclust(selected.vars.ow[,-c(3,4)], kmeans, method = "silhouette")

# High low
selected.vars.ow$prop <- case_when(
  selected.vars.ow$prop_stream_ring >= .5 ~ 'high',
  selected.vars.ow$prop_stream_ring < .5 ~ 'low'
)
# High med low
selected.vars.ow$prop <- case_when(
  selected.vars.ow$prop_stream_ring >= .6 ~ 'high',
  selected.vars.ow$prop_stream_ring >= .3 & selected.vars.ow$prop_stream_ring < .6 ~ 'mod',
  selected.vars.ow$prop_stream_ring <= .3 ~ 'low'
)
# Compute k-means with k = 3
set.seed(123)
res.km <- kmeans(scale(selected.vars.ow[, -c(3,4,5)]), 3, nstart = 25)
# K-means clusters showing the group of each individuals
# res.km$cluster
# Dimension reduction using PCA
res.pca <- prcomp(selected.vars.ow[,-c(3,4,5)],  scale = TRUE)
# Coordinates of individuals
ind.coord <- as.data.frame(get_pca_ind(res.pca)$coord)
# Add clusters obtained using the K-means algorithm
ind.coord$cluster <- factor(res.km$cluster)
# Add Species groups from the original data sett
ind.coord$prop <- selected.vars.ow$prop
# Data inspection
head(ind.coord)

##          Dim.1      Dim.2 cluster prop
## 2  -2.04084739  0.3139595       1 high
## 5   0.80100838 -1.7349375       2  low
## 6  -0.03628385  0.6882721       1 high
## 7  -1.89739403 -2.8030890       3 high
## 8   1.19748773 -1.3384582       2  low
## 11  0.06283599  0.7873920       1 high

# Percentage of variance explained by dimensions
eigenvalue <- round(get_eigenvalue(res.pca), 1)
variance.percent <- eigenvalue$variance.percent
head(eigenvalue)

##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1        1.1             54.4                        54.4
## Dim.2        0.9             45.6                       100.0

ggscatter(
  ind.coord, x = "Dim.1", y = "Dim.2", 
  color = "cluster", palette = "npg", ellipse = TRUE, ellipse.type = "convex",
  shape = 'prop', size = 3,  legend = "right", ggtheme = theme_bw(),
  xlab = paste0("Dim 1 (", variance.percent[1], "% )" ),
  ylab = paste0("Dim 2 (", variance.percent[2], "% )" )
) +
  stat_mean(aes(color = cluster), size = 4)

A few caveats to Kmeans clustering:

Assumes prior knowledge of data and requires analyst to choose the appropriate number of clusters (k) in advance
Final results are sensitive to initial random selection of cluster centers, outliers, and order of data.

This means, each algorithm run may lead to different clustering results.

References:

Hartigan, JA, and MA Wong. 1979. “Algorithm AS 136: A K-means clustering algorithm.” Applied Statistics. Royal Statistical Society, 100–108.

MacQueen, J. 1967. “Some Methods for Classification and Analysis of Multivariate Observations.” In Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability, Volume 1: Statistics, 281–97. Berkeley, Calif.: University of California Press. http://projecteuclid.org:443/euclid.bsmsp/1200512992.