Setting up your environment


Load libraries

## Wrangling 
library(dplyr)
library(tidyverse)
library(tidyr)          ## gather
## Diagnostics
library(car)            ## Anova, influence plot
library(rsample)       ## data splitting 
library(randomForest)   ## basic implementation
library(ranger)         ## a faster implementation of randomForest
#library(caret)         ## a package for performing many machine learning models
#library(h2o)           ## an extremely fast java-based platform         
library(mgcv)           ## gam, multinom
## Extraction/graphics
library(synchrony)      ## latlon2dist
library(patchwork)
library(geosphere)
library(ggplot2)
#library(broom.mixed)
#library(emmeans)
#library(dotwhisker)
#library(gridExtra)
## From GitHub: 
#library(r2glmm)        ## bbolker/r2glmm
#library(remef)         ## hohenstein/remef
#source("utils.R")
#source("gamm4_utils.R")

Import data

aa <- read.csv('AA_1997_2019_illex_formatted_with_vesstype.csv')
sf <- read.csv('crb&nefop_illex_2008-2020_atleast100lb.csv')
obs <- read.csv('OBLEN_DATA_VIEW.csv')
cbwk <- read.csv('cbwk_env.csv')
sys_state <- read.csv('sys_states.csv') 
demise_fs <- read.csv('demise_fs.csv')
nafo <- read.csv('NAFO_timeshelf_weeks.csv')

Calculate summary statistics for Illex catch

The following code subsets data by selecting weeks 18-44 to represent catch that occurs only during weeks when fishery may be open in any given year.

VTR LPUE and Study Fleet CPUE during the fishing season

# Full VTR data 
# Modify aa dataframe to fishing season
aa <- subset(aa, WEEK %in% c(18:44))
aa$score <- NA
a <- subset(sys_state, score == 'A', select = 'year')
g <- subset(sys_state, score == 'G', select = 'year')
p <- subset(sys_state, score == 'P', select = 'year')
aa.years <- unique(aa$YEAR)
locs1 <- which(aa$YEAR %in% a$year)
locs2 <- which(aa$YEAR %in% g$year)
locs3 <- which(aa$YEAR %in% p$year)
aa[locs1,39] <- 'A'
aa[locs2,39] <- 'G'
aa[locs3,39] <- 'P'

sf$week <- lubridate::week(sf[,28])
sf <- subset(sf, week %in% c(18:44))
sf$score <- NA
locs1 <- which(sf$YEAR %in% a$year)
locs2 <- which(sf$YEAR %in% g$year)
locs3 <- which(sf$YEAR %in% p$year)
sf[locs1,62] <- 'A'
sf[locs2,62] <- 'G'
sf[locs3,62] <- 'P'
# Calc summary stats across all years
wk_aa <- aa %>%
  dplyr::group_by(WEEK, score) %>%
  summarise(n = sum(illexLPUE),
            mean_cpue = mean(illexLPUE),
            sd_cpue = sd(illexLPUE),
            max_cpue = max(illexLPUE),
            min_cpue = min(illexLPUE))


# Study Fleet (not targeted trips)
wk_sf <- sf %>%
  dplyr::group_by(week, score) %>%
  summarise(n = sum(illexCPUE),
            mean_cpue = mean(illexCPUE),
            sd_cpue = sd(illexCPUE),
            max_cpue = max(illexCPUE),
            min_cpue = min(illexCPUE))

# use for percentage of catch that week/month/year
# dt <- aa %>%
#   dplyr::group_by(illexLPUE, score)%>%
#   dplyr::tally()%>%
#   dplyr::mutate(percent=n/sum(n))
# # Generate a series of figures
# g1 <- ggplot(data = dt, aes(x = illexLPUE, y = n, fill = score))+
#   geom_bar(stat="identity")+
#   geom_text(aes(label=paste0(sprintf("%1.1f", percent*100),"%")),
#             position=position_stack(vjust=0.5),
#             colour="white", size = 2) +
#   xlab('Days old') +
#   ylab('Number of Rings') +
#   labs(title = 'Age') +
#   theme_minimal()
p0 <- ggplot(wk_sf, aes(x = week, y = mean_cpue, fill = score)) +
  geom_bar(stat="identity" ) +
  xlab('Week') +
  ylab('Mean CPUE (lbs)') +
  labs(title = 'Mean CPUE Study Fleet') +
  #geom_text(aes(label=year), vjust=1.6, color="white", size=3.5)+
  theme_minimal()

p1 = ggplot(data = wk_sf,
       mapping = aes(x = week,
                     y = mean_cpue)) +
  geom_line(color = 'black') +
  geom_point(aes(fill = factor(score)), shape = 21,  size = 4) +
  scale_x_continuous(breaks = seq(18,44, by = 2)) +
  xlab('Week') +
  ylab('Mean CPUE SF') + 
  labs(title = 'Mean CPUE SF', fill = 'System state') +
  geom_errorbar(aes(ymin = mean_cpue - sd_cpue, ymax = mean_cpue + sd_cpue),
                width = .2, position = position_dodge(.9)) + 
  theme_minimal() +   
  facet_wrap(~score)
p1.1 = ggplot(data = wk_sf,
       mapping = aes(x = week,
                     y = mean_cpue)) +
  geom_line(color = 'black') +
  geom_point(aes(fill = factor(score)), shape = 21,  size = 4) +
  scale_x_continuous(breaks = seq(18,44, by = 2)) +
  xlab('Week') +
  ylab('Mean CPUE SF') + 
  labs(title = 'Mean CPUE SF', fill = 'System state') +
  theme_minimal() +   
  facet_wrap(~score)
p2 = ggplot(data = wk_sf,
       mapping = aes(x = week,
                     y = max_cpue)) +
  geom_line(color = 'black') +
  geom_point(aes(fill = factor(score)), shape = 21,  size = 4) +
  scale_x_continuous(breaks = seq(18,44, by = 2)) +
  xlab('Week') +
  ylab('Max CPUE') + 
  labs(title = 'Max CPUE SF', fill = 'System state') +
  theme_minimal() +   
  facet_wrap(~score)

p3 = ggplot(data = wk_aa,
            mapping = aes(x = WEEK,
                          y = mean_cpue)) +
  geom_line(color = 'black') +
  geom_point(aes(fill = factor(score)), shape = 21,  size = 4) +
  scale_x_continuous(breaks = seq(1,53, by = 5)) +
  xlab('Week') +
  ylab('Mean LPUE') + 
  labs(title = 'Mean LPUE AA') +
  theme_minimal() +   
  facet_wrap(~score)
p4 = ggplot(data = wk_aa,
       mapping = aes(x = WEEK,
                     y = mean_cpue)) +
  geom_line(color = 'black') +
  geom_point(aes(fill = factor(score)), shape = 21,  size = 4) +
  scale_x_continuous(breaks = seq(1,53, by = 5)) +
  xlab('Week') +
  ylab('Mean LPUE') + 
  labs(title = 'Mean LPUE AA') +
  geom_errorbar(aes(ymin = mean_cpue - sd_cpue, ymax = mean_cpue + sd_cpue),
                width = .2, position = position_dodge(.9)) + 
  theme_minimal() +   
  facet_wrap(~score)

p5 = ggplot(data = wk_aa,
       mapping = aes(x = WEEK,
                     y = max_cpue)) +
  geom_line(color = 'black') +
  geom_point(aes(fill = factor(score)), shape = 21,  size = 4) +
  scale_x_continuous(breaks = seq(1,53, by = 5)) +
  xlab('Week') +
  ylab('Max LPUE') + 
  labs(title = 'Max LPUE AA') +
  theme_minimal() +   
  facet_wrap(~score)

p6 = ggplot(data = wk_aa,
       mapping = aes(x = WEEK,
                     y = n)) +
  geom_line(color = 'black') +
  geom_point(aes(fill = factor(score)), shape = 21,  size = 4) +
  scale_x_continuous(breaks = seq(1,53, by = 5)) +
  xlab('Week') +
  ylab('Sum of LPUE') + 
  labs(title = 'Total LPUE AA') +
  theme_minimal() +   
  facet_wrap(~score)

p7 = ggplot(wk_aa, aes(x = WEEK, y = mean_cpue, fill = score)) +
  geom_bar(stat="identity" ) +
  xlab('Week') +
  ylab('Mean LPUE (lbs)') +
  labs(title = 'Mean LPUE AA') +
  #geom_text(aes(label=year), vjust=1.6, color="white", size=3.5)+
  theme_minimal()

VTR 1997:2020

This includes data during the fishing season only (weeks 18:44) across all available years (1997 through 2020) for both targeted and not targeted trips.

p6

p3/p4

p5

p7

Study Fleet 2011:2020

This includes data during the fishing season only (weeks 18:44) across a portion of years (2011 through 2020). Includes both targeted and not targeted trips.

p0
## Warning: Removed 46 rows containing missing values (position_stack).

p1
## Warning: Removed 46 rows containing missing values (geom_point).

p1.1
## Warning: Removed 46 rows containing missing values (geom_point).

p2
## Warning: Removed 46 rows containing missing values (geom_point).

Modify data


Add in NAFO subareas, system state classifications, ring info

# Change the format of catch by week data frame (cbwk) to include nafo zone
# as a variable 
names(cbwk) <- tolower(names(cbwk))
cbwk <- cbwk[,-c(9:16)] # remove empty columns 
nafo_zones <- c('5ze', '5zw', '6a', '6b', '6c')
z1 <- cbwk[,c(2:9, 14, 19, 24)]
colnames(z1) <- c('year', 'month', 'week', 'lon', 'lat', 'permit', 
                  'cpue', 'chl_anom', 'sst_anom', 'sdsst', 'msst')
#z1$zone <- '5ze'
z1$zone <- 1
z2 <- cbwk[,c(2:8, 10, 15, 20, 25)]
colnames(z2) <- c('year', 'month', 'week', 'lon', 'lat', 'permit',
                  'cpue','chl_anom', 'sst_anom', 'sdsst', 'msst')
#z2$zone <- '5zw'
z2$zone <- 2
z3 <- cbwk[,c(2:8, 11, 16, 21, 26)]
colnames(z3) <- c('year', 'month', 'week', 'lon', 'lat', 'permit',
                  'cpue', 'chl_anom', 'sst_anom', 'sdsst', 'msst')
#z3$zone <- '6a'
z3$zone <- 3
z4 <- cbwk[,c(2:8, 12, 17, 22, 27)]
colnames(z4) <- c('year', 'month', 'week', 'lon', 'lat', 'permit', 
                  'cpue', 'chl_anom', 'sst_anom', 'sdsst', 'msst')
#z4$zone <- '6b'
z4$zone <- 4
z5 <-  cbwk[,c(2:8, 13, 18, 23, 28)]
colnames(z5) <- c('year', 'month', 'week', 'lon', 'lat', 'permit', 
                  'cpue', 'chl_anom', 'sst_anom', 'sdsst', 'msst')
#z5$zone <- '6c'
z5$zone <- 5

# Join to make a data frame in long format
df <- rbind(z1,z2, z3,z4,z5)
colnames(df)[12] <- 'subarea'

# Add score
df$score <- NA
locs1 <- which(df$year == 2011)
df[locs1,13] <- 'A'
locs2 <- which(df$year == 2012)
df[locs2,13] <- 'A'
locs3 <- which(df$year == 2013)
df[locs3,13] <- 'P'
locs4 <- which(df$year == 2014)
df[locs4,13] <- 'A'
locs5 <- which(df$year == 2015)
df[locs5,13] <- 'P'
locs6 <- which(df$year == 2016)
df[locs6,13] <- 'P'
locs7 <- which(df$year == 2017)
df[locs7,13] <- 'G'
locs8 <- which(df$year == 2018)
df[locs8,13] <- 'G'
locs9 <- which(df$year == 2019)
df[locs9,13] <- 'G'
locs10 <- which(df$year == 2020)
df[locs10,13] <- 'G'
# Add system state
df$state <- NA
locs1 <- which(df$year == 2011)
df[locs1,14] <- '1'
locs2 <- which(df$year == 2012)
df[locs2,14] <- '1'
locs3 <- which(df$year == 2013)
df[locs3,14] <- '0'
locs4 <- which(df$year == 2014)
df[locs4,14] <- '1'
locs5 <- which(df$year == 2015)
df[locs5,14] <- '0'
locs6 <- which(df$year == 2016)
df[locs6,14] <- '0'
locs7 <- which(df$year == 2017)
df[locs7,14] <- '2'
locs8 <- which(df$year == 2018)
df[locs8,14] <- '2'
locs9 <- which(df$year == 2019)
df[locs9,14] <- '2'
locs10 <- which(df$year == 2020)
df[locs10,14] <- '2'


# Add id to match with occupancy
df$id <- NA
locs1 <- which(df$subarea == 1)
df[locs1,15] <- '5ze'
locs2 <- which(df$subarea == 2)
df[locs2,15] <- '5zw'
locs3 <- which(df$subarea == 3)
df[locs3,15] <- '6a'
locs4 <- which(df$subarea == 4)
df[locs4,15] <- '6b'
locs5 <- which(df$subarea == 5)
df[locs5,15] <- '6c'

Ring Occupancy


The following code uses A.Silver’s occupancy metric. “Occupancy is a count of how many days a ring was present in a given .1° bin of latitude and longitude. All rings present on a given day where found. For each ring all .1° latitude and longitude bins within the equivalent area of the ring (calculated from the radius) get a count of 1 ring day added to the occupancy. This process was repeated for all days within the given timeframe.”

Modify occupancy data

# Modify occupancy data 
# convert to long format
occupancy <- gather(nafo, id, occupancy, z6C:z5Ze, factor_key = TRUE)
occupancy$score <- NA
colnames(occupancy) <- c('year', 'week', 'total.occ', 'id', 
                         'occ.by.zone', 'score')
locs1 <- which(occupancy$year == 2011)
occupancy[locs1,6] <- 'A'
locs2 <- which(occupancy$year == 2012)
occupancy[locs2,6] <- 'A'
locs3 <- which(occupancy$year == 2013)
occupancy[locs3,6] <- 'P'
locs4 <- which(occupancy$year == 2014)
occupancy[locs4,6] <- 'A'
locs5 <- which(occupancy$year == 2015)
occupancy[locs5,6] <- 'P'
locs6 <- which(occupancy$year == 2016)
occupancy[locs6,6] <- 'P'
locs7 <- which(occupancy$year == 2017)
occupancy[locs7,6] <- 'G'
locs8 <- which(occupancy$year == 2018)
occupancy[locs8,6] <- 'G'
locs9 <- which(occupancy$year == 2019)
occupancy[locs9,6] <- 'G'
locs10 <- which(occupancy$year == 2020)
occupancy[locs10,6] <- 'G'

The following figures explore ring occupancy for rings existing West of -65 during the fishing season and hatching season

Hatching season consists of 2 time periods: Dec/Jan, March/May. The Dec/Jan subsets represent paralarvae that are expected to hatch during December-January and potentially make up the majority of the spring/early summer catch, based on a 6-7 month maturation phase (Hendrickson_ea_04). The second hatching season is proposed to take place in March - May and the mature and spawning squid would then likely make up the majority of the Fall catch. This 6-7 month maturation phase, () fairly consistent with other Illex species (Schwarz_Perez_12). Schwarz and collegues have found Illex argentinus generally spend ~30 days as paralarvae, ~ 130 days as juveniles and 30-60 days as mature/spawning adults.

The following code uses these assumptions about the timing of maturation to explore WCR occupancy during paralarvae and juvenile phases - where rings may serve as transport or food resource aggregation mechansisms.

Looking at Ring occupancy during fishing season

# Plot
year_plot

#occ_plot
tyear

myear

g1

g2

Looking at Ring occupancy during prelarvae and juvenile time periods

# Plot 
g3/g4

g5/g6

g6/g7

Quick ANOVA to test differences:

# Quantify
aov2.1 <- aov(total.occ ~ score, data = occ)
Anova(aov2.1, type = 'III')
## Anova Table (Type III tests)
## 
## Response: total.occ
##              Sum Sq   Df F value    Pr(>F)    
## (Intercept)  5371.9    1 279.185 < 2.2e-16 ***
## score         551.4    2  14.329 6.951e-07 ***
## Residuals   25918.2 1347                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov2.1)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = total.occ ~ score, data = occ)
## 
## $score
##           diff       lwr        upr     p adj
## G-A -0.5493827 -1.225923  0.1271575 0.1375346
## P-A -1.6172840 -2.340536 -0.8940321 0.0000005
## P-G -1.0679012 -1.744441 -0.3913610 0.0006472
# Consider only rings that were on shelf for greater than 0 days
# aov2.1 <- aov(timeshelf ~ score, data = occ[which(occ$timeshelf>0),])
# Anova(aov2.1, type = 'III')
# TukeyHSD(aov2.1)

Merge occupancy data to study fleet environmental data

# Try to put occupancy and df together

occ_subareas <- df %>% 
  dplyr::filter(weeks_old == 1) # create new dataframe which isolates 
                                # the first week of observation for each ring
z4 <-origin %>%
  dplyr::filter(lon > -60.000, lon < -55.000) # subset 'origin' df by zone
z3 <-origin %>%
  dplyr::filter(lon > -65.000, lon < -60.000)
z2 <-origin %>%
  dplyr::filter(lon > -70.000, lon < -65.000)
z1 <-origin %>%
  dplyr::filter(lon > -75.000, lon < -70.000)
# Number of rings from each zone should sum to 281 (total rings)..
test = sum(nrow(z4)+ nrow(z3)+ nrow(z2) + nrow(z1))

# Identify where rings that hit the shelf during the fishing season originated,
# keeping an eye out for those that were born in zones 2 or 1 in particular. 
origin_key <- rbind(cbind(as.character(z1$id), rep("zone1", nrow(z1))),
                    cbind(as.character(z2$id), rep("zone2", nrow(z2))),
                    cbind(as.character(z3$id), rep("zone3", nrow(z3))),
                    cbind(as.character(z4$id), rep("zone4", nrow(z4))))
colnames(origin_key) <- c("id", "origin")
origin_key <- as.data.frame(origin_key)
new_data <- dplyr::left_join(df,
                             origin_key,
                             by = "id")



a <- subset(occupancy, score == 'A', select = 'year')
g <- subset(sys_state, score == 'G', select = 'year')
p <- subset(sys_state, score == 'P', select = 'year')
aa.years <- unique(aa$YEAR)
locs1 <- which(aa$YEAR %in% a$year)
locs2 <- which(aa$YEAR %in% g$year)
locs3 <- which(aa$YEAR %in% p$year)
aa[locs1,39] <- 'A'
aa[locs2,39] <- 'G'
aa[locs3,39] <- 'P'


# Add # of rings decaying
# Identify unique years for subsetting
years <- sort(unique(df$year))
# Extract env values at fishing locations by year/week 
for(i in 1:length(years)){
  tmp <- subset(df, year == years[i])
  weeks = sort(unique(tmp$week))
  for(k in 1:length(bricks)){
    btmp <- brick(bricks[k])
    btmp@data@names <- as.numeric(c(1:52))
    if(substr(bricks[k], start =  82, stop =  83) == substr(years[i], start =  3, stop =  4)){
      for(j in 1:length(weeks)){
        tmp2 <- subset(tmp, WEEK == weeks[j])
        locs <- which(mean_cbwk$YEAR == years[i] & mean_cbwk$WEEK == weeks[j])
        mean_cbwk$sst[locs] <- sst
        
      }
    }
  }
}
df$weeks_old <- NA
df$days_old <- NA
for(i in 1:length(ringnames)){
  #if(ringname[i] == d[i,1] ){
  tmp2 <- subset(d, id == ringnames[1,i])
  for(j in 1:length(tmp2[,1])){
    tmp2[j, 5] <-  (j)
    tmp2[j, 6] <- ((tmp2[j,5]+1) - tmp2[1, 5])*7
    locs <- which(d$id == ringnames[1,i])
    d$weeks_old[locs] <- tmp2[,5]
    d$days_old[locs] <- tmp2[,6]
  }
}

Determine distance between fishing points and ring corridor

coords=c(32, -125, 43, -130)
# Compute great circle distance
latlon2dist(coords)





dlrlog_shelfdist <- dist2isobath(pt.baty415, x = dlrlog2[,"LONGDD"], y = dlrlog2[,"LATDD"], isobath = -200)
dlrlog2$shelfdist <- dlrlog_shelfdist$distance
dlrlog1 <- left_join(dlrlog1, dlrlog2)
rm(latlon,UTMcoord)


# Nahant location
nahant_lat_north <- 42.4266
nahant_lon_west <- -70.9223
nahant_lon_east <- nahant_lon_west + 360
# Dates of interest
dates_start <- as.Date("1870-01-01")
dates_stop <- as.Date("2099-12-30")
# list.files function altered to read files from another path * see above(path =) 
files <- list.files(path = path, pattern = glob2rx("tos_day_*.nc"), full.names = TRUE)
files2 <- list.files(path = path, pattern = glob2rx("tos_day_*.nc"), full.names = FALSE)
closest_dists <- numeric(length(files))
for (i in 1:length(files)) {
  nc <- nc_open(files[i])
  # Get attributes of time
  # ncatt_get(nc, "time")
  t <- as.character(ncvar_get(nc, "time"))
  tos <- ncvar_get(nc, "tos")
  lat <- ncvar_get(nc, "lat")
  lon <- ncvar_get(nc, "lon")
  year <- substr(t, start = 1, stop = 4)
  month <- substr(t, start = 5, stop = 6)
  day <- substr(t, start = 7, stop = 8)
  date <- as.Date(paste(year, month, day, sep = "/"))
  
  all_locs <- as.matrix(expand.grid(lat = lat, lon = lon))
  all_dists <-numeric(length = nrow(all_locs))
  
  # Compute the distance between all locations and Nahant
  for (j in 1:nrow(all_locs)) {
    # Ignore locations with no temperature data
    if (is.na(tos[lon == all_locs[j, 2], lat == all_locs[j, 1], 1])) {
      all_dists[j] <- NA
    } else {
      all_dists[j] <- latlon2dist(c(nahant_lat_north, 
                                    nahant_lon_east, 
                                    all_locs[j, ]))
    }
  }
  # Find closest location with temperature data
  closest <- which.min(all_dists)
  closest_dists[i] <- min(all_dists, na.rm = TRUE)
  closest_coords <- all_locs[closest, ]
  closest_lat_loc <- which(lat == closest_coords[1])
  closest_lon_loc <- which(lon == closest_coords[2])
  # longitude, latitude, time
  closest_tos <- tos[lon == lon[closest_lon_loc], 
                     lat == lat[closest_lat_loc], 
                     date >= dates_start & date <= dates_stop]
  tos_df <- data.frame(date = na.omit(date[date >= dates_start & date <= dates_stop]), 
                       tos = na.omit(k2c(closest_tos)))
  write.csv(file = paste0("nahant_", 
                          substr(files2[i], 
                                 start = 1, 
                                 stop = nchar(files2[i]) - 3), 
                          ".csv"), tos_df)
  nc_close(nc)
}
write.csv(file = "closest_dists_nahant.csv", 
          data.frame(model = files2, closest_dist = closest_dists))

  # get indices of the grid cell closest to Spencer Canyon 
  tlon <- -73.1650; tlat <- 38.6319
  j <- which.min(abs(lon-tlon))
  k <- which.min(abs(lat-tlat))
  print(c(j, lon[j], k, lat[k]))
  
  # get time series for the closest gridpoint
  fprob_spc1 <- fprob_ts1[j, k, ]
  fprob_spc2 <- fprob_ts2[j, k, ]

Relationship between pulses & states

No relationship

ss<- sys_state[-24,]
ss2<- sys_state[-23,]
table(ss$leslie, ss$score)   
##    
##     A G P
##   F 2 2 2
##   P 1 1 1
#chisq.test(table(ss$leslie, ss$score))$expected
fisher.test(table(ss$leslie, ss$score))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(ss$leslie, ss$score)
## p-value = 1
## alternative hypothesis: two.sided
table(ss2$leslie, ss2$score)   
##    
##     A G P
##   F 2 1 2
##   P 1 2 1
#chisq.test(table(ss2$leslie, ss2$score))$expected
fisher.test(table(ss2$leslie, ss2$score))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(ss2$leslie, ss2$score)
## p-value = 1
## alternative hypothesis: two.sided

Run Random Forest

# Create training (70%) and test (30%) sets for the data.
# Use set.seed for reproducibility

set.seed(123)
df_split <- initial_split(df, prop = .7) # can add ,lag = n
df_train <- training(df_split)
df_test  <- testing(df_split)

# 1.  Given training data set
# 2.  Select number of trees to build (ntrees)
# 3.  for i = 1 to ntrees do
# 4.  |  Generate a bootstrap sample of the original data
# 5.  |  Grow a regression tree to the bootstrapped data
# 6.  |  for each split do
# 7.  |  | Select m variables at random from all p variables
# 8.  |  | Pick the best variable/split-point among the m
# 9.  |  | Split the node into two child nodes
# 10. |  end
# 11. | Use typical tree model stopping criteria to determine when a tree is complete (but do not prune)
# 12. end


# for reproduciblity
set.seed(123)

# default RF model
dfrf <- df[complete.cases(df$cpue),]
dfrf <- df[complete.cases(df), ]

df <- df %>% drop_na() 
df_split <- initial_split(df, prop = .7) # can add ,lag = n
df_train <- training(df_split)
df_test  <- testing(df_split)

# m1 <- randomForest(
#   formula = cpue ~ .,
#   data    = dfrf
# )
# 
# m1
# plot(m1)
# # number of trees with lowest MSE
# which.min(m1$mse)
# 
# 
# # RMSE of this optimal random forest
# sqrt(m1$mse[which.min(m1$mse)])


optimal_ranger <- ranger(
    formula         = cpue ~ ., 
    data            = df_train, 
    num.trees       = 500,
    mtry            = 13,
    min.node.size   = 5,
    sample.fraction = .8,
    importance      = 'impurity'
  )
## Growing trees.. Progress: 59%. Estimated remaining time: 21 seconds.
optimal_ranger$variable.importance %>% 
  tidy() %>%
  dplyr::arrange(desc(x)) %>%
  dplyr::top_n(25) %>%
  ggplot(aes(reorder(names, x), x)) +
  geom_col(fill = 'slategrey') +
  coord_flip() +
  ggtitle("Top 13 important variables")+
  theme_minimal()
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## Selecting by x

Set up for GAMs

## Set up models - Dealer/VTR Data - GAMs

## Make minor modifications to dataframe, consistent with study fleet and 
#observer above (add 1 pound to all LPUE so that the log() link will behave
#and create an integer column of rounded LPUE to use the negative binomial).

## For model fitting, I'll fit a null model and a model with only year effects first with each distributional assumption (lognormal, gamma, and negative binomial). Then I'll use step() and/or manual stepwise selection to add more variables to the one with the best fit/diagnostics. 

####-------------------------------filter and format: Dealer/VTR -------------------####

# aa <- dplyr::filter(aa, istarget == "target") %>% 
#   droplevels() %>% 
#   mutate(lpue = illexLPUE + 1 # add a small constant to all  so log link can be applied
#          , year = as.factor(as.character(year))
#          , week = as.factor(as.character(week))
#          , lpue.int = round(illexLPUE, 0)
#   )
aa.mod <- aa %>% 
  #droplevels() %>% 
  mutate(lpue = illexLPUE + 1 # add a small constant to all  so log link can be applied
         , year = as.factor(as.character(YEAR))
         , week = as.factor(as.character(WEEK))
         , lpue.int = round(illexLPUE, 0)
  )


df.mod <- df %>% 
  #droplevels() %>% 
  mutate(cpue = cpue + 1 # add a small constant to all  so log link can be applied
         , year = as.factor(as.character(year))
         , week = as.factor(as.character(week))
         , cpue.int = round(cpue, 0)
  )

Run GAMs

# GAM with zone as a factor
mod <- gam(cpue ~ s(sdsst) + subarea,
            data = df, method = 'REML')
summary(mod)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## cpue ~ s(sdsst) + subarea
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2003.80      68.32  29.328  < 2e-16 ***
## subarea        60.20      20.66   2.914  0.00357 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df     F p-value    
## s(sdsst) 8.706  8.976 68.43  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00759   Deviance explained = 0.771%
## -REML = 8.2348e+05  Scale est. = 6.5338e+07  n = 79055
#plot(mod, pch = 1, pages = 1, all.terms = TRUE)
#gam.check(mod, page = 1)
plot(mod, all.terms = TRUE,  pages = 1,shade = TRUE, shade.col = "grey", 
     shift = coef(mod)[1], seWithMean = TRUE)

#?family.mgcv
#gam0 <- gam(cpue ~  year, data = df, family = nb)

## Try with subareas as random effects (bs = 're')

## Try with Markov Random Field Smooths (bs= 'mrf')

# Different GAMs for each zone
# g5zw <- gam(cpue ~ s(sst_anom_5zw), data = cbwk, method = 'REML')
# plot(g5zw, pch = 1, main = '5zw',  shade = TRUE, shade.col = "grey", 
#      shift = coef(g5zw)[1], seWithMean = TRUE)
# g5ze <-  gam(cpue ~ s(sst_anom_5ze), data = cbwk, method = 'REML')
# plot(g5ze, pch = 1, main = '5zw',  shade = TRUE, shade.col = "grey", 
#      shift = coef(g5ze)[1], seWithMean = TRUE)
# g6a <- gam(cpue ~ s(sst_anom_6a), data = cbwk, method = 'REML')
# plot(g6a, pch = 1, main = '5zw',  shade = TRUE, shade.col = "grey", 
#      shift = coef(g6a)[1], seWithMean = TRUE)
# g6b <- gam(cpue ~ s(sst_anom_6b), data = cbwk, method = 'REML')
# plot(g6b, pch = 1, main = '5zw',  shade = TRUE, shade.col = "grey", 
#      shift = coef(g6b)[1], seWithMean = TRUE)
# g6c <- gam(cpue ~ s(sst_anom_6c), data = cbwk, method = 'REML')
# plot(g6c, pch = 1, main = '5zw',  shade = TRUE, shade.col = "grey", 
#      shift = coef(g6c)[1], seWithMean = TRUE)
# g5zw <- gam(cpue ~ s(chl_anom_5zw, k = 10) + s(sst_anom_5zw, k = 10),
#             data = cbwk, method = 'REML')
# summary(g5zw)
# gam.check(g5zw, page = 1)
# concurvity(g5zw, full = TRUE)
# concurvity(g5zw, full = FALSE)
# 
# plot(g5zw, pages = 1, main = '5Zw', all.terms = TRUE,  shade = TRUE, 
#      shade.col = "lightgrey", 
#      shift = coef(g5zw)[1], seWithMean = TRUE)
# g5ze <-  gam(cpue ~ s(chl_anom_5ze), data = cbwk, method = 'REML')
# plot(g5ze, pch = 1, main = '5ze')
# g6a <- gam(cpue ~ s(chl_anom_6a), data = cbwk, method = 'REML')
# plot(g6a, pch = 1, main = '6a')
# g6b <- gam(cpue ~ s(chl_anom_6b), data = cbwk, method = 'REML')
# plot(g6b, pch = 1, main = '6b')
# g6c <- gam(cpue ~ s(chl_anom_6c) + s(sst_anom_6c), data = cbwk, method = 'REML')
# 
# plot(g6c, main = '6C', all.terms = TRUE,  shade = TRUE, shade.col = "lightgrey", 
#      shift = coef(g6c)[1], seWithMean = TRUE)
####### a function for model comparisons ####### 
compare.fun <- function(modlist) {
  modcompare <- data.frame(matrix(NA, nrow=length(modlist),ncol = 6))
  modcompare[,1] <- as.character()
  for (i in 1:length(modlist)) {
    modcompare[i,1] <- as.character(modlist[[i]]$call$formula[3])
    # modcompare[i,1] <- as.character(modlist[[i]]$formula[3])
    modcompare[i,2] <- round(modlist[[i]]$df.residual, digits = 0)
    modcompare[i,3] <- modlist[[i]]$deviance
    modcompare[i,4] <- round(modlist[[i]]$deviance/modlist[[i]]$df.residual, digits = 2)
    modcompare[i,5] <- round(((modlist[[i]]$null.deviance - modlist[[i]]$deviance) / modlist[[i]]$null.deviance)*100, digits=2)
    modcompare[i,6] <- modlist[[i]]$aic
  }
  colnames(modcompare) <- c("Formula",  "df", "ResDeviance", "Dev/df", "PercDevExplained", "AIC")
  return(modcompare %>% arrange(AIC))
}

yreffectplots <- function(mydata, mymodel, myY, mytitle) {
  
  years <- sort(unique(mydata$YEAR))
  coefs <- c(mymodel$coefficients[1], mymodel$coefficients[1] + mymodel$coefficients[2:length(years)])
  
  par(mfrow=c(1,1)) # reset plot frame
  
  ggplot() + 
    geom_point(data = mydata, aes(x = YEAR, y = mydata[, myY]), alpha = 0.8, position = position_jitter()) +
    geom_boxplot(data = mydata, aes(x = YEAR, y = mydata[, myY]), fill = NA,  outlier.shape = NA) +
    geom_point(aes(x = years, y = exp(coefs)), color = "red") +
    #theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
    labs(x = "YEAR" , y = "Observed (black) and fitted (red) CPUE") +
    ggplot() + 
    geom_point(data = mydata, aes(x = YEAR, y = log(mydata[, myY])), alpha = 0.8, position = position_jitter()) + 
    geom_boxplot(data = mydata, aes(x = YEAR, y = log(mydata[, myY])), fill = NA,  outlier.shape = NA) + 
    geom_point(aes(x = years, y = coefs), color = "red") +
    #theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
    labs(x = "YEAR", y = "Observed log(CPUE) (black) and \n model coefficients (red) ") +
    plot_annotation(title = paste(mytitle, mymodel$family ,"GAM")
                    , subtitle = as.character(mymodel$formula[3]))    
}