Load libraries
## Wrangling
library(dplyr)
library(tidyverse)
library(tidyr) ## gather
## Diagnostics
library(car) ## Anova, influence plot
library(rsample) ## data splitting
library(randomForest) ## basic implementation
library(ranger) ## a faster implementation of randomForest
#library(caret) ## a package for performing many machine learning models
#library(h2o) ## an extremely fast java-based platform
library(mgcv) ## gam, multinom
## Extraction/graphics
library(synchrony) ## latlon2dist
library(patchwork)
library(geosphere)
library(ggplot2)
#library(broom.mixed)
#library(emmeans)
#library(dotwhisker)
#library(gridExtra)
## From GitHub:
#library(r2glmm) ## bbolker/r2glmm
#library(remef) ## hohenstein/remef
#source("utils.R")
#source("gamm4_utils.R")
Import data
aa <- read.csv('AA_1997_2019_illex_formatted_with_vesstype.csv')
sf <- read.csv('crb&nefop_illex_2008-2020_atleast100lb.csv')
obs <- read.csv('OBLEN_DATA_VIEW.csv')
cbwk <- read.csv('cbwk_env.csv')
sys_state <- read.csv('sys_states.csv')
demise_fs <- read.csv('demise_fs.csv')
nafo <- read.csv('NAFO_timeshelf_weeks.csv')
The following code subsets data by selecting weeks 18-44 to represent catch that occurs only during weeks when fishery may be open in any given year.
VTR LPUE and Study Fleet CPUE during the fishing season
# Full VTR data
# Modify aa dataframe to fishing season
aa <- subset(aa, WEEK %in% c(18:44))
aa$score <- NA
a <- subset(sys_state, score == 'A', select = 'year')
g <- subset(sys_state, score == 'G', select = 'year')
p <- subset(sys_state, score == 'P', select = 'year')
aa.years <- unique(aa$YEAR)
locs1 <- which(aa$YEAR %in% a$year)
locs2 <- which(aa$YEAR %in% g$year)
locs3 <- which(aa$YEAR %in% p$year)
aa[locs1,39] <- 'A'
aa[locs2,39] <- 'G'
aa[locs3,39] <- 'P'
sf$week <- lubridate::week(sf[,28])
sf <- subset(sf, week %in% c(18:44))
sf$score <- NA
locs1 <- which(sf$YEAR %in% a$year)
locs2 <- which(sf$YEAR %in% g$year)
locs3 <- which(sf$YEAR %in% p$year)
sf[locs1,62] <- 'A'
sf[locs2,62] <- 'G'
sf[locs3,62] <- 'P'
# Calc summary stats across all years
wk_aa <- aa %>%
dplyr::group_by(WEEK, score) %>%
summarise(n = sum(illexLPUE),
mean_cpue = mean(illexLPUE),
sd_cpue = sd(illexLPUE),
max_cpue = max(illexLPUE),
min_cpue = min(illexLPUE))
# Study Fleet (not targeted trips)
wk_sf <- sf %>%
dplyr::group_by(week, score) %>%
summarise(n = sum(illexCPUE),
mean_cpue = mean(illexCPUE),
sd_cpue = sd(illexCPUE),
max_cpue = max(illexCPUE),
min_cpue = min(illexCPUE))
# use for percentage of catch that week/month/year
# dt <- aa %>%
# dplyr::group_by(illexLPUE, score)%>%
# dplyr::tally()%>%
# dplyr::mutate(percent=n/sum(n))
# # Generate a series of figures
# g1 <- ggplot(data = dt, aes(x = illexLPUE, y = n, fill = score))+
# geom_bar(stat="identity")+
# geom_text(aes(label=paste0(sprintf("%1.1f", percent*100),"%")),
# position=position_stack(vjust=0.5),
# colour="white", size = 2) +
# xlab('Days old') +
# ylab('Number of Rings') +
# labs(title = 'Age') +
# theme_minimal()
p0 <- ggplot(wk_sf, aes(x = week, y = mean_cpue, fill = score)) +
geom_bar(stat="identity" ) +
xlab('Week') +
ylab('Mean CPUE (lbs)') +
labs(title = 'Mean CPUE Study Fleet') +
#geom_text(aes(label=year), vjust=1.6, color="white", size=3.5)+
theme_minimal()
p1 = ggplot(data = wk_sf,
mapping = aes(x = week,
y = mean_cpue)) +
geom_line(color = 'black') +
geom_point(aes(fill = factor(score)), shape = 21, size = 4) +
scale_x_continuous(breaks = seq(18,44, by = 2)) +
xlab('Week') +
ylab('Mean CPUE SF') +
labs(title = 'Mean CPUE SF', fill = 'System state') +
geom_errorbar(aes(ymin = mean_cpue - sd_cpue, ymax = mean_cpue + sd_cpue),
width = .2, position = position_dodge(.9)) +
theme_minimal() +
facet_wrap(~score)
p1.1 = ggplot(data = wk_sf,
mapping = aes(x = week,
y = mean_cpue)) +
geom_line(color = 'black') +
geom_point(aes(fill = factor(score)), shape = 21, size = 4) +
scale_x_continuous(breaks = seq(18,44, by = 2)) +
xlab('Week') +
ylab('Mean CPUE SF') +
labs(title = 'Mean CPUE SF', fill = 'System state') +
theme_minimal() +
facet_wrap(~score)
p2 = ggplot(data = wk_sf,
mapping = aes(x = week,
y = max_cpue)) +
geom_line(color = 'black') +
geom_point(aes(fill = factor(score)), shape = 21, size = 4) +
scale_x_continuous(breaks = seq(18,44, by = 2)) +
xlab('Week') +
ylab('Max CPUE') +
labs(title = 'Max CPUE SF', fill = 'System state') +
theme_minimal() +
facet_wrap(~score)
p3 = ggplot(data = wk_aa,
mapping = aes(x = WEEK,
y = mean_cpue)) +
geom_line(color = 'black') +
geom_point(aes(fill = factor(score)), shape = 21, size = 4) +
scale_x_continuous(breaks = seq(1,53, by = 5)) +
xlab('Week') +
ylab('Mean LPUE') +
labs(title = 'Mean LPUE AA') +
theme_minimal() +
facet_wrap(~score)
p4 = ggplot(data = wk_aa,
mapping = aes(x = WEEK,
y = mean_cpue)) +
geom_line(color = 'black') +
geom_point(aes(fill = factor(score)), shape = 21, size = 4) +
scale_x_continuous(breaks = seq(1,53, by = 5)) +
xlab('Week') +
ylab('Mean LPUE') +
labs(title = 'Mean LPUE AA') +
geom_errorbar(aes(ymin = mean_cpue - sd_cpue, ymax = mean_cpue + sd_cpue),
width = .2, position = position_dodge(.9)) +
theme_minimal() +
facet_wrap(~score)
p5 = ggplot(data = wk_aa,
mapping = aes(x = WEEK,
y = max_cpue)) +
geom_line(color = 'black') +
geom_point(aes(fill = factor(score)), shape = 21, size = 4) +
scale_x_continuous(breaks = seq(1,53, by = 5)) +
xlab('Week') +
ylab('Max LPUE') +
labs(title = 'Max LPUE AA') +
theme_minimal() +
facet_wrap(~score)
p6 = ggplot(data = wk_aa,
mapping = aes(x = WEEK,
y = n)) +
geom_line(color = 'black') +
geom_point(aes(fill = factor(score)), shape = 21, size = 4) +
scale_x_continuous(breaks = seq(1,53, by = 5)) +
xlab('Week') +
ylab('Sum of LPUE') +
labs(title = 'Total LPUE AA') +
theme_minimal() +
facet_wrap(~score)
p7 = ggplot(wk_aa, aes(x = WEEK, y = mean_cpue, fill = score)) +
geom_bar(stat="identity" ) +
xlab('Week') +
ylab('Mean LPUE (lbs)') +
labs(title = 'Mean LPUE AA') +
#geom_text(aes(label=year), vjust=1.6, color="white", size=3.5)+
theme_minimal()
This includes data during the fishing season only (weeks 18:44) across all available years (1997 through 2020) for both targeted and not targeted trips.
p6
p3/p4
p5
p7
This includes data during the fishing season only (weeks 18:44) across a portion of years (2011 through 2020). Includes both targeted and not targeted trips.
p0
## Warning: Removed 46 rows containing missing values (position_stack).
p1
## Warning: Removed 46 rows containing missing values (geom_point).
p1.1
## Warning: Removed 46 rows containing missing values (geom_point).
p2
## Warning: Removed 46 rows containing missing values (geom_point).
Add in NAFO subareas, system state classifications, ring info
# Change the format of catch by week data frame (cbwk) to include nafo zone
# as a variable
names(cbwk) <- tolower(names(cbwk))
cbwk <- cbwk[,-c(9:16)] # remove empty columns
nafo_zones <- c('5ze', '5zw', '6a', '6b', '6c')
z1 <- cbwk[,c(2:9, 14, 19, 24)]
colnames(z1) <- c('year', 'month', 'week', 'lon', 'lat', 'permit',
'cpue', 'chl_anom', 'sst_anom', 'sdsst', 'msst')
#z1$zone <- '5ze'
z1$zone <- 1
z2 <- cbwk[,c(2:8, 10, 15, 20, 25)]
colnames(z2) <- c('year', 'month', 'week', 'lon', 'lat', 'permit',
'cpue','chl_anom', 'sst_anom', 'sdsst', 'msst')
#z2$zone <- '5zw'
z2$zone <- 2
z3 <- cbwk[,c(2:8, 11, 16, 21, 26)]
colnames(z3) <- c('year', 'month', 'week', 'lon', 'lat', 'permit',
'cpue', 'chl_anom', 'sst_anom', 'sdsst', 'msst')
#z3$zone <- '6a'
z3$zone <- 3
z4 <- cbwk[,c(2:8, 12, 17, 22, 27)]
colnames(z4) <- c('year', 'month', 'week', 'lon', 'lat', 'permit',
'cpue', 'chl_anom', 'sst_anom', 'sdsst', 'msst')
#z4$zone <- '6b'
z4$zone <- 4
z5 <- cbwk[,c(2:8, 13, 18, 23, 28)]
colnames(z5) <- c('year', 'month', 'week', 'lon', 'lat', 'permit',
'cpue', 'chl_anom', 'sst_anom', 'sdsst', 'msst')
#z5$zone <- '6c'
z5$zone <- 5
# Join to make a data frame in long format
df <- rbind(z1,z2, z3,z4,z5)
colnames(df)[12] <- 'subarea'
# Add score
df$score <- NA
locs1 <- which(df$year == 2011)
df[locs1,13] <- 'A'
locs2 <- which(df$year == 2012)
df[locs2,13] <- 'A'
locs3 <- which(df$year == 2013)
df[locs3,13] <- 'P'
locs4 <- which(df$year == 2014)
df[locs4,13] <- 'A'
locs5 <- which(df$year == 2015)
df[locs5,13] <- 'P'
locs6 <- which(df$year == 2016)
df[locs6,13] <- 'P'
locs7 <- which(df$year == 2017)
df[locs7,13] <- 'G'
locs8 <- which(df$year == 2018)
df[locs8,13] <- 'G'
locs9 <- which(df$year == 2019)
df[locs9,13] <- 'G'
locs10 <- which(df$year == 2020)
df[locs10,13] <- 'G'
# Add system state
df$state <- NA
locs1 <- which(df$year == 2011)
df[locs1,14] <- '1'
locs2 <- which(df$year == 2012)
df[locs2,14] <- '1'
locs3 <- which(df$year == 2013)
df[locs3,14] <- '0'
locs4 <- which(df$year == 2014)
df[locs4,14] <- '1'
locs5 <- which(df$year == 2015)
df[locs5,14] <- '0'
locs6 <- which(df$year == 2016)
df[locs6,14] <- '0'
locs7 <- which(df$year == 2017)
df[locs7,14] <- '2'
locs8 <- which(df$year == 2018)
df[locs8,14] <- '2'
locs9 <- which(df$year == 2019)
df[locs9,14] <- '2'
locs10 <- which(df$year == 2020)
df[locs10,14] <- '2'
# Add id to match with occupancy
df$id <- NA
locs1 <- which(df$subarea == 1)
df[locs1,15] <- '5ze'
locs2 <- which(df$subarea == 2)
df[locs2,15] <- '5zw'
locs3 <- which(df$subarea == 3)
df[locs3,15] <- '6a'
locs4 <- which(df$subarea == 4)
df[locs4,15] <- '6b'
locs5 <- which(df$subarea == 5)
df[locs5,15] <- '6c'
The following code uses A.Silver’s occupancy metric. “Occupancy is a count of how many days a ring was present in a given .1° bin of latitude and longitude. All rings present on a given day where found. For each ring all .1° latitude and longitude bins within the equivalent area of the ring (calculated from the radius) get a count of 1 ring day added to the occupancy. This process was repeated for all days within the given timeframe.”
# Modify occupancy data
# convert to long format
occupancy <- gather(nafo, id, occupancy, z6C:z5Ze, factor_key = TRUE)
occupancy$score <- NA
colnames(occupancy) <- c('year', 'week', 'total.occ', 'id',
'occ.by.zone', 'score')
locs1 <- which(occupancy$year == 2011)
occupancy[locs1,6] <- 'A'
locs2 <- which(occupancy$year == 2012)
occupancy[locs2,6] <- 'A'
locs3 <- which(occupancy$year == 2013)
occupancy[locs3,6] <- 'P'
locs4 <- which(occupancy$year == 2014)
occupancy[locs4,6] <- 'A'
locs5 <- which(occupancy$year == 2015)
occupancy[locs5,6] <- 'P'
locs6 <- which(occupancy$year == 2016)
occupancy[locs6,6] <- 'P'
locs7 <- which(occupancy$year == 2017)
occupancy[locs7,6] <- 'G'
locs8 <- which(occupancy$year == 2018)
occupancy[locs8,6] <- 'G'
locs9 <- which(occupancy$year == 2019)
occupancy[locs9,6] <- 'G'
locs10 <- which(occupancy$year == 2020)
occupancy[locs10,6] <- 'G'
The following figures explore ring occupancy for rings existing West of -65 during the fishing season and hatching season
Hatching season consists of 2 time periods: Dec/Jan, March/May. The Dec/Jan subsets represent paralarvae that are expected to hatch during December-January and potentially make up the majority of the spring/early summer catch, based on a 6-7 month maturation phase (Hendrickson_ea_04). The second hatching season is proposed to take place in March - May and the mature and spawning squid would then likely make up the majority of the Fall catch. This 6-7 month maturation phase, () fairly consistent with other Illex species (Schwarz_Perez_12). Schwarz and collegues have found Illex argentinus generally spend ~30 days as paralarvae, ~ 130 days as juveniles and 30-60 days as mature/spawning adults.
The following code uses these assumptions about the timing of maturation to explore WCR occupancy during paralarvae and juvenile phases - where rings may serve as transport or food resource aggregation mechansisms.
# Plot
year_plot
#occ_plot
tyear
myear
g1
g2
# Plot
g3/g4
g5/g6
g6/g7
Quick ANOVA to test differences:
# Quantify
aov2.1 <- aov(total.occ ~ score, data = occ)
Anova(aov2.1, type = 'III')
## Anova Table (Type III tests)
##
## Response: total.occ
## Sum Sq Df F value Pr(>F)
## (Intercept) 5371.9 1 279.185 < 2.2e-16 ***
## score 551.4 2 14.329 6.951e-07 ***
## Residuals 25918.2 1347
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov2.1)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = total.occ ~ score, data = occ)
##
## $score
## diff lwr upr p adj
## G-A -0.5493827 -1.225923 0.1271575 0.1375346
## P-A -1.6172840 -2.340536 -0.8940321 0.0000005
## P-G -1.0679012 -1.744441 -0.3913610 0.0006472
# Consider only rings that were on shelf for greater than 0 days
# aov2.1 <- aov(timeshelf ~ score, data = occ[which(occ$timeshelf>0),])
# Anova(aov2.1, type = 'III')
# TukeyHSD(aov2.1)
# Try to put occupancy and df together
occ_subareas <- df %>%
dplyr::filter(weeks_old == 1) # create new dataframe which isolates
# the first week of observation for each ring
z4 <-origin %>%
dplyr::filter(lon > -60.000, lon < -55.000) # subset 'origin' df by zone
z3 <-origin %>%
dplyr::filter(lon > -65.000, lon < -60.000)
z2 <-origin %>%
dplyr::filter(lon > -70.000, lon < -65.000)
z1 <-origin %>%
dplyr::filter(lon > -75.000, lon < -70.000)
# Number of rings from each zone should sum to 281 (total rings)..
test = sum(nrow(z4)+ nrow(z3)+ nrow(z2) + nrow(z1))
# Identify where rings that hit the shelf during the fishing season originated,
# keeping an eye out for those that were born in zones 2 or 1 in particular.
origin_key <- rbind(cbind(as.character(z1$id), rep("zone1", nrow(z1))),
cbind(as.character(z2$id), rep("zone2", nrow(z2))),
cbind(as.character(z3$id), rep("zone3", nrow(z3))),
cbind(as.character(z4$id), rep("zone4", nrow(z4))))
colnames(origin_key) <- c("id", "origin")
origin_key <- as.data.frame(origin_key)
new_data <- dplyr::left_join(df,
origin_key,
by = "id")
a <- subset(occupancy, score == 'A', select = 'year')
g <- subset(sys_state, score == 'G', select = 'year')
p <- subset(sys_state, score == 'P', select = 'year')
aa.years <- unique(aa$YEAR)
locs1 <- which(aa$YEAR %in% a$year)
locs2 <- which(aa$YEAR %in% g$year)
locs3 <- which(aa$YEAR %in% p$year)
aa[locs1,39] <- 'A'
aa[locs2,39] <- 'G'
aa[locs3,39] <- 'P'
# Add # of rings decaying
# Identify unique years for subsetting
years <- sort(unique(df$year))
# Extract env values at fishing locations by year/week
for(i in 1:length(years)){
tmp <- subset(df, year == years[i])
weeks = sort(unique(tmp$week))
for(k in 1:length(bricks)){
btmp <- brick(bricks[k])
btmp@data@names <- as.numeric(c(1:52))
if(substr(bricks[k], start = 82, stop = 83) == substr(years[i], start = 3, stop = 4)){
for(j in 1:length(weeks)){
tmp2 <- subset(tmp, WEEK == weeks[j])
locs <- which(mean_cbwk$YEAR == years[i] & mean_cbwk$WEEK == weeks[j])
mean_cbwk$sst[locs] <- sst
}
}
}
}
df$weeks_old <- NA
df$days_old <- NA
for(i in 1:length(ringnames)){
#if(ringname[i] == d[i,1] ){
tmp2 <- subset(d, id == ringnames[1,i])
for(j in 1:length(tmp2[,1])){
tmp2[j, 5] <- (j)
tmp2[j, 6] <- ((tmp2[j,5]+1) - tmp2[1, 5])*7
locs <- which(d$id == ringnames[1,i])
d$weeks_old[locs] <- tmp2[,5]
d$days_old[locs] <- tmp2[,6]
}
}
coords=c(32, -125, 43, -130)
# Compute great circle distance
latlon2dist(coords)
dlrlog_shelfdist <- dist2isobath(pt.baty415, x = dlrlog2[,"LONGDD"], y = dlrlog2[,"LATDD"], isobath = -200)
dlrlog2$shelfdist <- dlrlog_shelfdist$distance
dlrlog1 <- left_join(dlrlog1, dlrlog2)
rm(latlon,UTMcoord)
# Nahant location
nahant_lat_north <- 42.4266
nahant_lon_west <- -70.9223
nahant_lon_east <- nahant_lon_west + 360
# Dates of interest
dates_start <- as.Date("1870-01-01")
dates_stop <- as.Date("2099-12-30")
# list.files function altered to read files from another path * see above(path =)
files <- list.files(path = path, pattern = glob2rx("tos_day_*.nc"), full.names = TRUE)
files2 <- list.files(path = path, pattern = glob2rx("tos_day_*.nc"), full.names = FALSE)
closest_dists <- numeric(length(files))
for (i in 1:length(files)) {
nc <- nc_open(files[i])
# Get attributes of time
# ncatt_get(nc, "time")
t <- as.character(ncvar_get(nc, "time"))
tos <- ncvar_get(nc, "tos")
lat <- ncvar_get(nc, "lat")
lon <- ncvar_get(nc, "lon")
year <- substr(t, start = 1, stop = 4)
month <- substr(t, start = 5, stop = 6)
day <- substr(t, start = 7, stop = 8)
date <- as.Date(paste(year, month, day, sep = "/"))
all_locs <- as.matrix(expand.grid(lat = lat, lon = lon))
all_dists <-numeric(length = nrow(all_locs))
# Compute the distance between all locations and Nahant
for (j in 1:nrow(all_locs)) {
# Ignore locations with no temperature data
if (is.na(tos[lon == all_locs[j, 2], lat == all_locs[j, 1], 1])) {
all_dists[j] <- NA
} else {
all_dists[j] <- latlon2dist(c(nahant_lat_north,
nahant_lon_east,
all_locs[j, ]))
}
}
# Find closest location with temperature data
closest <- which.min(all_dists)
closest_dists[i] <- min(all_dists, na.rm = TRUE)
closest_coords <- all_locs[closest, ]
closest_lat_loc <- which(lat == closest_coords[1])
closest_lon_loc <- which(lon == closest_coords[2])
# longitude, latitude, time
closest_tos <- tos[lon == lon[closest_lon_loc],
lat == lat[closest_lat_loc],
date >= dates_start & date <= dates_stop]
tos_df <- data.frame(date = na.omit(date[date >= dates_start & date <= dates_stop]),
tos = na.omit(k2c(closest_tos)))
write.csv(file = paste0("nahant_",
substr(files2[i],
start = 1,
stop = nchar(files2[i]) - 3),
".csv"), tos_df)
nc_close(nc)
}
write.csv(file = "closest_dists_nahant.csv",
data.frame(model = files2, closest_dist = closest_dists))
# get indices of the grid cell closest to Spencer Canyon
tlon <- -73.1650; tlat <- 38.6319
j <- which.min(abs(lon-tlon))
k <- which.min(abs(lat-tlat))
print(c(j, lon[j], k, lat[k]))
# get time series for the closest gridpoint
fprob_spc1 <- fprob_ts1[j, k, ]
fprob_spc2 <- fprob_ts2[j, k, ]
No relationship
ss<- sys_state[-24,]
ss2<- sys_state[-23,]
table(ss$leslie, ss$score)
##
## A G P
## F 2 2 2
## P 1 1 1
#chisq.test(table(ss$leslie, ss$score))$expected
fisher.test(table(ss$leslie, ss$score))
##
## Fisher's Exact Test for Count Data
##
## data: table(ss$leslie, ss$score)
## p-value = 1
## alternative hypothesis: two.sided
table(ss2$leslie, ss2$score)
##
## A G P
## F 2 1 2
## P 1 2 1
#chisq.test(table(ss2$leslie, ss2$score))$expected
fisher.test(table(ss2$leslie, ss2$score))
##
## Fisher's Exact Test for Count Data
##
## data: table(ss2$leslie, ss2$score)
## p-value = 1
## alternative hypothesis: two.sided
# Create training (70%) and test (30%) sets for the data.
# Use set.seed for reproducibility
set.seed(123)
df_split <- initial_split(df, prop = .7) # can add ,lag = n
df_train <- training(df_split)
df_test <- testing(df_split)
# 1. Given training data set
# 2. Select number of trees to build (ntrees)
# 3. for i = 1 to ntrees do
# 4. | Generate a bootstrap sample of the original data
# 5. | Grow a regression tree to the bootstrapped data
# 6. | for each split do
# 7. | | Select m variables at random from all p variables
# 8. | | Pick the best variable/split-point among the m
# 9. | | Split the node into two child nodes
# 10. | end
# 11. | Use typical tree model stopping criteria to determine when a tree is complete (but do not prune)
# 12. end
# for reproduciblity
set.seed(123)
# default RF model
dfrf <- df[complete.cases(df$cpue),]
dfrf <- df[complete.cases(df), ]
df <- df %>% drop_na()
df_split <- initial_split(df, prop = .7) # can add ,lag = n
df_train <- training(df_split)
df_test <- testing(df_split)
# m1 <- randomForest(
# formula = cpue ~ .,
# data = dfrf
# )
#
# m1
# plot(m1)
# # number of trees with lowest MSE
# which.min(m1$mse)
#
#
# # RMSE of this optimal random forest
# sqrt(m1$mse[which.min(m1$mse)])
optimal_ranger <- ranger(
formula = cpue ~ .,
data = df_train,
num.trees = 500,
mtry = 13,
min.node.size = 5,
sample.fraction = .8,
importance = 'impurity'
)
## Growing trees.. Progress: 59%. Estimated remaining time: 21 seconds.
optimal_ranger$variable.importance %>%
tidy() %>%
dplyr::arrange(desc(x)) %>%
dplyr::top_n(25) %>%
ggplot(aes(reorder(names, x), x)) +
geom_col(fill = 'slategrey') +
coord_flip() +
ggtitle("Top 13 important variables")+
theme_minimal()
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## Selecting by x
## Set up models - Dealer/VTR Data - GAMs
## Make minor modifications to dataframe, consistent with study fleet and
#observer above (add 1 pound to all LPUE so that the log() link will behave
#and create an integer column of rounded LPUE to use the negative binomial).
## For model fitting, I'll fit a null model and a model with only year effects first with each distributional assumption (lognormal, gamma, and negative binomial). Then I'll use step() and/or manual stepwise selection to add more variables to the one with the best fit/diagnostics.
####-------------------------------filter and format: Dealer/VTR -------------------####
# aa <- dplyr::filter(aa, istarget == "target") %>%
# droplevels() %>%
# mutate(lpue = illexLPUE + 1 # add a small constant to all so log link can be applied
# , year = as.factor(as.character(year))
# , week = as.factor(as.character(week))
# , lpue.int = round(illexLPUE, 0)
# )
aa.mod <- aa %>%
#droplevels() %>%
mutate(lpue = illexLPUE + 1 # add a small constant to all so log link can be applied
, year = as.factor(as.character(YEAR))
, week = as.factor(as.character(WEEK))
, lpue.int = round(illexLPUE, 0)
)
df.mod <- df %>%
#droplevels() %>%
mutate(cpue = cpue + 1 # add a small constant to all so log link can be applied
, year = as.factor(as.character(year))
, week = as.factor(as.character(week))
, cpue.int = round(cpue, 0)
)
# GAM with zone as a factor
mod <- gam(cpue ~ s(sdsst) + subarea,
data = df, method = 'REML')
summary(mod)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## cpue ~ s(sdsst) + subarea
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2003.80 68.32 29.328 < 2e-16 ***
## subarea 60.20 20.66 2.914 0.00357 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(sdsst) 8.706 8.976 68.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.00759 Deviance explained = 0.771%
## -REML = 8.2348e+05 Scale est. = 6.5338e+07 n = 79055
#plot(mod, pch = 1, pages = 1, all.terms = TRUE)
#gam.check(mod, page = 1)
plot(mod, all.terms = TRUE, pages = 1,shade = TRUE, shade.col = "grey",
shift = coef(mod)[1], seWithMean = TRUE)
#?family.mgcv
#gam0 <- gam(cpue ~ year, data = df, family = nb)
## Try with subareas as random effects (bs = 're')
## Try with Markov Random Field Smooths (bs= 'mrf')
# Different GAMs for each zone
# g5zw <- gam(cpue ~ s(sst_anom_5zw), data = cbwk, method = 'REML')
# plot(g5zw, pch = 1, main = '5zw', shade = TRUE, shade.col = "grey",
# shift = coef(g5zw)[1], seWithMean = TRUE)
# g5ze <- gam(cpue ~ s(sst_anom_5ze), data = cbwk, method = 'REML')
# plot(g5ze, pch = 1, main = '5zw', shade = TRUE, shade.col = "grey",
# shift = coef(g5ze)[1], seWithMean = TRUE)
# g6a <- gam(cpue ~ s(sst_anom_6a), data = cbwk, method = 'REML')
# plot(g6a, pch = 1, main = '5zw', shade = TRUE, shade.col = "grey",
# shift = coef(g6a)[1], seWithMean = TRUE)
# g6b <- gam(cpue ~ s(sst_anom_6b), data = cbwk, method = 'REML')
# plot(g6b, pch = 1, main = '5zw', shade = TRUE, shade.col = "grey",
# shift = coef(g6b)[1], seWithMean = TRUE)
# g6c <- gam(cpue ~ s(sst_anom_6c), data = cbwk, method = 'REML')
# plot(g6c, pch = 1, main = '5zw', shade = TRUE, shade.col = "grey",
# shift = coef(g6c)[1], seWithMean = TRUE)
# g5zw <- gam(cpue ~ s(chl_anom_5zw, k = 10) + s(sst_anom_5zw, k = 10),
# data = cbwk, method = 'REML')
# summary(g5zw)
# gam.check(g5zw, page = 1)
# concurvity(g5zw, full = TRUE)
# concurvity(g5zw, full = FALSE)
#
# plot(g5zw, pages = 1, main = '5Zw', all.terms = TRUE, shade = TRUE,
# shade.col = "lightgrey",
# shift = coef(g5zw)[1], seWithMean = TRUE)
# g5ze <- gam(cpue ~ s(chl_anom_5ze), data = cbwk, method = 'REML')
# plot(g5ze, pch = 1, main = '5ze')
# g6a <- gam(cpue ~ s(chl_anom_6a), data = cbwk, method = 'REML')
# plot(g6a, pch = 1, main = '6a')
# g6b <- gam(cpue ~ s(chl_anom_6b), data = cbwk, method = 'REML')
# plot(g6b, pch = 1, main = '6b')
# g6c <- gam(cpue ~ s(chl_anom_6c) + s(sst_anom_6c), data = cbwk, method = 'REML')
#
# plot(g6c, main = '6C', all.terms = TRUE, shade = TRUE, shade.col = "lightgrey",
# shift = coef(g6c)[1], seWithMean = TRUE)
####### a function for model comparisons #######
compare.fun <- function(modlist) {
modcompare <- data.frame(matrix(NA, nrow=length(modlist),ncol = 6))
modcompare[,1] <- as.character()
for (i in 1:length(modlist)) {
modcompare[i,1] <- as.character(modlist[[i]]$call$formula[3])
# modcompare[i,1] <- as.character(modlist[[i]]$formula[3])
modcompare[i,2] <- round(modlist[[i]]$df.residual, digits = 0)
modcompare[i,3] <- modlist[[i]]$deviance
modcompare[i,4] <- round(modlist[[i]]$deviance/modlist[[i]]$df.residual, digits = 2)
modcompare[i,5] <- round(((modlist[[i]]$null.deviance - modlist[[i]]$deviance) / modlist[[i]]$null.deviance)*100, digits=2)
modcompare[i,6] <- modlist[[i]]$aic
}
colnames(modcompare) <- c("Formula", "df", "ResDeviance", "Dev/df", "PercDevExplained", "AIC")
return(modcompare %>% arrange(AIC))
}
yreffectplots <- function(mydata, mymodel, myY, mytitle) {
years <- sort(unique(mydata$YEAR))
coefs <- c(mymodel$coefficients[1], mymodel$coefficients[1] + mymodel$coefficients[2:length(years)])
par(mfrow=c(1,1)) # reset plot frame
ggplot() +
geom_point(data = mydata, aes(x = YEAR, y = mydata[, myY]), alpha = 0.8, position = position_jitter()) +
geom_boxplot(data = mydata, aes(x = YEAR, y = mydata[, myY]), fill = NA, outlier.shape = NA) +
geom_point(aes(x = years, y = exp(coefs)), color = "red") +
#theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
labs(x = "YEAR" , y = "Observed (black) and fitted (red) CPUE") +
ggplot() +
geom_point(data = mydata, aes(x = YEAR, y = log(mydata[, myY])), alpha = 0.8, position = position_jitter()) +
geom_boxplot(data = mydata, aes(x = YEAR, y = log(mydata[, myY])), fill = NA, outlier.shape = NA) +
geom_point(aes(x = years, y = coefs), color = "red") +
#theme(axis.text.x = element_text(angle = 30, hjust = 1, vjust = 1)) +
labs(x = "YEAR", y = "Observed log(CPUE) (black) and \n model coefficients (red) ") +
plot_annotation(title = paste(mytitle, mymodel$family ,"GAM")
, subtitle = as.character(mymodel$formula[3]))
}