b1_indicator_imputation_gam_step.Rmd
The code was written by MF to apply a Generalized Additive Model (GAM) method to imputate counts. The script replaced the previous imputation step using Thomas method which created a number of outling counts.
## Script for the imputation step based on the use of generalise additive model to
## impute missing counts in the colonies time series survey.
## MF 27/04/2022 B1 indicator JNCC
## the conditon that will apply to the imputation is that at least three counts must be present.
## In case the colony reported less than 3 counts will be discarded because the imputation will be not possible.
##libraries required
library(mgcv)
library(dplyr)
library(tidyverse)
library(readr)
library(fs)
##upload the file from breeding or wintering data
lollo<-read_tsv(path("data", "breeding", "ForImputationBreeding.txt"))
##check of the subregion
unique(lollo$OSPARsubRegion)
##Selection of the subregion of interest
out <- lollo[(lollo$OSPAR == 2 & lollo$OSPARsubRegion == "d"), ] #
#initialise the dummy dataframe to use to collate the imputation: always resent when Region/Subregion change
out_imputed<-data.frame()
new_out_colony<-data.frame(matrix(0, ncol = 14, nrow = 37))
colnames(new_out_colony)<-c("Country","Colony","ColonyNumber","CommonName","Species","Year","Count","Sample","Plot","OSPAR","OSPARsubRegion","Place","Imputation","gamCount")
##check of the Sample: 2=colony, 1=plot
unique(out$Sample)
#intialise the dataseries we want to impute: please change accordingly
newYear <-data.frame(seq (1984,2020,1))
colnames(newYear)<-"Year"
## basic data cleaning from Adam
out <- out[out$Count > -1,] ## get rid of missing data
#If Plot and Whole colony count is present we choose the ehole colony count and discar the plot data. If only plot OR whole colony data are present is fine and this step must be omitted
#out <- out[out$Sample ==2 ,] ## get rid of missing data
out$Plot[out$Plot < 0] <- NA ## show missing plots
out$Place <- paste(as.character(out$Colony), ",plot", as.character(out$Plot), sep="") ## set up a place indicator
out$Place[is.na(out$Plot)] <- as.character(out$Colony[is.na(out$Plot)])
# initialisiation of the vector reporting the failing colonies (i.e., colonies with less than 3 counts)
list_failed_colonies<-"NA"
##specie selection and loop for imputation
specie_list<-unique(out$CommonName)
for (l in 1:length(specie_list)){
out_species<-out[out$CommonName==specie_list[l],]
colony_list<-unique(out_species$Colony)
length(colony_list)
for (j in 1:length(colony_list)){
out_colony<-out_species[out_species$Colony==colony_list[j],]
if (nrow(out_colony)>2){
model<-gam(Count~s(Year, k=nrow(out_colony)),data=out_colony)
pred1 <- predict.gam(model,newYear)
pred1<-ifelse(pred1<0,0,pred1)
new_out_colony<-data.frame(matrix(0, ncol = 14, nrow = 37))
colnames(new_out_colony)<-c("Country","Colony","ColonyNumber","CommonName","Species","Year","Count","Sample","Plot","OSPAR","OSPARsubRegion","Place","Imputation","gamCount")
new_out_colony$Country<-out_colony$Country[1]
new_out_colony$Colony<-out_colony$Colony[1]
new_out_colony$ColonyNumber<-out_colony$ColonyNumber[1]
new_out_colony$CommonName<-out_colony$CommonName[1]
new_out_colony$Species<-out_colony$Species[1]
new_out_colony$Year<-newYear$Year
new_out_colony$Sample<-out_colony$Sample[1]
new_out_colony$Plot<-out_colony$Plot[1]
new_out_colony$OSPAR<-out_colony$OSPAR[1]
new_out_colony$OSPARsubRegion<-out_colony$OSPARsubRegion[1]
new_out_colony$Place<-out_colony$Place[1]
new_out_colony$gamCount<-round(pred1,0)
new_out_colony$Count="NA"
f=1
for (i in 1:37){
if(f<=nrow(out_colony)){
if (newYear$Year[i]==out_colony$Year[f]) {
new_out_colony$Imputation[i]="counted"
new_out_colony$Count[i]=out_colony$Count[f]
new_out_colony$gamCount[i]=out_colony$Count[f]
f=f+1}
else {
new_out_colony$Imputation[i]="imputed"}
}
else {
new_out_colony$Imputation[i]="imputed"
}
}
out_imputed<-rbind(out_imputed,new_out_colony)
}
else {list_failed_colonies<-append(list_failed_colonies,out_colony$Colony)}
}
}
##change the filename according with the Region /Subregion imputed
write.csv(list_failed_colonies, path("data", "breeding", "list_failed_colonies_2d.csv"))
write.csv(out_imputed, path("data", "breeding", "out_imputed_2d.csv"))