diff --git a/.gitignore b/.gitignore
index 61ac5e194db331708d2d46e12ae0ea12c4d44392..b4244590c4c38346ee16b6329dfada94a303b939 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 
 *.sav
 Stats/.Rhistory
+Stats/.RData
+Stats/.Rhistory
+Stats/.Rhistory
diff --git a/Stats/.Rhistory b/Stats/.Rhistory
index 363f710925e38fb4086501d3a4a42c1e6636ad78..0557ac4b0d57c105e9c1818e857089aec69ee2fd 100644
--- a/Stats/.Rhistory
+++ b/Stats/.Rhistory
@@ -1,3 +1,61 @@
+library(multcompView) # "multcompLetters" function
+library(ggplot2)
+library(pgirmess)
+library(postHoc)
+#library(tidyquant)    # geom_ma() if rolling average needed
+################# DATASET IMPORTS #####################################
+folder <- './../'
+whistles.dta <-read.table(file=paste0(folder,
+'Whistles/Evaluation/whistles_durations.csv'),
+sep = ',', header=TRUE)
+whistles.dta <- whistles.dta[order(whistles.dta$audio_names),]
+bbp.dta <-read.table(file=paste0(folder,
+'BBPs/Results/16-06-22_14h00_number_of_BBP.csv'),
+sep = ',', header=TRUE)
+bbp.dta <- bbp.dta[order(bbp.dta$audio_names),]
+clicks.dta <-read.table(file=paste0(folder,
+'Clicks/Results/projection_updated_number_of_clicks_02052022.csv'), #number_of_clicks_02052022.csv
+sep = ',', header=TRUE)
+clicks.dta <- clicks.dta[order(clicks.dta$audio_names),]
+# Merge files into 1 dataset
+acoustic.dta <- clicks.dta
+acoustic.dta$number_of_bbp <- bbp.dta$number_of_BBP
+acoustic.dta$total_whistles_duration <- whistles.dta$total_whistles_duration
+rm(whistles.dta, bbp.dta, clicks.dta)
+# add group IDs
+id2020 <- read.table(file=paste0(folder, 'CSV_data/Audio_Data_2020.csv'),
+sep = ',', header=TRUE)[1:396,]
+id2021 <- read.table(file=paste0(folder, 'CSV_data/Audio_Data_2021.csv'),
+sep = ',', header=TRUE)[1:96,]
+id2021$ID <- id2021$ID+max(id2020$ID)
+id2021$Seq <- id2021$Seq+max(id2020$Seq)
+id.dta <- rbind(id2020, id2021)
+id.dta$Fichier.Audio <-str_sub(id.dta$Fichier.Audio, -27, -5)
+acoustic.dta$ID <- rep(-1, 490)
+for (name in acoustic.dta$audio_names){
+acoustic.dta$ID[match(name, acoustic.dta$audio_names)] <- id.dta$ID[match(name, id.dta$Fichier.Audio)]
+}
+acoustic.dta$ID <- as.factor(acoustic.dta$ID)
+rm(id2020, id2021, id.dta)
+# suppress "T" acoustic data (other groups not tested on our variables)
+acoustic.dta <- acoustic.dta[acoustic.dta$acoustic!="T",]
+# shuffle dataframe
+acoustic.dta <- acoustic.dta[sample(1:nrow(acoustic.dta)), ]
+acoustic.dta$acoustic <- factor(acoustic.dta$acoustic)
+#################### DATA INSPECTION  #################################
+# Data description
+names(acoustic.dta)
+# self explenatory except acoustic : correspond to the activation sequence.
+# Look for obvious correlations
+plot(acoustic.dta) # nothing that we can see
+# Look for zero-inflation
+100*sum(acoustic.dta$number_of_clicks == 0)/nrow(acoustic.dta)
+100*sum(acoustic.dta$number_of_bbp == 0)/nrow(acoustic.dta)
+100*sum(acoustic.dta$total_whistles_duration == 0)/nrow(acoustic.dta)
+# 5.8%, 53.7% & 27.1% of our data are zeros. Will have to be dealt with.
+# QUESTION: This study is aimed at understanding if dolphin's acoustic activity
+# is influenced by their behavior, the emission of a pinger or a fishing net.
+# Dependent variables (Y): number_of_clicks, number_of_bbp, total_whistles_duration.
 # Explanatory variables (X): acoustic, fishing_net, behavior, beacon, net, number.
 # What are the H0/ H1 hypotheses ?
 # H0 : No influence of any of the explanatory variables on a dependant one.
@@ -78,7 +136,6 @@ data=acoustic.dta)
 vuong(zero.whi, nb.whi)  #(if p-value<0.05 then first model in comparison is better)
 mod.whi <- zero.whi # => zeroinflated model is indeed better suited
 car::Anova(mod.whi, type=3)
-shapiro.test(residuals(mod.whi)) # H0 : normality -> not rejected if p>0.05
 dwtest(mod.whi) # H0 -> independent if p>0.05 (autocorrelation if p<0.05)
 bptest(mod.whi) # H0 -> homoscedasticity if p<0.05
 # No normality but we do not need it
@@ -92,7 +149,6 @@ data=acoustic.dta)
 car::Anova(mod.bbp, type=3)
 dwtest(mod.bbp) # H0 -> independent if p>0.05 (autocorrelation if p<0.05)
 bptest(mod.bbp) # H0 -> homoscedasticity if p<0.05
-# Normality not needed in GLM, hypotheses verified !
 mod.bbp$deviance/mod.bbp$df.residual
 # slight underdispersion, not improved with ZINB so we keep this
 ### Model for clicks
@@ -103,7 +159,6 @@ car::Anova(mod.cli, type=3)
 shapiro.test(residuals(mod.cli)) # H0 : normality -> cannot be rejected if p > 0.05
 dwtest(mod.cli) # H0 -> independent if p>0.05 (autocorrelation if p<0.05)
 bptest(mod.cli) # H0 -> homoscedasticity if p<0.05
-# Normality not needed in GLM, hypotheses verified !
 mod.cli$deviance/mod.cli$df.residual
 # slight overdispersion. (ZINB does not clearly improve results so we keep this)
 # FYI1: Comparison of combination of explanatory variables between models
@@ -443,70 +498,15 @@ data_test <- acoustic.dta[acoustic.dta$ID!="2",]
 print( posthocKW(data_test$whistling_time_per_dolphin, data_test$ID))
 print( posthocKW(data_test$BBPs_per_dolphin, data_test$ID))
 print( posthocKW(data_test$clicks_per_dolphin, data_test$ID))
-hist(acoustic.dta$ID)
-hist(acoustic.dta$number)
-Adapted from Yannick OUTREMAN
-#    Agrocampus Ouest - 2020
-#######################################################################
-library(pscl)
-library(MASS)
-library(lmtest)
-library(multcomp)
-library(emmeans)
-library(dplyr)        # "%>%" function
-library(forcats)      # "fct_relevel" function
-library(stringr)      # "gsub" function
-library(rcompanion)   # "fullPTable" function
-library(multcompView) # "multcompLetters" function
-library(ggplot2)
-library(pgirmess)
-library(postHoc)
-#library(tidyquant)    # geom_ma() if rolling average needed
-################# DATASET IMPORTS #####################################
-folder <- './../'
-whistles.dta <-read.table(file=paste0(folder,
-'Whistles/Evaluation/whistles_durations.csv'),
-sep = ',', header=TRUE)
-whistles.dta <- whistles.dta[order(whistles.dta$audio_names),]
-bbp.dta <-read.table(file=paste0(folder,
-'BBPs/Results/16-06-22_14h00_number_of_BBP.csv'),
-sep = ',', header=TRUE)
-bbp.dta <- bbp.dta[order(bbp.dta$audio_names),]
-clicks.dta <-read.table(file=paste0(folder,
-'Clicks/Results/projection_updated_number_of_clicks_02052022.csv'), #number_of_clicks_02052022.csv
-sep = ',', header=TRUE)
-clicks.dta <- clicks.dta[order(clicks.dta$audio_names),]
-# Merge files into 1 dataset
-acoustic.dta <- clicks.dta
-acoustic.dta$number_of_bbp <- bbp.dta$number_of_BBP
-acoustic.dta$total_whistles_duration <- whistles.dta$total_whistles_duration
-rm(whistles.dta, bbp.dta, clicks.dta)
-# add group IDs
-id2020 <- read.table(file=paste0(folder, 'CSV_data/Audio_Data_2020.csv'),
-sep = ',', header=TRUE)[1:396,]
-id2021 <- read.table(file=paste0(folder, 'CSV_data/Audio_Data_2021.csv'),
-sep = ',', header=TRUE)[1:96,]
-id2021$ID <- id2021$ID+max(id2020$ID)
-id2021$Seq <- id2021$Seq+max(id2020$Seq)
-id.dta <- rbind(id2020, id2021)
-id.dta$Fichier.Audio <-str_sub(id.dta$Fichier.Audio, -27, -5)
-acoustic.dta$ID <- rep(-1, 490)
-for (name in acoustic.dta$audio_names){
-acoustic.dta$ID[match(name, acoustic.dta$audio_names)] <- id.dta$ID[match(name, id.dta$Fichier.Audio)]
-}
-acoustic.dta$ID <- as.factor(acoustic.dta$ID)
-rm(id2020, id2021, id.dta)
-# suppress "T" acoustic data (other groups not tested on our variables)
-acoustic.dta <- acoustic.dta[acoustic.dta$acoustic!="T",]
-# shuffle dataframe
-acoustic.dta <- acoustic.dta[sample(1:nrow(acoustic.dta)), ]
-acoustic.dta$acoustic <- factor(acoustic.dta$acoustic)
-#################### DATA INSPECTION  #################################
-# Data description
-names(acoustic.dta)
-# Look for obvious correlations
-plot(acoustic.dta) # nothing that we can see
-# Look for zero-inflation
-100*sum(acoustic.dta$number_of_clicks == 0)/nrow(acoustic.dta)
-100*sum(acoustic.dta$number_of_bbp == 0)/nrow(acoustic.dta)
-100*sum(acoustic.dta$total_whistles_duration == 0)/nrow(acoustic.dta)
+mod.cli$deviance/mod.cli$df.residual
+mod.bbp$deviance/mod.bbp$df.residual
+mod.whi$deviance/mod.whi$df.residual
+mod.whi$deviance/mod.whi$df.residual
+mod.whi$deviance
+mod.whi
+mod.whi$fitted.values
+mod.whi$df.null
+mod.whi$df.residual
+mod.whi$df.null/mod.whi$df.residual
+mod.bbp$deviance/mod.bbp$df.residual
+mod.cli$deviance/mod.cli$df.residual
diff --git a/Stats/BBP-click-whistles_3models.R b/Stats/BBP-click-whistles_3models.R
index 0744f046b3664c566e051d6c79a80f7d30009270..8a05c3c6465cc47de8ecd404573a760bd692921d 100644
--- a/Stats/BBP-click-whistles_3models.R
+++ b/Stats/BBP-click-whistles_3models.R
@@ -125,7 +125,7 @@ shapiro.test(acoustic.dta$total_whistles_duration)
 shapiro.test(acoustic.dta$number_of_bbp)
 shapiro.test(acoustic.dta$number_of_clicks)
 # p-values are significant => they do not follow normal distributions
-# will need a transformation or the use of a glm model
+# will need a transformation or the use of a glim model
 
 # X Number of individuals per level
 summary(factor(acoustic.dta$acoustic))
@@ -145,11 +145,12 @@ ftable(factor(acoustic.dta$fishing_net), factor(acoustic.dta$behavior), factor(a
 
 ##################### STATISTICAL MODELLING ###########################
 ### Model tested
-# LM: Linear model (residual hypothesis: normality, homoscedasticity, independant)
-# GLM: Generalized linear model (residual hypothesis: homoscedasticity, independant)
-# NB : Negative Binomial model (usually, when overdispersion with GLM)
-# ZINB: Zero inflated negative binomial model (residual hypothesis: homoscedasticity, independant
-# using number as an offset (more dolphins => more signals)
+# GLM: General linear model (residual hypothesis: normality, homoscedasticity, independant)
+# GLIM: Generalized linear model (residual hypothesis: uncorrelated residuals)
+# NB : Negative Binomial model (residual hypothesis: independantM)
+# ZINB: Zero inflated negative binomial model (residual hypothesis: independant)
+
+# We are using number as an offset (more dolphins => more signals)
 
 # beacon and net explanatory variables could not be tested in models 
 # as they contain information already present in "fishing_net" which is more 
@@ -161,13 +162,13 @@ ftable(factor(acoustic.dta$fishing_net), factor(acoustic.dta$behavior), factor(a
 par(mfrow=c(1,1))
 
 ### Model for whistles
-# Residual hypotheses not verified for LM
-# Overdipsersion when using GLM (negative binomial)
+# Residual hypotheses not verified for GLM
+# Overdipsersion when using GLIM (negative binomial)
 # Using ZINB:
 zero.whi <- zeroinfl(total_whistles_duration ~ 
                       acoustic + fishing_net + behavior + offset(log(number)), 
                     data=acoustic.dta, dist='negbin')
-nb.whi <- glm.nb(total_whistles_duration ~ 
+nb.whi <- glim.nb(total_whistles_duration ~ 
                      acoustic + fishing_net + behavior + offset(log(number)), 
                    data=acoustic.dta)
 # comparison ZINB VS NB model
@@ -176,14 +177,14 @@ mod.whi <- zero.whi # => zeroinflated model is indeed better suited
 car::Anova(mod.whi, type=3)
 dwtest(mod.whi) # H0 -> independent if p>0.05 (autocorrelation if p<0.05)
 bptest(mod.whi) # H0 -> homoscedasticity if p<0.05
-# No normality but we do not need it
-
+mod.whi$df.null/mod.whi$df.residual 
+# no dispersion, perfect
 
 ### Model for BBP
-# No normality of residuals for LM
-# overdispersion with GLM quasipoisson
-#try with glm NB:
-mod.bbp <- glm.nb(number_of_bbp ~ acoustic + fishing_net + behavior 
+# No normality of residuals for GLM
+# overdispersion with GLIM quasipoisson
+#try with glim NB:
+mod.bbp <- glim.nb(number_of_bbp ~ acoustic + fishing_net + behavior 
                   + offset(log(number)),
                   data=acoustic.dta)
 car::Anova(mod.bbp, type=3)
@@ -195,7 +196,7 @@ mod.bbp$deviance/mod.bbp$df.residual
 
 ### Model for clicks
 # Using NB model:
-mod.cli <- glm.nb(number_of_clicks ~ acoustic + fishing_net + acoustic:fishing_net + offset(log(number)), 
+mod.cli <- gilm.nb(number_of_clicks ~ acoustic + fishing_net + acoustic:fishing_net + offset(log(number)), 
                data=acoustic.dta)
 car::Anova(mod.cli, type=3)
 shapiro.test(residuals(mod.cli)) # H0 : normality -> cannot be rejected if p > 0.05