diff --git a/.gitignore b/.gitignore index 61ac5e194db331708d2d46e12ae0ea12c4d44392..b4244590c4c38346ee16b6329dfada94a303b939 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ *.sav Stats/.Rhistory +Stats/.RData +Stats/.Rhistory +Stats/.Rhistory diff --git a/Stats/.Rhistory b/Stats/.Rhistory index 363f710925e38fb4086501d3a4a42c1e6636ad78..0557ac4b0d57c105e9c1818e857089aec69ee2fd 100644 --- a/Stats/.Rhistory +++ b/Stats/.Rhistory @@ -1,3 +1,61 @@ +library(multcompView) # "multcompLetters" function +library(ggplot2) +library(pgirmess) +library(postHoc) +#library(tidyquant) # geom_ma() if rolling average needed +################# DATASET IMPORTS ##################################### +folder <- './../' +whistles.dta <-read.table(file=paste0(folder, +'Whistles/Evaluation/whistles_durations.csv'), +sep = ',', header=TRUE) +whistles.dta <- whistles.dta[order(whistles.dta$audio_names),] +bbp.dta <-read.table(file=paste0(folder, +'BBPs/Results/16-06-22_14h00_number_of_BBP.csv'), +sep = ',', header=TRUE) +bbp.dta <- bbp.dta[order(bbp.dta$audio_names),] +clicks.dta <-read.table(file=paste0(folder, +'Clicks/Results/projection_updated_number_of_clicks_02052022.csv'), #number_of_clicks_02052022.csv +sep = ',', header=TRUE) +clicks.dta <- clicks.dta[order(clicks.dta$audio_names),] +# Merge files into 1 dataset +acoustic.dta <- clicks.dta +acoustic.dta$number_of_bbp <- bbp.dta$number_of_BBP +acoustic.dta$total_whistles_duration <- whistles.dta$total_whistles_duration +rm(whistles.dta, bbp.dta, clicks.dta) +# add group IDs +id2020 <- read.table(file=paste0(folder, 'CSV_data/Audio_Data_2020.csv'), +sep = ',', header=TRUE)[1:396,] +id2021 <- read.table(file=paste0(folder, 'CSV_data/Audio_Data_2021.csv'), +sep = ',', header=TRUE)[1:96,] +id2021$ID <- id2021$ID+max(id2020$ID) +id2021$Seq <- id2021$Seq+max(id2020$Seq) +id.dta <- rbind(id2020, id2021) +id.dta$Fichier.Audio <-str_sub(id.dta$Fichier.Audio, -27, -5) +acoustic.dta$ID <- rep(-1, 490) +for (name in acoustic.dta$audio_names){ +acoustic.dta$ID[match(name, acoustic.dta$audio_names)] <- id.dta$ID[match(name, id.dta$Fichier.Audio)] +} +acoustic.dta$ID <- as.factor(acoustic.dta$ID) +rm(id2020, id2021, id.dta) +# suppress "T" acoustic data (other groups not tested on our variables) +acoustic.dta <- acoustic.dta[acoustic.dta$acoustic!="T",] +# shuffle dataframe +acoustic.dta <- acoustic.dta[sample(1:nrow(acoustic.dta)), ] +acoustic.dta$acoustic <- factor(acoustic.dta$acoustic) +#################### DATA INSPECTION ################################# +# Data description +names(acoustic.dta) +# self explenatory except acoustic : correspond to the activation sequence. +# Look for obvious correlations +plot(acoustic.dta) # nothing that we can see +# Look for zero-inflation +100*sum(acoustic.dta$number_of_clicks == 0)/nrow(acoustic.dta) +100*sum(acoustic.dta$number_of_bbp == 0)/nrow(acoustic.dta) +100*sum(acoustic.dta$total_whistles_duration == 0)/nrow(acoustic.dta) +# 5.8%, 53.7% & 27.1% of our data are zeros. Will have to be dealt with. +# QUESTION: This study is aimed at understanding if dolphin's acoustic activity +# is influenced by their behavior, the emission of a pinger or a fishing net. +# Dependent variables (Y): number_of_clicks, number_of_bbp, total_whistles_duration. # Explanatory variables (X): acoustic, fishing_net, behavior, beacon, net, number. # What are the H0/ H1 hypotheses ? # H0 : No influence of any of the explanatory variables on a dependant one. @@ -78,7 +136,6 @@ data=acoustic.dta) vuong(zero.whi, nb.whi) #(if p-value<0.05 then first model in comparison is better) mod.whi <- zero.whi # => zeroinflated model is indeed better suited car::Anova(mod.whi, type=3) -shapiro.test(residuals(mod.whi)) # H0 : normality -> not rejected if p>0.05 dwtest(mod.whi) # H0 -> independent if p>0.05 (autocorrelation if p<0.05) bptest(mod.whi) # H0 -> homoscedasticity if p<0.05 # No normality but we do not need it @@ -92,7 +149,6 @@ data=acoustic.dta) car::Anova(mod.bbp, type=3) dwtest(mod.bbp) # H0 -> independent if p>0.05 (autocorrelation if p<0.05) bptest(mod.bbp) # H0 -> homoscedasticity if p<0.05 -# Normality not needed in GLM, hypotheses verified ! mod.bbp$deviance/mod.bbp$df.residual # slight underdispersion, not improved with ZINB so we keep this ### Model for clicks @@ -103,7 +159,6 @@ car::Anova(mod.cli, type=3) shapiro.test(residuals(mod.cli)) # H0 : normality -> cannot be rejected if p > 0.05 dwtest(mod.cli) # H0 -> independent if p>0.05 (autocorrelation if p<0.05) bptest(mod.cli) # H0 -> homoscedasticity if p<0.05 -# Normality not needed in GLM, hypotheses verified ! mod.cli$deviance/mod.cli$df.residual # slight overdispersion. (ZINB does not clearly improve results so we keep this) # FYI1: Comparison of combination of explanatory variables between models @@ -443,70 +498,15 @@ data_test <- acoustic.dta[acoustic.dta$ID!="2",] print( posthocKW(data_test$whistling_time_per_dolphin, data_test$ID)) print( posthocKW(data_test$BBPs_per_dolphin, data_test$ID)) print( posthocKW(data_test$clicks_per_dolphin, data_test$ID)) -hist(acoustic.dta$ID) -hist(acoustic.dta$number) -Adapted from Yannick OUTREMAN -# Agrocampus Ouest - 2020 -####################################################################### -library(pscl) -library(MASS) -library(lmtest) -library(multcomp) -library(emmeans) -library(dplyr) # "%>%" function -library(forcats) # "fct_relevel" function -library(stringr) # "gsub" function -library(rcompanion) # "fullPTable" function -library(multcompView) # "multcompLetters" function -library(ggplot2) -library(pgirmess) -library(postHoc) -#library(tidyquant) # geom_ma() if rolling average needed -################# DATASET IMPORTS ##################################### -folder <- './../' -whistles.dta <-read.table(file=paste0(folder, -'Whistles/Evaluation/whistles_durations.csv'), -sep = ',', header=TRUE) -whistles.dta <- whistles.dta[order(whistles.dta$audio_names),] -bbp.dta <-read.table(file=paste0(folder, -'BBPs/Results/16-06-22_14h00_number_of_BBP.csv'), -sep = ',', header=TRUE) -bbp.dta <- bbp.dta[order(bbp.dta$audio_names),] -clicks.dta <-read.table(file=paste0(folder, -'Clicks/Results/projection_updated_number_of_clicks_02052022.csv'), #number_of_clicks_02052022.csv -sep = ',', header=TRUE) -clicks.dta <- clicks.dta[order(clicks.dta$audio_names),] -# Merge files into 1 dataset -acoustic.dta <- clicks.dta -acoustic.dta$number_of_bbp <- bbp.dta$number_of_BBP -acoustic.dta$total_whistles_duration <- whistles.dta$total_whistles_duration -rm(whistles.dta, bbp.dta, clicks.dta) -# add group IDs -id2020 <- read.table(file=paste0(folder, 'CSV_data/Audio_Data_2020.csv'), -sep = ',', header=TRUE)[1:396,] -id2021 <- read.table(file=paste0(folder, 'CSV_data/Audio_Data_2021.csv'), -sep = ',', header=TRUE)[1:96,] -id2021$ID <- id2021$ID+max(id2020$ID) -id2021$Seq <- id2021$Seq+max(id2020$Seq) -id.dta <- rbind(id2020, id2021) -id.dta$Fichier.Audio <-str_sub(id.dta$Fichier.Audio, -27, -5) -acoustic.dta$ID <- rep(-1, 490) -for (name in acoustic.dta$audio_names){ -acoustic.dta$ID[match(name, acoustic.dta$audio_names)] <- id.dta$ID[match(name, id.dta$Fichier.Audio)] -} -acoustic.dta$ID <- as.factor(acoustic.dta$ID) -rm(id2020, id2021, id.dta) -# suppress "T" acoustic data (other groups not tested on our variables) -acoustic.dta <- acoustic.dta[acoustic.dta$acoustic!="T",] -# shuffle dataframe -acoustic.dta <- acoustic.dta[sample(1:nrow(acoustic.dta)), ] -acoustic.dta$acoustic <- factor(acoustic.dta$acoustic) -#################### DATA INSPECTION ################################# -# Data description -names(acoustic.dta) -# Look for obvious correlations -plot(acoustic.dta) # nothing that we can see -# Look for zero-inflation -100*sum(acoustic.dta$number_of_clicks == 0)/nrow(acoustic.dta) -100*sum(acoustic.dta$number_of_bbp == 0)/nrow(acoustic.dta) -100*sum(acoustic.dta$total_whistles_duration == 0)/nrow(acoustic.dta) +mod.cli$deviance/mod.cli$df.residual +mod.bbp$deviance/mod.bbp$df.residual +mod.whi$deviance/mod.whi$df.residual +mod.whi$deviance/mod.whi$df.residual +mod.whi$deviance +mod.whi +mod.whi$fitted.values +mod.whi$df.null +mod.whi$df.residual +mod.whi$df.null/mod.whi$df.residual +mod.bbp$deviance/mod.bbp$df.residual +mod.cli$deviance/mod.cli$df.residual diff --git a/Stats/BBP-click-whistles_3models.R b/Stats/BBP-click-whistles_3models.R index 0744f046b3664c566e051d6c79a80f7d30009270..8a05c3c6465cc47de8ecd404573a760bd692921d 100644 --- a/Stats/BBP-click-whistles_3models.R +++ b/Stats/BBP-click-whistles_3models.R @@ -125,7 +125,7 @@ shapiro.test(acoustic.dta$total_whistles_duration) shapiro.test(acoustic.dta$number_of_bbp) shapiro.test(acoustic.dta$number_of_clicks) # p-values are significant => they do not follow normal distributions -# will need a transformation or the use of a glm model +# will need a transformation or the use of a glim model # X Number of individuals per level summary(factor(acoustic.dta$acoustic)) @@ -145,11 +145,12 @@ ftable(factor(acoustic.dta$fishing_net), factor(acoustic.dta$behavior), factor(a ##################### STATISTICAL MODELLING ########################### ### Model tested -# LM: Linear model (residual hypothesis: normality, homoscedasticity, independant) -# GLM: Generalized linear model (residual hypothesis: homoscedasticity, independant) -# NB : Negative Binomial model (usually, when overdispersion with GLM) -# ZINB: Zero inflated negative binomial model (residual hypothesis: homoscedasticity, independant -# using number as an offset (more dolphins => more signals) +# GLM: General linear model (residual hypothesis: normality, homoscedasticity, independant) +# GLIM: Generalized linear model (residual hypothesis: uncorrelated residuals) +# NB : Negative Binomial model (residual hypothesis: independantM) +# ZINB: Zero inflated negative binomial model (residual hypothesis: independant) + +# We are using number as an offset (more dolphins => more signals) # beacon and net explanatory variables could not be tested in models # as they contain information already present in "fishing_net" which is more @@ -161,13 +162,13 @@ ftable(factor(acoustic.dta$fishing_net), factor(acoustic.dta$behavior), factor(a par(mfrow=c(1,1)) ### Model for whistles -# Residual hypotheses not verified for LM -# Overdipsersion when using GLM (negative binomial) +# Residual hypotheses not verified for GLM +# Overdipsersion when using GLIM (negative binomial) # Using ZINB: zero.whi <- zeroinfl(total_whistles_duration ~ acoustic + fishing_net + behavior + offset(log(number)), data=acoustic.dta, dist='negbin') -nb.whi <- glm.nb(total_whistles_duration ~ +nb.whi <- glim.nb(total_whistles_duration ~ acoustic + fishing_net + behavior + offset(log(number)), data=acoustic.dta) # comparison ZINB VS NB model @@ -176,14 +177,14 @@ mod.whi <- zero.whi # => zeroinflated model is indeed better suited car::Anova(mod.whi, type=3) dwtest(mod.whi) # H0 -> independent if p>0.05 (autocorrelation if p<0.05) bptest(mod.whi) # H0 -> homoscedasticity if p<0.05 -# No normality but we do not need it - +mod.whi$df.null/mod.whi$df.residual +# no dispersion, perfect ### Model for BBP -# No normality of residuals for LM -# overdispersion with GLM quasipoisson -#try with glm NB: -mod.bbp <- glm.nb(number_of_bbp ~ acoustic + fishing_net + behavior +# No normality of residuals for GLM +# overdispersion with GLIM quasipoisson +#try with glim NB: +mod.bbp <- glim.nb(number_of_bbp ~ acoustic + fishing_net + behavior + offset(log(number)), data=acoustic.dta) car::Anova(mod.bbp, type=3) @@ -195,7 +196,7 @@ mod.bbp$deviance/mod.bbp$df.residual ### Model for clicks # Using NB model: -mod.cli <- glm.nb(number_of_clicks ~ acoustic + fishing_net + acoustic:fishing_net + offset(log(number)), +mod.cli <- gilm.nb(number_of_clicks ~ acoustic + fishing_net + acoustic:fishing_net + offset(log(number)), data=acoustic.dta) car::Anova(mod.cli, type=3) shapiro.test(residuals(mod.cli)) # H0 : normality -> cannot be rejected if p > 0.05