Checking reproducibility of FishMap results

This notebook gather the analysis performed in order to compare the output of the original FishMap main.R script between Client and ThinkR. We will compare the results obtained with identical seeds following the execution of the script dev/run_main_and_save_output.R.

Here, we use high resolution parameters :

  • k = 0.75
  • month_start <- 10
  • month_end <- 12

Executing main.R

We generate the outputs on ThinkR machine. > make sure your .Renviron variables FISHMAP_UPDATE_OUTPUTS and FISHMAP_OUTPUT_DIR are correctly set

# Generate results (model files are compiled)
source(here::here("dev", "run_main_and_save_output.R"))

Executing this code results in four outputs to be compared.

output_dir <- Sys.getenv("FISHMAP_OUTPUT_DIR")
list.files(path = file.path(output_dir))
## character(0)

To check whether the seed is effectively making the output reproducible, we will run a second time main.R with the same seed.

# Change output dir to avoid overriding previous run
Sys.setenv(FISHMAP_OUTPUT_DIR = "~/shared/outputs_fishmap_highres_rerun")

# Run main.R a second time (model files are already compiled from first run)
source(here::here("dev", "run_main_and_save_output.R"))
## Error in file(filename, "r", encoding = encoding): cannot open the connection

Note: model fit is performed in 186 steps in this run.

We list the resulting output files.

thinkr1_output_dir <- file.path("~","shared","outputs_fishmap_highres_rerun")

thinkr1_output <- paste0(
  list.files(
    path = thinkr1_output_dir,
    full.names = TRUE
  ),
  collapse = "\n"
)

glue::glue("The paths to ThinkR's second run output files are :\n {thinkr1_output}")
## The paths to ThinkR's second run output files are :
thinkr2_output_dir <- file.path("~","shared","outputs_fishmap_highres")

thinkr2_output <- paste0(
  list.files(
    path = thinkr2_output_dir,
    full.names = TRUE
  ),
  collapse = "\n"
)

glue::glue("The paths to ThinkR's first run output files are :\n {thinkr2_output}")
## The paths to ThinkR's first run output files are :
## /home/rstudio/shared/outputs_fishmap_highres/converge_output.rds
## /home/rstudio/shared/outputs_fishmap_highres/obj_input.rds
## /home/rstudio/shared/outputs_fishmap_highres/opt_output.rds
## /home/rstudio/shared/outputs_fishmap_highres/part1_output0.25.rds
## /home/rstudio/shared/outputs_fishmap_highres/part2_output0.25.rds
## /home/rstudio/shared/outputs_fishmap_highres/part3_output.rds
## /home/rstudio/shared/outputs_fishmap_highres/report_output.rds

Checking seed reproducibility

In order to contrast output files from both ThinkR runs, we will use the package waldo.

# list of output fiiles to contrast
files_to_contrast <- list.files(path = file.path(thinkr1_output_dir))

# running waldo on each files comparing thinkR runs
purrr::map(
  .x = files_to_contrast,
  ~ waldo::compare(
    x = readRDS(
      file.path(thinkr1_output_dir, .x)
    ),
    y = readRDS(
      file.path(thinkr2_output_dir, .x)
    )
  )
) %>% setNames(files_to_contrast)
## named list()

Both ThinkR outputs are identical. We will use one of them to now compare with the Client’s output.

Loading Client’s output

We now load the outputs generated from Clients (BA and JC) in a temporary folder.

# Create tmp folder to store Client output
tmp_folder <- tempfile(pattern = "fishmap_highres")
dir.create(tmp_folder)

# Download and unzip JC highres outputs from Git repo
jc_zip_file_url <- "https://github.com/balglave/FishMap/files/10970908/outputs_fishmap_highres.zip"
download.file(
  url = jc_zip_file_url,
  destfile = file.path(tmp_folder, "jc_output.zip")
)

unzip(
  zipfile = file.path(tmp_folder, "jc_output.zip"),
  exdir = file.path(tmp_folder, "jc_output")
)

jc_output_dir <- file.path(tmp_folder, "jc_output", "outputs_fishmap_highres")

jc_output <- paste0(
  list.files(
    path = jc_output_dir,
    full.names = TRUE
  ),
  collapse = "\n"
)

glue::glue("The paths to Juliette's output files are :\n {jc_output}")
## The paths to Juliette's output files are :
## /tmp/RtmpMWf0bp/fishmap_highres5ded6e7c2666/jc_output/outputs_fishmap_highres/converge_output.rds
## /tmp/RtmpMWf0bp/fishmap_highres5ded6e7c2666/jc_output/outputs_fishmap_highres/obj_input.rds
## /tmp/RtmpMWf0bp/fishmap_highres5ded6e7c2666/jc_output/outputs_fishmap_highres/opt_output.rds
## /tmp/RtmpMWf0bp/fishmap_highres5ded6e7c2666/jc_output/outputs_fishmap_highres/report_output.rds
# Download and unzip BA highres outputs from Git repo
ba_zip_file_url <- "https://github.com/balglave/FishMap/files/10982408/outputs_fishmap_highres.zip"
download.file(
  url = ba_zip_file_url,
  destfile = file.path(tmp_folder, "ba_output.zip")
)

unzip(
  zipfile = file.path(tmp_folder, "ba_output.zip"),
  exdir = file.path(tmp_folder, "ba_output")
)

ba_output_dir <- file.path(tmp_folder, "ba_output","outputs_fishmap_highres")

ba_output <- paste0(
  list.files(
    path = ba_output_dir,
    full.names = TRUE
  ),
  collapse = "\n"
)

glue::glue("The paths to Baptiste's output files are :\n {ba_output}")
## The paths to Baptiste's output files are :
## /tmp/RtmpMWf0bp/fishmap_highres5ded6e7c2666/ba_output/outputs_fishmap_highres/converge_output.rds
## /tmp/RtmpMWf0bp/fishmap_highres5ded6e7c2666/ba_output/outputs_fishmap_highres/obj_input.rds
## /tmp/RtmpMWf0bp/fishmap_highres5ded6e7c2666/ba_output/outputs_fishmap_highres/opt_output.rds
## /tmp/RtmpMWf0bp/fishmap_highres5ded6e7c2666/ba_output/outputs_fishmap_highres/report_output.rds

Contrasting files

Results between ThinkR and Client’s are not perfectly identical.

Important note : To compare numerical results we will set a tolerance in numerical differences to 10e-4.

We find differences in function code present in the outputs (see section on obj_inputs.rds). This might indicate a difference in package version. cf. sessionInfo() provided by Clients.

Create contrast function

We will use again waldo within a function to display the exact differences for each file.

# Create a function to explore waldo's output file by file between ThinkR and Baptiste + Juliette outputs

compare_output_file <- function(file_name, author) {
  if (author == "juliette") {
    client_output_dir <- jc_output_dir
  } else if (author == "baptiste") {
    client_output_dir <- ba_output_dir
  } else {
    stop("author must be either juliette or baptiste")
  }

  # running waldo on one file (thinkR ~ client)
  message(glue::glue("contrasting output of {file_name} between thinkr and {author}"))
  compare_author <- waldo::compare(
    x = readRDS(
      file.path(client_output_dir, file_name)
    ),
    y = readRDS(
      file.path(thinkr1_output_dir, file_name)
    ),
    x_arg = author,
    y_arg = "thinkr",
    max_diffs = 100,
    tolerance = 10e-4
  )

  return(compare_author)
}

Contrasting converge_output.rds output

compare_output_file(file_name = "converge_output.rds", author = "baptiste")
## contrasting output of converge_output.rds between thinkr and baptiste
## Warning in gzfile(file, "rb"): cannot open compressed file '/home/rstudio/shared/outputs_fishmap_highres_rerun/converge_output.rds', probable reason 'No
## such file or directory'
## Error in gzfile(file, "rb"): cannot open the connection
compare_output_file(file_name = "converge_output.rds", author = "juliette")
## contrasting output of converge_output.rds between thinkr and juliette
## Warning in gzfile(file, "rb"): cannot open compressed file '/home/rstudio/shared/outputs_fishmap_highres_rerun/converge_output.rds', probable reason 'No
## such file or directory'
## Error in gzfile(file, "rb"): cannot open the connection

Contrasting opt_output.rds output

compare_output_file(file_name = "opt_output.rds", author = "baptiste")
## contrasting output of opt_output.rds between thinkr and baptiste
## Warning in gzfile(file, "rb"): cannot open compressed file '/home/rstudio/shared/outputs_fishmap_highres_rerun/opt_output.rds', probable reason 'No such
## file or directory'
## Error in gzfile(file, "rb"): cannot open the connection
compare_output_file(file_name = "opt_output.rds", author = "juliette")
## contrasting output of opt_output.rds between thinkr and juliette
## Warning in gzfile(file, "rb"): cannot open compressed file '/home/rstudio/shared/outputs_fishmap_highres_rerun/opt_output.rds', probable reason 'No such
## file or directory'
## Error in gzfile(file, "rb"): cannot open the connection

Contrasting report_output.rds output

compare_output_file(file_name = "report_output.rds", author = "baptiste")
## contrasting output of report_output.rds between thinkr and baptiste
## Warning in gzfile(file, "rb"): cannot open compressed file '/home/rstudio/shared/outputs_fishmap_highres_rerun/report_output.rds', probable reason 'No
## such file or directory'
## Error in gzfile(file, "rb"): cannot open the connection
compare_output_file(file_name = "report_output.rds", author = "juliette")
## contrasting output of report_output.rds between thinkr and juliette
## Warning in gzfile(file, "rb"): cannot open compressed file '/home/rstudio/shared/outputs_fishmap_highres_rerun/report_output.rds', probable reason 'No
## such file or directory'
## Error in gzfile(file, "rb"): cannot open the connection

Contrasting obj_input.rds output

compare_output_file(file_name = "obj_input.rds", author = "juliette")
## contrasting output of obj_input.rds between thinkr and juliette
## Warning in gzfile(file, "rb"): cannot open compressed file '/home/rstudio/shared/outputs_fishmap_highres_rerun/obj_input.rds', probable reason 'No such
## file or directory'
## Error in gzfile(file, "rb"): cannot open the connection

We encounter an error when contrasting the fn and env element of the obj_input.rds object between Baptiste’s version and ThinkR’s version. We run waldo element by element except for the object of class environment, with a specific parameter (ignore_function_env = TRUE) that ignore function environment comparison and allows the comparison to succeed. This error might come from a difference in package version (cf. comment at the beginning of section).

file_name <- "obj_input.rds"
author <- "baptiste"
client_output_dir <- ba_output_dir

message(glue::glue("contrasting elemts of {file_name} element by element between thinkr and {author}"))
## contrasting elemts of obj_input.rds element by element between thinkr and baptiste
client_obj <- readRDS(
    file.path(client_output_dir, file_name)
  )

thinkr_obj <- readRDS(
    file.path(thinkr1_output_dir, file_name)
  )
## Warning in gzfile(file, "rb"): cannot open compressed file '/home/rstudio/shared/outputs_fishmap_highres_rerun/obj_input.rds', probable reason 'No such
## file or directory'
## Error in gzfile(file, "rb"): cannot open the connection
# select elements of list that are not a env class
list_is_env <- purrr::map_lgl(client_obj, ~ inherits(.x, "environment"))
list_names_not_env <- names(client_obj)[!list_is_env]

purrr::map(
  .x = list_names_not_env,
  ~ waldo::compare(
  x = client_obj[[.x]],
  y = thinkr_obj[[.x]],
  x_arg = author,
  y_arg = "thinkr",
  tolerance = 10e-6,
  max_diffs = 100,
  ignore_function_env = TRUE
)
) %>% setNames(list_names_not_env)
## Error in is_missing(y): object 'thinkr_obj' not found
# delete temporary folder
unlink(tmp_folder)
# knit the Rmd as a compiled Rmd in vignettes
knitr::knit(
  input = here::here("dev/archive/dev_check_model_reproducibility_highres.Rmd"),
  output = here::here("vignettes/dev-check-model-reproducibility-highres.Rmd")
)