Differential expression analysis


Project: GSE107401


The aim of this document is to present the data and differential expression analysis between samples from the project GSE107401.

The main steps are:

For a quick access to the figures, go to:
- Normalisation > Visualisation for normalised counts
- Results subsections for the pairwise comparisons

All figures are saved in PNG and PDF format and accessible in the ./project_GSE107401/deseq2_figures/ folder. You can download the code behind this analysis through the “Code” button in the top right corner of the document.

1 Settings

The aim of this section is to:

  • load the tools of interest
  • define custom functions
  • load the input parameters

1.1 Environment

We load the tools to handle data and figures.

library(dplyr)
library(ggplot2)

1.2 Custom functions

We further define custom functions to avoid repetitions in the code.

# -----------------------------------------------------------------------------
# buildCountMatrix
# Create a matrix of reads count
#
# Input:
#   files : a vector of files names
#   sampleLabel : a vector of sample names
#   projectPath: path to the project directory
#
# Output:
#   countMatrix : a reads count matrix
#
# Original author : Vivien DESHAIES
# -----------------------------------------------------------------------------
buildCountMatrix = function(files, sampleLabel, expHeader) {
  
  # read first file and create countMatrix
  countMatrix = read.table(files[1],
                           header = expHeader,
                           stringsAsFactors = F,
                           quote = "")
  colnames(countMatrix) = c("id", "count1")
  
  # read and merge all remaining files with the first
  for (i in 2:length(files)) {
    
    # read files
    exp = read.table(files[i],
                     header = expHeader,
                     stringsAsFactors = F,
                     quote = "")
    
    # lowercase exp columns names
    colnames(exp) = c("id", paste0("count", i))
    
    # merge file data to count matrix by id
    countMatrix = merge(countMatrix, exp, by = "id", suffixes = "_")
  }
  
  # name rows
  rownames(countMatrix) = countMatrix[,1]
  
  # delete first row containing row names
  countMatrix = countMatrix[,-1]
  
  # name columns
  colnames(countMatrix) = sampleLabel
  return(countMatrix)
}

###############################################################################
# -----------------------------------------------------------------------------
# saveRawCountMatrix
#
#   A specific treatment is needed for the raw count matrix because the object
#   DESeq2::counts(dds) does not include the names of the samples as column names
#
#   input: dds -> DESeq object
#          fileName -> character (name of the file to output)
#   output: rawCountMatrix -> file
#
# -----------------------------------------------------------------------------

saveRawCountMatrix = function(dds,fileName) {
  
  countMatrix = DESeq2::counts(dds)
  countMatrix = cbind(countMatrix, row.names(countMatrix))
  
  # Add the samples names as column names + add column Id
  colData = SummarizedExperiment::colData(dds)
  colnames(countMatrix) = c(colData$Name, "Id")
  
  # Put Ids on the first column
  countMatrix = countMatrix[, c("Id", colData$Name)]
  
  # Write the count matrix in a file
  write.table(countMatrix,
              file = paste0(fileName),
              sep = "\t",
              row.names = FALSE,
              quote = FALSE)
}

###############################################################################
# -----------------------------------------------------------------------------
# saveTable
#
#   input: dataframe -> data frame (count matrix)
#          fileName -> character (name of the file to output)
#   output: countMatrix -> file
# -----------------------------------------------------------------------------

saveTable = function(dataframe, fileName) {
  # Add column Id
  column_names = colnames(dataframe)
  dataframe = cbind(row.names(dataframe), dataframe)
  colnames(dataframe) = c("Id", column_names)
  
  # Write the table in a file
  write.table(dataframe,
              file = paste0(fileName),
              sep = "\t",
              row.names = FALSE,
              quote = FALSE)
}

# -----------------------------------------------------------------------------
# plotDendrogram
# Create a ggplot object corresponding to a dendrogram
#
# Input:
#   hc_data : object obtained with the ggdendro::dendro_data function
#   fig_title : figure title
#   col_vector: named vector of colors
#
# Output:
#   p : ggplot object
#
# Original author : Audrey ONFROY
# -----------------------------------------------------------------------------
plotDendrogram = function(hc_data,
                          fig_title = "",
                          col_vector = colors_condition) {
  
  p = ggplot2::ggplot() +
    # Draw dendrogram
    ggplot2::geom_segment(data = ggdendro::segment(hcdata), 
                          aes(x = x, y = y, xend = xend, yend = yend)) +
    # Leaf label
    ggplot2::geom_label(data = ggdendro::label(hcdata), 
                        aes(x = x, y = y, label = label),
                        linewidth = 0, fill = NA,
                        size = 3, angle = 90, hjust = 1,
                        nudge_y = -0.02*max(ggdendro::segment(hcdata)$yend)) +
    # Leaf colors
    ggplot2::geom_point(data = ggdendro::label(hcdata),
                        ggplot2::aes(x = x, y = y, col = Condition),
                        size = 3, pch = 19) +
    ggplot2::scale_color_manual(values = col_vector,
                                breaks = names(col_vector)) +
    # Style
    ggplot2::labs(y = "Distance",
                  title = fig_title) +
    ggplot2::coord_cartesian(clip = "off") +
    ggplot2::theme_minimal() +
    ggplot2::theme(axis.text.x = element_blank(),
                   axis.ticks.x = element_blank(),
                   axis.title.x = element_blank(),
                   axis.line.y = element_line(),
                   axis.ticks.y = element_line(),
                   panel.grid = element_blank(),
                   plot.title = element_text(hjust = 0.5),
                   plot.margin = ggplot2::unit(c(0.5, 0.5, max(nchar(hcdata$labels$label))/6, 0.5), "cm"),
                   # Transparent background
                   panel.background = ggplot2::element_blank(),
                   plot.background = ggplot2::element_blank(),
                   legend.background = ggplot2::element_blank())
  
  return(p)
}

# -----------------------------------------------------------------------------
# plotPCA
# Create a ggplot object corresponding to the PCA plot of individuals
#
# Input:
#   pca_df : dataframe
#   fig_title : figure title
#   col_vector: named vector of colors
#
# Output:
#   p : ggplot object
# -----------------------------------------------------------------------------
plotPCA = function(pca_df,
                   fig_title = "",
                   col_vector = colors_condition) {
  
  p = ggplot2::ggplot(pca_df, aes(x = Dim.1, y = Dim.2, col = Condition, label = sample)) +
    ggplot2::geom_hline(yintercept = 0, lty = 2) +
    ggplot2::geom_vline(xintercept = 0, lty = 2) +
    # Samples
    ggplot2::geom_point(size = 3, pch = 19) +
    ggrepel::geom_label_repel(fill = NA, label.size = 0) +
    ggplot2::scale_color_manual(values = col_vector,
                                breaks = names(col_vector)) +
    # Style
    ggplot2::labs(x = paste0("Dim1 (", round(pcaCount$eig[1, 2], 2), "%)"),
                  y = paste0("Dim2 (", round(pcaCount$eig[2, 2], 2), "%)"),
                  title = fig_title) +
    ggplot2::theme_minimal() +
    ggplot2::theme(plot.title = element_text(hjust = 0.5),
                   # Transparent background
                   panel.background = ggplot2::element_blank(),
                   plot.background = ggplot2::element_blank(),
                   legend.background = ggplot2::element_blank())
  
  return(p)
}

# -----------------------------------------------------------------------------
# plotBarplot
# Create a ggplot object corresponding to a barplot
#
# Input:
#   df : dataframe
#   y_column : column in df for the Y-axis
#   y_title : Y-axis title
#   fig_title : figure title
#   col_vector: named vector of colors
#
# Output:
#   p : ggplot object
# -----------------------------------------------------------------------------
plotBarplot = function(df,
                       y_column,
                       y_title = "",
                       fig_title = "",
                       col_vector = colors_condition) {
  df$y_value = df[, y_column]
  
  p = ggplot2::ggplot(df, aes(x = sample, y = y_value, fill = Condition)) +
    ggplot2::geom_bar(stat = "identity", col = "black") +
    ggplot2::scale_fill_manual(values = col_vector,
                               breaks = names(col_vector)) +
    # Style
    ggplot2::labs(x = "",
                  y = y_title,
                  title = fig_title) +
    ggplot2::scale_y_continuous(expand = c(0,0)) +
    ggplot2::theme_classic() +
    ggplot2::theme(plot.title = element_text(hjust = 0.5),
                   axis.text.x = element_text(angle = 90, hjust = 1),
                   # Transparent background
                   panel.background = ggplot2::element_blank(),
                   plot.background = ggplot2::element_blank(),
                   legend.background = ggplot2::element_blank())
  
  return(p)
}

# -----------------------------------------------------------------------------
# plotBoxplot
# Create a ggplot object corresponding to a boxplot
#
# Input:
#   df : dataframe
#   y_column : column in df for the Y-axis
#   y_title : Y-axis title
#   fig_title : figure title
#   colors_condition: named vector of colors
#
# Output:
#   p : ggplot object
# -----------------------------------------------------------------------------
plotBoxplot = function(df,
                       y_column,
                       y_title = "",
                       fig_title = "",
                       col_vector = colors_condition) {
  df$y_value = df[, y_column]
  
  p = ggplot2::ggplot(df, aes(x = sample, y = y_value, fill = Condition)) +
    ggplot2::geom_boxplot(col = "black") +
    ggplot2::scale_fill_manual(values = col_vector,
                               breaks = names(col_vector)) +
    # Style
    ggplot2::labs(x = "",
                  y = y_title,
                  title = fig_title) +
    ggplot2::scale_y_continuous(expand = c(0,0)) +
    ggplot2::theme_classic() +
    ggplot2::theme(plot.title = element_text(hjust = 0.5),
                   axis.text.x = element_text(angle = 90, hjust = 1),
                   # Transparent background
                   panel.background = ggplot2::element_blank(),
                   plot.background = ggplot2::element_blank(),
                   legend.background = ggplot2::element_blank())
  
  return(p)
}

# -----------------------------------------------------------------------------
# expressionColors
# Define a color panel for the heatmap
#
# Input:
#   range_values : a numerical vector with two values (min and max)
#
# Output:
#   expression_colors : a color or a function built with circlize::colorRamp2
# -----------------------------------------------------------------------------
expressionColors = function(range_values) {
  min_value = range_values[1]
  max_value = range_values[2]
  
  # min_value < 0 < max_value
  if (min_value < 0 & 0 < max_value) {
    expression_colors = circlize::colorRamp2(
      breaks = c(min_value, 0, max_value),
      colors = c("#224897", "#F7F7F7", "#D32A24"))
  } else
    
    # min_value < max_value < 0
    if (min_value < max_value & max_value < 0) {
      expression_colors = circlize::colorRamp2(
        breaks = c(min_value, 0),
        colors = c("#224897", "#F7F7F7"))
    } else
      
      # 0 < min_value < max_value
      if (0 < min_value & min_value < max_value) {
        expression_colors = circlize::colorRamp2(
          breaks = c(0, max_value),
          colors = c("#F7F7F7", "#D32A24"))
      } else {
        
        # min_value == max_value
        expression_colors = "#F7F7F7"
      }
  
  return(expression_colors)
}

# -----------------------------------------------------------------------------
# buildContrast
# Define the contrast vector to extract DE results from the dds object
#
# Input:
#   dds       : DESeqDataSet
#   group1    : list of character vectors, for instance:
#                           list(c("sexe", "F"), c("genotype", "HETERO")
#   group2    : list of character vectors, for instance:
#                           list(c("sexe", "F"), c("genotype", "HOMO")
#   weighted  : boolean     whether to weight the contrast matrix by the
#                           number of replicates per condition or not
#
# Output: a list of 3 elements
#   contrast        : a numerical vector
#   samples_group1  : a character vector containing sample names in group 1
#   samples_group2  : a character vector containing sample names in group 2
#
# Source:
# Modified from: https://www.r-bloggers.com/2024/05/a-guide-to-designs-and-contrasts-in-deseq2/
# -----------------------------------------------------------------------------
buildContrast = function(dds,
                         group1,
                         group2,
                         weighted = FALSE){
  #-- Model matrix for each sample 
  mod_mat = model.matrix(DESeq2::design(dds),
                         SummarizedExperiment::colData(dds))
  
  #-- Samples in group1
  # A list of TRUE / FALSE
  # Each element of the list is of length the number of samples
  # The i-th element of the list corresponds to the i-th condition in group1
  grp1_rows = lapply(c(1:length(group1)), FUN = function(group1_i) {
    group1_i_name = group1[[group1_i]][1]
    group1_i_values = group1[[group1_i]][2:length(group1_i)]
    
    samples_in_group1_i = (SummarizedExperiment::colData(dds)[[group1_i_name]] %in% group1_i_values) # TRUE / FALSE
    
    return(samples_in_group1_i)
  })
  grp1_rows = Reduce(function(x, y) x & y, grp1_rows)
  
  grp2_rows = lapply(c(1:length(group2)), FUN = function(group2_i) {
    group2_i_name = group2[[group2_i]][1]
    group2_i_values = group2[[group2_i]][2:length(group2_i)]
    
    samples_in_group2_i = (SummarizedExperiment::colData(dds)[[group2_i_name]] %in% group2_i_values) # TRUE / FALSE
    
    return(samples_in_group2_i)
  })
  grp2_rows = Reduce(function(x, y) x & y, grp2_rows)
  
  #-- Model matrix with only samples of interest
  mod_mat1 = mod_mat[grp1_rows, , drop = FALSE]
  mod_mat2 = mod_mat[grp2_rows, , drop = FALSE]
  
  #-- Consider the weight or not
  if(!weighted){
    mod_mat1 = mod_mat1[!duplicated(mod_mat1), , drop = FALSE]
    mod_mat2 = mod_mat2[!duplicated(mod_mat2), , drop = FALSE]
  }
  
  #-- Output
  contrast = colMeans(mod_mat1)-colMeans(mod_mat2)
  samples_group1 = colnames(dds)[grp1_rows]
  samples_group2 = colnames(dds)[grp2_rows]
  output = list(contrast = contrast,
                samples_group1 = samples_group1,
                samples_group2 = samples_group2)
  
  return(output)
}

These functions are listed below.

  • buildContrast
  • buildCountMatrix
  • current_version
  • expressionColors
  • plotBarplot
  • plotBoxplot
  • plotDendrogram
  • plotPCA
  • saveRawCountMatrix
  • saveTable
  • version_tested

1.3 Parameters

The table below summarizes all the input parameters.

projectName     = params$projectName
designPath      = params$designPath
comparisonPath  = params$comparisonPath
diffanaTest     = as.logical(toupper(params$diffanaTest))
expHeader       = as.logical(toupper(params$expHeader))
deseqModel      = params$deseqModel
sizeFactorType  = params$sizeFactorType
fitType         = params$fitType
statisticTest   = params$statisticTest
weightContrast  = params$weightContrast
prefix          = params$prefix
plotInteractive = params$plotInteractive
leaveOnError    = params$leaveOnError

# Display them as a table
c("Project name (projectName)" = projectName,
  "Path to the design file (designPath)" = designPath,
  "Path to the comparison file (comparisonPath)" = comparisonPath,
  "Whether to perform the differential expression analysis or not (diffanaTest)" = diffanaTest,
  "Whether the design table has a header or not (expHeader)" = expHeader,
  "DESeq2 model (deseqModel)" = deseqModel,
  "Size factor type (sizeFactorType)" = sizeFactorType,
  "Fit type (fitType)" = fitType,
  "Statistic test (statisticTest)" = statisticTest,
  "Whether to weight the contrast vector by the number of samples or not (weightContrast)" = weightContrast,
  "Prefix to save files (prefix)" = prefix,
  "Whether to make interactive plots or not (plotInteractive)" = plotInteractive,
  "Whether to stop the rendering in case of error (leaveOnError)" = leaveOnError) %>%
  data.frame(Parameter = names(.),
             Value     = .,
             row.names = NULL) %>%
  knitr::kable("html", escape = FALSE) %>%
  kableExtra::kable_styling("striped", full_width = FALSE) %>%
  kableExtra::column_spec(1, bold = TRUE)
Parameter Value
Project name (projectName) GSE107401
Path to the design file (designPath) ./project_GSE107401/deseq2_GSE107401-deseq2Design.txt
Path to the comparison file (comparisonPath) ./project_GSE107401/deseq2_GSE107401-comparisonFile.txt
Whether to perform the differential expression analysis or not (diffanaTest) TRUE
Whether the design table has a header or not (expHeader) TRUE
DESeq2 model (deseqModel) ~Condition
Size factor type (sizeFactorType) ratio
Fit type (fitType) parametric
Statistic test (statisticTest) Wald
Whether to weight the contrast vector by the number of samples or not (weightContrast) TRUE
Prefix to save files (prefix) ./project_GSE107401/deseq2_
Whether to make interactive plots or not (plotInteractive) TRUE
Whether to stop the rendering in case of error (leaveOnError) FALSE

2 Data preparation

The purpose of this section is to:

  • load the design file, containing metadata about each sample ;
  • load the individual sample count data, assemble them as a gene-by-sample count matrix, and save it as a TSV file ; and,
  • build a dataset in a format compatible with the differential expression analysis (DESeq2-related format).

2.1 Load metadata

We load the design file and display the 6 first rows.

design = read.table(designPath,
                    sep = "\t",
                    header = TRUE,
                    dec = ".",
                    stringsAsFactors = FALSE)

# About a warning in DESeq2
design$Condition = stringr::str_replace_all(
  design$Condition,
  pattern = "-",
  replacement = "_")

# Add Reference if does not exist
if (!("Reference" %in% colnames(design))) {
  design$Reference = Inf
  complexMode = TRUE
} else {
  complexMode = FALSE
}

# Reorder samples based on Condition
design = design %>%
  dplyr::arrange(Condition, RepTechGroup, Name) %>%
  dplyr::mutate(Condition = factor(Condition, levels = unique(Condition))) %>%
  dplyr::mutate(RepTechGroup = factor(RepTechGroup, levels = unique(RepTechGroup))) %>%
  dplyr::mutate(Name = factor(Name, levels = unique(Name)))

# Remove columns out of interest
design = design[, !(colnames(design) %in% c("SampleId", "Description", "Date", "FastqFormat"))]

# Display
head(design) %>%
  knitr::kable("html") 
Name expressionFile RepTechGroup Condition Reference
GSM2866306 ./project_GSE107401/GSM2866306_expression_output_expression_20130267.tsv KO1 KO 1
GSM2866307 ./project_GSE107401/GSM2866307_expression_output_expression_20130268.tsv KO2 KO 1
GSM2866308 ./project_GSE107401/GSM2866308_expression_output_expression_20130269.tsv KO3 KO 1
GSM2866303 ./project_GSE107401/GSM2866303_expression_output_expression_20130270.tsv WT1 WT 0
GSM2866304 ./project_GSE107401/GSM2866304_expression_output_expression_20130271.tsv WT2 WT 0
GSM2866305 ./project_GSE107401/GSM2866305_expression_output_expression_20130272.tsv WT3 WT 0

Description of the columns:

  • Name: Unique identifier of each technical replicate.
  • expressionFile: File path to the count expression data. The TSV file contains two columns: one for the gene identifiers and one for the number of reads mapping to their genomic location. For compatibility with Excel, please use the XLSX file instead of the TSV file (read more). Furthermore, the XLSX file contains additional columns, such as a description of each gene.
  • RepTechGroup: Samples with the same value in the RepTechGroup column are technical replicates. This occurs when the sequencing library is split over sequencing lanes. Each lane generates a technical replicate from the same biological library.
  • Condition: Samples with the same value in the Condition column are biological replicates. They are produced by the experimenter.
  • Reference: Numerical values used to define the pairwise comparisons between conditions in the differential expression analysis.

We associate a color to each condition.

colors_condition = unique(design$Condition) %>%
  setNames(nm = .,
           grDevices::hcl.colors(length(.), "Spectral"))

data.frame(Color = unname(colors_condition),
           Condition = names(colors_condition)) %>%
  knitr::kable("html",
               col.names = c("Color (hex code)", "Condition")) %>%
  kableExtra::column_spec(1, color = colors_condition) %>%
  kableExtra::kable_styling(full_width = FALSE)
Color (hex code) Condition
#A71B4B KO
#584B9F WT

2.2 Build count matrix

From the individual count data, we build a gene-by-sample count matrix. The count value for a given gene corresponds to the number (integer value) of sequencing reads that map to its genomic region. We display the top left corner of this matrix, meaning 5 genes and 5 samples. The complete matrix is saved as a TSV file (quick access).

count_mat = buildCountMatrix(files = design$expressionFile,
                             sampleLabel = design$Name,
                             expHeader = expHeader)

# Reorder columns
count_mat = count_mat[, levels(design$Name)]

# Display
count_mat[c(1:5), c(1:5)] %>%
  knitr::kable("html") %>%
  kableExtra::kable_styling(full_width = FALSE)
GSM2866306 GSM2866307 GSM2866308 GSM2866303 GSM2866304
ENSMUSG00000000001 10177 7880 10372 7353 8767
ENSMUSG00000000003 0 0 1 0 0
ENSMUSG00000000028 933 915 1184 204 317
ENSMUSG00000000031 66488 44221 88514 72959 70285
ENSMUSG00000000037 89 65 120 119 85

The count matrix contains 46500 genes, 6 samples and 41.94% of null values.

2.3 Create DESeqDataSet object

We create the DESeqDataSet object that includes the count matrix and the design file.

dds = DESeq2::DESeqDataSetFromMatrix(
  countData = count_mat,
  colData = design,
  design = as.formula(deseqModel))

2.4 Save count matrix

We save the count matrix as a TSV file.

filename = paste0(prefix, projectName, "-normalisation_rawCountMatrix.tsv")

saveRawCountMatrix(dds,
                   file = filename)

Output file name:
./project_GSE107401/deseq2_GSE107401-normalisation_rawCountMatrix.tsv

3 Quality control

The aim of this section is to better apprehend the variability between the technical replicates. For information on the correspondence between the identifiers of technical replicates and sample names, please refer to the design file.

3.1 Raw counts dendrogram

We compute euclidean distances between each pair of samples, considering the raw count values.

dist.mat = dist(t(count_mat))
hc = stats::hclust(dist.mat)
hcdata = ggdendro::dendro_data(hc, type = "rectangle")

# Add grouping for colors
hcdata$labels = dplyr::left_join(x = hcdata$labels,
                                 y = design[, c("Name", "Condition")],
                                 by = c("label" = "Name"))

We visualise these distances through a dendrogram. It informs whether technical replicates or closely related conditions are closer together than with other samples.

plotDendrogram(hc_data,
               fig_title = paste0("Dendrogram on the raw count matrix - ",
                                  projectName))

3.2 Raw counts PCA

The 6 samples are represented in a space of 46500 dimensions, corresponding to genes. We perform a Principal Component Analysis (PCA) to visualise samples on a two-dimension space such that it represents the highest variability between samples.

pcaCount = count_mat %>%
  t() %>%
  FactoMineR::PCA(., graph = FALSE)

We visualise the samples on the two-dimensional projection. The coordinates of each sample depend on those of the other samples. This projection captures 69.93% of the total variability between samples. Therefore, it lacks 30.07% of the initial information. In this representation, two points (samples) are close together if they have similar global transcriptomic profiles. We expect the technical and biological replicates to be closer to each other than to the other samples and conditions.

pcaCount$ind$coord %>%
  as.data.frame() %>%
  dplyr::mutate(sample = rownames(.)) %>%
  dplyr::left_join(x = .,
                   y = unique(design[, c("Name", "Condition")]),
                   by = c("sample" = "Name")) %>%
  plotPCA(.,
          fig_title =  paste0("PCA of the raw count matrix - ", projectName))

3.3 Null counts barplot

This figure displays the proportion of null counts per sample, meaning, the genes that are not being expressed or not captured by the protocol, within the biological samples. We expect samples from the same condition to be similar.

colMeans(count_mat == 0) %>%
  data.frame(sample = factor(names(.), levels = levels(design$Name)),
             prop_null_counts = .) %>%
  dplyr::mutate(prop_null_counts = 100*prop_null_counts) %>%
  dplyr::left_join(x = .,
                   y = design[, c("Name", "Condition")],
                   by = c("sample" = "Name")) %>%
  # Plot
  plotBarplot(.,
              y_column = "prop_null_counts",
              y_title = "Proportion of null counts (%)",
              fig_title = paste0("Proportion of null counts per sample - ",
                                 projectName))

3.4 Raw counts barplot

This figure displays the total number of reads per sample, also known as library size. We can assess if some samples have a low library size compared to others, and whether technical replicates share a similar number of reads or not.

colSums(DESeq2::counts(dds)) %>%
  data.frame(sample = factor(names(.), levels = levels(design$Name)),
             read_counts = .) %>%
  dplyr::left_join(x = .,
                   y = design[, c("Name", "Condition")],
                   by = c("sample" = "Name")) %>%
  # Plot
  plotBarplot(.,
              y_column = "read_counts",
              y_title = "Total read counts",
              fig_title = paste0("Number of reads per sample - ", projectName))

3.5 Raw counts boxplot

This figure displays, as a boxplot, the number of raw counts for each gene, per sample. It informs whether the genes show a similar count distribution between samples and conditions.

log2(DESeq2::counts(dds)+1) %>%
  reshape2::melt() %>%
  `colnames<-`(c("gene_ID", "sample", "log2_counts")) %>%
  dplyr::mutate(sample = factor(sample, levels = levels(design$Name))) %>%
  dplyr::left_join(x = .,
                   y = design[, c("Name", "Condition")],
                   by = c("sample" = "Name")) %>%
  # Plot
  plotBoxplot(.,
              y_column = "log2_counts",
              y_title = "log_2 (counts+1)",
              fig_title = paste0("Raw counts distribution per sample - ", projectName))


In this context, there is no technical replicate.

4 Normalisation

The aim of this section is to normalise the count matrix, before performing the differential expression analysis. The normalisation method considers the sequencing depth and RNA composition of each sample to compute normalised count values.

For more details, please visit:

https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html

4.1 Data handling

We normalise the count matrix. We display the top left corner of this matrix, meaning 5 genes and 5 samples. Unlike count data, normalised count values are not necessarily integers.

dds = DESeq2::estimateSizeFactors(dds,
                                  type = sizeFactorType)

DESeq2::counts(dds, normalized = TRUE)[c(1:5), c(1:5)] %>%
  knitr::kable("html") %>%
  kableExtra::kable_styling(full_width = FALSE)
KO1 KO2 KO3 WT1 WT2
ENSMUSG00000000001 8545.49706 8663.24632 8.367172e+03 8697.1743 7718.40931
ENSMUSG00000000003 0.00000 0.00000 8.067077e-01 0.0000 0.00000
ENSMUSG00000000028 783.42820 1005.94802 9.551419e+02 241.2925 279.08472
ENSMUSG00000000031 55829.12536 48616.42328 7.140492e+04 86296.3607 61878.45308
ENSMUSG00000000037 74.73216 71.46079 9.680492e+01 140.7539 74.83344

We save the normalised count matrix as a TSV file.

filename = paste0(prefix, projectName,
                  "-normalisation_normalisedCountMatrix.tsv")

saveTable(DESeq2::counts(dds, normalized = TRUE),
          fileName = filename)

Output file name:
./project_GSE107401/deseq2_GSE107401-normalisation_normalisedCountMatrix.tsv

4.2 Visualisation

4.2.1 Normalised counts dendrogram

We compute euclidean distances between each pair of samples, considering the normalised count values.

ddsStabilized = dds %>%
  DESeq2::varianceStabilizingTransformation() %>%
  SummarizedExperiment::assay()
dist.mat = dist(t(ddsStabilized))
hc = stats::hclust(dist.mat)
hcdata = ggdendro::dendro_data(hc, type = "rectangle")

# Add grouping for colors
hcdata$labels = dplyr::left_join(
  x = hcdata$labels,
  y = unique(design[, c("RepTechGroup", "Condition")]),
  by = c("label" = "RepTechGroup"))

We visualise these distances through a dendrogram. It informs whether the samples share a similar transcriptomic profile per condition or not.

plotDendrogram(
  hc_data,
  fig_title = paste0("Dendrogram on the normalised count matrix - ", projectName))

4.2.2 Normalised counts PCA

We perform a PCA for all samples using the normalised count matrix.

pcaCount = dds %>%
  DESeq2::counts(., normalized = TRUE) %>%
  t() %>%
  FactoMineR::PCA(., graph = FALSE)

We visualise the samples on the two-dimensional projection. The coordinates of each sample depend on those of the other samples. This projection captures 65.42% of the total variability between samples. Therefore, it lacks 34.58% of the initial information. In this representation, two points (samples) are close together if they have similar global transcriptomic profiles. We expect samples from the same biological condition to be closer to each other than to samples from other conditions.

pcaCount$ind$coord %>%
  as.data.frame() %>%
  dplyr::mutate(sample = rownames(.)) %>%
  dplyr::left_join(x = .,
                   y = unique(design[, c("RepTechGroup", "Condition")]),
                   by = c("sample" = "RepTechGroup")) %>%
  plotPCA(.,
          fig_title =  paste0("PCA of the normalised count matrix - ", projectName))

4.2.3 Normalised counts boxplot

This figure displays, as a boxplot, the number of normalised counts for each gene, per sample.

log2(DESeq2::counts(dds, normalized = TRUE)+1) %>%
  reshape2::melt() %>%
  `colnames<-`(c("gene_ID", "sample", "log2_counts")) %>%
  dplyr::mutate(sample = factor(sample, levels = levels(design$RepTechGroup))) %>%
  dplyr::left_join(x = .,
                   y = unique(design[, c("RepTechGroup", "Condition")]),
                   by = c("sample" = "RepTechGroup")) %>%
  # Plot
  plotBoxplot(
    .,
    y_column = "log2_counts",
    y_title = "log_2 (counts+1)",
    fig_title = paste0("Normalised counts distribution per sample - ", projectName))

4.2.4 Most expressed genes plot

We identified the most expressed gene in each sample.

# preparation of 2 data frame with the same number of column than
# the dds count matrix
maxCounts = DESeq2::counts(dds)[1,]
transcriptNames = DESeq2::counts(dds)[1,]

# for each sample (column)
for (i in 1:ncol(DESeq2::counts(dds))) {
  
  # selection of the maximum number of count
  maxCounts[i] = (max(DESeq2::counts(dds, normalized = TRUE)[,i])/
                    sum(DESeq2::counts(dds, normalized = TRUE)[,i]))*100
  
  # selection of the name of the genes this the maximum of count
  transcriptNames[i] = row.names(
    subset(DESeq2::counts(dds, normalized = TRUE),
           DESeq2::counts(dds, normalized = TRUE)[,i] == 
             max(DESeq2::counts(dds, normalized = TRUE)[,i])))
}

We represent the information as a barplot.

dplyr::left_join(x = data.frame(sample = names(maxCounts),
                                max_counts = maxCounts),
                 y = data.frame(sample = names(transcriptNames),
                                gene_id = transcriptNames),
                 by = "sample") %>%
  dplyr::mutate(sample = factor(sample, levels = levels(design$RepTechGroup))) %>%
  dplyr::left_join(x = .,
                   y = unique(design[, c("RepTechGroup", "Condition")]),
                   by = c("sample" = "RepTechGroup")) %>%
  # Plot
  ggplot2::ggplot(., aes(x = sample, y = max_counts,
                         fill = Condition, label = gene_id)) +
  ggplot2::geom_bar(stat = "identity", col = "black") +
  ggplot2::geom_label(mapping = aes(x = sample, y = 0),
                      linewidth = 0, fill = NA, hjust = 0) +
  ggplot2::scale_fill_manual(values = colors_condition,
                             breaks = names(colors_condition)) +
  # Style
  ggplot2::labs(y = "Proportion of reads (%)",
                title = paste0("Most expressed genes - ", projectName)) +
  ggplot2::scale_y_continuous(expand = c(0,0)) +
  ggplot2::coord_flip() +
  ggplot2::theme_classic() +
  ggplot2::theme(plot.title = element_text(hjust = 0.5),
                 axis.title.y = element_blank(),
                 axis.text.x = element_text(angle = 90, hjust = 1),
                 # Transparent background
                 panel.background = ggplot2::element_blank(),
                 plot.background = ggplot2::element_blank(),
                 legend.background = ggplot2::element_blank())

5 Differential analysis

The aim of this section is to perform the differential expression analysis.

The code behind this section can be downloaded at:

5.1 Dispersion estimates

After the normalisation, the first step of the differential expression analysis is the estimation of gene-wise dispersion. The dispersion estimates reflect the variability in the expression of a given gene across samples.

dds = DESeq2::estimateDispersions(dds,
                                  fitType = fitType)

5.2 Dispersion plot

We visualise the dispersion estimates of each gene (black dots) as a function of the mean of normalised counts across samples. The red curve corresponds to the expected dispersion value for genes, based on their mean normalised counts. Then, a shrinkage of the gene-wise dispersion estimates consists in computed a final dispersion value for each gene (blue dots). This is important to reduce the false positives in the differential expression analysis.

DESeq2::plotDispEsts(
  dds,
  main = paste0("Dispersion estimation scatter plot\n", projectName))

For more details, please visit:

https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html

5.3 Generalized linear model

The last step of the differential expression analysis pipeline is to fit a model to each gene to compute their log2 fold change. A Wald statistical test is then used to evaluate if the values are distinct from 0, meaning if the genes are differentially expressed or not.

dds = DESeq2::DESeq(dds,
                    test = statisticTest,
                    fitType = fitType,
                    betaPrior = FALSE)

6 Results

The aim of this section is to extract the results for each paired comparison and build relevant figures.

List of comparisons

The table below is a summary of the conditions of interest.

Condition Reference
WT 0
KO 1

The values in the Reference column are used to define the pairwise comparisons:

  • Reference < 0: the samples were considered by the normalisation step but not by the differential expression analysis
  • Reference = 0: the condition associated with these samples is considered as a reference in the differential expression analysis
  • Reference > 0: the condition associated with these samples is compared to each condition having a lower reference value

The table below summarised the paired comparisons (condition1 vs condition2) of interest.

condition1 condition2
Comparison 1 KO WT

The code behind each section below can be downloaded at:

6.1 KO vs_WT

Details:

  • number of samples in the KO condition: 3
  • number of samples in the WT condition: 3

6.1.1 Results table

We extract the dataframe containing the differential analysis results between condition KO and condition WT. We display the 6 first rows of this table. The complete table is saved as a TSV file (quick access).

res = DESeq2::results(dds,
                      contrast = comparison_contrast) %>%
  as.data.frame() %>%
  dplyr::mutate(ID = rownames(.)) %>%
  dplyr::relocate(ID)

head(res[, -1]) %>%
  knitr::kable("html") %>%
  kableExtra::kable_styling(full_width = FALSE)
baseMean log2FoldChange lfcSE stat pvalue padj
ENSMUSG00000000001 8.545201e+03 -0.0065238 0.1155344 -0.0564664 0.9549703 0.9707938
ENSMUSG00000000003 1.344513e-01 0.6813419 4.0804729 0.1669762 0.8673888 NA
ENSMUSG00000000028 5.881430e+02 1.8048132 0.1745103 10.3421607 0.0000000 0.0000000
ENSMUSG00000000031 6.747721e+04 -0.3810503 0.1980244 -1.9242593 0.0543221 0.1064735
ENSMUSG00000000037 9.885491e+01 -0.5189149 0.3569505 -1.4537448 0.1460171 0.2411026
ENSMUSG00000000049 1.637230e+02 -2.6679215 0.3835931 -6.9550819 0.0000000 0.0000000

Description of the columns:

  • The rownames correspond to the gene identifiers.
  • baseMean: Average normalised expression of the gene across all samples, after correcting for sequencing depth (size factors). It informs about how much the gene is expressed overall.
  • log2FoldChange: log2 of the ratio of the average expression of samples from the condition KO to samples from the condition WT. Positive values correspond to a higher expression in the condition KO, and conversely. A value of 1 means that the gene is twice more expressed in the condition KO compared to the condition WT.
  • lfcSE: Standard error of the log2 fold change. It reflects the uncertainty in the estimated fold change.
  • stat: Test statistic, corresponding to the ratio between log2FoldChange and lfcSE.
  • pvalue: Raw p-value assessing if the log2 fold change is different from 0 or not, meaning if the gene is differentially expressed or not.
  • padj: Adjusted p-value, corrected for multiple testing using the Benjamini-Hochberg method. It controls the false discovery rate and must be used to decide significance.

6.1.2 p-value distribution

Among 46500 genes, there are 31405 (68%) genes with an available p-value and 15095 (32%) with a non-available (NA) p-value. Below is the summary for the available values and a figure depicting the distribution of the raw p-values.

Value
Min. 0.0000000
1st Qu. 0.0011700
Median 0.1772734
Mean 0.3184053
3rd Qu. 0.6320702
Max. 0.9999217
ggplot2::ggplot(res, aes(x = pvalue)) +
  ggplot2::geom_histogram(col = "black", fill = "#A20F58", bins = 50) +
  # Style
  ggplot2::labs(x = "Adjusted p-value",
                y = "Number of genes",
                title = "Raw p-value distribution",
                subtitle = comparison_subtitle) +
  ggplot2::scale_y_continuous(expand = c(0,0)) +
  ggplot2::theme_classic() +
  ggplot2::theme(plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5),
                 # Transparent background
                 panel.background = ggplot2::element_blank(),
                 plot.background = ggplot2::element_blank(),
                 legend.background = ggplot2::element_blank())

6.1.3 Adjusted p-value distribution

Among 46500 genes, there are 24771 (53%) genes with an available adjusted p-value and 21729 (47%) with a non-available (NA) adjusted p-value. Below is the summary for the available values and a figure depicting the distribution of the adjusted p-values.

Value
Min. 0.0000000
1st Qu. 0.0003975
Median 0.0948678
Mean 0.2706592
3rd Qu. 0.5215269
Max. 0.9999217
ggplot2::ggplot(res, aes(x = padj)) +
  ggplot2::geom_histogram(col = "black", fill = "#77B241", bins = 50) +
  # Style
  ggplot2::labs(x = "Adjusted p-value",
                y = "Number of genes",
                title = "Adjusted p-value distribution",
                subtitle = comparison_subtitle) +
  ggplot2::scale_y_continuous(expand = c(0,0)) +
  ggplot2::theme_classic() +
  ggplot2::theme(plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5),
                 # Transparent background
                 panel.background = ggplot2::element_blank(),
                 plot.background = ggplot2::element_blank(),
                 legend.background = ggplot2::element_blank())

6.1.4 MA plot

The MA plot represents the log2 fold change of each gene (dot) as a function of their mean normalised expression value across all samples. Dots are colored based on the significance of the statistical test, using a significant threshold of 0.05 for the adjusted p-value (padj).

ggplot2::ggplot(res, aes(x = baseMean, y = log2FoldChange,
                         col = padj < 0.05)) +
  ggplot2::geom_point(size = 1) +
  ggplot2::geom_hline(yintercept = 0, lty = 1, col = "lightgray") +
  ggplot2::scale_color_manual(name = "padj < 0.05",
                              breaks = c(FALSE, TRUE, NA),
                              values = c("black", "#D32A24", "lightgray"),
                              na.value = "lightgray") +
  # Style
  ggplot2::scale_x_log10() +
  ggplot2::labs(x = "Mean of normalised counts",
                y = "Log_2 fold change",
                title = "MA plot",
                subtitle = comparison_subtitle) +
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5),
                 # Transparent background
                 panel.background = ggplot2::element_blank(),
                 plot.background = ggplot2::element_blank(),
                 legend.background = ggplot2::element_blank())

For more details, please visit:

https://hbctraining.github.io/DGE_workshop/lessons/05_DGE_DESeq2_analysis2.html

6.1.5 Number of differentially expressed genes

We compute the number of significantly differentially expressed genes for varying significant adjusted p-value thresholds.

signif_limit_vec = c(1e-6, 1e-5, 1e-4, 1e-3, 0.01, 0.05)
nb_DE_genes = lapply(signif_limit_vec, FUN = function(signif_limit) {
  nb_genes = res %>%
    # Differentially expressed genes
    dplyr::filter(abs(log2FoldChange) > 1) %>%
    # Significantly, based on the threshold
    dplyr::filter(padj < signif_limit) %>%
    # Number of genes
    nrow()
  
  return(nb_genes)
}) %>% unlist() %>%
  data.frame(signif_limit = signif_limit_vec,
             nb_genes = .)

The barplot indicates the number of significantly differentially expressed genes according to the significant adjusted p-value threshold. Only genes with a log2 fold change greater than 1 or smaller than -1 are considered.

Considering an adjusted p-value threshold of 1e-6, 2822 genes are significantly differentially expressed. When considering an adjusted p-value threshold of 0.05, 5525 genes are significantly differentially expressed, meaning, 2703 additional genes.

p = ggplot2::ggplot(nb_DE_genes, aes(x = signif_limit, y = nb_genes)) +
  ggplot2::geom_bar(stat = "identity", col = "black", fill = "#A3ADB8") +
  ggplot2::geom_label(aes(x = signif_limit,
                          y = nb_genes + 0.03*nb_DE_genes[6, 2],
                          label = nb_genes),
                      linewidth = 0, size = 5, fill = NA) +
  # Style
  ggplot2::scale_x_log10(breaks = signif_limit_vec,
                         labels = signif_limit_vec) +
  ggplot2::labs(x = "Adjusted p-value threshold",
                y = "Number of differentially expressed genes",
                title = "Impact of the adjusted p-value threshold on the number of differentially expressed genes",
                subtitle = comparison_subtitle) +
  ggplot2::theme_classic() +
  ggplot2::theme(plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5),
                 # Transparent background
                 panel.background = ggplot2::element_blank(),
                 plot.background = ggplot2::element_blank(),
                 legend.background = ggplot2::element_blank())

if (nb_DE_genes[6, 2] != 0) {
  p = p +
    ggplot2::scale_y_continuous(expand = c(0,0),
                                limits = c(0, 1.1*nb_DE_genes[6, 2]))
}

p

6.1.6 Volcano plot

The volcano plot represents the adjusted p-value as a function of the log2 fold change between condition KO and condition WT. We label a maximum of 10 genes in both directions. We identify such genes.

padj_threshold = 0.05
log2FC_threshold = 2

genes_up = res %>%
  dplyr::filter(padj < padj_threshold) %>%
  dplyr::filter(log2FoldChange > 0) %>%
  dplyr::top_n(n = 10, wt = log2FoldChange) %>%
  dplyr::pull(ID)

genes_dn = res %>%
  dplyr::filter(padj < padj_threshold) %>%
  dplyr::filter(log2FoldChange < 0) %>%
  dplyr::top_n(n = 10, wt = -log2FoldChange) %>%
  dplyr::pull(ID)

We label 10 significantly over-regulated and 10 significantly down-regulated genes, considering an adjusted p-value of 0.05.

non_zero_min = min(res$padj[res$padj != 0], na.rm = TRUE)

p = ggplot2::ggplot(res %>% dplyr::filter(!is.na(padj)),
                    aes(x = log2FoldChange,
                        y = ifelse(padj == 0, # -log10(0) = Inf
                                   yes = -log10(non_zero_min),
                                   no = -log10(padj)),
                        label = ID,
                        text = paste0(ID, "<br>",
                                      "log_2 (fold change): ", round(log2FoldChange, 2), "<br>",
                                      "adjusted p-value: ", round(padj, 4)))) +
  # Lines
  ggplot2::geom_hline(yintercept = 0, col = "lightgray") +
  ggplot2::geom_hline(yintercept = -log10(padj_threshold), lty = 2, col = "#A3ADB8") +
  ggplot2::geom_vline(xintercept = log2FC_threshold, lty = 2, col = "#A3ADB8") +
  ggplot2::geom_vline(xintercept = -log2FC_threshold, lty = 2, col = "#A3ADB8") +
  # Points
  ggplot2::geom_point(data = res %>%
                        dplyr::filter(padj >= padj_threshold),
                      mapping = aes(col = "Others")) +
  ggplot2::geom_point(data = res %>%
                        dplyr::filter(log2FoldChange > -log2FC_threshold &
                                        log2FoldChange < log2FC_threshold),
                      mapping = aes(col = "Others")) +
  ggplot2::geom_point(data = res %>%
                        dplyr::filter(padj < padj_threshold) %>%
                        dplyr::filter(log2FoldChange >= log2FC_threshold),
                      mapping = aes(col = "Significantly up-regulated")) +
  ggplot2::geom_point(data = res %>%
                        dplyr::filter(padj < padj_threshold) %>%
                        dplyr::filter(log2FoldChange <= -log2FC_threshold),
                      mapping = aes(col = "Significantly down-regulated")) +
  ggplot2::scale_color_manual(name = "genes",
                              values = c("#D32A24", "#224897", "#A3ADB8"),
                              breaks = c("Significantly up-regulated",
                                         "Significantly down-regulated",
                                         "Others")) +
  # Labels
  ggplot2::labs(x = "log_2 (fold change)",
                y = "-log10 (adjusted p-value)",
                title = "Volcano plot",
                subtitle = comparison_subtitle) +
  # Style
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = element_text(hjust = 0.5),
                 plot.subtitle = element_text(hjust = 0.5),
                 legend.position = "bottom",
                 legend.direction = "horizontal",
                 legend.text = element_text(margin = margin(l = 0.5)),
                 # Transparent background
                 panel.background = ggplot2::element_blank(),
                 plot.background = ggplot2::element_blank(),
                 legend.background = ggplot2::element_blank())

Static

p + ggrepel::geom_label_repel(data = res %>% dplyr::filter(ID %in% c(genes_up, genes_dn)),
                              fill = NA, label.size = 0,
                              size = 3, max.overlaps = 50)

Interactive

p = p + ggplot2::theme(legend.position = "none")

plotly::ggplotly(p, tooltip = "text") %>%
  plotly::layout(paper_bgcolor = "rgba(0,0,0,0)",
                 plot_bgcolor  = "rgba(0,0,0,0)")

6.1.7 Heatmap

We draw a heatmap for the 20 gene(s) presented above. We compute Z-scores from the normalised count matrix and visualise the top left corner of the resulting table, meaning the 20 gene(s) and their Z-scores in two samples. The Z-scores are only used for visualisation. The Z-score matrix is not saved.

samples_of_interest = c(comparison_samples_group1,
                        comparison_samples_group2)

# if-else in case on 0 differential expression gene
nb_genes = length(c(genes_up, genes_dn))
if (nb_genes < 1) {
  zscore_mat = matrix(data = 0,
                      nrow = 1,
                      ncol = length(samples_of_interest),
                      dimnames = list(c(""),
                                      samples_of_interest))
} else {
  zscore_mat = DESeq2::counts(dds, normalized = TRUE)[c(genes_up, genes_dn), samples_of_interest]
  
  # Special case if there is only one gene: vector instead of matrix
  if (nb_genes == 1) {
    zscore_mat = matrix(zscore_mat,
                        nrow = nb_genes,
                        dimnames = list(row_names = c(genes_up, genes_dn),
                                        col_names = samples_of_interest))
  }
  
  # Z-score
  zscore_mat = zscore_mat %>%
    t() %>% scale() %>% t()
}

# The code below is compatible with 1 gene
# Otherwise, zscore_mat[, c(1:2)] would have sufficed and been much simpler
matrix(zscore_mat[, c(1:2)],
       nrow = max(1, nb_genes),
       dimnames = list(row_names = c(genes_up, genes_dn),
                       col_names = samples_of_interest[c(1:2)])) %>%
  knitr::kable("html") %>%
  kableExtra::kable_styling(full_width = FALSE)
KO1 KO2
ENSMUSG00000005220 0.1308313 1.6997768
ENSMUSG00000005994 1.0693969 1.2403233
ENSMUSG00000015437 -0.4848211 1.9206515
ENSMUSG00000042514 -0.5728235 0.3946229
ENSMUSG00000043168 -0.6032827 0.6301209
ENSMUSG00000043549 1.2225999 -0.0009235
ENSMUSG00000046160 0.4559807 1.6655504
ENSMUSG00000067860 0.0741459 1.6942920
ENSMUSG00000075217 0.0315710 1.9744592
ENSMUSG00000093400 0.0829010 1.7700690
ENSMUSG00000001672 -0.9057268 -0.9057268
ENSMUSG00000026535 -0.8900501 -0.8931796
ENSMUSG00000026824 -0.5341863 -0.5341863
ENSMUSG00000028148 -0.8708964 -0.8627911
ENSMUSG00000032517 -0.7686147 -0.7701786
ENSMUSG00000046317 -0.9120461 -0.9120461
ENSMUSG00000048377 -0.9095362 -0.9095362
ENSMUSG00000050917 -0.9002409 -0.9012911
ENSMUSG00000059213 -0.9022808 -0.9011421
ENSMUSG00000100444 -0.7429518 -0.7429518

On the heatmap, each gene is one row and each sample is one column. Samples are annotated for their condition. The sample dendrogram is calculated based on the distances between samples considering only the genes represented on the heatmap. The condition associated to each sample is not considered by the method (unsupervised method). The genes dendrogram is calculated based on the Z-score of that gene among all samples.

# Heatmap top annotation
sub_design = design %>%
  dplyr::select(RepTechGroup, Condition) %>%
  dplyr::filter(RepTechGroup %in% samples_of_interest) %>%
  unique() %>%
  `rownames<-`(.$RepTechGroup)

if (complexMode) {
  sub_design = sub_design %>%
    # Add the complex condition
    `colnames<-`(c("RepTechGroup", "SubCondition")) %>%
    dplyr::mutate(Condition = ifelse(RepTechGroup %in% comparison_samples_group1,
                                     yes = condition1,
                                     no = condition2)) %>%
    dplyr::select(Condition, SubCondition)
  sub_design = sub_design[samples_of_interest, ] # re-order
  
  # Annotation
  ha_top = ComplexHeatmap::HeatmapAnnotation(
    df = sub_design,
    col = list(Condition = setNames(nm = c(condition1, condition2),
                                    c("#3AA5C1", "#FF6800")),
               SubCondition = colors_condition[unique(sub_design$SubCondition)]),
    na_col = "#F7F7F7")
} else {
  # Re-order
  sub_design = sub_design %>%
    dplyr::select(Condition)
  sub_design = sub_design[samples_of_interest, ] # re-order
  
  # Annotation
  ha_top = ComplexHeatmap::HeatmapAnnotation(
    df = sub_design,
    col = list(Condition = colors_condition[c(condition1, condition2)]),
    na_col = "#F7F7F7")
}

# Heatmap title
if (nb_genes > 1) {
  ht_title = paste0("Heatmap of ", nb_genes,
                    " differentially expressed genes\n",
                    comparison_subtitle)
} else {
  ht_title = paste0("Heatmap of ", nb_genes,
                    " differentially expressed gene\n",
                    comparison_subtitle)
}

# Heatmap main
ht = ComplexHeatmap::Heatmap(
  as.matrix(zscore_mat),
  # Expression colors
  col = expressionColors(range(zscore_mat)),
  heatmap_legend_param = list(title = "Z-score"),
  # Annotation
  top_annotation = ha_top,
  # Sample order
  cluster_columns = TRUE,
  # gene order
  cluster_rows = TRUE,
  # Visual aspect
  show_heatmap_legend = TRUE,
  border = TRUE,
  use_raster = FALSE,
  # Title
  column_title = ht_title)

# Draw
ComplexHeatmap::draw(ht,
                     merge_legend = TRUE,
                     heatmap_legend_side = "left",
                     annotation_legend_side = "left",
                     padding = unit(c(samples_padding, 2, 2, 5), "mm"),
                     background = "transparent")

Note: The clustering of samples may change, depending on the genes that are considered.

6.1.8 Save

We re-order the genes based on the adjusted p-value and edit the column names for enhanced readability. We display the 6 first rows of this table.

res = res %>%
  dplyr::mutate(dispersions.dds. = DESeq2::dispersions(dds)) %>%
  dplyr::arrange(padj) %>%
  `colnames<-`( c("ID",
                  "baseMean",
                  paste0("log2foldchange ", condition1, " vs ", condition2),
                  "standard error",
                  "Wald statistic",
                  "Wald test p-value",
                  "BH adjusted p-values",
                  "dispersions.dds.") )

head(res[, -1]) %>%
  knitr::kable("html") %>%
  kableExtra::kable_styling(full_width = FALSE)
baseMean log2foldchange KO vs WT standard error Wald statistic Wald test p-value BH adjusted p-values dispersions.dds.
ENSMUSG00000000223 69095.112 -8.825746 0.2039640 -43.27110 0 0 0.0284741
ENSMUSG00000021061 11643.299 -7.241691 0.1319283 -54.89112 0 0 0.0095483
ENSMUSG00000059213 10761.636 -9.379653 0.2173631 -43.15200 0 0 0.0200004
ENSMUSG00000056569 2192423.490 -4.989391 0.1308831 -38.12096 0 0 0.0123420
ENSMUSG00000033361 24278.624 -6.175509 0.1627914 -37.93510 0 0 0.0183975
ENSMUSG00000022456 5115.165 -6.712565 0.1814406 -36.99593 0 0 0.0189653

We save the table as a TSV file.

filename = paste0(prefix, projectName, "-diffana_",
                  condition1, "_vs_", condition2,".tsv")

saveTable(dataframe = res,
          fileName = filename)

Output file name:
./project_GSE107401/deseq2_GSE107401-diffana_KO_vs_WT.tsv


7 Versions

▼ show
## [1] "Packages location:"
## [1] "/usr/local/lib/R/library"
## [1] ""
## [1] "Session Information:"
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/local/lib/R/lib/libRblas.so 
## LAPACK: /usr/local/lib/R/lib/libRlapack.so;  LAPACK version 3.12.1
## 
## locale:
## [1] C
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_4.0.0 dplyr_1.1.4  
## 
## loaded via a namespace (and not attached):
##   [1] rlang_1.1.6                 magrittr_2.0.3             
##   [3] clue_0.3-66                 GetoptLong_1.0.5           
##   [5] matrixStats_1.5.0           compiler_4.5.1             
##   [7] png_0.1-8                   systemfonts_1.3.1          
##   [9] vctrs_0.6.5                 reshape2_1.4.4             
##  [11] stringr_1.5.2               shape_1.4.6.1              
##  [13] pkgconfig_2.0.3             crayon_1.5.3               
##  [15] fastmap_1.2.0               XVector_0.48.0             
##  [17] rmdformats_1.0.4            labeling_0.4.3             
##  [19] rmarkdown_2.29              UCSC.utils_1.4.0           
##  [21] purrr_1.0.4                 xfun_0.52                  
##  [23] cachem_1.1.0                GenomeInfoDb_1.44.0        
##  [25] jsonlite_2.0.0              flashClust_1.01-2          
##  [27] DelayedArray_0.34.1         BiocParallel_1.42.0        
##  [29] parallel_4.5.1              cluster_2.1.8.1            
##  [31] R6_2.6.1                    bslib_0.9.0                
##  [33] stringi_1.8.7               RColorBrewer_1.1-3         
##  [35] GenomicRanges_1.60.0        jquerylib_0.1.4            
##  [37] estimability_1.5.1          Rcpp_1.0.14                
##  [39] bookdown_0.46               SummarizedExperiment_1.38.0
##  [41] iterators_1.0.14            knitr_1.50                 
##  [43] base64enc_0.1-3             IRanges_2.42.0             
##  [45] Matrix_1.7-3                tidyselect_1.2.1           
##  [47] rstudioapi_0.17.1           abind_1.4-8                
##  [49] yaml_2.3.10                 doParallel_1.0.17          
##  [51] codetools_0.2-20            lattice_0.22-7             
##  [53] tibble_3.2.1                plyr_1.8.9                 
##  [55] Biobase_2.68.0              withr_3.0.2                
##  [57] S7_0.2.0                    evaluate_1.0.3             
##  [59] xml2_1.3.8                  circlize_0.4.16            
##  [61] pillar_1.10.2               MatrixGenerics_1.20.0      
##  [63] DT_0.34.0                   foreach_1.5.2              
##  [65] stats4_4.5.1                plotly_4.11.0              
##  [67] generics_0.1.3              S4Vectors_0.46.0           
##  [69] scales_1.4.0                leaps_3.2                  
##  [71] glue_1.8.0                  emmeans_2.0.0              
##  [73] scatterplot3d_0.3-44        lazyeval_0.2.2             
##  [75] tools_4.5.1                 data.table_1.17.0          
##  [77] locfit_1.5-9.12             mvtnorm_1.3-3              
##  [79] grid_4.5.1                  tidyr_1.3.1                
##  [81] crosstalk_1.2.1             colorspace_2.1-1           
##  [83] GenomeInfoDbData_1.2.14     cli_3.6.5                  
##  [85] kableExtra_1.4.0            textshaping_1.0.0          
##  [87] S4Arrays_1.8.0              viridisLite_0.4.2          
##  [89] svglite_2.2.2               ComplexHeatmap_2.24.0      
##  [91] ggdendro_0.2.0              gtable_0.3.6               
##  [93] DESeq2_1.48.0               sass_0.4.10                
##  [95] digest_0.6.37               BiocGenerics_0.54.0        
##  [97] SparseArray_1.8.0           ggrepel_0.9.6              
##  [99] rjson_0.2.23                FactoMineR_2.12            
## [101] htmlwidgets_1.6.4           farver_2.1.2               
## [103] htmltools_0.5.8.1           lifecycle_1.0.4            
## [105] httr_1.4.7                  multcompView_0.1-10        
## [107] GlobalOptions_0.1.2         MASS_7.3-65

8 Reference

The differential expression analysis uses the DESeq2 R package (vignette).

Love, M.I., Huber, W. & Anders, S., Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol 15, 550 (2014). DOI: 10.1186/s13059-014-0550-8

---
params:
  projectName: "GSE107401"
  designPath: "./project_GSE107401/deseq2_GSE107401-deseq2Design.txt"
  comparisonPath: "./project_GSE107401/deseq2_GSE107401-comparisonFile.txt"
  diffanaTest: TRUE
  expHeader: TRUE
  deseqModel: "~Condition"
  sizeFactorType: "ratio"
  fitType: "parametric"
  statisticTest: "Wald"
  weightContrast: FALSE
  prefix: "./project_GSE107401/deseq2_"
  plotInteractive: TRUE
  logoUrl: "https://www.outils.genomique.biologie.ens.fr/eoulsan/images/eoulsan.png"
  authorName: "Eoulsan"
  authorEmail: "eoulsan@bio.ens.psl.eu"
  leaveOnError: TRUE
title: "<center>Differential expression analysis</center>"
subtitle: "<center>`r paste0('Project: ', params$projectName)`</center>"
date: "`r invisible(Sys.setlocale('LC_TIME', 'C')) ; paste0('Analysis run on: ', format(Sys.time(), '%d %B %Y'))`
<br><span class='glyphicon glyphicon-pushpin'></span>`r paste0(' Project: ', params$projectName)`
<br><span class='glyphicon glyphicon-menu-right'></span> Version 2.1
<br><span class='glyphicon glyphicon-link'></span> Generated by <a href='https://www.outils.genomique.biologie.ens.fr/eoulsan/'>Eoulsan</a>"
# above is a small hack to add new lines in the postample without editing the template
author:
  - name: "`r params$authorName`"
    email: "`r params$authorEmail`"
output:
  rmdformats::readthedown:
    code_folding: show
    code_download: true
    number_sections: true
    toc_depth: 3
---

<!-- CSS to customize the colors -->
```{css style_section1, echo = FALSE}
body {
text-align: justify
}

/* Width of the main div */
#content {
max-width: 80%;
}
```

```{r style_section2, echo = FALSE, results = "asis"}
# NOTE: A CSS chunk cannot interpret the R input parameter. In order to parse the 'params$logoUrl' parameter, we require an R chunk. Therefore, the CSS content is displayed through an R chunk.

# Header in the sidebar
cat(paste0("
<style>
#sidebar h2 {
background-color:#A3ADB8;
background-size: 32%;
background-repeat: no-repeat;
background-position: center bottom;
background-image:url('", params$logoUrl, "');
background-origin: content-box;
height: 13%;
}
</style>
"))
```

```{css style_section3, echo = FALSE}
/* Sidebar body */
#sidebar {
background:#535F6A;
}

/* Small space about the authoring section */
#postamble {
border-top:solid 10px #A3ADB8;
}

/* Titles */
h1, h2, h3, h4, h5, h6, legend {
color: #000038;
}

/* Two-column mode */
.cols {
display: flex;
align-items: center;
}

.col-center {
justify-content: center;
}

.col-right {
margin-left: auto;
}

/* Button */
.btn-custom {
font-family: inherit;
font-size: inherit;
color: inherit;
background-color: #E3E3E3;
border-color: #E3E3E3;
padding: 1px 2px;
border-radius: 5px;
border-style: solid;
color: #000038;
}

.btn-custom:hover {
background-color: #535F6A;
border-color: #535F6A;
color: #FCFCFC;
}
```

<!-- Automatically computes and prints in the output the running time for any code chunk -->
```{r, echo = FALSE}
hooks = knitr::knit_hooks$get()
hook_foldable = function(type) {
  force(type)
  function(x, options) {
    res = hooks[[type]](x, options)
    
    if (isFALSE(options[[paste0("fold_", type)]])) return(res)
    
    paste0(
      "<details><summary>", "\U25BC show", "</summary>\n\n",
      res,
      "\n\n</details>"
    )
  }
}
knitr::knit_hooks$set(
  output = hook_foldable("output"),
  plot = hook_foldable("plot")
)
```

<!-- knit setup -->
```{r knit_setup, echo = FALSE}
fig_save_folder = paste0(params$prefix, "figures/")

knitr::opts_chunk$set(
  echo = TRUE,
  eval = TRUE,
  cache.lazy = FALSE,
  message = FALSE,
  warning = FALSE,
  fold_output = FALSE,
  fold_plot = FALSE,
  fig.align = "center",
  class.source = "fold-hide",
  # R base plots on transparent background
  dev.args = list(bg = 'transparent'),
  # Export figures
  dev = c('png', 'pdf'),
  fig.path = fig_save_folder,
  pdf.options(encoding = "ISOLatin9.enc"))
```

------------------------------------------------------------------------

The aim of this document is to present the data and differential expression analysis between samples from the project **`r params$projectName`**.

The main steps are:

- Data preparation
- Quality control
- Normalisation
- Differential expression analysis

```{r echo = FALSE, results = "asis"}
# NOTE: This section is very ugly. Due to the conditional section, we need to evaluate in advance whether the section will be executed or not, in order to create a short link to the conditionally-executed section. This requires the design file to be loaded. This section is hidden in the final rendered notebook. We only make calculations here to print a message. But, calculations are performed again in a visible way in the next sections. Not elegant and (low) resource-consuming.

text_to_print = "For a quick access to the figures, go to"

design_tmp = read.table(params$designPath,
                        sep = "\t",
                        header = TRUE,
                        dec = ".",
                        stringsAsFactors = FALSE)

if (!("Reference" %in% colnames(design_tmp))) {
  # Complex design, no reference column
  design_tmp = c(0,1)
} else {
  # Simple design, check if there are diverse Reference values above 0
  design_tmp = design_tmp |>
    dplyr::filter(Reference >= 0) |>
    dplyr::pull(Reference) |>
    unique()
}

if (params$diffanaTest & length(design_tmp) > 1) {
  text_to_print = paste0(
    text_to_print, ":<br>",
    "- [Normalisation > Visualisation](#normalisation_visu) for normalised counts<br>",
    "- [Results subsections](#results_subsection) for the pairwise comparisons")
} else {
  text_to_print = paste0(
    text_to_print, " the section [Normalisation > Visualisation](#normalisation_visu).")
}

cat(paste0(text_to_print, "\n"))
```

All figures are saved in PNG and PDF format and accessible in the *`r fig_save_folder`* folder. You can download the code behind this analysis through the "Code" button in the top right corner of the document.

# Settings

The aim of this section is to:

- load the tools of interest
- define custom functions
- load the input parameters

## Environment

We load the tools to handle data and figures.

```{r library, fold_output=TRUE}
library(dplyr)
library(ggplot2)
```

```{r check_deseq2_version, echo = FALSE}
version_tested = c(1, 48, 0) # version 1.48.0

current_version = utils::packageVersion("DESeq2") %>% # A.B.C
  as.character() %>%
  base::strsplit(., split = "[^0-9]+") %>%
  unlist() %>%
  as.numeric()

if (current_version[[1]] < version_tested[[1]] |
    (current_version[[1]] <= version_tested[[1]] &
     current_version[[2]] < version_tested[[2]])) {
  warning("The notebook has been tested using version 1.48.0 of DESeq2",
          " You are using an older version (",
          utils::packageVersion("DESeq2"),
          ") so it cannot be guaranteed that the script will work...")
}
```

## Custom functions

We further define custom functions to avoid repetitions in the code.

```{r custom_functions, class.source = "fold-hide"}
# -----------------------------------------------------------------------------
# buildCountMatrix
# Create a matrix of reads count
#
# Input:
#   files : a vector of files names
#   sampleLabel : a vector of sample names
#   projectPath: path to the project directory
#
# Output:
#   countMatrix : a reads count matrix
#
# Original author : Vivien DESHAIES
# -----------------------------------------------------------------------------
buildCountMatrix = function(files, sampleLabel, expHeader) {
  
  # read first file and create countMatrix
  countMatrix = read.table(files[1],
                           header = expHeader,
                           stringsAsFactors = F,
                           quote = "")
  colnames(countMatrix) = c("id", "count1")
  
  # read and merge all remaining files with the first
  for (i in 2:length(files)) {
    
    # read files
    exp = read.table(files[i],
                     header = expHeader,
                     stringsAsFactors = F,
                     quote = "")
    
    # lowercase exp columns names
    colnames(exp) = c("id", paste0("count", i))
    
    # merge file data to count matrix by id
    countMatrix = merge(countMatrix, exp, by = "id", suffixes = "_")
  }
  
  # name rows
  rownames(countMatrix) = countMatrix[,1]
  
  # delete first row containing row names
  countMatrix = countMatrix[,-1]
  
  # name columns
  colnames(countMatrix) = sampleLabel
  return(countMatrix)
}

###############################################################################
# -----------------------------------------------------------------------------
# saveRawCountMatrix
#
#   A specific treatment is needed for the raw count matrix because the object
#   DESeq2::counts(dds) does not include the names of the samples as column names
#
#   input: dds -> DESeq object
#          fileName -> character (name of the file to output)
#   output: rawCountMatrix -> file
#
# -----------------------------------------------------------------------------

saveRawCountMatrix = function(dds,fileName) {
  
  countMatrix = DESeq2::counts(dds)
  countMatrix = cbind(countMatrix, row.names(countMatrix))
  
  # Add the samples names as column names + add column Id
  colData = SummarizedExperiment::colData(dds)
  colnames(countMatrix) = c(colData$Name, "Id")
  
  # Put Ids on the first column
  countMatrix = countMatrix[, c("Id", colData$Name)]
  
  # Write the count matrix in a file
  write.table(countMatrix,
              file = paste0(fileName),
              sep = "\t",
              row.names = FALSE,
              quote = FALSE)
}

###############################################################################
# -----------------------------------------------------------------------------
# saveTable
#
#   input: dataframe -> data frame (count matrix)
#          fileName -> character (name of the file to output)
#   output: countMatrix -> file
# -----------------------------------------------------------------------------

saveTable = function(dataframe, fileName) {
  # Add column Id
  column_names = colnames(dataframe)
  dataframe = cbind(row.names(dataframe), dataframe)
  colnames(dataframe) = c("Id", column_names)
  
  # Write the table in a file
  write.table(dataframe,
              file = paste0(fileName),
              sep = "\t",
              row.names = FALSE,
              quote = FALSE)
}

# -----------------------------------------------------------------------------
# plotDendrogram
# Create a ggplot object corresponding to a dendrogram
#
# Input:
#   hc_data : object obtained with the ggdendro::dendro_data function
#   fig_title : figure title
#   col_vector: named vector of colors
#
# Output:
#   p : ggplot object
#
# Original author : Audrey ONFROY
# -----------------------------------------------------------------------------
plotDendrogram = function(hc_data,
                          fig_title = "",
                          col_vector = colors_condition) {
  
  p = ggplot2::ggplot() +
    # Draw dendrogram
    ggplot2::geom_segment(data = ggdendro::segment(hcdata), 
                          aes(x = x, y = y, xend = xend, yend = yend)) +
    # Leaf label
    ggplot2::geom_label(data = ggdendro::label(hcdata), 
                        aes(x = x, y = y, label = label),
                        linewidth = 0, fill = NA,
                        size = 3, angle = 90, hjust = 1,
                        nudge_y = -0.02*max(ggdendro::segment(hcdata)$yend)) +
    # Leaf colors
    ggplot2::geom_point(data = ggdendro::label(hcdata),
                        ggplot2::aes(x = x, y = y, col = Condition),
                        size = 3, pch = 19) +
    ggplot2::scale_color_manual(values = col_vector,
                                breaks = names(col_vector)) +
    # Style
    ggplot2::labs(y = "Distance",
                  title = fig_title) +
    ggplot2::coord_cartesian(clip = "off") +
    ggplot2::theme_minimal() +
    ggplot2::theme(axis.text.x = element_blank(),
                   axis.ticks.x = element_blank(),
                   axis.title.x = element_blank(),
                   axis.line.y = element_line(),
                   axis.ticks.y = element_line(),
                   panel.grid = element_blank(),
                   plot.title = element_text(hjust = 0.5),
                   plot.margin = ggplot2::unit(c(0.5, 0.5, max(nchar(hcdata$labels$label))/6, 0.5), "cm"),
                   # Transparent background
                   panel.background = ggplot2::element_blank(),
                   plot.background = ggplot2::element_blank(),
                   legend.background = ggplot2::element_blank())
  
  return(p)
}

# -----------------------------------------------------------------------------
# plotPCA
# Create a ggplot object corresponding to the PCA plot of individuals
#
# Input:
#   pca_df : dataframe
#   fig_title : figure title
#   col_vector: named vector of colors
#
# Output:
#   p : ggplot object
# -----------------------------------------------------------------------------
plotPCA = function(pca_df,
                   fig_title = "",
                   col_vector = colors_condition) {
  
  p = ggplot2::ggplot(pca_df, aes(x = Dim.1, y = Dim.2, col = Condition, label = sample)) +
    ggplot2::geom_hline(yintercept = 0, lty = 2) +
    ggplot2::geom_vline(xintercept = 0, lty = 2) +
    # Samples
    ggplot2::geom_point(size = 3, pch = 19) +
    ggrepel::geom_label_repel(fill = NA, label.size = 0) +
    ggplot2::scale_color_manual(values = col_vector,
                                breaks = names(col_vector)) +
    # Style
    ggplot2::labs(x = paste0("Dim1 (", round(pcaCount$eig[1, 2], 2), "%)"),
                  y = paste0("Dim2 (", round(pcaCount$eig[2, 2], 2), "%)"),
                  title = fig_title) +
    ggplot2::theme_minimal() +
    ggplot2::theme(plot.title = element_text(hjust = 0.5),
                   # Transparent background
                   panel.background = ggplot2::element_blank(),
                   plot.background = ggplot2::element_blank(),
                   legend.background = ggplot2::element_blank())
  
  return(p)
}

# -----------------------------------------------------------------------------
# plotBarplot
# Create a ggplot object corresponding to a barplot
#
# Input:
#   df : dataframe
#   y_column : column in df for the Y-axis
#   y_title : Y-axis title
#   fig_title : figure title
#   col_vector: named vector of colors
#
# Output:
#   p : ggplot object
# -----------------------------------------------------------------------------
plotBarplot = function(df,
                       y_column,
                       y_title = "",
                       fig_title = "",
                       col_vector = colors_condition) {
  df$y_value = df[, y_column]
  
  p = ggplot2::ggplot(df, aes(x = sample, y = y_value, fill = Condition)) +
    ggplot2::geom_bar(stat = "identity", col = "black") +
    ggplot2::scale_fill_manual(values = col_vector,
                               breaks = names(col_vector)) +
    # Style
    ggplot2::labs(x = "",
                  y = y_title,
                  title = fig_title) +
    ggplot2::scale_y_continuous(expand = c(0,0)) +
    ggplot2::theme_classic() +
    ggplot2::theme(plot.title = element_text(hjust = 0.5),
                   axis.text.x = element_text(angle = 90, hjust = 1),
                   # Transparent background
                   panel.background = ggplot2::element_blank(),
                   plot.background = ggplot2::element_blank(),
                   legend.background = ggplot2::element_blank())
  
  return(p)
}

# -----------------------------------------------------------------------------
# plotBoxplot
# Create a ggplot object corresponding to a boxplot
#
# Input:
#   df : dataframe
#   y_column : column in df for the Y-axis
#   y_title : Y-axis title
#   fig_title : figure title
#   colors_condition: named vector of colors
#
# Output:
#   p : ggplot object
# -----------------------------------------------------------------------------
plotBoxplot = function(df,
                       y_column,
                       y_title = "",
                       fig_title = "",
                       col_vector = colors_condition) {
  df$y_value = df[, y_column]
  
  p = ggplot2::ggplot(df, aes(x = sample, y = y_value, fill = Condition)) +
    ggplot2::geom_boxplot(col = "black") +
    ggplot2::scale_fill_manual(values = col_vector,
                               breaks = names(col_vector)) +
    # Style
    ggplot2::labs(x = "",
                  y = y_title,
                  title = fig_title) +
    ggplot2::scale_y_continuous(expand = c(0,0)) +
    ggplot2::theme_classic() +
    ggplot2::theme(plot.title = element_text(hjust = 0.5),
                   axis.text.x = element_text(angle = 90, hjust = 1),
                   # Transparent background
                   panel.background = ggplot2::element_blank(),
                   plot.background = ggplot2::element_blank(),
                   legend.background = ggplot2::element_blank())
  
  return(p)
}

# -----------------------------------------------------------------------------
# expressionColors
# Define a color panel for the heatmap
#
# Input:
#   range_values : a numerical vector with two values (min and max)
#
# Output:
#   expression_colors : a color or a function built with circlize::colorRamp2
# -----------------------------------------------------------------------------
expressionColors = function(range_values) {
  min_value = range_values[1]
  max_value = range_values[2]
  
  # min_value < 0 < max_value
  if (min_value < 0 & 0 < max_value) {
    expression_colors = circlize::colorRamp2(
      breaks = c(min_value, 0, max_value),
      colors = c("#224897", "#F7F7F7", "#D32A24"))
  } else
    
    # min_value < max_value < 0
    if (min_value < max_value & max_value < 0) {
      expression_colors = circlize::colorRamp2(
        breaks = c(min_value, 0),
        colors = c("#224897", "#F7F7F7"))
    } else
      
      # 0 < min_value < max_value
      if (0 < min_value & min_value < max_value) {
        expression_colors = circlize::colorRamp2(
          breaks = c(0, max_value),
          colors = c("#F7F7F7", "#D32A24"))
      } else {
        
        # min_value == max_value
        expression_colors = "#F7F7F7"
      }
  
  return(expression_colors)
}

# -----------------------------------------------------------------------------
# buildContrast
# Define the contrast vector to extract DE results from the dds object
#
# Input:
#   dds       : DESeqDataSet
#   group1    : list of character vectors, for instance:
#                           list(c("sexe", "F"), c("genotype", "HETERO")
#   group2    : list of character vectors, for instance:
#                           list(c("sexe", "F"), c("genotype", "HOMO")
#   weighted  : boolean     whether to weight the contrast matrix by the
#                           number of replicates per condition or not
#
# Output: a list of 3 elements
#   contrast        : a numerical vector
#   samples_group1  : a character vector containing sample names in group 1
#   samples_group2  : a character vector containing sample names in group 2
#
# Source:
# Modified from: https://www.r-bloggers.com/2024/05/a-guide-to-designs-and-contrasts-in-deseq2/
# -----------------------------------------------------------------------------
buildContrast = function(dds,
                         group1,
                         group2,
                         weighted = FALSE){
  #-- Model matrix for each sample 
  mod_mat = model.matrix(DESeq2::design(dds),
                         SummarizedExperiment::colData(dds))
  
  #-- Samples in group1
  # A list of TRUE / FALSE
  # Each element of the list is of length the number of samples
  # The i-th element of the list corresponds to the i-th condition in group1
  grp1_rows = lapply(c(1:length(group1)), FUN = function(group1_i) {
    group1_i_name = group1[[group1_i]][1]
    group1_i_values = group1[[group1_i]][2:length(group1_i)]
    
    samples_in_group1_i = (SummarizedExperiment::colData(dds)[[group1_i_name]] %in% group1_i_values) # TRUE / FALSE
    
    return(samples_in_group1_i)
  })
  grp1_rows = Reduce(function(x, y) x & y, grp1_rows)
  
  grp2_rows = lapply(c(1:length(group2)), FUN = function(group2_i) {
    group2_i_name = group2[[group2_i]][1]
    group2_i_values = group2[[group2_i]][2:length(group2_i)]
    
    samples_in_group2_i = (SummarizedExperiment::colData(dds)[[group2_i_name]] %in% group2_i_values) # TRUE / FALSE
    
    return(samples_in_group2_i)
  })
  grp2_rows = Reduce(function(x, y) x & y, grp2_rows)
  
  #-- Model matrix with only samples of interest
  mod_mat1 = mod_mat[grp1_rows, , drop = FALSE]
  mod_mat2 = mod_mat[grp2_rows, , drop = FALSE]
  
  #-- Consider the weight or not
  if(!weighted){
    mod_mat1 = mod_mat1[!duplicated(mod_mat1), , drop = FALSE]
    mod_mat2 = mod_mat2[!duplicated(mod_mat2), , drop = FALSE]
  }
  
  #-- Output
  contrast = colMeans(mod_mat1)-colMeans(mod_mat2)
  samples_group1 = colnames(dds)[grp1_rows]
  samples_group2 = colnames(dds)[grp2_rows]
  output = list(contrast = contrast,
                samples_group1 = samples_group1,
                samples_group2 = samples_group2)
  
  return(output)
}
```

These functions are listed below.

```{r print_functions, echo = FALSE, results = 'asis'}
cat(paste0("- ", setdiff(ls(), c("hooks", "params", "hook_foldable", "design_tmp",
                                 "fig_save_folder", "text_to_print"))),
    sep = "\n")
```

## Parameters

The table below summarizes all the input parameters.

```{r input_parameter, class.source = "fold-hide"}
projectName     = params$projectName
designPath      = params$designPath
comparisonPath  = params$comparisonPath
diffanaTest     = as.logical(toupper(params$diffanaTest))
expHeader       = as.logical(toupper(params$expHeader))
deseqModel      = params$deseqModel
sizeFactorType  = params$sizeFactorType
fitType         = params$fitType
statisticTest   = params$statisticTest
weightContrast  = params$weightContrast
prefix          = params$prefix
plotInteractive = params$plotInteractive
leaveOnError    = params$leaveOnError

# Display them as a table
c("Project name (projectName)" = projectName,
  "Path to the design file (designPath)" = designPath,
  "Path to the comparison file (comparisonPath)" = comparisonPath,
  "Whether to perform the differential expression analysis or not (diffanaTest)" = diffanaTest,
  "Whether the design table has a header or not (expHeader)" = expHeader,
  "DESeq2 model (deseqModel)" = deseqModel,
  "Size factor type (sizeFactorType)" = sizeFactorType,
  "Fit type (fitType)" = fitType,
  "Statistic test (statisticTest)" = statisticTest,
  "Whether to weight the contrast vector by the number of samples or not (weightContrast)" = weightContrast,
  "Prefix to save files (prefix)" = prefix,
  "Whether to make interactive plots or not (plotInteractive)" = plotInteractive,
  "Whether to stop the rendering in case of error (leaveOnError)" = leaveOnError) %>%
  data.frame(Parameter = names(.),
             Value     = .,
             row.names = NULL) %>%
  knitr::kable("html", escape = FALSE) %>%
  kableExtra::kable_styling("striped", full_width = FALSE) %>%
  kableExtra::column_spec(1, bold = TRUE)
```

# Data preparation

The purpose of this section is to:

* load the design file, containing metadata about each sample ;
* load the individual sample count data, assemble them as a gene-by-sample count matrix, and save it as a TSV file ; and,
* build a dataset in a format compatible with the differential expression analysis (DESeq2-related format).

## Load metadata

We load the design file and display the 6 first rows. 

```{r load_design}
design = read.table(designPath,
                    sep = "\t",
                    header = TRUE,
                    dec = ".",
                    stringsAsFactors = FALSE)

# About a warning in DESeq2
design$Condition = stringr::str_replace_all(
  design$Condition,
  pattern = "-",
  replacement = "_")

# Add Reference if does not exist
if (!("Reference" %in% colnames(design))) {
  design$Reference = Inf
  complexMode = TRUE
} else {
  complexMode = FALSE
}

# Reorder samples based on Condition
design = design %>%
  dplyr::arrange(Condition, RepTechGroup, Name) %>%
  dplyr::mutate(Condition = factor(Condition, levels = unique(Condition))) %>%
  dplyr::mutate(RepTechGroup = factor(RepTechGroup, levels = unique(RepTechGroup))) %>%
  dplyr::mutate(Name = factor(Name, levels = unique(Name)))

# Remove columns out of interest
design = design[, !(colnames(design) %in% c("SampleId", "Description", "Date", "FastqFormat"))]

# Display
head(design) %>%
  knitr::kable("html") 
```


Description of the columns:

```{r print_metadata_help, echo = FALSE, results = 'asis'}
the_message = c(Name = "- **`Name`**: Unique identifier of each technical replicate.",
                expressionFile = "- **`expressionFile`**: File path to the count expression data. The **TSV** file contains two columns: one for the gene identifiers and one for the number of reads mapping to their genomic location. For compatibility with Excel, please use the **XLSX** file instead of the TSV file ([read more](https://pmc.ncbi.nlm.nih.gov/articles/PMC8357140/)). Furthermore, the XLSX file contains additional columns, such as a description of each gene.",
                Reads = "- **`Reads`**: File path to the sequencing reads, in FASTQ format ([read more](https://en.wikipedia.org/wiki/FASTQ_format)).",
                RepTechGroup = "- **`RepTechGroup`**: Samples with the same value in the `RepTechGroup` column are **technical replicates**. This occurs when the sequencing library is split over sequencing lanes. Each lane generates a technical replicate from the same biological library.",
                Condition = "- **`Condition`**: Samples with the same value in the `Condition` column are **biological replicates**. They are produced by the experimenter.",
                Reference = "- **`Reference`**: Numerical values used to define the pairwise comparisons between conditions in the differential expression analysis.")
the_message = the_message[names(the_message) %in% colnames(design)]

cat(paste0(the_message), sep = "\n")
```

We associate a color to each condition.

```{r add_colors}
colors_condition = unique(design$Condition) %>%
  setNames(nm = .,
           grDevices::hcl.colors(length(.), "Spectral"))

data.frame(Color = unname(colors_condition),
           Condition = names(colors_condition)) %>%
  knitr::kable("html",
               col.names = c("Color (hex code)", "Condition")) %>%
  kableExtra::column_spec(1, color = colors_condition) %>%
  kableExtra::kable_styling(full_width = FALSE)
```

## Build count matrix

From the individual count data, we build a gene-by-sample count matrix. The count value for a given gene corresponds to the number (integer value) of sequencing reads that map to its genomic region. We display the top left corner of this matrix, meaning 5 genes and 5 samples. The complete matrix is saved as a TSV file ([quick access](#save_count_matrix)).

```{r count_mat}
count_mat = buildCountMatrix(files = design$expressionFile,
                             sampleLabel = design$Name,
                             expHeader = expHeader)

# Reorder columns
count_mat = count_mat[, levels(design$Name)]

# Display
count_mat[c(1:5), c(1:5)] %>%
  knitr::kable("html") %>%
  kableExtra::kable_styling(full_width = FALSE)
```

The count matrix contains **`r nrow(count_mat)`** genes, **`r ncol(count_mat)`** samples and **`r round(100*mean(count_mat == 0),2)`**% of null values.

## Create DESeqDataSet object

We create the DESeqDataSet object that includes the count matrix and the design file.

```{r dds_object}
dds = DESeq2::DESeqDataSetFromMatrix(
  countData = count_mat,
  colData = design,
  design = as.formula(deseqModel))
```

## Save count matrix {#save_count_matrix}

We save the count matrix as a TSV file.

```{r save_count_matrix}
filename = paste0(prefix, projectName, "-normalisation_rawCountMatrix.tsv")

saveRawCountMatrix(dds,
                   file = filename)
```

Output file name:<br>*`r filename`*

# Quality control

The aim of this section is to better apprehend the variability between the technical replicates. For information on the correspondence between the identifiers of technical replicates and sample names, please refer to the design file.

## Raw counts dendrogram

We compute euclidean distances between each pair of samples, considering the raw count values.

```{r dendrogram_raw_count_matrix}
dist.mat = dist(t(count_mat))
hc = stats::hclust(dist.mat)
hcdata = ggdendro::dendro_data(hc, type = "rectangle")

# Add grouping for colors
hcdata$labels = dplyr::left_join(x = hcdata$labels,
                                 y = design[, c("Name", "Condition")],
                                 by = c("label" = "Name"))
```

We visualise these distances through a dendrogram. It informs whether technical replicates or closely related conditions are closer together than with other samples.

```{r fig-normalisation_unpooled_clustering, fig.width = 10, fig.height = 6}
plotDendrogram(hc_data,
               fig_title = paste0("Dendrogram on the raw count matrix - ",
                                  projectName))
```

## Raw counts PCA

The **`r ncol(count_mat)`** samples are represented in a space of **`r nrow(count_mat)`** dimensions, corresponding to genes. We perform a Principal Component Analysis (PCA) to visualise samples on a two-dimension space such that it represents the highest variability between samples.

```{r run_pca}
pcaCount = count_mat %>%
  t() %>%
  FactoMineR::PCA(., graph = FALSE)
```

We visualise the samples on the two-dimensional projection. The coordinates of each sample depend on those of the other samples. This projection captures **`r round(pcaCount$eig[2, 3], 2)`**% of the total variability between samples. Therefore, it lacks **`r 100-round(pcaCount$eig[2, 3], 2)`**% of the initial information. In this representation, two points (samples) are close together if they have similar global transcriptomic profiles. We expect the technical and biological replicates to be closer to each other than to the other samples and conditions.

```{r fig-normalisation_unpooled_PCA, fig.width = 10, fig.height = 6}
pcaCount$ind$coord %>%
  as.data.frame() %>%
  dplyr::mutate(sample = rownames(.)) %>%
  dplyr::left_join(x = .,
                   y = unique(design[, c("Name", "Condition")]),
                   by = c("sample" = "Name")) %>%
  plotPCA(.,
          fig_title =  paste0("PCA of the raw count matrix - ", projectName))
```

## Null counts barplot

This figure displays the proportion of null counts per sample, meaning, the genes that are not being expressed or not captured by the protocol, within the biological samples. We expect samples from the same condition to be similar.

```{r fig-normalisation_null_counts, fig.width = 10, fig.height = 6}
colMeans(count_mat == 0) %>%
  data.frame(sample = factor(names(.), levels = levels(design$Name)),
             prop_null_counts = .) %>%
  dplyr::mutate(prop_null_counts = 100*prop_null_counts) %>%
  dplyr::left_join(x = .,
                   y = design[, c("Name", "Condition")],
                   by = c("sample" = "Name")) %>%
  # Plot
  plotBarplot(.,
              y_column = "prop_null_counts",
              y_title = "Proportion of null counts (%)",
              fig_title = paste0("Proportion of null counts per sample - ",
                                 projectName))
```

## Raw counts barplot

This figure displays the total number of reads per sample, also known as library size. We can assess if some samples have a low library size compared to others, and whether technical replicates share a similar number of reads or not.

```{r fig-normalisation_barplot_counts, fig.width = 10, fig.height = 6}
colSums(DESeq2::counts(dds)) %>%
  data.frame(sample = factor(names(.), levels = levels(design$Name)),
             read_counts = .) %>%
  dplyr::left_join(x = .,
                   y = design[, c("Name", "Condition")],
                   by = c("sample" = "Name")) %>%
  # Plot
  plotBarplot(.,
              y_column = "read_counts",
              y_title = "Total read counts",
              fig_title = paste0("Number of reads per sample - ", projectName))
```

## Raw counts boxplot

This figure displays, as a boxplot, the number of raw counts for each gene, per sample. It informs whether the genes show a similar count distribution between samples and conditions.

```{r fig-normalisation_boxplot_count, fig.width = 10, fig.height = 6}
log2(DESeq2::counts(dds)+1) %>%
  reshape2::melt() %>%
  `colnames<-`(c("gene_ID", "sample", "log2_counts")) %>%
  dplyr::mutate(sample = factor(sample, levels = levels(design$Name))) %>%
  dplyr::left_join(x = .,
                   y = design[, c("Name", "Condition")],
                   by = c("sample" = "Name")) %>%
  # Plot
  plotBoxplot(.,
              y_column = "log2_counts",
              y_title = "log_2 (counts+1)",
              fig_title = paste0("Raw counts distribution per sample - ", projectName))
```


<!-- Conditional execution in case technical replicates are present -->
<br>

```{r tech_rep_present, echo = FALSE}
tech_rep_present = (max(table(design$RepTechGroup, design$Condition)) > 1)

if (tech_rep_present) {
  the_message = "In this context, there are technical replicates."
} else {
  the_message = "In this context, there is no technical replicate."
  
  # Anyway, we change the sample labels to the biological meaningful ones.
  dds = DESeq2::collapseReplicates(dds,
                                   groupby = dds$RepTechGroup)
}
```

`r the_message`

```{r collapse_replicates, echo = FALSE, results = 'asis', eval = tech_rep_present}
res = knitr::knit_child(
  input = '02_child_collapse_replicates.Rmd',
  quiet = TRUE)

cat(res, sep = '\n')
```

# Normalisation

The aim of this section is to normalise the count matrix, before performing the differential expression analysis. The normalisation method considers the sequencing depth and RNA composition of each sample to compute normalised count values.

For more details, please visit:

[https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html](https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html)

## Data handling

We normalise the count matrix. We display the top left corner of this matrix, meaning 5 genes and 5 samples. Unlike count data, normalised count values are not necessarily integers.

```{r estimateSizeFactors}
dds = DESeq2::estimateSizeFactors(dds,
                                  type = sizeFactorType)

DESeq2::counts(dds, normalized = TRUE)[c(1:5), c(1:5)] %>%
  knitr::kable("html") %>%
  kableExtra::kable_styling(full_width = FALSE)
```

We save the normalised count matrix as a TSV file.

```{r save_norm_count_matrix}
filename = paste0(prefix, projectName,
                  "-normalisation_normalisedCountMatrix.tsv")

saveTable(DESeq2::counts(dds, normalized = TRUE),
          fileName = filename)
```

Output file name:<br>*`r filename`*

## Visualisation {#normalisation_visu}

### Normalised counts dendrogram

We compute euclidean distances between each pair of samples, considering the normalised count values.

```{r dendrogram_pooled_normalised_count_matrix}
ddsStabilized = dds %>%
  DESeq2::varianceStabilizingTransformation() %>%
  SummarizedExperiment::assay()
dist.mat = dist(t(ddsStabilized))
hc = stats::hclust(dist.mat)
hcdata = ggdendro::dendro_data(hc, type = "rectangle")

# Add grouping for colors
hcdata$labels = dplyr::left_join(
  x = hcdata$labels,
  y = unique(design[, c("RepTechGroup", "Condition")]),
  by = c("label" = "RepTechGroup"))
```

We visualise these distances through a dendrogram. It informs whether the samples share a similar transcriptomic profile per condition or not.

```{r fig-normalisation_pooled_clustering, fig.width = 10, fig.height = 6}
plotDendrogram(
  hc_data,
  fig_title = paste0("Dendrogram on the normalised count matrix - ", projectName))
```

### Normalised counts PCA

We perform a PCA for all samples using the normalised count matrix.

```{r run_pca2}
pcaCount = dds %>%
  DESeq2::counts(., normalized = TRUE) %>%
  t() %>%
  FactoMineR::PCA(., graph = FALSE)
```

We visualise the samples on the two-dimensional projection. The coordinates of each sample depend on those of the other samples. This projection captures **`r round(pcaCount$eig[2, 3], 2)`**% of the total variability between samples. Therefore, it lacks **`r 100-round(pcaCount$eig[2, 3], 2)`**% of the initial information. In this representation, two points (samples) are close together if they have similar global transcriptomic profiles. We expect samples from the same biological condition to be closer to each other than to samples from other conditions.

```{r fig-normalisation_pooled_PCA, fig.width = 10, fig.height = 6}
pcaCount$ind$coord %>%
  as.data.frame() %>%
  dplyr::mutate(sample = rownames(.)) %>%
  dplyr::left_join(x = .,
                   y = unique(design[, c("RepTechGroup", "Condition")]),
                   by = c("sample" = "RepTechGroup")) %>%
  plotPCA(.,
          fig_title =  paste0("PCA of the normalised count matrix - ", projectName))
```

### Normalised counts boxplot

This figure displays, as a boxplot, the number of normalised counts for each gene, per sample.

```{r fig-normalisation_normalised_boxplot_count, fig.width = 10, fig.height = 6}
log2(DESeq2::counts(dds, normalized = TRUE)+1) %>%
  reshape2::melt() %>%
  `colnames<-`(c("gene_ID", "sample", "log2_counts")) %>%
  dplyr::mutate(sample = factor(sample, levels = levels(design$RepTechGroup))) %>%
  dplyr::left_join(x = .,
                   y = unique(design[, c("RepTechGroup", "Condition")]),
                   by = c("sample" = "RepTechGroup")) %>%
  # Plot
  plotBoxplot(
    .,
    y_column = "log2_counts",
    y_title = "log_2 (counts+1)",
    fig_title = paste0("Normalised counts distribution per sample - ", projectName))
```

### Most expressed genes plot

We identified the most expressed gene in each sample.

```{r identify_most_expressed_genes}
# preparation of 2 data frame with the same number of column than
# the dds count matrix
maxCounts = DESeq2::counts(dds)[1,]
transcriptNames = DESeq2::counts(dds)[1,]

# for each sample (column)
for (i in 1:ncol(DESeq2::counts(dds))) {
  
  # selection of the maximum number of count
  maxCounts[i] = (max(DESeq2::counts(dds, normalized = TRUE)[,i])/
                    sum(DESeq2::counts(dds, normalized = TRUE)[,i]))*100
  
  # selection of the name of the genes this the maximum of count
  transcriptNames[i] = row.names(
    subset(DESeq2::counts(dds, normalized = TRUE),
           DESeq2::counts(dds, normalized = TRUE)[,i] == 
             max(DESeq2::counts(dds, normalized = TRUE)[,i])))
}
```

We represent the information as a barplot.

```{r fig-normalisation_most_expressed_genes, fig.width = 10, fig.height = 2+ncol(count_mat)/4}
dplyr::left_join(x = data.frame(sample = names(maxCounts),
                                max_counts = maxCounts),
                 y = data.frame(sample = names(transcriptNames),
                                gene_id = transcriptNames),
                 by = "sample") %>%
  dplyr::mutate(sample = factor(sample, levels = levels(design$RepTechGroup))) %>%
  dplyr::left_join(x = .,
                   y = unique(design[, c("RepTechGroup", "Condition")]),
                   by = c("sample" = "RepTechGroup")) %>%
  # Plot
  ggplot2::ggplot(., aes(x = sample, y = max_counts,
                         fill = Condition, label = gene_id)) +
  ggplot2::geom_bar(stat = "identity", col = "black") +
  ggplot2::geom_label(mapping = aes(x = sample, y = 0),
                      linewidth = 0, fill = NA, hjust = 0) +
  ggplot2::scale_fill_manual(values = colors_condition,
                             breaks = names(colors_condition)) +
  # Style
  ggplot2::labs(y = "Proportion of reads (%)",
                title = paste0("Most expressed genes - ", projectName)) +
  ggplot2::scale_y_continuous(expand = c(0,0)) +
  ggplot2::coord_flip() +
  ggplot2::theme_classic() +
  ggplot2::theme(plot.title = element_text(hjust = 0.5),
                 axis.title.y = element_blank(),
                 axis.text.x = element_text(angle = 90, hjust = 1),
                 # Transparent background
                 panel.background = ggplot2::element_blank(),
                 plot.background = ggplot2::element_blank(),
                 legend.background = ggplot2::element_blank())
```


<!-- Conditional execution in case differential expression analysis should be done -->

```{r diffana, echo = FALSE, results = 'asis', eval = diffanaTest}
res = knitr::knit_child(
  input = '03_child_differential_expression.Rmd',
  quiet = TRUE)

cat(res, sep = '\n')
```


# Versions

```{r sessioninfo, echo = FALSE, fold_output = TRUE, results = "hold"}
print("Packages location:")
.libPaths()
print("")
print("Session Information:")
sessionInfo()
```

# Reference

The differential expression analysis uses the `DESeq2` R package ([vignette](https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)).

> Love, M.I., Huber, W. & Anders, S., Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol 15, 550 (2014). DOI: [10.1186/s13059-014-0550-8](https://doi.org/10.1186/s13059-014-0550-8)
