5 Differential Expression

top_p_contrast1 <- 1e-8
top_fc_contrast1 <- 1
de_padj_threshold_contrast1 <- 0.001
de_fc_threshold_contrast1 <- 0.5

top_p_contrast2 <- 1e-10
top_fc_contrast2 <- 0.5
de_padj_threshold_contrast2 <- 0.001
de_fc_threshold_contrast2 <- 0.5

Differential expression (DE) is a common method used to compare gene expression between sample groups and to quantify the significance of these differences. We perform DE on a per-gene basis where the normalized expression is modeled using a linear mixed-effect model (LMM) to account for the sampling of multiple ROI/AOI segments per tissue. For more details, please see the LMM section in the Appendix.

For a given comparison, we will subset the data down to the relevant segments and perform DE. We provide tables, heatmaps, violin plots, and volcano plots of the top differentially expressed genes.

# convert test variables to factors
for (variable in factors_of_interest){
  pData(target_data)[[variable]] <- factor(pData(target_data)[[variable]])
  rm(variable)
  }
if ("slide name" %in% factors_of_interest){
    pData(target_data)[["slide"]] <- factor(pData(target_data)[["slide name"]])
}

We will now focus on the two key experimental comparisons:

Question 1: What are the differences between glomeruli and tubules in kidneys?

Question 2: What are the differences between glomeruli from healthy versus diseased (DKD) kidneys?

5.1 Glomeruli vs Tubules

We will compare glomeruli and tubules across the kidney dataset.

The following formula was used to model differences for a given gene:

\[ gene \sim region + (1+region|slide) \]

We adjust for the multiple sampling of ROI/AOI segments per tissue with the \(slide\) variable.

contrast1 <- target_data # ie, no filtering for this contrast

if(file.exists(file.path(de_data_dir, "de_contrast1.RDS"))){
 de_contrast1 <- readRDS(file.path(de_data_dir, "de_contrast1.RDS"))
} else {
  de_contrast1 <- mixedModelDE(contrast1,
                   elt = "log_q",
                   modelFormula = ~ region + (1 + region|slide), 
                   groupVar = "region",
                   nCores = parallel::detectCores()-1,
                   multiCore = FALSE)
  de_contrast1 <- formatLMMResults(de_contrast1, p_adjust_method = p_adjust_method_selected)
  saveRDS(de_contrast1, file=file.path(de_data_dir, "de_contrast1.RDS"))

}

Figures in the tabs below can be used to visualize differences between our two groups. The Volcano tab plots the log2 fold change (FC) between groups within the contrast and significance (p-value) for all genes. Additionally, a FDR (False Discovery Rate) correction was applied. The Heatmap tab visualizes the top genes using unsupervised hierarchical clustering on Pearson distances of the normalized data z-scores.

Analyst Notes: Overall, 3048 genes were significant at FDR < 0.05 (False Discovery Rate). 202 genes were significant using the dual thresholding of FDR < 0.001 (False Discovery Rate) and |log2 fold change| > 0.5 (datapoints in blue). A heatmap of these 202 genes show clustering primarily by region. Glomeruli show elevated EMP3 and PODXL expression relative to tubules. In contrast, STX3 and SUCLG1 are higher in tubules relative to glomeruli.

de_contrast1_volcano_list <- makeVolcano(df=de_contrast1, 
                                         to_label=label_de1,
                                         label_color="grey20", 
                                         SCALE=1.4, LWD=1,
                                         the_name = de_contrast1$Comparison[1],
                                         log_type="log2(FC)", 
                                         fc_cutoff=1,
                                         p_adjust_column = p_adjust_method,
                                         pval_cutoff=0.05)

write.table(de_contrast1_volcano_list[[2]], file.path(de_data_dir, "contrast1_de_volcano.tsv"), sep="\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
p <- de_contrast1_volcano_list[[1]]
ggsave(p, filename = file.path(de_plot_dir, "contrast1_de_volcano.svg"), width=8, height=5)
p


heatmap1_features <- unique(c(foi,
                            subset(de_contrast1, de_contrast1[[p_adjust_method]] < de_padj_threshold_contrast1 & 
                                    abs(`Estimate`) > de_fc_threshold_contrast1)$Feature))
heatmap_dat1 <- t(scale(t(
 assayDataElement(contrast1[heatmap1_features, ], elt = "q_norm"))))
contrast1_de_heatmap <- 
 Heatmap(heatmap_dat1,
         col = colorRamp2(c(seq(-3, 3, 0.05)), 
                          c(colorRampPalette(c("#0092b5", "white", "#a6ce39"))(121))),
         name = 'z-score', use_raster = TRUE,
         clustering_distance_rows = "pearson", clustering_method_rows = "average",
         clustering_distance_columns = "pearson", clustering_method_columns = "average",
         row_split = 2, column_split = 2,
         border_gp = gpar(col = "darkgray"),
         show_row_names = FALSE, show_column_names = FALSE,
         right_annotation = 
          rowAnnotation(foo = anno_mark(at = match(label_de1, rownames(heatmap_dat1)),
                                        labels = label_de1)),
         top_annotation = 
          HeatmapAnnotation(df= pData(contrast1)[,factors_of_interest],
                            col = pal_heatmaps))

svglite::svglite(filename = file.path(de_plot_dir, "contrast1_de_heatmap.svg"),
                width=12, height=10)
draw(contrast1_de_heatmap, merge_legend = TRUE, heatmap_legend_side = "bottom",
     annotation_legend_side = "bottom", adjust_annotation_extension = TRUE)
dev.off()
## png 
##   2
draw(contrast1_de_heatmap, merge_legend = TRUE, heatmap_legend_side = "bottom", 
      annotation_legend_side = "bottom", adjust_annotation_extension = TRUE)

if(any(label_de1 %in% colnames(pData(contrast1)))){
  stop("column names in pheno data cannot be label feature names.")
}
# individual points
contrast_factor <- factors_of_interest[3]
violin_df <- cbind(pData(contrast1) %>% dplyr::select(eval(contrast_factor)),
                  t(assayDataElement(contrast1[label_de1, ], elt = "q_norm")))
violin_df <- violin_df %>% tidyr::pivot_longer(cols=-1, names_to = "Feature", values_to = "Expression")
colnames(violin_df)[1] <- "contrast_factor"
# Format summary stats
violin_p_df <- filter(de_contrast1, Feature %in% label_de1)
violin_p_df <- violin_p_df %>% tidyr::separate(col = Comparison, into=c("group1", "group2"), sep=" vs ")
violin_p_df[p_adjust_method] <- signif(violin_p_df[p_adjust_method], 3)
violin_exp_max <- ddply(violin_df, .(Feature), summarize,
                        y.position=((max(Expression)+1)*1.1)) # +1 for safe log2
violin_p_df <- base::merge(violin_p_df, violin_exp_max, by="Feature")


p <- ggplot(violin_df, 
            aes(x=contrast_factor, y=Expression, fill=contrast_factor)) + 
  geom_violin() +
  geom_jitter(width=0.25, height=0, size = 1.5) + 
  scale_fill_manual(values = pal_main[[contrast_factor]]) +
  facet_wrap(~Feature, scales = "free_y") +
  labs(x = eval(contrast_factor), y = "Expression (normalized counts)") +
  scale_y_continuous(trans = "log2", expand = expansion(mult = 0.2)) +
  theme_bw(base_size = 14) + 
  guides(fill=guide_legend(title = eval(contrast_factor)))
p <- p + ggprism::add_pvalue(
  violin_p_df,
  label="{p_adjust_class} = {violin_p_df[[p_adjust_method]]}", label.size = 3.6,
  y.position = violin_p_df$y.position
) + theme(legend.position="bottom")
ggsave(p, filename = file.path(de_plot_dir, "contrast1_FOI.svg"), width=12, height=8)
p

The searchable table below lists log2 fold change estimates and p-values for each gene used in the above heatmap.

# Change the adjusted P-value column name to its classified adjusted P-value method for display.
if(p_adjust_method_selected == "none"){
 dt1 <- de_contrast1[, c("Feature", "Comparison", "Estimate", "P")]
 selected_dt1 <- subset(dt1, Feature %in% heatmap1_features) %>% arrange(P)
} else {
 dt1 <- de_contrast1
 colnames(dt1)[which(colnames(de_contrast1) == p_adjust_method)] <- p_adjust_class
 selected_dt1 <- subset(dt1, Feature %in% heatmap1_features) %>% arrange(P)
}

dt_params$buttons <-   list(list(extend = "copy"),
                       list(extend = "csv", filename = "03.contrast1_de_top_features"),
                       list(extend = "excel", filename = "03.contrast1_de_top_features"))
DT::datatable(
  selected_dt1,
  # dt1 (for all genes) 
  extensions = c("Buttons", "Scroller", "FixedColumns"),
  options = dt_params,
  rownames = FALSE) %>% DT::formatRound(columns=3, digits=3) %>% DT::formatSignif(columns=4:5)

5.2 Healthy vs Diseased (DKD) Glomeruli

We will now compare glomeruli from healthy and diseased (DKD) kidneys.

The following formula was used to model differences for a given gene:

\[ gene \sim class + (1|slide) \]

We adjust for the multiple sampling of ROI/AOI segments per tissue with the \(slide\) variable.

# Subset the data to only glomeruli.
contrast2_samples <- 
 rownames(filter(pData(target_data),
                 region=="glomerulus"))

contrast2 <- target_data[,contrast2_samples]

if(file.exists(file.path(de_data_dir, "de_contrast2.RDS"))){
  de_contrast2 <- readRDS(file.path(de_data_dir, "de_contrast2.RDS"))
} else {
  de_contrast2 <-
    mixedModelDE(contrast2,
                 elt = "log_q",
                 modelFormula = ~ class + (1 | slide), 
                 groupVar = "class",
                 nCores = parallel::detectCores()-1,
                 multiCore = FALSE)
  de_contrast2 <- formatLMMResults(de_contrast2, p_adjust_method = p_adjust_method_selected)
  saveRDS(de_contrast2, file=file.path(de_data_dir, "de_contrast2.RDS"))
}

Figures in the tabs below can be used to visualize differences between our two groups. The Volcano tab plots the log2 fold change (FC) between groups within the contrast and significance (p-value) for all genes. Additionally, a FDR (False Discovery Rate) correction was applied. The Heatmap tab visualizes the top genes using unsupervised hierarchical clustering on Pearson distances of the normalized data z-scores.

Let’s visualize our genes of interest to understand their biological heterogeneity. A simple and effective plot to view individual genes is the violin plot. This visualization reveals both the dynamic range and shape of the distribution for a gene target.

Analyst Notes: Overall, 107 genes were significant at FDR < 0.05 (False Discovery Rate). 20 genes were significant using the dual thresholding of FDR < 0.001 (False Discovery Rate) and |log2 fold change| > 0.5 (datapoints in blue). Most of these genes are enriched in DKD samples (see Heatmap), suggesting a disease-state specific response. For example, the violin plot of the log2 expression of LSP1 shows that this gene is elevated in DKD samples.

de_contrast2_volcano_list <- makeVolcano(df=de_contrast2, 
                                         to_label=label_de2,
                                         label_color="grey20", 
                                         SCALE=1.4, LWD=1,
                                         the_name = de_contrast2$Comparison[1],
                                         log_type="log2(FC)", 
                                         fc_cutoff=0.5,
                                         p_adjust_column = p_adjust_method,
                                         pval_cutoff=0.05)

write.table(de_contrast2_volcano_list[[2]], file.path(de_data_dir, "contrast2_de_volcano.tsv"), sep="\t", quote=FALSE, col.names=TRUE, row.names=FALSE)

p <- de_contrast2_volcano_list[[1]]
ggsave(p, filename = file.path(de_plot_dir, "contrast2_de_volcano.svg"), width=8, height=5)
p

heatmap2_features <- unique(c(foi,
                            subset(de_contrast2, de_contrast2[[p_adjust_method]] < de_padj_threshold_contrast2 & 
                                    abs(`Estimate`) > de_fc_threshold_contrast2)$Feature))
heatmap_dat2 <- t(scale(t(
 assayDataElement(contrast2[heatmap2_features, ], elt = "q_norm"))))
contrast2_de_heatmap <- 
 Heatmap(heatmap_dat2,
         col = colorRamp2(c(seq(-3, 3, 0.05)), 
                          c(colorRampPalette(c("#0092b5", "white", "#a6ce39"))(121))),
         name = 'z-score', use_raster = TRUE,
         clustering_distance_rows = "pearson", clustering_method_rows = "average",
         clustering_distance_columns = "pearson", clustering_method_columns = "average",
         row_split = 2, column_split = 2,
         border_gp = gpar(col = "darkgray"),
         show_row_names = FALSE, show_column_names = FALSE,
         right_annotation = 
          rowAnnotation(foo = anno_mark(at = match(label_de2, rownames(heatmap_dat2)),
                                        labels = label_de2)),
         top_annotation = 
          HeatmapAnnotation(df= pData(contrast2)[, factors_of_interest],
                            col = pal_heatmaps))

svglite::svglite(filename = file.path(de_plot_dir, "contrast2_de_heatmap.svg"),
                width=12, height=10)
draw(contrast2_de_heatmap, merge_legend = TRUE, heatmap_legend_side = "bottom",
     annotation_legend_side = "bottom", adjust_annotation_extension = TRUE)
dev.off()
## png 
##   2
draw(contrast2_de_heatmap, merge_legend = TRUE, heatmap_legend_side = "bottom", 
      annotation_legend_side = "bottom", adjust_annotation_extension = TRUE)

if(any(label_de2 %in% colnames(pData(contrast2)))){
  stop("column names in pheno data cannot be label feature names.")
}
# individual points
contrast_factor <- factors_of_interest[2]
violin_df <- cbind(pData(contrast2) %>% dplyr::select(eval(contrast_factor)),
                  t(assayDataElement(contrast2[label_de2, ], elt = "q_norm")))
violin_df <- violin_df %>% tidyr::pivot_longer(cols=-1, names_to = "Feature", values_to = "Expression")
colnames(violin_df)[1] <- "contrast_factor"
# Format summary stats
violin_p_df <- filter(de_contrast2, Feature %in% label_de2)
violin_p_df <- violin_p_df %>% tidyr::separate(col = Comparison, into=c("group1", "group2"), sep=" vs ")
violin_p_df[p_adjust_method] <- signif(violin_p_df[p_adjust_method], 3)
violin_exp_max <- ddply(violin_df, .(Feature), summarize,
                        y.position=((max(Expression)+1)*1.1)) # +1 for safe log2
violin_p_df <- base::merge(violin_p_df, violin_exp_max, by="Feature")


p <- ggplot(violin_df, 
            aes(x=contrast_factor, y=Expression, fill=contrast_factor)) + 
  geom_violin() +
  geom_jitter(width=0.25, height=0, size = 1.5) + 
  scale_fill_manual(values = pal_main[[contrast_factor]]) +
  facet_wrap(~Feature, scales = "free_y") +
  labs(x = eval(contrast_factor), y = "Expression (normalized counts)") +
  scale_y_continuous(trans = "log2", expand = expansion(mult = c(0.1, 0.2)), labels = scales::number_format(accuracy = 0.01)) +
  theme_bw(base_size = 14) +  
  guides(fill=guide_legend(title = eval(contrast_factor)))

p <- p + ggprism::add_pvalue(
  violin_p_df,
  label="{p_adjust_class} = {violin_p_df[[p_adjust_method]]}", label.size = 3.6,
  y.position = violin_p_df$y.position
) + theme(legend.position="bottom")
ggsave(p, filename = file.path(de_plot_dir, "contrast2_FOI.svg"), width=12, height=8)
p

The searchable table below lists log2 fold change estimates and p-values for each gene used in the above heatmap.

# Change the adjusted P-value column name to its classified adjusted P-value method for display.
if(p_adjust_method_selected == "none"){
 dt2 <- de_contrast2[, c("Feature", "Comparison", "Estimate", "P")]
 selected_dt2 <- subset(dt2, Feature %in% heatmap2_features) %>% arrange(P)
} else {
 dt2 <- de_contrast2
 colnames(dt2)[which(colnames(de_contrast2) == p_adjust_method)] <- p_adjust_class
 selected_dt2 <- subset(dt2, Feature %in% heatmap2_features) %>% arrange(P)
}

dt_params$buttons <-   list(list(extend = "copy"),
                       list(extend = "csv", filename = "03.contrast2_de_top_features"),
                       list(extend = "excel", filename = "03.contrast2_de_top_features"))
DT::datatable(
  selected_dt2,
  # dt2 (for all genes) 
  extensions = c("Buttons", "Scroller", "FixedColumns"),
  options = dt_params,
  rownames = FALSE) %>% DT::formatRound(columns=3, digits=3) %>% DT::formatSignif(columns=4:5)