在生信分析中,|log2FC| >1,p<0.05就是我們做分析想要的結(jié)果。不管是用GEO數(shù)據(jù)、TCGA數(shù)據(jù)等做分析,甚至是自己花錢做單細胞、多組學(xué),沒有差異也是枉然。在實驗中,表達差異和表型驗證,也要看到基因表達對表型的影響具有統(tǒng)計學(xué)差異。我們總結(jié)說過, 生信分析的數(shù)據(jù)可視化的方式要根據(jù)待展示差異基因的數(shù)目而定 。

如果待展示的基因較少,柱狀圖、箱線圖、散點圖、小提琴圖等都可以(10^1數(shù)量級);如果待展示的差異基因數(shù)為幾十個甚至上百個,首選熱圖(10^2數(shù)量級);如果待展示的基因數(shù)目已經(jīng)幾百,甚至幾千個(10^3數(shù)量級及以上),那么火山圖絕對是首選。

在銅死亡論文中,圖A(左)篩選流程,比較兩種銅離子載體的共同耐藥基因。圖A(右)是篩選結(jié)果,藍色標記是保護性基因(敲除后細胞存活),紅色標記是敏感基因(敲除后細胞更易死亡)。FDX1是最高置信度的基因,表明其在銅死亡中起核心作用。圖B和圖C火山圖展示篩選結(jié)果,橫軸是基因敲除效應(yīng)(log fold change),縱軸是統(tǒng)計顯著性。FDX1、LIAS、LIPT1、DLD等基因位于右上角,表明它們對銅死亡至關(guān)重要。

差異基因火山圖。關(guān)鍵基因:下調(diào)基因(胰腺癌 vs 健康對照):Robo1(軸突導(dǎo)向)、Mir6236(非編碼RNA)、Lin28b(干細胞調(diào)控)。上調(diào)基因:Gal(神經(jīng)肽)、Erbb3(生長因子受體)、Mif(免疫調(diào)節(jié)因子)。這篇Nature文章是以橫軸火山圖來展示的。

好看的火山圖真的很多。這里,上次我們用DeepSeek對GEO芯片數(shù)據(jù)處理,并進行差異分析和初步火山圖繪制。這次,我們繪制了多種火山圖,供果友們參考。需要注意的是,代碼在實際運行過程中,是需要微調(diào)的,而不是直接拿來就能用的。

library(dplyr)
library(ggplot2)
library(ggprism)
library(ggrepel)
library(patchwork) # 用于圖形排版
library(tibble)
library(tidyverse)
library(ggfun)
library(grid)

data = DM1_vs_CTRL

# 數(shù)據(jù)預(yù)處理函數(shù)
data = rownames_to_column(data, var = "genesymbol")
logFC_threshold = 1.5
pval_threshold = 0.05

# 標記上下調(diào)基因
# 第一步:只創(chuàng)建分組
data <- data %>% 
  mutate(group = case_when(
    adj.P.Val < pval_threshold & logFC > logFC_threshold ~ "Up",
    adj.P.Val < pval_threshold & logFC < -logFC_threshold ~ "Down",
    TRUE ~ "NS"
  ))

# 第二步:轉(zhuǎn)換為因子
data$group <- factor(data$group, levels = c("Up", "NS", "Down"))

# 基礎(chǔ)火山圖函數(shù)
basic_volcano <- function(data, logFC_threshold = 1.5, pval_threshold = 0.05, 
                          title = "Volcano Plot") {
  ggplot(data, aes(x = logFC, y = -log10(adj.P.Val), color = group)) +
    geom_point(aes(color = group), alpha = 0.85, size = 1.5) +
    scale_color_manual(values = c('red', 'gray', 'blue')) +
    geom_vline(xintercept = c(-logFC_threshold, logFC_threshold), 
               linetype = "dashed", color = "black", linewidth = 0.8) +
    geom_hline(yintercept = -log10(pval_threshold), 
               linetype = "dashed", color = "black", linewidth = 0.8) +
    labs(x = "log2 (fold change)", y = "-log10 (adj.P.Val)", title = title) +
    theme_prism(border = TRUE) +
    theme(
      plot.title = element_text(hjust = 0.5),
      legend.position = "right",
      legend.title = element_blank()
    )
}

# 帶標簽的火山圖函數(shù)
labeled_volcano <- function(data, genes_to_label = NULL, 
                            logFC_threshold = 1.5, pval_threshold = 0.05,
                            title = "Volcano Plot with Labels") {

  # 添加標簽列
  if (!is.null(genes_to_label)) {
    data <- data %>%
      mutate(label = ifelse(genesymbol %in% genes_to_label, genesymbol, ""))
  } else {
    data$label <- ifelse(data$adj.P.Val < pval_threshold & 
                           abs(data$logFC) > logFC_threshold, 
                         data$genesymbol, "")
  }

  basic_volcano(data, logFC_threshold, pval_threshold, title) +
    geom_label_repel(
      data = filter(data, label != ""),
      aes(label = label),
      size = 3,
      box.padding = 0.5,
      point.padding = 0.8,
      segment.color = "black",
      show.legend = FALSE,
      max.overlaps = Inf
    )
}

# 美化火山圖函數(shù)
enhanced_volcano <- function(data, top_n = 15, 
                             logFC_threshold = 1.5, pval_threshold = 0.05) {

  # 移除p值為0的基因
  data <- data %>% filter(adj.P.Val != 0)

  # 獲取上下調(diào)top基因
  top_up <- data %>%
    filter(group == "Up") %>%
    arrange(desc(-log10(adj.P.Val))) %>%
    slice(1:top_n)

  top_down <- data %>%
    filter(group == "Down") %>%
    arrange(desc(-log10(adj.P.Val))) %>%
    slice(1:top_n)

  # 繪圖
  ggplot(data, aes(x = logFC, y = -log10(adj.P.Val))) +
    # 背景點
    geom_point(aes(color = logFC, size = -log10(adj.P.Val)), alpha = 0.7) +
    # 突出顯示top基因
    geom_point(
      data = bind_rows(top_up, top_down),
      aes(fill = logFC, size = -log10(adj.P.Val)),
      shape = 21, color = "black", show.legend = FALSE
    ) +
    # 標簽
    geom_text_repel(
      data = top_up,
      aes(label = genesymbol),
      box.padding = 0.5,
      nudge_x = 0.5,
      nudge_y = 0.2,
      segment.curvature = -0.1,
      segment.ncp = 3,
      direction = "y",
      hjust = "left"
    ) +
    geom_text_repel(
      data = top_down,
      aes(label = genesymbol),
      box.padding = 0.5,
      nudge_x = -0.2,
      nudge_y = 0.2,
      segment.curvature = -0.1,
      segment.ncp = 3,
      segment.angle = 20,
      direction = "y",
      hjust = "right"
    ) +
    # 顏色和大小比例尺
    scale_color_gradientn(
      colours = c("#3288bd", "#66c2a5", "#ffffbf", "#f46d43", "#9e0142"),
      values = scales::rescale(c(min(data$logFC), -1, 0, 1, max(data$logFC)))
    ) +
    scale_fill_gradientn(
      colours = c("#3288bd", "#66c2a5", "#ffffbf", "#f46d43", "#9e0142"),
      values = scales::rescale(c(min(data$logFC), -1, 0, 1, max(data$logFC)))
    ) +
    scale_size(range = c(1, 7)) +
    # 參考線
    geom_vline(xintercept = c(-logFC_threshold, logFC_threshold), linetype = "dashed") +
    geom_hline(yintercept = -log10(pval_threshold), linetype = "dashed") +
    # 坐標軸限制
    xlim(c(-max(abs(data$logFC)), max(abs(data$logFC)))) +
    ylim(c(0, max(-log10(data$adj.P.Val)) + 2)) +
    # 注釋和主題
    labs(
      x = "log2 (fold change)",
      y = "-log10 (adj.P.Val)",
      title = "Enhanced Volcano Plot",
      subtitle = paste("Top", top_n, "up/down regulated genes")
    ) +
    theme_bw() +
    theme(
      panel.grid = element_blank(),
      legend.background = element_roundrect(color = "#808080", linetype = 1),
      axis.text = element_text(size = 13, color = "black"),
      axis.title = element_text(size = 15),
      plot.title = element_text(hjust = 0.5),
      plot.subtitle = element_text(hjust = 0.5)
    ) +
    # 添加方向箭頭
    annotation_custom(
      grob = segmentsGrob(
        y0 = unit(-10, "pt"),
        y1 = unit(-10, "pt"),
        arrow = arrow(angle = 45, length = unit(0.2, "cm"), ends = "first"),
        gp = gpar(lwd = 3, col = "#74add1")
      ),
      xmin = -max(abs(data$logFC)),
      xmax = -1,
      ymin = max(-log10(data$adj.P.Val)) + 1,
      ymax = max(-log10(data$adj.P.Val)) + 1
    ) +
    annotation_custom(
      grob = textGrob(label = "Down", gp = gpar(col = "#74add1")),
      xmin = -max(abs(data$logFC)),
      xmax = -1,
      ymin = max(-log10(data$adj.P.Val)) + 1.5,
      ymax = max(-log10(data$adj.P.Val)) + 1.5
    ) +
    annotation_custom(
      grob = segmentsGrob(
        y0 = unit(-10, "pt"),
        y1 = unit(-10, "pt"),
        arrow = arrow(angle = 45, length = unit(0.2, "cm"), ends = "last"),
        gp = gpar(lwd = 3, col = "#d73027")
      ),
      xmin = max(abs(data$logFC)),
      xmax = 1,
      ymin = max(-log10(data$adj.P.Val)) + 1,
      ymax = max(-log10(data$adj.P.Val)) + 1
    ) +
    annotation_custom(
      grob = textGrob(label = "Up", gp = gpar(col = "#d73027")),
      xmin = max(abs(data$logFC)),
      xmax = 1,
      ymin = max(-log10(data$adj.P.Val)) + 1.5,
      ymax = max(-log10(data$adj.P.Val)) + 1.5
    )
}

# MA圖函數(shù)
ma_plot <- function(data, logFC_threshold = 1.5, pval_threshold = 0.05,
                    highlight_genes = NULL) {

  # 如果沒有AveExpr列,使用平均表達量
  if (!"AveExpr" %in% colnames(data)) {
    warning("No 'AveExpr' column found. Using row means if available.")
    if (any(grepl("^GSM", colnames(data)))) {
      expr_cols <- grep("^GSM", colnames(data), value = TRUE)
      data$AveExpr <- rowMeans(data[, expr_cols], na.rm = TRUE)
    } else {
      stop("Cannot calculate average expression. Please provide 'AveExpr' column.")
    }
  }

  # 標記顯著基因
  significant <- data %>%
    filter(adj.P.Val < pval_threshold & abs(logFC) > logFC_threshold)

  ggplot(data, aes(x = log2(AveExpr), y = logFC, color = group)) +
    geom_point(alpha = 0.6, size = 1.5) +
    scale_color_manual(values = c("red", "grey50", "blue")) +
    geom_point(
      data = significant,
      aes(fill = group),
      shape = 21, color = "black", size = 2, alpha = 0.8
    ) +
    geom_hline(yintercept = c(-logFC_threshold, 0, logFC_threshold),
               linetype = c("dashed", "solid", "dashed"),
               linewidth = c(0.8, 1.2, 0.8)) +
    labs(
      x = "log2 (Average Expression)",
      y = "log2 (Fold Change)",
      title = "MA Plot"
    ) +
    theme_bw() +
    theme(
      panel.border = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      axis.line = element_line(colour = "black"),
      legend.position = "top"
    ) +
    {
      if (!is.null(highlight_genes)) {
        geom_text_repel(
          data = filter(data, genesymbol %in% highlight_genes),
          aes(label = genesymbol),
          box.padding = 0.5,
          max.overlaps = Inf
        )
      }
    }
}

# 使用示例
# 1. 準備數(shù)據(jù)
data_prepared <- data

# 2. 創(chuàng)建各種圖形
p1 <- basic_volcano(data_prepared, title = "Basic Volcano Plot")
p2 <- labeled_volcano(data_prepared, genes_to_label = c("CD36", "CX3CR1", "MAFF", "ACTN3"))
p3 <- enhanced_volcano(data_prepared)
p4 <- ma_plot(data_prepared, highlight_genes = c("CD36", "CX3CR1", "MAFF", "ACTN3"))

# 3. 組合圖形
(p1 + p2) 
(p3 + p4)

參考資料:https://mp.weixin.qq.com/s/RJhEX2GiF720_GQmjwcIlQ