在生信分析中,|log2FC| >1,p<0.05就是我們做分析想要的結(jié)果。不管是用GEO數(shù)據(jù)、TCGA數(shù)據(jù)等做分析,甚至是自己花錢做單細胞、多組學(xué),沒有差異也是枉然。在實驗中,表達差異和表型驗證,也要看到基因表達對表型的影響具有統(tǒng)計學(xué)差異。我們總結(jié)說過, 生信分析的數(shù)據(jù)可視化的方式要根據(jù)待展示差異基因的數(shù)目而定 。
如果待展示的基因較少,柱狀圖、箱線圖、散點圖、小提琴圖等都可以(10^1數(shù)量級);如果待展示的差異基因數(shù)為幾十個甚至上百個,首選熱圖(10^2數(shù)量級);如果待展示的基因數(shù)目已經(jīng)幾百,甚至幾千個(10^3數(shù)量級及以上),那么火山圖絕對是首選。
在銅死亡論文中,圖A(左)篩選流程,比較兩種銅離子載體的共同耐藥基因。圖A(右)是篩選結(jié)果,藍色標記是保護性基因(敲除后細胞存活),紅色標記是敏感基因(敲除后細胞更易死亡)。FDX1是最高置信度的基因,表明其在銅死亡中起核心作用。圖B和圖C火山圖展示篩選結(jié)果,橫軸是基因敲除效應(yīng)(log fold change),縱軸是統(tǒng)計顯著性。FDX1、LIAS、LIPT1、DLD等基因位于右上角,表明它們對銅死亡至關(guān)重要。
差異基因火山圖。關(guān)鍵基因:下調(diào)基因(胰腺癌 vs 健康對照):Robo1(軸突導(dǎo)向)、Mir6236(非編碼RNA)、Lin28b(干細胞調(diào)控)。上調(diào)基因:Gal(神經(jīng)肽)、Erbb3(生長因子受體)、Mif(免疫調(diào)節(jié)因子)。這篇Nature文章是以橫軸火山圖來展示的。
好看的火山圖真的很多。這里,上次我們用DeepSeek對GEO芯片數(shù)據(jù)處理,并進行差異分析和初步火山圖繪制。這次,我們繪制了多種火山圖,供果友們參考。需要注意的是,代碼在實際運行過程中,是需要微調(diào)的,而不是直接拿來就能用的。
library(dplyr)
library(ggplot2)
library(ggprism)
library(ggrepel)
library(patchwork) # 用于圖形排版
library(tibble)
library(tidyverse)
library(ggfun)
library(grid)
data = DM1_vs_CTRL
# 數(shù)據(jù)預(yù)處理函數(shù)
data = rownames_to_column(data, var = "genesymbol")
logFC_threshold = 1.5
pval_threshold = 0.05
# 標記上下調(diào)基因
# 第一步:只創(chuàng)建分組
data <- data %>%
mutate(group = case_when(
adj.P.Val < pval_threshold & logFC > logFC_threshold ~ "Up",
adj.P.Val < pval_threshold & logFC < -logFC_threshold ~ "Down",
TRUE ~ "NS"
))
# 第二步:轉(zhuǎn)換為因子
data$group <- factor(data$group, levels = c("Up", "NS", "Down"))
# 基礎(chǔ)火山圖函數(shù)
basic_volcano <- function(data, logFC_threshold = 1.5, pval_threshold = 0.05,
title = "Volcano Plot") {
ggplot(data, aes(x = logFC, y = -log10(adj.P.Val), color = group)) +
geom_point(aes(color = group), alpha = 0.85, size = 1.5) +
scale_color_manual(values = c('red', 'gray', 'blue')) +
geom_vline(xintercept = c(-logFC_threshold, logFC_threshold),
linetype = "dashed", color = "black", linewidth = 0.8) +
geom_hline(yintercept = -log10(pval_threshold),
linetype = "dashed", color = "black", linewidth = 0.8) +
labs(x = "log2 (fold change)", y = "-log10 (adj.P.Val)", title = title) +
theme_prism(border = TRUE) +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "right",
legend.title = element_blank()
)
}
# 帶標簽的火山圖函數(shù)
labeled_volcano <- function(data, genes_to_label = NULL,
logFC_threshold = 1.5, pval_threshold = 0.05,
title = "Volcano Plot with Labels") {
# 添加標簽列
if (!is.null(genes_to_label)) {
data <- data %>%
mutate(label = ifelse(genesymbol %in% genes_to_label, genesymbol, ""))
} else {
data$label <- ifelse(data$adj.P.Val < pval_threshold &
abs(data$logFC) > logFC_threshold,
data$genesymbol, "")
}
basic_volcano(data, logFC_threshold, pval_threshold, title) +
geom_label_repel(
data = filter(data, label != ""),
aes(label = label),
size = 3,
box.padding = 0.5,
point.padding = 0.8,
segment.color = "black",
show.legend = FALSE,
max.overlaps = Inf
)
}
# 美化火山圖函數(shù)
enhanced_volcano <- function(data, top_n = 15,
logFC_threshold = 1.5, pval_threshold = 0.05) {
# 移除p值為0的基因
data <- data %>% filter(adj.P.Val != 0)
# 獲取上下調(diào)top基因
top_up <- data %>%
filter(group == "Up") %>%
arrange(desc(-log10(adj.P.Val))) %>%
slice(1:top_n)
top_down <- data %>%
filter(group == "Down") %>%
arrange(desc(-log10(adj.P.Val))) %>%
slice(1:top_n)
# 繪圖
ggplot(data, aes(x = logFC, y = -log10(adj.P.Val))) +
# 背景點
geom_point(aes(color = logFC, size = -log10(adj.P.Val)), alpha = 0.7) +
# 突出顯示top基因
geom_point(
data = bind_rows(top_up, top_down),
aes(fill = logFC, size = -log10(adj.P.Val)),
shape = 21, color = "black", show.legend = FALSE
) +
# 標簽
geom_text_repel(
data = top_up,
aes(label = genesymbol),
box.padding = 0.5,
nudge_x = 0.5,
nudge_y = 0.2,
segment.curvature = -0.1,
segment.ncp = 3,
direction = "y",
hjust = "left"
) +
geom_text_repel(
data = top_down,
aes(label = genesymbol),
box.padding = 0.5,
nudge_x = -0.2,
nudge_y = 0.2,
segment.curvature = -0.1,
segment.ncp = 3,
segment.angle = 20,
direction = "y",
hjust = "right"
) +
# 顏色和大小比例尺
scale_color_gradientn(
colours = c("#3288bd", "#66c2a5", "#ffffbf", "#f46d43", "#9e0142"),
values = scales::rescale(c(min(data$logFC), -1, 0, 1, max(data$logFC)))
) +
scale_fill_gradientn(
colours = c("#3288bd", "#66c2a5", "#ffffbf", "#f46d43", "#9e0142"),
values = scales::rescale(c(min(data$logFC), -1, 0, 1, max(data$logFC)))
) +
scale_size(range = c(1, 7)) +
# 參考線
geom_vline(xintercept = c(-logFC_threshold, logFC_threshold), linetype = "dashed") +
geom_hline(yintercept = -log10(pval_threshold), linetype = "dashed") +
# 坐標軸限制
xlim(c(-max(abs(data$logFC)), max(abs(data$logFC)))) +
ylim(c(0, max(-log10(data$adj.P.Val)) + 2)) +
# 注釋和主題
labs(
x = "log2 (fold change)",
y = "-log10 (adj.P.Val)",
title = "Enhanced Volcano Plot",
subtitle = paste("Top", top_n, "up/down regulated genes")
) +
theme_bw() +
theme(
panel.grid = element_blank(),
legend.background = element_roundrect(color = "#808080", linetype = 1),
axis.text = element_text(size = 13, color = "black"),
axis.title = element_text(size = 15),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
) +
# 添加方向箭頭
annotation_custom(
grob = segmentsGrob(
y0 = unit(-10, "pt"),
y1 = unit(-10, "pt"),
arrow = arrow(angle = 45, length = unit(0.2, "cm"), ends = "first"),
gp = gpar(lwd = 3, col = "#74add1")
),
xmin = -max(abs(data$logFC)),
xmax = -1,
ymin = max(-log10(data$adj.P.Val)) + 1,
ymax = max(-log10(data$adj.P.Val)) + 1
) +
annotation_custom(
grob = textGrob(label = "Down", gp = gpar(col = "#74add1")),
xmin = -max(abs(data$logFC)),
xmax = -1,
ymin = max(-log10(data$adj.P.Val)) + 1.5,
ymax = max(-log10(data$adj.P.Val)) + 1.5
) +
annotation_custom(
grob = segmentsGrob(
y0 = unit(-10, "pt"),
y1 = unit(-10, "pt"),
arrow = arrow(angle = 45, length = unit(0.2, "cm"), ends = "last"),
gp = gpar(lwd = 3, col = "#d73027")
),
xmin = max(abs(data$logFC)),
xmax = 1,
ymin = max(-log10(data$adj.P.Val)) + 1,
ymax = max(-log10(data$adj.P.Val)) + 1
) +
annotation_custom(
grob = textGrob(label = "Up", gp = gpar(col = "#d73027")),
xmin = max(abs(data$logFC)),
xmax = 1,
ymin = max(-log10(data$adj.P.Val)) + 1.5,
ymax = max(-log10(data$adj.P.Val)) + 1.5
)
}
# MA圖函數(shù)
ma_plot <- function(data, logFC_threshold = 1.5, pval_threshold = 0.05,
highlight_genes = NULL) {
# 如果沒有AveExpr列,使用平均表達量
if (!"AveExpr" %in% colnames(data)) {
warning("No 'AveExpr' column found. Using row means if available.")
if (any(grepl("^GSM", colnames(data)))) {
expr_cols <- grep("^GSM", colnames(data), value = TRUE)
data$AveExpr <- rowMeans(data[, expr_cols], na.rm = TRUE)
} else {
stop("Cannot calculate average expression. Please provide 'AveExpr' column.")
}
}
# 標記顯著基因
significant <- data %>%
filter(adj.P.Val < pval_threshold & abs(logFC) > logFC_threshold)
ggplot(data, aes(x = log2(AveExpr), y = logFC, color = group)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("red", "grey50", "blue")) +
geom_point(
data = significant,
aes(fill = group),
shape = 21, color = "black", size = 2, alpha = 0.8
) +
geom_hline(yintercept = c(-logFC_threshold, 0, logFC_threshold),
linetype = c("dashed", "solid", "dashed"),
linewidth = c(0.8, 1.2, 0.8)) +
labs(
x = "log2 (Average Expression)",
y = "log2 (Fold Change)",
title = "MA Plot"
) +
theme_bw() +
theme(
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
legend.position = "top"
) +
{
if (!is.null(highlight_genes)) {
geom_text_repel(
data = filter(data, genesymbol %in% highlight_genes),
aes(label = genesymbol),
box.padding = 0.5,
max.overlaps = Inf
)
}
}
}
# 使用示例
# 1. 準備數(shù)據(jù)
data_prepared <- data
# 2. 創(chuàng)建各種圖形
p1 <- basic_volcano(data_prepared, title = "Basic Volcano Plot")
p2 <- labeled_volcano(data_prepared, genes_to_label = c("CD36", "CX3CR1", "MAFF", "ACTN3"))
p3 <- enhanced_volcano(data_prepared)
p4 <- ma_plot(data_prepared, highlight_genes = c("CD36", "CX3CR1", "MAFF", "ACTN3"))
# 3. 組合圖形
(p1 + p2)
(p3 + p4)
參考資料:https://mp.weixin.qq.com/s/RJhEX2GiF720_GQmjwcIlQ
熱門跟貼