【原】R語言ggplot2繪制箱線圖

醫(yī)科研 2021-01-25

展開全文

歡迎來到醫(yī)科研，這里是白介素2的讀書筆記，跟我一起聊臨床與科研的故事, 生物醫(yī)學數據挖掘，R語言，TCGA、GEO數據挖掘。

R語言ggplot2繪制箱線圖

差異基因與基因交集

if(T){
require(tidyverse)
AS_gene<-read.table("F:/Bioinfor_project/Breast/AS_research/AS/result/BRCA_marker_cox_single_all.txt",header =T,sep = "\t") 
AS_gene[1:5,1:5]
final_DEmRNA<-read.csv("F:/Bioinfor_project/Breast/AS_research/AS/result/TNBC_DEmRNA.csv",header=T,row.names = 1)
DEG_2<-final_DEmRNA %>% 
  rownames_to_column("genename") 
hubgene<-intersect(AS_gene$symbol,DEG_2$genename)
}

清洗數據-進一步探索

if(T){
require(dplyr)
## 提取已標準化的數據
mRNA_exp<-read.csv(file="F:/Bioinfor_project/Breast/AS_research/AS/result/TNBC_mRNA_exp_log.csv",header = T,row.names = 1)
load("F:/Bioinfor_project/Breast/TCGA/RNA-seq-model/BRCA_all.Rdata")
BRCA_all[1:5,1:5]
dim(BRCA_all)## 24491 1200
head(phe)
colnames(phe)

##
mRNA_exp[1:5,1:5]
dim(mRNA_exp)
data_exp<-mRNA_exp[hubgene,]
## 提取匹配的臨床信息
data_phe<-phe %>%
  filter(ID %in% colnames(mRNA_exp))

## 合并臨床信息與表達數據
data<-data_exp %>% t() %>% as.data.frame() %>% 
    ##行名變列名
    rownames_to_column(var = "ID") %>% 
    as_tibble() %>% 
    ## 合并臨床信息與表達數據
    inner_join(data_phe,by="ID") %>% 
    ## 增加分組信息
    mutate(group=ifelse(substring(colnames(mRNA_exp),14,15)=="01","TNBC","Normal")) %>% 
    ## 分組提前
    dplyr::select(ID,group,everything())
group=factor(c(rep(1,115),rep(0,113)))
head(data)  
#save(data,file = "F:/Bioinfor_project/Breast/AS_research/AS/result/hubgene.Rdata")
}
## # A tibble: 6 x 22
##   ID    group CCL14  HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
##   <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct>   <int>    <int>    <int>
## 1 TCGA~ TNBC   3.37  0     0     5.94 Basal     967        1       NA
## 2 TCGA~ TNBC   4.97  3.39  0     6.00 Basal     584        0       NA
## 3 TCGA~ TNBC   4.77  0     0     5.04 Basal    2654        0       NA
## 4 TCGA~ TNBC   4.19  3.74  2.84  5.13 Basal     754        1       NA
## 5 TCGA~ TNBC   0     0     0     5.65 Basal    2048        0     2048
## 6 TCGA~ TNBC   5.34  0     4.07  6.04 Basal    1027        0     1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## #   PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## #   M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>

載入數據

數據清洗-gather-spread

load(file = "F:/Bioinfor_project/Breast/AS_research/AS/result/hubgene.Rdata")
head(data)
## # A tibble: 6 x 22
##   ID    group CCL14  HBA1 CCL16 TUBB3 PAM50 Os_time OS_event RFS_time
##   <chr> <chr> <dbl> <dbl> <dbl> <dbl> <fct>   <int>    <int>    <int>
## 1 TCGA~ TNBC   3.37  0     0     5.94 Basal     967        1       NA
## 2 TCGA~ TNBC   4.97  3.39  0     6.00 Basal     584        0       NA
## 3 TCGA~ TNBC   4.77  0     0     5.04 Basal    2654        0       NA
## 4 TCGA~ TNBC   4.19  3.74  2.84  5.13 Basal     754        1       NA
## 5 TCGA~ TNBC   0     0     0     5.65 Basal    2048        0     2048
## 6 TCGA~ TNBC   5.34  0     4.07  6.04 Basal    1027        0     1027
## # ... with 12 more variables: RFS_event <int>, age <int>, ER <fct>,
## #   PR <fct>, gender <fct>, HER2 <fct>, Margin_status <fct>, Node <int>,
## #   M_stage <fct>, N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>
require(cowplot)

調整清洗數據至想要的格式

require(tidyverse)
require(ggplot2)
require(ggsci)
## Loading required package: ggsci
mydata<-data %>% 
  ## 基因表達數據gather,gather的范圍應調整
  gather(key="gene",value="Expression",CCL14:TUBB3) %>% 
  ##
  dplyr::select(ID,gene,Expression,everything()) 
head(mydata)  
## # A tibble: 6 x 20
##   ID    gene  Expression group PAM50 Os_time OS_event RFS_time RFS_event
##   <chr> <chr>      <dbl> <chr> <fct>   <int>    <int>    <int>     <int>
## 1 TCGA~ CCL14       3.37 TNBC  Basal     967        1       NA        NA
## 2 TCGA~ CCL14       4.97 TNBC  Basal     584        0       NA        NA
## 3 TCGA~ CCL14       4.77 TNBC  Basal    2654        0       NA        NA
## 4 TCGA~ CCL14       4.19 TNBC  Basal     754        1       NA        NA
## 5 TCGA~ CCL14       0    TNBC  Basal    2048        0     2048         0
## 6 TCGA~ CCL14       5.34 TNBC  Basal    1027        0     1027         0
## # ... with 11 more variables: age <int>, ER <fct>, PR <fct>, gender <fct>,
## #   HER2 <fct>, Margin_status <fct>, Node <int>, M_stage <fct>,
## #   N_stage <fct>, T_stage <fct>, `Pathologic stage` <fct>

繪制簡單箱線圖

P<-mydata %>% 
   ## 確定x,y
   ggplot(aes(x = gene, y = Expression, fill = group)) +
        geom_boxplot(alpha=0.7) +
        scale_y_continuous(name = "Expression")+
        scale_x_discrete(name = "Gene") +
        ggtitle("Boxplot of hub gene") +
        theme_bw() +
        theme(plot.title = element_text(size = 14, face =  "bold"),
              text = element_text(size = 12),
              axis.title = element_text(face="bold"),
              axis.text.x=element_text(size = 11)) 
P

ggsci調色

調用scale_fill_lancet，調色lancet配色

p2<-P+scale_fill_lancet()
p2
#ggsave(p2,filename = "hub_gene_boxplot.pdf",width = 5,height = 5)

分面圖形

facet_wrap()

P<-mydata %>% 
   ## 確定x,y
   ggplot(aes(x = group, y = Expression, fill = group)) +
        geom_boxplot(alpha=0.7) +
        scale_y_continuous(name = "Expression")+
        scale_x_discrete(name = "group") +
        ggtitle("Boxplot of hub gene") +
        theme_bw() +
        theme(plot.title = element_text(size = 14, face =  "bold"),
              text = element_text(size = 12),
              axis.title = element_text(face="bold"),
              axis.text.x=element_text(size = 11)) 
P

p3<-P+scale_fill_lancet()+facet_wrap(~gene)
p3
#ggsave(p3,filename = "hub_gene_boxplot_facet.pdf",width = 5,height = 5)

加入其它變量并分面-PAM50

p<-mydata %>% 
  ## 篩選數據,選定x,y
  filter(group=="TNBC" & PAM50!="NA") %>% 
  ggplot(aes(x = PAM50, y = Expression, fill = PAM50)) +
        geom_boxplot(alpha=0.7) +
        scale_y_continuous(name = "Expression")+
        scale_x_discrete(name = "PAM50") +
        ggtitle("Boxplot of hub gene") +
        theme_bw() +
        theme(plot.title = element_text(size = 14, face =  "bold"),
              text = element_text(size = 12),
              axis.title = element_text(face="bold"),
              axis.text.x=element_text(size = 11)) 
p+scale_fill_lancet()+
  facet_wrap(~gene)##分面變量

for循環(huán)實現批量繪圖

for循環(huán)：命名空對象list 配合do.call函數

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggplot2)
p <- list()
genename<-hubgene
for (j in genename) {
  p[[j]] <- ggplot(data=data, aes_string(x=group, y=j)) + 
            geom_boxplot(alpha=0.7,aes(fill=group)) +
            scale_y_continuous(name = "Expression")+
            scale_x_discrete(name = j) +    
            ggtitle(paste0("boxlot of ",j) ) +##ggtitle指定
            theme_bw() +
            theme(plot.title = element_text(size = 14, face =  "bold"),
            text = element_text(size = 12),
            axis.title = element_text(face="bold"),
            axis.text.x=element_text(size = 11))+
            scale_fill_lancet()
  #ggsave(sprintf("boxplot%s.pdf",j),width=5,height=5,onefile=T)
}
do.call(grid.arrange, c(p, ncol=2))