This document composes different stages of the differential transcript expression and differential transcript usage analysis protocol and displays the expected outputs when running on the test data we provided.
Except for the packages required by the protocol, we use an addition
package tictoc to calculate the time required by each stage
that runs R code. The elapsed time of each R-based stage is printed at
the end of each stage.
All code shown in the report is imported from scripts for each stage. Code in Stage 0 and Stage 1 are UNIX commands and should be run in the terminal, while code in Stages 2-6 are R code and should be run in R.
This stage should be run in bash. Here, we only display the commands.
# run the next line if gffread and salmon haven't been added to path; replace <$1> with the correct directory
# export PATH="$1/software/bin/:$PATH"
# 1. Prepare transcriptome sequence. Extract the transcript sequences of all transcripts in the gene annotation GFF file using GffRead
gffread -w data/reference/transcripts.fa \
-g data/reference/chm13v2.0_maskedY.fa \
data/reference/chm13.draft_v2.0.gene_annotation.gff3
# 2. Combine transcriptome and genome sequence in a new file as gentrome
cat data/reference/transcripts.fa \
data/reference/chm13v2.0_maskedY.fa \
> data/reference/gentrome.fa
# 3. Extract the name of the genome targets to create a list of the decoys
grep "^>" data/reference/chm13v2.0_maskedY.fa > data/reference/decoys.txt
cut -d " " -f 1 data/reference/decoys.txt > data/reference/decoys.tmp
sed 's/^>//' data/reference/decoys.tmp > data/reference/decoys.txt
rm data/reference/decoys.tmp
# 4. Build decoy-aware transcriptome index
salmon index -t data/reference/gentrome.fa \
-d data/reference/decoys.txt \
-p 8 \
-i data/reference/salmon_index
This stage should be run in bash. Here, we only display the commands.
# run the next line if salmon hasn't been added to path; replace <$1> with the correct directory
# export PATH="$1/software/bin/:$PATH"
for SAMPLE in SRR1428605{7..9} SRR14286066 SRR14286069 SRR31761441_merged; do
mkdir -p results/salmon_output/$SAMPLE
salmon quant -i data/reference/salmon_index\
-l A \
-1 data/reads/$SAMPLE\_1.fastq.gz \
-2 data/reads/$SAMPLE\_2.fastq.gz \
-o results/salmon_output/$SAMPLE \
-p 8 \
--numGibbsSamples 50
done
if (!requireNamespace("tictoc", quietly = TRUE)) {
install.packages("tictoc")
}
library(tictoc)
tic()
library(rtracklayer)
gff <- import("data/reference/chm13.draft_v2.0.gene_annotation.gff3")
transcripts <- gff[gff$type == "transcript", ]
transcript_annotation <- data.frame(
gene_id = transcripts$gene_id,
transcript_id = transcripts$ID,
ensembl = substr(transcripts$source_transcript, 1, 15),
gene = substr(transcripts$source_gene, 1, 15),
symbol = transcripts$gene_name,
gene_biotype = transcripts$gene_biotype,
transcript_biotype = transcripts$transcript_biotype,
chromosome = as.character(seqnames(transcripts))
)
symbols <- unlist(
lapply(split(transcript_annotation, transcript_annotation$gene_id),
function(df) {
x <- df$symbol
if (length(unique(x)) > 1) {
unique(x[!grepl("^MSTRG", x)])[1]
} else {
unique(x)
}
})
)
transcript_annotation$symbol <- symbols[match(transcript_annotation$gene_id,
names(symbols))]
write.csv(transcript_annotation, "data/reference/transcript_annotation.csv",
row.names = FALSE)
toc()
## 127.317 sec elapsed
tic()
library(edgeR)
library(limma)
salmon_dir <- "results/salmon_output"
samples <- list.files(salmon_dir)
samples
## [1] "SRR14286057" "SRR14286058" "SRR14286059"
## [4] "SRR14286066" "SRR14286069" "SRR31761441_merged"
targets <- read.delim("data/targets.txt", row.names = 1)
targets
## GEO SRA CellLine Replicate
## H1975-1 GSM5255695 SRR14286057 H1975 1
## H1975-2 GSM5255696 SRR14286058 H1975 2
## H1975-3 GSM5255697 SRR14286059 H1975 3
## HCC827-1 GSM5255704 SRR14286066 HCC827 1
## HCC827-3 GSM5255706 SRR14286069 HCC827 3
## HCC827-2 GSM5255705 SRR31761441, SRR31761442 HCC827 2
counts <- catchSalmon(file.path(salmon_dir, samples))
## Reading results/salmon_output/SRR14286057, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR14286058, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR14286059, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR14286066, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR14286069, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR31761441_merged, 232530 transcripts, 50 gibbs samples
y <- DGEList(counts = counts$counts / counts$annotation$Overdispersion,
genes = counts$annotation, group = factor(targets$CellLine))
colnames(y) <- rownames(targets)
transcript_annotation <- read.csv("data/reference/transcript_annotation.csv")
m <- match(rownames(y), transcript_annotation$transcript_id)
y$genes <- cbind(y$genes, transcript_annotation[m, ])
filt <- filterByExpr(y, min.count = 3, min.total.count = 10)
table(filt)
## filt
## FALSE TRUE
## 178225 54305
y <- y[filt, , keep.lib.sizes=FALSE]
keep <- !grepl("pseudogene", y$genes$gene_biotype)
table(keep)
## keep
## FALSE TRUE
## 1546 52759
y <- y[keep, , keep.lib.sizes = FALSE]
y <- normLibSizes(y)
y$samples
## group lib.size norm.factors
## H1975-1 H1975 14519929 1.150
## H1975-2 H1975 15690030 1.186
## H1975-3 H1975 13082720 1.175
## HCC827-1 HCC827 15359554 0.929
## HCC827-3 HCC827 15065879 0.832
## HCC827-2 HCC827 65654638 0.807
col <- c("red", "blue")
# Figure 3a
# pdf("results/figure/MDS.pdf", height = 6, width = 6)
plotMDS(y, col = col[as.numeric(y$samples$group)], pch = 16)
legend("topleft", levels(y$samples$group), text.col = col)
# dev.off()
toc()
## 5.735 sec elapsed
tic()
# Data preprocess using step 3 script
# source("workflow/3_count_preprocess.R")
# create design
design <- model.matrix(~ 0 + y$samples$group)
colnames(design) <- sub("y$samples$group", "", colnames(design), fixed = TRUE)
design
## H1975 HCC827
## 1 1 0
## 2 1 0
## 3 1 0
## 4 0 1
## 5 0 1
## 6 0 1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$`y$samples$group`
## [1] "contr.treatment"
# model fitting
y <- estimateDisp(y, design = design)
# Figure 3b
# pdf("results/figure/BCV.pdf", height = 4, width = 5)
# par(mgp = c(2, 1, 0))
plotBCV(y)
# dev.off()
fit <- glmQLFit(y, design = design)
# Figure 3c
# pdf("results/figure/QL.pdf", height = 4, width = 5)
# par(mgp = c(2, 1, 0))
plotQLDisp(fit)
# dev.off()
toc()
## 9.219 sec elapsed
tic()
# model fitting using step 4 script
# source("workflow/4_fit_model.R")
# test for specific comparisons
contrast <- makeContrasts(HCC827 - H1975, levels = design)
res <- glmQLFTest(fit, contrast = contrast)
# test for DE
is_de <- decideTests(res)
summary(is_de)
## -1*H1975 1*HCC827
## Down 14644
## NotSig 25159
## Up 12956
# top DE results
topTags(res)
## Coefficient: -1*H1975 1*HCC827
## Length EffectiveLength Overdispersion gene_id
## CHM13_T0082563 704 513 1.00 CHM13_G0021776
## CHM13_T0201110 3165 2974 1.01 CHM13_G0051570
## CHM13_T0139266 1889 1698 3.59 CHM13_G0035163
## CHM13_T0059911 3701 3510 1.37 CHM13_G0015985
## CHM13_T0055092 1805 1614 1.02 CHM13_G0014390
## CHM13_T0060257 1676 1485 1.23 CHM13_G0016039
## CHM13_T0057475 6243 6052 1.07 CHM13_G0015147
## CHM13_T0113733 3504 3296 2.29 CHM13_G0028818
## CHM13_T0144141 2716 2525 2.06 CHM13_G0036585
## CHM13_T0112035 1460 1269 1.03 CHM13_G0028503
## transcript_id ensembl gene symbol
## CHM13_T0082563 CHM13_T0082563 ENST00000306061 ENSG00000169715 MT1E
## CHM13_T0201110 CHM13_T0201110 ENST00000223095 ENSG00000106366 SERPINE1
## CHM13_T0139266 CHM13_T0139266 ENST00000372431 ENSG00000100979 PLTP
## CHM13_T0059911 CHM13_T0059911 ENST00000311852 ENSG00000157227 MMP14
## CHM13_T0055092 CHM13_T0055092 ENST00000655811 ENSG00000287530 AL445985.1
## CHM13_T0060257 CHM13_T0060257 ENST00000250383 ENSG00000100867 DHRS2
## CHM13_T0057475 CHM13_T0057475 ENST00000377474 ENSG00000178695 KCTD12
## CHM13_T0113733 CHM13_T0113733 ENST00000221992 ENSG00000105388 CEACAM5
## CHM13_T0144141 CHM13_T0144141 ENST00000429300 ENSG00000100033 PRODH
## CHM13_T0112035 CHM13_T0112035 ENST00000004982 ENSG00000004776 HSPB6
## gene_biotype transcript_biotype chromosome logFC logCPM F
## CHM13_T0082563 protein_coding protein_coding chr16 -12.13 6.96 2522
## CHM13_T0201110 protein_coding protein_coding chr7 -8.48 8.46 3238
## CHM13_T0139266 protein_coding protein_coding chr20 -11.30 5.78 1672
## CHM13_T0059911 protein_coding protein_coding chr14 -8.86 7.19 2739
## CHM13_T0055092 lncRNA lncRNA chr13 -9.16 4.73 1403
## CHM13_T0060257 protein_coding protein_coding chr14 -6.47 9.34 2091
## CHM13_T0057475 protein_coding protein_coding chr13 -6.83 6.91 2067
## CHM13_T0113733 protein_coding protein_coding chr19 10.02 4.91 990
## CHM13_T0144141 protein_coding retained_intron chr22 9.77 4.84 1002
## CHM13_T0112035 protein_coding protein_coding chr19 -8.14 5.52 1845
## PValue FDR
## CHM13_T0082563 1.08e-20 5.69e-16
## CHM13_T0201110 8.11e-19 1.49e-14
## CHM13_T0139266 8.45e-19 1.49e-14
## CHM13_T0059911 3.07e-18 4.05e-14
## CHM13_T0055092 9.23e-18 9.73e-14
## CHM13_T0060257 2.05e-17 1.68e-13
## CHM13_T0057475 2.24e-17 1.68e-13
## CHM13_T0113733 2.54e-17 1.68e-13
## CHM13_T0144141 3.87e-17 2.27e-13
## CHM13_T0112035 5.11e-17 2.70e-13
# save complete DTE results table into TSV file
tt <- topTags(res, n = Inf)
write.table(tt, file = "results/DTE_results.tsv", sep = "\t")
# Figure 4a
# pdf("results/figure/MD_small.pdf", height = 5, width = 4)
plotMD(res, main = "HCC827 vs H1975", cex = 0.3)
# dev.off()
# volcano plot
library(ggplot2)
# library(ggrepel)
fdr_cutoff <- 0.05
tt$table$DE_status <- ifelse(tt$table$FDR > fdr_cutoff, "NotSig",
ifelse(tt$table$logFC > 0, "Up", "Down"))
# Create a column for highlighting top 2 DE transcripts
tt$table$top2 <- ""
tt$table$top2[1:2] <- tt$table$transcript_id[1:2]
# pdf("results/figure/volcano_small.pdf", height = 5, width = 4)
ggplot(tt$table, aes(x = logFC, y = -log10(FDR), label = top2)) +
geom_point(aes(colour = DE_status), size = 0.3) +
geom_text(hjust = -0.05) +
scale_colour_manual(values = c(
"Up" = "red",
"Down" = "blue",
"NotSig" = "black"
)) +
geom_hline(yintercept = -log10(fdr_cutoff), linetype = "dashed")+
theme_classic()
# dev.off()
toc()
## 3.023 sec elapsed
tic()
# model fitting using step 4 script
# source("workflow/4_fit_model.R")
# test for specific comparisons
# make contrast
contrast <- makeContrasts(HCC827 - H1975, levels = design)
# test for differential transcript usage for the specified contrast
ds <- diffSplice(fit, contrast = contrast,
geneid = "gene_id", exonid = "transcript_id")
## Total number of exons: 52759
## Total number of genes: 17364
## Number of genes with 1 exon: 6161
## Mean number of exons in a gene: 3
## Max number of exons in a gene: 30
# generate a list of all differentially spliced genes (simes test)
ts_simes <- topSplice(ds, test = "simes", number = Inf)
# print the number of significant differentially spliced genes (simes test)
table(ts_simes$FDR < 0.05)
##
## FALSE TRUE
## 7775 3428
# print top differentially spliced genes (simes test)
topSplice(ds, test = "simes")
## gene_id gene symbol gene_biotype
## CHM13_G0047330 CHM13_G0047330 ENSG00000206503 HLA-A protein_coding
## CHM13_G0029617 CHM13_G0029617 ENSG00000170889 RPS9 protein_coding
## CHM13_G0031869 CHM13_G0031869 ENSG00000115641 FHL2 protein_coding
## CHM13_G0029793 CHM13_G0029793 ENSG00000166770 ZNF667-AS1 lncRNA
## CHM13_G0029891 CHM13_G0029891 ENSG00000121413 ZSCAN18 protein_coding
## CHM13_G0009404 CHM13_G0009404 ENSG00000168003 SLC3A2 protein_coding
## CHM13_G0029458 CHM13_G0029458 ENSG00000221923 ZNF880 protein_coding
## CHM13_G0012439 CHM13_G0012439 ENSG00000197111 PCBP2 protein_coding
## CHM13_G0009145 CHM13_G0009145 ENSG00000198561 CTNND1 protein_coding
## CHM13_G0027857 CHM13_G0027857 ENSG00000167460 TPM4 protein_coding
## chromosome NExons P.Value FDR
## CHM13_G0047330 chr6 6 2.89e-29 3.23e-25
## CHM13_G0029617 chr19 11 8.17e-28 4.58e-24
## CHM13_G0031869 chr2 7 9.84e-27 3.40e-23
## CHM13_G0029793 chr19 9 1.22e-26 3.40e-23
## CHM13_G0029891 chr19 9 1.93e-22 3.72e-19
## CHM13_G0009404 chr11 13 1.99e-22 3.72e-19
## CHM13_G0029458 chr19 5 4.20e-22 6.73e-19
## CHM13_G0012439 chr12 15 7.00e-22 9.80e-19
## CHM13_G0009145 chr11 13 1.80e-21 2.24e-18
## CHM13_G0027857 chr19 9 2.78e-21 3.11e-18
# generate a list of all differentially spliced transcripts
ts_transcript <- topSplice(ds, test = "t", number = Inf)
# print the number of significant differentially spliced transcripts
table(ts_transcript$FDR < 0.05)
##
## FALSE TRUE
## 39311 7287
# print top differentially spliced transcripts
topSplice(ds, test = "t")
## Length EffectiveLength Overdispersion gene_id
## CHM13_T0185741 1854 1663 5.31 CHM13_G0047330
## CHM13_T0117294 772 581 7.64 CHM13_G0029617
## CHM13_T0118000 1503 1312 6.01 CHM13_G0029793
## CHM13_T0126251 1601 1410 3.93 CHM13_G0031869
## CHM13_T0034581 2161 1970 3.99 CHM13_G0009404
## CHM13_T0118392 1766 1575 1.00 CHM13_G0029891
## CHM13_T0047324 1786 1595 1.43 CHM13_G0012439
## CHM13_T0116766 2026 1835 1.02 CHM13_G0029458
## CHM13_T0033511 5340 5149 34.99 CHM13_G0009145
## CHM13_T0109301 2598 2407 13.38 CHM13_G0027857
## transcript_id ensembl gene symbol
## CHM13_T0185741 CHM13_T0185741 ENST00000376806 ENSG00000206503 HLA-A
## CHM13_T0117294 CHM13_T0117294 ENST00000391752 ENSG00000170889 RPS9
## CHM13_T0118000 CHM13_T0118000 ENST00000654382 ENSG00000166770 ZNF667-AS1
## CHM13_T0126251 CHM13_T0126251 ENST00000409807 ENSG00000115641 FHL2
## CHM13_T0034581 CHM13_T0034581 ENST00000377890 ENSG00000168003 SLC3A2
## CHM13_T0118392 CHM13_T0118392 ENST00000595944 ENSG00000121413 ZSCAN18
## CHM13_T0047324 CHM13_T0047324 ENST00000553064 ENSG00000197111 PCBP2
## CHM13_T0116766 CHM13_T0116766 ENST00000422689 ENSG00000221923 ZNF880
## CHM13_T0033511 CHM13_T0033511 ENST00000673683 ENSG00000198561 CTNND1
## CHM13_T0109301 CHM13_T0109301 ENST00000646974 ENSG00000167460 TPM4
## gene_biotype transcript_biotype chromosome logFC t
## CHM13_T0185741 protein_coding protein_coding chr6 6.36 26.1
## CHM13_T0117294 protein_coding protein_coding chr19 -7.77 -20.0
## CHM13_T0118000 lncRNA lncRNA chr19 -7.51 -19.8
## CHM13_T0126251 protein_coding protein_coding chr2 -4.54 -25.5
## CHM13_T0034581 protein_coding protein_coding chr11 3.77 14.8
## CHM13_T0118392 protein_coding protein_coding chr19 -6.39 -16.5
## CHM13_T0047324 protein_coding protein_coding chr12 2.05 13.9
## CHM13_T0116766 protein_coding protein_coding chr19 -3.12 -19.7
## CHM13_T0033511 protein_coding protein_coding chr11 -2.44 -14.1
## CHM13_T0109301 protein_coding protein_coding chr19 -2.77 -15.5
## P.Value FDR
## CHM13_T0185741 4.81e-30 2.24e-25
## CHM13_T0117294 7.43e-29 1.73e-24
## CHM13_T0118000 1.35e-27 1.64e-23
## CHM13_T0126251 1.41e-27 1.64e-23
## CHM13_T0034581 1.53e-23 1.43e-19
## CHM13_T0118392 2.15e-23 1.67e-19
## CHM13_T0047324 4.66e-23 3.11e-19
## CHM13_T0116766 8.41e-23 4.90e-19
## CHM13_T0033511 1.39e-22 7.17e-19
## CHM13_T0109301 3.09e-22 1.44e-18
# save complete DTU results tables into TSV files
write.table(ts_simes, file = "results/DTU_gene_results.tsv", sep = "\t")
write.table(ts_transcript, file = "results/DTU_transcript_results.tsv",
sep = "\t")
# calculate TPM and expression proportion of transcripts
TPM <- tpm(y, effective.tx.length = fit$genes$EffectiveLength,
rta.overdispersion = fit$genes$Overdispersion)
TPMProp <- tpmProp(TPM, geneid = y$genes$gene_id)
# Select example gene ZNF880 (CHM13_G0029458)
g <- which(y$genes$gene_id == "CHM13_G0029458")
TPM[g, ]
## H1975-1 H1975-2 H1975-3 HCC827-1 HCC827-3 HCC827-2
## CHM13_T0116765 10.364 4.82 7.09 1.051 0.00 0.000
## CHM13_T0116766 66.571 64.15 59.90 1.394 1.59 1.598
## CHM13_T0116767 12.544 13.80 16.04 11.063 10.49 10.755
## CHM13_T0116768 4.852 6.28 7.43 2.911 2.43 2.144
## CHM13_T0116770 0.854 1.09 1.06 0.427 0.00 0.192
TPMProp[g, ]
## H1975-1 H1975-2 H1975-3 HCC827-1 HCC827-3 HCC827-2
## CHM13_T0116765 0.10889 0.0535 0.0775 0.0624 0.000 0.0000
## CHM13_T0116766 0.69938 0.7116 0.6545 0.0828 0.110 0.1088
## CHM13_T0116767 0.13179 0.1531 0.1753 0.6567 0.723 0.7322
## CHM13_T0116768 0.05097 0.0697 0.0812 0.1728 0.167 0.1460
## CHM13_T0116770 0.00897 0.0121 0.0116 0.0254 0.000 0.0131
# add annotation to transcript expression proportion data frame and save it into TSV file
AnnotatedTPMProp <- cbind(y$genes, TPMProp)
head(AnnotatedTPMProp)
## Length EffectiveLength Overdispersion gene_id
## CHM13_T0000008 1645 1454 3.29 CHM13_G0000006
## CHM13_T0000017 1317 1126 1.08 CHM13_G0000013
## CHM13_T0000019 6617 6426 2.32 CHM13_G0000012
## CHM13_T0000020 5443 5252 3.10 CHM13_G0000012
## CHM13_T0000023 5482 5291 3.64 CHM13_G0000012
## CHM13_T0000026 1653 1462 4.64 CHM13_G0000012
## transcript_id ensembl gene symbol
## CHM13_T0000008 CHM13_T0000008 ENST00000623180 ENSG00000280279 AC240565.2
## CHM13_T0000017 CHM13_T0000017 ENST00000473798 ENSG00000225880 LINC00115
## CHM13_T0000019 CHM13_T0000019 ENST00000445118 ENSG00000228794 LINC01128
## CHM13_T0000020 CHM13_T0000020 ENST00000669922 ENSG00000228794 LINC01128
## CHM13_T0000023 CHM13_T0000023 ENST00000666741 ENSG00000228794 LINC01128
## CHM13_T0000026 CHM13_T0000026 ENST00000608189 ENSG00000228794 LINC01128
## gene_biotype transcript_biotype chromosome H1975-1 H1975-2
## CHM13_T0000008 lncRNA lncRNA chr1 1.0000 1.0000
## CHM13_T0000017 lncRNA lncRNA chr1 1.0000 1.0000
## CHM13_T0000019 lncRNA lncRNA chr1 0.2502 0.2279
## CHM13_T0000020 lncRNA lncRNA chr1 0.0487 0.0353
## CHM13_T0000023 lncRNA lncRNA chr1 0.0573 0.0157
## CHM13_T0000026 lncRNA lncRNA chr1 0.0000 0.2116
## H1975-3 HCC827-1 HCC827-3 HCC827-2
## CHM13_T0000008 1.0000 1.0000 1.0000 1.0000
## CHM13_T0000017 1.0000 1.0000 1.0000 1.0000
## CHM13_T0000019 0.2165 0.0683 0.0562 0.0277
## CHM13_T0000020 0.0000 0.1104 0.0568 0.0000
## CHM13_T0000023 0.0429 0.1218 0.0925 0.0264
## CHM13_T0000026 0.0000 0.0000 0.3277 0.3628
write.table(AnnotatedTPMProp, file = "results/transcript_proportions.tsv")
# Results visualization
# make heatmaps to visualize the transcript usage
library(pheatmap)
annotation_col = data.frame(group = y$samples$group)
ann_colors = list(group = c(H1975 = "red", HCC827 = "blue"))
rownames(annotation_col) <- colnames(y)
# Figure 4b
# pdf("results/figure/DTU_heatmap_scaled_TPM.pdf", height = 4, width = 5)
pheatmap(log(TPM[g, ] + 1), scale = "column", cluster_cols = FALSE,
annotation_col = annotation_col, annotation_colors = ann_colors,
main="ZNF880")
# dev.off()
# make bar plots to visualize the transcript usage
dat <- as.data.frame(TPMProp[g, ])
# transform expression proportion data into long format to prepare for visualization
dat$transcript <- rownames(dat)
dat <- reshape(dat, varying = list(names(dat)[-ncol(dat)]),
v.names = "proportion", timevar = "sample",
times = colnames(dat)[-ncol(dat)], idvar = "transcript",
direction = "long")
# add sample information
dat$group <- y$samples$group[match(dat$sample, rownames(y$samples))]
# add transcript information
library(RColorBrewer)
transcript_colors <- brewer.pal(
sum(transcript_annotation$gene_id == "CHM13_G0029458"), "Set2")
names(transcript_colors) <- transcript_annotation$transcript_id[
transcript_annotation$gene_id == "CHM13_G0029458"]
# make stacked proportional bar plot to visualize the transcript usage
library(ggplot2)
# Figure 4c
# pdf("results/figure/DTU_barplot.pdf", height = 4, width = 5)
ggplot(dat, aes(x = sample, y = proportion, fill = transcript)) +
geom_bar(stat = "identity") +
facet_grid(cols = vars(group), scales = "free") +
labs(y = "transcript usage proportion", title = "ZNF880 transcript usage")+
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
scale_fill_manual(values = transcript_colors)
# dev.off()
# Visualize annotation transcripts
# Skip the next line if code from stage 2 has been run
gff <- rtracklayer::import("data/reference/chm13.draft_v2.0.gene_annotation.gff3")
annotation_DTU <- gff[gff$gene_id == "CHM13_G0029458" & gff$type == "exon"]
annotation_DTU$transcript <- annotation_DTU$transcript_id
annotation_DTU <- annotation_DTU[order(annotation_DTU$transcript_id)]
library(Gviz)
axisTrack <- GenomeAxisTrack()
grtrack <- GeneRegionTrack(
annotation_DTU,
transcriptAnnotation = "transcript",
fill = transcript_colors[as.character(annotation_DTU$transcript_id)]
)
# Figure 4d
# pdf("results/figure/DTU_transcript_model.pdf", height = 3, width = 8)
plotTracks(list(axisTrack, grtrack))
# dev.off()
toc()
## 95.316 sec elapsed
The R session information including version information of used packages is shown below.
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] Gviz_1.54.0 RColorBrewer_1.1-3 pheatmap_1.0.13
## [4] ggplot2_4.0.2 edgeR_4.8.2 limma_3.66.0
## [7] rtracklayer_1.70.1 GenomicRanges_1.62.1 Seqinfo_1.0.0
## [10] IRanges_2.44.0 S4Vectors_0.48.1 BiocGenerics_0.56.0
## [13] generics_0.1.4 tictoc_1.2.1
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.18.0 jsonlite_2.0.0
## [3] magrittr_2.0.4 GenomicFeatures_1.62.0
## [5] farver_2.1.2 rmarkdown_2.30
## [7] BiocIO_1.20.0 vctrs_0.7.1
## [9] memoise_2.0.1 Rsamtools_2.26.0
## [11] RCurl_1.98-1.17 base64enc_0.1-6
## [13] htmltools_0.5.9 S4Arrays_1.10.1
## [15] progress_1.2.3 curl_7.0.0
## [17] SparseArray_1.10.10 Formula_1.2-5
## [19] sass_0.4.10 bslib_0.10.0
## [21] htmlwidgets_1.6.4 httr2_1.2.2
## [23] cachem_1.1.0 GenomicAlignments_1.46.0
## [25] lifecycle_1.0.5 pkgconfig_2.0.3
## [27] Matrix_1.7-4 R6_2.6.1
## [29] fastmap_1.2.0 MatrixGenerics_1.22.0
## [31] digest_0.6.39 colorspace_2.1-2
## [33] AnnotationDbi_1.72.0 Hmisc_5.2-5
## [35] RSQLite_2.4.6 filelock_1.0.3
## [37] labeling_0.4.3 httr_1.4.8
## [39] abind_1.4-8 compiler_4.5.2
## [41] bit64_4.6.0-1 withr_3.0.2
## [43] htmlTable_2.4.3 S7_0.2.1
## [45] backports_1.5.0 BiocParallel_1.44.0
## [47] DBI_1.3.0 biomaRt_2.66.2
## [49] rappdirs_0.3.4 DelayedArray_0.36.1
## [51] rjson_0.2.23 tools_4.5.2
## [53] foreign_0.8-90 otel_0.2.0
## [55] nnet_7.3-20 glue_1.8.0
## [57] restfulr_0.0.16 checkmate_2.3.4
## [59] cluster_2.1.8.1 gtable_0.3.6
## [61] BSgenome_1.78.0 tzdb_0.5.0
## [63] ensembldb_2.34.0 data.table_1.18.2.1
## [65] hms_1.1.4 XVector_0.50.0
## [67] pillar_1.11.1 stringr_1.6.0
## [69] vroom_1.7.0 dplyr_1.2.0
## [71] BiocFileCache_3.0.0 lattice_0.22-7
## [73] deldir_2.0-4 bit_4.6.0
## [75] biovizBase_1.58.0 tidyselect_1.2.1
## [77] locfit_1.5-9.12 Biostrings_2.78.0
## [79] knitr_1.51 gridExtra_2.3
## [81] ProtGenerics_1.42.0 SummarizedExperiment_1.40.0
## [83] xfun_0.56 Biobase_2.70.0
## [85] statmod_1.5.1 matrixStats_1.5.0
## [87] stringi_1.8.7 UCSC.utils_1.6.1
## [89] lazyeval_0.2.2 yaml_2.3.12
## [91] evaluate_1.0.5 codetools_0.2-20
## [93] cigarillo_1.0.0 interp_1.1-6
## [95] tibble_3.3.1 cli_3.6.5
## [97] rpart_4.1.24 jquerylib_0.1.4
## [99] Rcpp_1.1.1 dichromat_2.0-0.1
## [101] GenomeInfoDb_1.46.2 dbplyr_2.5.2
## [103] png_0.1-8 XML_3.99-0.22
## [105] parallel_4.5.2 readr_2.2.0
## [107] blob_1.3.0 prettyunits_1.2.0
## [109] jpeg_0.1-11 latticeExtra_0.6-31
## [111] AnnotationFilter_1.34.0 bitops_1.0-9
## [113] VariantAnnotation_1.56.0 scales_1.4.0
## [115] crayon_1.5.3 rlang_1.1.7
## [117] KEGGREST_1.50.0