1 Introduction

This document composes different stages of the differential transcript expression and differential transcript usage analysis protocol and displays the expected outputs when running on the test data we provided.

Except for the packages required by the protocol, we use an addition package tictoc to calculate the time required by each stage that runs R code. The elapsed time of each R-based stage is printed at the end of each stage.

All code shown in the report is imported from scripts for each stage. Code in Stage 0 and Stage 1 are UNIX commands and should be run in the terminal, while code in Stages 2-6 are R code and should be run in R.

2 Stage 0: preparation of transcriptome index

This stage should be run in bash. Here, we only display the commands.

# run the next line if gffread and salmon haven't been added to path; replace <$1> with the correct directory
# export PATH="$1/software/bin/:$PATH"

# 1.    Prepare transcriptome sequence. Extract the transcript sequences of all transcripts in the gene annotation GFF file using GffRead
gffread -w data/reference/transcripts.fa \
  -g data/reference/chm13v2.0_maskedY.fa \
  data/reference/chm13.draft_v2.0.gene_annotation.gff3

# 2.    Combine transcriptome and genome sequence in a new file as gentrome
cat data/reference/transcripts.fa \
  data/reference/chm13v2.0_maskedY.fa \
  > data/reference/gentrome.fa

# 3.    Extract the name of the genome targets to create a list of the decoys
grep "^>" data/reference/chm13v2.0_maskedY.fa > data/reference/decoys.txt 
cut -d " " -f 1 data/reference/decoys.txt > data/reference/decoys.tmp 
sed 's/^>//' data/reference/decoys.tmp > data/reference/decoys.txt 
rm data/reference/decoys.tmp 

# 4.    Build decoy-aware transcriptome index
salmon index -t data/reference/gentrome.fa \
  -d data/reference/decoys.txt \
  -p 8 \
  -i data/reference/salmon_index

3 Stage 1: transcript quantification (quasi-mapping/selective alignment)

This stage should be run in bash. Here, we only display the commands.

# run the next line if salmon hasn't been added to path; replace <$1> with the correct directory
# export PATH="$1/software/bin/:$PATH"

for SAMPLE in SRR1428605{7..9} SRR14286066 SRR14286069 SRR31761441_merged; do
  mkdir -p results/salmon_output/$SAMPLE
  salmon quant -i data/reference/salmon_index\
    -l A \
    -1 data/reads/$SAMPLE\_1.fastq.gz \
    -2 data/reads/$SAMPLE\_2.fastq.gz \
    -o results/salmon_output/$SAMPLE \
    -p  8 \
    --numGibbsSamples 50
done

4 Stage 2: genes and transcripts annotation

if (!requireNamespace("tictoc", quietly = TRUE)) {
    install.packages("tictoc")
}
library(tictoc)
tic()
library(rtracklayer)
gff <- import("data/reference/chm13.draft_v2.0.gene_annotation.gff3")

transcripts <- gff[gff$type == "transcript", ]

transcript_annotation <- data.frame(
  gene_id = transcripts$gene_id,
  transcript_id = transcripts$ID,
  ensembl = substr(transcripts$source_transcript, 1, 15),
  gene = substr(transcripts$source_gene, 1, 15),
  symbol = transcripts$gene_name,
  gene_biotype = transcripts$gene_biotype,
  transcript_biotype = transcripts$transcript_biotype,
  chromosome = as.character(seqnames(transcripts))
)

symbols <- unlist(
  lapply(split(transcript_annotation, transcript_annotation$gene_id),
         function(df) {
           x <- df$symbol
           if (length(unique(x)) > 1) {
             unique(x[!grepl("^MSTRG", x)])[1]
           } else {
             unique(x)
           }
         })
)
transcript_annotation$symbol <- symbols[match(transcript_annotation$gene_id, 
                                              names(symbols))]

write.csv(transcript_annotation, "data/reference/transcript_annotation.csv", 
          row.names = FALSE)
toc()
## 127.317 sec elapsed

5 Stage 3: transcript-level count data preprocessing

tic()
library(edgeR)
library(limma)

salmon_dir <- "results/salmon_output"
samples <- list.files(salmon_dir)
samples
## [1] "SRR14286057"        "SRR14286058"        "SRR14286059"       
## [4] "SRR14286066"        "SRR14286069"        "SRR31761441_merged"
targets <- read.delim("data/targets.txt", row.names = 1)
targets
##                 GEO                      SRA CellLine Replicate
## H1975-1  GSM5255695              SRR14286057    H1975         1
## H1975-2  GSM5255696              SRR14286058    H1975         2
## H1975-3  GSM5255697              SRR14286059    H1975         3
## HCC827-1 GSM5255704              SRR14286066   HCC827         1
## HCC827-3 GSM5255706              SRR14286069   HCC827         3
## HCC827-2 GSM5255705 SRR31761441, SRR31761442   HCC827         2
counts <- catchSalmon(file.path(salmon_dir, samples))
## Reading results/salmon_output/SRR14286057, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR14286058, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR14286059, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR14286066, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR14286069, 232530 transcripts, 50 gibbs samples
## Reading results/salmon_output/SRR31761441_merged, 232530 transcripts, 50 gibbs samples
y <- DGEList(counts = counts$counts / counts$annotation$Overdispersion, 
             genes = counts$annotation, group = factor(targets$CellLine))

colnames(y) <- rownames(targets)

transcript_annotation <- read.csv("data/reference/transcript_annotation.csv")

m <- match(rownames(y), transcript_annotation$transcript_id)
y$genes <- cbind(y$genes, transcript_annotation[m, ])

filt <- filterByExpr(y, min.count = 3, min.total.count = 10)
table(filt)
## filt
##  FALSE   TRUE 
## 178225  54305
y <- y[filt, , keep.lib.sizes=FALSE]

keep <- !grepl("pseudogene", y$genes$gene_biotype)
table(keep)
## keep
## FALSE  TRUE 
##  1546 52759
y <- y[keep, , keep.lib.sizes = FALSE]

y <- normLibSizes(y)
y$samples
##           group lib.size norm.factors
## H1975-1   H1975 14519929        1.150
## H1975-2   H1975 15690030        1.186
## H1975-3   H1975 13082720        1.175
## HCC827-1 HCC827 15359554        0.929
## HCC827-3 HCC827 15065879        0.832
## HCC827-2 HCC827 65654638        0.807
col <- c("red", "blue")
# Figure 3a
# pdf("results/figure/MDS.pdf", height = 6, width = 6)
plotMDS(y, col = col[as.numeric(y$samples$group)], pch = 16)
legend("topleft", levels(y$samples$group), text.col = col)

# dev.off()
toc()
## 5.735 sec elapsed

6 Stage 4: generalized linear model fitting

tic()
# Data preprocess using step 3 script
# source("workflow/3_count_preprocess.R")

# create design
design <- model.matrix(~ 0 + y$samples$group)
colnames(design) <- sub("y$samples$group", "", colnames(design), fixed = TRUE)
design
##   H1975 HCC827
## 1     1      0
## 2     1      0
## 3     1      0
## 4     0      1
## 5     0      1
## 6     0      1
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$`y$samples$group`
## [1] "contr.treatment"
# model fitting
y <- estimateDisp(y, design = design)

# Figure 3b
# pdf("results/figure/BCV.pdf", height = 4, width = 5)
# par(mgp = c(2, 1, 0)) 
plotBCV(y)

# dev.off()

fit <- glmQLFit(y, design = design)

# Figure 3c
# pdf("results/figure/QL.pdf", height = 4, width = 5)
# par(mgp = c(2, 1, 0)) 
plotQLDisp(fit)

# dev.off()
toc()
## 9.219 sec elapsed

7 Stage 5: differential expression analysis

tic()
# model fitting using step 4 script
# source("workflow/4_fit_model.R")

# test for specific comparisons
contrast <- makeContrasts(HCC827 - H1975, levels = design)

res <- glmQLFTest(fit, contrast = contrast)

# test for DE
is_de <- decideTests(res)
summary(is_de)
##        -1*H1975 1*HCC827
## Down               14644
## NotSig             25159
## Up                 12956
# top DE results
topTags(res)
## Coefficient:  -1*H1975 1*HCC827 
##                Length EffectiveLength Overdispersion        gene_id
## CHM13_T0082563    704             513           1.00 CHM13_G0021776
## CHM13_T0201110   3165            2974           1.01 CHM13_G0051570
## CHM13_T0139266   1889            1698           3.59 CHM13_G0035163
## CHM13_T0059911   3701            3510           1.37 CHM13_G0015985
## CHM13_T0055092   1805            1614           1.02 CHM13_G0014390
## CHM13_T0060257   1676            1485           1.23 CHM13_G0016039
## CHM13_T0057475   6243            6052           1.07 CHM13_G0015147
## CHM13_T0113733   3504            3296           2.29 CHM13_G0028818
## CHM13_T0144141   2716            2525           2.06 CHM13_G0036585
## CHM13_T0112035   1460            1269           1.03 CHM13_G0028503
##                 transcript_id         ensembl            gene     symbol
## CHM13_T0082563 CHM13_T0082563 ENST00000306061 ENSG00000169715       MT1E
## CHM13_T0201110 CHM13_T0201110 ENST00000223095 ENSG00000106366   SERPINE1
## CHM13_T0139266 CHM13_T0139266 ENST00000372431 ENSG00000100979       PLTP
## CHM13_T0059911 CHM13_T0059911 ENST00000311852 ENSG00000157227      MMP14
## CHM13_T0055092 CHM13_T0055092 ENST00000655811 ENSG00000287530 AL445985.1
## CHM13_T0060257 CHM13_T0060257 ENST00000250383 ENSG00000100867      DHRS2
## CHM13_T0057475 CHM13_T0057475 ENST00000377474 ENSG00000178695     KCTD12
## CHM13_T0113733 CHM13_T0113733 ENST00000221992 ENSG00000105388    CEACAM5
## CHM13_T0144141 CHM13_T0144141 ENST00000429300 ENSG00000100033      PRODH
## CHM13_T0112035 CHM13_T0112035 ENST00000004982 ENSG00000004776      HSPB6
##                  gene_biotype transcript_biotype chromosome  logFC logCPM    F
## CHM13_T0082563 protein_coding     protein_coding      chr16 -12.13   6.96 2522
## CHM13_T0201110 protein_coding     protein_coding       chr7  -8.48   8.46 3238
## CHM13_T0139266 protein_coding     protein_coding      chr20 -11.30   5.78 1672
## CHM13_T0059911 protein_coding     protein_coding      chr14  -8.86   7.19 2739
## CHM13_T0055092         lncRNA             lncRNA      chr13  -9.16   4.73 1403
## CHM13_T0060257 protein_coding     protein_coding      chr14  -6.47   9.34 2091
## CHM13_T0057475 protein_coding     protein_coding      chr13  -6.83   6.91 2067
## CHM13_T0113733 protein_coding     protein_coding      chr19  10.02   4.91  990
## CHM13_T0144141 protein_coding    retained_intron      chr22   9.77   4.84 1002
## CHM13_T0112035 protein_coding     protein_coding      chr19  -8.14   5.52 1845
##                  PValue      FDR
## CHM13_T0082563 1.08e-20 5.69e-16
## CHM13_T0201110 8.11e-19 1.49e-14
## CHM13_T0139266 8.45e-19 1.49e-14
## CHM13_T0059911 3.07e-18 4.05e-14
## CHM13_T0055092 9.23e-18 9.73e-14
## CHM13_T0060257 2.05e-17 1.68e-13
## CHM13_T0057475 2.24e-17 1.68e-13
## CHM13_T0113733 2.54e-17 1.68e-13
## CHM13_T0144141 3.87e-17 2.27e-13
## CHM13_T0112035 5.11e-17 2.70e-13
# save complete DTE results table into TSV file
tt <- topTags(res, n = Inf)
write.table(tt, file = "results/DTE_results.tsv", sep = "\t")

# Figure 4a
# pdf("results/figure/MD_small.pdf", height = 5, width = 4)
plotMD(res, main = "HCC827 vs H1975", cex = 0.3)

# dev.off()

# volcano plot
library(ggplot2)
# library(ggrepel)
fdr_cutoff <- 0.05
tt$table$DE_status <- ifelse(tt$table$FDR > fdr_cutoff, "NotSig", 
                         ifelse(tt$table$logFC > 0, "Up", "Down"))

# Create a column for highlighting top 2 DE transcripts
tt$table$top2 <- ""
tt$table$top2[1:2] <- tt$table$transcript_id[1:2]
# pdf("results/figure/volcano_small.pdf", height = 5, width = 4)
ggplot(tt$table, aes(x = logFC, y = -log10(FDR),  label = top2)) +
  geom_point(aes(colour = DE_status), size = 0.3) +
  geom_text(hjust = -0.05) +
  scale_colour_manual(values = c(
    "Up" = "red",
    "Down" = "blue",
    "NotSig" = "black"
  )) +
  geom_hline(yintercept = -log10(fdr_cutoff), linetype = "dashed")+
  theme_classic()

# dev.off()
toc()
## 3.023 sec elapsed

8 Stage 6: differential usage analysis

tic()
# model fitting using step 4 script
# source("workflow/4_fit_model.R")

# test for specific comparisons
# make contrast 
contrast <- makeContrasts(HCC827 - H1975, levels = design)

# test for differential transcript usage for the specified contrast
ds <- diffSplice(fit, contrast = contrast, 
                 geneid = "gene_id", exonid = "transcript_id")
## Total number of exons:  52759 
## Total number of genes:  17364 
## Number of genes with 1 exon:  6161 
## Mean number of exons in a gene:  3 
## Max number of exons in a gene:  30
# generate a list of all differentially spliced genes (simes test)
ts_simes <- topSplice(ds, test = "simes", number = Inf)
# print the number of significant differentially spliced genes (simes test)
table(ts_simes$FDR < 0.05)
## 
## FALSE  TRUE 
##  7775  3428
# print top differentially spliced genes (simes test)
topSplice(ds, test = "simes")
##                       gene_id            gene     symbol   gene_biotype
## CHM13_G0047330 CHM13_G0047330 ENSG00000206503      HLA-A protein_coding
## CHM13_G0029617 CHM13_G0029617 ENSG00000170889       RPS9 protein_coding
## CHM13_G0031869 CHM13_G0031869 ENSG00000115641       FHL2 protein_coding
## CHM13_G0029793 CHM13_G0029793 ENSG00000166770 ZNF667-AS1         lncRNA
## CHM13_G0029891 CHM13_G0029891 ENSG00000121413    ZSCAN18 protein_coding
## CHM13_G0009404 CHM13_G0009404 ENSG00000168003     SLC3A2 protein_coding
## CHM13_G0029458 CHM13_G0029458 ENSG00000221923     ZNF880 protein_coding
## CHM13_G0012439 CHM13_G0012439 ENSG00000197111      PCBP2 protein_coding
## CHM13_G0009145 CHM13_G0009145 ENSG00000198561     CTNND1 protein_coding
## CHM13_G0027857 CHM13_G0027857 ENSG00000167460       TPM4 protein_coding
##                chromosome NExons  P.Value      FDR
## CHM13_G0047330       chr6      6 2.89e-29 3.23e-25
## CHM13_G0029617      chr19     11 8.17e-28 4.58e-24
## CHM13_G0031869       chr2      7 9.84e-27 3.40e-23
## CHM13_G0029793      chr19      9 1.22e-26 3.40e-23
## CHM13_G0029891      chr19      9 1.93e-22 3.72e-19
## CHM13_G0009404      chr11     13 1.99e-22 3.72e-19
## CHM13_G0029458      chr19      5 4.20e-22 6.73e-19
## CHM13_G0012439      chr12     15 7.00e-22 9.80e-19
## CHM13_G0009145      chr11     13 1.80e-21 2.24e-18
## CHM13_G0027857      chr19      9 2.78e-21 3.11e-18
# generate a list of all differentially spliced transcripts
ts_transcript <- topSplice(ds, test = "t", number = Inf)
# print the number of significant differentially spliced transcripts
table(ts_transcript$FDR < 0.05)
## 
## FALSE  TRUE 
## 39311  7287
# print top differentially spliced transcripts
topSplice(ds, test = "t")
##                Length EffectiveLength Overdispersion        gene_id
## CHM13_T0185741   1854            1663           5.31 CHM13_G0047330
## CHM13_T0117294    772             581           7.64 CHM13_G0029617
## CHM13_T0118000   1503            1312           6.01 CHM13_G0029793
## CHM13_T0126251   1601            1410           3.93 CHM13_G0031869
## CHM13_T0034581   2161            1970           3.99 CHM13_G0009404
## CHM13_T0118392   1766            1575           1.00 CHM13_G0029891
## CHM13_T0047324   1786            1595           1.43 CHM13_G0012439
## CHM13_T0116766   2026            1835           1.02 CHM13_G0029458
## CHM13_T0033511   5340            5149          34.99 CHM13_G0009145
## CHM13_T0109301   2598            2407          13.38 CHM13_G0027857
##                 transcript_id         ensembl            gene     symbol
## CHM13_T0185741 CHM13_T0185741 ENST00000376806 ENSG00000206503      HLA-A
## CHM13_T0117294 CHM13_T0117294 ENST00000391752 ENSG00000170889       RPS9
## CHM13_T0118000 CHM13_T0118000 ENST00000654382 ENSG00000166770 ZNF667-AS1
## CHM13_T0126251 CHM13_T0126251 ENST00000409807 ENSG00000115641       FHL2
## CHM13_T0034581 CHM13_T0034581 ENST00000377890 ENSG00000168003     SLC3A2
## CHM13_T0118392 CHM13_T0118392 ENST00000595944 ENSG00000121413    ZSCAN18
## CHM13_T0047324 CHM13_T0047324 ENST00000553064 ENSG00000197111      PCBP2
## CHM13_T0116766 CHM13_T0116766 ENST00000422689 ENSG00000221923     ZNF880
## CHM13_T0033511 CHM13_T0033511 ENST00000673683 ENSG00000198561     CTNND1
## CHM13_T0109301 CHM13_T0109301 ENST00000646974 ENSG00000167460       TPM4
##                  gene_biotype transcript_biotype chromosome logFC     t
## CHM13_T0185741 protein_coding     protein_coding       chr6  6.36  26.1
## CHM13_T0117294 protein_coding     protein_coding      chr19 -7.77 -20.0
## CHM13_T0118000         lncRNA             lncRNA      chr19 -7.51 -19.8
## CHM13_T0126251 protein_coding     protein_coding       chr2 -4.54 -25.5
## CHM13_T0034581 protein_coding     protein_coding      chr11  3.77  14.8
## CHM13_T0118392 protein_coding     protein_coding      chr19 -6.39 -16.5
## CHM13_T0047324 protein_coding     protein_coding      chr12  2.05  13.9
## CHM13_T0116766 protein_coding     protein_coding      chr19 -3.12 -19.7
## CHM13_T0033511 protein_coding     protein_coding      chr11 -2.44 -14.1
## CHM13_T0109301 protein_coding     protein_coding      chr19 -2.77 -15.5
##                 P.Value      FDR
## CHM13_T0185741 4.81e-30 2.24e-25
## CHM13_T0117294 7.43e-29 1.73e-24
## CHM13_T0118000 1.35e-27 1.64e-23
## CHM13_T0126251 1.41e-27 1.64e-23
## CHM13_T0034581 1.53e-23 1.43e-19
## CHM13_T0118392 2.15e-23 1.67e-19
## CHM13_T0047324 4.66e-23 3.11e-19
## CHM13_T0116766 8.41e-23 4.90e-19
## CHM13_T0033511 1.39e-22 7.17e-19
## CHM13_T0109301 3.09e-22 1.44e-18
# save complete DTU results tables into TSV files
write.table(ts_simes, file = "results/DTU_gene_results.tsv", sep = "\t")
write.table(ts_transcript, file = "results/DTU_transcript_results.tsv", 
            sep = "\t")

# calculate TPM and expression proportion of transcripts
TPM <- tpm(y, effective.tx.length = fit$genes$EffectiveLength,
           rta.overdispersion = fit$genes$Overdispersion)
TPMProp <- tpmProp(TPM, geneid = y$genes$gene_id)
# Select example gene ZNF880 (CHM13_G0029458)
g <- which(y$genes$gene_id == "CHM13_G0029458")
TPM[g, ]
##                H1975-1 H1975-2 H1975-3 HCC827-1 HCC827-3 HCC827-2
## CHM13_T0116765  10.364    4.82    7.09    1.051     0.00    0.000
## CHM13_T0116766  66.571   64.15   59.90    1.394     1.59    1.598
## CHM13_T0116767  12.544   13.80   16.04   11.063    10.49   10.755
## CHM13_T0116768   4.852    6.28    7.43    2.911     2.43    2.144
## CHM13_T0116770   0.854    1.09    1.06    0.427     0.00    0.192
TPMProp[g, ]
##                H1975-1 H1975-2 H1975-3 HCC827-1 HCC827-3 HCC827-2
## CHM13_T0116765 0.10889  0.0535  0.0775   0.0624    0.000   0.0000
## CHM13_T0116766 0.69938  0.7116  0.6545   0.0828    0.110   0.1088
## CHM13_T0116767 0.13179  0.1531  0.1753   0.6567    0.723   0.7322
## CHM13_T0116768 0.05097  0.0697  0.0812   0.1728    0.167   0.1460
## CHM13_T0116770 0.00897  0.0121  0.0116   0.0254    0.000   0.0131
# add annotation to transcript expression proportion data frame and save it into TSV file
AnnotatedTPMProp <- cbind(y$genes, TPMProp)
head(AnnotatedTPMProp)
##                Length EffectiveLength Overdispersion        gene_id
## CHM13_T0000008   1645            1454           3.29 CHM13_G0000006
## CHM13_T0000017   1317            1126           1.08 CHM13_G0000013
## CHM13_T0000019   6617            6426           2.32 CHM13_G0000012
## CHM13_T0000020   5443            5252           3.10 CHM13_G0000012
## CHM13_T0000023   5482            5291           3.64 CHM13_G0000012
## CHM13_T0000026   1653            1462           4.64 CHM13_G0000012
##                 transcript_id         ensembl            gene     symbol
## CHM13_T0000008 CHM13_T0000008 ENST00000623180 ENSG00000280279 AC240565.2
## CHM13_T0000017 CHM13_T0000017 ENST00000473798 ENSG00000225880  LINC00115
## CHM13_T0000019 CHM13_T0000019 ENST00000445118 ENSG00000228794  LINC01128
## CHM13_T0000020 CHM13_T0000020 ENST00000669922 ENSG00000228794  LINC01128
## CHM13_T0000023 CHM13_T0000023 ENST00000666741 ENSG00000228794  LINC01128
## CHM13_T0000026 CHM13_T0000026 ENST00000608189 ENSG00000228794  LINC01128
##                gene_biotype transcript_biotype chromosome H1975-1 H1975-2
## CHM13_T0000008       lncRNA             lncRNA       chr1  1.0000  1.0000
## CHM13_T0000017       lncRNA             lncRNA       chr1  1.0000  1.0000
## CHM13_T0000019       lncRNA             lncRNA       chr1  0.2502  0.2279
## CHM13_T0000020       lncRNA             lncRNA       chr1  0.0487  0.0353
## CHM13_T0000023       lncRNA             lncRNA       chr1  0.0573  0.0157
## CHM13_T0000026       lncRNA             lncRNA       chr1  0.0000  0.2116
##                H1975-3 HCC827-1 HCC827-3 HCC827-2
## CHM13_T0000008  1.0000   1.0000   1.0000   1.0000
## CHM13_T0000017  1.0000   1.0000   1.0000   1.0000
## CHM13_T0000019  0.2165   0.0683   0.0562   0.0277
## CHM13_T0000020  0.0000   0.1104   0.0568   0.0000
## CHM13_T0000023  0.0429   0.1218   0.0925   0.0264
## CHM13_T0000026  0.0000   0.0000   0.3277   0.3628
write.table(AnnotatedTPMProp, file = "results/transcript_proportions.tsv")

# Results visualization
# make heatmaps to visualize the transcript usage
library(pheatmap)
annotation_col = data.frame(group = y$samples$group)
ann_colors = list(group = c(H1975 = "red", HCC827 = "blue"))
rownames(annotation_col) <- colnames(y)
# Figure 4b
# pdf("results/figure/DTU_heatmap_scaled_TPM.pdf", height = 4, width = 5)
pheatmap(log(TPM[g, ] + 1), scale = "column", cluster_cols = FALSE, 
         annotation_col = annotation_col, annotation_colors = ann_colors,
         main="ZNF880")

# dev.off()


# make bar plots to visualize the transcript usage
dat <- as.data.frame(TPMProp[g, ])
# transform expression proportion data into long format to prepare for visualization
dat$transcript <- rownames(dat)
dat <- reshape(dat, varying = list(names(dat)[-ncol(dat)]),
               v.names = "proportion", timevar = "sample",
               times = colnames(dat)[-ncol(dat)], idvar = "transcript",
               direction = "long")
# add sample information
dat$group <- y$samples$group[match(dat$sample, rownames(y$samples))]
# add transcript information
library(RColorBrewer)
transcript_colors <- brewer.pal(
  sum(transcript_annotation$gene_id == "CHM13_G0029458"), "Set2")
names(transcript_colors) <- transcript_annotation$transcript_id[
  transcript_annotation$gene_id == "CHM13_G0029458"]
# make stacked proportional bar plot to visualize the transcript usage
library(ggplot2)
# Figure 4c
# pdf("results/figure/DTU_barplot.pdf", height = 4, width = 5)
ggplot(dat, aes(x = sample, y = proportion, fill = transcript)) +
  geom_bar(stat = "identity") +
  facet_grid(cols = vars(group), scales = "free") +
  labs(y = "transcript usage proportion", title = "ZNF880 transcript usage")+
  theme_bw() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
  scale_fill_manual(values = transcript_colors)

# dev.off()

# Visualize annotation transcripts
# Skip the next line if code from stage 2 has been run
gff <- rtracklayer::import("data/reference/chm13.draft_v2.0.gene_annotation.gff3")

annotation_DTU <- gff[gff$gene_id == "CHM13_G0029458" & gff$type == "exon"]
annotation_DTU$transcript <- annotation_DTU$transcript_id
annotation_DTU <- annotation_DTU[order(annotation_DTU$transcript_id)]

library(Gviz)
axisTrack <- GenomeAxisTrack()
grtrack <- GeneRegionTrack(
  annotation_DTU,
  transcriptAnnotation = "transcript",
  fill = transcript_colors[as.character(annotation_DTU$transcript_id)]
)
# Figure 4d
# pdf("results/figure/DTU_transcript_model.pdf", height = 3, width = 8)
plotTracks(list(axisTrack, grtrack))

# dev.off()
toc()
## 95.316 sec elapsed

9 Session information

The R session information including version information of used packages is shown below.

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] Gviz_1.54.0          RColorBrewer_1.1-3   pheatmap_1.0.13     
##  [4] ggplot2_4.0.2        edgeR_4.8.2          limma_3.66.0        
##  [7] rtracklayer_1.70.1   GenomicRanges_1.62.1 Seqinfo_1.0.0       
## [10] IRanges_2.44.0       S4Vectors_0.48.1     BiocGenerics_0.56.0 
## [13] generics_0.1.4       tictoc_1.2.1        
## 
## loaded via a namespace (and not attached):
##   [1] rstudioapi_0.18.0           jsonlite_2.0.0             
##   [3] magrittr_2.0.4              GenomicFeatures_1.62.0     
##   [5] farver_2.1.2                rmarkdown_2.30             
##   [7] BiocIO_1.20.0               vctrs_0.7.1                
##   [9] memoise_2.0.1               Rsamtools_2.26.0           
##  [11] RCurl_1.98-1.17             base64enc_0.1-6            
##  [13] htmltools_0.5.9             S4Arrays_1.10.1            
##  [15] progress_1.2.3              curl_7.0.0                 
##  [17] SparseArray_1.10.10         Formula_1.2-5              
##  [19] sass_0.4.10                 bslib_0.10.0               
##  [21] htmlwidgets_1.6.4           httr2_1.2.2                
##  [23] cachem_1.1.0                GenomicAlignments_1.46.0   
##  [25] lifecycle_1.0.5             pkgconfig_2.0.3            
##  [27] Matrix_1.7-4                R6_2.6.1                   
##  [29] fastmap_1.2.0               MatrixGenerics_1.22.0      
##  [31] digest_0.6.39               colorspace_2.1-2           
##  [33] AnnotationDbi_1.72.0        Hmisc_5.2-5                
##  [35] RSQLite_2.4.6               filelock_1.0.3             
##  [37] labeling_0.4.3              httr_1.4.8                 
##  [39] abind_1.4-8                 compiler_4.5.2             
##  [41] bit64_4.6.0-1               withr_3.0.2                
##  [43] htmlTable_2.4.3             S7_0.2.1                   
##  [45] backports_1.5.0             BiocParallel_1.44.0        
##  [47] DBI_1.3.0                   biomaRt_2.66.2             
##  [49] rappdirs_0.3.4              DelayedArray_0.36.1        
##  [51] rjson_0.2.23                tools_4.5.2                
##  [53] foreign_0.8-90              otel_0.2.0                 
##  [55] nnet_7.3-20                 glue_1.8.0                 
##  [57] restfulr_0.0.16             checkmate_2.3.4            
##  [59] cluster_2.1.8.1             gtable_0.3.6               
##  [61] BSgenome_1.78.0             tzdb_0.5.0                 
##  [63] ensembldb_2.34.0            data.table_1.18.2.1        
##  [65] hms_1.1.4                   XVector_0.50.0             
##  [67] pillar_1.11.1               stringr_1.6.0              
##  [69] vroom_1.7.0                 dplyr_1.2.0                
##  [71] BiocFileCache_3.0.0         lattice_0.22-7             
##  [73] deldir_2.0-4                bit_4.6.0                  
##  [75] biovizBase_1.58.0           tidyselect_1.2.1           
##  [77] locfit_1.5-9.12             Biostrings_2.78.0          
##  [79] knitr_1.51                  gridExtra_2.3              
##  [81] ProtGenerics_1.42.0         SummarizedExperiment_1.40.0
##  [83] xfun_0.56                   Biobase_2.70.0             
##  [85] statmod_1.5.1               matrixStats_1.5.0          
##  [87] stringi_1.8.7               UCSC.utils_1.6.1           
##  [89] lazyeval_0.2.2              yaml_2.3.12                
##  [91] evaluate_1.0.5              codetools_0.2-20           
##  [93] cigarillo_1.0.0             interp_1.1-6               
##  [95] tibble_3.3.1                cli_3.6.5                  
##  [97] rpart_4.1.24                jquerylib_0.1.4            
##  [99] Rcpp_1.1.1                  dichromat_2.0-0.1          
## [101] GenomeInfoDb_1.46.2         dbplyr_2.5.2               
## [103] png_0.1-8                   XML_3.99-0.22              
## [105] parallel_4.5.2              readr_2.2.0                
## [107] blob_1.3.0                  prettyunits_1.2.0          
## [109] jpeg_0.1-11                 latticeExtra_0.6-31        
## [111] AnnotationFilter_1.34.0     bitops_1.0-9               
## [113] VariantAnnotation_1.56.0    scales_1.4.0               
## [115] crayon_1.5.3                rlang_1.1.7                
## [117] KEGGREST_1.50.0