Methodology

  • RNA-Seq differential expression
    • We obtained raw reads through the SRA database
    • To create the alignment index, we used the Homo sapiens GRCh38 transcriptome, from Ensembl version 106
    • We aligned using Kallisto version 0.48.0
    • We read the count matrix obtained with Kallisto with the tximport R package, which also aggregates transcript-level counts at the gene-level.
    • Finally, we used DESeq2 to perform differential expression.

Differential Expression

After alignment with Kallisto, we added all count matrices to the ‘resultados/’ directory, with separate datasets in their own subdirectories.

Loading libraries and functions

library(BiocParallel)
library(tximport)
library(biomaRt)
library(DESeq2)
library(readr)
library(dplyr)

# Use contrasts from the original GSE table
get_phenodata <- function(gse_id) {
  read_csv("data/Plataformas.csv") %>%
    janitor::clean_names() %>%
    filter(gse == gse_id) %>%
    dplyr::select(gsm, grupo, grupo_no_trabalho_original)
}

# List Kallisto output files
get_files <- function(gse_id) {
  list.files(
    paste0("data/", gse_id),
    pattern =  ".h5",
    full.names = TRUE,
    recursive = TRUE
  )
}

# Acquire metadata for the study
get_metadata <- function(gse_id, run_names) {
  metadata <- read_csv(paste0("data/", gse_id, "_metadata.txt")) %>%
    janitor::clean_names() %>%
    inner_join(get_phenodata(gse_id), by = c("sample_name" = "gsm")) %>%
    arrange(match(run, run_names))
  
  samples <- metadata %>%
    dplyr::select(run, grupo) %>%
    setNames(c("run", "contrast"))
  
  samples$contrast <- as.factor(samples$contrast)
  
  return(samples)
}

# Read abundances (counts) from Kallisto outputs
get_count_matrix <- function(filenames) {
  gtf <- "data/Homo_sapiens.GRCh38.107.gtf.gz"
  txdb.filename <- "data/Homo_sapiens.GRCh38.107.gtf.sqlite"
  
  if (!("Homo_sapiens.GRCh38.107.gtf.sqlite" %in% list.files("data"))) {
    txdb <- GenomicFeatures::makeTxDbFromGFF(gtf, format = "gtf")
    AnnotationDbi::saveDb(txdb, txdb.filename)
  }
  
  # Load txdb
  txdb <- AnnotationDbi::loadDb(txdb.filename)
  txdf <-
    AnnotationDbi::select(txdb, keys(txdb, "GENEID"), "TXNAME", "GENEID")
  tab <- table(txdf$GENEID)
  txdf$ntx <- tab[match(txdf$GENEID, names(tab))]
  tx2gene <- data.frame(
    tx = txdf$TXNAME,
    gene = txdf$GENEID,
    stringsAsFactors = F
  )
  
  tximport(
    files = filenames,
    type = "kallisto",
    tx2gene = tx2gene,
    ignoreTxVersion = T
  )
}

# Run differential expression with DESeq2
run_dge <- function(count_matrix, contrast_table) {
  deseqobj <-
    DESeqDataSetFromTximport(count_matrix,
                             colData = contrast_table,
                             design = ~ contrast)
  
  dds <-
    DESeq(deseqobj,
          parallel = TRUE,
          BPPARAM = MulticoreParam(2))
  
  results(dds,
          contrast = c("contrast", "Tratado", "Controle"),
          tidy = TRUE)
  
}

# Translate Ensembl Gene IDs (ENSG) to HGNC symbols and
# add confidence intervals
translate_dges <- function(results) {
  ensembl <-
    useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
  
  names <- results %>% dplyr::select(row)
  
  translated <- getBM(
    attributes = c('ensembl_gene_id',
                   'hgnc_symbol'),
    filters = 'ensembl_gene_id',
    values = names,
    mart = ensembl
  )
  
  results %>%
    left_join(translated, by = c("row" = "ensembl_gene_id")) %>%
    group_by(hgnc_symbol) %>%
    slice(which.max(abs(log2FoldChange))) %>%
    ungroup() %>% 
    filter(hgnc_symbol != "") %>% 
    # Add confidence intervals
    mutate(CI.L = log2FoldChange - (qnorm(0.95) * lfcSE),
         CI.R = log2FoldChange + (qnorm(0.95) * lfcSE))
} 

Analysis

  • First, we read the metadata tables for each study;
  • Then we read the count matrices, with the tximport package, which will also convert transcript level quantifications to gene level quantifications (ENST -> ENSG).
  • Now that we have the contrasts and the count matrices, we can run DESeq2 to perform the differential expression analysis.
  • Finally, we added the correspondent gene symbols for each ENSG and the confidence intervals for the log2FoldChanges.
GSE91383_files <- get_files("GSE91383")
GSE91383_run_names <- stringr::str_extract_all(GSE91383_files, "SRR\\d+", simplify = TRUE)
GSE91383_metadata <- get_metadata("GSE91383", GSE91383_run_names)
GSE91383_matrix <- get_count_matrix(GSE91383_files)
GSE91383 <- run_dge(GSE91383_matrix, GSE91383_metadata)
GSE91383_translated <- translate_dges(GSE91383)
GSE91383_translated %>% 
  write_csv('results/GSE91383.csv')

GSE116127_files <- get_files("GSE116127")
GSE116127_run_names <- stringr::str_extract_all(GSE116127_files, "SRR\\d+", simplify = TRUE)
GSE116127_metadata <- get_metadata("GSE116127", GSE116127_run_names)
GSE116127_matrix <- get_count_matrix(GSE116127_files)
GSE116127 <- run_dge(GSE116127_matrix, GSE116127_metadata)
GSE116127_translated <- translate_dges(GSE116127)
GSE116127_translated %>% 
  write_csv('results/GSE116127.csv')