After alignment with Kallisto, we added all count matrices to the ‘resultados/’ directory, with separate datasets in their own subdirectories.
library(BiocParallel)
library(tximport)
library(biomaRt)
library(DESeq2)
library(readr)
library(dplyr)
# Use contrasts from the original GSE table
get_phenodata <- function(gse_id) {
read_csv("data/Plataformas.csv") %>%
janitor::clean_names() %>%
filter(gse == gse_id) %>%
dplyr::select(gsm, grupo, grupo_no_trabalho_original)
}
# List Kallisto output files
get_files <- function(gse_id) {
list.files(
paste0("data/", gse_id),
pattern = ".h5",
full.names = TRUE,
recursive = TRUE
)
}
# Acquire metadata for the study
get_metadata <- function(gse_id, run_names) {
metadata <- read_csv(paste0("data/", gse_id, "_metadata.txt")) %>%
janitor::clean_names() %>%
inner_join(get_phenodata(gse_id), by = c("sample_name" = "gsm")) %>%
arrange(match(run, run_names))
samples <- metadata %>%
dplyr::select(run, grupo) %>%
setNames(c("run", "contrast"))
samples$contrast <- as.factor(samples$contrast)
return(samples)
}
# Read abundances (counts) from Kallisto outputs
get_count_matrix <- function(filenames) {
gtf <- "data/Homo_sapiens.GRCh38.107.gtf.gz"
txdb.filename <- "data/Homo_sapiens.GRCh38.107.gtf.sqlite"
if (!("Homo_sapiens.GRCh38.107.gtf.sqlite" %in% list.files("data"))) {
txdb <- GenomicFeatures::makeTxDbFromGFF(gtf, format = "gtf")
AnnotationDbi::saveDb(txdb, txdb.filename)
}
# Load txdb
txdb <- AnnotationDbi::loadDb(txdb.filename)
txdf <-
AnnotationDbi::select(txdb, keys(txdb, "GENEID"), "TXNAME", "GENEID")
tab <- table(txdf$GENEID)
txdf$ntx <- tab[match(txdf$GENEID, names(tab))]
tx2gene <- data.frame(
tx = txdf$TXNAME,
gene = txdf$GENEID,
stringsAsFactors = F
)
tximport(
files = filenames,
type = "kallisto",
tx2gene = tx2gene,
ignoreTxVersion = T
)
}
# Run differential expression with DESeq2
run_dge <- function(count_matrix, contrast_table) {
deseqobj <-
DESeqDataSetFromTximport(count_matrix,
colData = contrast_table,
design = ~ contrast)
dds <-
DESeq(deseqobj,
parallel = TRUE,
BPPARAM = MulticoreParam(2))
results(dds,
contrast = c("contrast", "Tratado", "Controle"),
tidy = TRUE)
}
# Translate Ensembl Gene IDs (ENSG) to HGNC symbols and
# add confidence intervals
translate_dges <- function(results) {
ensembl <-
useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
names <- results %>% dplyr::select(row)
translated <- getBM(
attributes = c('ensembl_gene_id',
'hgnc_symbol'),
filters = 'ensembl_gene_id',
values = names,
mart = ensembl
)
results %>%
left_join(translated, by = c("row" = "ensembl_gene_id")) %>%
group_by(hgnc_symbol) %>%
slice(which.max(abs(log2FoldChange))) %>%
ungroup() %>%
filter(hgnc_symbol != "") %>%
# Add confidence intervals
mutate(CI.L = log2FoldChange - (qnorm(0.95) * lfcSE),
CI.R = log2FoldChange + (qnorm(0.95) * lfcSE))
}
GSE91383_files <- get_files("GSE91383")
GSE91383_run_names <- stringr::str_extract_all(GSE91383_files, "SRR\\d+", simplify = TRUE)
GSE91383_metadata <- get_metadata("GSE91383", GSE91383_run_names)
GSE91383_matrix <- get_count_matrix(GSE91383_files)
GSE91383 <- run_dge(GSE91383_matrix, GSE91383_metadata)
GSE91383_translated <- translate_dges(GSE91383)
GSE91383_translated %>%
write_csv('results/GSE91383.csv')
GSE116127_files <- get_files("GSE116127")
GSE116127_run_names <- stringr::str_extract_all(GSE116127_files, "SRR\\d+", simplify = TRUE)
GSE116127_metadata <- get_metadata("GSE116127", GSE116127_run_names)
GSE116127_matrix <- get_count_matrix(GSE116127_files)
GSE116127 <- run_dge(GSE116127_matrix, GSE116127_metadata)
GSE116127_translated <- translate_dges(GSE116127)
GSE116127_translated %>%
write_csv('results/GSE116127.csv')