T. Cruzi - Strain expression analysis

Authors

João V. F. Cavalcante

Rodrigo J. S. Dalmolin

1 Analysis of gene expression between two strains of T. cruzi

This dataset was originally generated as part of the following study:

Silveira, Anna Clara Azevedo, de Souza, Iara Dantas, Cavalcante, João Vitor Ferreira, Teixeira, Thaise Lara, Orikaza, Cristina Mary, Dalmolin, Rodrigo Juliani Siqueira, da Silveira, José Franco, da Silva, Claudio Vieira, P21 Ablation Unveils Strain-Specific Transcriptional Reprogramming in Trypanosoma cruzi Amastigotes, International Journal of Microbiology, 2025, 9919200, 12 pages, 2025. https://doi.org/10.1155/ijm/9919200

1.1 Differential gene expression

“Upregulated” here refers to higher expression in the G strain relative to the Y strain. We’ll exclude samples used for the previous study (P21 knockouts).

set.seed(1024)

# --- Load and Merge Counts ---
counts_cepag <- read.table("data/dados_cepag/count_table_star.txt", header = TRUE, row.names = 1)
counts_cepay <- read.table("data/dados_cepay_ref_cepag/count_table_star.txt", header = TRUE, row.names = 1)

# Select relevant columns
cepag_cols <- colnames(counts_cepag)[grep("^cepag", colnames(counts_cepag))]
cepay_cols <- colnames(counts_cepay)[grep("^cepay", colnames(counts_cepay))]

# Subset the data
counts_cepag_filtered <- counts_cepag[, cepag_cols]
counts_cepay_filtered <- counts_cepay[, cepay_cols]

# Ensure row consistency
if (!identical(rownames(counts_cepag_filtered), rownames(counts_cepay_filtered))) {
  common_genes <- intersect(rownames(counts_cepag_filtered), rownames(counts_cepay_filtered))
  counts_cepag_filtered <- counts_cepag_filtered[common_genes, ]
  counts_cepay_filtered <- counts_cepay_filtered[common_genes, ]
}

merged_counts <- cbind(counts_cepag_filtered, counts_cepay_filtered)

# --- DESeq2 Setup ---
sample_names <- colnames(merged_counts)
strains <- sub("_.*", "", sample_names)
colData <- data.frame(row.names = sample_names, strain = as.factor(strains))

dds <- DESeqDataSetFromMatrix(countData = merged_counts,
                              colData = colData,
                              design = ~ strain)

# Pre-filter rows with low counts
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

# Run DESeq2
dds <- DESeq(dds)

# Save checkpoint
saveRDS(dds, "results/deseq2_dds.rds")

res <- results(dds, contrast=c("strain", "cepag", "cepay"))
res_df <- as.data.frame(res)
res_df <- res_df %>% filter(!is.na(padj))

res_df$gene_id <- rownames(res_df)

res_df$category <- "Not significant"
res_df$category[res_df$log2FoldChange > 0 & res_df$padj < 0.001] <- "Upregulated"
res_df$category[res_df$log2FoldChange < 0 & res_df$padj < 0.001] <- "Downregulated"
res_df$category <- factor(res_df$category, levels = c("Downregulated", "Not significant", "Upregulated"))

write.csv(res_df, "results/tables/deseq2_results.csv")

volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = category)) +
  geom_point(size = 1.0) + 
  scale_color_manual(values = c("Downregulated" = "#3b4992", 
                                "Not significant" = "#bdbdbd", 
                                "Upregulated" = "#ee0000")) + 
  labs(title = "G strain vs Y strain", 
       x = "log2FoldChange",
       y = "-log10 (padj)",
       color = NULL) + 
  theme_light() + 
  theme(
    plot.title = element_text(hjust = 0.5, size = 14), 
    legend.position = "bottom",                        
    legend.justification = "left",                     
    legend.key = element_blank(),                      
    legend.text = element_text(size = 11)              
  ) + 
  guides(color = guide_legend(override.aes = list(size = 3)))

print(volcano_plot)

ggsave("results/figures/volcano_plot_styled.svg", plot = volcano_plot, width = 7, height = 6)

1.2 Functional Enrichment (ORA)

We’ll use the GAF file provided by TriTrypDB.

# Use res_df from memory (no need to read CSV)

all_genes_universe <- rownames(res_df) 
N_universe <- length(all_genes_universe)

upregulated_genes <- res_df %>%
  filter(category == "Upregulated") %>%
  rownames()

downregulated_genes <- res_df %>%
  filter(category == "Downregulated") %>%
  rownames()

# Read the GAF file
gaf_file <- "data/TriTrypDB-64_TcruziG_GO.gaf"
gaf <- read.delim(gaf_file, header = FALSE, comment.char = "!", sep = "\t", quote = "",
                  col.names = c("DB", "DB_Object_ID", "DB_Object_Symbol", "Qualifier",
                                "GO_ID", "DB_Reference", "Evidence_Code", "With_From",
                                "Aspect", "DB_Object_Name", "DB_Object_Synonym",
                                "DB_Object_Type", "Taxon", "Date", "Assigned_By",
                                "Annotation_Extension", "Gene_Product_Form_ID"))

go_term_descriptions <- toTable(GOTERM) %>%
  dplyr::select(2, 3) %>%
  distinct()

create_go_list <- function(go_df, desc_df, ontology_code, min_size = 5) {
  go_filtered <- go_df %>% 
    filter(Aspect == ontology_code) %>% 
    dplyr::select(gene = DB_Object_ID, go_id = GO_ID)
  
  ls_go <- split(go_filtered$gene, go_filtered$go_id)
  term_names <- desc_df$Term[match(names(ls_go), desc_df$go_id)]
  names(ls_go) <- term_names
  ls_go <- ls_go[!is.na(names(ls_go))]
  ls_go <- ls_go[sapply(ls_go, function(x) length(unique(x)) >= min_size)]
  return(ls_go)
}

t_cruzi_enrich <- function(ls_go, dge, universe_count) {
  n_terms <- sapply(ls_go, function(x) length(unique(x)))
  n_intersect <- sapply(ls_go, function(x) { length(intersect(x, dge)) })
  n_dge <- length(dge)
  
  intersect_genes <- sapply(ls_go, function(x) { 
    ids <- intersect(x, dge)
    paste0(ids, collapse = "/")
  })
  
  gene_ratio_numeric <- n_intersect / n_dge
  pvalue <- (1 - phyper(n_intersect - 1, n_terms, universe_count - n_terms, n_dge))
  padj <- p.adjust(pvalue, method = "BH")
  
  res <- data.frame(
    description = names(ls_go),
    pvalue = pvalue,
    padj = padj,
    gene_ratio = gene_ratio_numeric, 
    count = n_intersect,
    n_dge = n_dge,
    genes = intersect_genes
  ) %>% 
    arrange(padj) %>% 
    filter(count > 0)
  
  return(res)
}

create_ora_plot <- function(go_enrich_up, go_enrich_down, title) {
  
  if(nrow(go_enrich_up) == 0 & nrow(go_enrich_down) == 0) {
    message(paste("No significant terms found for", title))
    return(NULL)
  }
  
  if(nrow(go_enrich_up) > 0) go_enrich_up <- go_enrich_up %>% mutate(direction = "Upregulated genes")
  if(nrow(go_enrich_down) > 0) go_enrich_down <- go_enrich_down %>% mutate(direction = "Downregulated genes")
  
  go_enrich <- bind_rows(go_enrich_up, go_enrich_down)
  
  go_enrich <- go_enrich %>%
    group_by(direction) %>%
    slice_min(order_by = padj, n = 10) %>% 
    ungroup()
  
  go_enrich$direction <- factor(go_enrich$direction, levels = c("Downregulated genes", "Upregulated genes"))

  p <- ggplot(go_enrich, aes(x = gene_ratio, 
                             y = fct_reorder(description, gene_ratio), 
                             size = count, 
                             col = padj)) +
    geom_point() +
    facet_wrap(~ direction, scales = "free_y") +
    scale_color_gradient(low = "black", high = "#56B1F7", limits = c(0, 0.05), guide = guide_colorbar(reverse = FALSE)) +
    labs(x = "Gene ratio", 
         y = NULL, 
         color = "Adjusted\np-value", 
         size = "Intersection",
         title = title) +
    theme_bw() +
    theme(
      panel.background = element_rect(fill = "white", color = "black"),
      panel.grid.major = element_line(linetype = "dotted", color = "gray60"),
      panel.grid.minor = element_blank(),
      strip.background = element_rect(fill = "gray90", color = "black", size = 1),
      strip.text = element_text(color = "black", face = "plain", size = 10),
      axis.text.y = element_text(size = 10, color = "black"),
      axis.text.x = element_text(size = 9, color = "black"),
      plot.title = element_text(hjust = 0.5, face = "plain"),
      legend.position = "right",
      legend.key = element_blank()
    )
  
  return(p)
}

ls_go_bp <- create_go_list(gaf, go_term_descriptions, "P", min_size = 5)
ls_go_cc <- create_go_list(gaf, go_term_descriptions, "C", min_size = 5)
ls_go_mf <- create_go_list(gaf, go_term_descriptions, "F", min_size = 5)

bp_up <- t_cruzi_enrich(ls_go_bp, upregulated_genes, N_universe) %>% filter(padj <= 0.05)
bp_down <- t_cruzi_enrich(ls_go_bp, downregulated_genes, N_universe) %>% filter(padj <= 0.05)
cc_up <- t_cruzi_enrich(ls_go_cc, upregulated_genes, N_universe) %>% filter(padj <= 0.05)
cc_down <- t_cruzi_enrich(ls_go_cc, downregulated_genes, N_universe) %>% filter(padj <= 0.05)
mf_up <- t_cruzi_enrich(ls_go_mf, upregulated_genes, N_universe) %>% filter(padj <= 0.05)
mf_down <- t_cruzi_enrich(ls_go_mf, downregulated_genes, N_universe) %>% filter(padj <= 0.05)

bp_up |> 
  mutate(gene_group = "Upregulated") |> 
  bind_rows(bp_down |> mutate(gene_group = "Downregulated")) |> 
  write.xlsx("results/tables/Supplementary_Table_2_BP.xlsx")

mf_up |> 
  mutate(gene_group = "Upregulated") |> 
  bind_rows(mf_down |> mutate(gene_group = "Downregulated")) |> 
  write.xlsx("results/tables/Supplementary_Table_3_MF.xlsx")

p_bp <- create_ora_plot(bp_up, bp_down, title = "G strain vs Y strain - Biological Process")
if(!is.null(p_bp)) ggsave("results/figures/bp_enrichment_styled.svg", p_bp, width = 12, height = 7)

p_cc <- create_ora_plot(cc_up, cc_down, title = "G strain vs Y strain - Cellular Component")
if(!is.null(p_cc)) ggsave("results/figures/cc_enrichment_styled.svg", p_cc, width = 12, height = 7)

p_mf <- create_ora_plot(mf_up, mf_down, title = "G strain vs Y strain - Molecular Function")
if(!is.null(p_mf)) ggsave("results/figures/mf_enrichment_styled.svg", p_mf, width = 12, height = 7)

p_bp

p_mf

1.3 Visualize the expression of particular gene families in the DEGs

gtf_path <- "data/refs/TriTrypDB-64_TcruziG.gtf"
gtf_data <- rtracklayer::import(gtf_path, format = "gtf")

gene_info <- data.frame(
  gene_id = gtf_data$ID,
  type = gtf_data$type,
  description = gtf_data$description
) %>% 
  filter(type == "protein_coding_gene") %>% 
  dplyr::select(gene_id, description) %>%
  distinct()

genes_of_interest <- c(
  "trans-sialidase",
  "MASP",
  "mucin-like glycoprotein",
  "mucin",
  "GP63"
)

# Join gene_info with dds rownames
genes_df <- data.frame(gene_id = rownames(dds)) %>%
  left_join(gene_info, by = "gene_id")

genes_df$group <- apply(
  sapply(genes_of_interest, function(x) {
    ifelse(grepl(x, genes_df$description, ignore.case = TRUE), x, NA) 
  }), 1, function(y) {
    matches <- y[!is.na(y)]
    ifelse(length(matches) > 0, matches[1], NA)
  }
)

genes_selected <- genes_df %>% 
  filter(!is.na(group))

norm_counts <- counts(dds, normalized = TRUE)
heatmap_data <- norm_counts[rownames(norm_counts) %in% genes_selected$gene_id, ]
z_scores <- t(apply(heatmap_data, 1, scale, center = TRUE))
colnames(z_scores) <- colnames(heatmap_data)

# Customize Labels
new_col_labels <- colnames(z_scores)
new_col_labels <- gsub("cepag_", "G Strain ", new_col_labels)
new_col_labels <- gsub("cepay_", "Y Strain ", new_col_labels)

annotation_col <- data.frame(row.names = colnames(z_scores))
annotation_col$Strain <- ifelse(grepl("cepag", colnames(z_scores)), "G Strain", "Y Strain")

annotation_row <- genes_selected %>%
  filter(gene_id %in% rownames(z_scores)) %>%
  dplyr::select(gene_id, group) %>%
  tibble::column_to_rownames("gene_id") %>%
  dplyr::rename(Family = group)

# --- Plot ---
hm_obj <- pheatmap(z_scores,
         annotation_col = annotation_col,
         annotation_row = annotation_row,
         labels_col = new_col_labels, 
         annotation_names_col = FALSE, 
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100),
         show_rownames = FALSE, 
         cluster_cols = FALSE,
         cluster_rows = TRUE,
         silent = TRUE)


svglite("results/figures/heatmap_gene_families.svg", width = 10, height = 12)
grid::grid.draw(hm_obj$gtable)
dev.off()

png 
  2

grid::grid.draw(hm_obj$gtable)

1.4 Save final supplementary table (DGEs with descriptions)

annotated_results <- res_df %>%
  left_join(gene_info, by = "gene_id") %>%
  dplyr::select(gene_id, description, everything())

write.xlsx(annotated_results, "results/tables/Supplementary_Table_1.xlsx")