Gene homology Part 2 - creating directed networks with igraph

2016-12-14 00:00:00 +0000

In my last post I created a gene homology network for human genes. In this post I want to extend the network to include edges for other species.

First, I am loading biomaRt and extract a list of all available datasets.

library(biomaRt)
ensembl = useMart("ensembl")
datasets <- listDatasets(ensembl)

For this network I don’t want to include all species with available information in Biomart because it would get way too messy. So, I am only picking the 11 species below: Human, fruitfly, mouse, C. elegans, dog, zebrafish, chicken, chimp, rat, yeast and pig. For all of these species, I am going to use their respective org.db database to extract gene information.

In order to loop over all species’ org.dbs for installation and loading, I create a new column with the name of the library.

datasets$orgDb <- NA

datasets[grep("hsapiens", datasets$dataset), "orgDb"] <- "org.Hs.eg.db"
datasets[grep("dmel", datasets$dataset), "orgDb"] <- "org.Dm.eg.db"
datasets[grep("mmus", datasets$dataset), "orgDb"] <- "org.Mm.eg.db"
datasets[grep("celegans", datasets$dataset), "orgDb"] <- "org.Ce.eg.db"
datasets[grep("cfam", datasets$dataset), "orgDb"] <- "org.Cf.eg.db"
datasets[grep("drerio", datasets$dataset), "orgDb"] <- "org.Dr.eg.db"
datasets[grep("ggallus", datasets$dataset), "orgDb"] <- "org.Gg.eg.db"
datasets[grep("ptrog", datasets$dataset), "orgDb"] <- "org.Pt.eg.db"
datasets[grep("rnor", datasets$dataset), "orgDb"] <- "org.Rn.eg.db"
datasets[grep("scer", datasets$dataset), "orgDb"] <- "org.Sc.sgd.db"
datasets[grep("sscrofa", datasets$dataset), "orgDb"] <- "org.Ss.eg.db"

datasets <- datasets[!is.na(datasets$orgDb), ]
datasets
##                       dataset                              description
## 9    rnorvegicus_gene_ensembl                     Rat genes (Rnor_6.0)
## 13   scerevisiae_gene_ensembl Saccharomyces cerevisiae genes (R64-1-1)
## 14      celegans_gene_ensembl  Caenorhabditis elegans genes (WBcel235)
## 22  ptroglodytes_gene_ensembl            Chimpanzee genes (CHIMP2.1.4)
## 26       sscrofa_gene_ensembl                  Pig genes (Sscrofa10.2)
## 32      hsapiens_gene_ensembl                  Human genes (GRCh38.p7)
## 36       ggallus_gene_ensembl        Chicken genes (Gallus_gallus-5.0)
## 41        drerio_gene_ensembl                 Zebrafish genes (GRCz10)
## 53 dmelanogaster_gene_ensembl                   Fruitfly genes (BDGP6)
## 61     mmusculus_gene_ensembl                  Mouse genes (GRCm38.p5)
## 69   cfamiliaris_gene_ensembl                    Dog genes (CanFam3.1)
##              version         orgDb
## 9           Rnor_6.0  org.Rn.eg.db
## 13           R64-1-1 org.Sc.sgd.db
## 14          WBcel235  org.Ce.eg.db
## 22        CHIMP2.1.4  org.Pt.eg.db
## 26       Sscrofa10.2  org.Ss.eg.db
## 32         GRCh38.p7  org.Hs.eg.db
## 36 Gallus_gallus-5.0  org.Gg.eg.db
## 41            GRCz10  org.Dr.eg.db
## 53             BDGP6  org.Dm.eg.db
## 61         GRCm38.p5  org.Mm.eg.db
## 69         CanFam3.1  org.Cf.eg.db

Now I can load the Ensembl mart for each species.

for (i in 1:nrow(datasets)) {
  ensembl <- datasets[i, 1]
  assign(paste0(ensembl), useMart("ensembl", dataset = paste0(ensembl)))
}

specieslist <- datasets$dataset

And install all org.db libraries, if necessary.

library(AnnotationDbi)

load_orgDb <- function(orgDb){
  if(!orgDb %in% installed.packages()[,"Package"]){
    source("https://bioconductor.org/biocLite.R")
    biocLite(orgDb)
  }
}

sapply(datasets$orgDb, load_orgDb, simplify = TRUE, USE.NAMES = TRUE)

Once they are all installed, I can load the libraries.

lapply(datasets$orgDb, require, character.only = TRUE)

Because I want to exctract and compare information on homologus genes for all possible combinations of species, they need to have a common identifier. To find them, I first produce a list with all available keytypes and then ask for common elements using rlist’s list.common() function.

keytypes_list <- lapply(datasets$orgDb, function(x) NULL)
names(keytypes_list) <- paste(datasets$orgDb)

for (orgDb in datasets$orgDb){
  keytypes_list[[orgDb]] <- keytypes(get(orgDb))
}

library(rlist)
list.common(keytypes_list)
##  [1] "ENTREZID"    "ENZYME"      "EVIDENCE"    "EVIDENCEALL" "GENENAME"   
##  [6] "GO"          "GOALL"       "ONTOLOGY"    "ONTOLOGYALL" "PATH"       
## [11] "PMID"        "REFSEQ"      "UNIPROT"

Entrez IDs are available for all species, so these are the keys I am using as gene identifiers. I am first creating a table of homologous genes for each species with all other species separately by looping over the datasets table.

for (i in 1:nrow(datasets)){
  orgDbs <- datasets$orgDb[i]
  values <- keys(get(orgDbs), keytype = "ENTREZID")

  ds <- datasets$dataset[i]
  mart <- useMart("ensembl", dataset = paste(ds))
  print(mart)

  if (!is.na(listFilters(mart)$name[grep("^entrezgene$", listFilters(mart)$name)])){
    if (!is.na(listAttributes(mart)$name[grep("^entrezgene$", listAttributes(mart)$name)])){
    print("TRUE")
    for (species in specieslist) {
      print(species)
      if (species != ds){
        assign(paste("homologs", orgDbs, species, sep = "_"), getLDS(attributes = c("entrezgene"),
                                                                     filters = "entrezgene",
                                                                     values = values,
                                                                     mart = mart,
                                                                     attributesL = c("entrezgene"),
                                                                     martL = get(species)))
      }
    }
  }
  }
}

Now I sort and combine these tables into one big table and remove duplicate entries.

library(dplyr)

for (i in 1:nrow(datasets)){
  orgDbs <- datasets$orgDb[i]
  values <- data.frame(GeneID = keys(get(orgDbs), keytype = "ENTREZID"))
  values$GeneID <- as.character(values$GeneID)
  ds <- datasets$dataset[i]

  for (j in 1:length(specieslist)){
    species <- specieslist[j]

    if (j == 1){
      homologs_table <- values
    }

    if (species != ds){
      homologs_species <- get(paste("homologs", orgDbs, species, sep = "_"))
      homologs_species$EntrezGene.ID <- as.character(homologs_species$EntrezGene.ID)

      homologs <- left_join(values, homologs_species, by = c("GeneID" = "EntrezGene.ID"))
      homologs <- homologs[!duplicated(homologs$GeneID), ]
      colnames(homologs)[2] <- paste(species)

      homologs_table <- left_join(homologs_table, homologs, by = "GeneID")
  }
  }

  colnames(homologs_table)[1] <- paste(ds)

  assign(paste("homologs_table", ds, sep = "_"), homologs_table)
}


for (i in 1:nrow(datasets)){
  ds <- datasets$dataset[i]

  if (i == 1){
    homologs_table_combined <- get(paste("homologs_table", ds, sep = "_"))
    homologs_table_combined <- homologs_table_combined[, order(colnames(homologs_table_combined))]
  } else {
    homologs_table_species <- get(paste("homologs_table", ds, sep = "_"))
    homologs_table_species <- homologs_table_species[, order(colnames(homologs_table_species))]

    homologs_table_combined <- rbind(homologs_table_combined, homologs_table_species)
  }
}

homologs_table_combined <- homologs_table_combined[!duplicated(homologs_table_combined), ]

Now each row in the table shows one gene and its homologs in all of the 11 species. Some genes have multiple homologs in other species (e.g. gene 44071 of the fruitfly has two homologs in zebrafish).

head(homologs_table_combined)
##   celegans_gene_ensembl cfamiliaris_gene_ensembl
## 1                    NA                       NA
## 2                    NA                   477699
## 3                    NA                   477023
## 4                173979                   490207
## 5                179795                   607852
## 6                177055                       NA
##   dmelanogaster_gene_ensembl drerio_gene_ensembl ggallus_gene_ensembl
## 1                         NA                  NA                   NA
## 2                      44071                  NA                   NA
## 3                         NA              431754               770094
## 4                      38864              406283                   NA
## 5                      36760              794259               395373
## 6                    3772179              541489               421909
##   hsapiens_gene_ensembl mmusculus_gene_ensembl ptroglodytes_gene_ensembl
## 1                    NA                     NA                        NA
## 2                     2                 232345                    465372
## 3                    30                 235674                    460268
## 4                    34                  11364                    469356
## 5                    47                 104112                    454672
## 6                    52                  11431                    458990
##   rnorvegicus_gene_ensembl scerevisiae_gene_ensembl sscrofa_gene_ensembl
## 1                    24152                       NA                   NA
## 2                    24153                       NA               403166
## 3                    24157                   854646            100515577
## 4                    24158                       NA               397104
## 5                    24159                   854310                   NA
## 6                    24161                   856187            100737301

In order to create the cooccurrence matrix, I am converting gene names to “1” and NAs to “0” and multiply the matrix with its transposed self. This is now the basis for igraph’s graph_from_adjacency_matrix() function with which I’m creating a directed network. Before plotting the network, I am removing multiple entries and loops.

homologs_table_combined_matrix <- as.matrix(ifelse(is.na(homologs_table_combined), 0, 1))

co_occurrence <- t(as.matrix(homologs_table_combined_matrix)) %*% as.matrix(homologs_table_combined_matrix)

library(igraph)
g <- graph_from_adjacency_matrix(co_occurrence,
                                 weighted = TRUE,
                                 diag = FALSE,
                                 mode = "directed")

g <- simplify(g, remove.multiple = TRUE, remove.loops = TRUE)

I am also preparing a node annotation table with colors for each species group and the count of unique gene identifiers for species.

datasets[, 2] <- gsub("(.*)( genes (.*))", "\\1", datasets[, 2])
datasets$description[grep("Saccharomyces", datasets$description)] <- "Yeast"
datasets$description[grep("elegans", datasets$description)] <- "C. elegans"

datasets$group <- ifelse(datasets$description == "Yeast", "fungus",
                         ifelse(datasets$description == "C. elegans", "roundworm",
                                ifelse(datasets$description == "Chicken", "bird",
                                       ifelse(datasets$dataset == "Zebrafish", "fish",
                                              ifelse(datasets$description == "Fruitfly", "insect", "mammal")))))

list <- as.character(unique(datasets$dataset))

library(RColorBrewer)
set3 <- brewer.pal(length(list), "Set3")

datasets$col <- NA
for (i in 1:length(list)){
  datasets[datasets$dataset == list[i], "col"] <- set3[i]
}

no_genes <- as.data.frame(apply(homologs_table_combined, 2, function(x) length(unique(x))))
no_genes$dataset <- rownames(no_genes)
colnames(no_genes)[1] <- "no_genes"

library(dplyr)
datasets <- left_join(datasets, no_genes, by = "dataset")

datasets <- datasets[order(datasets$dataset), ]

I also want to have the proportion of genes that have homologs in each of the respective other species as edge attributes. I am preparing an extra table for this of all possible combinations of nodes.

for (i in 1:ncol(homologs_table_combined)){
  input <- unique(na.omit(homologs_table_combined[, i]))
  homologs_table_combined_subset <- homologs_table_combined[!is.na(homologs_table_combined[, i]), ]

  for (j in 1:ncol(homologs_table_combined)){
    if (i != j){
      output <- unique(na.omit(homologs_table_combined_subset[, j]))
      
      if (i == 1 & j == 2){
        edge_table <- data.frame(source = colnames(homologs_table_combined)[i], 
                                 target = colnames(homologs_table_combined)[j], 
                                 weight = length(output)/length(input))
      } else {
        table <- data.frame(source = colnames(homologs_table_combined)[i], 
                            target = colnames(homologs_table_combined)[j], 
                            weight = length(output)/length(input))
        edge_table <- rbind(edge_table, table)
      }
    }
  }
}

list <- as.character(unique(edge_table$source))

edge_table$col <- NA
for (i in 1:length(list)){
  edge_table[edge_table$source == list[i], "col"] <- set3[i]
}

And finally I can produce the plot:

V(g)$color <- datasets$col
V(g)$label <- datasets$description
V(g)$size <- datasets$no_genes/2000
E(g)$arrow.size <- 3
E(g)$width <- edge_table$weight*25

plot(g,
     vertex.label.font = 1,
     vertex.shape = "sphere",
     vertex.label.cex = 3,
     vertex.label.color = "black",
     vertex.frame.color = NA,
     edge.curved = rep(0.1, ecount(g)),
     edge.color = edge_table$col,
     layout = layout_in_circle)

Node size shows the total number of genes of each species and edge width shows the proportion of these genes with homologs in the respective other species. Edge and node colors show the 11 species and their outgoing edges.

Weirdly, the mouse seems to have more gene entries in the org.db library than human. If anyone knows why that is, please let me know!



## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.1
## 
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] dplyr_0.5.0          RColorBrewer_1.1-2   igraph_1.0.1        
##  [4] rlist_0.4.6.1        org.Cf.eg.db_3.4.0   org.Mm.eg.db_3.4.0  
##  [7] org.Dm.eg.db_3.4.0   org.Dr.eg.db_3.4.0   org.Gg.eg.db_3.4.0  
## [10] org.Hs.eg.db_3.4.0   org.Ss.eg.db_3.4.0   org.Pt.eg.db_3.4.0  
## [13] org.Ce.eg.db_3.4.0   org.Sc.sgd.db_3.4.0  org.Rn.eg.db_3.4.0  
## [16] AnnotationDbi_1.36.0 IRanges_2.8.1        S4Vectors_0.12.1    
## [19] Biobase_2.34.0       BiocGenerics_0.20.0  biomaRt_2.30.0      
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.8       bitops_1.0-6      tools_3.3.2      
##  [4] digest_0.6.10     tibble_1.2        RSQLite_1.1-1    
##  [7] evaluate_0.10     memoise_1.0.0     DBI_0.5-1        
## [10] yaml_2.1.14       stringr_1.1.0     knitr_1.15.1     
## [13] rprojroot_1.1     data.table_1.10.0 R6_2.2.0         
## [16] XML_3.98-1.5      rmarkdown_1.2     magrittr_1.5     
## [19] backports_1.0.4   htmltools_0.3.5   assertthat_0.1   
## [22] stringi_1.1.2     RCurl_1.95-4.8

Creating a network of human gene homology with R and D3

2016-12-11 00:00:00 +0000

Edited on 20 December 2016


This week I want to explore how many of our human genes have homologs in other species. I will use this question to show how to visualize dendrograms and (D3-) networks.

Here, I am looking at gene homologs for all genes of the human genome. In Part 2 I am creating a full network between a subset of the species from this post.


Accessing species information from Biomart

I am using the biomaRt package to access all datasets available in the Biomart Ensembl database. The listDatasets() function shows that currently there is data for 69 species (including humans). This dataframe gives us the name of each dataset in Biomart Ensembl, the respective common species name and version number.

library(biomaRt)
ensembl = useMart("ensembl")
datasets <- listDatasets(ensembl)

human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")

I am then looping over each species to call useMart() and assign an object to each species’ Ensembl database.

for (i in 1:nrow(datasets)) {
  ensembl <- datasets[i, 1]
  assign(paste0(ensembl), useMart("ensembl", dataset = paste0(ensembl)))
}

specieslist <- datasets$dataset


Identifying human gene homologs

Protein coding genes

The majority of human genes are protein coding genes. This means that their DNA sequence will be translated into a protein with specific cellular functions. For an overview of all gene biotypes in the human genome, have a look at this previous post.

To identify all protein coding genes in the human genome, I am using AnnotationDbi and the EnsDb.Hsapiens.v79 database.

library(AnnotationDbi)
library(EnsDb.Hsapiens.v79)

# get all Ensembl gene IDs
human_EnsDb <- keys(EnsDb.Hsapiens.v79, keytype = "GENEID")

# get the biotype of each gene ID
human_gene_EnsDb <- ensembldb::select(EnsDb.Hsapiens.v79, keys = human_EnsDb, columns = "GENEBIOTYPE", keytype = "GENEID")

# and keep only protein coding genes
human_prot_coding_genes <- human_gene_EnsDb[which(human_gene_EnsDb$GENEBIOTYPE == "protein_coding"), ]

To get a dataframe of human genes and their homologs in other species, I am looping over all Biomart Ensembl databases using biomaRt’s getLDS() function. getLDS() links two datasets and is equivalent to homology mapping with Ensembl. I am also including the human database so that I can see how many genes from EnsDb.Hsapiens.v79 are found via Biomart.

for (species in specieslist) {
  print(species)
  assign(paste0("homologs_human_", species), getLDS(attributes = c("ensembl_gene_id", "chromosome_name"), 
       filters = "ensembl_gene_id", 
       values = human_prot_coding_genes$GENEID, 
       mart = human, 
       attributesL = c("ensembl_gene_id", "chromosome_name"), 
       martL = get(species)))
}

Based on EnsDb.Hsapiens.v79 protein coding genes, I now have 69 individual datasets with homologous genes found in other species. I now want to combine these 69 datasets into one cooccurrence matrix. To do this, I am first joining the homologous gene subsets back to the original dataframe with all protein coding genes. Then, I convert existing homologous gene names to 1 and NAs to 0. These, I then merge into one table. This final table now contains the information whether each gene has a homolog in each of the other species (“1”) or not (“0”).

library(dplyr)

for (i in 1:length(specieslist)){
  species <- specieslist[i]
  homologs_human <- left_join(human_prot_coding_genes, get(paste0("homologs_human_", species)), by = c("GENEID" = "Gene.ID"))
  homologs_human[, paste0(species)] <- ifelse(is.na(homologs_human$Gene.ID.1), 0, 1)
  homologs_human <- homologs_human[, c(1, 6)]
  homologs_human <- homologs_human[!duplicated(homologs_human$GENEID), ]
  
  if (i == 1){
    homologs_human_table <- homologs_human
  } else {
    homologs_human_table <- left_join(homologs_human_table, homologs_human, by = "GENEID")
  }
}

I only want to keep the 21,925 human genes found via Biomart (77 genes were not found).

homologs_human_table <- homologs_human_table[which(homologs_human_table[, grep("sapiens", colnames(homologs_human_table))] == 1), ]

The final steps to creating a cooccurrence matrix are

  • removing the column with Ensembl gene IDs (because we have the Homo sapiens column from Biomart),
  • multiplying with the transposed matrix and
  • setting all rows and columns which don’t show cooccurrences with humans to 0 (because I only looked at homology from the human perspective).
gene_matrix <- homologs_human_table[, -1]

co_occurrence <- t(as.matrix(gene_matrix)) %*% as.matrix(gene_matrix)
co_occurrence[-grep("sapiens", rownames(co_occurrence)), -grep("sapiens", colnames(co_occurrence))] <- 0

The first information I want to extract from this data is how many and what proportion of human genes had a homolog in each of the other species.

genes <- data.frame(organism = colnames(gene_matrix),
                    number_genes = colSums(gene_matrix),
                    proportion_human_genes = colSums(gene_matrix)/nrow(homologs_human_table))

I also want to know which species are similar regarding the gene homology they share with humans. This, I will visualize with a hierarchical clustering dendrogram using the dendextend and circlize packages.

Before I can produce plots however, I want to create annotation attributes for each species:

  • their common name (because most people won’t know off the top of their heads what species T. nigroviridis is) and
  • a grouping attribute, e.g. mammal, fish, bird, etc.

In order to have the correct order of species, I create the annotation dataframe based off the dendrogram.

Most of the common names I can extract from the list of Biomart Ensembl datasets but a few I have to change manually.

library(dendextend)
library(circlize)

# create a dendrogram
h <- hclust(dist(scale(as.matrix(t(gene_matrix))), method = "manhattan"))
dend <- as.dendrogram(h)

library(dplyr)
labels <- as.data.frame(dend %>% labels) %>%
  left_join(datasets, by = c("dend %>% labels" = "dataset")) %>%
  left_join(genes, by = c("dend %>% labels" = "organism"))

labels[, 2] <- gsub("(.*)( genes (.*))", "\\1", labels[, 2])
labels$group <- c(rep("mammal", 2), rep("fish", 8), "amphibia", rep("fish", 4), rep("bird", 5), rep("reptile", 2), rep("mammal", 41), "fungus", "lamprey", rep("seasquirt", 2), "nematode", "insect")

labels$description[grep("hedgehog", labels$description)] <- "Hedgehog Tenrec"
labels$description[grep("Saccharomyces", labels$description)] <- "Yeast"
labels$description[grep("savignyi", labels$description)] <- "C. savignyi"
labels$description[grep("intestinalis", labels$description)] <- "C. intestinalis"
labels$description[grep("elegans", labels$description)] <- "C. elegans"
labels$description[grep("turtle", labels$description)] <- "Turtle"
labels$description[grep("Vervet", labels$description)] <- "Vervet"

The first plot shows the proportion of human genes with a homolog in each of the other species.

library(ggplot2)

my_theme <- function(base_size = 12, base_family = "sans"){
  theme_grey(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(color = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "aliceblue"),
    strip.background = element_rect(fill = "lightgrey", color = "grey", size = 1),
    strip.text = element_text(face = "bold", size = 12, color = "navy"),
    legend.position = "bottom",
    legend.background = element_blank(),
    panel.margin = unit(.5, "lines"),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.5)
  )
}

labels_p <- labels[-1, ]

f = as.factor(labels_p[order(labels_p$proportion_human_genes, decreasing = TRUE), "description"])
labels_p <- within(labels_p, description <- factor(description, levels = f))

ggplot(labels_p, aes(x = description, y = proportion_human_genes, fill = group)) +
  geom_bar(stat = "identity") +
  my_theme() +
  labs(
    x = "",
    y = "Proportion of homology",
    fill = "",
    title = "Human gene homology",
    subtitle = "Proportion of human protein coding genes with homologs in 68 other species"
  )

As expected, the highest proportions of homologous genes are found between humans and most other mammals: between 81 and 70% of our genes have a homolog in most mammals. The lowest proportion of homologous genes are found in yeast, but even such a simply organism shares more than 20% of its genes with us. Think about that the next time you enjoy a delicious freshly baked bread or drink a nice cold glass of beer…

However, based on these Biomart Ensembl databases, more of our genes have a homolog in mice than in primates. I would have expected primates to come out on top, but what we see here might be confounded by the different amounts of information we have on species’ genomes and the accuracy of this information: there has been more research done on the mouse than e.g. on the alpaca. We therefore have more and more accurate information on the mouse genome and have identified more homologous genes than in other species.


For plotting the dendrograms, I am adding a column with color specifications for each group to the annotation table.

labels$col <- c(rep("aquamarine4", 2), rep("darkorange3", 8), "blue4", rep("darkorange3", 4), rep("darkred", 5), rep("darkmagenta", 2), rep("aquamarine4", 41), "darkslateblue", "deepskyblue1", rep("deeppink3", 2), "forestgreen", "brown3")

This dendrogram shows the hierarchical clustering of human gene homology with 68 other species. The species are grouped by color (see legend).

labels(dend) <- labels$description
dend %>% 
  set("labels_col", labels$col) %>% 
  set("labels_cex", 2) %>% 
  circlize_dendrogram(labels_track_height = NA, dend_track_height = 0.3) 

legend("topleft", unique(labels$group), pch = 19,
       col = unique(labels$col), pt.cex = 3, cex = 3, bty = "n", ncol = 1)

The most distant group from all other species include yeast, fruit fly, lamprey, C. elegans and the two seasquirt species, followed by tarsier, alpaca, sloth and shrew. The next group includes the fish, reptiles, amphibiens, birds and the platypus. Interestingly, the platypus the Xenopus are more similar to fish than to birds and reptiles.

And finally, we have the remaining mammals: here, the primates cluster nicely together.


In order to show the homology between human genes with the other species, we can also plot a network. The 2D-network is created from the cooccurrence matrix with node size and edge width representing the proportion of homology and colors depicting the species’ group.

# plot the network graph
library(igraph)
g <- graph_from_adjacency_matrix(co_occurrence,
                         weighted = TRUE,
                         diag = FALSE,
                         mode = "undirected")

g <- simplify(g, remove.multiple = F, remove.loops = T, edge.attr.comb = c(weight = "sum", type = "ignore"))

labels_2 <- labels[match(rownames(co_occurrence), labels[, 1]), ]

V(g)$color <- labels_2$col
V(g)$label <- labels_2$description
V(g)$size <- labels_2$proportion_human_genes*25
E(g)$arrow.size <- 0.2
E(g)$edge.color <- "gray80"
E(g)$width <- labels_2$proportion_human_genes*10
plot(g,
     vertex.label.font = 1,
     vertex.shape = "sphere",
     vertex.label.cex = 1,
     vertex.label.color = "white",
     vertex.frame.color = NA)

legend("topleft", unique(labels_2$group), pch = 19,
       col = unique(labels_2$col), pt.cex = 2, cex = 1.5, bty = "n", ncol = 1)

Another nice way to visualize networks is to create a D3 JavaScript network graph with the networkD3 package. It shows the same information as the 2D-network above but here you can interact with the nodes, which makes the species names easier to read.

library(networkD3)

net_d3 <- igraph_to_networkD3(g, group = labels_2$group)
net_d3$nodes <- merge(net_d3$nodes, genes, by.x = "name", by.y = "row.names")
net_d3$nodes$proportion_human_genes <- net_d3$nodes$proportion_human_genes
net_d3$nodes <- merge(net_d3$nodes, labels_2[, 1:2], by.x = "name", by.y = "dend %>% labels")
net_d3$nodes[, 1] <- net_d3$nodes$description

net_d3$nodes <- net_d3$nodes[match(labels_2[, 1], net_d3$nodes$organism), ]

# Create force directed network plot
forceNetwork(Links = net_d3$links, 
             Nodes = net_d3$nodes,
             Nodesize = "proportion_human_genes",
             radiusCalculation = JS(" Math.sqrt(d.nodesize)*15"),
             linkDistance = 100,
             zoom = TRUE, 
             opacity = 0.9,
             Source = 'source', 
             Target = 'target',
             linkWidth = networkD3::JS("function(d) { return d.value/5; }"),
             bounded = FALSE, 
             colourScale = JS("d3.scale.category10()"),
             NodeID = 'name', 
             Group = 'group', 
             fontSize = 10, 
             charge = -200,
             opacityNoHover = 0.7)

Click to open the interactive network and use the mouse to zoom and turn the plot:

networkD3



## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12
## 
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] networkD3_0.2.13         igraph_1.0.1            
##  [3] ggplot2_2.2.0            dplyr_0.5.0             
##  [5] circlize_0.3.9           dendextend_1.3.0        
##  [7] EnsDb.Hsapiens.v79_1.1.0 ensembldb_1.6.0         
##  [9] GenomicFeatures_1.26.0   GenomicRanges_1.26.0    
## [11] GenomeInfoDb_1.10.0      AnnotationDbi_1.36.0    
## [13] IRanges_2.8.0            S4Vectors_0.12.0        
## [15] Biobase_2.34.0           BiocGenerics_0.20.0     
## [17] biomaRt_2.30.0          
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.2.1                    jsonlite_1.1                 
##  [3] AnnotationHub_2.6.0           shiny_0.14.2                 
##  [5] assertthat_0.1                interactiveDisplayBase_1.12.0
##  [7] Rsamtools_1.26.0              yaml_2.1.14                  
##  [9] robustbase_0.92-6             RSQLite_1.0.0                
## [11] lattice_0.20-34               digest_0.6.10                
## [13] XVector_0.14.0                colorspace_1.3-1             
## [15] htmltools_0.3.5               httpuv_1.3.3                 
## [17] Matrix_1.2-7.1                plyr_1.8.4                   
## [19] XML_3.98-1.5                  zlibbioc_1.20.0              
## [21] xtable_1.8-2                  mvtnorm_1.0-5                
## [23] scales_0.4.1                  whisker_0.3-2                
## [25] BiocParallel_1.8.0            tibble_1.2                   
## [27] SummarizedExperiment_1.4.0    nnet_7.3-12                  
## [29] lazyeval_0.2.0                magrittr_1.5                 
## [31] mime_0.5                      mclust_5.2                   
## [33] evaluate_0.10                 MASS_7.3-45                  
## [35] class_7.3-14                  BiocInstaller_1.24.0         
## [37] tools_3.3.2                   GlobalOptions_0.0.10         
## [39] trimcluster_0.1-2             stringr_1.1.0                
## [41] kernlab_0.9-25                munsell_0.4.3                
## [43] cluster_2.0.5                 fpc_2.1-10                   
## [45] Biostrings_2.42.0             grid_3.3.2                   
## [47] RCurl_1.95-4.8                htmlwidgets_0.8              
## [49] bitops_1.0-6                  labeling_0.3                 
## [51] rmarkdown_1.1                 gtable_0.2.0                 
## [53] flexmix_2.3-13                DBI_0.5-1                    
## [55] R6_2.2.0                      GenomicAlignments_1.10.0     
## [57] knitr_1.15                    prabclus_2.2-6               
## [59] rtracklayer_1.34.0            shape_1.4.2                  
## [61] modeltools_0.2-21             stringi_1.1.2                
## [63] Rcpp_0.12.8                   DEoptimR_1.0-8               
## [65] diptest_0.75-7

How to set up your own R blog with Github pages and Jekyll Bootstrap

2016-12-04 00:00:00 +0000

This post is in reply to a request: How did I set up this R blog?

I have wanted to have my own R blog for a while before I actually went ahead and realised this page. I had seen all the cool things people do with R by following R-bloggers and reading their newsletter every day!

While I am using R every day at work, the thematic scope there is of course very bioinformatics-centric (with a little bit of Machine Learning and lots of visualization and statistics), I wanted to have an incentive to regularly explore other types of analyses and other types of data that I don’t normally work with. I have also very often benefited from other people’s published code in that it gave me ideas for my own work and I hope that sharing my own analyses will inspire others as much as I often am by what can be be done with data.

Moreover, since I’ve started my R blog, I’ve very much enjoyed interacting more with other R-bloggers and R users and getting valuable feedback. I definitely feel more part of the community now that I’m an active contributor!


In this post, I will describe how I go about publishing R code and its output to my blog. I will not go into detail on how to customize your page but only give a quick introduction to setting up a blog with Github pages and JekyllBoostrap. I will focus on how to write content with RMarkdown and RStudio.


Setting up Github pages

I decided to host my blog on Github pages because I was already used to working with Github (it’s where I store my exprAnalysis R package).

Setting up your own Github page is free and very easy! All you need is a Github account. On Github you need to create a new repository called “username.github.io”” (substitute username for your Github user name, in my case: ShirinG.github.io).

I set up the rest via the terminal interface:

  • go to the folder where your repository should live (in this example called “Test”)
  • clone the repository and change to its directory
  • pull from Github to make sure everything is synchronized and up to date between your computer and Github
cd Home/Documents/Test
git clone https://github.com/username/username.github.io
cd username.github.io
git pull origin master


JekyllBoostrap

Github pages also supports Jekyll, an easy to use static site generator perfect for blogging. I set up my blog with JekyllBoostrap, a full blog scaffold for Jekyll based blogs. Follow their Quick Start Guide for instructions on how to install it.

It also gives detailed instructions on how to set up comments, analytics and permalinks.


Customizing your page

Most of the heavy-lifting has already been done for you with JekyllBootstrap but of course there are many options for customizing and adapting your blog’s layout to your liking.

The main file for customizing your page is Jekyll’s “*_config.yml*” file (check back with JekyllBoostrap for details). Here you need to fill it all the relevant information about your page, like title, author, etc.

If you know a little bit about JavaScript, CSS and HTML, you can also directly modify the style sheets in the directory “*_includes*”.

You can change your page’s theme by following this tutorial.

In your page repository’s main folder you have a markdown called “index”. This is your main page and is already set up to show a list of all your blog post.

You can have more pages in the navigation bar if you want, like an “About me”, etc. You simply need to create a new markdown with the respective content in your page repository’s main folder.

You will also automatically get .xml files with your blog’s RSS and Atom feeds.


Preparing R blog posts with RMarkdown

I am generally working in RStudio, so this is where I’m preparing all my blog posts. I’m writing my code in RMarkdown and produce html output files until I deem a blog post finished.

This is how an example Rmarkdown document could look like.

Your Rmarkdown will need a header with title, author, date and output information:

---
title: "This is a test post for my R blog"
author: "Dr. Shirin Glander"
date: '`r Sys.Date()`'
output: html_document
---

Below the header you can write text, insert pictures, create tables and much more. Code chunks will go in between brackets.

```{r, echo=TRUE}
summary(cars)
```

As always, there is much more information on how to work with RMarkdown on the webs.

The final code chunk will look something like this in the output:

library(ggplot2)
ggplot(mpg, aes(displ, hwy)) +
  geom_point(aes(color = class)) +
  geom_smooth(se = FALSE, method = "loess") +
  labs(
    title = "Fuel efficiency generally decreases with engine size",
    subtitle = "Two seaters (sports cars) are an exception because of their light weight",
    caption = "Data from fueleconomy.gov"
  )


Once I’m happy with my finished blog post, I’m changing the output format in the YAML header to create a markdown file (I’m using markdown_github) and knit again.

output: 
  md_document:
    variant: markdown_github

This will produce a markdown document (ending in .md) of your blog post. This markdown is now almost ready to be published but first you need to add a header in the markdown document to the top of your finished markdown for Jekyll to recognize. An example markdown header looks like this:

---
layout: post
title: "This is a test post for my R blog"
date: 2016-12-01
categories: rblogging
tags: test ggplot2
---

It needs to contain the layout definition for post, the title and date. Categories and tags are optional but I like using them to categorize the posts I have written and make them easier to find again.

Now this file is ready to go on your blog.


Publishing a blog post

I’m working in a different directory to produce my posts and test out things (in this example called “blog_prep”), so once I have my markdown file ready (in this example called “example_post.md”), I am going back to the terminal window and copy it to my blog repository’s folder “_post” and rename it so that it has the following format: Date-name.md (of course, you could also manually copy, paste and rename the file).

cp ../blog_prep/example_post.md _posts
mv _posts/example_post.md _posts/2016-11-20-example_post.md

If you also have figures or graphs in your blog post, there will be a folder generated along with your markdown with the name “example_post_files”. You need to copy this folder to your blog repository as well. If you gave your post a category, it will look for the files in a folder with the same name as your category with subfolders for the date. With the example above, which was categorized as “rblogging”, I will need to have/ make a folder with the same name and create subdirectories for the date on which my post is published:

mkdir -p rblogging/2016/12/01
cp ../blog_prep/example_post_files rblogging/2016/12/01


Testing your blog page locally

Before I push my updated blog to Github where it will go live, I’m testing the output locally with Jekyll. In order to be able to run Jekyll locally, you need to follow these excellent guidelines. Once you have Jekyll up and running, all you need to do is to be in your page repository and run jekyll serve.

bundle exec jekyll serve

This will build your page from the contents of your repository and you can preview whether everything looks the way you want it to.


If my new blog post looks okay and I don’t want to change anything any more, I can finally stage, commit and push the updated blog to Github.

git add *
git commit -m "name"
git push origin master


Alternatively, if you’d rather use RStudio for pushing changes to Github, you can also connect your blog repository to RStudio and stage, commit and push from there.


And voilà, your new R blog post is out there for the world to see!

Extreme Gradient Boosting and Preprocessing in Machine Learning - Addendum to predicting flu outcome with R

2016-12-02 00:00:00 +0000

In last week’s post I explored whether machine learning models can be applied to predict flu deaths from the 2013 outbreak of influenza A H7N9 in China. There, I compared random forests, elastic-net regularized generalized linear models, k-nearest neighbors, penalized discriminant analysis, stabilized linear discriminant analysis, nearest shrunken centroids, single C5.0 tree and partial least squares.


Extreme gradient boosting

Extreme gradient boosting (XGBoost) is a faster and improved implementation of gradient boosting for supervised learning and has recently been very successfully applied in Kaggle competitions. Because I’ve heard XGBoost’s praise being sung everywhere lately, I wanted to get my feet wet with it too. So this week I want to compare the prediction success of gradient boosting with the same dataset. Additionally, I want to test the influence of different preprocessing methods on the outcome.


“XGBoost uses a more regularized model formalization to control over-fitting, which gives it better performance.” Tianqi Chen, developer of xgboost

XGBoost is a tree ensemble model, which means the sum of predictions from a set of classification and regression trees (CART). In that, XGBoost is similar to Random Forests but it uses a different approach to model training.


Starting with the same test and training data (partitioned into validation test and validation train subsets) from last week’s post, I am training extreme gradient boosting models as implemented in the xgboost and caret packages with different preprocessing settings.

Out of the different implementations and variations of gradient boosting algorithms, caret performed best on PCA-preprocessed data in the validation set. These paramteres were then used to predict the outcome in the test set and compare it to last week’s predictions.

Compared to last week, there is much less uncertainty in the predictions from XGBoost. Overall, I would say that this algorithm is superior to the others I have used before.


xgboost

Extreme gradient boosting is implemented in the xgboost package.

# install the stable/pre-compiled version from CRAN

install.packages('xgboost')

# or install from weekly updated drat repo

install.packages("drat", repos="https://cran.rstudio.com")
drat:::addRepo("dmlc")
install.packages("xgboost", repos="http://dmlc.ml/drat/", type="source")

XGBoost supports only numbers, so the outcome classes have to be converted into integers and both training and test data have to be in numeric matrix format.

library(xgboost)
matrix_train <- apply(val_train_X, 2, function(x) as.numeric(as.character(x)))
outcome_death_train <- ifelse(val_train_data$outcome == "Death", 1, 0)

matrix_test <- apply(val_test_X, 2, function(x) as.numeric(as.character(x)))
outcome_death_test <- ifelse(val_test_data$outcome == "Death", 1, 0)

xgb_train_matrix <- xgb.DMatrix(data = as.matrix(matrix_train), label = outcome_death_train)
xgb_test_matrix <- xgb.DMatrix(data = as.matrix(matrix_test), label = outcome_death_test)

watchlist <- list(train = xgb_train_matrix, test = xgb_test_matrix)
label <- getinfo(xgb_test_matrix, "label")

I am using cross validation to evaluate the error rate.

param <- list("objective" = "binary:logistic")

xgb.cv(param = param, 
       data = xgb_train_matrix, 
       nfold = 3,
       label = getinfo(xgb_train_matrix, "label"),
       nrounds = 5)
## [1]  train-error:0.133950+0.022131   test-error:0.341131+0.129919 
## [2]  train-error:0.124941+0.033432   test-error:0.358674+0.105191 
## [3]  train-error:0.115932+0.045501   test-error:0.340156+0.092598 
## [4]  train-error:0.124941+0.033432   test-error:0.340156+0.092598 
## [5]  train-error:0.089616+0.051301   test-error:0.394737+0.098465


Training with gbtree

gbtree is the default booster for xgb.train.

bst_1 <- xgb.train(data = xgb_train_matrix, 
                   label = getinfo(xgb_train_matrix, "label"),
                   max.depth = 2, 
                   eta = 1, 
                   nthread = 4, 
                   nround = 50, # number of trees used for model building
                   watchlist = watchlist, 
                   objective = "binary:logistic")
## [1]  train-error:0.250000    test-error:0.347826 
## [2]  train-error:0.178571    test-error:0.304348 
## [3]  train-error:0.142857    test-error:0.347826 
## [4]  train-error:0.089286    test-error:0.260870 
## [5]  train-error:0.053571    test-error:0.347826 
## [6]  train-error:0.053571    test-error:0.391304 
## [7]  train-error:0.035714    test-error:0.347826 
## [8]  train-error:0.000000    test-error:0.391304 
## [9]  train-error:0.017857    test-error:0.391304 
## [10] train-error:0.000000    test-error:0.391304 
## [11] train-error:0.000000    test-error:0.391304 
## [12] train-error:0.000000    test-error:0.347826 
## [13] train-error:0.000000    test-error:0.347826 
## [14] train-error:0.000000    test-error:0.347826 
## [15] train-error:0.000000    test-error:0.347826 
## [16] train-error:0.000000    test-error:0.347826 
## [17] train-error:0.000000    test-error:0.347826 
## [18] train-error:0.000000    test-error:0.347826 
## [19] train-error:0.000000    test-error:0.347826 
## [20] train-error:0.000000    test-error:0.347826 
## [21] train-error:0.000000    test-error:0.347826 
## [22] train-error:0.000000    test-error:0.347826 
## [23] train-error:0.000000    test-error:0.347826 
## [24] train-error:0.000000    test-error:0.347826 
## [25] train-error:0.000000    test-error:0.347826 
## [26] train-error:0.000000    test-error:0.347826 
## [27] train-error:0.000000    test-error:0.347826 
## [28] train-error:0.000000    test-error:0.347826 
## [29] train-error:0.000000    test-error:0.347826 
## [30] train-error:0.000000    test-error:0.347826 
## [31] train-error:0.000000    test-error:0.347826 
## [32] train-error:0.000000    test-error:0.347826 
## [33] train-error:0.000000    test-error:0.347826 
## [34] train-error:0.000000    test-error:0.347826 
## [35] train-error:0.000000    test-error:0.347826 
## [36] train-error:0.000000    test-error:0.347826 
## [37] train-error:0.000000    test-error:0.347826 
## [38] train-error:0.000000    test-error:0.347826 
## [39] train-error:0.000000    test-error:0.347826 
## [40] train-error:0.000000    test-error:0.347826 
## [41] train-error:0.000000    test-error:0.347826 
## [42] train-error:0.000000    test-error:0.347826 
## [43] train-error:0.000000    test-error:0.347826 
## [44] train-error:0.000000    test-error:0.347826 
## [45] train-error:0.000000    test-error:0.347826 
## [46] train-error:0.000000    test-error:0.347826 
## [47] train-error:0.000000    test-error:0.347826 
## [48] train-error:0.000000    test-error:0.347826 
## [49] train-error:0.000000    test-error:0.347826 
## [50] train-error:0.000000    test-error:0.347826

Each feature is grouped by importance with k-means clustering. Gain is the improvement in accuracy that the addition of a feature brings to the branches it is on.

features = colnames(matrix_train)
importance_matrix_1 <- xgb.importance(features, model = bst_1)
print(importance_matrix_1)
##                   Feature       Gain      Cover  Frequency
## 1:                    age 0.54677689 0.45726557 0.43333333
## 2:  days_onset_to_outcome 0.14753698 0.11686537 0.11111111
## 3:            early_onset 0.06724531 0.07365416 0.07777778
## 4:          early_outcome 0.06679880 0.08869426 0.07777778
## 5:      province_Shanghai 0.05325295 0.04317030 0.04444444
## 6: days_onset_to_hospital 0.03883631 0.11300882 0.13333333
## 7:         province_other 0.03502971 0.03977657 0.04444444
## 8:               gender_f 0.02610959 0.02484170 0.01111111
## 9:               hospital 0.01841345 0.04272324 0.06666667
xgb.ggplot.importance(importance_matrix_1) +
  theme_minimal()

pred_1 <- predict(bst_1, xgb_test_matrix)

result_1 <- data.frame(case_ID = rownames(val_test_data),
                       outcome = val_test_data$outcome, 
                       label = label, 
                       prediction_p_death = round(pred_1, digits = 2),
                       prediction = as.integer(pred_1 > 0.5),
                       prediction_eval = ifelse(as.integer(pred_1 > 0.5) != label, "wrong", "correct"))
result_1
##     case_ID outcome label prediction_p_death prediction prediction_eval
## 1  case_123   Death     1               0.01          0           wrong
## 2  case_127 Recover     0               0.00          0         correct
## 3  case_128 Recover     0               0.96          1           wrong
## 4   case_14 Recover     0               0.76          1           wrong
## 5   case_19   Death     1               0.95          1         correct
## 6    case_2   Death     1               0.85          1         correct
## 7   case_20 Recover     0               0.98          1           wrong
## 8   case_21 Recover     0               0.06          0         correct
## 9   case_34   Death     1               1.00          1         correct
## 10  case_37 Recover     0               0.00          0         correct
## 11   case_5 Recover     0               0.03          0         correct
## 12  case_51 Recover     0               0.27          0         correct
## 13  case_55 Recover     0               0.11          0         correct
## 14   case_6   Death     1               0.26          0           wrong
## 15  case_61   Death     1               1.00          1         correct
## 16  case_65 Recover     0               0.00          0         correct
## 17  case_74 Recover     0               0.92          1           wrong
## 18  case_78   Death     1               0.03          0           wrong
## 19  case_79 Recover     0               0.00          0         correct
## 20   case_8   Death     1               0.06          0           wrong
## 21  case_87   Death     1               0.64          1         correct
## 22  case_91 Recover     0               0.00          0         correct
## 23  case_94 Recover     0               0.01          0         correct
err <- as.numeric(sum(as.integer(pred_1 > 0.5) != label))/length(label)
print(paste("test-error =", round(err, digits = 2)))
## [1] "test-error = 0.35"


Training with gblinear

bst_2 <- xgb.train(data = xgb_train_matrix, 
                   booster = "gblinear", 
                   label = getinfo(xgb_train_matrix, "label"),
                   max.depth = 2, 
                   eta = 1, 
                   nthread = 4, 
                   nround = 50, # number of trees used for model building
                   watchlist = watchlist, 
                   objective = "binary:logistic")
## [1]  train-error:0.339286    test-error:0.434783 
## [2]  train-error:0.303571    test-error:0.391304 
## [3]  train-error:0.285714    test-error:0.347826 
## [4]  train-error:0.303571    test-error:0.347826 
## [5]  train-error:0.285714    test-error:0.347826 
## [6]  train-error:0.285714    test-error:0.391304 
## [7]  train-error:0.285714    test-error:0.391304 
## [8]  train-error:0.267857    test-error:0.391304 
## [9]  train-error:0.250000    test-error:0.391304 
## [10] train-error:0.250000    test-error:0.478261 
## [11] train-error:0.250000    test-error:0.434783 
## [12] train-error:0.232143    test-error:0.434783 
## [13] train-error:0.232143    test-error:0.434783 
## [14] train-error:0.232143    test-error:0.434783 
## [15] train-error:0.232143    test-error:0.434783 
## [16] train-error:0.232143    test-error:0.434783 
## [17] train-error:0.232143    test-error:0.434783 
## [18] train-error:0.232143    test-error:0.434783 
## [19] train-error:0.232143    test-error:0.434783 
## [20] train-error:0.232143    test-error:0.434783 
## [21] train-error:0.214286    test-error:0.434783 
## [22] train-error:0.214286    test-error:0.434783 
## [23] train-error:0.214286    test-error:0.434783 
## [24] train-error:0.214286    test-error:0.434783 
## [25] train-error:0.214286    test-error:0.434783 
## [26] train-error:0.214286    test-error:0.434783 
## [27] train-error:0.214286    test-error:0.434783 
## [28] train-error:0.214286    test-error:0.434783 
## [29] train-error:0.214286    test-error:0.434783 
## [30] train-error:0.232143    test-error:0.434783 
## [31] train-error:0.214286    test-error:0.434783 
## [32] train-error:0.214286    test-error:0.434783 
## [33] train-error:0.214286    test-error:0.434783 
## [34] train-error:0.214286    test-error:0.434783 
## [35] train-error:0.214286    test-error:0.434783 
## [36] train-error:0.214286    test-error:0.434783 
## [37] train-error:0.214286    test-error:0.434783 
## [38] train-error:0.214286    test-error:0.434783 
## [39] train-error:0.214286    test-error:0.434783 
## [40] train-error:0.214286    test-error:0.434783 
## [41] train-error:0.214286    test-error:0.434783 
## [42] train-error:0.214286    test-error:0.434783 
## [43] train-error:0.214286    test-error:0.434783 
## [44] train-error:0.214286    test-error:0.434783 
## [45] train-error:0.214286    test-error:0.434783 
## [46] train-error:0.214286    test-error:0.434783 
## [47] train-error:0.214286    test-error:0.434783 
## [48] train-error:0.214286    test-error:0.434783 
## [49] train-error:0.214286    test-error:0.434783 
## [50] train-error:0.214286    test-error:0.434783

Each feature is grouped by importance with k-means clustering. Gain is the improvement in accuracy that the addition of a feature brings to the branches it is on.

features = colnames(matrix_train)
importance_matrix_2 <- xgb.importance(features, model = bst_2)
print(importance_matrix_2)
##                    Feature     Weight
##  1:                    age  0.0439012
##  2:               hospital -0.3131160
##  3:               gender_f  0.8067830
##  4:       province_Jiangsu -0.8807380
##  5:      province_Shanghai -0.4065330
##  6:      province_Zhejiang -0.1284590
##  7:         province_other  0.4510330
##  8:  days_onset_to_outcome  0.0209952
##  9: days_onset_to_hospital -0.0856504
## 10:            early_onset  0.6452190
## 11:          early_outcome  2.5134300
xgb.ggplot.importance(importance_matrix_2) +
  theme_minimal()

pred_2 <- predict(bst_2, xgb_test_matrix)

result_2 <- data.frame(case_ID = rownames(val_test_data),
                       outcome = val_test_data$outcome, 
                       label = label, 
                       prediction_p_death = round(pred_2, digits = 2),
                       prediction = as.integer(pred_2 > 0.5),
                       prediction_eval = ifelse(as.integer(pred_2 > 0.5) != label, "wrong", "correct"))
result_2
##     case_ID outcome label prediction_p_death prediction prediction_eval
## 1  case_123   Death     1               0.10          0           wrong
## 2  case_127 Recover     0               0.03          0         correct
## 3  case_128 Recover     0               0.13          0         correct
## 4   case_14 Recover     0               0.15          0         correct
## 5   case_19   Death     1               0.28          0           wrong
## 6    case_2   Death     1               0.28          0           wrong
## 7   case_20 Recover     0               0.80          1           wrong
## 8   case_21 Recover     0               0.79          1           wrong
## 9   case_34   Death     1               0.74          1         correct
## 10  case_37 Recover     0               0.06          0         correct
## 11   case_5 Recover     0               0.14          0         correct
## 12  case_51 Recover     0               0.77          1           wrong
## 13  case_55 Recover     0               0.23          0         correct
## 14   case_6   Death     1               0.47          0           wrong
## 15  case_61   Death     1               0.52          1         correct
## 16  case_65 Recover     0               0.03          0         correct
## 17  case_74 Recover     0               0.70          1           wrong
## 18  case_78   Death     1               0.22          0           wrong
## 19  case_79 Recover     0               0.06          0         correct
## 20   case_8   Death     1               0.36          0           wrong
## 21  case_87   Death     1               0.65          1         correct
## 22  case_91 Recover     0               0.05          0         correct
## 23  case_94 Recover     0               0.07          0         correct
err <- as.numeric(sum(as.integer(pred_2 > 0.5) != label))/length(label)
print(paste("test-error =", round(err, digits = 2)))
## [1] "test-error = 0.43"


caret

Extreme gradient boosting is also implemented in the caret package. Caret also provides options for preprocessing, of which I will compare a few.


No preprocessing

library(caret)

set.seed(27)
model_xgb_null <-train(outcome ~ .,
                 data=val_train_data,
                 method="xgbTree",
                 preProcess = NULL,
                 trControl = trainControl(method = "repeatedcv", number = 5, repeats = 10, verboseIter = FALSE))
confusionMatrix(predict(model_xgb_null, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       5       4
##    Recover     4      10
##                                           
##                Accuracy : 0.6522          
##                  95% CI : (0.4273, 0.8362)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.4216          
##                                           
##                   Kappa : 0.2698          
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5556          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.5556          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.3913          
##          Detection Rate : 0.2174          
##    Detection Prevalence : 0.3913          
##       Balanced Accuracy : 0.6349          
##                                           
##        'Positive' Class : Death           
## 


Scaling and centering

With this method the column variables are centered (subtracting the column mean from each value in a column) and standardized (dividing by the column standard deviation).

set.seed(27)
model_xgb_sc_cen <-train(outcome ~ .,
                 data=val_train_data,
                 method="xgbTree",
                 preProcess = c("scale", "center"),
                 trControl = trainControl(method = "repeatedcv", number = 5, repeats = 10, verboseIter = FALSE))
confusionMatrix(predict(model_xgb_sc_cen, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       5       4
##    Recover     4      10
##                                           
##                Accuracy : 0.6522          
##                  95% CI : (0.4273, 0.8362)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.4216          
##                                           
##                   Kappa : 0.2698          
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5556          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.5556          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.3913          
##          Detection Rate : 0.2174          
##    Detection Prevalence : 0.3913          
##       Balanced Accuracy : 0.6349          
##                                           
##        'Positive' Class : Death           
## 

Scaling and centering did not improve the predictions.

pred_3 <- predict(model_xgb_sc_cen, val_test_data[, -1])
pred_3b <- round(predict(model_xgb_sc_cen, val_test_data[, -1], type="prob"), digits = 2)

result_3 <- data.frame(case_ID = rownames(val_test_data),
                       outcome = val_test_data$outcome, 
                       label = label, 
                       prediction = pred_3,
                       pred_3b)
result_3$prediction_eval <- ifelse(result_3$prediction != result_3$outcome, "wrong", "correct")
result_3
##     case_ID outcome label prediction Death Recover prediction_eval
## 1  case_123   Death     1    Recover  0.04    0.96           wrong
## 2  case_127 Recover     0    Recover  0.04    0.96         correct
## 3  case_128 Recover     0      Death  0.81    0.19           wrong
## 4   case_14 Recover     0      Death  0.70    0.30           wrong
## 5   case_19   Death     1      Death  0.67    0.33         correct
## 6    case_2   Death     1      Death  0.66    0.34         correct
## 7   case_20 Recover     0      Death  0.76    0.24           wrong
## 8   case_21 Recover     0    Recover  0.17    0.83         correct
## 9   case_34   Death     1      Death  0.94    0.06         correct
## 10  case_37 Recover     0    Recover  0.02    0.98         correct
## 11   case_5 Recover     0    Recover  0.09    0.91         correct
## 12  case_51 Recover     0    Recover  0.41    0.59         correct
## 13  case_55 Recover     0    Recover  0.12    0.88         correct
## 14   case_6   Death     1    Recover  0.30    0.70           wrong
## 15  case_61   Death     1      Death  0.99    0.01         correct
## 16  case_65 Recover     0    Recover  0.01    0.99         correct
## 17  case_74 Recover     0      Death  0.85    0.15           wrong
## 18  case_78   Death     1    Recover  0.21    0.79           wrong
## 19  case_79 Recover     0    Recover  0.01    0.99         correct
## 20   case_8   Death     1    Recover  0.20    0.80           wrong
## 21  case_87   Death     1      Death  0.65    0.35         correct
## 22  case_91 Recover     0    Recover  0.01    0.99         correct
## 23  case_94 Recover     0    Recover  0.07    0.93         correct


Box-Cox transformation

The Box-Cox power transformation is used to normalize data.

set.seed(27)
model_xgb_BoxCox <-train(outcome ~ .,
                 data=val_train_data,
                 method="xgbTree",
                 preProcess = "BoxCox",
                 trControl = trainControl(method = "repeatedcv", number = 5, repeats = 10, verboseIter = FALSE))
confusionMatrix(predict(model_xgb_BoxCox, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       5       4
##    Recover     4      10
##                                           
##                Accuracy : 0.6522          
##                  95% CI : (0.4273, 0.8362)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.4216          
##                                           
##                   Kappa : 0.2698          
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5556          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.5556          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.3913          
##          Detection Rate : 0.2174          
##    Detection Prevalence : 0.3913          
##       Balanced Accuracy : 0.6349          
##                                           
##        'Positive' Class : Death           
## 

Box-Cox transformation did not improve the predictions either.

pred_4 <- predict(model_xgb_BoxCox, val_test_data[, -1])
pred_4b <- round(predict(model_xgb_BoxCox, val_test_data[, -1], type="prob"), digits = 2)

result_4 <- data.frame(case_ID = rownames(val_test_data),
                       outcome = val_test_data$outcome, 
                       label = label, 
                       prediction = pred_4,
                       pred_4b)
result_4$prediction_eval <- ifelse(result_4$prediction != result_4$outcome, "wrong", "correct")
result_4
##     case_ID outcome label prediction Death Recover prediction_eval
## 1  case_123   Death     1    Recover  0.01    0.99           wrong
## 2  case_127 Recover     0    Recover  0.01    0.99         correct
## 3  case_128 Recover     0      Death  0.88    0.12           wrong
## 4   case_14 Recover     0      Death  0.81    0.19           wrong
## 5   case_19   Death     1      Death  0.88    0.12         correct
## 6    case_2   Death     1      Death  0.79    0.21         correct
## 7   case_20 Recover     0      Death  0.84    0.16           wrong
## 8   case_21 Recover     0    Recover  0.07    0.93         correct
## 9   case_34   Death     1      Death  0.97    0.03         correct
## 10  case_37 Recover     0    Recover  0.01    0.99         correct
## 11   case_5 Recover     0    Recover  0.06    0.94         correct
## 12  case_51 Recover     0    Recover  0.27    0.73         correct
## 13  case_55 Recover     0    Recover  0.09    0.91         correct
## 14   case_6   Death     1    Recover  0.30    0.70           wrong
## 15  case_61   Death     1      Death  1.00    0.00         correct
## 16  case_65 Recover     0    Recover  0.00    1.00         correct
## 17  case_74 Recover     0      Death  0.90    0.10           wrong
## 18  case_78   Death     1    Recover  0.11    0.89           wrong
## 19  case_79 Recover     0    Recover  0.00    1.00         correct
## 20   case_8   Death     1    Recover  0.13    0.87           wrong
## 21  case_87   Death     1      Death  0.61    0.39         correct
## 22  case_91 Recover     0    Recover  0.00    1.00         correct
## 23  case_94 Recover     0    Recover  0.04    0.96         correct


Principal Component Analysis (PCA)

PCA is used for dimensionality reduction. When applied as a preprocessing method the number of features are reduced by using the eigenvectors of the covariance matrix.

set.seed(27)
model_xgb_pca <-train(outcome ~ .,
                 data=val_train_data,
                 method="xgbTree",
                 preProcess = "pca",
                 trControl = trainControl(method = "repeatedcv", number = 5, repeats = 10, verboseIter = FALSE))
confusionMatrix(predict(model_xgb_pca, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       4       2
##    Recover     5      12
##                                           
##                Accuracy : 0.6957          
##                  95% CI : (0.4708, 0.8679)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.2644          
##                                           
##                   Kappa : 0.3207          
##  Mcnemar's Test P-Value : 0.4497          
##                                           
##             Sensitivity : 0.4444          
##             Specificity : 0.8571          
##          Pos Pred Value : 0.6667          
##          Neg Pred Value : 0.7059          
##              Prevalence : 0.3913          
##          Detection Rate : 0.1739          
##    Detection Prevalence : 0.2609          
##       Balanced Accuracy : 0.6508          
##                                           
##        'Positive' Class : Death           
## 

PCA did improve the predictions slightly.

pred_5 <- predict(model_xgb_pca, val_test_data[, -1])
pred_5b <- round(predict(model_xgb_pca, val_test_data[, -1], type="prob"), digits = 2)

result_5 <- data.frame(case_ID = rownames(val_test_data),
                       outcome = val_test_data$outcome, 
                       label = label, 
                       prediction = pred_5,
                       pred_5b)
result_5$prediction_eval <- ifelse(result_5$prediction != result_5$outcome, "wrong", "correct")
result_5
##     case_ID outcome label prediction Death Recover prediction_eval
## 1  case_123   Death     1    Recover  0.02    0.98           wrong
## 2  case_127 Recover     0    Recover  0.29    0.71         correct
## 3  case_128 Recover     0    Recover  0.01    0.99         correct
## 4   case_14 Recover     0    Recover  0.44    0.56         correct
## 5   case_19   Death     1    Recover  0.07    0.93           wrong
## 6    case_2   Death     1    Recover  0.26    0.74           wrong
## 7   case_20 Recover     0    Recover  0.16    0.84         correct
## 8   case_21 Recover     0      Death  0.57    0.43           wrong
## 9   case_34   Death     1      Death  0.93    0.07         correct
## 10  case_37 Recover     0    Recover  0.03    0.97         correct
## 11   case_5 Recover     0    Recover  0.11    0.89         correct
## 12  case_51 Recover     0    Recover  0.19    0.81         correct
## 13  case_55 Recover     0    Recover  0.16    0.84         correct
## 14   case_6   Death     1      Death  0.55    0.45         correct
## 15  case_61   Death     1    Recover  0.22    0.78           wrong
## 16  case_65 Recover     0    Recover  0.08    0.92         correct
## 17  case_74 Recover     0      Death  0.67    0.33           wrong
## 18  case_78   Death     1      Death  0.87    0.13         correct
## 19  case_79 Recover     0    Recover  0.16    0.84         correct
## 20   case_8   Death     1    Recover  0.22    0.78           wrong
## 21  case_87   Death     1      Death  0.66    0.34         correct
## 22  case_91 Recover     0    Recover  0.01    0.99         correct
## 23  case_94 Recover     0    Recover  0.14    0.86         correct


Median imputation

set.seed(27)
model_xgb_medianImpute <-train(outcome ~ .,
                 data=val_train_data,
                 method="xgbTree",
                 preProcess = "medianImpute",
                 trControl = trainControl(method = "repeatedcv", number = 5, repeats = 10, verboseIter = FALSE))
confusionMatrix(predict(model_xgb_medianImpute, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       5       4
##    Recover     4      10
##                                           
##                Accuracy : 0.6522          
##                  95% CI : (0.4273, 0.8362)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.4216          
##                                           
##                   Kappa : 0.2698          
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5556          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.5556          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.3913          
##          Detection Rate : 0.2174          
##    Detection Prevalence : 0.3913          
##       Balanced Accuracy : 0.6349          
##                                           
##        'Positive' Class : Death           
## 

Median imputation did not improve the predictions either.

pred_6 <- predict(model_xgb_medianImpute, val_test_data[, -1])
pred_6b <- round(predict(model_xgb_medianImpute, val_test_data[, -1], type="prob"), digits = 2)

result_6 <- data.frame(case_ID = rownames(val_test_data),
                       outcome = val_test_data$outcome, 
                       label = label, 
                       prediction = pred_6,
                       pred_6b)
result_6$prediction_eval <- ifelse(result_6$prediction != result_6$outcome, "wrong", "correct")
result_6
##     case_ID outcome label prediction Death Recover prediction_eval
## 1  case_123   Death     1    Recover  0.04    0.96           wrong
## 2  case_127 Recover     0    Recover  0.04    0.96         correct
## 3  case_128 Recover     0      Death  0.81    0.19           wrong
## 4   case_14 Recover     0      Death  0.70    0.30           wrong
## 5   case_19   Death     1      Death  0.67    0.33         correct
## 6    case_2   Death     1      Death  0.66    0.34         correct
## 7   case_20 Recover     0      Death  0.76    0.24           wrong
## 8   case_21 Recover     0    Recover  0.17    0.83         correct
## 9   case_34   Death     1      Death  0.94    0.06         correct
## 10  case_37 Recover     0    Recover  0.02    0.98         correct
## 11   case_5 Recover     0    Recover  0.09    0.91         correct
## 12  case_51 Recover     0    Recover  0.41    0.59         correct
## 13  case_55 Recover     0    Recover  0.12    0.88         correct
## 14   case_6   Death     1    Recover  0.30    0.70           wrong
## 15  case_61   Death     1      Death  0.99    0.01         correct
## 16  case_65 Recover     0    Recover  0.01    0.99         correct
## 17  case_74 Recover     0      Death  0.85    0.15           wrong
## 18  case_78   Death     1    Recover  0.21    0.79           wrong
## 19  case_79 Recover     0    Recover  0.01    0.99         correct
## 20   case_8   Death     1    Recover  0.20    0.80           wrong
## 21  case_87   Death     1      Death  0.65    0.35         correct
## 22  case_91 Recover     0    Recover  0.01    0.99         correct
## 23  case_94 Recover     0    Recover  0.07    0.93         correct


Comparison of extreme gradient boosting models

Combining results

library(dplyr)
result <- left_join(result_1[, c(1, 2, 6)], result_2[, c(1, 6)], by = "case_ID")
result <- left_join(result, result_3[, c(1, 7)], by = "case_ID")
result <- left_join(result, result_4[, c(1, 7)], by = "case_ID")
result <- left_join(result, result_5[, c(1, 7)], by = "case_ID")
result <- left_join(result, result_6[, c(1, 7)], by = "case_ID")
colnames(result)[-c(1:2)] <- c("pred_xgboost_gbtree", "pred_xgboost_gblinear", "model_xgb_sc_cen", "model_xgb_BoxCox", "pred_xgbTree_pca", "model_xgb_medianImpute")


What’s the rate of correctly predicted cases in the validation data?

round(sum(result$pred_xgboost_gbtree == "correct")/nrow(result), digits = 2)
## [1] 0.65
round(sum(result$pred_xgboost_gblinear == "correct")/nrow(result), digits = 2)
## [1] 0.57
round(sum(result$model_xgb_sc_cen == "correct")/nrow(result), digits = 2)
## [1] 0.65
round(sum(result$model_xgb_BoxCox == "correct")/nrow(result), digits = 2)
## [1] 0.65
round(sum(result$pred_xgbTree_pca == "correct")/nrow(result), digits = 2)
## [1] 0.7
round(sum(result$model_xgb_medianImpute == "correct")/nrow(result), digits = 2)
## [1] 0.65

PCA preprocessing combined with caret’s implementation of extreme gradient boosting had the highest number of accurately predicted cases in the validation data set.


Predicting unknown output

Because PCA preprocessing and caret’s implementation of extreme gradient boosting had the highest prediction accuracy, this model will be used to predict the unknown cases from the original dataset.

Uncertainty of prediction is accounted for with a cutoff below 0.7.

set.seed(27)
model_xgb_pca <-train(outcome ~ .,
                 data = train_data,
                 method = "xgbTree",
                 preProcess = "pca",
                 trControl = trainControl(method = "repeatedcv", number = 5, repeats = 10, verboseIter = FALSE))
pred <- predict(model_xgb_pca, test_data)
predb <- round(predict(model_xgb_pca, test_data, type="prob"), digits = 2)

result <- data.frame(case_ID = rownames(test_data),
                       prediction = pred,
                       predb)
result$predicted_outcome <- ifelse(result$Death > 0.7, "Death",
                                   ifelse(result$Recover > 0.7, "Recover", "uncertain"))
result
##     case_ID prediction Death Recover predicted_outcome
## 1  case_100    Recover  0.08    0.92           Recover
## 2  case_101    Recover  0.11    0.89           Recover
## 3  case_102    Recover  0.04    0.96           Recover
## 4  case_103    Recover  0.01    0.99           Recover
## 5  case_104    Recover  0.19    0.81           Recover
## 6  case_105    Recover  0.01    0.99           Recover
## 7  case_108      Death  0.99    0.01             Death
## 8  case_109    Recover  0.30    0.70         uncertain
## 9  case_110    Recover  0.16    0.84           Recover
## 10 case_112      Death  0.54    0.46         uncertain
## 11 case_113    Recover  0.02    0.98           Recover
## 12 case_114    Recover  0.30    0.70         uncertain
## 13 case_115    Recover  0.02    0.98           Recover
## 14 case_118      Death  0.65    0.35         uncertain
## 15 case_120      Death  0.89    0.11             Death
## 16 case_122    Recover  0.00    1.00           Recover
## 17 case_126    Recover  0.04    0.96           Recover
## 18 case_130    Recover  0.42    0.58         uncertain
## 19 case_132    Recover  0.47    0.53         uncertain
## 20 case_136      Death  0.86    0.14             Death
## 21  case_15      Death  0.99    0.01             Death
## 22  case_16      Death  0.91    0.09             Death
## 23  case_22      Death  0.80    0.20             Death
## 24  case_28      Death  0.91    0.09             Death
## 25  case_31      Death  1.00    0.00             Death
## 26  case_32      Death  0.93    0.07             Death
## 27  case_38      Death  0.87    0.13             Death
## 28  case_39      Death  0.88    0.12             Death
## 29   case_4      Death  0.99    0.01             Death
## 30  case_40      Death  1.00    0.00             Death
## 31  case_41      Death  0.95    0.05             Death
## 32  case_42    Recover  0.49    0.51         uncertain
## 33  case_47      Death  0.91    0.09             Death
## 34  case_48      Death  0.95    0.05             Death
## 35  case_52      Death  0.62    0.38         uncertain
## 36  case_54    Recover  0.43    0.57         uncertain
## 37  case_56      Death  0.81    0.19             Death
## 38  case_62      Death  0.58    0.42         uncertain
## 39  case_63    Recover  0.44    0.56         uncertain
## 40  case_66    Recover  0.18    0.82           Recover
## 41  case_67    Recover  0.02    0.98           Recover
## 42  case_68    Recover  0.27    0.73           Recover
## 43  case_69    Recover  0.35    0.65         uncertain
## 44  case_70    Recover  0.02    0.98           Recover
## 45  case_71    Recover  0.17    0.83           Recover
## 46  case_80    Recover  0.05    0.95           Recover
## 47  case_84    Recover  0.02    0.98           Recover
## 48  case_85      Death  0.99    0.01             Death
## 49  case_86    Recover  0.02    0.98           Recover
## 50  case_88    Recover  0.02    0.98           Recover
## 51   case_9      Death  0.98    0.02             Death
## 52  case_90    Recover  0.08    0.92           Recover
## 53  case_92    Recover  0.04    0.96           Recover
## 54  case_93    Recover  0.43    0.57         uncertain
## 55  case_95    Recover  0.11    0.89           Recover
## 56  case_96      Death  0.68    0.32         uncertain
## 57  case_99    Recover  0.04    0.96           Recover


Comparison with predicted outcome from last week’s analyses


Plotting predicted outcome

Results from last week’s analysis come from the following part of the analysis:

results <- data.frame(randomForest = predict(model_rf, newdata = test_data, type="prob"),
glmnet = predict(model_glmnet, newdata = test_data, type="prob"),
kknn = predict(model_kknn, newdata = test_data, type="prob"),
pda = predict(model_pda, newdata = test_data, type="prob"),
slda = predict(model_slda, newdata = test_data, type="prob"),
pam = predict(model_pam, newdata = test_data, type="prob"),
C5.0Tree = predict(model_C5.0Tree, newdata = test_data, type="prob"),
pls = predict(model_pls, newdata = test_data, type="prob"))

results$sum_Death <- rowSums(results[, grep("Death", colnames(results))])
results$sum_Recover <- rowSums(results[, grep("Recover", colnames(results))])
results$log2_ratio <- log2(results$sum_Recover/results$sum_Death)
results$predicted_outcome <- ifelse(results$log2_ratio > 1.5, "Recover", ifelse(results$log2_ratio < -1.5, "Death", "uncertain"))
results[, -c(1:16)]

results <- results_last_week

Combining the table with predicted outcomes from this and last week with the original data.

result <- merge(result[, c(1, 5)], results_last_week[, ncol(results_last_week), drop = FALSE], by.x = "case_ID", by.y = "row.names")
colnames(result)[2:3] <- c("predicted_outcome_xgboost", "predicted_outcome_last_week")

results_combined <- merge(result, fluH7N9.china.2013[which(fluH7N9.china.2013$case.ID %in% result$case_ID), ], 
                          by.x = "case_ID", by.y = "case.ID")
results_combined <- results_combined[, -c(6,7)]

For plotting with ggplot2, the dataframe needs to be gathered.

library(tidyr)
results_combined_gather <- results_combined %>%
  gather(group_dates, date, date.of.onset:date.of.hospitalisation)

results_combined_gather$group_dates <- factor(results_combined_gather$group_dates, levels = c("date.of.onset", "date.of.hospitalisation"))

results_combined_gather$group_dates <- mapvalues(results_combined_gather$group_dates, from = c("date.of.onset", "date.of.hospitalisation"), 
                                             to = c("Date of onset", "Date of hospitalisation"))

results_combined_gather$gender <- mapvalues(results_combined_gather$gender, from = c("f", "m"), 
                                             to = c("Female", "Male"))
levels(results_combined_gather$gender) <- c(levels(results_combined_gather$gender), "unknown")
results_combined_gather$gender[is.na(results_combined_gather$gender)] <- "unknown"

results_combined_gather <- results_combined_gather %>%
  gather(group_pred, prediction, predicted_outcome_xgboost:predicted_outcome_last_week)

results_combined_gather$group_pred <- mapvalues(results_combined_gather$group_pred, from = c("predicted_outcome_xgboost", "predicted_outcome_last_week"), 
                                             to = c("Predicted outcome from XGBoost", "Predicted outcome from last week"))

Setting a custom theme for plotting:

library(ggplot2)
my_theme <- function(base_size = 12, base_family = "sans"){
  theme_minimal(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(color = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "aliceblue"),
    strip.background = element_rect(fill = "lightgrey", color = "grey", size = 1),
    strip.text = element_text(face = "bold", size = 12, color = "black"),
    legend.position = "bottom",
    legend.justification = "top", 
    legend.box = "horizontal",
    legend.box.background = element_rect(colour = "grey50"),
    legend.background = element_blank(),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.5),
    panel.spacing = unit(1, "lines")
  )
}
results_combined_gather$province <- mapvalues(results_combined_gather$province, 
                                                from = c("Anhui", "Beijing", "Fujian", "Guangdong", "Hebei", "Henan", "Hunan", "Jiangxi", "Shandong", "Taiwan"), 
                                                to = rep("Other", 10))

levels(results_combined_gather$gender) <- c(levels(results_combined_gather$gender), "unknown")
results_combined_gather$gender[is.na(results_combined_gather$gender)] <- "unknown"

results_combined_gather$province <- factor(results_combined_gather$province, levels = c("Jiangsu",  "Shanghai", "Zhejiang", "Other"))
ggplot(data = subset(results_combined_gather, group_dates == "Date of onset"), aes(x = date, y = as.numeric(age), fill = prediction)) +
  stat_density2d(aes(alpha = ..level..), geom = "polygon") +
  geom_jitter(aes(color = prediction, shape = gender), size = 2) +
  geom_rug(aes(color = prediction)) +
  labs(
    fill = "Predicted outcome",
    color = "Predicted outcome",
    alpha = "Level",
    shape = "Gender",
    x = "Date of onset in 2013",
    y = "Age",
    title = "2013 Influenza A H7N9 cases in China",
    subtitle = "Predicted outcome of cases with unknown outcome",
    caption = ""
  ) +
  facet_grid(group_pred ~ province) +
  my_theme() +
  scale_shape_manual(values = c(15, 16, 17)) +
  scale_color_brewer(palette="Set1", na.value = "grey50") +
  scale_fill_brewer(palette="Set1")


There is much less uncertainty in the XGBoost data, even tough I used slightly different methods for classifying uncertainty: In last week’s analysis I based uncertainty on the ratio of combined prediction values from all analyses, this week uncertainty is based on the prediction value from one analysis.


For a comparison of feature selection methods see here.


If you are interested in more machine learning posts, check out the category listing for machine_learning.


sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252    LC_MONETARY=English_United States.1252 LC_NUMERIC=C                           LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_0.6.0     plyr_1.8.4      xgboost_0.6-0   caret_6.0-73    ggplot2_2.2.0   lattice_0.20-34 mice_2.25       Rcpp_0.12.8     dplyr_0.5.0     outbreaks_1.0.0
## 
## loaded via a namespace (and not attached):
##  [1] Ckmeans.1d.dp_3.4.6-4 RColorBrewer_1.1-2    compiler_3.3.2        nloptr_1.0.4          class_7.3-14          iterators_1.0.8       tools_3.3.2           rpart_4.1-10          digest_0.6.10         lme4_1.1-12           evaluate_0.10         tibble_1.2            gtable_0.2.0          nlme_3.1-128          mgcv_1.8-16           Matrix_1.2-7.1        foreach_1.4.3         DBI_0.5-1             parallel_3.3.2        yaml_2.1.14           SparseM_1.74          e1071_1.6-7           stringr_1.1.0         knitr_1.15            MatrixModels_0.4-1    stats4_3.3.2          grid_3.3.2            nnet_7.3-12           data.table_1.9.6      R6_2.2.0              survival_2.40-1       rmarkdown_1.1         minqa_1.2.4           reshape2_1.4.2        car_2.1-3             magrittr_1.5          scales_0.4.1          codetools_0.2-15      ModelMetrics_1.1.0    htmltools_0.3.5       MASS_7.3-45           splines_3.3.2         assertthat_0.1        pbkrtest_0.4-6        colorspace_1.3-0      labeling_0.3          quantreg_5.29         stringi_1.1.2         lazyeval_0.2.0        munsell_0.4.3         chron_2.3-47

Can we predict flu deaths with Machine Learning and R?

2016-11-27 00:00:00 +0000

Edited on 26 December 2016


Among the many R packages, there is the outbreaks package. It contains datasets on epidemics, on of which is from the 2013 outbreak of influenza A H7N9 in China, as analysed by Kucharski et al. (2014):

A. Kucharski, H. Mills, A. Pinsent, C. Fraser, M. Van Kerkhove, C. A. Donnelly, and S. Riley. 2014. Distinguishing between reservoir exposure and human-to-human transmission for emerging pathogens using case onset data. PLOS Currents Outbreaks. Mar 7, edition 1. doi: 10.1371/currents.outbreaks.e1473d9bfc99d080ca242139a06c455f.

A. Kucharski, H. Mills, A. Pinsent, C. Fraser, M. Van Kerkhove, C. A. Donnelly, and S. Riley. 2014. Data from: Distinguishing between reservoir exposure and human-to-human transmission for emerging pathogens using case onset data. Dryad Digital Repository. http://dx.doi.org/10.5061/dryad.2g43n.

I will be using their data as an example to test whether we can use Machine Learning algorithms for predicting disease outcome.

To do so, I selected and extracted features from the raw data, including age, days between onset and outcome, gender, whether the patients were hospitalised, etc. Missing values were imputed and different model algorithms were used to predict outcome (death or recovery). The prediction accuracy, sensitivity and specificity. The thus prepared dataset was devided into training and testing subsets. The test subset contained all cases with an unknown outcome. Before I applied the models to the test data, I further split the training data into validation subsets.

The tested modeling algorithms were similarly successful at predicting the outcomes of the validation data. To decide on final classifications, I compared predictions from all models and defined the outcome “Death” or “Recovery” as a function of all models, whereas classifications with a low prediction probability were flagged as “uncertain”. Accounting for this uncertainty led to a 100% correct classification of the validation test set.

The training cases with unknown outcome were then classified based on the same algorithms. From 57 unknown cases, 14 were classified as “Recovery”, 10 as “Death” and 33 as uncertain.


In a Part 2 I am looking at how extreme gradient boosting performs on this dataset.


The data

The dataset contains case ID, date of onset, date of hospitalisation, date of outcome, gender, age, province and of course the outcome: Death or Recovery. I can already see that there are a couple of missing values in the data, which I will deal with later.

# install and load package
if (!require("outbreaks")) install.packages("outbreaks")
library(outbreaks)
fluH7N9.china.2013_backup <- fluH7N9.china.2013 # back up original dataset in case something goes awry along the way

# convert ? to NAs
fluH7N9.china.2013$age[which(fluH7N9.china.2013$age == "?")] <- NA

# create a new column with case ID
fluH7N9.china.2013$case.ID <- paste("case", fluH7N9.china.2013$case.ID, sep = "_")
head(fluH7N9.china.2013)
##   case.ID date.of.onset date.of.hospitalisation date.of.outcome outcome gender age province
## 1  case_1    2013-02-19                    <NA>      2013-03-04   Death      m  87 Shanghai
## 2  case_2    2013-02-27              2013-03-03      2013-03-10   Death      m  27 Shanghai
## 3  case_3    2013-03-09              2013-03-19      2013-04-09   Death      f  35    Anhui
## 4  case_4    2013-03-19              2013-03-27            <NA>    <NA>      f  45  Jiangsu
## 5  case_5    2013-03-19              2013-03-30      2013-05-15 Recover      f  48  Jiangsu
## 6  case_6    2013-03-21              2013-03-28      2013-04-26   Death      f  32  Jiangsu


Before I start preparing the data for Machine Learning, I want to get an idea of the distribution of the data points and their different variables by plotting.

Most provinces have only a handful of cases, so I am combining them into the category “other” and keep only Jiangsu, Shanghai and Zhejian and separate provinces.

# gather for plotting with ggplot2
library(tidyr)
fluH7N9.china.2013_gather <- fluH7N9.china.2013 %>%
  gather(Group, Date, date.of.onset:date.of.outcome)

# rearrange group order
fluH7N9.china.2013_gather$Group <- factor(fluH7N9.china.2013_gather$Group, levels = c("date.of.onset", "date.of.hospitalisation", "date.of.outcome"))

# rename groups
library(plyr)
fluH7N9.china.2013_gather$Group <- mapvalues(fluH7N9.china.2013_gather$Group, from = c("date.of.onset", "date.of.hospitalisation", "date.of.outcome"), 
          to = c("Date of onset", "Date of hospitalisation", "Date of outcome"))

# renaming provinces
fluH7N9.china.2013_gather$province <- mapvalues(fluH7N9.china.2013_gather$province, 
                                                from = c("Anhui", "Beijing", "Fujian", "Guangdong", "Hebei", "Henan", "Hunan", "Jiangxi", "Shandong", "Taiwan"), 
                                                to = rep("Other", 10))

# add a level for unknown gender
levels(fluH7N9.china.2013_gather$gender) <- c(levels(fluH7N9.china.2013_gather$gender), "unknown")
fluH7N9.china.2013_gather$gender[is.na(fluH7N9.china.2013_gather$gender)] <- "unknown"

# rearrange province order so that Other is the last
fluH7N9.china.2013_gather$province <- factor(fluH7N9.china.2013_gather$province, levels = c("Jiangsu",  "Shanghai", "Zhejiang", "Other"))

# convert age to numeric
fluH7N9.china.2013_gather$age <- as.numeric(as.character(fluH7N9.china.2013_gather$age))
# preparing my ggplot2 theme of choice

library(ggplot2)
my_theme <- function(base_size = 12, base_family = "sans"){
  theme_minimal(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(color = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "aliceblue"),
    strip.background = element_rect(fill = "lightgrey", color = "grey", size = 1),
    strip.text = element_text(face = "bold", size = 12, color = "black"),
    legend.position = "bottom",
    legend.justification = "top", 
    legend.box = "horizontal",
    legend.box.background = element_rect(colour = "grey50"),
    legend.background = element_blank(),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.5)
  )
}
# plotting raw data

ggplot(data = fluH7N9.china.2013_gather, aes(x = Date, y = age, fill = outcome)) +
  stat_density2d(aes(alpha = ..level..), geom = "polygon") +
  geom_jitter(aes(color = outcome, shape = gender), size = 1.5) +
  geom_rug(aes(color = outcome)) +
  labs(
    fill = "Outcome",
    color = "Outcome",
    alpha = "Level",
    shape = "Gender",
    x = "Date in 2013",
    y = "Age",
    title = "2013 Influenza A H7N9 cases in China",
    subtitle = "Dataset from 'outbreaks' package (Kucharski et al. 2014)",
    caption = ""
  ) +
  facet_grid(Group ~ province) +
  my_theme() +
  scale_shape_manual(values = c(15, 16, 17)) +
  scale_color_brewer(palette="Set1", na.value = "grey50") +
  scale_fill_brewer(palette="Set1")

This plot shows the dates of onset, hospitalisation and outcome (if known) of each data point. Outcome is marked by color and age shown on the y-axis. Gender is marked by point shape.

The density distribution of date by age for the cases seems to indicate that older people died more frequently in the Jiangsu and Zhejiang province than in Shanghai and in other provinces.

When we look at the distribution of points along the time axis, it suggests that their might be a positive correlation between the likelihood of death and an early onset or early outcome.


I also want to know how many cases there are for each gender and province and compare the genders’ age distribution.

fluH7N9.china.2013_gather_2 <- fluH7N9.china.2013_gather[, -4] %>%
  gather(group_2, value, gender:province)

fluH7N9.china.2013_gather_2$value <- mapvalues(fluH7N9.china.2013_gather_2$value, from = c("m", "f", "unknown", "Other"), 
          to = c("Male", "Female", "Unknown gender", "Other province"))

fluH7N9.china.2013_gather_2$value <- factor(fluH7N9.china.2013_gather_2$value, 
                                            levels = c("Female", "Male", "Unknown gender", "Jiangsu", "Shanghai", "Zhejiang", "Other province"))

p1 <- ggplot(data = fluH7N9.china.2013_gather_2, aes(x = value, fill = outcome, color = outcome)) +
  geom_bar(position = "dodge", alpha = 0.7, size = 1) +
  my_theme() +
  scale_fill_brewer(palette="Set1", na.value = "grey50") +
  scale_color_brewer(palette="Set1", na.value = "grey50") +
  labs(
    color = "",
    fill = "",
    x = "",
    y = "Count",
    title = "2013 Influenza A H7N9 cases in China",
    subtitle = "Gender and Province numbers of flu cases",
    caption = ""
  )

p2 <- ggplot(data = fluH7N9.china.2013_gather, aes(x = age, fill = outcome, color = outcome)) +
  geom_density(alpha = 0.3, size = 1) +
  geom_rug() +
  scale_color_brewer(palette="Set1", na.value = "grey50") +
  scale_fill_brewer(palette="Set1", na.value = "grey50") +
  my_theme() +
  labs(
    color = "",
    fill = "",
    x = "Age",
    y = "Density",
    title = "",
    subtitle = "Age distribution of flu cases",
    caption = ""
  )

library(gridExtra)
library(grid)

grid.arrange(p1, p2, ncol = 2)

In the dataset, there are more male than female cases and correspondingly, we see more deaths, recoveries and unknown outcomes in men than in women. This is potentially a problem later on for modeling because the inherent likelihoods for outcome are not directly comparable between the sexes.

Most unknown outcomes were recorded in Zhejiang. Similarly to gender, we don’t have an equal distribution of data points across provinces either.

When we look at the age distribution it is obvious that people who died tended to be slightly older than those who recovered. The density curve of unknown outcomes is more similar to that of death than of recovery, suggesting that among these people there might have been more deaths than recoveries.


And lastly, I want to plot how many days passed between onset, hospitalisation and outcome for each case.

ggplot(data = fluH7N9.china.2013_gather, aes(x = Date, y = age, color = outcome)) +
  geom_point(aes(shape = gender), size = 1.5, alpha = 0.6) +
  geom_path(aes(group = case.ID)) +
  facet_wrap( ~ province, ncol = 2) +
  my_theme() +
  scale_shape_manual(values = c(15, 16, 17)) +
  scale_color_brewer(palette="Set1", na.value = "grey50") +
  scale_fill_brewer(palette="Set1") +
  labs(
    color = "Outcome",
    shape = "Gender",
    x = "Date in 2013",
    y = "Age",
    title = "2013 Influenza A H7N9 cases in China",
    subtitle = "Dataset from 'outbreaks' package (Kucharski et al. 2014)",
    caption = "\nTime from onset of flu to outcome."
  )

This plot shows that there are many missing values in the dates, so it is hard to draw a general conclusion.


Features

In Machine Learning-speak features are the variables used for model training. Using the right features dramatically influences the accuracy of the model.

Because we don’t have many features, I am keeping age as it is, but I am also generating new features:

  • from the date information I am calculating the days between onset and outcome and between onset and hospitalisation
  • I am converting gender into numeric values with 1 for female and 0 for male
  • similarly, I am converting provinces to binary classifiers (yes == 1, no == 0) for Shanghai, Zhejiang, Jiangsu and other provinces
  • the same binary classification is given for whether a case was hospitalised, and whether they had an early onset or early outcome (earlier than the median date)
# preparing the data frame for modeling
# 
library(dplyr)

dataset <- fluH7N9.china.2013 %>%
  mutate(hospital = as.factor(ifelse(is.na(date.of.hospitalisation), 0, 1)),
         gender_f = as.factor(ifelse(gender == "f", 1, 0)),
         province_Jiangsu = as.factor(ifelse(province == "Jiangsu", 1, 0)),
         province_Shanghai = as.factor(ifelse(province == "Shanghai", 1, 0)),
         province_Zhejiang = as.factor(ifelse(province == "Zhejiang", 1, 0)),
         province_other = as.factor(ifelse(province == "Zhejiang" | province == "Jiangsu" | province == "Shanghai", 0, 1)),
         days_onset_to_outcome = as.numeric(as.character(gsub(" days", "",
                                      as.Date(as.character(date.of.outcome), format = "%Y-%m-%d") - 
                                        as.Date(as.character(date.of.onset), format = "%Y-%m-%d")))),
         days_onset_to_hospital = as.numeric(as.character(gsub(" days", "",
                                      as.Date(as.character(date.of.hospitalisation), format = "%Y-%m-%d") - 
                                        as.Date(as.character(date.of.onset), format = "%Y-%m-%d")))),
         age = as.numeric(as.character(age)),
         early_onset = as.factor(ifelse(date.of.onset < summary(fluH7N9.china.2013$date.of.onset)[[3]], 1, 0)),
         early_outcome = as.factor(ifelse(date.of.outcome < summary(fluH7N9.china.2013$date.of.outcome)[[3]], 1, 0))) %>%
  subset(select = -c(2:4, 6, 8))
rownames(dataset) <- dataset$case.ID
dataset <- dataset[, -1]
head(dataset)
##        outcome age hospital gender_f province_Jiangsu province_Shanghai province_Zhejiang province_other days_onset_to_outcome days_onset_to_hospital early_onset early_outcome
## case_1   Death  87        0        0                0                 1                 0              0                    13                     NA           1             1
## case_2   Death  27        1        0                0                 1                 0              0                    11                      4           1             1
## case_3   Death  35        1        1                0                 0                 0              1                    31                     10           1             1
## case_4    <NA>  45        1        1                1                 0                 0              0                    NA                      8           1          <NA>
## case_5 Recover  48        1        1                1                 0                 0              0                    57                     11           1             0
## case_6   Death  32        1        1                1                 0                 0              0                    36                      7           1             1


Imputing missing values

When looking at the dataset I created for modeling, it is obvious that we have quite a few missing values.

The missing values from the outcome column are what I want to predict but for the rest I would either have to remove the entire row from the data or impute the missing information. I decided to try the latter with the mice package.

# impute missing data

library(mice)

dataset_impute <- mice(dataset[, -1],  print = FALSE)
dataset_impute
## Multiply imputed dataset
## Call:
## mice(data = dataset[, -1], printFlag = FALSE)
## Number of multiple imputations:  5
## Missing cells per column:
##                    age               hospital               gender_f       province_Jiangsu      province_Shanghai      province_Zhejiang         province_other  days_onset_to_outcome days_onset_to_hospital            early_onset          early_outcome 
##                      2                      0                      2                      0                      0                      0                      0                     67                     74                     10                     65 
## Imputation methods:
##                    age               hospital               gender_f       province_Jiangsu      province_Shanghai      province_Zhejiang         province_other  days_onset_to_outcome days_onset_to_hospital            early_onset          early_outcome 
##                  "pmm"                     ""               "logreg"                     ""                     ""                     ""                     ""                  "pmm"                  "pmm"               "logreg"               "logreg" 
## VisitSequence:
##                    age               gender_f  days_onset_to_outcome days_onset_to_hospital            early_onset          early_outcome 
##                      1                      3                      8                      9                     10                     11 
## PredictorMatrix:
##                        age hospital gender_f province_Jiangsu province_Shanghai province_Zhejiang province_other days_onset_to_outcome days_onset_to_hospital early_onset early_outcome
## age                      0        1        1                1                 1                 1              1                     1                      1           1             1
## hospital                 0        0        0                0                 0                 0              0                     0                      0           0             0
## gender_f                 1        1        0                1                 1                 1              1                     1                      1           1             1
## province_Jiangsu         0        0        0                0                 0                 0              0                     0                      0           0             0
## province_Shanghai        0        0        0                0                 0                 0              0                     0                      0           0             0
## province_Zhejiang        0        0        0                0                 0                 0              0                     0                      0           0             0
## province_other           0        0        0                0                 0                 0              0                     0                      0           0             0
## days_onset_to_outcome    1        1        1                1                 1                 1              1                     0                      1           1             1
## days_onset_to_hospital   1        1        1                1                 1                 1              1                     1                      0           1             1
## early_onset              1        1        1                1                 1                 1              1                     1                      1           0             1
## early_outcome            1        1        1                1                 1                 1              1                     1                      1           1             0
## Random generator seed value:  NA
# recombine imputed data frame with the outcome column

dataset_complete <- merge(dataset[, 1, drop = FALSE], mice::complete(dataset_impute, 1), by = "row.names", all = TRUE)
rownames(dataset_complete) <- dataset_complete$Row.names
dataset_complete <- dataset_complete[, -1]


Test, train and validation datasets

For building the model, I am separating the imputed data frame into training and test data. Test data are the 57 cases with unknown outcome.

summary(dataset$outcome)
##   Death Recover    NA's 
##      32      47      57


The training data will be further devided for validation of the models: 70% of the training data will be kept for model building and the remaining 30% will be used for model testing.

I am using the caret package for modeling.

train_index <- which(is.na(dataset_complete$outcome))
train_data <- dataset_complete[-train_index, ]
test_data  <- dataset_complete[train_index, -1]

library(caret)
set.seed(27)
val_index <- createDataPartition(train_data$outcome, p = 0.7, list=FALSE)
val_train_data <- train_data[val_index, ]
val_test_data  <- train_data[-val_index, ]
val_train_X <- val_train_data[,-1]
val_test_X <- val_test_data[,-1]


Decision trees

To get an idea about how each feature contributes to the prediction of the outcome, I first built a decision tree based on the training data using rpart and rattle.

library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)

set.seed(27)
fit <- rpart(outcome ~ .,
                    data = train_data,
                    method = "class",
                    control = rpart.control(xval = 10, minbucket = 2, cp = 0), parms = list(split = "information"))

fancyRpartPlot(fit)

This randomly generated decision tree shows that cases with an early outcome were most likely to die when they were 68 or older, when they also had an early onset and when they were sick for fewer than 13 days. If a person was not among the first cases and was younger than 52, they had a good chance of recovering, but if they were 82 or older, they were more likely to die from the flu.


Feature Importance

Not all of the features I created will be equally important to the model. The decision tree already gave me an idea of which features might be most important but I also want to estimate feature importance using a Random Forest approach with repeated cross validation.

# prepare training scheme
control <- trainControl(method = "repeatedcv", number = 10, repeats = 10)

# train the model
set.seed(27)
model <- train(outcome ~ ., data = train_data, method = "rf", preProcess = NULL, trControl = control)

# estimate variable importance
importance <- varImp(model, scale=TRUE)
# prepare for plotting
importance_df_1 <- importance$importance
importance_df_1$group <- rownames(importance_df_1)

importance_df_1$group <- mapvalues(importance_df_1$group, 
                                           from = c("age", "hospital2", "gender_f2", "province_Jiangsu2", "province_Shanghai2", "province_Zhejiang2",
                                                    "province_other2", "days_onset_to_outcome", "days_onset_to_hospital", "early_onset2", "early_outcome2" ), 
                                           to = c("Age", "Hospital", "Female", "Jiangsu", "Shanghai", "Zhejiang",
                                                    "Other province", "Days onset to outcome", "Days onset to hospital", "Early onset", "Early outcome" ))
f = importance_df_1[order(importance_df_1$Overall, decreasing = FALSE), "group"]

importance_df_2 <- importance_df_1
importance_df_2$Overall <- 0

importance_df <- rbind(importance_df_1, importance_df_2)

# setting factor levels
importance_df <- within(importance_df, group <- factor(group, levels = f))
importance_df_1 <- within(importance_df_1, group <- factor(group, levels = f))

ggplot() +
  geom_point(data = importance_df_1, aes(x = Overall, y = group, color = group), size = 2) +
  geom_path(data = importance_df, aes(x = Overall, y = group, color = group, group = group), size = 1) +
  scale_color_manual(values = rep(brewer.pal(1, "Set1")[1], 11)) +
  my_theme() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5)) +
  labs(
    x = "Importance",
    y = "",
    title = "2013 Influenza A H7N9 cases in China",
    subtitle = "Scaled feature importance",
    caption = "\nDetermined with Random Forest and
    repeated cross validation (10 repeats, 10 times)"
  )

This tells me that age is the most important determining factor for predicting disease outcome, followed by days between onset an outcome, early outcome and days between onset and hospitalisation.


Feature Plot

Before I start actually building models, I want to check whether the distribution of feature values is comparable between training, validation and test datasets.

# prepare for plotting
dataset_complete_gather <- dataset_complete %>%
  mutate(set = ifelse(rownames(dataset_complete) %in% rownames(test_data), "Test Data", 
                               ifelse(rownames(dataset_complete) %in% rownames(val_train_data), "Validation Train Data",
                                      ifelse(rownames(dataset_complete) %in% rownames(val_test_data), "Validation Test Data", "NA"))),
         case_ID = rownames(.)) %>%
  gather(group, value, age:early_outcome)

dataset_complete_gather$group <- mapvalues(dataset_complete_gather$group, 
                                           from = c("age", "hospital", "gender_f", "province_Jiangsu", "province_Shanghai", "province_Zhejiang",
                                                    "province_other", "days_onset_to_outcome", "days_onset_to_hospital", "early_onset", "early_outcome" ), 
                                           to = c("Age", "Hospital", "Female", "Jiangsu", "Shanghai", "Zhejiang",
                                                    "Other province", "Days onset to outcome", "Days onset to hospital", "Early onset", "Early outcome" ))
ggplot(data = dataset_complete_gather, aes(x = as.numeric(value), fill = outcome, color = outcome)) +
  geom_density(alpha = 0.2) +
  geom_rug() +
  scale_color_brewer(palette="Set1", na.value = "grey50") +
  scale_fill_brewer(palette="Set1", na.value = "grey50") +
  my_theme() +
  facet_wrap(set ~ group, ncol = 11, scales = "free") +
  labs(
    x = "Value",
    y = "Density",
    title = "2013 Influenza A H7N9 cases in China",
    subtitle = "Features for classifying outcome",
    caption = "\nDensity distribution of all features used for classification of flu outcome."
  )
ggplot(subset(dataset_complete_gather, group == "Age" | group == "Days onset to hospital" | group == "Days onset to outcome"), 
       aes(x=outcome, y=as.numeric(value), fill=set)) + geom_boxplot() +
  my_theme() +
  scale_fill_brewer(palette="Set1", type = "div ") +
  facet_wrap( ~ group, ncol = 3, scales = "free") +
  labs(
    fill = "",
    x = "Outcome",
    y = "Value",
    title = "2013 Influenza A H7N9 cases in China",
    subtitle = "Features for classifying outcome",
    caption = "\nBoxplot of the features age, days from onset to hospitalisation and days from onset to outcome."
  )

Luckily, the distributions looks reasonably similar between the validation and test data, except for a few outliers.



Comparing Machine Learning algorithms

Before I try to predict the outcome of the unknown cases, I am testing the models’ accuracy with the validation datasets on a couple of algorithms. I have chosen only a few more well known algorithms, but caret implements many more.

I have chose to not do any preprocessing because I was worried that the different data distributions with continuous variables (e.g. age) and binary variables (i.e. 0, 1 classification of e.g. hospitalisation) would lead to problems.


Random Forest

Random Forests predictions are based on the generation of multiple classification trees.

This model classified 14 out of 23 cases correctly.

set.seed(27)
model_rf <- caret::train(outcome ~ .,
                             data = val_train_data,
                             method = "rf",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_rf
## Random Forest 
## 
## 56 samples
## 11 predictors
##  2 classes: 'Death', 'Recover' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 50, 50, 51, 51, 51, 50, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7801905  0.5338408
##    6    0.7650952  0.4926366
##   11    0.7527619  0.4638073
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.
confusionMatrix(predict(model_rf, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       4       4
##    Recover     5      10
##                                           
##                Accuracy : 0.6087          
##                  95% CI : (0.3854, 0.8029)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.5901          
##                                           
##                   Kappa : 0.1619          
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.4444          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.5000          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.3913          
##          Detection Rate : 0.1739          
##    Detection Prevalence : 0.3478          
##       Balanced Accuracy : 0.5794          
##                                           
##        'Positive' Class : Death           
## 


Elastic-Net Regularized Generalized Linear Models

Lasso or elastic net regularization for generalized linear model regression are based on linear regression models and is useful when we have feature correlation in our model.

This model classified 13 out of 23 cases correctly.

set.seed(27)
model_glmnet <- caret::train(outcome ~ .,
                             data = val_train_data,
                             method = "glmnet",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_glmnet
## glmnet 
## 
## 56 samples
## 11 predictors
##  2 classes: 'Death', 'Recover' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 50, 50, 51, 51, 51, 50, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        Accuracy   Kappa    
##   0.10   0.0005154913  0.7211905  0.4218952
##   0.10   0.0051549131  0.7189524  0.4189835
##   0.10   0.0515491312  0.7469524  0.4704687
##   0.55   0.0005154913  0.7211905  0.4218952
##   0.55   0.0051549131  0.7236190  0.4280528
##   0.55   0.0515491312  0.7531905  0.4801031
##   1.00   0.0005154913  0.7228571  0.4245618
##   1.00   0.0051549131  0.7278571  0.4361809
##   1.00   0.0515491312  0.7678571  0.5094194
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.05154913.
confusionMatrix(predict(model_glmnet, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       3       4
##    Recover     6      10
##                                           
##                Accuracy : 0.5652          
##                  95% CI : (0.3449, 0.7681)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.7418          
##                                           
##                   Kappa : 0.0496          
##  Mcnemar's Test P-Value : 0.7518          
##                                           
##             Sensitivity : 0.3333          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.4286          
##          Neg Pred Value : 0.6250          
##              Prevalence : 0.3913          
##          Detection Rate : 0.1304          
##    Detection Prevalence : 0.3043          
##       Balanced Accuracy : 0.5238          
##                                           
##        'Positive' Class : Death           
## 


k-Nearest Neighbors

k-nearest neighbors predicts based on point distances with predefined constants.

This model classified 14 out of 23 cases correctly.

set.seed(27)
model_kknn <- caret::train(outcome ~ .,
                             data = val_train_data,
                             method = "kknn",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_kknn
## k-Nearest Neighbors 
## 
## 56 samples
## 11 predictors
##  2 classes: 'Death', 'Recover' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 50, 50, 51, 51, 51, 50, ... 
## Resampling results across tuning parameters:
## 
##   kmax  Accuracy   Kappa    
##   5     0.6531905  0.2670442
##   7     0.7120476  0.4031836
##   9     0.7106190  0.4004564
## 
## Tuning parameter 'distance' was held constant at a value of 2
## Tuning parameter 'kernel' was held constant at a value of optimal
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were kmax = 7, distance = 2 and kernel = optimal.
confusionMatrix(predict(model_kknn, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       3       3
##    Recover     6      11
##                                           
##                Accuracy : 0.6087          
##                  95% CI : (0.3854, 0.8029)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.5901          
##                                           
##                   Kappa : 0.1266          
##  Mcnemar's Test P-Value : 0.5050          
##                                           
##             Sensitivity : 0.3333          
##             Specificity : 0.7857          
##          Pos Pred Value : 0.5000          
##          Neg Pred Value : 0.6471          
##              Prevalence : 0.3913          
##          Detection Rate : 0.1304          
##    Detection Prevalence : 0.2609          
##       Balanced Accuracy : 0.5595          
##                                           
##        'Positive' Class : Death           
## 


Penalized Discriminant Analysis

Penalized Discriminant Analysis is the penalized linear discriminant analysis and is also useful for when we have highly correlated features.

This model classified 14 out of 23 cases correctly.

set.seed(27)
model_pda <- caret::train(outcome ~ .,
                             data = val_train_data,
                             method = "pda",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_pda
## Penalized Discriminant Analysis 
## 
## 56 samples
## 11 predictors
##  2 classes: 'Death', 'Recover' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 50, 50, 51, 51, 51, 50, ... 
## Resampling results across tuning parameters:
## 
##   lambda  Accuracy   Kappa    
##   0e+00         NaN        NaN
##   1e-04   0.7255238  0.4373766
##   1e-01   0.7235238  0.4342554
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was lambda = 1e-04.
confusionMatrix(predict(model_pda, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       4       4
##    Recover     5      10
##                                           
##                Accuracy : 0.6087          
##                  95% CI : (0.3854, 0.8029)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.5901          
##                                           
##                   Kappa : 0.1619          
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.4444          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.5000          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.3913          
##          Detection Rate : 0.1739          
##    Detection Prevalence : 0.3478          
##       Balanced Accuracy : 0.5794          
##                                           
##        'Positive' Class : Death           
## 


Stabilized Linear Discriminant Analysis

Stabilized Linear Discriminant Analysis is designed for high-dimensional data and correlated co-variables.

This model classified 15 out of 23 cases correctly.

set.seed(27)
model_slda <- caret::train(outcome ~ .,
                             data = val_train_data,
                             method = "slda",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_slda
## Stabilized Linear Discriminant Analysis 
## 
## 56 samples
## 11 predictors
##  2 classes: 'Death', 'Recover' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 50, 50, 51, 51, 51, 50, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.6886667  0.3512234
## 
## 
confusionMatrix(predict(model_slda, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       3       2
##    Recover     6      12
##                                           
##                Accuracy : 0.6522          
##                  95% CI : (0.4273, 0.8362)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.4216          
##                                           
##                   Kappa : 0.2069          
##  Mcnemar's Test P-Value : 0.2888          
##                                           
##             Sensitivity : 0.3333          
##             Specificity : 0.8571          
##          Pos Pred Value : 0.6000          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.3913          
##          Detection Rate : 0.1304          
##    Detection Prevalence : 0.2174          
##       Balanced Accuracy : 0.5952          
##                                           
##        'Positive' Class : Death           
## 


Nearest Shrunken Centroids

Nearest Shrunken Centroids computes a standardized centroid for each class and shrinks each centroid toward the overall centroid for all classes.

This model classified 15 out of 23 cases correctly.

set.seed(27)
model_pam <- caret::train(outcome ~ .,
                             data = val_train_data,
                             method = "pam",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_pam
## Nearest Shrunken Centroids 
## 
## 56 samples
## 11 predictors
##  2 classes: 'Death', 'Recover' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 50, 50, 51, 51, 51, 50, ... 
## Resampling results across tuning parameters:
## 
##   threshold  Accuracy   Kappa    
##   0.1200215  0.7283333  0.4418904
##   1.7403111  0.6455714  0.2319674
##   3.3606007  0.5904762  0.0000000
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was threshold = 0.1200215.
confusionMatrix(predict(model_pam, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       4       3
##    Recover     5      11
##                                           
##                Accuracy : 0.6522          
##                  95% CI : (0.4273, 0.8362)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.4216          
##                                           
##                   Kappa : 0.2397          
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.4444          
##             Specificity : 0.7857          
##          Pos Pred Value : 0.5714          
##          Neg Pred Value : 0.6875          
##              Prevalence : 0.3913          
##          Detection Rate : 0.1739          
##    Detection Prevalence : 0.3043          
##       Balanced Accuracy : 0.6151          
##                                           
##        'Positive' Class : Death           
## 


Single C5.0 Tree

C5.0 is another tree-based modeling algorithm.

This model classified 15 out of 23 cases correctly.

set.seed(27)
model_C5.0Tree <- caret::train(outcome ~ .,
                             data = val_train_data,
                             method = "C5.0Tree",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_C5.0Tree
## Single C5.0 Tree 
## 
## 56 samples
## 11 predictors
##  2 classes: 'Death', 'Recover' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 50, 50, 51, 51, 51, 50, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7103333  0.4047334
## 
## 
confusionMatrix(predict(model_C5.0Tree, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       5       4
##    Recover     4      10
##                                           
##                Accuracy : 0.6522          
##                  95% CI : (0.4273, 0.8362)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.4216          
##                                           
##                   Kappa : 0.2698          
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5556          
##             Specificity : 0.7143          
##          Pos Pred Value : 0.5556          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.3913          
##          Detection Rate : 0.2174          
##    Detection Prevalence : 0.3913          
##       Balanced Accuracy : 0.6349          
##                                           
##        'Positive' Class : Death           
## 


Partial Least Squares

Partial least squares regression combined principal component analysis and multiple regression and is useful for modeling with correlated features.

This model classified 15 out of 23 cases correctly.

set.seed(27)
model_pls <- caret::train(outcome ~ .,
                             data = val_train_data,
                             method = "pls",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_pls
## Partial Least Squares 
## 
## 56 samples
## 11 predictors
##  2 classes: 'Death', 'Recover' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 50, 50, 51, 51, 51, 50, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  Accuracy   Kappa    
##   1      0.6198571  0.2112982
##   2      0.6376190  0.2436222
##   3      0.6773810  0.3305780
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was ncomp = 3.
confusionMatrix(predict(model_pls, val_test_data[, -1]), val_test_data$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Death Recover
##    Death       3       2
##    Recover     6      12
##                                           
##                Accuracy : 0.6522          
##                  95% CI : (0.4273, 0.8362)
##     No Information Rate : 0.6087          
##     P-Value [Acc > NIR] : 0.4216          
##                                           
##                   Kappa : 0.2069          
##  Mcnemar's Test P-Value : 0.2888          
##                                           
##             Sensitivity : 0.3333          
##             Specificity : 0.8571          
##          Pos Pred Value : 0.6000          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.3913          
##          Detection Rate : 0.1304          
##    Detection Prevalence : 0.2174          
##       Balanced Accuracy : 0.5952          
##                                           
##        'Positive' Class : Death           
## 


Comparing accuracy of models

All models were similarly accurate.

# Create a list of models
models <- list(rf = model_rf, glmnet = model_glmnet, kknn = model_kknn, pda = model_pda, slda = model_slda,
               pam = model_pam, C5.0Tree = model_C5.0Tree, pls = model_pls)

# Resample the models
resample_results <- resamples(models)

# Generate a summary
summary(resample_results, metric = c("Kappa", "Accuracy"))
## 
## Call:
## summary.resamples(object = resample_results, metric = c("Kappa", "Accuracy"))
## 
## Models: rf, glmnet, kknn, pda, slda, pam, C5.0Tree, pls 
## Number of resamples: 100 
## 
## Kappa 
##             Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## rf       -0.3636  0.3214 0.5714 0.5338  0.6800    1    0
## glmnet   -0.3636  0.2768 0.5455 0.5094  0.6667    1    0
## kknn     -0.3636  0.1667 0.5035 0.4032  0.6282    1    0
## pda      -0.6667  0.2431 0.5455 0.4374  0.6667    1    0
## slda     -0.5217  0.0000 0.4308 0.3512  0.6667    1    0
## pam      -0.5000  0.2292 0.5455 0.4419  0.6667    1    0
## C5.0Tree -0.4286  0.1667 0.4167 0.4047  0.6154    1    0
## pls      -0.6667  0.0000 0.3333 0.3306  0.6282    1    0
## 
## Accuracy 
##            Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## rf       0.4000  0.6667 0.8000 0.7802  0.8393    1    0
## glmnet   0.4000  0.6500 0.8000 0.7679  0.8333    1    0
## kknn     0.3333  0.6000 0.7571 0.7120  0.8333    1    0
## pda      0.2000  0.6000 0.8000 0.7255  0.8333    1    0
## slda     0.2000  0.6000 0.6905 0.6887  0.8333    1    0
## pam      0.2000  0.6000 0.8000 0.7283  0.8333    1    0
## C5.0Tree 0.2000  0.6000 0.7143 0.7103  0.8333    1    0
## pls      0.1429  0.5929 0.6667 0.6774  0.8333    1    0
bwplot(resample_results , metric = c("Kappa","Accuracy"))


Combined results of predicting validation test samples

To compare the predictions from all models, I summed up the prediction probabilities for Death and Recovery from all models and calculated the log2 of the ratio between the summed probabilities for Recovery by the summed probabilities for Death. All cases with a log2 ratio bigger than 1.5 were defined as Recover, cases with a log2 ratio below -1.5 were defined as Death, and the remaining cases were defined as uncertain.

results <- data.frame(randomForest = predict(model_rf, newdata = val_test_data[, -1], type="prob"),
                      glmnet = predict(model_glmnet, newdata = val_test_data[, -1], type="prob"),
                      kknn = predict(model_kknn, newdata = val_test_data[, -1], type="prob"),
                      pda = predict(model_pda, newdata = val_test_data[, -1], type="prob"),
                      slda = predict(model_slda, newdata = val_test_data[, -1], type="prob"),
                      pam = predict(model_pam, newdata = val_test_data[, -1], type="prob"),
                      C5.0Tree = predict(model_C5.0Tree, newdata = val_test_data[, -1], type="prob"),
                      pls = predict(model_pls, newdata = val_test_data[, -1], type="prob"))

results$sum_Death <- rowSums(results[, grep("Death", colnames(results))])
results$sum_Recover <- rowSums(results[, grep("Recover", colnames(results))])
results$log2_ratio <- log2(results$sum_Recover/results$sum_Death)
results$true_outcome <- val_test_data$outcome
results$pred_outcome <- ifelse(results$log2_ratio > 1.5, "Recover", ifelse(results$log2_ratio < -1.5, "Death", "uncertain"))
results$prediction <- ifelse(results$pred_outcome == results$true_outcome, "CORRECT", 
                             ifelse(results$pred_outcome == "uncertain", "uncertain", "wrong"))
results[, -c(1:16)]
##          sum_Death sum_Recover  log2_ratio true_outcome pred_outcome prediction
## case_123 2.2236374    5.776363  1.37723972        Death    uncertain  uncertain
## case_127 0.7522674    7.247733  3.26821230      Recover      Recover    CORRECT
## case_128 2.4448101    5.555190  1.18411381      Recover    uncertain  uncertain
## case_14  2.9135620    5.086438  0.80387172      Recover    uncertain  uncertain
## case_19  2.2113377    5.788662  1.38831062        Death    uncertain  uncertain
## case_2   3.6899508    4.310049  0.22410277        Death    uncertain  uncertain
## case_20  4.5621189    3.437881 -0.40818442      Recover    uncertain  uncertain
## case_21  4.9960238    3.003976 -0.73390698      Recover    uncertain  uncertain
## case_34  6.1430233    1.856977 -1.72599312        Death        Death    CORRECT
## case_37  0.8717595    7.128241  3.03154401      Recover      Recover    CORRECT
## case_5   1.7574945    6.242505  1.82860496      Recover      Recover    CORRECT
## case_51  3.8917736    4.108226  0.07808791      Recover    uncertain  uncertain
## case_55  3.1548368    4.845163  0.61897986      Recover    uncertain  uncertain
## case_6   3.5163388    4.483661  0.35060317        Death    uncertain  uncertain
## case_61  5.0803241    2.919676 -0.79911232        Death    uncertain  uncertain
## case_65  1.1282313    6.871769  2.60661863      Recover      Recover    CORRECT
## case_74  5.3438427    2.656157 -1.00853699      Recover    uncertain  uncertain
## case_78  3.2183947    4.781605  0.57115378        Death    uncertain  uncertain
## case_79  1.9521617    6.047838  1.63134700      Recover      Recover    CORRECT
## case_8   3.6106045    4.389395  0.28178184        Death    uncertain  uncertain
## case_87  4.9712879    3.028712 -0.71491522        Death    uncertain  uncertain
## case_91  1.8648837    6.135116  1.71800508      Recover      Recover    CORRECT
## case_94  2.1198087    5.880191  1.47192901      Recover    uncertain  uncertain

All predictions based on all models were either correct or uncertain.


Predicting unknown outcomes

The above models will now be used to predict the outcome of cases with unknown fate.

set.seed(27)
model_rf <- caret::train(outcome ~ .,
                             data = train_data,
                             method = "rf",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_glmnet <- caret::train(outcome ~ .,
                             data = train_data,
                             method = "glmnet",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_kknn <- caret::train(outcome ~ .,
                             data = train_data,
                             method = "kknn",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_pda <- caret::train(outcome ~ .,
                             data = train_data,
                             method = "pda",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_slda <- caret::train(outcome ~ .,
                             data = train_data,
                             method = "slda",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_pam <- caret::train(outcome ~ .,
                             data = train_data,
                             method = "pam",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_C5.0Tree <- caret::train(outcome ~ .,
                             data = train_data,
                             method = "C5.0Tree",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))
model_pls <- caret::train(outcome ~ .,
                             data = train_data,
                             method = "pls",
                             preProcess = NULL,
                             trControl = trainControl(method = "repeatedcv", number = 10, repeats = 10, verboseIter = FALSE))

models <- list(rf = model_rf, glmnet = model_glmnet, kknn = model_kknn, pda = model_pda, slda = model_slda,
               pam = model_pam, C5.0Tree = model_C5.0Tree, pls = model_pls)

# Resample the models
resample_results <- resamples(models)

bwplot(resample_results , metric = c("Kappa","Accuracy"))

Here again, the accuracy is similar in all models.


The final results are calculate as described above.

results <- data.frame(randomForest = predict(model_rf, newdata = test_data, type="prob"),
                      glmnet = predict(model_glmnet, newdata = test_data, type="prob"),
                      kknn = predict(model_kknn, newdata = test_data, type="prob"),
                      pda = predict(model_pda, newdata = test_data, type="prob"),
                      slda = predict(model_slda, newdata = test_data, type="prob"),
                      pam = predict(model_pam, newdata = test_data, type="prob"),
                      C5.0Tree = predict(model_C5.0Tree, newdata = test_data, type="prob"),
                      pls = predict(model_pls, newdata = test_data, type="prob"))

results$sum_Death <- rowSums(results[, grep("Death", colnames(results))])
results$sum_Recover <- rowSums(results[, grep("Recover", colnames(results))])
results$log2_ratio <- log2(results$sum_Recover/results$sum_Death)
results$predicted_outcome <- ifelse(results$log2_ratio > 1.5, "Recover", ifelse(results$log2_ratio < -1.5, "Death", "uncertain"))
results[, -c(1:16)]
##          sum_Death sum_Recover  log2_ratio predicted_outcome
## case_100 3.6707808    4.329219  0.23801986         uncertain
## case_101 6.3194220    1.680578 -1.91083509             Death
## case_102 6.4960471    1.503953 -2.11080274             Death
## case_103 2.3877223    5.612278  1.23295134         uncertain
## case_104 4.1254604    3.874540 -0.09053024         uncertain
## case_105 2.1082161    5.891784  1.48268174         uncertain
## case_108 6.5941436    1.405856 -2.22973606             Death
## case_109 2.2780779    5.721922  1.32868278         uncertain
## case_110 1.7853361    6.214664  1.79948072           Recover
## case_112 4.5102439    3.489756 -0.37007925         uncertain
## case_113 4.7047554    3.295245 -0.51373420         uncertain
## case_114 1.6397105    6.360290  1.95565132           Recover
## case_115 1.2351519    6.764848  2.45336902           Recover
## case_118 1.4675571    6.532443  2.15420601           Recover
## case_120 1.9322149    6.067785  1.65091447           Recover
## case_122 1.1165786    6.883421  2.62404109           Recover
## case_126 5.4933927    2.506607 -1.13196145         uncertain
## case_130 4.3035732    3.696427 -0.21940367         uncertain
## case_132 5.0166184    2.983382 -0.74976671         uncertain
## case_136 2.6824566    5.317543  0.98720505         uncertain
## case_15  5.8545370    2.145463 -1.44826607         uncertain
## case_16  6.5063794    1.493621 -2.12304122             Death
## case_22  4.2929861    3.707014 -0.21172398         uncertain
## case_28  6.6129447    1.387055 -2.25326754             Death
## case_31  6.2625800    1.737420 -1.84981057             Death
## case_32  6.0986316    1.901368 -1.68144746             Death
## case_38  1.7581540    6.241846  1.82791134           Recover
## case_39  2.4765188    5.523481  1.15726428         uncertain
## case_4   5.1309910    2.869009 -0.83868503         uncertain
## case_40  6.6069297    1.393070 -2.24571191             Death
## case_41  5.5499144    2.450086 -1.17963336         uncertain
## case_42  2.0709160    5.929084  1.51754019           Recover
## case_47  6.1988812    1.801119 -1.78311450             Death
## case_48  5.6500772    2.349923 -1.26565724         uncertain
## case_52  5.9096543    2.090346 -1.49933214         uncertain
## case_54  2.6763066    5.323693  0.99218409         uncertain
## case_56  2.6826414    5.317359  0.98705557         uncertain
## case_62  6.0722552    1.927745 -1.65531833             Death
## case_63  3.9541381    4.045862  0.03308379         uncertain
## case_66  5.8320010    2.167999 -1.42762690         uncertain
## case_67  2.1732059    5.826794  1.42287745         uncertain
## case_68  4.3464174    3.653583 -0.25051491         uncertain
## case_69  1.8902845    6.109715  1.69250181           Recover
## case_70  4.5986242    3.401376 -0.43508393         uncertain
## case_71  2.7966938    5.203306  0.89570626         uncertain
## case_80  2.7270986    5.272901  0.95123017         uncertain
## case_84  1.9485253    6.051475  1.63490411           Recover
## case_85  3.0076953    4.992305  0.73104755         uncertain
## case_86  2.0417802    5.958220  1.54505376           Recover
## case_88  0.7285419    7.271458  3.31916083           Recover
## case_9   2.2188620    5.781138  1.38153353         uncertain
## case_90  1.3973262    6.602674  2.24038155           Recover
## case_92  5.0994993    2.900501 -0.81405366         uncertain
## case_93  4.3979048    3.602095 -0.28798006         uncertain
## case_95  3.5561792    4.443821  0.32147260         uncertain
## case_96  1.3359082    6.664092  2.31858744           Recover
## case_99  5.0776686    2.922331 -0.79704644         uncertain

From 57 cases, 14 were defined as Recover, 10 as Death and 33 as uncertain.

results_combined <- merge(results[, -c(1:16)], fluH7N9.china.2013[which(fluH7N9.china.2013$case.ID %in% rownames(results)), ], 
                          by.x = "row.names", by.y = "case.ID")
results_combined <- results_combined[, -c(2, 3, 8, 9)]
results_combined_gather <- results_combined %>%
  gather(group_dates, date, date.of.onset:date.of.hospitalisation)

results_combined_gather$group_dates <- factor(results_combined_gather$group_dates, levels = c("date.of.onset", "date.of.hospitalisation"))

results_combined_gather$group_dates <- mapvalues(results_combined_gather$group_dates, from = c("date.of.onset", "date.of.hospitalisation"), 
                                             to = c("Date of onset", "Date of hospitalisation"))

results_combined_gather$gender <- mapvalues(results_combined_gather$gender, from = c("f", "m"), 
                                             to = c("Female", "Male"))
levels(results_combined_gather$gender) <- c(levels(results_combined_gather$gender), "unknown")
results_combined_gather$gender[is.na(results_combined_gather$gender)] <- "unknown"
results_combined_gather$age <- as.numeric(as.character(results_combined_gather$age))
ggplot(data = results_combined_gather, aes(x = date, y = log2_ratio, color = predicted_outcome)) +
  geom_jitter(aes(size = age), alpha = 0.3) +
  geom_rug() +
  facet_grid(gender ~ group_dates) +
  labs(
    color = "Predicted outcome",
    size = "Age",
    x = "Date in 2013",
    y = "log2 ratio of prediction Recover vs Death",
    title = "2013 Influenza A H7N9 cases in China",
    subtitle = "Predicted outcome",
    caption = ""
  ) +
  my_theme() +
  scale_color_brewer(palette="Set1") +
  scale_fill_brewer(palette="Set1")

The comparison of date of onset, data of hospitalisation, gender and age with predicted outcome shows that predicted deaths were associated with older age than predicted Recoveries. Date of onset does not show an obvious bias in either direction.


Conclusions

This dataset posed a couple of difficulties to begin with, like unequal distribution of data points across variables and missing data. This makes the modeling inherently prone to flaws. However, real life data isn’t perfect either, so I went ahead and tested the modeling success anyway.

By accounting for uncertain classification with low predictions probability, the validation data could be classified accurately. However, for a more accurate model, these few cases don’t give enough information to reliably predict the outcome. More cases, more information (i.e. more features) and fewer missing data would improve the modeling outcome.

Also, this example is only applicable for this specific case of flu. In order to be able to draw more general conclusions about flu outcome, other cases and additional information, for example on medical parameters like preexisting medical conditions, disase parameters, demographic information, etc. would be necessary.

All in all, this dataset served as a nice example of the possibilities (and pitfalls) of machine learning applications and showcases a basic workflow for building prediction models with R.


For a comparison of feature selection methods see here.

If you see any mistakes or have tips and tricks for improvement, please don’t hesitate to let me know! Thanks. :-)


If you are interested in more machine learning posts, check out the category listing for machine_learning.


sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252    LC_MONETARY=English_United States.1252 LC_NUMERIC=C                           LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] pls_2.5-0           C50_0.1.0-24        pamr_1.55           survival_2.40-1     cluster_2.0.5       ipred_0.9-5         mda_0.4-9           class_7.3-14        kknn_1.3.1          glmnet_2.0-5        foreach_1.4.3       Matrix_1.2-7.1      randomForest_4.6-12 RColorBrewer_1.1-2  rpart.plot_2.1.0    rattle_4.1.0        rpart_4.1-10        caret_6.0-73        lattice_0.20-34     mice_2.25           Rcpp_0.12.7         dplyr_0.5.0         gridExtra_2.2.1     ggplot2_2.2.0       plyr_1.8.4          tidyr_0.6.0         outbreaks_1.0.0    
## 
## loaded via a namespace (and not attached):
##  [1] RGtk2_2.20.31      assertthat_0.1     digest_0.6.10      R6_2.2.0           MatrixModels_0.4-1 stats4_3.3.2       evaluate_0.10      e1071_1.6-7        lazyeval_0.2.0     minqa_1.2.4        SparseM_1.74       car_2.1-3          nloptr_1.0.4       partykit_1.1-1     rmarkdown_1.1      labeling_0.3       splines_3.3.2      lme4_1.1-12        stringr_1.1.0      igraph_1.0.1       munsell_0.4.3      compiler_3.3.2     mgcv_1.8-16        htmltools_0.3.5    nnet_7.3-12        tibble_1.2         prodlim_1.5.7      codetools_0.2-15   MASS_7.3-45        ModelMetrics_1.1.0 nlme_3.1-128       gtable_0.2.0       DBI_0.5-1          magrittr_1.5       scales_0.4.1       stringi_1.1.2      reshape2_1.4.2     Formula_1.2-1      lava_1.4.5         iterators_1.0.8    tools_3.3.2        parallel_3.3.2     pbkrtest_0.4-6     yaml_2.1.14        colorspace_1.3-0   knitr_1.15         quantreg_5.29

Analysing the Gilmore Girls' coffee addiction with R

2016-11-20 00:00:00 +0000

Last week’s post showed how to create a Gilmore Girls character network.

In this week’s short post, I want to explore the Gilmore Girls’ famous coffee addiction by analysing the same episode transcripts that were also used last week.

I am also showcasing how to use the recently updated ggplot2 2.2.0.

The transcripts were prepared as described in last week’s post. Full R code can be found below.


How often do they talk about coffee?

The first question I want to explore is how often Lorelai, Rory, Luke, Sookie, Lane and Paris talked about coffee. For this, I subsetted the lines of these characters to where they talk about coffee (i.e. lines that include the word “coffee” at least once). I then created a plot showing how often they talk about coffee per episode and season.



The differences in coffee mention between characters are of course somewhat biased by differences in the total number of lines of each character in the respective episodes, as well by the length of the episodes.

This led me to wonder what proportion of each episode was occupied by which character.





R code

# set names
names <- c("LORELAI", "RORY", "LUKE", "SOOKIE", "LANE", "PARIS")

# extract lines for each character in names
for (name in names){
  assign(paste("lines", name, sep = "_"), transcripts_2[grep(paste0(name), transcripts_2$character), ])
}

# extract lines not from characters in names
lines_OTHER <- transcripts_2[!grepl("LORELAI|RORY|LUKE|SOOKIE|LANE|PARIS", transcripts_2$character), ]

# update names to include "other"
names <- c(names, "OTHER")
# which lines mention coffee.
for (name in names){
  df <- get(paste("lines", name, sep = "_"))
  assign(paste("coffee", name, sep = "_"), df[grep("coffee", df$dialogue), ])
}
# calculate number of lines in total and with coffee mentions per episode
library(dplyr)

for (name in names){
  lines_p_ep <- get(paste("lines", name, sep = "_"))
  lines_p_ep <- as.data.frame(table(lines_p_ep$episode))
  
  df <- get(paste("coffee", name, sep = "_"))
  df_2 <- as.data.frame(table(df$episode))
  
  # calculate coffee lines per episode
  coffee_lines_p_ep <- full_join(lines_p_ep, df_2, by = "Var1")
  coffee_lines_p_ep$ratio_p_ep <- coffee_lines_p_ep$Freq.y/coffee_lines_p_ep$Freq.x
  
  coffee_lines_p_ep$Season <- gsub("_.*", "", coffee_lines_p_ep$Var1)
  coffee_lines_p_ep$character <- paste(name)
  
  assign(paste("coffee_lines_p_ep", name, sep = "_"), coffee_lines_p_ep)
}

coffee_df <- rbind(coffee_lines_p_ep_LORELAI, coffee_lines_p_ep_RORY, coffee_lines_p_ep_LUKE, coffee_lines_p_ep_SOOKIE, 
                   coffee_lines_p_ep_LANE, coffee_lines_p_ep_PARIS)
library(ggplot2)

my_theme <- function(base_size = 12, base_family = "sans"){
  theme_grey(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 0.5),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(color = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "aliceblue"),
    strip.background = element_rect(fill = "lightgrey", color = "grey", size = 1),
    strip.text = element_text(face = "bold", size = 12, color = "black"),
    legend.position = "bottom",
    legend.justification = "top", 
    legend.box = "horizontal",
    legend.box.background = element_rect(colour = "grey50"),
    legend.background = element_blank(),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.5)
  )
}
library(tidyr)

coffee_df_gather <- coffee_df %>% 
  gather(group, value, Freq.x:ratio_p_ep)

coffee_df_gather[which(coffee_df_gather$group == "Freq.x"), "group"] <- "Lines per episode"
coffee_df_gather[which(coffee_df_gather$group == "Freq.y"), "group"] <- "Coffee lines per episode"
coffee_df_gather[which(coffee_df_gather$group == "ratio_p_ep"), "group"] <- "Ratio of coffee lines per episode"

coffee_df_gather$character <- factor(coffee_df_gather$character, levels = names)
coffee_df_gather$group <- factor(coffee_df_gather$group, levels = c("Lines per episode", "Coffee lines per episode", "Ratio of coffee lines per episode"))

ggplot(data = coffee_df_gather, aes(x = character, y = value, color = Season, fill = Season)) +
  geom_boxplot(alpha = 0.5) +
  labs(
    x = "",
    y = "Number or ratio of lines",
    title = "Lorelai & Luke are the coffee queen and king",
    subtitle = "Coffee mentions per episode and season",
    caption = "\nThese boxplots show the number of total lines per episode and season, lines with coffee mentions per epsiode and season,
     as well as the ratio between the two for the main characters Lorelai, Rory, Luke, Sookie, Lane and Paris. Lorelai consistently had
     the most lines per episode, followed closely by Rory and Luke. Sookie, Lane and Paris had roughly the same numbers of lines per episode,
     although their variance is higher. The same trend is reflected by the number of lines with coffee mentions, except that Rory and Luke seem
     to talk similarly often about coffee. Interestingly, Lorelai's (verbal) coffee obsession seems to have decreased slightly over the seasons.
     The ratio of coffee lines divided by the total number of lines for each episode reflects that even though Lorelai, Rory and Luke talk a lot about coffee,
     they also talk a lot in general, so their ratio of coffee mentions is only slightly higher than Sookie's, Lane's and Paris'. 
     While the latter's mean ratio is lower. they have a much higher variance with a few episodes where they seem to have talked relatively much about coffee.
     During the first seasons, Luke is the character with the highest ratio of coffee vs general talk but this is decreasing over time and becomes more
     similar to Lorelai and Rory's coffee talk ratio."
  ) +
  my_theme() +
  facet_wrap(~ group, ncol = 3, scales = "free") +
  guides(fill = guide_legend(nrow = 1, byrow = TRUE)) +
  scale_fill_brewer(palette="Dark2") +
  scale_color_brewer(palette="Dark2")
# calculate number of lines in total
for (name in names){
  lines_p_ep <- get(paste("lines", name, sep = "_"))
  lines_p_ep <- as.data.frame(table(lines_p_ep$episode))

  lines_p_ep$character <- paste(name)
  
  assign(paste("lines_p_ep", name, sep = "_"), lines_p_ep)
}

# combine and convert to wide format to make calculating proportions easier
lines_df_wide <- spread(rbind(lines_p_ep_LORELAI, lines_p_ep_RORY, lines_p_ep_LUKE, lines_p_ep_SOOKIE, 
                              lines_p_ep_LANE, lines_p_ep_PARIS, lines_p_ep_OTHER), Var1, Freq)
rownames(lines_df_wide) <- lines_df_wide[, 1]
lines_df_wide <- as.data.frame(t(lines_df_wide[, -1]))

# calculate proportions and percentage
prop_lines_df_wide <- sweep(lines_df_wide, 1, rowSums(lines_df_wide), FUN = "/")
percent_lines_df_wide <- prop_lines_df_wide * 100
percent_lines_df_wide$Episode <- rownames(percent_lines_df_wide)
percent_lines_df_wide$Season <- gsub("_.*", "", percent_lines_df_wide$Episode)

# gather for plotting
percent_lines_df_gather <- percent_lines_df_wide %>% 
  gather(Character, Percent, LANE:SOOKIE)

percent_lines_df_gather$Character <- factor(percent_lines_df_gather$Character, levels = names)
percent_lines_df_gather <- merge(percent_lines_df_gather, subset(lines_LORELAI, select = c(episode, episode_running_nr)), by.x = "Episode", by.y = "episode")
percent_lines_df_gather$Episode <- factor(percent_lines_df_gather$Episode, levels = percent_lines_df_gather[order(percent_lines_df_gather$episode_running_nr, decreasing = TRUE), "Episode"])
percent_lines_df_gather <- percent_lines_df_gather[!duplicated(percent_lines_df_gather), ]
ggplot(data = percent_lines_df_gather, aes(x = Character, y = Percent, fill = Season)) +
  geom_boxplot(alpha = 0.5) +
  labs(
    x = "",
    y = "Percent",
    title = "Who's the biggest talker? (a)",
    subtitle = "Percentage of lines per episode spoken by main characters",
    caption = "\nThe boxplot shows the percentages of lines spoken by the main characters per episode and season.
    In the first three seasons the majority of lines are clearly spoken by Lorelai and Rory, followed with some distance by Luke.
    During the later seasons, the gap between Lorelai, Rory and the rest is not as pronounced any more."
  ) +
  my_theme() +
  facet_wrap(~ Season, ncol = 4, scales = "free") +
  guides(fill = guide_legend(nrow = 1, byrow = TRUE)) +
  scale_fill_brewer(palette = "Dark2") +
  scale_color_brewer(palette = "Dark2")
# calculate number of lines in total
for (name in names){
  lines_p_season <- get(paste("lines", name, sep = "_"))
  lines_p_season <- as.data.frame(table(lines_p_season$season))

  lines_p_season$character <- paste(name)
  
  assign(paste("lines_p_season", name, sep = "_"), lines_p_season)
}

# combine and convert to wide format to make calculating proportions easier
lines_df_wide <- spread(rbind(lines_p_season_LORELAI, lines_p_season_RORY, lines_p_season_LUKE, lines_p_season_SOOKIE, 
                              lines_p_season_LANE, lines_p_season_PARIS, lines_p_season_OTHER), Var1, Freq)
rownames(lines_df_wide) <- lines_df_wide[, 1]
lines_df_wide <- as.data.frame(t(lines_df_wide[, -1]))

# calculate proportions and percentage
prop_lines_df_wide <- sweep(lines_df_wide, 1, rowSums(lines_df_wide), FUN = "/")
percent_lines_df_wide <- prop_lines_df_wide * 100
percent_lines_df_wide$Season <- rownames(percent_lines_df_wide)

# gather for plotting
percent_lines_df_gather <- percent_lines_df_wide %>%
  gather(Character, Percent, LANE:SOOKIE)

percent_lines_df_gather$Character <- factor(percent_lines_df_gather$Character, levels = names)
# plot
bp <- ggplot(percent_lines_df_gather, aes(x = "", y = Percent, fill = Character)) + 
  geom_bar(width = 1, stat = "identity") + theme_minimal() +
  scale_fill_brewer(palette = "Spectral")

pie <- bp + coord_polar("y", start = 0) +
  theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.border = element_blank(),
  panel.grid = element_blank(),
  axis.ticks = element_blank(),
  plot.title = element_text(size = 14, face = "bold"),
  legend.title = element_blank(),
  legend.position = "bottom",
  legend.text = element_text(size = 8),
  legend.justification = "top", 
    legend.box = "horizontal",
    legend.box.background = element_rect(colour = "grey50"),
    legend.background = element_blank(),
  ) + guides(fill = guide_legend(nrow = 1, byrow = TRUE))

pie + facet_wrap( ~ Season, ncol = 4) +
  labs(
    title = "Who's the biggest talker? (b)",
    subtitle = "Percent of lines per season spoken by main characters")



sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12
## 
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_2.2.0         dplyr_0.5.0           splitstackshape_1.4.2
## [4] data.table_1.9.6      tidyr_0.6.0          
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.8        knitr_1.15         magrittr_1.5      
##  [4] munsell_0.4.3      colorspace_1.3-1   R6_2.2.0          
##  [7] stringr_1.1.0      plyr_1.8.4         tools_3.3.2       
## [10] grid_3.3.2         gtable_0.2.0       DBI_0.5-1         
## [13] htmltools_0.3.5    lazyeval_0.2.0     yaml_2.1.14       
## [16] assertthat_0.1     digest_0.6.10      tibble_1.2        
## [19] RColorBrewer_1.1-2 evaluate_0.10      rmarkdown_1.1     
## [22] labeling_0.3       stringi_1.1.2      scales_0.4.1      
## [25] chron_2.3-47

Creating a Gilmore Girls character network with R

2016-11-13 00:00:00 +0000

With the impending (and by many - including me - much awaited) Gilmore Girls Revival, I wanted to take a somewhat different look at our beloved characters from Stars Hollow.

I had recently read a few cool examples of how to create co-occurrence networks and wanted to combine this with an analysis similar to what David Robinson did for Love Actually.

Fortunately, there are people out there, who have invested their time (and a lot of it, I imagine) to write up transcripts for every Gilmore Girls episode. I chose www.crazy-internet-people.com’s list of transcripts.

Based on these transcripts I calculated the main character’s number of lines per episode and from there the co-occurrence matrix with other characters. This told me with which other major characters they appeared together in episodes and how often. This network nicely illustrates that Lorelai has the most lines of all characters (node size reflects total number of lines in all episodes a character had), followed by Rory. No surprise there but interestingly, the third place goes to Luke and fourth to Emily. Most interaction happened between Lorelai and Rory of course (edge width reflects number of co-occurences in episodes), but Lorelai and Luke, Rory and Luke, Emily and Rory and Lorelai and Sookie follow suit. What the network also shows is that Lorelai has major connections with most other characters - more so than Rory.

A cluster dendrogram shows us the character co-occurrence in a slightly different way: The further down in the dendrogram tree two nodes split, the more episodes these characters had in common. Again, no surprise here that Lorelai and Rory have the most closely connected nodes, closely followed by Luke. We can also see quite nicely that the couples Sookie and Jackson, Lane and Zack and Emily and Richard share a lot of episodes.


Of course, these numbers also reflect the total number of episodes that characters were in, so that there is an inherent bias for characters with short occurrences in many episodes being more strongly connected to e.g. Lorelai than characters with fewer but more important plots. It would have been interesting to calculate the co-occurrence per scene instead of episode but unfortunately, this information was not given in the transcripts (if someone knows transcripts that denote scene number, please contact me).

I also wanted to see in which episodes these 20 characters appeared. Of course, Lorelai and Rory appeared in every episode but for other characters, there are clear gaps.



And finally, I wanted to know how many lines per episode each of them spoke: This boxplot shows the median number of lines per episode for each character (middle line of the boxes), as well as the lower and upper quartiles (outer edges of the boxes) and the outlier episodes (dots).



There will be a part 2 next week, where I will explore the Gilmore Girls a bit more. No spoilers, but among other things, I’ll be looking at their coffee consumption through the data lens…


For a detailed description, plus R code for the plots see further below or find the R Markdown on Github.

If you don’t care about the show and have not (unlike me) watched every episode at least twice, maybe you’ll be interested in using my R code to recreate a similar character network for other TV shows, movies or books (and if you do, please share them with me)!



Obtaining all episode transcripts

The transcript URLs from www.crazy-internet-people.com have the following scheme: “http://www.crazy-internet-people.com/site/gilmoregirls/pages/, s#/, s#s/, §.html” (#: number of season, of which there are seven; §: running number of episode, from 1 to 153). Following this scheme, I looped over all seasons and episodes to read the lines for each HTML directly into R via their respective URLs.

Because the raw HTML looked a bit messy, I had to do some tidying of the text:

  • First, I grabbed only lines with a character name at the beginning, which indicates the character who is speaking (these were all in caps).
  • Then, I had to remove the remaining HTML tags/ descriptors.
  • After this, I had the transcript text remaining, which I could transform into a data frame
  • and add season, episode number and running episode number to each line of text.
  • And finally, I combined all transcripts into one object.
for(i in 1:7){                                            # there are 7 seasons

  if(i == 1){                                             # all seasons except the first have 22 episodes (the first has 21)

    for(j in 1:21){

      cat("\nSeason", i, ", Episode", j, "\n")            # to see the progress I am printing the season and episode number

      thepage <- readLines(paste0("http://www.crazy-internet-people.com/site/gilmoregirls/pages/s", i, "/s", i, "s/", j, ".html"))

      thepage <- thepage[grep("^[[:upper:]]+:", thepage)]  # grabbing character lines only
      thepage <- gsub("\t", "", thepage)                  # removing HTML tags
      thepage <- gsub("<.*>", "", thepage)                # removing some more HTML tags

      thepage <- as.data.frame(thepage)
      thepage$season <- i                                 # add season number
      thepage$episode <- paste(i, j, sep = "_")           # add episode number
      thepage$episode_running_nr <- j                     # add running epsiode number

      if(i == 1 & j == 1){                                # combine all transcripts into one object
        transcripts <- thepage
      } else {
        transcripts <- rbind(transcripts, thepage)
      }
    }

    } else {                                              # repeat for seasons 2 to 7

      for(j in 1:22){

        cat("\nSeason", i, ", Episode", j, "\n")

        if(i == 2){                                       # to get the running episode number, 
                                                          # I have to add the number of episodes from previous seasons
          n <- j+21
        }

        if(i == 3){
          n <- j+21+22
        }

        if(i == 4){
          n <- j+21+22+22
        }

        if(i == 5){
          n <- j+21+22+22+22
        }

        if(i == 6){
          n <- j+21+22+22+22+22
        }

        if(i == 7){
          n <- j+21+22+22+22+22+22
        }

        # rinse and repeat
        
        thepage <- readLines(paste0("http://www.crazy-internet-people.com/site/gilmoregirls/pages/s", i, "/s", i, "s/", n, ".html"))

        thepage <- thepage[grep("^[[:upper:]]{2,}:", thepage)]
        thepage <- gsub("\t", "", thepage)
        thepage <- gsub("<.*>", "", thepage)

        thepage <- as.data.frame(thepage)
        thepage$season <- i
        thepage$episode <- paste(i, j, sep = "_")
        thepage$episode_running_nr <- n

        if(i == 1 & j == 1){
          transcripts <- thepage
        } else {
          transcripts <- rbind(transcripts, thepage)
        }
        }
  }
}


Some of the lines were empty, so I removed those.

transcripts$thepage <- as.character(transcripts$thepage)  # convert to character vector
transcripts <- transcripts[!transcripts$thepage == "", ]  # remove empty lines


This is how the data frame looked like at this point:

  • each row contains one line of dialogue with the character name in caps coming before their lines.
head(transcripts)
##                                          thepage season episode
## 1 LORELAI: Please, Luke. Please, please, please.      1     1_1
## 2 LUKE: How many cups have you had this morning?      1     1_1
## 3                                 LORELAI: None.      1     1_1
## 4                                  LUKE: Plus...      1     1_1
## 5            LORELAI: Five, but yours is better.      1     1_1
## 6                      LUKE: You have a problem.      1     1_1
##   episode_running_nr
## 1                  1
## 2                  1
## 3                  1
## 4                  1
## 5                  1
## 6                  1


To be able to count the characters, I separated the character names from their lines. This was done by splitting the first column after the first colon, using the tidyr package.

I also removed all leading and trailing whitespace from the character names, changed all letters in the character column to all caps and changed “ands” and apostrophes to the proper encoding. And I also had to manually correct quite a few misspelled character names.

# separate first column after first colon
library(tidyr)
transcripts_2 <- separate(transcripts, "thepage", into = c("character", "dialogue"), sep = ":", extra = "merge", fill = "right")

# remove leading and trailing whitespace
transcripts_2$character <- gsub("^\\s+|\\s+$", "", transcripts_2$character)

# convert all character names to all upper case
transcripts_2$character <- toupper(transcripts_2$character)

# fix misspelled character names
transcripts_2$character <- gsub("ZACK", "ZACH", transcripts_2$character)
transcripts_2$character <- gsub("LORLEAI", "LORELAI", transcripts_2$character)
transcripts_2$character <- gsub("LOREALI", "LORELAI", transcripts_2$character)
transcripts_2$character <- gsub("LORELI", "LORELAI", transcripts_2$character)
transcripts_2$character <- gsub("LORLAI", "LORELAI", transcripts_2$character)
transcripts_2$character <- gsub("LORELA$", "LORELAI", transcripts_2$character)
transcripts_2$character <- gsub("LORLELAI", "LORELAI", transcripts_2$character)
transcripts_2$character <- gsub("^ORELAI", "LORELAI", transcripts_2$character)
transcripts_2$character <- gsub("LOREAI", "LORELAI", transcripts_2$character)
transcripts_2$character <- gsub("^ORY", "RORY", transcripts_2$character)
transcripts_2$character <- gsub("LUK$", "LUKE", transcripts_2$character)
transcripts_2$character <- gsub("BABETE", "BABETTE", transcripts_2$character)
transcripts_2$character <- gsub("BABETTER", "BABETTE", transcripts_2$character)
transcripts_2$character <- gsub("BARBETTE", "BABETTE", transcripts_2$character)
transcripts_2$character <- gsub("BABETTE/MISS PATTY", "BABETTE AND MISS PATTY", transcripts_2$character)
transcripts_2$character <- gsub("JACKSON/SOOKIE", "JACKSON AND SOOKIE", transcripts_2$character)
transcripts_2$character <- gsub("LORELAI/SOOKIE", "LORELAI AND SOOKIE", transcripts_2$character)
transcripts_2$character <- gsub("LORELAI/RORY", "LORELAI AND RORY", transcripts_2$character)
transcripts_2$character <- gsub("TAYOR", "TAYLOR", transcripts_2$character)
transcripts_2$character <- gsub("TRISTIN", "TRISTAN", transcripts_2$character)
transcripts_2$character <- gsub("MICHE$", "MICHEL", transcripts_2$character)
transcripts_2$character <- gsub("MICHELL", "MICHEL", transcripts_2$character)
transcripts_2$character <- gsub("SOOKI$", "SOOKIE", transcripts_2$character)
transcripts_2$character <- gsub("SOOKEI", "SOOKIE", transcripts_2$character)
transcripts_2$character <- gsub("SOOKIES", "SOOKIE", transcripts_2$character)
transcripts_2$character <- gsub("Mrs.KIM", "MRS KIM", transcripts_2$character)
transcripts_2$character <- gsub("MRS.KIM", "MRS KIM", transcripts_2$character)
transcripts_2$character <- gsub("MRS KIM", "MRS KIM", transcripts_2$character)
transcripts_2$character <- gsub("RICHRAD", "RICHARD", transcripts_2$character)
transcripts_2$character <- gsub("RMILY", "EMILY", transcripts_2$character)
transcripts_2$character <- gsub("CHRISTOHPER", "CHRISTOPHER", transcripts_2$character)
transcripts_2$character <- gsub("CHRISTOPER", "CHRISTOPHER", transcripts_2$character)
transcripts_2$character <- gsub("CHRSTOPHER", "CHRISTOPHER", transcripts_2$character)
transcripts_2$character <- gsub("CHRIS$", "CHRISTOPHER", transcripts_2$character)
transcripts_2$character <- gsub("CHERRY", "SHERRY", transcripts_2$character)
transcripts_2$character <- gsub("LINDAY", "LINDSAY", transcripts_2$character)

# substitute &#146; with apostrophe
transcripts_2$character <- gsub("&#146;", "'", transcripts_2$character)

# some ANDs are written as &AMP; so they will be changed as well
transcripts_2$character <- gsub("&AMP;", "AND", transcripts_2$character)

# and finally I want ANDs to be written as semicolons
transcripts_2$character <- gsub(" AND ", ";", transcripts_2$character)

#  and remove disclaimer lines
transcripts_2 <- transcripts_2[-which(transcripts_2$character == "DISCLAIMER"), ] 
head(transcripts_2)
##   character                                  dialogue season episode
## 1   LORELAI     Please, Luke. Please, please, please.      1     1_1
## 2      LUKE  How many cups have you had this morning?      1     1_1
## 3   LORELAI                                     None.      1     1_1
## 4      LUKE                                   Plus...      1     1_1
## 5   LORELAI                Five, but yours is better.      1     1_1
## 6      LUKE                       You have a problem.      1     1_1
##   episode_running_nr
## 1                  1
## 2                  1
## 3                  1
## 4                  1
## 5                  1
## 6                  1

This is how the data frame looked like after tidying.

nrow(transcripts_2)
## [1] 116954

In total there are now 116,983 lines.


How many characters are there and how many lines do they have?

To find out how many characters there were in Gilmore Girls during 153 episodes, I couldn’t simply count them because there are combined characters (e.g. Lorelai and Rory speaking together) and voice overs among them.

First, I want to duplicate all lines with two speakers, to make them count for each character. I also want to only count lines where there is only one character, so I removed all character fields with multiple, generic or unspecific characters. And I don’t want to have voice overs either.

Most of the characters, however, were still not recurring characters, so I filtered out all those characters that only occurred in one episode.

# separating all rows where multiple characters spoke into one line per character with duplicate line text
library(splitstackshape)
transcripts_2 <- cSplit(transcripts_2, splitCols = "character", sep = ";", direction = "long")

# manually removing characters I don't want to keep
characters <- as.data.frame(table(transcripts_2$character, transcripts_2$episode)) %>%
  subset(!grepl(" VOICE", Var1)) %>%
  subset(!grepl("^ALL", Var1)) %>%
  subset(!grepl("AS A GROUP", Var1)) %>%
  subset(!grepl("CROWD", Var1)) %>%
  subset(!grepl("RADIO", Var1)) %>%
  subset(!grepl("BIKERS", Var1)) %>%
  subset(!grepl("ANNOUNCER", Var1)) %>%
  subset(!grepl("BOTH", Var1)) %>%
  subset(!grepl("WOMAN", Var1)) %>%
  subset(!grepl("VOICE", Var1)) %>%
  subset(!grepl("BARTENDER", Var1)) %>%
  subset(!grepl("OFFICER", Var1)) %>%
  subset(!grepl("GIRL", Var1)) %>%
  subset(!grepl("GIRLS", Var1)) %>%
  subset(!grepl("BOYS", Var1)) %>%
  subset(!grepl("EVERYONE", Var1)) %>%
  subset(!grepl("SUPERVISOR", Var1)) %>%
  subset(!grepl("PHOTOGRAPHER", Var1)) %>%
  subset(!grepl("RECEPTIONIST", Var1)) %>%
  subset(!grepl("CUSTOMER", Var1)) %>%
  subset(!grepl("TV", Var1)) %>%
  subset(!grepl("VET", Var1)) %>%
  subset(!grepl("KID", Var1)) %>%
  subset(!grepl("MOM", Var1)) %>%
  subset(!grepl("DOCTOR", Var1)) %>%
  subset(!grepl("BOUNCER", Var1)) %>%
  subset(!grepl("LYRICS", Var1)) %>%
  subset(!grepl("SPEAKER", Var1)) %>%
  subset(!grepl("BOY", Var1)) %>%
  subset(!grepl("TEACHER", Var1)) %>%
  subset(!grepl("EMPLOYEE", Var1)) %>%
  subset(!grepl("MAID", Var1)) %>%
  subset(!grepl("CASHIER", Var1)) %>%
  subset(!grepl("MAN", Var1)) %>%
  subset(!grepl("NURSE", Var1)) %>%
  subset(!grepl("PLAYER", Var1)) %>%
  subset(!grepl("WAITER", Var1)) %>%
  subset(!grepl("WAITRESS", Var1)) %>%
  subset(!grepl("DRIVER", Var1)) %>%
  subset(!grepl("GRANDMA", Var1)) %>%
  subset(!grepl("LADIES", Var1)) %>%
  subset(!grepl("OPERATOR", Var1)) %>%
  subset(!grepl("TOURIST", Var1)) %>%
  subset(!grepl("CHEF", Var1)) %>%
  subset(!grepl("GUEST", Var1)) %>%
  subset(!grepl("HOSTESS", Var1)) %>%
  subset(!grepl("JUDGE", Var1)) %>%
  subset(!grepl("LADY", Var1)) %>%
  subset(!grepl("SECRETARY", Var1)) %>%
  subset(!grepl("PRIEST", Var1)) %>%
  subset(!grepl("STUDENT", Var1)) %>%
  subset(!grepl("PROFESSOR", Var1)) %>%
  subset(!grepl("ANSWERING MACHINE", Var1))

# find out which characters had the most lines and which to remove
library(reshape2)
characters_2 <- dcast(characters, Var1 ~ Var2)
rownames(characters_2) <- characters_2[, 1]
characters_2 <- characters_2[, -1]

characters_3 <- rowSums(characters_2)
major_characters <- characters_3[order(characters_3, decreasing = TRUE)]

characters_2 <- ifelse(characters_2 > 0, 1, 0)

characters_to_remove <- rownames(characters_2[which(rowSums(characters_2) <= 1), ])

characters <- characters[-which(characters$Var1 %in% characters_to_remove), ]
length(unique(as.character(characters$Var1)))
## [1] 153

This leaves us with these 153 characters:

unique(as.character(characters$Var1))
##   [1] "AK"               "ALEX"             "ANDREW"          
##   [4] "ANNA"             "APRIL"            "ASHER"           
##   [7] "AUDREY"           "AUNT JUN"         "BABETTE"         
##  [10] "BARBARA"          "BILL"             "BILLY"           
##  [13] "BOBBI"            "BOBBY"            "BOOTSY"          
##  [16] "BRAD"             "BRIAN"            "BRUCE"           
##  [19] "CAESAR"           "CAESER"           "CARL"            
##  [22] "CAROL"            "CARRIE"           "CHARLESTON"      
##  [25] "CHARLIE"          "CHRISTOPHER"      "CLARA"           
##  [28] "CLAUDE"           "COLIN"            "DAVE"            
##  [31] "DEAN"             "DENNIS"           "DEREK"           
##  [34] "DOUG"             "DOYLE"            "DR SCHULTZ"      
##  [37] "DRELLA"           "ED"               "EMCEE"           
##  [40] "EMILY"            "FINN"             "FLOYD"           
##  [43] "FRAN"             "FRANCIE"          "FRANCINE"        
##  [46] "FRED"             "GIGI"             "GIL"             
##  [49] "GLENN"            "GUY"              "GYPSY"           
##  [52] "HARRY"            "HEADMASTER"       "HENRY"           
##  [55] "HONOR"            "JACK"             "JACKSON"         
##  [58] "JACOB"            "JAMIE"            "JANET"           
##  [61] "JASON"            "JESS"             "JIMMY"           
##  [64] "JOE"              "JOEL"             "JOHN"            
##  [67] "JONI"             "JORDAN"           "JULIET"          
##  [70] "KAREN"            "KIRK"             "KYLE"            
##  [73] "KYON"             "LANCE"            "LANE"            
##  [76] "LAURA"            "LINDSAY"          "LIZ"             
##  [79] "LIZA"             "LOGAN"            "LORELAI"         
##  [82] "LOU"              "LOUISE"           "LUCY"            
##  [85] "LUKE"             "LULU"             "MADELINE"        
##  [88] "MAGGIE"           "MARCIA"           "MARILYN"         
##  [91] "MARSHALL"         "MARTY"            "MAUREEN"         
##  [94] "MAX"              "MEGAN"            "MIA"             
##  [97] "MICHEL"           "MISS PATTY"       "MITCHUM"         
## [100] "MOREY"            "MR. HUNTER"       "MRS KIM"         
## [103] "MRS. CASSINI"     "MRS. KIM"         "MRS. LISTER"     
## [106] "MRS. O'MALLEY"    "MUSIC"            "NANCY"           
## [109] "NATALIE"          "NICK"             "NICOLE"          
## [112] "NORA"             "OLIVIA"           "PARIS"           
## [115] "PATTY"            "PAUL ANKA"        "PHILLIP"         
## [118] "PRINCIPAL"        "RABBI"            "RACHEL"          
## [121] "RAJ"              "REVEREND"         "REVEREND SKINNER"
## [124] "RICHARD"          "ROB"              "ROBERT"          
## [127] "RORY"             "ROSEMARY"         "RUNE"            
## [130] "SANDRA"           "SARAH"            "SHANE"           
## [133] "SHEILA"           "SHERRY"           "SHIRA"           
## [136] "SIMON"            "SOOKIE"           "SOPHIE"          
## [139] "STRAUB"           "SUE"              "SUSAN"           
## [142] "T.J.'S BROTHER"   "TANNA"            "TAYLOR"          
## [145] "TJ"               "TOBIN"            "TOM"             
## [148] "TRISTAN"          "TRIX"             "TROUBADOUR"      
## [151] "VIVIAN"           "YOUNG CHUI"       "ZACH"


Character co-occurrence network

The major part was to calculate the co-occurence matrix for characters per episode (i.e. which characters co-occurred together in episodes and how often). Because the network would get too big had I used too many characters, I restricted the network to the 20 main characters with the most lines over all episodes.

# extracting only those characters with the most lines
transcripts_3 <- transcripts_2[which(transcripts_2$character %in% names(major_characters[1:20])), ]
transcripts_3$season_name <- paste("Season", transcripts_3$season, sep = " ")
# create lines per episode matrix for each of these characters
library(reshape2)
speaker_scene_matrix <- transcripts_3 %>%
  acast(character ~ episode, fun.aggregate = length)

The network was calculated from the co-occurrence matrix as a weighted network. Node colors reflect the character’s gender.

# calculate co-occurrence matrix
data_matrix <- as.matrix(t(speaker_scene_matrix))
total_occurrences <- colSums(t(speaker_scene_matrix))

co_occurrence <- t(data_matrix) %*% data_matrix

# plot the network graph
library(igraph)
g <- graph.adjacency(co_occurrence,
                         weighted = TRUE,
                         diag = FALSE,
                         mode = "upper")

g <- simplify(g, remove.multiple = F, remove.loops = T, edge.attr.comb = c(weight = "sum", type = "ignore"))

females <- c("EMILY", "LANE", "LORELAI", "MISS PATTY", "PARIS", "RORY", "SOOKIE") 
        
V(g)$gender <- ifelse(V(g)$name %in% females, "female", "male")

plot(g,
     vertex.label.family = "Helvetica",
     vertex.label.font = 1,
     vertex.shape = "sphere",
     vertex.size=total_occurrences/800,
     vertex.label.cex=0.8,
     vertex.color=c( "pink", "skyblue")[1+(V(g)$gender=="male")],
     vertex.label.color="black",
     vertex.frame.color = NA,
     edge.width = E(g)$weight/100000,
     edge.curved=.1,
     layout=layout_in_circle)


The cluster dendrogram shows us the character co-occurrence in a slightly different way:

norm <- speaker_scene_matrix / rowSums(speaker_scene_matrix)

h <- hclust(dist(norm, method = "manhattan"))

plot(h)


And finally, I wanted to plot in which episodes these 20 characters appeared:

library(ggplot2)

my_theme <- function(base_size = 12, base_family = "sans"){
  theme_grey(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(color = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "aliceblue"),
    strip.background = element_rect(fill = "lightgrey", color = "grey", size = 1),
    strip.text = element_text(face = "bold", size = 12, color = "black"),
    legend.position = "none",
    legend.background = element_blank(),
    panel.margin = unit(.5, "lines"),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.5)
  )
}
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ggplot(transcripts_3, aes(x = episode_running_nr, y = character, color = season_name)) +
  geom_point(size = 2) +
  geom_path(aes(group = episode_running_nr)) +
  my_theme() +
  facet_wrap(~ season_name, ncol = 2, scales = "free_x") +
  scale_color_hue(l = 50) +
  labs(y = "",
       x = "Episode running number",
       title = "Gilmore Girls major character's occurence in episodes")


And I wanted to know how many lines per episode each of them spoke:

# setting factor levels
f = characters[order(characters$Freq, decreasing = TRUE), "Var1"]
characters <- within(characters, Var1 <- factor(Var1, levels = f))

ggplot(data = subset(characters, Var1 %in% names(major_characters[1:20])), aes(x = Var1, y = Freq)) +
  geom_boxplot(fill = "navy", alpha = 0.8) +
  labs(y = "Number of Lines",
       title = "How many lines do the main characters have per epsisode?", 
       x = "Characters") +
  my_theme()



sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12
## 
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_2.1.0         igraph_1.0.1          reshape2_1.4.2       
## [4] splitstackshape_1.4.2 data.table_1.9.6      tidyr_0.6.0          
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.7      knitr_1.14       magrittr_1.5     munsell_0.4.3   
##  [5] colorspace_1.2-7 stringr_1.1.0    plyr_1.8.4       tools_3.3.2     
##  [9] grid_3.3.2       gtable_0.2.0     htmltools_0.3.5  yaml_2.1.13     
## [13] assertthat_0.1   digest_0.6.10    tibble_1.2       formatR_1.4     
## [17] evaluate_0.10    rmarkdown_1.1    labeling_0.3     stringi_1.1.2   
## [21] scales_0.4.0     chron_2.3-47

Is 'Yeah' Josh and Chuck's favorite word?

2016-11-06 00:00:00 +0000

Text mining and sentiment analysis of a Stuff You Should Know Podcast


Stuff You Should Know (or SYSK) is one of the many great podcasts from How Stuff Works. The two SYSK hosts Josh and Chuck have taught me so many fascinating things over the years, and today I want to use one of their podcasts to learn a little bit about text analysis in R.

Initially, I wanted to explore all SYSK podcasts. Unfortunately however, I could only find a transcript for the episode Earwax: Live With It, posted on March 19, 2015.


The complete R code can be found at the end of this post or as an R-Markdown on Github.


The podcast transcript

I copied the episode transcript from its web page and saved it as a tab delimited text file. The file can be downloaded from Github.


Separating Josh and Chuck

Of course, I wouldn’t actually want to separate Josh and Chuck. But for comparison’s sake in this analysis, I am creating two separate files for lines of dialogue spoken by either Josh or Chuck. I am also keeping the combination of both for background information.


How emotional are Josh and Chuck?

Sentiment analysis

The first thing I want to explore is a sentiment analysis of the lines spoken by Josh and Chuck. Sentiment analysis categorizes text data into positive and negative sentiments and gives information about the emotional state or attitude of the speaker or the contents of a text.

I am using the package syuzhet for sentiment analysis.


NRC sentiments

Saif Mohammad’s NRC Emotion Lexicon is a collection of words that were manually categorized based on their association with the emotions anger, anticipation, disgust, fear, joy, sadness, surprise, and trust, and with positive and negative sentiments.


What sentiment did the podcast have?

Before I go ahead with the sentiment analysis I want to get an idea of the podcast’s words’ association with the NRC categories.

As can be seen by the words and their associated emotions/ sentiments, sentiment analysis is not perfect. Most words make a lot of intuitive sense with their category (e.g. gross, fungus and spider in disgust), but a few I find to be really strange (like, why would waffle be associated with anger?). Still, the majority of categorisations make sense, so let’s go ahead with the sentiment analysis.


Does the podcast’s sentiment change over time?

Sentiment analysis for each line of dialogue produces a matrix with one column per sentiment/ emotion and one row per line. If any of the words in a line of dialogue could be associated with a given category, this category would get a value of 1 in the matrix. If there was no association with a category, its value would be 0. The lines of dialogue are sorted according to the original input text, in this case this means that they represent the order in which they were spoken in the podcast. Because the plot would get too big with all categories, I split the data into positive and negative sentiments and emotions.

For analysing positive and negative sentiments, syuzhet implements four different methods, each of which uses a slightly different scale. But all of them assign negative values to indicate negative sentiment and positive values to indicate positive sentiments.

All of the methods rely on a precomputed lexicon of word-sentiment score associations. The emotional or sentiment valence is then computed based on the scores of the words from each line of dialogue.

The upper two plots show on the x-axis the progression of dialogue over time with each point being a line of dialogue. The sentiment score on the y-axis shows the intensity of the sentiment or emotions in the respective line of dialogue, i.e. the more words in a line were associated with the given category, the higher the line’s sentiment score.

From a first glance at these emotions and sentiments, it seems to me that the podcast is more positive than negative but we can get a better overview of positive and negative sentiments by scoring only positive and negative sentiments.

In the third graph we can see quite well that the trend goes towards positive scores, meaning the podcast is overall upbeat. While there are different peaks in both positive and negative directions in Chuck’s and Josh’s lines, there is no overall bias for one being more positive (or negative) than the other.

Finally, I am looking at the sentiment percentage values to get an idea about the percentage of positive versus negative scores along the podcast’s trajectory. Here, the podcast was divided into 20 bins and the mean sentiment valence calculated for each. This last plot shows a clear trend of increasing positivity towards the end of the podcast in Chuck’s lines. Josh on the other hand doesn’t change very much over the progression of the podcast. Interesting…


Quantitative text analysis

Building a corpus

In text analysis, a corpus refers to a collection of documents. Here, I am using the tm package to create my corpus from the character vectors of Josh’s and Chuck’s lines. SnowballC is used for word stemming.

Before I can analyse the text data meaningfully, however, I have to do some pre-processing:

  1. Removing punctuation

    Here, I am removing all punctuation marks. Before I do that, I will transform all hyphens to a space, because the text includes some words which are connected by hyphens and would otherwise be connected if I simply removed the hyphen with the removePunctuation function.

  2. Transforming to lower case

    R character string processing is case-sensitive, so everything will be converted to lower case.

  3. Stripping numbers

    Numbers are usually not very meaningful for text analysis (if we are not specifically interested in dates, for example), so they are removed as well.

  4. Removing stopwords

    Stopwords are collections of very common words which by themselves don’t tell us very much about the content of a text (e.g. and, but, the, however, etc.). The tm package includes a list of stopwords for the English language.

  5. Stripping whitespace

    I’m also removing superfluous whitespace around words.

  6. Stemming

    Finally, I’m stemming the words in the corpus. This means that words with a common root are shortened to this root. Even though stemming algorithms are not perfect, they allow us to compare conjugated words from the same origin.


Creating the Document Term Matrix

The document term matrix (DTM) lists the number of occurrences of each word per document in the corpus. Here, each document in the corpus represents one line of dialogue from the original transcript.

By restricting the DTM to words with a minimum number of letters and an occurrence in at least a minimum number of lines of dialogue (cutoff), we exclude less specific terms.


Is “Yeah” Josh and Chucks favorite word?

Most frequent words

By summing up the occurrences of each word over all documents we get the word count frequencies.

When accounting for stem words and the cutoff I set for the DTM to evaluate, Josh and Chuck spoke roughly the same number of words (Josh: 880, Chuck: 865) and had almost the same number of dialogue lines (Josh: 202, Chuck: 193). So, good job on neither one dominating the discussion. ;-)

The lefthand plot shows the number of words spoken per line of dialogue. The background barplot shows the mean number of words spoken per line, the boxplot shows all individual data points (each point represents one line of dialogue and its corresponding word count). While the total number of words and of dialogue lines were basically the same, Chuck’s lines had a stronger deviation around the median with few very long lines. Josh on the other hand seems to have spoken lines with a more consistent length.

The righthand plot shows the most common words and how often they were used overall (red) and by Josh and Chuck separately (green and blue). The most frequent words include (not surprisingly) “ear” and “earwax”, but funnily also “yeah”. To be honest, while listening to the podcast I never noticed yeah being said exceptionally often but I guess the data doesn’t lie…

Wordclouds are another way to visualize the frequency of words. The frequency is indicated both by the size of the words (bigger words are more frequent than smaller words) and their color.


Word association

Associations among words bigger than 60% were plotted in a heatmap to find words that most often co-occured in the same line of dialogue.

Among the most conspicuous associations were cotton and swab in Josh’s lines (this was probably a hyphenated word to begin with: cotton-swab) and between secret and gland in Chuck’s line (probably secretory gland).


Hierarchical clustering

Hierarchical clustering can be used to classify words by sorting them into clusters according to similarity of occurence (i.e. their frequency or count).

We already knew that the words “earwax”, “ear” and “yeah” were the most common, so they were clustered accordingly.


Knowledge for the masses

Shorter words are more frequent than longer words

Most words have 4 or 5 letters, only a handful are longer than 7 letters. We don’t have words with fewer than 3 letters because they were excluded in the beginning when obtaining the DTM.

As we can see above, there is only a very small correlation between the length of words and how often they are used. As expected from what we intuitively know, the most common words tend to be shorter while long words are used only occasionally because they are often more specific terms. And there is no real difference between Josh or Chuck when it comes to the length (and complexity?) of the words they use. In general the words they use are rather on the short site, which makes sense as a big part of what makes their podcast so great is that they convey information in a down-to-earth, understandable way.

The frequency plot of all the letters in the alphabet shows that vocals are more common than consonants.


The plot above shows how often each letter occurs at which position in a word in all the words used by Josh and Chuck. For example, the letter j occured only once and at position one (so 100% of js are at this position). The other letters are more equally distributed but according to the fact that fewer words were longer than 6 letters, there are fewer letters at positions 7 to 10.


Conclusion

This little excursion into text analysis gave an interesting different look at a podcast one would normally “evaluate” intuitively while listening. This hard cold look through the data lens can highlight aspects that probably would be overlooked otherwise; for example, I never noticed that Josh and Chuck used the word yeah that much!

It would be very interesting to broaden the analysis to more podcasts to see if the yeah-thing was just a fluke of this episode or whether it’s a recurrent thing, maybe by trying speech-to-text-conversion tools.



R code

# setting my custom theme of choice
library(ggplot2)
library(ggrepel)

my_theme <- function(base_size = 12, base_family = "sans"){
  theme_grey(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(color = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "aliceblue"),
    strip.background = element_rect(fill = "lightgrey", color = "grey", size = 1),
    strip.text = element_text(face = "bold", size = 12, color = "navy"),
    legend.position = "bottom",
    legend.background = element_blank(),
    panel.margin = unit(.5, "lines"),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.5)
  )
}
# reading lines of transcript
raw <- readLines("sysk_earwax.transcript.txt")
head(raw)
## [1] "Josh: Hey, and welcome to the podcast. I'm Josh Clark. There is Charles W. \"Chuck\" Bryant, there is Jeri. Yeah, it's Stuff You Should Know."
## [2] ""                                                                                                                                             
## [3] "Chuck: He just shrugged."                                                                                                                     
## [4] ""                                                                                                                                             
## [5] "Josh: Yeah, like \"eh, what are we going to do? That's what we are.\""                                                                        
## [6] ""
# extracting lines beginning with Josh:/ Chuck: from transcript by looping over the names
names <- c("Josh", "Chuck")

for (name in names){
  # grep lines from Josh or Chuck
  assign(paste("lines", name, sep = "_"), as.character(raw[grep(paste0("^", name, ":"), raw)]))
}
# and removing the beginning of each line that indicates who's speaking
lines_Josh <- gsub("^Josh: ", "", lines_Josh)
lines_Chuck <- gsub("^Chuck: ", "", lines_Chuck)

# Combining Josh's and Chuck's lines
lines_bg <- c(lines_Josh, lines_Chuck)

names <- c("Josh", "Chuck", "bg")
# get NRC sentiments for 
# a) each line of dialogue and
# b) each word spoken in the podcast
library(syuzhet)

for (name in names){
  assign(paste("get_nrc_sentiment", name, sep = "_"), data.frame(line_number = 1:length(get(paste("lines", name, sep = "_"))), 
                                                                 get_nrc_sentiment(get(paste("lines", name, sep = "_")))))
  
  get_tokens <- get_tokens(get(paste("lines", name, sep = "_")))
  assign(paste("get_nrc_sentiments_tokens", name, sep = "_"), data.frame(word = get_tokens,
                                                                         get_nrc_sentiment(get_tokens)))
}
# gather word sentiments for plotting
library(tidyr)

get_nrc_sentiments_tokens_bg_gather <- get_nrc_sentiments_tokens_bg %>% 
  gather(word, sentiment, anger:positive)
colnames(get_nrc_sentiments_tokens_bg_gather)[2:3] <- c("sentiment", "value")
get_nrc_sentiments_tokens_bg_gather$name <- "bg"
get_nrc_sentiments_tokens_bg_gather <- get_nrc_sentiments_tokens_bg_gather[which(get_nrc_sentiments_tokens_bg_gather$value != 0), ]
get_nrc_sentiments_tokens_bg_gather <- get_nrc_sentiments_tokens_bg_gather[!duplicated(get_nrc_sentiments_tokens_bg_gather$word), ]
ggplot(data = get_nrc_sentiments_tokens_bg_gather, aes(x = value, y = value, color = word, label = word)) + 
  geom_line(size = 1) +
  facet_wrap(~ sentiment, ncol = 2) +
  my_theme() + 
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    legend.position = "none",
    panel.grid.major = element_blank()) +
  geom_text_repel(segment.color = "aliceblue", segment.alpha = 0) +
  labs(title = "Sentiment categories of words used in podcast")
get_nrc_sentiment_Josh_gather <- get_nrc_sentiment_Josh %>% 
  gather(line_number, sentiment, anger:positive)
colnames(get_nrc_sentiment_Josh_gather)[2:3] <- c("sentiment", "value")
get_nrc_sentiment_Josh_gather$name <- "Josh"

get_nrc_sentiment_Chuck_gather <- get_nrc_sentiment_Chuck %>% 
  gather(line_number, sentiment, anger:positive)
colnames(get_nrc_sentiment_Chuck_gather)[2:3] <- c("sentiment", "value")
get_nrc_sentiment_Chuck_gather$name <- "Chuck"

get_nrc_sentiment_gather <- rbind(get_nrc_sentiment_Josh_gather, get_nrc_sentiment_Chuck_gather)
# split into positive and negative emotions/ sentiments
get_nrc_sentiment_gather_pos <- get_nrc_sentiment_gather[which(get_nrc_sentiment_gather$sentiment %in% 
                                                                 c("anticipation", "joy", "positive", "surprise", "trust")), ]
get_nrc_sentiment_gather_neg <- get_nrc_sentiment_gather[which(get_nrc_sentiment_gather$sentiment %in% 
                                                                 c("anger", "disgust", "fear", "negative", "sadness")), ]

library(RColorBrewer)

p1 <- ggplot(data = get_nrc_sentiment_gather_pos, aes(x = line_number, y = value)) + 
  geom_line(size = 1, color = brewer.pal(3, "Set1")[3]) +
  facet_grid(name ~ sentiment) +
  my_theme() + 
  theme(
    axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
    legend.position = "none") +
  labs(x = "Dialogue line number", y = "Sentiment valence",
       title = "Sentiments during podcast progression (positive sentiments)")

p2 <- ggplot(data = get_nrc_sentiment_gather_neg, aes(x = line_number, y = value)) + 
  geom_line(size = 1, color = brewer.pal(3, "Set1")[1]) +
  facet_grid(name ~ sentiment) +
  my_theme() + 
  theme(
    axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
    legend.position = "none") +
  labs(x = "Dialogue line number", y = "Sentiment valence",
       title = "Sentiments during podcast progression (negative sentiments)")
# get sentiment scores
for (name in names){
  sentiment <- data.frame(line_number = 1:length(get(paste("lines", name, sep = "_"))), 
                              syuzhet = get_sentiment(get(paste("lines", name, sep = "_")), method = "syuzhet"),
                              bing = get_sentiment(get(paste("lines", name, sep = "_")), method = "bing"),
                              afinn = get_sentiment(get(paste("lines", name, sep = "_")), method = "afinn"),
                              nrc = get_sentiment(get(paste("lines", name, sep = "_")), method = "nrc"))
  assign(paste("sentiment", name, sep = "_"), sentiment)
}
# gather for plotting
sentiment_Josh_gather <- sentiment_Josh %>% 
  gather(line_number, analysis, syuzhet:nrc)
colnames(sentiment_Josh_gather)[2:3] <- c("algorithm", "value")
sentiment_Josh_gather$name <- "Josh"

sentiment_Chuck_gather <- sentiment_Chuck %>% 
  gather(line_number, analysis, syuzhet:nrc)
colnames(sentiment_Chuck_gather)[2:3] <- c("algorithm", "value")
sentiment_Chuck_gather$name <- "Chuck"

sentiment_gather <- rbind(sentiment_Josh_gather, sentiment_Chuck_gather)
p3 <- ggplot(data = sentiment_gather, aes(x = line_number, y = value, color = algorithm)) + 
  geom_hline(aes(yintercept=0), linetype="dashed", size = 1) +
  geom_line(size = 1) +
  facet_grid(name ~ algorithm) +
  my_theme() + 
  theme(
    axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
    legend.position = "none") +
  labs(x = "Dialogue line number", y = "Sentiment valence",
       title = "Sentiment during podcast progression according to different lexicons")
# get sentiment percent values
for (name in names){
  sentiment_percent_vals <- data.frame(bin = 1:20,
                              syuzhet = get_percentage_values(get_sentiment(get(paste("lines", name, sep = "_")), method = "syuzhet"), bins = 20),
                              bing = get_percentage_values(get_sentiment(get(paste("lines", name, sep = "_")), method = "bing"), bins = 20),
                              afinn = get_percentage_values(get_sentiment(get(paste("lines", name, sep = "_")), method = "afinn"), bins = 20),
                              nrc = get_percentage_values(get_sentiment(get(paste("lines", name, sep = "_")), method = "nrc"), bins = 20))
  assign(paste("sentiment_percent_vals", name, sep = "_"), sentiment_percent_vals)
}
# gather for plotting
sentiment_percent_vals_Josh_gather <- sentiment_percent_vals_Josh %>% 
  gather(bin, analysis, syuzhet:nrc)
colnames(sentiment_percent_vals_Josh_gather)[2:3] <- c("algorithm", "value")
sentiment_percent_vals_Josh_gather$name <- "Josh"

sentiment_percent_vals_Chuck_gather <- sentiment_percent_vals_Chuck %>% 
  gather(bin, analysis, syuzhet:nrc)
colnames(sentiment_percent_vals_Chuck_gather)[2:3] <- c("algorithm", "value")
sentiment_percent_vals_Chuck_gather$name <- "Chuck"

sentiment_percent_vals_gather <- rbind(sentiment_percent_vals_Josh_gather, sentiment_percent_vals_Chuck_gather)
p4 <- ggplot(data = sentiment_percent_vals_gather, aes(x = bin, y = value, color = algorithm)) + 
  geom_hline(aes(yintercept=0), linetype="dashed", size = 1) +
  geom_line(size = 1) +
  facet_grid(name ~ algorithm) +
  my_theme() + 
  theme(
    axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
    legend.position = "none") +
  labs(x = "Bin", y = "Sentiment percent values",
       title = "Sentiment percent values during podcast progression")

library(gridExtra)
library(grid)

grid.arrange(p1, p2, p3, p4, ncol = 1)
library(tm)
data("crude")
library(SnowballC)

transform_to_space <- content_transformer(function(x, pattern) gsub(pattern, " ", x))

for (name in names){
  corpus <- Corpus(VectorSource(get(paste("lines", name, sep = "_"))))

  # Removing punctuation
  corpus <- tm_map(corpus, transform_to_space, "-")
  corpus <- tm_map(corpus, removePunctuation)
  
  # Transforming to lower case
  corpus <- tm_map(corpus, content_transformer(tolower))
    
  # Stripping numbers
  corpus <- tm_map(corpus, removeNumbers)
    
  # Removing stopwords
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  
  # Stripping whitespace
  corpus <- tm_map(corpus, stripWhitespace)
    
  # Stemming
  corpus <- tm_map(corpus, stemDocument)

  # Converting said to say because this isn't included in stemDocument
  corpus <- tm_map(corpus, gsub, pattern = "said", replacement = "say")
  
  # There are a couple of words not included among the stopwords that I would like to remove
  myStopwords <- c("one", "two", "three", "just", "your", "that", "can")
  corpus <- tm_map(corpus, removeWords, myStopwords)
  
  # Assigning to object and treat as text document
  assign(paste("corpus", name, sep = "_"), tm_map(corpus, PlainTextDocument))
}

corpus_Josh
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 227
corpus_Chuck
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 226
corpus_bg
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 453
#  but here I will keep them all to make the numbers comparable to background later
for (name in names){
  assign(paste("dtm", name, sep = "_"), DocumentTermMatrix(get(paste("corpus", name, sep = "_")), 
         control = list(wordLengths = c(3, 100), bounds = list(global = c(3, length(get(paste("lines", name, sep = "_"))))))))
}

dtm_Josh
## <<DocumentTermMatrix (documents: 227, terms: 126)>>
## Non-/sparse entries: 801/27801
## Sparsity           : 97%
## Maximal term length: 10
## Weighting          : term frequency (tf)
dtm_Chuck
## <<DocumentTermMatrix (documents: 226, terms: 122)>>
## Non-/sparse entries: 779/26793
## Sparsity           : 97%
## Maximal term length: 10
## Weighting          : term frequency (tf)
dtm_bg
## <<DocumentTermMatrix (documents: 453, terms: 260)>>
## Non-/sparse entries: 1945/115835
## Sparsity           : 98%
## Maximal term length: 10
## Weighting          : term frequency (tf)
# get word frequencies from DTM
for (name in names){
  assign(paste("freq_dtm", name, sep = "_"), colSums(as.matrix(get(paste("dtm", name, sep = "_")))))
}
# find most frequent terms and strongest associations with "ear"
for (name in names){
  print(findFreqTerms(get(paste("dtm", name, sep = "_")), lowfreq=40))
  print(findAssocs(get(paste("dtm", name, sep = "_")), "ear", 0.4))
}
## [1] "ear"    "earwax" "yeah"  
## $ear
## canal candl 
##  0.44  0.44 
## 
## [1] "ear"  "yeah"
## $ear
## canal  want clean 
##  0.54  0.44  0.42 
## 
## [1] "dont"   "ear"    "earwax" "like"   "right"  "say"    "think"  "yeah"  
## $ear
## canal candl 
##  0.49  0.42
length(freq_dtm_Josh)
## [1] 126
length(freq_dtm_Chuck)
## [1] 122
length(freq_dtm_bg)
## [1] 260
# get number of words per line
for (name in names){
  words_per_doc <- data.frame(wordcount = rowSums(as.matrix(get(paste("dtm", name, sep = "_")))))
  words_per_doc$name <- rep(name, nrow(words_per_doc))
  
  assign(paste("words_per_doc", name, sep = "_"), words_per_doc)
}

words_per_doc <- rbind(words_per_doc_Josh, words_per_doc_Chuck)
words_per_doc <- words_per_doc[which(words_per_doc$wordcount > 0), ]

word_count <- data.frame(name = c(names[-3]), 
                         number_all_words = c(sum(words_per_doc[which(words_per_doc$name == "Josh"), 1]), 
                                       sum(words_per_doc[which(words_per_doc$name == "Chuck"), 1])),
                         number_lines = c(nrow(words_per_doc[which(words_per_doc$name == "Josh"), ]), 
                                       nrow(words_per_doc[which(words_per_doc$name == "Chuck"), ])))
word_count$words_per_line <- word_count$number_all_words/word_count$number_lines
word_count
##    name number_all_words number_lines words_per_line
## 1  Josh              880          202       4.356436
## 2 Chuck              865          193       4.481865
p1 <- ggplot(words_per_doc, aes(x = name, y = wordcount, fill = name)) +
  geom_bar(data = word_count, aes(x = name, y = words_per_line, fill = name), stat = "identity", alpha = .5) + 
  geom_boxplot() + 
  scale_fill_brewer(palette = "Set1", name = "") +
  my_theme() +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
        legend.position = "none") +
  labs(y = "Number of words",
       title = "Number of words spoken\n per line of dialogue")
# create merged data frame for plotting
library(dplyr)

freq_df_Josh <- data.frame(words = names(freq_dtm_Josh), freq = freq_dtm_Josh)
freq_df_Chuck <- data.frame(words = names(freq_dtm_Chuck), freq = freq_dtm_Chuck)

freq_df <- full_join(freq_df_Josh, freq_df_Chuck, by = "words")

freq_df_bg  <- data.frame(words = names(freq_dtm_bg), freq = freq_dtm_bg)
freq_df <- full_join(freq_df, freq_df_bg, by = "words")

freq_df[is.na(freq_df)] <- 0
colnames(freq_df)[2:4] <- names
colnames(freq_df)[4] <- "background"
# subsetting only those words which occur at least 15 times in either Josh's, Chuck's or their combined lines
freq_df_subs <- freq_df[apply(freq_df[, -1], 1, function(x) any(x > 15)), ]

# gather for plotting
freq_df_subs_gather <- freq_df_subs %>% 
  gather(words, count)
colnames(freq_df_subs_gather)[2] <- "name"
# setting factor levels
f = as.factor(freq_df_bg[order(freq_df_bg$freq, decreasing = TRUE), "words"])
freq_df_subs_gather <- within(freq_df_subs_gather, words <- factor(words, levels = f))

p2 <- ggplot(freq_df_subs_gather, aes(x = words, y = count, fill = name)) +
  geom_bar(data = subset(freq_df_subs_gather, name == "background"), alpha = .3, stat = "identity") + 
  geom_bar(data = subset(freq_df_subs_gather, name != "background"), stat = "identity", position = position_dodge(width = .8), width = .7) + 
  scale_fill_brewer(palette = "Set1", name = "") +
  my_theme() +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
        legend.position = c(.9, .85)) +
  labs(y = "Frequency of use (count)",
       title = "Frequency of the most common words in Josh and Chuck's podcast")
grid.arrange(p1, p2, widths = c(0.3, 0.7), ncol = 2)
library(wordcloud)

#setting the same seed makes the look consistent
set.seed(42)

layout(matrix(c(1, 2, 3, 4), nrow = 2, byrow = TRUE), heights = c(0.1, 1))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, "Josh:", cex = 2)
plot.new()
text(x=0.5, y=0.5, "Chuck:", cex = 2)
wordcloud(names(freq_dtm_Josh), freq_dtm_Josh, min.freq = 8, colors = rev(brewer.pal(8, "Spectral")))
wordcloud(names(freq_dtm_Chuck), freq_dtm_Chuck, min.freq = 8, colors = rev(brewer.pal(8, "Spectral")))
library(dplyr)

for (name in names){
  
  for (i in 1:length(names(get(paste("freq_dtm", name, sep = "_"))))){
      associations <- as.data.frame(findAssocs(get(paste("dtm", name, sep = "_")), names(get(paste("freq_dtm", name, sep = "_")))[i], 0))
      associations$word <- rownames(associations)
  
  if (i == 1){
    associations_table <- associations[, c(2, 1)]
  } else {
    associations_table <- full_join(associations_table, associations, by = "word")
  }
  
  if (i == length(names(get(paste("freq_dtm", name, sep = "_"))))){
    associations_table[is.na(associations_table)] <- 0
    assign(paste("associations_table", name, sep = "_"), associations_table)
  }
  }
}
library(reshape2)

cor = 0.6

for (name in names){
  top_words <- get(paste("associations_table", name, sep = "_"))
  top_words <- top_words[apply(top_words[, -1], 1, function(x) any(x > cor)), c(TRUE, apply(top_words[, -1], 2, function(x) any(x > cor)))]
  
  associations_table_melt <- melt(top_words)

  p <- ggplot(data = associations_table_melt, aes(x = word, y = variable, fill = value)) + 
    geom_tile(width=.9, height=.9) +
    scale_fill_gradient2(low = "white", high = "red",  
                         limit = c(0,1), space = "Lab", 
                         name="") +
    my_theme() + 
    theme(
      axis.title = element_blank(),
      legend.position = "right") +
    coord_fixed() +
    labs(title = paste0("Word pair associations > ", cor, " in ", name, "'s lines"))
  
  print(p)
}
library(ggdendro)

for (name in names){
  #convert dtm to matrix
  m <- as.matrix(subset(get(paste("freq_df", name, sep = "_")), freq > 10))
  d <- dist(m, method = "maximum")
  hc <- hclust(d, method = "average")
  
  assign(paste("hc", name, sep = "_"), dendro_data(as.dendrogram(hc)))
}
ggplot() + 
  ylim(-5, 50) +
  geom_segment(data = segment(hc_bg), aes(x = x, y = y+40, xend = xend, yend = yend+40)) + 
  geom_segment(data = segment(hc_Josh), aes(x = x+5, y = y, xend = xend+5, yend = yend)) + 
  geom_segment(data = segment(hc_Chuck), aes(x = x+22, y = y, xend = xend+22, yend = yend)) + 
  geom_text(data = label(hc_bg), aes(x = x, y = y+37, label = label, color = label), angle = 45, lineheight = 0, vjust = 0, hjust = 0.6) +
  geom_text(data = label(hc_Josh), aes(x = x+5, y = y-3, label = label, color = label), angle = 45, lineheight = 0, vjust = 0, hjust = 0.6) +
  geom_text(data = label(hc_Chuck), aes(x = x+22, y = y-3, label = label, color = label), angle = 45, lineheight = 0, vjust = 0, hjust = 0.6) +
  ggplot2::annotate("text", x = 4, y = 48, label = "Background:") +
  ggplot2::annotate("text", x = 4.9, y = 26, label = "Josh:") +
  ggplot2::annotate("text", x = 21.5, y = 26, label = "Chuck:") +
  my_theme() + 
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    legend.position = "none",
    panel.grid.major = element_blank()) +
  labs(title = "Hierarchical clustering dendrograms of words with at least 10 occurences")
freq_df$characters <- nchar(freq_df$words)
  
p1 <- ggplot(data = freq_df, aes(x = characters)) +
  geom_histogram(binwidth = 1, fill = "navy") +
  geom_vline(xintercept = round(mean(freq_df$characters), digits = 0),
             color = "red", size = 2, alpha = .5) +
  labs(x = "Letters", y = "Words",
       title = "Histogram of\nword length") +
  my_theme() +
  theme(axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5))
freq_df_gather <- freq_df %>% 
  gather(words, count, Josh:background)
colnames(freq_df_gather)[3] <- "name"

p2 <- ggplot(subset(freq_df_gather, name != "background"), aes(x = characters, y = count, label = words)) + 
  geom_point(color = "deepskyblue4", alpha = 0.5) + 
  my_theme() +
  geom_smooth(method = "lm", se = TRUE, size = 1, color = "black") +
  facet_grid(~ name) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5)) +
  labs(x = "Frequency of use (count)", y = "Word length",
       title = "Correlation between word length and frequency\nof use in Josh's and Chuck's lines") +
  geom_text_repel(data = subset(freq_df_gather, name != "background" & count > 25))
library(stringr)
letters <- unlist(sapply(freq_df$words, function(x) str_split(x, "")))

library(qdap)
letters_tab <- dist_tab(letters)

# setting factor levels
f = as.factor(rev(letters_tab$interval))
letters_tab <- within(letters_tab, interval <- factor(interval, levels = f))

p3 <- ggplot(letters_tab, aes(x = interval, y = percent)) +
  geom_bar(stat = "identity", fill = "navy", alpha = 0.8) +
  coord_flip() +
  my_theme() +
  theme(axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5)) +
  labs(x = "Letter", y = "Percentage",
       title = "Frequency of letters")
grid.arrange(p1, p2, p3, widths = c(0.2, 0.5, 0.3), ncol = 3)
letters_df <- as.data.frame(letters)
letters_df$position <- as.factor(unlist(regmatches(rownames(letters_df), gregexpr("[0-9]+", rownames(letters_df)))))

letters_df_table <- as.data.frame(table(letters_df))
total_number_letter <- as.data.frame(table(letters_df$letters))

letters_df_table <- left_join(letters_df_table, total_number_letter, by = c("letters" = "Var1"))
letters_df_table$percent <- letters_df_table$Freq.x*100/letters_df_table$Freq.y
letters_df_table$percent[letters_df_table$percent == 0] <- Inf

letters_df_table$Freq.x[letters_df_table$Freq.x == 0] <- Inf

letters_df_table_gather <- letters_df_table[, -4] %>% 
  gather(letters, position, Freq.x:percent)
colnames(letters_df_table_gather)[3:4] <- c("measure", "value")

letters_df_table_gather$measure <- ifelse(letters_df_table_gather$measure == "Freq.x", "Count", "Percent")

# setting factor levels
f = as.factor(rev(1:10))
letters_df_table_gather <- within(letters_df_table_gather, position <- factor(position, levels = f))
ggplot(data = subset(letters_df_table_gather, !is.infinite(value)), aes(x = letters, y = position, fill = value)) + 
  geom_tile(width=.9, height=.9) +
  facet_grid(measure ~ .) +
  scale_fill_gradient(low = "moccasin", high = "red") +
  coord_flip() +
  my_theme() + 
  theme(
    axis.text.x = element_text(angle = 0, vjust = 0, hjust = 0.5),
    legend.position = "right") +
  coord_fixed() +
  labs(x = "", y = "Position",
       title = "How often does each letter occur\nat which position in a word?")
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12
## 
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] qdap_2.2.5             qdapTools_1.3.1        qdapRegex_0.6.0       
##  [4] qdapDictionaries_1.0.6 stringr_1.1.0          ggdendro_0.1-20       
##  [7] reshape2_1.4.2         wordcloud_2.5          dplyr_0.5.0           
## [10] SnowballC_0.5.1        tm_0.6-2               NLP_0.1-9             
## [13] gridExtra_2.2.1        RColorBrewer_1.1-2     tidyr_0.6.0           
## [16] syuzhet_1.0.0          ggrepel_0.6.3          ggplot2_2.1.0         
## 
## loaded via a namespace (and not attached):
##  [1] gtools_3.5.0        venneuler_1.1-0     slam_0.1-37        
##  [4] rJava_0.9-8         reports_0.1.4       colorspace_1.2-7   
##  [7] htmltools_0.3.5     yaml_2.1.13         chron_2.3-47       
## [10] XML_3.98-1.4        DBI_0.5-1           plyr_1.8.4         
## [13] munsell_0.4.3       gtable_0.2.0        evaluate_0.10      
## [16] labeling_0.3        knitr_1.14          gender_0.5.1       
## [19] parallel_3.3.2      xlsxjars_0.6.1      Rcpp_0.12.7        
## [22] scales_0.4.0        formatR_1.4         gdata_2.17.0       
## [25] plotrix_3.6-3       xlsx_0.5.7          openNLPdata_1.5.3-2
## [28] digest_0.6.10       stringi_1.1.2       tools_3.3.2        
## [31] bitops_1.0-6        magrittr_1.5        lazyeval_0.2.0     
## [34] RCurl_1.95-4.8      tibble_1.2          MASS_7.3-45        
## [37] data.table_1.9.6    assertthat_0.1      rmarkdown_1.1      
## [40] openNLP_0.2-6       R6_2.2.0            igraph_1.0.1

Exploring the human genome (Part 2) - Transcripts

2016-11-01 00:00:00 +0000

How many transcripts and proteins do genes have?

In Exploring the human genome (Part 1) - Gene Annotations I examined Ensembl, Entrez and HGNC gene annotations with AnnotationDbi via three R packages: org.Hs.eg.db, EnsDb.Hsapiens.v79 and TxDb.Hsapiens.UCSC.hg38.knownGene.

Now, I want to know how many transcripts there are for genes in these databases.


What is a transcript?

While a gene is defined as a unit of DNA information which encodes for the production of a protein, it is really more a concept than an actual physical unit. Human genes conists of exons and introns, which can often be transcribed in different combinations - a process called alternative splicing.

“While the concept of a gene has been helpful in defining the relationship of a portion of a genome to a phenotype, this traditional term may not be as useful as it once was. Currently, “gene” has come to refer principally to a genomic region producing a polyadenylated mRNA that encodes a protein. However, the recent emergence of a large collection of unannotated transcripts with apparently little protein coding capacity, collectively called transcripts of unknown function (TUFs), has begun to blur the physical boundaries and genomic organization of genic regions with noncoding transcripts often overlapping protein-coding genes on the same (sense) and opposite strand (antisense). Moreover, they are often located in intergenic regions, making the genic portions of the human genome an interleaved network of both annotated polyadenylated and nonpolyadenylated transcripts, including splice variants with novel 5′ ends extending hundreds of kilobase. This complex transcriptional organization and other recently observed features of genomes argue for the reconsideration of the term “gene” and suggests that transcripts may be used to define the operational unit of a genome.” Thomas Gingeras, Genome Res. 2007


org.Hs.eg.db

With org.Hs.eg.db I am using Entrez and Ensembl IDs to obtain Ensembl transcript IDs, which show splice variants of a gene, including protein-coding and non-coding transcripts.

library(AnnotationDbi)
library(org.Hs.eg.db)

ENTREZID_org <- keys(org.Hs.eg.db, keytype = "ENTREZID")
ENSEMBL_org <- keys(org.Hs.eg.db, keytype = "ENSEMBL")

# Summarize number of transcripts per gene Entrez ID
org_trans_entrez <- AnnotationDbi::select(org.Hs.eg.db, keys = ENTREZID_org, columns = c("ENSEMBLTRANS"), keytype = "ENTREZID")
org_transcript_num_table_entrez <- as.data.frame(table(org_trans_entrez$ENTREZID))
colnames(org_transcript_num_table_entrez) <- c("Entrez", "orgDb")

# Summarize number of transcripts per gene Ensembl ID
org_trans_ensembl <- AnnotationDbi::select(org.Hs.eg.db, keys = ENSEMBL_org, columns = c("ENSEMBLTRANS"), keytype = "ENSEMBL")
org_transcript_num_table_ensembl <- as.data.frame(table(org_trans_ensembl$ENSEMBL))
colnames(org_transcript_num_table_ensembl) <- c("Ensembl", "orgDb")

# how many NAs are in each column?
sapply(org_trans_entrez, function(x) sum(is.na(x)))
##     ENTREZID ENSEMBLTRANS 
##            0        52081
sapply(org_trans_ensembl, function(x) sum(is.na(x)))
##      ENSEMBL ENSEMBLTRANS 
##            0        17678
head(org_trans_entrez)
##   ENTREZID    ENSEMBLTRANS
## 1        1            <NA>
## 2        2            <NA>
## 3        3 ENST00000543404
## 4        3 ENST00000566278
## 5        3 ENST00000545343
## 6        3 ENST00000544183
head(org_trans_ensembl)
##           ENSEMBL    ENSEMBLTRANS
## 1 ENSG00000121410            <NA>
## 2 ENSG00000175899            <NA>
## 3 ENSG00000256069 ENST00000543404
## 4 ENSG00000256069 ENST00000566278
## 5 ENSG00000256069 ENST00000545343
## 6 ENSG00000256069 ENST00000544183

Strangely, some genes, like A1BG (Entrez ID 1) are listed with one gene ID but no Ensembl transcript ID. This is weird, especially since I happen to know for this particular gene that it has several transcripts. Also, because each gene must have at least one transcript, all NAs are counted as 1 transcript in the summary table.

Let’s check other databases…


TxDb.Hsapiens.UCSC.hg38.knownGene

TxDb.Hsapiens.UCSC.hg38.knownGene only has Entrez IDs to identify genes and UCSC transcript ID to identify transcripts.

library(TxDb.Hsapiens.UCSC.hg38.knownGene)

ENTREZID_TxDb <- keys(TxDb.Hsapiens.UCSC.hg38.knownGene, keytype = "GENEID")
TxDb_trans <- AnnotationDbi::select(TxDb.Hsapiens.UCSC.hg38.knownGene, keys = ENTREZID_TxDb, columns = c("TXID"), keytype = "GENEID")

# Summarize number of transcripts per gene Entrez ID
TxDb_transcript_num_table_entrez <- as.data.frame(table(TxDb_trans$GENEID))
colnames(TxDb_transcript_num_table_entrez) <- c("Entrez", "TxDb")

# how many NAs are in each column?
sapply(TxDb_trans, function(x) sum(is.na(x)))
## GENEID   TXID 
##      0      0
head(TxDb_trans)
##   GENEID   TXID
## 1      1 166436
## 2      1 166437
## 3      1 166438
## 4      1 166439
## 5      1 166440
## 6      1 166441

Here, we don’t have NAs in the data and in contrast to org.Hs.eg.db we find at least six transcripts for A1BG.


EnsDb.Hsapiens.v79

Like org.Hs.eg.db EnsDb.Hsapiens.v79 has Entrez and Ensembl IDs in reference to Ensembl transcript IDs.

library(EnsDb.Hsapiens.v79)

ENSEMBL_EnsDb <- keys(EnsDb.Hsapiens.v79, keytype = "GENEID")
ENTREZ_EnsDb <- keys(EnsDb.Hsapiens.v79, keytype = "ENTREZID")

EnsDb_trans_ensembl <- ensembldb::select(EnsDb.Hsapiens.v79, keys = ENSEMBL_EnsDb, columns = c("TXID"), keytype = "GENEID")
EnsDb_trans_entrez <- ensembldb::select(EnsDb.Hsapiens.v79, keys = ENTREZ_EnsDb, columns = c("TXID"), keytype = "ENTREZID")

# somehow there are empty fields in the Entrez ID column, replacing them with NA
EnsDb_trans_entrez[EnsDb_trans_entrez ==  ""] <- NA

# how many NAs are in each column?
sapply(EnsDb_trans_entrez, function(x) sum(is.na(x)))
## ENTREZID     TXID 
##    47828        0
# and removing NA rows
EnsDb_trans_entrez <- EnsDb_trans_entrez[!is.na(EnsDb_trans_entrez$ENTREZID), ]
# order by Entrez ID to compare with other databases
EnsDb_trans_entrez <- EnsDb_trans_entrez[order(EnsDb_trans_entrez$ENTREZID),]

head(EnsDb_trans_ensembl)
##            GENEID            TXID
## 1 ENSG00000000003 ENST00000373020
## 2 ENSG00000000003 ENST00000496771
## 3 ENSG00000000003 ENST00000494424
## 4 ENSG00000000003 ENST00000612152
## 5 ENSG00000000003 ENST00000614008
## 6 ENSG00000000005 ENST00000373031
head(EnsDb_trans_entrez)
##        ENTREZID            TXID
## 95915         1 ENST00000596924
## 95916         1 ENST00000263100
## 95917         1 ENST00000595014
## 95918         1 ENST00000598345
## 95919         1 ENST00000600966
## 135994       10 ENST00000286479

Here, we find five transcripts for A1BG.


# Summarize number of transcripts per gene Entrez ID
EnsDb_transcript_num_table_entrez <- as.data.frame(table(EnsDb_trans_entrez$ENTREZID))
colnames(EnsDb_transcript_num_table_entrez) <- c("Entrez", "EnsDb")

# Summarize number of transcripts per gene Ensembl ID
EnsDb_transcript_num_table_ensembl <- as.data.frame(table(EnsDb_trans_ensembl$GENEID))
colnames(EnsDb_transcript_num_table_ensembl) <- c("Ensembl", "EnsDb")

# In the Entrez column, there are some with multiple entries
# divide entries with multiple gene names into one row per gene/ entry
head(EnsDb_transcript_num_table_entrez[grep(";", EnsDb_transcript_num_table_entrez$Entrez), ])
##                                     Entrez EnsDb
## 13                     100033415;100033421     1
## 15                     100033417;100033419     2
## 18                     100033421;100033415     1
## 40                     100033446;100033449     1
## 42 100033448;100033803;100033817;100033810     2
## 43                     100033449;100033446     1
library(splitstackshape)
out <- as.data.frame(cSplit(EnsDb_transcript_num_table_entrez, splitCols = "Entrez", sep = ";", direction = "long"), stringsAsFactors = FALSE)
out$Entrez <- as.character(out$Entrez)

# remove duplicates and take the mean
library(plyr)
EnsDb_transcript_num_table_entrez <- ddply(out, "Entrez", numcolwise(mean))


Comparison of transcript numbers per database package

The majority of genes have only one transcript. The number of genes with more transcripts decreases with number of transcripts; this can be seen in the plots below.

Entrez IDs

# merging datasets by Entrez ID
library(dplyr)
transcript_num_table_entrez <- full_join(org_transcript_num_table_entrez, TxDb_transcript_num_table_entrez, by = "Entrez")
transcript_num_table_entrez <- full_join(transcript_num_table_entrez, EnsDb_transcript_num_table_entrez, by = "Entrez")

# gather for plotting
library(tidyr)
transcript_num_table_gather_entrez <- transcript_num_table_entrez %>% 
  gather(DB, count, orgDb:EnsDb)

# How many counts are NA?
sapply(transcript_num_table_gather_entrez, function(x) sum(is.na(x)))
## Entrez     DB  count 
##      0      0  71066
# removing rows with NA counts
transcript_num_table_gather_entrez <- transcript_num_table_gather_entrez[!is.na(transcript_num_table_gather_entrez$count), ]

# because there are only a handful of genes with many transcripts, they can't be plotted together with genes with few transcripts
# separating them in high and low
transcript_num_table_gather_entrez$group <- ifelse(transcript_num_table_gather_entrez$count > 25, "high", "low")

# setting factor levels
f = c("low", "high")
transcript_num_table_gather_entrez <- within(transcript_num_table_gather_entrez, group <- factor(group, levels = f))
# setting my custom theme of choice
library(ggplot2)

my_theme <- function(base_size = 12, base_family = "sans"){
  theme_grey(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(colour = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "aliceblue"),
    strip.background = element_rect(fill = "lightgrey", color = "grey", size = 1),
    strip.text = element_text(face = "bold", size = 12, colour = "navy"),
    legend.position = "bottom",
    panel.margin = unit(.05, "lines"),
    panel.border = element_rect(color = "grey", fill = NA, size = 0.5)
  )
}
p <- ggplot(data = transcript_num_table_gather_entrez, aes(as.numeric(count))) + 
  geom_histogram(fill = "deepskyblue4") +
  my_theme() +
  labs(title = "Histogram of number of transcripts per gene (Entrez ID)", 
       x = "Number of transcripts per gene", y = "Count") +
  facet_wrap(DB ~ group, scales = "free", ncol = 2)

ann_text_entrez <- data.frame(x = c(20, 300, 20, 100, 20, 100),
                       y = c(4000, 350, 40000, 15, 5000, 280),
                       group = rep(c("low", "high"), 3),
                       DB = rep(c("EnsDb", "orgDb", "TxDb"), each = 2),
                       labs = c(paste(length(which(transcript_num_table_gather_entrez$group ==  "low" & 
                                                     transcript_num_table_gather_entrez$DB ==  "EnsDb"))), 
                                paste(length(which(transcript_num_table_gather_entrez$group ==  "high" & 
                                                     transcript_num_table_gather_entrez$DB ==  "EnsDb"))),
                                paste(length(which(transcript_num_table_gather_entrez$group ==  "low" & 
                                                     transcript_num_table_gather_entrez$DB ==  "orgDb"))), 
                                paste(length(which(transcript_num_table_gather_entrez$group ==  "high" & 
                                                     transcript_num_table_gather_entrez$DB ==  "orgDb"))),
                                paste(length(which(transcript_num_table_gather_entrez$group ==  "low" & 
                                                     transcript_num_table_gather_entrez$DB ==  "TxDb"))), 
                                paste(length(which(transcript_num_table_gather_entrez$group ==  "high" & 
                                                     transcript_num_table_gather_entrez$DB ==  "TxDb")))))

p + geom_text(data = ann_text_entrez, aes(x, y, label = labs, group = NULL), size = 8)

This time, orgDb has the highest number of Entrez ID gene entries with corresponding transcript information, the majority of which have fewer than 25 transcripts (59984 genes); only 152 genes have more than 25 transcripts. EnsDb and TxDB have comparable numbers of gene entries, also for genes with few (23971 and 24669 genes) and many transcripts (566 and 552 genes).


Ensembl IDs

# merging datasets by Ensembl ID
transcript_num_table_ensembl <- full_join(org_transcript_num_table_ensembl, EnsDb_transcript_num_table_ensembl, by = "Ensembl")

# gather for plotting
transcript_num_table_gather_ensembl <- transcript_num_table_ensembl %>% 
  gather(DB, count, orgDb:EnsDb)

# How many counts are NA?
sapply(transcript_num_table_gather_ensembl, function(x) sum(is.na(x)))
## Ensembl      DB   count 
##       0       0   38659
# removing rows with NA counts
transcript_num_table_gather_ensembl <- transcript_num_table_gather_ensembl[!is.na(transcript_num_table_gather_ensembl$count), ]

# because there are only a handful of genes with many transcripts, they can't be plotted together with genes with few transcripts
# separating them in high and low
transcript_num_table_gather_ensembl$group <- ifelse(transcript_num_table_gather_ensembl$count > 25, "high", "low")

# setting factor levels
f = c("low", "high")
transcript_num_table_gather_ensembl <- within(transcript_num_table_gather_ensembl, group <- factor(group, levels = f))
p <- ggplot(data = transcript_num_table_gather_ensembl, aes(as.numeric(count))) + 
  geom_histogram(fill = "deepskyblue4") +
  my_theme() +
  labs(title = "Histogram of number of transcripts per gene (Ensembl ID)", 
       x = "Number of transcripts per gene", y = "Count") +
  facet_wrap(DB ~ group, scales = "free", ncol = 2)

ann_text_ensembl <- data.frame(x = c(10, 75, 10, 100),
                              y = c(30000, 100, 15000, 100),
                              group = rep(c("low", "high"), 4),
                              DB = rep(c("EnsDb", "orgDb"), each = 2),
                              labs = c(paste(length(which(transcript_num_table_gather_ensembl$group ==  "low" & 
                                                            transcript_num_table_gather_ensembl$DB ==  "EnsDb"))), 
                                       paste(length(which(transcript_num_table_gather_ensembl$group ==  "high" & 
                                                            transcript_num_table_gather_ensembl$DB ==  "EnsDb"))),
                                       paste(length(which(transcript_num_table_gather_ensembl$group ==  "low" & 
                                                            transcript_num_table_gather_ensembl$DB ==  "orgDb"))), 
                                       paste(length(which(transcript_num_table_gather_ensembl$group ==  "high" & 
                                                            transcript_num_table_gather_ensembl$DB ==  "orgDb")))))

p + geom_text(data = ann_text_ensembl, aes(x, y, label = labs, group = NULL), size = 8)

For Ensembl gene IDs EnsDb has the higher number of entries with transcript information; most of these genes (65374) have fewer than 25 transcripts. While orgDb has fewer Ensembl-transcript entries, proportionally more have a high number of transcripts (1035 vs 400), while 26980 genes have low transcript numbers.


Comparison of transcript numbers per gene

Now that we know what the distribution of transcript numbers per gene is in different database packages, I want to directly compare the numbers of transcripts that are identified for each gene.

# for comparison replacing NAs with 0
transcript_num_table_ensembl_NAtozero <- transcript_num_table_ensembl
transcript_num_table_ensembl_NAtozero[is.na(transcript_num_table_ensembl_NAtozero)] <- 0

transcript_num_table_entrez_NAtozero <- transcript_num_table_entrez
transcript_num_table_entrez_NAtozero[is.na(transcript_num_table_entrez_NAtozero)] <- 0
library(ggrepel)

p1 <- ggplot(transcript_num_table_ensembl_NAtozero, aes(x = orgDb, y = EnsDb)) + 
  geom_abline(linetype="dashed") +
  geom_point(colour = "deepskyblue4", alpha = 0.3) + 
  my_theme() +
  geom_smooth(size = 1, color = "black") +            # Add a loess smoothed fit curve with confidence region
  labs(title = "Number of transcripts per Ensembl ID") +
  geom_text_repel(data = subset(transcript_num_table_ensembl_NAtozero, orgDb > 140 | EnsDb > 150), aes(label = Ensembl))

p2 <- ggplot(transcript_num_table_entrez_NAtozero, aes(x = orgDb, y = TxDb)) + 
  geom_abline(linetype="dashed") + 
  geom_point(colour = "deepskyblue4", alpha = 0.3) + 
  my_theme() +
  geom_smooth(size = 1, color = "black") +
  labs(title = "Number of transcripts per Entrez ID") +
  geom_text_repel(data = subset(transcript_num_table_entrez_NAtozero, orgDb > 100 | TxDb > 200), aes(label = Entrez))

p3 <- ggplot(transcript_num_table_entrez_NAtozero, aes(x = orgDb, y = EnsDb)) + 
  geom_abline(linetype="dashed") +
  geom_point(colour = "deepskyblue4", alpha = 0.3) + 
  my_theme() +
  geom_smooth(size = 1, color = "black") +
  labs(title = "Number of transcripts per Entrez ID") +
  geom_text_repel(data = subset(transcript_num_table_entrez_NAtozero, orgDb > 100 | EnsDb > 900), aes(label = Entrez))

p4 <- ggplot(transcript_num_table_entrez_NAtozero, aes(x = TxDb, y = EnsDb)) + 
  geom_abline(linetype="dashed") +
  geom_point(colour = "deepskyblue4", alpha = 0.3) + 
  my_theme() +
  geom_smooth(size = 1, color = "black") +
  labs(title = "Number of transcripts per Entrez ID") +
  geom_text_repel(data = subset(transcript_num_table_entrez_NAtozero, TxDb > 150 | EnsDb > 900), aes(label = Entrez))

library(gridExtra)
library(grid)

grid.arrange(p1, p2, p3, p4, ncol = 2)

The plots above show scatterplots with fitted loess curves with confidence regions of the numbers of transcripts for each gene with either Entrez or Ensembl entries from different database packages; dashed lines show 1:1 ratio. For most extreme genes, the Ensembl/ Entrez ID is given in the plots.

While some genes do have the same number of transcripts in different database packages, the majority do not. Entrez IDs seem to give a better correlation than Ensembl IDs, except for a few extreme outliers.


Is there a correlation between number of transcripts and number of exons of a gene?

Because the number of exons a gene has exponentially increases the number of potential transcripts produced via alternative splicing, it would make sense if the number of exons correlates with the number of transcripts of a gene. However, not all combinations of exons will produce a functioning protein, so there won’t be as many transcripts as theoretically possible.

Exon IDs are only available in TxDb.Hsapiens.UCSC.hg38.knownGene and EnsDb.Hsapiens.v79.

TxDb_exons <- AnnotationDbi::select(TxDb.Hsapiens.UCSC.hg38.knownGene, keys = ENTREZID_TxDb, columns = c("EXONID"), keytype = "GENEID")

EnsDb_exons_ensembl <- ensembldb::select(EnsDb.Hsapiens.v79, keys = ENSEMBL_EnsDb, columns = c("EXONID"), keytype = "GENEID")
EnsDb_exons_entrez <- ensembldb::select(EnsDb.Hsapiens.v79, keys = ENTREZ_EnsDb, columns = c("EXONID"), keytype = "ENTREZID")

# summarize number of exons per gene
TxDb_exons_num_table_entrez <- as.data.frame(table(TxDb_exons$GENEID))

EnsDb_exons_num_table_ensembl <- as.data.frame(table(EnsDb_exons_ensembl$GENEID))
EnsDb_exons_num_table_entrez <- as.data.frame(table(EnsDb_exons_entrez$ENTREZID))

# there are over 98000 entries without corresponding Entrez ID, removing those
EnsDb_exons_num_table_entrez <- EnsDb_exons_num_table_entrez[-1, ]
EnsDb_trans_vs_exons_ensembl <- full_join(EnsDb_transcript_num_table_ensembl, EnsDb_exons_num_table_ensembl, by = c("Ensembl" = "Var1"))
colnames(EnsDb_trans_vs_exons_ensembl)[2:3] <- c("EnsDb_transcripts", "EnsDb_exons")

EnsDb_trans_vs_exons_entrez <- full_join(EnsDb_transcript_num_table_entrez, EnsDb_exons_num_table_entrez, by = c("Entrez" = "Var1"))
colnames(EnsDb_trans_vs_exons_entrez)[2:3] <- c("EnsDb_transcripts", "EnsDb_exons")

TxDb_trans_vs_exons_entrez <- full_join(TxDb_transcript_num_table_entrez, TxDb_exons_num_table_entrez, by = c("Entrez" = "Var1"))
colnames(TxDb_trans_vs_exons_entrez)[2:3] <- c("TxDb_transcripts", "TxDb_exons")
p1 <- ggplot(EnsDb_trans_vs_exons_ensembl, aes(x = EnsDb_transcripts, y = EnsDb_exons)) + 
  geom_abline(linetype="dashed") +
  geom_point(colour = "deepskyblue4", alpha = 0.3) + 
  my_theme() +
  geom_smooth(size = 1, color = "black") +
  labs(title = "Number of transcripts vs number of exons\nper Ensembl ID in EnsDb") +
  geom_text_repel(data = subset(EnsDb_trans_vs_exons_ensembl, EnsDb_transcripts > 150 | EnsDb_exons > 300), aes(label = Ensembl))

p2 <- ggplot(EnsDb_trans_vs_exons_entrez, aes(x = EnsDb_transcripts, y = EnsDb_exons)) + 
  geom_abline(linetype="dashed") +
  geom_point(colour = "deepskyblue4", alpha = 0.3) + 
  my_theme() +
  geom_smooth(size = 1, color = "black") +
  labs(title = "Number of transcripts vs number of exons\nper Entrez ID in EnsDb") +
  geom_text_repel(data = subset(EnsDb_trans_vs_exons_entrez, EnsDb_transcripts > 300 | EnsDb_exons > 1000), aes(label = Entrez))

p3 <- ggplot(TxDb_trans_vs_exons_entrez, aes(x = TxDb_transcripts, y = TxDb_exons)) + 
  geom_abline(linetype="dashed") +
  geom_point(colour = "deepskyblue4", alpha = 0.3) + 
  my_theme() +
  geom_smooth(size = 1, color = "black") +
  labs(title = "Number of transcripts vs number of exons\nper Entrez ID in TxDb") +
  geom_text_repel(data = subset(TxDb_trans_vs_exons_entrez, TxDb_transcripts > 200 | TxDb_exons > 400), aes(label = Entrez))

grid.arrange(p1, p2, p3, ncol = 1)

Indeed, there is a reasonably good correlation between number of transcripts and number of exons ins genes. The scatterplots above show transcript vs exon number per gene in the three database packages with fitted loess curves with confidence regions. As expected, the increase is not exponential as theoretically possible, but stronger than 1:1. For Entrez IDs, in both EnsDb and TxDbm genes with up to about 100 transcripts show a steeper correlation than genes with more than 100 transcripts. For Ensembl IDs the cutoff is around 40 transcripts.


What genes have many exons but few transcripts?

Interestingly, there are a couple of genes with many exons but only few transcripts. To find out more about these genes, I will compare their proportion of gene biotypes.

# gene biotypes are only available in EnsDb
EnsDb_trans_vs_exons_ensembl$ratio <- EnsDb_trans_vs_exons_ensembl$EnsDb_exons/EnsDb_trans_vs_exons_ensembl$EnsDb_transcripts
EnsDb_trans_vs_exons_ensembl_genes <- as.character(EnsDb_trans_vs_exons_ensembl[order(EnsDb_trans_vs_exons_ensembl$ratio, decreasing = TRUE), "Ensembl"][1:100])

EnsDb_trans_vs_exons_entrez$ratio <- EnsDb_trans_vs_exons_entrez$EnsDb_exons/EnsDb_trans_vs_exons_entrez$EnsDb_transcripts
EnsDb_trans_vs_exons_entrez_genes <- as.character(EnsDb_trans_vs_exons_entrez[order(EnsDb_trans_vs_exons_entrez$ratio, decreasing = TRUE), "Entrez"][1:100])
# get all gene biotypes for all gene IDs
biotypes_exons_trans_EnsDb_ensembl_all <- ensembldb::select(EnsDb.Hsapiens.v79, keys = EnsDb_trans_vs_exons_ensembl_genes, 
                                          columns = "GENEBIOTYPE", keytype = "GENEID")

biotypes_exons_trans_EnsDb_entrez_all <- ensembldb::select(EnsDb.Hsapiens.v79, keys = EnsDb_trans_vs_exons_entrez_genes, 
                                          columns = "GENEBIOTYPE", keytype = "ENTREZID")
# get all gene biotypes for all gene IDs
biotypes_EnsDb_ensembl_all <- ensembldb::select(EnsDb.Hsapiens.v79, keys = keys(EnsDb.Hsapiens.v79, keytype = "GENEID"), 
                                          columns = "GENEBIOTYPE", keytype = "GENEID")

biotypes_EnsDb_entrez_all <- ensembldb::select(EnsDb.Hsapiens.v79, keys = keys(EnsDb.Hsapiens.v79, keytype = "ENTREZID"), 
                                          columns = "GENEBIOTYPE", keytype = "ENTREZID")
# how many NAs are in each column?
sapply(biotypes_exons_trans_EnsDb_ensembl_all, function(x) sum(is.na(x)))
##      GENEID GENEBIOTYPE 
##           0           0
sapply(biotypes_exons_trans_EnsDb_entrez_all, function(x) sum(is.na(x)))
##    ENTREZID GENEBIOTYPE 
##           0           0
# summary table of number of genebiotypes
biotypes_exons_trans_EnsDb_ensembl_all_num_table <- as.data.frame(table(biotypes_exons_trans_EnsDb_ensembl_all$GENEBIOTYPE))
biotypes_exons_trans_EnsDb_entrez_all_num_table <- as.data.frame(table(biotypes_exons_trans_EnsDb_entrez_all$GENEBIOTYPE))
# subset to keep only those biotypes that are among the top genes
biotypes_EnsDb_ensembl_all_subs <- biotypes_EnsDb_ensembl_all[which(biotypes_EnsDb_ensembl_all$GENEBIOTYPE %in% 
                                                                      unique(biotypes_exons_trans_EnsDb_ensembl_all$GENEBIOTYPE)), ]

biotypes_EnsDb_entrez_all_subs <- biotypes_EnsDb_entrez_all[which(biotypes_EnsDb_entrez_all$GENEBIOTYPE %in% 
                                                                      unique(biotypes_exons_trans_EnsDb_entrez_all$GENEBIOTYPE)), ]

# summary table of biotypes background (all genes)
biotypes_EnsDb_ensembl_all_subs_table <- as.data.frame(table(biotypes_EnsDb_ensembl_all_subs$GENEBIOTYPE))
biotypes_EnsDb_entrez_all_subs_table <- as.data.frame(table(biotypes_EnsDb_entrez_all_subs$GENEBIOTYPE))

# add background number of biotypes to number among top genes
biotypes_EnsDb_ensembl_top_num_table <- merge(biotypes_exons_trans_EnsDb_ensembl_all_num_table, biotypes_EnsDb_ensembl_all_subs_table, by = "Var1", all.x = TRUE)
biotypes_EnsDb_entrez_top_num_table <- merge(biotypes_exons_trans_EnsDb_entrez_all_num_table, biotypes_EnsDb_entrez_all_subs_table, by = "Var1", all.x = TRUE)
# merge
biotypes_top_num_table <- merge(biotypes_EnsDb_ensembl_top_num_table, biotypes_EnsDb_entrez_top_num_table, by = "Var1", all = TRUE)
colnames(biotypes_top_num_table) <- c("GENEBIOTYPE", "Ensembl_top", "Ensembl_bg", "Entrez_top", "Entrez_bg")

biotypes_top_num_table$Ensembl <- biotypes_top_num_table$Ensembl_top/biotypes_top_num_table$Ensembl_bg
biotypes_top_num_table$Entrez <- biotypes_top_num_table$Entrez_top/biotypes_top_num_table$Entrez_bg

# gather for plotting
biotypes_top_num_table_gather <- biotypes_top_num_table[, c(1, 6, 7)] %>% 
  gather(GENEBIOTYPE, count, Ensembl:Entrez)
colnames(biotypes_top_num_table_gather)[2] <- "Gene_ID"

# replace NA with 0
biotypes_top_num_table_gather[is.na(biotypes_top_num_table_gather)] <- 0
ggplot(biotypes_top_num_table_gather, aes(x = factor(GENEBIOTYPE), y = count, fill = Gene_ID)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  my_theme() + 
  theme(legend.position = "right") + coord_flip() +
  labs(title = "Gene biotypes of genes with many exons but few transcripts", x = "", y = "Ratio of biotype count vs background number",
       fill = "Gene ID") + 
  scale_x_discrete(breaks = biotypes_top_num_table_gather$GENEBIOTYPE,
                      labels = gsub("_", " ", biotypes_top_num_table_gather$GENEBIOTYPE)) +
  scale_fill_brewer(palette = "Set1")

Compared to the background distribution of biotypes, the proportionally most strongly represented group are LRG genes. As defined by Ensembl’s glossary:

“An LRG (Locus Reference Genomic) is a fixed sequence, independent of the genome, specifically created for the diagnostic community to record DNA sequence variation on a fixed framework. Sequence variants in LSDBs (Locus Specific Databases) are reported using LRG sequences.”

Other than this, there are only protein coding, processed and pseudogenes among genes with many exons and few transcripts. So, nothing exceptional about these genes’s biotypes.


What types of genes have the most transcripts?

Is there a biological function associated with having many transcripts? In order to find out more about these genes, I am looking at

  1. their biotypes and
  2. for over-represented gene ontology (GO) categories.
# mean number of transcripts over all database packages
mean_num_transcripts_ensembl <- data.frame(mean_num_transcripts = rowMeans(transcript_num_table_ensembl_NAtozero[, -1]),
               row.names = transcript_num_table_ensembl_NAtozero[, 1])

mean_num_transcripts_entrez <- data.frame(mean_num_transcripts = rowMeans(transcript_num_table_entrez_NAtozero[, -1]),
               row.names = transcript_num_table_entrez_NAtozero[, 1])

# standard error of mean
mean_num_transcripts_ensembl$sem <- apply(transcript_num_table_ensembl_NAtozero[, -1], 1, function(x) sd(x)/sqrt(length(x)))
mean_num_transcripts_entrez$sem <- apply(transcript_num_table_entrez_NAtozero[, -1], 1, function(x) sd(x)/sqrt(length(x)))

# ranking the mean
mean_num_transcripts_ensembl$rank <- rank(mean_num_transcripts_ensembl$mean_num_transcripts)
mean_num_transcripts_ensembl <- mean_num_transcripts_ensembl[order(mean_num_transcripts_ensembl$rank, decreasing = TRUE), ]

mean_num_transcripts_entrez$rank <- rank(mean_num_transcripts_entrez$mean_num_transcripts)
mean_num_transcripts_entrez <- mean_num_transcripts_entrez[order(mean_num_transcripts_entrez$rank, decreasing = TRUE), ]


I am picking the top 1000 genes with highest mean number of transcripts.

# picking top genes
mean_num_transcripts_ensembl_top <- mean_num_transcripts_ensembl[1:1000, ]
mean_num_transcripts_entrez_top <- mean_num_transcripts_entrez[1:1000, ]

# distribution of mean number of transcripts
p1 <- ggplot(mean_num_transcripts_ensembl_top, aes(x=mean_num_transcripts)) + geom_density(fill="navy") + my_theme() +
  labs(title = "Mean number of transcripts\nper Ensembl ID for the top genes", x = "mean number of transcripts pr gene")
p2 <- ggplot(mean_num_transcripts_entrez_top, aes(x=mean_num_transcripts)) + geom_density(fill="navy") + my_theme() +
  labs(title = "Mean number of transcripts\nper Entrez ID for the top genes", x = "mean number of transcripts per gene")
grid.arrange(p1, p2, ncol = 1)

The density distribution of the mean number of transcripts per gene (mean over database packages) for the top genes with highest transcript numbers shows the stark difference between Ensembl and Entrez IDs.


Gene biotypes of top genes with highest mean number of transcripts

biotypes_EnsDb_ensembl_top <- ensembldb::select(EnsDb.Hsapiens.v79, keys = rownames(mean_num_transcripts_ensembl_top), 
                                          columns = "GENEBIOTYPE", keytype = "GENEID")

biotypes_EnsDb_entrez_top <- ensembldb::select(EnsDb.Hsapiens.v79, keys = rownames(mean_num_transcripts_entrez_top), 
                                          columns = "GENEBIOTYPE", keytype = "ENTREZID")

# how many NAs are in each column?
sapply(biotypes_EnsDb_ensembl_top, function(x) sum(is.na(x)))
##      GENEID GENEBIOTYPE 
##           0           0
sapply(biotypes_EnsDb_entrez_top, function(x) sum(is.na(x)))
##    ENTREZID GENEBIOTYPE 
##           0           0
# summary table of number of genebiotypes
biotypes_EnsDb_ensembl_top_num_table <- as.data.frame(table(biotypes_EnsDb_ensembl_top$GENEBIOTYPE))
biotypes_EnsDb_entrez_top_num_table <- as.data.frame(table(biotypes_EnsDb_entrez_top$GENEBIOTYPE))
# subset to keep only those biotypes that are among the top genes
biotypes_EnsDb_ensembl_all_subs <- biotypes_EnsDb_ensembl_all[which(biotypes_EnsDb_ensembl_all$GENEBIOTYPE %in% 
                                                                      unique(biotypes_EnsDb_ensembl_top_num_table$Var1)), ]

biotypes_EnsDb_entrez_all_subs <- biotypes_EnsDb_entrez_all[which(biotypes_EnsDb_entrez_all$GENEBIOTYPE %in% 
                                                                      unique(biotypes_EnsDb_entrez_top_num_table$Var1)), ]

# summary table of biotypes background (all genes)
biotypes_EnsDb_ensembl_all_subs_table <- as.data.frame(table(biotypes_EnsDb_ensembl_all_subs$GENEBIOTYPE))
biotypes_EnsDb_entrez_all_subs_table <- as.data.frame(table(biotypes_EnsDb_entrez_all_subs$GENEBIOTYPE))

# add background number of biotypes to number among top genes
biotypes_EnsDb_ensembl_top_num_table <- merge(biotypes_EnsDb_ensembl_top_num_table, biotypes_EnsDb_ensembl_all_subs_table, by = "Var1", all.x = TRUE)
biotypes_EnsDb_entrez_top_num_table <- merge(biotypes_EnsDb_entrez_top_num_table, biotypes_EnsDb_entrez_all_subs_table, by = "Var1", all.x = TRUE)
# merge
biotypes_top_num_table <- merge(biotypes_EnsDb_ensembl_top_num_table, biotypes_EnsDb_entrez_top_num_table, by = "Var1", all = TRUE)
colnames(biotypes_top_num_table) <- c("GENEBIOTYPE", "Ensembl_top", "Ensembl_bg", "Entrez_top", "Entrez_bg")

biotypes_top_num_table$Ensembl <- biotypes_top_num_table$Ensembl_top/biotypes_top_num_table$Ensembl_bg
biotypes_top_num_table$Entrez <- biotypes_top_num_table$Entrez_top/biotypes_top_num_table$Entrez_bg

# gather for plotting
biotypes_top_num_table_gather <- biotypes_top_num_table[, c(1, 6, 7)] %>% 
  gather(GENEBIOTYPE, count, Ensembl:Entrez)
colnames(biotypes_top_num_table_gather)[2] <- "Gene_ID"

# replace NA with 0
biotypes_top_num_table_gather[is.na(biotypes_top_num_table_gather)] <- 0
ggplot(biotypes_top_num_table_gather, aes(x = factor(GENEBIOTYPE), y = count, fill = Gene_ID)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  my_theme() + 
  theme(legend.position = "right") + coord_flip() +
  labs(title = "Gene biotypes of top genes\nwith highest transcript numbers", x = "", y = "Ratio of biotype count vs background number",
       fill = "Gene ID") + 
  scale_x_discrete(breaks = biotypes_top_num_table_gather$GENEBIOTYPE,
                      labels = gsub("_", " ", biotypes_top_num_table_gather$GENEBIOTYPE)) +
  scale_fill_brewer(palette = "Set1")

Again, compared to background, we find a high proportion of LRG, protein coding, processed and pseudogenes. But here, we also see a few other biotypes represented among genes with most transcripts. Especially interesting is that we find lincRNAs and miRNAs among them, not something I would have expected…


Enrichment Analysis of top genes with highest mean number of transcripts

Functional profiles for the top 100 genes with most transcripts were created with clusterProfiler’s compareCluster function.


Grouping GO terms

With groupGO, we can see the distribution of GO categories among input genes.


GO category enrichment

Over-representation of GO categories is calculated with enrichGO.

Especially interesting to me is that the GO enrichment analysis of genes with many transcripts finds these genes to be overproportionally involved in immune system functions like MCH class II and antigen binding. These functions are known for their high variability, which is necessary for efficient immune defenses against a wide variety of intruders (pathogens).

We also see an enrichment of genes involved in cell-cell-adhesion, specifically cadherin binding. These have also been known for their high variability via alternative splicing.



## R version 3.3.1 (2016-06-21)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.12 (Sierra)
## 
## locale:
## [1] de_DE.UTF-8/de_DE.UTF-8/de_DE.UTF-8/C/de_DE.UTF-8/de_DE.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] clusterProfiler_3.2.0                  
##  [2] DOSE_3.0.0                             
##  [3] gridExtra_2.2.1                        
##  [4] ggrepel_0.5                            
##  [5] ggplot2_2.1.0                          
##  [6] tidyr_0.6.0                            
##  [7] dplyr_0.5.0                            
##  [8] plyr_1.8.4                             
##  [9] splitstackshape_1.4.2                  
## [10] data.table_1.9.6                       
## [11] EnsDb.Hsapiens.v79_1.1.0               
## [12] ensembldb_1.6.0                        
## [13] TxDb.Hsapiens.UCSC.hg38.knownGene_3.4.0
## [14] GenomicFeatures_1.26.0                 
## [15] GenomicRanges_1.26.0                   
## [16] GenomeInfoDb_1.10.0                    
## [17] org.Hs.eg.db_3.4.0                     
## [18] AnnotationDbi_1.36.0                   
## [19] IRanges_2.8.0                          
## [20] S4Vectors_0.12.0                       
## [21] Biobase_2.34.0                         
## [22] BiocGenerics_0.20.0                    
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.2.1                    AnnotationHub_2.6.0          
##  [3] splines_3.3.1                 shiny_0.14.1                 
##  [5] assertthat_0.1                DO.db_2.9                    
##  [7] interactiveDisplayBase_1.12.0 Rsamtools_1.26.0             
##  [9] yaml_2.1.13                   RSQLite_1.0.0                
## [11] lattice_0.20-34               chron_2.3-47                 
## [13] digest_0.6.10                 RColorBrewer_1.1-2           
## [15] XVector_0.14.0                qvalue_2.6.0                 
## [17] colorspace_1.2-7              htmltools_0.3.5              
## [19] httpuv_1.3.3                  Matrix_1.2-7.1               
## [21] XML_3.98-1.4                  biomaRt_2.30.0               
## [23] zlibbioc_1.20.0               xtable_1.8-2                 
## [25] GO.db_3.4.0                   scales_0.4.0                 
## [27] BiocParallel_1.8.0            tibble_1.2                   
## [29] mgcv_1.8-15                   SummarizedExperiment_1.4.0   
## [31] lazyeval_0.2.0                magrittr_1.5                 
## [33] mime_0.5                      evaluate_0.10                
## [35] nlme_3.1-128                  BiocInstaller_1.24.0         
## [37] tools_3.3.1                   formatR_1.4                  
## [39] stringr_1.1.0                 munsell_0.4.3                
## [41] Biostrings_2.42.0             RCurl_1.95-4.8               
## [43] igraph_1.0.1                  bitops_1.0-6                 
## [45] labeling_0.3                  rmarkdown_1.1                
## [47] gtable_0.2.0                  DBI_0.5-1                    
## [49] reshape2_1.4.1                R6_2.2.0                     
## [51] GenomicAlignments_1.10.0      knitr_1.14                   
## [53] rtracklayer_1.34.0            fastmatch_1.0-4              
## [55] fgsea_1.0.0                   stringi_1.1.2                
## [57] GOSemSim_2.0.0                Rcpp_0.12.7

Exploring the human genome (Part 1) - Gene Annotations

2016-10-23 00:00:00 +0000

When working with any type of genome data, we often look for annotation information about genes, e.g. what’s the gene’s full name, what’s its abbreviated symbol, what ID it has in other databases, what functions have been described, how many and which transcripts exist, etc.

However, when looking for this information we (luckily) find a number of different databases and packages to access their information in R. But with this often comes confusion, because they all vary - sometimes only slightly, other times quite strongly - in how many genes they list and what they consider to be a “gene”.



What exactly is a gene?

The narrow traditional definition of a gene is that it is a hereditary unit of information, which meant that it is a unit of DNA which encodes for the production of a protein. The Human Genome Project has estimated that the human genome comprises 20000 to 25000 genes.

However, if we take the definition of gene more liberally, we could also consider chunks of DNA which code for other functional units, like miRNAs, pseudogenes or other regulatory elements.



AnnotationDbi

The R package AnnotationDbi provides connections to a large number of annotation resources for gene and genome information of many organisms. It can access feature data from a number of gene or genome centric packages.

Here, I want to explore some of these packages and compare the information they contain.



How many genes are there in different databases?

Because genes can have identifiers from many databases (but not necessarily from all and there is not always a one-to-one mapping between them), we compare the number of unique identifiers for Entrez and Ensembl, as well as for gene symbol.

Entrez Gene is NCBI’s database for gene-specific information. It does not include all known or predicted genes; instead Entrez Gene focuses on the genomes that have been completely sequenced, that have an active research community to contribute gene-specific information, or that are scheduled for intense sequence analysis.” Maglott et al., Nucleic Acids Research 2005

Ensembl is a genome browser but it also includes gene annotations.

Gene symbols represent the official gene name abbreviation from HGNC.



org.Hs.eg.db

The first package I looked at was org.Hs.eg.db, which contains genome wide annotations for the human genome.

library(AnnotationDbi)
library(org.Hs.eg.db)

# We can find out which keys and/or columns can be accessed by running
# keytypes(org.Hs.eg.db) or columns(org.Hs.eg.db)

ENTREZID_org <- keys(org.Hs.eg.db, keytype = "ENTREZID")
ENSEMBL_org <- keys(org.Hs.eg.db, keytype = "ENSEMBL")
HGNC_org <- keys(org.Hs.eg.db, keytype = "SYMBOL")


EnsDb.Hsapiens.v79

EnsDb.Hsapiens.v79 contains annotation information from Ensembl. Here, we also find genes based on Ensembl, Entrez and HGNC IDs.

library(EnsDb.Hsapiens.v79)

# keytypes(EnsDb.Hsapiens.v79)

ENTREZID_EnsDb <- keys(EnsDb.Hsapiens.v79, keytype = "ENTREZID")
ENSEMBL_EnsDb <- keys(EnsDb.Hsapiens.v79, keytype = "GENEID")
HGNC_EnsDb <- keys(EnsDb.Hsapiens.v79, keytype = "SYMBOL")


TxDb.Hsapiens.UCSC.hg38.knownGene

TxDb.Hsapiens.UCSC.hg38.knownGene provides annotation information from UCSC, specifically on transcripts in form of TxDb objects. Here, we only find comparable information on Entrez IDs.

library(TxDb.Hsapiens.UCSC.hg38.knownGene)

# keytypes(TxDb.Hsapiens.UCSC.hg38.knownGene)

ENTREZID_TxDb <- keys(TxDb.Hsapiens.UCSC.hg38.knownGene, keytype = "GENEID")


Comparison of gene numbers from the three databases accessed via the three R packages

# setting my custom theme of choice
library(ggplot2)

my_theme <- function(base_size = 12, base_family = "sans"){
  theme_grey(base_size = base_size, base_family = base_family) +
  theme(
    axis.text = element_text(size = 14, angle = 90, hjust = 1),
    axis.title = element_text(size = 14),
    panel.grid.major = element_line(colour = "grey"),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white"),
    strip.background = element_rect(fill = "cornsilk", color = "maroon", size = 2),
    strip.text = element_text(face = "bold", size = 15, colour = "maroon"),
    legend.position = "bottom"
  )
}
# preparing the gene table for plotting
gene_table <- data.frame(DB = c("Entrez_orgDb", "Ensembl_orgDb", "HGNC_orgDb", 
                                "Entrez_EnsDb", "Ensembl_EnsDb", "HGNC_EnsDb", "Entrez_TxDb"), 
                         no_genes = c(length(unique(ENTREZID_org)), length(unique(ENSEMBL_org)), 
                                      length(unique(HGNC_org)), length(unique(ENTREZID_EnsDb)),
                                      length(unique(ENSEMBL_EnsDb)), length(unique(HGNC_EnsDb)), 
                                      length(unique(ENTREZID_TxDb))))

p <- ggplot(gene_table, aes(x = factor(DB), y = no_genes, label = no_genes)) + 
  geom_bar(stat = "identity", fill = "maroon") +
  my_theme() +
  labs(title = "Number of genes in databases", y = "Count", x = "")

p + geom_text(size = 6, vjust = 1.5)

When we compare the numbers of genes we can obtain from the three databases Ensembl, Entrez and HGNC via three R packages (org.Hs.eg.db, EnsDb.Hsapiens.v79 and TxDb.Hsapiens.UCSC.hg38.knownGene), we see that there are strong differences: There seem to be either around 25000 to 28000 genes or around 60000. The only database with consistent numbers is HGNC. The reason for these divergent numbers could be due to how they classify a “gene”. Everything that is around 25000 could be because they consider protein coding genes only. Will we check if that’s indeed the case in a minute.



Overlap between databases

First, I want to know how well the database information from different packages overlaps. Venn diagrams show us the number of genes overlapping between the three databases.

library(gplots)

venn(list(orgDb = unique(ENTREZID_org), EnsDb = unique(ENTREZID_EnsDb), TxDb = unique(ENTREZID_TxDb)), 
     show.plot = TRUE, small = 0.7, showSetLogicLabel = FALSE, simplify = TRUE)
mtext("Entrez database", side=3, cex = 1.2)

venn(list(orgDb = unique(ENSEMBL_org), EnsDb = unique(ENSEMBL_EnsDb)), 
     show.plot = TRUE, small = 0.7, showSetLogicLabel = FALSE, simplify = TRUE)
mtext("Ensembl database", side=3, cex = 1.2)

While there is a reasonably large overlap between packages, there are still many genes which are either in one or the other package.

For gene names this is worst, but it might be due to changing names of genes (one gene often has several symbols which can be used, the preferred one often changing with new knowledge about the gene’s function).

For Ensembl and Entrez it seems like one of the databases simply contains more gene entries than the other(s), while the majority of genes from the smaller database are included.



Gene biotypes

Gene biotype annotation tells us the general category of a gene. The biggest category is protein coding genes. They allow us to check whether the genes in different databases and package consider different categories as “genes”.

Gene biotype information is only available for genes in EnsemblDb! Thus, the classification of genes in the other packages is biased in that only genes in EnsemblDb could be considered.

# gene dataframe for org.Hs.eg.db
gene_dataframe_orgDb <- AnnotationDbi::select(org.Hs.eg.db, keys=ENTREZID_org, columns=c("ENSEMBL", "SYMBOL"), keytype="ENTREZID")
colnames(gene_dataframe_orgDb) <- c("Entrez", "Ensembl", "HGNC")
colnames(gene_dataframe_orgDb) <- paste(colnames(gene_dataframe_orgDb), "orgDb", sep = "_")
# additional column with HGNC added for merging
gene_dataframe_orgDb$HGNC <- gene_dataframe_orgDb$HGNC_orgDb

# gene dataframe for EnsDb.Hsapiens.v79
gene_dataframe_EnsDb <- ensembldb::select(EnsDb.Hsapiens.v79, keys=ENSEMBL_EnsDb, 
                                          columns=c("ENTREZID", "SYMBOL", "GENEBIOTYPE"), keytype="GENEID")
colnames(gene_dataframe_EnsDb) <- c("Ensembl", "Entrez", "HGNC", "GENEBIOTYPE")
colnames(gene_dataframe_EnsDb) <- paste(colnames(gene_dataframe_EnsDb), "EnsDb", sep = "_")
# additional column with HGNC added for merging & keeping one copy of it in the merged dataframe
gene_dataframe_EnsDb$HGNC <- gene_dataframe_EnsDb$HGNC_EnsDb

# merging the two dataframes by HGNC
library(dplyr)
gene_dataframe <- full_join(gene_dataframe_EnsDb, gene_dataframe_orgDb, by = "HGNC")
gene_dataframe <- gene_dataframe[, -5]

# making a dataframe with additional column for merging & keeping one copy in the merged dataframe
ENTREZID_TxDb_df <- data.frame(ENTREZID = as.character(ENTREZID_TxDb), Entrez_TxDb = ENTREZID_TxDb)
gene_dataframe <- left_join(gene_dataframe, ENTREZID_TxDb_df, by = c("Entrez_orgDb" = "ENTREZID")) 
# calculating percentages of gene biotypes
for (i in 1:length(colnames(gene_dataframe))){
  key = colnames(gene_dataframe)[i]
  
  # remove duplicate and NA plots
  gene_dataframe_subs <- gene_dataframe[!duplicated(gene_dataframe[, key]), c(key, "GENEBIOTYPE_EnsDb")]
  gene_dataframe_subs <- gene_dataframe_subs[!is.na(gene_dataframe_subs[, key]), ]
  
  cat("\nKey:", key, "has", nrow(gene_dataframe_subs), "unique rows without NAs.\nOf these,", 
      sum(is.na(gene_dataframe_subs$GENEBIOTYPE_EnsDb)), "don't have a gene biotype annotation\n")
  
  if (i == 1){
    genebiotypes_table <- as.data.frame(table(gene_dataframe_subs$GENEBIOTYPE_EnsDb))
    genebiotypes_table <- rbind(genebiotypes_table,
                                      data.frame(Var1 = "NA", 
                                                 Freq = sum(is.na(gene_dataframe_subs$GENEBIOTYPE_EnsDb))))
    genebiotypes_table$percent <- round(genebiotypes_table$Freq / 
                                                sum(genebiotypes_table$Freq) * 100, digits = 4)
  } else {
      genebiotypes_table_pre <- as.data.frame(table(gene_dataframe_subs$GENEBIOTYPE_EnsDb))
      genebiotypes_table_pre <- rbind(genebiotypes_table_pre,
                                      data.frame(Var1 = "NA", 
                                                 Freq = sum(is.na(gene_dataframe_subs$GENEBIOTYPE_EnsDb))))
      genebiotypes_table_pre$percent <- round(genebiotypes_table_pre$Freq / 
                                                sum(genebiotypes_table_pre$Freq) * 100, digits = 4)
      genebiotypes_table <- full_join(genebiotypes_table, genebiotypes_table_pre, by = "Var1")
  }
}
## 
## Key: Ensembl_EnsDb has 65774 unique rows without NAs.
## Of these, 0 don't have a gene biotype annotation
## 
## Key: Entrez_EnsDb has 24263 unique rows without NAs.
## Of these, 0 don't have a gene biotype annotation
## 
## Key: HGNC_EnsDb has 59074 unique rows without NAs.
## Of these, 0 don't have a gene biotype annotation
## 
## Key: Entrez_orgDb has 60136 unique rows without NAs.
## Of these, 25740 don't have a gene biotype annotation
## 
## Key: Ensembl_orgDb has 28015 unique rows without NAs.
## Of these, 2456 don't have a gene biotype annotation
## 
## Key: HGNC_orgDb has 60083 unique rows without NAs.
## Of these, 25691 don't have a gene biotype annotation
## 
## Key: Entrez_TxDb has 25217 unique rows without NAs.
## Of these, 2087 don't have a gene biotype annotation
# create unique colnames
colnames(genebiotypes_table) <- make.unique(c("Key", rep(colnames(gene_dataframe), each = 2)))
genebiotypes_table <- genebiotypes_table[, -c(8, 9)]

# order by percentage of Ensembl EnsDb
genebiotypes_table <- genebiotypes_table[order(genebiotypes_table$Ensembl_EnsDb.1, decreasing = TRUE), ]


How many protein coding genes are in each database and annotation package?

Ensembl IDs from EnsDb.Hsapiens.v79 have the most protein coding genes, which is consistent with also having the highest number of annotations in general. The number of protein coding genes in the other databases/ packages is only slightly lower, however and fits well with the predicted 20000 to 25000.

genebiotypes_table_protein_coding <- genebiotypes_table[
  which(genebiotypes_table$Key %in% c("protein_coding", "NA")), ]

genebiotypes_table_count <- genebiotypes_table_protein_coding[1, c(1, grep(".1", colnames(genebiotypes_table_protein_coding)) - 1)]

library(tidyr)
genebiotypes_table_gather <- genebiotypes_table_count %>% 
  gather(Key, Count, Ensembl_EnsDb:Entrez_TxDb)
colnames(genebiotypes_table_gather)[2] <- "DB"

# plot
bp <- ggplot(genebiotypes_table_gather[, -1], aes(x = factor(DB), y = Count, label = Count)) + 
  geom_bar(stat = "identity", fill = "maroon") + my_theme()
bp + labs(title = "Number of protein coding genes in databases", y = "Count", x = "") +
  geom_text(size = 6, vjust = 1.5)

When looking at the percentage of protein coding genes among all gene annotations we can confirm that the database/ package combinations with lower gene numbers (Ensembl org.Db, Entrez EnsDb and Entrez TxDb) are mainly comprised of protein coding genes while databases/ packages with around 60000 gene entries consider other functional units as well.

genebiotypes_table_percent <- genebiotypes_table_protein_coding[, c(1, grep(".1", colnames(genebiotypes_table_protein_coding)))]

library(tidyr)
genebiotypes_table_gather <- genebiotypes_table_percent %>% 
  gather(Key, Percent, Ensembl_EnsDb.1:Entrez_TxDb.1)
colnames(genebiotypes_table_gather)[2] <- "DB"
genebiotypes_table_gather$DB <- gsub(".1", "", genebiotypes_table_gather$DB)

# plot
bp <- ggplot(genebiotypes_table_gather, aes(x = "", y = Percent, fill = Key)) + 
  geom_bar(width = 1, stat = "identity") + theme_minimal()

pie <- bp + coord_polar("y", start = 0) +
  ggtitle("Percentage of protein coding genes in gene databases") +
  theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.border = element_blank(),
  panel.grid = element_blank(),
  axis.ticks = element_blank(),
  plot.title = element_text(size = 14, face = "bold"),
  legend.title = element_blank(),
  legend.position = "bottom",
  legend.text = element_text(size = 8)
  )
pie + facet_wrap(~DB, ncol = 4)


Percentages of gene biotypes

What are these other biotypes that most databases/ packages consider as genes? The second largest group following protein coding genes are processed pseudogenes, then come lincRNAs and other small RNAs.

genebiotypes_table_percent <- genebiotypes_table[, c(1, grep(".1", colnames(genebiotypes_table)))]
genebiotypes_table_percent$Key <- factor(genebiotypes_table_percent$Key, 
                                         levels = paste0(genebiotypes_table_percent$Key))

# gather dataframe for plotting
library(tidyr)
genebiotypes_table_gather <- genebiotypes_table_percent %>% 
  gather(Key, Percent, Ensembl_EnsDb.1:Entrez_TxDb.1)
colnames(genebiotypes_table_gather)[2] <- "DB"
genebiotypes_table_gather$DB <-