Preprocess Functions

This document contains the functions used to process data in the MOGDx pipeline.

Import Libraries

removeTraitNAs

Remove NAs from trait data

removeTraitNAs <- function(traitDF, otherDFs, trait) {
  rowsToKeep <- !is.na(traitDF[[trait]])
  traitDF <- traitDF[rowsToKeep, ]
  otherDFs <- lapply(otherDFs, function(df) {
    if (is.data.frame(df) || is.matrix(df)) {
      df[rowsToKeep, ]
    } else if (is.null(df)) {
      # For example, if foldID is NULL in cvTrait
      df
    } else {
      # Assumes df is a vector
      df[rowsToKeep]
    }
  })
  list(traitDF = traitDF, otherDFs = otherDFs)
}

cvTrait

Perform cross validated lasso regression on trait of interest

cvTrait <- function(trainMethyl, trainPhenotypes, trait, nFolds) {
  print(paste0('Removing rows with missing ', trait, ' from training data.'))
  trainRemoveNAResult <- removeTraitNAs(trainPhenotypes, list(trainMethyl = trainMethyl), trait)
  trainPhenotypes <- trainRemoveNAResult$traitDF
  trainMethyl <- trainRemoveNAResult$otherDFs$trainMethyl
  
  print('Fitting lasso model')
  methylModel <- cv.glmnet(x = trainMethyl,
                           y = as.factor(trainPhenotypes[[trait]]),
                           seed = 42,
                           family = 'multinomial',
                           type.measure = "class",
                           alpha = 1,
                           nFolds = nFolds,
                           parallel = TRUE,
                           trace.it = 1)
  print(methylModel)
  list(trait = trait, model = methylModel)
}

diff_expr

Perfrorm differential expression analysis and return differentially expressed genes

diff_expr <- function(count_mtx , datMeta , trait , n_genes , modality) {
  
  # Remove Genes with low level of expression -------------------------------
  to_keep = rowSums(count_mtx) > 0 #removed 1157 genes
  print(paste0('Removing ',length(to_keep) - sum(to_keep),' Genes with all 0'))
  
  count_mtx <- count_mtx[to_keep,]
  datExpr <- count_mtx
  
  if (modality != 'miRNA') {
    to_keep = filterByExpr(datExpr , group = datMeta[[trait]])
    
    print(paste0('keeping ',sum(to_keep) ,' genes'))
    print(paste0("Removing ",length(to_keep) - sum(to_keep)," Genes"))
    
    count_mtx = count_mtx[to_keep,]
  }
  
  # Remove Outliers ---------------------------------------------------------
  print('removing outliers')
  absadj = count_mtx %>% bicor %>% abs
  netsummary = fundamentalNetworkConcepts(absadj)
  ku = netsummary$Connectivity
  z.ku = (ku-mean(ku))/sqrt(var(ku))

  to_keep = z.ku > -2
  print(paste0("Keeping ",sum(to_keep)," Samples"))
  print(paste0("Removed ",length(to_keep) - sum(to_keep), " Samples"))

  count_mtx <- count_mtx[,to_keep] #removed 36
  datMeta <- datMeta[to_keep,]
  
  # Normalisation Using DESeq -----------------------------------------------
  plot_data = data.frame('ID'=rownames(count_mtx), 'Mean'=rowMeans(count_mtx), 'SD'=apply(count_mtx,1,sd))
  
  plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='black') +
    scale_x_log10() + scale_y_log10() + theme_minimal()  + theme(plot.title = element_text(hjust = 0.5)) 
  
  datMeta[[trait]] <- as.factor(datMeta[[trait]])
  dds = DESeqDataSetFromMatrix(countData = count_mtx, colData = datMeta , design = formula(paste("~ 0 +",trait)))
  
  print('performing DESeq')
  dds = DESeq(dds)
  
  # DEA Plots ---------------------------------------------------------------
  DE_info = results(dds)
  DESeq2::plotMA(DE_info, main= 'Original LFC values')
  
  # VST Transformation of Data ----------------------------------------------
  nsub_check = sum( rowMeans( counts(dds, normalized=TRUE)) > 5 )
  if (nsub_check < 1000) {
    vsd = vst(dds , nsub= nsub_check)
  } else {
    vsd = vst(dds)
  }
  
  datExpr_vst = assay(vsd)
  datMeta_vst = colData(vsd)
  
  meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal() + ylim(c(0,2))
  
  plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
  
  plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) + geom_smooth(color = 'gray') +
    scale_x_log10() + scale_y_log10() + theme_minimal()
  
  subtypes <- levels(as.factor(datMeta[[trait]]))
  top_genes = c()
  for (subtype1 in subtypes[1:length(subtypes)-1]) {
    subtypes = subtypes[subtypes != subtype1]
    for (subtype2 in subtypes)  {
      if (subtype1 != subtype2) {
        res <- results(dds , contrast = c(trait , subtype1 , subtype2))
      }
      top_genes = unique(c(top_genes , head(order(res$padj) , n_genes) ))
    }
  }
  
  list(dds = dds , datExpr = datExpr_vst, datMeta = datMeta_vst , top_genes = top_genes)
}

make.knn.graph

Perform K Nearest Neighbour calculation and return igraph object

make.knn.graph<-function(D,k){
  # calculate euclidean distances between cells
  dist<-as.matrix(dist(D))
  # make a list of edges to k nearest neighbors for each cell
  edges <- mat.or.vec(0,2)
  for (i in 1:nrow(dist)){
    # find closes neighbours
    matches <- setdiff(order(dist[i,],decreasing = F)[1:(k+1)],i)
    if (length(matches) > k) {
      edges <- rbind(edges,cbind(rep(i,length(matches)),matches))
      #edges <- rbind(edges,cbind(matches,rep(i,length(matches))))
    } else {
      edges <- rbind(edges,cbind(rep(i,k),matches))
      #edges <- rbind(edges,cbind(matches,rep(i,k)))
    }
    # add edges in both directions
    
    #edges <- rbind(edges,cbind(matches,rep(i,k)))  
  }
  # create a graph from the edgelist
  graph <- graph_from_edgelist(edges,directed=F)
  V(graph)$frame.color <- NA
  # make a layout for visualizing in 2D
  set.seed(1)
  g.layout<-layout_with_fr(graph)
  return(list(graph=graph,layout=g.layout))        
}

expr.to.graph

Perform KNN graph generation, plot graph and save as long data frame

expr.to.graph<-function(datExpr , datMeta , trait , top_genes , modality){
  
  if (modality %in% c('mRNA' , 'miRNA')) {
    mat <- datExpr[top_genes, ]
  } else {
    mat <- t(datExpr[ , top_genes[[trait]]])
  }
  
  if (modality %in% c('mRNA' , 'miRNA' , 'DNAm' , 'RPPA' , 'CSF')) {
    mat <- mat - rowMeans(mat)
    corr_mat <- cor(mat, method="pearson")
  } else {
    corr_mat <- t(mat)
  }
  
  print(dim(mat))
  g <- make.knn.graph(corr_mat , 15)
  
  plot.igraph(g$graph,layout=g$layout, vertex.frame.color='black', vertex.color=as.factor(datMeta[[trait]]),
              vertex.size=5,vertex.label=NA, vertex.label.cex = 0.3 , main=modality )
  
  g <- g$graph
  g <- simplify(g, remove.multiple=TRUE, remove.loops=TRUE)
  
  # Remove any vertices remaining that have no edges
  g <- delete.vertices(g, degree(g)==0)
  
  # Assign names to the graph vertices 
  V(g)$name <- rownames(datMeta)
  V(g)$class <- as.character(datMeta[[trait]])
  V(g)$color <- as.numeric(as.factor(V(g)$class))
  V(g)$vertex.frame.color <- "black"
  
  return(as_long_data_frame(g))
}

snf.to.graph

Perform KNN on SNF adjacenecy matrix and return igraph object

snf.to.graph <- function(W , datMeta , trait , idx , sub_mod_list) {
  
  g <- make.knn.graph(W , 15)
  
  plot.igraph(g$graph,layout=g$layout, vertex.frame.color='black', vertex.color=as.numeric(as.factor(datMeta[idx,][[trait]])),
              vertex.size=5,vertex.label=NA,main=paste0(sub_mod_list , collapse = '_'))
  
  g <- g$graph
  g <- simplify(g, remove.multiple=TRUE, remove.loops=TRUE)
  
  # Remove any vertices remaining that have no edges
  g <- delete.vertices(g, degree(g)==0)
  
  # Assign names to the graph vertices 
  V(g)$name <- rownames(datMeta[idx,])
  V(g)$class <- as.character(datMeta[idx,][[trait]])
  V(g)$color <- as.numeric(as.factor(V(g)$class))
  V(g)$vertex.frame.color <- "black"
  
  return(g)
}