Source code for clubcpg_prelim.PReLIM

from __future__ import print_function

"""
Author: 
Jack Duryea
Waterland Lab
Computational Epigenetics Section
Baylor College of Medicine

Created April 2018

Updated April 11 2019: use random forests as model


PReLIM: Preceise Read Level Imputation of Methylation

PReLIM imputes missing CpG methylation
states in CpG matrices.

"""

# standard imports
import numpy as np
from tqdm import tqdm
import copy
import time

from collections import defaultdict
import random

# sklearn imports
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Pickle
try:
	import cPickle as p
except ImportError:
	import pickle as p


# TODO: most of these fields are redundant in our application
[docs]class CpGBin(): """ Constructor for a bin """
[docs] def __init__(self, matrix, #relative_positions binStartInc=None, binEndInc=None, cpgPositions=None, sequence="", encoding=None, missingToken= -1, chromosome=None, binSize=100, species="MM10", verbose=True, tag1=None, tag2=None): """ :param matrix: numpy array, the bin's CpG matrix. :param binStartInc: integer, the starting, inclusive, chromosomal index of the bin. :param binEndInc: integer, the ending, inclusive, chromosomal index of the bin. :param cpgPositions: array of integers, the chromosomal positions of the CpGs in the bin. :param sequence: string, nucleotide sequence (A,C,G,T) :param encoding: array, a reduced representation of the bin's CpG matrix :param missingToken: integer, the token that represents missing data in the matrix. :param chromosome: string, the chromosome this bin resides in. :param binSize: integer, the number of base pairs this bin covers :param species: string, the speices this bin belongs too. :param verbose: boolean, print warnings, set to "false" for no error checking and faster speed :param tag1: anything, for custom use. :param tag2: anything, for custom use. """ self.cpgDensity = matrix.shape[1] self.readDepth = matrix.shape[0] self.matrix = np.array(matrix, dtype=float) self.binStartInc = binStartInc self.binEndInc = binEndInc self.cpgPositions = cpgPositions self.sequence = sequence self.missingToken = missingToken self.chromosome = chromosome self.binSize = binSize self.species = species self.tag1 = tag1 self.tag2 = tag2
[docs]class PReLIM(): """ PReLIM imputation class to handle training and predicting from models. """
[docs] def __init__(self, cpgDensity=2): self.model = None self.cpgDensity = cpgDensity self.METHYLATED = 1 self.UNMETHYLATED = 0 self.MISSING = -1 self.methylated = 1 self.unmethylated = 0 self.unknown = -1
# Train a model
[docs] def train(self, bin_matrices, model_file="no", verbose=False): """ bin_matrices: list of cpg matrices model_file, string, The name of the file to save the model to. If None, then create a file name that includes a timestamp. If you don't want to save a file, set this to "no" """ # bin_matrices: a list of cpg matrices X,y = self.get_X_y(bin_matrices, model_file=model_file, verbose=False) # Train the neural network model self.fit(X,y, model_file=model_file, verbose=verbose)
[docs] def fit(self, X_train, y_train, n_estimators = [10, 50, 100, 500, 1000], cores = -1, max_depths = [1, 5, 10, 20, 30], model_file=None, verbose=False ): """ Inputs: 1. X_train, numpy array, Contains feature vectors. 2. y_train, numpy array, Contains labels for training data. 3. n_estimators, list, the number of estimators to try during a grid search. 4. max_depths, list, the maximum depths of trees to try during a grid search. 5. cores, the number of cores to use during training, helpful for grid search. 6. model_file, string, The name of the file to save the model to. If None, then create a file name that includes a timestamp. If you don't want to save a file, set this to "no" 5-fold validation is built into the grid search Outputs: The trained model Usage: model.fit(X_train, y_train) """ grid_param = { "n_estimators": n_estimators, "max_depth": max_depths, } # Note: let the grid search use a lot of cores, but only use 1 for each forest # since dispatching can take a lot of time rf = RandomForestClassifier(n_jobs=1) self.model = GridSearchCV(rf, grid_param, n_jobs=cores, cv=5, verbose=verbose) if len(X_train) > 0: self.model.fit(X_train, y_train) # save the model if model_file == "no": return self.model if not model_file: model_file = "PReLIM_model" + str(time.time()) p.dump(self.model, open(model_file,"wb")) return self.model else: self.model = None return self.model
# Feature collection directly from bins def get_X_y(self, bin_matrices, model_file=None, verbose=False): bins = [] # convert to bin objects for ease of use for matrix in bin_matrices: mybin = CpGBin( matrix=matrix ) bins.append( mybin ) # find bins with no missing data complete_bins = _filter_missing_data( bins ) random.shuffle( complete_bins ) # apply masks masked_bins = _apply_masks( complete_bins, bins ) # extract features X, y = self._collectFeatures( masked_bins ) return X, y # Return a vector of predicted classes
[docs] def predict_classes(self, X): """ Inputs: 1. X, numpy array, contains feature vectors Outputs: 1. 1-d numpy array of prediction values Usage: y_pred = CpGNet.predict_classes(X) """ return self.model.predict(X)
# Return a vector of probabilities for methylation
[docs] def predict(self, X): """ Inputs: 1. X, numpy array, contains feature vectors Outputs: 1. 1-d numpy array of predicted class labels Usage: y_pred = CpGNet.predict(X) """ return self.model.predict_proba(X)[:,1]
[docs] def predict_proba(self, X): """ Inputs: 1. X, numpy array, contains feature vectors Outputs: 1. 1-d numpy array of class predictions Usage: y_pred = CpGNet.predict(X) """ return self.model.predict_proba(X)[:1]
# Load a saved model
[docs] def loadWeights(self, model_file): """ Inputs: 1. model_file, string, name of file with a saved model Outputs: None Effects: self.model is loaded with the provided weights """ self.model = p.load(open(model_file,"rb"))
def _get_imputation_features(self,matrix): ''' Returns a vector of features needed for the imputation of this matrix Inputs: 1. matrix, a 2d np array, dtype=float, representing a CpG matrix, 1=methylated, 0=unmethylated, -1=unknown Outputs: 1. A feature vector for the matrix ''' X = [] numReads = matrix.shape[0] density = matrix.shape[1] nan_copy = np.copy(matrix) nan_copy[nan_copy == -1] = np.nan column_means = np.nanmean(nan_copy, axis=0) row_means = np.nanmean(nan_copy, axis=1) encoding = self._encode_input_matrix(matrix)[0] for i in range(numReads): for j in range(density): observed_state = matrix[i, j] if observed_state != -1: continue row_mean = row_means[i] col_mean = column_means[j] row = np.copy(matrix[i]) row[j] = -1 data = [row_mean] + [col_mean] + [i, j] + list(row) + list(encoding) X.append(data) X = np.array(X) return X # Imputes missing values in Bins
[docs] def impute(self, matrix): """ Inputs: 1. matrix, a 2d np array, dtype=float, representing a CpG matrix, 1=methylated, 0=unmethylated, -1=unknown Outputs: 1. A 2d numpy array with predicted probabilities of methylation """ X = self._get_imputation_features(matrix) if len(X) == 0: # nothing to impute return matrix predictions = self.predict(X) k = 0 # keep track of prediction index for missing states predicted_matrix = np.copy(matrix) for i in range(predicted_matrix.shape[0]): for j in range(predicted_matrix.shape[1]): if predicted_matrix[i, j] == -1: predicted_matrix[i, j] = predictions[k] k += 1 return predicted_matrix
[docs] def impute_many(self, matrices): ''' Imputes a bunch of matrices at the same time to help speed up imputation time. Inputs: 1. matrices: array-like (i.e. list), where each element is a 2d np array, dtype=float, representing a CpG matrix, 1=methylated, 0=unmethylated, -1=unknown Outputs: 1. A List of 2d numpy arrays with predicted probabilities of methylation for unknown values. ''' # Extract all features for all matrices so we can predict in bulk, this is where the speedup comes from X = np.array([features for matrix_features in [self._get_imputation_features(matrix) for matrix in matrices] for features in matrix_features]) if len(X) == 0: return matrices predictions = self.predict(X) predicted_matrices = [] # TODO: lots of for-loops here, could be sped up? k = 0 # keep track of prediction index for missing states, order is crucial! for matrix in matrices: predicted_matrix = np.copy(matrix) for i in range(predicted_matrix.shape[0]): for j in range(predicted_matrix.shape[1]): if predicted_matrix[i, j] == -1: predicted_matrix[i, j] = predictions[k] k += 1 predicted_matrices.append(predicted_matrix) return predicted_matrices
### Helper functions, for private use only ### # Returns a matrix encoding of a CpG matrix def _encode_input_matrix(self, m): matrix = np.copy(m) n_cpgs = matrix.shape[1] matrix += 1 # deal with -1s base_3_vec = np.power(3, np.arange(n_cpgs - 1, -1, -1)) # encodings = np.dot(base_3_vec, matrix.T) # encoded_vector_dim = np.power(3, n_cpgs) encoded_vector = np.zeros(encoded_vector_dim) # for x in encodings: encoded_vector[int(x)] += 1 # num_reads = encodings.shape[0] # # Now we normalize encoded_vector_norm = normalize([encoded_vector], norm="l1") return encoded_vector_norm[0], num_reads # finds the majority class of the given column, discounting the current cpg def _get_column_mean(self, matrix, col_i, current_cpg_state): sub = matrix[:, col_i] return self._get_mean(sub, current_cpg_state) # finds the majority class of the given read, discounting the current cpg def _get_read_mean(self, matrix, read_i, current_cpg_state): sub = matrix[read_i, :] return self._get_mean(sub, current_cpg_state) def _get_mean(self, sub_matrix, current_cpg_state): num_methy = np.count_nonzero(sub_matrix == self.METHYLATED) num_unmethy = np.count_nonzero(sub_matrix == self.UNMETHYLATED) if current_cpg_state == self.METHYLATED: num_methy -= 1 num_methy = max(0, num_methy) if current_cpg_state == self.UNMETHYLATED: num_unmethy -= 1 num_unmethy = max(0, num_unmethy) if float(num_methy + num_unmethy) == 0: return -2 return float(num_methy) / float(num_methy + num_unmethy) # Returns X, y # note: y can contain the labels 1,0, -1 def _collectFeatures(self, bins): X = [] Y = [] for Bin in tqdm(bins): observed_matrix = Bin.tag2["observed"] truth_matrix = Bin.tag2["truth"] encoding = self._encode_input_matrix(observed_matrix)[0] numReads = observed_matrix.shape[0] density = observed_matrix.shape[1] #positions = Bin.cpgPositions nan_copy = np.copy(observed_matrix) nan_copy[nan_copy == -1] = np.nan column_means = np.nanmean(nan_copy,axis=0) row_means = np.nanmean(nan_copy,axis=1) for i in range(numReads): for j in range(density): observed_state = observed_matrix[i,j] if observed_state != -1: continue state = truth_matrix[i,j] Y.append(state) row_mean = row_means[i] col_mean = column_means[j] # j is the current index in the row # encoding is the matrix encoding vector # differences is the difference in positions of the cpgs row = np.copy(observed_matrix[i]) row[j] = -1 data = [row_mean] + [col_mean] + [i, j] + list(row) + list(encoding) X.append(data) X = np.array(X) Y = np.array(Y) Y.astype(int) return X, Y
# Helper functions # returns a list of bins similar to the input # but matrix rows with missing values are removed def _filter_bad_reads(bins): filtered_bins = [] for Bin in bins: newBin = copy.deepcopy(Bin) matrix = newBin.matrix # find rows with missing values counts = np.count_nonzero(matrix == -1, axis=1) idx = counts == 0 matrix_filtered = matrix[idx] newBin.matrix = matrix_filtered filtered_bins.append(newBin) return filtered_bins # returns a mapping of dimensions to list of masks that can be used on data # of that size. # the missing pattern is in matrix form. # -1 is missing, 2 is known def _extract_masks( bins): masks = defaultdict(lambda: []) for Bin in tqdm(bins): matrix = np.copy(Bin.matrix) matrix[matrix >= 0] = 2 #min_missing = 10 min_missing = 1 # must have at least 1 missing value if np.count_nonzero(matrix == -1) >= min_missing: masks[matrix.shape].append(matrix) return masks def _apply_masks( filtered_bins, all_bins ): masks = _extract_masks( all_bins ) ready_bins = [] for Bin in filtered_bins: truth_matrix = Bin.matrix m_shape = truth_matrix.shape if m_shape in masks: if len( masks [ m_shape ] ) > 0: mask = random.choice(masks[m_shape]) observed = np.minimum(truth_matrix, mask) Bin.tag2 = {"truth":truth_matrix, "observed":observed, "mask":mask} ready_bins.append(Bin) return ready_bins # get a set of bins with no missing data def _filter_missing_data( bins, min_read_depth=1 ): cpg_bins_complete = _filter_bad_reads(bins) # secondary depth filter cpg_bins_complete_depth = [bin_ for bin_ in cpg_bins_complete if bin_.matrix.shape[0] >= min_read_depth] return cpg_bins_complete_depth