Source code for clubcpg_prelim.PReLIM

from __future__ import print_function

"""
Author: 
Jack Duryea
Waterland Lab
Computational Epigenetics Section
Baylor College of Medicine

Created April 2018

Updated April 11 2019: use random forests as model


PReLIM: Preceise Read Level Imputation of Methylation

PReLIM imputes missing CpG methylation
states in CpG matrices.

"""

# standard imports
import numpy as np
from tqdm import tqdm
import copy
import time

from collections import defaultdict
import random

# sklearn imports
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Pickle
try:
	import cPickle as p
except ImportError:
	import pickle as p


# TODO: most of these fields are redundant in our application
[docs]class CpGBin():
	""" 
	Constructor for a bin

	"""
[docs]	def __init__(self, 
			matrix, 
			#relative_positions
			binStartInc=None, 
			binEndInc=None, 
			cpgPositions=None, 
			sequence="",
			encoding=None, 
			missingToken= -1, 
			chromosome=None, 
			binSize=100, 
			species="MM10", 
			verbose=True, 
			tag1=None, 
			tag2=None):
		"""

		:param matrix: numpy array, the bin's CpG matrix.
		:param binStartInc: integer, the starting, inclusive, chromosomal index of the bin.
		:param binEndInc: integer, the ending, inclusive, chromosomal index of the bin.
		:param cpgPositions: array of integers, the chromosomal positions of the CpGs in the bin.
		:param sequence: string, nucleotide sequence (A,C,G,T)
		:param encoding: array, a reduced representation of the bin's CpG matrix
		:param missingToken: integer, the token that represents missing data in the matrix.
		:param chromosome: string, the chromosome this bin resides in.
		:param binSize: integer, the number of base pairs this bin covers
		:param species: string, the speices this bin belongs too.
		:param verbose: boolean, print warnings, set to "false" for no error checking and faster speed
		:param tag1: anything, for custom use.
		:param tag2: anything, for custom use.

		"""

		self.cpgDensity = matrix.shape[1]
		self.readDepth = matrix.shape[0]
		
		

		self.matrix = np.array(matrix, dtype=float)
		self.binStartInc = binStartInc
		self.binEndInc = binEndInc
		self.cpgPositions = cpgPositions
		self.sequence = sequence
		self.missingToken = missingToken
		self.chromosome = chromosome
		self.binSize = binSize
		self.species = species
		self.tag1 = tag1
		self.tag2 = tag2


[docs]class PReLIM():
	"""
	PReLIM imputation class to handle training and predicting from models.

	"""
[docs]	def __init__(self, cpgDensity=2):
		self.model = None
		self.cpgDensity = cpgDensity
		self.METHYLATED = 1
		self.UNMETHYLATED = 0
		self.MISSING = -1
		self.methylated = 1
		self.unmethylated = 0
		self.unknown = -1

	# Train a model
[docs]	def train(self, bin_matrices, model_file="no", verbose=False):
		"""
		bin_matrices: list of cpg matrices

		model_file, string,      The name of the file to save the model to. 
			If None, then create a file name that includes a timestamp.
			If you don't want to save a file, set this to "no"
		"""
		# bin_matrices: a list of cpg matrices 
		X,y = self.get_X_y(bin_matrices, model_file=model_file, verbose=False)
		
		# Train the neural network model
		self.fit(X,y, model_file=model_file, verbose=verbose)

[docs]	def fit(self,
			X_train,
			y_train,
			n_estimators = [10, 50, 100, 500, 1000],
			cores = -1,
			max_depths = [1, 5, 10, 20, 30],
			model_file=None,
			verbose=False
			):
		"""
		Inputs: 
		1. X_train,     numpy array, Contains feature vectors.
		2. y_train,     numpy array, Contains labels for training data.
		3. n_estimators, list, the number of estimators to try during a grid search.
		4. max_depths, list, the maximum depths of trees to try during a grid search.
		5. cores, the number of cores to use during training, helpful for grid search.
		6. model_file, string,      The name of the file to save the model to. 
		If None, then create a file name that includes a timestamp.
		If you don't want to save a file, set this to "no"
	
		5-fold validation is built into the grid search

		Outputs: 
		The trained model

		Usage: 
		model.fit(X_train, y_train)	
		"""


		grid_param = {  
		 "n_estimators": n_estimators,
		  "max_depth": max_depths,
		}

		# Note: let the grid search use a lot of cores, but only use 1 for each forest
		# since dispatching can take a lot of time
		rf = RandomForestClassifier(n_jobs=1)
		self.model = GridSearchCV(rf, grid_param, n_jobs=cores, cv=5, verbose=verbose)
		if len(X_train) > 0:
			self.model.fit(X_train, y_train)

			# save the model
			if model_file == "no":
				return self.model

			if not model_file:
				model_file = "PReLIM_model" + str(time.time())

			p.dump(self.model, open(model_file,"wb"))

			return self.model
		else:
			self.model = None
			return self.model




	# Feature collection directly from bins
	def get_X_y(self, bin_matrices, model_file=None, verbose=False):
		bins = []

		# convert to bin objects for ease of use
		for matrix in bin_matrices:
			mybin = CpGBin( matrix=matrix )
			bins.append( mybin )
		
		# find bins with no missing data
		complete_bins = _filter_missing_data( bins )
		random.shuffle( complete_bins )
		
		# apply masks
		masked_bins = _apply_masks( complete_bins, bins )

		# extract features
		X, y = self._collectFeatures( masked_bins ) 
		return X, y 


	# Return a vector of predicted classes 
[docs]	def predict_classes(self, X):
		"""
		Inputs: 
		1. X, numpy array, contains feature vectors
		
		Outputs: 
		1. 1-d numpy array of prediction values

		Usage: 
		y_pred = CpGNet.predict_classes(X)  

		"""
		return self.model.predict(X)
	
	# Return a vector of probabilities for methylation
[docs]	def predict(self, X):
		"""
		Inputs: 
		1. X, numpy array, contains feature vectors
		
		Outputs: 
		1. 1-d numpy array of predicted class labels

		Usage: 
		y_pred = CpGNet.predict(X)  

		"""
		return self.model.predict_proba(X)[:,1]


[docs]	def predict_proba(self, X):
		"""
		Inputs: 
		1. X, numpy array, contains feature vectors
		
		Outputs: 
		1. 1-d numpy array of class predictions 

		Usage: 
		y_pred = CpGNet.predict(X)  

		"""
		return self.model.predict_proba(X)[:1]


	# Load a saved model
[docs]	def loadWeights(self, model_file):
		"""
		Inputs:
		1. model_file, string, name of file with a saved model

		Outputs:
		None

		Effects:
		self.model is loaded with the provided weights
		"""
		self.model = p.load(open(model_file,"rb"))

	




	def _get_imputation_features(self,matrix):
		'''
		Returns a vector of features needed for the imputation of this matrix

		Inputs: 
		1. matrix, a 2d np array, dtype=float, representing a CpG matrix, 1=methylated, 0=unmethylated, -1=unknown

		Outputs:
		1. A feature vector for the matrix
		'''

		X = []

		numReads = matrix.shape[0]
		density = matrix.shape[1]

		nan_copy = np.copy(matrix)
		nan_copy[nan_copy == -1] = np.nan
		column_means = np.nanmean(nan_copy, axis=0)
		row_means = np.nanmean(nan_copy, axis=1)
		
		encoding = self._encode_input_matrix(matrix)[0]

		for i in range(numReads):
			for j in range(density):
				observed_state = matrix[i, j]
				
				if observed_state != -1:
					continue

				row_mean = row_means[i]
				col_mean = column_means[j]
				
				row = np.copy(matrix[i])
				row[j] = -1

				data = [row_mean] + [col_mean] +  [i, j] + list(row) +  list(encoding)
				X.append(data)

		X = np.array(X)

		return X

	# Imputes missing values in Bins
[docs]	def impute(self, matrix):
		"""
		Inputs: 
		1. matrix, a 2d np array, dtype=float, representing a CpG matrix, 1=methylated, 0=unmethylated, -1=unknown
		
		Outputs: 
		1. A 2d numpy array with predicted probabilities of methylation

		"""

		X = self._get_imputation_features(matrix)
		
		if len(X) == 0: # nothing to impute
			return matrix

		predictions = self.predict(X)

		k = 0 # keep track of prediction index for missing states
		predicted_matrix = np.copy(matrix)
		for i in range(predicted_matrix.shape[0]):
			for j in range(predicted_matrix.shape[1]):
				if predicted_matrix[i, j] == -1:
					predicted_matrix[i, j] = predictions[k]
					k += 1

		return predicted_matrix







[docs]	def impute_many(self, matrices):
		'''
		Imputes a bunch of matrices at the same time to help speed up imputation time.

		Inputs:

		1. matrices: array-like (i.e. list), where each element is
		a 2d np array, dtype=float, representing a CpG matrix, 1=methylated, 0=unmethylated, -1=unknown

		Outputs:

		1. A List of 2d numpy arrays with predicted probabilities of methylation for unknown values.
		'''

		# Extract all features for all matrices so we can predict in bulk, this is where the speedup comes from
		
		X = np.array([features for matrix_features in [self._get_imputation_features(matrix) for matrix in matrices] for features in matrix_features])
		
		if len(X) == 0:
			return matrices
		
		predictions = self.predict(X)


		predicted_matrices = []

		# TODO: lots of for-loops here, could be sped up?

		k = 0 # keep track of prediction index for missing states, order is crucial!
		for matrix in matrices:
			predicted_matrix = np.copy(matrix)
			for i in range(predicted_matrix.shape[0]):
				for j in range(predicted_matrix.shape[1]):
					if predicted_matrix[i, j] == -1:
						predicted_matrix[i, j] = predictions[k]
						k += 1
			predicted_matrices.append(predicted_matrix)

		return predicted_matrices







	### Helper functions, for private use only ###

	# Returns a matrix encoding of a CpG matrix
	def _encode_input_matrix(self, m):
		matrix = np.copy(m)
		n_cpgs = matrix.shape[1]
		matrix += 1  # deal with -1s
		base_3_vec = np.power(3, np.arange(n_cpgs - 1, -1, -1))
		#
		encodings = np.dot(base_3_vec, matrix.T)
		#
		encoded_vector_dim = np.power(3, n_cpgs)
		encoded_vector = np.zeros(encoded_vector_dim)
		#
		for x in encodings:
			encoded_vector[int(x)] += 1
		#
		num_reads = encodings.shape[0]
		#
		# Now we normalize
		encoded_vector_norm = normalize([encoded_vector], norm="l1")
		return encoded_vector_norm[0], num_reads

	# finds the majority class of the given column, discounting the current cpg
	
	def _get_column_mean(self, matrix, col_i, current_cpg_state):
		sub = matrix[:, col_i]
		return self._get_mean(sub, current_cpg_state)

	# finds the majority class of the given read, discounting the current cpg
	def _get_read_mean(self, matrix, read_i, current_cpg_state):
		sub = matrix[read_i, :]
		return self._get_mean(sub, current_cpg_state)

	def _get_mean(self, sub_matrix, current_cpg_state):
		num_methy = np.count_nonzero(sub_matrix == self.METHYLATED)
		num_unmethy = np.count_nonzero(sub_matrix == self.UNMETHYLATED)

		if current_cpg_state == self.METHYLATED:
			num_methy -= 1
		num_methy = max(0, num_methy)
		if current_cpg_state == self.UNMETHYLATED:
			num_unmethy -= 1
		num_unmethy = max(0, num_unmethy)
		if float(num_methy + num_unmethy) == 0:
			return -2

		return float(num_methy) / float(num_methy + num_unmethy)

	# Returns X, y
	# note: y can contain the labels 1,0, -1
	def _collectFeatures(self, bins):
		X = []
		Y = []
		for Bin in tqdm(bins):
			observed_matrix = Bin.tag2["observed"]
			truth_matrix = Bin.tag2["truth"]
			encoding = self._encode_input_matrix(observed_matrix)[0]

			numReads = observed_matrix.shape[0]
			density = observed_matrix.shape[1]
			#positions = Bin.cpgPositions
			nan_copy = np.copy(observed_matrix)
			

			nan_copy[nan_copy == -1] = np.nan 
			column_means = np.nanmean(nan_copy,axis=0)
			row_means = np.nanmean(nan_copy,axis=1)

			for i in range(numReads):
				for j in range(density):
					observed_state = observed_matrix[i,j]
					if observed_state != -1:
						continue

					state = truth_matrix[i,j]
					Y.append(state)
					
					
					
					row_mean = row_means[i]
					col_mean = column_means[j]
					# j is the current index in the row
					
					# encoding is the matrix encoding vector
					# differences is the difference in positions of the cpgs
					row = np.copy(observed_matrix[i])
					row[j] = -1

					data = [row_mean] + [col_mean] +  [i, j] + list(row) +  list(encoding)
					X.append(data)


		X = np.array(X)
		Y = np.array(Y)
		Y.astype(int)
		return X, Y



# Helper functions

# returns a list of bins similar to the input
# but matrix rows with missing values are removed
def _filter_bad_reads(bins):
	filtered_bins = []
	for Bin in bins:
		newBin = copy.deepcopy(Bin)
		matrix = newBin.matrix

		# find rows with missing values
		counts = np.count_nonzero(matrix == -1, axis=1)
		idx = counts == 0
		matrix_filtered = matrix[idx]
		newBin.matrix = matrix_filtered
		filtered_bins.append(newBin)

	return filtered_bins

# returns a mapping of dimensions to list of masks that can be used on data
# of that size.
# the missing pattern is in matrix form.
# -1 is missing, 2 is known
def _extract_masks( bins):
	masks = defaultdict(lambda: [])
	for Bin in tqdm(bins):
		matrix = np.copy(Bin.matrix)
		matrix[matrix >= 0] = 2

		#min_missing = 10
		min_missing = 1 # must have at least 1 missing value
		if np.count_nonzero(matrix == -1) >= min_missing:
			masks[matrix.shape].append(matrix)

	return masks

  
def _apply_masks( filtered_bins, all_bins ):
	
	masks = _extract_masks( all_bins )
	ready_bins = []

	for Bin in filtered_bins:
		truth_matrix = Bin.matrix
		m_shape = truth_matrix.shape
		if m_shape in masks:
			if len( masks [ m_shape ] ) > 0:
				mask = random.choice(masks[m_shape])
				observed = np.minimum(truth_matrix, mask)
				Bin.tag2 = {"truth":truth_matrix, "observed":observed, "mask":mask}
				ready_bins.append(Bin)

	return ready_bins

# get a set of bins with no missing data
def _filter_missing_data( bins, min_read_depth=1 ):
	cpg_bins_complete = _filter_bad_reads(bins)
	# secondary depth filter
	cpg_bins_complete_depth = [bin_ for bin_ in cpg_bins_complete if bin_.matrix.shape[0] >= min_read_depth]
	return cpg_bins_complete_depth