Source code for clubcpg.Imputation

import pandas as pd
import numpy as np
import logging
import os
from clubcpg.ConnectToCpGNet import TrainWithPReLIM
from clubcpg.ParseBam import BamFileReadParser
from clubcpg_prelim import PReLIM
from pebble import ProcessPool
from joblib import load

[docs]class Imputation: """The class providing convienent APIs to train models and impute from models using PReLIM """
[docs] def __init__(self, cpg_density: int, bam_file: str, mbias_read1_5=None, mbias_read1_3=None, mbias_read2_5= None, mbias_read2_3=None, processes=-1): """[summary] Arguments: cpg_density {int} -- Number of CpGs this class instance will be used for bam_file {str} -- path to the bam file Keyword Arguments: mbias_read1_5 {[type]} -- [description] (default: {None}) mbias_read1_3 {[type]} -- [description] (default: {None}) mbias_read2_5 {[type]} -- [description] (default: {None}) mbias_read2_3 {[type]} -- [description] (default: {None}) processes {int} -- number or CPUs to use when parallelization can be utilized, default= All available (default: {-1}) """ self.cpg_density = cpg_density self.bam_file = bam_file self.mbias_read1_5 = mbias_read1_5 self.mbias_read1_3 = mbias_read1_3 self.mbias_read2_5 = mbias_read2_5 self.mbias_read2_3 = mbias_read2_3 self.processes = processes
[docs] def extract_matrices(self, coverage_data_frame: pd.DataFrame, sample_limit: int = None, return_bins=False): """Extract CpG matrices from bam file. Arguments: coverage_data_frame {pd.DataFrame} -- Output of clubcpg-coverage read in as a csv file Keyword Arguments: return_bins {bool} -- Return the bin location along with the matrix (default: {False}) Returns: [tuple] -- Returns tuple of (bin, np.array) if returns_bins = True else returns only np.array """ subset = coverage_data_frame[coverage_data_frame['cpgs'] == self.cpg_density] bins_of_interest = subset['bin'].unique() # Downsample the training bins if requested and necessary if sample_limit and len(bins_of_interest) > sample_limit: bins_of_interest = np.random.choice(bins_of_interest, size=sample_limit) # Use the pebbel ProcessPool because it can handle hanging processes with a timeout complete_results = [] with ProcessPool(max_workers=self.processes) as pool: future =, bins_of_interest, timeout=5) iterator = future.result() while True: try: result = next(iterator) complete_results.append(result) except StopIteration: break except TimeoutError as error: print("Timeout caught - {}".format(error.args[1])) except Exception as error: print("Unknown exception = {}".format(error)) bins, matrices = zip(*complete_results) # destroy the pool pool.close() # Remove any potential bad data clean_matrices = [] clean_bins = [] for matrix, bin_ in zip(matrices, bins): try: if matrix.shape[1] == self.cpg_density: clean_matrices.append(matrix) clean_bins.append(bin_) except IndexError as e:"Index error at bin {}".format(bin_)) logging.error(str(e)) continue # if len(clean_matrices) > 0: # clean_matrices = np.array(clean_matrices) # clean_matrices = np.array(clean_matrices) if return_bins: return clean_bins, clean_matrices else: return clean_matrices
def _multiprocess_extract(self, one_bin: str): """Function to be used for multiprocessing Arguments: one_bin {str} -- bin id as "chr7_222222" Returns: [tuple] -- bin, matrix """ try: read_parser = BamFileReadParser(self.bam_file, 20, read1_5=self.mbias_read1_5, read1_3=self.mbias_read1_3, read2_5=self.mbias_read2_5, read2_3=self.mbias_read2_3) chrom, loc = one_bin.split("_") loc = int(loc) reads = read_parser.parse_reads(chrom, loc-100, loc) # TODO unhardcode bin size matrix = read_parser.create_matrix(reads) matrix = matrix.dropna(how="all") # if matrix.shape[0] == 0: # return None matrix = matrix.fillna(-1) matrix = np.array(matrix) matrix = matrix.astype('int8') except: # BAD EXCEPTION return (one_bin, np.array([])) return (one_bin, matrix)
[docs] def train_model(self, output_folder: str, matrices: iter): """Train a CpGNet model using :class:`.TrainWithCpGNet` Arguments: output_folder {str} -- Folder to save trained models matrices {iter} -- An iterable of CpGMatrices - ideally obtained through Imputation.extract_matrices() Returns: [keras model] -- Returns the trained CpGNet model """ train_model = TrainWithPReLIM(cpg_density=self.cpg_density, save_path=output_folder) model = train_model.train_model(matrices) return model
[docs] @staticmethod def postprocess_predictions(predicted_matrix): """Takes array with predicted values and rounds them to 0 or 1 if threshold is exceeded Arguments: predicted_matrix {[type]} -- matrix generated by imputation Returns: [type] -- predicted matrix predictions as 1, 0, or NaN """ processed_array = [] for array in predicted_matrix: new_array = [] for item in array: if item != 1 and item != 0: if item <= 0.2: #TODO un-hardcode this new_array.append(0.0) elif item >= 0.8: #TODO un-hardcode this new_array.append(1.0) else: new_array.append(np.nan) else: new_array.append(item) processed_array.append(new_array) return np.array(processed_array)
[docs] def impute_from_model(self, models_folder: str, matrices: iter, postprocess=True): """Generator to provide imputed matrices on-the-fly Arguments: models_folder {str} -- Path to directory containing trained CpGNet models matrices {iter} -- An iterable containging n x m matrices with n=cpgs and m=reads Keyword Arguments: postprocess {bool} -- Round imputed values to 1s and 0s (default: {True}) """ model_path = os.path.join(models_folder, "saved_model_{}_cpgs.prelim".format(self.cpg_density)) trained_model = PReLIM(cpgDensity=self.cpg_density) trained_model.model = load(model_path) print("Successfully loaded model: {}".format(model_path), flush=True) for m in matrices: # only impute if there is an unknown if -1 in m: m = m.astype(float) pm = trained_model.impute(m) if postprocess: pm = self.postprocess_predictions(pm) # Nothing to impute, passback original matrix to keep list in order else: pm = m.copy() # K.clear_session() yield pm