Source code for genipe.formats.impute2


# This file is part of genipe.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.


import logging

import numpy as np

from ..error import GenipeError


__author__ = "Louis-Philippe Lemieux Perreault"
__copyright__ = "Copyright 2014, Beaulieu-Saucier Pharmacogenomics Centre"
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


__all__ = ["matrix_from_line", "get_good_probs", "maf_from_probs",
           "dosage_from_probs", "hard_calls_from_probs",
           "maf_dosage_from_probs", "additive_from_probs"]


[docs]def matrix_from_line(impute2_line): """Generates the probability matrix from an IMPUTE2 line. Args: impute2_line (list): a single line from IMPUTE2's result (split by space) Returns: tuple: a tuple containing the marker's information (first five values of the line) and the matrix probability (numpy array, float) The shape of the matrix is n x 3 where n is the number of samples. The columns represent the probability for AA, AB and BB. Note ---- The ``impute2_line`` variable is a list of str, corresponding to a line from the IMPUTE2's result, split by space. """ # Creating the array and changing it's shape probabilities = np.array(impute2_line[5:], dtype=float) probabilities.shape = (len(probabilities) // 3, 3) return impute2_line[:5], probabilities
[docs]def get_good_probs(prob_matrix, min_prob=0.9): """Gathers good imputed genotypes (>= probability threshold). Args: prob_matrix (numpy.array): the probability matrix min_prob (float): the probability threshold Returns: numpy.array: a mask array containing the positions where the probabilities are equal or higher to the threshold """ return np.amax(prob_matrix, axis=1) >= min_prob
[docs]def maf_from_probs(prob_matrix, a1, a2, gender=None, site_name=None): """Computes MAF from a probability matrix (and gender if chromosome X). Args: prob_matrix (numpy.array): the probability matrix a1 (str): the first allele a2 (str): the second allele gender (numpy.array): the gender of the samples site_name (str): the name for this site Returns: tuple: a tuple containing three values: the minor allele frequency, the minor and the major allele. When 'gender' is not None, we assume that the MAF on chromosome X is required (hence, males count as 1, and females as 2 alleles). There is also an Exception raised if there are any heterozygous males. """ # By default, the MAF is NA, and a1=major, a2=minor maf = "NA" major, minor = a1, a2 # If there are no data, we return default values if prob_matrix.shape[0] == 0: return maf, minor, major if gender is None: # Not checking gender (this isn't chromosome X) nb_geno = np.bincount(np.argmax(prob_matrix, axis=1), minlength=3) maf = ((nb_geno[2] * 2) + nb_geno[1]) / (np.sum(nb_geno) * 2) else: # Getting the males and females males = (gender == 1) females = (gender == 2) # Male counts males_nb_geno = np.bincount(np.argmax(prob_matrix[males], axis=1), minlength=3) # Female counts females_nb_geno = np.bincount(np.argmax(prob_matrix[females], axis=1), minlength=3) # The total number of genotypes total_geno_males = np.sum(males_nb_geno) total_geno_females = np.sum(females_nb_geno) # If there are no genotypes, we return default values if (total_geno_males + total_geno_females) == 0: return maf, minor, major # There shouldn't be heterozygous genotypes for males if males_nb_geno[1] > 0: raise GenipeError("{}: heterozygous male " "present".format(site_name)) # Computing the frequencies maf = males_nb_geno[2] + (females_nb_geno[2] * 2) + females_nb_geno[1] maf /= (total_geno_males + (total_geno_females * 2)) # Is this the MAF? if maf != "NA" and maf > 0.5: minor, major = a1, a2 maf = 1 - maf return maf, minor, major
[docs]def maf_dosage_from_probs(prob_matrix, a1, a2, scale=2, gender=None, site_name=None): """Computes MAF and dosage vector from probs matrix. Args: prob_matrix (numpy.array): the probability matrix a1 (str): the first allele a2 (str): the second allele scale (int): the scale value gender (numpy.array): the gender of the samples site_name (str): the name for this site Returns: tuple: a tuple containing four values: the dosage vector, the minor allele frequency, the minor and the major allele. When 'gender' is not None, we assume that the MAF on chromosome X is required (hence, males count as 1, and females as 2 alleles). There is also an Exception raised if there are any heterozygous males. """ # By default, the MAF is NA, and a1=major, a2=minor maf = "NA" major, minor = a1, a2 # If there are no data, we return default values if prob_matrix.shape[0] == 0: return np.array([], dtype=float), maf, minor, major # Getting the dosage by default (by default, on the second allele) dosage = dosage_from_probs( homo_probs=prob_matrix[:, 2], hetero_probs=prob_matrix[:, 1], scale=scale, ) set_no_maf = False if gender is None: # Not checking gender (this isn't chromosome X) maf = dosage.sum() / (len(dosage) * 2) else: # Getting the males and females m = (gender == 1) f = (gender == 2) # Checking males genotype males_nb_geno = np.bincount(np.argmax(prob_matrix[m], axis=1), minlength=3) if males_nb_geno[1] > 0: raise GenipeError("{}: heterozygous male " "present".format(site_name)) # The number of alleles nb_alleles = m.sum() + (f.sum() * 2) if nb_alleles == 0: # Gender is unknown for all, so we compute frequency for right # dosage value, then we set the MAF to NA afterwards logging.warning("All samples have unknown gender, MAF will be NA") maf = dosage.sum() / (len(dosage) * 2) set_no_maf = True else: # Computing frequencies maf = (dosage[f].sum() + (dosage[m].sum() / 2)) / nb_alleles # Is this the MAF? if maf != "NA" and maf > 0.5: minor, major = a1, a2 maf = 1 - maf dosage = 2 - dosage return dosage, maf if not set_no_maf else "NA", minor, major
[docs]def dosage_from_probs(homo_probs, hetero_probs, scale=2): """Computes dosage from probability matrix (for the minor allele). Args: homo_probs (numpy.array): the probabilities for the homozygous genotype hetero_probs (numpy.array): the probabilities for the heterozygous genotype scale (int): the scale value Returns: numpy.array: the dosage computed from the probabilities """ return (homo_probs + (hetero_probs / 2)) * scale
[docs]def hard_calls_from_probs(a1, a2, probs): """Computes hard calls from probability matrix. Args: a1 (str): the first allele a2 (str): the second allele probs (numpy.array): the probability matrix Returns: numpy.array: the hard calls computed from the probabilities """ possible_geno = np.array([ " ".join([a1] * 2), # Homo A1 " ".join([a1, a2]), # Hetero " ".join([a2] * 2), # Homo A2 ]) return possible_geno[np.argmax(probs, axis=1)]
[docs]def additive_from_probs(a1, a2, probs): """Compute additive format from probability matrix. Args: a1 (str): the a1 allele a2 (str): the a2 allele probs (numpy.array): the probability matrix Returns: tuple: the additive format computed from the probabilities, the minor and major allele. The encoding is as follow: 0 when homozygous major allele, 1 when heterozygous and 2 when homozygous minor allele. The minor and major alleles are inferred by looking at the MAF. By default, we think a2 is the minor allele, but flip if required. """ calls = np.argmax(probs, axis=1) minor = a2 major = a1 if np.sum(calls) / (len(calls)*2) > 0.5: calls = 2 - calls minor = a1 major = a2 return calls, minor, major