Source code for genipe.reporting.autoreport


# This file is part of genipe.
#
# This work is licensed under the Creative Commons Attribution-NonCommercial
# 4.0 International License. To view a copy of this license, visit
# http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative
# Commons, PO Box 1866, Mountain View, CA 94042, USA.


import os
import shutil
from datetime import date

from pkg_resources import resource_filename

from . import utils
from .. import __version__
from ..error import GenipeError


__author__ = "Louis-Philippe Lemieux Perreault"
__copyright__ = "Copyright 2014, Beaulieu-Saucier Pharmacogenomics Centre"
__license__ = "Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)"


__all__ = ["generate_report", ]


[docs]def generate_report(out_dir, run_opts, run_info): """Generate the report. Args: out_dir (str): the output directory for the report run_opts (dict): the run options run_info (dict): the run information """ # Configuring Jinja2 jinja2_env = utils.config_jinja2() # Gathering the report data today = date.today() report_data = { "report_number": utils.sanitize_tex(run_opts.report_number), "title": utils.sanitize_tex(run_opts.report_title), "author": utils.sanitize_tex(run_opts.report_author), "month": utils.sanitize_tex("{:%B}".format(today)), "day": utils.sanitize_tex("{:%d}".format(today)), "year": utils.sanitize_tex("{:%Y}".format(today)), "package_name": utils.sanitize_tex(__name__.split(".")[0]), "package_version": utils.sanitize_tex(__version__), } # We want to copy the figures to the right place figures = ["frequency_barh"] for figure in figures: assert figure in run_info, figure if run_info[figure] != "": shutil.copy(run_info[figure], out_dir) run_info[figure] = os.path.basename(run_info[figure]) # Gathering the report content report_content = "" report_content += _generate_background(jinja2_env, run_opts, run_info) report_content += _generate_methods(jinja2_env, run_opts, run_info) report_content += _generate_results(jinja2_env, run_opts, run_info) report_content += _generate_conclusions(jinja2_env, run_opts, run_info) report_data["report_content"] = report_content # Gathering the annex content annex_content = _generate_annex(jinja2_env, run_opts, run_info) report_data["annex_content"] = annex_content # Getting the template main_template = jinja2_env.get_template("main_template.tex") # Writing the report report_filename = os.path.join(out_dir, "report.tex") try: with open(report_filename, "w") as o_file: print(main_template.render(**report_data), file=o_file) except FileNotFoundError: raise GenipeError("{}: cannot write file".format(report_filename)) # Copying the bibliography file bib_file = resource_filename( __name__, os.path.join("templates", "biblio", "references.bib"), ) shutil.copy(bib_file, out_dir) # Copying the bibliography style bib_style = resource_filename( __name__, os.path.join("templates", "biblio", "references.bst"), ) shutil.copy(bib_style, out_dir) # Copying the Makefile (to help build the report) makefile = resource_filename( __name__, os.path.join("templates", "utils", "Makefile"), ) shutil.copy(makefile, out_dir)
def _generate_background(templates, run_options, run_information): """Generates the background section of the report. Args: templates (jinja2.Environment): the jinja2 template environment run_options (dict): the run options run_information (dict): the run information Returns: str: a string representation of the "background" section """ # Some assertion assert "report_background" in run_options # The background can either be a file or a string background_content = run_options.report_background if os.path.isfile(background_content): with open(background_content, "r") as i_file: background_content = " ".join( line for line in i_file.read().splitlines() if line != "" ) # Loading the template section_template = templates.get_template("section_template.tex") # Returning the section return section_template.render( section_name="Background", section_type="section", section_label="sec:background", section_content=utils.sanitize_tex(background_content), ) def _generate_methods(templates, run_options, run_information): """Generate the method section of the report. Args: templates (jinja2.Environment): the jinja2 template environment run_options (dict): the run options run_information (dict): the run information Returns: str: a string representation of the "methods" section """ # Some assertions required_variables = ["shapeit_version", "impute2_version", "plink_version", "initial_nb_markers", "initial_nb_samples", "nb_duplicates", "nb_ambiguous", "nb_flip", "nb_exclude", "nb_phasing_markers", "nb_flip_reference", "nb_special_markers", "reference_checked", "no_marker_left", "no_imputed_sites", "nb_samples_no_gender"] for required_variable in required_variables: assert required_variable in run_information, required_variable # Loading the templates section_template = templates.get_template("section_template.tex") itemize_template = templates.get_template("iterate_template.tex") methods = templates.get_template("parts/methods.tex") # Are there any filtering rules? filtering_rules = "" if run_options.filtering_rules is not None: filtering_rules = utils.sanitize_tex(" (filtering out sites where") for i, rule in enumerate(run_options.filtering_rules): p = ", " if i == 0: p = " " elif i == len(run_options.filtering_rules) - 1: p = " or " p = utils.sanitize_tex(p) filtering_rules += p + utils.format_tex( utils.sanitize_tex(rule), "texttt", ) filtering_rules += utils.sanitize_tex(")") # The input files data_files = [ "{}.{}".format(run_options.bfile, ext) for ext in ("bed", "bim", "fam") ] # The text for the different steps steps = [] # Was there an initial reference check? to_add_1 = "" to_add_2 = "" if run_information["reference_checked"]: to_add_1 = utils.sanitize_tex( "An initial strand check was also performed using the human " "reference genome. " ) to_add_2 = utils.format_tex( utils.sanitize_tex( " Also, {nb_flip} markers were flipped because of strand " "issue.".format( nb_flip=run_information["nb_flip_reference"], ) ), "textbf", ) # The ambiguous and duplicated markers that were removed steps.append(utils.wrap_tex(utils.sanitize_tex( "Ambiguous markers with alleles " ) + utils.format_tex("A", "texttt") + "/" + utils.format_tex("T", "texttt") + " and " + utils.format_tex("C", "texttt") + "/" + utils.format_tex("G", "texttt") + utils.sanitize_tex( ", duplicated markers (same position), and markers located on " "the mitochondrial or the Y chromosomes were excluded from the " "imputation. " ) + to_add_1 + utils.format_tex( utils.sanitize_tex( "In total, {ambiguous} ambiguous, {duplicated} duplicated and " "{special} Y/mitochondrial markers were excluded.".format( ambiguous=run_information["nb_ambiguous"], duplicated=run_information["nb_duplicates"], special=run_information["nb_special_markers"], ) ), "textbf", ) + to_add_2)) # The number of markers that were flipped steps.append(utils.wrap_tex(utils.sanitize_tex( "Markers' strand was checked using the SHAPEIT algorithm and " "IMPUTE2's reference files. " ) + utils.format_tex( utils.sanitize_tex( "In total, {nb_markers} markers had an incorrect strand and " "were flipped using Plink.".format( nb_markers=run_information["nb_flip"], ) ), "textbf", ))) # The number of excluded markers because of strand problem steps.append(utils.wrap_tex(utils.sanitize_tex( "The strand of each marker was checked again using SHAPEIT against " "IMPUTE2's reference files. " ) + utils.format_tex( utils.sanitize_tex( "In total, {nb_markers} markers were found to still be on the " "wrong strand, and were hence excluded from the final dataset " "using Plink.".format( nb_markers=run_information["nb_exclude"], ) ), "textbf", ))) steps = itemize_template.render(iteration_type="enumerate", iteration_list=steps) # Returning the section return section_template.render( section_name="Methods", section_type="section", section_label="sec:methods", section_content=methods.render( data_files=data_files, steps_data=steps, filtering_rules=filtering_rules, **run_information ), ) def _generate_results(templates, run_options, run_information): """Generates the results section of the report. Args: templates (jinja2.Environment): the jinja2 template environment run_options (dict): the run options run_information (dict): the run information Returns: str: a string representation of the "results" section """ # Some assertions required_variables = ["cross_validation_final_nb_genotypes", "cross_validation_nb_genotypes_chrom", "cross_validation_table_1", "cross_validation_table_2", "cross_validation_table_1_chrom", "cross_validation_table_2_chrom", "prob_threshold", "nb_imputed", "average_comp_rate", "rate_threshold", "info_threshold", "nb_good_sites", "average_comp_rate_cleaned", "mean_missing", "nb_samples", "nb_genotyped", "nb_genotyped_not_complete", "pct_genotyped_not_complete", "nb_geno_now_complete", "pct_geno_now_complete", "nb_site_now_complete", "pct_good_sites", "nb_missing_geno", "nb_maf_nan", "nb_marker_with_maf", "nb_maf_geq_01", "nb_maf_geq_05", "nb_maf_lt_05", "nb_maf_lt_01", "nb_maf_geq_01_lt_05", "pct_maf_geq_01", "pct_maf_geq_05", "pct_maf_lt_05", "pct_maf_lt_01", "pct_maf_geq_01_lt_05", "frequency_barh"] for required_variable in required_variables: assert required_variable in run_information, required_variable # Loading the templates section_template = templates.get_template("section_template.tex") tabular_template = templates.get_template("tabular_template.tex") graphics_template = templates.get_template("graphics_template.tex") float_template = templates.get_template("float_template.tex") cross_validation = templates.get_template("parts/cross_validation.tex") completion_rate = templates.get_template("parts/completion_rate.tex") frequencies = templates.get_template("parts/frequencies.tex") # The header of the two kind of tables header_table_1 = [ utils.format_tex(utils.sanitize_tex("Interval"), "textbf"), utils.format_tex(utils.sanitize_tex("Nb Geno"), "textbf"), utils.format_tex(utils.sanitize_tex("Concordance (%)"), "textbf"), ] header_table_2 = [ utils.format_tex(utils.sanitize_tex("Interval"), "textbf"), utils.format_tex(utils.sanitize_tex("Called (%)"), "textbf"), utils.format_tex(utils.sanitize_tex("Concordance (%)"), "textbf"), ] # Creating the tables tables = "" # Adding the table for each of the chromosomes for chrom in run_options.required_chrom: # Getting the table 1 table_1 = run_information["cross_validation_table_1_chrom"][chrom] for i in range(len(table_1)): table_1[i][0] = utils.tex_inline_math(table_1[i][0]) table_1 = utils.create_tabular( template=tabular_template, header=header_table_1, col_align=["c", "r", "r"], data=table_1, ) # Getting the table 2 table_2 = run_information["cross_validation_table_2_chrom"][chrom] for i in range(len(table_2)): table_2[i][0] = utils.tex_inline_math( table_2[i][0].replace(">=", r"\geq "), ) table_2 = utils.create_tabular( template=tabular_template, header=header_table_2, col_align=["c", "r", "r"], data=table_2, ) # The number of genotypes nb_genotypes = run_information["cross_validation_nb_genotypes_chrom"] nb_genotypes = nb_genotypes[chrom] # Adding the float tables += utils.create_float( template=float_template, float_type="table", caption=utils.wrap_tex(utils.sanitize_tex( "IMPUTE2's internal cross-validation for chromosome {}. " "Tables show the percentage of concordance between genotyped " "calls and imputed calls for {:,d} " "genotypes.".format(chrom, nb_genotypes) )), label="tab:cross_validation_chr_{}".format(chrom), placement="H", content=table_1 + r"\hfill" + table_2, ) # Adding the table for all the chromosomes (Table 1) table_1 = run_information["cross_validation_table_1"] for i in range(len(table_1)): table_1[i][0] = utils.tex_inline_math(table_1[i][0]) table_1 = utils.create_tabular( template=tabular_template, header=header_table_1, col_align=["c", "r", "r"], data=table_1, ) # Adding the table for all the chromosomes (Table 2) table_2 = run_information["cross_validation_table_2"] for i in range(len(table_2)): table_2[i][0] = utils.tex_inline_math( table_2[i][0].replace(">=", r"\geq "), ) table_2 = utils.create_tabular( template=tabular_template, header=header_table_2, col_align=["c", "r", "r"], data=table_2, ) # The number of genotypes nb_genotypes = run_information["cross_validation_final_nb_genotypes"] # Adding the float if len(run_options.required_chrom) > 1: tables += "\n\n" + utils.create_float( template=float_template, float_type="table", caption=utils.wrap_tex(utils.sanitize_tex( "IMPUTE2's internal cross-validation across the genome. " "Tables show the percentage of concordance between genotyped " "calls and imputed calls for {:,d} " "genotypes.".format(nb_genotypes) )), label="tab:cross_validation", placement="H", content=table_1 + r"\hfill" + table_2, ) # Creating the cross-validation subsection cross_validation_content = section_template.render( section_name="Cross-validation", section_type="subsection", section_content=cross_validation.render( single_chromosome=len(run_options.required_chrom) == 1, first_chrom=run_options.required_chrom[0], last_chrom=run_options.required_chrom[-1], tables=tables, ), section_label="subsec:cross_validation", ) # Creating the completion rate subsection completion_rate_content = section_template.render( section_name="Completion rate", section_type="subsection", section_content=completion_rate.render(**run_information), section_label="subsec:completion_rate", ) # Do we have a frequency bar plot? frequency_float = "" if run_information["frequency_barh"] != "": frequency_float = utils.create_float( template=float_template, float_type="figure", caption=utils.wrap_tex(utils.sanitize_tex( "Proportions of minor allele frequencies for imputed " "sites with a completion rate of {}% or " "more at a probability of {}% or " "more.".format(run_information["rate_threshold"], run_information["prob_threshold"]) )), label="fig:frequency_barh", placement="H", content=graphics_template.render( width=r"0.9\textwidth", path=run_information["frequency_barh"], ), ) run_information["frequency_float"] = frequency_float # Creating the frequency subsection frequencies_content = section_template.render( section_name="Minor allele frequencies", section_type="subsection", section_content=frequencies.render(**run_information), section_label="subsec:maf", ) # The final content content = (cross_validation_content + completion_rate_content + frequencies_content) return section_template.render(section_name="Results", section_type="section", section_content=content, section_label="sec:results") def _generate_conclusions(templates, run_options, run_information): """Generates the background section of the report. Args: templates (jinja2.Environment): the jinja2 template environment run_options (dict): the run options run_information (dict): the run information Returns: str: a string representation of the "conclusions" section """ # Some assertions required_variables = ["nb_good_sites", "prob_threshold", "rate_threshold", "info_threshold", "nb_genotyped"] for required_variable in required_variables: assert required_variable in run_information # Loading the template section_template = templates.get_template("section_template.tex") conclusions = templates.get_template("parts/conclusions.tex") itemize_template = templates.get_template("iterate_template.tex") # Adding the required information (output directories) run_information["output_dir"] = utils.sanitize_tex(run_options.out_dir) run_information["output_dir_chrom"] = utils.sanitize_tex( os.path.join(run_options.out_dir, "chr*") ) run_information["output_final_impute2"] = utils.sanitize_tex( os.path.join(run_options.out_dir, "chr*", "final_impute2") ) # Output files output_files = [ utils.wrap_tex( utils.format_tex(utils.sanitize_tex("chr*.imputed.alleles"), "texttt") + utils.sanitize_tex(": description of the reference and " "alternative allele at each site.") ), utils.wrap_tex( utils.format_tex( utils.sanitize_tex("chr*.imputed.completion_rates"), "texttt", ) + utils.sanitize_tex(": number of missing values and completion " "rate for all site (using a probability " "threshold ") + utils.tex_inline_math( r"\geq {}\%".format(run_information["prob_threshold"]) ) + ")." ), utils.wrap_tex( utils.format_tex( utils.sanitize_tex("chr*.imputed.good_sites"), "texttt", ) + utils.sanitize_tex(": list of sites which pass the information " "threshold (") + utils.tex_inline_math( r"\geq {}".format(run_information["info_threshold"]) ) + utils.sanitize_tex(") and the completion rate threshold (") + utils.tex_inline_math( r"\geq {}\%".format(run_information["rate_threshold"]) ) + utils.sanitize_tex(") using the probability threshold ") + utils.tex_inline_math( r"\geq {}\%".format(run_information["prob_threshold"]) ) + "." ), utils.wrap_tex( utils.format_tex( utils.sanitize_tex("chr*.imputed.impute2"), "texttt", ) + utils.sanitize_tex(": imputation results (merged from all " "segments).") ), utils.wrap_tex( utils.format_tex( utils.sanitize_tex("chr*.imputed.impute2_info"), "texttt", ) + utils.sanitize_tex(": the IMPUTE2 marker-wise information file " "(merged from all segments).") ), utils.wrap_tex( utils.format_tex( utils.sanitize_tex("chr*.imputed.imputed_sites"), "texttt", ) + utils.sanitize_tex(": list of imputed sites (excluding sites that " "were previously genotyped in the study " "cohort).") ), utils.wrap_tex( utils.format_tex( utils.sanitize_tex("chr*.imputed.log"), "texttt", ) + utils.sanitize_tex(": log file of the merging procedure.") ), utils.wrap_tex( utils.format_tex( utils.sanitize_tex("chr*.imputed.maf"), "texttt", ) + utils.sanitize_tex(": minor allele frequency (along with minor " "allele identification) for all sites using " "the probability threshold ") + utils.tex_inline_math( r"\geq {}\%".format(run_information["prob_threshold"]) ) + "." ), utils.wrap_tex( utils.format_tex( utils.sanitize_tex("chr*.imputed.map"), "texttt", ) + utils.sanitize_tex(": a map file describing the genomic location " "of all sites.") ), utils.wrap_tex( utils.format_tex( utils.sanitize_tex("chr*.imputed.sample"), "texttt", ) + utils.sanitize_tex(": the sample file generated by the phasing " "step.") ), ] # Formating the enumeration of files run_information["output_files"] = itemize_template.render( iteration_type="itemize", iteration_list=output_files, ) # Returning the section return section_template.render( section_name="Conclusions", section_type="section", section_label="sec:conclusions", section_content=conclusions.render(**run_information), ) def _generate_annex(templates, run_options, run_information): """Generates the annex section of the report (execution times). Args: templates (jinja2.Environment): the jinja2 template environment run_options (dict): the run options run_information (dict): the run information Returns: str: a string representation of the "Annex" section """ # Some assertions required_variables = ["plink_exclude_exec_time", "shapeit_check_1_exec_time", "shapeit_check_2_exec_time", "plink_missing_exec_time", "plink_flip_exec_time", "plink_final_exec_time", "shapeit_phase_exec_time", "merge_impute2_exec_time", "impute2_exec_time", "bgzip_exec_time"] for required_variable in required_variables: assert required_variable in run_information, required_variable # Loading the templates tabular_template = templates.get_template("tabular_template.tex") float_template = templates.get_template("float_template.tex") # This section content content = ("The following tables show the execution time required by all " "the different tasks. All tasks are split by chromosomes. " "Execution times for imputation for each chromosome are means " "of individual segment times. Computing all genotyped markers' " "missing rate took {}.") content = utils.wrap_tex(utils.sanitize_tex(content.format( utils.format_time( run_information["plink_missing_exec_time"], written_time=True, ), ))) # The header of the tables table_header = [ utils.format_tex(utils.sanitize_tex("Chrom"), "textbf"), utils.format_tex(utils.sanitize_tex("Time"), "textbf"), ] # Getting the first table (plink_exclude_chr*) content += "\n\n" + _generate_time_float( table=run_information["plink_exclude_exec_time"], header=table_header, task_name="plink_exclude_chr*", label="plink_exclude_exec_time", tabular_t=tabular_template, float_t=float_template, ) # Getting the second table (shapeit_check_chr*_1) content += _generate_time_float( table=run_information["shapeit_check_1_exec_time"], header=table_header, task_name="shapeit_check_chr*_1", label="shapeit_check_1_exec_time", tabular_t=tabular_template, float_t=float_template, ) # Getting the third table (plink_flip_chr*) content += _generate_time_float( table=run_information["plink_flip_exec_time"], header=table_header, task_name="plink_flip_chr*", label="plink_flip_exec_time", tabular_t=tabular_template, float_t=float_template, ) # Getting the fourth table (shapeit_check_chr*_2) content += _generate_time_float( table=run_information["shapeit_check_2_exec_time"], header=table_header, task_name="shapeit_check_chr*_2", label="shapeit_check_2_exec_time", tabular_t=tabular_template, float_t=float_template, ) # Getting the fifth table (plink_final_exclude_chr*) content += _generate_time_float( table=run_information["plink_final_exec_time"], header=table_header, task_name="plink_final_exclude_chr*", label="plink_final_exclude_exec_time", tabular_t=tabular_template, float_t=float_template, ) # Getting the sixth table (shapeit_phase_chr*) content += _generate_time_float( table=run_information["shapeit_phase_exec_time"], header=table_header, task_name="shapeit_phase_chr*", label="shapeit_phase_exec_time", tabular_t=tabular_template, float_t=float_template, ) # Getting the seventh table (impute2_chr*) content += _generate_time_float( table=run_information["impute2_exec_time"], header=[utils.format_tex(utils.sanitize_tex("Chrom"), "textbf"), utils.format_tex(utils.sanitize_tex("Nb Seg."), "textbf"), utils.format_tex(utils.sanitize_tex("Mean T."), "textbf"), utils.format_tex(utils.sanitize_tex("Max T."), "textbf")], task_name="impute2_chr*", label="impute2_exec_time", tabular_t=tabular_template, float_t=float_template, first_time_col=2, ) # Getting the eight table (merge_impute2_chr*) content += _generate_time_float( table=run_information["merge_impute2_exec_time"], header=table_header, task_name="merge_impute2_chr*", label="merge_impute2_exec_time", tabular_t=tabular_template, float_t=float_template, ) # The last table (bgzip_chr*) only if present if run_information["bgzip_exec_time"]: content += _generate_time_float( table=run_information["bgzip_exec_time"], header=table_header, task_name="bgzip_chr*", label="bgzip_exec_time", tabular_t=tabular_template, float_t=float_template, ) return content def _generate_time_float(task_name, label, table, header, tabular_t, float_t, first_time_col=1): """Generates time tables (split one long table in two). Args: task_name (str): the name of the task label (str): the label for the float table (list): the data for the float header (str): the header for the tables tabular_t (jinja2.Template): the template for the tabular float_t (jinja2.Template): the template for the float first_time_col (int): the first column containing time (base 0) Returns: str: a LaTeX float """ two_tables = True sep = len(table) // 2 if len(table) <= 11: two_tables = False sep = len(table) # Adding the first table table_1 = utils.create_tabular( template=tabular_t, header=header, col_align=["r"] * len(header), data=_format_time_columns(table[:sep], first_time_col), ) # Adding the second table table_2 = "" if two_tables: table_2 = r"\hspace{1cm}" table_2 += utils.create_tabular( template=tabular_t, header=header, col_align=["r"] * len(header), data=_format_time_columns(table[sep:], first_time_col), ) # The caption caption = utils.sanitize_tex("Execution time for the '") caption += utils.format_tex(utils.sanitize_tex(task_name), "texttt") caption += utils.sanitize_tex("' tasks.") # Returning the float return utils.create_float( template=float_t, float_type="table", caption=utils.wrap_tex(caption), label="tab:{}".format(label), placement="H", content=table_1 + table_2, ) def _format_time_columns(table, first_col): """Colorize the time in the table (columns 2 and up). Args: table (list): the data for the tabular first_col (int): the first column containing time Returns: list: the same data, but with time column colorized """ for i in range(len(table)): for j in range(first_col, len(table[i])): table[i][j] = utils.colorize_time(table[i][j]) return table