Source code for pytximport._cli

"""Expose the tximport function as a command-line tool."""

from logging import basicConfig, log, warning
from pathlib import Path

import click
import numpy as np
from click_default_group import DefaultGroup

from .core import tximport
from .utils import create_transcript_gene_map_from_annotation



[docs]
@click.group(
    cls=DefaultGroup,
    default="run",
    default_if_no_args=True,
    help="Welcome to the pytximport command-line interface for importing transcript-level quantification files.",
)
@click.pass_context
def cli(  # type: ignore  # pragma: no cover
    ctx: click.Context,
):
    """Welcome to the pytximport command-line interface for importing transcript-level quantification files."""
    pass



@cli.command(
    no_args_is_help=True,
)
@click.option(
    "-i",
    "--file_paths",
    "--file-paths",
    type=click.Path(exists=False),
    multiple=True,
    help="The path to an quantification file. To provide multiple input files, use `-i input1.sf -i input2.sf ...`.",
    required=True,
)
@click.option(
    "-t",
    "--data_type",
    "--data-type",
    type=click.Choice(["kallisto", "salmon", "sailfish", "oarfish", "piscem", "stringtie", "rsem", "tsv"]),
    help="The type of quantification files.",
    required=True,
)
@click.option(
    "-m",
    "--transcript_gene_map",
    "--transcript-gene-map",
    type=click.Path(exists=True),
    help=(
        "The path to the transcript to gene map. Either a tab-separated (.tsv) or comma-separated (.csv) file. "
        "Expected column names are `transcript_id` and `gene_id`."
    ),
)
@click.option(
    "-c",
    "--counts_from_abundance",
    "--counts-from-abundance",
    type=click.Choice(["scaled_tpm", "length_scaled_tpm", "dtu_scaled_tpm"]),
    help=(
        "The method to calculate the counts from the abundance. Leave empty to use counts. "
        "For differential gene expression analysis, we recommend using `length_scaled_tpm`. "
        "For differential transcript expression analysis, we recommend using `scaled_tpm`. "
        "For differential isoform usage analysis, we recommend using `dtu_scaled_tpm`."
    ),
)
@click.option(
    "-o",
    "--output_path",
    "--save-path",
    type=click.Path(),
    help="The output path to save the resulting counts to.",
    required=True,
)
@click.option(
    "-of",
    "--output_format",
    "--output-format",
    type=click.Choice(["csv", "h5ad"]),
    help="The format of the output file.",
)
@click.option(
    "-ow",
    "--output_path_overwrite",
    "--save-path-overwrite",
    is_flag=True,
    help="Provide this flag to overwrite an existing file at the output path.",
)
@click.option(
    "--ignore_after_bar",
    "--ignore-after-bar",
    type=bool,
    default=True,
    help="Whether to split the transcript id after the bar character (`|`).",
)
@click.option(
    "--ignore_transcript_version",
    "--ignore-transcript-version",
    type=bool,
    default=True,
    help="Whether to ignore the transcript version.",
)
@click.option(
    "-gl",
    "--gene_level",
    "--gene-level",
    is_flag=True,
    help="Provide this flag when importing gene-level counts from RSEM files.",
)
@click.option(
    "-tx",
    "--return_transcript_data",
    "--return-transcript-data",
    is_flag=True,
    help=(
        "Provide this flag to return transcript-level instead of gene-summarized data. "
        "Incompatible with gene-level input and `counts_from_abundance=length_scaled_tpm`."
    ),
)
@click.option(
    "-ir",
    "--inferential_replicates",
    "--inferential-replicates",
    is_flag=True,
    help="Provide this flag to make use of inferential replicates. Will use the median of the inferential replicates.",
)
@click.option(
    "-id",
    "--id_column",
    "--id-column",
    type=str,
    help="The column name for the transcript id.",
)
@click.option(
    "-counts",
    "--counts_column",
    "--counts-column",
    type=str,
    help="The column name for the counts.",
)
@click.option(
    "-length",
    "--length_column",
    "--length-column",
    type=str,
    help="The column name for the length.",
)
@click.option(
    "-tpm",
    "--abundance_column",
    "--abundance-column",
    type=str,
    help="The column name for the abundance.",
)
@click.option(
    "--existence_optional",
    "--existence-optional",
    is_flag=True,
    help="Whether the existence of the files is optional.",
)
def run(  # type: ignore  # pragma: no cover
    **kwargs,
) -> None:
    """Call the tximport function via the command line."""
    basicConfig(level=25, format="%(asctime)s: %(message)s")

    # Add return_data to the kwargs with a default value of False
    kwargs["return_data"] = False
    kwargs["output_type"] = "anndata"
    kwargs["inferential_replicate_transformer"] = lambda x: np.median(x, axis=1)

    tximport(**kwargs)  # type: ignore


@cli.command(
    no_args_is_help=True,
)
@click.option(
    "-i",
    "--input_file",
    "--input",
    type=click.Path(exists=True),
    help="The path to the annotation GTF file.",
    required=True,
)
@click.option(
    "-o",
    "--output_file",
    "--output",
    type=click.Path(),
    help="The output path to save the resulting transcript-to-gene mapping file to.",
    required=True,
)
@click.option(
    "-ow",
    "--output_path_overwrite",
    "--save-path-overwrite",
    is_flag=True,
    help="Provide this flag to overwrite an existing file at the output path.",
)
@click.option(
    "--source-field",
    "--source_field",
    type=str,
    help="The annotation field to use as the source in the mapping file.",
    required=False,
)
@click.option(
    "--target-field",
    "--target_field",
    type=str,
    multiple=True,
    help="The annotation field(s) to use as the target in the mapping file.",
    required=False,
)
@click.option(
    "--keep-biotype",
    "--keep_biotype",
    is_flag=True,
    help="Provide this flag to keep the gene_biotype column as an additional column in the mapping file.",
)
def create_map(  # type: ignore  # pragma: no cover
    **kwargs,
) -> None:
    """Create a transcript-to-gene mapping file via the command line."""
    basicConfig(level=25, format="%(asctime)s: %(message)s")
    log(25, "Creating a transcript-to-gene mapping file.")

    if isinstance(kwargs["target_field"], tuple):
        kwargs["target_field"] = list(kwargs["target_field"])

    df = create_transcript_gene_map_from_annotation(
        kwargs["input_file"],
        source_field=kwargs["source_field"] if kwargs["source_field"] else "transcript_id",
        target_field=kwargs["target_field"] if kwargs["target_field"] else "gene_id",
        keep_biotype=kwargs["keep_biotype"],
    )
    log(25, "Created the transcript-to-gene mapping file. Saving the file...")

    output_file = Path(kwargs["output_file"])
    if not output_file.exists() or kwargs["output_path_overwrite"]:
        df.to_csv(
            kwargs["output_file"],
            sep=("," if kwargs["output_file"].endswith(".csv") else "\t"),
            index=False,
        )
        log(25, f"Saved the transcript-to-gene mapping file to {kwargs['output_file']}.")
    else:
        warning(
            f"Could not save the transcript-to-gene mapping file. File already exists at {kwargs['output_file']}. "
            "Use the `-ow` flag to overwrite."
        )