Source code for pytximport.importers._read_kallisto

from logging import warning
from pathlib import Path
from typing import Optional, Union

import numpy as np
from h5py import File

from ..definitions import InferentialReplicates, TranscriptData
from ..utils._convert_counts_to_tpm import convert_counts_to_tpm
from ._read_tsv import read_tsv



[docs]
def read_inferential_replicates_kallisto(
    file_path: Union[str, Path],
) -> InferentialReplicates:
    """Read inferential replicates from a kallisto quantification file.

    Args:
        file_path (Union[str, Path]): The path to the quantification file.

    Returns:
        InferentialReplicates: The inferential replicates.
    """
    if not isinstance(file_path, Path):
        file_path = Path(file_path)

    if file_path.is_dir():
        file_path = file_path / "abundance.h5"

    if file_path.suffix != ".h5":
        raise ImportError("Only .h5 files are supported.")

    if not file_path.exists():
        raise ImportError(f"The file does not exist: {file_path}")

    with File(file_path, "r") as f:
        if "/bootstrap" not in f.keys():
            raise ImportError("No inferential replicates found.")

        bootstraps = f["/bootstrap"]
        target_count = len(bootstraps["bs0"])
        inferential_replicates = np.zeros((target_count, len(bootstraps.keys())))

        for bootstrap_idx in range(len(bootstraps.keys())):
            inferential_replicates[:, bootstrap_idx] = bootstraps[f"bs{bootstrap_idx}"][:]

        vars = np.var(inferential_replicates, axis=1, ddof=1)
        return InferentialReplicates(
            variance=vars,
            replicates=inferential_replicates,
        )




[docs]
def read_kallisto(
    file_path: Union[str, Path],
    id_column: str = "aux/ids",
    counts_column: str = "est_counts",
    length_column: str = "aux/eff_lengths",
    abundance_column: Optional[str] = None,
    inferential_replicates: bool = False,
    recompute_counts: bool = False,
) -> TranscriptData:
    """Read a kallisto quantification file.

    Args:
        file_path (Union[str, Path]): The path to the quantification file.
        id_column (str, optional): The column name for the transcript id. Defaults to "aux/ids".
        counts_column (str, optional): The column name for the counts. Defaults to "est_counts".
        length_column (str, optional): The column name for the length. Defaults to "aux/eff_lengths".
        abundance_column (Optional[str], optional): The column name for the abundance. Defaults to None.
        inferential_replicates (bool, optional): Whether to read inferential replicates. Defaults to False.
        recompute_counts (bool, optional): Whether inferential replicates will be used to recompute counts and
            abundances. If true, the counts and abundances will not be read from the file. Defaults to False.

    Returns:
        TranscriptData: The transcript-level expression.
    """
    if not isinstance(file_path, Path):
        file_path = Path(file_path)

    if file_path.is_dir():
        file_path = file_path / "abundance.h5"

    if file_path.suffix not in [".h5", ".tsv"]:
        raise ImportError("Only .h5 and .tsv files are supported.")

    if not file_path.exists():
        raise ImportError(f"The file does not exist: {file_path}")

    # Read the quantification file
    if file_path.suffix == ".h5":
        with File(file_path, "r") as f:
            # Get the transcript-level expression
            transcript_ids = f.file[id_column][:].astype(str)
            counts = f.file[counts_column][:]
            length = f.file[length_column][:]

            # Get the abundance if it was specified, else it will be calculated
            if abundance_column is not None:
                abundance = f.file[abundance_column][:]

        # Check that the length of the counts, length, and abundances are the same
        assert len(transcript_ids) == len(counts) == len(length), (
            "The transcript ids, counts and length have different length."
        )

        # Calculate the transcript-level TPM if the abundance was not included
        if abundance_column is None:
            warning("Abundance column not provided, calculating TPM.")
            abundance = convert_counts_to_tpm(counts, length)
        else:
            assert len(transcript_ids) == len(abundance), "The transcript ids and abundance have different length."

        # Create a DataFrame with the transcript-level expression
        transcripts = TranscriptData(
            transcript_id=transcript_ids,
            counts=counts,
            length=length,
            abundance=abundance,
            inferential_replicates=None,
        )

        if inferential_replicates:
            transcripts["inferential_replicates"] = read_inferential_replicates_kallisto(
                file_path,
            )

    elif file_path.suffix == ".tsv":
        transcripts = read_tsv(
            file_path,
            id_column=id_column,
            counts_column=counts_column,
            length_column=length_column,
            abundance_column=abundance_column,
            recompute_counts=recompute_counts,
        )

        if inferential_replicates:
            # Check whether there is a matching inferential replicates file
            inferential_replicates_file = file_path.parent / "abundance.h5"

            if inferential_replicates_file.exists():
                transcripts["inferential_replicates"] = read_inferential_replicates_kallisto(
                    inferential_replicates_file,
                )
            else:
                raise ImportError("No inferential replicates found. Please provide the path to the abundance.h5 file.")

    return transcripts