Source code for pytximport.importers._read_piscem

from pathlib import Path
from typing import Literal, Union

import numpy as np
import pandas as pd

from ..definitions import InferentialReplicates, TranscriptData
from ._read_tsv import read_tsv



[docs]
def read_inferential_replicates_piscem(
    file_path: Union[str, Path],
) -> InferentialReplicates:
    """Read inferential replicates from a piscem quantification file.

    Args:
        file_path (Union[str, Path]): The path to the quantification file. The file should be a .quant file that is
            colocated with the inferential replicates file (.infreps.pq).

    Returns:
        InferentialReplicates: The inferential replicates.
    """
    if not isinstance(file_path, Path):
        file_path = Path(file_path)

    # Add .infreps.pq to the stem
    file_path = file_path.with_suffix(".infreps.pq")

    # Check whether the file exists
    if not file_path.exists():
        raise ImportError(f"The file does not exist: {file_path}")

    # Read the inferential replicates
    try:
        # Pandas will attempt to use pyarrow by default with a fallback to fastparquet
        bootstrap_data = pd.read_parquet(file_path)
    except ImportError:
        raise ImportError(
            "Could not read inferential replicates. "
            "Either pyarrow or fastparquet is required to read inferential replicates."
        )

    return InferentialReplicates(
        variance=np.var(bootstrap_data.to_numpy(dtype=np.float64), axis=1, ddof=1),
        replicates=bootstrap_data.to_numpy(dtype=np.float64),
    )




[docs]
def read_piscem(
    file_path: Union[str, Path],
    id_column: str = "target_name",
    counts_column: str = "ecount",
    length_column: str = "eeln",
    abundance_column: str = "tpm",
    inferential_replicates: bool = False,
    recompute_counts: bool = False,
) -> TranscriptData:
    """Read a piscem-infer quantification file.

    Args:
        file_path (Union[str, Path]): The path to the quantification file.
        id_column (str, optional): The column name for the transcript id. Defaults to "Name".
        counts_column (str, optional): The column name for the counts. Defaults to "NumReads".
        length_column (str, optional): The column name for the length. Defaults to "EffectiveLength".
        abundance_column (str, optional): The column name for the abundance. Defaults to "TPM".
        aux_dir_name (Literal["aux_info", "aux"], optional): The name of the aux directory. Defaults to "aux_info".
        inferential_replicates (bool, optional): Whether to read inferential replicates. Defaults to False.
        recompute_counts (bool, optional): Whether inferential replicates will be used to recompute counts and
            abundances. If true, the counts and abundances will not be read from the file. Defaults to False.

    Returns:
        TranscriptData: The transcript-level expression.
    """
    if not isinstance(file_path, Path):
        file_path = Path(file_path)

    # Check that we are importing a .quant file
    if not file_path.suffix == ".quant" and not file_path.suffix == ".gz":
        raise ImportError("Only .quant and .gz files are supported.")

    transcript_data = read_tsv(
        file_path,
        id_column=id_column,
        counts_column=counts_column,
        length_column=length_column,
        abundance_column=abundance_column,
        recompute_counts=recompute_counts,
    )

    if inferential_replicates:
        transcript_data["inferential_replicates"] = read_inferential_replicates_piscem(
            file_path,
        )

    return transcript_data