Source code for pytximport.importers._read_tsv

import importlib.util
from logging import warning
from pathlib import Path
from typing import Literal, Optional, Union, cast

import numpy as np
import pandas as pd
from numpy.typing import ArrayLike, DTypeLike
from pandas._typing import DtypeArg

from ..definitions import TranscriptData
from ..utils._convert_counts_to_tpm import convert_counts_to_tpm



[docs]
def parse_dataframe(
    transcript_dataframe: pd.DataFrame,
    id_column: str,
    counts_column: str,
    length_column: str,
    abundance_column: Optional[str] = None,
    recompute_counts: bool = False,
) -> TranscriptData:
    """Parse a DataFrame with the transcript-level expression.

    Args:
        transcript_dataframe (pd.DataFrame): The DataFrame with the transcript-level expression.
        id_column (str): The column name for the transcript id.
        counts_column (str): The column name for the counts.
        length_column (str): The column name for the length.
        abundance_column (Optional[str], optional): The column name for the abundance. Defaults to None.
        recompute_counts (bool, optional): Whether inferential replicates will be used to recompute counts and
            abundances. If true, the counts and abundances will not be read from the file. Defaults to False.

    Returns:
        TranscriptData: The transcript-level expression.
    """
    # Check that the columns are in the table
    assert id_column in transcript_dataframe.columns, f"Could not find the transcript id column `{id_column}`."
    assert length_column in transcript_dataframe.columns, f"Could not find the length column `{length_column}`."

    counts: Optional[ArrayLike]
    if not recompute_counts:
        assert counts_column in transcript_dataframe.columns, f"Could not find the counts column `{counts_column}`."

        # Calculate the transcript-level TPM if the abundance was not included
        if abundance_column is None:
            warning("Abundance column not provided, calculating TPM.")
            abundance = convert_counts_to_tpm(
                counts=np.asarray(transcript_dataframe[counts_column].values),
                length=np.asarray(transcript_dataframe[length_column].values),
            )
        else:
            assert abundance_column in transcript_dataframe.columns, (
                f"Could not find the abundance column `{abundance_column}`."
            )
            abundance = np.asarray(transcript_dataframe[abundance_column].values)

        counts = np.asarray(transcript_dataframe[counts_column].values)
    else:
        counts = None
        abundance = None

    # Create a DataFrame with the transcript-level expression
    transcripts = TranscriptData(
        transcript_id=transcript_dataframe[id_column].values.astype(str).tolist(),
        counts=counts,
        length=np.asarray(transcript_dataframe[length_column].values),
        abundance=abundance,
        inferential_replicates=None,
    )

    # Return the transcript-level expression
    return transcripts




[docs]
def read_tsv(
    file_path: Union[str, Path],
    id_column: str,
    counts_column: str,
    length_column: str,
    abundance_column: Optional[str] = None,
    recompute_counts: bool = False,
) -> TranscriptData:
    """Read a quantification file in tsv format.

    Args:
        file_path (Union[str, Path]): The path to the quantification file.
        id_column (str): The column name for the transcript id.
        counts_column (str): The column name for the counts.
        length_column (str): The column name for the length.
        abundance_column (Optional[str], optional): The column name for the abundance. Defaults to None.
        recompute_counts (bool, optional): Whether inferential replicates will be used to recompute counts and
            abundances. If true, the counts and abundances will not be read from the file. Defaults to False.

    Returns:
        TranscriptData: The transcript-level expression.
    """
    if not isinstance(file_path, Path):
        file_path = Path(file_path)

    if not file_path.exists():
        raise ImportError(f"The file does not exist: {file_path}")

    # Read the quantification file as a tsv, tab separated with the first line being the column names
    usecols = [id_column, length_column]
    dtype: dict[str, DTypeLike] = {id_column: str, length_column: np.float64}

    if not recompute_counts:
        usecols.append(counts_column)
        dtype[counts_column] = np.float64

        if abundance_column is not None and abundance_column not in usecols:
            usecols.append(abundance_column)
            dtype[abundance_column] = np.float64

    # Check if pyarrow is available
    engine: Literal["pyarrow", "c"] = "pyarrow" if importlib.util.find_spec("pyarrow") is not None else "c"

    if engine != "pyarrow":
        warning("pyarrow is not available, consider installing it to improve import performance.")

    transcript_dataframe = pd.read_table(
        file_path,
        header=0,
        sep="\t",
        compression=("gzip" if file_path.suffix == ".gz" else None),
        engine=engine,
        usecols=usecols,
        dtype=cast(DtypeArg, dtype),
        na_filter=False,
    )

    return parse_dataframe(
        transcript_dataframe,
        id_column=id_column,
        counts_column=counts_column,
        length_column=length_column,
        abundance_column=abundance_column,
        recompute_counts=recompute_counts,
    )