Source code for pytximport.importers._read_salmon

import gzip
import json
import os
from pathlib import Path
from typing import Literal, Union

import numpy as np

from ..definitions import InferentialReplicates, TranscriptData
from ._read_tsv import read_tsv


[docs] def read_inferential_replicates_salmon( file_path: Union[str, Path], aux_dir_name: Literal["aux_info", "aux"] = "aux_info", ) -> InferentialReplicates: """Read inferential replicates from a salmon quantification file. Args: file_path (Union[str, Path]): The path to the quantification file. aux_dir_name (Literal["aux_info", "aux"], optional): The name of the aux directory. Defaults to "aux_info". Returns: InferentialReplicates: The inferential replicates. """ if not isinstance(file_path, Path): file_path = Path(file_path) if not file_path.is_dir(): file_path = file_path.parent cmd_info_path = file_path / "cmd_info.json" if not os.path.exists(cmd_info_path): raise ImportError("cmd_info.json not found.") with open(cmd_info_path, "r") as f: cmd_info = json.load(f) if "auxDir" in cmd_info: aux_dir_name = cmd_info["auxDir"] aux_dir = file_path / aux_dir_name if not os.path.exists(aux_dir): raise ImportError("Auxiliary directory not found.") meta_info_path = aux_dir / "meta_info.json" with open(meta_info_path) as f: meta_info = json.load(f) if "salmon_version" in meta_info: assert meta_info["salmon_version"] >= "0.8.0", "Salmon version must be >= 0.8.0 to use inferential replicates." if "sailfish_version" in meta_info: assert meta_info["sailfish_version"] >= "0.9.0", ( "Sailfish version must be >= 0.9.0 to use inferential replicates." ) bootstrap_count = meta_info.get("num_bootstraps", 0) if bootstrap_count == 0: raise ImportError("No bootstraps found.") bootstrap_path = aux_dir / "bootstrap" / "bootstraps.gz" if not os.path.exists(bootstrap_path): raise ImportError("Bootstraps file not found.") if "num_valid_targets" in meta_info: meta_info["num_targets"] = meta_info["num_valid_targets"] target_count = meta_info.get("num_targets", 0) if target_count == 0: raise ImportError("No inferential replicate targets found.") expected_n = target_count * bootstrap_count try: # Try to read as floats with gzip.open(bootstrap_path, "rb") as f: bootstrap_data = np.frombuffer(f.read(), dtype=np.float64, count=expected_n) assert len(bootstrap_data) == expected_n except (AssertionError, ValueError): # Try to read as integers with gzip.open(bootstrap_path, "rb") as f: bootstrap_data = np.frombuffer(f.read(), dtype=np.int32, count=expected_n) bootstrap_data = bootstrap_data.reshape((bootstrap_count, target_count)).T return InferentialReplicates( variance=np.var(bootstrap_data, axis=1, ddof=1), replicates=bootstrap_data, )
[docs] def read_salmon( file_path: Union[str, Path], id_column: str = "Name", counts_column: str = "NumReads", length_column: str = "EffectiveLength", abundance_column: str = "TPM", aux_dir_name: Literal["aux_info", "aux"] = "aux_info", inferential_replicates: bool = False, recompute_counts: bool = False, ) -> TranscriptData: """Read a salmon quantification file. Args: file_path (Union[str, Path]): The path to the quantification file. id_column (str, optional): The column name for the transcript id. Defaults to "Name". counts_column (str, optional): The column name for the counts. Defaults to "NumReads". length_column (str, optional): The column name for the length. Defaults to "EffectiveLength". abundance_column (str, optional): The column name for the abundance. Defaults to "TPM". aux_dir_name (Literal["aux_info", "aux"], optional): The name of the aux directory. Defaults to "aux_info". inferential_replicates (bool, optional): Whether to read inferential replicates. Defaults to False. recompute_counts (bool, optional): Whether inferential replicates will be used to recompute counts and abundances. If true, the counts and abundances will not be read from the file. Defaults to False. Returns: TranscriptData: The transcript-level expression. """ if not isinstance(file_path, Path): file_path = Path(file_path) if file_path.is_dir(): # Add quant.sf to the file path file_path = file_path / "quant.sf" # Check that we are importing a .sf file if not file_path.suffix == ".sf" and not file_path.suffix == ".gz": raise ImportError("Only .sf and .gz files are supported.") transcript_data = read_tsv( file_path, id_column=id_column, counts_column=counts_column, length_column=length_column, abundance_column=abundance_column, recompute_counts=recompute_counts, ) if inferential_replicates: transcript_data["inferential_replicates"] = read_inferential_replicates_salmon( file_path, aux_dir_name=aux_dir_name, ) return transcript_data