msaexplorer._data_classes

this contains the dataclasses used to store the data for the msa explorer. these are not meant to be used outside of this package.

  1"""
  2this contains the dataclasses used to store the data for the msa explorer. these are not meant to be used outside of this package.
  3"""
  4
  5# built-in
  6from dataclasses import dataclass, field
  7
  8# libs
  9from numpy import ndarray
 10
 11
 12@dataclass(frozen=True)
 13class SingleNucleotidePolymorphism:
 14    """
 15    SNP data for one alignment position.
 16    """
 17
 18    ref: str
 19    alt: dict[str, tuple[float, tuple[str, ...]]] = field(default_factory=dict)
 20
 21
 22@dataclass(frozen=True)
 23class VariantCollection:
 24    """Container for SNPs"""
 25
 26    chrom: str
 27    positions: dict[int, SingleNucleotidePolymorphism] = field(default_factory=dict)
 28
 29    def __len__(self) -> int:
 30        return len(self.positions)
 31
 32    def __iter__(self):
 33        return iter(self.positions)
 34
 35    def __contains__(self, position: int) -> bool:
 36        return position in self.positions
 37    
 38    def __getitem__(self, position: int) -> SingleNucleotidePolymorphism:
 39        """Access a SNP by position."""
 40        return self.positions[position]
 41
 42
 43@dataclass(frozen=True)
 44class AlignmentStats:
 45    """
 46    Generic result container for position-based statistics.
 47    """
 48
 49    stat_name: str
 50    positions: ndarray
 51    values: ndarray
 52
 53    def __post_init__(self):
 54        if self.positions.shape != self.values.shape:
 55            raise ValueError("positions and values must have the same shape")
 56
 57    # dunder methods
 58    def __len__(self) -> int:
 59        return len(self.values)
 60
 61    def __getitem__(self, index: int) -> float:
 62        return self.values[index]
 63
 64    def __contains__(self, position: int) -> bool:
 65        return position in self.positions
 66
 67
 68@dataclass(frozen=True)
 69class LengthStats:
 70    """Summary statistics for ungapped sequence lengths in an alignment."""
 71
 72    n_sequences: int
 73    mean_length: float
 74    std_length: float
 75    min_length: int
 76    max_length: int
 77
 78
 79@dataclass(frozen=True)
 80class PairwiseDistance:
 81    """
 82    Result container for Pairwise distances. Array can either be a 2D (compared to reference) or 3D array.
 83    """
 84
 85    reference_id: str | None
 86    sequence_ids: list[str]
 87    distances: ndarray
 88
 89    # dunder methods
 90    def __len__(self) -> int:
 91        return len(self.sequence_ids)
 92
 93    def __getitem__(self, index: int | str) -> float:
 94        """
 95        Different ways to access the distance matrix.
 96        - pd[0] -> Distance first sequence to all other sequences
 97        - pd['seq_name'] -> Distance of a specific sequence to all other sequences
 98        """
 99        if isinstance(index, str):
100            idx = self.sequence_ids.index(index)
101            return self.distances[idx]
102        return self.distances[index]
103
104    def __contains__(self, item: str) -> bool:
105        """Seq ID present"""
106        return item in self.sequence_ids
107
108
109@dataclass(frozen=True)
110class OpenReadingFrame:
111    """
112    Represents a single conserved ORF detected across an alignment.
113
114    Attributes:
115        orf_id:       Unique identifier, e.g. ``'ORF_0'``.
116        location:     Main ORF boundaries as a tuple of ``(start, stop)`` pairs
117                      (0-based, half-open).  Typically a single pair, but may
118                      carry additional coordinates for split ORFs.
119        frame:        Reading frame (0, 1, or 2).
120        strand:       ``'+'`` for forward, ``'-'`` for reverse complement.
121        conservation: Percentage of fully identical alignment columns inside the ORF.
122        internal:     Tuple of ``(start, stop)`` pairs for nested (internal) ORFs
123                      that share the same stop codon.
124    """
125
126    orf_id: str
127    location: tuple[tuple[int, int], ...]
128    frame: int
129    strand: str
130    conservation: float
131    internal: tuple[tuple[int, int], ...] = field(default_factory=tuple)
132
133    def __post_init__(self):
134        if self.strand not in ('+', '-'):
135            raise ValueError(f"strand must be '+' or '-', got {self.strand!r}")
136        if not (0 <= self.frame <= 2):
137            raise ValueError(f"frame must be 0, 1, or 2, got {self.frame!r}")
138
139    def __len__(self) -> int:
140        """Length of the main ORF in alignment columns."""
141        return self.location[0][1] - self.location[0][0]
142
143    def __contains__(self, position: int) -> bool:
144        """True if *position* (0-based) falls inside the main ORF."""
145        start, stop = self.location[0]
146        return start <= position < stop
147
148
149@dataclass(frozen=True)
150class OrfCollection:
151    """
152    Ordered collection of `OpenReadingFrame` objects returned by
153    `MSA.get_conserved_orfs` or`MSA.get_non_overlapping_conserved_orfs`.
154
155    The class intentionally mimics a *read-only dict* interface
156    """
157
158    orfs: tuple[OpenReadingFrame, ...] = field(default_factory=tuple)
159
160    # dict-like interface
161    def keys(self) -> list[str]:
162        """Return ORF identifiers in insertion order."""
163        return [orf.orf_id for orf in self.orfs]
164
165    def values(self) -> list[OpenReadingFrame]:
166        """Return :class:`OpenReadingFrame` objects in insertion order."""
167        return list(self.orfs)
168
169    def items(self) -> list[tuple[str, OpenReadingFrame]]:
170        """Return ``(orf_id, OpenReadingFrame)`` pairs in insertion order."""
171        return [(orf.orf_id, orf) for orf in self.orfs]
172
173    # dunder methods
174    def __len__(self) -> int:
175        return len(self.orfs)
176
177    def __bool__(self) -> bool:
178        return len(self.orfs) > 0
179
180    def __iter__(self):
181        """Iterate over ORF identifiers (mirrors ``dict.__iter__``)."""
182        return iter(orf.orf_id for orf in self.orfs)
183
184    def __getitem__(self, key: str | int) -> OpenReadingFrame:
185        """
186        Access an ORF by identifier string or integer index.
187
188        Examples::
189            orfs['ORF_0']   # by identifier
190            orfs[0]         # by index
191        """
192        if isinstance(key, int):
193            return self.orfs[key]
194        for orf in self.orfs:
195            if orf.orf_id == key:
196                return orf
197        raise KeyError(key)
198
199    def __contains__(self, orf_id: str) -> bool:
200        """True if an ORF with *orf_id* is present in the collection."""
201        return any(orf.orf_id == orf_id for orf in self.orfs)
@dataclass(frozen=True)
class SingleNucleotidePolymorphism:
13@dataclass(frozen=True)
14class SingleNucleotidePolymorphism:
15    """
16    SNP data for one alignment position.
17    """
18
19    ref: str
20    alt: dict[str, tuple[float, tuple[str, ...]]] = field(default_factory=dict)

SNP data for one alignment position.

SingleNucleotidePolymorphism(ref: str, alt: dict[str, tuple[float, tuple[str, ...]]] = <factory>)
ref: str
alt: dict[str, tuple[float, tuple[str, ...]]]
@dataclass(frozen=True)
class VariantCollection:
23@dataclass(frozen=True)
24class VariantCollection:
25    """Container for SNPs"""
26
27    chrom: str
28    positions: dict[int, SingleNucleotidePolymorphism] = field(default_factory=dict)
29
30    def __len__(self) -> int:
31        return len(self.positions)
32
33    def __iter__(self):
34        return iter(self.positions)
35
36    def __contains__(self, position: int) -> bool:
37        return position in self.positions
38    
39    def __getitem__(self, position: int) -> SingleNucleotidePolymorphism:
40        """Access a SNP by position."""
41        return self.positions[position]

Container for SNPs

VariantCollection( chrom: str, positions: dict[int, SingleNucleotidePolymorphism] = <factory>)
chrom: str
positions: dict[int, SingleNucleotidePolymorphism]
@dataclass(frozen=True)
class AlignmentStats:
44@dataclass(frozen=True)
45class AlignmentStats:
46    """
47    Generic result container for position-based statistics.
48    """
49
50    stat_name: str
51    positions: ndarray
52    values: ndarray
53
54    def __post_init__(self):
55        if self.positions.shape != self.values.shape:
56            raise ValueError("positions and values must have the same shape")
57
58    # dunder methods
59    def __len__(self) -> int:
60        return len(self.values)
61
62    def __getitem__(self, index: int) -> float:
63        return self.values[index]
64
65    def __contains__(self, position: int) -> bool:
66        return position in self.positions

Generic result container for position-based statistics.

AlignmentStats(stat_name: str, positions: numpy.ndarray, values: numpy.ndarray)
stat_name: str
positions: numpy.ndarray
values: numpy.ndarray
@dataclass(frozen=True)
class LengthStats:
69@dataclass(frozen=True)
70class LengthStats:
71    """Summary statistics for ungapped sequence lengths in an alignment."""
72
73    n_sequences: int
74    mean_length: float
75    std_length: float
76    min_length: int
77    max_length: int

Summary statistics for ungapped sequence lengths in an alignment.

LengthStats( n_sequences: int, mean_length: float, std_length: float, min_length: int, max_length: int)
n_sequences: int
mean_length: float
std_length: float
min_length: int
max_length: int
@dataclass(frozen=True)
class PairwiseDistance:
 80@dataclass(frozen=True)
 81class PairwiseDistance:
 82    """
 83    Result container for Pairwise distances. Array can either be a 2D (compared to reference) or 3D array.
 84    """
 85
 86    reference_id: str | None
 87    sequence_ids: list[str]
 88    distances: ndarray
 89
 90    # dunder methods
 91    def __len__(self) -> int:
 92        return len(self.sequence_ids)
 93
 94    def __getitem__(self, index: int | str) -> float:
 95        """
 96        Different ways to access the distance matrix.
 97        - pd[0] -> Distance first sequence to all other sequences
 98        - pd['seq_name'] -> Distance of a specific sequence to all other sequences
 99        """
100        if isinstance(index, str):
101            idx = self.sequence_ids.index(index)
102            return self.distances[idx]
103        return self.distances[index]
104
105    def __contains__(self, item: str) -> bool:
106        """Seq ID present"""
107        return item in self.sequence_ids

Result container for Pairwise distances. Array can either be a 2D (compared to reference) or 3D array.

PairwiseDistance( reference_id: str | None, sequence_ids: list[str], distances: numpy.ndarray)
reference_id: str | None
sequence_ids: list[str]
distances: numpy.ndarray
@dataclass(frozen=True)
class OpenReadingFrame:
110@dataclass(frozen=True)
111class OpenReadingFrame:
112    """
113    Represents a single conserved ORF detected across an alignment.
114
115    Attributes:
116        orf_id:       Unique identifier, e.g. ``'ORF_0'``.
117        location:     Main ORF boundaries as a tuple of ``(start, stop)`` pairs
118                      (0-based, half-open).  Typically a single pair, but may
119                      carry additional coordinates for split ORFs.
120        frame:        Reading frame (0, 1, or 2).
121        strand:       ``'+'`` for forward, ``'-'`` for reverse complement.
122        conservation: Percentage of fully identical alignment columns inside the ORF.
123        internal:     Tuple of ``(start, stop)`` pairs for nested (internal) ORFs
124                      that share the same stop codon.
125    """
126
127    orf_id: str
128    location: tuple[tuple[int, int], ...]
129    frame: int
130    strand: str
131    conservation: float
132    internal: tuple[tuple[int, int], ...] = field(default_factory=tuple)
133
134    def __post_init__(self):
135        if self.strand not in ('+', '-'):
136            raise ValueError(f"strand must be '+' or '-', got {self.strand!r}")
137        if not (0 <= self.frame <= 2):
138            raise ValueError(f"frame must be 0, 1, or 2, got {self.frame!r}")
139
140    def __len__(self) -> int:
141        """Length of the main ORF in alignment columns."""
142        return self.location[0][1] - self.location[0][0]
143
144    def __contains__(self, position: int) -> bool:
145        """True if *position* (0-based) falls inside the main ORF."""
146        start, stop = self.location[0]
147        return start <= position < stop

Represents a single conserved ORF detected across an alignment.

Attributes: orf_id: Unique identifier, e.g. 'ORF_0'. location: Main ORF boundaries as a tuple of (start, stop) pairs (0-based, half-open). Typically a single pair, but may carry additional coordinates for split ORFs. frame: Reading frame (0, 1, or 2). strand: '+' for forward, '-' for reverse complement. conservation: Percentage of fully identical alignment columns inside the ORF. internal: Tuple of (start, stop) pairs for nested (internal) ORFs that share the same stop codon.

OpenReadingFrame( orf_id: str, location: tuple[tuple[int, int], ...], frame: int, strand: str, conservation: float, internal: tuple[tuple[int, int], ...] = <factory>)
orf_id: str
location: tuple[tuple[int, int], ...]
frame: int
strand: str
conservation: float
internal: tuple[tuple[int, int], ...]
@dataclass(frozen=True)
class OrfCollection:
150@dataclass(frozen=True)
151class OrfCollection:
152    """
153    Ordered collection of `OpenReadingFrame` objects returned by
154    `MSA.get_conserved_orfs` or`MSA.get_non_overlapping_conserved_orfs`.
155
156    The class intentionally mimics a *read-only dict* interface
157    """
158
159    orfs: tuple[OpenReadingFrame, ...] = field(default_factory=tuple)
160
161    # dict-like interface
162    def keys(self) -> list[str]:
163        """Return ORF identifiers in insertion order."""
164        return [orf.orf_id for orf in self.orfs]
165
166    def values(self) -> list[OpenReadingFrame]:
167        """Return :class:`OpenReadingFrame` objects in insertion order."""
168        return list(self.orfs)
169
170    def items(self) -> list[tuple[str, OpenReadingFrame]]:
171        """Return ``(orf_id, OpenReadingFrame)`` pairs in insertion order."""
172        return [(orf.orf_id, orf) for orf in self.orfs]
173
174    # dunder methods
175    def __len__(self) -> int:
176        return len(self.orfs)
177
178    def __bool__(self) -> bool:
179        return len(self.orfs) > 0
180
181    def __iter__(self):
182        """Iterate over ORF identifiers (mirrors ``dict.__iter__``)."""
183        return iter(orf.orf_id for orf in self.orfs)
184
185    def __getitem__(self, key: str | int) -> OpenReadingFrame:
186        """
187        Access an ORF by identifier string or integer index.
188
189        Examples::
190            orfs['ORF_0']   # by identifier
191            orfs[0]         # by index
192        """
193        if isinstance(key, int):
194            return self.orfs[key]
195        for orf in self.orfs:
196            if orf.orf_id == key:
197                return orf
198        raise KeyError(key)
199
200    def __contains__(self, orf_id: str) -> bool:
201        """True if an ORF with *orf_id* is present in the collection."""
202        return any(orf.orf_id == orf_id for orf in self.orfs)

Ordered collection of OpenReadingFrame objects returned by MSA.get_conserved_orfs orMSA.get_non_overlapping_conserved_orfs.

The class intentionally mimics a read-only dict interface

OrfCollection( orfs: tuple[OpenReadingFrame, ...] = <factory>)
orfs: tuple[OpenReadingFrame, ...]
def keys(self) -> list[str]:
162    def keys(self) -> list[str]:
163        """Return ORF identifiers in insertion order."""
164        return [orf.orf_id for orf in self.orfs]

Return ORF identifiers in insertion order.

def values(self) -> list[OpenReadingFrame]:
166    def values(self) -> list[OpenReadingFrame]:
167        """Return :class:`OpenReadingFrame` objects in insertion order."""
168        return list(self.orfs)

Return OpenReadingFrame objects in insertion order.

def items(self) -> list[tuple[str, OpenReadingFrame]]:
170    def items(self) -> list[tuple[str, OpenReadingFrame]]:
171        """Return ``(orf_id, OpenReadingFrame)`` pairs in insertion order."""
172        return [(orf.orf_id, orf) for orf in self.orfs]

Return (orf_id, OpenReadingFrame) pairs in insertion order.