msaexplorer._data_classes
this contains the dataclasses used to store the data for the msa explorer. these are not meant to be used outside of this package.
1""" 2this contains the dataclasses used to store the data for the msa explorer. these are not meant to be used outside of this package. 3""" 4 5# built-in 6from dataclasses import dataclass, field 7 8# libs 9from numpy import ndarray 10 11 12@dataclass(frozen=True) 13class SingleNucleotidePolymorphism: 14 """ 15 SNP data for one alignment position. 16 """ 17 18 ref: str 19 alt: dict[str, tuple[float, tuple[str, ...]]] = field(default_factory=dict) 20 21 22@dataclass(frozen=True) 23class VariantCollection: 24 """Container for SNPs""" 25 26 chrom: str 27 positions: dict[int, SingleNucleotidePolymorphism] = field(default_factory=dict) 28 29 def __len__(self) -> int: 30 return len(self.positions) 31 32 def __iter__(self): 33 return iter(self.positions) 34 35 def __contains__(self, position: int) -> bool: 36 return position in self.positions 37 38 def __getitem__(self, position: int) -> SingleNucleotidePolymorphism: 39 """Access a SNP by position.""" 40 return self.positions[position] 41 42 43@dataclass(frozen=True) 44class AlignmentStats: 45 """ 46 Generic result container for position-based statistics. 47 """ 48 49 stat_name: str 50 positions: ndarray 51 values: ndarray 52 53 def __post_init__(self): 54 if self.positions.shape != self.values.shape: 55 raise ValueError("positions and values must have the same shape") 56 57 # dunder methods 58 def __len__(self) -> int: 59 return len(self.values) 60 61 def __getitem__(self, index: int) -> float: 62 return self.values[index] 63 64 def __contains__(self, position: int) -> bool: 65 return position in self.positions 66 67 68@dataclass(frozen=True) 69class LengthStats: 70 """Summary statistics for ungapped sequence lengths in an alignment.""" 71 72 n_sequences: int 73 mean_length: float 74 std_length: float 75 min_length: int 76 max_length: int 77 78 79@dataclass(frozen=True) 80class PairwiseDistance: 81 """ 82 Result container for Pairwise distances. Array can either be a 2D (compared to reference) or 3D array. 83 """ 84 85 reference_id: str | None 86 sequence_ids: list[str] 87 distances: ndarray 88 89 # dunder methods 90 def __len__(self) -> int: 91 return len(self.sequence_ids) 92 93 def __getitem__(self, index: int | str) -> float: 94 """ 95 Different ways to access the distance matrix. 96 - pd[0] -> Distance first sequence to all other sequences 97 - pd['seq_name'] -> Distance of a specific sequence to all other sequences 98 """ 99 if isinstance(index, str): 100 idx = self.sequence_ids.index(index) 101 return self.distances[idx] 102 return self.distances[index] 103 104 def __contains__(self, item: str) -> bool: 105 """Seq ID present""" 106 return item in self.sequence_ids 107 108 109@dataclass(frozen=True) 110class OpenReadingFrame: 111 """ 112 Represents a single conserved ORF detected across an alignment. 113 114 Attributes: 115 orf_id: Unique identifier, e.g. ``'ORF_0'``. 116 location: Main ORF boundaries as a tuple of ``(start, stop)`` pairs 117 (0-based, half-open). Typically a single pair, but may 118 carry additional coordinates for split ORFs. 119 frame: Reading frame (0, 1, or 2). 120 strand: ``'+'`` for forward, ``'-'`` for reverse complement. 121 conservation: Percentage of fully identical alignment columns inside the ORF. 122 internal: Tuple of ``(start, stop)`` pairs for nested (internal) ORFs 123 that share the same stop codon. 124 """ 125 126 orf_id: str 127 location: tuple[tuple[int, int], ...] 128 frame: int 129 strand: str 130 conservation: float 131 internal: tuple[tuple[int, int], ...] = field(default_factory=tuple) 132 133 def __post_init__(self): 134 if self.strand not in ('+', '-'): 135 raise ValueError(f"strand must be '+' or '-', got {self.strand!r}") 136 if not (0 <= self.frame <= 2): 137 raise ValueError(f"frame must be 0, 1, or 2, got {self.frame!r}") 138 139 def __len__(self) -> int: 140 """Length of the main ORF in alignment columns.""" 141 return self.location[0][1] - self.location[0][0] 142 143 def __contains__(self, position: int) -> bool: 144 """True if *position* (0-based) falls inside the main ORF.""" 145 start, stop = self.location[0] 146 return start <= position < stop 147 148 149@dataclass(frozen=True) 150class OrfCollection: 151 """ 152 Ordered collection of `OpenReadingFrame` objects returned by 153 `MSA.get_conserved_orfs` or`MSA.get_non_overlapping_conserved_orfs`. 154 155 The class intentionally mimics a *read-only dict* interface 156 """ 157 158 orfs: tuple[OpenReadingFrame, ...] = field(default_factory=tuple) 159 160 # dict-like interface 161 def keys(self) -> list[str]: 162 """Return ORF identifiers in insertion order.""" 163 return [orf.orf_id for orf in self.orfs] 164 165 def values(self) -> list[OpenReadingFrame]: 166 """Return :class:`OpenReadingFrame` objects in insertion order.""" 167 return list(self.orfs) 168 169 def items(self) -> list[tuple[str, OpenReadingFrame]]: 170 """Return ``(orf_id, OpenReadingFrame)`` pairs in insertion order.""" 171 return [(orf.orf_id, orf) for orf in self.orfs] 172 173 # dunder methods 174 def __len__(self) -> int: 175 return len(self.orfs) 176 177 def __bool__(self) -> bool: 178 return len(self.orfs) > 0 179 180 def __iter__(self): 181 """Iterate over ORF identifiers (mirrors ``dict.__iter__``).""" 182 return iter(orf.orf_id for orf in self.orfs) 183 184 def __getitem__(self, key: str | int) -> OpenReadingFrame: 185 """ 186 Access an ORF by identifier string or integer index. 187 188 Examples:: 189 orfs['ORF_0'] # by identifier 190 orfs[0] # by index 191 """ 192 if isinstance(key, int): 193 return self.orfs[key] 194 for orf in self.orfs: 195 if orf.orf_id == key: 196 return orf 197 raise KeyError(key) 198 199 def __contains__(self, orf_id: str) -> bool: 200 """True if an ORF with *orf_id* is present in the collection.""" 201 return any(orf.orf_id == orf_id for orf in self.orfs)
13@dataclass(frozen=True) 14class SingleNucleotidePolymorphism: 15 """ 16 SNP data for one alignment position. 17 """ 18 19 ref: str 20 alt: dict[str, tuple[float, tuple[str, ...]]] = field(default_factory=dict)
SNP data for one alignment position.
23@dataclass(frozen=True) 24class VariantCollection: 25 """Container for SNPs""" 26 27 chrom: str 28 positions: dict[int, SingleNucleotidePolymorphism] = field(default_factory=dict) 29 30 def __len__(self) -> int: 31 return len(self.positions) 32 33 def __iter__(self): 34 return iter(self.positions) 35 36 def __contains__(self, position: int) -> bool: 37 return position in self.positions 38 39 def __getitem__(self, position: int) -> SingleNucleotidePolymorphism: 40 """Access a SNP by position.""" 41 return self.positions[position]
Container for SNPs
44@dataclass(frozen=True) 45class AlignmentStats: 46 """ 47 Generic result container for position-based statistics. 48 """ 49 50 stat_name: str 51 positions: ndarray 52 values: ndarray 53 54 def __post_init__(self): 55 if self.positions.shape != self.values.shape: 56 raise ValueError("positions and values must have the same shape") 57 58 # dunder methods 59 def __len__(self) -> int: 60 return len(self.values) 61 62 def __getitem__(self, index: int) -> float: 63 return self.values[index] 64 65 def __contains__(self, position: int) -> bool: 66 return position in self.positions
Generic result container for position-based statistics.
69@dataclass(frozen=True) 70class LengthStats: 71 """Summary statistics for ungapped sequence lengths in an alignment.""" 72 73 n_sequences: int 74 mean_length: float 75 std_length: float 76 min_length: int 77 max_length: int
Summary statistics for ungapped sequence lengths in an alignment.
80@dataclass(frozen=True) 81class PairwiseDistance: 82 """ 83 Result container for Pairwise distances. Array can either be a 2D (compared to reference) or 3D array. 84 """ 85 86 reference_id: str | None 87 sequence_ids: list[str] 88 distances: ndarray 89 90 # dunder methods 91 def __len__(self) -> int: 92 return len(self.sequence_ids) 93 94 def __getitem__(self, index: int | str) -> float: 95 """ 96 Different ways to access the distance matrix. 97 - pd[0] -> Distance first sequence to all other sequences 98 - pd['seq_name'] -> Distance of a specific sequence to all other sequences 99 """ 100 if isinstance(index, str): 101 idx = self.sequence_ids.index(index) 102 return self.distances[idx] 103 return self.distances[index] 104 105 def __contains__(self, item: str) -> bool: 106 """Seq ID present""" 107 return item in self.sequence_ids
Result container for Pairwise distances. Array can either be a 2D (compared to reference) or 3D array.
110@dataclass(frozen=True) 111class OpenReadingFrame: 112 """ 113 Represents a single conserved ORF detected across an alignment. 114 115 Attributes: 116 orf_id: Unique identifier, e.g. ``'ORF_0'``. 117 location: Main ORF boundaries as a tuple of ``(start, stop)`` pairs 118 (0-based, half-open). Typically a single pair, but may 119 carry additional coordinates for split ORFs. 120 frame: Reading frame (0, 1, or 2). 121 strand: ``'+'`` for forward, ``'-'`` for reverse complement. 122 conservation: Percentage of fully identical alignment columns inside the ORF. 123 internal: Tuple of ``(start, stop)`` pairs for nested (internal) ORFs 124 that share the same stop codon. 125 """ 126 127 orf_id: str 128 location: tuple[tuple[int, int], ...] 129 frame: int 130 strand: str 131 conservation: float 132 internal: tuple[tuple[int, int], ...] = field(default_factory=tuple) 133 134 def __post_init__(self): 135 if self.strand not in ('+', '-'): 136 raise ValueError(f"strand must be '+' or '-', got {self.strand!r}") 137 if not (0 <= self.frame <= 2): 138 raise ValueError(f"frame must be 0, 1, or 2, got {self.frame!r}") 139 140 def __len__(self) -> int: 141 """Length of the main ORF in alignment columns.""" 142 return self.location[0][1] - self.location[0][0] 143 144 def __contains__(self, position: int) -> bool: 145 """True if *position* (0-based) falls inside the main ORF.""" 146 start, stop = self.location[0] 147 return start <= position < stop
Represents a single conserved ORF detected across an alignment.
Attributes:
orf_id: Unique identifier, e.g. 'ORF_0'.
location: Main ORF boundaries as a tuple of (start, stop) pairs
(0-based, half-open). Typically a single pair, but may
carry additional coordinates for split ORFs.
frame: Reading frame (0, 1, or 2).
strand: '+' for forward, '-' for reverse complement.
conservation: Percentage of fully identical alignment columns inside the ORF.
internal: Tuple of (start, stop) pairs for nested (internal) ORFs
that share the same stop codon.
150@dataclass(frozen=True) 151class OrfCollection: 152 """ 153 Ordered collection of `OpenReadingFrame` objects returned by 154 `MSA.get_conserved_orfs` or`MSA.get_non_overlapping_conserved_orfs`. 155 156 The class intentionally mimics a *read-only dict* interface 157 """ 158 159 orfs: tuple[OpenReadingFrame, ...] = field(default_factory=tuple) 160 161 # dict-like interface 162 def keys(self) -> list[str]: 163 """Return ORF identifiers in insertion order.""" 164 return [orf.orf_id for orf in self.orfs] 165 166 def values(self) -> list[OpenReadingFrame]: 167 """Return :class:`OpenReadingFrame` objects in insertion order.""" 168 return list(self.orfs) 169 170 def items(self) -> list[tuple[str, OpenReadingFrame]]: 171 """Return ``(orf_id, OpenReadingFrame)`` pairs in insertion order.""" 172 return [(orf.orf_id, orf) for orf in self.orfs] 173 174 # dunder methods 175 def __len__(self) -> int: 176 return len(self.orfs) 177 178 def __bool__(self) -> bool: 179 return len(self.orfs) > 0 180 181 def __iter__(self): 182 """Iterate over ORF identifiers (mirrors ``dict.__iter__``).""" 183 return iter(orf.orf_id for orf in self.orfs) 184 185 def __getitem__(self, key: str | int) -> OpenReadingFrame: 186 """ 187 Access an ORF by identifier string or integer index. 188 189 Examples:: 190 orfs['ORF_0'] # by identifier 191 orfs[0] # by index 192 """ 193 if isinstance(key, int): 194 return self.orfs[key] 195 for orf in self.orfs: 196 if orf.orf_id == key: 197 return orf 198 raise KeyError(key) 199 200 def __contains__(self, orf_id: str) -> bool: 201 """True if an ORF with *orf_id* is present in the collection.""" 202 return any(orf.orf_id == orf_id for orf in self.orfs)
Ordered collection of OpenReadingFrame objects returned by
MSA.get_conserved_orfs orMSA.get_non_overlapping_conserved_orfs.
The class intentionally mimics a read-only dict interface
162 def keys(self) -> list[str]: 163 """Return ORF identifiers in insertion order.""" 164 return [orf.orf_id for orf in self.orfs]
Return ORF identifiers in insertion order.
166 def values(self) -> list[OpenReadingFrame]: 167 """Return :class:`OpenReadingFrame` objects in insertion order.""" 168 return list(self.orfs)
Return OpenReadingFrame objects in insertion order.