msaexplorer.draw

The draw module

The draw module lets you draw alignments and statistic plots such as SNPs, ORFs, entropy and much more. For each plot a matplotlib axes has to be passed to the plotting function.

Importantly some of the plotting features can only be accessed for nucleotide alignments but not for amino acid alignments. The functions will raise the appropriate exception in such a case.

Functions

  1r"""
  2# The draw module
  3
  4The draw module lets you draw alignments and statistic plots such as SNPs, ORFs, entropy and much more. For each plot a
  5`matplotlib axes` has to be passed to the plotting function.
  6
  7Importantly some of the plotting features can only be accessed for nucleotide alignments but not for amino acid alignments.
  8The functions will raise the appropriate exception in such a case.
  9
 10## Functions
 11
 12"""
 13
 14# built-in
 15from itertools import chain
 16from typing import Callable, Dict
 17from copy import deepcopy
 18import os
 19
 20import matplotlib
 21from numpy import ndarray
 22
 23# MSAexplorer
 24from msaexplorer import explore, config
 25
 26# libs
 27import numpy as np
 28import matplotlib.pyplot as plt
 29import matplotlib.patches as patches
 30from matplotlib.cm import ScalarMappable
 31from matplotlib.colors import is_color_like, Normalize, to_rgba
 32from matplotlib.collections import PatchCollection, PolyCollection
 33from matplotlib.text import TextPath
 34from matplotlib.patches import PathPatch
 35from matplotlib.font_manager import FontProperties
 36from matplotlib.transforms import Affine2D
 37
 38
 39# general helper functions
 40def _validate_input_parameters(aln: explore.MSA, ax: plt.Axes, annotation: explore.Annotation | None = None):
 41    """
 42    Validate MSA class and axis.
 43    """
 44    if not isinstance(aln, explore.MSA):
 45        raise ValueError('alignment has to be an MSA class. use explore.MSA() to read in alignment')
 46    if not isinstance(ax, plt.Axes):
 47        raise ValueError('ax has to be an matplotlib axis')
 48    if annotation is not None:
 49        if not isinstance(annotation, explore.Annotation):
 50            raise ValueError('annotation has to be an annotation class. use explore.Annotation() to read in annotation')
 51
 52
 53def _format_x_axis(aln: explore.MSA, ax: plt.Axes, show_x_label: bool, show_left: bool):
 54    """
 55    General axis formatting.
 56    """
 57    ax.set_xlim(
 58        (aln.zoom[0] - 0.5, aln.zoom[0] + aln.length - 0.5) if aln.zoom is not None else (-0.5, aln.length - 0.5)
 59    )
 60    ax.spines['top'].set_visible(False)
 61    ax.spines['right'].set_visible(False)
 62    if show_x_label:
 63        ax.set_xlabel('alignment position')
 64    if not show_left:
 65        ax.spines['left'].set_visible(False)
 66
 67
 68def _find_stretches(row, non_nan_only=False) -> list[tuple[int, int, int]] | list[tuple[int, int]] :
 69    """
 70    Finds consecutive stretches of values in an array, with an option to exclude NaN stretches.
 71
 72    :param row: NumPy array of values
 73    :param non_nan_only: If True, only returns stretches of non-NaN values.
 74    :return: List of stretches (start, end, value at start) for all values or non-NaN values only.
 75    """
 76    if row.size == 0:
 77        return []
 78
 79    if non_nan_only:
 80        # Create a boolean mask for non-NaN values
 81        non_nan_mask = ~np.isnan(row)
 82        # Find changes in the mask
 83        changes = np.diff(non_nan_mask.astype(int)) != 0
 84        change_idx = np.nonzero(changes)[0]
 85        starts = np.concatenate(([0], change_idx + 1))
 86        ends = np.concatenate((change_idx, [len(row) - 1]))
 87
 88        # Return only stretches that start with non-NaN values
 89        return [(start, end) for start, end in zip(starts, ends) if non_nan_mask[start]]
 90    else:
 91        # Find change points: where adjacent cells differ.
 92        changes = np.diff(row) != 0
 93        change_idx = np.nonzero(changes)[0]
 94        starts = np.concatenate(([0], change_idx + 1))
 95        ends = np.concatenate((change_idx, [len(row) - 1]))
 96
 97        return [(start, end, row[start]) for start, end in zip(starts, ends)]
 98
 99
100def _seq_names(aln: explore.MSA, ax: plt.Axes, custom_seq_names: tuple, show_seq_names: bool):
101    """
102    Validate custom names and set show names to True. Format axis accordingly.
103    """
104    if custom_seq_names:
105        show_seq_names = True
106        if not isinstance(custom_seq_names, tuple):
107            raise ValueError('configure your custom names list: custom_names=(name1, name2...)')
108        if len(custom_seq_names) != len(aln.alignment.keys()):
109            raise ValueError('length of sequences not equal to number of custom names')
110    if show_seq_names:
111        ax.yaxis.set_ticks_position('none')
112        ax.set_yticks(np.arange(len(aln.alignment)) + 0.4)
113        if custom_seq_names:
114            ax.set_yticklabels(custom_seq_names[::-1])
115        else:
116            names = [x.split(' ')[0] for x in list(aln.alignment.keys())[::-1]]
117            ax.set_yticklabels(names)
118    else:
119        ax.set_yticks([])
120
121
122def _create_identity_patch(row, aln: explore.MSA, col: list, zoom: tuple[int, int], y_position: float | int, reference_color: str, seq_name: str, identity_color: str | ndarray, show_gaps: bool):
123    """
124    Creates the initial patch.
125    """
126    if show_gaps:
127        # plot a rectangle for parts that do not have gaps
128        for stretch in _find_stretches(row, True):
129            col.append(patches.Rectangle((stretch[0] + zoom[0] - 0.5, y_position), stretch[1] - stretch[0] + 1, 0.8,
130                                                           facecolor=reference_color if seq_name == aln.reference_id else identity_color
131                                                           )
132
133                                         )
134    # just plot a rectangle
135    else:
136        col.append(patches.Rectangle((zoom[0] - 0.5, y_position), zoom[1] - zoom[0], 0.8,
137                                     facecolor=reference_color if seq_name == aln.reference_id else identity_color
138                                     )
139                   )
140
141
142def _create_polygons(stretches: list, identity_values: list | ndarray, zoom: tuple, y_position: int, polygons: list, aln_colors: dict | ScalarMappable, polygon_colors: list, detected_identity_values: set | None = None):
143    """
144    create the individual polygons
145    """
146
147    for start, end, value in stretches:
148        if value not in identity_values:
149            continue
150        if detected_identity_values is not None:
151            detected_identity_values.add(value)
152        width = end + 1 - start
153        # Calculate x coordinates adjusted for zoom and centering
154        x0 = start + zoom[0] - 0.5
155        x1 = x0 + width
156        # Define the rectangle corners
157        rect_coords = [
158            (x0, y_position),
159            (x1, y_position),
160            (x1, y_position + 0.8),
161            (x0, y_position + 0.8),
162            (x0, y_position)
163        ]
164        polygons.append(rect_coords)
165        if type(aln_colors) != ScalarMappable:
166            polygon_colors.append(aln_colors[value]['color'])
167        else:
168            polygon_colors.append(aln_colors.to_rgba(value))
169
170
171def _plot_annotation(annotation_dict: dict, ax: plt.Axes, direction_marker_size: int | None, color: str | ScalarMappable):
172    """
173    Plot annotations
174    :param annotation_dict: dict of annotations
175    :param ax: matplotlib Axes
176    :param direction_marker_size: size of marker
177    :param color: color of annotation (color or scalar)
178    """
179    for annotation in annotation_dict:
180        for locations in annotation_dict[annotation]['location']:
181            x_value = locations[0]
182            length = locations[1] - locations[0]
183            ax.add_patch(
184                patches.FancyBboxPatch(
185                    (x_value, annotation_dict[annotation]['track'] + 1),
186                    length,
187                    0.8,
188                    boxstyle="Round, pad=0",
189                    ec="black",
190                    fc=color.to_rgba(annotation_dict[annotation]['conservation']) if isinstance(color, ScalarMappable) else color,
191                )
192            )
193            if direction_marker_size is not None:
194                if annotation_dict[annotation]['strand'] == '-':
195                    marker = '<'
196                else:
197                    marker = '>'
198                ax.plot(x_value + length/2, annotation_dict[annotation]['track'] + 1.4, marker=marker, markersize=direction_marker_size, color='white', markeredgecolor='black')
199
200        # plot linked annotations (such as splicing)
201        if len(annotation_dict[annotation]['location']) > 1:
202            y_value = annotation_dict[annotation]['track'] + 1.4
203            start = None
204            for locations in annotation_dict[annotation]['location']:
205                if start is None:
206                    start = locations[1]
207                    continue
208                ax.plot([start, locations[0]], [y_value, y_value], '--', linewidth=2, color='black')
209                start = locations[1]
210
211
212def _add_track_positions(annotation_dic):
213    # create a dict and sort
214    annotation_dic = dict(sorted(annotation_dic.items(), key=lambda x: x[1]['location'][0][0]))
215
216    # remember for each track the largest stop
217    track_stops = [0]
218
219    for ann in annotation_dic:
220        flattened_locations = list(chain.from_iterable(annotation_dic[ann]['location']))  # flatten list
221        track = 0
222        # check if a start of a gene is smaller than the stop of the current track
223        # -> move to new track
224        while flattened_locations[0] < track_stops[track]:
225            track += 1
226            # if all prior tracks are potentially causing an overlap
227            # create a new track and break
228            if len(track_stops) <= track:
229                track_stops.append(0)
230                break
231        # in the current track remember the stop of the current gene
232        track_stops[track] = flattened_locations[-1]
233        # and indicate the track in the dict
234        annotation_dic[ann]['track'] = track
235
236    return annotation_dic
237
238
239def _get_contrast_text_color(rgba_color):
240    """
241    compute the brightness of a color
242    """
243    r, g, b, a = rgba_color
244    brightness = (r * 299 + g * 587 + b * 114) / 1000
245    return 'white' if brightness < 0.5 else 'black'
246
247
248def _plot_sequence_text(aln: explore.MSA, seq_name: str, ref_name: str, allways_text: bool, values: ndarray, matrix: ndarray, ax:plt.Axes, zoom: tuple, y_position:int, value_to_skip: int, ref_color:str, show_gaps: bool, cmap: None | ScalarMappable = None):
249    """
250    Plot sequence text - however this will be done even if there is not enough space.
251    Might need some rework in the future.
252    """
253    x_text = 0
254    if seq_name == ref_name:
255        different_cols = np.any((matrix != value_to_skip) & ~np.isnan(matrix), axis=0)
256    else:
257        different_cols = [False]*aln.length
258
259    for idx, (character, value) in enumerate(zip(aln.alignment[seq_name], values)):
260        if value != value_to_skip and character != '-' or seq_name == ref_name and character != '-' or character == '-'and not show_gaps or allways_text and character != '-':
261            # text color
262            if seq_name == ref_name:
263                text_color = _get_contrast_text_color(to_rgba(ref_color))
264            elif cmap is not None:
265                text_color = _get_contrast_text_color(cmap.to_rgba(value))
266            else:
267                text_color = 'black'
268
269            ax.text(
270                x=x_text + zoom[0] if zoom is not None else x_text,
271                y=y_position + 0.4,
272                s=character,
273                fontweight='bold' if different_cols[idx] else 'normal',
274                ha='center',
275                va='center_baseline',
276                c=text_color if value != value_to_skip or seq_name == ref_name else 'dimgrey'
277            )
278        x_text += 1
279
280
281def identity_alignment(aln: explore.MSA, ax: plt.Axes, show_title: bool = True, show_identity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', show_mask:bool = True, show_gaps:bool = True, fancy_gaps:bool = False, show_mismatches: bool = True, show_ambiguities: bool = False, color_scheme: str | None = None, show_x_label: bool = True, show_legend: bool = False, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int]= (1, 1)):
282    """
283    Generates an identity alignment overview plot.
284    :param aln: alignment MSA class
285    :param ax: matplotlib axes
286    :param show_title: whether to show title
287    :param show_seq_names: whether to show seq names
288    :param show_identity_sequence: whether to show sequence for only differences and reference - zoom in to avoid plotting issues
289    :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
290    :param custom_seq_names: custom seq names
291    :param reference_color: color of reference sequence
292    :param show_mask: whether to show N or X chars otherwise it will be shown as match or mismatch
293    :param show_gaps: whether to show gaps otherwise it will be shown as match or mismatch
294    :param fancy_gaps: show gaps with a small black bar
295    :param show_mismatches: whether to show mismatches otherwise it will be shown as match
296    :param show_ambiguities: whether to show non-N ambiguities -> only relevant for RNA/DNA sequences
297    :param color_scheme: color mismatching chars with their unique color. Options for DNA/RNA are: standard, purine_pyrimidine, strong_weak; and for AS: standard, clustal, zappo, hydrophobicity
298    :param show_x_label: whether to show x label
299    :param show_legend: whether to show the legend
300    :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html
301    """
302
303    # Validate inputs and colors
304    _validate_input_parameters(aln, ax)
305    if not is_color_like(reference_color):
306        raise ValueError(f'{reference_color} for reference is not a color')
307
308    if color_scheme is not None:
309        if aln.aln_type == 'AA' and color_scheme not in ['standard', 'clustal', 'zappo', 'hydrophobicity']:
310            raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, clustal, zappo, hydrophobicity')
311        elif aln.aln_type in ['DNA', 'RNA'] and color_scheme not in ['standard', 'purine_pyrimidine', 'strong_weak']:
312            raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, purine_pyrimidine, strong_weak')
313
314    # Both options for gaps work hand in hand
315    if fancy_gaps:
316        show_gaps = True
317
318    # Determine zoom
319    zoom = (0, aln.length) if aln.zoom is None else aln.zoom
320
321    # Set up color mapping and identity values
322    aln_colors = config.IDENTITY_COLORS.copy()
323    identity_values = [-1, -2, -3]  # -1 = mismatch, -2 = mask, -3 ambiguity
324    if color_scheme is not None:
325        colors_to_extend = config.CHAR_COLORS[aln.aln_type][color_scheme]
326        identity_values += [x + 1 for x in range(len(colors_to_extend))] # x+1 is needed to allow correct mapping
327        # use the standard setting for the index (same as in aln.calc_identity_alignment)
328        # and map the corresponding color scheme to it
329        for idx, char in enumerate(config.CHAR_COLORS[aln.aln_type]['standard']):
330            aln_colors[idx + 1] = {'type': char, 'color': colors_to_extend[char]}
331
332    # Compute identity alignment
333    identity_aln = aln.calc_identity_alignment(
334        encode_mask=show_mask,
335        encode_gaps=show_gaps,
336        encode_mismatches=show_mismatches,
337        encode_ambiguities=show_ambiguities,
338        encode_each_mismatch_char=True if color_scheme is not None else False
339    )
340
341    # List to store polygons
342    detected_identity_values = {0}
343    polygons, polygon_colors, patch_list = [], [], []
344
345    for i, seq_name in enumerate(aln.alignment):
346        y_position = len(aln.alignment) - i - 1
347        row = identity_aln[i]
348
349        # plot a line below everything
350        if fancy_gaps:
351            ax.hlines(
352                y_position + 0.4,
353                xmin=zoom[0] - 0.5,
354                xmax=zoom[1] + 0.5,
355                color='black',
356                linestyle='-',
357                zorder=0,
358                linewidth=0.75
359            )
360
361        # plot the basic shape per sequence with gaps
362        _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, aln_colors[0]['color'], show_gaps)
363
364        # find consecutive stretches
365        stretches = _find_stretches(row)
366        # create polygons per stretch
367        _create_polygons(stretches, identity_values, zoom, y_position, polygons, aln_colors, polygon_colors, detected_identity_values)
368
369        # add sequence text
370        if show_identity_sequence or show_sequence_all:
371            _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, identity_aln[i], identity_aln, ax,
372                                zoom, y_position, 0, reference_color, show_gaps)
373
374    # Create the LineCollection: each segment is drawn in a single call.
375    ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt'))
376    ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors))
377
378    # custom legend
379    if show_legend:
380        if color_scheme is not None and color_scheme != 'standard':
381            for x in aln_colors:
382                for group in config.CHAR_GROUPS[aln.aln_type][color_scheme]:
383                    if aln_colors[x]['type'] in config.CHAR_GROUPS[aln.aln_type][color_scheme][group]:
384                        aln_colors[x]['type'] = group
385                        break
386        # create it
387        handels, labels, detected_groups = [], [], set()
388        for x in aln_colors:
389            if x in detected_identity_values and aln_colors[x]['type'] not in detected_groups:
390                handels.append(
391                    ax.add_line(
392                        plt.Line2D(
393                            [],
394                            [],
395                            color=aln_colors[x]['color'] if color_scheme != 'hydrophobicity' or x == 0 else config.CHAR_COLORS[aln.aln_type]['hydrophobicity'][
396                                config.CHAR_GROUPS[aln.aln_type]['hydrophobicity'][aln_colors[x]['type']][0]
397                            ],
398                            marker='s',
399                            markeredgecolor='grey',
400                            linestyle='',
401                            markersize=10))
402                )
403                labels.append(aln_colors[x]['type'])
404                detected_groups.add(aln_colors[x]['type'])
405
406        # plot it
407        ax.legend(
408            handels,
409            labels,
410            loc='lower right',
411            bbox_to_anchor=bbox_to_anchor,
412            ncols=(len(detected_identity_values) + 1) / 2 if aln.aln_type == 'AA' and color_scheme == 'standard' else len(detected_identity_values),
413            frameon=False
414        )
415
416    _seq_names(aln, ax, custom_seq_names, show_seq_names)
417
418    # configure axis
419    ax.set_ylim(0, len(aln.alignment))
420    if show_title:
421        ax.set_title('identity', loc='left')
422    _format_x_axis(aln, ax, show_x_label, show_left=False)
423
424
425def similarity_alignment(aln: explore.MSA, ax: plt.Axes, matrix_type: str | None = None, show_title: bool = True, show_similarity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', cmap: str = 'twilight_r', show_gaps:bool = True, fancy_gaps:bool = False, show_x_label: bool = True, show_cbar: bool = False, cbar_fraction: float = 0.1):
426    """
427    Generates a similarity alignment overview plot. Importantly the similarity values are normalized!
428    :param aln: alignment MSA class
429    :param ax: matplotlib axes
430    :param matrix_type: substitution matrix - see config.SUBS_MATRICES, standard: NT - TRANS, AA - BLOSUM65
431    :param show_title: whether to show title
432    :param show_similarity_sequence: whether to show sequence only for differences and reference - zoom in to avoid plotting issues
433    :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
434    :param show_seq_names: whether to show seq names
435    :param custom_seq_names: custom seq names
436    :param reference_color: color of reference sequence
437    :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html
438    :param show_gaps: whether to show gaps otherwise it will be ignored
439    :param fancy_gaps: show gaps with a small black bar
440    :param show_x_label: whether to show x label
441    :param show_cbar: whether to show the legend - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
442    :param cbar_fraction: fraction of the original ax reserved for the legend
443    """
444    # input check
445    _validate_input_parameters(aln, ax)
446
447    # validate colors
448    if not is_color_like(reference_color):
449        raise ValueError(f'{reference_color} for reference is not a color')
450
451    # Both options for gaps work hand in hand
452    if fancy_gaps:
453        show_gaps = True
454
455    # define zoom to plot
456    if aln.zoom is None:
457        zoom = (0, aln.length)
458    else:
459        zoom = aln.zoom
460
461    # get data
462    similarity_aln = aln.calc_similarity_alignment(matrix_type=matrix_type)  # use normalized values here
463    similarity_aln = similarity_aln.round(2)  # round data for good color mapping
464
465    # determine min max values of the underlying matrix and create cmap
466    min_value, max_value = 0, 1
467    cmap = ScalarMappable(
468        norm=Normalize(
469            vmin=min_value,
470            vmax=max_value
471        ),
472        cmap=plt.get_cmap(cmap)
473    )
474
475    # create similarity values
476    similarity_values = np.arange(start=min_value, stop=max_value, step=0.01)
477    # round it to be absolutely sure that values match with rounded sim alignment
478    similarity_values = similarity_values.round(2)
479
480    # create plot
481    polygons, polygon_colors, patch_list = [], [], []
482
483    for i, seq_name in enumerate(aln.alignment):
484        y_position = len(aln.alignment) - i - 1
485        row = similarity_aln[i]
486
487        # plot a line below everything
488        if fancy_gaps:
489            ax.hlines(
490                y_position + 0.4,
491                xmin=zoom[0] - 0.5,
492                xmax=zoom[1] + 0.5,
493                color='black',
494                linestyle='-',
495                zorder=0,
496                linewidth=0.75
497            )
498
499        # plot the basic shape per sequence with gaps
500        _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name,
501                               cmap.to_rgba(max_value) if seq_name != aln.reference_id else reference_color,
502                               show_gaps)
503
504        # find consecutive stretches
505        stretches = _find_stretches(row)
506        # create polygons per stretch
507        _create_polygons(stretches, similarity_values, zoom, y_position, polygons, cmap, polygon_colors)
508
509        # add sequence text
510        if show_sequence_all or show_similarity_sequence:
511            _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, similarity_aln[i], similarity_aln, ax,
512                                zoom, y_position, 1, reference_color, show_gaps, cmap)
513
514    # Create the LineCollection: each segment is drawn in a single call.
515    ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt'))
516    ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors))
517
518    # legend
519    if show_cbar:
520        cbar = plt.colorbar(cmap, ax=ax, location= 'top', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction)
521        cbar.set_ticks([min_value, max_value])
522        cbar.set_ticklabels(['low', 'high'])
523
524    # format seq names
525    _seq_names(aln, ax, custom_seq_names, show_seq_names)
526
527    # configure axis
528    ax.set_ylim(0, len(aln.alignment))
529    if show_title:
530        ax.set_title('similarity', loc='left')
531    _format_x_axis(aln, ax, show_x_label, show_left=False)
532
533
534def _moving_average(arr: ndarray, window_size: int, zoom: tuple | None, aln_length: int) -> tuple[ndarray, ndarray]:
535    """
536    Calculate the moving average of an array.
537    :param arr: array with values
538    :param window_size: size of the moving average
539    :param zoom: zoom of the alignment
540    :param aln_length: length of the alignment
541    :return: new array with moving average
542    """
543    if window_size > 1:
544        i = 0
545        moving_averages, plotting_idx = [], []
546        while i < len(arr) + 1:
547            half_window_size = window_size // 2
548            window_left = arr[i - half_window_size : i] if i > half_window_size else arr[0:i]
549            window_right = arr[i: i + half_window_size] if i < len(arr) - half_window_size else arr[i: len(arr)]
550            moving_averages.append((sum(window_left) + sum(window_right)) / (len(window_left) + len(window_right)))
551            plotting_idx.append(i)
552            i += 1
553
554        return np.array(moving_averages), np.array(plotting_idx) if zoom is None else np.array(plotting_idx) + zoom[0]
555    else:
556        return arr, np.arange(zoom[0], zoom[1]) if zoom is not None else np.arange(aln_length)
557
558
559def stat_plot(aln: explore.MSA, ax: plt.Axes, stat_type: str, line_color: str = 'burlywood', line_width: int | float = 2, rolling_average: int = 20, show_x_label: bool = False, show_title: bool = True):
560    """
561    Generate a plot for the various alignment stats.
562    :param aln: alignment MSA class
563    :param ax: matplotlib axes
564    :param stat_type: 'entropy', 'gc', 'coverage', 'ts/tv', 'identity' or 'similarity' -> (here default matrices are used NT - TRANS, AA - BLOSUM65)
565    :param line_color: color of the line
566    :param line_width: width of the line
567    :param rolling_average: average rolling window size left and right of a position in nucleotides or amino acids
568    :param show_x_label: whether to show the x-axis label
569    :param show_title: whether to show the title
570    """
571
572    # define possible functions to calc here
573    stat_functions: Dict[str, Callable[[], list | ndarray]] = {
574        'gc': aln.calc_gc,
575        'entropy': aln.calc_entropy,
576        'coverage': aln.calc_coverage,
577        'identity': aln.calc_identity_alignment,
578        'similarity': aln.calc_similarity_alignment,
579        'ts tv score': aln.calc_transition_transversion_score
580    }
581
582    if stat_type not in stat_functions:
583        raise ValueError('stat_type must be one of {}'.format(list(stat_functions.keys())))
584
585    # input check
586    _validate_input_parameters(aln, ax)
587    if not is_color_like(line_color):
588        raise ValueError('line color is not a color')
589
590    # validate rolling average
591    if rolling_average < 1 or rolling_average > aln.length:
592        raise ValueError('rolling_average must be between 1 and length of sequence')
593
594    # generate input data
595    array = stat_functions[stat_type]()
596
597    if stat_type == 'identity':
598        min_value, max_value = -1, 0
599    elif stat_type == 'ts tv score':
600        min_value, max_value = -1, 1
601    else:
602        min_value, max_value = 0, 1
603    if stat_type in ['identity', 'similarity']:
604        # for the mean nan values get handled as the lowest possible number in the matrix
605        array = np.nan_to_num(array, True, min_value)
606        array = np.mean(array, axis=0)
607    data, plot_idx = _moving_average(array, rolling_average, aln.zoom, aln.length)
608
609    # plot the data
610    ax.fill_between(
611        # this add dummy data left and right for better plotting
612        # otherwise only half of the step is shown
613        np.concatenate(([plot_idx[0] - 0.5], plot_idx, [plot_idx[-1] + 0.5])) if rolling_average == 1 else plot_idx,
614        np.concatenate(([data[0]], data, [data[-1]])) if rolling_average == 1 else data,
615        min_value,
616        linewidth = line_width,
617        edgecolor=line_color,
618        step='mid' if rolling_average == 1 else None,
619        facecolor=(line_color, 0.6) if stat_type not in ['ts tv score', 'gc'] else 'none'
620    )
621    if stat_type == 'gc':
622        ax.hlines(0.5, xmin=0, xmax=aln.zoom[0] + aln.length if aln.zoom is not None else aln.length, color='black', linestyles='--', linewidth=1)
623
624    # format axis
625    ax.set_ylim(min_value, max_value*0.1+max_value)
626    ax.set_yticks([min_value, max_value])
627    if stat_type == 'gc':
628        ax.set_yticklabels(['0', '100'])
629    elif stat_type == 'ts tv score':
630        ax.set_yticklabels(['tv', 'ts'])
631    else:
632        ax.set_yticklabels(['low', 'high'])
633
634    # show title
635    if show_title:
636        ax.set_title(
637            f'{stat_type} (average over {rolling_average} positions)' if rolling_average > 1 else f'{stat_type} for each position',
638            loc='left'
639        )
640
641    _format_x_axis(aln, ax, show_x_label, show_left=True)
642
643
644def variant_plot(aln: explore.MSA, ax: plt.Axes, lollisize: tuple[int, int] | list[int, int] = (1, 3), color_scheme: str = 'standard', show_x_label: bool = False, show_legend: bool = True, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int] = (1, 1)):
645    """
646    Plots variants.
647    :param aln: alignment MSA class
648    :param ax: matplotlib axes
649    :param lollisize: (stem_size, head_size)
650    :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options
651    :param show_x_label:  whether to show the x-axis label
652    :param show_legend: whether to show the legend
653    :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html
654    """
655
656    # validate input
657    _validate_input_parameters(aln, ax)
658    if not isinstance(lollisize, tuple) or len(lollisize) != 2:
659        raise ValueError('lollisize must be tuple of length 2 (stem, head)')
660    for _size in lollisize:
661        if not isinstance(_size, float | int) or _size <= 0:
662            raise ValueError('lollisize must be floats greater than zero')
663
664    # define colors
665    colors = config.CHAR_COLORS[aln.aln_type][color_scheme]
666    # get snps
667    snps = aln.get_snps()
668    # define where to plot (each ref type gets a separate line)
669    ref_y_positions, y_pos, detected_var = {}, 0, set()
670
671    # iterate over snp dict
672    for pos in snps['POS']:
673        for identifier in snps['POS'][pos]:
674            # fill in y pos dict
675            if identifier == 'ref':
676                if snps['POS'][pos]['ref'] not in ref_y_positions:
677                    ref_y_positions[snps['POS'][pos]['ref']] = y_pos
678                    y_pos += 1.1
679                    continue
680            # plot
681            if identifier == 'ALT':
682                for alt in snps['POS'][pos]['ALT']:
683                    ax.vlines(x=pos + aln.zoom[0] if aln.zoom is not None else pos,
684                              ymin=ref_y_positions[snps['POS'][pos]['ref']],
685                              ymax=ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'],
686                              color=colors[alt],
687                              zorder=100,
688                              linewidth=lollisize[0]
689                              )
690                    ax.plot(pos + aln.zoom[0] if aln.zoom is not None else pos,
691                            ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'],
692                            color=colors[alt],
693                            marker='o',
694                            markersize=lollisize[1]
695                            )
696                    detected_var.add(alt)
697
698    # plot hlines
699    for y_char in ref_y_positions:
700        ax.hlines(
701            ref_y_positions[y_char],
702            xmin=aln.zoom[0] - 0.5 if aln.zoom is not None else -0.5,
703            xmax=aln.zoom[0] + aln.length + 0.5 if aln.zoom is not None else aln.length + 0.5,
704            color='black',
705            linestyle='-',
706            zorder=0,
707            linewidth=0.75
708        )
709    # create a custom legend
710    if show_legend:
711        custom_legend = [
712            ax.add_line(
713                plt.Line2D(
714                    [],
715                    [],
716                    color=colors[char],
717                    marker='o',
718                    linestyle='',
719                    markersize=5
720                )
721            ) for char in colors if char in detected_var
722        ]
723        ax.legend(
724            custom_legend,
725            [char for char in colors if char in detected_var],  # ensures correct sorting
726            loc='lower right',
727            title='variant',
728            bbox_to_anchor=bbox_to_anchor,
729            ncols=len(detected_var)/2 if aln.aln_type == 'AA' else len(detected_var),
730            frameon=False
731        )
732
733    # format axis
734    _format_x_axis(aln, ax, show_x_label, show_left=False)
735    ax.spines['left'].set_visible(False)
736    ax.set_yticks([ref_y_positions[x] for x in ref_y_positions])
737    ax.set_yticklabels(ref_y_positions.keys())
738    ax.set_ylim(0, y_pos)
739    ax.set_ylabel('reference')
740
741
742def orf_plot(aln: explore.MSA, ax: plt.Axes, min_length: int = 500, non_overlapping_orfs: bool = True, cmap: str = 'Blues', direction_marker_size: int | None = 5, show_x_label: bool = False, show_cbar: bool = False, cbar_fraction: float = 0.1):
743    """
744    Plot conserved ORFs.
745    :param aln: alignment MSA class
746    :param ax: matplotlib axes
747    :param min_length: minimum length of orf
748    :param non_overlapping_orfs: whether to consider overlapping orfs
749    :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html
750    :param direction_marker_size: marker size for direction marker, not shown if marker_size == None
751    :param show_x_label: whether to show the x-axis label
752    :param show_cbar: whether to show the colorbar - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
753    :param cbar_fraction: fraction of the original ax reserved for the colorbar
754    """
755
756    # normalize colorbar
757    cmap = ScalarMappable(norm=Normalize(0, 100), cmap=plt.get_cmap(cmap))
758
759    # validate input
760    _validate_input_parameters(aln, ax)
761
762    # get orfs --> first deepcopy and reset zoom that the orfs are also zoomed in (by default, the orfs are only
763    # searched within the zoomed region)
764    aln_temp = deepcopy(aln)
765    aln_temp.zoom = None
766    if non_overlapping_orfs:
767        annotation_dict = aln_temp.get_non_overlapping_conserved_orfs(min_length=min_length)
768    else:
769        annotation_dict = aln_temp.get_conserved_orfs(min_length=min_length)
770
771    # filter dict for zoom
772    if aln.zoom is not None:
773        annotation_dict = {key:val for key, val in annotation_dict.items() if max(val['location'][0][0], aln.zoom[0]) <= min(val['location'][0][1], aln.zoom[1])}
774
775    # add track for plotting
776    _add_track_positions(annotation_dict)
777
778    # plot
779    _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=cmap)
780
781    # legend
782    if show_cbar:
783        cbar = plt.colorbar(cmap,ax=ax, location= 'top', orientation='horizontal', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction)
784        cbar.set_label('% identity')
785        cbar.set_ticks([0, 100])
786
787    # format axis
788    _format_x_axis(aln, ax, show_x_label, show_left=False)
789    ax.set_ylim(bottom=0.8)
790    ax.set_yticks([])
791    ax.set_yticklabels([])
792    ax.set_title('conserved orfs', loc='left')
793
794
795def annotation_plot(aln: explore.MSA, annotation: explore.Annotation | str, ax: plt.Axes, feature_to_plot: str, color: str = 'wheat', direction_marker_size: int | None = 5, show_x_label: bool = False):
796    """
797    Plot annotations from bed, gff or gb files. Are automatically mapped to alignment.
798    :param aln: alignment MSA class
799    :param annotation: annotation class | path to annotation file
800    :param ax: matplotlib axes
801    :param feature_to_plot: potential feature to plot (not for bed files as it is parsed as one feature)
802    :param color: color for the annotation
803    :param direction_marker_size: marker size for direction marker, only relevant if show_direction is True
804    :param show_x_label: whether to show the x-axis label
805    """
806    # helper function
807    def parse_annotation_from_string(path: str, msa: explore.MSA) -> explore.Annotation:
808        """
809        Parse annotation.
810        :param path: path to annotation
811        :param msa: msa object
812        :return: parsed annotation
813        """
814        if os.path.exists(path):
815            # reset zoom so the annotation is correctly parsed
816            msa_temp = deepcopy(msa)
817            msa_temp.zoom = None
818            return explore.Annotation(msa_temp, path)
819        else:
820            raise FileNotFoundError()
821
822    # parse from path
823    if type(annotation) is str:
824        annotation = parse_annotation_from_string(annotation, aln)
825
826    # validate input
827    _validate_input_parameters(aln, ax, annotation)
828    if not is_color_like(color):
829        raise ValueError(f'{color} for reference is not a color')
830
831    # ignore features to plot for bed files (here it is written into one feature)
832    if annotation.ann_type == 'bed':
833        annotation_dict = annotation.features['region']
834        feature_to_plot = 'bed regions'
835    else:
836        # try to subset the annotation dict
837        try:
838            annotation_dict = annotation.features[feature_to_plot]
839        except KeyError:
840            raise KeyError(f'Feature {feature_to_plot} not found. Use annotation.features.keys() to see available features.')
841
842    # plotting and formating
843    _add_track_positions(annotation_dict)
844    _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=color)
845    _format_x_axis(aln, ax, show_x_label, show_left=False)
846    ax.set_ylim(bottom=0.8)
847    ax.set_yticks([])
848    ax.set_yticklabels([])
849    ax.set_title(f'{annotation.locus} ({feature_to_plot})', loc='left')
850
851
852def sequence_logo(aln:explore.MSA, ax:plt.Axes, color_scheme: str = 'standard', plot_type: str = 'logo', show_x_label:bool = False):
853    """
854    Plot sequence logo or stacked area plot (use the first one with appropriate zoom levels). The
855    logo visualizes the relative frequency of nt or aa characters in the alignment. The char frequency
856    is scaled to the information content at each position. --> identical to how Geneious calculates it.
857
858    :param aln: alignment MSA class
859    :param ax: matplotlib axes
860    :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options
861    :param plot_type: 'logo' for sequence logo, 'stacked' for stacked area plot
862    :param show_x_label: whether to show the x-axis label
863    """
864
865    _validate_input_parameters(aln, ax)
866    # calc matrix
867    matrix = aln.calc_position_matrix('IC') * aln.calc_position_matrix('PPM')
868    letters_to_plot = list(config.CHAR_COLORS[aln.aln_type]['standard'].keys())[:-1]
869
870    #matrix = np.clip(matrix, 0, None)  # only positive values
871    # plot
872    if plot_type == 'logo':
873        for pos in range(matrix.shape[1]):
874            # sort the positive matrix row values by size
875            items = [(letters_to_plot[i], matrix[i, pos]) for i in range(len(letters_to_plot)) if matrix[i, pos] > 0]
876            items.sort(key=lambda x: x[1])
877            # plot each position
878            y_offset = 0
879            for letter, h in items:
880                tp = TextPath((aln.zoom[0] - 0.325 if aln.zoom is not None else - 0.325, 0), letter, size=1, prop=FontProperties(weight='bold'))
881                bb = tp.get_extents()
882                glyph_height = bb.height if bb.height > 0 else 1e-6  # avoid div by zero
883                scale_to_1 = 1.0 / glyph_height
884
885                transform = (Affine2D()
886                             .scale(1.0, h * scale_to_1)  # scale manually by IC and normalize font
887                             .translate(pos, y_offset)
888                             + ax.transData)
889
890                patch = PathPatch(tp, transform=transform,
891                                  facecolor=config.CHAR_COLORS[aln.aln_type][color_scheme][letter],
892                                  edgecolor='none')
893                ax.add_patch(patch)
894                y_offset += h
895    elif plot_type == 'stacked':
896        y_values = np.zeros(matrix.shape[1])
897        x_values = np.arange(0, matrix.shape[1]) if aln.zoom is None else np.arange(aln.zoom[0], aln.zoom[1])
898        for row in range(matrix.shape[0]):
899            y = matrix[row]
900            ax.fill_between(x_values,
901                            y_values,
902                            y_values + y,
903                            fc=config.CHAR_COLORS[aln.aln_type][color_scheme].get(letters_to_plot[row]),
904                            ec='None',
905                            label=letters_to_plot[row],
906                            step='pre')
907            y_values += y
908
909    # adjust limits & labels
910    _format_x_axis(aln, ax, show_x_label, show_left=True)
911    if aln.aln_type == 'AA':
912        ax.set_ylim(bottom=0, top=5)
913    else:
914        ax.set_ylim(bottom=0, top=2)
915    ax.spines['top'].set_visible(False)
916    ax.spines['right'].set_visible(False)
917    ax.set_ylabel('bits')
def identity_alignment( aln: msaexplorer.explore.MSA, ax: matplotlib.axes._axes.Axes, show_title: bool = True, show_identity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', show_mask: bool = True, show_gaps: bool = True, fancy_gaps: bool = False, show_mismatches: bool = True, show_ambiguities: bool = False, color_scheme: str | None = None, show_x_label: bool = True, show_legend: bool = False, bbox_to_anchor: tuple[float | int, float | int] | list[float | int, float | int] = (1, 1)):
282def identity_alignment(aln: explore.MSA, ax: plt.Axes, show_title: bool = True, show_identity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', show_mask:bool = True, show_gaps:bool = True, fancy_gaps:bool = False, show_mismatches: bool = True, show_ambiguities: bool = False, color_scheme: str | None = None, show_x_label: bool = True, show_legend: bool = False, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int]= (1, 1)):
283    """
284    Generates an identity alignment overview plot.
285    :param aln: alignment MSA class
286    :param ax: matplotlib axes
287    :param show_title: whether to show title
288    :param show_seq_names: whether to show seq names
289    :param show_identity_sequence: whether to show sequence for only differences and reference - zoom in to avoid plotting issues
290    :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
291    :param custom_seq_names: custom seq names
292    :param reference_color: color of reference sequence
293    :param show_mask: whether to show N or X chars otherwise it will be shown as match or mismatch
294    :param show_gaps: whether to show gaps otherwise it will be shown as match or mismatch
295    :param fancy_gaps: show gaps with a small black bar
296    :param show_mismatches: whether to show mismatches otherwise it will be shown as match
297    :param show_ambiguities: whether to show non-N ambiguities -> only relevant for RNA/DNA sequences
298    :param color_scheme: color mismatching chars with their unique color. Options for DNA/RNA are: standard, purine_pyrimidine, strong_weak; and for AS: standard, clustal, zappo, hydrophobicity
299    :param show_x_label: whether to show x label
300    :param show_legend: whether to show the legend
301    :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html
302    """
303
304    # Validate inputs and colors
305    _validate_input_parameters(aln, ax)
306    if not is_color_like(reference_color):
307        raise ValueError(f'{reference_color} for reference is not a color')
308
309    if color_scheme is not None:
310        if aln.aln_type == 'AA' and color_scheme not in ['standard', 'clustal', 'zappo', 'hydrophobicity']:
311            raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, clustal, zappo, hydrophobicity')
312        elif aln.aln_type in ['DNA', 'RNA'] and color_scheme not in ['standard', 'purine_pyrimidine', 'strong_weak']:
313            raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, purine_pyrimidine, strong_weak')
314
315    # Both options for gaps work hand in hand
316    if fancy_gaps:
317        show_gaps = True
318
319    # Determine zoom
320    zoom = (0, aln.length) if aln.zoom is None else aln.zoom
321
322    # Set up color mapping and identity values
323    aln_colors = config.IDENTITY_COLORS.copy()
324    identity_values = [-1, -2, -3]  # -1 = mismatch, -2 = mask, -3 ambiguity
325    if color_scheme is not None:
326        colors_to_extend = config.CHAR_COLORS[aln.aln_type][color_scheme]
327        identity_values += [x + 1 for x in range(len(colors_to_extend))] # x+1 is needed to allow correct mapping
328        # use the standard setting for the index (same as in aln.calc_identity_alignment)
329        # and map the corresponding color scheme to it
330        for idx, char in enumerate(config.CHAR_COLORS[aln.aln_type]['standard']):
331            aln_colors[idx + 1] = {'type': char, 'color': colors_to_extend[char]}
332
333    # Compute identity alignment
334    identity_aln = aln.calc_identity_alignment(
335        encode_mask=show_mask,
336        encode_gaps=show_gaps,
337        encode_mismatches=show_mismatches,
338        encode_ambiguities=show_ambiguities,
339        encode_each_mismatch_char=True if color_scheme is not None else False
340    )
341
342    # List to store polygons
343    detected_identity_values = {0}
344    polygons, polygon_colors, patch_list = [], [], []
345
346    for i, seq_name in enumerate(aln.alignment):
347        y_position = len(aln.alignment) - i - 1
348        row = identity_aln[i]
349
350        # plot a line below everything
351        if fancy_gaps:
352            ax.hlines(
353                y_position + 0.4,
354                xmin=zoom[0] - 0.5,
355                xmax=zoom[1] + 0.5,
356                color='black',
357                linestyle='-',
358                zorder=0,
359                linewidth=0.75
360            )
361
362        # plot the basic shape per sequence with gaps
363        _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, aln_colors[0]['color'], show_gaps)
364
365        # find consecutive stretches
366        stretches = _find_stretches(row)
367        # create polygons per stretch
368        _create_polygons(stretches, identity_values, zoom, y_position, polygons, aln_colors, polygon_colors, detected_identity_values)
369
370        # add sequence text
371        if show_identity_sequence or show_sequence_all:
372            _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, identity_aln[i], identity_aln, ax,
373                                zoom, y_position, 0, reference_color, show_gaps)
374
375    # Create the LineCollection: each segment is drawn in a single call.
376    ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt'))
377    ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors))
378
379    # custom legend
380    if show_legend:
381        if color_scheme is not None and color_scheme != 'standard':
382            for x in aln_colors:
383                for group in config.CHAR_GROUPS[aln.aln_type][color_scheme]:
384                    if aln_colors[x]['type'] in config.CHAR_GROUPS[aln.aln_type][color_scheme][group]:
385                        aln_colors[x]['type'] = group
386                        break
387        # create it
388        handels, labels, detected_groups = [], [], set()
389        for x in aln_colors:
390            if x in detected_identity_values and aln_colors[x]['type'] not in detected_groups:
391                handels.append(
392                    ax.add_line(
393                        plt.Line2D(
394                            [],
395                            [],
396                            color=aln_colors[x]['color'] if color_scheme != 'hydrophobicity' or x == 0 else config.CHAR_COLORS[aln.aln_type]['hydrophobicity'][
397                                config.CHAR_GROUPS[aln.aln_type]['hydrophobicity'][aln_colors[x]['type']][0]
398                            ],
399                            marker='s',
400                            markeredgecolor='grey',
401                            linestyle='',
402                            markersize=10))
403                )
404                labels.append(aln_colors[x]['type'])
405                detected_groups.add(aln_colors[x]['type'])
406
407        # plot it
408        ax.legend(
409            handels,
410            labels,
411            loc='lower right',
412            bbox_to_anchor=bbox_to_anchor,
413            ncols=(len(detected_identity_values) + 1) / 2 if aln.aln_type == 'AA' and color_scheme == 'standard' else len(detected_identity_values),
414            frameon=False
415        )
416
417    _seq_names(aln, ax, custom_seq_names, show_seq_names)
418
419    # configure axis
420    ax.set_ylim(0, len(aln.alignment))
421    if show_title:
422        ax.set_title('identity', loc='left')
423    _format_x_axis(aln, ax, show_x_label, show_left=False)

Generates an identity alignment overview plot.

Parameters
  • aln: alignment MSA class
  • ax: matplotlib axes
  • show_title: whether to show title
  • show_seq_names: whether to show seq names
  • show_identity_sequence: whether to show sequence for only differences and reference - zoom in to avoid plotting issues
  • show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
  • custom_seq_names: custom seq names
  • reference_color: color of reference sequence
  • show_mask: whether to show N or X chars otherwise it will be shown as match or mismatch
  • show_gaps: whether to show gaps otherwise it will be shown as match or mismatch
  • fancy_gaps: show gaps with a small black bar
  • show_mismatches: whether to show mismatches otherwise it will be shown as match
  • show_ambiguities: whether to show non-N ambiguities -> only relevant for RNA/DNA sequences
  • color_scheme: color mismatching chars with their unique color. Options for DNA/RNA are: standard, purine_pyrimidine, strong_weak; and for AS: standard, clustal, zappo, hydrophobicity
  • show_x_label: whether to show x label
  • show_legend: whether to show the legend
  • bbox_to_anchor: bounding box coordinates for the legend - see: https: //matplotlib.org/stable/api/legend_api.html
def similarity_alignment( aln: msaexplorer.explore.MSA, ax: matplotlib.axes._axes.Axes, matrix_type: str | None = None, show_title: bool = True, show_similarity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', cmap: str = 'twilight_r', show_gaps: bool = True, fancy_gaps: bool = False, show_x_label: bool = True, show_cbar: bool = False, cbar_fraction: float = 0.1):
426def similarity_alignment(aln: explore.MSA, ax: plt.Axes, matrix_type: str | None = None, show_title: bool = True, show_similarity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', cmap: str = 'twilight_r', show_gaps:bool = True, fancy_gaps:bool = False, show_x_label: bool = True, show_cbar: bool = False, cbar_fraction: float = 0.1):
427    """
428    Generates a similarity alignment overview plot. Importantly the similarity values are normalized!
429    :param aln: alignment MSA class
430    :param ax: matplotlib axes
431    :param matrix_type: substitution matrix - see config.SUBS_MATRICES, standard: NT - TRANS, AA - BLOSUM65
432    :param show_title: whether to show title
433    :param show_similarity_sequence: whether to show sequence only for differences and reference - zoom in to avoid plotting issues
434    :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
435    :param show_seq_names: whether to show seq names
436    :param custom_seq_names: custom seq names
437    :param reference_color: color of reference sequence
438    :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html
439    :param show_gaps: whether to show gaps otherwise it will be ignored
440    :param fancy_gaps: show gaps with a small black bar
441    :param show_x_label: whether to show x label
442    :param show_cbar: whether to show the legend - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
443    :param cbar_fraction: fraction of the original ax reserved for the legend
444    """
445    # input check
446    _validate_input_parameters(aln, ax)
447
448    # validate colors
449    if not is_color_like(reference_color):
450        raise ValueError(f'{reference_color} for reference is not a color')
451
452    # Both options for gaps work hand in hand
453    if fancy_gaps:
454        show_gaps = True
455
456    # define zoom to plot
457    if aln.zoom is None:
458        zoom = (0, aln.length)
459    else:
460        zoom = aln.zoom
461
462    # get data
463    similarity_aln = aln.calc_similarity_alignment(matrix_type=matrix_type)  # use normalized values here
464    similarity_aln = similarity_aln.round(2)  # round data for good color mapping
465
466    # determine min max values of the underlying matrix and create cmap
467    min_value, max_value = 0, 1
468    cmap = ScalarMappable(
469        norm=Normalize(
470            vmin=min_value,
471            vmax=max_value
472        ),
473        cmap=plt.get_cmap(cmap)
474    )
475
476    # create similarity values
477    similarity_values = np.arange(start=min_value, stop=max_value, step=0.01)
478    # round it to be absolutely sure that values match with rounded sim alignment
479    similarity_values = similarity_values.round(2)
480
481    # create plot
482    polygons, polygon_colors, patch_list = [], [], []
483
484    for i, seq_name in enumerate(aln.alignment):
485        y_position = len(aln.alignment) - i - 1
486        row = similarity_aln[i]
487
488        # plot a line below everything
489        if fancy_gaps:
490            ax.hlines(
491                y_position + 0.4,
492                xmin=zoom[0] - 0.5,
493                xmax=zoom[1] + 0.5,
494                color='black',
495                linestyle='-',
496                zorder=0,
497                linewidth=0.75
498            )
499
500        # plot the basic shape per sequence with gaps
501        _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name,
502                               cmap.to_rgba(max_value) if seq_name != aln.reference_id else reference_color,
503                               show_gaps)
504
505        # find consecutive stretches
506        stretches = _find_stretches(row)
507        # create polygons per stretch
508        _create_polygons(stretches, similarity_values, zoom, y_position, polygons, cmap, polygon_colors)
509
510        # add sequence text
511        if show_sequence_all or show_similarity_sequence:
512            _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, similarity_aln[i], similarity_aln, ax,
513                                zoom, y_position, 1, reference_color, show_gaps, cmap)
514
515    # Create the LineCollection: each segment is drawn in a single call.
516    ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt'))
517    ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors))
518
519    # legend
520    if show_cbar:
521        cbar = plt.colorbar(cmap, ax=ax, location= 'top', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction)
522        cbar.set_ticks([min_value, max_value])
523        cbar.set_ticklabels(['low', 'high'])
524
525    # format seq names
526    _seq_names(aln, ax, custom_seq_names, show_seq_names)
527
528    # configure axis
529    ax.set_ylim(0, len(aln.alignment))
530    if show_title:
531        ax.set_title('similarity', loc='left')
532    _format_x_axis(aln, ax, show_x_label, show_left=False)

Generates a similarity alignment overview plot. Importantly the similarity values are normalized!

Parameters
  • aln: alignment MSA class
  • ax: matplotlib axes
  • matrix_type: substitution matrix - see config.SUBS_MATRICES, standard: NT - TRANS, AA - BLOSUM65
  • show_title: whether to show title
  • show_similarity_sequence: whether to show sequence only for differences and reference - zoom in to avoid plotting issues
  • show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
  • show_seq_names: whether to show seq names
  • custom_seq_names: custom seq names
  • reference_color: color of reference sequence
  • cmap: color mapping for % identity - see https: //matplotlib.org/stable/users/explain/colors/colormaps.html
  • show_gaps: whether to show gaps otherwise it will be ignored
  • fancy_gaps: show gaps with a small black bar
  • show_x_label: whether to show x label
  • show_cbar: whether to show the legend - see https: //matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
  • cbar_fraction: fraction of the original ax reserved for the legend
def stat_plot( aln: msaexplorer.explore.MSA, ax: matplotlib.axes._axes.Axes, stat_type: str, line_color: str = 'burlywood', line_width: int | float = 2, rolling_average: int = 20, show_x_label: bool = False, show_title: bool = True):
560def stat_plot(aln: explore.MSA, ax: plt.Axes, stat_type: str, line_color: str = 'burlywood', line_width: int | float = 2, rolling_average: int = 20, show_x_label: bool = False, show_title: bool = True):
561    """
562    Generate a plot for the various alignment stats.
563    :param aln: alignment MSA class
564    :param ax: matplotlib axes
565    :param stat_type: 'entropy', 'gc', 'coverage', 'ts/tv', 'identity' or 'similarity' -> (here default matrices are used NT - TRANS, AA - BLOSUM65)
566    :param line_color: color of the line
567    :param line_width: width of the line
568    :param rolling_average: average rolling window size left and right of a position in nucleotides or amino acids
569    :param show_x_label: whether to show the x-axis label
570    :param show_title: whether to show the title
571    """
572
573    # define possible functions to calc here
574    stat_functions: Dict[str, Callable[[], list | ndarray]] = {
575        'gc': aln.calc_gc,
576        'entropy': aln.calc_entropy,
577        'coverage': aln.calc_coverage,
578        'identity': aln.calc_identity_alignment,
579        'similarity': aln.calc_similarity_alignment,
580        'ts tv score': aln.calc_transition_transversion_score
581    }
582
583    if stat_type not in stat_functions:
584        raise ValueError('stat_type must be one of {}'.format(list(stat_functions.keys())))
585
586    # input check
587    _validate_input_parameters(aln, ax)
588    if not is_color_like(line_color):
589        raise ValueError('line color is not a color')
590
591    # validate rolling average
592    if rolling_average < 1 or rolling_average > aln.length:
593        raise ValueError('rolling_average must be between 1 and length of sequence')
594
595    # generate input data
596    array = stat_functions[stat_type]()
597
598    if stat_type == 'identity':
599        min_value, max_value = -1, 0
600    elif stat_type == 'ts tv score':
601        min_value, max_value = -1, 1
602    else:
603        min_value, max_value = 0, 1
604    if stat_type in ['identity', 'similarity']:
605        # for the mean nan values get handled as the lowest possible number in the matrix
606        array = np.nan_to_num(array, True, min_value)
607        array = np.mean(array, axis=0)
608    data, plot_idx = _moving_average(array, rolling_average, aln.zoom, aln.length)
609
610    # plot the data
611    ax.fill_between(
612        # this add dummy data left and right for better plotting
613        # otherwise only half of the step is shown
614        np.concatenate(([plot_idx[0] - 0.5], plot_idx, [plot_idx[-1] + 0.5])) if rolling_average == 1 else plot_idx,
615        np.concatenate(([data[0]], data, [data[-1]])) if rolling_average == 1 else data,
616        min_value,
617        linewidth = line_width,
618        edgecolor=line_color,
619        step='mid' if rolling_average == 1 else None,
620        facecolor=(line_color, 0.6) if stat_type not in ['ts tv score', 'gc'] else 'none'
621    )
622    if stat_type == 'gc':
623        ax.hlines(0.5, xmin=0, xmax=aln.zoom[0] + aln.length if aln.zoom is not None else aln.length, color='black', linestyles='--', linewidth=1)
624
625    # format axis
626    ax.set_ylim(min_value, max_value*0.1+max_value)
627    ax.set_yticks([min_value, max_value])
628    if stat_type == 'gc':
629        ax.set_yticklabels(['0', '100'])
630    elif stat_type == 'ts tv score':
631        ax.set_yticklabels(['tv', 'ts'])
632    else:
633        ax.set_yticklabels(['low', 'high'])
634
635    # show title
636    if show_title:
637        ax.set_title(
638            f'{stat_type} (average over {rolling_average} positions)' if rolling_average > 1 else f'{stat_type} for each position',
639            loc='left'
640        )
641
642    _format_x_axis(aln, ax, show_x_label, show_left=True)

Generate a plot for the various alignment stats.

Parameters
  • aln: alignment MSA class
  • ax: matplotlib axes
  • stat_type: 'entropy', 'gc', 'coverage', 'ts/tv', 'identity' or 'similarity' -> (here default matrices are used NT - TRANS, AA - BLOSUM65)
  • line_color: color of the line
  • line_width: width of the line
  • rolling_average: average rolling window size left and right of a position in nucleotides or amino acids
  • show_x_label: whether to show the x-axis label
  • show_title: whether to show the title
def variant_plot( aln: msaexplorer.explore.MSA, ax: matplotlib.axes._axes.Axes, lollisize: tuple[int, int] | list[int, int] = (1, 3), color_scheme: str = 'standard', show_x_label: bool = False, show_legend: bool = True, bbox_to_anchor: tuple[float | int, float | int] | list[float | int, float | int] = (1, 1)):
645def variant_plot(aln: explore.MSA, ax: plt.Axes, lollisize: tuple[int, int] | list[int, int] = (1, 3), color_scheme: str = 'standard', show_x_label: bool = False, show_legend: bool = True, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int] = (1, 1)):
646    """
647    Plots variants.
648    :param aln: alignment MSA class
649    :param ax: matplotlib axes
650    :param lollisize: (stem_size, head_size)
651    :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options
652    :param show_x_label:  whether to show the x-axis label
653    :param show_legend: whether to show the legend
654    :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html
655    """
656
657    # validate input
658    _validate_input_parameters(aln, ax)
659    if not isinstance(lollisize, tuple) or len(lollisize) != 2:
660        raise ValueError('lollisize must be tuple of length 2 (stem, head)')
661    for _size in lollisize:
662        if not isinstance(_size, float | int) or _size <= 0:
663            raise ValueError('lollisize must be floats greater than zero')
664
665    # define colors
666    colors = config.CHAR_COLORS[aln.aln_type][color_scheme]
667    # get snps
668    snps = aln.get_snps()
669    # define where to plot (each ref type gets a separate line)
670    ref_y_positions, y_pos, detected_var = {}, 0, set()
671
672    # iterate over snp dict
673    for pos in snps['POS']:
674        for identifier in snps['POS'][pos]:
675            # fill in y pos dict
676            if identifier == 'ref':
677                if snps['POS'][pos]['ref'] not in ref_y_positions:
678                    ref_y_positions[snps['POS'][pos]['ref']] = y_pos
679                    y_pos += 1.1
680                    continue
681            # plot
682            if identifier == 'ALT':
683                for alt in snps['POS'][pos]['ALT']:
684                    ax.vlines(x=pos + aln.zoom[0] if aln.zoom is not None else pos,
685                              ymin=ref_y_positions[snps['POS'][pos]['ref']],
686                              ymax=ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'],
687                              color=colors[alt],
688                              zorder=100,
689                              linewidth=lollisize[0]
690                              )
691                    ax.plot(pos + aln.zoom[0] if aln.zoom is not None else pos,
692                            ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'],
693                            color=colors[alt],
694                            marker='o',
695                            markersize=lollisize[1]
696                            )
697                    detected_var.add(alt)
698
699    # plot hlines
700    for y_char in ref_y_positions:
701        ax.hlines(
702            ref_y_positions[y_char],
703            xmin=aln.zoom[0] - 0.5 if aln.zoom is not None else -0.5,
704            xmax=aln.zoom[0] + aln.length + 0.5 if aln.zoom is not None else aln.length + 0.5,
705            color='black',
706            linestyle='-',
707            zorder=0,
708            linewidth=0.75
709        )
710    # create a custom legend
711    if show_legend:
712        custom_legend = [
713            ax.add_line(
714                plt.Line2D(
715                    [],
716                    [],
717                    color=colors[char],
718                    marker='o',
719                    linestyle='',
720                    markersize=5
721                )
722            ) for char in colors if char in detected_var
723        ]
724        ax.legend(
725            custom_legend,
726            [char for char in colors if char in detected_var],  # ensures correct sorting
727            loc='lower right',
728            title='variant',
729            bbox_to_anchor=bbox_to_anchor,
730            ncols=len(detected_var)/2 if aln.aln_type == 'AA' else len(detected_var),
731            frameon=False
732        )
733
734    # format axis
735    _format_x_axis(aln, ax, show_x_label, show_left=False)
736    ax.spines['left'].set_visible(False)
737    ax.set_yticks([ref_y_positions[x] for x in ref_y_positions])
738    ax.set_yticklabels(ref_y_positions.keys())
739    ax.set_ylim(0, y_pos)
740    ax.set_ylabel('reference')

Plots variants.

Parameters
  • aln: alignment MSA class
  • ax: matplotlib axes
  • lollisize: (stem_size, head_size)
  • color_scheme: color scheme for characters. see config.CHAR_COLORS for available options
  • show_x_label: whether to show the x-axis label
  • show_legend: whether to show the legend
  • bbox_to_anchor: bounding box coordinates for the legend - see: https: //matplotlib.org/stable/api/legend_api.html
def orf_plot( aln: msaexplorer.explore.MSA, ax: matplotlib.axes._axes.Axes, min_length: int = 500, non_overlapping_orfs: bool = True, cmap: str = 'Blues', direction_marker_size: int | None = 5, show_x_label: bool = False, show_cbar: bool = False, cbar_fraction: float = 0.1):
743def orf_plot(aln: explore.MSA, ax: plt.Axes, min_length: int = 500, non_overlapping_orfs: bool = True, cmap: str = 'Blues', direction_marker_size: int | None = 5, show_x_label: bool = False, show_cbar: bool = False, cbar_fraction: float = 0.1):
744    """
745    Plot conserved ORFs.
746    :param aln: alignment MSA class
747    :param ax: matplotlib axes
748    :param min_length: minimum length of orf
749    :param non_overlapping_orfs: whether to consider overlapping orfs
750    :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html
751    :param direction_marker_size: marker size for direction marker, not shown if marker_size == None
752    :param show_x_label: whether to show the x-axis label
753    :param show_cbar: whether to show the colorbar - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
754    :param cbar_fraction: fraction of the original ax reserved for the colorbar
755    """
756
757    # normalize colorbar
758    cmap = ScalarMappable(norm=Normalize(0, 100), cmap=plt.get_cmap(cmap))
759
760    # validate input
761    _validate_input_parameters(aln, ax)
762
763    # get orfs --> first deepcopy and reset zoom that the orfs are also zoomed in (by default, the orfs are only
764    # searched within the zoomed region)
765    aln_temp = deepcopy(aln)
766    aln_temp.zoom = None
767    if non_overlapping_orfs:
768        annotation_dict = aln_temp.get_non_overlapping_conserved_orfs(min_length=min_length)
769    else:
770        annotation_dict = aln_temp.get_conserved_orfs(min_length=min_length)
771
772    # filter dict for zoom
773    if aln.zoom is not None:
774        annotation_dict = {key:val for key, val in annotation_dict.items() if max(val['location'][0][0], aln.zoom[0]) <= min(val['location'][0][1], aln.zoom[1])}
775
776    # add track for plotting
777    _add_track_positions(annotation_dict)
778
779    # plot
780    _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=cmap)
781
782    # legend
783    if show_cbar:
784        cbar = plt.colorbar(cmap,ax=ax, location= 'top', orientation='horizontal', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction)
785        cbar.set_label('% identity')
786        cbar.set_ticks([0, 100])
787
788    # format axis
789    _format_x_axis(aln, ax, show_x_label, show_left=False)
790    ax.set_ylim(bottom=0.8)
791    ax.set_yticks([])
792    ax.set_yticklabels([])
793    ax.set_title('conserved orfs', loc='left')

Plot conserved ORFs.

Parameters
  • aln: alignment MSA class
  • ax: matplotlib axes
  • min_length: minimum length of orf
  • non_overlapping_orfs: whether to consider overlapping orfs
  • cmap: color mapping for % identity - see https: //matplotlib.org/stable/users/explain/colors/colormaps.html
  • direction_marker_size: marker size for direction marker, not shown if marker_size == None
  • show_x_label: whether to show the x-axis label
  • show_cbar: whether to show the colorbar - see https: //matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
  • cbar_fraction: fraction of the original ax reserved for the colorbar
def annotation_plot( aln: msaexplorer.explore.MSA, annotation: msaexplorer.explore.Annotation | str, ax: matplotlib.axes._axes.Axes, feature_to_plot: str, color: str = 'wheat', direction_marker_size: int | None = 5, show_x_label: bool = False):
796def annotation_plot(aln: explore.MSA, annotation: explore.Annotation | str, ax: plt.Axes, feature_to_plot: str, color: str = 'wheat', direction_marker_size: int | None = 5, show_x_label: bool = False):
797    """
798    Plot annotations from bed, gff or gb files. Are automatically mapped to alignment.
799    :param aln: alignment MSA class
800    :param annotation: annotation class | path to annotation file
801    :param ax: matplotlib axes
802    :param feature_to_plot: potential feature to plot (not for bed files as it is parsed as one feature)
803    :param color: color for the annotation
804    :param direction_marker_size: marker size for direction marker, only relevant if show_direction is True
805    :param show_x_label: whether to show the x-axis label
806    """
807    # helper function
808    def parse_annotation_from_string(path: str, msa: explore.MSA) -> explore.Annotation:
809        """
810        Parse annotation.
811        :param path: path to annotation
812        :param msa: msa object
813        :return: parsed annotation
814        """
815        if os.path.exists(path):
816            # reset zoom so the annotation is correctly parsed
817            msa_temp = deepcopy(msa)
818            msa_temp.zoom = None
819            return explore.Annotation(msa_temp, path)
820        else:
821            raise FileNotFoundError()
822
823    # parse from path
824    if type(annotation) is str:
825        annotation = parse_annotation_from_string(annotation, aln)
826
827    # validate input
828    _validate_input_parameters(aln, ax, annotation)
829    if not is_color_like(color):
830        raise ValueError(f'{color} for reference is not a color')
831
832    # ignore features to plot for bed files (here it is written into one feature)
833    if annotation.ann_type == 'bed':
834        annotation_dict = annotation.features['region']
835        feature_to_plot = 'bed regions'
836    else:
837        # try to subset the annotation dict
838        try:
839            annotation_dict = annotation.features[feature_to_plot]
840        except KeyError:
841            raise KeyError(f'Feature {feature_to_plot} not found. Use annotation.features.keys() to see available features.')
842
843    # plotting and formating
844    _add_track_positions(annotation_dict)
845    _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=color)
846    _format_x_axis(aln, ax, show_x_label, show_left=False)
847    ax.set_ylim(bottom=0.8)
848    ax.set_yticks([])
849    ax.set_yticklabels([])
850    ax.set_title(f'{annotation.locus} ({feature_to_plot})', loc='left')

Plot annotations from bed, gff or gb files. Are automatically mapped to alignment.

Parameters
  • aln: alignment MSA class
  • annotation: annotation class | path to annotation file
  • ax: matplotlib axes
  • feature_to_plot: potential feature to plot (not for bed files as it is parsed as one feature)
  • color: color for the annotation
  • direction_marker_size: marker size for direction marker, only relevant if show_direction is True
  • show_x_label: whether to show the x-axis label