msaexplorer.draw
The draw module
The draw module lets you draw alignments and statistic plots such as SNPs, ORFs, entropy and much more. For each plot a
matplotlib axes
has to be passed to the plotting function.
Importantly some of the plotting features can only be accessed for nucleotide alignments but not for amino acid alignments. The functions will raise the appropriate exception in such a case.
Functions
1r""" 2# The draw module 3 4The draw module lets you draw alignments and statistic plots such as SNPs, ORFs, entropy and much more. For each plot a 5`matplotlib axes` has to be passed to the plotting function. 6 7Importantly some of the plotting features can only be accessed for nucleotide alignments but not for amino acid alignments. 8The functions will raise the appropriate exception in such a case. 9 10## Functions 11 12""" 13 14# built-in 15from itertools import chain 16from typing import Callable, Dict 17from copy import deepcopy 18import os 19 20import matplotlib 21from numpy import ndarray 22 23# MSAexplorer 24from msaexplorer import explore, config 25 26# libs 27import numpy as np 28import matplotlib.pyplot as plt 29import matplotlib.patches as patches 30from matplotlib.cm import ScalarMappable 31from matplotlib.colors import is_color_like, Normalize, to_rgba 32from matplotlib.collections import PatchCollection, PolyCollection 33from matplotlib.text import TextPath 34from matplotlib.patches import PathPatch 35from matplotlib.font_manager import FontProperties 36from matplotlib.transforms import Affine2D 37 38 39# general helper functions 40def _validate_input_parameters(aln: explore.MSA, ax: plt.Axes, annotation: explore.Annotation | None = None): 41 """ 42 Validate MSA class and axis. 43 """ 44 if not isinstance(aln, explore.MSA): 45 raise ValueError('alignment has to be an MSA class. use explore.MSA() to read in alignment') 46 if not isinstance(ax, plt.Axes): 47 raise ValueError('ax has to be an matplotlib axis') 48 if annotation is not None: 49 if not isinstance(annotation, explore.Annotation): 50 raise ValueError('annotation has to be an annotation class. use explore.Annotation() to read in annotation') 51 52 53def _format_x_axis(aln: explore.MSA, ax: plt.Axes, show_x_label: bool, show_left: bool): 54 """ 55 General axis formatting. 56 """ 57 ax.set_xlim( 58 (aln.zoom[0] - 0.5, aln.zoom[0] + aln.length - 0.5) if aln.zoom is not None else (-0.5, aln.length - 0.5) 59 ) 60 ax.spines['top'].set_visible(False) 61 ax.spines['right'].set_visible(False) 62 if show_x_label: 63 ax.set_xlabel('alignment position') 64 if not show_left: 65 ax.spines['left'].set_visible(False) 66 67 68def _find_stretches(row, non_nan_only=False) -> list[tuple[int, int, int]] | list[tuple[int, int]] : 69 """ 70 Finds consecutive stretches of values in an array, with an option to exclude NaN stretches. 71 72 :param row: NumPy array of values 73 :param non_nan_only: If True, only returns stretches of non-NaN values. 74 :return: List of stretches (start, end, value at start) for all values or non-NaN values only. 75 """ 76 if row.size == 0: 77 return [] 78 79 if non_nan_only: 80 # Create a boolean mask for non-NaN values 81 non_nan_mask = ~np.isnan(row) 82 # Find changes in the mask 83 changes = np.diff(non_nan_mask.astype(int)) != 0 84 change_idx = np.nonzero(changes)[0] 85 starts = np.concatenate(([0], change_idx + 1)) 86 ends = np.concatenate((change_idx, [len(row) - 1])) 87 88 # Return only stretches that start with non-NaN values 89 return [(start, end) for start, end in zip(starts, ends) if non_nan_mask[start]] 90 else: 91 # Find change points: where adjacent cells differ. 92 changes = np.diff(row) != 0 93 change_idx = np.nonzero(changes)[0] 94 starts = np.concatenate(([0], change_idx + 1)) 95 ends = np.concatenate((change_idx, [len(row) - 1])) 96 97 return [(start, end, row[start]) for start, end in zip(starts, ends)] 98 99 100def _seq_names(aln: explore.MSA, ax: plt.Axes, custom_seq_names: tuple, show_seq_names: bool): 101 """ 102 Validate custom names and set show names to True. Format axis accordingly. 103 """ 104 if custom_seq_names: 105 show_seq_names = True 106 if not isinstance(custom_seq_names, tuple): 107 raise ValueError('configure your custom names list: custom_names=(name1, name2...)') 108 if len(custom_seq_names) != len(aln.alignment.keys()): 109 raise ValueError('length of sequences not equal to number of custom names') 110 if show_seq_names: 111 ax.yaxis.set_ticks_position('none') 112 ax.set_yticks(np.arange(len(aln.alignment)) + 0.4) 113 if custom_seq_names: 114 ax.set_yticklabels(custom_seq_names[::-1]) 115 else: 116 names = [x.split(' ')[0] for x in list(aln.alignment.keys())[::-1]] 117 ax.set_yticklabels(names) 118 else: 119 ax.set_yticks([]) 120 121 122def _create_identity_patch(row, aln: explore.MSA, col: list, zoom: tuple[int, int], y_position: float | int, reference_color: str, seq_name: str, identity_color: str | ndarray, show_gaps: bool): 123 """ 124 Creates the initial patch. 125 """ 126 if show_gaps: 127 # plot a rectangle for parts that do not have gaps 128 for stretch in _find_stretches(row, True): 129 col.append(patches.Rectangle((stretch[0] + zoom[0] - 0.5, y_position), stretch[1] - stretch[0] + 1, 0.8, 130 facecolor=reference_color if seq_name == aln.reference_id else identity_color 131 ) 132 133 ) 134 # just plot a rectangle 135 else: 136 col.append(patches.Rectangle((zoom[0] - 0.5, y_position), zoom[1] - zoom[0], 0.8, 137 facecolor=reference_color if seq_name == aln.reference_id else identity_color 138 ) 139 ) 140 141 142def _create_polygons(stretches: list, identity_values: list | ndarray, zoom: tuple, y_position: int, polygons: list, aln_colors: dict | ScalarMappable, polygon_colors: list, detected_identity_values: set | None = None): 143 """ 144 create the individual polygons 145 """ 146 147 for start, end, value in stretches: 148 if value not in identity_values: 149 continue 150 if detected_identity_values is not None: 151 detected_identity_values.add(value) 152 width = end + 1 - start 153 # Calculate x coordinates adjusted for zoom and centering 154 x0 = start + zoom[0] - 0.5 155 x1 = x0 + width 156 # Define the rectangle corners 157 rect_coords = [ 158 (x0, y_position), 159 (x1, y_position), 160 (x1, y_position + 0.8), 161 (x0, y_position + 0.8), 162 (x0, y_position) 163 ] 164 polygons.append(rect_coords) 165 if type(aln_colors) != ScalarMappable: 166 polygon_colors.append(aln_colors[value]['color']) 167 else: 168 polygon_colors.append(aln_colors.to_rgba(value)) 169 170 171def _plot_annotation(annotation_dict: dict, ax: plt.Axes, direction_marker_size: int | None, color: str | ScalarMappable): 172 """ 173 Plot annotations 174 :param annotation_dict: dict of annotations 175 :param ax: matplotlib Axes 176 :param direction_marker_size: size of marker 177 :param color: color of annotation (color or scalar) 178 """ 179 for annotation in annotation_dict: 180 for locations in annotation_dict[annotation]['location']: 181 x_value = locations[0] 182 length = locations[1] - locations[0] 183 ax.add_patch( 184 patches.FancyBboxPatch( 185 (x_value, annotation_dict[annotation]['track'] + 1), 186 length, 187 0.8, 188 boxstyle="Round, pad=0", 189 ec="black", 190 fc=color.to_rgba(annotation_dict[annotation]['conservation']) if isinstance(color, ScalarMappable) else color, 191 ) 192 ) 193 if direction_marker_size is not None: 194 if annotation_dict[annotation]['strand'] == '-': 195 marker = '<' 196 else: 197 marker = '>' 198 ax.plot(x_value + length/2, annotation_dict[annotation]['track'] + 1.4, marker=marker, markersize=direction_marker_size, color='white', markeredgecolor='black') 199 200 # plot linked annotations (such as splicing) 201 if len(annotation_dict[annotation]['location']) > 1: 202 y_value = annotation_dict[annotation]['track'] + 1.4 203 start = None 204 for locations in annotation_dict[annotation]['location']: 205 if start is None: 206 start = locations[1] 207 continue 208 ax.plot([start, locations[0]], [y_value, y_value], '--', linewidth=2, color='black') 209 start = locations[1] 210 211 212def _add_track_positions(annotation_dic): 213 # create a dict and sort 214 annotation_dic = dict(sorted(annotation_dic.items(), key=lambda x: x[1]['location'][0][0])) 215 216 # remember for each track the largest stop 217 track_stops = [0] 218 219 for ann in annotation_dic: 220 flattened_locations = list(chain.from_iterable(annotation_dic[ann]['location'])) # flatten list 221 track = 0 222 # check if a start of a gene is smaller than the stop of the current track 223 # -> move to new track 224 while flattened_locations[0] < track_stops[track]: 225 track += 1 226 # if all prior tracks are potentially causing an overlap 227 # create a new track and break 228 if len(track_stops) <= track: 229 track_stops.append(0) 230 break 231 # in the current track remember the stop of the current gene 232 track_stops[track] = flattened_locations[-1] 233 # and indicate the track in the dict 234 annotation_dic[ann]['track'] = track 235 236 return annotation_dic 237 238 239def _get_contrast_text_color(rgba_color): 240 """ 241 compute the brightness of a color 242 """ 243 r, g, b, a = rgba_color 244 brightness = (r * 299 + g * 587 + b * 114) / 1000 245 return 'white' if brightness < 0.5 else 'black' 246 247 248def _plot_sequence_text(aln: explore.MSA, seq_name: str, ref_name: str, allways_text: bool, values: ndarray, matrix: ndarray, ax:plt.Axes, zoom: tuple, y_position:int, value_to_skip: int, ref_color:str, show_gaps: bool, cmap: None | ScalarMappable = None): 249 """ 250 Plot sequence text - however this will be done even if there is not enough space. 251 Might need some rework in the future. 252 """ 253 x_text = 0 254 if seq_name == ref_name: 255 different_cols = np.any((matrix != value_to_skip) & ~np.isnan(matrix), axis=0) 256 else: 257 different_cols = [False]*aln.length 258 259 for idx, (character, value) in enumerate(zip(aln.alignment[seq_name], values)): 260 if value != value_to_skip and character != '-' or seq_name == ref_name and character != '-' or character == '-'and not show_gaps or allways_text and character != '-': 261 # text color 262 if seq_name == ref_name: 263 text_color = _get_contrast_text_color(to_rgba(ref_color)) 264 elif cmap is not None: 265 text_color = _get_contrast_text_color(cmap.to_rgba(value)) 266 else: 267 text_color = 'black' 268 269 ax.text( 270 x=x_text + zoom[0] if zoom is not None else x_text, 271 y=y_position + 0.4, 272 s=character, 273 fontweight='bold' if different_cols[idx] else 'normal', 274 ha='center', 275 va='center_baseline', 276 c=text_color if value != value_to_skip or seq_name == ref_name else 'dimgrey' 277 ) 278 x_text += 1 279 280 281def identity_alignment(aln: explore.MSA, ax: plt.Axes, show_title: bool = True, show_identity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', show_mask:bool = True, show_gaps:bool = True, fancy_gaps:bool = False, show_mismatches: bool = True, show_ambiguities: bool = False, color_scheme: str | None = None, show_x_label: bool = True, show_legend: bool = False, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int]= (1, 1)): 282 """ 283 Generates an identity alignment overview plot. 284 :param aln: alignment MSA class 285 :param ax: matplotlib axes 286 :param show_title: whether to show title 287 :param show_seq_names: whether to show seq names 288 :param show_identity_sequence: whether to show sequence for only differences and reference - zoom in to avoid plotting issues 289 :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues 290 :param custom_seq_names: custom seq names 291 :param reference_color: color of reference sequence 292 :param show_mask: whether to show N or X chars otherwise it will be shown as match or mismatch 293 :param show_gaps: whether to show gaps otherwise it will be shown as match or mismatch 294 :param fancy_gaps: show gaps with a small black bar 295 :param show_mismatches: whether to show mismatches otherwise it will be shown as match 296 :param show_ambiguities: whether to show non-N ambiguities -> only relevant for RNA/DNA sequences 297 :param color_scheme: color mismatching chars with their unique color. Options for DNA/RNA are: standard, purine_pyrimidine, strong_weak; and for AS: standard, clustal, zappo, hydrophobicity 298 :param show_x_label: whether to show x label 299 :param show_legend: whether to show the legend 300 :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html 301 """ 302 303 # Validate inputs and colors 304 _validate_input_parameters(aln, ax) 305 if not is_color_like(reference_color): 306 raise ValueError(f'{reference_color} for reference is not a color') 307 308 if color_scheme is not None: 309 if aln.aln_type == 'AA' and color_scheme not in ['standard', 'clustal', 'zappo', 'hydrophobicity']: 310 raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, clustal, zappo, hydrophobicity') 311 elif aln.aln_type in ['DNA', 'RNA'] and color_scheme not in ['standard', 'purine_pyrimidine', 'strong_weak']: 312 raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, purine_pyrimidine, strong_weak') 313 314 # Both options for gaps work hand in hand 315 if fancy_gaps: 316 show_gaps = True 317 318 # Determine zoom 319 zoom = (0, aln.length) if aln.zoom is None else aln.zoom 320 321 # Set up color mapping and identity values 322 aln_colors = config.IDENTITY_COLORS.copy() 323 identity_values = [-1, -2, -3] # -1 = mismatch, -2 = mask, -3 ambiguity 324 if color_scheme is not None: 325 colors_to_extend = config.CHAR_COLORS[aln.aln_type][color_scheme] 326 identity_values += [x + 1 for x in range(len(colors_to_extend))] # x+1 is needed to allow correct mapping 327 # use the standard setting for the index (same as in aln.calc_identity_alignment) 328 # and map the corresponding color scheme to it 329 for idx, char in enumerate(config.CHAR_COLORS[aln.aln_type]['standard']): 330 aln_colors[idx + 1] = {'type': char, 'color': colors_to_extend[char]} 331 332 # Compute identity alignment 333 identity_aln = aln.calc_identity_alignment( 334 encode_mask=show_mask, 335 encode_gaps=show_gaps, 336 encode_mismatches=show_mismatches, 337 encode_ambiguities=show_ambiguities, 338 encode_each_mismatch_char=True if color_scheme is not None else False 339 ) 340 341 # List to store polygons 342 detected_identity_values = {0} 343 polygons, polygon_colors, patch_list = [], [], [] 344 345 for i, seq_name in enumerate(aln.alignment): 346 y_position = len(aln.alignment) - i - 1 347 row = identity_aln[i] 348 349 # plot a line below everything 350 if fancy_gaps: 351 ax.hlines( 352 y_position + 0.4, 353 xmin=zoom[0] - 0.5, 354 xmax=zoom[1] + 0.5, 355 color='black', 356 linestyle='-', 357 zorder=0, 358 linewidth=0.75 359 ) 360 361 # plot the basic shape per sequence with gaps 362 _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, aln_colors[0]['color'], show_gaps) 363 364 # find consecutive stretches 365 stretches = _find_stretches(row) 366 # create polygons per stretch 367 _create_polygons(stretches, identity_values, zoom, y_position, polygons, aln_colors, polygon_colors, detected_identity_values) 368 369 # add sequence text 370 if show_identity_sequence or show_sequence_all: 371 _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, identity_aln[i], identity_aln, ax, 372 zoom, y_position, 0, reference_color, show_gaps) 373 374 # Create the LineCollection: each segment is drawn in a single call. 375 ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt')) 376 ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors)) 377 378 # custom legend 379 if show_legend: 380 if color_scheme is not None and color_scheme != 'standard': 381 for x in aln_colors: 382 for group in config.CHAR_GROUPS[aln.aln_type][color_scheme]: 383 if aln_colors[x]['type'] in config.CHAR_GROUPS[aln.aln_type][color_scheme][group]: 384 aln_colors[x]['type'] = group 385 break 386 # create it 387 handels, labels, detected_groups = [], [], set() 388 for x in aln_colors: 389 if x in detected_identity_values and aln_colors[x]['type'] not in detected_groups: 390 handels.append( 391 ax.add_line( 392 plt.Line2D( 393 [], 394 [], 395 color=aln_colors[x]['color'] if color_scheme != 'hydrophobicity' or x == 0 else config.CHAR_COLORS[aln.aln_type]['hydrophobicity'][ 396 config.CHAR_GROUPS[aln.aln_type]['hydrophobicity'][aln_colors[x]['type']][0] 397 ], 398 marker='s', 399 markeredgecolor='grey', 400 linestyle='', 401 markersize=10)) 402 ) 403 labels.append(aln_colors[x]['type']) 404 detected_groups.add(aln_colors[x]['type']) 405 406 # plot it 407 ax.legend( 408 handels, 409 labels, 410 loc='lower right', 411 bbox_to_anchor=bbox_to_anchor, 412 ncols=(len(detected_identity_values) + 1) / 2 if aln.aln_type == 'AA' and color_scheme == 'standard' else len(detected_identity_values), 413 frameon=False 414 ) 415 416 _seq_names(aln, ax, custom_seq_names, show_seq_names) 417 418 # configure axis 419 ax.set_ylim(0, len(aln.alignment)) 420 if show_title: 421 ax.set_title('identity', loc='left') 422 _format_x_axis(aln, ax, show_x_label, show_left=False) 423 424 425def similarity_alignment(aln: explore.MSA, ax: plt.Axes, matrix_type: str | None = None, show_title: bool = True, show_similarity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', cmap: str = 'twilight_r', show_gaps:bool = True, fancy_gaps:bool = False, show_x_label: bool = True, show_cbar: bool = False, cbar_fraction: float = 0.1): 426 """ 427 Generates a similarity alignment overview plot. Importantly the similarity values are normalized! 428 :param aln: alignment MSA class 429 :param ax: matplotlib axes 430 :param matrix_type: substitution matrix - see config.SUBS_MATRICES, standard: NT - TRANS, AA - BLOSUM65 431 :param show_title: whether to show title 432 :param show_similarity_sequence: whether to show sequence only for differences and reference - zoom in to avoid plotting issues 433 :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues 434 :param show_seq_names: whether to show seq names 435 :param custom_seq_names: custom seq names 436 :param reference_color: color of reference sequence 437 :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html 438 :param show_gaps: whether to show gaps otherwise it will be ignored 439 :param fancy_gaps: show gaps with a small black bar 440 :param show_x_label: whether to show x label 441 :param show_cbar: whether to show the legend - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html 442 :param cbar_fraction: fraction of the original ax reserved for the legend 443 """ 444 # input check 445 _validate_input_parameters(aln, ax) 446 447 # validate colors 448 if not is_color_like(reference_color): 449 raise ValueError(f'{reference_color} for reference is not a color') 450 451 # Both options for gaps work hand in hand 452 if fancy_gaps: 453 show_gaps = True 454 455 # define zoom to plot 456 if aln.zoom is None: 457 zoom = (0, aln.length) 458 else: 459 zoom = aln.zoom 460 461 # get data 462 similarity_aln = aln.calc_similarity_alignment(matrix_type=matrix_type) # use normalized values here 463 similarity_aln = similarity_aln.round(2) # round data for good color mapping 464 465 # determine min max values of the underlying matrix and create cmap 466 min_value, max_value = 0, 1 467 cmap = ScalarMappable( 468 norm=Normalize( 469 vmin=min_value, 470 vmax=max_value 471 ), 472 cmap=plt.get_cmap(cmap) 473 ) 474 475 # create similarity values 476 similarity_values = np.arange(start=min_value, stop=max_value, step=0.01) 477 # round it to be absolutely sure that values match with rounded sim alignment 478 similarity_values = similarity_values.round(2) 479 480 # create plot 481 polygons, polygon_colors, patch_list = [], [], [] 482 483 for i, seq_name in enumerate(aln.alignment): 484 y_position = len(aln.alignment) - i - 1 485 row = similarity_aln[i] 486 487 # plot a line below everything 488 if fancy_gaps: 489 ax.hlines( 490 y_position + 0.4, 491 xmin=zoom[0] - 0.5, 492 xmax=zoom[1] + 0.5, 493 color='black', 494 linestyle='-', 495 zorder=0, 496 linewidth=0.75 497 ) 498 499 # plot the basic shape per sequence with gaps 500 _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, 501 cmap.to_rgba(max_value) if seq_name != aln.reference_id else reference_color, 502 show_gaps) 503 504 # find consecutive stretches 505 stretches = _find_stretches(row) 506 # create polygons per stretch 507 _create_polygons(stretches, similarity_values, zoom, y_position, polygons, cmap, polygon_colors) 508 509 # add sequence text 510 if show_sequence_all or show_similarity_sequence: 511 _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, similarity_aln[i], similarity_aln, ax, 512 zoom, y_position, 1, reference_color, show_gaps, cmap) 513 514 # Create the LineCollection: each segment is drawn in a single call. 515 ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt')) 516 ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors)) 517 518 # legend 519 if show_cbar: 520 cbar = plt.colorbar(cmap, ax=ax, location= 'top', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction) 521 cbar.set_ticks([min_value, max_value]) 522 cbar.set_ticklabels(['low', 'high']) 523 524 # format seq names 525 _seq_names(aln, ax, custom_seq_names, show_seq_names) 526 527 # configure axis 528 ax.set_ylim(0, len(aln.alignment)) 529 if show_title: 530 ax.set_title('similarity', loc='left') 531 _format_x_axis(aln, ax, show_x_label, show_left=False) 532 533 534def _moving_average(arr: ndarray, window_size: int, zoom: tuple | None, aln_length: int) -> tuple[ndarray, ndarray]: 535 """ 536 Calculate the moving average of an array. 537 :param arr: array with values 538 :param window_size: size of the moving average 539 :param zoom: zoom of the alignment 540 :param aln_length: length of the alignment 541 :return: new array with moving average 542 """ 543 if window_size > 1: 544 i = 0 545 moving_averages, plotting_idx = [], [] 546 while i < len(arr) + 1: 547 half_window_size = window_size // 2 548 window_left = arr[i - half_window_size : i] if i > half_window_size else arr[0:i] 549 window_right = arr[i: i + half_window_size] if i < len(arr) - half_window_size else arr[i: len(arr)] 550 moving_averages.append((sum(window_left) + sum(window_right)) / (len(window_left) + len(window_right))) 551 plotting_idx.append(i) 552 i += 1 553 554 return np.array(moving_averages), np.array(plotting_idx) if zoom is None else np.array(plotting_idx) + zoom[0] 555 else: 556 return arr, np.arange(zoom[0], zoom[1]) if zoom is not None else np.arange(aln_length) 557 558 559def stat_plot(aln: explore.MSA, ax: plt.Axes, stat_type: str, line_color: str = 'burlywood', line_width: int | float = 2, rolling_average: int = 20, show_x_label: bool = False, show_title: bool = True): 560 """ 561 Generate a plot for the various alignment stats. 562 :param aln: alignment MSA class 563 :param ax: matplotlib axes 564 :param stat_type: 'entropy', 'gc', 'coverage', 'ts/tv', 'identity' or 'similarity' -> (here default matrices are used NT - TRANS, AA - BLOSUM65) 565 :param line_color: color of the line 566 :param line_width: width of the line 567 :param rolling_average: average rolling window size left and right of a position in nucleotides or amino acids 568 :param show_x_label: whether to show the x-axis label 569 :param show_title: whether to show the title 570 """ 571 572 # define possible functions to calc here 573 stat_functions: Dict[str, Callable[[], list | ndarray]] = { 574 'gc': aln.calc_gc, 575 'entropy': aln.calc_entropy, 576 'coverage': aln.calc_coverage, 577 'identity': aln.calc_identity_alignment, 578 'similarity': aln.calc_similarity_alignment, 579 'ts tv score': aln.calc_transition_transversion_score 580 } 581 582 if stat_type not in stat_functions: 583 raise ValueError('stat_type must be one of {}'.format(list(stat_functions.keys()))) 584 585 # input check 586 _validate_input_parameters(aln, ax) 587 if not is_color_like(line_color): 588 raise ValueError('line color is not a color') 589 590 # validate rolling average 591 if rolling_average < 1 or rolling_average > aln.length: 592 raise ValueError('rolling_average must be between 1 and length of sequence') 593 594 # generate input data 595 array = stat_functions[stat_type]() 596 597 if stat_type == 'identity': 598 min_value, max_value = -1, 0 599 elif stat_type == 'ts tv score': 600 min_value, max_value = -1, 1 601 else: 602 min_value, max_value = 0, 1 603 if stat_type in ['identity', 'similarity']: 604 # for the mean nan values get handled as the lowest possible number in the matrix 605 array = np.nan_to_num(array, True, min_value) 606 array = np.mean(array, axis=0) 607 data, plot_idx = _moving_average(array, rolling_average, aln.zoom, aln.length) 608 609 # plot the data 610 ax.fill_between( 611 # this add dummy data left and right for better plotting 612 # otherwise only half of the step is shown 613 np.concatenate(([plot_idx[0] - 0.5], plot_idx, [plot_idx[-1] + 0.5])) if rolling_average == 1 else plot_idx, 614 np.concatenate(([data[0]], data, [data[-1]])) if rolling_average == 1 else data, 615 min_value, 616 linewidth = line_width, 617 edgecolor=line_color, 618 step='mid' if rolling_average == 1 else None, 619 facecolor=(line_color, 0.6) if stat_type not in ['ts tv score', 'gc'] else 'none' 620 ) 621 if stat_type == 'gc': 622 ax.hlines(0.5, xmin=0, xmax=aln.zoom[0] + aln.length if aln.zoom is not None else aln.length, color='black', linestyles='--', linewidth=1) 623 624 # format axis 625 ax.set_ylim(min_value, max_value*0.1+max_value) 626 ax.set_yticks([min_value, max_value]) 627 if stat_type == 'gc': 628 ax.set_yticklabels(['0', '100']) 629 elif stat_type == 'ts tv score': 630 ax.set_yticklabels(['tv', 'ts']) 631 else: 632 ax.set_yticklabels(['low', 'high']) 633 634 # show title 635 if show_title: 636 ax.set_title( 637 f'{stat_type} (average over {rolling_average} positions)' if rolling_average > 1 else f'{stat_type} for each position', 638 loc='left' 639 ) 640 641 _format_x_axis(aln, ax, show_x_label, show_left=True) 642 643 644def variant_plot(aln: explore.MSA, ax: plt.Axes, lollisize: tuple[int, int] | list[int, int] = (1, 3), color_scheme: str = 'standard', show_x_label: bool = False, show_legend: bool = True, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int] = (1, 1)): 645 """ 646 Plots variants. 647 :param aln: alignment MSA class 648 :param ax: matplotlib axes 649 :param lollisize: (stem_size, head_size) 650 :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options 651 :param show_x_label: whether to show the x-axis label 652 :param show_legend: whether to show the legend 653 :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html 654 """ 655 656 # validate input 657 _validate_input_parameters(aln, ax) 658 if not isinstance(lollisize, tuple) or len(lollisize) != 2: 659 raise ValueError('lollisize must be tuple of length 2 (stem, head)') 660 for _size in lollisize: 661 if not isinstance(_size, float | int) or _size <= 0: 662 raise ValueError('lollisize must be floats greater than zero') 663 664 # define colors 665 colors = config.CHAR_COLORS[aln.aln_type][color_scheme] 666 # get snps 667 snps = aln.get_snps() 668 # define where to plot (each ref type gets a separate line) 669 ref_y_positions, y_pos, detected_var = {}, 0, set() 670 671 # iterate over snp dict 672 for pos in snps['POS']: 673 for identifier in snps['POS'][pos]: 674 # fill in y pos dict 675 if identifier == 'ref': 676 if snps['POS'][pos]['ref'] not in ref_y_positions: 677 ref_y_positions[snps['POS'][pos]['ref']] = y_pos 678 y_pos += 1.1 679 continue 680 # plot 681 if identifier == 'ALT': 682 for alt in snps['POS'][pos]['ALT']: 683 ax.vlines(x=pos + aln.zoom[0] if aln.zoom is not None else pos, 684 ymin=ref_y_positions[snps['POS'][pos]['ref']], 685 ymax=ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'], 686 color=colors[alt], 687 zorder=100, 688 linewidth=lollisize[0] 689 ) 690 ax.plot(pos + aln.zoom[0] if aln.zoom is not None else pos, 691 ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'], 692 color=colors[alt], 693 marker='o', 694 markersize=lollisize[1] 695 ) 696 detected_var.add(alt) 697 698 # plot hlines 699 for y_char in ref_y_positions: 700 ax.hlines( 701 ref_y_positions[y_char], 702 xmin=aln.zoom[0] - 0.5 if aln.zoom is not None else -0.5, 703 xmax=aln.zoom[0] + aln.length + 0.5 if aln.zoom is not None else aln.length + 0.5, 704 color='black', 705 linestyle='-', 706 zorder=0, 707 linewidth=0.75 708 ) 709 # create a custom legend 710 if show_legend: 711 custom_legend = [ 712 ax.add_line( 713 plt.Line2D( 714 [], 715 [], 716 color=colors[char], 717 marker='o', 718 linestyle='', 719 markersize=5 720 ) 721 ) for char in colors if char in detected_var 722 ] 723 ax.legend( 724 custom_legend, 725 [char for char in colors if char in detected_var], # ensures correct sorting 726 loc='lower right', 727 title='variant', 728 bbox_to_anchor=bbox_to_anchor, 729 ncols=len(detected_var)/2 if aln.aln_type == 'AA' else len(detected_var), 730 frameon=False 731 ) 732 733 # format axis 734 _format_x_axis(aln, ax, show_x_label, show_left=False) 735 ax.spines['left'].set_visible(False) 736 ax.set_yticks([ref_y_positions[x] for x in ref_y_positions]) 737 ax.set_yticklabels(ref_y_positions.keys()) 738 ax.set_ylim(0, y_pos) 739 ax.set_ylabel('reference') 740 741 742def orf_plot(aln: explore.MSA, ax: plt.Axes, min_length: int = 500, non_overlapping_orfs: bool = True, cmap: str = 'Blues', direction_marker_size: int | None = 5, show_x_label: bool = False, show_cbar: bool = False, cbar_fraction: float = 0.1): 743 """ 744 Plot conserved ORFs. 745 :param aln: alignment MSA class 746 :param ax: matplotlib axes 747 :param min_length: minimum length of orf 748 :param non_overlapping_orfs: whether to consider overlapping orfs 749 :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html 750 :param direction_marker_size: marker size for direction marker, not shown if marker_size == None 751 :param show_x_label: whether to show the x-axis label 752 :param show_cbar: whether to show the colorbar - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html 753 :param cbar_fraction: fraction of the original ax reserved for the colorbar 754 """ 755 756 # normalize colorbar 757 cmap = ScalarMappable(norm=Normalize(0, 100), cmap=plt.get_cmap(cmap)) 758 759 # validate input 760 _validate_input_parameters(aln, ax) 761 762 # get orfs --> first deepcopy and reset zoom that the orfs are also zoomed in (by default, the orfs are only 763 # searched within the zoomed region) 764 aln_temp = deepcopy(aln) 765 aln_temp.zoom = None 766 if non_overlapping_orfs: 767 annotation_dict = aln_temp.get_non_overlapping_conserved_orfs(min_length=min_length) 768 else: 769 annotation_dict = aln_temp.get_conserved_orfs(min_length=min_length) 770 771 # filter dict for zoom 772 if aln.zoom is not None: 773 annotation_dict = {key:val for key, val in annotation_dict.items() if max(val['location'][0][0], aln.zoom[0]) <= min(val['location'][0][1], aln.zoom[1])} 774 775 # add track for plotting 776 _add_track_positions(annotation_dict) 777 778 # plot 779 _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=cmap) 780 781 # legend 782 if show_cbar: 783 cbar = plt.colorbar(cmap,ax=ax, location= 'top', orientation='horizontal', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction) 784 cbar.set_label('% identity') 785 cbar.set_ticks([0, 100]) 786 787 # format axis 788 _format_x_axis(aln, ax, show_x_label, show_left=False) 789 ax.set_ylim(bottom=0.8) 790 ax.set_yticks([]) 791 ax.set_yticklabels([]) 792 ax.set_title('conserved orfs', loc='left') 793 794 795def annotation_plot(aln: explore.MSA, annotation: explore.Annotation | str, ax: plt.Axes, feature_to_plot: str, color: str = 'wheat', direction_marker_size: int | None = 5, show_x_label: bool = False): 796 """ 797 Plot annotations from bed, gff or gb files. Are automatically mapped to alignment. 798 :param aln: alignment MSA class 799 :param annotation: annotation class | path to annotation file 800 :param ax: matplotlib axes 801 :param feature_to_plot: potential feature to plot (not for bed files as it is parsed as one feature) 802 :param color: color for the annotation 803 :param direction_marker_size: marker size for direction marker, only relevant if show_direction is True 804 :param show_x_label: whether to show the x-axis label 805 """ 806 # helper function 807 def parse_annotation_from_string(path: str, msa: explore.MSA) -> explore.Annotation: 808 """ 809 Parse annotation. 810 :param path: path to annotation 811 :param msa: msa object 812 :return: parsed annotation 813 """ 814 if os.path.exists(path): 815 # reset zoom so the annotation is correctly parsed 816 msa_temp = deepcopy(msa) 817 msa_temp.zoom = None 818 return explore.Annotation(msa_temp, path) 819 else: 820 raise FileNotFoundError() 821 822 # parse from path 823 if type(annotation) is str: 824 annotation = parse_annotation_from_string(annotation, aln) 825 826 # validate input 827 _validate_input_parameters(aln, ax, annotation) 828 if not is_color_like(color): 829 raise ValueError(f'{color} for reference is not a color') 830 831 # ignore features to plot for bed files (here it is written into one feature) 832 if annotation.ann_type == 'bed': 833 annotation_dict = annotation.features['region'] 834 feature_to_plot = 'bed regions' 835 else: 836 # try to subset the annotation dict 837 try: 838 annotation_dict = annotation.features[feature_to_plot] 839 except KeyError: 840 raise KeyError(f'Feature {feature_to_plot} not found. Use annotation.features.keys() to see available features.') 841 842 # plotting and formating 843 _add_track_positions(annotation_dict) 844 _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=color) 845 _format_x_axis(aln, ax, show_x_label, show_left=False) 846 ax.set_ylim(bottom=0.8) 847 ax.set_yticks([]) 848 ax.set_yticklabels([]) 849 ax.set_title(f'{annotation.locus} ({feature_to_plot})', loc='left') 850 851 852def sequence_logo(aln:explore.MSA, ax:plt.Axes, color_scheme: str = 'standard', plot_type: str = 'logo', show_x_label:bool = False): 853 """ 854 Plot sequence logo or stacked area plot (use the first one with appropriate zoom levels). The 855 logo visualizes the relative frequency of nt or aa characters in the alignment. The char frequency 856 is scaled to the information content at each position. --> identical to how Geneious calculates it. 857 858 :param aln: alignment MSA class 859 :param ax: matplotlib axes 860 :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options 861 :param plot_type: 'logo' for sequence logo, 'stacked' for stacked area plot 862 :param show_x_label: whether to show the x-axis label 863 """ 864 865 _validate_input_parameters(aln, ax) 866 # calc matrix 867 matrix = aln.calc_position_matrix('IC') * aln.calc_position_matrix('PPM') 868 letters_to_plot = list(config.CHAR_COLORS[aln.aln_type]['standard'].keys())[:-1] 869 870 #matrix = np.clip(matrix, 0, None) # only positive values 871 # plot 872 if plot_type == 'logo': 873 for pos in range(matrix.shape[1]): 874 # sort the positive matrix row values by size 875 items = [(letters_to_plot[i], matrix[i, pos]) for i in range(len(letters_to_plot)) if matrix[i, pos] > 0] 876 items.sort(key=lambda x: x[1]) 877 # plot each position 878 y_offset = 0 879 for letter, h in items: 880 tp = TextPath((aln.zoom[0] - 0.325 if aln.zoom is not None else - 0.325, 0), letter, size=1, prop=FontProperties(weight='bold')) 881 bb = tp.get_extents() 882 glyph_height = bb.height if bb.height > 0 else 1e-6 # avoid div by zero 883 scale_to_1 = 1.0 / glyph_height 884 885 transform = (Affine2D() 886 .scale(1.0, h * scale_to_1) # scale manually by IC and normalize font 887 .translate(pos, y_offset) 888 + ax.transData) 889 890 patch = PathPatch(tp, transform=transform, 891 facecolor=config.CHAR_COLORS[aln.aln_type][color_scheme][letter], 892 edgecolor='none') 893 ax.add_patch(patch) 894 y_offset += h 895 elif plot_type == 'stacked': 896 y_values = np.zeros(matrix.shape[1]) 897 x_values = np.arange(0, matrix.shape[1]) if aln.zoom is None else np.arange(aln.zoom[0], aln.zoom[1]) 898 for row in range(matrix.shape[0]): 899 y = matrix[row] 900 ax.fill_between(x_values, 901 y_values, 902 y_values + y, 903 fc=config.CHAR_COLORS[aln.aln_type][color_scheme].get(letters_to_plot[row]), 904 ec='None', 905 label=letters_to_plot[row], 906 step='pre') 907 y_values += y 908 909 # adjust limits & labels 910 _format_x_axis(aln, ax, show_x_label, show_left=True) 911 if aln.aln_type == 'AA': 912 ax.set_ylim(bottom=0, top=5) 913 else: 914 ax.set_ylim(bottom=0, top=2) 915 ax.spines['top'].set_visible(False) 916 ax.spines['right'].set_visible(False) 917 ax.set_ylabel('bits')
282def identity_alignment(aln: explore.MSA, ax: plt.Axes, show_title: bool = True, show_identity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', show_mask:bool = True, show_gaps:bool = True, fancy_gaps:bool = False, show_mismatches: bool = True, show_ambiguities: bool = False, color_scheme: str | None = None, show_x_label: bool = True, show_legend: bool = False, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int]= (1, 1)): 283 """ 284 Generates an identity alignment overview plot. 285 :param aln: alignment MSA class 286 :param ax: matplotlib axes 287 :param show_title: whether to show title 288 :param show_seq_names: whether to show seq names 289 :param show_identity_sequence: whether to show sequence for only differences and reference - zoom in to avoid plotting issues 290 :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues 291 :param custom_seq_names: custom seq names 292 :param reference_color: color of reference sequence 293 :param show_mask: whether to show N or X chars otherwise it will be shown as match or mismatch 294 :param show_gaps: whether to show gaps otherwise it will be shown as match or mismatch 295 :param fancy_gaps: show gaps with a small black bar 296 :param show_mismatches: whether to show mismatches otherwise it will be shown as match 297 :param show_ambiguities: whether to show non-N ambiguities -> only relevant for RNA/DNA sequences 298 :param color_scheme: color mismatching chars with their unique color. Options for DNA/RNA are: standard, purine_pyrimidine, strong_weak; and for AS: standard, clustal, zappo, hydrophobicity 299 :param show_x_label: whether to show x label 300 :param show_legend: whether to show the legend 301 :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html 302 """ 303 304 # Validate inputs and colors 305 _validate_input_parameters(aln, ax) 306 if not is_color_like(reference_color): 307 raise ValueError(f'{reference_color} for reference is not a color') 308 309 if color_scheme is not None: 310 if aln.aln_type == 'AA' and color_scheme not in ['standard', 'clustal', 'zappo', 'hydrophobicity']: 311 raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, clustal, zappo, hydrophobicity') 312 elif aln.aln_type in ['DNA', 'RNA'] and color_scheme not in ['standard', 'purine_pyrimidine', 'strong_weak']: 313 raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, purine_pyrimidine, strong_weak') 314 315 # Both options for gaps work hand in hand 316 if fancy_gaps: 317 show_gaps = True 318 319 # Determine zoom 320 zoom = (0, aln.length) if aln.zoom is None else aln.zoom 321 322 # Set up color mapping and identity values 323 aln_colors = config.IDENTITY_COLORS.copy() 324 identity_values = [-1, -2, -3] # -1 = mismatch, -2 = mask, -3 ambiguity 325 if color_scheme is not None: 326 colors_to_extend = config.CHAR_COLORS[aln.aln_type][color_scheme] 327 identity_values += [x + 1 for x in range(len(colors_to_extend))] # x+1 is needed to allow correct mapping 328 # use the standard setting for the index (same as in aln.calc_identity_alignment) 329 # and map the corresponding color scheme to it 330 for idx, char in enumerate(config.CHAR_COLORS[aln.aln_type]['standard']): 331 aln_colors[idx + 1] = {'type': char, 'color': colors_to_extend[char]} 332 333 # Compute identity alignment 334 identity_aln = aln.calc_identity_alignment( 335 encode_mask=show_mask, 336 encode_gaps=show_gaps, 337 encode_mismatches=show_mismatches, 338 encode_ambiguities=show_ambiguities, 339 encode_each_mismatch_char=True if color_scheme is not None else False 340 ) 341 342 # List to store polygons 343 detected_identity_values = {0} 344 polygons, polygon_colors, patch_list = [], [], [] 345 346 for i, seq_name in enumerate(aln.alignment): 347 y_position = len(aln.alignment) - i - 1 348 row = identity_aln[i] 349 350 # plot a line below everything 351 if fancy_gaps: 352 ax.hlines( 353 y_position + 0.4, 354 xmin=zoom[0] - 0.5, 355 xmax=zoom[1] + 0.5, 356 color='black', 357 linestyle='-', 358 zorder=0, 359 linewidth=0.75 360 ) 361 362 # plot the basic shape per sequence with gaps 363 _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, aln_colors[0]['color'], show_gaps) 364 365 # find consecutive stretches 366 stretches = _find_stretches(row) 367 # create polygons per stretch 368 _create_polygons(stretches, identity_values, zoom, y_position, polygons, aln_colors, polygon_colors, detected_identity_values) 369 370 # add sequence text 371 if show_identity_sequence or show_sequence_all: 372 _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, identity_aln[i], identity_aln, ax, 373 zoom, y_position, 0, reference_color, show_gaps) 374 375 # Create the LineCollection: each segment is drawn in a single call. 376 ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt')) 377 ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors)) 378 379 # custom legend 380 if show_legend: 381 if color_scheme is not None and color_scheme != 'standard': 382 for x in aln_colors: 383 for group in config.CHAR_GROUPS[aln.aln_type][color_scheme]: 384 if aln_colors[x]['type'] in config.CHAR_GROUPS[aln.aln_type][color_scheme][group]: 385 aln_colors[x]['type'] = group 386 break 387 # create it 388 handels, labels, detected_groups = [], [], set() 389 for x in aln_colors: 390 if x in detected_identity_values and aln_colors[x]['type'] not in detected_groups: 391 handels.append( 392 ax.add_line( 393 plt.Line2D( 394 [], 395 [], 396 color=aln_colors[x]['color'] if color_scheme != 'hydrophobicity' or x == 0 else config.CHAR_COLORS[aln.aln_type]['hydrophobicity'][ 397 config.CHAR_GROUPS[aln.aln_type]['hydrophobicity'][aln_colors[x]['type']][0] 398 ], 399 marker='s', 400 markeredgecolor='grey', 401 linestyle='', 402 markersize=10)) 403 ) 404 labels.append(aln_colors[x]['type']) 405 detected_groups.add(aln_colors[x]['type']) 406 407 # plot it 408 ax.legend( 409 handels, 410 labels, 411 loc='lower right', 412 bbox_to_anchor=bbox_to_anchor, 413 ncols=(len(detected_identity_values) + 1) / 2 if aln.aln_type == 'AA' and color_scheme == 'standard' else len(detected_identity_values), 414 frameon=False 415 ) 416 417 _seq_names(aln, ax, custom_seq_names, show_seq_names) 418 419 # configure axis 420 ax.set_ylim(0, len(aln.alignment)) 421 if show_title: 422 ax.set_title('identity', loc='left') 423 _format_x_axis(aln, ax, show_x_label, show_left=False)
Generates an identity alignment overview plot.
Parameters
- aln: alignment MSA class
- ax: matplotlib axes
- show_title: whether to show title
- show_seq_names: whether to show seq names
- show_identity_sequence: whether to show sequence for only differences and reference - zoom in to avoid plotting issues
- show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
- custom_seq_names: custom seq names
- reference_color: color of reference sequence
- show_mask: whether to show N or X chars otherwise it will be shown as match or mismatch
- show_gaps: whether to show gaps otherwise it will be shown as match or mismatch
- fancy_gaps: show gaps with a small black bar
- show_mismatches: whether to show mismatches otherwise it will be shown as match
- show_ambiguities: whether to show non-N ambiguities -> only relevant for RNA/DNA sequences
- color_scheme: color mismatching chars with their unique color. Options for DNA/RNA are: standard, purine_pyrimidine, strong_weak; and for AS: standard, clustal, zappo, hydrophobicity
- show_x_label: whether to show x label
- show_legend: whether to show the legend
- bbox_to_anchor: bounding box coordinates for the legend - see: https: //matplotlib.org/stable/api/legend_api.html
426def similarity_alignment(aln: explore.MSA, ax: plt.Axes, matrix_type: str | None = None, show_title: bool = True, show_similarity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', cmap: str = 'twilight_r', show_gaps:bool = True, fancy_gaps:bool = False, show_x_label: bool = True, show_cbar: bool = False, cbar_fraction: float = 0.1): 427 """ 428 Generates a similarity alignment overview plot. Importantly the similarity values are normalized! 429 :param aln: alignment MSA class 430 :param ax: matplotlib axes 431 :param matrix_type: substitution matrix - see config.SUBS_MATRICES, standard: NT - TRANS, AA - BLOSUM65 432 :param show_title: whether to show title 433 :param show_similarity_sequence: whether to show sequence only for differences and reference - zoom in to avoid plotting issues 434 :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues 435 :param show_seq_names: whether to show seq names 436 :param custom_seq_names: custom seq names 437 :param reference_color: color of reference sequence 438 :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html 439 :param show_gaps: whether to show gaps otherwise it will be ignored 440 :param fancy_gaps: show gaps with a small black bar 441 :param show_x_label: whether to show x label 442 :param show_cbar: whether to show the legend - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html 443 :param cbar_fraction: fraction of the original ax reserved for the legend 444 """ 445 # input check 446 _validate_input_parameters(aln, ax) 447 448 # validate colors 449 if not is_color_like(reference_color): 450 raise ValueError(f'{reference_color} for reference is not a color') 451 452 # Both options for gaps work hand in hand 453 if fancy_gaps: 454 show_gaps = True 455 456 # define zoom to plot 457 if aln.zoom is None: 458 zoom = (0, aln.length) 459 else: 460 zoom = aln.zoom 461 462 # get data 463 similarity_aln = aln.calc_similarity_alignment(matrix_type=matrix_type) # use normalized values here 464 similarity_aln = similarity_aln.round(2) # round data for good color mapping 465 466 # determine min max values of the underlying matrix and create cmap 467 min_value, max_value = 0, 1 468 cmap = ScalarMappable( 469 norm=Normalize( 470 vmin=min_value, 471 vmax=max_value 472 ), 473 cmap=plt.get_cmap(cmap) 474 ) 475 476 # create similarity values 477 similarity_values = np.arange(start=min_value, stop=max_value, step=0.01) 478 # round it to be absolutely sure that values match with rounded sim alignment 479 similarity_values = similarity_values.round(2) 480 481 # create plot 482 polygons, polygon_colors, patch_list = [], [], [] 483 484 for i, seq_name in enumerate(aln.alignment): 485 y_position = len(aln.alignment) - i - 1 486 row = similarity_aln[i] 487 488 # plot a line below everything 489 if fancy_gaps: 490 ax.hlines( 491 y_position + 0.4, 492 xmin=zoom[0] - 0.5, 493 xmax=zoom[1] + 0.5, 494 color='black', 495 linestyle='-', 496 zorder=0, 497 linewidth=0.75 498 ) 499 500 # plot the basic shape per sequence with gaps 501 _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, 502 cmap.to_rgba(max_value) if seq_name != aln.reference_id else reference_color, 503 show_gaps) 504 505 # find consecutive stretches 506 stretches = _find_stretches(row) 507 # create polygons per stretch 508 _create_polygons(stretches, similarity_values, zoom, y_position, polygons, cmap, polygon_colors) 509 510 # add sequence text 511 if show_sequence_all or show_similarity_sequence: 512 _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, similarity_aln[i], similarity_aln, ax, 513 zoom, y_position, 1, reference_color, show_gaps, cmap) 514 515 # Create the LineCollection: each segment is drawn in a single call. 516 ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt')) 517 ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors)) 518 519 # legend 520 if show_cbar: 521 cbar = plt.colorbar(cmap, ax=ax, location= 'top', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction) 522 cbar.set_ticks([min_value, max_value]) 523 cbar.set_ticklabels(['low', 'high']) 524 525 # format seq names 526 _seq_names(aln, ax, custom_seq_names, show_seq_names) 527 528 # configure axis 529 ax.set_ylim(0, len(aln.alignment)) 530 if show_title: 531 ax.set_title('similarity', loc='left') 532 _format_x_axis(aln, ax, show_x_label, show_left=False)
Generates a similarity alignment overview plot. Importantly the similarity values are normalized!
Parameters
- aln: alignment MSA class
- ax: matplotlib axes
- matrix_type: substitution matrix - see config.SUBS_MATRICES, standard: NT - TRANS, AA - BLOSUM65
- show_title: whether to show title
- show_similarity_sequence: whether to show sequence only for differences and reference - zoom in to avoid plotting issues
- show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
- show_seq_names: whether to show seq names
- custom_seq_names: custom seq names
- reference_color: color of reference sequence
- cmap: color mapping for % identity - see https: //matplotlib.org/stable/users/explain/colors/colormaps.html
- show_gaps: whether to show gaps otherwise it will be ignored
- fancy_gaps: show gaps with a small black bar
- show_x_label: whether to show x label
- show_cbar: whether to show the legend - see https: //matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
- cbar_fraction: fraction of the original ax reserved for the legend
560def stat_plot(aln: explore.MSA, ax: plt.Axes, stat_type: str, line_color: str = 'burlywood', line_width: int | float = 2, rolling_average: int = 20, show_x_label: bool = False, show_title: bool = True): 561 """ 562 Generate a plot for the various alignment stats. 563 :param aln: alignment MSA class 564 :param ax: matplotlib axes 565 :param stat_type: 'entropy', 'gc', 'coverage', 'ts/tv', 'identity' or 'similarity' -> (here default matrices are used NT - TRANS, AA - BLOSUM65) 566 :param line_color: color of the line 567 :param line_width: width of the line 568 :param rolling_average: average rolling window size left and right of a position in nucleotides or amino acids 569 :param show_x_label: whether to show the x-axis label 570 :param show_title: whether to show the title 571 """ 572 573 # define possible functions to calc here 574 stat_functions: Dict[str, Callable[[], list | ndarray]] = { 575 'gc': aln.calc_gc, 576 'entropy': aln.calc_entropy, 577 'coverage': aln.calc_coverage, 578 'identity': aln.calc_identity_alignment, 579 'similarity': aln.calc_similarity_alignment, 580 'ts tv score': aln.calc_transition_transversion_score 581 } 582 583 if stat_type not in stat_functions: 584 raise ValueError('stat_type must be one of {}'.format(list(stat_functions.keys()))) 585 586 # input check 587 _validate_input_parameters(aln, ax) 588 if not is_color_like(line_color): 589 raise ValueError('line color is not a color') 590 591 # validate rolling average 592 if rolling_average < 1 or rolling_average > aln.length: 593 raise ValueError('rolling_average must be between 1 and length of sequence') 594 595 # generate input data 596 array = stat_functions[stat_type]() 597 598 if stat_type == 'identity': 599 min_value, max_value = -1, 0 600 elif stat_type == 'ts tv score': 601 min_value, max_value = -1, 1 602 else: 603 min_value, max_value = 0, 1 604 if stat_type in ['identity', 'similarity']: 605 # for the mean nan values get handled as the lowest possible number in the matrix 606 array = np.nan_to_num(array, True, min_value) 607 array = np.mean(array, axis=0) 608 data, plot_idx = _moving_average(array, rolling_average, aln.zoom, aln.length) 609 610 # plot the data 611 ax.fill_between( 612 # this add dummy data left and right for better plotting 613 # otherwise only half of the step is shown 614 np.concatenate(([plot_idx[0] - 0.5], plot_idx, [plot_idx[-1] + 0.5])) if rolling_average == 1 else plot_idx, 615 np.concatenate(([data[0]], data, [data[-1]])) if rolling_average == 1 else data, 616 min_value, 617 linewidth = line_width, 618 edgecolor=line_color, 619 step='mid' if rolling_average == 1 else None, 620 facecolor=(line_color, 0.6) if stat_type not in ['ts tv score', 'gc'] else 'none' 621 ) 622 if stat_type == 'gc': 623 ax.hlines(0.5, xmin=0, xmax=aln.zoom[0] + aln.length if aln.zoom is not None else aln.length, color='black', linestyles='--', linewidth=1) 624 625 # format axis 626 ax.set_ylim(min_value, max_value*0.1+max_value) 627 ax.set_yticks([min_value, max_value]) 628 if stat_type == 'gc': 629 ax.set_yticklabels(['0', '100']) 630 elif stat_type == 'ts tv score': 631 ax.set_yticklabels(['tv', 'ts']) 632 else: 633 ax.set_yticklabels(['low', 'high']) 634 635 # show title 636 if show_title: 637 ax.set_title( 638 f'{stat_type} (average over {rolling_average} positions)' if rolling_average > 1 else f'{stat_type} for each position', 639 loc='left' 640 ) 641 642 _format_x_axis(aln, ax, show_x_label, show_left=True)
Generate a plot for the various alignment stats.
Parameters
- aln: alignment MSA class
- ax: matplotlib axes
- stat_type: 'entropy', 'gc', 'coverage', 'ts/tv', 'identity' or 'similarity' -> (here default matrices are used NT - TRANS, AA - BLOSUM65)
- line_color: color of the line
- line_width: width of the line
- rolling_average: average rolling window size left and right of a position in nucleotides or amino acids
- show_x_label: whether to show the x-axis label
- show_title: whether to show the title
645def variant_plot(aln: explore.MSA, ax: plt.Axes, lollisize: tuple[int, int] | list[int, int] = (1, 3), color_scheme: str = 'standard', show_x_label: bool = False, show_legend: bool = True, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int] = (1, 1)): 646 """ 647 Plots variants. 648 :param aln: alignment MSA class 649 :param ax: matplotlib axes 650 :param lollisize: (stem_size, head_size) 651 :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options 652 :param show_x_label: whether to show the x-axis label 653 :param show_legend: whether to show the legend 654 :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html 655 """ 656 657 # validate input 658 _validate_input_parameters(aln, ax) 659 if not isinstance(lollisize, tuple) or len(lollisize) != 2: 660 raise ValueError('lollisize must be tuple of length 2 (stem, head)') 661 for _size in lollisize: 662 if not isinstance(_size, float | int) or _size <= 0: 663 raise ValueError('lollisize must be floats greater than zero') 664 665 # define colors 666 colors = config.CHAR_COLORS[aln.aln_type][color_scheme] 667 # get snps 668 snps = aln.get_snps() 669 # define where to plot (each ref type gets a separate line) 670 ref_y_positions, y_pos, detected_var = {}, 0, set() 671 672 # iterate over snp dict 673 for pos in snps['POS']: 674 for identifier in snps['POS'][pos]: 675 # fill in y pos dict 676 if identifier == 'ref': 677 if snps['POS'][pos]['ref'] not in ref_y_positions: 678 ref_y_positions[snps['POS'][pos]['ref']] = y_pos 679 y_pos += 1.1 680 continue 681 # plot 682 if identifier == 'ALT': 683 for alt in snps['POS'][pos]['ALT']: 684 ax.vlines(x=pos + aln.zoom[0] if aln.zoom is not None else pos, 685 ymin=ref_y_positions[snps['POS'][pos]['ref']], 686 ymax=ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'], 687 color=colors[alt], 688 zorder=100, 689 linewidth=lollisize[0] 690 ) 691 ax.plot(pos + aln.zoom[0] if aln.zoom is not None else pos, 692 ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'], 693 color=colors[alt], 694 marker='o', 695 markersize=lollisize[1] 696 ) 697 detected_var.add(alt) 698 699 # plot hlines 700 for y_char in ref_y_positions: 701 ax.hlines( 702 ref_y_positions[y_char], 703 xmin=aln.zoom[0] - 0.5 if aln.zoom is not None else -0.5, 704 xmax=aln.zoom[0] + aln.length + 0.5 if aln.zoom is not None else aln.length + 0.5, 705 color='black', 706 linestyle='-', 707 zorder=0, 708 linewidth=0.75 709 ) 710 # create a custom legend 711 if show_legend: 712 custom_legend = [ 713 ax.add_line( 714 plt.Line2D( 715 [], 716 [], 717 color=colors[char], 718 marker='o', 719 linestyle='', 720 markersize=5 721 ) 722 ) for char in colors if char in detected_var 723 ] 724 ax.legend( 725 custom_legend, 726 [char for char in colors if char in detected_var], # ensures correct sorting 727 loc='lower right', 728 title='variant', 729 bbox_to_anchor=bbox_to_anchor, 730 ncols=len(detected_var)/2 if aln.aln_type == 'AA' else len(detected_var), 731 frameon=False 732 ) 733 734 # format axis 735 _format_x_axis(aln, ax, show_x_label, show_left=False) 736 ax.spines['left'].set_visible(False) 737 ax.set_yticks([ref_y_positions[x] for x in ref_y_positions]) 738 ax.set_yticklabels(ref_y_positions.keys()) 739 ax.set_ylim(0, y_pos) 740 ax.set_ylabel('reference')
Plots variants.
Parameters
- aln: alignment MSA class
- ax: matplotlib axes
- lollisize: (stem_size, head_size)
- color_scheme: color scheme for characters. see config.CHAR_COLORS for available options
- show_x_label: whether to show the x-axis label
- show_legend: whether to show the legend
- bbox_to_anchor: bounding box coordinates for the legend - see: https: //matplotlib.org/stable/api/legend_api.html
743def orf_plot(aln: explore.MSA, ax: plt.Axes, min_length: int = 500, non_overlapping_orfs: bool = True, cmap: str = 'Blues', direction_marker_size: int | None = 5, show_x_label: bool = False, show_cbar: bool = False, cbar_fraction: float = 0.1): 744 """ 745 Plot conserved ORFs. 746 :param aln: alignment MSA class 747 :param ax: matplotlib axes 748 :param min_length: minimum length of orf 749 :param non_overlapping_orfs: whether to consider overlapping orfs 750 :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html 751 :param direction_marker_size: marker size for direction marker, not shown if marker_size == None 752 :param show_x_label: whether to show the x-axis label 753 :param show_cbar: whether to show the colorbar - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html 754 :param cbar_fraction: fraction of the original ax reserved for the colorbar 755 """ 756 757 # normalize colorbar 758 cmap = ScalarMappable(norm=Normalize(0, 100), cmap=plt.get_cmap(cmap)) 759 760 # validate input 761 _validate_input_parameters(aln, ax) 762 763 # get orfs --> first deepcopy and reset zoom that the orfs are also zoomed in (by default, the orfs are only 764 # searched within the zoomed region) 765 aln_temp = deepcopy(aln) 766 aln_temp.zoom = None 767 if non_overlapping_orfs: 768 annotation_dict = aln_temp.get_non_overlapping_conserved_orfs(min_length=min_length) 769 else: 770 annotation_dict = aln_temp.get_conserved_orfs(min_length=min_length) 771 772 # filter dict for zoom 773 if aln.zoom is not None: 774 annotation_dict = {key:val for key, val in annotation_dict.items() if max(val['location'][0][0], aln.zoom[0]) <= min(val['location'][0][1], aln.zoom[1])} 775 776 # add track for plotting 777 _add_track_positions(annotation_dict) 778 779 # plot 780 _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=cmap) 781 782 # legend 783 if show_cbar: 784 cbar = plt.colorbar(cmap,ax=ax, location= 'top', orientation='horizontal', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction) 785 cbar.set_label('% identity') 786 cbar.set_ticks([0, 100]) 787 788 # format axis 789 _format_x_axis(aln, ax, show_x_label, show_left=False) 790 ax.set_ylim(bottom=0.8) 791 ax.set_yticks([]) 792 ax.set_yticklabels([]) 793 ax.set_title('conserved orfs', loc='left')
Plot conserved ORFs.
Parameters
- aln: alignment MSA class
- ax: matplotlib axes
- min_length: minimum length of orf
- non_overlapping_orfs: whether to consider overlapping orfs
- cmap: color mapping for % identity - see https: //matplotlib.org/stable/users/explain/colors/colormaps.html
- direction_marker_size: marker size for direction marker, not shown if marker_size == None
- show_x_label: whether to show the x-axis label
- show_cbar: whether to show the colorbar - see https: //matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
- cbar_fraction: fraction of the original ax reserved for the colorbar
796def annotation_plot(aln: explore.MSA, annotation: explore.Annotation | str, ax: plt.Axes, feature_to_plot: str, color: str = 'wheat', direction_marker_size: int | None = 5, show_x_label: bool = False): 797 """ 798 Plot annotations from bed, gff or gb files. Are automatically mapped to alignment. 799 :param aln: alignment MSA class 800 :param annotation: annotation class | path to annotation file 801 :param ax: matplotlib axes 802 :param feature_to_plot: potential feature to plot (not for bed files as it is parsed as one feature) 803 :param color: color for the annotation 804 :param direction_marker_size: marker size for direction marker, only relevant if show_direction is True 805 :param show_x_label: whether to show the x-axis label 806 """ 807 # helper function 808 def parse_annotation_from_string(path: str, msa: explore.MSA) -> explore.Annotation: 809 """ 810 Parse annotation. 811 :param path: path to annotation 812 :param msa: msa object 813 :return: parsed annotation 814 """ 815 if os.path.exists(path): 816 # reset zoom so the annotation is correctly parsed 817 msa_temp = deepcopy(msa) 818 msa_temp.zoom = None 819 return explore.Annotation(msa_temp, path) 820 else: 821 raise FileNotFoundError() 822 823 # parse from path 824 if type(annotation) is str: 825 annotation = parse_annotation_from_string(annotation, aln) 826 827 # validate input 828 _validate_input_parameters(aln, ax, annotation) 829 if not is_color_like(color): 830 raise ValueError(f'{color} for reference is not a color') 831 832 # ignore features to plot for bed files (here it is written into one feature) 833 if annotation.ann_type == 'bed': 834 annotation_dict = annotation.features['region'] 835 feature_to_plot = 'bed regions' 836 else: 837 # try to subset the annotation dict 838 try: 839 annotation_dict = annotation.features[feature_to_plot] 840 except KeyError: 841 raise KeyError(f'Feature {feature_to_plot} not found. Use annotation.features.keys() to see available features.') 842 843 # plotting and formating 844 _add_track_positions(annotation_dict) 845 _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=color) 846 _format_x_axis(aln, ax, show_x_label, show_left=False) 847 ax.set_ylim(bottom=0.8) 848 ax.set_yticks([]) 849 ax.set_yticklabels([]) 850 ax.set_title(f'{annotation.locus} ({feature_to_plot})', loc='left')
Plot annotations from bed, gff or gb files. Are automatically mapped to alignment.
Parameters
- aln: alignment MSA class
- annotation: annotation class | path to annotation file
- ax: matplotlib axes
- feature_to_plot: potential feature to plot (not for bed files as it is parsed as one feature)
- color: color for the annotation
- direction_marker_size: marker size for direction marker, only relevant if show_direction is True
- show_x_label: whether to show the x-axis label
853def sequence_logo(aln:explore.MSA, ax:plt.Axes, color_scheme: str = 'standard', plot_type: str = 'logo', show_x_label:bool = False): 854 """ 855 Plot sequence logo or stacked area plot (use the first one with appropriate zoom levels). The 856 logo visualizes the relative frequency of nt or aa characters in the alignment. The char frequency 857 is scaled to the information content at each position. --> identical to how Geneious calculates it. 858 859 :param aln: alignment MSA class 860 :param ax: matplotlib axes 861 :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options 862 :param plot_type: 'logo' for sequence logo, 'stacked' for stacked area plot 863 :param show_x_label: whether to show the x-axis label 864 """ 865 866 _validate_input_parameters(aln, ax) 867 # calc matrix 868 matrix = aln.calc_position_matrix('IC') * aln.calc_position_matrix('PPM') 869 letters_to_plot = list(config.CHAR_COLORS[aln.aln_type]['standard'].keys())[:-1] 870 871 #matrix = np.clip(matrix, 0, None) # only positive values 872 # plot 873 if plot_type == 'logo': 874 for pos in range(matrix.shape[1]): 875 # sort the positive matrix row values by size 876 items = [(letters_to_plot[i], matrix[i, pos]) for i in range(len(letters_to_plot)) if matrix[i, pos] > 0] 877 items.sort(key=lambda x: x[1]) 878 # plot each position 879 y_offset = 0 880 for letter, h in items: 881 tp = TextPath((aln.zoom[0] - 0.325 if aln.zoom is not None else - 0.325, 0), letter, size=1, prop=FontProperties(weight='bold')) 882 bb = tp.get_extents() 883 glyph_height = bb.height if bb.height > 0 else 1e-6 # avoid div by zero 884 scale_to_1 = 1.0 / glyph_height 885 886 transform = (Affine2D() 887 .scale(1.0, h * scale_to_1) # scale manually by IC and normalize font 888 .translate(pos, y_offset) 889 + ax.transData) 890 891 patch = PathPatch(tp, transform=transform, 892 facecolor=config.CHAR_COLORS[aln.aln_type][color_scheme][letter], 893 edgecolor='none') 894 ax.add_patch(patch) 895 y_offset += h 896 elif plot_type == 'stacked': 897 y_values = np.zeros(matrix.shape[1]) 898 x_values = np.arange(0, matrix.shape[1]) if aln.zoom is None else np.arange(aln.zoom[0], aln.zoom[1]) 899 for row in range(matrix.shape[0]): 900 y = matrix[row] 901 ax.fill_between(x_values, 902 y_values, 903 y_values + y, 904 fc=config.CHAR_COLORS[aln.aln_type][color_scheme].get(letters_to_plot[row]), 905 ec='None', 906 label=letters_to_plot[row], 907 step='pre') 908 y_values += y 909 910 # adjust limits & labels 911 _format_x_axis(aln, ax, show_x_label, show_left=True) 912 if aln.aln_type == 'AA': 913 ax.set_ylim(bottom=0, top=5) 914 else: 915 ax.set_ylim(bottom=0, top=2) 916 ax.spines['top'].set_visible(False) 917 ax.spines['right'].set_visible(False) 918 ax.set_ylabel('bits')
Plot sequence logo or stacked area plot (use the first one with appropriate zoom levels). The logo visualizes the relative frequency of nt or aa characters in the alignment. The char frequency is scaled to the information content at each position. --> identical to how Geneious calculates it.
Parameters
- aln: alignment MSA class
- ax: matplotlib axes
- color_scheme: color scheme for characters. see config.CHAR_COLORS for available options
- plot_type: 'logo' for sequence logo, 'stacked' for stacked area plot
- show_x_label: whether to show the x-axis label