msaexplorer.draw
The draw module
The draw module lets you draw alignments and statistic plots such as SNPs, ORFs, entropy and much more. For each plot a
matplotlib axes
has to be passed to the plotting function.
Importantly some of the plotting features can only be accessed for nucleotide alignments but not for amino acid alignments. The functions will raise the appropriate exception in such a case.
Functions
1r""" 2# The draw module 3 4The draw module lets you draw alignments and statistic plots such as SNPs, ORFs, entropy and much more. For each plot a 5`matplotlib axes` has to be passed to the plotting function. 6 7Importantly some of the plotting features can only be accessed for nucleotide alignments but not for amino acid alignments. 8The functions will raise the appropriate exception in such a case. 9 10## Functions 11 12""" 13import pathlib 14# built-in 15from itertools import chain 16from typing import Callable, Dict 17from copy import deepcopy 18import os 19 20import matplotlib 21from numpy import ndarray 22 23# MSAexplorer 24from msaexplorer import explore, config 25 26# libs 27import numpy as np 28import matplotlib.pyplot as plt 29import matplotlib.patches as patches 30from matplotlib.cm import ScalarMappable 31from matplotlib.colors import is_color_like, Normalize, to_rgba 32from matplotlib.collections import PatchCollection, PolyCollection 33from matplotlib.text import TextPath 34from matplotlib.patches import PathPatch 35from matplotlib.font_manager import FontProperties 36from matplotlib.transforms import Affine2D 37 38 39# general helper functions 40def _validate_input_parameters(aln: explore.MSA, ax: plt.Axes, annotation: explore.Annotation | None = None): 41 """ 42 Validate MSA class and axis. 43 """ 44 if not isinstance(aln, explore.MSA): 45 raise ValueError('alignment has to be an MSA class. use explore.MSA() to read in alignment') 46 if ax is not None and not isinstance(ax, plt.Axes): 47 raise ValueError('ax has to be an matplotlib axis') 48 if annotation is not None: 49 if not isinstance(annotation, explore.Annotation): 50 raise ValueError('annotation has to be an annotation class. use explore.Annotation() to read in annotation') 51 52 53def _format_x_axis(aln: explore.MSA, ax: plt.Axes, show_x_label: bool, show_left: bool): 54 """ 55 General axis formatting. 56 """ 57 ax.set_xlim( 58 (aln.zoom[0] - 0.5, aln.zoom[0] + aln.length - 0.5) if aln.zoom is not None else (-0.5, aln.length - 0.5) 59 ) 60 ax.spines['top'].set_visible(False) 61 ax.spines['right'].set_visible(False) 62 if show_x_label: 63 ax.set_xlabel('alignment position') 64 if not show_left: 65 ax.spines['left'].set_visible(False) 66 67 68def _find_stretches(row, non_nan_only=False) -> list[tuple[int, int, int]] | list[tuple[int, int]] : 69 """ 70 Finds consecutive stretches of values in an array, with an option to exclude NaN stretches. 71 72 :param row: NumPy array of values 73 :param non_nan_only: If True, only returns stretches of non-NaN values. 74 :return: List of stretches (start, end, value at start) for all values or non-NaN values only. 75 """ 76 if row.size == 0: 77 return [] 78 79 if non_nan_only: 80 # Create a boolean mask for non-NaN values 81 non_nan_mask = ~np.isnan(row) 82 # Find changes in the mask 83 changes = np.diff(non_nan_mask.astype(int)) != 0 84 change_idx = np.nonzero(changes)[0] 85 starts = np.concatenate(([0], change_idx + 1)) 86 ends = np.concatenate((change_idx, [len(row) - 1])) 87 88 # Return only stretches that start with non-NaN values 89 return [(start, end) for start, end in zip(starts, ends) if non_nan_mask[start]] 90 else: 91 # Find change points: where adjacent cells differ. 92 changes = np.diff(row) != 0 93 change_idx = np.nonzero(changes)[0] 94 starts = np.concatenate(([0], change_idx + 1)) 95 ends = np.concatenate((change_idx, [len(row) - 1])) 96 97 return [(start, end, row[start]) for start, end in zip(starts, ends)] 98 99 100def _seq_names(aln: explore.MSA, ax: plt.Axes, custom_seq_names: tuple, show_seq_names: bool): 101 """ 102 Validate custom names and set show names to True. Format axis accordingly. 103 """ 104 if custom_seq_names: 105 show_seq_names = True 106 if not isinstance(custom_seq_names, tuple): 107 raise ValueError('configure your custom names list: custom_names=(name1, name2...)') 108 if len(custom_seq_names) != len(aln.alignment.keys()): 109 raise ValueError('length of sequences not equal to number of custom names') 110 if show_seq_names: 111 ax.yaxis.set_ticks_position('none') 112 ax.set_yticks(np.arange(len(aln.alignment)) + 0.4) 113 if custom_seq_names: 114 ax.set_yticklabels(custom_seq_names[::-1]) 115 else: 116 names = [x.split(' ')[0] for x in list(aln.alignment.keys())[::-1]] 117 ax.set_yticklabels(names) 118 else: 119 ax.set_yticks([]) 120 121 122def _create_identity_patch(row, aln: explore.MSA, col: list, zoom: tuple[int, int], y_position: float | int, reference_color: str, seq_name: str, identity_color: str | ndarray, show_gaps: bool): 123 """ 124 Creates the initial patch. 125 """ 126 if show_gaps: 127 # plot a rectangle for parts that do not have gaps 128 for stretch in _find_stretches(row, True): 129 col.append(patches.Rectangle((stretch[0] + zoom[0] - 0.5, y_position), stretch[1] - stretch[0] + 1, 0.8, 130 facecolor=reference_color if seq_name == aln.reference_id else identity_color 131 ) 132 133 ) 134 # just plot a rectangle 135 else: 136 col.append(patches.Rectangle((zoom[0] - 0.5, y_position), zoom[1] - zoom[0], 0.8, 137 facecolor=reference_color if seq_name == aln.reference_id else identity_color 138 ) 139 ) 140 141 142def _create_polygons(stretches: list, identity_values: list | ndarray, zoom: tuple, y_position: int, polygons: list, aln_colors: dict | ScalarMappable, polygon_colors: list, detected_identity_values: set | None = None): 143 """ 144 create the individual polygons 145 """ 146 147 for start, end, value in stretches: 148 if value not in identity_values: 149 continue 150 if detected_identity_values is not None: 151 detected_identity_values.add(value) 152 width = end + 1 - start 153 # Calculate x coordinates adjusted for zoom and centering 154 x0 = start + zoom[0] - 0.5 155 x1 = x0 + width 156 # Define the rectangle corners 157 rect_coords = [ 158 (x0, y_position), 159 (x1, y_position), 160 (x1, y_position + 0.8), 161 (x0, y_position + 0.8), 162 (x0, y_position) 163 ] 164 polygons.append(rect_coords) 165 if type(aln_colors) != ScalarMappable: 166 polygon_colors.append(aln_colors[value]['color']) 167 else: 168 polygon_colors.append(aln_colors.to_rgba(value)) 169 170 171def _plot_annotation(annotation_dict: dict, ax: plt.Axes, direction_marker_size: int | None, color: str | ScalarMappable): 172 """ 173 Plot annotations 174 :param annotation_dict: dict of annotations 175 :param ax: matplotlib Axes 176 :param direction_marker_size: size of marker 177 :param color: color of annotation (color or scalar) 178 """ 179 for annotation in annotation_dict: 180 for locations in annotation_dict[annotation]['location']: 181 x_value = locations[0] 182 length = locations[1] - locations[0] 183 ax.add_patch( 184 patches.FancyBboxPatch( 185 (x_value, annotation_dict[annotation]['track'] + 1), 186 length, 187 0.8, 188 boxstyle="Round, pad=0", 189 ec="black", 190 fc=color.to_rgba(annotation_dict[annotation]['conservation']) if isinstance(color, ScalarMappable) else color, 191 ) 192 ) 193 if direction_marker_size is not None: 194 if annotation_dict[annotation]['strand'] == '-': 195 marker = '<' 196 else: 197 marker = '>' 198 ax.plot(x_value + length/2, annotation_dict[annotation]['track'] + 1.4, marker=marker, markersize=direction_marker_size, color='white', markeredgecolor='black') 199 200 # plot linked annotations (such as splicing) 201 if len(annotation_dict[annotation]['location']) > 1: 202 y_value = annotation_dict[annotation]['track'] + 1.4 203 start = None 204 for locations in annotation_dict[annotation]['location']: 205 if start is None: 206 start = locations[1] 207 continue 208 ax.plot([start, locations[0]], [y_value, y_value], '--', linewidth=2, color='black') 209 start = locations[1] 210 211 212def _add_track_positions(annotation_dic): 213 # create a dict and sort 214 annotation_dic = dict(sorted(annotation_dic.items(), key=lambda x: x[1]['location'][0][0])) 215 216 # remember for each track the largest stop 217 track_stops = [0] 218 219 for ann in annotation_dic: 220 flattened_locations = list(chain.from_iterable(annotation_dic[ann]['location'])) # flatten list 221 track = 0 222 # check if a start of a gene is smaller than the stop of the current track 223 # -> move to new track 224 while flattened_locations[0] < track_stops[track]: 225 track += 1 226 # if all prior tracks are potentially causing an overlap 227 # create a new track and break 228 if len(track_stops) <= track: 229 track_stops.append(0) 230 break 231 # in the current track remember the stop of the current gene 232 track_stops[track] = flattened_locations[-1] 233 # and indicate the track in the dict 234 annotation_dic[ann]['track'] = track 235 236 return annotation_dic 237 238 239def _get_contrast_text_color(rgba_color): 240 """ 241 compute the brightness of a color 242 """ 243 r, g, b, a = rgba_color 244 brightness = (r * 299 + g * 587 + b * 114) / 1000 245 246 return 'white' if brightness < 0.5 else 'black' 247 248 249def _plot_sequence_text(aln: explore.MSA, seq_name: str, ref_name: str, always_text: bool, values: ndarray, matrix: ndarray, ax:plt.Axes, zoom: tuple, y_position:int, value_to_skip: int, ref_color:str, show_gaps: bool, cmap: None | ScalarMappable = None, colorscheme: None | dict = None): 250 """ 251 Plot sequence text - however this will be done even if there is not enough space. 252 Might need some rework in the future. 253 """ 254 x_text = 0 255 if seq_name == ref_name: 256 different_cols = np.any((matrix != value_to_skip) & ~np.isnan(matrix), axis=0) 257 else: 258 different_cols = [False]*aln.length 259 260 for idx, (character, value) in enumerate(zip(aln.alignment[seq_name], values)): 261 if value != value_to_skip and character != '-' or seq_name == ref_name and character != '-' or character == '-'and not show_gaps or always_text and character != '-': 262 # text color 263 if seq_name == ref_name: 264 text_color = _get_contrast_text_color(to_rgba(ref_color)) 265 elif cmap is not None: 266 text_color = _get_contrast_text_color(cmap.to_rgba(value)) 267 else: 268 text_color = _get_contrast_text_color(to_rgba(colorscheme[value]['color'])) 269 270 ax.text( 271 x=x_text + zoom[0] if zoom is not None else x_text, 272 y=y_position + 0.4, 273 s=character, 274 fontweight='bold' if different_cols[idx] else 'normal', 275 ha='center', 276 va='center_baseline', 277 c=text_color if value != value_to_skip or seq_name == ref_name else 'dimgrey' 278 ) 279 x_text += 1 280 281 282def identity_alignment(aln: explore.MSA | str, ax: plt.Axes | None = None, show_title: bool = True, show_identity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', show_mask:bool = True, show_gaps:bool = True, fancy_gaps:bool = False, show_mismatches: bool = True, show_ambiguities: bool = False, color_scheme: str | None = None, show_x_label: bool = True, show_legend: bool = False, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int]= (1, 1)): 283 """ 284 Generates an identity alignment overview plot. 285 :param aln: alignment MSA class or path 286 :param ax: matplotlib axes 287 :param show_title: whether to show title 288 :param show_seq_names: whether to show seq names 289 :param show_identity_sequence: whether to show sequence for only differences and reference - zoom in to avoid plotting issues 290 :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues 291 :param custom_seq_names: custom seq names 292 :param reference_color: color of reference sequence 293 :param show_mask: whether to show N or X chars otherwise it will be shown as match or mismatch 294 :param show_gaps: whether to show gaps otherwise it will be shown as match or mismatch 295 :param fancy_gaps: show gaps with a small black bar 296 :param show_mismatches: whether to show mismatches otherwise it will be shown as match 297 :param show_ambiguities: whether to show non-N ambiguities -> only relevant for RNA/DNA sequences 298 :param color_scheme: color mismatching chars with their unique color. Options for DNA/RNA are: standard, purine_pyrimidine, strong_weak; and for AS: standard, clustal, zappo, hydrophobicity 299 :param show_x_label: whether to show x label 300 :param show_legend: whether to show the legend 301 :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html 302 """ 303 304 # Validate inputs and colors 305 if type(aln) is str: 306 if os.path.exists(aln): 307 aln = explore.MSA(aln) 308 else: 309 raise FileNotFoundError(f'File {aln} does not exist') 310 311 _validate_input_parameters(aln, ax) 312 if ax is None: 313 ax = plt.gca() 314 if not is_color_like(reference_color): 315 raise ValueError(f'{reference_color} for reference is not a color') 316 317 if color_scheme is not None: 318 if aln.aln_type == 'AA' and color_scheme not in ['standard', 'clustal', 'zappo', 'hydrophobicity']: 319 raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, clustal, zappo, hydrophobicity') 320 elif aln.aln_type in ['DNA', 'RNA'] and color_scheme not in ['standard', 'purine_pyrimidine', 'strong_weak']: 321 raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, purine_pyrimidine, strong_weak') 322 323 # Both options for gaps work hand in hand 324 if fancy_gaps: 325 show_gaps = True 326 327 # Determine zoom 328 zoom = (0, aln.length) if aln.zoom is None else aln.zoom 329 330 # Set up color mapping and identity values 331 aln_colors = config.IDENTITY_COLORS.copy() 332 identity_values = [-1, -2, -3] # -1 = mismatch, -2 = mask, -3 ambiguity 333 if color_scheme is not None: 334 colors_to_extend = config.CHAR_COLORS[aln.aln_type][color_scheme] 335 identity_values += [x + 1 for x in range(len(colors_to_extend))] # x+1 is needed to allow correct mapping 336 # use the standard setting for the index (same as in aln.calc_identity_alignment) 337 # and map the corresponding color scheme to it 338 for idx, char in enumerate(config.CHAR_COLORS[aln.aln_type]['standard']): 339 aln_colors[idx + 1] = {'type': char, 'color': colors_to_extend[char]} 340 341 # Compute identity alignment 342 identity_aln = aln.calc_identity_alignment( 343 encode_mask=show_mask, 344 encode_gaps=show_gaps, 345 encode_mismatches=show_mismatches, 346 encode_ambiguities=show_ambiguities, 347 encode_each_mismatch_char=True if color_scheme is not None else False 348 ) 349 350 # List to store polygons 351 detected_identity_values = {0} 352 polygons, polygon_colors, patch_list = [], [], [] 353 354 for i, seq_name in enumerate(aln.alignment): 355 y_position = len(aln.alignment) - i - 1 356 row = identity_aln[i] 357 358 # plot a line below everything 359 if fancy_gaps: 360 ax.hlines( 361 y_position + 0.4, 362 xmin=zoom[0] - 0.5, 363 xmax=zoom[1] + 0.5, 364 color='black', 365 linestyle='-', 366 zorder=0, 367 linewidth=0.75 368 ) 369 370 # plot the basic shape per sequence with gaps 371 _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, aln_colors[0]['color'], show_gaps) 372 373 # find consecutive stretches 374 stretches = _find_stretches(row) 375 # create polygons per stretch 376 _create_polygons(stretches, identity_values, zoom, y_position, polygons, aln_colors, polygon_colors, detected_identity_values) 377 378 # add sequence text 379 if show_identity_sequence or show_sequence_all: 380 _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, identity_aln[i], identity_aln, ax, zoom, y_position, 0, reference_color, show_gaps, colorscheme=aln_colors) 381 382 # Create the LineCollection: each segment is drawn in a single call. 383 ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt')) 384 ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors)) 385 386 # custom legend 387 if show_legend: 388 if color_scheme is not None and color_scheme != 'standard': 389 for x in aln_colors: 390 for group in config.CHAR_GROUPS[aln.aln_type][color_scheme]: 391 if aln_colors[x]['type'] in config.CHAR_GROUPS[aln.aln_type][color_scheme][group]: 392 aln_colors[x]['type'] = group 393 break 394 # create it 395 handels, labels, detected_groups = [], [], set() 396 for x in aln_colors: 397 if x in detected_identity_values and aln_colors[x]['type'] not in detected_groups: 398 handels.append( 399 ax.add_line( 400 plt.Line2D( 401 [], 402 [], 403 color=aln_colors[x]['color'] if color_scheme != 'hydrophobicity' or x == 0 else config.CHAR_COLORS[aln.aln_type]['hydrophobicity'][ 404 config.CHAR_GROUPS[aln.aln_type]['hydrophobicity'][aln_colors[x]['type']][0] 405 ], 406 marker='s', 407 markeredgecolor='grey', 408 linestyle='', 409 markersize=10)) 410 ) 411 labels.append(aln_colors[x]['type']) 412 detected_groups.add(aln_colors[x]['type']) 413 414 # ncols 415 if color_scheme is None or aln.aln_type != 'AA': 416 ncols = len(detected_identity_values) 417 elif color_scheme == 'standard': 418 ncols = (len(detected_identity_values) + 1) / 2 419 else: 420 ncols = (len(detected_groups) + 1) / 2 421 422 # plot it 423 ax.legend( 424 handels, 425 labels, 426 loc='lower right', 427 bbox_to_anchor=bbox_to_anchor, 428 ncols=ncols, 429 frameon=False 430 ) 431 _seq_names(aln, ax, custom_seq_names, show_seq_names) 432 433 # configure axis 434 ax.set_ylim(0, len(aln.alignment)) 435 if show_title: 436 ax.set_title('identity', loc='left') 437 _format_x_axis(aln, ax, show_x_label, show_left=False) 438 439 return ax 440 441 442def similarity_alignment(aln: explore.MSA | str, ax: plt.Axes | None = None, matrix_type: str | None = None, show_title: bool = True, show_similarity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', cmap: str = 'twilight_r', show_gaps:bool = True, fancy_gaps:bool = False, show_x_label: bool = True, show_cbar: bool = False, cbar_fraction: float = 0.1): 443 """ 444 Generates a similarity alignment overview plot. Importantly the similarity values are normalized! 445 :param aln: alignment MSA class or path 446 :param ax: matplotlib axes 447 :param matrix_type: substitution matrix - see config.SUBS_MATRICES, standard: NT - TRANS, AA - BLOSUM65 448 :param show_title: whether to show title 449 :param show_similarity_sequence: whether to show sequence only for differences and reference - zoom in to avoid plotting issues 450 :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues 451 :param show_seq_names: whether to show seq names 452 :param custom_seq_names: custom seq names 453 :param reference_color: color of reference sequence 454 :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html 455 :param show_gaps: whether to show gaps otherwise it will be ignored 456 :param fancy_gaps: show gaps with a small black bar 457 :param show_x_label: whether to show x label 458 :param show_cbar: whether to show the legend - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html 459 :param cbar_fraction: fraction of the original ax reserved for the legend 460 """ 461 # input check 462 if type(aln) is str: 463 if os.path.exists(aln): 464 aln = explore.MSA(aln) 465 else: 466 raise FileNotFoundError(f'File {aln} does not exist') 467 _validate_input_parameters(aln, ax) 468 if ax is None: 469 ax = plt.gca() 470 # validate colors 471 if not is_color_like(reference_color): 472 raise ValueError(f'{reference_color} for reference is not a color') 473 474 # Both options for gaps work hand in hand 475 if fancy_gaps: 476 show_gaps = True 477 478 # define zoom to plot 479 if aln.zoom is None: 480 zoom = (0, aln.length) 481 else: 482 zoom = aln.zoom 483 484 # get data 485 similarity_aln = aln.calc_similarity_alignment(matrix_type=matrix_type) # use normalized values here 486 similarity_aln = similarity_aln.round(2) # round data for good color mapping 487 488 # determine min max values of the underlying matrix and create cmap 489 min_value, max_value = 0, 1 490 cmap = ScalarMappable( 491 norm=Normalize( 492 vmin=min_value, 493 vmax=max_value 494 ), 495 cmap=plt.get_cmap(cmap) 496 ) 497 498 # create similarity values 499 similarity_values = np.arange(start=min_value, stop=max_value, step=0.01) 500 # round it to be absolutely sure that values match with rounded sim alignment 501 similarity_values = similarity_values.round(2) 502 503 # create plot 504 polygons, polygon_colors, patch_list = [], [], [] 505 506 for i, seq_name in enumerate(aln.alignment): 507 y_position = len(aln.alignment) - i - 1 508 row = similarity_aln[i] 509 510 # plot a line below everything 511 if fancy_gaps: 512 ax.hlines( 513 y_position + 0.4, 514 xmin=zoom[0] - 0.5, 515 xmax=zoom[1] + 0.5, 516 color='black', 517 linestyle='-', 518 zorder=0, 519 linewidth=0.75 520 ) 521 522 # plot the basic shape per sequence with gaps 523 _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, 524 cmap.to_rgba(max_value) if seq_name != aln.reference_id else reference_color, 525 show_gaps) 526 527 # find consecutive stretches 528 stretches = _find_stretches(row) 529 # create polygons per stretch 530 _create_polygons(stretches, similarity_values, zoom, y_position, polygons, cmap, polygon_colors) 531 532 # add sequence text 533 if show_sequence_all or show_similarity_sequence: 534 _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, similarity_aln[i], similarity_aln, ax, 535 zoom, y_position, 1, reference_color, show_gaps, cmap) 536 537 # Create the LineCollection: each segment is drawn in a single call. 538 ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt')) 539 ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors)) 540 541 # legend 542 if show_cbar: 543 cbar = plt.colorbar(cmap, ax=ax, location= 'top', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction) 544 cbar.set_ticks([min_value, max_value]) 545 cbar.set_ticklabels(['low', 'high']) 546 547 # format seq names 548 _seq_names(aln, ax, custom_seq_names, show_seq_names) 549 550 # configure axis 551 ax.set_ylim(0, len(aln.alignment)) 552 if show_title: 553 ax.set_title('similarity', loc='left') 554 _format_x_axis(aln, ax, show_x_label, show_left=False) 555 556 return ax 557 558 559def _moving_average(arr: ndarray, window_size: int, zoom: tuple | None, aln_length: int) -> tuple[ndarray, ndarray]: 560 """ 561 Calculate the moving average of an array. 562 :param arr: array with values 563 :param window_size: size of the moving average 564 :param zoom: zoom of the alignment 565 :param aln_length: length of the alignment 566 :return: new array with moving average 567 """ 568 if window_size > 1: 569 i = 0 570 moving_averages, plotting_idx = [], [] 571 while i < len(arr) + 1: 572 half_window_size = window_size // 2 573 window_left = arr[i - half_window_size : i] if i > half_window_size else arr[0:i] 574 window_right = arr[i: i + half_window_size] if i < len(arr) - half_window_size else arr[i: len(arr)] 575 moving_averages.append((sum(window_left) + sum(window_right)) / (len(window_left) + len(window_right))) 576 plotting_idx.append(i) 577 i += 1 578 579 return np.array(moving_averages), np.array(plotting_idx) if zoom is None else np.array(plotting_idx) + zoom[0] 580 else: 581 return arr, np.arange(zoom[0], zoom[1]) if zoom is not None else np.arange(aln_length) 582 583 584def stat_plot(aln: explore.MSA | str, stat_type: str, ax: plt.Axes | None = None, line_color: str = 'burlywood', line_width: int | float = 2, rolling_average: int = 20, show_x_label: bool = False, show_title: bool = True): 585 """ 586 Generate a plot for the various alignment stats. 587 :param aln: alignment MSA class or path 588 :param ax: matplotlib axes 589 :param stat_type: 'entropy', 'gc', 'coverage', 'ts/tv', 'identity' or 'similarity' -> (here default matrices are used NT - TRANS, AA - BLOSUM65) 590 :param line_color: color of the line 591 :param line_width: width of the line 592 :param rolling_average: average rolling window size left and right of a position in nucleotides or amino acids 593 :param show_x_label: whether to show the x-axis label 594 :param show_title: whether to show the title 595 """ 596 597 # define possible functions to calc here 598 stat_functions: Dict[str, Callable[[], list | ndarray]] = { 599 'gc': aln.calc_gc, 600 'entropy': aln.calc_entropy, 601 'coverage': aln.calc_coverage, 602 'identity': aln.calc_identity_alignment, 603 'similarity': aln.calc_similarity_alignment, 604 'ts tv score': aln.calc_transition_transversion_score 605 } 606 607 if stat_type not in stat_functions: 608 raise ValueError('stat_type must be one of {}'.format(list(stat_functions.keys()))) 609 610 # input check 611 if type(aln) is str: 612 if os.path.exists(aln): 613 aln = explore.MSA(aln) 614 else: 615 raise FileNotFoundError(f'File {aln} does not exist') 616 _validate_input_parameters(aln, ax) 617 if ax is None: 618 ax = plt.gca() 619 if not is_color_like(line_color): 620 raise ValueError('line color is not a color') 621 622 # validate rolling average 623 if rolling_average < 1 or rolling_average > aln.length: 624 raise ValueError('rolling_average must be between 1 and length of sequence') 625 626 # generate input data 627 array = stat_functions[stat_type]() 628 629 if stat_type == 'identity': 630 min_value, max_value = -1, 0 631 elif stat_type == 'ts tv score': 632 min_value, max_value = -1, 1 633 else: 634 min_value, max_value = 0, 1 635 if stat_type in ['identity', 'similarity']: 636 # for the mean nan values get handled as the lowest possible number in the matrix 637 array = np.nan_to_num(array, True, min_value) 638 array = np.mean(array, axis=0) 639 data, plot_idx = _moving_average(array, rolling_average, aln.zoom, aln.length) 640 641 # plot the data 642 ax.fill_between( 643 # this add dummy data left and right for better plotting 644 # otherwise only half of the step is shown 645 np.concatenate(([plot_idx[0] - 0.5], plot_idx, [plot_idx[-1] + 0.5])) if rolling_average == 1 else plot_idx, 646 np.concatenate(([data[0]], data, [data[-1]])) if rolling_average == 1 else data, 647 min_value, 648 linewidth = line_width, 649 edgecolor=line_color, 650 step='mid' if rolling_average == 1 else None, 651 facecolor=(line_color, 0.6) if stat_type not in ['ts tv score', 'gc'] else 'none' 652 ) 653 if stat_type == 'gc': 654 ax.hlines(0.5, xmin=0, xmax=aln.zoom[0] + aln.length if aln.zoom is not None else aln.length, color='black', linestyles='--', linewidth=1) 655 656 # format axis 657 ax.set_ylim(min_value, max_value*0.1+max_value) 658 ax.set_yticks([min_value, max_value]) 659 if stat_type == 'gc': 660 ax.set_yticklabels(['0', '100']) 661 elif stat_type == 'ts tv score': 662 ax.set_yticklabels(['tv', 'ts']) 663 else: 664 ax.set_yticklabels(['low', 'high']) 665 666 # show title 667 if show_title: 668 ax.set_title( 669 f'{stat_type} (average over {rolling_average} positions)' if rolling_average > 1 else f'{stat_type} for each position', 670 loc='left' 671 ) 672 673 _format_x_axis(aln, ax, show_x_label, show_left=True) 674 675 return ax 676 677 678def variant_plot(aln: explore.MSA | str, ax: plt.Axes | None = None, lollisize: tuple[int, int] | list[int, int] = (1, 3), color_scheme: str = 'standard', show_x_label: bool = False, show_legend: bool = True, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int] = (1, 1)): 679 """ 680 Plots variants. 681 :param aln: alignment MSA class or path 682 :param ax: matplotlib axes 683 :param lollisize: (stem_size, head_size) 684 :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options 685 :param show_x_label: whether to show the x-axis label 686 :param show_legend: whether to show the legend 687 :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html 688 """ 689 690 # validate input 691 if type(aln) is str: 692 if os.path.exists(aln): 693 aln = explore.MSA(aln) 694 else: 695 raise FileNotFoundError(f'File {aln} does not exist') 696 _validate_input_parameters(aln, ax) 697 if ax is None: 698 ax = plt.gca() 699 if not isinstance(lollisize, tuple) or len(lollisize) != 2: 700 raise ValueError('lollisize must be tuple of length 2 (stem, head)') 701 for _size in lollisize: 702 if not isinstance(_size, float | int) or _size <= 0: 703 raise ValueError('lollisize must be floats greater than zero') 704 705 # define colors 706 colors = config.CHAR_COLORS[aln.aln_type][color_scheme] 707 # get snps 708 snps = aln.get_snps() 709 # define where to plot (each ref type gets a separate line) 710 ref_y_positions, y_pos, detected_var = {}, 0, set() 711 712 # iterate over snp dict 713 for pos in snps['POS']: 714 for identifier in snps['POS'][pos]: 715 # fill in y pos dict 716 if identifier == 'ref': 717 if snps['POS'][pos]['ref'] not in ref_y_positions: 718 ref_y_positions[snps['POS'][pos]['ref']] = y_pos 719 y_pos += 1.1 720 continue 721 # plot 722 if identifier == 'ALT': 723 for alt in snps['POS'][pos]['ALT']: 724 ax.vlines(x=pos + aln.zoom[0] if aln.zoom is not None else pos, 725 ymin=ref_y_positions[snps['POS'][pos]['ref']], 726 ymax=ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'], 727 color=colors[alt], 728 zorder=100, 729 linewidth=lollisize[0] 730 ) 731 ax.plot(pos + aln.zoom[0] if aln.zoom is not None else pos, 732 ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'], 733 color=colors[alt], 734 marker='o', 735 markersize=lollisize[1] 736 ) 737 detected_var.add(alt) 738 739 # plot hlines 740 for y_char in ref_y_positions: 741 ax.hlines( 742 ref_y_positions[y_char], 743 xmin=aln.zoom[0] - 0.5 if aln.zoom is not None else -0.5, 744 xmax=aln.zoom[0] + aln.length + 0.5 if aln.zoom is not None else aln.length + 0.5, 745 color='black', 746 linestyle='-', 747 zorder=0, 748 linewidth=0.75 749 ) 750 # create a custom legend 751 if show_legend: 752 custom_legend = [ 753 ax.add_line( 754 plt.Line2D( 755 [], 756 [], 757 color=colors[char], 758 marker='o', 759 linestyle='', 760 markersize=5 761 ) 762 ) for char in colors if char in detected_var 763 ] 764 ax.legend( 765 custom_legend, 766 [char for char in colors if char in detected_var], # ensures correct sorting 767 loc='lower right', 768 title='variant', 769 bbox_to_anchor=bbox_to_anchor, 770 ncols=len(detected_var)/2 if aln.aln_type == 'AA' else len(detected_var), 771 frameon=False 772 ) 773 774 # format axis 775 _format_x_axis(aln, ax, show_x_label, show_left=False) 776 ax.spines['left'].set_visible(False) 777 ax.set_yticks([ref_y_positions[x] for x in ref_y_positions]) 778 ax.set_yticklabels(ref_y_positions.keys()) 779 ax.set_ylim(0, y_pos) 780 ax.set_ylabel('reference') 781 782 return ax 783 784 785def orf_plot(aln: explore.MSA | str, ax: plt.Axes | None = None, min_length: int = 500, non_overlapping_orfs: bool = True, cmap: str = 'Blues', direction_marker_size: int | None = 5, show_x_label: bool = False, show_cbar: bool = False, cbar_fraction: float = 0.1): 786 """ 787 Plot conserved ORFs. 788 :param aln: alignment MSA class or path 789 :param ax: matplotlib axes 790 :param min_length: minimum length of orf 791 :param non_overlapping_orfs: whether to consider overlapping orfs 792 :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html 793 :param direction_marker_size: marker size for direction marker, not shown if marker_size == None 794 :param show_x_label: whether to show the x-axis label 795 :param show_cbar: whether to show the colorbar - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html 796 :param cbar_fraction: fraction of the original ax reserved for the colorbar 797 """ 798 799 # normalize colorbar 800 cmap = ScalarMappable(norm=Normalize(0, 100), cmap=plt.get_cmap(cmap)) 801 802 # validate input 803 if type(aln) is str: 804 if os.path.exists(aln): 805 aln = explore.MSA(aln) 806 else: 807 raise FileNotFoundError(f'File {aln} does not exist') 808 _validate_input_parameters(aln, ax) 809 _validate_input_parameters(aln, ax) 810 if ax is None: 811 ax = plt.gca() 812 813 # get orfs --> first deepcopy and reset zoom that the orfs are also zoomed in (by default, the orfs are only 814 # searched within the zoomed region) 815 aln_temp = deepcopy(aln) 816 aln_temp.zoom = None 817 if non_overlapping_orfs: 818 annotation_dict = aln_temp.get_non_overlapping_conserved_orfs(min_length=min_length) 819 else: 820 annotation_dict = aln_temp.get_conserved_orfs(min_length=min_length) 821 822 # filter dict for zoom 823 if aln.zoom is not None: 824 annotation_dict = {key:val for key, val in annotation_dict.items() if max(val['location'][0][0], aln.zoom[0]) <= min(val['location'][0][1], aln.zoom[1])} 825 826 # add track for plotting 827 _add_track_positions(annotation_dict) 828 829 # plot 830 _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=cmap) 831 832 # legend 833 if show_cbar: 834 cbar = plt.colorbar(cmap,ax=ax, location= 'top', orientation='horizontal', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction) 835 cbar.set_label('% identity') 836 cbar.set_ticks([0, 100]) 837 838 # format axis 839 _format_x_axis(aln, ax, show_x_label, show_left=False) 840 ax.set_ylim(bottom=0.8) 841 ax.set_yticks([]) 842 ax.set_yticklabels([]) 843 ax.set_title('conserved orfs', loc='left') 844 845 return ax 846 847 848def annotation_plot(aln: explore.MSA | str, annotation: explore.Annotation | str, feature_to_plot: str, ax: plt.Axes | None = None, color: str = 'wheat', direction_marker_size: int | None = 5, show_x_label: bool = False): 849 """ 850 Plot annotations from bed, gff or gb files. Are automatically mapped to alignment. 851 :param aln: alignment MSA class 852 :param annotation: annotation class | path to annotation file 853 :param ax: matplotlib axes 854 :param feature_to_plot: potential feature to plot (not for bed files as it is parsed as one feature) 855 :param color: color for the annotation 856 :param direction_marker_size: marker size for direction marker, only relevant if show_direction is True 857 :param show_x_label: whether to show the x-axis label 858 """ 859 # helper function 860 def parse_annotation_from_string(path: str, msa: explore.MSA) -> explore.Annotation: 861 """ 862 Parse annotation. 863 :param path: path to annotation 864 :param msa: msa object 865 :return: parsed annotation 866 """ 867 if os.path.exists(path): 868 # reset zoom so the annotation is correctly parsed 869 msa_temp = deepcopy(msa) 870 msa_temp.zoom = None 871 return explore.Annotation(msa_temp, path) 872 else: 873 raise FileNotFoundError() 874 875 # validate input 876 if type(aln) is str: 877 if os.path.exists(aln): 878 aln = explore.MSA(aln) 879 else: 880 raise FileNotFoundError(f'File {aln} does not exist') 881 if type(annotation) is str: 882 annotation = parse_annotation_from_string(annotation, aln) 883 _validate_input_parameters(aln, ax, annotation) 884 if ax is None: 885 ax = plt.gca() 886 if not is_color_like(color): 887 raise ValueError(f'{color} for reference is not a color') 888 889 # ignore features to plot for bed files (here it is written into one feature) 890 if annotation.ann_type == 'bed': 891 annotation_dict = annotation.features['region'] 892 feature_to_plot = 'bed regions' 893 else: 894 # try to subset the annotation dict 895 try: 896 annotation_dict = annotation.features[feature_to_plot] 897 except KeyError: 898 raise KeyError(f'Feature {feature_to_plot} not found. Use annotation.features.keys() to see available features.') 899 900 # plotting and formating 901 _add_track_positions(annotation_dict) 902 _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=color) 903 _format_x_axis(aln, ax, show_x_label, show_left=False) 904 ax.set_ylim(bottom=0.8) 905 ax.set_yticks([]) 906 ax.set_yticklabels([]) 907 ax.set_title(f'{annotation.locus} ({feature_to_plot})', loc='left') 908 909 return ax 910 911 912def sequence_logo(aln:explore.MSA | str, ax:plt.Axes | None = None, color_scheme: str = 'standard', plot_type: str = 'stacked', show_x_label:bool = False): 913 """ 914 Plot sequence logo or stacked area plot (use the first one with appropriate zoom levels). The 915 logo visualizes the relative frequency of nt or aa characters in the alignment. The char frequency 916 is scaled to the information content at each position. --> identical to how Geneious calculates it. 917 918 :param aln: alignment MSA class or path 919 :param ax: matplotlib axes 920 :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options 921 :param plot_type: 'logo' for sequence logo, 'stacked' for stacked area plot 922 :param show_x_label: whether to show the x-axis label 923 """ 924 925 if type(aln) is str: 926 if os.path.exists(aln): 927 aln = explore.MSA(aln) 928 else: 929 raise FileNotFoundError(f'File {aln} does not exist') 930 _validate_input_parameters(aln, ax) 931 if ax is None: 932 ax = plt.gca() 933 # calc matrix 934 matrix = aln.calc_position_matrix('IC') * aln.calc_position_matrix('PPM') 935 letters_to_plot = list(config.CHAR_COLORS[aln.aln_type]['standard'].keys())[:-1] 936 937 #matrix = np.clip(matrix, 0, None) # only positive values 938 # plot 939 if plot_type == 'logo': 940 for pos in range(matrix.shape[1]): 941 # sort the positive matrix row values by size 942 items = [(letters_to_plot[i], matrix[i, pos]) for i in range(len(letters_to_plot)) if matrix[i, pos] > 0] 943 items.sort(key=lambda x: x[1]) 944 # plot each position 945 y_offset = 0 946 for letter, h in items: 947 tp = TextPath((aln.zoom[0] - 0.325 if aln.zoom is not None else - 0.325, 0), letter, size=1, prop=FontProperties(weight='bold')) 948 bb = tp.get_extents() 949 glyph_height = bb.height if bb.height > 0 else 1e-6 # avoid div by zero 950 scale_to_1 = 1.0 / glyph_height 951 952 transform = (Affine2D() 953 .scale(1.0, h * scale_to_1) # scale manually by IC and normalize font 954 .translate(pos, y_offset) 955 + ax.transData) 956 957 patch = PathPatch(tp, transform=transform, 958 facecolor=config.CHAR_COLORS[aln.aln_type][color_scheme][letter], 959 edgecolor='none') 960 ax.add_patch(patch) 961 y_offset += h 962 elif plot_type == 'stacked': 963 y_values = np.zeros(matrix.shape[1]) 964 x_values = np.arange(0, matrix.shape[1]) if aln.zoom is None else np.arange(aln.zoom[0], aln.zoom[1]) 965 for row in range(matrix.shape[0]): 966 y = matrix[row] 967 ax.fill_between(x_values, 968 y_values, 969 y_values + y, 970 fc=config.CHAR_COLORS[aln.aln_type][color_scheme].get(letters_to_plot[row]), 971 ec='None', 972 label=letters_to_plot[row], 973 step='mid') 974 y_values += y 975 976 # adjust limits & labels 977 _format_x_axis(aln, ax, show_x_label, show_left=True) 978 if aln.aln_type == 'AA': 979 ax.set_ylim(bottom=0, top=5) 980 else: 981 ax.set_ylim(bottom=0, top=2) 982 ax.spines['top'].set_visible(False) 983 ax.spines['right'].set_visible(False) 984 ax.set_ylabel('bits') 985 986 return ax
283def identity_alignment(aln: explore.MSA | str, ax: plt.Axes | None = None, show_title: bool = True, show_identity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', show_mask:bool = True, show_gaps:bool = True, fancy_gaps:bool = False, show_mismatches: bool = True, show_ambiguities: bool = False, color_scheme: str | None = None, show_x_label: bool = True, show_legend: bool = False, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int]= (1, 1)): 284 """ 285 Generates an identity alignment overview plot. 286 :param aln: alignment MSA class or path 287 :param ax: matplotlib axes 288 :param show_title: whether to show title 289 :param show_seq_names: whether to show seq names 290 :param show_identity_sequence: whether to show sequence for only differences and reference - zoom in to avoid plotting issues 291 :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues 292 :param custom_seq_names: custom seq names 293 :param reference_color: color of reference sequence 294 :param show_mask: whether to show N or X chars otherwise it will be shown as match or mismatch 295 :param show_gaps: whether to show gaps otherwise it will be shown as match or mismatch 296 :param fancy_gaps: show gaps with a small black bar 297 :param show_mismatches: whether to show mismatches otherwise it will be shown as match 298 :param show_ambiguities: whether to show non-N ambiguities -> only relevant for RNA/DNA sequences 299 :param color_scheme: color mismatching chars with their unique color. Options for DNA/RNA are: standard, purine_pyrimidine, strong_weak; and for AS: standard, clustal, zappo, hydrophobicity 300 :param show_x_label: whether to show x label 301 :param show_legend: whether to show the legend 302 :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html 303 """ 304 305 # Validate inputs and colors 306 if type(aln) is str: 307 if os.path.exists(aln): 308 aln = explore.MSA(aln) 309 else: 310 raise FileNotFoundError(f'File {aln} does not exist') 311 312 _validate_input_parameters(aln, ax) 313 if ax is None: 314 ax = plt.gca() 315 if not is_color_like(reference_color): 316 raise ValueError(f'{reference_color} for reference is not a color') 317 318 if color_scheme is not None: 319 if aln.aln_type == 'AA' and color_scheme not in ['standard', 'clustal', 'zappo', 'hydrophobicity']: 320 raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, clustal, zappo, hydrophobicity') 321 elif aln.aln_type in ['DNA', 'RNA'] and color_scheme not in ['standard', 'purine_pyrimidine', 'strong_weak']: 322 raise ValueError(f'{color_scheme} is not a supported coloring scheme for {aln.aln_type} alignments. Supported are: standard, purine_pyrimidine, strong_weak') 323 324 # Both options for gaps work hand in hand 325 if fancy_gaps: 326 show_gaps = True 327 328 # Determine zoom 329 zoom = (0, aln.length) if aln.zoom is None else aln.zoom 330 331 # Set up color mapping and identity values 332 aln_colors = config.IDENTITY_COLORS.copy() 333 identity_values = [-1, -2, -3] # -1 = mismatch, -2 = mask, -3 ambiguity 334 if color_scheme is not None: 335 colors_to_extend = config.CHAR_COLORS[aln.aln_type][color_scheme] 336 identity_values += [x + 1 for x in range(len(colors_to_extend))] # x+1 is needed to allow correct mapping 337 # use the standard setting for the index (same as in aln.calc_identity_alignment) 338 # and map the corresponding color scheme to it 339 for idx, char in enumerate(config.CHAR_COLORS[aln.aln_type]['standard']): 340 aln_colors[idx + 1] = {'type': char, 'color': colors_to_extend[char]} 341 342 # Compute identity alignment 343 identity_aln = aln.calc_identity_alignment( 344 encode_mask=show_mask, 345 encode_gaps=show_gaps, 346 encode_mismatches=show_mismatches, 347 encode_ambiguities=show_ambiguities, 348 encode_each_mismatch_char=True if color_scheme is not None else False 349 ) 350 351 # List to store polygons 352 detected_identity_values = {0} 353 polygons, polygon_colors, patch_list = [], [], [] 354 355 for i, seq_name in enumerate(aln.alignment): 356 y_position = len(aln.alignment) - i - 1 357 row = identity_aln[i] 358 359 # plot a line below everything 360 if fancy_gaps: 361 ax.hlines( 362 y_position + 0.4, 363 xmin=zoom[0] - 0.5, 364 xmax=zoom[1] + 0.5, 365 color='black', 366 linestyle='-', 367 zorder=0, 368 linewidth=0.75 369 ) 370 371 # plot the basic shape per sequence with gaps 372 _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, aln_colors[0]['color'], show_gaps) 373 374 # find consecutive stretches 375 stretches = _find_stretches(row) 376 # create polygons per stretch 377 _create_polygons(stretches, identity_values, zoom, y_position, polygons, aln_colors, polygon_colors, detected_identity_values) 378 379 # add sequence text 380 if show_identity_sequence or show_sequence_all: 381 _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, identity_aln[i], identity_aln, ax, zoom, y_position, 0, reference_color, show_gaps, colorscheme=aln_colors) 382 383 # Create the LineCollection: each segment is drawn in a single call. 384 ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt')) 385 ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors)) 386 387 # custom legend 388 if show_legend: 389 if color_scheme is not None and color_scheme != 'standard': 390 for x in aln_colors: 391 for group in config.CHAR_GROUPS[aln.aln_type][color_scheme]: 392 if aln_colors[x]['type'] in config.CHAR_GROUPS[aln.aln_type][color_scheme][group]: 393 aln_colors[x]['type'] = group 394 break 395 # create it 396 handels, labels, detected_groups = [], [], set() 397 for x in aln_colors: 398 if x in detected_identity_values and aln_colors[x]['type'] not in detected_groups: 399 handels.append( 400 ax.add_line( 401 plt.Line2D( 402 [], 403 [], 404 color=aln_colors[x]['color'] if color_scheme != 'hydrophobicity' or x == 0 else config.CHAR_COLORS[aln.aln_type]['hydrophobicity'][ 405 config.CHAR_GROUPS[aln.aln_type]['hydrophobicity'][aln_colors[x]['type']][0] 406 ], 407 marker='s', 408 markeredgecolor='grey', 409 linestyle='', 410 markersize=10)) 411 ) 412 labels.append(aln_colors[x]['type']) 413 detected_groups.add(aln_colors[x]['type']) 414 415 # ncols 416 if color_scheme is None or aln.aln_type != 'AA': 417 ncols = len(detected_identity_values) 418 elif color_scheme == 'standard': 419 ncols = (len(detected_identity_values) + 1) / 2 420 else: 421 ncols = (len(detected_groups) + 1) / 2 422 423 # plot it 424 ax.legend( 425 handels, 426 labels, 427 loc='lower right', 428 bbox_to_anchor=bbox_to_anchor, 429 ncols=ncols, 430 frameon=False 431 ) 432 _seq_names(aln, ax, custom_seq_names, show_seq_names) 433 434 # configure axis 435 ax.set_ylim(0, len(aln.alignment)) 436 if show_title: 437 ax.set_title('identity', loc='left') 438 _format_x_axis(aln, ax, show_x_label, show_left=False) 439 440 return ax
Generates an identity alignment overview plot.
Parameters
- aln: alignment MSA class or path
- ax: matplotlib axes
- show_title: whether to show title
- show_seq_names: whether to show seq names
- show_identity_sequence: whether to show sequence for only differences and reference - zoom in to avoid plotting issues
- show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
- custom_seq_names: custom seq names
- reference_color: color of reference sequence
- show_mask: whether to show N or X chars otherwise it will be shown as match or mismatch
- show_gaps: whether to show gaps otherwise it will be shown as match or mismatch
- fancy_gaps: show gaps with a small black bar
- show_mismatches: whether to show mismatches otherwise it will be shown as match
- show_ambiguities: whether to show non-N ambiguities -> only relevant for RNA/DNA sequences
- color_scheme: color mismatching chars with their unique color. Options for DNA/RNA are: standard, purine_pyrimidine, strong_weak; and for AS: standard, clustal, zappo, hydrophobicity
- show_x_label: whether to show x label
- show_legend: whether to show the legend
- bbox_to_anchor: bounding box coordinates for the legend - see: https: //matplotlib.org/stable/api/legend_api.html
443def similarity_alignment(aln: explore.MSA | str, ax: plt.Axes | None = None, matrix_type: str | None = None, show_title: bool = True, show_similarity_sequence: bool = False, show_sequence_all: bool = False, show_seq_names: bool = False, custom_seq_names: tuple | list = (), reference_color: str = 'lightsteelblue', cmap: str = 'twilight_r', show_gaps:bool = True, fancy_gaps:bool = False, show_x_label: bool = True, show_cbar: bool = False, cbar_fraction: float = 0.1): 444 """ 445 Generates a similarity alignment overview plot. Importantly the similarity values are normalized! 446 :param aln: alignment MSA class or path 447 :param ax: matplotlib axes 448 :param matrix_type: substitution matrix - see config.SUBS_MATRICES, standard: NT - TRANS, AA - BLOSUM65 449 :param show_title: whether to show title 450 :param show_similarity_sequence: whether to show sequence only for differences and reference - zoom in to avoid plotting issues 451 :param show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues 452 :param show_seq_names: whether to show seq names 453 :param custom_seq_names: custom seq names 454 :param reference_color: color of reference sequence 455 :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html 456 :param show_gaps: whether to show gaps otherwise it will be ignored 457 :param fancy_gaps: show gaps with a small black bar 458 :param show_x_label: whether to show x label 459 :param show_cbar: whether to show the legend - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html 460 :param cbar_fraction: fraction of the original ax reserved for the legend 461 """ 462 # input check 463 if type(aln) is str: 464 if os.path.exists(aln): 465 aln = explore.MSA(aln) 466 else: 467 raise FileNotFoundError(f'File {aln} does not exist') 468 _validate_input_parameters(aln, ax) 469 if ax is None: 470 ax = plt.gca() 471 # validate colors 472 if not is_color_like(reference_color): 473 raise ValueError(f'{reference_color} for reference is not a color') 474 475 # Both options for gaps work hand in hand 476 if fancy_gaps: 477 show_gaps = True 478 479 # define zoom to plot 480 if aln.zoom is None: 481 zoom = (0, aln.length) 482 else: 483 zoom = aln.zoom 484 485 # get data 486 similarity_aln = aln.calc_similarity_alignment(matrix_type=matrix_type) # use normalized values here 487 similarity_aln = similarity_aln.round(2) # round data for good color mapping 488 489 # determine min max values of the underlying matrix and create cmap 490 min_value, max_value = 0, 1 491 cmap = ScalarMappable( 492 norm=Normalize( 493 vmin=min_value, 494 vmax=max_value 495 ), 496 cmap=plt.get_cmap(cmap) 497 ) 498 499 # create similarity values 500 similarity_values = np.arange(start=min_value, stop=max_value, step=0.01) 501 # round it to be absolutely sure that values match with rounded sim alignment 502 similarity_values = similarity_values.round(2) 503 504 # create plot 505 polygons, polygon_colors, patch_list = [], [], [] 506 507 for i, seq_name in enumerate(aln.alignment): 508 y_position = len(aln.alignment) - i - 1 509 row = similarity_aln[i] 510 511 # plot a line below everything 512 if fancy_gaps: 513 ax.hlines( 514 y_position + 0.4, 515 xmin=zoom[0] - 0.5, 516 xmax=zoom[1] + 0.5, 517 color='black', 518 linestyle='-', 519 zorder=0, 520 linewidth=0.75 521 ) 522 523 # plot the basic shape per sequence with gaps 524 _create_identity_patch(row, aln, patch_list, zoom, y_position, reference_color, seq_name, 525 cmap.to_rgba(max_value) if seq_name != aln.reference_id else reference_color, 526 show_gaps) 527 528 # find consecutive stretches 529 stretches = _find_stretches(row) 530 # create polygons per stretch 531 _create_polygons(stretches, similarity_values, zoom, y_position, polygons, cmap, polygon_colors) 532 533 # add sequence text 534 if show_sequence_all or show_similarity_sequence: 535 _plot_sequence_text(aln, list(aln.alignment.keys())[i], aln.reference_id, show_sequence_all, similarity_aln[i], similarity_aln, ax, 536 zoom, y_position, 1, reference_color, show_gaps, cmap) 537 538 # Create the LineCollection: each segment is drawn in a single call. 539 ax.add_collection(PatchCollection(patch_list, match_original=True, linewidths='none', joinstyle='miter', capstyle='butt')) 540 ax.add_collection(PolyCollection(polygons, facecolors=polygon_colors, linewidths=0.5, edgecolors=polygon_colors)) 541 542 # legend 543 if show_cbar: 544 cbar = plt.colorbar(cmap, ax=ax, location= 'top', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction) 545 cbar.set_ticks([min_value, max_value]) 546 cbar.set_ticklabels(['low', 'high']) 547 548 # format seq names 549 _seq_names(aln, ax, custom_seq_names, show_seq_names) 550 551 # configure axis 552 ax.set_ylim(0, len(aln.alignment)) 553 if show_title: 554 ax.set_title('similarity', loc='left') 555 _format_x_axis(aln, ax, show_x_label, show_left=False) 556 557 return ax
Generates a similarity alignment overview plot. Importantly the similarity values are normalized!
Parameters
- aln: alignment MSA class or path
- ax: matplotlib axes
- matrix_type: substitution matrix - see config.SUBS_MATRICES, standard: NT - TRANS, AA - BLOSUM65
- show_title: whether to show title
- show_similarity_sequence: whether to show sequence only for differences and reference - zoom in to avoid plotting issues
- show_sequence_all: whether to show all sequences - zoom in to avoid plotting issues
- show_seq_names: whether to show seq names
- custom_seq_names: custom seq names
- reference_color: color of reference sequence
- cmap: color mapping for % identity - see https: //matplotlib.org/stable/users/explain/colors/colormaps.html
- show_gaps: whether to show gaps otherwise it will be ignored
- fancy_gaps: show gaps with a small black bar
- show_x_label: whether to show x label
- show_cbar: whether to show the legend - see https: //matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
- cbar_fraction: fraction of the original ax reserved for the legend
585def stat_plot(aln: explore.MSA | str, stat_type: str, ax: plt.Axes | None = None, line_color: str = 'burlywood', line_width: int | float = 2, rolling_average: int = 20, show_x_label: bool = False, show_title: bool = True): 586 """ 587 Generate a plot for the various alignment stats. 588 :param aln: alignment MSA class or path 589 :param ax: matplotlib axes 590 :param stat_type: 'entropy', 'gc', 'coverage', 'ts/tv', 'identity' or 'similarity' -> (here default matrices are used NT - TRANS, AA - BLOSUM65) 591 :param line_color: color of the line 592 :param line_width: width of the line 593 :param rolling_average: average rolling window size left and right of a position in nucleotides or amino acids 594 :param show_x_label: whether to show the x-axis label 595 :param show_title: whether to show the title 596 """ 597 598 # define possible functions to calc here 599 stat_functions: Dict[str, Callable[[], list | ndarray]] = { 600 'gc': aln.calc_gc, 601 'entropy': aln.calc_entropy, 602 'coverage': aln.calc_coverage, 603 'identity': aln.calc_identity_alignment, 604 'similarity': aln.calc_similarity_alignment, 605 'ts tv score': aln.calc_transition_transversion_score 606 } 607 608 if stat_type not in stat_functions: 609 raise ValueError('stat_type must be one of {}'.format(list(stat_functions.keys()))) 610 611 # input check 612 if type(aln) is str: 613 if os.path.exists(aln): 614 aln = explore.MSA(aln) 615 else: 616 raise FileNotFoundError(f'File {aln} does not exist') 617 _validate_input_parameters(aln, ax) 618 if ax is None: 619 ax = plt.gca() 620 if not is_color_like(line_color): 621 raise ValueError('line color is not a color') 622 623 # validate rolling average 624 if rolling_average < 1 or rolling_average > aln.length: 625 raise ValueError('rolling_average must be between 1 and length of sequence') 626 627 # generate input data 628 array = stat_functions[stat_type]() 629 630 if stat_type == 'identity': 631 min_value, max_value = -1, 0 632 elif stat_type == 'ts tv score': 633 min_value, max_value = -1, 1 634 else: 635 min_value, max_value = 0, 1 636 if stat_type in ['identity', 'similarity']: 637 # for the mean nan values get handled as the lowest possible number in the matrix 638 array = np.nan_to_num(array, True, min_value) 639 array = np.mean(array, axis=0) 640 data, plot_idx = _moving_average(array, rolling_average, aln.zoom, aln.length) 641 642 # plot the data 643 ax.fill_between( 644 # this add dummy data left and right for better plotting 645 # otherwise only half of the step is shown 646 np.concatenate(([plot_idx[0] - 0.5], plot_idx, [plot_idx[-1] + 0.5])) if rolling_average == 1 else plot_idx, 647 np.concatenate(([data[0]], data, [data[-1]])) if rolling_average == 1 else data, 648 min_value, 649 linewidth = line_width, 650 edgecolor=line_color, 651 step='mid' if rolling_average == 1 else None, 652 facecolor=(line_color, 0.6) if stat_type not in ['ts tv score', 'gc'] else 'none' 653 ) 654 if stat_type == 'gc': 655 ax.hlines(0.5, xmin=0, xmax=aln.zoom[0] + aln.length if aln.zoom is not None else aln.length, color='black', linestyles='--', linewidth=1) 656 657 # format axis 658 ax.set_ylim(min_value, max_value*0.1+max_value) 659 ax.set_yticks([min_value, max_value]) 660 if stat_type == 'gc': 661 ax.set_yticklabels(['0', '100']) 662 elif stat_type == 'ts tv score': 663 ax.set_yticklabels(['tv', 'ts']) 664 else: 665 ax.set_yticklabels(['low', 'high']) 666 667 # show title 668 if show_title: 669 ax.set_title( 670 f'{stat_type} (average over {rolling_average} positions)' if rolling_average > 1 else f'{stat_type} for each position', 671 loc='left' 672 ) 673 674 _format_x_axis(aln, ax, show_x_label, show_left=True) 675 676 return ax
Generate a plot for the various alignment stats.
Parameters
- aln: alignment MSA class or path
- ax: matplotlib axes
- stat_type: 'entropy', 'gc', 'coverage', 'ts/tv', 'identity' or 'similarity' -> (here default matrices are used NT - TRANS, AA - BLOSUM65)
- line_color: color of the line
- line_width: width of the line
- rolling_average: average rolling window size left and right of a position in nucleotides or amino acids
- show_x_label: whether to show the x-axis label
- show_title: whether to show the title
679def variant_plot(aln: explore.MSA | str, ax: plt.Axes | None = None, lollisize: tuple[int, int] | list[int, int] = (1, 3), color_scheme: str = 'standard', show_x_label: bool = False, show_legend: bool = True, bbox_to_anchor: tuple[float|int, float|int] | list[float|int, float|int] = (1, 1)): 680 """ 681 Plots variants. 682 :param aln: alignment MSA class or path 683 :param ax: matplotlib axes 684 :param lollisize: (stem_size, head_size) 685 :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options 686 :param show_x_label: whether to show the x-axis label 687 :param show_legend: whether to show the legend 688 :param bbox_to_anchor: bounding box coordinates for the legend - see: https://matplotlib.org/stable/api/legend_api.html 689 """ 690 691 # validate input 692 if type(aln) is str: 693 if os.path.exists(aln): 694 aln = explore.MSA(aln) 695 else: 696 raise FileNotFoundError(f'File {aln} does not exist') 697 _validate_input_parameters(aln, ax) 698 if ax is None: 699 ax = plt.gca() 700 if not isinstance(lollisize, tuple) or len(lollisize) != 2: 701 raise ValueError('lollisize must be tuple of length 2 (stem, head)') 702 for _size in lollisize: 703 if not isinstance(_size, float | int) or _size <= 0: 704 raise ValueError('lollisize must be floats greater than zero') 705 706 # define colors 707 colors = config.CHAR_COLORS[aln.aln_type][color_scheme] 708 # get snps 709 snps = aln.get_snps() 710 # define where to plot (each ref type gets a separate line) 711 ref_y_positions, y_pos, detected_var = {}, 0, set() 712 713 # iterate over snp dict 714 for pos in snps['POS']: 715 for identifier in snps['POS'][pos]: 716 # fill in y pos dict 717 if identifier == 'ref': 718 if snps['POS'][pos]['ref'] not in ref_y_positions: 719 ref_y_positions[snps['POS'][pos]['ref']] = y_pos 720 y_pos += 1.1 721 continue 722 # plot 723 if identifier == 'ALT': 724 for alt in snps['POS'][pos]['ALT']: 725 ax.vlines(x=pos + aln.zoom[0] if aln.zoom is not None else pos, 726 ymin=ref_y_positions[snps['POS'][pos]['ref']], 727 ymax=ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'], 728 color=colors[alt], 729 zorder=100, 730 linewidth=lollisize[0] 731 ) 732 ax.plot(pos + aln.zoom[0] if aln.zoom is not None else pos, 733 ref_y_positions[snps['POS'][pos]['ref']] + snps['POS'][pos]['ALT'][alt]['AF'], 734 color=colors[alt], 735 marker='o', 736 markersize=lollisize[1] 737 ) 738 detected_var.add(alt) 739 740 # plot hlines 741 for y_char in ref_y_positions: 742 ax.hlines( 743 ref_y_positions[y_char], 744 xmin=aln.zoom[0] - 0.5 if aln.zoom is not None else -0.5, 745 xmax=aln.zoom[0] + aln.length + 0.5 if aln.zoom is not None else aln.length + 0.5, 746 color='black', 747 linestyle='-', 748 zorder=0, 749 linewidth=0.75 750 ) 751 # create a custom legend 752 if show_legend: 753 custom_legend = [ 754 ax.add_line( 755 plt.Line2D( 756 [], 757 [], 758 color=colors[char], 759 marker='o', 760 linestyle='', 761 markersize=5 762 ) 763 ) for char in colors if char in detected_var 764 ] 765 ax.legend( 766 custom_legend, 767 [char for char in colors if char in detected_var], # ensures correct sorting 768 loc='lower right', 769 title='variant', 770 bbox_to_anchor=bbox_to_anchor, 771 ncols=len(detected_var)/2 if aln.aln_type == 'AA' else len(detected_var), 772 frameon=False 773 ) 774 775 # format axis 776 _format_x_axis(aln, ax, show_x_label, show_left=False) 777 ax.spines['left'].set_visible(False) 778 ax.set_yticks([ref_y_positions[x] for x in ref_y_positions]) 779 ax.set_yticklabels(ref_y_positions.keys()) 780 ax.set_ylim(0, y_pos) 781 ax.set_ylabel('reference') 782 783 return ax
Plots variants.
Parameters
- aln: alignment MSA class or path
- ax: matplotlib axes
- lollisize: (stem_size, head_size)
- color_scheme: color scheme for characters. see config.CHAR_COLORS for available options
- show_x_label: whether to show the x-axis label
- show_legend: whether to show the legend
- bbox_to_anchor: bounding box coordinates for the legend - see: https: //matplotlib.org/stable/api/legend_api.html
786def orf_plot(aln: explore.MSA | str, ax: plt.Axes | None = None, min_length: int = 500, non_overlapping_orfs: bool = True, cmap: str = 'Blues', direction_marker_size: int | None = 5, show_x_label: bool = False, show_cbar: bool = False, cbar_fraction: float = 0.1): 787 """ 788 Plot conserved ORFs. 789 :param aln: alignment MSA class or path 790 :param ax: matplotlib axes 791 :param min_length: minimum length of orf 792 :param non_overlapping_orfs: whether to consider overlapping orfs 793 :param cmap: color mapping for % identity - see https://matplotlib.org/stable/users/explain/colors/colormaps.html 794 :param direction_marker_size: marker size for direction marker, not shown if marker_size == None 795 :param show_x_label: whether to show the x-axis label 796 :param show_cbar: whether to show the colorbar - see https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html 797 :param cbar_fraction: fraction of the original ax reserved for the colorbar 798 """ 799 800 # normalize colorbar 801 cmap = ScalarMappable(norm=Normalize(0, 100), cmap=plt.get_cmap(cmap)) 802 803 # validate input 804 if type(aln) is str: 805 if os.path.exists(aln): 806 aln = explore.MSA(aln) 807 else: 808 raise FileNotFoundError(f'File {aln} does not exist') 809 _validate_input_parameters(aln, ax) 810 _validate_input_parameters(aln, ax) 811 if ax is None: 812 ax = plt.gca() 813 814 # get orfs --> first deepcopy and reset zoom that the orfs are also zoomed in (by default, the orfs are only 815 # searched within the zoomed region) 816 aln_temp = deepcopy(aln) 817 aln_temp.zoom = None 818 if non_overlapping_orfs: 819 annotation_dict = aln_temp.get_non_overlapping_conserved_orfs(min_length=min_length) 820 else: 821 annotation_dict = aln_temp.get_conserved_orfs(min_length=min_length) 822 823 # filter dict for zoom 824 if aln.zoom is not None: 825 annotation_dict = {key:val for key, val in annotation_dict.items() if max(val['location'][0][0], aln.zoom[0]) <= min(val['location'][0][1], aln.zoom[1])} 826 827 # add track for plotting 828 _add_track_positions(annotation_dict) 829 830 # plot 831 _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=cmap) 832 833 # legend 834 if show_cbar: 835 cbar = plt.colorbar(cmap,ax=ax, location= 'top', orientation='horizontal', anchor=(1,0), shrink=0.2, pad=2/ax.bbox.height, fraction=cbar_fraction) 836 cbar.set_label('% identity') 837 cbar.set_ticks([0, 100]) 838 839 # format axis 840 _format_x_axis(aln, ax, show_x_label, show_left=False) 841 ax.set_ylim(bottom=0.8) 842 ax.set_yticks([]) 843 ax.set_yticklabels([]) 844 ax.set_title('conserved orfs', loc='left') 845 846 return ax
Plot conserved ORFs.
Parameters
- aln: alignment MSA class or path
- ax: matplotlib axes
- min_length: minimum length of orf
- non_overlapping_orfs: whether to consider overlapping orfs
- cmap: color mapping for % identity - see https: //matplotlib.org/stable/users/explain/colors/colormaps.html
- direction_marker_size: marker size for direction marker, not shown if marker_size == None
- show_x_label: whether to show the x-axis label
- show_cbar: whether to show the colorbar - see https: //matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.colorbar.html
- cbar_fraction: fraction of the original ax reserved for the colorbar
849def annotation_plot(aln: explore.MSA | str, annotation: explore.Annotation | str, feature_to_plot: str, ax: plt.Axes | None = None, color: str = 'wheat', direction_marker_size: int | None = 5, show_x_label: bool = False): 850 """ 851 Plot annotations from bed, gff or gb files. Are automatically mapped to alignment. 852 :param aln: alignment MSA class 853 :param annotation: annotation class | path to annotation file 854 :param ax: matplotlib axes 855 :param feature_to_plot: potential feature to plot (not for bed files as it is parsed as one feature) 856 :param color: color for the annotation 857 :param direction_marker_size: marker size for direction marker, only relevant if show_direction is True 858 :param show_x_label: whether to show the x-axis label 859 """ 860 # helper function 861 def parse_annotation_from_string(path: str, msa: explore.MSA) -> explore.Annotation: 862 """ 863 Parse annotation. 864 :param path: path to annotation 865 :param msa: msa object 866 :return: parsed annotation 867 """ 868 if os.path.exists(path): 869 # reset zoom so the annotation is correctly parsed 870 msa_temp = deepcopy(msa) 871 msa_temp.zoom = None 872 return explore.Annotation(msa_temp, path) 873 else: 874 raise FileNotFoundError() 875 876 # validate input 877 if type(aln) is str: 878 if os.path.exists(aln): 879 aln = explore.MSA(aln) 880 else: 881 raise FileNotFoundError(f'File {aln} does not exist') 882 if type(annotation) is str: 883 annotation = parse_annotation_from_string(annotation, aln) 884 _validate_input_parameters(aln, ax, annotation) 885 if ax is None: 886 ax = plt.gca() 887 if not is_color_like(color): 888 raise ValueError(f'{color} for reference is not a color') 889 890 # ignore features to plot for bed files (here it is written into one feature) 891 if annotation.ann_type == 'bed': 892 annotation_dict = annotation.features['region'] 893 feature_to_plot = 'bed regions' 894 else: 895 # try to subset the annotation dict 896 try: 897 annotation_dict = annotation.features[feature_to_plot] 898 except KeyError: 899 raise KeyError(f'Feature {feature_to_plot} not found. Use annotation.features.keys() to see available features.') 900 901 # plotting and formating 902 _add_track_positions(annotation_dict) 903 _plot_annotation(annotation_dict, ax, direction_marker_size=direction_marker_size, color=color) 904 _format_x_axis(aln, ax, show_x_label, show_left=False) 905 ax.set_ylim(bottom=0.8) 906 ax.set_yticks([]) 907 ax.set_yticklabels([]) 908 ax.set_title(f'{annotation.locus} ({feature_to_plot})', loc='left') 909 910 return ax
Plot annotations from bed, gff or gb files. Are automatically mapped to alignment.
Parameters
- aln: alignment MSA class
- annotation: annotation class | path to annotation file
- ax: matplotlib axes
- feature_to_plot: potential feature to plot (not for bed files as it is parsed as one feature)
- color: color for the annotation
- direction_marker_size: marker size for direction marker, only relevant if show_direction is True
- show_x_label: whether to show the x-axis label
913def sequence_logo(aln:explore.MSA | str, ax:plt.Axes | None = None, color_scheme: str = 'standard', plot_type: str = 'stacked', show_x_label:bool = False): 914 """ 915 Plot sequence logo or stacked area plot (use the first one with appropriate zoom levels). The 916 logo visualizes the relative frequency of nt or aa characters in the alignment. The char frequency 917 is scaled to the information content at each position. --> identical to how Geneious calculates it. 918 919 :param aln: alignment MSA class or path 920 :param ax: matplotlib axes 921 :param color_scheme: color scheme for characters. see config.CHAR_COLORS for available options 922 :param plot_type: 'logo' for sequence logo, 'stacked' for stacked area plot 923 :param show_x_label: whether to show the x-axis label 924 """ 925 926 if type(aln) is str: 927 if os.path.exists(aln): 928 aln = explore.MSA(aln) 929 else: 930 raise FileNotFoundError(f'File {aln} does not exist') 931 _validate_input_parameters(aln, ax) 932 if ax is None: 933 ax = plt.gca() 934 # calc matrix 935 matrix = aln.calc_position_matrix('IC') * aln.calc_position_matrix('PPM') 936 letters_to_plot = list(config.CHAR_COLORS[aln.aln_type]['standard'].keys())[:-1] 937 938 #matrix = np.clip(matrix, 0, None) # only positive values 939 # plot 940 if plot_type == 'logo': 941 for pos in range(matrix.shape[1]): 942 # sort the positive matrix row values by size 943 items = [(letters_to_plot[i], matrix[i, pos]) for i in range(len(letters_to_plot)) if matrix[i, pos] > 0] 944 items.sort(key=lambda x: x[1]) 945 # plot each position 946 y_offset = 0 947 for letter, h in items: 948 tp = TextPath((aln.zoom[0] - 0.325 if aln.zoom is not None else - 0.325, 0), letter, size=1, prop=FontProperties(weight='bold')) 949 bb = tp.get_extents() 950 glyph_height = bb.height if bb.height > 0 else 1e-6 # avoid div by zero 951 scale_to_1 = 1.0 / glyph_height 952 953 transform = (Affine2D() 954 .scale(1.0, h * scale_to_1) # scale manually by IC and normalize font 955 .translate(pos, y_offset) 956 + ax.transData) 957 958 patch = PathPatch(tp, transform=transform, 959 facecolor=config.CHAR_COLORS[aln.aln_type][color_scheme][letter], 960 edgecolor='none') 961 ax.add_patch(patch) 962 y_offset += h 963 elif plot_type == 'stacked': 964 y_values = np.zeros(matrix.shape[1]) 965 x_values = np.arange(0, matrix.shape[1]) if aln.zoom is None else np.arange(aln.zoom[0], aln.zoom[1]) 966 for row in range(matrix.shape[0]): 967 y = matrix[row] 968 ax.fill_between(x_values, 969 y_values, 970 y_values + y, 971 fc=config.CHAR_COLORS[aln.aln_type][color_scheme].get(letters_to_plot[row]), 972 ec='None', 973 label=letters_to_plot[row], 974 step='mid') 975 y_values += y 976 977 # adjust limits & labels 978 _format_x_axis(aln, ax, show_x_label, show_left=True) 979 if aln.aln_type == 'AA': 980 ax.set_ylim(bottom=0, top=5) 981 else: 982 ax.set_ylim(bottom=0, top=2) 983 ax.spines['top'].set_visible(False) 984 ax.spines['right'].set_visible(False) 985 ax.set_ylabel('bits') 986 987 return ax
Plot sequence logo or stacked area plot (use the first one with appropriate zoom levels). The logo visualizes the relative frequency of nt or aa characters in the alignment. The char frequency is scaled to the information content at each position. --> identical to how Geneious calculates it.
Parameters
- aln: alignment MSA class or path
- ax: matplotlib axes
- color_scheme: color scheme for characters. see config.CHAR_COLORS for available options
- plot_type: 'logo' for sequence logo, 'stacked' for stacked area plot
- show_x_label: whether to show the x-axis label