Skip to content

Visualization Module

MS2LDA Visualisation

MS2LDA_visualisation

compute_coherence_values

compute_coherence_values(spectra_path, limit, start, step)

Compute c_v coherence for various number of topics

Parameters:

dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics

Returns:

model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics

Source code in MS2LDA/Visualisation/MS2LDA_visualisation.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def compute_coherence_values(spectra_path, limit, start, step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        spectra = load_mgf(spectra_path)
        cleaned_spectra = clean_spectra(spectra)
        fragment_words, loss_words = features_to_words(cleaned_spectra)
        feature_words = combine_features(fragment_words, loss_words)
        ms2lda = define_model(n_motifs=n_motifs)
        train_parameters = {"parallel": 4}
        trained_ms2lda = train_model(
            ms2lda, feature_words, iterations=300, train_parameters=train_parameters
        )
        model_list.append(trained_ms2lda)
        coherence_model_lda = CoherenceModel(
            model=lda_model, texts=spectra, dictionary=id2word, coherence="c_v"
        )
        coherence_values.append(coherence_model_lda.get_coherence())
    # Plotting
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc="best")
    plt.ylim(0, 1)
    plt.show()

    return None

General Visualisation

visualisation

create_interactive_motif_network

create_interactive_motif_network(
    spectra,
    significant_figures,
    motif_sizes,
    smiles_clusters,
    spectra_cluster,
    motif_colors,
    file_generation=False,
)

Generates a network for the annotated optimized spectra, after running Spec2Vec annotation, if clicking in a node it will shot the spectrum and the molecule associated with it.

Parameters:

Name Type Description Default
spectra list

list of matchms.Spectrum.objects; after Spec2Vec annotation

required
significant_figures int

number of significant figures to round the mz values

required
motif_sizes list

list of sizes for the motif_{i} nodes; should match the length of spectra

required

Returns:

Name Type Description
network Graph

network with nodes and edges, spectra and structures

Source code in MS2LDA/Visualisation/visualisation.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
def create_interactive_motif_network(
    spectra,
    significant_figures,
    motif_sizes,
    smiles_clusters,
    spectra_cluster,
    motif_colors,
    file_generation=False,
):  # spectra-cluster added motif_colors
    """
    Generates a network for the annotated optimized spectra, after running Spec2Vec annotation, if clicking in a node
    it will shot the spectrum and the molecule associated with it.

    ARGS:
        spectra (list): list of matchms.Spectrum.objects; after Spec2Vec annotation
        significant_figures (int): number of significant figures to round the mz values
        motif_sizes (list, optional): list of sizes for the `motif_{i}` nodes; should match the length of spectra

    RETURNS:
        network (nx.Graph): network with nodes and edges, spectra and structures
    """

    if motif_sizes is not None:
        motif_sizes_filtered = list(
            map(lambda x: 0.7 if x == "?" else x, motif_sizes)
        )  # filtering '?'

    G = nx.Graph()

    if motif_sizes is not None and len(motif_sizes) != len(spectra):
        raise ValueError("Length of motif_sizes must match the number of spectra")

    for i, spectrum in enumerate(spectra, start=0):
        motif_node = f"motif_{i}"
        G.add_node(motif_node)

        if spectrum.peaks:
            peak_list = spectrum.peaks.mz
            rounded_peak_list = [round(x, significant_figures) for x in peak_list]
            int_peak_list = spectrum.peaks.intensities

            for edge, weight in zip(rounded_peak_list, int_peak_list):
                G.add_edge(motif_node, edge, weight=weight, color="red")

        if spectrum.losses:
            loss_list = spectrum.losses.mz
            rounded_loss_list = [round(x, significant_figures) for x in loss_list]
            int_losses_list = spectrum.losses.intensities

            for edge, weight in zip(rounded_loss_list, int_losses_list):
                G.add_edge(motif_node, edge, weight=weight, color="blue")

    node_sizes = {}
    if motif_sizes is None:
        default_size = 800
        for i in range(1, len(spectra)):  # error here!?
            node_sizes[f"motif_{i}"] = default_size
    else:
        n_smiles_cluster = []
        for i in smiles_clusters:
            n_smiles_cluster.append(len(i))
        max_n_smiles_cluster = max(n_smiles_cluster)

        n_frags_cluster = []
        for i in spectra:
            n_frags_cluster.append(len(i.peaks.mz))
        max_n_frags_cluster = max(n_frags_cluster)

        for i in range(1, len(spectra)):
            node_sizes[f"motif_{i}"] = (
                ((motif_sizes_filtered[i] * 10) ** 3) / 3
                + (((n_smiles_cluster[i] / max_n_smiles_cluster) * 10) ** 3) / 3
                + (((n_frags_cluster[i] / max_n_frags_cluster) * 10) ** 3) / 3
            )

    # new; for tox
    node_colors = {}
    if motif_colors is None:
        default_color = "#210070"
        for i in range(1, len(spectra)):
            node_colors[f"motif_{i}"] = default_color
    else:
        for i in range(1, len(spectra)):
            node_colors[f"motif_{i}"] = motif_colors[i]
    # --------------------------

    pos = nx.spring_layout(G)
    fig, ax = plt.subplots(figsize=(10, 50))

    edges = G.edges(data=True)
    weights = [d["weight"] * 10 for (u, v, d) in edges]
    edge_colors = [d["color"] for (u, v, d) in edges]
    nx.draw_networkx_edges(G, pos, alpha=0.3, width=weights, edge_color=edge_colors)
    node_size_list = [node_sizes.get(node, 100) for node in G.nodes]
    node_color_list = [node_colors.get(node, "green") for node in G.nodes]
    # node_color_list_flat = [color for sublist in node_color_list for color in (sublist if isinstance(sublist, list) else [sublist])]

    # nx.draw_networkx_nodes(G, pos, node_size=node_size_list, node_color="#210070", alpha=0.9)
    nx.draw_networkx_nodes(
        G, pos, node_size=node_size_list, node_color=node_color_list, alpha=0.9
    )

    label_options = {"ec": "k", "fc": "white", "alpha": 0.7}
    nx.draw_networkx_labels(G, pos, font_size=6, bbox=label_options)

    def on_click(event):
        for node, (x, y) in pos.items():
            dist = (x - event.xdata) ** 2 + (y - event.ydata) ** 2
            if dist < 0.00025:
                if isinstance(
                    node, str
                ):  # Check if the node is a string and matches "motif_x"
                    node_number = int(node.split("_")[1])
                    # print(f"Node {node} clicked!\n"
                    # f"Cluster similarity: {motif_sizes_filtered[node_number]*100}%\n"
                    # f"N of compounds: {(n_smiles_cluster[node_number]/max_n_smiles_cluster)*100}"
                    # f"N of features: {(n_frags_cluster[node_number]/max_n_frags_cluster)*100}"
                    # f"Fragments: {spectra[node_number].peaks.mz}\n"
                    # f"Losses: {spectra[node_number].losses.mz}")
                    mols = [MolFromSmiles(smi) for smi in smiles_clusters[node_number]]
                    img = MolsToGridImage(mols)

                    # spectra[node_number].plot()

                    pil_img = Image.open(io.BytesIO(img.data))

                    # Display new window
                    pil_img.show()

                    for spec in spectra_cluster[node_number]:  # also added
                        spectra[node_number].plot_against(spec)

                break

    # Connect the click event to the on_click function
    fig.canvas.mpl_connect("button_press_event", on_click)

    plt.show()
    if file_generation:
        nx.write_graphml(G, "lda_model_output.graphml")

    return G

create_network

create_network(
    spectra, significant_figures=2, motif_sizes=None, file_generation=False
)

Generates a network for the motifs spectra, where the nodes are the motifs (output of LDA model) and the edges are the peaks and losses of the spectra. The size of the nodes can be adjusted with the motif_sizes refined annotation

Parameters:

Name Type Description Default
spectra list

list of matchms.Spectrum.objects; after LDA modelling

required
significant_figures int

number of significant figures to round the mz values

2
motif_sizes list

list of sizes for the motif_{i} nodes; should match the length of spectra

None

Returns:

Name Type Description
network Graph

network with nodes and edges

Source code in MS2LDA/Visualisation/visualisation.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def create_network(
    spectra, significant_figures=2, motif_sizes=None, file_generation=False
):
    """
    Generates a network for the motifs spectra, where the nodes are the motifs (output of LDA model)
    and the edges are the peaks and losses of the spectra. The size of the nodes can be adjusted with the motif_sizes refined annotation

    ARGS:
        spectra (list): list of matchms.Spectrum.objects; after LDA modelling
        significant_figures (int, optional): number of significant figures to round the mz values
        motif_sizes (list, optional): list of sizes for the `motif_{i}` nodes; should match the length of spectra

    RETURNS:
        network (nx.Graph): network with nodes and edges
    """
    if motif_sizes is not None:
        motif_sizes_filtered = list(
            map(lambda x: 0.7 if x == "?" else x, motif_sizes)
        )  # filtering ?

    G = nx.Graph()

    if motif_sizes is not None and len(motif_sizes) != len(spectra):
        raise ValueError("Length of motif_sizes must match the number of spectra")

    for i, spectrum in enumerate(spectra, start=0):
        motif_node = f"motif_{i}"
        G.add_node(motif_node)

        peak_list = spectrum.peaks.mz
        rounded_peak_list = [round(x, significant_figures) for x in peak_list]
        int_peak_list = spectrum.peaks.intensities
        for edge, weight in zip(rounded_peak_list, int_peak_list):
            G.add_edge(motif_node, edge, weight=weight, color="red")

        if spectrum.losses:
            loss_list = spectrum.losses.mz
            rounded_loss_list = [round(x, significant_figures) for x in loss_list]
            int_losses_list = spectrum.losses.intensities

            for edge, weight in zip(rounded_loss_list, int_losses_list):
                G.add_edge(motif_node, edge, weight=weight, color="blue")

    # Arranging node size - motifs
    node_sizes = {}
    if motif_sizes is None:
        default_size = 1000
        for i in range(1, len(spectra)):
            node_sizes[f"motif_{i}"] = default_size

    else:
        for i in range(1, len(spectra)):
            node_sizes[f"motif_{i}"] = ((motif_sizes_filtered[i] * 100) ** 2) / 2

    fig, ax = plt.subplots(figsize=(50, 50))
    edges = G.edges(data=True)
    weights = [d["weight"] * 10 for (u, v, d) in edges]
    edge_colors = [d["color"] for (u, v, d) in edges]
    pos = nx.spring_layout(G)

    nx.draw_networkx_edges(G, pos, alpha=0.3, width=weights, edge_color=edge_colors)
    node_size_list = [node_sizes.get(node, 100) for node in G.nodes]
    nx.draw_networkx_nodes(
        G, pos, node_size=node_size_list, node_color="#210070", alpha=0.9
    )

    label_options = {"ec": "k", "fc": "white", "alpha": 0.7}
    nx.draw_networkx_labels(G, pos, font_size=14, bbox=label_options)

    # ax.margins(0.1, 0.05)
    # fig.tight_layout()
    # plt.axis("off")
    # plt.show()
    if file_generation:
        nx.write_graphml(G, "lda_model_output.graphml")
    return G

show_annotated_motifs

show_annotated_motifs(
    opt_motif_spectra, motif_spectra, clustered_smiles, savefig=None
)

Show side-by-side RDKit molecule images from clustered SMILES, and plot motif vs. optimized motif.

  • If in a Jupyter notebook, we'll try the 'Notebook-friendly' style.
  • If not in Jupyter, we'll switch to a headless backend (no GUI windows), skip plt.show(), and just close figures if not saving.
Source code in MS2LDA/Visualisation/visualisation.py
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
def show_annotated_motifs(
    opt_motif_spectra, motif_spectra, clustered_smiles, savefig=None
):
    """
    Show side-by-side RDKit molecule images from clustered SMILES,
    and plot motif vs. optimized motif.

    - If in a Jupyter notebook, we'll try the 'Notebook-friendly' style.
    - If not in Jupyter, we'll switch to a headless backend (no GUI windows),
      skip plt.show(), and just close figures if not saving.
    """
    assert len(opt_motif_spectra) == len(
        motif_spectra
    ), "Lengths of opt_motif_spectra and motif_spectra must match!"

    # Create output folder if needed
    if savefig is not None:
        os.makedirs(savefig, exist_ok=True)

    # We'll pass 'returnPNG=not_in_jupyter' so that in Jupyter we do no `returnPNG`,
    # in Dash we do `returnPNG=True`.
    not_in_jupyter = not _in_nb

    for m in range(len(motif_spectra)):
        mass_to_charge_opt = opt_motif_spectra[m].peaks.mz
        intensities_opt = opt_motif_spectra[m].peaks.intensities
        mass_to_charge = motif_spectra[m].peaks.mz
        intensities = motif_spectra[m].peaks.intensities

        loss_to_charge_opt = opt_motif_spectra[m].losses.mz
        loss_intensities_opt = opt_motif_spectra[m].losses.intensities
        loss_to_charge = motif_spectra[m].losses.mz
        loss_intensities = motif_spectra[m].losses.intensities

        # Convert SMILES -> RDKit mols
        mols = []
        for smi in clustered_smiles[m]:
            mol = MolFromSmiles(smi)
            if mol is not None:
                mols.append(mol)

        # If no valid SMILES, fallback to blank
        pil_img = None
        if len(mols) == 0:
            pil_img = Image.new("RGB", (400, 400), "white")
        else:
            # Attempt to get either a PIL object or bytes from MolsToGridImage
            # If in Jupyter => returnPNG=False
            # if not => returnPNG=True
            result = MolsToGridImage(
                mols,
                molsPerRow=len(mols),
                subImgSize=(400, 400),
                returnPNG=not_in_jupyter,
            )
            pil_img = _convert_molgrid_result_to_pil(result)
            if pil_img is None:
                # If that fails, fallback blank
                pil_img = Image.new("RGB", (400, 400), "white")

        # Make figure
        fig = plt.figure(figsize=(10, 6), facecolor="none", edgecolor="none")

        # Top subplot: molecule grid
        ax_top = fig.add_subplot(3, 1, 1)
        ax_top.imshow(pil_img)
        ax_top.axis("off")
        top_pos = ax_top.get_position()
        ax_top.set_position([top_pos.x0, top_pos.y0, top_pos.width, top_pos.height])

        # Bottom subplot: motif vs. optimized motif
        ax_bot = fig.add_subplot(3, 1, 3)
        if len(mass_to_charge):
            ax_bot.stem(
                mass_to_charge,
                intensities,
                basefmt="k-",
                markerfmt="",
                linefmt="black",
                label=f"motif_{m}",
            )
        if len(mass_to_charge_opt) > 0:
            ax_bot.stem(
                mass_to_charge_opt,
                intensities_opt,
                basefmt="k-",
                markerfmt="",
                linefmt="#FF9E01",
                label=f"opt motif_{m}",
            )

        ax_bot.set_ylim(
            0,
        )
        ax_bot.set_xlabel("m/z", fontsize=12)
        ax_bot.set_ylabel("Intensity", fontsize=12)
        ax_bot.spines["right"].set_visible(False)
        ax_bot.spines["top"].set_visible(False)
        ax_bot.spines["left"].set_color("black")
        ax_bot.spines["bottom"].set_color("black")
        ax_bot.spines["left"].set_linewidth(1.5)
        ax_bot.spines["bottom"].set_linewidth(1.5)
        ax_bot.tick_params(
            axis="both",
            which="major",
            direction="out",
            length=6,
            width=1.5,
            color="black",
        )

        # middel suplot: motif vs optimized motif losses
        ax_midd = fig.add_subplot(3, 1, 2)
        if len(loss_to_charge):
            ax_midd.stem(
                loss_to_charge,
                loss_intensities,
                basefmt="k-",
                markerfmt="",
                linefmt="black",
                bottom=0,
                label=f"motif_{m}",
            )
        if len(loss_to_charge_opt) > 0:
            ax_midd.stem(
                loss_to_charge_opt,
                loss_intensities_opt,
                basefmt="k-",
                markerfmt="",
                linefmt="#3979A9",
                bottom=0,
                label=f"opt motif_{m}",
            )
        # Ensure stems originate from the x-axis (y=0)
        ax_midd.axhline(y=0, color="black", linewidth=1.5)  # Explicitly draw x-axis

        # Only set ylim if there are valid intensities
        if any(loss_intensities):
            ax_midd.set_ylim(0, max(loss_intensities) * 1.1)
        else:
            ax_midd.set_ylim(0, 1)  # Default if no data

        ax_midd.yaxis.set_label_position("right")
        ax_midd.xaxis.set_label_position("top")
        ax_midd.set_xlabel("loss", fontsize=12)
        ax_midd.set_ylabel("Intensity", fontsize=12)
        ax_midd.invert_xaxis()
        ax_midd.invert_yaxis()
        ax_midd.xaxis.tick_top()
        ax_midd.yaxis.tick_right()

        ax_midd.spines["left"].set_visible(False)
        ax_midd.spines["bottom"].set_visible(False)
        ax_midd.spines["right"].set_color("black")
        ax_midd.spines["top"].set_color("black")
        ax_midd.spines["right"].set_linewidth(1.5)
        ax_midd.spines["top"].set_linewidth(1.5)
        ax_midd.tick_params(
            axis="both",
            which="major",
            direction="out",
            length=6,
            width=1.5,
            color="black",
        )

        plt.legend(loc="best")

        # Save or close
        if savefig:
            outfile = os.path.join(savefig, f"motif_{m}.png")
            plt.savefig(outfile, format="png", dpi=400)
            plt.close(fig)
        else:
            # If in Jupyter => show
            if _in_nb:
                plt.show()
            else:
                plt.close(fig)

LDA Dictionary

ldadict

generate_corpusjson_from_tomotopy

generate_corpusjson_from_tomotopy(
    model,
    documents,
    spectra,
    doc_metadata,
    min_prob_to_keep_beta=0.001,
    min_prob_to_keep_phi=0.01,
    min_prob_to_keep_theta=0.01,
    filename=None,
)

Generates lda_dict in the similar format as in the previous MS2LDA app.

Source code in MS2LDA/Visualisation/ldadict.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def generate_corpusjson_from_tomotopy(
    model,
    documents,
    spectra,
    doc_metadata,
    min_prob_to_keep_beta=1e-3,
    min_prob_to_keep_phi=1e-2,
    min_prob_to_keep_theta=1e-2,
    filename=None,
):
    """
    Generates lda_dict in the similar format as in the previous MS2LDA app.
    """

    # Build doc names
    if spectra is not None and len(spectra) == len(documents):
        doc_names = [spec.get("id") for spec in spectra]
    else:
        doc_names = list(doc_metadata.keys())

    # Gather all unique tokens from `documents`:
    unique_words = set()
    for doc in documents:
        unique_words.update(doc)
    word_list = sorted(unique_words)
    word_index = {word: idx for idx, word in enumerate(word_list)}
    n_words = len(word_index)

    # doc_index
    doc_index = {name: i for i, name in enumerate(doc_names)}
    n_docs = len(doc_index)

    # corpus
    corpus = {}
    for doc_name, doc_words in zip(doc_names, documents):
        word_counts = {}
        for w in doc_words:
            word_counts[w] = word_counts.get(w, 0) + 1
        corpus[doc_name] = word_counts

    # Number of topics and alpha
    K = model.k
    alpha = list(map(float, model.alpha))

    # Map each token => model’s vocab index if present
    model_vocab = model.used_vocabs
    model_vocab_index = {w: i for i, w in enumerate(model_vocab)}
    word_to_model_idx = {}
    for w in word_index:
        if w in model_vocab_index:
            word_to_model_idx[word_index[w]] = model_vocab_index[w]

    # Construct beta_matrix (K x n_words)
    beta_matrix = np.zeros((K, n_words), dtype=float)
    for k_idx in range(K):
        word_probs = model.get_topic_word_dist(k_idx)
        for w_i in range(n_words):
            if w_i in word_to_model_idx:
                model_wi = word_to_model_idx[w_i]
                beta_matrix[k_idx, w_i] = word_probs[model_wi]
            else:
                beta_matrix[k_idx, w_i] = 0.0

    # Construct gamma_matrix (doc-topic distribution)
    gamma_matrix = np.zeros((n_docs, K), dtype=float)
    for d_idx, doc in enumerate(model.docs):
        gamma_matrix[d_idx, :] = doc.get_topic_dist()

    # Construct phi_matrix
    phi_matrix = {}
    for d_idx, doc in enumerate(model.docs):
        doc_name = doc_names[d_idx]
        phi_matrix[doc_name] = defaultdict(lambda: np.zeros(K, dtype=float))
        for word_id, topic_id in zip(doc.words, doc.topics):
            w_str = model.vocabs[word_id]
            phi_matrix[doc_name][w_str][topic_id] += 1.0
        # Normalize each word’s topic distribution
        for w_str, topic_vec in phi_matrix[doc_name].items():
            total = topic_vec.sum()
            if total > 0.0:
                phi_matrix[doc_name][w_str] = topic_vec / total

    # Build topic_index + metadata
    topic_index = {f"motif_{k}": k for k in range(K)}
    topic_metadata = {
        f"motif_{k}": {"name": f"motif_{k}", "type": "learnt"} for k in range(K)
    }

    # For convenience, extract “features” if they look like "frag@X" or "loss@X".
    features_to_mz = {}
    for w in word_list:
        if w.startswith("frag@") or w.startswith("loss@"):
            try:
                val = float(w.split("@")[1])
                features_to_mz[w] = (val, val)
            except:
                pass

    # Prepare the final dictionary
    lda_dict = {
        "corpus": corpus,
        "word_index": word_index,
        "doc_index": doc_index,
        "K": K,
        "alpha": alpha,
        "beta": {},
        "doc_metadata": doc_metadata,
        "topic_index": topic_index,
        "topic_metadata": topic_metadata,
        "features": features_to_mz,
        "gamma": [list(map(float, row)) for row in gamma_matrix],
    }

    reverse_word_index = {v: k for k, v in word_index.items()}
    reverse_topic_index = {v: k for k, v in topic_index.items()}

    # Fill in beta
    for k_idx in range(K):
        t_name = reverse_topic_index[k_idx]  # e.g. "motif_49"
        t_dict = {}
        for w_i in range(n_words):
            val = beta_matrix[k_idx, w_i]
            if val > min_prob_to_keep_beta:
                w_str = reverse_word_index[w_i]
                t_dict[w_str] = float(val)
        lda_dict["beta"][t_name] = t_dict

    # Build theta from gamma
    e_theta = gamma_matrix / gamma_matrix.sum(axis=1, keepdims=True)
    lda_dict["theta"] = {}
    for d_idx in range(n_docs):
        doc_name = doc_names[d_idx]
        row = {}
        for k_idx in range(K):
            val = e_theta[d_idx, k_idx]
            if val > min_prob_to_keep_theta:
                row[reverse_topic_index[k_idx]] = float(val)
        lda_dict["theta"][doc_name] = row

    # Compute overlap_scores
    overlap_scores = {}
    for doc_name, phi_dict in phi_matrix.items():
        doc_overlaps = {}
        for w_str, topic_vec in phi_dict.items():

            # For each topic that has p >= min_prob_to_keep_phi, check if we pass that threshold
            for k_idx, p in enumerate(topic_vec):
                if p < min_prob_to_keep_phi:
                    continue

                t = reverse_topic_index[k_idx]
                if t not in lda_dict["beta"]:
                    continue
                # add a zero if not in doc_overlaps
                if t not in doc_overlaps:
                    doc_overlaps[t] = 0.0
                # multiply
                doc_overlaps[t] += lda_dict["beta"][t].get(w_str, 0.0) * p

        overlap_scores[doc_name] = {}
        for t in doc_overlaps:
            overlap_scores[doc_name][t] = float(doc_overlaps[t])
    lda_dict["overlap_scores"] = overlap_scores

    # 17) Optionally save
    if filename:
        with open(filename, "w") as f:
            json.dump(lda_dict, f, indent=2)

    return lda_dict

save_visualization_data

save_visualization_data(
    trained_ms2lda,
    cleaned_spectra,
    optimized_motifs,
    doc2spec_map,
    output_folder,
    filename="ms2lda_viz.json",
    min_prob_to_keep_beta=0.001,
    min_prob_to_keep_phi=0.01,
    min_prob_to_keep_theta=0.01,
    run_parameters=None,
)

Creates the final data structure needed by the MS2LDA UI (clustered_smiles_data, optimized_motifs_data, lda_dict, spectra_data) and saves it to JSON in output_folder/filename.

Parameters:

Name Type Description Default
trained_ms2lda LDAModel

the trained LDA model in memory

required
cleaned_spectra list of Spectrum

final cleaned spectra

required
optimized_motifs list of Spectrum

annotated + optimized motifs

required
doc2spec_map dict

doc-hash to original Spectrum map

required
output_folder str

folder path for saving the .json

required
filename str

name of the saved JSON (default "ms2lda_viz.json")

'ms2lda_viz.json'
min_prob_to_keep_beta float

threshold for storing topic-word distribution in beta

0.001
min_prob_to_keep_phi float

threshold for storing word-topic distribution in phi (used for overlap calc)

0.01
min_prob_to_keep_theta float

threshold for doc-topic distribution in theta

0.01

Returns:

Type Description

None

Source code in MS2LDA/Visualisation/ldadict.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
def save_visualization_data(
    trained_ms2lda,
    cleaned_spectra,
    optimized_motifs,
    doc2spec_map,
    output_folder,
    filename="ms2lda_viz.json",
    min_prob_to_keep_beta=1e-3,
    min_prob_to_keep_phi=1e-2,
    min_prob_to_keep_theta=1e-2,
    run_parameters=None,
):
    """
    Creates the final data structure needed by the MS2LDA UI
    (clustered_smiles_data, optimized_motifs_data, lda_dict, spectra_data)
    and saves it to JSON in `output_folder/filename`.

    Args:
        trained_ms2lda (tomotopy.LDAModel): the trained LDA model in memory
        cleaned_spectra (list of Spectrum): final cleaned spectra
        optimized_motifs (list of Spectrum): annotated + optimized motifs
        doc2spec_map (dict): doc-hash to original Spectrum map
        output_folder (str): folder path for saving the .json
        filename (str): name of the saved JSON (default "ms2lda_viz.json")
        min_prob_to_keep_beta (float): threshold for storing topic-word distribution in beta
        min_prob_to_keep_phi (float): threshold for storing word-topic distribution in phi (used for overlap calc)
        min_prob_to_keep_theta (float): threshold for doc-topic distribution in theta

    Returns:
        None
    """

    # 1) Build "documents" & doc_metadata from the model in memory
    documents = []
    for doc_idx, doc in enumerate(trained_ms2lda.docs):
        tokens = [trained_ms2lda.vocabs[word_id] for word_id in doc.words]
        documents.append(tokens)

    doc_metadata = {}
    for i, doc in enumerate(trained_ms2lda.docs):
        doc_name = f"spec_{i}"
        doc_metadata[doc_name] = {"placeholder": f"Doc {i}"}

    # 2) Generate the main LDA dictionary (which does *not* store phi in final dict)
    lda_dict = generate_corpusjson_from_tomotopy(
        model=trained_ms2lda,
        documents=documents,
        spectra=None,  # skip re-cleaning for token consistency
        doc_metadata=doc_metadata,
        min_prob_to_keep_beta=min_prob_to_keep_beta,
        min_prob_to_keep_phi=min_prob_to_keep_phi,
        min_prob_to_keep_theta=min_prob_to_keep_theta,
        filename=None,  # we won't save it here
    )

    # 3) Convert each cleaned spectrum to a dictionary
    def spectrum_to_dict(s):
        dct = {
            "metadata": s.metadata.copy(),
            "mz": [float(mz) for mz in s.peaks.mz],
            "intensities": [float(i) for i in s.peaks.intensities],
        }
        if s.losses:
            dct["metadata"]["losses"] = [
                {"loss_mz": float(mz_), "loss_intensity": float(int_)}
                for mz_, int_ in zip(s.losses.mz, s.losses.intensities)
            ]
        return dct

    # 4) Convert each optimized motif to a dictionary
    def motif_to_dict(m):
        dct = {
            "metadata": m.metadata.copy(),
            "mz": [float(x) for x in m.peaks.mz],
            "intensities": [float(x) for x in m.peaks.intensities],
        }
        if m.losses:
            dct["metadata"]["losses"] = [
                {"loss_mz": float(mz_), "loss_intensity": float(int_)}
                for mz_, int_ in zip(m.losses.mz, m.losses.intensities)
            ]
        return dct

    optimized_motifs_data = [motif_to_dict(m) for m in optimized_motifs]
    spectra_data = [spectrum_to_dict(s) for s in cleaned_spectra]

    # 5) Gather short_annotation from each optimized motif for "clustered_smiles_data"
    clustered_smiles_data = []
    for motif in optimized_motifs:
        ann = motif.get("auto_annotation")
        if isinstance(ann, list):
            clustered_smiles_data.append(ann)
        elif ann is None:
            clustered_smiles_data.append([])
        else:
            # single string
            clustered_smiles_data.append([ann])

        # 6) Build the final dictionary
        # build doc→spec index from doc2spec_map
        # Map Spectrum object -> integer index
        spectrum_to_idx = {spec: i for i, spec in enumerate(cleaned_spectra)}
        doc_to_spec_index = {}

        for d_idx, doc in enumerate(trained_ms2lda.docs):
            words = [trained_ms2lda.vocabs[w_id] for w_id in doc.words]
            doc_text = "".join(words)
            hashed = hashlib.md5(doc_text.encode("utf-8")).hexdigest()
            if hashed in doc2spec_map:
                real_spec = doc2spec_map[hashed]
                doc_to_spec_index[str(d_idx)] = spectrum_to_idx.get(real_spec, -1)
            else:
                doc_to_spec_index[str(d_idx)] = -1  # hash is not found

        lda_dict["doc_to_spec_index"] = doc_to_spec_index

    final_data = {
        "clustered_smiles_data": clustered_smiles_data,
        "optimized_motifs_data": optimized_motifs_data,
        "lda_dict": lda_dict,
        "spectra_data": spectra_data,
        "run_parameters": run_parameters if run_parameters else {},
    }

    # 7) Compress & Save as .json.gz
    os.makedirs(output_folder, exist_ok=True)
    outpath = os.path.join(output_folder, filename + ".gz")  # e.g. ms2lda_viz.json.gz
    with gzip.open(outpath, "wt", encoding="utf-8") as gz_file:
        json.dump(final_data, gz_file, indent=2)

    print(f"Visualization data saved (gzipped) to: {outpath}")

Motifset Similarity Plotting

plot_MotifsetSimilarity

vis_motifset_similarity

vis_motifset_similarity(
    motifset_a, motifset_b, names=["First", "Second"], save=False
)

compares two set of motifs and draw lines between similar ones

Parameters:

Name Type Description Default
motifset_a list

a list of indices which have a similarity to motifset_b

required
motifset_b list

a list of indices which have a similarity to motifset_a

required
names list

names for motifsets

['First', 'Second']
saves True or False

parameter if generated plot should be saved

required

Returns:

Type Description

None

Source code in MS2LDA/Visualisation/plot_MotifsetSimilarity.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def vis_motifset_similarity(
    motifset_a, motifset_b, names=["First", "Second"], save=False
):
    """compares two set of motifs and draw lines between similar ones

    ARGS:
        motifset_a (list): a list of indices which have a similarity to motifset_b
        motifset_b (list): a list of indices which have a similarity to motifset_a
        names (list): names for motifsets
        saves (True or False): parameter if generated plot should be saved

    RETURNS:
        None
    """

    similarity_sets = list(zip(motifset_a, motifset_b))
    n_motifset_a = list(range(max(motifset_a) + 1))
    n_motifset_b = list(range(max(motifset_b) + 1))

    unique_entries_a = len(set(motifset_a))
    unique_entries_b = len(set(motifset_b))

    plt.figure(figsize=(25, 10))

    plt.text(-0.5, 1.5, f"{names[0]} motifset with {unique_entries_a} unique spectra")
    for i, char in enumerate(n_motifset_a):
        if char in motifset_a:
            plt.text(i, 1.01, char, ha="center", va="center", fontsize=5, color="blue")

    plt.text(-0.5, -1.5, f"{names[1]} motifset with {unique_entries_b} unique spectra")
    for i, char in enumerate(n_motifset_b):
        if char in motifset_b:
            plt.text(i, -1.01, char, ha="center", va="center", fontsize=5, color="red")

    for motifset_a_index, motifset_b_index in similarity_sets:
        plt.plot(
            [n_motifset_a[motifset_a_index], n_motifset_b[motifset_b_index]],
            [1, -1],
            "k-",
            lw=0.5,
        )

    plt.xlim(0, max(n_motifset_a + n_motifset_b))
    plt.ylim(-2, 2)
    plt.axis("off")

    plt.title("Similarities between to Motif sets")

    if save == True:
        plt.savefig(f"{names[0]}_{names[1]}_comparison.jpg")
    plt.show()