Visualization Module¶

MS2LDA Visualisation¶

MS2LDA_visualisation ¶

compute_coherence_values ¶

compute_coherence_values(spectra_path, limit, start, step)

Compute c_v coherence for various number of topics

Parameters:¶

dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics

Returns:¶

model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics

Source code in MS2LDA/Visualisation/MS2LDA_visualisation.py

def compute_coherence_values(spectra_path, limit, start, step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        spectra = load_mgf(spectra_path)
        cleaned_spectra = clean_spectra(spectra)
        fragment_words, loss_words = features_to_words(cleaned_spectra)
        feature_words = combine_features(fragment_words, loss_words)
        ms2lda = define_model(n_motifs=n_motifs)
        train_parameters = {"parallel": 4}
        trained_ms2lda = train_model(
            ms2lda, feature_words, iterations=300, train_parameters=train_parameters
        )
        model_list.append(trained_ms2lda)
        coherence_model_lda = CoherenceModel(
            model=lda_model, texts=spectra, dictionary=id2word, coherence="c_v"
        )
        coherence_values.append(coherence_model_lda.get_coherence())
    # Plotting
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc="best")
    plt.ylim(0, 1)
    plt.show()

    return None

General Visualisation¶

visualisation ¶

create_interactive_motif_network ¶

create_interactive_motif_network(
    spectra,
    significant_figures,
    motif_sizes,
    smiles_clusters,
    spectra_cluster,
    motif_colors,
    file_generation=False,
)

Generates a network for the annotated optimized spectra, after running Spec2Vec annotation, if clicking in a node it will shot the spectrum and the molecule associated with it.

Parameters:

Name	Type	Description	Default
`spectra`	`list`	list of matchms.Spectrum.objects; after Spec2Vec annotation	required
`significant_figures`	`int`	number of significant figures to round the mz values	required
`motif_sizes`	`list`	list of sizes for the `motif_{i}` nodes; should match the length of spectra	required

Returns:

Name	Type	Description
`network`	`Graph`	network with nodes and edges, spectra and structures

Source code in MS2LDA/Visualisation/visualisation.py

def create_interactive_motif_network(
    spectra,
    significant_figures,
    motif_sizes,
    smiles_clusters,
    spectra_cluster,
    motif_colors,
    file_generation=False,
):  # spectra-cluster added motif_colors
    """
    Generates a network for the annotated optimized spectra, after running Spec2Vec annotation, if clicking in a node
    it will shot the spectrum and the molecule associated with it.

    ARGS:
        spectra (list): list of matchms.Spectrum.objects; after Spec2Vec annotation
        significant_figures (int): number of significant figures to round the mz values
        motif_sizes (list, optional): list of sizes for the `motif_{i}` nodes; should match the length of spectra

    RETURNS:
        network (nx.Graph): network with nodes and edges, spectra and structures
    """

    if motif_sizes is not None:
        motif_sizes_filtered = list(
            map(lambda x: 0.7 if x == "?" else x, motif_sizes)
        )  # filtering '?'

    G = nx.Graph()

    if motif_sizes is not None and len(motif_sizes) != len(spectra):
        raise ValueError("Length of motif_sizes must match the number of spectra")

    for i, spectrum in enumerate(spectra, start=0):
        motif_node = f"motif_{i}"
        G.add_node(motif_node)

        if spectrum.peaks:
            peak_list = spectrum.peaks.mz
            rounded_peak_list = [round(x, significant_figures) for x in peak_list]
            int_peak_list = spectrum.peaks.intensities

            for edge, weight in zip(rounded_peak_list, int_peak_list):
                G.add_edge(motif_node, edge, weight=weight, color="red")

        if spectrum.losses:
            loss_list = spectrum.losses.mz
            rounded_loss_list = [round(x, significant_figures) for x in loss_list]
            int_losses_list = spectrum.losses.intensities

            for edge, weight in zip(rounded_loss_list, int_losses_list):
                G.add_edge(motif_node, edge, weight=weight, color="blue")

    node_sizes = {}
    if motif_sizes is None:
        default_size = 800
        for i in range(1, len(spectra)):  # error here!?
            node_sizes[f"motif_{i}"] = default_size
    else:
        n_smiles_cluster = []
        for i in smiles_clusters:
            n_smiles_cluster.append(len(i))
        max_n_smiles_cluster = max(n_smiles_cluster)

        n_frags_cluster = []
        for i in spectra:
            n_frags_cluster.append(len(i.peaks.mz))
        max_n_frags_cluster = max(n_frags_cluster)

        for i in range(1, len(spectra)):
            node_sizes[f"motif_{i}"] = (
                ((motif_sizes_filtered[i] * 10) ** 3) / 3
                + (((n_smiles_cluster[i] / max_n_smiles_cluster) * 10) ** 3) / 3
                + (((n_frags_cluster[i] / max_n_frags_cluster) * 10) ** 3) / 3
            )

    # new; for tox
    node_colors = {}
    if motif_colors is None:
        default_color = "#210070"
        for i in range(1, len(spectra)):
            node_colors[f"motif_{i}"] = default_color
    else:
        for i in range(1, len(spectra)):
            node_colors[f"motif_{i}"] = motif_colors[i]
    # --------------------------

    pos = nx.spring_layout(G)
    fig, ax = plt.subplots(figsize=(10, 50))

    edges = G.edges(data=True)
    weights = [d["weight"] * 10 for (u, v, d) in edges]
    edge_colors = [d["color"] for (u, v, d) in edges]
    nx.draw_networkx_edges(G, pos, alpha=0.3, width=weights, edge_color=edge_colors)
    node_size_list = [node_sizes.get(node, 100) for node in G.nodes]
    node_color_list = [node_colors.get(node, "green") for node in G.nodes]
    # node_color_list_flat = [color for sublist in node_color_list for color in (sublist if isinstance(sublist, list) else [sublist])]

    # nx.draw_networkx_nodes(G, pos, node_size=node_size_list, node_color="#210070", alpha=0.9)
    nx.draw_networkx_nodes(
        G, pos, node_size=node_size_list, node_color=node_color_list, alpha=0.9
    )

    label_options = {"ec": "k", "fc": "white", "alpha": 0.7}
    nx.draw_networkx_labels(G, pos, font_size=6, bbox=label_options)

    def on_click(event):
        for node, (x, y) in pos.items():
            dist = (x - event.xdata) ** 2 + (y - event.ydata) ** 2
            if dist < 0.00025:
                if isinstance(
                    node, str
                ):  # Check if the node is a string and matches "motif_x"
                    node_number = int(node.split("_")[1])
                    # print(f"Node {node} clicked!\n"
                    # f"Cluster similarity: {motif_sizes_filtered[node_number]*100}%\n"
                    # f"N of compounds: {(n_smiles_cluster[node_number]/max_n_smiles_cluster)*100}"
                    # f"N of features: {(n_frags_cluster[node_number]/max_n_frags_cluster)*100}"
                    # f"Fragments: {spectra[node_number].peaks.mz}\n"
                    # f"Losses: {spectra[node_number].losses.mz}")
                    mols = [MolFromSmiles(smi) for smi in smiles_clusters[node_number]]
                    img = MolsToGridImage(mols)

                    # spectra[node_number].plot()

                    pil_img = Image.open(io.BytesIO(img.data))

                    # Display new window
                    pil_img.show()

                    for spec in spectra_cluster[node_number]:  # also added
                        spectra[node_number].plot_against(spec)

                break

    # Connect the click event to the on_click function
    fig.canvas.mpl_connect("button_press_event", on_click)

    plt.show()
    if file_generation:
        nx.write_graphml(G, "lda_model_output.graphml")

    return G

create_network ¶

create_network(
    spectra, significant_figures=2, motif_sizes=None, file_generation=False
)

Generates a network for the motifs spectra, where the nodes are the motifs (output of LDA model) and the edges are the peaks and losses of the spectra. The size of the nodes can be adjusted with the motif_sizes refined annotation

Parameters:

Name	Type	Description	Default
`spectra`	`list`	list of matchms.Spectrum.objects; after LDA modelling	required
`significant_figures`	`int`	number of significant figures to round the mz values	`2`
`motif_sizes`	`list`	list of sizes for the `motif_{i}` nodes; should match the length of spectra	`None`

Returns:

Name	Type	Description
`network`	`Graph`	network with nodes and edges

Source code in MS2LDA/Visualisation/visualisation.py

def create_network(
    spectra, significant_figures=2, motif_sizes=None, file_generation=False
):
    """
    Generates a network for the motifs spectra, where the nodes are the motifs (output of LDA model)
    and the edges are the peaks and losses of the spectra. The size of the nodes can be adjusted with the motif_sizes refined annotation

    ARGS:
        spectra (list): list of matchms.Spectrum.objects; after LDA modelling
        significant_figures (int, optional): number of significant figures to round the mz values
        motif_sizes (list, optional): list of sizes for the `motif_{i}` nodes; should match the length of spectra

    RETURNS:
        network (nx.Graph): network with nodes and edges
    """
    if motif_sizes is not None:
        motif_sizes_filtered = list(
            map(lambda x: 0.7 if x == "?" else x, motif_sizes)
        )  # filtering ?

    G = nx.Graph()

    if motif_sizes is not None and len(motif_sizes) != len(spectra):
        raise ValueError("Length of motif_sizes must match the number of spectra")

    for i, spectrum in enumerate(spectra, start=0):
        motif_node = f"motif_{i}"
        G.add_node(motif_node)

        peak_list = spectrum.peaks.mz
        rounded_peak_list = [round(x, significant_figures) for x in peak_list]
        int_peak_list = spectrum.peaks.intensities
        for edge, weight in zip(rounded_peak_list, int_peak_list):
            G.add_edge(motif_node, edge, weight=weight, color="red")

        if spectrum.losses:
            loss_list = spectrum.losses.mz
            rounded_loss_list = [round(x, significant_figures) for x in loss_list]
            int_losses_list = spectrum.losses.intensities

            for edge, weight in zip(rounded_loss_list, int_losses_list):
                G.add_edge(motif_node, edge, weight=weight, color="blue")

    # Arranging node size - motifs
    node_sizes = {}
    if motif_sizes is None:
        default_size = 1000
        for i in range(1, len(spectra)):
            node_sizes[f"motif_{i}"] = default_size

    else:
        for i in range(1, len(spectra)):
            node_sizes[f"motif_{i}"] = ((motif_sizes_filtered[i] * 100) ** 2) / 2

    fig, ax = plt.subplots(figsize=(50, 50))
    edges = G.edges(data=True)
    weights = [d["weight"] * 10 for (u, v, d) in edges]
    edge_colors = [d["color"] for (u, v, d) in edges]
    pos = nx.spring_layout(G)

    nx.draw_networkx_edges(G, pos, alpha=0.3, width=weights, edge_color=edge_colors)
    node_size_list = [node_sizes.get(node, 100) for node in G.nodes]
    nx.draw_networkx_nodes(
        G, pos, node_size=node_size_list, node_color="#210070", alpha=0.9
    )

    label_options = {"ec": "k", "fc": "white", "alpha": 0.7}
    nx.draw_networkx_labels(G, pos, font_size=14, bbox=label_options)

    # ax.margins(0.1, 0.05)
    # fig.tight_layout()
    # plt.axis("off")
    # plt.show()
    if file_generation:
        nx.write_graphml(G, "lda_model_output.graphml")
    return G

show_annotated_motifs ¶

show_annotated_motifs(
    opt_motif_spectra, motif_spectra, clustered_smiles, savefig=None
)

Show side-by-side RDKit molecule images from clustered SMILES, and plot motif vs. optimized motif.

If in a Jupyter notebook, we'll try the 'Notebook-friendly' style.
If not in Jupyter, we'll switch to a headless backend (no GUI windows), skip plt.show(), and just close figures if not saving.

Source code in MS2LDA/Visualisation/visualisation.py

def show_annotated_motifs(
    opt_motif_spectra, motif_spectra, clustered_smiles, savefig=None
):
    """
    Show side-by-side RDKit molecule images from clustered SMILES,
    and plot motif vs. optimized motif.

    - If in a Jupyter notebook, we'll try the 'Notebook-friendly' style.
    - If not in Jupyter, we'll switch to a headless backend (no GUI windows),
      skip plt.show(), and just close figures if not saving.
    """
    assert len(opt_motif_spectra) == len(
        motif_spectra
    ), "Lengths of opt_motif_spectra and motif_spectra must match!"

    # Create output folder if needed
    if savefig is not None:
        os.makedirs(savefig, exist_ok=True)

    # We'll pass 'returnPNG=not_in_jupyter' so that in Jupyter we do no `returnPNG`,
    # in Dash we do `returnPNG=True`.
    not_in_jupyter = not _in_nb

    for m in range(len(motif_spectra)):
        mass_to_charge_opt = opt_motif_spectra[m].peaks.mz
        intensities_opt = opt_motif_spectra[m].peaks.intensities
        mass_to_charge = motif_spectra[m].peaks.mz
        intensities = motif_spectra[m].peaks.intensities

        loss_to_charge_opt = opt_motif_spectra[m].losses.mz
        loss_intensities_opt = opt_motif_spectra[m].losses.intensities
        loss_to_charge = motif_spectra[m].losses.mz
        loss_intensities = motif_spectra[m].losses.intensities

        # Convert SMILES -> RDKit mols
        mols = []
        for smi in clustered_smiles[m]:
            mol = MolFromSmiles(smi)
            if mol is not None:
                mols.append(mol)

        # If no valid SMILES, fallback to blank
        pil_img = None
        if len(mols) == 0:
            pil_img = Image.new("RGB", (400, 400), "white")
        else:
            # Attempt to get either a PIL object or bytes from MolsToGridImage
            # If in Jupyter => returnPNG=False
            # if not => returnPNG=True
            result = MolsToGridImage(
                mols,
                molsPerRow=len(mols),
                subImgSize=(400, 400),
                returnPNG=not_in_jupyter,
            )
            pil_img = _convert_molgrid_result_to_pil(result)
            if pil_img is None:
                # If that fails, fallback blank
                pil_img = Image.new("RGB", (400, 400), "white")

        # Make figure
        fig = plt.figure(figsize=(10, 6), facecolor="none", edgecolor="none")

        # Top subplot: molecule grid
        ax_top = fig.add_subplot(3, 1, 1)
        ax_top.imshow(pil_img)
        ax_top.axis("off")
        top_pos = ax_top.get_position()
        ax_top.set_position([top_pos.x0, top_pos.y0, top_pos.width, top_pos.height])

        # Bottom subplot: motif vs. optimized motif
        ax_bot = fig.add_subplot(3, 1, 3)
        if len(mass_to_charge):
            ax_bot.stem(
                mass_to_charge,
                intensities,
                basefmt="k-",
                markerfmt="",
                linefmt="black",
                label=f"motif_{m}",
            )
        if len(mass_to_charge_opt) > 0:
            ax_bot.stem(
                mass_to_charge_opt,
                intensities_opt,
                basefmt="k-",
                markerfmt="",
                linefmt="#FF9E01",
                label=f"opt motif_{m}",
            )

        ax_bot.set_ylim(
            0,
        )
        ax_bot.set_xlabel("m/z", fontsize=12)
        ax_bot.set_ylabel("Intensity", fontsize=12)
        ax_bot.spines["right"].set_visible(False)
        ax_bot.spines["top"].set_visible(False)
        ax_bot.spines["left"].set_color("black")
        ax_bot.spines["bottom"].set_color("black")
        ax_bot.spines["left"].set_linewidth(1.5)
        ax_bot.spines["bottom"].set_linewidth(1.5)
        ax_bot.tick_params(
            axis="both",
            which="major",
            direction="out",
            length=6,
            width=1.5,
            color="black",
        )

        # middel suplot: motif vs optimized motif losses
        ax_midd = fig.add_subplot(3, 1, 2)
        if len(loss_to_charge):
            ax_midd.stem(
                loss_to_charge,
                loss_intensities,
                basefmt="k-",
                markerfmt="",
                linefmt="black",
                bottom=0,
                label=f"motif_{m}",
            )
        if len(loss_to_charge_opt) > 0:
            ax_midd.stem(
                loss_to_charge_opt,
                loss_intensities_opt,
                basefmt="k-",
                markerfmt="",
                linefmt="#3979A9",
                bottom=0,
                label=f"opt motif_{m}",
            )
        # Ensure stems originate from the x-axis (y=0)
        ax_midd.axhline(y=0, color="black", linewidth=1.5)  # Explicitly draw x-axis

        # Only set ylim if there are valid intensities
        if any(loss_intensities):
            ax_midd.set_ylim(0, max(loss_intensities) * 1.1)
        else:
            ax_midd.set_ylim(0, 1)  # Default if no data

        ax_midd.yaxis.set_label_position("right")
        ax_midd.xaxis.set_label_position("top")
        ax_midd.set_xlabel("loss", fontsize=12)
        ax_midd.set_ylabel("Intensity", fontsize=12)
        ax_midd.invert_xaxis()
        ax_midd.invert_yaxis()
        ax_midd.xaxis.tick_top()
        ax_midd.yaxis.tick_right()

        ax_midd.spines["left"].set_visible(False)
        ax_midd.spines["bottom"].set_visible(False)
        ax_midd.spines["right"].set_color("black")
        ax_midd.spines["top"].set_color("black")
        ax_midd.spines["right"].set_linewidth(1.5)
        ax_midd.spines["top"].set_linewidth(1.5)
        ax_midd.tick_params(
            axis="both",
            which="major",
            direction="out",
            length=6,
            width=1.5,
            color="black",
        )

        plt.legend(loc="best")

        # Save or close
        if savefig:
            outfile = os.path.join(savefig, f"motif_{m}.png")
            plt.savefig(outfile, format="png", dpi=400)
            plt.close(fig)
        else:
            # If in Jupyter => show
            if _in_nb:
                plt.show()
            else:
                plt.close(fig)

LDA Dictionary¶

ldadict ¶

generate_corpusjson_from_tomotopy ¶

generate_corpusjson_from_tomotopy(
    model,
    documents,
    spectra,
    doc_metadata,
    min_prob_to_keep_beta=0.001,
    min_prob_to_keep_phi=0.01,
    min_prob_to_keep_theta=0.01,
    filename=None,
)

Generates lda_dict in the similar format as in the previous MS2LDA app.

Source code in MS2LDA/Visualisation/ldadict.py

def generate_corpusjson_from_tomotopy(
    model,
    documents,
    spectra,
    doc_metadata,
    min_prob_to_keep_beta=1e-3,
    min_prob_to_keep_phi=1e-2,
    min_prob_to_keep_theta=1e-2,
    filename=None,
):
    """
    Generates lda_dict in the similar format as in the previous MS2LDA app.
    """

    # Build doc names
    if spectra is not None and len(spectra) == len(documents):
        doc_names = [spec.get("id") for spec in spectra]
    else:
        doc_names = list(doc_metadata.keys())

    # Gather all unique tokens from `documents`:
    unique_words = set()
    for doc in documents:
        unique_words.update(doc)
    word_list = sorted(unique_words)
    word_index = {word: idx for idx, word in enumerate(word_list)}
    n_words = len(word_index)

    # doc_index
    doc_index = {name: i for i, name in enumerate(doc_names)}
    n_docs = len(doc_index)

    # corpus
    corpus = {}
    for doc_name, doc_words in zip(doc_names, documents):
        word_counts = {}
        for w in doc_words:
            word_counts[w] = word_counts.get(w, 0) + 1
        corpus[doc_name] = word_counts

    # Number of topics and alpha
    K = model.k
    alpha = list(map(float, model.alpha))

    # Map each token => model’s vocab index if present
    model_vocab = model.used_vocabs
    model_vocab_index = {w: i for i, w in enumerate(model_vocab)}
    word_to_model_idx = {}
    for w in word_index:
        if w in model_vocab_index:
            word_to_model_idx[word_index[w]] = model_vocab_index[w]

    # Construct beta_matrix (K x n_words)
    beta_matrix = np.zeros((K, n_words), dtype=float)
    for k_idx in range(K):
        word_probs = model.get_topic_word_dist(k_idx)
        for w_i in range(n_words):
            if w_i in word_to_model_idx:
                model_wi = word_to_model_idx[w_i]
                beta_matrix[k_idx, w_i] = word_probs[model_wi]
            else:
                beta_matrix[k_idx, w_i] = 0.0

    # Construct gamma_matrix (doc-topic distribution)
    gamma_matrix = np.zeros((n_docs, K), dtype=float)
    for d_idx, doc in enumerate(model.docs):
        gamma_matrix[d_idx, :] = doc.get_topic_dist()

    # Construct phi_matrix
    phi_matrix = {}
    for d_idx, doc in enumerate(model.docs):
        doc_name = doc_names[d_idx]
        phi_matrix[doc_name] = defaultdict(lambda: np.zeros(K, dtype=float))
        for word_id, topic_id in zip(doc.words, doc.topics):
            w_str = model.vocabs[word_id]
            phi_matrix[doc_name][w_str][topic_id] += 1.0
        # Normalize each word’s topic distribution
        for w_str, topic_vec in phi_matrix[doc_name].items():
            total = topic_vec.sum()
            if total > 0.0:
                phi_matrix[doc_name][w_str] = topic_vec / total

    # Build topic_index + metadata
    topic_index = {f"motif_{k}": k for k in range(K)}
    topic_metadata = {
        f"motif_{k}": {"name": f"motif_{k}", "type": "learnt"} for k in range(K)
    }

    # For convenience, extract “features” if they look like "frag@X" or "loss@X".
    features_to_mz = {}
    for w in word_list:
        if w.startswith("frag@") or w.startswith("loss@"):
            try:
                val = float(w.split("@")[1])
                features_to_mz[w] = (val, val)
            except:
                pass

    # Prepare the final dictionary
    lda_dict = {
        "corpus": corpus,
        "word_index": word_index,
        "doc_index": doc_index,
        "K": K,
        "alpha": alpha,
        "beta": {},
        "doc_metadata": doc_metadata,
        "topic_index": topic_index,
        "topic_metadata": topic_metadata,
        "features": features_to_mz,
        "gamma": [list(map(float, row)) for row in gamma_matrix],
    }

    reverse_word_index = {v: k for k, v in word_index.items()}
    reverse_topic_index = {v: k for k, v in topic_index.items()}

    # Fill in beta
    for k_idx in range(K):
        t_name = reverse_topic_index[k_idx]  # e.g. "motif_49"
        t_dict = {}
        for w_i in range(n_words):
            val = beta_matrix[k_idx, w_i]
            if val > min_prob_to_keep_beta:
                w_str = reverse_word_index[w_i]
                t_dict[w_str] = float(val)
        lda_dict["beta"][t_name] = t_dict

    # Build theta from gamma
    e_theta = gamma_matrix / gamma_matrix.sum(axis=1, keepdims=True)
    lda_dict["theta"] = {}
    for d_idx in range(n_docs):
        doc_name = doc_names[d_idx]
        row = {}
        for k_idx in range(K):
            val = e_theta[d_idx, k_idx]
            if val > min_prob_to_keep_theta:
                row[reverse_topic_index[k_idx]] = float(val)
        lda_dict["theta"][doc_name] = row

    # Compute overlap_scores
    overlap_scores = {}
    for doc_name, phi_dict in phi_matrix.items():
        doc_overlaps = {}
        for w_str, topic_vec in phi_dict.items():

            # For each topic that has p >= min_prob_to_keep_phi, check if we pass that threshold
            for k_idx, p in enumerate(topic_vec):
                if p < min_prob_to_keep_phi:
                    continue

                t = reverse_topic_index[k_idx]
                if t not in lda_dict["beta"]:
                    continue
                # add a zero if not in doc_overlaps
                if t not in doc_overlaps:
                    doc_overlaps[t] = 0.0
                # multiply
                doc_overlaps[t] += lda_dict["beta"][t].get(w_str, 0.0) * p

        overlap_scores[doc_name] = {}
        for t in doc_overlaps:
            overlap_scores[doc_name][t] = float(doc_overlaps[t])
    lda_dict["overlap_scores"] = overlap_scores

    # 17) Optionally save
    if filename:
        with open(filename, "w") as f:
            json.dump(lda_dict, f, indent=2)

    return lda_dict

save_visualization_data ¶

save_visualization_data(
    trained_ms2lda,
    cleaned_spectra,
    optimized_motifs,
    doc2spec_map,
    output_folder,
    filename="ms2lda_viz.json",
    min_prob_to_keep_beta=0.001,
    min_prob_to_keep_phi=0.01,
    min_prob_to_keep_theta=0.01,
    run_parameters=None,
)

Creates the final data structure needed by the MS2LDA UI (clustered_smiles_data, optimized_motifs_data, lda_dict, spectra_data) and saves it to JSON in output_folder/filename.

Parameters:

Name	Type	Description	Default
`trained_ms2lda`	`LDAModel`	the trained LDA model in memory	required
`cleaned_spectra`	`list of Spectrum`	final cleaned spectra	required
`optimized_motifs`	`list of Spectrum`	annotated + optimized motifs	required
`doc2spec_map`	`dict`	doc-hash to original Spectrum map	required
`output_folder`	`str`	folder path for saving the .json	required
`filename`	`str`	name of the saved JSON (default "ms2lda_viz.json")	`'ms2lda_viz.json'`
`min_prob_to_keep_beta`	`float`	threshold for storing topic-word distribution in beta	`0.001`
`min_prob_to_keep_phi`	`float`	threshold for storing word-topic distribution in phi (used for overlap calc)	`0.01`
`min_prob_to_keep_theta`	`float`	threshold for doc-topic distribution in theta	`0.01`

Returns:

Type	Description
	None

Source code in MS2LDA/Visualisation/ldadict.py

def save_visualization_data(
    trained_ms2lda,
    cleaned_spectra,
    optimized_motifs,
    doc2spec_map,
    output_folder,
    filename="ms2lda_viz.json",
    min_prob_to_keep_beta=1e-3,
    min_prob_to_keep_phi=1e-2,
    min_prob_to_keep_theta=1e-2,
    run_parameters=None,
):
    """
    Creates the final data structure needed by the MS2LDA UI
    (clustered_smiles_data, optimized_motifs_data, lda_dict, spectra_data)
    and saves it to JSON in `output_folder/filename`.

    Args:
        trained_ms2lda (tomotopy.LDAModel): the trained LDA model in memory
        cleaned_spectra (list of Spectrum): final cleaned spectra
        optimized_motifs (list of Spectrum): annotated + optimized motifs
        doc2spec_map (dict): doc-hash to original Spectrum map
        output_folder (str): folder path for saving the .json
        filename (str): name of the saved JSON (default "ms2lda_viz.json")
        min_prob_to_keep_beta (float): threshold for storing topic-word distribution in beta
        min_prob_to_keep_phi (float): threshold for storing word-topic distribution in phi (used for overlap calc)
        min_prob_to_keep_theta (float): threshold for doc-topic distribution in theta

    Returns:
        None
    """

    # 1) Build "documents" & doc_metadata from the model in memory
    documents = []
    for doc_idx, doc in enumerate(trained_ms2lda.docs):
        tokens = [trained_ms2lda.vocabs[word_id] for word_id in doc.words]
        documents.append(tokens)

    doc_metadata = {}
    for i, doc in enumerate(trained_ms2lda.docs):
        doc_name = f"spec_{i}"
        doc_metadata[doc_name] = {"placeholder": f"Doc {i}"}

    # 2) Generate the main LDA dictionary (which does *not* store phi in final dict)
    lda_dict = generate_corpusjson_from_tomotopy(
        model=trained_ms2lda,
        documents=documents,
        spectra=None,  # skip re-cleaning for token consistency
        doc_metadata=doc_metadata,
        min_prob_to_keep_beta=min_prob_to_keep_beta,
        min_prob_to_keep_phi=min_prob_to_keep_phi,
        min_prob_to_keep_theta=min_prob_to_keep_theta,
        filename=None,  # we won't save it here
    )

    # 3) Convert each cleaned spectrum to a dictionary
    def spectrum_to_dict(s):
        dct = {
            "metadata": s.metadata.copy(),
            "mz": [float(mz) for mz in s.peaks.mz],
            "intensities": [float(i) for i in s.peaks.intensities],
        }
        if s.losses:
            dct["metadata"]["losses"] = [
                {"loss_mz": float(mz_), "loss_intensity": float(int_)}
                for mz_, int_ in zip(s.losses.mz, s.losses.intensities)
            ]
        return dct

    # 4) Convert each optimized motif to a dictionary
    def motif_to_dict(m):
        dct = {
            "metadata": m.metadata.copy(),
            "mz": [float(x) for x in m.peaks.mz],
            "intensities": [float(x) for x in m.peaks.intensities],
        }
        if m.losses:
            dct["metadata"]["losses"] = [
                {"loss_mz": float(mz_), "loss_intensity": float(int_)}
                for mz_, int_ in zip(m.losses.mz, m.losses.intensities)
            ]
        return dct

    optimized_motifs_data = [motif_to_dict(m) for m in optimized_motifs]
    spectra_data = [spectrum_to_dict(s) for s in cleaned_spectra]

    # 5) Gather short_annotation from each optimized motif for "clustered_smiles_data"
    clustered_smiles_data = []
    for motif in optimized_motifs:
        ann = motif.get("auto_annotation")
        if isinstance(ann, list):
            clustered_smiles_data.append(ann)
        elif ann is None:
            clustered_smiles_data.append([])
        else:
            # single string
            clustered_smiles_data.append([ann])

        # 6) Build the final dictionary
        # build doc→spec index from doc2spec_map
        # Map Spectrum object -> integer index
        spectrum_to_idx = {spec: i for i, spec in enumerate(cleaned_spectra)}
        doc_to_spec_index = {}

        for d_idx, doc in enumerate(trained_ms2lda.docs):
            words = [trained_ms2lda.vocabs[w_id] for w_id in doc.words]
            doc_text = "".join(words)
            hashed = hashlib.md5(doc_text.encode("utf-8")).hexdigest()
            if hashed in doc2spec_map:
                real_spec = doc2spec_map[hashed]
                doc_to_spec_index[str(d_idx)] = spectrum_to_idx.get(real_spec, -1)
            else:
                doc_to_spec_index[str(d_idx)] = -1  # hash is not found

        lda_dict["doc_to_spec_index"] = doc_to_spec_index

    final_data = {
        "clustered_smiles_data": clustered_smiles_data,
        "optimized_motifs_data": optimized_motifs_data,
        "lda_dict": lda_dict,
        "spectra_data": spectra_data,
        "run_parameters": run_parameters if run_parameters else {},
    }

    # 7) Compress & Save as .json.gz
    os.makedirs(output_folder, exist_ok=True)
    outpath = os.path.join(output_folder, filename + ".gz")  # e.g. ms2lda_viz.json.gz
    with gzip.open(outpath, "wt", encoding="utf-8") as gz_file:
        json.dump(final_data, gz_file, indent=2)

    print(f"Visualization data saved (gzipped) to: {outpath}")

Motifset Similarity Plotting¶

plot_MotifsetSimilarity ¶

vis_motifset_similarity ¶

vis_motifset_similarity(
    motifset_a, motifset_b, names=["First", "Second"], save=False
)

compares two set of motifs and draw lines between similar ones

Parameters:

Name	Type	Description	Default
`motifset_a`	`list`	a list of indices which have a similarity to motifset_b	required
`motifset_b`	`list`	a list of indices which have a similarity to motifset_a	required
`names`	`list`	names for motifsets	`['First', 'Second']`
`saves`	`True or False`	parameter if generated plot should be saved	required

Returns:

Type	Description
	None

Source code in MS2LDA/Visualisation/plot_MotifsetSimilarity.py

def vis_motifset_similarity(
    motifset_a, motifset_b, names=["First", "Second"], save=False
):
    """compares two set of motifs and draw lines between similar ones

    ARGS:
        motifset_a (list): a list of indices which have a similarity to motifset_b
        motifset_b (list): a list of indices which have a similarity to motifset_a
        names (list): names for motifsets
        saves (True or False): parameter if generated plot should be saved

    RETURNS:
        None
    """

    similarity_sets = list(zip(motifset_a, motifset_b))
    n_motifset_a = list(range(max(motifset_a) + 1))
    n_motifset_b = list(range(max(motifset_b) + 1))

    unique_entries_a = len(set(motifset_a))
    unique_entries_b = len(set(motifset_b))

    plt.figure(figsize=(25, 10))

    plt.text(-0.5, 1.5, f"{names[0]} motifset with {unique_entries_a} unique spectra")
    for i, char in enumerate(n_motifset_a):
        if char in motifset_a:
            plt.text(i, 1.01, char, ha="center", va="center", fontsize=5, color="blue")

    plt.text(-0.5, -1.5, f"{names[1]} motifset with {unique_entries_b} unique spectra")
    for i, char in enumerate(n_motifset_b):
        if char in motifset_b:
            plt.text(i, -1.01, char, ha="center", va="center", fontsize=5, color="red")

    for motifset_a_index, motifset_b_index in similarity_sets:
        plt.plot(
            [n_motifset_a[motifset_a_index], n_motifset_b[motifset_b_index]],
            [1, -1],
            "k-",
            lw=0.5,
        )

    plt.xlim(0, max(n_motifset_a + n_motifset_b))
    plt.ylim(-2, 2)
    plt.axis("off")

    plt.title("Similarities between to Motif sets")

    if save == True:
        plt.savefig(f"{names[0]}_{names[1]}_comparison.jpg")
    plt.show()