Skip to content

Modeling Module

modeling

calculate_document_entropy

calculate_document_entropy(model)

Entropy for Document-Topic Distribution

Source code in MS2LDA/modeling.py
103
104
105
106
107
108
109
110
111
112
113
114
def calculate_document_entropy(model):
    """Entropy for Document-Topic Distribution"""
    entropy_values = []

    for doc in model.docs:
        topic_dist = doc.get_topic_dist()
        topic_dist = np.where(np.array(topic_dist) == 0, 1e-12, topic_dist)

        entropy = -np.sum(topic_dist * np.log(topic_dist))
        entropy_values.append(entropy)

    return np.mean(entropy_values)

calculate_topic_entropy

calculate_topic_entropy(model)

Entropy for Topic-Word Distribution

Source code in MS2LDA/modeling.py
117
118
119
120
121
122
123
124
125
126
127
128
def calculate_topic_entropy(model):
    """Entropy for Topic-Word Distribution"""
    entropy_values = []

    for k in range(model.k):
        word_dist = model.get_topic_word_dist(k)
        word_dist = np.where(np.array(word_dist) == 0, 1e-12, word_dist)

        entropy = -np.sum(word_dist * np.log(word_dist))
        entropy_values.append(entropy)

    return np.mean(entropy_values)

check_convergence

check_convergence(entropy_history, epsilon=0.001, n=3)

no

Source code in MS2LDA/modeling.py
131
132
133
134
135
136
137
138
def check_convergence(entropy_history, epsilon=0.001, n=3):
    """no"""
    changes = [
        abs(entropy_history[i] - entropy_history[i - 1]) / entropy_history[i - 1]
        for i in range(1, len(entropy_history))
    ]

    return all(change < epsilon for change in changes[-n:])

create_motif_spectra

create_motif_spectra(
    motif_features, charge=1, motifset_name="unknown", significant_digits=2
)

creates a matchms spectrum object for the found motifs

Parameters:

Name Type Description Default
motif_features list

tuples within a list of lists with spectral features assigned per motif and their given motif importance

required

Returns:

Name Type Description
motif_spectra list

list of matchms spectrum objects; one for each motif

Source code in MS2LDA/modeling.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def create_motif_spectra(
    motif_features, charge=1, motifset_name="unknown", significant_digits=2
):
    """creates a matchms spectrum object for the found motifs

    ARGS:
        motif_features (list): tuples within a list of lists with spectral features assigned per motif and their given motif importance

    RETURNS:
        motif_spectra (list): list of matchms spectrum objects; one for each motif
    """

    motif_spectra = []

    for k, motif_k_features in enumerate(motif_features):
        motif_spectrum = create_spectrum(
            motif_k_features,
            k,
            charge=charge,
            motifset=motifset_name,
            significant_digits=significant_digits,
        )
        motif_spectra.append(motif_spectrum)

    return motif_spectra

define_model

define_model(n_motifs, model_parameters={})

creating a LDA model using the tomotopy library

Parameters:

Name Type Description Default
n_motifs int

number of motifs that will be generated

required
model_parameters dict

defines all further parameters that can be set in the tomotopy LDA model (see https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel)

{}

Returns:

Name Type Description
model

tomotopy LDAModel class

Source code in MS2LDA/modeling.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
def define_model(n_motifs, model_parameters={}):
    """creating a LDA model using the tomotopy library

    ARGS:
        n_motifs (int): number of motifs that will be generated
        model_parameters (dict): defines all further parameters that can be set in the tomotopy LDA model (see https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel)

    RETURNS:
        model: tomotopy LDAModel class
    """

    model = tp.LDAModel(k=n_motifs, **model_parameters)

    return model

extract_motifs

extract_motifs(model, top_n=50)

extract motifs from the trained LDA model

Parameters:

Name Type Description Default
model

tomotopy LDAModel class

required
top_n int

number of top n features extracted per motif

50

Returns:

Name Type Description
motif_features list

tuples within a list of lists with spectral features assigned per motif and their given motif importance

Source code in MS2LDA/modeling.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def extract_motifs(model, top_n=50):
    """extract motifs from the trained LDA model

    ARGS:
        model: tomotopy LDAModel class
        top_n (int): number of top n features extracted per motif

    RETURNS:
        motif_features (list): tuples within a list of lists with spectral features assigned per motif and their given motif importance
    """

    motif_features = []

    for motif_index in range(model.k):
        motif_k_features = model.get_topic_words(motif_index, top_n=top_n)
        motif_features.append(motif_k_features)

    return motif_features

train_model

train_model(
    model,
    documents,
    iterations=100,
    train_parameters={},
    convergence_parameters={
        "type": "entropy_history_doc",
        "threshold": 0.01,
        "window_size": 3,
        "step_size": 10,
    },
)

trains the LDA model on the given documents

Parameters:

Name Type Description Default
model

tomotopy LDAModel class

required
documents list

list of lists with frag@/loss@ strings representing spectral features

required
iterations int

number of iterations in the training

100
train_parameters dict

defines all further parameters that can be set in the tomotopy training function (see https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel.train)

{}

Returns:

Name Type Description
model

tomotopy LDAModel class

convergence_curve list

list containing the model perplexity values for after every 10 iterations

Source code in MS2LDA/modeling.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def train_model(
    model,
    documents,
    iterations=100,
    train_parameters={},
    convergence_parameters={
        "type": "entropy_history_doc",
        "threshold": 0.01,
        "window_size": 3,
        "step_size": 10,
    },
):
    """trains the LDA model on the given documents

    ARGS:
        model: tomotopy LDAModel class
        documents (list): list of lists with frag@/loss@ strings representing spectral features
        iterations (int): number of iterations in the training
        train_parameters (dict): defines all further parameters that can be set in the tomotopy training function (see https://bab2min.github.io/tomotopy/v0.12.6/en/#tomotopy.LDAModel.train)

    RETURNS:
        model: tomotopy LDAModel class
        convergence_curve (list): list containing the model perplexity values for after every 10 iterations
    """

    for doc in documents:
        model.add_doc(doc)

    convergence_history = {
        "entropy_history_doc": [],
        "entropy_history_topic": [],
        "perplexity_history": [],
        "log_likelihood_history": [],
    }

    # entropy_history_doc = []
    # entropy_history_topic = []
    # perplexity_history = []
    # log_likelihood_history = []

    for _ in tqdm(range(0, iterations, convergence_parameters["step_size"])):
        model.train(convergence_parameters["step_size"], **train_parameters)

        # calculate perplexity score and saves it
        perplexity = model.perplexity
        convergence_history["perplexity_history"].append(perplexity)

        # calculates log likelihood score and save it
        log_likelihood = model.ll_per_word
        convergence_history["log_likelihood_history"].append(log_likelihood)

        # calculates the document topic entropy and saves it
        current_doc_entropy = calculate_document_entropy(model)
        convergence_history["entropy_history_doc"].append(current_doc_entropy)

        # calculates the topic word entropy and saves it
        current_topic_entropy = calculate_topic_entropy(model)
        convergence_history["entropy_history_topic"].append(current_topic_entropy)

        # Check convergence criteria
        model_converged = len(
            convergence_history[convergence_parameters["type"]]
        ) > convergence_parameters["window_size"] and check_convergence(
            convergence_history[convergence_parameters["type"]],
            epsilon=convergence_parameters["threshold"],
            n=convergence_parameters["window_size"],
        )

        # early stopping
        if model_converged:
            print("Model has converged")
            return model, convergence_history

    else:
        print("model did not converge")
        return model, convergence_history