Skip to content

Preprocessing Module

Load and Clean

load_and_clean

clean_spectra

clean_spectra(spectra, preprocessing_parameters={})

uses matchms to normalize intensities, add information and add losses to the spectra

Parameters:

Name Type Description Default
spectra generator

generator object of matchms.Spectrum.objects loaded via matchms in python

required
entropy_threshold float

spectral entropy threshold to sort out noisy spectra (see MoNA spectral entropy)

required

Returns:

Name Type Description
cleaned_spectra list

list of matchms.Spectrum.objects; spectra that do not fit will be removed

Source code in MS2LDA/Preprocessing/load_and_clean.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def clean_spectra(spectra, preprocessing_parameters={}):
    """uses matchms to normalize intensities, add information and add losses to the spectra

    ARGS:
        spectra (generator): generator object of matchms.Spectrum.objects loaded via matchms in python
        entropy_threshold (float): spectral entropy threshold to sort out noisy spectra (see MoNA spectral entropy)

    RETURNS:
        cleaned_spectra (list): list of matchms.Spectrum.objects; spectra that do not fit will be removed
    """
    ensure_key = lambda parameters, key, default: parameters.setdefault(key, default)

    ensure_key(preprocessing_parameters, "min_mz", 0)
    ensure_key(preprocessing_parameters, "max_mz", 1000)
    ensure_key(preprocessing_parameters, "max_frags", 500)
    ensure_key(preprocessing_parameters, "min_frags", 3)
    ensure_key(preprocessing_parameters, "min_intensity", 0.001)
    ensure_key(preprocessing_parameters, "max_intensity", 1)

    cleaned_spectra = []
    count = 0

    for i, spectrum in enumerate(spectra):
        # metadata filters
        spectrum = msfilters.default_filters(spectrum)
        spectrum = msfilters.add_retention_index(spectrum)
        spectrum = msfilters.add_retention_time(spectrum)
        # spectrum = msfilters.require_precursor_mz(spectrum) # do we need this

        # normalize and filter peaks
        spectrum = msfilters.normalize_intensities(spectrum)
        spectrum = msfilters.select_by_relative_intensity(
            spectrum,
            intensity_from=preprocessing_parameters["min_intensity"],
            intensity_to=preprocessing_parameters["max_intensity"],
        )
        spectrum = msfilters.select_by_mz(
            spectrum,
            mz_from=preprocessing_parameters["min_mz"],
            mz_to=preprocessing_parameters["max_mz"],
        )
        spectrum = msfilters.reduce_to_number_of_peaks(
            spectrum, n_max=preprocessing_parameters["max_frags"]
        )
        spectrum = msfilters.require_minimum_number_of_peaks(
            spectrum, n_required=preprocessing_parameters["min_frags"]
        )
        # spectrum = msfilters.add_losses(spectrum)

        if spectrum:
            spectrum.set("id", f"spec_{count}")  # reindex
            cleaned_spectra.append(spectrum)
            count += 1

    return cleaned_spectra

load_mgf

load_mgf(spectra_path)

loads spectra from a mgf file

Parameters:

Name Type Description Default
spectra_path str

path to the spectra.mgf file

required

Returns:

Name Type Description
spectra generator

matchms generator object with the loaded spectra

Source code in MS2LDA/Preprocessing/load_and_clean.py
11
12
13
14
15
16
17
18
19
20
21
22
23
def load_mgf(spectra_path):
    """loads spectra from a mgf file

    ARGS:
        spectra_path (str): path to the spectra.mgf file

    RETURNS:
        spectra (generator): matchms generator object with the loaded spectra
    """

    spectra = load_from_mgf(spectra_path)

    return spectra

load_msp

load_msp(spectra_path)

loads spectra from a mzml file

Parameters:

Name Type Description Default
spectra_path str

path to the spectra.mgf file

required

Returns:

Name Type Description
spectra generator

matchms generator object with the loaded spectra

Source code in MS2LDA/Preprocessing/load_and_clean.py
41
42
43
44
45
46
47
48
49
50
51
52
53
def load_msp(spectra_path):
    """loads spectra from a mzml file

    ARGS:
        spectra_path (str): path to the spectra.mgf file

    RETURNS:
        spectra (generator): matchms generator object with the loaded spectra
    """

    spectra = load_from_msp(spectra_path)

    return spectra

load_mzml

load_mzml(spectra_path)

loads spectra from a mzml file

Parameters:

Name Type Description Default
spectra_path str

path to the spectra.mgf file

required

Returns:

Name Type Description
spectra generator

matchms generator object with the loaded spectra

Source code in MS2LDA/Preprocessing/load_and_clean.py
56
57
58
59
60
61
62
63
64
65
66
67
68
def load_mzml(spectra_path):
    """loads spectra from a mzml file

    ARGS:
        spectra_path (str): path to the spectra.mgf file

    RETURNS:
        spectra (generator): matchms generator object with the loaded spectra
    """

    spectra = load_from_mzml(spectra_path)

    return spectra

Generate Corpus

generate_corpus

combine_features

combine_features(dataset_frag, dataset_loss)

combines fragments and losses for a list of spectra

Parameters:

Name Type Description Default
dataset_frag(list)

list of lists where each list represents fragements from one spectrum

required
dataset_loss list

list of lists where each list represents the losses from one spectrum

required

Returns:

Name Type Description
frag_and_loss list

list of list where each list represents the fragments and losses from one spectrum

Source code in MS2LDA/Preprocessing/generate_corpus.py
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def combine_features(dataset_frag, dataset_loss):
    """combines fragments and losses for a list of spectra

    ARGS:
        dataset_frag(list): list of lists where each list represents fragements from one spectrum
        dataset_loss (list): list of lists where each list represents the losses from one spectrum

    RETURNS:
        frag_and_loss (list): list of list where each list represents the fragments and losses from one spectrum
    """

    dataset_features = []
    for spectrum_frag, spectrum_loss in zip(dataset_frag, dataset_loss):
        dataset_features.append(spectrum_frag + spectrum_loss)

    return dataset_features

features_to_words

features_to_words(spectra, significant_figures=2, acquisition_type='DDA')

generates a list of lists for fragments and losses for a dataset

Parameters:

Name Type Description Default
spectra list

list of matchms.Spectrum.objects; they should be cleaned beforehand e.g. intensity normalization, add losses

required

Returns:

Name Type Description
dataset_frag list

is a list of lists where each list represents fragements from one spectrum

dataset_loss list

is a list of lists where each list represents the losses from one spectrum

Source code in MS2LDA/Preprocessing/generate_corpus.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def features_to_words(
    spectra, significant_figures=2, acquisition_type="DDA"
):  # You should write some unittests for this function; seems to be error prone
    """generates a list of lists for fragments and losses for a dataset

    ARGS:
        spectra (list): list of matchms.Spectrum.objects; they should be cleaned beforehand e.g. intensity normalization, add losses

    RETURNS:
        dataset_frag (list): is a list of lists where each list represents fragements from one spectrum
        dataset_loss (list): is a list of lists where each list represents the losses from one spectrum
    """
    dataset_frag = []
    dataset_loss = []

    for spectrum in spectra:
        intensities_from_0_to_100 = (spectrum.peaks.intensities * 100).round()

        frag_with_n_digits = [
            ["frag@" + str(round(mz, significant_figures))] for mz in spectrum.peaks.mz
        ]  # round mz and add identifier -> frag@
        frag_multiplied_intensities = [
            frag * int(intensity)
            for frag, intensity in zip(frag_with_n_digits, intensities_from_0_to_100)
        ]  # weight fragments
        frag_flattend = list(chain(*frag_multiplied_intensities))  # flatten lists
        dataset_frag.append(frag_flattend)

        if acquisition_type == "DIA":
            continue

        elif acquisition_type == "DDA":

            loss_with_n_digits = [
                ["loss@" + str(round(mz, significant_figures))]
                for mz in spectrum.losses.mz
            ]  # round mz and add identifier -> loss@
            loss_multiplied_intensities = [
                loss * int(intensity)
                for loss, intensity in zip(
                    loss_with_n_digits, intensities_from_0_to_100
                )
            ]  # weight losses
            loss_flattend = list(chain(*loss_multiplied_intensities))  # flatten lists
            loss_without_zeros = list(
                filter(lambda loss: float(loss[5:]) > 0.01, loss_flattend)
            )  # removes 0 or negative loss values
            dataset_loss.append(loss_without_zeros)

    if dataset_loss:
        return combine_features(dataset_frag, dataset_loss)
    elif dataset_frag and not dataset_loss:
        return dataset_frag
    else:
        raise ValueError("Something went wrong! No vocabulary generated!")

map_doc2spec

map_doc2spec(feature_words, spectra)

generates hashkeys to find the original spectrum for a generate document

Parameters:

Name Type Description Default
feature_words
required
metadata
required

Returns:

Name Type Description
doc2spec_map
Source code in MS2LDA/Preprocessing/generate_corpus.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
def map_doc2spec(feature_words, spectra):
    """generates hashkeys to find the original spectrum for a generate document

    ARGS:
        feature_words:
        metadata:

    RETURNS:
        doc2spec_map:
    """
    doc2spec_map = {}
    for feature_word, spectrum in zip(feature_words, spectra):
        hashed_feature_word = hashlib.md5(
            "".join(feature_word).encode("utf-8")
        ).hexdigest()
        doc2spec_map[hashed_feature_word] = spectrum

    return doc2spec_map