Preprocessing Module¶

Load and Clean¶

load_and_clean ¶

clean_spectra ¶

clean_spectra(spectra, preprocessing_parameters={})

uses matchms to normalize intensities, add information and add losses to the spectra

Parameters:

Name	Type	Description	Default
`spectra`	`generator`	generator object of matchms.Spectrum.objects loaded via matchms in python	required
`entropy_threshold`	`float`	spectral entropy threshold to sort out noisy spectra (see MoNA spectral entropy)	required

Returns:

Name	Type	Description
`cleaned_spectra`	`list`	list of matchms.Spectrum.objects; spectra that do not fit will be removed

Source code in MS2LDA/Preprocessing/load_and_clean.py

def clean_spectra(spectra, preprocessing_parameters={}):
    """uses matchms to normalize intensities, add information and add losses to the spectra

    ARGS:
        spectra (generator): generator object of matchms.Spectrum.objects loaded via matchms in python
        entropy_threshold (float): spectral entropy threshold to sort out noisy spectra (see MoNA spectral entropy)

    RETURNS:
        cleaned_spectra (list): list of matchms.Spectrum.objects; spectra that do not fit will be removed
    """
    ensure_key = lambda parameters, key, default: parameters.setdefault(key, default)

    ensure_key(preprocessing_parameters, "min_mz", 0)
    ensure_key(preprocessing_parameters, "max_mz", 1000)
    ensure_key(preprocessing_parameters, "max_frags", 500)
    ensure_key(preprocessing_parameters, "min_frags", 3)
    ensure_key(preprocessing_parameters, "min_intensity", 0.001)
    ensure_key(preprocessing_parameters, "max_intensity", 1)

    cleaned_spectra = []
    count = 0

    for i, spectrum in enumerate(spectra):
        # metadata filters
        spectrum = msfilters.default_filters(spectrum)
        spectrum = msfilters.add_retention_index(spectrum)
        spectrum = msfilters.add_retention_time(spectrum)
        # spectrum = msfilters.require_precursor_mz(spectrum) # do we need this

        # normalize and filter peaks
        spectrum = msfilters.normalize_intensities(spectrum)
        spectrum = msfilters.select_by_relative_intensity(
            spectrum,
            intensity_from=preprocessing_parameters["min_intensity"],
            intensity_to=preprocessing_parameters["max_intensity"],
        )
        spectrum = msfilters.select_by_mz(
            spectrum,
            mz_from=preprocessing_parameters["min_mz"],
            mz_to=preprocessing_parameters["max_mz"],
        )
        spectrum = msfilters.reduce_to_number_of_peaks(
            spectrum, n_max=preprocessing_parameters["max_frags"]
        )
        spectrum = msfilters.require_minimum_number_of_peaks(
            spectrum, n_required=preprocessing_parameters["min_frags"]
        )
        # spectrum = msfilters.add_losses(spectrum)

        if spectrum:
            spectrum.set("id", f"spec_{count}")  # reindex
            cleaned_spectra.append(spectrum)
            count += 1

    return cleaned_spectra

load_mgf ¶

load_mgf(spectra_path)

loads spectra from a mgf file

Parameters:

Name	Type	Description	Default
`spectra_path`	`str`	path to the spectra.mgf file	required

Returns:

Name	Type	Description
`spectra`	`generator`	matchms generator object with the loaded spectra

Source code in MS2LDA/Preprocessing/load_and_clean.py

def load_mgf(spectra_path):
    """loads spectra from a mgf file

    ARGS:
        spectra_path (str): path to the spectra.mgf file

    RETURNS:
        spectra (generator): matchms generator object with the loaded spectra
    """

    spectra = load_from_mgf(spectra_path)

    return spectra

load_msp ¶

load_msp(spectra_path)

loads spectra from a mzml file

Parameters:

Name	Type	Description	Default
`spectra_path`	`str`	path to the spectra.mgf file	required

Returns:

Name	Type	Description
`spectra`	`generator`	matchms generator object with the loaded spectra

Source code in MS2LDA/Preprocessing/load_and_clean.py

def load_msp(spectra_path):
    """loads spectra from a mzml file

    ARGS:
        spectra_path (str): path to the spectra.mgf file

    RETURNS:
        spectra (generator): matchms generator object with the loaded spectra
    """

    spectra = load_from_msp(spectra_path)

    return spectra

load_mzml ¶

load_mzml(spectra_path)

loads spectra from a mzml file

Parameters:

Name	Type	Description	Default
`spectra_path`	`str`	path to the spectra.mgf file	required

Returns:

Name	Type	Description
`spectra`	`generator`	matchms generator object with the loaded spectra

Source code in MS2LDA/Preprocessing/load_and_clean.py

def load_mzml(spectra_path):
    """loads spectra from a mzml file

    ARGS:
        spectra_path (str): path to the spectra.mgf file

    RETURNS:
        spectra (generator): matchms generator object with the loaded spectra
    """

    spectra = load_from_mzml(spectra_path)

    return spectra

Generate Corpus¶

generate_corpus ¶

combine_features ¶

combine_features(dataset_frag, dataset_loss)

combines fragments and losses for a list of spectra

Parameters:

Name	Type	Description	Default
`dataset_frag(list)`		list of lists where each list represents fragements from one spectrum	required
`dataset_loss`	`list`	list of lists where each list represents the losses from one spectrum	required

Returns:

Name	Type	Description
`frag_and_loss`	`list`	list of list where each list represents the fragments and losses from one spectrum

Source code in MS2LDA/Preprocessing/generate_corpus.py

def combine_features(dataset_frag, dataset_loss):
    """combines fragments and losses for a list of spectra

    ARGS:
        dataset_frag(list): list of lists where each list represents fragements from one spectrum
        dataset_loss (list): list of lists where each list represents the losses from one spectrum

    RETURNS:
        frag_and_loss (list): list of list where each list represents the fragments and losses from one spectrum
    """

    dataset_features = []
    for spectrum_frag, spectrum_loss in zip(dataset_frag, dataset_loss):
        dataset_features.append(spectrum_frag + spectrum_loss)

    return dataset_features

features_to_words ¶

features_to_words(spectra, significant_figures=2, acquisition_type='DDA')

generates a list of lists for fragments and losses for a dataset

Parameters:

Name	Type	Description	Default
`spectra`	`list`	list of matchms.Spectrum.objects; they should be cleaned beforehand e.g. intensity normalization, add losses	required

Returns:

Name	Type	Description
`dataset_frag`	`list`	is a list of lists where each list represents fragements from one spectrum
`dataset_loss`	`list`	is a list of lists where each list represents the losses from one spectrum

Source code in MS2LDA/Preprocessing/generate_corpus.py

def features_to_words(
    spectra, significant_figures=2, acquisition_type="DDA"
):  # You should write some unittests for this function; seems to be error prone
    """generates a list of lists for fragments and losses for a dataset

    ARGS:
        spectra (list): list of matchms.Spectrum.objects; they should be cleaned beforehand e.g. intensity normalization, add losses

    RETURNS:
        dataset_frag (list): is a list of lists where each list represents fragements from one spectrum
        dataset_loss (list): is a list of lists where each list represents the losses from one spectrum
    """
    dataset_frag = []
    dataset_loss = []

    for spectrum in spectra:
        intensities_from_0_to_100 = (spectrum.peaks.intensities * 100).round()

        frag_with_n_digits = [
            ["frag@" + str(round(mz, significant_figures))] for mz in spectrum.peaks.mz
        ]  # round mz and add identifier -> frag@
        frag_multiplied_intensities = [
            frag * int(intensity)
            for frag, intensity in zip(frag_with_n_digits, intensities_from_0_to_100)
        ]  # weight fragments
        frag_flattend = list(chain(*frag_multiplied_intensities))  # flatten lists
        dataset_frag.append(frag_flattend)

        if acquisition_type == "DIA":
            continue

        elif acquisition_type == "DDA":

            loss_with_n_digits = [
                ["loss@" + str(round(mz, significant_figures))]
                for mz in spectrum.losses.mz
            ]  # round mz and add identifier -> loss@
            loss_multiplied_intensities = [
                loss * int(intensity)
                for loss, intensity in zip(
                    loss_with_n_digits, intensities_from_0_to_100
                )
            ]  # weight losses
            loss_flattend = list(chain(*loss_multiplied_intensities))  # flatten lists
            loss_without_zeros = list(
                filter(lambda loss: float(loss[5:]) > 0.01, loss_flattend)
            )  # removes 0 or negative loss values
            dataset_loss.append(loss_without_zeros)

    if dataset_loss:
        return combine_features(dataset_frag, dataset_loss)
    elif dataset_frag and not dataset_loss:
        return dataset_frag
    else:
        raise ValueError("Something went wrong! No vocabulary generated!")

map_doc2spec ¶

map_doc2spec(feature_words, spectra)

generates hashkeys to find the original spectrum for a generate document

Parameters:

Name	Type	Description	Default
`feature_words`			required
`metadata`			required

Returns:

Name	Type	Description
`doc2spec_map`

Source code in MS2LDA/Preprocessing/generate_corpus.py

def map_doc2spec(feature_words, spectra):
    """generates hashkeys to find the original spectrum for a generate document

    ARGS:
        feature_words:
        metadata:

    RETURNS:
        doc2spec_map:
    """
    doc2spec_map = {}
    for feature_word, spectrum in zip(feature_words, spectra):
        hashed_feature_word = hashlib.md5(
            "".join(feature_word).encode("utf-8")
        ).hexdigest()
        doc2spec_map[hashed_feature_word] = spectrum

    return doc2spec_map