Source code for crowsetta.formats.seq.simple

"""Module with functions meant to handle
any simple sequence-like annotation format.

The annotations can be a csv or txt file;
the format should have 3 columns that represent
the onset and offset times in seconds
and the labels of the segments
in the annotated sequences.

The default is to assume
a comma-separated values file
with a header 'onset_s, offset_s, label',
but this can be modified
with keyword arguments.

This format also assumes that each annotation file
corresponds to one annotated source file,
i.e. a single audio or spectrogram file.
"""
import pathlib
from typing import ClassVar, Mapping, Optional

import attr
import numpy as np
import pandas as pd
import pandera
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike



[docs]
class SimpleSeqSchema(pandera.SchemaModel):
    """A :class:`pandera.SchemaModel`
    that validates :type:`pandas.DataFrame`s
    loaded from a csv or txt file in a 'simple-seq' format.

    The :meth:`SimpleSeq.from_file` loads the :type:`pandas.DataFrame`
    and makes any changes needed to get it to this format
    before validation, e.g., changing column names.
    """

    onset_s: Optional[Series[float]] = pandera.Field()
    offset_s: Optional[Series[float]] = pandera.Field()
    label: Series[pd.StringDtype] = pandera.Field(coerce=True)


[docs]
    class Config:
        ordered = True
        strict = True





[docs]
@crowsetta.interface.SeqLike.register
@attr.define
class SimpleSeq:
    """Class meant to represent any simple sequence-like annotation format.

    The annotations can be a csv or txt file;
    the format should have 3 columns that represent
    the onset and offset times in seconds
    and the labels of the segments
    in the annotated sequences.

    The default is to assume
    a comma-separated values file
    with a header 'onset_s, offset_s, label',
    but this can be modified
    with keyword arguments.

    This format also assumes that each annotation file
    corresponds to one annotated source file,
    i.e. a single audio or spectrogram file.

    Attributes
    ----------
    name: str
        Shorthand name for annotation format: ``'simple-seq'``.
    ext: str
        Extension of files in annotation format:
        ``('.csv', '.txt')``
    onsets_s : numpy.ndarray
        Vector of floats corresponding
        to beginning of segments, i.e. onsets, in seconds
    offsets_s : numpy.ndarray
        Vector of floats corresponding
        to ends of segments, i.e. offsets, in seconds
    labels : numpy.ndarray
        Vector of string labels for segments
    annot_path : str, pathlib.Path
        Path to file from which annotations were loaded.
    notated_path : str. pathlib.Path
        path to file that ``annot_path`` annotates.
        E.g., an audio file, or an array file
        that contains a spectrogram generated from audio.
        Optional, default is None.
    """

    name: ClassVar[str] = "simple-seq"
    ext: ClassVar[str] = (".csv", ".txt")

    onsets_s: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
    offsets_s: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
    labels: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
    annot_path: pathlib.Path
    notated_path: Optional[pathlib.Path] = attr.field(default=None, converter=attr.converters.optional(pathlib.Path))


[docs]
    @classmethod
    def from_file(
        cls,
        annot_path: PathLike,
        notated_path: Optional[PathLike] = None,
        columns_map: Optional[Mapping] = None,
        read_csv_kwargs: Optional[Mapping] = None,
    ) -> "Self":  # noqa: F821
        """Load annotations from a file
        in the 'simple-seq' format.

        The annotations can be a csv or txt file;
        the format should have 3 columns that represent
        the onset and offset times in seconds
        and the labels of the segments
        in the annotated sequences.

        The default is to assume
        a comma-separated values file
        with a header 'onset_s, offset_s, label',
        but this can be modified
        with keyword arguments.

        This format also assumes that each annotation file
        corresponds to one annotated source file,
        i.e. a single audio or spectrogram file.

        Parameters
        ----------
        annot_path : str, pathlib.Path
            Path to an annotation file,
            with one of the extensions {'.csv', '.txt'}.
        notated_path : str, pathlib.Path
            Path to file that ``annot_path`` annotates.
            E.g., an audio file, or an array file
            that contains a spectrogram generated from audio.
            Optional, default is None.
        columns_map : dict-like
            Maps column names in header of ``annot_path``
            to the standardized names
            used by this format.
            E.g., ``{'begin_time': 'onset_s', 'end_time': 'offset_s', 'text': 'label'}``.
            Optional, default is None--assumes that
            columns have the standardized names.
        read_csv_kwargs : dict
            Keyword arguments passed to
            :func:`pandas.read_csv`. Default is None,
            in which case all defaults for
            :func:`pandas.read_csv` will be used.

        Examples
        --------
        >>> example = crowsetta.data.get('simple-seq')
        >>> simple = crowsetta.formats.seq.SimpleSeq.from_file(example.annot_path,
        >>>                                                    columns_map={'start_seconds': 'onset_s',
        >>>                                                                 'stop_seconds': 'offset_s',
        >>>                                                                 'name': 'label'},
        >>>                                                    read_csv_kwargs={'index_col': 0})
        """
        annot_path = pathlib.Path(annot_path)
        crowsetta.validation.validate_ext(annot_path, extension=cls.ext)

        if read_csv_kwargs:
            df = pd.read_csv(annot_path, **read_csv_kwargs)
        else:
            df = pd.read_csv(annot_path)

        if columns_map:
            df.columns = [columns_map[column_name] for column_name in df.columns]
        df = df[["onset_s", "offset_s", "label"]]  # put in correct order
        df = SimpleSeqSchema.validate(df)

        return cls(
            onsets_s=df["onset_s"].values,
            offsets_s=df["offset_s"].values,
            labels=df["label"].values,
            annot_path=annot_path,
            notated_path=notated_path,
        )



[docs]
    def to_seq(self, round_times: bool = True, decimals: int = 3) -> crowsetta.Sequence:
        """Convert this annotation to a :class:`crowsetta.Sequence`.

        Parameters
        ----------
        round_times : bool
            If True, round onsets_s and offsets_s.
            Default is True.
        decimals : int
            Number of decimals places to round floating point numbers to.
            Only meaningful if round_times is True.
            Default is 3, so that times are rounded to milliseconds.

        Returns
        -------
        seq : crowsetta.Sequence

        Examples
        --------
        >>> example = crowsetta.data.get('simple-seq')
        >>> simple = crowsetta.formats.seq.SimpleSeq.from_file(example.annot_path,
        >>>                                                    columns_map={'start_seconds': 'onset_s',
        >>>                                                                 'stop_seconds': 'offset_s',
        >>>                                                                 'name': 'label'},
        >>>                                                    read_csv_kwargs={'index_col': 0})
        >>> seq = simple.to_seq()

        Notes
        -----
        The ``round_times`` and ``decimals`` arguments are provided
        to reduce differences across platforms
        due to floating point error, e.g. when loading annotation files
        and then sending them to a csv file,
        the result should be the same on Windows and Linux.
        """
        if round_times:
            onsets_s = np.around(self.onsets_s, decimals=decimals)
            offsets_s = np.around(self.offsets_s, decimals=decimals)
        else:
            onsets_s = self.onsets_s
            offsets_s = self.offsets_s

        seq = crowsetta.Sequence.from_keyword(labels=self.labels, onsets_s=onsets_s, offsets_s=offsets_s)
        return seq



[docs]
    def to_annot(self, round_times: bool = True, decimals: int = 3) -> crowsetta.Annotation:
        """Convert this annotation to a :class:`crowsetta.Annotation`.

        Parameters
        ----------
        round_times : bool
            If True, round onsets_s and offsets_s.
            Default is True.
        decimals : int
            Number of decimals places to round floating point numbers to.
            Only meaningful if round_times is True.
            Default is 3, so that times are rounded to milliseconds.

        Returns
        -------
        annot : crowsetta.Annotation

        Examples
        --------
        >>> example = crowsetta.data.get('simple-seq')
        >>> simple = crowsetta.formats.seq.SimpleSeq.from_file(example.annot_path,
        >>>                                                    columns_map={'start_seconds': 'onset_s',
        >>>                                                                 'stop_seconds': 'offset_s',
        >>>                                                                 'name': 'label'},
        >>>                                                    read_csv_kwargs={'index_col': 0})
        >>> annot = simple.to_annot()

        Notes
        -----
        The ``round_times`` and ``decimals`` arguments are provided
        to reduce differences across platforms
        due to floating point error, e.g. when loading annotation files
        and then sending them to a csv file,
        the result should be the same on Windows and Linux.
        """
        seq = self.to_seq(round_times, decimals)
        return crowsetta.Annotation(annot_path=self.annot_path, notated_path=self.notated_path, seq=seq)



[docs]
    def to_file(self, annot_path: PathLike, to_csv_kwargs: Optional[Mapping] = None) -> None:
        """Save this 'simple-seq' annotation to a csv file.

        Parameters
        ----------
        annot_path : str, pathlib.Path
            Path with filename of csv file that should be saved
        to_csv_kwargs : dict-like
            keyword arguments passed to
            :meth:`pandas.DataFrame.to_csv`.
            Default is None, in which case
            defaults for :func:`pandas.to_csv`
            will be used, except ``index``
            is set to False.
        """
        df = pd.DataFrame.from_records({"onset_s": self.onsets_s, "offset_s": self.offsets_s, "label": self.labels})
        df = df[["onset_s", "offset_s", "label"]]  # put in correct order
        try:
            df = SimpleSeqSchema.validate(df)
        except pandera.errors.SchemaError as e:
            raise ValueError(f"Annotations produced an invalid dataframe, cannot convert to csv:\n{df}") from e
        if to_csv_kwargs:
            df.to_csv(annot_path, **to_csv_kwargs)
        else:
            df.to_csv(annot_path, index=False)