Source code for crowsetta.formats.seq.audseq

"""module for Audacity LabelTrack
in standard/default format exported to txt files
https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Standard_.28default.29_format
"""
import pathlib
from typing import ClassVar, Optional

import attr
import numpy as np
import pandas as pd
import pandera
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike



[docs]
class AudSeqSchema(pandera.SchemaModel):
    """A :class:`pandera.SchemaModel`
    that validates :type:`pandas.DataFrame`s
    loaded from Audacity Labeltrack annotations
    exported to txt files in the standard format.

    The standard format is described here:
    https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Standard_.28default.29_format
    """

    start_time: Optional[Series[float]] = pandera.Field()
    end_time: Optional[Series[float]] = pandera.Field()
    label: Series[pd.StringDtype] = pandera.Field(coerce=True)


[docs]
    class Config:
        ordered = True
        strict = True





[docs]
@crowsetta.interface.SeqLike.register
@attr.define
class AudSeq:
    """Class meant to represent
    Audacity Labeltrack annotations
    exported to txt files in the standard format[1]_.

    The txt file will have 3 tab-separated columns
    that represent the start time, end time, and labels
    of annotated regions.

    Attributes
    ----------
    name: str
        Shorthand name for annotation format: ``'aud-seq'``.
    ext: str
        Extension of files in annotation format, ``'.txt'``.
    start_times : numpy.ndarray
        Vector of integer sample numbers corresponding
        to beginning of segments, i.e. onsets.
    end_times : numpy.ndarray
        Vector of integer sample numbers corresponding
        to ends of segments, i.e. offsets.
    labels : numpy.ndarray
        Vector of string labels for segments;
        each element is either a single word,
        or a single phonetic transcription code.
    annot_path : str, pathlib.Path
        Path to file from which annotations were loaded.
    notated_path : str, pathlib.Path
        Path to file that ``annot_path`` annotates.
        E.g., an audio file, or an array file
        that contains a spectrogram generated from audio.
        Optional, default is None.

    References
    ----------
    .. [1^] https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Standard_.28default.29_format
    """

    name: ClassVar[str] = "aud-seq"
    ext: ClassVar[str] = ".txt"

    start_times: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
    end_times: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
    labels: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
    annot_path: pathlib.Path
    notated_path: Optional[pathlib.Path] = attr.field(default=None, converter=attr.converters.optional(pathlib.Path))


[docs]
    @classmethod
    def from_file(
        cls,
        annot_path: PathLike,
        notated_path: Optional[PathLike] = None,
    ) -> "Self":  # noqa: F821
        """Load annotations from a file.

        Parameters
        ----------
        annot_path : str, pathlib.Path
            Path to an annotation file,
            with '.txt' extension.
        notated_path : str, pathlib.Path
            Path to file that ``annot_path`` annotates.
            E.g., an audio file, or an array file
            that contains a spectrogram generated from audio.
            Optional, default is None.

        Examples
        --------
        >>> example = crowsetta.data.get('aud-seq')
        >>> audseq = crowsetta.formats.seq.AudSeq.from_file(example.annot_path)
        """
        annot_path = pathlib.Path(annot_path)
        crowsetta.validation.validate_ext(annot_path, extension=cls.ext)
        df = pd.read_csv(annot_path, sep="\t", header=None)
        df.columns = ["start_time", "end_time", "label"]
        df = AudSeqSchema.validate(df)

        return cls(
            start_times=df["start_time"].values,
            end_times=df["end_time"].values,
            labels=df["label"].values,
            annot_path=annot_path,
            notated_path=notated_path,
        )



[docs]
    def to_seq(self, round_times: bool = True, decimals: int = 3) -> crowsetta.Sequence:
        """Convert this annotation to a :class:`crowsetta.Sequence`.

        Parameters
        ----------
        round_times : bool
            If True, round ``onsets_s`` and ``offsets_s``.
            Default is True.
        decimals : int
            Number of decimals places to round floating point numbers to.
            Only meaningful if round_times is True.
            Default is 3, so that times are rounded to milliseconds.

        Returns
        -------
        seq : crowsetta.Sequence

        Examples
        --------
        >>> example = crowsetta.data.get('aud-seq')
        >>> audseq = crowsetta.formats.seq.AudSeq.from_file(example.annot_path)
        >>> seq = audseq.to_seq()

        Notes
        -----
        The ``round_times`` and ``decimals`` arguments are provided
        to reduce differences across platforms
        due to floating point error, e.g. when loading annotation files
        and then sending them to a csv file,
        the result should be the same on Windows and Linux.
        """
        if round_times:
            onsets_s = np.around(self.start_times, decimals=decimals)
            offsets_s = np.around(self.end_times, decimals=decimals)
        else:
            onsets_s = self.start_times
            offsets_s = self.end_times

        seq = crowsetta.Sequence.from_keyword(labels=self.labels, onsets_s=onsets_s, offsets_s=offsets_s)
        return seq



[docs]
    def to_annot(self, round_times: bool = True, decimals: int = 3) -> crowsetta.Annotation:
        """Convert this annotation to a :class:`crowsetta.Annotation`.

        Parameters
        ----------
        round_times : bool
            If True, round onsets_s and offsets_s.
            Default is True.
        decimals : int
            Number of decimals places to round floating point numbers to.
            Only meaningful if round_times is True.
            Default is 3, so that times are rounded to milliseconds.

        Returns
        -------
        annot : crowsetta.Annotation

        Examples
        --------
        >>> example = crowsetta.data.get('aud-seq')
        >>> audseq = crowsetta.formats.seq.AudSeq.from_file(example.annot_path)
        >>> annot = audseq.to_annot()

        Notes
        -----
        The ``round_times`` and ``decimals`` arguments are provided
        to reduce differences across platforms
        due to floating point error, e.g. when loading annotation files
        and then sending them to a csv file,
        the result should be the same on Windows and Linux.
        """
        seq = self.to_seq(round_times, decimals)
        return crowsetta.Annotation(annot_path=self.annot_path, notated_path=self.notated_path, seq=seq)



[docs]
    def to_file(self, annot_path: PathLike) -> None:
        """Save this 'aud-seq' annotation to a txt file
        in the standard/default Audacity LabelTrack format.

        Parameters
        ----------
        annot_path : str, pathlib.Path
            Path with filename of txt file that should be saved.
        """
        df = pd.DataFrame.from_records(
            {"start_time": self.start_times, "end_time": self.end_times, "label": self.labels}
        )
        df = df[["start_time", "end_time", "label"]]  # put in correct order
        try:
            df = AudSeqSchema.validate(df)
        except pandera.errors.SchemaError as e:
            raise ValueError(
                f"Annotations produced an invalid dataframe, " f"cannot convert to Audacity LabelTrack txt file:\n{df}"
            ) from e
        df.to_csv(annot_path, sep="\t", header=False, index=False)