Source code for crowsetta.formats.seq.generic

"""
Generic sequence format,
meant to be an abstraction of
any sequence-like format.

Consists of :class:`crowsetta.Annotation`
instances, each with a :class:`crowsetta.Sequence`
made up of :class:`crowsetta.Segment`s.

Functions in this module
load the format from a csv file,
or write a csv file in the generic format.
Other formats that convert to
:class:`~crowsetta.Annotation`s
with :class:`~crowsetta.Sequence`s can be converted
to this format.
"""
import os
from collections import OrderedDict
from typing import ClassVar, List, Optional, Union

import attr
import pandas as pd
import pandera
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike

ONSET_OFFSET_COLS_ERR = """For onset times and offset times,
all values must be specified in at least one unit:
seconds (float), or sample number (integer). All rows must be non-null for either
'onset_s' and 'offset_s' or 'onset_sample' and 'offset_sample'.
Both units can also be specified. Conversion between units is not validated.
"""



[docs]
class GenericSeqSchema(pandera.SchemaModel):
    """A :class: `pandera.SchemaModel` that validates
    :type:`pandas.DataFrame`s
    loaded from a csv file  in the ``'generic-seq'`` annotation
    format.
    """

    label: Series[pd.StringDtype] = pandera.Field(coerce=True)
    onset_s: Optional[Series[float]] = pandera.Field()
    offset_s: Optional[Series[float]] = pandera.Field()
    onset_sample: Optional[Series[int]] = pandera.Field()
    offset_sample: Optional[Series[int]] = pandera.Field()

    notated_path: Series[str] = pandera.Field(coerce=True)
    annot_path: Series[str] = pandera.Field(coerce=True)
    sequence: Series[int] = pandera.Field()
    annotation: Series[int] = pandera.Field()


[docs]
    @pandera.dataframe_check(error=ONSET_OFFSET_COLS_ERR)
    def both_onset_s_and_offset_s_if_either(cls, df: pd.DataFrame) -> bool:
        """check that, if one of {'onset_s', 'offset_s'} column is present,
        then both are present"""
        if any([col in df for col in ("onset_s", "offset_s")]):
            return all([col in df for col in ("onset_s", "offset_s")])
        else:
            return True



[docs]
    @pandera.dataframe_check(error=ONSET_OFFSET_COLS_ERR)
    def both_onset_sample_and_offset_sample_if_either(cls, df: pd.DataFrame) -> bool:
        """check that, if one of {'onset_sample', 'offset_sample'} column is present,
        then both are present"""
        if any([col in df for col in ("onset_sample", "offset_sample")]):
            return all([col in df for col in ("onset_sample", "offset_sample")])
        else:
            return True



[docs]
    @pandera.dataframe_check(error=ONSET_OFFSET_COLS_ERR)
    def onset_offset_s_and_ind_are_not_both_missing(cls, df: pd.DataFrame) -> bool:
        """check that at least one of the on/offset column pairs is present:
        either {'onset_s', 'offset_s'} or {'onset_sample', 'offset_sample'}"""
        if "onset_s" not in df and "offset_s" not in df:
            return "onset_sample" in df and "offset_sample" in df
        elif "onset_sample" not in df and "offset_sample" not in df:
            return "onset_s" in df and "offset_s" in df
        elif all([col in df for col in ("onset_s", "offset_s", "onset_sample", "offset_sample")]):
            #  i.e., else return True, but extra verbose for clarity
            return True



[docs]
    class Config:
        ordered = True
        strict = True





[docs]
def annot2df(
    annot: Union[crowsetta.Annotation, List[crowsetta.Annotation]], abspath: bool = False, basename: bool = False
) -> pd.DataFrame:
    """Convert sequence-like :class:`crowsetta.Annotation`
    to a :type:`pandas.DataFrame` in the ``'generic-seq'`` format.

    Parameters
    ----------
    annot : crowsetta.Annotation, or list of Annotations
    csv_path : str, pathlib.Path
        Path including filename of csv file to write to,
        will be created (or overwritten if it exists already)
    abspath : bool
        If True, converts filename for each audio file into absolute path.
        Default is False.
    basename : bool
        If True, discard any information about path and just use file name.
        Default is False.

    Notes
    -----
    The abspath and basename parameters specify how file names for audio files are saved.
    These options are useful when working with multiple copies of files, and for
    reproducibility (so you know which copy of a file you were working with).
    Default for both is False, in which case the filename is saved just as it is passed to
    this function in a :class:`crowsetta.Sequence` object.
    """
    if not (isinstance(annot, crowsetta.Annotation) or isinstance(annot, list)):
        raise TypeError("annot must be Annotation or list of Annotations, " f"not type {type(annot)})")

    if isinstance(annot, crowsetta.Annotation):
        # put in a list so we can iterate over it
        annot = [annot]

    if not all([isinstance(annot_, crowsetta.Annotation) for annot_ in annot]):
        raise TypeError("not all objects in annot are of type Annotation")

    if abspath and basename:
        raise ValueError(
            "abspath and basename arguments cannot both be set to True, "
            "unclear whether absolute path should be saved or if no path "
            "information (just base filename) should be saved."
        )

    records = []
    for annot_num, annot_ in enumerate(annot):
        if isinstance(annot_.seq, crowsetta.Sequence):
            seq_list = [annot_.seq]
        elif isinstance(annot_.seq, list):
            seq_list = annot_.seq
        for seq_num, seq in enumerate(seq_list):
            for segment in seq.segments:
                row = OrderedDict(
                    {
                        key: val
                        for key, val in segment.asdict().items()
                        # don't keep onset or offset if they are None
                        # but keep any other Nones, so those other Nones will raise expected errors downstreams
                        if not (val is None and any([key.startswith(prefix) for prefix in ("onset", "offset")]))
                    }
                )  # OrderedDict is default; being extra explicit here
                annot_path = annot_.annot_path
                notated_path = annot_.notated_path
                if abspath:
                    annot_path = os.path.abspath(annot_path)
                    if notated_path is not None:
                        notated_path = os.path.abspath(notated_path)
                elif basename:
                    annot_path = os.path.basename(annot_path)
                    if notated_path is not None:
                        notated_path = os.path.basename(notated_path)
                # need to put in notated_path before annot_path
                if notated_path is not None:
                    row["notated_path"] = notated_path
                else:
                    row["notated_path"] = "None"
                row["annot_path"] = annot_path
                # we use 'sequence' and 'annotation' fields when we are
                # loading back into Annotations
                row["sequence"] = seq_num
                row["annotation"] = annot_num
                records.append(row)

    df = pd.DataFrame.from_records(records)
    df = GenericSeqSchema.validate(df)
    return df




[docs]
def annot2csv(
    annot: Union[crowsetta.Annotation, List[crowsetta.Annotation]],
    csv_path: PathLike,
    abspath: bool = False,
    basename: bool = False,
) -> None:
    """Write sequence-like :class:`crowsetta.Annotation`
    to a csv file in the ``'generic-seq'`` format

    Parameters
    ----------
    annot : crowsetta.Annotation, or list of Annotations
    csv_path : str, pathlib.Path
        Path including filename of csv file to write to,
        will be created (or overwritten if it exists already)
    abspath : bool
        If True, converts filename for each audio file into absolute path.
        Default is False.
    basename : bool
        If True, discard any information about path and just use file name.
        Default is False.

    Notes
    -----
    The abspath and basename parameters specify how file names for audio files are saved.
    These options are useful when working with multiple copies of files, and for
    reproducibility (so you know which copy of a file you were working with).
    Default for both is False, in which case the filename is saved just as it is passed to
    this function in a Sequence object.
    """
    df = annot2df(annot, abspath, basename)
    df.to_csv(csv_path, index=False)




[docs]
def csv2annot(csv_path: PathLike) -> List[crowsetta.Annotation]:
    """Loads a comma-separated values (csv) file containing annotations
    for song files, returns contents as a
    :class:`list` of :class:`crowsetta.Annotation` instances.

    Parameters
    ----------
    csv_path : str, pathlib.Path
        Path to csv file containing annotations
        saved in the ``'generic-seq'`` format.

    Returns
    -------
    annot_list : list
        A :class:`list` of :class:`crowsetta.Annotation` instances.
    """
    df = pd.read_csv(csv_path)
    df = GenericSeqSchema.validate(df)

    annot_list = []
    # tried doing this various ways with `pandas.DataFrame.groupby('annotation')`
    # but they are all less readable +
    # required more work to convert -> `crowsetta.Annotation` instances
    for annotation_ind in df.annotation.unique():
        df_annot = df[df.annotation == annotation_ind]
        # ---- get what we need to build an Annotation instance
        # 1. annot_path
        annot_path = df_annot.annot_path.unique()
        if len(annot_path) > 1:
            raise ValueError(
                f"found multiple values for 'annot_path' for annotation #{annotation_ind}:" f"\n{annot_path}"
            )
        annot_path = annot_path[0]
        # 2. notated_path
        notated_path = df_annot.notated_path.unique()
        if len(notated_path) > 1:
            raise ValueError(
                f"found multiple values for 'notated_path' for annotation #{annotation_ind}:" f"\n{notated_path}"
            )
        notated_path = notated_path[0]
        # 3. Sequence
        seq_uniq = df_annot.sequence.unique()
        assert len(seq_uniq) > 0
        if len(seq_uniq) > 1:
            raise ValueError("Multiple sequences per annotation are not implemented")
        labels = df_annot.label.values
        if "onset_s" and "offset_s" in df_annot:
            onsets_s = df_annot.onset_s.values
            offsets_s = df_annot.offset_s.values
        else:
            onsets_s = None
            offsets_s = None
        if "onset_sample" and "offset_sample" in df_annot:
            onsets_inds = df_annot.onset_sample.values
            offsets_inds = df_annot.offset_sample.values
        else:
            onsets_inds = None
            offsets_inds = None
        seq = crowsetta.Sequence.from_keyword(
            labels=labels,
            onsets_s=onsets_s,
            offsets_s=offsets_s,
            onset_samples=onsets_inds,
            offset_samples=offsets_inds,
        )
        annot = crowsetta.Annotation(annot_path=annot_path, notated_path=notated_path, seq=seq)
        annot_list.append(annot)

    return annot_list




[docs]
@crowsetta.interface.SeqLike.register
@attr.define
class GenericSeq:
    """Class that represents annotations from a generic format,
    meant to be an abstraction of
    any sequence-like format.

    Consists of :class:`crowsetta.Annotation`s,
    each with a :class:`crowsetta.Sequence` made up
    of :class:`crowsetta.Segment`s.

    Other formats that convert to :class:`~crowsetta.Annotation`s
    with :class:`~crowsetta.Sequence`s can be converted
    to this format.

    Attributes
    ----------
    name: str
        Shorthand name for annotation format: ``'generic-seq'``
    ext: str
        Extension of files in annotation format: ``'.csv'``
    annots : list
        A :class:`list` of :class:`crowsetta.Annotation` instances.
    """

    name: ClassVar[str] = "generic-seq"
    ext: ClassVar[str] = ".csv"

    annots: List[crowsetta.Annotation]


[docs]
    @classmethod
    def from_file(cls, annot_path: PathLike) -> "Self":  # noqa: F821
        """Load annotations in 'generic-seq' format from a csv file.

        Parameters
        ----------
        annot_path : str, pathlib.Path
            Path to csv file containing annotations
            saved in the ``'generic-seq'`` format.

        Examples
        --------
        >>> example = crowsetta.data.get('generic-seq')
        >>> generic = crowsetta.formats.seq.GenericSeq.from_file(example.annot_path)"""
        annots = csv2annot(csv_path=annot_path)
        return cls(annots=annots)



[docs]
    def to_seq(self) -> List[crowsetta.Sequence]:
        """Return a :class:`list` of :class:`crowsetta.Sequence` instances,
        one for every annotation.

        Examples
        --------
        >>> example = crowsetta.data.get('generic-seq')
        >>> generic = crowsetta.formats.seq.GenericSeq.from_file(example.annot_path)
        >>> seqs = generic.to_seq()
        """
        return [annot.seq for annot in self.annots]



[docs]
    def to_annot(self) -> List[crowsetta.Annotation]:
        """Returns this set of :class:`crowsetta.Annotation` instances
        as a :class:`list`.

        This is the same as accessing the :class:`list` of :class:`crowsetta.Annotation`
        instances directly. The method is implemented so that this class
        conforms with the :class:`crowsetta.interface.seq.SeqLike` interface.

        Examples
        --------
        >>> example = crowsetta.data.get('generic-seq')
        >>> generic = crowsetta.formats.seq.GenericSeq.from_file(example.annot_path)
        >>> annots = generic.to_annot()
        """
        return self.annots



[docs]
    def to_df(self, abspath: bool = False, basename: bool = False) -> pd.DataFrame:
        """Convert these annotations to a :type:`pandas.DataFrame`.

        abspath : bool
            If True, converts filename for each audio file into absolute path.
            Default is False.
        basename : bool
            If True, discard any information about path and just use file name.
            Default is False.
        """
        return annot2df(self.annots, abspath, basename)



[docs]
    def to_file(self, annot_path: PathLike, abspath: bool = False, basename: bool = False) -> None:
        """Write these annotations to a csv file
        in ``'generic-seq'`` format.

        Parameters
        ----------
        annot_path : str, pathlib.Path
            Path including filename of csv file to write to,
            will be created (or overwritten if it exists already)
        abspath : bool
            If True, converts filename for each audio file into absolute path.
            Default is False.
        basename : bool
            If True, discard any information about path and just use file name.
            Default is False.
        """
        annot2csv(csv_path=annot_path, annot=self.annots, abspath=abspath, basename=basename)