Source code for crowsetta.formats.seq.generic

"""
Generic sequence format,
meant to be an abstraction of
any sequence-like format.

Consists of :class:`crowsetta.Annotation`
instances, each with a :class:`crowsetta.Sequence`
made up of :class:`crowsetta.Segment`s.

Functions in this module
load the format from a csv file,
or write a csv file in the generic format.
Other formats that convert to
:class:`~crowsetta.Annotation`s
with :class:`~crowsetta.Sequence`s can be converted
to this format.
"""
import os
from collections import OrderedDict
from typing import ClassVar, List, Optional, Union

import attr
import pandas as pd
import pandera
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike

ONSET_OFFSET_COLS_ERR = """For onset times and offset times,
all values must be specified in at least one unit:
seconds (float), or sample number (integer). All rows must be non-null for either
'onset_s' and 'offset_s' or 'onset_sample' and 'offset_sample'.
Both units can also be specified. Conversion between units is not validated.
"""


[docs] class GenericSeqSchema(pandera.SchemaModel): """A :class: `pandera.SchemaModel` that validates :type:`pandas.DataFrame`s loaded from a csv file in the ``'generic-seq'`` annotation format. """ label: Series[pd.StringDtype] = pandera.Field(coerce=True) onset_s: Optional[Series[float]] = pandera.Field() offset_s: Optional[Series[float]] = pandera.Field() onset_sample: Optional[Series[int]] = pandera.Field() offset_sample: Optional[Series[int]] = pandera.Field() notated_path: Series[str] = pandera.Field(coerce=True) annot_path: Series[str] = pandera.Field(coerce=True) sequence: Series[int] = pandera.Field() annotation: Series[int] = pandera.Field()
[docs] @pandera.dataframe_check(error=ONSET_OFFSET_COLS_ERR) def both_onset_s_and_offset_s_if_either(cls, df: pd.DataFrame) -> bool: """check that, if one of {'onset_s', 'offset_s'} column is present, then both are present""" if any([col in df for col in ("onset_s", "offset_s")]): return all([col in df for col in ("onset_s", "offset_s")]) else: return True
[docs] @pandera.dataframe_check(error=ONSET_OFFSET_COLS_ERR) def both_onset_sample_and_offset_sample_if_either(cls, df: pd.DataFrame) -> bool: """check that, if one of {'onset_sample', 'offset_sample'} column is present, then both are present""" if any([col in df for col in ("onset_sample", "offset_sample")]): return all([col in df for col in ("onset_sample", "offset_sample")]) else: return True
[docs] @pandera.dataframe_check(error=ONSET_OFFSET_COLS_ERR) def onset_offset_s_and_ind_are_not_both_missing(cls, df: pd.DataFrame) -> bool: """check that at least one of the on/offset column pairs is present: either {'onset_s', 'offset_s'} or {'onset_sample', 'offset_sample'}""" if "onset_s" not in df and "offset_s" not in df: return "onset_sample" in df and "offset_sample" in df elif "onset_sample" not in df and "offset_sample" not in df: return "onset_s" in df and "offset_s" in df elif all([col in df for col in ("onset_s", "offset_s", "onset_sample", "offset_sample")]): # i.e., else return True, but extra verbose for clarity return True
[docs] class Config: ordered = True strict = True
[docs] def annot2df( annot: Union[crowsetta.Annotation, List[crowsetta.Annotation]], abspath: bool = False, basename: bool = False ) -> pd.DataFrame: """Convert sequence-like :class:`crowsetta.Annotation` to a :type:`pandas.DataFrame` in the ``'generic-seq'`` format. Parameters ---------- annot : crowsetta.Annotation, or list of Annotations csv_path : str, pathlib.Path Path including filename of csv file to write to, will be created (or overwritten if it exists already) abspath : bool If True, converts filename for each audio file into absolute path. Default is False. basename : bool If True, discard any information about path and just use file name. Default is False. Notes ----- The abspath and basename parameters specify how file names for audio files are saved. These options are useful when working with multiple copies of files, and for reproducibility (so you know which copy of a file you were working with). Default for both is False, in which case the filename is saved just as it is passed to this function in a :class:`crowsetta.Sequence` object. """ if not (isinstance(annot, crowsetta.Annotation) or isinstance(annot, list)): raise TypeError("annot must be Annotation or list of Annotations, " f"not type {type(annot)})") if isinstance(annot, crowsetta.Annotation): # put in a list so we can iterate over it annot = [annot] if not all([isinstance(annot_, crowsetta.Annotation) for annot_ in annot]): raise TypeError("not all objects in annot are of type Annotation") if abspath and basename: raise ValueError( "abspath and basename arguments cannot both be set to True, " "unclear whether absolute path should be saved or if no path " "information (just base filename) should be saved." ) records = [] for annot_num, annot_ in enumerate(annot): if isinstance(annot_.seq, crowsetta.Sequence): seq_list = [annot_.seq] elif isinstance(annot_.seq, list): seq_list = annot_.seq for seq_num, seq in enumerate(seq_list): for segment in seq.segments: row = OrderedDict( { key: val for key, val in segment.asdict().items() # don't keep onset or offset if they are None # but keep any other Nones, so those other Nones will raise expected errors downstreams if not (val is None and any([key.startswith(prefix) for prefix in ("onset", "offset")])) } ) # OrderedDict is default; being extra explicit here annot_path = annot_.annot_path notated_path = annot_.notated_path if abspath: annot_path = os.path.abspath(annot_path) if notated_path is not None: notated_path = os.path.abspath(notated_path) elif basename: annot_path = os.path.basename(annot_path) if notated_path is not None: notated_path = os.path.basename(notated_path) # need to put in notated_path before annot_path if notated_path is not None: row["notated_path"] = notated_path else: row["notated_path"] = "None" row["annot_path"] = annot_path # we use 'sequence' and 'annotation' fields when we are # loading back into Annotations row["sequence"] = seq_num row["annotation"] = annot_num records.append(row) df = pd.DataFrame.from_records(records) df = GenericSeqSchema.validate(df) return df
[docs] def annot2csv( annot: Union[crowsetta.Annotation, List[crowsetta.Annotation]], csv_path: PathLike, abspath: bool = False, basename: bool = False, ) -> None: """Write sequence-like :class:`crowsetta.Annotation` to a csv file in the ``'generic-seq'`` format Parameters ---------- annot : crowsetta.Annotation, or list of Annotations csv_path : str, pathlib.Path Path including filename of csv file to write to, will be created (or overwritten if it exists already) abspath : bool If True, converts filename for each audio file into absolute path. Default is False. basename : bool If True, discard any information about path and just use file name. Default is False. Notes ----- The abspath and basename parameters specify how file names for audio files are saved. These options are useful when working with multiple copies of files, and for reproducibility (so you know which copy of a file you were working with). Default for both is False, in which case the filename is saved just as it is passed to this function in a Sequence object. """ df = annot2df(annot, abspath, basename) df.to_csv(csv_path, index=False)
[docs] def csv2annot(csv_path: PathLike) -> List[crowsetta.Annotation]: """Loads a comma-separated values (csv) file containing annotations for song files, returns contents as a :class:`list` of :class:`crowsetta.Annotation` instances. Parameters ---------- csv_path : str, pathlib.Path Path to csv file containing annotations saved in the ``'generic-seq'`` format. Returns ------- annot_list : list A :class:`list` of :class:`crowsetta.Annotation` instances. """ df = pd.read_csv(csv_path) df = GenericSeqSchema.validate(df) annot_list = [] # tried doing this various ways with `pandas.DataFrame.groupby('annotation')` # but they are all less readable + # required more work to convert -> `crowsetta.Annotation` instances for annotation_ind in df.annotation.unique(): df_annot = df[df.annotation == annotation_ind] # ---- get what we need to build an Annotation instance # 1. annot_path annot_path = df_annot.annot_path.unique() if len(annot_path) > 1: raise ValueError( f"found multiple values for 'annot_path' for annotation #{annotation_ind}:" f"\n{annot_path}" ) annot_path = annot_path[0] # 2. notated_path notated_path = df_annot.notated_path.unique() if len(notated_path) > 1: raise ValueError( f"found multiple values for 'notated_path' for annotation #{annotation_ind}:" f"\n{notated_path}" ) notated_path = notated_path[0] # 3. Sequence seq_uniq = df_annot.sequence.unique() assert len(seq_uniq) > 0 if len(seq_uniq) > 1: raise ValueError("Multiple sequences per annotation are not implemented") labels = df_annot.label.values if "onset_s" and "offset_s" in df_annot: onsets_s = df_annot.onset_s.values offsets_s = df_annot.offset_s.values else: onsets_s = None offsets_s = None if "onset_sample" and "offset_sample" in df_annot: onsets_inds = df_annot.onset_sample.values offsets_inds = df_annot.offset_sample.values else: onsets_inds = None offsets_inds = None seq = crowsetta.Sequence.from_keyword( labels=labels, onsets_s=onsets_s, offsets_s=offsets_s, onset_samples=onsets_inds, offset_samples=offsets_inds, ) annot = crowsetta.Annotation(annot_path=annot_path, notated_path=notated_path, seq=seq) annot_list.append(annot) return annot_list
[docs] @crowsetta.interface.SeqLike.register @attr.define class GenericSeq: """Class that represents annotations from a generic format, meant to be an abstraction of any sequence-like format. Consists of :class:`crowsetta.Annotation`s, each with a :class:`crowsetta.Sequence` made up of :class:`crowsetta.Segment`s. Other formats that convert to :class:`~crowsetta.Annotation`s with :class:`~crowsetta.Sequence`s can be converted to this format. Attributes ---------- name: str Shorthand name for annotation format: ``'generic-seq'`` ext: str Extension of files in annotation format: ``'.csv'`` annots : list A :class:`list` of :class:`crowsetta.Annotation` instances. """ name: ClassVar[str] = "generic-seq" ext: ClassVar[str] = ".csv" annots: List[crowsetta.Annotation]
[docs] @classmethod def from_file(cls, annot_path: PathLike) -> "Self": # noqa: F821 """Load annotations in 'generic-seq' format from a csv file. Parameters ---------- annot_path : str, pathlib.Path Path to csv file containing annotations saved in the ``'generic-seq'`` format. Examples -------- >>> example = crowsetta.data.get('generic-seq') >>> generic = crowsetta.formats.seq.GenericSeq.from_file(example.annot_path)""" annots = csv2annot(csv_path=annot_path) return cls(annots=annots)
[docs] def to_seq(self) -> List[crowsetta.Sequence]: """Return a :class:`list` of :class:`crowsetta.Sequence` instances, one for every annotation. Examples -------- >>> example = crowsetta.data.get('generic-seq') >>> generic = crowsetta.formats.seq.GenericSeq.from_file(example.annot_path) >>> seqs = generic.to_seq() """ return [annot.seq for annot in self.annots]
[docs] def to_annot(self) -> List[crowsetta.Annotation]: """Returns this set of :class:`crowsetta.Annotation` instances as a :class:`list`. This is the same as accessing the :class:`list` of :class:`crowsetta.Annotation` instances directly. The method is implemented so that this class conforms with the :class:`crowsetta.interface.seq.SeqLike` interface. Examples -------- >>> example = crowsetta.data.get('generic-seq') >>> generic = crowsetta.formats.seq.GenericSeq.from_file(example.annot_path) >>> annots = generic.to_annot() """ return self.annots
[docs] def to_df(self, abspath: bool = False, basename: bool = False) -> pd.DataFrame: """Convert these annotations to a :type:`pandas.DataFrame`. abspath : bool If True, converts filename for each audio file into absolute path. Default is False. basename : bool If True, discard any information about path and just use file name. Default is False. """ return annot2df(self.annots, abspath, basename)
[docs] def to_file(self, annot_path: PathLike, abspath: bool = False, basename: bool = False) -> None: """Write these annotations to a csv file in ``'generic-seq'`` format. Parameters ---------- annot_path : str, pathlib.Path Path including filename of csv file to write to, will be created (or overwritten if it exists already) abspath : bool If True, converts filename for each audio file into absolute path. Default is False. basename : bool If True, discard any information about path and just use file name. Default is False. """ annot2csv(csv_path=annot_path, annot=self.annots, abspath=abspath, basename=basename)