Source code for crowsetta.formats.seq.simple

"""Module with functions meant to handle
any simple sequence-like annotation format.

The annotations can be a csv or txt file;
the format should have 3 columns that represent
the onset and offset times in seconds
and the labels of the segments
in the annotated sequences.

The default is to assume
a comma-separated values file
with a header 'onset_s, offset_s, label',
but this can be modified
with keyword arguments.

This format also assumes that each annotation file
corresponds to one annotated source file,
i.e. a single audio or spectrogram file.
"""

import pathlib
from typing import ClassVar, Mapping, Optional

import attr
import numpy as np
import pandas as pd
import pandera.pandas
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike


[docs] class SimpleSeqSchema(pandera.pandas.DataFrameModel): """A :class:`pandera.pandas.DataFrameModel` that validates a :type:`pandas.DataFrame` loaded from a csv or txt file in a 'simple-seq' format. The :meth:`SimpleSeq.from_file` loads the :type:`pandas.DataFrame` and makes any changes needed to get it to this format before validation, e.g., changing column names. """ onset_s: Optional[Series[float]] = pandera.pandas.Field() offset_s: Optional[Series[float]] = pandera.pandas.Field() label: Series[pd.StringDtype] = pandera.pandas.Field(coerce=True)
[docs] class Config: ordered = True strict = True
SIMPLESEQ_COLUMNS = ("onset_s", "offset_s", "label")
[docs] @crowsetta.interface.SeqLike.register @attr.define class SimpleSeq: """Class meant to represent any simple sequence-like annotation format. The annotations can be a csv or txt file; the format should have 3 columns that represent the onset and offset times in seconds and the labels of the segments in the annotated sequences. The default is to assume a comma-separated values file with a header 'onset_s, offset_s, label', but this can be modified with keyword arguments. This format also assumes that each annotation file corresponds to one annotated source file, i.e. a single audio or spectrogram file. Attributes ---------- name: str Shorthand name for annotation format: ``'simple-seq'``. ext: str Extension of files in annotation format: ``('.csv', '.txt')`` onsets_s : numpy.ndarray Vector of floats corresponding to beginning of segments, i.e. onsets, in seconds offsets_s : numpy.ndarray Vector of floats corresponding to ends of segments, i.e. offsets, in seconds labels : numpy.ndarray Vector of string labels for segments annot_path : str, pathlib.Path Path to file from which annotations were loaded. notated_path : str. pathlib.Path path to file that ``annot_path`` annotates. E.g., an audio file, or an array file that contains a spectrogram generated from audio. Optional, default is None. """ name: ClassVar[str] = "simple-seq" ext: ClassVar[str] = (".csv", ".txt") onsets_s: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal)) offsets_s: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal)) labels: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal)) annot_path: pathlib.Path notated_path: Optional[pathlib.Path] = attr.field(default=None, converter=attr.converters.optional(pathlib.Path))
[docs] @classmethod def from_file( cls, annot_path: PathLike, notated_path: Optional[PathLike] = None, columns_map: Optional[Mapping] = None, read_csv_kwargs: Optional[Mapping] = None, default_label: str = "-", ) -> "Self": # noqa: F821 """Load annotations from a file in the 'simple-seq' format. The annotations can be a csv or txt file; the format should have 3 columns that represent the onset and offset times in seconds and the labels of the segments in the annotated sequences. The default is to assume a comma-separated values file with a header 'onset_s, offset_s, label', but this can be modified with keyword arguments. This format also assumes that each annotation file corresponds to one annotated source file, i.e. a single audio or spectrogram file. Parameters ---------- annot_path : str, pathlib.Path Path to an annotation file, with one of the extensions {'.csv', '.txt'}. notated_path : str, pathlib.Path Path to file that ``annot_path`` annotates. E.g., an audio file, or an array file that contains a spectrogram generated from audio. Optional, default is None. columns_map : dict-like Maps column names in header of ``annot_path`` to the standardized names used by this format. E.g., ``{'begin_time': 'onset_s', 'end_time': 'offset_s', 'text': 'label'}``. Optional, default is None--assumes that columns have the standardized names. read_csv_kwargs : dict Keyword arguments passed to :func:`pandas.read_csv`. Default is None, in which case all defaults for :func:`pandas.read_csv` will be used. Examples -------- >>> path = crowsetta.example('bl26lb16', return_path=True) >>> simple = crowsetta.formats.seq.SimpleSeq.from_file(path, >>> columns_map={'start_seconds': 'onset_s', >>> 'stop_seconds': 'offset_s', >>> 'name': 'label'}, >>> read_csv_kwargs={'index_col': 0}) """ annot_path = pathlib.Path(annot_path) crowsetta.validation.validate_ext(annot_path, extension=cls.ext) if read_csv_kwargs: df = pd.read_csv(annot_path, **read_csv_kwargs) else: df = pd.read_csv(annot_path) if len(df) == 0: # handle empty csv file, fixes https://github.com/vocalpy/crowsetta/issues/264 return cls( onsets_s=[], offsets_s=[], labels=[], annot_path=annot_path, notated_path=notated_path, ) if columns_map is not None: if not isinstance(columns_map, dict): raise TypeError(f"The `columns_map` argument must be a `dict` but type was: {type(dict)}") if not all((isinstance(k, str) and isinstance(v, str) for k, v in columns_map.items())): raise ValueError( "The `columns_map` argument must be a dict that maps string keys to string values, " "but not all keys and values were strings." ) if not all(v in SIMPLESEQ_COLUMNS for v in columns_map.values()): invalid_values = [v for v in columns_map.values() if v not in SIMPLESEQ_COLUMNS] raise ValueError( f"The `columns_map` argument must map keys (column names in the csv) " 'to these values: ("onset_s", "offset_s", "label"). ' f"The following values are invalid: {invalid_values}" ) df.columns = [ columns_map[column_name] if column_name in columns_map else column_name for column_name in df.columns ] if "label" not in df.columns: df["label"] = default_label if not all([col_name in df.columns for col_name in SIMPLESEQ_COLUMNS]): raise ValueError( "Annotations loaded from path did not have expected column names. " f"Column names from loaded csv file were: {df.columns.to_list()}\n" f"Expected column names are: {SIMPLESEQ_COLUMNS}\n" f"Please either re-map the column names using the `columns_map` argument, " "or if needed modify the csv file directly. " "Note that you only need to remap the column names that correspond to onset " "times, offset times, and labels; other columns will be ignored." ) df = df[list(SIMPLESEQ_COLUMNS)] # put in correct order df = SimpleSeqSchema.validate(df) return cls( onsets_s=df["onset_s"].values, offsets_s=df["offset_s"].values, labels=df["label"].values, annot_path=annot_path, notated_path=notated_path, )
[docs] def to_seq(self, round_times: bool = True, decimals: int = 3) -> crowsetta.Sequence: """Convert this annotation to a :class:`crowsetta.Sequence`. Parameters ---------- round_times : bool If True, round onsets_s and offsets_s. Default is True. decimals : int Number of decimals places to round floating point numbers to. Only meaningful if round_times is True. Default is 3, so that times are rounded to milliseconds. Returns ------- seq : crowsetta.Sequence Examples -------- >>> path = crowsetta.example('bl26lb16', return_path=True) >>> simple = crowsetta.formats.seq.SimpleSeq.from_file(path, >>> columns_map={'start_seconds': 'onset_s', >>> 'stop_seconds': 'offset_s', >>> 'name': 'label'}, >>> read_csv_kwargs={'index_col': 0}) >>> seq = simple.to_seq() Notes ----- The ``round_times`` and ``decimals`` arguments are provided to reduce differences across platforms due to floating point error, e.g. when loading annotation files and then sending them to a csv file, the result should be the same on Windows and Linux. """ if round_times: onsets_s = np.around(self.onsets_s, decimals=decimals) offsets_s = np.around(self.offsets_s, decimals=decimals) else: onsets_s = self.onsets_s offsets_s = self.offsets_s seq = crowsetta.Sequence.from_keyword(labels=self.labels, onsets_s=onsets_s, offsets_s=offsets_s) return seq
[docs] def to_annot(self, round_times: bool = True, decimals: int = 3) -> crowsetta.Annotation: """Convert this annotation to a :class:`crowsetta.Annotation`. Parameters ---------- round_times : bool If True, round onsets_s and offsets_s. Default is True. decimals : int Number of decimals places to round floating point numbers to. Only meaningful if round_times is True. Default is 3, so that times are rounded to milliseconds. Returns ------- annot : crowsetta.Annotation Examples -------- >>> path = crowsetta.example('bl26lb1b', return_path=True) >>> simple = crowsetta.formats.seq.SimpleSeq.from_file(path, >>> columns_map={'start_seconds': 'onset_s', >>> 'stop_seconds': 'offset_s', >>> 'name': 'label'}, >>> read_csv_kwargs={'index_col': 0}) >>> annot = simple.to_annot() Notes ----- The ``round_times`` and ``decimals`` arguments are provided to reduce differences across platforms due to floating point error, e.g. when loading annotation files and then sending them to a csv file, the result should be the same on Windows and Linux. """ seq = self.to_seq(round_times, decimals) return crowsetta.Annotation(annot_path=self.annot_path, notated_path=self.notated_path, seq=seq)
[docs] def to_file(self, annot_path: PathLike, to_csv_kwargs: Optional[Mapping] = None) -> None: """Save this 'simple-seq' annotation to a csv file. Parameters ---------- annot_path : str, pathlib.Path Path with filename of csv file that should be saved to_csv_kwargs : dict-like keyword arguments passed to :meth:`pandas.DataFrame.to_csv`. Default is None, in which case defaults for :func:`pandas.to_csv` will be used, except ``index`` is set to False. """ df = pd.DataFrame.from_records({"onset_s": self.onsets_s, "offset_s": self.offsets_s, "label": self.labels}) df = df[["onset_s", "offset_s", "label"]] # put in correct order try: df = SimpleSeqSchema.validate(df) except pandera.errors.SchemaError as e: raise ValueError(f"Annotations produced an invalid dataframe, cannot convert to csv:\n{df}") from e if to_csv_kwargs: df.to_csv(annot_path, **to_csv_kwargs) else: df.to_csv(annot_path, index=False)