Source code for crowsetta.formats.seq.audseq

"""module for Audacity LabelTrack
in standard/default format exported to txt files
https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Standard_.28default.29_format
"""
import pathlib
from typing import ClassVar, Optional

import attr
import numpy as np
import pandas as pd
import pandera
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike


[docs] class AudSeqSchema(pandera.SchemaModel): """A :class:`pandera.SchemaModel` that validates :type:`pandas.DataFrame`s loaded from Audacity Labeltrack annotations exported to txt files in the standard format. The standard format is described here: https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Standard_.28default.29_format """ start_time: Optional[Series[float]] = pandera.Field() end_time: Optional[Series[float]] = pandera.Field() label: Series[pd.StringDtype] = pandera.Field(coerce=True)
[docs] class Config: ordered = True strict = True
[docs] @crowsetta.interface.SeqLike.register @attr.define class AudSeq: """Class meant to represent Audacity Labeltrack annotations exported to txt files in the standard format[1]_. The txt file will have 3 tab-separated columns that represent the start time, end time, and labels of annotated regions. Attributes ---------- name: str Shorthand name for annotation format: ``'aud-seq'``. ext: str Extension of files in annotation format, ``'.txt'``. start_times : numpy.ndarray Vector of integer sample numbers corresponding to beginning of segments, i.e. onsets. end_times : numpy.ndarray Vector of integer sample numbers corresponding to ends of segments, i.e. offsets. labels : numpy.ndarray Vector of string labels for segments; each element is either a single word, or a single phonetic transcription code. annot_path : str, pathlib.Path Path to file from which annotations were loaded. notated_path : str, pathlib.Path Path to file that ``annot_path`` annotates. E.g., an audio file, or an array file that contains a spectrogram generated from audio. Optional, default is None. References ---------- .. [1^] https://manual.audacityteam.org/man/importing_and_exporting_labels.html#Standard_.28default.29_format """ name: ClassVar[str] = "aud-seq" ext: ClassVar[str] = ".txt" start_times: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal)) end_times: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal)) labels: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal)) annot_path: pathlib.Path notated_path: Optional[pathlib.Path] = attr.field(default=None, converter=attr.converters.optional(pathlib.Path))
[docs] @classmethod def from_file( cls, annot_path: PathLike, notated_path: Optional[PathLike] = None, ) -> "Self": # noqa: F821 """Load annotations from a file. Parameters ---------- annot_path : str, pathlib.Path Path to an annotation file, with '.txt' extension. notated_path : str, pathlib.Path Path to file that ``annot_path`` annotates. E.g., an audio file, or an array file that contains a spectrogram generated from audio. Optional, default is None. Examples -------- >>> example = crowsetta.data.get('aud-seq') >>> audseq = crowsetta.formats.seq.AudSeq.from_file(example.annot_path) """ annot_path = pathlib.Path(annot_path) crowsetta.validation.validate_ext(annot_path, extension=cls.ext) df = pd.read_csv(annot_path, sep="\t", header=None) df.columns = ["start_time", "end_time", "label"] df = AudSeqSchema.validate(df) return cls( start_times=df["start_time"].values, end_times=df["end_time"].values, labels=df["label"].values, annot_path=annot_path, notated_path=notated_path, )
[docs] def to_seq(self, round_times: bool = True, decimals: int = 3) -> crowsetta.Sequence: """Convert this annotation to a :class:`crowsetta.Sequence`. Parameters ---------- round_times : bool If True, round ``onsets_s`` and ``offsets_s``. Default is True. decimals : int Number of decimals places to round floating point numbers to. Only meaningful if round_times is True. Default is 3, so that times are rounded to milliseconds. Returns ------- seq : crowsetta.Sequence Examples -------- >>> example = crowsetta.data.get('aud-seq') >>> audseq = crowsetta.formats.seq.AudSeq.from_file(example.annot_path) >>> seq = audseq.to_seq() Notes ----- The ``round_times`` and ``decimals`` arguments are provided to reduce differences across platforms due to floating point error, e.g. when loading annotation files and then sending them to a csv file, the result should be the same on Windows and Linux. """ if round_times: onsets_s = np.around(self.start_times, decimals=decimals) offsets_s = np.around(self.end_times, decimals=decimals) else: onsets_s = self.start_times offsets_s = self.end_times seq = crowsetta.Sequence.from_keyword(labels=self.labels, onsets_s=onsets_s, offsets_s=offsets_s) return seq
[docs] def to_annot(self, round_times: bool = True, decimals: int = 3) -> crowsetta.Annotation: """Convert this annotation to a :class:`crowsetta.Annotation`. Parameters ---------- round_times : bool If True, round onsets_s and offsets_s. Default is True. decimals : int Number of decimals places to round floating point numbers to. Only meaningful if round_times is True. Default is 3, so that times are rounded to milliseconds. Returns ------- annot : crowsetta.Annotation Examples -------- >>> example = crowsetta.data.get('aud-seq') >>> audseq = crowsetta.formats.seq.AudSeq.from_file(example.annot_path) >>> annot = audseq.to_annot() Notes ----- The ``round_times`` and ``decimals`` arguments are provided to reduce differences across platforms due to floating point error, e.g. when loading annotation files and then sending them to a csv file, the result should be the same on Windows and Linux. """ seq = self.to_seq(round_times, decimals) return crowsetta.Annotation(annot_path=self.annot_path, notated_path=self.notated_path, seq=seq)
[docs] def to_file(self, annot_path: PathLike) -> None: """Save this 'aud-seq' annotation to a txt file in the standard/default Audacity LabelTrack format. Parameters ---------- annot_path : str, pathlib.Path Path with filename of txt file that should be saved. """ df = pd.DataFrame.from_records( {"start_time": self.start_times, "end_time": self.end_times, "label": self.labels} ) df = df[["start_time", "end_time", "label"]] # put in correct order try: df = AudSeqSchema.validate(df) except pandera.errors.SchemaError as e: raise ValueError( f"Annotations produced an invalid dataframe, " f"cannot convert to Audacity LabelTrack txt file:\n{df}" ) from e df.to_csv(annot_path, sep="\t", header=False, index=False)