Source code for crowsetta.formats.seq.textgrid.textgrid

"""Module with functions for working with Praat TextGrid annotation files"""
from __future__ import annotations

import pathlib
import reprlib
from typing import ClassVar, Optional, Union

import attr
import numpy as np

import crowsetta
from crowsetta.typing import PathLike

from .classes import IntervalTier, PointTier
from .parse import parse



[docs]
@crowsetta.interface.SeqLike.register
@attr.define
class TextGrid:
    """Class that represents annotations
    from TextGrid [1]_ files
    produced by the application Praat [2]_.

    This class can load TextGrid files
    saved by Praat as text files,
    in either the default format
    or the "short" format,
    as described in the specification [1]_.
    The class can load either UTF-8 or UTF-16 text files.
    It should detect both the encoding (UTF-8 or UTF-16)
    and the format (default or "short") automatically.

    The class does not currently parse binary TextGrid files
    (althoug there is an issue to add this,
    see https://github.com/vocalpy/crowsetta/issues/242).
    Please "thumbs up" that issue and comment
    if you would find this helpful.

    This class can parse both interval tiers
    and point tiers in TextGrid files,
    but when converting to a
    :class:`crowsetta.Annotation` it can only
    convert :class:`~crowsetta.formats.seq.textgrid.classes.IntervalTier`
    instances to :class:`crowsetta.Sequence` instances.
    See the :meth:`~crowsetta.formats.seq.textgrid.TextGrid.to_seq`
    method for details.

    Attributes
    ----------
    name: str
        Shorthand name for annotation format: ``'textgrid'``.
    ext: str
        Extension of files in annotation format: ``'.TextGrid'``.
    xmin: float
        Start time in seconds of this TextGrid.
    xmax: float
        End time in seconds of this TextGrid.
    tiers: list
        The tiers in this TextGrid,
        a list of IntervalTier and/or PointTier instances.
    annot_path : str, pathlib.Path
        The path to the TextGrid file from which annotations were loaded.
    audio_path : str, pathlib.Path
        The path to the audio file that ``annot_path`` annotates.
        Optional, default is None.

    Examples
    --------
    Loading the example textgrid

    >>> example = crowsetta.data.get('textgrid')
    >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
    >>> print(textgrid)
    TextGrid(tiers=[PointTier(nam...ark='L+!H-')]), IntervalTier(...aleila\\-^')]), IntervalTier(...t='earlier')])], xmin=0.0, xmax=2.4360509767904546, annot_path=PosixPath('/home/pimienta/.local/share/crowsetta/5.0.0rc2/textgrid/AVO-maea-basic.TextGrid'), audio_path=None)  # noqa: E501

    Determining the number of tiers in the textgrid

    >>> example = crowsetta.data.get('textgrid')
    >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
    >>> len(textgrid)
    3

    Getting the names of the tiers in the textgrid

    >>> example = crowsetta.data.get('textgrid')
    >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
    >>> textgrid.tier_names
    ['Tones', 'Samoan', 'Gloss']

    Getting a tier from the TextGrid by name

    >>> example = crowsetta.data.get('textgrid')
    >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
    >>> textgrid['Gloss']
    IntervalTier(name='Gloss', xmin=0.0, xmax=2.4360509767904546, intervals=[Interval(xmin=0.0, xmax=0.051451575248407266, text='PRES'), Interval(xmin=0.051451575248407266, xmax=0.6407379583230295, text='Sione'), Interval(xmin=0.6407379583230295, xmax=0.7544662733943284, text='PAST'), Interval(xmin=0.7544662733943284, xmax=1.244041566788134, text='pull-ES'), Interval(xmin=1.244041566788134, xmax=1.3481058803597676, text='DET'), Interval(xmin=1.3481058803597676, xmax=1.70760078178904, text='rope'), Interval(xmin=1.70760078178904, xmax=2.4360509767904546, text='earlier')])  # noqa: E501

    Getting a tier from the TextGrid by index

    >>> example = crowsetta.data.get('textgrid')
    >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
    >>> textgrid[2]  # same tier we just got by name
    IntervalTier(name='Gloss', xmin=0.0, xmax=2.4360509767904546, intervals=[Interval(xmin=0.0, xmax=0.051451575248407266, text='PRES'), Interval(xmin=0.051451575248407266, xmax=0.6407379583230295, text='Sione'), Interval(xmin=0.6407379583230295, xmax=0.7544662733943284, text='PAST'), Interval(xmin=0.7544662733943284, xmax=1.244041566788134, text='pull-ES'), Interval(xmin=1.244041566788134, xmax=1.3481058803597676, text='DET'), Interval(xmin=1.3481058803597676, xmax=1.70760078178904, text='rope'), Interval(xmin=1.70760078178904, xmax=2.4360509767904546, text='earlier')])  # noqa: E501

    Calling the :meth:`~crowsetta.formats.seq.TextGrid.to_seq` method
    with no arguments will convert all interval tiers
    :class:`~crowsetta.Sequence` instances,
    in the order they appear in the TextGrid.

    >>> example = crowsetta.data.get('textgrid')
    >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
    >>> textgrid.to_seq()
    [<Sequence with 7 segments>, <Sequence with 7 segments>]

    Call the :meth:`~crowsetta.formats.seq.TextGrid.to_seq` method
    with a ``tier`` argument to convert a specific
    :class:`~crowsetta.formats.seq.textgrid.classes.IntervalTier`s to a
    single :class:`~crowsetta.Sequence` instance.

    >>> example = crowsetta.data.get('textgrid')
    >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
    >>> textgrid.to_seq(tier=2)
    [<Sequence with 7 segments>]

    When calling :meth:`~crowsetta.formats.seq.TextGrid.to_seq`
    you can specify the ``tier`` as an int,
    or the name of the tier as a string.
    I.e., this parameter works the same way
    as square bracket access to a TextGrid as shown above.

    >>> example = crowsetta.data.get('textgrid')
    >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
    >>> seq1 = textgrid.to_seq(tier=2)
    >>> seq2 = textgrid.to_seq(tier="Gloss")
    >>> seq1 == seq2
    True

    Notes
    -----
    Code for parsing TextGrids is adapted from several sources,
    all under MIT license.
    The main logic in
    :func:`~crowsetta.formats.seq.textgrid.parse.parse_fp`
    is from <https://github.com/dopefishh/pympi>
    which is perhaps the most concise
    Python code I have found for parsing TextGrids.
    However, there are also good ideas in
    https://github.com/kylebgorman/textgrid/blob/master/textgrid/textgrid.py
    (__getitem__ method for tier access) and
    https://github.com/timmahrt/praatIO
    (data classes, handling encoding).

    For some documentation of the binary format see
    https://github.com/Legisign/Praat-textgrids
    and for a citable library with docs see
    https://github.com/hbuschme/TextGridTools
    but note that both of these have a GPL license.

    References
    ----------

    .. [1] https://www.fon.hum.uva.nl/praat/manual/TextGrid_file_formats.html

    .. [2]  Boersma, Paul & Weenink, David (2023).
       Praat: doing phonetics by computer [Computer program].
       Version 6.3.09, retrieved 2 March 2023 from http://www.praat.org/
    """

    name: ClassVar[str] = "textgrid"
    ext: ClassVar[str] = ".TextGrid"

    tiers: list[Union[IntervalTier, PointTier]] = attr.field(repr=reprlib.repr)
    xmin: float
    xmax: float
    annot_path: pathlib.Path
    audio_path: Optional[pathlib.Path] = attr.field(default=None, converter=attr.converters.optional(pathlib.Path))


[docs]
    @classmethod
    def from_file(
        cls,
        annot_path: PathLike,
        audio_path: Optional[PathLike] = None,
        keep_empty: bool = False,
    ) -> "Self":  # noqa: F821
        """Load annotations from a TextGrid file
        in the format used by Praat.

        Parameters
        ----------
        annot_path : str, pathlib.Path
            The path to a TextGrid file from which annotations were loaded.
        audio_path : str, pathlib.Path
            The path to the audio file that ``annot_path`` annotates.
            Optional, default is None.
        keep_empty : bool
            If True, keep intervals in
            interval tiers that have empty labels
            (i.e., the empty string "").
            Default is False.

        Examples
        --------
        >>> example = crowsetta.data.get('textgrid')
        >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
        >>> print(textgrid)
        TextGrid(tiers=[PointTier(nam...ark='L+!H-')]), IntervalTier(...aleila\\-^')]), IntervalTier(...t='earlier')])], xmin=0.0, xmax=2.4360509767904546, annot_path=PosixPath('/home/pimienta/.local/share/crowsetta/5.0.0rc2/textgrid/AVO-maea-basic.TextGrid'), audio_path=None)  # noqa: E501

        For usage, see the
        "Examples" section in :class:`crowsetta.formats.seq.textgrid.TextGrid`.

        See Also
        --------
        :class:`crowsetta.formats.seq.textgrid.TextGrid`
        """
        annot_path = pathlib.Path(annot_path)
        crowsetta.validation.validate_ext(annot_path, extension=cls.ext)

        tg_dict = parse(annot_path, keep_empty)

        return cls(
            tiers=tg_dict["tiers"],
            xmin=tg_dict["xmin"],
            xmax=tg_dict["xmax"],
            annot_path=annot_path,
            audio_path=audio_path,
        )


    def __len__(self):
        return len(self.tiers)

    @property
    def tier_names(self):
        return list(tier.name for tier in self.tiers)

    def __getitem__(self, key: Union[str, int, slice]) -> Union[IntervalTier, PointTier]:
        if isinstance(key, str):
            matching_name_inds = [tier_ind for tier_ind, tier in enumerate(self.tiers) if tier.name == key]
            if len(matching_name_inds) > 1:
                raise ValueError(
                    f"Multiple tiers have the name '{key}', tiers are: {matching_name_inds}."
                    "Please access tiers with one of those integer indices, "
                    "or give the tiers unique names to be able to access with a string."
                )
            ind = matching_name_inds[0]
            return self.tiers[ind]

        elif isinstance(key, (int, slice)):
            return self.tiers[key]

        else:
            raise TypeError(f"Tiers must be accessed with a string key or an integer index, but got a {type(key)}.")

    @staticmethod
    def _interval_tier_to_seq(
        interval_tier: IntervalTier, round_times: bool = True, decimals: int = 3
    ) -> crowsetta.Sequence:
        """Helper method used by ``to_seq``
        that converts a single IntervalTier to a ``crowsetta.Sequence``"""
        onsets_s = []
        offsets_s = []
        labels = []

        for interval in interval_tier.intervals:
            xmin, xmax, text = interval.xmin, interval.xmax, interval.text
            onsets_s.append(xmin)
            offsets_s.append(xmax)
            labels.append(text)

        onsets_s = np.array(onsets_s)
        offsets_s = np.array(offsets_s)
        labels = np.array(labels)

        if round_times:
            onsets_s = np.around(onsets_s, decimals=decimals)
            offsets_s = np.around(offsets_s, decimals=decimals)

        seq = crowsetta.Sequence.from_keyword(labels=labels, onsets_s=onsets_s, offsets_s=offsets_s)

        return seq


[docs]
    def to_seq(
        self, tier: int | str | None = None, round_times: bool = True, decimals: int = 3
    ) -> crowsetta.Sequence | list[crowsetta.Sequence]:
        """Convert an IntervalTier from this TextGrid annotation
        into a :class:`crowsetta.Sequence`.

        Currently, there is only support for converting a single IntervalTier
        to a single :class:`~crowsetta.Sequence`.

        Parameters
        ----------
        tier : int
            Index or string name of interval tier in TextGrid file
            from which annotations should be taken.
            Default is None, in which case all interval tiers
            are converted to :class:`crowsetta.Sequence`s.
        round_times : bool
            If True, round times of onsets and offsets.
            Default is True.
        decimals : int
            Number of decimals places to round floating point numbers to.
            Only meaningful if round_times is True.
            Default is 3, so that times are rounded to milliseconds.

        Returns
        -------
        seq : crowsetta.Sequence

        Examples
        --------
        Calling the :meth:`~crowsetta.formats.seq.TextGrid.to_seq` method
        with no arguments will convert all interval tiers
        :class:`~crowsetta.Sequence` instances,
        in the order they appear in the TextGrid.

        >>> example = crowsetta.data.get('textgrid')
        >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
        >>> textgrid.to_seq()
        [<Sequence with 7 segments>, <Sequence with 7 segments>]

        Call the :meth:`~crowsetta.formats.seq.TextGrid.to_seq` method
        with a ``tier`` arguments to convert a specific
        :class:`~crowsetta.formats.seq.textgrid.classes.IntervalTier` to a
        single :class:`~crowsetta.Sequence`.

        >>> example = crowsetta.data.get('textgrid')
        >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
        >>> textgrid.to_seq(tier=2)
        [<Sequence with 7 segments>]

        When calling :meth:`~crowsetta.formats.seq.TextGrid.to_seq`
        you can specify the ``tier`` as an int,
        or the name of the tier as a string.
        I.e., this parameter works the same way
        as square bracket access to a TextGrid as shown above.

        >>> example = crowsetta.data.get('textgrid')
        >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
        >>> seq1 = textgrid.to_seq(tier=2)
        >>> seq2 = textgrid.to_seq(tier="Gloss")
        >>> seq1 == seq2
        True

        Notes
        -----
        The ``round_times`` and ``decimals`` arguments are provided
        to reduce differences across platforms
        due to floating point error, e.g. when loading annotation files
        and then sending them to a csv file,
        the result should be the same on Windows and Linux.
        """
        if tier is not None:
            tier_ = self.__getitem__(tier)
            if not isinstance(tier_, IntervalTier):
                raise ValueError(
                    f"The specified tier ({tier}) is not an interval tier, type is {type(tier_)}."
                    f"Cannot convert to a crowsetta.Sequence"
                )
            return self._interval_tier_to_seq(tier_, round_times, decimals)

        seq = [
            self._interval_tier_to_seq(tier, round_times, decimals)
            for tier in self.tiers
            if isinstance(tier, IntervalTier)
        ]
        if len(seq) == 1:
            seq = seq[0]

        return seq



[docs]
    def to_annot(
        self, tier: int | str | None = None, round_times: bool = True, decimals: int = 3
    ) -> crowsetta.Annotation:
        """Convert interval tier or tiers from this TextGrid annotation
        to a :class:`crowsetta.Annotation` with a :data:`seq` attribute.

        Parameters
        ----------
        tier : int
            Index or string name of interval tier in TextGrid file
            from which annotations should be taken.
            Default is None, in which case all interval tiers
            are converted to :class:`crowsetta.Sequence`s.
        round_times : bool
            If True, round times of onsets and offsets.
            Default is True.
        decimals : int
            Number of decimals places to round floating point numbers to.
            Only meaningful if round_times is True.
            Default is 3, so that times are rounded to milliseconds.

        Returns
        -------
        annot : crowsetta.Annotation

        Examples
        --------
        >>> example = crowsetta.data.get('textgrid')
        >>> textgrid = crowsetta.formats.seq.TextGrid.from_file(example.annot_path)
        >>> annot = textgrid.to_annot()

        Notes
        -----
        The ``round_times`` and ``decimals`` arguments are provided
        to reduce differences across platforms
        due to floating point error, e.g. when loading annotation files
        and then sending them to a csv file,
        the result should be the same on Windows and Linux.
        """
        seq = self.to_seq(tier=tier, round_times=round_times, decimals=decimals)

        return crowsetta.Annotation(annot_path=self.annot_path, notated_path=self.audio_path, seq=seq)