Source code for crowsetta.formats.seq.timit

"""Module with functions that handle phn annotation files
from the TIMIT[1]_ dataset.

.. [1] Garofolo, John S., et al. TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1.
   Web Download. Philadelphia: Linguistic Data Consortium, 1993.
"""
import pathlib
import warnings
from typing import ClassVar, Optional

import attr
import numpy as np
import pandas as pd
import pandera
import soundfile
from pandera.typing import Series

import crowsetta
from crowsetta.typing import PathLike



[docs]
class TimitTranscriptSchema(pandera.SchemaModel):
    """A :class:`pandera.SchemaModel` that validates :type:`pandas.DataFrame`s
    loaded from a phn or wrd file in the TIMIT[1]_ transcription format.

    References
    ----------
    .. [1] Garofolo, John S., et al. TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1.
       Web Download. Philadelphia: Linguistic Data Consortium, 1993.
    """

    begin_sample: Optional[Series[int]] = pandera.Field()
    end_sample: Optional[Series[int]] = pandera.Field()
    text: Series[pd.StringDtype] = pandera.Field(coerce=True)


[docs]
    class Config:
        ordered = True
        strict = True





[docs]
@crowsetta.interface.SeqLike.register
@attr.define
class Timit:
    """Class that represents annotations from transcription files in the
    DARPA TIMIT Acoustic-Phonetic Continuous Speech Corpus[1]_.

    Attributes
    ----------
    name: str
        Shorthand name for annotation format: ``'timit'``.
    ext: str
        Extension of files in annotation format:
        ``('.phn', '.PHN', '.wrd', '.WRD')``
    begin_samples : numpy.ndarray
        Vector of integer sample numbers corresponding
        to beginning of segments, i.e. onsets
    end_samples : numpy.ndarray
        Vector of integer sample numbers corresponding
        to ends of segments, i.e. offsets
    text : numpy.ndarray
        Vector of string labels for segments;
        each element is either a single word,
        or a single phonetic transcription code.
    annot_path : str, pathlib.Path
        Path to TIMIT transcription file from which annotations were loaded.
    audio_path : str. pathlib.Path
        Path to audio file that the TIMIT transcription file annotates.

    References
    ----------
    .. [1] Garofolo, John S., et al. TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1.
       Web Download. Philadelphia: Linguistic Data Consortium, 1993.
    """

    name: ClassVar[str] = "timit"
    ext: ClassVar[str] = (".phn", ".PHN", ".wrd", ".WRD")

    begin_samples: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
    end_samples: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
    text: np.ndarray = attr.field(eq=attr.cmp_using(eq=np.array_equal))
    annot_path: pathlib.Path
    audio_path: Optional[pathlib.Path] = attr.field(default=None, converter=attr.converters.optional(pathlib.Path))


[docs]
    @classmethod
    def from_file(cls, annot_path: PathLike, audio_path: Optional[PathLike] = None) -> "Self":  # noqa: F821
        """Load annotations from a TIMIT[1]_ transcription file.

        Parameters
        ----------
        annot_path : str, pathlib.Path
            Path to a TIMIT transcription file,
            with one of the extensions {'.phn', '.PHN', '.wrd', '.WRD'}.
        audio_path : str, pathlib.Path
            Optional, defaults to ``annot_path`` with the extension
            changed to '.wav' or '.WAV'. Both extensions are checked
            and if either file exists, that one is used. Otherwise,
            defaults to '.wav' in lowercase.

        Examples
        --------
        >>> example = crowsetta.data.get('timit')
        >>> timit = crowsetta.formats.seq.Timit.from_file(example.annot_path)

        Notes
        -----
        Versions of the dataset exist with the extensions
        in capital letters. Some platforms may not have case-sensitive paths.

        References
        ----------
        .. [1] Garofolo, John S., et al. TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1.
           Web Download. Philadelphia: Linguistic Data Consortium, 1993.
        """
        annot_path = pathlib.Path(annot_path)
        # note multiple extensions, both all-uppercase and all-lowercase `.phn` exist,
        # depending on which version of TIMIT dataset you have
        crowsetta.validation.validate_ext(annot_path, extension=cls.ext)

        #  assume file is space-separated with no header
        df = pd.read_csv(annot_path, sep=" ", header=None)
        df.columns = ["begin_sample", "end_sample", "text"]
        df = TimitTranscriptSchema.validate(df)

        if audio_path is None:
            for ext in (".wav", ".WAV"):
                audio_path = annot_path.parent / (annot_path.stem + ext)
                if audio_path.exists():
                    break
            if not audio_path.exists():
                # just default to lower-case .wav
                audio_path = annot_path.parent / (annot_path.stem + ".wav")

        return cls(
            annot_path=annot_path,
            begin_samples=df["begin_sample"].values,
            end_samples=df["end_sample"].values,
            text=df["text"].values,
            audio_path=audio_path,
        )



[docs]
    def to_seq(
        self, round_times: bool = True, decimals: int = 3, samplerate: Optional[int] = None
    ) -> crowsetta.Sequence:
        """Convert this TIMIT annotation to a :class:`crowsetta.Sequence`.

        Parameters
        ----------
        round_times : bool
            if True, round onsets_s and offsets_s.
            Default is True.
        decimals : int
            number of decimals places to round floating point numbers to.
            Only meaningful if round_times is True.
            Default is 3, so that times are rounded to milliseconds.
        samplerate : int
            Sampling rate for wave files. Used to convert
            ``begin_samples`` and ``end_samples``
            from sample number to seconds.
            Default is None, in which ths function
            tries to open ``audio_path`` and determine
            the actual sampling rate. If this does not work,
            then the ``onsets_s`` and ``offsets_s`` attributes
            of the :class:`crowsetta.Sequence` are left as None.

        Examples
        --------
        >>> example = crowsetta.data.get('timit')
        >>> timit = crowsetta.formats.seq.Timit.from_file(example.annot_path)
        >>> seq = timit.to_seq()

        Returns
        -------
        phn_seq : crowsetta.Sequence

        Notes
        -----
        The ``round_times`` and ``decimals`` arguments are provided
        to reduce differences across platforms
        due to floating point error, e.g. when loading annotation files
        and then sending them to a csv file,
        the result should be the same on Windows and Linux.
        """
        onset_samples = self.begin_samples
        offset_samples = self.end_samples
        labels = self.text

        if samplerate is None:
            try:
                samplerate = soundfile.info(self.audio_path).samplerate
            except RuntimeError:
                warnings.warn(
                    f"wav file not found: {self.audio_path}."
                    f"Could not determine sampling rate to convert onsets and offsets to seconds. "
                    f"To use a fixed sampling rate for all files, pass in a value for the `samplerate` "
                    f"argument, but be aware that this may not be the correct sampling rate for some files.",
                    UserWarning,
                    stacklevel=2,
                )
                samplerate = None

        onsets_s = onset_samples / samplerate
        offsets_s = offset_samples / samplerate

        if round_times:
            onsets_s = np.around(onsets_s, decimals=decimals)
            offsets_s = np.around(offsets_s, decimals=decimals)

        phn_seq = crowsetta.Sequence.from_keyword(
            labels=labels,
            onset_samples=onset_samples,
            offset_samples=offset_samples,
            onsets_s=onsets_s,
            offsets_s=offsets_s,
        )
        return phn_seq



[docs]
    def to_annot(
        self, round_times: bool = True, decimals: int = 3, samplerate: Optional[int] = None
    ) -> crowsetta.Annotation:
        """Convert this TIMIT annotation to a :class:`crowsetta.Annotation`.

        Parameters
        ----------
        round_times : bool
            If True, round onsets_s and offsets_s.
            Default is True.
        decimals : int
            Number of decimals places to round floating point numbers to.
            Only meaningful if round_times is True.
            Default is 3, so that times are rounded to milliseconds.
        samplerate : int
            Sampling rate for wave files. Used to convert
            ``begin_samples`` and ``end_samples``
            from sample number to seconds.
            Default is None, in which ths function
            tries to open ``audio_path`` and determine
            the actual sampling rate. If this does not work,
            then the ``onsets_s`` and ``offsets_s`` attributes
            of the :class:`crowsetta.Sequence` are left as None.

        Examples
        --------
        >>> example = crowsetta.data.get('timit')
        >>> timit = crowsetta.formats.seq.Timit.from_file(example.annot_path)
        >>> annot = timit.to_annot()

        Returns
        -------
        annot : crowsetta.Annotation

        Notes
        -----
        The ``round_times`` and ``decimals`` arguments are provided
        to reduce differences across platforms
        due to floating point error, e.g. when loading annotation files
        and then sending them to a csv file,
        the result should be the same on Windows and Linux.
        """
        phn_seq = self.to_seq(round_times, decimals, samplerate)
        return crowsetta.Annotation(annot_path=self.annot_path, notated_path=self.audio_path, seq=phn_seq)



[docs]
    def to_file(self, annot_path: PathLike) -> None:
        """Make a phn file in the TIMIT format
        from this instance.

        Parameters
        ----------
        annot_path : str, pahtlib.Path
             Path including filename where file should be saved.
             Must have a valid extension for TIMIT transcription files,
             one of {'.phn', '.PHN', '.wrd', '.WRD'}.
        """
        crowsetta.validation.validate_ext(annot_path, extension=self.ext)

        lines = []
        for begin_sample, end_sample, text in zip(
            self.begin_samples.tolist(), self.end_samples.tolist(), list(self.text)
        ):
            lines.append(f"{begin_sample} {end_sample} {text}\n")

        with annot_path.open("w") as fp:
            fp.writelines(lines)