Source code for crowsetta.formats.seq.textgrid.parse

"""Functions for parsing TextGrid files.

Code for parsing TextGrids is adapted from several sources,
all under MIT license.
The main logic in
:func:`~crowsetta.formats.seq.textgrid.parse.parse_fp`
is from <https://github.com/dopefishh/pympi>
which is perhaps the most concise
Python code I have found for parsing TextGrids.
However there are also good ideas in
https://github.com/kylebgorman/textgrid/blob/master/textgrid/textgrid.py
(__getitem__ method) and
https://github.com/timmahrt/praatIO
(data classes, handling encoding).
For some documentation of the binary format see
https://github.com/Legisign/Praat-textgrids
and for a citable library with docs see
https://github.com/hbuschme/TextGridTools
but note that both of these have a GPL license.
"""
from __future__ import annotations

import pathlib
import re
from typing import Final, TextIO

from .classes import Interval, IntervalTier, Point, PointTier

FLOAT_PAT: Final = re.compile(r"([\d.]+)\s*$", flags=re.UNICODE)
INT_PAT: Final = re.compile(r"([\d]+)\s*$", flags=re.UNICODE)
STR_PAT: Final = re.compile(r'"(.*)"\s*$', flags=re.UNICODE)


[docs] def search_next_line(fp: TextIO, pat: re.Pattern) -> str: """Get next line from a text stream and search it for a regex pattern. This is a helper function used by :func:`~crowsetta.textgrid.parse.get_float_from_line`, :func:`~crowsetta.textgrid.parse.get_int_from_line`, and :func:`~crowsetta.textgrid.parse.get_str_from_line`. Parameters ---------- fp : TextIO Python text stream from an open TextGrid file. pat : re.Pattern A complied regex pattern. Returns ------- match : str The match string """ line = fp.readline() return pat.search(line).group(1)
[docs] def get_float_from_next_line(fp: TextIO) -> float: """Get next line from a text stream, search for a string that matches a float value, and return as a float. Helper function used by :func:`~crowsetta.textgrid.parse.parse_fp`, e.g., to parse ``xmin`` and ``xmax`` times of ``IntervalTier``s. Parameters ---------- fp : TextIO Python text stream from an open TextGrid file. Returns ------- val : float """ return float(search_next_line(fp, pat=FLOAT_PAT))
[docs] def get_int_from_next_line(fp: TextIO) -> int: """Get next line from a text stream, search for a string that matches an int value, and return as an int. Helper function used by :func:`~crowsetta.textgrid.parse.parse_fp`, e.g., to parse the number of intervals in an interval tier. Parameters ---------- fp : TextIO Python text stream from an open TextGrid file. Returns ------- val : int """ return int(search_next_line(fp, pat=INT_PAT))
[docs] def get_str_from_next_line(fp: TextIO) -> str: """Get next line from a text stream, search for a string as Praat writes them (with double quoting), and then return just that string. Helper function used by :func:`~crowsetta.textgrid.parse.parse_fp`, e.g., to parse ``text``s for ``Interval``s in ``IntervalTier``s or to parse ``text`` for ``PointTier``s. Parameters ---------- fp : TextIO Python text stream from an open TextGrid file. Returns ------- val : str """ # don't need to cast here return search_next_line(fp, pat=STR_PAT)
INTERVAL_TIER: Final = "IntervalTier" POINT_TIER: Final = "TextTier"
[docs] def parse_fp(fp: TextIO, keep_empty: bool = False) -> dict: """Parse a TextGrid file passed in as an open text stream, converting it to a :class:`dict`. Helper function called by :func:`~crowsetta.formats.seq.textgrid.parse.parse`. Parameters ---------- fp : TextIO Python text stream from an open TextGrid file. keep_empty : bool If True, keep intervals in interval tiers that have empty labels (i.e., the empty string ""). Default is False. Returns ------- tg : dict A parsed TextGrid as a :class:`dict:. """ # Skip the Headers and empty line for _ in range(3): fp.readline() xmin_tg, xmax_tg = get_float_from_next_line(fp), get_float_from_next_line(fp) # We don't use next line except to determine format: # if it's just '<exists>' then format is "short", anything else is "full" line = fp.readline() is_short = line.strip() == "<exists>" n_tier = get_int_from_next_line(fp) if not is_short: # skip item[]: fp.readline() # make textgrid dict we will return below tg = { "xmin": xmin_tg, "xmax": xmax_tg, } tiers = [] for _ in range(n_tier): if not is_short: fp.readline() # skip item[\d]: (where \d is some number) tier_type = get_str_from_next_line(fp) tier_name = get_str_from_next_line(fp) xmin_tier = get_float_from_next_line(fp) xmax_tier = get_float_from_next_line(fp) entries = [] # intervals or points depending on tier type for _ in range(get_int_from_next_line(fp)): if not is_short: fp.readline() # skip intervals [\d] if tier_type == INTERVAL_TIER: xmin = get_float_from_next_line(fp) xmax = get_float_from_next_line(fp) text = get_str_from_next_line(fp) if not keep_empty: if text == "": continue entry = Interval(xmin=xmin, xmax=xmax, text=text) elif tier_type == POINT_TIER: number = get_float_from_next_line(fp) mark = get_str_from_next_line(fp) entry = Point( number=number, mark=mark, ) entries.append(entry) if tier_type == INTERVAL_TIER: tier = IntervalTier(name=tier_name, xmin=xmin_tier, xmax=xmax_tier, intervals=entries) elif tier_type == POINT_TIER: tier = PointTier( name=tier_name, xmin=xmin_tier, xmax=xmax_tier, points=entries, ) tiers.append(tier) tg["tiers"] = tiers return tg
[docs] def parse(textgrid_path: str | pathlib.Path, keep_empty: bool = False) -> dict: """Parse a TextGrid file, loading it into a :class:`dict`. This function is used by :meth:`crowsetta.formats.seq.TextGrid.from_file` to load and parse the TextGrid file passed in as the ``annot_path`` argument. Parameters ---------- textgrid_path : str, pathlib.Path The path to a TextGrid file. keep_empty : bool If True, keep intervals in interval tiers that have empty labels (i.e., the empty string ""). Default is False. Returns ------- textgrid_raw : dict A dict with keys 'xmin', 'xmax', and 'tiers'. """ textgrid_path = pathlib.Path(textgrid_path) try: with textgrid_path.open("r", encoding="utf-16") as fp: textgrid_raw = parse_fp(fp, keep_empty) except UnicodeError: with textgrid_path.open("r", encoding="utf-8") as fp: textgrid_raw = parse_fp(fp, keep_empty) return textgrid_raw