Source code for aeneas.ttswrappers.festivalttswrapper

#!/usr/bin/env python
# coding=utf-8

# aeneas is a Python/C library and a set of tools
# to automagically synchronize audio and text (aka forced alignment)
#
# Copyright (C) 2012-2013, Alberto Pettarin (www.albertopettarin.it)
# Copyright (C) 2013-2015, ReadBeyond Srl   (www.readbeyond.it)
# Copyright (C) 2015-2017, Alberto Pettarin (www.albertopettarin.it)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
This module contains the following classes:

* :class:`~aeneas.ttswrappers.festivalttswrapper.FESTIVALTTSWrapper`,
  a wrapper for the ``Festival`` TTS engine.

Please refer to
http://www.cstr.ed.ac.uk/projects/festival/
for further details.
"""

from __future__ import absolute_import
from __future__ import print_function

from aeneas.exacttiming import TimeValue
from aeneas.language import Language
from aeneas.runtimeconfiguration import RuntimeConfiguration
from aeneas.ttswrappers.basettswrapper import BaseTTSWrapper
import aeneas.globalfunctions as gf


[docs]class FESTIVALTTSWrapper(BaseTTSWrapper):
    """
    A wrapper for the ``Festival`` TTS engine.

    This wrapper supports calling the TTS engine
    via ``subprocess`` or via Python C++ extension.

    .. warning::
        The C++ extension call is experimental and
        probably works only on Linux at the moment.

    In abstract terms, it performs one or more calls like ::

        $ echo text | text2wave -eval "(language_italian)" -o output_file.wav

    To use this TTS engine, specify ::

        "tts=festival"

    in the ``RuntimeConfiguration`` object.
    To execute from a non-default location: ::

        "tts=festival|tts_path=/path/to/wave2text"

    See :class:`~aeneas.ttswrappers.basettswrapper.BaseTTSWrapper`
    for the available functions.
    Below are listed the languages supported by this wrapper.

    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    """

    CES = Language.CES
    """ Czech """

    CYM = Language.CYM
    """ Welsh """

    ENG = Language.ENG
    """ English """

    FIN = Language.FIN
    """ Finnish """

    ITA = Language.ITA
    """ Italian """

    RUS = Language.RUS
    """ Russian """

    SPA = Language.SPA
    """ Spanish """

    ENG_GBR = "eng-GBR"
    """ English (GB) """

    ENG_SCT = "eng-SCT"
    """ English (Scotland) """

    ENG_USA = "eng-USA"
    """ English (USA) """

    LANGUAGE_TO_VOICE_CODE = {
        CES: CES,
        CYM: CYM,
        ENG: ENG,
        SPA: SPA,
        FIN: FIN,
        ITA: ITA,
        RUS: RUS,
        ENG_GBR: ENG_GBR,
        ENG_SCT: ENG_SCT,
        ENG_USA: ENG_USA,
    }
    DEFAULT_LANGUAGE = ENG_USA

    CODE_TO_HUMAN = {
        CES: u"Czech",
        CYM: u"Welsh",
        ENG: u"English",
        FIN: u"Finnish",
        ITA: u"Italian",
        RUS: u"Russian",
        SPA: u"Spanish",
        ENG_GBR: u"English (GB)",
        ENG_SCT: u"English (Scotland)",
        ENG_USA: u"English (USA)",
    }

    CODE_TO_HUMAN_LIST = sorted([u"%s\t%s" % (k, v) for k, v in CODE_TO_HUMAN.items()])

    VOICE_CODE_TO_SUBPROCESS = {
        CES: u"(language_czech)",
        CYM: u"(language_welsh)",
        ENG: u"(language_english)",
        ENG_GBR: u"(language_british_english)",
        ENG_SCT: u"(language_scots_gaelic)",
        ENG_USA: u"(language_american_english)",
        SPA: u"(language_castillian_spanish)",
        FIN: u"(language_finnish)",
        ITA: u"(language_italian)",
        RUS: u"(language_russian)",
    }

    DEFAULT_TTS_PATH = "text2wave"

    OUTPUT_AUDIO_FORMAT = ("pcm_s16le", 1, 16000)

    HAS_SUBPROCESS_CALL = True

    HAS_C_EXTENSION_CALL = True

    C_EXTENSION_NAME = "cfw"

    TAG = u"FESTIVALTTSWrapper"

    def __init__(self, rconf=None, logger=None):
        super(FESTIVALTTSWrapper, self).__init__(rconf=rconf, logger=logger)
        self.set_subprocess_arguments([
            self.tts_path,
            self.CLI_PARAMETER_VOICE_CODE_FUNCTION,
            u"-o",
            self.CLI_PARAMETER_WAVE_PATH,
            self.CLI_PARAMETER_TEXT_STDIN
        ])

    def _voice_code_to_subprocess(self, voice_code):
        return [u"-eval", self.VOICE_CODE_TO_SUBPROCESS[voice_code]]

    def _synthesize_multiple_c_extension(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, using the cfw extension.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, :class:`~aeneas.exacttiming.TimeValue`, int))
        """
        self.log(u"Synthesizing using C extension...")

        # convert parameters from Python values to C values
        try:
            c_quit_after = float(quit_after)
        except TypeError:
            c_quit_after = 0.0
        c_backwards = 0
        if backwards:
            c_backwards = 1
        self.log([u"output_file_path: %s", output_file_path])
        self.log([u"c_quit_after:     %.3f", c_quit_after])
        self.log([u"c_backwards:      %d", c_backwards])
        self.log(u"Preparing u_text...")
        u_text = []
        fragments = text_file.fragments
        for fragment in fragments:
            f_lang = fragment.language
            f_text = fragment.filtered_text
            if f_lang is None:
                f_lang = self.DEFAULT_LANGUAGE
            f_voice_code = self.VOICE_CODE_TO_SUBPROCESS[self._language_to_voice_code(f_lang)]
            if f_text is None:
                f_text = u""
            u_text.append((f_voice_code, f_text))
        self.log(u"Preparing u_text... done")

        # call C extension
        sr = None
        sf = None
        intervals = None

        self.log(u"Preparing c_text...")
        if gf.PY2:
            # Python 2 => pass byte strings
            c_text = [(gf.safe_bytes(t[0]), gf.safe_bytes(t[1])) for t in u_text]
        else:
            # Python 3 => pass Unicode strings
            c_text = [(gf.safe_unicode(t[0]), gf.safe_unicode(t[1])) for t in u_text]
        self.log(u"Preparing c_text... done")

        self.log(u"Calling aeneas.cfw directly")
        try:
            self.log(u"Importing aeneas.cfw...")
            import aeneas.cfw.cfw
            self.log(u"Importing aeneas.cfw... done")
            self.log(u"Calling aeneas.cfw...")
            sr, sf, intervals = aeneas.cfw.cfw.synthesize_multiple(
                output_file_path,
                c_quit_after,
                c_backwards,
                c_text
            )
            self.log(u"Calling aeneas.cfw... done")
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while running cfw", exc, False, None)
            return (False, None)

        self.log([u"sr: %d", sr])
        self.log([u"sf: %d", sf])

        # create output
        anchors = []
        current_time = TimeValue("0.000")
        num_chars = 0
        if backwards:
            fragments = fragments[::-1]
        for i in range(sf):
            # get the correct fragment
            fragment = fragments[i]
            # store for later output
            anchors.append([
                TimeValue(intervals[i][0]),
                fragment.identifier,
                fragment.filtered_text
            ])
            # increase the character counter
            num_chars += fragment.characters
            # update current_time
            current_time = TimeValue(intervals[i][1])

        # return output
        # NOTE anchors do not make sense if backwards == True
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Synthesizing using C extension... done")
        return (True, (anchors, current_time, num_chars))