Source code for aeneas.ttswrappers.nuancettswrapper

#!/usr/bin/env python
# coding=utf-8

# aeneas is a Python/C library and a set of tools
# to automagically synchronize audio and text (aka forced alignment)
#
# Copyright (C) 2012-2013, Alberto Pettarin (www.albertopettarin.it)
# Copyright (C) 2013-2015, ReadBeyond Srl   (www.readbeyond.it)
# Copyright (C) 2015-2017, Alberto Pettarin (www.albertopettarin.it)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
This module contains the following classes:

* :class:`~aeneas.ttswrappers.nuancettswrapper.NuanceTTSWrapper`,
  a wrapper for the Nuance TTS API engine.

Please refer to
https://developer.nuance.com/
for further details.

.. note:: This module requires Python module ``requests`` (``pip install requests``).

.. warning:: You will be billed according to your Nuance Developers account plan.

.. warning:: This module is experimental, use at your own risk.

.. versionadded:: 1.5.0
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy
import time
import uuid

from aeneas.audiofile import AudioFile
from aeneas.exacttiming import TimeValue
from aeneas.language import Language
from aeneas.runtimeconfiguration import RuntimeConfiguration
from aeneas.ttswrappers.basettswrapper import BaseTTSWrapper
import aeneas.globalfunctions as gf


[docs]class NuanceTTSWrapper(BaseTTSWrapper): """ A wrapper for the Nuance Developers TTS API. This wrapper supports calling the TTS engine only via Python. In abstract terms, it performs one or more calls to the Nuance TTS API service, and concatenate the resulting WAVE files, returning their anchor times. To use this TTS engine, specify :: "tts=nuance|nuance_tts_api_id=...|nuance_tts_api_key=..." in the ``RuntimeConfiguration`` object, substituting your Nuance Developer API ID and Key. You might also want to enable the TTS caching, to reduce the number of API calls :: "tts=nuance|tts_cache=True" See :class:`~aeneas.ttswrappers.basettswrapper.BaseTTSWrapper` for the available functions. Below are listed the languages supported by this wrapper. :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` """ ARA = Language.ARA """ Arabic """ CAT = Language.CAT """ Catalan """ CES = Language.CES """ Czech """ CMN = Language.CMN """ Mandarin Chinese """ DAN = Language.DAN """ Danish """ DEU = Language.DEU """ German """ ELL = Language.ELL """ Greek (Modern) """ ENG = Language.ENG """ English """ EUS = Language.EUS """ Basque """ FIN = Language.FIN """ Finnish """ FRA = Language.FRA """ French """ GLG = Language.GLG """ Galician """ HEB = Language.HEB """ Hebrew """ HIN = Language.HIN """ Hindi """ HUN = Language.HUN """ Hungarian """ IND = Language.IND """ Indonesian """ ITA = Language.ITA """ Italian """ JPN = Language.JPN """ Japanese """ KOR = Language.KOR """ Korean """ NLD = Language.NLD """ Dutch """ NOR = Language.NOR """ Norwegian """ POL = Language.POL """ Polish """ POR = Language.POR """ Portuguese """ RON = Language.RON """ Romanian """ RUS = Language.RUS """ Russian """ SLK = Language.SLK """ Slovak """ SPA = Language.SPA """ Spanish """ SWE = Language.SWE """ Swedish """ THA = Language.THA """ Thai """ TUR = Language.TUR """ Turkish """ YUE = Language.YUE """ Yue Chinese """ CMN_CHN = "cmn-CHN" """ Mandarin Chinese (China) """ CMN_TWN = "cmn-TWN" """ Mandarin Chinese (Taiwan) """ ENG_AUS = "eng-AUS" """ English (Australia) """ ENG_GBR = "eng-GBR" """ English (GB) """ ENG_IND = "eng-IND" """ English (India) """ ENG_IRL = "eng-IRL" """ English (Ireland) """ ENG_SCT = "eng-SCT" """ English (Scotland) """ ENG_ZAF = "eng-ZAF" """ English (South Africa) """ ENG_USA = "eng-USA" """ English (USA) """ FRA_CAN = "fra-CAN" """ French (Canada) """ NLD_BEL = "nld-BEL" """ Dutch (Belgium) """ POR_BRA = "por-BRA" """ Portuguese (Brazil) """ POR_PRT = "por-PRT" """ Portuguese (Portugal) """ SPA_COL = "spa-COL" """ Spanish (Colombia) """ SPA_ESP = "spa-ESP" """ Spanish (Spain) """ SPA_MEX = "spa-MEX" """ Spanish (Mexico) """ CODE_TO_HUMAN = { ARA: u"Arabic", CAT: u"Catalan", CES: u"Czech", CMN: u"Mandarin Chinese", DAN: u"Danish", DEU: u"German", ELL: u"Greek (Modern)", ENG: u"English", EUS: u"Basque", FIN: u"Finnish", FRA: u"French", GLG: u"Galician", HEB: u"Hebrew", HIN: u"Hindi", HUN: u"Hungarian", IND: u"Indonesian", ITA: u"Italian", JPN: u"Japanese", KOR: u"Korean", NLD: u"Dutch", NOR: u"Norwegian", POL: u"Polish", POR: u"Portuguese", RON: u"Romanian", RUS: u"Russian", SLK: u"Slovak", SPA: u"Spanish", SWE: u"Swedish", THA: u"Thai", TUR: u"Turkish", YUE: u"Yue Chinese", CMN_CHN: u"Mandarin Chinese (China)", CMN_TWN: u"Mandarin Chinese (Taiwan)", ENG_AUS: u"English (Australia)", ENG_GBR: u"English (GB)", ENG_IND: u"English (India)", ENG_IRL: u"English (Ireland)", ENG_SCT: u"English (Scotland)", ENG_USA: u"English (USA)", ENG_ZAF: u"English (South Africa)", FRA_CAN: u"French (Canada)", NLD_BEL: u"Dutch (Belgium)", POR_BRA: u"Portuguese (Brazil)", POR_PRT: u"Portuguese (Portugal)", SPA_COL: u"Spanish (Colombia)", SPA_ESP: u"Spanish (Spain)", SPA_MEX: u"Spanish (Mexico)", } CODE_TO_HUMAN_LIST = sorted([u"%s\t%s" % (k, v) for k, v in CODE_TO_HUMAN.items()]) LANGUAGE_TO_VOICE_CODE = { ARA: "Laila", # F, M: Maged, Tarik CAT: "Montserrat", # F, M: Jordi CES: "Iveta", # F, F: Zuzana CMN: "Tian-Tian", # F DAN: "Ida", # F, M: Magnus DEU: "Anna-ML", # F-ML, F-ML: Petra-ML, M: Markus, Yannick ELL: "Melina", # F, M: Nikos ENG: "Kate", # F EUS: "Miren", # F FIN: "Satu", # F FRA: "Audrey-ML", # F-ML, F: Aurelie, M: Thomas GLG: "Carmela", # F HEB: "Carmit", # F HIN: "Lekha", # F HUN: "Mariska", # F IND: "Damayanti", # F ITA: "Alice-ML", # F-ML, F: Federica, Paola, M: Luca JPN: "Kyoko", # F, M: Otoya KOR: "Sora", # F NLD: "Claire", # F, M: Xander NOR: "Nora", # F, M: Henrik POL: "Ewa", # F, F: Zosia POR: "Catarina", # F RON: "Ioana", # F RUS: "Katya", # F, F: Milena, M: Yuri SLK: "Laura", # F SPA: "Monica", # F, M: Jorge SWE: "Alva", # F, M: Oskar THA: "Kanya", # F TUR: "Yelda", # F, M: Cem YUE: "Sin-Ji", # F CMN_CHN: "Tian-Tian", # F CMN_TWN: "Mei-Jia", # F FRA_CAN: "Amelie", # F, F: Chantal, M: Nicolas ENG_AUS: "Karen", # F, M: Lee ENG_GBR: "Kate", # F, F: Serena, M: Daniel, Oliver ENG_IND: "Veena", # F ENG_IRL: "Moira", # F ENG_SCT: "Fiona", # F ENG_USA: "Ava", # F, F: Allison, Samantha, Susan, Zoe, M: Tom ENG_ZAF: "Tessa", # F NLD_BEL: "Ellen", # F POR_BRA: "Luciana", # F, M: Felipe POR_PRT: "Catarina", # F, F: Joana SPA_COL: "Soledad", # F, M: Carlos SPA_ESP: "Monica", # F, F (Valencian): Empar SPA_MEX: "Angelica", # F, F: Paulina, M: Juan } DEFAULT_LANGUAGE = ENG_GBR OUTPUT_AUDIO_FORMAT = ("pcm_s16le", 1, 16000) HAS_PYTHON_CALL = True # Nuance TTS API specific END_POINT = "NMDPTTSCmdServlet/tts" """ Nuance TTS API end point """ SAMPLE_RATE = 16000 """ Synthesize 16kHz PCM16 mono """ URL = "https://tts.nuancemobility.net" """ Nuance TTS API URL """ TAG = u"NuanceTTSWrapper" def __init__(self, rconf=None, logger=None): super(NuanceTTSWrapper, self).__init__(rconf=rconf, logger=logger) def _synthesize_single_python_helper(self, text, voice_code, output_file_path=None, return_audio_data=True): self.log(u"Importing requests...") import requests self.log(u"Importing requests... done") # prepare request header and contents request_id = str(uuid.uuid4()).replace("-", "")[0:16] headers = { u"Content-Type": u"text/plain; charset=utf-8", u"Accept": u"audio/x-wav;codec=pcm;bit=16;rate=%d" % self.SAMPLE_RATE } text_to_synth = text.encode("utf-8") url = "%s/%s?appId=%s&appKey=%s&id=%s&voice=%s" % ( self.URL, self.END_POINT, self.rconf[RuntimeConfiguration.NUANCE_TTS_API_ID], self.rconf[RuntimeConfiguration.NUANCE_TTS_API_KEY], request_id, voice_code ) # post request sleep_delay = self.rconf[RuntimeConfiguration.TTS_API_SLEEP] attempts = self.rconf[RuntimeConfiguration.TTS_API_RETRY_ATTEMPTS] self.log([u"Sleep delay: %.3f", sleep_delay]) self.log([u"Retry attempts: %d", attempts]) while attempts > 0: self.log(u"Sleeping to throttle API usage...") time.sleep(sleep_delay) self.log(u"Sleeping to throttle API usage... done") self.log(u"Posting...") try: response = requests.post(url, data=text_to_synth, headers=headers) except Exception as exc: self.log_exc(u"Unexpected exception on HTTP POST. Are you offline?", exc, True, ValueError) self.log(u"Posting... done") status_code = response.status_code self.log([u"Status code: %d", status_code]) if status_code == 200: self.log(u"Got status code 200, break") break else: self.log_warn(u"Got status code other than 200, retry") attempts -= 1 if attempts <= 0: self.log_exc(u"All API requests returned status code != 200", None, True, ValueError) # save to file if requested if output_file_path is None: self.log(u"output_file_path is None => not saving to file") else: self.log(u"output_file_path is not None => saving to file...") import wave output_file = wave.open(output_file_path, "wb") output_file.setframerate(self.SAMPLE_RATE) # sample rate output_file.setnchannels(1) # 1 channel, i.e. mono output_file.setsampwidth(2) # 16 bit/sample, i.e. 2 bytes/sample output_file.writeframes(response.content) output_file.close() self.log(u"output_file_path is not None => saving to file... done") # get length and data audio_sample_rate = self.SAMPLE_RATE number_of_frames = len(response.content) / 2 audio_length = TimeValue(number_of_frames / audio_sample_rate) self.log([u"Response (bytes): %d", len(response.content)]) self.log([u"Number of frames: %d", number_of_frames]) self.log([u"Audio length (s): %.3f", audio_length]) audio_format = "pcm16" audio_samples = numpy.fromstring(response.content, dtype=numpy.int16).astype("float64") / 32768 # return data return (True, (audio_length, audio_sample_rate, audio_format, audio_samples))