123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288 |
- # Copyright (c) Alibaba, Inc. and its affiliates.
- import logging
- from re import I
- import uuid
- import json
- import threading
- from nls.core import NlsCore
- from . import logging
- from . import util
- from .exception import (StartTimeoutException,
- CompleteTimeoutException,
- InvalidParameter)
- __SPEECH_SYNTHESIZER_NAMESPACE__ = 'SpeechSynthesizer'
- __SPEECH_LONG_SYNTHESIZER_NAMESPACE__ = 'SpeechLongSynthesizer'
- __SPEECH_SYNTHESIZER_REQUEST_CMD__ = {
- 'start': 'StartSynthesis'
- }
- __URL__ = 'wss://nls-gateway.cn-shanghai.aliyuncs.com/ws/v1'
- __all__ = ['NlsSpeechSynthesizer']
- class NlsSpeechSynthesizer:
- """
- Api for text-to-speech
- """
- def __init__(self,
- url=__URL__,
- token=None,
- appkey=None,
- long_tts=False,
- on_metainfo=None,
- on_data=None,
- on_completed=None,
- on_error=None,
- on_close=None,
- callback_args=[]):
- """
- NlsSpeechSynthesizer initialization
- Parameters:
- -----------
- url: str
- websocket url.
- akid: str
- access id from aliyun. if you provide a token, ignore this argument.
- appkey: str
- appkey from aliyun
- long_tts: bool
- whether using long-text synthesis support, default is False. long-text synthesis
- can support longer text but more expensive.
- on_metainfo: function
- Callback object which is called when recognition started.
- on_start has two arguments.
- The 1st argument is message which is a json format string.
- The 2nd argument is *args which is callback_args.
- on_data: function
- Callback object which is called when partial synthesis result arrived
- arrived.
- on_result_changed has two arguments.
- The 1st argument is binary data corresponding to aformat in start
- method.
- The 2nd argument is *args which is callback_args.
- on_completed: function
- Callback object which is called when recognition is completed.
- on_completed has two arguments.
- The 1st argument is message which is a json format string.
- The 2nd argument is *args which is callback_args.
- on_error: function
- Callback object which is called when any error occurs.
- on_error has two arguments.
- The 1st argument is message which is a json format string.
- The 2nd argument is *args which is callback_args.
- on_close: function
- Callback object which is called when connection closed.
- on_close has one arguments.
- The 1st argument is *args which is callback_args.
- callback_args: list
- callback_args will return in callbacks above for *args.
- """
- if not token or not appkey:
- raise InvalidParameter('Must provide token and appkey')
- self.__response_handler__ = {
- 'MetaInfo': self.__metainfo,
- 'SynthesisCompleted': self.__synthesis_completed,
- 'TaskFailed': self.__task_failed
- }
- self.__callback_args = callback_args
- self.__url = url
- self.__appkey = appkey
- self.__token = token
- self.__long_tts = long_tts
- self.__start_cond = threading.Condition()
- self.__start_flag = False
- self.__on_metainfo = on_metainfo
- self.__on_data = on_data
- self.__on_completed = on_completed
- self.__on_error = on_error
- self.__on_close = on_close
- self.__allow_aformat = (
- 'pcm', 'wav', 'mp3'
- )
- self.__allow_sample_rate = (
- 8000, 11025, 16000, 22050,
- 24000, 32000, 44100, 48000
- )
- def __handle_message(self, message):
- logging.debug('__handle_message')
- try:
- __result = json.loads(message)
- if __result['header']['name'] in self.__response_handler__:
- __handler = self.__response_handler__[__result['header']['name']]
- __handler(message)
- else:
- logging.error('cannot handle cmd{}'.format(
- __result['header']['name']))
- return
- except json.JSONDecodeError:
- logging.error('cannot parse message:{}'.format(message))
- return
- def __syn_core_on_open(self):
- logging.debug('__syn_core_on_open')
- with self.__start_cond:
- self.__start_flag = True
- self.__start_cond.notify()
- def __syn_core_on_data(self, data, opcode, flag):
- logging.debug('__syn_core_on_data')
- if self.__on_data:
- self.__on_data(data, *self.__callback_args)
- def __syn_core_on_msg(self, msg, *args):
- logging.debug('__syn_core_on_msg:msg={} args={}'.format(msg, args))
- self.__handle_message(msg)
- def __syn_core_on_error(self, msg, *args):
- logging.debug('__sr_core_on_error:msg={} args={}'.format(msg, args))
- def __syn_core_on_close(self):
- logging.debug('__sr_core_on_close')
- if self.__on_close:
- self.__on_close(*self.__callback_args)
- with self.__start_cond:
- self.__start_flag = False
- self.__start_cond.notify()
- def __metainfo(self, message):
- logging.debug('__metainfo')
- if self.__on_metainfo:
- self.__on_metainfo(message, *self.__callback_args)
- def __synthesis_completed(self, message):
- logging.debug('__synthesis_completed')
- self.__nls.shutdown()
- logging.debug('__synthesis_completed shutdown done')
- if self.__on_completed:
- self.__on_completed(message, *self.__callback_args)
- with self.__start_cond:
- self.__start_flag = False
- self.__start_cond.notify()
- def __task_failed(self, message):
- logging.debug('__task_failed')
- with self.__start_cond:
- self.__start_flag = False
- self.__start_cond.notify()
- if self.__on_error:
- self.__on_error(message, *self.__callback_args)
- def start(self,
- text=None,
- voice='xiaoyun',
- aformat='pcm',
- sample_rate=16000,
- volume=50,
- speech_rate=0,
- pitch_rate=0,
- wait_complete=True,
- start_timeout=10,
- completed_timeout=60,
- ex:dict=None):
- """
- Synthesis start
- Parameters:
- -----------
- text: str
- utf-8 text
- voice: str
- voice for text-to-speech, default is xiaoyun
- aformat: str
- audio binary format, support: 'pcm', 'wav', 'mp3', default is 'pcm'
- sample_rate: int
- audio sample rate, default is 16000, support:8000, 11025, 16000, 22050,
- 24000, 32000, 44100, 48000
- volume: int
- audio volume, from 0~100, default is 50
- speech_rate: int
- speech rate from -500~500, default is 0
- pitch_rate: int
- pitch for voice from -500~500, default is 0
- wait_complete: bool
- whether block until syntheis completed or timeout for completed timeout
- start_timeout: int
- timeout for connection established
- completed_timeout: int
- timeout for waiting synthesis completed from connection established
- ex: dict
- dict which will merge into 'payload' field in request
- """
- if text is None:
- raise InvalidParameter('Text cannot be None')
-
- self.__nls = NlsCore(
- url=self.__url,
- token=self.__token,
- on_open=self.__syn_core_on_open,
- on_message=self.__syn_core_on_msg,
- on_data=self.__syn_core_on_data,
- on_close=self.__syn_core_on_close,
- on_error=self.__syn_core_on_error,
- callback_args=[])
- if aformat not in self.__allow_aformat:
- raise InvalidParameter('format {} not support'.format(aformat))
- if sample_rate not in self.__allow_sample_rate:
- raise InvalidParameter('samplerate {} not support'.format(sample_rate))
- if volume < 0 or volume > 100:
- raise InvalidParameter('volume {} not support'.format(volume))
- if speech_rate < -500 or speech_rate > 500:
- raise InvalidParameter('speech_rate {} not support'.format(speech_rate))
- if pitch_rate < -500 or pitch_rate > 500:
- raise InvalidParameter('pitch rate {} not support'.format(pitch_rate))
- __id4 = uuid.uuid4().hex
- self.__task_id = uuid.uuid4().hex
- __namespace = __SPEECH_SYNTHESIZER_NAMESPACE__
- if self.__long_tts:
- __namespace = __SPEECH_LONG_SYNTHESIZER_NAMESPACE__
- __header = {
- 'message_id': __id4,
- 'task_id': self.__task_id,
- 'namespace': __namespace,
- 'name': __SPEECH_SYNTHESIZER_REQUEST_CMD__['start'],
- 'appkey': self.__appkey
- }
- __payload = {
- 'text': text,
- 'voice': voice,
- 'format': aformat,
- 'sample_rate': sample_rate,
- 'volume': volume,
- 'speech_rate': speech_rate,
- 'pitch_rate': pitch_rate
- }
- if ex:
- __payload.update(ex)
- __msg = {
- 'header': __header,
- 'payload': __payload,
- 'context': util.GetDefaultContext()
- }
- __jmsg = json.dumps(__msg)
- with self.__start_cond:
- if self.__start_flag:
- logging.debug('already start...')
- return
- self.__nls.start(__jmsg, ping_interval=0, ping_timeout=None)
- if self.__start_flag == False:
- if not self.__start_cond.wait(start_timeout):
- logging.debug('syn start timeout')
- raise StartTimeoutException(f'Waiting Start over {start_timeout}s')
- if self.__start_flag and wait_complete:
- if not self.__start_cond.wait(completed_timeout):
- raise CompleteTimeoutException(f'Waiting Complete over {completed_timeout}s')
- def shutdown(self):
- """
- Shutdown connection immediately
- """
- self.__nls.shutdown()
|