client_service
/
voice-gateway-service


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
							# Copyright (c) Alibaba, Inc. and its affiliates.

import logging
import uuid
import json
import threading
from enum import IntEnum

from nls.core import NlsCore
from . import logging
from .exception import StartTimeoutException, WrongStateException, InvalidParameter

__STREAM_INPUT_TTS_NAMESPACE__ = "FlowingSpeechSynthesizer"

__STREAM_INPUT_TTS_REQUEST_CMD__ = {
    "start": "StartSynthesis",
    "send": "RunSynthesis",
    "stop": "StopSynthesis",
}
__STREAM_INPUT_TTS_REQUEST_NAME__ = {
    "started": "SynthesisStarted",
    "sentence_begin": "SentenceBegin",
    "sentence_synthesis": "SentenceSynthesis",
    "sentence_end": "SentenceEnd",
    "completed": "SynthesisCompleted",
    "task_failed": "TaskFailed",
}

__URL__ = "wss://nls-gateway.cn-shanghai.aliyuncs.com/ws/v1"

__all__ = ["NlsStreamInputTtsSynthesizer"]


class NlsStreamInputTtsRequest:
    def __init__(self, task_id, session_id, appkey):
        self.task_id = task_id
        self.appkey = appkey
        self.session_id = session_id

    def getStartCMD(self, voice, format, sample_rate, volumn, speech_rate, pitch_rate, ex):
        self.voice = voice
        self.format = format
        self.sample_rate = sample_rate
        self.volumn = volumn
        self.speech_rate = speech_rate
        self.pitch_rate = pitch_rate
        cmd = {
            "header": {
                "message_id": uuid.uuid4().hex,
                "task_id": self.task_id,
                "name": __STREAM_INPUT_TTS_REQUEST_CMD__["start"],
                "namespace": __STREAM_INPUT_TTS_NAMESPACE__,
                "appkey": self.appkey,
            },
            "payload": {
                "session_id": self.session_id,
                "voice": self.voice,
                "format": self.format,
                "sample_rate": self.sample_rate,
                "volumn": self.volumn,
                "speech_rate": self.speech_rate,
                "pitch_rate": self.pitch_rate,
            },
        }
        if ex:
            cmd["payload"].update(ex)        
        return json.dumps(cmd)

    def getSendCMD(self, text):
        cmd = {
            "header": {
                "message_id": uuid.uuid4().hex,
                "task_id": self.task_id,
                "name": __STREAM_INPUT_TTS_REQUEST_CMD__["send"],
                "namespace": __STREAM_INPUT_TTS_NAMESPACE__,
                "appkey": self.appkey,
            },
            "payload": {"text": text},
        }
        return json.dumps(cmd)

    def getStopCMD(self):
        cmd = {
            "header": {
                "message_id": uuid.uuid4().hex,
                "task_id": self.task_id,
                "name": __STREAM_INPUT_TTS_REQUEST_CMD__["stop"],
                "namespace": __STREAM_INPUT_TTS_NAMESPACE__,
                "appkey": self.appkey,
            },
        }
        return json.dumps(cmd)


class NlsStreamInputTtsStatus(IntEnum):
    Begin = 1
    Start = 2
    Started = 3
    WaitingComplete = 3
    Completed = 4
    Failed = 5
    Closed = 6

class ThreadSafeStatus:
    def __init__(self, state: NlsStreamInputTtsStatus):
        self._state = state
        self._lock = threading.Lock()
    
    def get(self) -> NlsStreamInputTtsStatus:
        with self._lock:
            return self._state
    
    def set(self, state: NlsStreamInputTtsStatus):
        with self._lock:
            self._state = state


class NlsStreamInputTtsSynthesizer:
    """
    Api for text-to-speech
    """

    def __init__(
        self,
        url=__URL__,
        token=None,
        appkey=None,
        session_id=None,
        on_data=None,
        on_sentence_begin=None,
        on_sentence_synthesis=None,
        on_sentence_end=None,
        on_completed=None,
        on_error=None,
        on_close=None,
        callback_args=[],
    ):
        """
        NlsSpeechSynthesizer initialization

        Parameters:
        -----------
        url: str
            websocket url.
        akid: str
            access id from aliyun. if you provide a token, ignore this argument.
        appkey: str
            appkey from aliyun
        session_id: str
            32-character string, if empty, sdk will generate a random string.
        on_data: function
            Callback object which is called when partial synthesis result arrived
            arrived.
            on_result_changed has two arguments.
            The 1st argument is binary data corresponding to aformat in start
            method.
            The 2nd argument is *args which is callback_args.
        on_sentence_begin: function
            Callback object which is called when detected sentence start.
            on_start has two arguments.
            The 1st argument is message which is a json format string.
            The 2nd argument is *args which is callback_args.
        on_sentence_synthesis: function
            Callback object which is called when detected sentence synthesis.
            The incremental timestamp is returned within payload.
            on_start has two arguments.
            The 1st argument is message which is a json format string.
            The 2nd argument is *args which is callback_args.
        on_sentence_end: function
            Callback object which is called when detected sentence end.
            The timestamp of the whole sentence is returned within payload.
            on_start has two arguments.
            The 1st argument is message which is a json format string.
            The 2nd argument is *args which is callback_args.
        on_completed: function
            Callback object which is called when recognition is completed.
            on_completed has two arguments.
            The 1st argument is message which is a json format string.
            The 2nd argument is *args which is callback_args.
        on_error: function
            Callback object which is called when any error occurs.
            on_error has two arguments.
            The 1st argument is message which is a json format string.
            The 2nd argument is *args which is callback_args.
        on_close: function
            Callback object which is called when connection closed.
            on_close has one arguments.
            The 1st argument is *args which is callback_args.
        callback_args: list
            callback_args will return in callbacks above for *args.
        """
        if not token or not appkey:
            raise InvalidParameter("Must provide token and appkey")
        self.__response_handler__ = {
            __STREAM_INPUT_TTS_REQUEST_NAME__["started"]: self.__synthesis_started,
            __STREAM_INPUT_TTS_REQUEST_NAME__["sentence_begin"]: self.__sentence_begin,
            __STREAM_INPUT_TTS_REQUEST_NAME__[
                "sentence_synthesis"
            ]: self.__sentence_synthesis,
            __STREAM_INPUT_TTS_REQUEST_NAME__["sentence_end"]: self.__sentence_end,
            __STREAM_INPUT_TTS_REQUEST_NAME__["completed"]: self.__synthesis_completed,
            __STREAM_INPUT_TTS_REQUEST_NAME__["task_failed"]: self.__task_failed,
        }
        self.__callback_args = callback_args
        self.__url = url
        self.__appkey = appkey
        self.__token = token
        self.__session_id = session_id
        self.start_sended = threading.Event()
        self.started_event = threading.Event()
        self.complete_event = threading.Event()
        self.__on_sentence_begin = on_sentence_begin
        self.__on_sentence_synthesis = on_sentence_synthesis
        self.__on_sentence_end = on_sentence_end
        self.__on_data = on_data
        self.__on_completed = on_completed
        self.__on_error = on_error
        self.__on_close = on_close
        self.__allow_aformat = ("pcm", "wav", "mp3")
        self.__allow_sample_rate = (
            8000,
            11025,
            16000,
            22050,
            24000,
            32000,
            44100,
            48000,
        )
        self.state = ThreadSafeStatus(NlsStreamInputTtsStatus.Begin)     
        if not self.__session_id:
            self.__session_id = uuid.uuid4().hex   
        self.request = NlsStreamInputTtsRequest(
            uuid.uuid4().hex, self.__session_id, self.__appkey
        )

    def __handle_message(self, message):
        logging.debug("__handle_message")
        try:
            __result = json.loads(message)
            if __result["header"]["name"] in self.__response_handler__:
                __handler = self.__response_handler__[__result["header"]["name"]]
                __handler(message)
            else:
                logging.error("cannot handle cmd{}".format(__result["header"]["name"]))
                return
        except json.JSONDecodeError:
            logging.error("cannot parse message:{}".format(message))
            return

    def __syn_core_on_open(self):
        logging.debug("__syn_core_on_open")
        self.start_sended.set()

    def __syn_core_on_data(self, data, opcode, flag):
        logging.debug("__syn_core_on_data")
        if self.__on_data:
            self.__on_data(data, *self.__callback_args)

    def __syn_core_on_msg(self, msg, *args):
        logging.debug("__syn_core_on_msg:msg={} args={}".format(msg, args))
        self.__handle_message(msg)

    def __syn_core_on_error(self, msg, *args):
        logging.debug("__sr_core_on_error:msg={} args={}".format(msg, args))

    def __syn_core_on_close(self):
        logging.debug("__sr_core_on_close")
        if self.__on_close:
            self.__on_close(*self.__callback_args)
        self.state.set(NlsStreamInputTtsStatus.Closed)
        self.start_sended.set()
        self.started_event.set()
        self.complete_event.set()

    def __synthesis_started(self, message):
        logging.debug("__synthesis_started")
        self.started_event.set()

    def __sentence_begin(self, message):
        logging.debug("__sentence_begin")
        if self.__on_sentence_begin:
            self.__on_sentence_begin(message, *self.__callback_args)

    def __sentence_synthesis(self, message):
        logging.debug("__sentence_synthesis")
        if self.__on_sentence_synthesis:
            self.__on_sentence_synthesis(message, *self.__callback_args)

    def __sentence_end(self, message):
        logging.debug("__sentence_end")
        if self.__on_sentence_end:
            self.__on_sentence_end(message, *self.__callback_args)

    def __synthesis_completed(self, message):
        logging.debug("__synthesis_completed")
        if self.__on_completed:
            self.__on_completed(message, *self.__callback_args)
        self.__nls.shutdown()
        logging.debug("__synthesis_completed shutdown done")
        self.complete_event.set()


    def __task_failed(self, message):
        logging.debug("__task_failed")
        self.start_sended.set()
        self.started_event.set()
        self.complete_event.set()
        if self.__on_error:
            self.__on_error(message, *self.__callback_args)
        self.state.set(NlsStreamInputTtsStatus.Failed)

    def startStreamInputTts(
        self,
        voice="longxiaochun",
        aformat="pcm",
        sample_rate=24000,
        volume=50,
        speech_rate=0,
        pitch_rate=0,
        ex:dict=None,
    ):
        """
        Synthesis start

        Parameters:
        -----------
        voice: str
            voice for text-to-speech, default is xiaoyun
        aformat: str
            audio binary format, support: 'pcm', 'wav', 'mp3', default is 'pcm'
        sample_rate: int
            audio sample rate, default is 24000, support:8000, 11025, 16000, 22050,
            24000, 32000, 44100, 48000
        volume: int
            audio volume, from 0~100, default is 50
        speech_rate: int
            speech rate from -500~500, default is 0
        pitch_rate: int
            pitch for voice from -500~500, default is 0
        ex: dict
            dict which will merge into 'payload' field in request
        """

        self.__nls = NlsCore(
            url=self.__url,
            token=self.__token,
            on_open=self.__syn_core_on_open,
            on_message=self.__syn_core_on_msg,
            on_data=self.__syn_core_on_data,
            on_close=self.__syn_core_on_close,
            on_error=self.__syn_core_on_error,
            callback_args=[],
        )

        if aformat not in self.__allow_aformat:
            raise InvalidParameter("format {} not support".format(aformat))
        if sample_rate not in self.__allow_sample_rate:
            raise InvalidParameter("samplerate {} not support".format(sample_rate))
        if volume < 0 or volume > 100:
            raise InvalidParameter("volume {} not support".format(volume))
        if speech_rate < -500 or speech_rate > 500:
            raise InvalidParameter("speech_rate {} not support".format(speech_rate))
        if pitch_rate < -500 or pitch_rate > 500:
            raise InvalidParameter("pitch rate {} not support".format(pitch_rate))

        request = self.request.getStartCMD(
            voice, aformat, sample_rate, volume, speech_rate, pitch_rate, ex
        )
        
        last_state = self.state.get()
        if last_state != NlsStreamInputTtsStatus.Begin:
            logging.debug("start with wrong state {}".format(last_state))
            self.state.set(NlsStreamInputTtsStatus.Failed)
            raise WrongStateException("start with wrong state {}".format(last_state))

        logging.debug("start with request: {}".format(request))
        self.__nls.start(request, ping_interval=0, ping_timeout=None)
        self.state.set(NlsStreamInputTtsStatus.Start)
        if not self.start_sended.wait(timeout=10):
            logging.debug("syn start timeout")
            raise StartTimeoutException(f"Waiting Connection before Start over 10s")

        if last_state != NlsStreamInputTtsStatus.Begin:
            logging.debug("start with wrong state {}".format(last_state))
            self.state.set(NlsStreamInputTtsStatus.Failed)
            raise WrongStateException("start with wrong state {}".format(last_state))

        if not self.started_event.wait(timeout=10):
            logging.debug("syn started timeout")
            self.state.set(NlsStreamInputTtsStatus.Failed)
            raise StartTimeoutException(f"Waiting Started over 10s")
        self.state.set(NlsStreamInputTtsStatus.Started)

    def sendStreamInputTts(self, text):
        """
        send text to server

        Parameters:
        -----------
        text: str
            utf-8 text
        """
        last_state = self.state.get()
        if last_state != NlsStreamInputTtsStatus.Started:
            logging.debug("send with wrong state {}".format(last_state))
            self.state.set(NlsStreamInputTtsStatus.Failed)
            raise WrongStateException("send with wrong state {}".format(last_state))

        request = self.request.getSendCMD(text)
        logging.debug("send with request: {}".format(request))
        self.__nls.send(request, None)

    def stopStreamInputTts(self):
        """
        Synthesis end
        """

        last_state = self.state.get()
        if last_state != NlsStreamInputTtsStatus.Started:
            logging.debug("send with wrong state {}".format(last_state))
            self.state.set(NlsStreamInputTtsStatus.Failed)
            raise WrongStateException("stop with wrong state {}".format(last_state))


        request = self.request.getStopCMD()
        logging.debug("stop with request: {}".format(request))
        self.__nls.send(request, None)
        self.state.set(NlsStreamInputTtsStatus.WaitingComplete)
        self.complete_event.wait()
        self.state.set(NlsStreamInputTtsStatus.Completed)
        self.shutdown()

    def shutdown(self):
        """
        Shutdown connection immediately
        """

        self.__nls.shutdown()