From e92d0692c2509aa58c87735555b6a6b7c802ea31 Mon Sep 17 00:00:00 2001 From: Markus Toman Date: Thu, 18 Dec 2014 12:11:55 +0100 Subject: [PATCH] SAPI Bookmarks added SAPI bookmarks are now supported (allows e.g. JAWS to continually read word document) Avoid hts_engine crash when empty labels array is provided (previously there were always at least "sil" phones, with the new option to turn them off, this might happen sometimes. --- engine/manager/include/Text.h | 1 - engine/manager/src/HTSEngineSynthesizer.cpp | 46 +++++++++-------- engine/manager/src/Text.cpp | 9 ---- sapi/htstts.cpp | 57 ++++++++++++++++----- 4 files changed, 69 insertions(+), 44 deletions(-) diff --git a/engine/manager/include/Text.h b/engine/manager/include/Text.h index 14215b4..5e41fc5 100644 --- a/engine/manager/include/Text.h +++ b/engine/manager/include/Text.h @@ -27,7 +27,6 @@ class TextFragment { std::string text; ///< actual text public: - // TextFragment( const std::string& text, const VoiceDataPtr& voiceData ); TextFragment(const std::string& text, const FragmentPropertiesPtr& properties); const std::string& GetText() const { diff --git a/engine/manager/src/HTSEngineSynthesizer.cpp b/engine/manager/src/HTSEngineSynthesizer.cpp index 4896516..0adcbd6 100644 --- a/engine/manager/src/HTSEngineSynthesizer.cpp +++ b/engine/manager/src/HTSEngineSynthesizer.cpp @@ -33,6 +33,8 @@ HTSEngineSynthesizer::~HTSEngineSynthesizer() { **/ TTSResultPtr HTSEngineSynthesizer::SynthesizeLabels(const FragmentPropertiesPtr& properties, const LabelsPtr& labels) { + TTSResultPtr result(new TTSResult()); + //- no model given? if (properties->find(PROPERTY_KEY_VOICE_PATH) == properties->end()) { throw PropertyMissingException(PROPERTY_KEY_VOICE_PATH); @@ -92,36 +94,38 @@ TTSResultPtr HTSEngineSynthesizer::SynthesizeLabels(const FragmentPropertiesPtr& } //- synthesize - //- this is a big ugly hack as we need char** instead of std::string - //- we currently use an array of pointers to the internal c_str of the label data. - //- this is faster but also unsafer than copying it. - char** labelData = new char* [ labels->size() ]; - std::vector::iterator it = labels->begin(); - for (int i = 0; it != labels->end(); ++it, ++i) { - LabelPtr lab = *it; - labelData[i] = const_cast(lab->GetAsHTKLabel().c_str()); - } - HTS_Engine_synthesize_from_strings(&engine, labelData, labels->size()); - delete[] labelData; + if (labels->size() > 0) { + //- this is a big ugly hack as we need char** instead of std::string + //- we currently use an array of pointers to the internal c_str of the label data. + //- this is faster but also unsafer than copying it. + char** labelData = new char* [ labels->size() ]; + std::vector::iterator it = labels->begin(); + for (int i = 0; it != labels->end(); ++it, ++i) { + LabelPtr lab = *it; + labelData[i] = const_cast(lab->GetAsHTKLabel().c_str()); + } - //- store result data - TTSResultPtr result(new TTSResult()); - int num_samples = HTS_Engine_get_nsamples(&engine); - result->GetFrames().reserve(num_samples); + HTS_Engine_synthesize_from_strings(&engine, labelData, labels->size()); - for (int i = 0; i < num_samples; ++i) { - result->GetFrames().push_back(HTS_Engine_get_generated_speech(&engine, i)); + //- store result data + int num_samples = HTS_Engine_get_nsamples(&engine); + result->GetFrames().reserve(num_samples); + + for (int i = 0; i < num_samples; ++i) { + result->GetFrames().push_back(HTS_Engine_get_generated_speech(&engine, i)); + } + delete[] labelData; + + //- store labels + // we could add time alignment information to labels here + result->GetLabels().insert(result->GetLabels().begin(), labels->begin(), labels->end()); } //- store meta information // additional meta information in TTS result can be added here. result->SetSamplingRate(HTS_Engine_get_sampling_frequency(&engine)); - //- store labels - // we could add time alignment information to labels here - result->GetLabels().insert(result->GetLabels().begin(), labels->begin(), labels->end()); - return result; } diff --git a/engine/manager/src/Text.cpp b/engine/manager/src/Text.cpp index 442f6d7..01a3555 100644 --- a/engine/manager/src/Text.cpp +++ b/engine/manager/src/Text.cpp @@ -22,15 +22,6 @@ Text::~Text() { } -/** - * TextFragment - **/ -/* -TextFragment::TextFragment( const std::string& text ) { - this->text = text; - this->properties = FragmentPropertiesPtr(new FragmentProperties()); -}*/ - /** * TextFragment **/ diff --git a/sapi/htstts.cpp b/sapi/htstts.cpp index 5e1d64c..f58db2d 100644 --- a/sapi/htstts.cpp +++ b/sapi/htstts.cpp @@ -1,9 +1,8 @@ -ulTextLen; ULONG currlen; ULONG FRAG_SIZE = 500; + + // if this fragment has some changed properties for volume/pitch/etc., change the properties, else use voice properties FragmentPropertiesPtr props = AdjustProperties(&(curr_frag->State), this->voiceProperties); switch (curr_frag->State.eAction) { @@ -172,8 +185,12 @@ HTSTTS::Speak(DWORD dwSpeakFlags, LOG_DEBUG("[Speak] Should spell out something"); case SPVA_Pronounce: LOG_DEBUG("[Speak] Should pronounce something"); + case SPVA_Bookmark: + // bookmarks are treated like normal text fragments, + // but will set a special property + LOG_DEBUG("[Speak] Set a bookmark here"); + (*props)[SAPI_PROPERTY_BOOKMARK] = std::string((char*)currStart, charsLeft * sizeof(wchar_t)) + std::string("\0"); case SPVA_Speak: - LOG_DEBUG("[Speak] Converting text"); if (curr_frag->ulTextLen == 0) { @@ -198,8 +215,6 @@ HTSTTS::Speak(DWORD dwSpeakFlags, WideCharToMultiByte(CP_UTF8, 0, currStart, currlen, tmptext, len, NULL, NULL); tmptext[len] = 0; - //TODO: copy and modify properties if this fragment has specific needs - LOG_DEBUG("[Speak] Text = " << tmptext); fullText.push_back(TextFragmentPtr(new TextFragment(tmptext, props))); @@ -215,8 +230,6 @@ HTSTTS::Speak(DWORD dwSpeakFlags, case SPVA_Silence: LOG_DEBUG("[Speak] Should do silence"); break; - case SPVA_Bookmark: - break; default: break; } @@ -235,11 +248,23 @@ HTSTTS::Speak(DWORD dwSpeakFlags, break; } + // is this actually a bookmark? + //TODO: only send if interest is there + if (tf->GetProperties()->find(SAPI_PROPERTY_BOOKMARK) != tf->GetProperties()->end()) { + SPEVENT evt; + evt.eEventId = SPEI_TTS_BOOKMARK; + evt.elParamType = SPET_LPARAM_IS_STRING; + evt.ullAudioStreamOffset = 0L; + evt.wParam = atol(tf->GetText().c_str()); + evt.lParam = (LPARAM)((*tf->GetProperties())[SAPI_PROPERTY_BOOKMARK].c_str()); + (strdup(tf->GetText().c_str())); + pOutputSite->AddEvents(&evt, 1); + continue; + } + LOG_DEBUG("[Speak] Synthesize text fragment"); try { - //TODO: modify volume and speaking rate according to live action events for future fragments - // combine it with values from SAPI XML TTSResultPtr result = ttsManager.SynthesizeTextFragment(tf); //TODO: is result->GetSamplingRate() correct? @@ -349,18 +374,14 @@ void HTSTTS::HandleActions(ISpTTSEngineSite* site) { //- change base speaking rate action if (actions & SPVES_RATE) { long adj; - site->GetRate(&adj); - ttsManager.SetBaseSpeakingRate(ConvertSapiRate(adj)); } //- change base volume if (actions & SPVES_VOLUME) { USHORT adj; - site->GetVolume(&adj); - ttsManager.SetBaseVolume((int)adj); } } @@ -378,6 +399,14 @@ FragmentPropertiesPtr HTSTTS::AdjustProperties(const SPVSTATE* state, FragmentPr //TODO: if( state.EmphAdj ) //TODO: if( state.PitchAdj.MiddleAdj ) + // bookmark always needs special properties. + if (state->eAction == SPVA_Bookmark) { + if (newProps == NULL) { + newProps = new FragmentProperties(*props); + } + //(*newProps)[SAPI_PROPERTY_BOOKMARK] = PROPERTY_VALUE_TRUE; + } + //- speaking rate changed for this fragment? if (state->RateAdj) { std::stringstream ss; @@ -403,6 +432,8 @@ FragmentPropertiesPtr HTSTTS::AdjustProperties(const SPVSTATE* state, FragmentPr //- return either the new properties //- or if nothing has changed, the old ones if (newProps) { + (*newProps)[PROPERTY_KEY_FRAGMENT_NOSIL_END] = PROPERTY_VALUE_TRUE; + (*newProps)[PROPERTY_KEY_FRAGMENT_NOSIL_BEGIN] = PROPERTY_VALUE_TRUE; return FragmentPropertiesPtr(newProps); } else {