From e92d0692c2509aa58c87735555b6a6b7c802ea31 Mon Sep 17 00:00:00 2001
From: Markus Toman <m.toman@neuratec.com>
Date: Thu, 18 Dec 2014 12:11:55 +0100
Subject: [PATCH] SAPI Bookmarks added

SAPI bookmarks are now supported (allows e.g. JAWS to continually read
word document)
Avoid hts_engine crash when empty labels array is provided (previously
there were always at least "sil" phones, with the new option to turn
them off, this might happen sometimes.
---
 engine/manager/include/Text.h               |  1 -
 engine/manager/src/HTSEngineSynthesizer.cpp | 46 +++++++++--------
 engine/manager/src/Text.cpp                 |  9 ----
 sapi/htstts.cpp                             | 57 ++++++++++++++++-----
 4 files changed, 69 insertions(+), 44 deletions(-)
diff --git a/engine/manager/include/Text.h b/engine/manager/include/Text.h
index 14215b4..5e41fc5 100644
--- a/engine/manager/include/Text.h
+++ b/engine/manager/include/Text.h
@@ -27,7 +27,6 @@ class TextFragment {
       std::string text;                       ///< actual text
 
    public:
-      // TextFragment( const std::string& text, const VoiceDataPtr& voiceData );
       TextFragment(const std::string& text, const FragmentPropertiesPtr& properties);
 
       const std::string& GetText() const {
diff --git a/engine/manager/src/HTSEngineSynthesizer.cpp b/engine/manager/src/HTSEngineSynthesizer.cpp
index 4896516..0adcbd6 100644
--- a/engine/manager/src/HTSEngineSynthesizer.cpp
+++ b/engine/manager/src/HTSEngineSynthesizer.cpp
@@ -33,6 +33,8 @@ HTSEngineSynthesizer::~HTSEngineSynthesizer() {
 **/
 TTSResultPtr HTSEngineSynthesizer::SynthesizeLabels(const FragmentPropertiesPtr& properties, const LabelsPtr& labels) {
 
+   TTSResultPtr result(new TTSResult());
+
    //- no model given?
    if (properties->find(PROPERTY_KEY_VOICE_PATH) == properties->end()) {
       throw PropertyMissingException(PROPERTY_KEY_VOICE_PATH);
@@ -92,36 +94,38 @@ TTSResultPtr HTSEngineSynthesizer::SynthesizeLabels(const FragmentPropertiesPtr&
    }
 
    //- synthesize
-   //- this is a big ugly hack as we need char** instead of std::string
-   //- we currently use an array of pointers to the internal c_str of the label data.
-   //- this is faster but also unsafer than copying it.
-   char** labelData = new char* [ labels->size() ];
-   std::vector<LabelPtr>::iterator it = labels->begin();
-   for (int i = 0; it != labels->end(); ++it, ++i) {
-      LabelPtr lab = *it;
-      labelData[i] = const_cast<char*>(lab->GetAsHTKLabel().c_str());
-   }
 
-   HTS_Engine_synthesize_from_strings(&engine, labelData, labels->size());
-   delete[] labelData;
+   if (labels->size() > 0) {
+      //- this is a big ugly hack as we need char** instead of std::string
+      //- we currently use an array of pointers to the internal c_str of the label data.
+      //- this is faster but also unsafer than copying it.
+      char** labelData = new char* [ labels->size() ];
+      std::vector<LabelPtr>::iterator it = labels->begin();
+      for (int i = 0; it != labels->end(); ++it, ++i) {
+         LabelPtr lab = *it;
+         labelData[i] = const_cast<char*>(lab->GetAsHTKLabel().c_str());
+      }
 
-   //- store result data
-   TTSResultPtr result(new TTSResult());
-   int num_samples = HTS_Engine_get_nsamples(&engine);
-   result->GetFrames().reserve(num_samples);
+      HTS_Engine_synthesize_from_strings(&engine, labelData, labels->size());
 
-   for (int i = 0; i < num_samples; ++i) {
-      result->GetFrames().push_back(HTS_Engine_get_generated_speech(&engine, i));
+      //- store result data
+      int num_samples = HTS_Engine_get_nsamples(&engine);
+      result->GetFrames().reserve(num_samples);
+
+      for (int i = 0; i < num_samples; ++i) {
+         result->GetFrames().push_back(HTS_Engine_get_generated_speech(&engine, i));
+      }
+      delete[] labelData;
+
+      //- store labels
+      // we could add time alignment information to labels here
+      result->GetLabels().insert(result->GetLabels().begin(), labels->begin(), labels->end());
    }
 
    //- store meta information
    // additional meta information in TTS result can be added here.
    result->SetSamplingRate(HTS_Engine_get_sampling_frequency(&engine));
 
-   //- store labels
-   // we could add time alignment information to labels here
-   result->GetLabels().insert(result->GetLabels().begin(), labels->begin(), labels->end());
-
    return result;
 }
 
diff --git a/engine/manager/src/Text.cpp b/engine/manager/src/Text.cpp
index 442f6d7..01a3555 100644
--- a/engine/manager/src/Text.cpp
+++ b/engine/manager/src/Text.cpp
@@ -22,15 +22,6 @@ Text::~Text() {
 }
 
 
-/**
- * TextFragment
- **/
-/*
-TextFragment::TextFragment( const std::string& text  ) {
-    this->text = text;
-    this->properties = FragmentPropertiesPtr(new FragmentProperties());
-}*/
-
 /**
  * TextFragment
  **/
diff --git a/sapi/htstts.cpp b/sapi/htstts.cpp
index 5e1d64c..f58db2d 100644
--- a/sapi/htstts.cpp
+++ b/sapi/htstts.cpp
@@ -1,9 +1,8 @@
-<// htstts.cpp : Implementation of HTSTTS
+// htstts.cpp : Implementation of HTSTTS
 
 #include "stdafx.h"
 #include "htstts.h"
 
-
 #include "TTSManager.h"
 #include "TTSLogger.h"
 
@@ -14,10 +13,22 @@ static double ConvertSapiRate(int r);
 
 static TTSManager ttsManager;
 
+#define SAPI_PROPERTY_BOOKMARK "SAPIROPBKMK"
+
+/******************************************************************************
+* Subclass of TextFragment to add some SAPI specific properties.
+******************************************************************************/
+class SAPITextFragment : public TextFragment {
+   public:
+      std::string bookmark;
+};
+
 /******************************************************************************
 * HTSTTS Constructor
 ******************************************************************************/
 HTSTTS::HTSTTS() : voiceProperties(new FragmentProperties()) {
+   (*voiceProperties)[PROPERTY_KEY_FRAGMENT_NOSIL_END] = PROPERTY_VALUE_TRUE;
+   (*voiceProperties)[PROPERTY_KEY_FRAGMENT_NOSIL_BEGIN] = PROPERTY_VALUE_TRUE;
    ResetActions();
 }
 
@@ -164,6 +175,8 @@ HTSTTS::Speak(DWORD dwSpeakFlags,
       ULONG charsLeft = curr_frag->ulTextLen;
       ULONG currlen;
       ULONG FRAG_SIZE = 500;
+
+      // if this fragment has some changed properties for volume/pitch/etc., change the properties, else use voice properties
       FragmentPropertiesPtr props = AdjustProperties(&(curr_frag->State), this->voiceProperties);
 
       switch (curr_frag->State.eAction) {
@@ -172,8 +185,12 @@ HTSTTS::Speak(DWORD dwSpeakFlags,
          LOG_DEBUG("[Speak] Should spell out something");
       case SPVA_Pronounce:
          LOG_DEBUG("[Speak] Should pronounce something");
+      case SPVA_Bookmark:
+         // bookmarks are treated like normal text fragments,
+         // but will set a special property
+         LOG_DEBUG("[Speak] Set a bookmark here");
+         (*props)[SAPI_PROPERTY_BOOKMARK] = std::string((char*)currStart, charsLeft * sizeof(wchar_t)) + std::string("\0");
       case SPVA_Speak:
-
          LOG_DEBUG("[Speak] Converting text");
 
          if (curr_frag->ulTextLen == 0) {
@@ -198,8 +215,6 @@ HTSTTS::Speak(DWORD dwSpeakFlags,
             WideCharToMultiByte(CP_UTF8, 0, currStart,  currlen, tmptext, len, NULL, NULL);
             tmptext[len] = 0;
 
-            //TODO: copy and modify properties if this fragment has specific needs
-
             LOG_DEBUG("[Speak] Text = " << tmptext);
             fullText.push_back(TextFragmentPtr(new TextFragment(tmptext, props)));
 
@@ -215,8 +230,6 @@ HTSTTS::Speak(DWORD dwSpeakFlags,
       case SPVA_Silence:
          LOG_DEBUG("[Speak] Should do silence");
          break;
-      case SPVA_Bookmark:
-         break;
       default:
          break;
       }
@@ -235,11 +248,23 @@ HTSTTS::Speak(DWORD dwSpeakFlags,
          break;
       }
 
+      // is this actually a bookmark?
+      //TODO: only send if interest is there
+      if (tf->GetProperties()->find(SAPI_PROPERTY_BOOKMARK) != tf->GetProperties()->end()) {
+         SPEVENT evt;
+         evt.eEventId = SPEI_TTS_BOOKMARK;
+         evt.elParamType = SPET_LPARAM_IS_STRING;
+         evt.ullAudioStreamOffset = 0L;
+         evt.wParam = atol(tf->GetText().c_str());
+         evt.lParam = (LPARAM)((*tf->GetProperties())[SAPI_PROPERTY_BOOKMARK].c_str());
+         (strdup(tf->GetText().c_str()));
+         pOutputSite->AddEvents(&evt, 1);
+         continue;
+      }
+
       LOG_DEBUG("[Speak] Synthesize text fragment");
 
       try {
-         //TODO: modify volume and speaking rate according to live action events for future fragments
-         //      combine it with values from SAPI XML
          TTSResultPtr result = ttsManager.SynthesizeTextFragment(tf);
 
          //TODO: is result->GetSamplingRate() correct?
@@ -349,18 +374,14 @@ void HTSTTS::HandleActions(ISpTTSEngineSite* site) {
    //- change base speaking rate action
    if (actions & SPVES_RATE) {
       long adj;
-
       site->GetRate(&adj);
-
       ttsManager.SetBaseSpeakingRate(ConvertSapiRate(adj));
    }
 
    //- change base volume
    if (actions & SPVES_VOLUME) {
       USHORT adj;
-
       site->GetVolume(&adj);
-
       ttsManager.SetBaseVolume((int)adj);
    }
 }
@@ -378,6 +399,14 @@ FragmentPropertiesPtr HTSTTS::AdjustProperties(const SPVSTATE* state, FragmentPr
    //TODO: if( state.EmphAdj )
    //TODO: if( state.PitchAdj.MiddleAdj )
 
+   // bookmark always needs special properties.
+   if (state->eAction == SPVA_Bookmark) {
+      if (newProps == NULL) {
+         newProps = new FragmentProperties(*props);
+      }
+      //(*newProps)[SAPI_PROPERTY_BOOKMARK] = PROPERTY_VALUE_TRUE;
+   }
+
    //- speaking rate changed for this fragment?
    if (state->RateAdj) {
       std::stringstream ss;
@@ -403,6 +432,8 @@ FragmentPropertiesPtr HTSTTS::AdjustProperties(const SPVSTATE* state, FragmentPr
    //- return either the new properties
    //- or if nothing has changed, the old ones
    if (newProps) {
+      (*newProps)[PROPERTY_KEY_FRAGMENT_NOSIL_END] = PROPERTY_VALUE_TRUE;
+      (*newProps)[PROPERTY_KEY_FRAGMENT_NOSIL_BEGIN] = PROPERTY_VALUE_TRUE;
       return FragmentPropertiesPtr(newProps);
    }
    else {