feat: latest_long as the default google ts model (#526)

* JIT-12948 set the Google transcription model to latest_long as the default * JIT-12948 allow the selection of any google model * JIT-12948 remove single utterance
jitsi · Mar 20, 2024 · 551ae20 · 551ae20
1 parent 6e60482
commit 551ae20
Showing 1 changed file with 25 additions and 63 deletions.
diff --git a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java
@@ -165,12 +165,6 @@ public class GoogleCloudTranscriptionService
      */
     private final static boolean RETRIEVE_INTERIM_RESULTS = true;
 
-    /**
-     * Whether the Google Cloud API only listens for a single utterance
-     * or continuous to listen once an utterance is over
-     */
-    private final static boolean SINGLE_UTTERANCE_ONLY = true;
-
     /**
      * The amount of ms after which a StreamingRecognize session will be closed
      * when no new audio is given. This is to make sure the session retrieves
@@ -180,22 +174,21 @@ public class GoogleCloudTranscriptionService
     private final static int STREAMING_SESSION_TIMEOUT_MS = 2000;
 
     /**
-     * Property name to determine whether to use the Google Speech API's
-     * video model
+     * Property name to determine which Google Speech API model to use
      */
-    private final static String P_NAME_USE_VIDEO_MODEL
-        = "org.jitsi.jigasi.transcription.USE_VIDEO_MODEL";
+    private final static String GOOGLE_MODEL
+        = "org.jitsi.jigasi.transcription.google_model";
 
     /**
-     * The default value for the property USE_VIDEO_MODEL
+     * The default value for the property GOOGLE_MODEL
      */
-    private final static boolean DEFAULT_VALUE_USE_VIDEO_MODEL = false;
+    private final static String DEFAULT_VALUE_GOOGLE_MODEL = "latest_long";
 
     /**
      * Check whether the given string contains a supported language tag
      *
      * @param tag the language tag
-     * @throws UnsupportedOperationException when the google cloud API does not
+     * @throws UnsupportedOperationException when the Google cloud API does not
      * support the given language
      */
     private static void validateLanguageTag(String tag)
@@ -229,10 +222,9 @@ public boolean supportsLanguageRouting()
     private List<SpeechContext> speechContexts = null;
 
     /**
-     * Whether to use the more expensive video model when making
-     * requests.
+     * The model used for STT
      */
-    private boolean useVideoModel;
+    private final String useModel;
 
     /**
      * Creates the RecognitionConfig the Google service uses based
@@ -262,15 +254,10 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
                     "encoding");
         }
 
-        // set the model to use. It will default to a cheaper model with
-        // lower performance when not set.
-        if (useVideoModel)
+        builder.setModel(useModel);
+        if (logger.isDebugEnabled())
         {
-            if (logger.isDebugEnabled())
-            {
-                logger.debug("Using the more expensive video model");
-            }
-            builder.setModel("video");
+            logger.debug("Using model " + useModel);
         }
 
         // set the Language tag
@@ -287,13 +274,13 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
     }
 
     /**
-     * Create a TranscriptionService which will send audio to the google cloud
+     * Create a TranscriptionService which will send audio to the Google cloud
      * platform to get a transcription
      */
     public GoogleCloudTranscriptionService()
     {
-        useVideoModel = JigasiBundleActivator.getConfigurationService()
-            .getBoolean(P_NAME_USE_VIDEO_MODEL, DEFAULT_VALUE_USE_VIDEO_MODEL);
+        useModel = JigasiBundleActivator.getConfigurationService()
+            .getString(GOOGLE_MODEL, DEFAULT_VALUE_GOOGLE_MODEL);
     }
 
     /**
@@ -435,7 +422,7 @@ public class GoogleCloudStreamingRecognitionSession
         private RequestApiStreamObserverManager requestManager;
 
         /**
-         * A single thread which is used to sent all requests to the API.
+         * A single thread which is used to send all requests to the API.
          * This is needed to reliably sent the first request to the service
          */
         private ExecutorService service = Executors.newSingleThreadExecutor();
@@ -513,7 +500,7 @@ public void addTranscriptionListener(TranscriptionListener listener)
     private static class GoogleCloudCostLogger
     {
         /**
-         * The length of a cost interval of the google cloud speech-to-text API
+         * The length of a cost interval of the Google cloud speech-to-text API
          */
         private final static int INTERVAL_LENGTH_MS = 15000;
 
@@ -563,7 +550,7 @@ private static class GoogleCloudCostLogger
 
         /**
          * Tell the {@link GoogleCloudCostLogger} that a certain length of audio
-         * was send.
+         * was sent.
          *
          * @param ms the length of the audio chunk sent to the API
          */
@@ -657,13 +644,13 @@ private class RequestApiStreamObserverManager
         private boolean stopped = false;
 
         /**
-         * Used to log the cost of every request which is send
+         * Used to log the cost of every request which is sent
          */
         private final GoogleCloudCostLogger costLogger;
 
         /**
          * Create a new RequestApiStreamObserverManager, which will try
-         * to mimic a streaming session of indefinite lenth
+         * to mimic a streaming session of indefinite length
          *
          * @param client the SpeechClient with which to open new sessions
          * @param debugName extra text which will be added to logs
@@ -686,7 +673,7 @@ private ApiStreamObserver<StreamingRecognizeRequest> createObserver(
             RecognitionConfig config)
         {
             // Each observer gets its own responseObserver to be able to
-            // to get an unique ID
+            // get a unique ID
             ResponseApiStreamingObserver<StreamingRecognizeResponse>
                 responseObserver =
                 new ResponseApiStreamingObserver<StreamingRecognizeResponse>(
@@ -700,8 +687,6 @@ private ApiStreamObserver<StreamingRecognizeRequest> createObserver(
                 StreamingRecognitionConfig.newBuilder()
                     .setConfig(config)
                     .setInterimResults(RETRIEVE_INTERIM_RESULTS)
-                    .setSingleUtterance(!useVideoModel &&
-                                            SINGLE_UTTERANCE_ONLY)
                     .build();
 
             // StreamingCallable manages sending the audio and receiving
@@ -922,16 +907,12 @@ public void onNext(StreamingRecognizeResponse message)
                 if (logger.isDebugEnabled())
                     logger.debug(
                         debugName + ": received error from StreamingRecognizeResponse: "
-                        + message.getError().getMessage());
+                             + message.getError().getMessage());
                 requestManager.terminateCurrentSession();
                 return;
             }
 
-            // This will happen when SINGLE_UTTERANCE is set to true
-            // and the server has detected the end of the user's speech
-            // utterance.
-            if (isEndOfSingleUtteranceMessage(message) ||
-                message.getResultsCount() == 0)
+            if (message.getResultsCount() == 0)
             {
                 if (logger.isDebugEnabled())
                     logger.debug(
@@ -944,14 +925,14 @@ public void onNext(StreamingRecognizeResponse message)
             List<StreamingRecognitionResult> results = message.getResultsList();
             StreamingRecognitionResult result = results.get(0);
 
-            // If empty, the session has reached it's time limit and
+            // If empty, the session has reached its time limit and
             // nothing new was said, but there should be an error in the message
             // so this is never supposed to happen
             if (result.getAlternativesList().isEmpty())
             {
                 logger.warn(
                     debugName + ": received a list of alternatives which"
-                              + " was empty");
+                            + " was empty");
                 requestManager.terminateCurrentSession();
                 return;
             }
@@ -981,28 +962,9 @@ else if (logger.isDebugEnabled())
             }
         }
 
-        /**
-         * Get whether a {@link StreamingRecognizeResponse} has an
-         * {@link StreamingRecognizeResponse#speechEventType_} of
-         * {@link StreamingRecognizeResponse.SpeechEventType#
-         * END_OF_SINGLE_UTTERANCE}
-         *
-         * @param message the message to check
-         * @return true if the message has the eventType
-         * {@link StreamingRecognizeResponse.SpeechEventType
-         * #END_OF_SINGLE_UTTERANCE}, false otherwise
-         */
-        private boolean isEndOfSingleUtteranceMessage(
-            StreamingRecognizeResponse message)
-        {
-            return message.getSpeechEventType().
-                equals(StreamingRecognizeResponse.SpeechEventType.
-                    END_OF_SINGLE_UTTERANCE);
-        }
-
         /**
          * Handle a single {@link StreamingRecognitionResult} by creating
-         * a {@link TranscriptionResult} based on the result and notifying all
+         * a {@link TranscriptionResult} based on the result and notifying
          * all registered {@link TranscriptionListener}s
          *
          * @param result the result to handle