Skip to content

Commit

Permalink
feat: latest_long as the default google ts model (#526)
Browse files Browse the repository at this point in the history
* JIT-12948 set the Google transcription model to latest_long as the default

* JIT-12948 allow the selection of any google model

* JIT-12948 remove single utterance
  • Loading branch information
rpurdel authored Mar 20, 2024
1 parent 6e60482 commit 551ae20
Showing 1 changed file with 25 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -165,12 +165,6 @@ public class GoogleCloudTranscriptionService
*/
private final static boolean RETRIEVE_INTERIM_RESULTS = true;

/**
* Whether the Google Cloud API only listens for a single utterance
* or continuous to listen once an utterance is over
*/
private final static boolean SINGLE_UTTERANCE_ONLY = true;

/**
* The amount of ms after which a StreamingRecognize session will be closed
* when no new audio is given. This is to make sure the session retrieves
Expand All @@ -180,22 +174,21 @@ public class GoogleCloudTranscriptionService
private final static int STREAMING_SESSION_TIMEOUT_MS = 2000;

/**
* Property name to determine whether to use the Google Speech API's
* video model
* Property name to determine which Google Speech API model to use
*/
private final static String P_NAME_USE_VIDEO_MODEL
= "org.jitsi.jigasi.transcription.USE_VIDEO_MODEL";
private final static String GOOGLE_MODEL
= "org.jitsi.jigasi.transcription.google_model";

/**
* The default value for the property USE_VIDEO_MODEL
* The default value for the property GOOGLE_MODEL
*/
private final static boolean DEFAULT_VALUE_USE_VIDEO_MODEL = false;
private final static String DEFAULT_VALUE_GOOGLE_MODEL = "latest_long";

/**
* Check whether the given string contains a supported language tag
*
* @param tag the language tag
* @throws UnsupportedOperationException when the google cloud API does not
* @throws UnsupportedOperationException when the Google cloud API does not
* support the given language
*/
private static void validateLanguageTag(String tag)
Expand Down Expand Up @@ -229,10 +222,9 @@ public boolean supportsLanguageRouting()
private List<SpeechContext> speechContexts = null;

/**
* Whether to use the more expensive video model when making
* requests.
* The model used for STT
*/
private boolean useVideoModel;
private final String useModel;

/**
* Creates the RecognitionConfig the Google service uses based
Expand Down Expand Up @@ -262,15 +254,10 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
"encoding");
}

// set the model to use. It will default to a cheaper model with
// lower performance when not set.
if (useVideoModel)
builder.setModel(useModel);
if (logger.isDebugEnabled())
{
if (logger.isDebugEnabled())
{
logger.debug("Using the more expensive video model");
}
builder.setModel("video");
logger.debug("Using model " + useModel);
}

// set the Language tag
Expand All @@ -287,13 +274,13 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request)
}

/**
* Create a TranscriptionService which will send audio to the google cloud
* Create a TranscriptionService which will send audio to the Google cloud
* platform to get a transcription
*/
public GoogleCloudTranscriptionService()
{
useVideoModel = JigasiBundleActivator.getConfigurationService()
.getBoolean(P_NAME_USE_VIDEO_MODEL, DEFAULT_VALUE_USE_VIDEO_MODEL);
useModel = JigasiBundleActivator.getConfigurationService()
.getString(GOOGLE_MODEL, DEFAULT_VALUE_GOOGLE_MODEL);
}

/**
Expand Down Expand Up @@ -435,7 +422,7 @@ public class GoogleCloudStreamingRecognitionSession
private RequestApiStreamObserverManager requestManager;

/**
* A single thread which is used to sent all requests to the API.
* A single thread which is used to send all requests to the API.
* This is needed to reliably sent the first request to the service
*/
private ExecutorService service = Executors.newSingleThreadExecutor();
Expand Down Expand Up @@ -513,7 +500,7 @@ public void addTranscriptionListener(TranscriptionListener listener)
private static class GoogleCloudCostLogger
{
/**
* The length of a cost interval of the google cloud speech-to-text API
* The length of a cost interval of the Google cloud speech-to-text API
*/
private final static int INTERVAL_LENGTH_MS = 15000;

Expand Down Expand Up @@ -563,7 +550,7 @@ private static class GoogleCloudCostLogger

/**
* Tell the {@link GoogleCloudCostLogger} that a certain length of audio
* was send.
* was sent.
*
* @param ms the length of the audio chunk sent to the API
*/
Expand Down Expand Up @@ -657,13 +644,13 @@ private class RequestApiStreamObserverManager
private boolean stopped = false;

/**
* Used to log the cost of every request which is send
* Used to log the cost of every request which is sent
*/
private final GoogleCloudCostLogger costLogger;

/**
* Create a new RequestApiStreamObserverManager, which will try
* to mimic a streaming session of indefinite lenth
* to mimic a streaming session of indefinite length
*
* @param client the SpeechClient with which to open new sessions
* @param debugName extra text which will be added to logs
Expand All @@ -686,7 +673,7 @@ private ApiStreamObserver<StreamingRecognizeRequest> createObserver(
RecognitionConfig config)
{
// Each observer gets its own responseObserver to be able to
// to get an unique ID
// get a unique ID
ResponseApiStreamingObserver<StreamingRecognizeResponse>
responseObserver =
new ResponseApiStreamingObserver<StreamingRecognizeResponse>(
Expand All @@ -700,8 +687,6 @@ private ApiStreamObserver<StreamingRecognizeRequest> createObserver(
StreamingRecognitionConfig.newBuilder()
.setConfig(config)
.setInterimResults(RETRIEVE_INTERIM_RESULTS)
.setSingleUtterance(!useVideoModel &&
SINGLE_UTTERANCE_ONLY)
.build();

// StreamingCallable manages sending the audio and receiving
Expand Down Expand Up @@ -922,16 +907,12 @@ public void onNext(StreamingRecognizeResponse message)
if (logger.isDebugEnabled())
logger.debug(
debugName + ": received error from StreamingRecognizeResponse: "
+ message.getError().getMessage());
+ message.getError().getMessage());
requestManager.terminateCurrentSession();
return;
}

// This will happen when SINGLE_UTTERANCE is set to true
// and the server has detected the end of the user's speech
// utterance.
if (isEndOfSingleUtteranceMessage(message) ||
message.getResultsCount() == 0)
if (message.getResultsCount() == 0)
{
if (logger.isDebugEnabled())
logger.debug(
Expand All @@ -944,14 +925,14 @@ public void onNext(StreamingRecognizeResponse message)
List<StreamingRecognitionResult> results = message.getResultsList();
StreamingRecognitionResult result = results.get(0);

// If empty, the session has reached it's time limit and
// If empty, the session has reached its time limit and
// nothing new was said, but there should be an error in the message
// so this is never supposed to happen
if (result.getAlternativesList().isEmpty())
{
logger.warn(
debugName + ": received a list of alternatives which"
+ " was empty");
+ " was empty");
requestManager.terminateCurrentSession();
return;
}
Expand Down Expand Up @@ -981,28 +962,9 @@ else if (logger.isDebugEnabled())
}
}

/**
* Get whether a {@link StreamingRecognizeResponse} has an
* {@link StreamingRecognizeResponse#speechEventType_} of
* {@link StreamingRecognizeResponse.SpeechEventType#
* END_OF_SINGLE_UTTERANCE}
*
* @param message the message to check
* @return true if the message has the eventType
* {@link StreamingRecognizeResponse.SpeechEventType
* #END_OF_SINGLE_UTTERANCE}, false otherwise
*/
private boolean isEndOfSingleUtteranceMessage(
StreamingRecognizeResponse message)
{
return message.getSpeechEventType().
equals(StreamingRecognizeResponse.SpeechEventType.
END_OF_SINGLE_UTTERANCE);
}

/**
* Handle a single {@link StreamingRecognitionResult} by creating
* a {@link TranscriptionResult} based on the result and notifying all
* a {@link TranscriptionResult} based on the result and notifying
* all registered {@link TranscriptionListener}s
*
* @param result the result to handle
Expand Down

0 comments on commit 551ae20

Please sign in to comment.