I am developing an .NET 9 MAUI mobile app for Android. this app includes a page where I am loading an HybridWebView in my XAML Page:
<StackLayout Grid.Row="0" Orientation="Horizontal" VerticalOptions="Center" Margin="0,10,0,10"> <Grid RowDefinitions="Auto,*"
ColumnDefinitions="*"
MaximumWidthRequest="400"
MaximumHeightRequest="1000">
<HybridWebView x:Name="hybridWebView" RawMessageReceived="OnHybridWebViewRawMessageReceived"
Grid.Row="1" />
</Grid> </StackLayout>
This HybridWebView will later include an Azure Speech to Avatar video showing a standard Azure Speech avatar. I am calling the JavaScript I need to generate the avatar through this call:
var result = await hybridWebView.InvokeJavaScriptAsync("speak", HybridJsContext.Default.DictionaryStringString,
[com.Message, Controllers.CurrentLanguage], [HybridJsContext.Default.String, HybridJsContext.Default.String]);
Now being in my index.html page I use this html code:
<div class="avatar-container">
<div id="videoContainer">
<div id="overlayArea" style="position: absolute; top: 0; left: 0;" hidden="hidden">
<p id="overlayText" style="font-size: large;"></p>
<img id="overlayImage" src="your-image-source.png" alt="Overlay Image">
</div>
<div id="remoteVideo"></div>
<canvas id="canvas" width="1920" height="1080" style="background-color: transparent; background-image: url('AvatarBackground.jpg'); background-size: cover; background-position: center;" hidden="hidden"></canvas>
<canvas id="tmpCanvas" width="1920" height="1080" hidden="hidden"></canvas>
</div>
</div>
And this script:
if (typeof SpeechSDK === "undefined") {
console.log("INFO FROM INDEX.HTML: ERROR-IN-SCRIPT: Speech SDK is not loaded.", "color: red; font-size: 16px;");
} else {
window.speechConfig = SpeechSDK.SpeechConfig.fromSubscription("[SUBSCRIPTIONKEY]", "[REGION]");
window.avatarConfig = new SpeechSDK.AvatarConfig("lisa", "casual-sitting");
window.peerConnection = new RTCPeerConnection({
iceServers: [{
urls: ["[TURN-URL-AND-PORT]"],
username: "[USERNAME]",
credential: "[CREDENTIALS]"
}]
});
// Log connection state changes
window.peerConnection.oniceconnectionstatechange = function () {
console.log("INFO FROM INDEX.HTML: ICE Connection State: " + window.peerConnection.iceConnectionState, "color: orange; font-size: 14px;");
if (window.peerConnection.iceConnectionState === 'connected') {
console.log("INFO FROM INDEX.HTML: TURN server connection established.", "color: green; font-size: 14px;");
} else if (window.peerConnection.iceConnectionState === 'failed') {
console.error("ERROR FROM INDEX.HTML: TURN server connection failed.", "color: red; font-size: 14px;");
}
};
window.peerConnection.ontrack = function (event) {
console.log("INFO FROM INDEX.HTML: Track received: " + event.track.kind, "color: blue; font-size: 14px;");
if (event.track.kind === 'video') {
const videoElement = document.createElement('video');
videoElement.srcObject = event.streams[0];
videoElement.autoplay = true;
videoElement.muted = true; // Add this line to enable autoplay
videoElement.playsInline = true; // For iOS
videoElement.style.width = '100%';
window.remoteVideoDiv = document.getElementById('remoteVideo');
remoteVideoDiv.innerHTML = '';
remoteVideoDiv.appendChild(videoElement);
console.info("INFO FROM INDEX.HTML: Video element appended.", "color: green; font-size: 12px;");
}
if (event.track.kind === 'audio') {
window.audioElement = document.createElement('audio');
audioElement.srcObject = event.streams[0];
audioElement.autoplay = true;
audioElement.muted = true;
document.body.appendChild(audioElement);
console.info("INFO FROM INDEX.HTML: cAudio element appended.", "color: green; font-size: 12px;");
}
};
window.peerConnection.addTransceiver('video', { direction: 'sendrecv' });
window.peerConnection.addTransceiver('audio', { direction: 'sendrecv' });
window.avatarSynthesizer = new SpeechSDK.AvatarSynthesizer(window.speechConfig, window.avatarConfig);
window.avatarSynthesizer.startAvatarAsync(window.peerConnection).then(
() => {
console.info("INFO FROM INDEX.HTML: Avatar started successfully.", "color: green; font-size: 14px;");
window.HybridWebView.InvokeDotNet('FinishInitializeAvatar');
// window.speak("Hello, this is a test message from your avatar.", "en");
}
).catch(
(error) => {
console.error("ERROR FROM INDEX.HTML: ERROR-IN-SCRIPT: Avatar failed to start. Error: " + error, "color: red; font-size: 16px;");
}
);
}
And finally. This is the code I call to execute the speech:
window.speak = (textToSpeech, detectedLanguage) => {
const audioElement = document.getElementById('audio');
if (audioElement) {
audioElement.muted = false;
}
const videoElement = document.getElementById('video');
if (videoElement) {
videoElement.muted = false;
}
window.audioElement.muted = false;
window.remoteVideoDiv.muted = false;
let myMap = new Map([["de", "de-DE"], ["en", "en-GB"], ["fr", "fr-FR"], ["es", "es-ES"], ["pl", "pl-PL"], ["pt", "pt-PT"], ["tr", "tr-TR"], ["nl", "nl-NL"]]);
let language = myMap.get(detectedLanguage);
let spokenText = textToSpeech;
let spokenSsml = `<speak version='1.0' xmlns='' xmlns:mstts='' xml:lang="en-US">
<voice name='en-US-JennyMultilingualNeural'>
<lang xml:lang='${language}'>
${htmlEncode(spokenText)}
</lang>
</voice>
</speak>`;
return window.avatarSynthesizer.speakSsmlAsync(spokenSsml).then(
(result) => {
if (result.reason === SpeechSDK.ResultReason.SynthesizingAudioCompleted) {
console.log("INFO FROM INDEX.HTML: Speech synthesized successfully with text " + spokenText, "color: green; font-size: 12px;");
return "success";
}
return "failed";
})
.catch((error) => {
console.log("ERROR FROM INDEX.HTML: ERROR-IN-SCRIPT: Failed to synthesize speech. Error: " + error, "color: red; font-size: 16px;");
//let jsonObject = JSON.stringify("failed");
//return jsonObject;
return "failed";
});
}
This is all working fine. I can see the avatar. I see that the avatar is speaking (lips are moving etc). BUT I hear NO AUDIO! I have tested Android permissions, which are looking like this:
<uses-permission android:name="android.permission.ACCESS_FINE_LOCATION" />
<uses-permission android:name="android.permission.ACCESS_COARSE_LOCATION" />
<uses-permission android:name="android.permission.ACCESS_NETWORK_STATE" />
<uses-permission android:name="android.permission.INTERNET" />
<uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
<uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
<uses-permission android:name="android.permission.CAMERA" />
<uses-permission android:name="android.permission.RECORD_AUDIO" />
<uses-permission android:name="android.permission.READ_DEVICE_CONFIG" />
<uses-permission android:name="android.permission.MODIFY_AUDIO_SETTINGS" />
<uses-permission android:name="android.permission.BLUETOOTH_CONNECT" />
I have changed headsets, changed devices, tried Android Emulator as well as real devices (Pixel 8). I tried different was to unmute the elements like this:
const audioElement = document.getElementById('audio');
if (audioElement) {
audioElement.muted = false;
}
const videoElement = document.getElementById('video');
if (videoElement) {
videoElement.muted = false;
}
window.audioElement.muted = false;
window.remoteVideoDiv.muted = false;
I am developing an .NET 9 MAUI mobile app for Android. this app includes a page where I am loading an HybridWebView in my XAML Page:
<StackLayout Grid.Row="0" Orientation="Horizontal" VerticalOptions="Center" Margin="0,10,0,10"> <Grid RowDefinitions="Auto,*"
ColumnDefinitions="*"
MaximumWidthRequest="400"
MaximumHeightRequest="1000">
<HybridWebView x:Name="hybridWebView" RawMessageReceived="OnHybridWebViewRawMessageReceived"
Grid.Row="1" />
</Grid> </StackLayout>
This HybridWebView will later include an Azure Speech to Avatar video showing a standard Azure Speech avatar. I am calling the JavaScript I need to generate the avatar through this call:
var result = await hybridWebView.InvokeJavaScriptAsync("speak", HybridJsContext.Default.DictionaryStringString,
[com.Message, Controllers.CurrentLanguage], [HybridJsContext.Default.String, HybridJsContext.Default.String]);
Now being in my index.html page I use this html code:
<div class="avatar-container">
<div id="videoContainer">
<div id="overlayArea" style="position: absolute; top: 0; left: 0;" hidden="hidden">
<p id="overlayText" style="font-size: large;"></p>
<img id="overlayImage" src="your-image-source.png" alt="Overlay Image">
</div>
<div id="remoteVideo"></div>
<canvas id="canvas" width="1920" height="1080" style="background-color: transparent; background-image: url('AvatarBackground.jpg'); background-size: cover; background-position: center;" hidden="hidden"></canvas>
<canvas id="tmpCanvas" width="1920" height="1080" hidden="hidden"></canvas>
</div>
</div>
And this script:
if (typeof SpeechSDK === "undefined") {
console.log("INFO FROM INDEX.HTML: ERROR-IN-SCRIPT: Speech SDK is not loaded.", "color: red; font-size: 16px;");
} else {
window.speechConfig = SpeechSDK.SpeechConfig.fromSubscription("[SUBSCRIPTIONKEY]", "[REGION]");
window.avatarConfig = new SpeechSDK.AvatarConfig("lisa", "casual-sitting");
window.peerConnection = new RTCPeerConnection({
iceServers: [{
urls: ["[TURN-URL-AND-PORT]"],
username: "[USERNAME]",
credential: "[CREDENTIALS]"
}]
});
// Log connection state changes
window.peerConnection.oniceconnectionstatechange = function () {
console.log("INFO FROM INDEX.HTML: ICE Connection State: " + window.peerConnection.iceConnectionState, "color: orange; font-size: 14px;");
if (window.peerConnection.iceConnectionState === 'connected') {
console.log("INFO FROM INDEX.HTML: TURN server connection established.", "color: green; font-size: 14px;");
} else if (window.peerConnection.iceConnectionState === 'failed') {
console.error("ERROR FROM INDEX.HTML: TURN server connection failed.", "color: red; font-size: 14px;");
}
};
window.peerConnection.ontrack = function (event) {
console.log("INFO FROM INDEX.HTML: Track received: " + event.track.kind, "color: blue; font-size: 14px;");
if (event.track.kind === 'video') {
const videoElement = document.createElement('video');
videoElement.srcObject = event.streams[0];
videoElement.autoplay = true;
videoElement.muted = true; // Add this line to enable autoplay
videoElement.playsInline = true; // For iOS
videoElement.style.width = '100%';
window.remoteVideoDiv = document.getElementById('remoteVideo');
remoteVideoDiv.innerHTML = '';
remoteVideoDiv.appendChild(videoElement);
console.info("INFO FROM INDEX.HTML: Video element appended.", "color: green; font-size: 12px;");
}
if (event.track.kind === 'audio') {
window.audioElement = document.createElement('audio');
audioElement.srcObject = event.streams[0];
audioElement.autoplay = true;
audioElement.muted = true;
document.body.appendChild(audioElement);
console.info("INFO FROM INDEX.HTML: cAudio element appended.", "color: green; font-size: 12px;");
}
};
window.peerConnection.addTransceiver('video', { direction: 'sendrecv' });
window.peerConnection.addTransceiver('audio', { direction: 'sendrecv' });
window.avatarSynthesizer = new SpeechSDK.AvatarSynthesizer(window.speechConfig, window.avatarConfig);
window.avatarSynthesizer.startAvatarAsync(window.peerConnection).then(
() => {
console.info("INFO FROM INDEX.HTML: Avatar started successfully.", "color: green; font-size: 14px;");
window.HybridWebView.InvokeDotNet('FinishInitializeAvatar');
// window.speak("Hello, this is a test message from your avatar.", "en");
}
).catch(
(error) => {
console.error("ERROR FROM INDEX.HTML: ERROR-IN-SCRIPT: Avatar failed to start. Error: " + error, "color: red; font-size: 16px;");
}
);
}
And finally. This is the code I call to execute the speech:
window.speak = (textToSpeech, detectedLanguage) => {
const audioElement = document.getElementById('audio');
if (audioElement) {
audioElement.muted = false;
}
const videoElement = document.getElementById('video');
if (videoElement) {
videoElement.muted = false;
}
window.audioElement.muted = false;
window.remoteVideoDiv.muted = false;
let myMap = new Map([["de", "de-DE"], ["en", "en-GB"], ["fr", "fr-FR"], ["es", "es-ES"], ["pl", "pl-PL"], ["pt", "pt-PT"], ["tr", "tr-TR"], ["nl", "nl-NL"]]);
let language = myMap.get(detectedLanguage);
let spokenText = textToSpeech;
let spokenSsml = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='http://www.w3.org/2001/mstts' xml:lang="en-US">
<voice name='en-US-JennyMultilingualNeural'>
<lang xml:lang='${language}'>
${htmlEncode(spokenText)}
</lang>
</voice>
</speak>`;
return window.avatarSynthesizer.speakSsmlAsync(spokenSsml).then(
(result) => {
if (result.reason === SpeechSDK.ResultReason.SynthesizingAudioCompleted) {
console.log("INFO FROM INDEX.HTML: Speech synthesized successfully with text " + spokenText, "color: green; font-size: 12px;");
return "success";
}
return "failed";
})
.catch((error) => {
console.log("ERROR FROM INDEX.HTML: ERROR-IN-SCRIPT: Failed to synthesize speech. Error: " + error, "color: red; font-size: 16px;");
//let jsonObject = JSON.stringify("failed");
//return jsonObject;
return "failed";
});
}
This is all working fine. I can see the avatar. I see that the avatar is speaking (lips are moving etc). BUT I hear NO AUDIO! I have tested Android permissions, which are looking like this:
<uses-permission android:name="android.permission.ACCESS_FINE_LOCATION" />
<uses-permission android:name="android.permission.ACCESS_COARSE_LOCATION" />
<uses-permission android:name="android.permission.ACCESS_NETWORK_STATE" />
<uses-permission android:name="android.permission.INTERNET" />
<uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
<uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
<uses-permission android:name="android.permission.CAMERA" />
<uses-permission android:name="android.permission.RECORD_AUDIO" />
<uses-permission android:name="android.permission.READ_DEVICE_CONFIG" />
<uses-permission android:name="android.permission.MODIFY_AUDIO_SETTINGS" />
<uses-permission android:name="android.permission.BLUETOOTH_CONNECT" />
I have changed headsets, changed devices, tried Android Emulator as well as real devices (Pixel 8). I tried different was to unmute the elements like this:
const audioElement = document.getElementById('audio');
if (audioElement) {
audioElement.muted = false;
}
const videoElement = document.getElementById('video');
if (videoElement) {
videoElement.muted = false;
}
window.audioElement.muted = false;
window.remoteVideoDiv.muted = false;
After a long time of try/error and research the solution for this is quite simple. The code is 100% correct but (at least on Android) the audio is blocked. It is blocked because there is no user interaction on the hybridwevview itself but only on the MAUI UI. If I add a button onto the hybridwebview looking like this:
<button id="enableAudioButton">Enable Audio</button>
and add this code into my JavaScript
document.getElementById('enableAudioButton').addEventListener('click', () => {
const audioElement = document.getElementById('audio');
if (audioElement) {
audioElement.muted = false;
audioElement.play().then(() => {
console.log("enableAudioButton Audio is playing.");
}).catch((error) => {
console.error("enableAudioButton Error attempting to play audio:", error);
});
}
and press the button "Enable Audio", the audio works. What I try to do now is to automate the click on the button, which is the next challange.