Hi,
So I am trying to use the Speech Service that Azure AI Speech provides. I want to use the pronunciation assessment feature from the speech-to-text functionality. Here below is the code I used:
def speech_recognize_with_pronuncation(filePath, text_script):
referenceText = text_script if text_script else ""
speech_config = speechsdk.SpeechConfig(subscription=config("SPEECH_KEY"), region=config("SERVICE_REGION"))
speech_config.speech_recognition_language="en-GB"
audio_config = speechsdk.audio.AudioConfig(filename=filePath)
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config,
audio_config=audio_config
)
pronunciation_config = speechsdk.PronunciationAssessmentConfig(json_string="{{\"referenceText\":\"{}\",\"gradingSystem\":\"HundredMark\",\"granularity\":\"Phoneme\",\"phonemeAlphabet\":\"IPA\"}}".format(referenceText))
pronunciation_config.enable_prosody_assessment()
pronunciation_config.enable_content_assessment_with_topic("greeting")
pronunciation_config.apply_to(speech_recognizer)
speech_recognition_result = speech_recognizer.recognize_once()
pronunciation_assessment_result_json = speech_recognition_result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult)
return pronunciation_assessment_result_json
The problem I am facing is that the phonemes that are getting returned after analysis are empty. However, the accuracy scores are available for the empty phonemes. Here below is the response showing empty phonemes but with score values:
{
"status_code": 200,
"message": "Successful",
"detail": {
"Id": "d954ca5f17f54eb9a36978c67211b4fc",
"RecognitionStatus": "Success",
"Offset": 800000,
"Duration": 4000000,
"Channel": 0,
"DisplayText": "Heart.",
"SNR": 46.816277,
"NBest": [
{
"Confidence": 0.91353595,
"Lexical": "heart",
"ITN": "heart",
"MaskedITN": "heart",
"Display": "Heart.",
"PronunciationAssessment": {
"AccuracyScore": 42.0,
"FluencyScore": 100.0,
"ProsodyScore": 0.0,
"CompletenessScore": 100.0,
"PronScore": 28.4
},
"Words": [
{
"Word": "heart",
"Offset": 800000,
"Duration": 4000000,
"PronunciationAssessment": {
"AccuracyScore": 42.0,
"ErrorType": "Mispronunciation",
"Feedback": {
"Prosody": {
"Break": {
"ErrorTypes": [
"None"
],
"BreakLength": 0
},
"Intonation": {
"ErrorTypes": [],
"Monotone": {
"SyllablePitchDeltaConfidence": 0.50218755
}
}
}
}
},
"Syllables": [
{
"Syllable": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 800000,
"Duration": 4000000
}
],
"Phonemes": [
{
"Phoneme": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 800000,
"Duration": 1300000,
"blend_shape": []
},
{
"Phoneme": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 2200000,
"Duration": 500000,
"blend_shape": []
},
{
"Phoneme": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 2800000,
"Duration": 700000,
"blend_shape": []
},
{
"Phoneme": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 3600000,
"Duration": 1200000,
"blend_shape": []
}
]
}
],
"Incorrect_spoken_words": [
{
"Word": "heart",
"Offset": 800000,
"Duration": 4000000,
"PronunciationAssessment": {
"AccuracyScore": 42.0,
"ErrorType": "Mispronunciation",
"Feedback": {
"Prosody": {
"Break": {
"ErrorTypes": [
"None"
],
"BreakLength": 0
},
"Intonation": {
"ErrorTypes": [],
"Monotone": {
"SyllablePitchDeltaConfidence": 0.50218755
}
}
}
}
},
"Syllables": [
{
"Syllable": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 800000,
"Duration": 4000000
}
],
"Phonemes": [
{
"Phoneme": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 800000,
"Duration": 1300000,
"blend_shape": []
},
{
"Phoneme": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 2200000,
"Duration": 500000,
"blend_shape": []
},
{
"Phoneme": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 2800000,
"Duration": 700000,
"blend_shape": []
},
{
"Phoneme": "",
"PronunciationAssessment": {
"AccuracyScore": 42.0
},
"Offset": 3600000,
"Duration": 1200000,
"blend_shape": []
}
]
}
]
}
]
}
}
Please address this issue ASAP since this featue is crucial in my project and our use for this feature will be huge.
Thank you
Umair Habib