The Azure Cognitive Services Speech is repeating the audio twice

Question

While I am using the Azure Cognitive Services Speech is repeating the audio twice ,

Answer

Hi @Vijay Sheth,

Thank you for reaching out to Microsoft Q&A forum!

I understand that you are experiencing an issue with the Azure Speech service. Before we proceed, may I confirm if you are using the Azure Text-to-Speech service to generate audio with the given text? If not, could you please provide more details about the resource and region you are working with?

Regarding the issue you mentioned, I tried using Azure TTS to reproduce it on my end, but it seems to be working fine. It's possible that this is an intermittent issue, so I suggest trying again after some time. If the issue persists, please let me know and we can work together to find a solution.

I hope you understand. Do let us know if you any further queries.

Do not forget to click Accept Answer and Yes for was this answer helpful.

Answer

Here is the code which i am using

import streamlit as st
import speech_recognition as sr
import openai
import pandas as pd
import docx
import PyPDF2
import azure.cognitiveservices.speech as speechsdk
import tempfile
import os
from pydub import AudioSegment
from pydub.playback import play

# Set your OpenAI API key
openai.api_key = ''


# Initialize session state
if "stop_execution" not in st.session_state:
    st.session_state.stop_execution = False
if "tmp_file_path" not in st.session_state:
    st.session_state.tmp_file_path = None
if "playing" not in st.session_state:
    st.session_state.playing = False
if "audio_thread" not in st.session_state:
    st.session_state.audio_thread = None

# Azure Cognitive Services configuration
AZURE_SPEECH_KEY = ''
AZURE_SERVICE_REGION = 'australiaeast'

# Function to capture voice input
def capture_voice():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        st.write("Listening...")
        audio = recognizer.listen(source)

    try:
        st.write("Recognizing...")
        text = recognizer.recognize_google(audio)
        st.write(f"You said: {text}")
        return text
    except sr.UnknownValueError:
        st.write("Sorry, I could not understand the audio.")
        return None
    except sr.RequestError:
        st.write("Could not request results; check your network connection.")
        return None

# Function to query OpenAI's GPT-3
def query_chat_model(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=150
        )
        return response.choices[0].message['content'].strip()
    except openai.error.RateLimitError:
        st.write("Rate limit exceeded. Please wait and try again later.")
        return "Rate limit exceeded. Please try again later."
    except Exception as e:
        st.write(f"Error querying the OpenAI API: {e}")
        return "An error occurred while querying the OpenAI API."

# Function to read content from an Excel file
def read_excel_file(file):
    try:
        df = pd.read_excel(file, engine='openpyxl')
        return df
    except Exception as e:
        st.write(f"Error reading the Excel file: {e}")
        return None

# Function to read content from a PDF file
def read_pdf_file(file):
    try:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        st.write(f"Error reading the PDF file: {e}")
        return None

# Function to read content from a Word file
def read_word_file(file):
    try:
        doc = docx.Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "
"
        return text
    except Exception as e:
        st.write(f"Error reading the Word file: {e}")
        return None

# Function to read content from a TXT file
def read_txt_file(file):
    try:
        text = file.read().decode("utf-8")
        return text
    except Exception as e:
        st.write(f"Error reading the TXT file: {e}")
        return None

# Function to generate a response using GPT-3 and the provided content
def generate_response(query, content):
    prompt = f"Using the following data: {content}, answer the question: {query}"
    response = query_chat_model(prompt)
    return response

# Function to convert text to speech using Azure Cognitive Services and play it
def text_to_speech(text, lang='en'):
    if not text:
        st.write("No text to speak.")
        return

    tmp_file_path = None
    try:
        speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
        audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)  # Explicitly use default speaker

        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        result = synthesizer.speak_text_async(text).get()

        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
                tmp_file.write(result.audio_data)
                tmp_file_path = tmp_file.name

            # Convert the WAV file to MP3 using pydub
            audio = AudioSegment.from_wav(tmp_file_path)
            mp3_file_path = tmp_file_path.replace('.wav', '.mp3')
            audio.export(mp3_file_path, format='mp3')

            # Play sound using pydub
            play_audio(mp3_file_path)

        else:
            st.write(f"Speech synthesis failed: {result.reason}")

    except Exception as e:
        st.write(f"Error in text-to-speech conversion: {e}")
    finally:
        return tmp_file_path

# Function to play audio file
def play_audio(file_path):
    if file_path and os.path.exists(file_path):
        audio = AudioSegment.from_mp3(file_path)
        st.session_state.playing = True
        play(audio)
        st.session_state.playing = False
        os.remove(file_path)

# Function to stop the playback
def stop_playback():
    if st.session_state.playing:
        st.session_state.playing = False
        st.write("Playback stopped")

# Main function to integrate all functionalities
def main():
    st.title("Voice Controlled GPT-3 with File Data")

    uploaded_file = st.file_uploader("Upload a file", type=["xlsx", "pdf", "docx", "txt"])
    file_content = None

    if uploaded_file is not None:
        file_type = uploaded_file.name.split('.')[-1]
        if file_type == "xlsx":
            df = read_excel_file(uploaded_file)
            if df is not None:
                st.write("Excel Data:")
                st.dataframe(df)
                file_content = df.to_string(index=False)
        elif file_type == "pdf":
            file_content = read_pdf_file(uploaded_file)
        elif file_type == "docx":
            file_content = read_word_file(uploaded_file)
        elif file_type == "txt":
            file_content = read_txt_file(uploaded_file)

    col1, col2 = st.columns(2)
    with col1:
        if st.button("Speak", key="speak_button"):
            st.session_state.stop_execution = False
            voice_input = capture_voice()
            if st.session_state.stop_execution:
                return
            if voice_input and file_content:
                response_text = generate_response(voice_input, file_content)
                st.write(f"Response: {response_text}")
                text_to_speech(response_text)
            elif voice_input:
                st.write("Please upload a file first.")
            else:
                st.write("Failed to capture voice input.")
    
    with col2:
        if st.button("Stop", key="stop_button"):
            stop_playback()

if __name__ == "__main__":
    main()

Answer

# File: app.py
import streamlit as st
import speech_recognition as sr
import openai
import pandas as pd
import docx
import PyPDF2
from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer, AudioDataStream, ResultReason
import tempfile
import os
import base64

# Set your OpenAI API key
openai.api_key = "
"
azure_speech_key = ""
azure_service_region = ""

# Function to capture voice input
def capture_voice():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        st.write("Listening...")
        audio = recognizer.listen(source)

    try:
        st.write("Recognizing...")
        text = recognizer.recognize_google(audio)
        st.write(f"You said: {text}")
        return text.lower()  # Convert to lower case for easier comparison
    except sr.UnknownValueError:
        st.error("Sorry, I could not understand the audio.")
        return None
    except sr.RequestError:
        st.error("Could not request results; check your network connection.")
        return None

# Function to query OpenAI's GPT-3
def query_chat_model(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=150
        )
        return response.choices[0].message['content'].strip()
    except openai.error.RateLimitError:
        st.error("Rate limit exceeded. Please wait and try again later.")
        return "Rate limit exceeded. Please try again later."
    except Exception as e:
        st.error(f"Error querying the OpenAI API: {e}")
        return "An error occurred while querying the OpenAI API."

# Function to read content from an Excel file
def read_excel_file(file):
    try:
        df = pd.read_excel(file, engine='openpyxl')
        return df
    except Exception as e:
        st.error(f"Error reading the Excel file: {e}")
        return None

# Function to read content from a PDF file
def read_pdf_file(file):
    try:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        st.error(f"Error reading the PDF file: {e}")
        return None

# Function to read content from a Word file
def read_word_file(file):
    try:
        doc = docx.Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "
"
        return text
    except Exception as e:
        st.error(f"Error reading the Word file: {e}")
        return None

# Function to read content from a TXT file
def read_txt_file(file):
    try:
        text = file.read().decode("utf-8")
        return text
    except Exception as e:
        st.error(f"Error reading the TXT file: {e}")
        return None

# Function to generate a response using GPT-3 and the provided content
def generate_response(query, content):
    prompt = f"Using the following data: {content}, answer the question: {query}"
    response = query_chat_model(prompt)
    return response

# Function to convert text to speech and get audio file path
def text_to_speech(text):
    if not text:
        st.error("No text to speak.")
        return None

    try:
        speech_config = SpeechConfig(subscription=azure_speech_key, region=azure_service_region)
        speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"
        synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)

        result = synthesizer.speak_text_async(text).get()
        if result.reason == ResultReason.SynthesizingAudioCompleted:
            audio_stream = AudioDataStream(result)
            temp_audio_path = os.path.join(tempfile.gettempdir(), "output.wav")
            audio_stream.save_to_wav_file(temp_audio_path)
            return temp_audio_path
        else:
            st.error("Error during text-to-speech conversion")
            return None
    except Exception as e:
        st.error(f"Error in text-to-speech conversion: {e}")
        return None

# Function to play audio automatically
def play_audio(file_path):
    try:
        audio_file = open(file_path, 'rb').read()
        audio_base64 = base64.b64encode(audio_file).decode('utf-8')
        audio_html = f''
        st.markdown(audio_html, unsafe_allow_html=True)
    except Exception as e:
        st.error(f"Error playing audio: {e}")

# Main function to integrate all functionalities
def main():
    st.title("Voice Controlled GPT-3 with File Data")

    response_container = st.container()
    upload_container = st.container()

    with response_container:
        if st.button("Speak"):
            voice_input = capture_voice()
            st.session_state.voice_input = voice_input

        if "response_text" in st.session_state:
            st.write(f"Response: {st.session_state.response_text}")
            audio_file_path = text_to_speech(st.session_state.response_text)
            if audio_file_path:
                play_audio(audio_file_path)

    with upload_container:
        uploaded_file = st.file_uploader("Upload a file", type=["xlsx", "pdf", "docx", "txt"])
        file_content = None

        if uploaded_file is not None:
            file_type = uploaded_file.name.split('.')[-1]
            if file_type == "xlsx":
                df = read_excel_file(uploaded_file)
                if df is not None:
                    file_content = df.to_string(index=False)
                    st.session_state.file_content = file_content
                    st.write("Excel Data:")
                    st.dataframe(df)
            elif file_type == "pdf":
                file_content = read_pdf_file(uploaded_file)
                if file_content:
                    st.session_state.file_content = file_content
                    st.write("PDF Content:")
                    st.write(file_content)
            elif file_type == "docx":
                file_content = read_word_file(uploaded_file)
                if file_content:
                    st.session_state.file_content = file_content
                    st.write("Word Document Content:")
                    st.write(file_content)
            elif file_type == "txt":
                file_content = read_txt_file(uploaded_file)
                if file_content:
                    st.session_state.file_content = file_content
                    st.write("Text File Content:")
                    st.write(file_content)

    if "voice_input" in st.session_state and "file_content" in st.session_state:
        response_text = generate_response(st.session_state.voice_input, st.session_state.file_content)
        st.session_state.response_text = response_text

        with response_container:
            st.write(f"Response: {response_text}")
            audio_file_path = text_to_speech(response_text)
            if audio_file_path:
                play_audio(audio_file_path)

if __name__ == "__main__":
    main()

This is working now , voice is not getting repeated.

Share via

The Azure Cognitive Services Speech is repeating the audio twice

3 answers