The Azure Cognitive Services Speech is repeating the audio twice

Vijay Sheth 0 Reputation points
2024-06-18T07:02:02.35+00:00

While I am using the Azure Cognitive Services Speech is repeating the audio twice ,

Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
1,516 questions
{count} votes

3 answers

Sort by: Most helpful
  1. santoshkc 6,310 Reputation points Microsoft Vendor
    2024-06-18T08:44:34.4733333+00:00

    Hi @Vijay Sheth,

    Thank you for reaching out to Microsoft Q&A forum!

    I understand that you are experiencing an issue with the Azure Speech service. Before we proceed, may I confirm if you are using the Azure Text-to-Speech service to generate audio with the given text? If not, could you please provide more details about the resource and region you are working with?

    Regarding the issue you mentioned, I tried using Azure TTS to reproduce it on my end, but it seems to be working fine. It's possible that this is an intermittent issue, so I suggest trying again after some time. If the issue persists, please let me know and we can work together to find a solution.

    I hope you understand. Do let us know if you any further queries.


    Do not forget to click Accept Answer and Yes for was this answer helpful.


  2. Vijay Sheth 0 Reputation points
    2024-06-19T12:10:34.7966667+00:00

    Here is the code which i am using

    import streamlit as st
    import speech_recognition as sr
    import openai
    import pandas as pd
    import docx
    import PyPDF2
    import azure.cognitiveservices.speech as speechsdk
    import tempfile
    import os
    from pydub import AudioSegment
    from pydub.playback import play
    
    # Set your OpenAI API key
    openai.api_key = ''
    
    
    # Initialize session state
    if "stop_execution" not in st.session_state:
        st.session_state.stop_execution = False
    if "tmp_file_path" not in st.session_state:
        st.session_state.tmp_file_path = None
    if "playing" not in st.session_state:
        st.session_state.playing = False
    if "audio_thread" not in st.session_state:
        st.session_state.audio_thread = None
    
    # Azure Cognitive Services configuration
    AZURE_SPEECH_KEY = ''
    AZURE_SERVICE_REGION = 'australiaeast'
    
    # Function to capture voice input
    def capture_voice():
        recognizer = sr.Recognizer()
        with sr.Microphone() as source:
            st.write("Listening...")
            audio = recognizer.listen(source)
    
        try:
            st.write("Recognizing...")
            text = recognizer.recognize_google(audio)
            st.write(f"You said: {text}")
            return text
        except sr.UnknownValueError:
            st.write("Sorry, I could not understand the audio.")
            return None
        except sr.RequestError:
            st.write("Could not request results; check your network connection.")
            return None
    
    # Function to query OpenAI's GPT-3
    def query_chat_model(prompt):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
                max_tokens=150
            )
            return response.choices[0].message['content'].strip()
        except openai.error.RateLimitError:
            st.write("Rate limit exceeded. Please wait and try again later.")
            return "Rate limit exceeded. Please try again later."
        except Exception as e:
            st.write(f"Error querying the OpenAI API: {e}")
            return "An error occurred while querying the OpenAI API."
    
    # Function to read content from an Excel file
    def read_excel_file(file):
        try:
            df = pd.read_excel(file, engine='openpyxl')
            return df
        except Exception as e:
            st.write(f"Error reading the Excel file: {e}")
            return None
    
    # Function to read content from a PDF file
    def read_pdf_file(file):
        try:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            return text
        except Exception as e:
            st.write(f"Error reading the PDF file: {e}")
            return None
    
    # Function to read content from a Word file
    def read_word_file(file):
        try:
            doc = docx.Document(file)
            text = ""
            for para in doc.paragraphs:
                text += para.text + "\n"
            return text
        except Exception as e:
            st.write(f"Error reading the Word file: {e}")
            return None
    
    # Function to read content from a TXT file
    def read_txt_file(file):
        try:
            text = file.read().decode("utf-8")
            return text
        except Exception as e:
            st.write(f"Error reading the TXT file: {e}")
            return None
    
    # Function to generate a response using GPT-3 and the provided content
    def generate_response(query, content):
        prompt = f"Using the following data: {content}, answer the question: {query}"
        response = query_chat_model(prompt)
        return response
    
    # Function to convert text to speech using Azure Cognitive Services and play it
    def text_to_speech(text, lang='en'):
        if not text:
            st.write("No text to speak.")
            return
    
        tmp_file_path = None
        try:
            speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
            audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)  # Explicitly use default speaker
    
            synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
            result = synthesizer.speak_text_async(text).get()
    
            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
                with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
                    tmp_file.write(result.audio_data)
                    tmp_file_path = tmp_file.name
    
                # Convert the WAV file to MP3 using pydub
                audio = AudioSegment.from_wav(tmp_file_path)
                mp3_file_path = tmp_file_path.replace('.wav', '.mp3')
                audio.export(mp3_file_path, format='mp3')
    
                # Play sound using pydub
                play_audio(mp3_file_path)
    
            else:
                st.write(f"Speech synthesis failed: {result.reason}")
    
        except Exception as e:
            st.write(f"Error in text-to-speech conversion: {e}")
        finally:
            return tmp_file_path
    
    # Function to play audio file
    def play_audio(file_path):
        if file_path and os.path.exists(file_path):
            audio = AudioSegment.from_mp3(file_path)
            st.session_state.playing = True
            play(audio)
            st.session_state.playing = False
            os.remove(file_path)
    
    # Function to stop the playback
    def stop_playback():
        if st.session_state.playing:
            st.session_state.playing = False
            st.write("Playback stopped")
    
    # Main function to integrate all functionalities
    def main():
        st.title("Voice Controlled GPT-3 with File Data")
    
        uploaded_file = st.file_uploader("Upload a file", type=["xlsx", "pdf", "docx", "txt"])
        file_content = None
    
        if uploaded_file is not None:
            file_type = uploaded_file.name.split('.')[-1]
            if file_type == "xlsx":
                df = read_excel_file(uploaded_file)
                if df is not None:
                    st.write("Excel Data:")
                    st.dataframe(df)
                    file_content = df.to_string(index=False)
            elif file_type == "pdf":
                file_content = read_pdf_file(uploaded_file)
            elif file_type == "docx":
                file_content = read_word_file(uploaded_file)
            elif file_type == "txt":
                file_content = read_txt_file(uploaded_file)
    
        col1, col2 = st.columns(2)
        with col1:
            if st.button("Speak", key="speak_button"):
                st.session_state.stop_execution = False
                voice_input = capture_voice()
                if st.session_state.stop_execution:
                    return
                if voice_input and file_content:
                    response_text = generate_response(voice_input, file_content)
                    st.write(f"Response: {response_text}")
                    text_to_speech(response_text)
                elif voice_input:
                    st.write("Please upload a file first.")
                else:
                    st.write("Failed to capture voice input.")
        
        with col2:
            if st.button("Stop", key="stop_button"):
                stop_playback()
    
    if __name__ == "__main__":
        main()
    
    

  3. Vijay Sheth 0 Reputation points
    2024-06-24T10:20:11.1866667+00:00
    # File: app.py
    import streamlit as st
    import speech_recognition as sr
    import openai
    import pandas as pd
    import docx
    import PyPDF2
    from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer, AudioDataStream, ResultReason
    import tempfile
    import os
    import base64
    
    # Set your OpenAI API key
    openai.api_key = "
    "
    azure_speech_key = ""
    azure_service_region = ""
    
    # Function to capture voice input
    def capture_voice():
        recognizer = sr.Recognizer()
        with sr.Microphone() as source:
            st.write("Listening...")
            audio = recognizer.listen(source)
    
        try:
            st.write("Recognizing...")
            text = recognizer.recognize_google(audio)
            st.write(f"You said: {text}")
            return text.lower()  # Convert to lower case for easier comparison
        except sr.UnknownValueError:
            st.error("Sorry, I could not understand the audio.")
            return None
        except sr.RequestError:
            st.error("Could not request results; check your network connection.")
            return None
    
    # Function to query OpenAI's GPT-3
    def query_chat_model(prompt):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt},
                ],
                max_tokens=150
            )
            return response.choices[0].message['content'].strip()
        except openai.error.RateLimitError:
            st.error("Rate limit exceeded. Please wait and try again later.")
            return "Rate limit exceeded. Please try again later."
        except Exception as e:
            st.error(f"Error querying the OpenAI API: {e}")
            return "An error occurred while querying the OpenAI API."
    
    # Function to read content from an Excel file
    def read_excel_file(file):
        try:
            df = pd.read_excel(file, engine='openpyxl')
            return df
        except Exception as e:
            st.error(f"Error reading the Excel file: {e}")
            return None
    
    # Function to read content from a PDF file
    def read_pdf_file(file):
        try:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            return text
        except Exception as e:
            st.error(f"Error reading the PDF file: {e}")
            return None
    
    # Function to read content from a Word file
    def read_word_file(file):
        try:
            doc = docx.Document(file)
            text = ""
            for para in doc.paragraphs:
                text += para.text + "\n"
            return text
        except Exception as e:
            st.error(f"Error reading the Word file: {e}")
            return None
    
    # Function to read content from a TXT file
    def read_txt_file(file):
        try:
            text = file.read().decode("utf-8")
            return text
        except Exception as e:
            st.error(f"Error reading the TXT file: {e}")
            return None
    
    # Function to generate a response using GPT-3 and the provided content
    def generate_response(query, content):
        prompt = f"Using the following data: {content}, answer the question: {query}"
        response = query_chat_model(prompt)
        return response
    
    # Function to convert text to speech and get audio file path
    def text_to_speech(text):
        if not text:
            st.error("No text to speak.")
            return None
    
        try:
            speech_config = SpeechConfig(subscription=azure_speech_key, region=azure_service_region)
            speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"
            synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)
    
            result = synthesizer.speak_text_async(text).get()
            if result.reason == ResultReason.SynthesizingAudioCompleted:
                audio_stream = AudioDataStream(result)
                temp_audio_path = os.path.join(tempfile.gettempdir(), "output.wav")
                audio_stream.save_to_wav_file(temp_audio_path)
                return temp_audio_path
            else:
                st.error("Error during text-to-speech conversion")
                return None
        except Exception as e:
            st.error(f"Error in text-to-speech conversion: {e}")
            return None
    
    # Function to play audio automatically
    def play_audio(file_path):
        try:
            audio_file = open(file_path, 'rb').read()
            audio_base64 = base64.b64encode(audio_file).decode('utf-8')
            audio_html = f'<audio autoplay="true" controls><source src="data:audio/wav;base64,{audio_base64}" type="audio/wav"></audio>'
            st.markdown(audio_html, unsafe_allow_html=True)
        except Exception as e:
            st.error(f"Error playing audio: {e}")
    
    # Main function to integrate all functionalities
    def main():
        st.title("Voice Controlled GPT-3 with File Data")
    
        response_container = st.container()
        upload_container = st.container()
    
        with response_container:
            if st.button("Speak"):
                voice_input = capture_voice()
                st.session_state.voice_input = voice_input
    
            if "response_text" in st.session_state:
                st.write(f"Response: {st.session_state.response_text}")
                audio_file_path = text_to_speech(st.session_state.response_text)
                if audio_file_path:
                    play_audio(audio_file_path)
    
        with upload_container:
            uploaded_file = st.file_uploader("Upload a file", type=["xlsx", "pdf", "docx", "txt"])
            file_content = None
    
            if uploaded_file is not None:
                file_type = uploaded_file.name.split('.')[-1]
                if file_type == "xlsx":
                    df = read_excel_file(uploaded_file)
                    if df is not None:
                        file_content = df.to_string(index=False)
                        st.session_state.file_content = file_content
                        st.write("Excel Data:")
                        st.dataframe(df)
                elif file_type == "pdf":
                    file_content = read_pdf_file(uploaded_file)
                    if file_content:
                        st.session_state.file_content = file_content
                        st.write("PDF Content:")
                        st.write(file_content)
                elif file_type == "docx":
                    file_content = read_word_file(uploaded_file)
                    if file_content:
                        st.session_state.file_content = file_content
                        st.write("Word Document Content:")
                        st.write(file_content)
                elif file_type == "txt":
                    file_content = read_txt_file(uploaded_file)
                    if file_content:
                        st.session_state.file_content = file_content
                        st.write("Text File Content:")
                        st.write(file_content)
    
        if "voice_input" in st.session_state and "file_content" in st.session_state:
            response_text = generate_response(st.session_state.voice_input, st.session_state.file_content)
            st.session_state.response_text = response_text
    
            with response_container:
                st.write(f"Response: {response_text}")
                audio_file_path = text_to_speech(response_text)
                if audio_file_path:
                    play_audio(audio_file_path)
    
    if __name__ == "__main__":
        main()
    
    
    

    This is working now , voice is not getting repeated.

    0 comments No comments