initial version of voice interaction with ChatGPT

2023-04-21 22:17:24 +02:00 · 2023-04-21 22:17:24 +02:00 · c6e0a995be
commit c6e0a995be
parent 6fdab94f1b
10 changed files with 241 additions and 0 deletions
--- a/chatbot/.gitignore
+++ b/chatbot/.gitignore
@ -0,0 +1,12 @@
 # Python
 venv
 __pycache__
 # Project specific
 config.json
 # Media files, Zip files
 *.wav
 *.m4a
 *.mov
 *.zip
--- a/chatbot/.python-version
+++ b/chatbot/.python-version
@ -0,0 +1 @@
 3.8
--- a/chatbot/Readme.md
+++ b/chatbot/Readme.md
@ -0,0 +1,80 @@
 # Setup and run
 Within `chat` directory
    pyenv local 3.8
    python -m venv venv
    source venv/bin/activate
    pip install -r requirements.txt
 Install ESpeak, see below
 **Set config:**
 In `chat` folder copy file `config-sample.json` to new file with name `config.json`  and set especially the OpenAI API Key,
 the other values can be kept as default.
 See section "Recording Audio" for getting correct Device ID on Raspberry. You can execute `record_audio.py` to see a list of available devices with IDs.
 **Run:**
 Execute `main.py` e.g.
    python -m main
 # Recording Audio 
 ## Python package PyAudio
 Installed via pip
 For more examples, e.g. conversion into MP3 see
 https://realpython.com/playing-and-recording-sound-python/#recording-audio
 ## Raspberry Pi
 Setup microphone and Device Id:
 https://makersportal.com/blog/2018/8/23/recording-audio-on-the-raspberry-pi-with-python-and-a-usb-microphone
 # Speech Synthesis
 ## Raspberry/Linux: Pico TTS
 https://github.com/cyberbotics/picotts
 Pico TTS for Raspberry:
 https://cstan.io/post/2020/03/pico-tts-unter-raspbian-uebersetzen/
 ## Mac
 ### ESpeak
 See https://espeak.sourceforge.net/
 Verbesserung: MBrola Voices für ESpeak (https://github.com/numediart/MBROLA)
 Install
    brew install espeak
 Test
    espeak -vde -p20 -k20 "Hallo Welt"
 MBRola für Mac: https://github.com/pettarin/setup-festival-mbrola
 ### Maybe Pico TTS could run on Mac
 https://github.com/cyberbotics/webots/wiki/Pico-Compilation
 # Audio for Raspberry 
 https://naomiproject.github.io/docs/
 # Similar projects
 https://github.com/anatolybazarov/oracle.sh
--- a/chatbot/chat/chatgpt.py
+++ b/chatbot/chat/chatgpt.py
@ -0,0 +1,25 @@
 import os
 import openai
 def chat(message, openai_key, shortAnswer=True):
    print("###################")
    openai.api_key = openai_key
    if shortAnswer:
        message = message + " - eine kurze Antwort bitte!"
    print(f"Asking ChatGPT for an answer to: \n {message} \n ...")
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": message}
        ]
    )
    first_answer = completion.choices[0].message.content
    print(f"Got answer from ChatGPT:\n {first_answer}")
    return first_answer
 if __name__ == "__main__":
    answer = chat("Erzähle mir was über Murnau")
    print(answer)
--- a/chatbot/chat/config-sample.json
+++ b/chatbot/chat/config-sample.json
@ -0,0 +1,6 @@
 {
    "OPENAI_KEY": "xxxxxxxxx",
    "MIC_DEVICE_ID": 0,
    "SECONDS_RECORDING": 5,
    "ESPEAK_VOICE": "de+f4"
 }
--- a/chatbot/chat/main.py
+++ b/chatbot/chat/main.py
@ -0,0 +1,21 @@
 import os
 import json
 import speech_recognition
 import speech_synthesis
 import chatgpt
 import record_audio
 if __name__ == "__main__":
    dirname = os.path.dirname(__file__)
    with open(os.path.join(dirname, "config.json")) as config_file:
        config = json.load(config_file)
        openai_key=config['OPENAI_KEY']
        record_audio.record(dirname, 
                            device_id=config["MIC_DEVICE_ID"], 
                            max_recording_time_s=config["SECONDS_RECORDING"])
        transcribed = speech_recognition.transcribe(dirname, openai_key)
        answer = chatgpt.chat(transcribed, openai_key)
        #Idea: recognize language and pass it to speech synthesizer. Unfortunately the detected language currently not returned by the OpenAI API
        speech_synthesis.speak(answer, voice=config["ESPEAK_VOICE"])
--- a/chatbot/chat/record_audio.py
+++ b/chatbot/chat/record_audio.py
@ -0,0 +1,54 @@
 import os
 import pyaudio
 import wave
 def record(dirname, device_id=0, max_recording_time_s=3):
    print("###################")
    fname = os.path.join(dirname, "recorded.wav")
    form_1 = pyaudio.paInt16 # 16-bit resolution
    chans = 1 # 1 channel
    samp_rate = 44100 # 44.1kHz sampling rate
    chunk = 4096 # 2^12 samples for buffer
    record_secs = max_recording_time_s # seconds to record
    audio = pyaudio.PyAudio() # create pyaudio instantiation
    # create pyaudio stream
    stream = audio.open(format = form_1,rate = samp_rate,channels = chans, \
                        input_device_index = device_id,input = True, \
                        frames_per_buffer=chunk)
    print(f"Recording via microphone for {max_recording_time_s} seconds")
    frames = []
    # loop through stream and append audio chunks to frame array
    for ii in range(0,int((samp_rate/chunk)*record_secs)):
        data = stream.read(chunk)
        frames.append(data)
    print("Finished recording")
    # stop the stream, close it, and terminate the pyaudio instantiation
    stream.stop_stream()
    stream.close()
    audio.terminate()
    # save the audio frames as .wav file
    wavefile = wave.open(fname,'wb')
    wavefile.setnchannels(chans)
    wavefile.setsampwidth(audio.get_sample_size(form_1))
    wavefile.setframerate(samp_rate)
    wavefile.writeframes(b''.join(frames))
    wavefile.close()
 def get_mics():
    p = pyaudio.PyAudio()
    for ii in range(p.get_device_count()):
        device_name = p.get_device_info_by_index(ii).get('name')
        print(f"{ii}: {device_name}")
 if __name__ == "__main__":
    get_mics()
    dirname = os.path.dirname(__file__)
    record(dirname, device_id=0, max_recording_time_s=5)
--- a/chatbot/chat/speech_recognition.py
+++ b/chatbot/chat/speech_recognition.py
@ -0,0 +1,25 @@
 import os
 import json
 import openai
 def transcribe(dirname, openai_key, file = "recorded.wav"):  
    print("###################")  
    openai.api_key = openai_key
    fname = os.path.join(dirname, file)
    audio_file = open(fname, "rb")
    print(f"Transcribing audio via OpenAI Whisper ...")
    transcript = openai.Audio.transcribe("whisper-1", audio_file)
    recognized_text = transcript.text
    print(f"Recognized text: \n > {recognized_text}")
    return recognized_text
 if __name__ == "__main__":
    dirname = os.path.dirname(__file__)
    with open(os.path.join(dirname, "config.json")) as config_file:
        config = json.load(config_file)
        openai_key=config['OPENAI_KEY']
        transcribed = transcribe(dirname, openai_key, file="test.m4a")
        print(transcribed)
--- a/chatbot/chat/speech_synthesis.py
+++ b/chatbot/chat/speech_synthesis.py
@ -0,0 +1,14 @@
 import subprocess
 def speak(text, voice="de+f4"):
    print("###################")
    print("Generating audio from text")
    cmd = f'espeak -v{voice} -p20 -s150 "{text}"'
    process_return = subprocess.call(cmd, shell=True)
    #print(process_return)
 if __name__ == "__main__":
    speak("Murnau ist eine kleine Stadt in Bayern, Deutschland, bekannt für seine atemberaubende Natur, \
          insbesondere den Murnauer Moos Nationalpark, sowie für seine malerische Architektur und deutsche Kultur.",
          voice="de+klatt2")
--- a/chatbot/requirements.txt
+++ b/chatbot/requirements.txt
@ -0,0 +1,3 @@
 openai
 pyaudio