From c6e0a995bea089756229d3d50cf3e976626e621b Mon Sep 17 00:00:00 2001 From: Lars Haferkamp <> Date: Fri, 21 Apr 2023 22:17:24 +0200 Subject: [PATCH] initial version of voice interaction with ChatGPT --- chatbot/.gitignore | 12 +++++ chatbot/.python-version | 1 + chatbot/Readme.md | 80 ++++++++++++++++++++++++++++++ chatbot/chat/chatgpt.py | 25 ++++++++++ chatbot/chat/config-sample.json | 6 +++ chatbot/chat/main.py | 21 ++++++++ chatbot/chat/record_audio.py | 54 ++++++++++++++++++++ chatbot/chat/speech_recognition.py | 25 ++++++++++ chatbot/chat/speech_synthesis.py | 14 ++++++ chatbot/requirements.txt | 3 ++ 10 files changed, 241 insertions(+) create mode 100644 chatbot/.gitignore create mode 100644 chatbot/.python-version create mode 100644 chatbot/Readme.md create mode 100644 chatbot/chat/chatgpt.py create mode 100644 chatbot/chat/config-sample.json create mode 100644 chatbot/chat/main.py create mode 100644 chatbot/chat/record_audio.py create mode 100644 chatbot/chat/speech_recognition.py create mode 100644 chatbot/chat/speech_synthesis.py create mode 100644 chatbot/requirements.txt diff --git a/chatbot/.gitignore b/chatbot/.gitignore new file mode 100644 index 0000000..32b2dbb --- /dev/null +++ b/chatbot/.gitignore @@ -0,0 +1,12 @@ +# Python +venv +__pycache__ + +# Project specific +config.json + +# Media files, Zip files +*.wav +*.m4a +*.mov +*.zip diff --git a/chatbot/.python-version b/chatbot/.python-version new file mode 100644 index 0000000..cc1923a --- /dev/null +++ b/chatbot/.python-version @@ -0,0 +1 @@ +3.8 diff --git a/chatbot/Readme.md b/chatbot/Readme.md new file mode 100644 index 0000000..818c03b --- /dev/null +++ b/chatbot/Readme.md @@ -0,0 +1,80 @@ +# Setup and run + +Within `chat` directory + + pyenv local 3.8 + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + +Install ESpeak, see below + +**Set config:** + +In `chat` folder copy file `config-sample.json` to new file with name `config.json` and set especially the OpenAI API Key, +the other values can be kept as default. + +See section "Recording Audio" for getting correct Device ID on Raspberry. You can execute `record_audio.py` to see a list of available devices with IDs. + +**Run:** + +Execute `main.py` e.g. + + python -m main + +# Recording Audio + +## Python package PyAudio + +Installed via pip + +For more examples, e.g. conversion into MP3 see +https://realpython.com/playing-and-recording-sound-python/#recording-audio + +## Raspberry Pi + +Setup microphone and Device Id: + +https://makersportal.com/blog/2018/8/23/recording-audio-on-the-raspberry-pi-with-python-and-a-usb-microphone + + +# Speech Synthesis + +## Raspberry/Linux: Pico TTS + +https://github.com/cyberbotics/picotts + +Pico TTS for Raspberry: +https://cstan.io/post/2020/03/pico-tts-unter-raspbian-uebersetzen/ + + +## Mac + +### ESpeak +See https://espeak.sourceforge.net/ + +Verbesserung: MBrola Voices für ESpeak (https://github.com/numediart/MBROLA) + + +Install + + brew install espeak + +Test + + espeak -vde -p20 -k20 "Hallo Welt" + +MBRola für Mac: https://github.com/pettarin/setup-festival-mbrola + +### Maybe Pico TTS could run on Mac + https://github.com/cyberbotics/webots/wiki/Pico-Compilation + + +# Audio for Raspberry + +https://naomiproject.github.io/docs/ + + +# Similar projects + +https://github.com/anatolybazarov/oracle.sh diff --git a/chatbot/chat/chatgpt.py b/chatbot/chat/chatgpt.py new file mode 100644 index 0000000..5ffa4bd --- /dev/null +++ b/chatbot/chat/chatgpt.py @@ -0,0 +1,25 @@ +import os +import openai + +def chat(message, openai_key, shortAnswer=True): + print("###################") + openai.api_key = openai_key + if shortAnswer: + message = message + " - eine kurze Antwort bitte!" + + print(f"Asking ChatGPT for an answer to: \n {message} \n ...") + completion = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": message} + ] + ) + first_answer = completion.choices[0].message.content + print(f"Got answer from ChatGPT:\n {first_answer}") + return first_answer + +if __name__ == "__main__": + answer = chat("Erzähle mir was über Murnau") + print(answer) + + diff --git a/chatbot/chat/config-sample.json b/chatbot/chat/config-sample.json new file mode 100644 index 0000000..b3a1642 --- /dev/null +++ b/chatbot/chat/config-sample.json @@ -0,0 +1,6 @@ +{ + "OPENAI_KEY": "xxxxxxxxx", + "MIC_DEVICE_ID": 0, + "SECONDS_RECORDING": 5, + "ESPEAK_VOICE": "de+f4" +} diff --git a/chatbot/chat/main.py b/chatbot/chat/main.py new file mode 100644 index 0000000..40e4ec4 --- /dev/null +++ b/chatbot/chat/main.py @@ -0,0 +1,21 @@ +import os +import json + +import speech_recognition +import speech_synthesis +import chatgpt +import record_audio + +if __name__ == "__main__": + dirname = os.path.dirname(__file__) + with open(os.path.join(dirname, "config.json")) as config_file: + config = json.load(config_file) + openai_key=config['OPENAI_KEY'] + record_audio.record(dirname, + device_id=config["MIC_DEVICE_ID"], + max_recording_time_s=config["SECONDS_RECORDING"]) + transcribed = speech_recognition.transcribe(dirname, openai_key) + answer = chatgpt.chat(transcribed, openai_key) + #Idea: recognize language and pass it to speech synthesizer. Unfortunately the detected language currently not returned by the OpenAI API + speech_synthesis.speak(answer, voice=config["ESPEAK_VOICE"]) + diff --git a/chatbot/chat/record_audio.py b/chatbot/chat/record_audio.py new file mode 100644 index 0000000..44dd71f --- /dev/null +++ b/chatbot/chat/record_audio.py @@ -0,0 +1,54 @@ +import os + +import pyaudio +import wave + +def record(dirname, device_id=0, max_recording_time_s=3): + print("###################") + fname = os.path.join(dirname, "recorded.wav") + + form_1 = pyaudio.paInt16 # 16-bit resolution + chans = 1 # 1 channel + samp_rate = 44100 # 44.1kHz sampling rate + chunk = 4096 # 2^12 samples for buffer + record_secs = max_recording_time_s # seconds to record + + audio = pyaudio.PyAudio() # create pyaudio instantiation + + # create pyaudio stream + stream = audio.open(format = form_1,rate = samp_rate,channels = chans, \ + input_device_index = device_id,input = True, \ + frames_per_buffer=chunk) + print(f"Recording via microphone for {max_recording_time_s} seconds") + frames = [] + + # loop through stream and append audio chunks to frame array + for ii in range(0,int((samp_rate/chunk)*record_secs)): + data = stream.read(chunk) + frames.append(data) + + print("Finished recording") + + # stop the stream, close it, and terminate the pyaudio instantiation + stream.stop_stream() + stream.close() + audio.terminate() + + # save the audio frames as .wav file + wavefile = wave.open(fname,'wb') + wavefile.setnchannels(chans) + wavefile.setsampwidth(audio.get_sample_size(form_1)) + wavefile.setframerate(samp_rate) + wavefile.writeframes(b''.join(frames)) + wavefile.close() + +def get_mics(): + p = pyaudio.PyAudio() + for ii in range(p.get_device_count()): + device_name = p.get_device_info_by_index(ii).get('name') + print(f"{ii}: {device_name}") + +if __name__ == "__main__": + get_mics() + dirname = os.path.dirname(__file__) + record(dirname, device_id=0, max_recording_time_s=5) diff --git a/chatbot/chat/speech_recognition.py b/chatbot/chat/speech_recognition.py new file mode 100644 index 0000000..5694822 --- /dev/null +++ b/chatbot/chat/speech_recognition.py @@ -0,0 +1,25 @@ +import os +import json + +import openai + +def transcribe(dirname, openai_key, file = "recorded.wav"): + print("###################") + openai.api_key = openai_key + fname = os.path.join(dirname, file) + audio_file = open(fname, "rb") + print(f"Transcribing audio via OpenAI Whisper ...") + transcript = openai.Audio.transcribe("whisper-1", audio_file) + recognized_text = transcript.text + print(f"Recognized text: \n > {recognized_text}") + return recognized_text + +if __name__ == "__main__": + dirname = os.path.dirname(__file__) + with open(os.path.join(dirname, "config.json")) as config_file: + config = json.load(config_file) + openai_key=config['OPENAI_KEY'] + transcribed = transcribe(dirname, openai_key, file="test.m4a") + print(transcribed) + + diff --git a/chatbot/chat/speech_synthesis.py b/chatbot/chat/speech_synthesis.py new file mode 100644 index 0000000..1b52ebc --- /dev/null +++ b/chatbot/chat/speech_synthesis.py @@ -0,0 +1,14 @@ +import subprocess + +def speak(text, voice="de+f4"): + print("###################") + print("Generating audio from text") + cmd = f'espeak -v{voice} -p20 -s150 "{text}"' + process_return = subprocess.call(cmd, shell=True) + #print(process_return) + + +if __name__ == "__main__": + speak("Murnau ist eine kleine Stadt in Bayern, Deutschland, bekannt für seine atemberaubende Natur, \ + insbesondere den Murnauer Moos Nationalpark, sowie für seine malerische Architektur und deutsche Kultur.", + voice="de+klatt2") diff --git a/chatbot/requirements.txt b/chatbot/requirements.txt new file mode 100644 index 0000000..b9ca856 --- /dev/null +++ b/chatbot/requirements.txt @@ -0,0 +1,3 @@ +openai +pyaudio +