initial version of voice interaction with ChatGPT

This commit is contained in:
Lars Haferkamp 2023-04-21 22:17:24 +02:00
parent 6fdab94f1b
commit c6e0a995be
10 changed files with 241 additions and 0 deletions

12
chatbot/.gitignore vendored Normal file
View file

@ -0,0 +1,12 @@
# Python
venv
__pycache__
# Project specific
config.json
# Media files, Zip files
*.wav
*.m4a
*.mov
*.zip

1
chatbot/.python-version Normal file
View file

@ -0,0 +1 @@
3.8

80
chatbot/Readme.md Normal file
View file

@ -0,0 +1,80 @@
# Setup and run
Within `chat` directory
pyenv local 3.8
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
Install ESpeak, see below
**Set config:**
In `chat` folder copy file `config-sample.json` to new file with name `config.json` and set especially the OpenAI API Key,
the other values can be kept as default.
See section "Recording Audio" for getting correct Device ID on Raspberry. You can execute `record_audio.py` to see a list of available devices with IDs.
**Run:**
Execute `main.py` e.g.
python -m main
# Recording Audio
## Python package PyAudio
Installed via pip
For more examples, e.g. conversion into MP3 see
https://realpython.com/playing-and-recording-sound-python/#recording-audio
## Raspberry Pi
Setup microphone and Device Id:
https://makersportal.com/blog/2018/8/23/recording-audio-on-the-raspberry-pi-with-python-and-a-usb-microphone
# Speech Synthesis
## Raspberry/Linux: Pico TTS
https://github.com/cyberbotics/picotts
Pico TTS for Raspberry:
https://cstan.io/post/2020/03/pico-tts-unter-raspbian-uebersetzen/
## Mac
### ESpeak
See https://espeak.sourceforge.net/
Verbesserung: MBrola Voices für ESpeak (https://github.com/numediart/MBROLA)
Install
brew install espeak
Test
espeak -vde -p20 -k20 "Hallo Welt"
MBRola für Mac: https://github.com/pettarin/setup-festival-mbrola
### Maybe Pico TTS could run on Mac
https://github.com/cyberbotics/webots/wiki/Pico-Compilation
# Audio for Raspberry
https://naomiproject.github.io/docs/
# Similar projects
https://github.com/anatolybazarov/oracle.sh

25
chatbot/chat/chatgpt.py Normal file
View file

@ -0,0 +1,25 @@
import os
import openai
def chat(message, openai_key, shortAnswer=True):
print("###################")
openai.api_key = openai_key
if shortAnswer:
message = message + " - eine kurze Antwort bitte!"
print(f"Asking ChatGPT for an answer to: \n {message} \n ...")
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": message}
]
)
first_answer = completion.choices[0].message.content
print(f"Got answer from ChatGPT:\n {first_answer}")
return first_answer
if __name__ == "__main__":
answer = chat("Erzähle mir was über Murnau")
print(answer)

View file

@ -0,0 +1,6 @@
{
"OPENAI_KEY": "xxxxxxxxx",
"MIC_DEVICE_ID": 0,
"SECONDS_RECORDING": 5,
"ESPEAK_VOICE": "de+f4"
}

21
chatbot/chat/main.py Normal file
View file

@ -0,0 +1,21 @@
import os
import json
import speech_recognition
import speech_synthesis
import chatgpt
import record_audio
if __name__ == "__main__":
dirname = os.path.dirname(__file__)
with open(os.path.join(dirname, "config.json")) as config_file:
config = json.load(config_file)
openai_key=config['OPENAI_KEY']
record_audio.record(dirname,
device_id=config["MIC_DEVICE_ID"],
max_recording_time_s=config["SECONDS_RECORDING"])
transcribed = speech_recognition.transcribe(dirname, openai_key)
answer = chatgpt.chat(transcribed, openai_key)
#Idea: recognize language and pass it to speech synthesizer. Unfortunately the detected language currently not returned by the OpenAI API
speech_synthesis.speak(answer, voice=config["ESPEAK_VOICE"])

View file

@ -0,0 +1,54 @@
import os
import pyaudio
import wave
def record(dirname, device_id=0, max_recording_time_s=3):
print("###################")
fname = os.path.join(dirname, "recorded.wav")
form_1 = pyaudio.paInt16 # 16-bit resolution
chans = 1 # 1 channel
samp_rate = 44100 # 44.1kHz sampling rate
chunk = 4096 # 2^12 samples for buffer
record_secs = max_recording_time_s # seconds to record
audio = pyaudio.PyAudio() # create pyaudio instantiation
# create pyaudio stream
stream = audio.open(format = form_1,rate = samp_rate,channels = chans, \
input_device_index = device_id,input = True, \
frames_per_buffer=chunk)
print(f"Recording via microphone for {max_recording_time_s} seconds")
frames = []
# loop through stream and append audio chunks to frame array
for ii in range(0,int((samp_rate/chunk)*record_secs)):
data = stream.read(chunk)
frames.append(data)
print("Finished recording")
# stop the stream, close it, and terminate the pyaudio instantiation
stream.stop_stream()
stream.close()
audio.terminate()
# save the audio frames as .wav file
wavefile = wave.open(fname,'wb')
wavefile.setnchannels(chans)
wavefile.setsampwidth(audio.get_sample_size(form_1))
wavefile.setframerate(samp_rate)
wavefile.writeframes(b''.join(frames))
wavefile.close()
def get_mics():
p = pyaudio.PyAudio()
for ii in range(p.get_device_count()):
device_name = p.get_device_info_by_index(ii).get('name')
print(f"{ii}: {device_name}")
if __name__ == "__main__":
get_mics()
dirname = os.path.dirname(__file__)
record(dirname, device_id=0, max_recording_time_s=5)

View file

@ -0,0 +1,25 @@
import os
import json
import openai
def transcribe(dirname, openai_key, file = "recorded.wav"):
print("###################")
openai.api_key = openai_key
fname = os.path.join(dirname, file)
audio_file = open(fname, "rb")
print(f"Transcribing audio via OpenAI Whisper ...")
transcript = openai.Audio.transcribe("whisper-1", audio_file)
recognized_text = transcript.text
print(f"Recognized text: \n > {recognized_text}")
return recognized_text
if __name__ == "__main__":
dirname = os.path.dirname(__file__)
with open(os.path.join(dirname, "config.json")) as config_file:
config = json.load(config_file)
openai_key=config['OPENAI_KEY']
transcribed = transcribe(dirname, openai_key, file="test.m4a")
print(transcribed)

View file

@ -0,0 +1,14 @@
import subprocess
def speak(text, voice="de+f4"):
print("###################")
print("Generating audio from text")
cmd = f'espeak -v{voice} -p20 -s150 "{text}"'
process_return = subprocess.call(cmd, shell=True)
#print(process_return)
if __name__ == "__main__":
speak("Murnau ist eine kleine Stadt in Bayern, Deutschland, bekannt für seine atemberaubende Natur, \
insbesondere den Murnauer Moos Nationalpark, sowie für seine malerische Architektur und deutsche Kultur.",
voice="de+klatt2")

3
chatbot/requirements.txt Normal file
View file

@ -0,0 +1,3 @@
openai
pyaudio