initial version of voice interaction with ChatGPT
This commit is contained in:
parent
6fdab94f1b
commit
c6e0a995be
10 changed files with 241 additions and 0 deletions
12
chatbot/.gitignore
vendored
Normal file
12
chatbot/.gitignore
vendored
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# Python
|
||||||
|
venv
|
||||||
|
__pycache__
|
||||||
|
|
||||||
|
# Project specific
|
||||||
|
config.json
|
||||||
|
|
||||||
|
# Media files, Zip files
|
||||||
|
*.wav
|
||||||
|
*.m4a
|
||||||
|
*.mov
|
||||||
|
*.zip
|
1
chatbot/.python-version
Normal file
1
chatbot/.python-version
Normal file
|
@ -0,0 +1 @@
|
||||||
|
3.8
|
80
chatbot/Readme.md
Normal file
80
chatbot/Readme.md
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
# Setup and run
|
||||||
|
|
||||||
|
Within `chat` directory
|
||||||
|
|
||||||
|
pyenv local 3.8
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
Install ESpeak, see below
|
||||||
|
|
||||||
|
**Set config:**
|
||||||
|
|
||||||
|
In `chat` folder copy file `config-sample.json` to new file with name `config.json` and set especially the OpenAI API Key,
|
||||||
|
the other values can be kept as default.
|
||||||
|
|
||||||
|
See section "Recording Audio" for getting correct Device ID on Raspberry. You can execute `record_audio.py` to see a list of available devices with IDs.
|
||||||
|
|
||||||
|
**Run:**
|
||||||
|
|
||||||
|
Execute `main.py` e.g.
|
||||||
|
|
||||||
|
python -m main
|
||||||
|
|
||||||
|
# Recording Audio
|
||||||
|
|
||||||
|
## Python package PyAudio
|
||||||
|
|
||||||
|
Installed via pip
|
||||||
|
|
||||||
|
For more examples, e.g. conversion into MP3 see
|
||||||
|
https://realpython.com/playing-and-recording-sound-python/#recording-audio
|
||||||
|
|
||||||
|
## Raspberry Pi
|
||||||
|
|
||||||
|
Setup microphone and Device Id:
|
||||||
|
|
||||||
|
https://makersportal.com/blog/2018/8/23/recording-audio-on-the-raspberry-pi-with-python-and-a-usb-microphone
|
||||||
|
|
||||||
|
|
||||||
|
# Speech Synthesis
|
||||||
|
|
||||||
|
## Raspberry/Linux: Pico TTS
|
||||||
|
|
||||||
|
https://github.com/cyberbotics/picotts
|
||||||
|
|
||||||
|
Pico TTS for Raspberry:
|
||||||
|
https://cstan.io/post/2020/03/pico-tts-unter-raspbian-uebersetzen/
|
||||||
|
|
||||||
|
|
||||||
|
## Mac
|
||||||
|
|
||||||
|
### ESpeak
|
||||||
|
See https://espeak.sourceforge.net/
|
||||||
|
|
||||||
|
Verbesserung: MBrola Voices für ESpeak (https://github.com/numediart/MBROLA)
|
||||||
|
|
||||||
|
|
||||||
|
Install
|
||||||
|
|
||||||
|
brew install espeak
|
||||||
|
|
||||||
|
Test
|
||||||
|
|
||||||
|
espeak -vde -p20 -k20 "Hallo Welt"
|
||||||
|
|
||||||
|
MBRola für Mac: https://github.com/pettarin/setup-festival-mbrola
|
||||||
|
|
||||||
|
### Maybe Pico TTS could run on Mac
|
||||||
|
https://github.com/cyberbotics/webots/wiki/Pico-Compilation
|
||||||
|
|
||||||
|
|
||||||
|
# Audio for Raspberry
|
||||||
|
|
||||||
|
https://naomiproject.github.io/docs/
|
||||||
|
|
||||||
|
|
||||||
|
# Similar projects
|
||||||
|
|
||||||
|
https://github.com/anatolybazarov/oracle.sh
|
25
chatbot/chat/chatgpt.py
Normal file
25
chatbot/chat/chatgpt.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
import os
|
||||||
|
import openai
|
||||||
|
|
||||||
|
def chat(message, openai_key, shortAnswer=True):
|
||||||
|
print("###################")
|
||||||
|
openai.api_key = openai_key
|
||||||
|
if shortAnswer:
|
||||||
|
message = message + " - eine kurze Antwort bitte!"
|
||||||
|
|
||||||
|
print(f"Asking ChatGPT for an answer to: \n {message} \n ...")
|
||||||
|
completion = openai.ChatCompletion.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": message}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
first_answer = completion.choices[0].message.content
|
||||||
|
print(f"Got answer from ChatGPT:\n {first_answer}")
|
||||||
|
return first_answer
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
answer = chat("Erzähle mir was über Murnau")
|
||||||
|
print(answer)
|
||||||
|
|
||||||
|
|
6
chatbot/chat/config-sample.json
Normal file
6
chatbot/chat/config-sample.json
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"OPENAI_KEY": "xxxxxxxxx",
|
||||||
|
"MIC_DEVICE_ID": 0,
|
||||||
|
"SECONDS_RECORDING": 5,
|
||||||
|
"ESPEAK_VOICE": "de+f4"
|
||||||
|
}
|
21
chatbot/chat/main.py
Normal file
21
chatbot/chat/main.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
import speech_recognition
|
||||||
|
import speech_synthesis
|
||||||
|
import chatgpt
|
||||||
|
import record_audio
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
dirname = os.path.dirname(__file__)
|
||||||
|
with open(os.path.join(dirname, "config.json")) as config_file:
|
||||||
|
config = json.load(config_file)
|
||||||
|
openai_key=config['OPENAI_KEY']
|
||||||
|
record_audio.record(dirname,
|
||||||
|
device_id=config["MIC_DEVICE_ID"],
|
||||||
|
max_recording_time_s=config["SECONDS_RECORDING"])
|
||||||
|
transcribed = speech_recognition.transcribe(dirname, openai_key)
|
||||||
|
answer = chatgpt.chat(transcribed, openai_key)
|
||||||
|
#Idea: recognize language and pass it to speech synthesizer. Unfortunately the detected language currently not returned by the OpenAI API
|
||||||
|
speech_synthesis.speak(answer, voice=config["ESPEAK_VOICE"])
|
||||||
|
|
54
chatbot/chat/record_audio.py
Normal file
54
chatbot/chat/record_audio.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pyaudio
|
||||||
|
import wave
|
||||||
|
|
||||||
|
def record(dirname, device_id=0, max_recording_time_s=3):
|
||||||
|
print("###################")
|
||||||
|
fname = os.path.join(dirname, "recorded.wav")
|
||||||
|
|
||||||
|
form_1 = pyaudio.paInt16 # 16-bit resolution
|
||||||
|
chans = 1 # 1 channel
|
||||||
|
samp_rate = 44100 # 44.1kHz sampling rate
|
||||||
|
chunk = 4096 # 2^12 samples for buffer
|
||||||
|
record_secs = max_recording_time_s # seconds to record
|
||||||
|
|
||||||
|
audio = pyaudio.PyAudio() # create pyaudio instantiation
|
||||||
|
|
||||||
|
# create pyaudio stream
|
||||||
|
stream = audio.open(format = form_1,rate = samp_rate,channels = chans, \
|
||||||
|
input_device_index = device_id,input = True, \
|
||||||
|
frames_per_buffer=chunk)
|
||||||
|
print(f"Recording via microphone for {max_recording_time_s} seconds")
|
||||||
|
frames = []
|
||||||
|
|
||||||
|
# loop through stream and append audio chunks to frame array
|
||||||
|
for ii in range(0,int((samp_rate/chunk)*record_secs)):
|
||||||
|
data = stream.read(chunk)
|
||||||
|
frames.append(data)
|
||||||
|
|
||||||
|
print("Finished recording")
|
||||||
|
|
||||||
|
# stop the stream, close it, and terminate the pyaudio instantiation
|
||||||
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
|
audio.terminate()
|
||||||
|
|
||||||
|
# save the audio frames as .wav file
|
||||||
|
wavefile = wave.open(fname,'wb')
|
||||||
|
wavefile.setnchannels(chans)
|
||||||
|
wavefile.setsampwidth(audio.get_sample_size(form_1))
|
||||||
|
wavefile.setframerate(samp_rate)
|
||||||
|
wavefile.writeframes(b''.join(frames))
|
||||||
|
wavefile.close()
|
||||||
|
|
||||||
|
def get_mics():
|
||||||
|
p = pyaudio.PyAudio()
|
||||||
|
for ii in range(p.get_device_count()):
|
||||||
|
device_name = p.get_device_info_by_index(ii).get('name')
|
||||||
|
print(f"{ii}: {device_name}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
get_mics()
|
||||||
|
dirname = os.path.dirname(__file__)
|
||||||
|
record(dirname, device_id=0, max_recording_time_s=5)
|
25
chatbot/chat/speech_recognition.py
Normal file
25
chatbot/chat/speech_recognition.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
import openai
|
||||||
|
|
||||||
|
def transcribe(dirname, openai_key, file = "recorded.wav"):
|
||||||
|
print("###################")
|
||||||
|
openai.api_key = openai_key
|
||||||
|
fname = os.path.join(dirname, file)
|
||||||
|
audio_file = open(fname, "rb")
|
||||||
|
print(f"Transcribing audio via OpenAI Whisper ...")
|
||||||
|
transcript = openai.Audio.transcribe("whisper-1", audio_file)
|
||||||
|
recognized_text = transcript.text
|
||||||
|
print(f"Recognized text: \n > {recognized_text}")
|
||||||
|
return recognized_text
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
dirname = os.path.dirname(__file__)
|
||||||
|
with open(os.path.join(dirname, "config.json")) as config_file:
|
||||||
|
config = json.load(config_file)
|
||||||
|
openai_key=config['OPENAI_KEY']
|
||||||
|
transcribed = transcribe(dirname, openai_key, file="test.m4a")
|
||||||
|
print(transcribed)
|
||||||
|
|
||||||
|
|
14
chatbot/chat/speech_synthesis.py
Normal file
14
chatbot/chat/speech_synthesis.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
def speak(text, voice="de+f4"):
|
||||||
|
print("###################")
|
||||||
|
print("Generating audio from text")
|
||||||
|
cmd = f'espeak -v{voice} -p20 -s150 "{text}"'
|
||||||
|
process_return = subprocess.call(cmd, shell=True)
|
||||||
|
#print(process_return)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
speak("Murnau ist eine kleine Stadt in Bayern, Deutschland, bekannt für seine atemberaubende Natur, \
|
||||||
|
insbesondere den Murnauer Moos Nationalpark, sowie für seine malerische Architektur und deutsche Kultur.",
|
||||||
|
voice="de+klatt2")
|
3
chatbot/requirements.txt
Normal file
3
chatbot/requirements.txt
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
openai
|
||||||
|
pyaudio
|
||||||
|
|
Loading…
Reference in a new issue