From c6e0a995bea089756229d3d50cf3e976626e621b Mon Sep 17 00:00:00 2001
From: Lars Haferkamp <>
Date: Fri, 21 Apr 2023 22:17:24 +0200
Subject: [PATCH] initial version of voice interaction with ChatGPT

---
 chatbot/.gitignore                 | 12 +++++
 chatbot/.python-version            |  1 +
 chatbot/Readme.md                  | 80 ++++++++++++++++++++++++++++++
 chatbot/chat/chatgpt.py            | 25 ++++++++++
 chatbot/chat/config-sample.json    |  6 +++
 chatbot/chat/main.py               | 21 ++++++++
 chatbot/chat/record_audio.py       | 54 ++++++++++++++++++++
 chatbot/chat/speech_recognition.py | 25 ++++++++++
 chatbot/chat/speech_synthesis.py   | 14 ++++++
 chatbot/requirements.txt           |  3 ++
 10 files changed, 241 insertions(+)
 create mode 100644 chatbot/.gitignore
 create mode 100644 chatbot/.python-version
 create mode 100644 chatbot/Readme.md
 create mode 100644 chatbot/chat/chatgpt.py
 create mode 100644 chatbot/chat/config-sample.json
 create mode 100644 chatbot/chat/main.py
 create mode 100644 chatbot/chat/record_audio.py
 create mode 100644 chatbot/chat/speech_recognition.py
 create mode 100644 chatbot/chat/speech_synthesis.py
 create mode 100644 chatbot/requirements.txt

diff --git a/chatbot/.gitignore b/chatbot/.gitignore
new file mode 100644
index 0000000..32b2dbb
--- /dev/null
+++ b/chatbot/.gitignore
@@ -0,0 +1,12 @@
+# Python
+venv
+__pycache__
+
+# Project specific
+config.json
+
+# Media files, Zip files
+*.wav
+*.m4a
+*.mov
+*.zip
diff --git a/chatbot/.python-version b/chatbot/.python-version
new file mode 100644
index 0000000..cc1923a
--- /dev/null
+++ b/chatbot/.python-version
@@ -0,0 +1 @@
+3.8
diff --git a/chatbot/Readme.md b/chatbot/Readme.md
new file mode 100644
index 0000000..818c03b
--- /dev/null
+++ b/chatbot/Readme.md
@@ -0,0 +1,80 @@
+# Setup and run
+
+Within `chat` directory
+
+    pyenv local 3.8
+    python -m venv venv
+    source venv/bin/activate
+    pip install -r requirements.txt
+
+Install ESpeak, see below
+
+**Set config:**
+
+In `chat` folder copy file `config-sample.json` to new file with name `config.json`  and set especially the OpenAI API Key,
+the other values can be kept as default.
+
+See section "Recording Audio" for getting correct Device ID on Raspberry. You can execute `record_audio.py` to see a list of available devices with IDs.
+
+**Run:**
+
+Execute `main.py` e.g.
+
+    python -m main
+
+# Recording Audio 
+
+## Python package PyAudio
+
+Installed via pip
+
+For more examples, e.g. conversion into MP3 see
+https://realpython.com/playing-and-recording-sound-python/#recording-audio
+
+## Raspberry Pi
+
+Setup microphone and Device Id:
+
+https://makersportal.com/blog/2018/8/23/recording-audio-on-the-raspberry-pi-with-python-and-a-usb-microphone
+
+
+# Speech Synthesis
+
+## Raspberry/Linux: Pico TTS
+
+https://github.com/cyberbotics/picotts
+
+Pico TTS for Raspberry:
+https://cstan.io/post/2020/03/pico-tts-unter-raspbian-uebersetzen/
+
+
+## Mac
+
+### ESpeak
+See https://espeak.sourceforge.net/
+
+Verbesserung: MBrola Voices für ESpeak (https://github.com/numediart/MBROLA)
+
+
+Install
+
+    brew install espeak
+
+Test
+
+    espeak -vde -p20 -k20 "Hallo Welt"
+
+MBRola für Mac: https://github.com/pettarin/setup-festival-mbrola
+
+### Maybe Pico TTS could run on Mac
+ https://github.com/cyberbotics/webots/wiki/Pico-Compilation
+
+
+# Audio for Raspberry 
+
+https://naomiproject.github.io/docs/
+
+
+# Similar projects
+
+https://github.com/anatolybazarov/oracle.sh
diff --git a/chatbot/chat/chatgpt.py b/chatbot/chat/chatgpt.py
new file mode 100644
index 0000000..5ffa4bd
--- /dev/null
+++ b/chatbot/chat/chatgpt.py
@@ -0,0 +1,25 @@
+import os
+import openai
+
+def chat(message, openai_key, shortAnswer=True):
+    print("###################")
+    openai.api_key = openai_key
+    if shortAnswer:
+        message = message + " - eine kurze Antwort bitte!"
+
+    print(f"Asking ChatGPT for an answer to: \n {message} \n ...")
+    completion = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "user", "content": message}
+        ]
+    )
+    first_answer = completion.choices[0].message.content
+    print(f"Got answer from ChatGPT:\n {first_answer}")
+    return first_answer
+
+if __name__ == "__main__":
+    answer = chat("Erzähle mir was über Murnau")
+    print(answer)
+
+
diff --git a/chatbot/chat/config-sample.json b/chatbot/chat/config-sample.json
new file mode 100644
index 0000000..b3a1642
--- /dev/null
+++ b/chatbot/chat/config-sample.json
@@ -0,0 +1,6 @@
+{
+    "OPENAI_KEY": "xxxxxxxxx",
+    "MIC_DEVICE_ID": 0,
+    "SECONDS_RECORDING": 5,
+    "ESPEAK_VOICE": "de+f4"
+}
diff --git a/chatbot/chat/main.py b/chatbot/chat/main.py
new file mode 100644
index 0000000..40e4ec4
--- /dev/null
+++ b/chatbot/chat/main.py
@@ -0,0 +1,21 @@
+import os
+import json
+
+import speech_recognition
+import speech_synthesis
+import chatgpt
+import record_audio
+
+if __name__ == "__main__":
+    dirname = os.path.dirname(__file__)
+    with open(os.path.join(dirname, "config.json")) as config_file:
+        config = json.load(config_file)
+        openai_key=config['OPENAI_KEY']
+        record_audio.record(dirname, 
+                            device_id=config["MIC_DEVICE_ID"], 
+                            max_recording_time_s=config["SECONDS_RECORDING"])
+        transcribed = speech_recognition.transcribe(dirname, openai_key)
+        answer = chatgpt.chat(transcribed, openai_key)
+        #Idea: recognize language and pass it to speech synthesizer. Unfortunately the detected language currently not returned by the OpenAI API
+        speech_synthesis.speak(answer, voice=config["ESPEAK_VOICE"])
+
diff --git a/chatbot/chat/record_audio.py b/chatbot/chat/record_audio.py
new file mode 100644
index 0000000..44dd71f
--- /dev/null
+++ b/chatbot/chat/record_audio.py
@@ -0,0 +1,54 @@
+import os
+
+import pyaudio
+import wave
+
+def record(dirname, device_id=0, max_recording_time_s=3):
+    print("###################")
+    fname = os.path.join(dirname, "recorded.wav")
+
+    form_1 = pyaudio.paInt16 # 16-bit resolution
+    chans = 1 # 1 channel
+    samp_rate = 44100 # 44.1kHz sampling rate
+    chunk = 4096 # 2^12 samples for buffer
+    record_secs = max_recording_time_s # seconds to record
+
+    audio = pyaudio.PyAudio() # create pyaudio instantiation
+
+    # create pyaudio stream
+    stream = audio.open(format = form_1,rate = samp_rate,channels = chans, \
+                        input_device_index = device_id,input = True, \
+                        frames_per_buffer=chunk)
+    print(f"Recording via microphone for {max_recording_time_s} seconds")
+    frames = []
+
+    # loop through stream and append audio chunks to frame array
+    for ii in range(0,int((samp_rate/chunk)*record_secs)):
+        data = stream.read(chunk)
+        frames.append(data)
+
+    print("Finished recording")
+
+    # stop the stream, close it, and terminate the pyaudio instantiation
+    stream.stop_stream()
+    stream.close()
+    audio.terminate()
+
+    # save the audio frames as .wav file
+    wavefile = wave.open(fname,'wb')
+    wavefile.setnchannels(chans)
+    wavefile.setsampwidth(audio.get_sample_size(form_1))
+    wavefile.setframerate(samp_rate)
+    wavefile.writeframes(b''.join(frames))
+    wavefile.close()
+
+def get_mics():
+    p = pyaudio.PyAudio()
+    for ii in range(p.get_device_count()):
+        device_name = p.get_device_info_by_index(ii).get('name')
+        print(f"{ii}: {device_name}")
+
+if __name__ == "__main__":
+    get_mics()
+    dirname = os.path.dirname(__file__)
+    record(dirname, device_id=0, max_recording_time_s=5)
diff --git a/chatbot/chat/speech_recognition.py b/chatbot/chat/speech_recognition.py
new file mode 100644
index 0000000..5694822
--- /dev/null
+++ b/chatbot/chat/speech_recognition.py
@@ -0,0 +1,25 @@
+import os
+import json
+
+import openai
+
+def transcribe(dirname, openai_key, file = "recorded.wav"):  
+    print("###################")  
+    openai.api_key = openai_key
+    fname = os.path.join(dirname, file)
+    audio_file = open(fname, "rb")
+    print(f"Transcribing audio via OpenAI Whisper ...")
+    transcript = openai.Audio.transcribe("whisper-1", audio_file)
+    recognized_text = transcript.text
+    print(f"Recognized text: \n > {recognized_text}")
+    return recognized_text
+
+if __name__ == "__main__":
+    dirname = os.path.dirname(__file__)
+    with open(os.path.join(dirname, "config.json")) as config_file:
+        config = json.load(config_file)
+        openai_key=config['OPENAI_KEY']
+        transcribed = transcribe(dirname, openai_key, file="test.m4a")
+        print(transcribed)
+
+
diff --git a/chatbot/chat/speech_synthesis.py b/chatbot/chat/speech_synthesis.py
new file mode 100644
index 0000000..1b52ebc
--- /dev/null
+++ b/chatbot/chat/speech_synthesis.py
@@ -0,0 +1,14 @@
+import subprocess
+
+def speak(text, voice="de+f4"):
+    print("###################")
+    print("Generating audio from text")
+    cmd = f'espeak -v{voice} -p20 -s150 "{text}"'
+    process_return = subprocess.call(cmd, shell=True)
+    #print(process_return)
+
+
+if __name__ == "__main__":
+    speak("Murnau ist eine kleine Stadt in Bayern, Deutschland, bekannt für seine atemberaubende Natur, \
+          insbesondere den Murnauer Moos Nationalpark, sowie für seine malerische Architektur und deutsche Kultur.",
+          voice="de+klatt2")
diff --git a/chatbot/requirements.txt b/chatbot/requirements.txt
new file mode 100644
index 0000000..b9ca856
--- /dev/null
+++ b/chatbot/requirements.txt
@@ -0,0 +1,3 @@
+openai
+pyaudio
+