SUSE_audio_assistant/demo_assistant.py

import io
import tempfile
import pyaudio
from pydub import AudioSegment
import wave
import re
import time

import queue
import threading
from transformers import pipeline
from datasets import load_dataset
from faster_whisper import WhisperModel
import torch
from TTS.api import TTS
from gpt4all import GPT4All

from audio_utils import AudioSplit


CHUNK = 1024
FORMAT = pyaudio.paInt16 # 16-bit resolution
CHANNELS = 1
RATE = 16000 # sample rate
DURATION = 3
Q_DURATION = 7
SUSE = r"s*u*s*e"
THANK= r"Thank\s*(?:you|u)\b"


g_active = False
g_wait = False
g_lock = threading.Lock()
counter = 0

p_audio = pyaudio.PyAudio()
playback_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=24000, output=True)
record_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
def record_audio():
    global p_audio
    global g_active
    global g_wait
    print("Recording started...")
    while True:
        if g_active == True:
            print("c", end="")
            if g_wait == True:
                print("w", end="")
                while not audio_queue.empty():
                    audio_queue.get()
                time.sleep(1)
                continue
        audio_data = record_stream.read(CHUNK, exception_on_overflow=False)
        audio_queue.put(audio_data)
        print(".", end="")

    record_stream.stop_stream()
    record_stream.close()
    p_audio.terminate()

def speech_to_text():
    global p_audio
    global g_active
    global g_wait
    while True:
        if g_active:
            time_duration = Q_DURATION
        else:
            time_duration = DURATION
        tf = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')
        mp3_tf = tempfile.NamedTemporaryFile(suffix=".mp3", delete=True, mode='wb')
        with wave.open(tf.name, 'wb') as wav_file:
            wav_file.setnchannels(CHANNELS)
            wav_file.setsampwidth(p_audio.get_sample_size(FORMAT))
            wav_file.setframerate(RATE)
            # Read audio data from the stream for the specified duration
            for i in range(0, RATE // CHUNK * time_duration):
                print("r", end="")
                audio_data = audio_queue.get()
                wav_file.writeframes(audio_data)
            audio_queue.task_done()
        print(f"{time_duration} sec recording done.")
        # Perform speech recognition
        audio = AudioSegment.from_wav(tf.name)
        audio.export(mp3_tf.name, format="mp3")
        # segments, info = model.transcribe(mp3_tf_filename, beam_size=5)
        segments, _ = model.transcribe(mp3_tf.name)
        questions = []
        if g_active:
            counter += 1
            for segment in segments:
                print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
                if segment.text:
                    questions.append(segment.text)
            q = re.sub(THANK, "", " ".join(questions))
            print(f"Question:{q} counter{counter}")
            if len(q) > 40 and counter >= 2:
                counter = 0
                output = gpt_model.generate(" ".join(questions), max_tokens=50)
                print(f"Answer:{output}")
                reply_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')
                tts.tts_to_file(text=output, file_path=reply_wav.name)
                play(playback_stream, reply_wav.name)
                with g_lock:
                    g_active = False
                time.sleep(5)
            continue
        for segment in segments:
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
            text_input = segment.text.lower()
            if text_input.find("hey") != -1:
                if re.search(SUSE, text_input):
                    counter = 1
                    with g_lock:
                        g_active = True
                        g_wait = True
                    play(playback_stream, "data/audio/suse_intro.wav")
                    print("Finish suse")
                    with g_lock:
                        g_wait = False
                    time.sleep(5)

def play(play_stream, filename):
    wave_file = wave.open(filename, 'rb')
    print(f"Wave: rate={wave_file.getframerate()} channels={wave_file.getnchannels()} width={wave_file.getsampwidth()}")
    out_data = wave_file.readframes(CHUNK)
    while out_data:
        play_stream.write(out_data)
        out_data = wave_file.readframes(CHUNK)

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Init TTS
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
tts = TTS("tts_models/en/blizzard2013/capacitron-t2-c150_v2").to(device)
gpt_model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
# Create a queue to share audio data between threads
audio_queue = queue.Queue()
# model_size = "large-v2"
model_size = "small.en"
# model_size = "tiny.en"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cpu", compute_type="int8")


print(f"Stream: playback->{playback_stream.get_write_available()}")
# Create and start the recording thread
recording_thread = threading.Thread(target=record_audio)
recording_thread.start()

# Create and start the speech-to-text thread
speech_to_text_thread = threading.Thread(target=speech_to_text)
speech_to_text_thread.start()

# Wait for the recording thread to finish (you can define conditions to stop the recording)
recording_thread.join()

# Stop the speech-to-text thread
speech_to_text_thread.join()

p.terminate()