SUSE_audio_assistant/demo_assistant.py
Alex Lau (AvengerMoJo) 7ba9d8d3db
Just MVT demo for the concept of AI audio assistant.
Signed-off-by: Alex Lau (AvengerMoJo) <alau@suse.com>
2023-11-10 02:26:16 +08:00

164 lines
5.4 KiB
Python

import io
import tempfile
import pyaudio
from pydub import AudioSegment
import wave
import re
import time
import queue
import threading
from transformers import pipeline
from datasets import load_dataset
from faster_whisper import WhisperModel
import torch
from TTS.api import TTS
from gpt4all import GPT4All
from audio_utils import AudioSplit
CHUNK = 1024
FORMAT = pyaudio.paInt16 # 16-bit resolution
CHANNELS = 1
RATE = 16000 # sample rate
DURATION = 3
Q_DURATION = 7
SUSE = r"s*u*s*e"
THANK= r"Thank\s*(?:you|u)\b"
g_active = False
g_wait = False
g_lock = threading.Lock()
counter = 0
p_audio = pyaudio.PyAudio()
playback_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=24000, output=True)
record_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
def record_audio():
global p_audio
global g_active
global g_wait
print("Recording started...")
while True:
if g_active == True:
print("c", end="")
if g_wait == True:
print("w", end="")
while not audio_queue.empty():
audio_queue.get()
time.sleep(1)
continue
audio_data = record_stream.read(CHUNK, exception_on_overflow=False)
audio_queue.put(audio_data)
print(".", end="")
record_stream.stop_stream()
record_stream.close()
p_audio.terminate()
def speech_to_text():
global p_audio
global g_active
global g_wait
while True:
if g_active:
time_duration = Q_DURATION
else:
time_duration = DURATION
tf = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')
mp3_tf = tempfile.NamedTemporaryFile(suffix=".mp3", delete=True, mode='wb')
with wave.open(tf.name, 'wb') as wav_file:
wav_file.setnchannels(CHANNELS)
wav_file.setsampwidth(p_audio.get_sample_size(FORMAT))
wav_file.setframerate(RATE)
# Read audio data from the stream for the specified duration
for i in range(0, RATE // CHUNK * time_duration):
print("r", end="")
audio_data = audio_queue.get()
wav_file.writeframes(audio_data)
audio_queue.task_done()
print(f"{time_duration} sec recording done.")
# Perform speech recognition
audio = AudioSegment.from_wav(tf.name)
audio.export(mp3_tf.name, format="mp3")
# segments, info = model.transcribe(mp3_tf_filename, beam_size=5)
segments, _ = model.transcribe(mp3_tf.name)
questions = []
if g_active:
counter += 1
for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
if segment.text:
questions.append(segment.text)
q = re.sub(THANK, "", " ".join(questions))
print(f"Question:{q} counter{counter}")
if len(q) > 40 and counter >= 2:
counter = 0
output = gpt_model.generate(" ".join(questions), max_tokens=50)
print(f"Answer:{output}")
reply_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')
tts.tts_to_file(text=output, file_path=reply_wav.name)
play(playback_stream, reply_wav.name)
with g_lock:
g_active = False
time.sleep(5)
continue
for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
text_input = segment.text.lower()
if text_input.find("hey") != -1:
if re.search(SUSE, text_input):
counter = 1
with g_lock:
g_active = True
g_wait = True
play(playback_stream, "data/audio/suse_intro.wav")
print("Finish suse")
with g_lock:
g_wait = False
time.sleep(5)
def play(play_stream, filename):
wave_file = wave.open(filename, 'rb')
print(f"Wave: rate={wave_file.getframerate()} channels={wave_file.getnchannels()} width={wave_file.getsampwidth()}")
out_data = wave_file.readframes(CHUNK)
while out_data:
play_stream.write(out_data)
out_data = wave_file.readframes(CHUNK)
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Init TTS
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
tts = TTS("tts_models/en/blizzard2013/capacitron-t2-c150_v2").to(device)
gpt_model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
# Create a queue to share audio data between threads
audio_queue = queue.Queue()
# model_size = "large-v2"
model_size = "small.en"
# model_size = "tiny.en"
# Run on GPU with FP16
model = WhisperModel(model_size, device="cpu", compute_type="int8")
print(f"Stream: playback->{playback_stream.get_write_available()}")
# Create and start the recording thread
recording_thread = threading.Thread(target=record_audio)
recording_thread.start()
# Create and start the speech-to-text thread
speech_to_text_thread = threading.Thread(target=speech_to_text)
speech_to_text_thread.start()
# Wait for the recording thread to finish (you can define conditions to stop the recording)
recording_thread.join()
# Stop the speech-to-text thread
speech_to_text_thread.join()
p.terminate()