SUSE_audio_assistant/test/fast_whisper2.py

import io
import tempfile
import pyaudio
from pydub import AudioSegment
import wave
import re
import time

import queue
import threading
from transformers import pipeline
from datasets import load_dataset
from faster_whisper import WhisperModel
import torch
from TTS.api import TTS
from gpt4all import GPT4All

from audio_utils import AudioSplit


CHUNK = 1024
FORMAT = pyaudio.paInt16 # 16-bit resolution
CHANNELS = 1
RATE = 16000 # sample rate
DURATION = 5
SUSE = r"s*u*s*e"
THANK= r"Thank\s*(?:you|u)\b"


g_active = False
g_wait = False
g_lock = threading.Lock()
counter = 0

p_audio = pyaudio.PyAudio()
playback_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=24000, output=True)
record_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
def record_audio():
    global p_audio
    global g_active
    global g_wait
    print("Recording started...")
    while True:
        if g_active == True:
            print("c", end="")
            if g_wait == True:
                print("w", end="")
                while not audio_queue.empty():
                    audio_queue.get()
                time.sleep(1)
                continue
        audio_data = record_stream.read(CHUNK)
        audio_queue.put(audio_data)
        print(".", end="")
            
    record_stream.stop_stream()
    record_stream.close()
    p_audio.terminate()

def speech_to_text():
    global p_audio
    global g_active
    global g_wait
    while True:
        tf = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')
        mp3_tf = tempfile.NamedTemporaryFile(suffix=".mp3", delete=True, mode='wb')
        with wave.open(tf.name, 'wb') as wav_file:
            wav_file.setnchannels(CHANNELS)
            wav_file.setsampwidth(p_audio.get_sample_size(FORMAT))
            wav_file.setframerate(RATE)
            # Read audio data from the stream for the specified duration
            for i in range(0, RATE // CHUNK * DURATION):
                print("r", end="")
                audio_data = audio_queue.get()
                wav_file.writeframes(audio_data)
            audio_queue.task_done()
        print(f"{DURATION} sec recording done.")
        # Perform speech recognition
        audio = AudioSegment.from_wav(tf.name)
        audio.export(mp3_tf.name, format="mp3")
        # segments, info = model.transcribe(mp3_tf_filename, beam_size=5)
        segments, _ = model.transcribe(mp3_tf.name)
        questions = []
        if g_active:
            counter += 1
            for segment in segments:
                print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
                if segment.text:
                    questions.append(segment.text)
            q = re.sub(THANK, "", " ".join(questions))
            print(f"Question:{q} counter{counter}")
            if len(q) > 40 and counter > 3:
                counter = 0
                output = gpt_model.generate(" ".join(questions), max_tokens=50)
                print(f"Answer:{output}")
                reply_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')
                tts.tts_to_file(text=output, file_path=reply_wav.name)
                play(playback_stream, reply_wav.name)
                with g_lock:
                    g_active = False
                time.sleep(5)
            continue
        for segment in segments:
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
            text_input = segment.text.lower()
            if text_input.find("hey") != -1:
                if re.search(SUSE, text_input):
                    counter = 1
                    with g_lock:
                        g_active = True
                        g_wait = True
                    play(playback_stream, "data/audio/suse_intro.wav")
                    print("Finish suse")
                    with g_lock:
                        g_wait = False
                    time.sleep(5)

def play(play_stream, filename):
    wave_file = wave.open(filename, 'rb')
    print(f"Wave: rate={wave_file.getframerate()} channels={wave_file.getnchannels()} width={wave_file.getsampwidth()}")
    out_data = wave_file.readframes(CHUNK)
    while out_data:
        play_stream.write(out_data)
        out_data = wave_file.readframes(CHUNK)

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Init TTS
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
tts = TTS("tts_models/en/blizzard2013/capacitron-t2-c150_v2").to(device)
gpt_model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
# Create a queue to share audio data between threads
audio_queue = queue.Queue()
# model_size = "large-v2"
model_size = "small.en"
# model_size = "tiny.en"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cpu", compute_type="int8")


print(f"Stream: playback->{playback_stream.get_write_available()}")
# Create and start the recording thread
recording_thread = threading.Thread(target=record_audio)
recording_thread.start()

# Create and start the speech-to-text thread
speech_to_text_thread = threading.Thread(target=speech_to_text)
speech_to_text_thread.start()

# Wait for the recording thread to finish (you can define conditions to stop the recording)
recording_thread.join()

# Stop the speech-to-text thread
speech_to_text_thread.join()

p.terminate()
Just MVT demo for the concept of AI audio assistant. Signed-off-by: Alex Lau (AvengerMoJo) <alau@suse.com> 2023-11-09 19:26:16 +01:00			`import io`
			`import tempfile`
			`import pyaudio`
			`from pydub import AudioSegment`
			`import wave`
			`import re`
			`import time`

			`import queue`
			`import threading`
			`from transformers import pipeline`
			`from datasets import load_dataset`
			`from faster_whisper import WhisperModel`
			`import torch`
			`from TTS.api import TTS`
			`from gpt4all import GPT4All`

			`from audio_utils import AudioSplit`


			`CHUNK = 1024`
			`FORMAT = pyaudio.paInt16 # 16-bit resolution`
			`CHANNELS = 1`
			`RATE = 16000 # sample rate`
			`DURATION = 5`
			`SUSE = r"sus*e"`
			`THANK= r"Thank\s*(?:you\|u)\b"`


			`g_active = False`
			`g_wait = False`
			`g_lock = threading.Lock()`
			`counter = 0`

			`p_audio = pyaudio.PyAudio()`
			`playback_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=24000, output=True)`
			`record_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)`
			`def record_audio():`
			`global p_audio`
			`global g_active`
			`global g_wait`
			`print("Recording started...")`
			`while True:`
			`if g_active == True:`
			`print("c", end="")`
			`if g_wait == True:`
			`print("w", end="")`
			`while not audio_queue.empty():`
			`audio_queue.get()`
			`time.sleep(1)`
			`continue`
			`audio_data = record_stream.read(CHUNK)`
			`audio_queue.put(audio_data)`
			`print(".", end="")`

			`record_stream.stop_stream()`
			`record_stream.close()`
			`p_audio.terminate()`

			`def speech_to_text():`
			`global p_audio`
			`global g_active`
			`global g_wait`
			`while True:`
			`tf = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')`
			`mp3_tf = tempfile.NamedTemporaryFile(suffix=".mp3", delete=True, mode='wb')`
			`with wave.open(tf.name, 'wb') as wav_file:`
			`wav_file.setnchannels(CHANNELS)`
			`wav_file.setsampwidth(p_audio.get_sample_size(FORMAT))`
			`wav_file.setframerate(RATE)`
			`# Read audio data from the stream for the specified duration`
			`for i in range(0, RATE // CHUNK * DURATION):`
			`print("r", end="")`
			`audio_data = audio_queue.get()`
			`wav_file.writeframes(audio_data)`
			`audio_queue.task_done()`
			`print(f"{DURATION} sec recording done.")`
			`# Perform speech recognition`
			`audio = AudioSegment.from_wav(tf.name)`
			`audio.export(mp3_tf.name, format="mp3")`
			`# segments, info = model.transcribe(mp3_tf_filename, beam_size=5)`
			`segments, _ = model.transcribe(mp3_tf.name)`
			`questions = []`
			`if g_active:`
			`counter += 1`
			`for segment in segments:`
			`print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))`
			`if segment.text:`
			`questions.append(segment.text)`
			`q = re.sub(THANK, "", " ".join(questions))`
			`print(f"Question:{q} counter{counter}")`
			`if len(q) > 40 and counter > 3:`
			`counter = 0`
			`output = gpt_model.generate(" ".join(questions), max_tokens=50)`
			`print(f"Answer:{output}")`
			`reply_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')`
			`tts.tts_to_file(text=output, file_path=reply_wav.name)`
			`play(playback_stream, reply_wav.name)`
			`with g_lock:`
			`g_active = False`
			`time.sleep(5)`
			`continue`
			`for segment in segments:`
			`print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))`
			`text_input = segment.text.lower()`
			`if text_input.find("hey") != -1:`
			`if re.search(SUSE, text_input):`
			`counter = 1`
			`with g_lock:`
			`g_active = True`
			`g_wait = True`
			`play(playback_stream, "data/audio/suse_intro.wav")`
			`print("Finish suse")`
			`with g_lock:`
			`g_wait = False`
			`time.sleep(5)`

			`def play(play_stream, filename):`
			`wave_file = wave.open(filename, 'rb')`
			`print(f"Wave: rate={wave_file.getframerate()} channels={wave_file.getnchannels()} width={wave_file.getsampwidth()}")`
			`out_data = wave_file.readframes(CHUNK)`
			`while out_data:`
			`play_stream.write(out_data)`
			`out_data = wave_file.readframes(CHUNK)`

			`# Get device`
			`device = "cuda" if torch.cuda.is_available() else "cpu"`

			`# Init TTS`
			`# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)`
			`tts = TTS("tts_models/en/blizzard2013/capacitron-t2-c150_v2").to(device)`
			`gpt_model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")`
			`# Create a queue to share audio data between threads`
			`audio_queue = queue.Queue()`
			`# model_size = "large-v2"`
			`model_size = "small.en"`
			`# model_size = "tiny.en"`

			`# Run on GPU with FP16`
			`model = WhisperModel(model_size, device="cpu", compute_type="int8")`


			`print(f"Stream: playback->{playback_stream.get_write_available()}")`
			`# Create and start the recording thread`
			`recording_thread = threading.Thread(target=record_audio)`
			`recording_thread.start()`

			`# Create and start the speech-to-text thread`
			`speech_to_text_thread = threading.Thread(target=speech_to_text)`
			`speech_to_text_thread.start()`

			`# Wait for the recording thread to finish (you can define conditions to stop the recording)`
			`recording_thread.join()`

			`# Stop the speech-to-text thread`
			`speech_to_text_thread.join()`

			`p.terminate()`