import io import tempfile import pyaudio from pydub import AudioSegment import wave import re import time import queue import threading from transformers import pipeline from datasets import load_dataset from faster_whisper import WhisperModel import torch from TTS.api import TTS from gpt4all import GPT4All from audio_utils import AudioSplit CHUNK = 1024 FORMAT = pyaudio.paInt16 # 16-bit resolution CHANNELS = 1 RATE = 16000 # sample rate DURATION = 3 Q_DURATION = 7 SUSE = r"s*u*s*e" THANK= r"Thank\s*(?:you|u)\b" g_active = False g_wait = False g_lock = threading.Lock() counter = 0 p_audio = pyaudio.PyAudio() playback_stream =, channels=CHANNELS, rate=24000, output=True) record_stream =, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) def record_audio(): global p_audio global g_active global g_wait print("Recording started...") while True: if g_active == True: print("c", end="") if g_wait == True: print("w", end="") while not audio_queue.empty(): audio_queue.get() time.sleep(1) continue audio_data =, exception_on_overflow=False) audio_queue.put(audio_data) print(".", end="") record_stream.stop_stream() record_stream.close() p_audio.terminate() def speech_to_text(): global p_audio global g_active global g_wait while True: if g_active: time_duration = Q_DURATION else: time_duration = DURATION tf = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb') mp3_tf = tempfile.NamedTemporaryFile(suffix=".mp3", delete=True, mode='wb') with, 'wb') as wav_file: wav_file.setnchannels(CHANNELS) wav_file.setsampwidth(p_audio.get_sample_size(FORMAT)) wav_file.setframerate(RATE) # Read audio data from the stream for the specified duration for i in range(0, RATE // CHUNK * time_duration): print("r", end="") audio_data = audio_queue.get() wav_file.writeframes(audio_data) audio_queue.task_done() print(f"{time_duration} sec recording done.") # Perform speech recognition audio = AudioSegment.from_wav( audio.export(, format="mp3") # segments, info = model.transcribe(mp3_tf_filename, beam_size=5) segments, _ = model.transcribe( questions = [] if g_active: counter += 1 for segment in segments: print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) if segment.text: questions.append(segment.text) q = re.sub(THANK, "", " ".join(questions)) print(f"Question:{q} counter{counter}") if len(q) > 40 and counter >= 2: counter = 0 output = gpt_model.generate(" ".join(questions), max_tokens=50) print(f"Answer:{output}") reply_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb') tts.tts_to_file(text=output, play(playback_stream, with g_lock: g_active = False time.sleep(5) continue for segment in segments: print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) text_input = segment.text.lower() if text_input.find("hey") != -1: if, text_input): counter = 1 with g_lock: g_active = True g_wait = True play(playback_stream, "data/audio/suse_intro.wav") print("Finish suse") with g_lock: g_wait = False time.sleep(5) def play(play_stream, filename): wave_file =, 'rb') print(f"Wave: rate={wave_file.getframerate()} channels={wave_file.getnchannels()} width={wave_file.getsampwidth()}") out_data = wave_file.readframes(CHUNK) while out_data: play_stream.write(out_data) out_data = wave_file.readframes(CHUNK) # Get device device = "cuda" if torch.cuda.is_available() else "cpu" # Init TTS # tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) tts = TTS("tts_models/en/blizzard2013/capacitron-t2-c150_v2").to(device) gpt_model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf") # Create a queue to share audio data between threads audio_queue = queue.Queue() # model_size = "large-v2" model_size = "small.en" # model_size = "tiny.en" # Run on GPU with FP16 model = WhisperModel(model_size, device="cpu", compute_type="int8") print(f"Stream: playback->{playback_stream.get_write_available()}") # Create and start the recording thread recording_thread = threading.Thread(target=record_audio) recording_thread.start() # Create and start the speech-to-text thread speech_to_text_thread = threading.Thread(target=speech_to_text) speech_to_text_thread.start() # Wait for the recording thread to finish (you can define conditions to stop the recording) recording_thread.join() # Stop the speech-to-text thread speech_to_text_thread.join() p.terminate()