159 lines
5.3 KiB
Python
159 lines
5.3 KiB
Python
|
import io
|
||
|
import tempfile
|
||
|
import pyaudio
|
||
|
from pydub import AudioSegment
|
||
|
import wave
|
||
|
import re
|
||
|
import time
|
||
|
|
||
|
import queue
|
||
|
import threading
|
||
|
from transformers import pipeline
|
||
|
from datasets import load_dataset
|
||
|
from faster_whisper import WhisperModel
|
||
|
import torch
|
||
|
from TTS.api import TTS
|
||
|
from gpt4all import GPT4All
|
||
|
|
||
|
from audio_utils import AudioSplit
|
||
|
|
||
|
|
||
|
CHUNK = 1024
|
||
|
FORMAT = pyaudio.paInt16 # 16-bit resolution
|
||
|
CHANNELS = 1
|
||
|
RATE = 16000 # sample rate
|
||
|
DURATION = 5
|
||
|
SUSE = r"s*u*s*e"
|
||
|
THANK= r"Thank\s*(?:you|u)\b"
|
||
|
|
||
|
|
||
|
g_active = False
|
||
|
g_wait = False
|
||
|
g_lock = threading.Lock()
|
||
|
counter = 0
|
||
|
|
||
|
p_audio = pyaudio.PyAudio()
|
||
|
playback_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=24000, output=True)
|
||
|
record_stream = p_audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
|
||
|
def record_audio():
|
||
|
global p_audio
|
||
|
global g_active
|
||
|
global g_wait
|
||
|
print("Recording started...")
|
||
|
while True:
|
||
|
if g_active == True:
|
||
|
print("c", end="")
|
||
|
if g_wait == True:
|
||
|
print("w", end="")
|
||
|
while not audio_queue.empty():
|
||
|
audio_queue.get()
|
||
|
time.sleep(1)
|
||
|
continue
|
||
|
audio_data = record_stream.read(CHUNK)
|
||
|
audio_queue.put(audio_data)
|
||
|
print(".", end="")
|
||
|
|
||
|
record_stream.stop_stream()
|
||
|
record_stream.close()
|
||
|
p_audio.terminate()
|
||
|
|
||
|
def speech_to_text():
|
||
|
global p_audio
|
||
|
global g_active
|
||
|
global g_wait
|
||
|
while True:
|
||
|
tf = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')
|
||
|
mp3_tf = tempfile.NamedTemporaryFile(suffix=".mp3", delete=True, mode='wb')
|
||
|
with wave.open(tf.name, 'wb') as wav_file:
|
||
|
wav_file.setnchannels(CHANNELS)
|
||
|
wav_file.setsampwidth(p_audio.get_sample_size(FORMAT))
|
||
|
wav_file.setframerate(RATE)
|
||
|
# Read audio data from the stream for the specified duration
|
||
|
for i in range(0, RATE // CHUNK * DURATION):
|
||
|
print("r", end="")
|
||
|
audio_data = audio_queue.get()
|
||
|
wav_file.writeframes(audio_data)
|
||
|
audio_queue.task_done()
|
||
|
print(f"{DURATION} sec recording done.")
|
||
|
# Perform speech recognition
|
||
|
audio = AudioSegment.from_wav(tf.name)
|
||
|
audio.export(mp3_tf.name, format="mp3")
|
||
|
# segments, info = model.transcribe(mp3_tf_filename, beam_size=5)
|
||
|
segments, _ = model.transcribe(mp3_tf.name)
|
||
|
questions = []
|
||
|
if g_active:
|
||
|
counter += 1
|
||
|
for segment in segments:
|
||
|
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
|
||
|
if segment.text:
|
||
|
questions.append(segment.text)
|
||
|
q = re.sub(THANK, "", " ".join(questions))
|
||
|
print(f"Question:{q} counter{counter}")
|
||
|
if len(q) > 40 and counter > 3:
|
||
|
counter = 0
|
||
|
output = gpt_model.generate(" ".join(questions), max_tokens=50)
|
||
|
print(f"Answer:{output}")
|
||
|
reply_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=True, mode='wb')
|
||
|
tts.tts_to_file(text=output, file_path=reply_wav.name)
|
||
|
play(playback_stream, reply_wav.name)
|
||
|
with g_lock:
|
||
|
g_active = False
|
||
|
time.sleep(5)
|
||
|
continue
|
||
|
for segment in segments:
|
||
|
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
|
||
|
text_input = segment.text.lower()
|
||
|
if text_input.find("hey") != -1:
|
||
|
if re.search(SUSE, text_input):
|
||
|
counter = 1
|
||
|
with g_lock:
|
||
|
g_active = True
|
||
|
g_wait = True
|
||
|
play(playback_stream, "data/audio/suse_intro.wav")
|
||
|
print("Finish suse")
|
||
|
with g_lock:
|
||
|
g_wait = False
|
||
|
time.sleep(5)
|
||
|
|
||
|
def play(play_stream, filename):
|
||
|
wave_file = wave.open(filename, 'rb')
|
||
|
print(f"Wave: rate={wave_file.getframerate()} channels={wave_file.getnchannels()} width={wave_file.getsampwidth()}")
|
||
|
out_data = wave_file.readframes(CHUNK)
|
||
|
while out_data:
|
||
|
play_stream.write(out_data)
|
||
|
out_data = wave_file.readframes(CHUNK)
|
||
|
|
||
|
# Get device
|
||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||
|
|
||
|
# Init TTS
|
||
|
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
||
|
tts = TTS("tts_models/en/blizzard2013/capacitron-t2-c150_v2").to(device)
|
||
|
gpt_model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
|
||
|
# Create a queue to share audio data between threads
|
||
|
audio_queue = queue.Queue()
|
||
|
# model_size = "large-v2"
|
||
|
model_size = "small.en"
|
||
|
# model_size = "tiny.en"
|
||
|
|
||
|
# Run on GPU with FP16
|
||
|
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||
|
|
||
|
|
||
|
print(f"Stream: playback->{playback_stream.get_write_available()}")
|
||
|
# Create and start the recording thread
|
||
|
recording_thread = threading.Thread(target=record_audio)
|
||
|
recording_thread.start()
|
||
|
|
||
|
# Create and start the speech-to-text thread
|
||
|
speech_to_text_thread = threading.Thread(target=speech_to_text)
|
||
|
speech_to_text_thread.start()
|
||
|
|
||
|
# Wait for the recording thread to finish (you can define conditions to stop the recording)
|
||
|
recording_thread.join()
|
||
|
|
||
|
# Stop the speech-to-text thread
|
||
|
speech_to_text_thread.join()
|
||
|
|
||
|
p.terminate()
|