|
|
import datetime |
|
|
import json |
|
|
import time |
|
|
import pytz |
|
|
|
|
|
import gradio as gr |
|
|
from huggingface_hub import snapshot_download |
|
|
from omegaconf import OmegaConf |
|
|
from vosk import KaldiRecognizer, Model |
|
|
|
|
|
THRESHOLD_G = 0.5 |
|
|
THRESHOLD_H = 0.34 |
|
|
|
|
|
|
|
|
def load_vosk(model_id: str): |
|
|
model_dir = snapshot_download(model_id) |
|
|
return Model(model_path=model_dir) |
|
|
|
|
|
|
|
|
OmegaConf.register_new_resolver("load_vosk", load_vosk) |
|
|
|
|
|
models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml")) |
|
|
|
|
|
|
|
|
def automatic_speech_recognition(model_id: str, stream: str, new_chunk: str): |
|
|
model = models_config[model_id]["model"] |
|
|
|
|
|
sample_rate, audio_array = new_chunk |
|
|
if audio_array.ndim == 2: |
|
|
audio_array = audio_array[:, 0] |
|
|
|
|
|
audio_bytes = audio_array.tobytes() |
|
|
|
|
|
if stream is None: |
|
|
rec = KaldiRecognizer(model, sample_rate) |
|
|
rec.SetWords(True) |
|
|
rec.SetMaxAlternatives(0) |
|
|
results = [] |
|
|
start_time = time.time() |
|
|
else: |
|
|
rec, results, start_time = stream |
|
|
|
|
|
if time.time() - start_time > 10: |
|
|
rec.FinalResult() |
|
|
start_time = time.time() |
|
|
|
|
|
if rec.AcceptWaveform(audio_bytes): |
|
|
result = json.loads(rec.Result()) |
|
|
text_result = result["text"] |
|
|
if text_result != "" and result["text"] != "<SIL>": |
|
|
for word in ["HEY EVAS", "HI EVAS", "EVAS GO", "EVAS STOP"]: |
|
|
if word in text_result: |
|
|
print(result) |
|
|
|
|
|
conf_result = (round(result["result"][0]["conf"], 3), round(result["result"][1]["conf"], 3)) |
|
|
if "HI" in word or "HEY" in word: |
|
|
threshold_1 = THRESHOLD_H |
|
|
else: |
|
|
threshold_1 = THRESHOLD_G |
|
|
if conf_result[0] > threshold_1 and conf_result[1] > THRESHOLD_G: |
|
|
results.append( |
|
|
datetime.datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S") + " " + text_result |
|
|
) |
|
|
rec.FinalResult() |
|
|
start_time = time.time() |
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(results) > 0: |
|
|
output_text = "\n".join(results) |
|
|
else: |
|
|
output_text = "" |
|
|
|
|
|
return (rec, results, start_time), output_text |
|
|
|
|
|
|
|
|
demo = gr.Blocks( |
|
|
title="Wakeword Demo", |
|
|
css="@import url(https://tauhu.tw/tauhu-oo.css);", |
|
|
theme=gr.themes.Default( |
|
|
font=( |
|
|
"tauhu-oo", |
|
|
gr.themes.GoogleFont("Source Sans Pro"), |
|
|
"ui-sans-serif", |
|
|
"system-ui", |
|
|
"sans-serif", |
|
|
) |
|
|
), |
|
|
) |
|
|
|
|
|
with demo: |
|
|
default_model_id = list(models_config.keys())[0] |
|
|
model_drop_down = gr.Dropdown( |
|
|
models_config.keys(), |
|
|
value=default_model_id, |
|
|
label="模型", |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
# Wakeword Demo |
|
|
""" |
|
|
) |
|
|
state = gr.State() |
|
|
audio = gr.Audio( |
|
|
label="錄音", |
|
|
type="numpy", |
|
|
format="wav", |
|
|
waveform_options=gr.WaveformOptions( |
|
|
sample_rate=16000, |
|
|
), |
|
|
sources=["microphone"], |
|
|
streaming=True, |
|
|
) |
|
|
gr.Interface( |
|
|
automatic_speech_recognition, |
|
|
inputs=[ |
|
|
model_drop_down, |
|
|
state, |
|
|
audio, |
|
|
], |
|
|
outputs=[ |
|
|
state, |
|
|
gr.Text(interactive=False, label="輸出"), |
|
|
], |
|
|
live=True, |
|
|
stream_every=0.25, |
|
|
clear_btn=None, |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
demo.launch() |
|
|
demo.launch() |
|
|
demo.launch() |
|
|
|