wakeword / app.py
hungshinlee's picture
Update app.py
dc31fd6 verified
import datetime
import json
import time
import pytz
import gradio as gr
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from vosk import KaldiRecognizer, Model
THRESHOLD_G = 0.5
THRESHOLD_H = 0.34
def load_vosk(model_id: str):
model_dir = snapshot_download(model_id)
return Model(model_path=model_dir)
OmegaConf.register_new_resolver("load_vosk", load_vosk)
models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
def automatic_speech_recognition(model_id: str, stream: str, new_chunk: str):
model = models_config[model_id]["model"]
sample_rate, audio_array = new_chunk
if audio_array.ndim == 2:
audio_array = audio_array[:, 0]
audio_bytes = audio_array.tobytes()
if stream is None:
rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)
rec.SetMaxAlternatives(0)
results = []
start_time = time.time()
else:
rec, results, start_time = stream
if time.time() - start_time > 10:
rec.FinalResult()
start_time = time.time()
if rec.AcceptWaveform(audio_bytes):
result = json.loads(rec.Result())
text_result = result["text"]
if text_result != "" and result["text"] != "<SIL>":
for word in ["HEY EVAS", "HI EVAS", "EVAS GO", "EVAS STOP"]:
if word in text_result:
print(result)
conf_result = (round(result["result"][0]["conf"], 3), round(result["result"][1]["conf"], 3))
if "HI" in word or "HEY" in word:
threshold_1 = THRESHOLD_H
else:
threshold_1 = THRESHOLD_G
if conf_result[0] > threshold_1 and conf_result[1] > THRESHOLD_G:
results.append(
datetime.datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S") + " " + text_result
)
rec.FinalResult()
start_time = time.time()
break
# rec.AcceptWaveform(audio_bytes)
# partial_result = json.loads(rec.PartialResult())
# if partial_result["partial"] != "":
# print(partial_result)
# results.append(
# datetime.datetime.now().strftime("%H:%M:%S")
# + " "
# + partial_result["partial"]
# )
if len(results) > 0:
output_text = "\n".join(results)
else:
output_text = ""
return (rec, results, start_time), output_text
demo = gr.Blocks(
title="Wakeword Demo",
css="@import url(https://tauhu.tw/tauhu-oo.css);",
theme=gr.themes.Default(
font=(
"tauhu-oo",
gr.themes.GoogleFont("Source Sans Pro"),
"ui-sans-serif",
"system-ui",
"sans-serif",
)
),
)
with demo:
default_model_id = list(models_config.keys())[0]
model_drop_down = gr.Dropdown(
models_config.keys(),
value=default_model_id,
label="模型",
)
gr.Markdown(
"""
# Wakeword Demo
"""
)
state = gr.State()
audio = gr.Audio(
label="錄音",
type="numpy",
format="wav",
waveform_options=gr.WaveformOptions(
sample_rate=16000,
),
sources=["microphone"],
streaming=True,
)
gr.Interface(
automatic_speech_recognition,
inputs=[
model_drop_down,
state,
audio,
],
outputs=[
state,
gr.Text(interactive=False, label="輸出"),
],
live=True,
stream_every=0.25,
clear_btn=None,
# flagging_mode="auto",
)
demo.launch()
demo.launch()
demo.launch()