Spaces:

kenkone
/

wakeword

Runtime error

File size: 3,847 Bytes

3b70fe7
b5c6cd4
4cb8cfb
d8ecd12
b5c6cd4
 
 
 
 
 
22eb6d8
dc31fd6
0432374
b5c6cd4
 
 
 
 
 
 
 
 
 
 
abc7c42
 
b5c6cd4
 
 
 
 
 
 
 
 
 
3b70fe7
5a21062
f972aa2
b5c6cd4
f972aa2
4cb8cfb
f972aa2
4cb8cfb
f972aa2
b5c6cd4
4985536
 
 
4cb8cfb
0432374
 
 
 
 
22eb6d8
dc31fd6
22eb6d8
dc31fd6
 
b017623
 
 
 
 
 
4985536
 
 
 
 
 
 
 
 
 
 
f468a6d
5a21062
3b70fe7
b5c6cd4
3b70fe7
b5c6cd4
f972aa2
b5c6cd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b0a155
b5c6cd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b0a155
b5c6cd4
 
 
 
 
 
 
 
 
4985536
4cb8cfb

import datetime
import json
import time
import pytz

import gradio as gr
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from vosk import KaldiRecognizer, Model

THRESHOLD_G = 0.5
THRESHOLD_H = 0.34


def load_vosk(model_id: str):
    model_dir = snapshot_download(model_id)
    return Model(model_path=model_dir)


OmegaConf.register_new_resolver("load_vosk", load_vosk)

models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))


def automatic_speech_recognition(model_id: str, stream: str, new_chunk: str):
    model = models_config[model_id]["model"]

    sample_rate, audio_array = new_chunk
    if audio_array.ndim == 2:
        audio_array = audio_array[:, 0]

    audio_bytes = audio_array.tobytes()

    if stream is None:
        rec = KaldiRecognizer(model, sample_rate)
        rec.SetWords(True)
        rec.SetMaxAlternatives(0)
        results = []
        start_time = time.time()
    else:
        rec, results, start_time = stream

    if time.time() - start_time > 10:
        rec.FinalResult()
        start_time = time.time()

    if rec.AcceptWaveform(audio_bytes):
        result = json.loads(rec.Result())
        text_result = result["text"]
        if text_result != "" and result["text"] != "<SIL>":
            for word in ["HEY EVAS", "HI EVAS", "EVAS GO", "EVAS STOP"]:
                if word in text_result:
                    print(result)

                    conf_result = (round(result["result"][0]["conf"], 3), round(result["result"][1]["conf"], 3))
                    if "HI" in word or "HEY" in word:
                        threshold_1 = THRESHOLD_H
                    else:
                        threshold_1 = THRESHOLD_G
                    if conf_result[0] > threshold_1 and conf_result[1] > THRESHOLD_G:
                        results.append(
                            datetime.datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S") + " " + text_result
                        )
                        rec.FinalResult()
                        start_time = time.time()
                        break

    # rec.AcceptWaveform(audio_bytes)
    # partial_result = json.loads(rec.PartialResult())

    # if partial_result["partial"] != "":
    #     print(partial_result)
    #     results.append(
    #         datetime.datetime.now().strftime("%H:%M:%S")
    #         + " "
    #         + partial_result["partial"]
    #     )

    if len(results) > 0:
        output_text = "\n".join(results)
    else:
        output_text = ""

    return (rec, results, start_time), output_text


demo = gr.Blocks(
    title="Wakeword Demo",
    css="@import url(https://tauhu.tw/tauhu-oo.css);",
    theme=gr.themes.Default(
        font=(
            "tauhu-oo",
            gr.themes.GoogleFont("Source Sans Pro"),
            "ui-sans-serif",
            "system-ui",
            "sans-serif",
        )
    ),
)

with demo:
    default_model_id = list(models_config.keys())[0]
    model_drop_down = gr.Dropdown(
        models_config.keys(),
        value=default_model_id,
        label="模型",
    )

    gr.Markdown(
        """
        # Wakeword Demo
        """
    )
    state = gr.State()
    audio = gr.Audio(
        label="錄音",
        type="numpy",
        format="wav",
        waveform_options=gr.WaveformOptions(
            sample_rate=16000,
        ),
        sources=["microphone"],
        streaming=True,
    )
    gr.Interface(
        automatic_speech_recognition,
        inputs=[
            model_drop_down,
            state,
            audio,
        ],
        outputs=[
            state,
            gr.Text(interactive=False, label="輸出"),
        ],
        live=True,
        stream_every=0.25,
        clear_btn=None,
        # flagging_mode="auto",
    )


demo.launch()
demo.launch()
demo.launch()