File size: 3,847 Bytes
3b70fe7
b5c6cd4
4cb8cfb
d8ecd12
b5c6cd4
 
 
 
 
 
22eb6d8
dc31fd6
0432374
b5c6cd4
 
 
 
 
 
 
 
 
 
 
abc7c42
 
b5c6cd4
 
 
 
 
 
 
 
 
 
3b70fe7
5a21062
f972aa2
b5c6cd4
f972aa2
4cb8cfb
f972aa2
4cb8cfb
f972aa2
b5c6cd4
4985536
 
 
4cb8cfb
0432374
 
 
 
 
22eb6d8
dc31fd6
22eb6d8
dc31fd6
 
b017623
 
 
 
 
 
4985536
 
 
 
 
 
 
 
 
 
 
f468a6d
5a21062
3b70fe7
b5c6cd4
3b70fe7
b5c6cd4
f972aa2
b5c6cd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b0a155
b5c6cd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b0a155
b5c6cd4
 
 
 
 
 
 
 
 
4985536
4cb8cfb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import datetime
import json
import time
import pytz

import gradio as gr
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from vosk import KaldiRecognizer, Model

THRESHOLD_G = 0.5
THRESHOLD_H = 0.34


def load_vosk(model_id: str):
    model_dir = snapshot_download(model_id)
    return Model(model_path=model_dir)


OmegaConf.register_new_resolver("load_vosk", load_vosk)

models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))


def automatic_speech_recognition(model_id: str, stream: str, new_chunk: str):
    model = models_config[model_id]["model"]

    sample_rate, audio_array = new_chunk
    if audio_array.ndim == 2:
        audio_array = audio_array[:, 0]

    audio_bytes = audio_array.tobytes()

    if stream is None:
        rec = KaldiRecognizer(model, sample_rate)
        rec.SetWords(True)
        rec.SetMaxAlternatives(0)
        results = []
        start_time = time.time()
    else:
        rec, results, start_time = stream

    if time.time() - start_time > 10:
        rec.FinalResult()
        start_time = time.time()

    if rec.AcceptWaveform(audio_bytes):
        result = json.loads(rec.Result())
        text_result = result["text"]
        if text_result != "" and result["text"] != "<SIL>":
            for word in ["HEY EVAS", "HI EVAS", "EVAS GO", "EVAS STOP"]:
                if word in text_result:
                    print(result)

                    conf_result = (round(result["result"][0]["conf"], 3), round(result["result"][1]["conf"], 3))
                    if "HI" in word or "HEY" in word:
                        threshold_1 = THRESHOLD_H
                    else:
                        threshold_1 = THRESHOLD_G
                    if conf_result[0] > threshold_1 and conf_result[1] > THRESHOLD_G:
                        results.append(
                            datetime.datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S") + " " + text_result
                        )
                        rec.FinalResult()
                        start_time = time.time()
                        break

    # rec.AcceptWaveform(audio_bytes)
    # partial_result = json.loads(rec.PartialResult())

    # if partial_result["partial"] != "":
    #     print(partial_result)
    #     results.append(
    #         datetime.datetime.now().strftime("%H:%M:%S")
    #         + " "
    #         + partial_result["partial"]
    #     )

    if len(results) > 0:
        output_text = "\n".join(results)
    else:
        output_text = ""

    return (rec, results, start_time), output_text


demo = gr.Blocks(
    title="Wakeword Demo",
    css="@import url(https://tauhu.tw/tauhu-oo.css);",
    theme=gr.themes.Default(
        font=(
            "tauhu-oo",
            gr.themes.GoogleFont("Source Sans Pro"),
            "ui-sans-serif",
            "system-ui",
            "sans-serif",
        )
    ),
)

with demo:
    default_model_id = list(models_config.keys())[0]
    model_drop_down = gr.Dropdown(
        models_config.keys(),
        value=default_model_id,
        label="模型",
    )

    gr.Markdown(
        """
        # Wakeword Demo
        """
    )
    state = gr.State()
    audio = gr.Audio(
        label="錄音",
        type="numpy",
        format="wav",
        waveform_options=gr.WaveformOptions(
            sample_rate=16000,
        ),
        sources=["microphone"],
        streaming=True,
    )
    gr.Interface(
        automatic_speech_recognition,
        inputs=[
            model_drop_down,
            state,
            audio,
        ],
        outputs=[
            state,
            gr.Text(interactive=False, label="輸出"),
        ],
        live=True,
        stream_every=0.25,
        clear_btn=None,
        # flagging_mode="auto",
    )


demo.launch()
demo.launch()
demo.launch()