File size: 3,847 Bytes
3b70fe7 b5c6cd4 4cb8cfb d8ecd12 b5c6cd4 22eb6d8 dc31fd6 0432374 b5c6cd4 abc7c42 b5c6cd4 3b70fe7 5a21062 f972aa2 b5c6cd4 f972aa2 4cb8cfb f972aa2 4cb8cfb f972aa2 b5c6cd4 4985536 4cb8cfb 0432374 22eb6d8 dc31fd6 22eb6d8 dc31fd6 b017623 4985536 f468a6d 5a21062 3b70fe7 b5c6cd4 3b70fe7 b5c6cd4 f972aa2 b5c6cd4 4b0a155 b5c6cd4 4b0a155 b5c6cd4 4985536 4cb8cfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import datetime
import json
import time
import pytz
import gradio as gr
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from vosk import KaldiRecognizer, Model
THRESHOLD_G = 0.5
THRESHOLD_H = 0.34
def load_vosk(model_id: str):
model_dir = snapshot_download(model_id)
return Model(model_path=model_dir)
OmegaConf.register_new_resolver("load_vosk", load_vosk)
models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
def automatic_speech_recognition(model_id: str, stream: str, new_chunk: str):
model = models_config[model_id]["model"]
sample_rate, audio_array = new_chunk
if audio_array.ndim == 2:
audio_array = audio_array[:, 0]
audio_bytes = audio_array.tobytes()
if stream is None:
rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)
rec.SetMaxAlternatives(0)
results = []
start_time = time.time()
else:
rec, results, start_time = stream
if time.time() - start_time > 10:
rec.FinalResult()
start_time = time.time()
if rec.AcceptWaveform(audio_bytes):
result = json.loads(rec.Result())
text_result = result["text"]
if text_result != "" and result["text"] != "<SIL>":
for word in ["HEY EVAS", "HI EVAS", "EVAS GO", "EVAS STOP"]:
if word in text_result:
print(result)
conf_result = (round(result["result"][0]["conf"], 3), round(result["result"][1]["conf"], 3))
if "HI" in word or "HEY" in word:
threshold_1 = THRESHOLD_H
else:
threshold_1 = THRESHOLD_G
if conf_result[0] > threshold_1 and conf_result[1] > THRESHOLD_G:
results.append(
datetime.datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S") + " " + text_result
)
rec.FinalResult()
start_time = time.time()
break
# rec.AcceptWaveform(audio_bytes)
# partial_result = json.loads(rec.PartialResult())
# if partial_result["partial"] != "":
# print(partial_result)
# results.append(
# datetime.datetime.now().strftime("%H:%M:%S")
# + " "
# + partial_result["partial"]
# )
if len(results) > 0:
output_text = "\n".join(results)
else:
output_text = ""
return (rec, results, start_time), output_text
demo = gr.Blocks(
title="Wakeword Demo",
css="@import url(https://tauhu.tw/tauhu-oo.css);",
theme=gr.themes.Default(
font=(
"tauhu-oo",
gr.themes.GoogleFont("Source Sans Pro"),
"ui-sans-serif",
"system-ui",
"sans-serif",
)
),
)
with demo:
default_model_id = list(models_config.keys())[0]
model_drop_down = gr.Dropdown(
models_config.keys(),
value=default_model_id,
label="模型",
)
gr.Markdown(
"""
# Wakeword Demo
"""
)
state = gr.State()
audio = gr.Audio(
label="錄音",
type="numpy",
format="wav",
waveform_options=gr.WaveformOptions(
sample_rate=16000,
),
sources=["microphone"],
streaming=True,
)
gr.Interface(
automatic_speech_recognition,
inputs=[
model_drop_down,
state,
audio,
],
outputs=[
state,
gr.Text(interactive=False, label="輸出"),
],
live=True,
stream_every=0.25,
clear_btn=None,
# flagging_mode="auto",
)
demo.launch()
demo.launch()
demo.launch()
|