Spaces:

andito
/

speech-to-speech-demo

Runtime error

App Files Files Community

andito HF Staff commited on Sep 26, 2024

Commit

b5b0b9a

1 Parent(s): 8b97d54

update streaming

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +18 -15
audio_streaming_client.py +128 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 📚
 colorFrom: yellow
 colorTo: purple
 sdk: gradio
-sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: yellow
 colorTo: purple
 sdk: gradio
+sdk_version: 5.0.0b3
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,20 +1,23 @@
 import gradio as gr
-def pass_audio(audio, test):
-    if test is not None:
-        print(len(test))
-        print(test[0])
-        print(len(test[1]))
-        print(test[1])
-    else:
-        print("test is None")
-    return audio, test
-demo = gr.Interface(
-    pass_audio,
-    ["state", gr.Audio(sources=["microphone"], streaming=True)],
-    ["state", gr.Audio(streaming=True, autoplay=True)],
-    live=True,
-)
 demo.launch()

 import gradio as gr
+from audio_streaming_client import AudioStreamingClient
+audio_streaming_client = AudioStreamingClient()
+audio_streaming_client.start()
+def stream_audio(audio):
+    sample_rate = audio[0]
+    audio_streaming_client.put_audio(audio[1], sample_rate)
+    output_size = len(audio[1])
+    output_audio = audio_streaming_client.get_audio(sample_rate, output_size)
+    return (sample_rate, output_audio)
+with gr.Blocks() as demo:
+    gr.Markdown("# Speech to speech in an inference endpoint 🎤")
+    inp = gr.Audio(sources=["microphone"], type="numpy")
+    out =  gr.Audio(streaming=True, autoplay=True)
+    inp.stream(stream_audio, inp, out, time_limit=600, stream_every=1)
 demo.launch()

audio_streaming_client.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import threading
+from queue import Queue, Empty
+import numpy as np
+import requests
+import base64
+import time
+from dataclasses import dataclass, field
+import websocket
+import threading
+import ssl
+import librosa
+import os
+class AudioStreamingClient:
+    def __init__(self):
+        self.auth_token = os.environ.get("HF_AUTH_TOKEN", None)
+        self.api_url = os.environ.get("HF_API_URL", None)
+        self.stop_event = threading.Event()
+        self.send_queue = Queue()
+        self.recv_queue = Queue()
+        self.session_id = None
+        self.headers = {
+            "Accept": "application/json",
+            "Authorization": f"Bearer {self.auth_token}",
+            "Content-Type": "application/json"
+        }
+        self.session_state = "idle"  # Possible states: idle, sending, processing, waiting
+        self.ws_ready = threading.Event()
+    def start(self):
+        print("Starting audio streaming...")
+        ws_url = self.api_url.replace("http", "ws") + "/ws"
+        self.ws = websocket.WebSocketApp(
+            ws_url,
+            header=[f"{key}: {value}" for key, value in self.headers.items()],
+            on_open=self.on_open,
+            on_message=self.on_message,
+            on_error=self.on_error,
+            on_close=self.on_close
+        )
+        self.ws_thread = threading.Thread(target=self.ws.run_forever, kwargs={'sslopt': {"cert_reqs": ssl.CERT_NONE}})
+        self.ws_thread.start()
+        # Wait for the WebSocket to be ready
+        self.ws_ready.wait()
+        self.send_thread = threading.Thread(target=self.send_audio)
+        self.send_thread.start()
+    def on_close(self):
+        self.stop_event.set()
+        self.send_thread.join()
+        self.ws.close()
+        self.ws_thread.join()
+        print("Audio streaming stopped.")
+    def on_open(self, ws):
+        print("WebSocket connection opened.")
+        self.ws_ready.set()  # Signal that the WebSocket is ready
+    def on_message(self, ws, message):
+        # message is bytes
+        if message == b'DONE':
+            print("listen")
+            self.session_state = "listen"
+        else:
+            print("processing")
+            self.session_state = "processing"
+            audio_np = np.frombuffer(message, dtype=np.int16)
+            self.recv_queue.put(audio_np)
+    def on_error(self, ws, error):
+        print(f"WebSocket error: {error}")
+    def on_close(self, ws, close_status_code, close_msg):
+        print("WebSocket connection closed.")
+    def send_audio(self):
+        while not self.stop_event.is_set():
+            if not self.send_queue.empty():
+                chunk = self.send_queue.get()
+                if self.session_state != "processing":
+                    self.ws.send(chunk.tobytes(), opcode=websocket.ABNF.OPCODE_BINARY)
+                else:
+                    self.ws.send([], opcode=websocket.ABNF.OPCODE_BINARY)  # handshake
+            time.sleep(0.01)
+    def put_audio(self, chunk, sample_rate):
+        chunk = np.clip(chunk, -32768, 32767).astype(np.int16)
+        chunk = chunk.astype(np.float32) / 32768.0
+        chunk = librosa.resample(chunk, orig_sr=48000, target_sr=16000)
+        chunk = (chunk * 32768.0).astype(np.int16)
+        self.send_queue.put(chunk)
+    def get_audio(self, sample_rate, output_size):
+        output_chunk = np.array([], dtype=np.int16)
+        output_sample_rate = 16000
+        output_chunk_size = int(output_size*output_sample_rate/sample_rate)
+        while output_chunk.size < output_chunk_size:
+            try:
+                self.ws.send([], opcode=websocket.ABNF.OPCODE_BINARY)  # handshake
+                chunk = self.recv_queue.get(timeout=0.1)
+            except Empty:
+                chunk = None
+            if chunk is not None:
+                # Ensure chunk is int16 and clip to valid range
+                chunk_int16 = np.clip(chunk, -32768, 32767).astype(np.int16)
+                output_chunk = np.concatenate([output_chunk, chunk_int16])
+            else:
+                print("padding chunk of size ", len(output_chunk))
+                output_chunk = np.pad(output_chunk, (0, output_chunk_size - len(output_chunk)))
+        output_chunk = output_chunk.astype(np.float32) / 32768.0
+        output_chunk = librosa.resample(output_chunk, orig_sr=output_sample_rate, target_sr=sample_rate)
+        output_chunk = (output_chunk * 32768.0).astype(np.int16)
+        print("output_chunk size: ", len(output_chunk))
+        output_chunk = output_chunk[:output_size]
+        return np.pad(output_chunk, (0, output_size - len(output_chunk)))
+if __name__ == "__main__":
+    client = AudioStreamingClient()
+    client.start()