Spaces:
Sleeping
Sleeping
Commit
·
939b9ab
1
Parent(s):
755dbb1
Audio processing refactoring
Browse files- api/audio.py +253 -129
api/audio.py
CHANGED
|
@@ -1,25 +1,31 @@
|
|
| 1 |
import io
|
| 2 |
import wave
|
| 3 |
-
|
| 4 |
import numpy as np
|
| 5 |
import requests
|
| 6 |
-
|
| 7 |
from openai import OpenAI
|
| 8 |
-
|
| 9 |
-
from utils.errors import APIError, AudioConversionError
|
| 10 |
-
from typing import List, Optional, Generator, Tuple
|
| 11 |
import webrtcvad
|
| 12 |
-
|
| 13 |
from transformers import pipeline
|
|
|
|
|
|
|
| 14 |
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
def detect_voice(audio: np.ndarray, sample_rate: int = 48000, frame_duration: int = 30) -> bool:
|
| 17 |
-
vad = webrtcvad.Vad()
|
| 18 |
-
vad.set_mode(3) # Aggressiveness mode: 0 (least aggressive) to 3 (most aggressive)
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
num_samples_per_frame = int(sample_rate * frame_duration / 1000)
|
| 24 |
frames = [audio_bytes[i : i + num_samples_per_frame * 2] for i in range(0, len(audio_bytes), num_samples_per_frame * 2)]
|
| 25 |
|
|
@@ -35,34 +41,43 @@ def detect_voice(audio: np.ndarray, sample_rate: int = 48000, frame_duration: in
|
|
| 35 |
|
| 36 |
|
| 37 |
class STTManager:
|
| 38 |
-
|
| 39 |
-
self.SAMPLE_RATE = 48000
|
| 40 |
-
self.CHUNK_LENGTH = 5
|
| 41 |
-
self.STEP_LENGTH = 3
|
| 42 |
-
self.MAX_RELIABILITY_CUTOFF = self.CHUNK_LENGTH - 1
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
if config.stt.type == "HF_LOCAL":
|
| 49 |
self.pipe = pipeline("automatic-speech-recognition", model=config.stt.name)
|
| 50 |
|
| 51 |
def numpy_audio_to_bytes(self, audio_data: np.ndarray) -> bytes:
|
| 52 |
"""
|
| 53 |
-
Convert
|
| 54 |
|
| 55 |
-
:
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
|
|
|
|
|
|
|
|
|
|
| 61 |
buffer = io.BytesIO()
|
| 62 |
try:
|
| 63 |
with wave.open(buffer, "wb") as wf:
|
| 64 |
-
wf.setnchannels(
|
| 65 |
-
wf.setsampwidth(
|
| 66 |
wf.setframerate(self.SAMPLE_RATE)
|
| 67 |
wf.writeframes(audio_data.tobytes())
|
| 68 |
except Exception as e:
|
|
@@ -71,112 +86,164 @@ class STTManager:
|
|
| 71 |
|
| 72 |
def process_audio_chunk(self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
| 73 |
"""
|
| 74 |
-
Process
|
| 75 |
|
| 76 |
-
:
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
"""
|
| 80 |
|
|
|
|
|
|
|
|
|
|
| 81 |
has_voice = detect_voice(audio[1])
|
| 82 |
ended = len(audio[1]) % 24000 != 0
|
| 83 |
-
|
| 84 |
if has_voice:
|
| 85 |
audio_buffer = np.concatenate((audio_buffer, audio[1]))
|
| 86 |
-
|
| 87 |
is_short = len(audio_buffer) / self.SAMPLE_RATE < 1.0
|
| 88 |
-
|
| 89 |
if is_short or (has_voice and not ended):
|
| 90 |
return audio_buffer, np.array([], dtype=np.int16)
|
| 91 |
-
|
| 92 |
return np.array([], dtype=np.int16), audio_buffer
|
| 93 |
|
| 94 |
def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
|
| 95 |
"""
|
| 96 |
-
|
| 97 |
|
| 98 |
-
:
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
"""
|
| 102 |
|
|
|
|
|
|
|
|
|
|
| 103 |
if len(audio) < 500:
|
| 104 |
return text
|
| 105 |
-
|
| 106 |
-
transcript = self.transcribe_numpy_array(audio, context=text)
|
| 107 |
-
return text + " " + transcript
|
| 108 |
|
| 109 |
-
|
| 110 |
-
"""
|
| 111 |
-
Add a text message to the chat history.
|
| 112 |
|
| 113 |
-
|
| 114 |
-
:param chat: List of chat messages.
|
| 115 |
-
:return: Updated chat history.
|
| 116 |
"""
|
| 117 |
-
|
| 118 |
-
return chat
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
|
|
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
def
|
| 128 |
"""
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
-
:
|
| 132 |
-
|
| 133 |
-
:return: Updated chat history.
|
| 134 |
"""
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
| 137 |
return chat
|
| 138 |
|
| 139 |
def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
|
| 140 |
"""
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
:
|
| 144 |
-
|
| 145 |
-
:return: Transcribed text.
|
| 146 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
try:
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
model=self.config.stt.name, file=data, response_format="text", prompt=context
|
| 154 |
-
)
|
| 155 |
-
elif self.config.stt.type == "HF_API":
|
| 156 |
-
audio_bytes = self.numpy_audio_to_bytes(audio)
|
| 157 |
-
headers = {"Authorization": "Bearer " + self.config.stt.key}
|
| 158 |
-
response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)
|
| 159 |
-
if response.status_code != 200:
|
| 160 |
-
error_details = response.json().get("error", "No error message provided")
|
| 161 |
-
raise APIError("STT Error: HF API error", status_code=response.status_code, details=error_details)
|
| 162 |
-
transcription = response.json().get("text", None)
|
| 163 |
-
if transcription is None:
|
| 164 |
-
raise APIError("STT Error: No transcription returned by HF API")
|
| 165 |
-
elif self.config.stt.type == "HF_LOCAL":
|
| 166 |
-
result = self.pipe({"sampling_rate": self.SAMPLE_RATE, "raw": audio.astype(np.float32) / 32768.0})
|
| 167 |
-
transcription = result["text"]
|
| 168 |
-
except APIError:
|
| 169 |
-
raise
|
| 170 |
except Exception as e:
|
| 171 |
raise APIError(f"STT Error: Unexpected error: {e}")
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
return transcription
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
def test_stt(self) -> bool:
|
| 176 |
"""
|
| 177 |
-
Test
|
| 178 |
|
| 179 |
-
:
|
|
|
|
| 180 |
"""
|
| 181 |
try:
|
| 182 |
self.transcribe_audio(np.zeros(10000))
|
|
@@ -186,15 +253,29 @@ class STTManager:
|
|
| 186 |
|
| 187 |
|
| 188 |
class TTSManager:
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
self.config = config
|
| 191 |
-
self.
|
| 192 |
-
self.
|
|
|
|
| 193 |
|
| 194 |
-
def test_tts(self, stream) -> bool:
|
| 195 |
"""
|
| 196 |
-
Test
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
"""
|
| 199 |
try:
|
| 200 |
list(self.read_text("Handshake", stream=stream))
|
|
@@ -204,52 +285,95 @@ class TTSManager:
|
|
| 204 |
|
| 205 |
def read_text(self, text: str, stream: Optional[bool] = None) -> Generator[bytes, None, None]:
|
| 206 |
"""
|
| 207 |
-
Convert text to speech
|
| 208 |
-
|
| 209 |
-
:
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
| 212 |
|
|
|
|
|
|
|
|
|
|
| 213 |
if not text:
|
| 214 |
yield b""
|
| 215 |
return
|
| 216 |
|
| 217 |
-
if stream is None
|
| 218 |
-
stream = self.streaming
|
| 219 |
|
| 220 |
-
headers = {"Authorization": "Bearer
|
| 221 |
data = {"model": self.config.tts.name, "input": text, "voice": "alloy", "response_format": "opus"}
|
| 222 |
|
| 223 |
try:
|
| 224 |
-
if
|
| 225 |
-
if self.config.tts.type == "OPENAI_API":
|
| 226 |
-
response = requests.post(self.config.tts.url + "/audio/speech", headers=headers, json=data)
|
| 227 |
-
elif self.config.tts.type == "HF_API":
|
| 228 |
-
response = requests.post(self.config.tts.url, headers=headers, json={"inputs": text})
|
| 229 |
-
|
| 230 |
-
if response.status_code != 200:
|
| 231 |
-
error_details = response.json().get("error", "No error message provided")
|
| 232 |
-
raise APIError(f"TTS Error: {self.config.tts.type} error", status_code=response.status_code, details=error_details)
|
| 233 |
-
yield response.content
|
| 234 |
-
else:
|
| 235 |
-
if self.config.tts.type != "OPENAI_API":
|
| 236 |
-
raise APIError("TTS Error: Streaming not supported for this TTS type")
|
| 237 |
-
|
| 238 |
-
with requests.post(self.config.tts.url + "/audio/speech", headers=headers, json=data, stream=True) as response:
|
| 239 |
-
if response.status_code != 200:
|
| 240 |
-
error_details = response.json().get("error", "No error message provided")
|
| 241 |
-
raise APIError("TTS Error: OPENAI API error", status_code=response.status_code, details=error_details)
|
| 242 |
-
yield from response.iter_content(chunk_size=1024)
|
| 243 |
except APIError:
|
| 244 |
raise
|
| 245 |
except Exception as e:
|
| 246 |
raise APIError(f"TTS Error: Unexpected error: {e}")
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
def read_last_message(self, chat_history: List[List[Optional[str]]]) -> Generator[bytes, None, None]:
|
| 249 |
"""
|
| 250 |
-
Read the last message in the chat history
|
| 251 |
-
|
| 252 |
-
:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
"""
|
| 254 |
-
if
|
| 255 |
yield from self.read_text(chat_history[-1][1])
|
|
|
|
| 1 |
import io
|
| 2 |
import wave
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import requests
|
|
|
|
| 5 |
from openai import OpenAI
|
|
|
|
|
|
|
|
|
|
| 6 |
import webrtcvad
|
|
|
|
| 7 |
from transformers import pipeline
|
| 8 |
+
from typing import List, Optional, Generator, Tuple, Any
|
| 9 |
+
from utils.errors import APIError, AudioConversionError
|
| 10 |
|
| 11 |
+
SAMPLE_RATE: int = 48000
|
| 12 |
+
FRAME_DURATION: int = 30
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
def detect_voice(audio: np.ndarray, sample_rate: int = SAMPLE_RATE, frame_duration: int = FRAME_DURATION) -> bool:
|
| 16 |
+
"""
|
| 17 |
+
Detect voice activity in the given audio data.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
audio (np.ndarray): Audio data as a numpy array.
|
| 21 |
+
sample_rate (int): Sample rate of the audio. Defaults to SAMPLE_RATE.
|
| 22 |
+
frame_duration (int): Duration of each frame in milliseconds. Defaults to FRAME_DURATION.
|
| 23 |
|
| 24 |
+
Returns:
|
| 25 |
+
bool: True if voice activity is detected, False otherwise.
|
| 26 |
+
"""
|
| 27 |
+
vad = webrtcvad.Vad(3) # Aggressiveness mode: 3 (most aggressive)
|
| 28 |
+
audio_bytes = audio.tobytes()
|
| 29 |
num_samples_per_frame = int(sample_rate * frame_duration / 1000)
|
| 30 |
frames = [audio_bytes[i : i + num_samples_per_frame * 2] for i in range(0, len(audio_bytes), num_samples_per_frame * 2)]
|
| 31 |
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
class STTManager:
|
| 44 |
+
"""Manages speech-to-text operations."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
def __init__(self, config: Any):
|
| 47 |
+
"""
|
| 48 |
+
Initialize the STTManager.
|
| 49 |
|
| 50 |
+
Args:
|
| 51 |
+
config (Any): Configuration object containing STT settings.
|
| 52 |
+
"""
|
| 53 |
+
self.config = config
|
| 54 |
+
self.SAMPLE_RATE: int = SAMPLE_RATE
|
| 55 |
+
self.CHUNK_LENGTH: int = 5
|
| 56 |
+
self.STEP_LENGTH: int = 3
|
| 57 |
+
self.MAX_RELIABILITY_CUTOFF: int = self.CHUNK_LENGTH - 1
|
| 58 |
+
self.status: bool = self.test_stt()
|
| 59 |
+
self.streaming: bool = self.status
|
| 60 |
if config.stt.type == "HF_LOCAL":
|
| 61 |
self.pipe = pipeline("automatic-speech-recognition", model=config.stt.name)
|
| 62 |
|
| 63 |
def numpy_audio_to_bytes(self, audio_data: np.ndarray) -> bytes:
|
| 64 |
"""
|
| 65 |
+
Convert numpy array audio data to bytes.
|
| 66 |
|
| 67 |
+
Args:
|
| 68 |
+
audio_data (np.ndarray): Audio data as a numpy array.
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
bytes: Audio data as bytes.
|
| 72 |
|
| 73 |
+
Raises:
|
| 74 |
+
AudioConversionError: If there's an error during conversion.
|
| 75 |
+
"""
|
| 76 |
buffer = io.BytesIO()
|
| 77 |
try:
|
| 78 |
with wave.open(buffer, "wb") as wf:
|
| 79 |
+
wf.setnchannels(1)
|
| 80 |
+
wf.setsampwidth(2)
|
| 81 |
wf.setframerate(self.SAMPLE_RATE)
|
| 82 |
wf.writeframes(audio_data.tobytes())
|
| 83 |
except Exception as e:
|
|
|
|
| 86 |
|
| 87 |
def process_audio_chunk(self, audio: Tuple[int, np.ndarray], audio_buffer: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
| 88 |
"""
|
| 89 |
+
Process an audio chunk and update the audio buffer.
|
| 90 |
|
| 91 |
+
Args:
|
| 92 |
+
audio (Tuple[int, np.ndarray]): Audio chunk data.
|
| 93 |
+
audio_buffer (np.ndarray): Existing audio buffer.
|
|
|
|
| 94 |
|
| 95 |
+
Returns:
|
| 96 |
+
Tuple[np.ndarray, np.ndarray]: Updated audio buffer and processed audio.
|
| 97 |
+
"""
|
| 98 |
has_voice = detect_voice(audio[1])
|
| 99 |
ended = len(audio[1]) % 24000 != 0
|
|
|
|
| 100 |
if has_voice:
|
| 101 |
audio_buffer = np.concatenate((audio_buffer, audio[1]))
|
|
|
|
| 102 |
is_short = len(audio_buffer) / self.SAMPLE_RATE < 1.0
|
|
|
|
| 103 |
if is_short or (has_voice and not ended):
|
| 104 |
return audio_buffer, np.array([], dtype=np.int16)
|
|
|
|
| 105 |
return np.array([], dtype=np.int16), audio_buffer
|
| 106 |
|
| 107 |
def transcribe_audio(self, audio: np.ndarray, text: str = "") -> str:
|
| 108 |
"""
|
| 109 |
+
Transcribe audio data and append to existing text.
|
| 110 |
|
| 111 |
+
Args:
|
| 112 |
+
audio (np.ndarray): Audio data to transcribe.
|
| 113 |
+
text (str): Existing text to append to. Defaults to empty string.
|
|
|
|
| 114 |
|
| 115 |
+
Returns:
|
| 116 |
+
str: Transcribed text appended to existing text.
|
| 117 |
+
"""
|
| 118 |
if len(audio) < 500:
|
| 119 |
return text
|
| 120 |
+
transcript = self.transcribe_numpy_array(audio, context=text)
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
return f"{text} {transcript}".strip()
|
|
|
|
|
|
|
| 123 |
|
| 124 |
+
def transcribe_and_add_to_chat(self, audio: np.ndarray, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
|
|
|
|
|
|
|
| 125 |
"""
|
| 126 |
+
Transcribe audio and add the result to the chat history.
|
|
|
|
| 127 |
|
| 128 |
+
Args:
|
| 129 |
+
audio (np.ndarray): Audio data to transcribe.
|
| 130 |
+
chat (List[List[Optional[str]]]): Existing chat history.
|
| 131 |
|
| 132 |
+
Returns:
|
| 133 |
+
List[List[Optional[str]]]: Updated chat history with transcribed text.
|
| 134 |
+
"""
|
| 135 |
+
text = self.transcribe_audio(audio)
|
| 136 |
+
return self.add_to_chat(text, chat)
|
| 137 |
|
| 138 |
+
def add_to_chat(self, text: str, chat: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
|
| 139 |
"""
|
| 140 |
+
Add text to the chat history.
|
| 141 |
+
|
| 142 |
+
Args:
|
| 143 |
+
text (str): Text to add to chat.
|
| 144 |
+
chat (List[List[Optional[str]]]): Existing chat history.
|
| 145 |
+
editable_chat (bool): Whether the chat is editable. Defaults to True.
|
| 146 |
|
| 147 |
+
Returns:
|
| 148 |
+
List[List[Optional[str]]]: Updated chat history.
|
|
|
|
| 149 |
"""
|
| 150 |
+
if not text:
|
| 151 |
+
return chat
|
| 152 |
+
if not chat or chat[-1][0] is None:
|
| 153 |
+
chat.append(["", None])
|
| 154 |
+
chat[-1][0] = text
|
| 155 |
return chat
|
| 156 |
|
| 157 |
def transcribe_numpy_array(self, audio: np.ndarray, context: Optional[str] = None) -> str:
|
| 158 |
"""
|
| 159 |
+
Transcribe audio data using the configured STT service.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
audio (np.ndarray): Audio data as a numpy array.
|
| 163 |
+
context (Optional[str]): Optional context for transcription.
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
str: Transcribed text.
|
| 167 |
|
| 168 |
+
Raises:
|
| 169 |
+
APIError: If there's an unexpected error during transcription.
|
|
|
|
| 170 |
"""
|
| 171 |
+
transcription_methods = {
|
| 172 |
+
"OPENAI_API": self._transcribe_openai,
|
| 173 |
+
"HF_API": self._transcribe_hf_api,
|
| 174 |
+
"HF_LOCAL": self._transcribe_hf_local,
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
try:
|
| 178 |
+
transcribe_method = transcription_methods.get(self.config.stt.type)
|
| 179 |
+
if transcribe_method:
|
| 180 |
+
return transcribe_method(audio, context)
|
| 181 |
+
else:
|
| 182 |
+
raise APIError(f"Unsupported STT type: {self.config.stt.type}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
except Exception as e:
|
| 184 |
raise APIError(f"STT Error: Unexpected error: {e}")
|
| 185 |
|
| 186 |
+
def _transcribe_openai(self, audio: np.ndarray, context: Optional[str]) -> str:
|
| 187 |
+
"""
|
| 188 |
+
Transcribe audio using OpenAI API.
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
audio (np.ndarray): Audio data as a numpy array.
|
| 192 |
+
context (Optional[str]): Optional context for transcription.
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
str: Transcribed text.
|
| 196 |
+
"""
|
| 197 |
+
audio_bytes = self.numpy_audio_to_bytes(audio)
|
| 198 |
+
data = ("temp.wav", audio_bytes, "audio/wav")
|
| 199 |
+
client = OpenAI(base_url=self.config.stt.url, api_key=self.config.stt.key)
|
| 200 |
+
return client.audio.transcriptions.create(model=self.config.stt.name, file=data, response_format="text", prompt=context)
|
| 201 |
+
|
| 202 |
+
def _transcribe_hf_api(self, audio: np.ndarray, _context: Optional[str]) -> str:
|
| 203 |
+
"""
|
| 204 |
+
Transcribe audio using Hugging Face API.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
audio (np.ndarray): Audio data as a numpy array.
|
| 208 |
+
_context (Optional[str]): Unused context parameter.
|
| 209 |
+
|
| 210 |
+
Returns:
|
| 211 |
+
str: Transcribed text.
|
| 212 |
+
|
| 213 |
+
Raises:
|
| 214 |
+
APIError: If there's an error in the API response.
|
| 215 |
+
"""
|
| 216 |
+
audio_bytes = self.numpy_audio_to_bytes(audio)
|
| 217 |
+
headers = {"Authorization": f"Bearer {self.config.stt.key}"}
|
| 218 |
+
response = requests.post(self.config.stt.url, headers=headers, data=audio_bytes)
|
| 219 |
+
if response.status_code != 200:
|
| 220 |
+
error_details = response.json().get("error", "No error message provided")
|
| 221 |
+
raise APIError("STT Error: HF API error", status_code=response.status_code, details=error_details)
|
| 222 |
+
transcription = response.json().get("text")
|
| 223 |
+
if transcription is None:
|
| 224 |
+
raise APIError("STT Error: No transcription returned by HF API")
|
| 225 |
return transcription
|
| 226 |
|
| 227 |
+
def _transcribe_hf_local(self, audio: np.ndarray, _context: Optional[str]) -> str:
|
| 228 |
+
"""
|
| 229 |
+
Transcribe audio using local Hugging Face model.
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
audio (np.ndarray): Audio data as a numpy array.
|
| 233 |
+
_context (Optional[str]): Unused context parameter.
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
str: Transcribed text.
|
| 237 |
+
"""
|
| 238 |
+
result = self.pipe({"sampling_rate": self.SAMPLE_RATE, "raw": audio.astype(np.float32) / 32768.0})
|
| 239 |
+
return result["text"]
|
| 240 |
+
|
| 241 |
def test_stt(self) -> bool:
|
| 242 |
"""
|
| 243 |
+
Test the STT functionality.
|
| 244 |
|
| 245 |
+
Returns:
|
| 246 |
+
bool: True if the test is successful, False otherwise.
|
| 247 |
"""
|
| 248 |
try:
|
| 249 |
self.transcribe_audio(np.zeros(10000))
|
|
|
|
| 253 |
|
| 254 |
|
| 255 |
class TTSManager:
|
| 256 |
+
"""Manages text-to-speech operations."""
|
| 257 |
+
|
| 258 |
+
def __init__(self, config: Any):
|
| 259 |
+
"""
|
| 260 |
+
Initialize the TTSManager.
|
| 261 |
+
|
| 262 |
+
Args:
|
| 263 |
+
config (Any): Configuration object containing TTS settings.
|
| 264 |
+
"""
|
| 265 |
self.config = config
|
| 266 |
+
self.SAMPLE_RATE: int = SAMPLE_RATE
|
| 267 |
+
self.status: bool = self.test_tts(stream=False)
|
| 268 |
+
self.streaming: bool = self.test_tts(stream=True) if self.status else False
|
| 269 |
|
| 270 |
+
def test_tts(self, stream: bool) -> bool:
|
| 271 |
"""
|
| 272 |
+
Test the TTS functionality.
|
| 273 |
+
|
| 274 |
+
Args:
|
| 275 |
+
stream (bool): Whether to test streaming TTS.
|
| 276 |
+
|
| 277 |
+
Returns:
|
| 278 |
+
bool: True if the test is successful, False otherwise.
|
| 279 |
"""
|
| 280 |
try:
|
| 281 |
list(self.read_text("Handshake", stream=stream))
|
|
|
|
| 285 |
|
| 286 |
def read_text(self, text: str, stream: Optional[bool] = None) -> Generator[bytes, None, None]:
|
| 287 |
"""
|
| 288 |
+
Convert text to speech using the configured TTS service.
|
| 289 |
+
|
| 290 |
+
Args:
|
| 291 |
+
text (str): Text to convert to speech.
|
| 292 |
+
stream (Optional[bool]): Whether to stream the audio. Defaults to self.streaming if not provided.
|
| 293 |
+
|
| 294 |
+
Yields:
|
| 295 |
+
bytes: Audio data in bytes.
|
| 296 |
|
| 297 |
+
Raises:
|
| 298 |
+
APIError: If there's an unexpected error during text-to-speech conversion.
|
| 299 |
+
"""
|
| 300 |
if not text:
|
| 301 |
yield b""
|
| 302 |
return
|
| 303 |
|
| 304 |
+
stream = self.streaming if stream is None else stream
|
|
|
|
| 305 |
|
| 306 |
+
headers = {"Authorization": f"Bearer {self.config.tts.key}"}
|
| 307 |
data = {"model": self.config.tts.name, "input": text, "voice": "alloy", "response_format": "opus"}
|
| 308 |
|
| 309 |
try:
|
| 310 |
+
yield from self._read_text_stream(headers, data) if stream else self._read_text_non_stream(headers, data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
except APIError:
|
| 312 |
raise
|
| 313 |
except Exception as e:
|
| 314 |
raise APIError(f"TTS Error: Unexpected error: {e}")
|
| 315 |
|
| 316 |
+
def _read_text_non_stream(self, headers: dict, data: dict) -> Generator[bytes, None, None]:
|
| 317 |
+
"""
|
| 318 |
+
Handle non-streaming TTS requests.
|
| 319 |
+
|
| 320 |
+
Args:
|
| 321 |
+
headers (dict): Request headers.
|
| 322 |
+
data (dict): Request data.
|
| 323 |
+
|
| 324 |
+
Yields:
|
| 325 |
+
bytes: Audio data in bytes.
|
| 326 |
+
|
| 327 |
+
Raises:
|
| 328 |
+
APIError: If there's an error in the API response.
|
| 329 |
+
"""
|
| 330 |
+
if self.config.tts.type == "OPENAI_API":
|
| 331 |
+
url = f"{self.config.tts.url}/audio/speech"
|
| 332 |
+
elif self.config.tts.type == "HF_API":
|
| 333 |
+
url = self.config.tts.url
|
| 334 |
+
data = {"inputs": data["input"]}
|
| 335 |
+
else:
|
| 336 |
+
raise APIError(f"TTS Error: Unsupported TTS type: {self.config.tts.type}")
|
| 337 |
+
|
| 338 |
+
response = requests.post(url, headers=headers, json=data)
|
| 339 |
+
if response.status_code != 200:
|
| 340 |
+
error_details = response.json().get("error", "No error message provided")
|
| 341 |
+
raise APIError(f"TTS Error: {self.config.tts.type} error", status_code=response.status_code, details=error_details)
|
| 342 |
+
yield response.content
|
| 343 |
+
|
| 344 |
+
def _read_text_stream(self, headers: dict, data: dict) -> Generator[bytes, None, None]:
|
| 345 |
+
"""
|
| 346 |
+
Handle streaming TTS requests.
|
| 347 |
+
|
| 348 |
+
Args:
|
| 349 |
+
headers (dict): Request headers.
|
| 350 |
+
data (dict): Request data.
|
| 351 |
+
|
| 352 |
+
Yields:
|
| 353 |
+
bytes: Audio data in bytes.
|
| 354 |
+
|
| 355 |
+
Raises:
|
| 356 |
+
APIError: If there's an error in the API response or if streaming is not supported.
|
| 357 |
+
"""
|
| 358 |
+
if self.config.tts.type != "OPENAI_API":
|
| 359 |
+
raise APIError("TTS Error: Streaming not supported for this TTS type")
|
| 360 |
+
|
| 361 |
+
url = f"{self.config.tts.url}/audio/speech"
|
| 362 |
+
with requests.post(url, headers=headers, json=data, stream=True) as response:
|
| 363 |
+
if response.status_code != 200:
|
| 364 |
+
error_details = response.json().get("error", "No error message provided")
|
| 365 |
+
raise APIError("TTS Error: OPENAI API error", status_code=response.status_code, details=error_details)
|
| 366 |
+
yield from response.iter_content(chunk_size=1024)
|
| 367 |
+
|
| 368 |
def read_last_message(self, chat_history: List[List[Optional[str]]]) -> Generator[bytes, None, None]:
|
| 369 |
"""
|
| 370 |
+
Read the last message in the chat history.
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
chat_history (List[List[Optional[str]]]): Chat history.
|
| 374 |
+
|
| 375 |
+
Yields:
|
| 376 |
+
bytes: Audio data for the last message.
|
| 377 |
"""
|
| 378 |
+
if chat_history and chat_history[-1][1]:
|
| 379 |
yield from self.read_text(chat_history[-1][1])
|