tts.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. # -*- coding: utf-8 -*-
  2. """
  3. TTS (Text-to-Speech) Interface.
  4. Definiert die Schnittstelle für TTS-Provider (Plugins).
  5. """
  6. from abc import ABC, abstractmethod
  7. from dataclasses import dataclass, field
  8. from enum import Enum
  9. from typing import Any, AsyncIterator
  10. class TTSState(Enum):
  11. """Zustand des TTS-Providers."""
  12. UNINITIALIZED = "uninitialized"
  13. READY = "ready"
  14. SYNTHESIZING = "synthesizing"
  15. ERROR = "error"
  16. @dataclass
  17. class Voice:
  18. """Beschreibung einer TTS-Stimme."""
  19. id: str
  20. name: str
  21. language: str
  22. gender: str = "neutral" # male, female, neutral
  23. description: str = ""
  24. # Qualität/Stil
  25. style: str = "default"
  26. sample_rate: int = 22050
  27. # Provider-spezifisch
  28. provider: str = ""
  29. model: str = ""
  30. metadata: dict[str, Any] = field(default_factory=dict)
  31. def to_dict(self) -> dict:
  32. """Konvertiert zu Dictionary."""
  33. return {
  34. "id": self.id,
  35. "name": self.name,
  36. "language": self.language,
  37. "gender": self.gender,
  38. "description": self.description,
  39. "style": self.style,
  40. "sample_rate": self.sample_rate,
  41. "provider": self.provider,
  42. "model": self.model,
  43. }
  44. @dataclass
  45. class TTSConfig:
  46. """Konfiguration für TTS-Provider."""
  47. # Sprache & Stimme
  48. language: str = "de-DE"
  49. voice_id: str | None = None
  50. voice_name: str | None = None
  51. # Audio-Format
  52. sample_rate: int = 22050
  53. channels: int = 1
  54. sample_width: int = 2 # 16-bit
  55. # Sprachparameter
  56. speed: float = 1.0 # 0.5 - 2.0
  57. pitch: float = 1.0 # 0.5 - 2.0
  58. volume: float = 1.0 # 0.0 - 1.0
  59. # Performance
  60. use_gpu: bool = False
  61. num_threads: int = 4
  62. # Modell
  63. model_path: str | None = None
  64. model_name: str | None = None
  65. # Caching
  66. enable_cache: bool = True
  67. cache_dir: str | None = None
  68. @dataclass
  69. class TTSResult:
  70. """Ergebnis einer TTS-Synthese."""
  71. # Audio-Daten
  72. audio_data: bytes
  73. sample_rate: int = 22050
  74. channels: int = 1
  75. # Metadaten
  76. text: str = ""
  77. duration_seconds: float = 0.0
  78. processing_time_ms: float = 0.0
  79. # Stimme
  80. voice_id: str = ""
  81. voice_name: str = ""
  82. language: str = "de-DE"
  83. # Provider-Info
  84. provider: str = ""
  85. model: str = ""
  86. # Zusätzliche Daten
  87. raw_result: Any = None
  88. def to_dict(self) -> dict:
  89. """Konvertiert zu Dictionary (ohne Audio-Daten)."""
  90. return {
  91. "text": self.text,
  92. "duration_seconds": self.duration_seconds,
  93. "processing_time_ms": self.processing_time_ms,
  94. "sample_rate": self.sample_rate,
  95. "channels": self.channels,
  96. "voice_id": self.voice_id,
  97. "voice_name": self.voice_name,
  98. "language": self.language,
  99. "provider": self.provider,
  100. "model": self.model,
  101. "audio_size_bytes": len(self.audio_data),
  102. }
  103. def save_wav(self, file_path: str) -> None:
  104. """Speichert Audio als WAV-Datei."""
  105. import wave
  106. with wave.open(file_path, "wb") as wf:
  107. wf.setnchannels(self.channels)
  108. wf.setsampwidth(2) # 16-bit
  109. wf.setframerate(self.sample_rate)
  110. wf.writeframes(self.audio_data)
  111. class TTSProvider(ABC):
  112. """
  113. Abstrakte Basisklasse für TTS-Provider.
  114. Plugins implementieren diese Klasse um Text-to-Speech
  115. Funktionalität bereitzustellen.
  116. Events:
  117. - tts_started: TTS-Synthese gestartet
  118. - tts_completed: TTS erfolgreich abgeschlossen
  119. - tts_error: Fehler bei TTS
  120. - tts_chunk: Audio-Chunk bei Streaming
  121. """
  122. def __init__(self, config: TTSConfig | None = None):
  123. """
  124. Initialisiert den Provider.
  125. Args:
  126. config: TTS-Konfiguration
  127. """
  128. self._config = config or TTSConfig()
  129. self._state = TTSState.UNINITIALIZED
  130. self._model_loaded = False
  131. self._current_voice: Voice | None = None
  132. @property
  133. def config(self) -> TTSConfig:
  134. """Aktuelle Konfiguration."""
  135. return self._config
  136. @property
  137. def state(self) -> TTSState:
  138. """Aktueller Zustand."""
  139. return self._state
  140. @property
  141. def is_ready(self) -> bool:
  142. """Prüft ob Provider bereit ist."""
  143. return self._state == TTSState.READY
  144. @property
  145. def current_voice(self) -> Voice | None:
  146. """Aktuelle Stimme."""
  147. return self._current_voice
  148. @property
  149. @abstractmethod
  150. def name(self) -> str:
  151. """Name des Providers (z.B. 'piper', 'google')."""
  152. pass
  153. @property
  154. @abstractmethod
  155. def supported_languages(self) -> list[str]:
  156. """Liste unterstützter Sprachen."""
  157. pass
  158. @property
  159. def supports_streaming(self) -> bool:
  160. """Prüft ob Streaming unterstützt wird."""
  161. return False
  162. @property
  163. def supports_ssml(self) -> bool:
  164. """Prüft ob SSML unterstützt wird."""
  165. return False
  166. # === Lifecycle ===
  167. @abstractmethod
  168. async def initialize(self) -> None:
  169. """
  170. Initialisiert den Provider (lädt Modell, etc.).
  171. Raises:
  172. RuntimeError: Bei Initialisierungsfehler
  173. """
  174. pass
  175. @abstractmethod
  176. async def shutdown(self) -> None:
  177. """Fährt den Provider herunter und gibt Ressourcen frei."""
  178. pass
  179. # === Stimmen ===
  180. @abstractmethod
  181. async def get_voices(
  182. self,
  183. language: str | None = None,
  184. ) -> list[Voice]:
  185. """
  186. Gibt verfügbare Stimmen zurück.
  187. Args:
  188. language: Filter nach Sprache (optional)
  189. Returns:
  190. Liste von Voice-Objekten
  191. """
  192. pass
  193. async def set_voice(self, voice_id: str) -> bool:
  194. """
  195. Setzt die aktive Stimme.
  196. Args:
  197. voice_id: ID der Stimme
  198. Returns:
  199. True wenn erfolgreich
  200. """
  201. voices = await self.get_voices()
  202. for voice in voices:
  203. if voice.id == voice_id:
  204. self._current_voice = voice
  205. return True
  206. return False
  207. # === Synthese ===
  208. @abstractmethod
  209. async def synthesize(
  210. self,
  211. text: str,
  212. voice_id: str | None = None,
  213. language: str | None = None,
  214. ) -> TTSResult:
  215. """
  216. Synthetisiert Text zu Audio.
  217. Args:
  218. text: Zu synthetisierender Text
  219. voice_id: Stimmen-ID (optional, default aus Config)
  220. language: Sprache (optional, default aus Config)
  221. Returns:
  222. TTSResult mit Audio-Daten
  223. Raises:
  224. RuntimeError: Bei Synthesefehler
  225. """
  226. pass
  227. async def synthesize_ssml(
  228. self,
  229. ssml: str,
  230. voice_id: str | None = None,
  231. ) -> TTSResult:
  232. """
  233. Synthetisiert SSML zu Audio.
  234. Args:
  235. ssml: SSML-formatierter Text
  236. voice_id: Stimmen-ID (optional)
  237. Returns:
  238. TTSResult mit Audio-Daten
  239. """
  240. if not self.supports_ssml:
  241. # Fallback: SSML-Tags entfernen und als Plain-Text behandeln
  242. import re
  243. plain_text = re.sub(r"<[^>]+>", "", ssml)
  244. return await self.synthesize(plain_text, voice_id)
  245. raise NotImplementedError("SSML nicht implementiert")
  246. # === Streaming (optional) ===
  247. async def synthesize_stream(
  248. self,
  249. text: str,
  250. voice_id: str | None = None,
  251. language: str | None = None,
  252. chunk_size: int = 4096,
  253. ) -> AsyncIterator[bytes]:
  254. """
  255. Synthetisiert Text zu Audio-Stream.
  256. Args:
  257. text: Zu synthetisierender Text
  258. voice_id: Stimmen-ID (optional)
  259. language: Sprache (optional)
  260. chunk_size: Größe der Audio-Chunks
  261. Yields:
  262. Audio-Chunks als bytes
  263. """
  264. # Default-Implementierung: Synthetisiere komplett und chunke
  265. result = await self.synthesize(text, voice_id, language)
  266. for i in range(0, len(result.audio_data), chunk_size):
  267. yield result.audio_data[i:i + chunk_size]
  268. # === Utilities ===
  269. def estimate_duration(self, text: str) -> float:
  270. """
  271. Schätzt Audio-Dauer für Text.
  272. Args:
  273. text: Text
  274. Returns:
  275. Geschätzte Dauer in Sekunden
  276. """
  277. # Grobe Schätzung: ~150 Wörter pro Minute
  278. words = len(text.split())
  279. return (words / 150) * 60 * (1 / self._config.speed)