main.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. # -*- coding: utf-8 -*-
  2. """
  3. Google Cloud TTS Plugin - Text-to-Speech mit Google Cloud.
  4. Google Cloud Text-to-Speech bietet hochwertige, natürlich klingende Stimmen.
  5. Unterstützt:
  6. - WaveNet-Stimmen (höchste Qualität)
  7. - Standard-Stimmen
  8. - Viele Sprachen und Stimmen
  9. - SSML-Unterstützung
  10. - Anpassbare Sprechgeschwindigkeit und Tonhöhe
  11. Erfordert:
  12. - Google Cloud Konto
  13. - Text-to-Speech API aktiviert
  14. - Credentials (JSON-Schlüsseldatei)
  15. """
  16. import asyncio
  17. import time
  18. from pathlib import Path
  19. from typing import Any, AsyncIterator
  20. from trixy_core.plugins import TrixyPlugin
  21. from trixy_core.audio.tts import TTSProvider, TTSConfig, TTSResult, TTSState, Voice
  22. class GoogleTTSProvider(TTSProvider):
  23. """Google Cloud TTS Provider."""
  24. def __init__(
  25. self,
  26. config: TTSConfig | None = None,
  27. credentials_path: str | None = None,
  28. voice_name: str = "de-DE-Wavenet-C",
  29. speaking_rate: float = 1.0,
  30. pitch: float = 0.0,
  31. ):
  32. super().__init__(config)
  33. self._credentials_path = credentials_path
  34. self._voice_name = voice_name
  35. self._speaking_rate = speaking_rate
  36. self._pitch = pitch
  37. self._client = None
  38. self._available_voices: dict[str, Voice] = {}
  39. @property
  40. def name(self) -> str:
  41. return "google"
  42. @property
  43. def supported_languages(self) -> list[str]:
  44. return [
  45. "de-DE", "en-US", "en-GB", "en-AU",
  46. "fr-FR", "fr-CA", "es-ES", "es-US",
  47. "it-IT", "pt-BR", "pt-PT", "nl-NL",
  48. "pl-PL", "ru-RU", "ja-JP", "ko-KR",
  49. "zh-CN", "zh-TW", "ar-XA", "hi-IN",
  50. ]
  51. @property
  52. def supports_streaming(self) -> bool:
  53. return False # Google TTS unterstützt kein echtes Streaming
  54. async def initialize(self) -> None:
  55. """Initialisiert den Google TTS Client."""
  56. try:
  57. from google.cloud import texttospeech
  58. self._state = TTSState.SYNTHESIZING
  59. # Credentials setzen
  60. if self._credentials_path:
  61. import os
  62. os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
  63. # Client erstellen
  64. loop = asyncio.get_event_loop()
  65. self._client = await loop.run_in_executor(
  66. None,
  67. texttospeech.TextToSpeechClient
  68. )
  69. # Verfügbare Stimmen laden
  70. await self._load_voices()
  71. self._model_loaded = True
  72. self._state = TTSState.READY
  73. except ImportError:
  74. raise RuntimeError(
  75. "Google Cloud TTS nicht installiert. "
  76. "Installieren mit: pip install google-cloud-texttospeech"
  77. )
  78. except Exception as e:
  79. self._state = TTSState.ERROR
  80. raise RuntimeError(f"Fehler beim Initialisieren von Google TTS: {e}")
  81. async def _load_voices(self) -> None:
  82. """Lädt verfügbare Stimmen von Google."""
  83. if not self._client:
  84. return
  85. from google.cloud import texttospeech
  86. loop = asyncio.get_event_loop()
  87. def list_voices():
  88. return self._client.list_voices()
  89. response = await loop.run_in_executor(None, list_voices)
  90. for voice in response.voices:
  91. for language_code in voice.language_codes:
  92. voice_id = voice.name
  93. # Geschlecht bestimmen
  94. gender = "unknown"
  95. if voice.ssml_gender == texttospeech.SsmlVoiceGender.MALE:
  96. gender = "male"
  97. elif voice.ssml_gender == texttospeech.SsmlVoiceGender.FEMALE:
  98. gender = "female"
  99. elif voice.ssml_gender == texttospeech.SsmlVoiceGender.NEUTRAL:
  100. gender = "neutral"
  101. # Qualität aus Namen ableiten
  102. quality = "Standard"
  103. if "Wavenet" in voice_id:
  104. quality = "WaveNet (Hochwertig)"
  105. elif "Neural2" in voice_id:
  106. quality = "Neural2 (Sehr hochwertig)"
  107. elif "Studio" in voice_id:
  108. quality = "Studio (Premium)"
  109. self._available_voices[voice_id] = Voice(
  110. id=voice_id,
  111. name=voice_id,
  112. language=language_code,
  113. gender=gender,
  114. description=f"Google {quality} Stimme",
  115. sample_rate=24000 if "Wavenet" in voice_id or "Neural" in voice_id else 22050,
  116. )
  117. # Aktuelle Stimme setzen
  118. if self._voice_name in self._available_voices:
  119. self._current_voice = self._available_voices[self._voice_name]
  120. async def shutdown(self) -> None:
  121. """Gibt Ressourcen frei."""
  122. self._client = None
  123. self._model_loaded = False
  124. self._available_voices.clear()
  125. self._state = TTSState.UNINITIALIZED
  126. async def get_voices(self, language: str | None = None) -> list[Voice]:
  127. """Gibt verfügbare Stimmen zurück."""
  128. voices = list(self._available_voices.values())
  129. if language:
  130. voices = [v for v in voices if v.language.startswith(language.split("-")[0])]
  131. return voices
  132. async def synthesize(
  133. self,
  134. text: str,
  135. voice_id: str | None = None,
  136. language: str | None = None,
  137. ) -> TTSResult:
  138. """Synthetisiert Text mit Google TTS."""
  139. if not self._client:
  140. raise RuntimeError("Google TTS Client nicht initialisiert")
  141. from google.cloud import texttospeech
  142. self._state = TTSState.SYNTHESIZING
  143. start_time = time.time()
  144. try:
  145. # Stimme bestimmen
  146. voice_name = voice_id or self._voice_name
  147. lang = language or self._config.language
  148. # Synthesis Input (SSML oder Text)
  149. if text.strip().startswith("<speak>"):
  150. synthesis_input = texttospeech.SynthesisInput(ssml=text)
  151. else:
  152. synthesis_input = texttospeech.SynthesisInput(text=text)
  153. # Voice Parameter
  154. voice_params = texttospeech.VoiceSelectionParams(
  155. language_code=lang,
  156. name=voice_name,
  157. )
  158. # Audio Config
  159. audio_config = texttospeech.AudioConfig(
  160. audio_encoding=texttospeech.AudioEncoding.LINEAR16,
  161. speaking_rate=self._speaking_rate,
  162. pitch=self._pitch,
  163. sample_rate_hertz=24000,
  164. )
  165. # Synthese in Thread
  166. loop = asyncio.get_event_loop()
  167. def do_synthesize():
  168. return self._client.synthesize_speech(
  169. input=synthesis_input,
  170. voice=voice_params,
  171. audio_config=audio_config,
  172. )
  173. response = await loop.run_in_executor(None, do_synthesize)
  174. processing_time = (time.time() - start_time) * 1000
  175. audio_data = response.audio_content
  176. self._state = TTSState.READY
  177. # Dauer berechnen (16-bit mono @ 24kHz)
  178. num_samples = len(audio_data) // 2
  179. duration = num_samples / 24000
  180. # Voice-Info
  181. current_voice = self._available_voices.get(voice_name)
  182. return TTSResult(
  183. audio_data=audio_data,
  184. sample_rate=24000,
  185. channels=1,
  186. text=text,
  187. duration_seconds=duration,
  188. processing_time_ms=processing_time,
  189. voice_id=voice_name,
  190. voice_name=current_voice.name if current_voice else voice_name,
  191. language=lang,
  192. provider="google",
  193. model=voice_name,
  194. )
  195. except Exception as e:
  196. self._state = TTSState.ERROR
  197. raise RuntimeError(f"Google TTS Synthese fehlgeschlagen: {e}")
  198. async def synthesize_stream(
  199. self,
  200. text: str,
  201. voice_id: str | None = None,
  202. language: str | None = None,
  203. chunk_size: int = 4096,
  204. ) -> AsyncIterator[bytes]:
  205. """Synthetisiert Text als Stream (simuliert)."""
  206. # Google TTS unterstützt kein echtes Streaming
  207. # Wir generieren das Audio und chunken es
  208. result = await self.synthesize(text, voice_id, language)
  209. for i in range(0, len(result.audio_data), chunk_size):
  210. yield result.audio_data[i:i + chunk_size]
  211. class GoogleTTSPlugin(TrixyPlugin):
  212. """Google Cloud TTS Plugin für Trixy."""
  213. def __init__(self, application, plugin_path, config: dict | None = None):
  214. super().__init__(application, plugin_path, config)
  215. self._provider: GoogleTTSProvider | None = None
  216. async def on_load(self) -> None:
  217. """Plugin wird geladen."""
  218. from trixy_core.utils.debug import pinfo
  219. pinfo("Google TTS Plugin: Lade...")
  220. # Konfiguration
  221. credentials_path = self.config.get("credentials_path")
  222. voice_name = self.config.get("voice", "de-DE-Wavenet-C")
  223. language = self.config.get("language", "de-DE")
  224. speaking_rate = self.config.get("speaking_rate", 1.0)
  225. pitch = self.config.get("pitch", 0.0)
  226. tts_config = TTSConfig(
  227. language=language,
  228. voice_id=voice_name,
  229. )
  230. # Provider erstellen
  231. self._provider = GoogleTTSProvider(
  232. tts_config,
  233. credentials_path=credentials_path,
  234. voice_name=voice_name,
  235. speaking_rate=speaking_rate,
  236. pitch=pitch,
  237. )
  238. # Client initialisieren
  239. await self._provider.initialize()
  240. # Extension registrieren
  241. if hasattr(self.application, "extension_points"):
  242. ext_point = self.application.extension_points.get("conversation.tts")
  243. if ext_point:
  244. ext_point.register(self._provider)
  245. pinfo("Google TTS Plugin: Extension registriert")
  246. # Event-Handler registrieren
  247. self._register_event_handlers()
  248. pinfo(f"Google TTS Plugin: Geladen (Stimme: {voice_name})")
  249. def _register_event_handlers(self) -> None:
  250. """Registriert Event-Handler."""
  251. em = self.application.events
  252. @em.on("tts_request")
  253. async def on_tts_request(event_name: str, data: dict) -> None:
  254. """Verarbeitet TTS-Anfragen."""
  255. if not self._provider or not self._provider.is_ready:
  256. return
  257. text = data.get("text")
  258. satellite_id = data.get("satellite_id")
  259. session_id = data.get("session_id")
  260. voice_id = data.get("voice_id")
  261. language = data.get("language")
  262. if not text:
  263. return
  264. from trixy_core.utils.debug import pinfo
  265. pinfo(f"Google TTS: Synthetisiere '{text[:50]}...'")
  266. try:
  267. result = await self._provider.synthesize(
  268. text,
  269. voice_id=voice_id,
  270. language=language,
  271. )
  272. await em.emit("tts_completed", {
  273. "audio_data": result.audio_data.hex(),
  274. "sample_rate": result.sample_rate,
  275. "duration_seconds": result.duration_seconds,
  276. "provider": "google",
  277. "text": text,
  278. "satellite_id": satellite_id,
  279. "session_id": session_id,
  280. "processing_time_ms": result.processing_time_ms,
  281. })
  282. pinfo(f"Google TTS: Fertig ({result.duration_seconds:.1f}s Audio, "
  283. f"{result.processing_time_ms:.0f}ms)")
  284. except Exception as e:
  285. from trixy_core.utils.debug import perror
  286. perror(f"Google TTS Fehler: {e}")
  287. await em.emit("tts_error", {
  288. "error": str(e),
  289. "provider": "google",
  290. "text": text,
  291. "satellite_id": satellite_id,
  292. "session_id": session_id,
  293. })
  294. async def on_unload(self) -> None:
  295. """Plugin wird entladen."""
  296. if self._provider:
  297. await self._provider.shutdown()
  298. self._provider = None
  299. from trixy_core.utils.debug import pinfo
  300. pinfo("Google TTS Plugin: Entladen")
  301. @property
  302. def provider(self) -> GoogleTTSProvider | None:
  303. """TTS-Provider."""
  304. return self._provider
  305. # Plugin-Export
  306. Plugin = GoogleTTSPlugin