| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382 |
- # -*- coding: utf-8 -*-
- """
- Google Cloud TTS Plugin - Text-to-Speech mit Google Cloud.
- Google Cloud Text-to-Speech bietet hochwertige, natürlich klingende Stimmen.
- Unterstützt:
- - WaveNet-Stimmen (höchste Qualität)
- - Standard-Stimmen
- - Viele Sprachen und Stimmen
- - SSML-Unterstützung
- - Anpassbare Sprechgeschwindigkeit und Tonhöhe
- Erfordert:
- - Google Cloud Konto
- - Text-to-Speech API aktiviert
- - Credentials (JSON-Schlüsseldatei)
- """
- import asyncio
- import time
- from pathlib import Path
- from typing import Any, AsyncIterator
- from trixy_core.plugins import TrixyPlugin
- from trixy_core.audio.tts import TTSProvider, TTSConfig, TTSResult, TTSState, Voice
- class GoogleTTSProvider(TTSProvider):
- """Google Cloud TTS Provider."""
- def __init__(
- self,
- config: TTSConfig | None = None,
- credentials_path: str | None = None,
- voice_name: str = "de-DE-Wavenet-C",
- speaking_rate: float = 1.0,
- pitch: float = 0.0,
- ):
- super().__init__(config)
- self._credentials_path = credentials_path
- self._voice_name = voice_name
- self._speaking_rate = speaking_rate
- self._pitch = pitch
- self._client = None
- self._available_voices: dict[str, Voice] = {}
- @property
- def name(self) -> str:
- return "google"
- @property
- def supported_languages(self) -> list[str]:
- return [
- "de-DE", "en-US", "en-GB", "en-AU",
- "fr-FR", "fr-CA", "es-ES", "es-US",
- "it-IT", "pt-BR", "pt-PT", "nl-NL",
- "pl-PL", "ru-RU", "ja-JP", "ko-KR",
- "zh-CN", "zh-TW", "ar-XA", "hi-IN",
- ]
- @property
- def supports_streaming(self) -> bool:
- return False # Google TTS unterstützt kein echtes Streaming
- async def initialize(self) -> None:
- """Initialisiert den Google TTS Client."""
- try:
- from google.cloud import texttospeech
- self._state = TTSState.SYNTHESIZING
- # Credentials setzen
- if self._credentials_path:
- import os
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
- # Client erstellen
- loop = asyncio.get_event_loop()
- self._client = await loop.run_in_executor(
- None,
- texttospeech.TextToSpeechClient
- )
- # Verfügbare Stimmen laden
- await self._load_voices()
- self._model_loaded = True
- self._state = TTSState.READY
- except ImportError:
- raise RuntimeError(
- "Google Cloud TTS nicht installiert. "
- "Installieren mit: pip install google-cloud-texttospeech"
- )
- except Exception as e:
- self._state = TTSState.ERROR
- raise RuntimeError(f"Fehler beim Initialisieren von Google TTS: {e}")
- async def _load_voices(self) -> None:
- """Lädt verfügbare Stimmen von Google."""
- if not self._client:
- return
- from google.cloud import texttospeech
- loop = asyncio.get_event_loop()
- def list_voices():
- return self._client.list_voices()
- response = await loop.run_in_executor(None, list_voices)
- for voice in response.voices:
- for language_code in voice.language_codes:
- voice_id = voice.name
- # Geschlecht bestimmen
- gender = "unknown"
- if voice.ssml_gender == texttospeech.SsmlVoiceGender.MALE:
- gender = "male"
- elif voice.ssml_gender == texttospeech.SsmlVoiceGender.FEMALE:
- gender = "female"
- elif voice.ssml_gender == texttospeech.SsmlVoiceGender.NEUTRAL:
- gender = "neutral"
- # Qualität aus Namen ableiten
- quality = "Standard"
- if "Wavenet" in voice_id:
- quality = "WaveNet (Hochwertig)"
- elif "Neural2" in voice_id:
- quality = "Neural2 (Sehr hochwertig)"
- elif "Studio" in voice_id:
- quality = "Studio (Premium)"
- self._available_voices[voice_id] = Voice(
- id=voice_id,
- name=voice_id,
- language=language_code,
- gender=gender,
- description=f"Google {quality} Stimme",
- sample_rate=24000 if "Wavenet" in voice_id or "Neural" in voice_id else 22050,
- )
- # Aktuelle Stimme setzen
- if self._voice_name in self._available_voices:
- self._current_voice = self._available_voices[self._voice_name]
- async def shutdown(self) -> None:
- """Gibt Ressourcen frei."""
- self._client = None
- self._model_loaded = False
- self._available_voices.clear()
- self._state = TTSState.UNINITIALIZED
- async def get_voices(self, language: str | None = None) -> list[Voice]:
- """Gibt verfügbare Stimmen zurück."""
- voices = list(self._available_voices.values())
- if language:
- voices = [v for v in voices if v.language.startswith(language.split("-")[0])]
- return voices
- async def synthesize(
- self,
- text: str,
- voice_id: str | None = None,
- language: str | None = None,
- ) -> TTSResult:
- """Synthetisiert Text mit Google TTS."""
- if not self._client:
- raise RuntimeError("Google TTS Client nicht initialisiert")
- from google.cloud import texttospeech
- self._state = TTSState.SYNTHESIZING
- start_time = time.time()
- try:
- # Stimme bestimmen
- voice_name = voice_id or self._voice_name
- lang = language or self._config.language
- # Synthesis Input (SSML oder Text)
- if text.strip().startswith("<speak>"):
- synthesis_input = texttospeech.SynthesisInput(ssml=text)
- else:
- synthesis_input = texttospeech.SynthesisInput(text=text)
- # Voice Parameter
- voice_params = texttospeech.VoiceSelectionParams(
- language_code=lang,
- name=voice_name,
- )
- # Audio Config
- audio_config = texttospeech.AudioConfig(
- audio_encoding=texttospeech.AudioEncoding.LINEAR16,
- speaking_rate=self._speaking_rate,
- pitch=self._pitch,
- sample_rate_hertz=24000,
- )
- # Synthese in Thread
- loop = asyncio.get_event_loop()
- def do_synthesize():
- return self._client.synthesize_speech(
- input=synthesis_input,
- voice=voice_params,
- audio_config=audio_config,
- )
- response = await loop.run_in_executor(None, do_synthesize)
- processing_time = (time.time() - start_time) * 1000
- audio_data = response.audio_content
- self._state = TTSState.READY
- # Dauer berechnen (16-bit mono @ 24kHz)
- num_samples = len(audio_data) // 2
- duration = num_samples / 24000
- # Voice-Info
- current_voice = self._available_voices.get(voice_name)
- return TTSResult(
- audio_data=audio_data,
- sample_rate=24000,
- channels=1,
- text=text,
- duration_seconds=duration,
- processing_time_ms=processing_time,
- voice_id=voice_name,
- voice_name=current_voice.name if current_voice else voice_name,
- language=lang,
- provider="google",
- model=voice_name,
- )
- except Exception as e:
- self._state = TTSState.ERROR
- raise RuntimeError(f"Google TTS Synthese fehlgeschlagen: {e}")
- async def synthesize_stream(
- self,
- text: str,
- voice_id: str | None = None,
- language: str | None = None,
- chunk_size: int = 4096,
- ) -> AsyncIterator[bytes]:
- """Synthetisiert Text als Stream (simuliert)."""
- # Google TTS unterstützt kein echtes Streaming
- # Wir generieren das Audio und chunken es
- result = await self.synthesize(text, voice_id, language)
- for i in range(0, len(result.audio_data), chunk_size):
- yield result.audio_data[i:i + chunk_size]
- class GoogleTTSPlugin(TrixyPlugin):
- """Google Cloud TTS Plugin für Trixy."""
- def __init__(self, application, plugin_path, config: dict | None = None):
- super().__init__(application, plugin_path, config)
- self._provider: GoogleTTSProvider | None = None
- async def on_load(self) -> None:
- """Plugin wird geladen."""
- from trixy_core.utils.debug import pinfo
- pinfo("Google TTS Plugin: Lade...")
- # Konfiguration
- credentials_path = self.config.get("credentials_path")
- voice_name = self.config.get("voice", "de-DE-Wavenet-C")
- language = self.config.get("language", "de-DE")
- speaking_rate = self.config.get("speaking_rate", 1.0)
- pitch = self.config.get("pitch", 0.0)
- tts_config = TTSConfig(
- language=language,
- voice_id=voice_name,
- )
- # Provider erstellen
- self._provider = GoogleTTSProvider(
- tts_config,
- credentials_path=credentials_path,
- voice_name=voice_name,
- speaking_rate=speaking_rate,
- pitch=pitch,
- )
- # Client initialisieren
- await self._provider.initialize()
- # Extension registrieren
- if hasattr(self.application, "extension_points"):
- ext_point = self.application.extension_points.get("conversation.tts")
- if ext_point:
- ext_point.register(self._provider)
- pinfo("Google TTS Plugin: Extension registriert")
- # Event-Handler registrieren
- self._register_event_handlers()
- pinfo(f"Google TTS Plugin: Geladen (Stimme: {voice_name})")
- def _register_event_handlers(self) -> None:
- """Registriert Event-Handler."""
- em = self.application.events
- @em.on("tts_request")
- async def on_tts_request(event_name: str, data: dict) -> None:
- """Verarbeitet TTS-Anfragen."""
- if not self._provider or not self._provider.is_ready:
- return
- text = data.get("text")
- satellite_id = data.get("satellite_id")
- session_id = data.get("session_id")
- voice_id = data.get("voice_id")
- language = data.get("language")
- if not text:
- return
- from trixy_core.utils.debug import pinfo
- pinfo(f"Google TTS: Synthetisiere '{text[:50]}...'")
- try:
- result = await self._provider.synthesize(
- text,
- voice_id=voice_id,
- language=language,
- )
- await em.emit("tts_completed", {
- "audio_data": result.audio_data.hex(),
- "sample_rate": result.sample_rate,
- "duration_seconds": result.duration_seconds,
- "provider": "google",
- "text": text,
- "satellite_id": satellite_id,
- "session_id": session_id,
- "processing_time_ms": result.processing_time_ms,
- })
- pinfo(f"Google TTS: Fertig ({result.duration_seconds:.1f}s Audio, "
- f"{result.processing_time_ms:.0f}ms)")
- except Exception as e:
- from trixy_core.utils.debug import perror
- perror(f"Google TTS Fehler: {e}")
- await em.emit("tts_error", {
- "error": str(e),
- "provider": "google",
- "text": text,
- "satellite_id": satellite_id,
- "session_id": session_id,
- })
- async def on_unload(self) -> None:
- """Plugin wird entladen."""
- if self._provider:
- await self._provider.shutdown()
- self._provider = None
- from trixy_core.utils.debug import pinfo
- pinfo("Google TTS Plugin: Entladen")
- @property
- def provider(self) -> GoogleTTSProvider | None:
- """TTS-Provider."""
- return self._provider
- # Plugin-Export
- Plugin = GoogleTTSPlugin
|