# -*- coding: utf-8 -*- """ TTS (Text-to-Speech) Interface. Definiert die Schnittstelle für TTS-Provider (Plugins). """ from abc import ABC, abstractmethod from dataclasses import dataclass, field from enum import Enum from typing import Any, AsyncIterator class TTSState(Enum): """Zustand des TTS-Providers.""" UNINITIALIZED = "uninitialized" READY = "ready" SYNTHESIZING = "synthesizing" ERROR = "error" @dataclass class Voice: """Beschreibung einer TTS-Stimme.""" id: str name: str language: str gender: str = "neutral" # male, female, neutral description: str = "" # Qualität/Stil style: str = "default" sample_rate: int = 22050 # Provider-spezifisch provider: str = "" model: str = "" metadata: dict[str, Any] = field(default_factory=dict) def to_dict(self) -> dict: """Konvertiert zu Dictionary.""" return { "id": self.id, "name": self.name, "language": self.language, "gender": self.gender, "description": self.description, "style": self.style, "sample_rate": self.sample_rate, "provider": self.provider, "model": self.model, } @dataclass class TTSConfig: """Konfiguration für TTS-Provider.""" # Sprache & Stimme language: str = "de-DE" voice_id: str | None = None voice_name: str | None = None # Audio-Format sample_rate: int = 22050 channels: int = 1 sample_width: int = 2 # 16-bit # Sprachparameter speed: float = 1.0 # 0.5 - 2.0 pitch: float = 1.0 # 0.5 - 2.0 volume: float = 1.0 # 0.0 - 1.0 # Performance use_gpu: bool = False num_threads: int = 4 # Modell model_path: str | None = None model_name: str | None = None # Caching enable_cache: bool = True cache_dir: str | None = None @dataclass class TTSResult: """Ergebnis einer TTS-Synthese.""" # Audio-Daten audio_data: bytes sample_rate: int = 22050 channels: int = 1 # Metadaten text: str = "" duration_seconds: float = 0.0 processing_time_ms: float = 0.0 # Stimme voice_id: str = "" voice_name: str = "" language: str = "de-DE" # Provider-Info provider: str = "" model: str = "" # Zusätzliche Daten raw_result: Any = None def to_dict(self) -> dict: """Konvertiert zu Dictionary (ohne Audio-Daten).""" return { "text": self.text, "duration_seconds": self.duration_seconds, "processing_time_ms": self.processing_time_ms, "sample_rate": self.sample_rate, "channels": self.channels, "voice_id": self.voice_id, "voice_name": self.voice_name, "language": self.language, "provider": self.provider, "model": self.model, "audio_size_bytes": len(self.audio_data), } def save_wav(self, file_path: str) -> None: """Speichert Audio als WAV-Datei.""" import wave with wave.open(file_path, "wb") as wf: wf.setnchannels(self.channels) wf.setsampwidth(2) # 16-bit wf.setframerate(self.sample_rate) wf.writeframes(self.audio_data) class TTSProvider(ABC): """ Abstrakte Basisklasse für TTS-Provider. Plugins implementieren diese Klasse um Text-to-Speech Funktionalität bereitzustellen. Events: - tts_started: TTS-Synthese gestartet - tts_completed: TTS erfolgreich abgeschlossen - tts_error: Fehler bei TTS - tts_chunk: Audio-Chunk bei Streaming """ def __init__(self, config: TTSConfig | None = None): """ Initialisiert den Provider. Args: config: TTS-Konfiguration """ self._config = config or TTSConfig() self._state = TTSState.UNINITIALIZED self._model_loaded = False self._current_voice: Voice | None = None @property def config(self) -> TTSConfig: """Aktuelle Konfiguration.""" return self._config @property def state(self) -> TTSState: """Aktueller Zustand.""" return self._state @property def is_ready(self) -> bool: """Prüft ob Provider bereit ist.""" return self._state == TTSState.READY @property def current_voice(self) -> Voice | None: """Aktuelle Stimme.""" return self._current_voice @property @abstractmethod def name(self) -> str: """Name des Providers (z.B. 'piper', 'google').""" pass @property @abstractmethod def supported_languages(self) -> list[str]: """Liste unterstützter Sprachen.""" pass @property def supports_streaming(self) -> bool: """Prüft ob Streaming unterstützt wird.""" return False @property def supports_ssml(self) -> bool: """Prüft ob SSML unterstützt wird.""" return False # === Lifecycle === @abstractmethod async def initialize(self) -> None: """ Initialisiert den Provider (lädt Modell, etc.). Raises: RuntimeError: Bei Initialisierungsfehler """ pass @abstractmethod async def shutdown(self) -> None: """Fährt den Provider herunter und gibt Ressourcen frei.""" pass # === Stimmen === @abstractmethod async def get_voices( self, language: str | None = None, ) -> list[Voice]: """ Gibt verfügbare Stimmen zurück. Args: language: Filter nach Sprache (optional) Returns: Liste von Voice-Objekten """ pass async def set_voice(self, voice_id: str) -> bool: """ Setzt die aktive Stimme. Args: voice_id: ID der Stimme Returns: True wenn erfolgreich """ voices = await self.get_voices() for voice in voices: if voice.id == voice_id: self._current_voice = voice return True return False # === Synthese === @abstractmethod async def synthesize( self, text: str, voice_id: str | None = None, language: str | None = None, ) -> TTSResult: """ Synthetisiert Text zu Audio. Args: text: Zu synthetisierender Text voice_id: Stimmen-ID (optional, default aus Config) language: Sprache (optional, default aus Config) Returns: TTSResult mit Audio-Daten Raises: RuntimeError: Bei Synthesefehler """ pass async def synthesize_ssml( self, ssml: str, voice_id: str | None = None, ) -> TTSResult: """ Synthetisiert SSML zu Audio. Args: ssml: SSML-formatierter Text voice_id: Stimmen-ID (optional) Returns: TTSResult mit Audio-Daten """ if not self.supports_ssml: # Fallback: SSML-Tags entfernen und als Plain-Text behandeln import re plain_text = re.sub(r"<[^>]+>", "", ssml) return await self.synthesize(plain_text, voice_id) raise NotImplementedError("SSML nicht implementiert") # === Streaming (optional) === async def synthesize_stream( self, text: str, voice_id: str | None = None, language: str | None = None, chunk_size: int = 4096, ) -> AsyncIterator[bytes]: """ Synthetisiert Text zu Audio-Stream. Args: text: Zu synthetisierender Text voice_id: Stimmen-ID (optional) language: Sprache (optional) chunk_size: Größe der Audio-Chunks Yields: Audio-Chunks als bytes """ # Default-Implementierung: Synthetisiere komplett und chunke result = await self.synthesize(text, voice_id, language) for i in range(0, len(result.audio_data), chunk_size): yield result.audio_data[i:i + chunk_size] # === Utilities === def estimate_duration(self, text: str) -> float: """ Schätzt Audio-Dauer für Text. Args: text: Text Returns: Geschätzte Dauer in Sekunden """ # Grobe Schätzung: ~150 Wörter pro Minute words = len(text.split()) return (words / 150) * 60 * (1 / self._config.speed)