| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350 |
- # -*- coding: utf-8 -*-
- """
- TTS (Text-to-Speech) Interface.
- Definiert die Schnittstelle für TTS-Provider (Plugins).
- """
- from abc import ABC, abstractmethod
- from dataclasses import dataclass, field
- from enum import Enum
- from typing import Any, AsyncIterator
- class TTSState(Enum):
- """Zustand des TTS-Providers."""
- UNINITIALIZED = "uninitialized"
- READY = "ready"
- SYNTHESIZING = "synthesizing"
- ERROR = "error"
- @dataclass
- class Voice:
- """Beschreibung einer TTS-Stimme."""
- id: str
- name: str
- language: str
- gender: str = "neutral" # male, female, neutral
- description: str = ""
- # Qualität/Stil
- style: str = "default"
- sample_rate: int = 22050
- # Provider-spezifisch
- provider: str = ""
- model: str = ""
- metadata: dict[str, Any] = field(default_factory=dict)
- def to_dict(self) -> dict:
- """Konvertiert zu Dictionary."""
- return {
- "id": self.id,
- "name": self.name,
- "language": self.language,
- "gender": self.gender,
- "description": self.description,
- "style": self.style,
- "sample_rate": self.sample_rate,
- "provider": self.provider,
- "model": self.model,
- }
- @dataclass
- class TTSConfig:
- """Konfiguration für TTS-Provider."""
- # Sprache & Stimme
- language: str = "de-DE"
- voice_id: str | None = None
- voice_name: str | None = None
- # Audio-Format
- sample_rate: int = 22050
- channels: int = 1
- sample_width: int = 2 # 16-bit
- # Sprachparameter
- speed: float = 1.0 # 0.5 - 2.0
- pitch: float = 1.0 # 0.5 - 2.0
- volume: float = 1.0 # 0.0 - 1.0
- # Performance
- use_gpu: bool = False
- num_threads: int = 4
- # Modell
- model_path: str | None = None
- model_name: str | None = None
- # Caching
- enable_cache: bool = True
- cache_dir: str | None = None
- @dataclass
- class TTSResult:
- """Ergebnis einer TTS-Synthese."""
- # Audio-Daten
- audio_data: bytes
- sample_rate: int = 22050
- channels: int = 1
- # Metadaten
- text: str = ""
- duration_seconds: float = 0.0
- processing_time_ms: float = 0.0
- # Stimme
- voice_id: str = ""
- voice_name: str = ""
- language: str = "de-DE"
- # Provider-Info
- provider: str = ""
- model: str = ""
- # Zusätzliche Daten
- raw_result: Any = None
- def to_dict(self) -> dict:
- """Konvertiert zu Dictionary (ohne Audio-Daten)."""
- return {
- "text": self.text,
- "duration_seconds": self.duration_seconds,
- "processing_time_ms": self.processing_time_ms,
- "sample_rate": self.sample_rate,
- "channels": self.channels,
- "voice_id": self.voice_id,
- "voice_name": self.voice_name,
- "language": self.language,
- "provider": self.provider,
- "model": self.model,
- "audio_size_bytes": len(self.audio_data),
- }
- def save_wav(self, file_path: str) -> None:
- """Speichert Audio als WAV-Datei."""
- import wave
- with wave.open(file_path, "wb") as wf:
- wf.setnchannels(self.channels)
- wf.setsampwidth(2) # 16-bit
- wf.setframerate(self.sample_rate)
- wf.writeframes(self.audio_data)
- class TTSProvider(ABC):
- """
- Abstrakte Basisklasse für TTS-Provider.
- Plugins implementieren diese Klasse um Text-to-Speech
- Funktionalität bereitzustellen.
- Events:
- - tts_started: TTS-Synthese gestartet
- - tts_completed: TTS erfolgreich abgeschlossen
- - tts_error: Fehler bei TTS
- - tts_chunk: Audio-Chunk bei Streaming
- """
- def __init__(self, config: TTSConfig | None = None):
- """
- Initialisiert den Provider.
- Args:
- config: TTS-Konfiguration
- """
- self._config = config or TTSConfig()
- self._state = TTSState.UNINITIALIZED
- self._model_loaded = False
- self._current_voice: Voice | None = None
- @property
- def config(self) -> TTSConfig:
- """Aktuelle Konfiguration."""
- return self._config
- @property
- def state(self) -> TTSState:
- """Aktueller Zustand."""
- return self._state
- @property
- def is_ready(self) -> bool:
- """Prüft ob Provider bereit ist."""
- return self._state == TTSState.READY
- @property
- def current_voice(self) -> Voice | None:
- """Aktuelle Stimme."""
- return self._current_voice
- @property
- @abstractmethod
- def name(self) -> str:
- """Name des Providers (z.B. 'piper', 'google')."""
- pass
- @property
- @abstractmethod
- def supported_languages(self) -> list[str]:
- """Liste unterstützter Sprachen."""
- pass
- @property
- def supports_streaming(self) -> bool:
- """Prüft ob Streaming unterstützt wird."""
- return False
- @property
- def supports_ssml(self) -> bool:
- """Prüft ob SSML unterstützt wird."""
- return False
- # === Lifecycle ===
- @abstractmethod
- async def initialize(self) -> None:
- """
- Initialisiert den Provider (lädt Modell, etc.).
- Raises:
- RuntimeError: Bei Initialisierungsfehler
- """
- pass
- @abstractmethod
- async def shutdown(self) -> None:
- """Fährt den Provider herunter und gibt Ressourcen frei."""
- pass
- # === Stimmen ===
- @abstractmethod
- async def get_voices(
- self,
- language: str | None = None,
- ) -> list[Voice]:
- """
- Gibt verfügbare Stimmen zurück.
- Args:
- language: Filter nach Sprache (optional)
- Returns:
- Liste von Voice-Objekten
- """
- pass
- async def set_voice(self, voice_id: str) -> bool:
- """
- Setzt die aktive Stimme.
- Args:
- voice_id: ID der Stimme
- Returns:
- True wenn erfolgreich
- """
- voices = await self.get_voices()
- for voice in voices:
- if voice.id == voice_id:
- self._current_voice = voice
- return True
- return False
- # === Synthese ===
- @abstractmethod
- async def synthesize(
- self,
- text: str,
- voice_id: str | None = None,
- language: str | None = None,
- ) -> TTSResult:
- """
- Synthetisiert Text zu Audio.
- Args:
- text: Zu synthetisierender Text
- voice_id: Stimmen-ID (optional, default aus Config)
- language: Sprache (optional, default aus Config)
- Returns:
- TTSResult mit Audio-Daten
- Raises:
- RuntimeError: Bei Synthesefehler
- """
- pass
- async def synthesize_ssml(
- self,
- ssml: str,
- voice_id: str | None = None,
- ) -> TTSResult:
- """
- Synthetisiert SSML zu Audio.
- Args:
- ssml: SSML-formatierter Text
- voice_id: Stimmen-ID (optional)
- Returns:
- TTSResult mit Audio-Daten
- """
- if not self.supports_ssml:
- # Fallback: SSML-Tags entfernen und als Plain-Text behandeln
- import re
- plain_text = re.sub(r"<[^>]+>", "", ssml)
- return await self.synthesize(plain_text, voice_id)
- raise NotImplementedError("SSML nicht implementiert")
- # === Streaming (optional) ===
- async def synthesize_stream(
- self,
- text: str,
- voice_id: str | None = None,
- language: str | None = None,
- chunk_size: int = 4096,
- ) -> AsyncIterator[bytes]:
- """
- Synthetisiert Text zu Audio-Stream.
- Args:
- text: Zu synthetisierender Text
- voice_id: Stimmen-ID (optional)
- language: Sprache (optional)
- chunk_size: Größe der Audio-Chunks
- Yields:
- Audio-Chunks als bytes
- """
- # Default-Implementierung: Synthetisiere komplett und chunke
- result = await self.synthesize(text, voice_id, language)
- for i in range(0, len(result.audio_data), chunk_size):
- yield result.audio_data[i:i + chunk_size]
- # === Utilities ===
- def estimate_duration(self, text: str) -> float:
- """
- Schätzt Audio-Dauer für Text.
- Args:
- text: Text
- Returns:
- Geschätzte Dauer in Sekunden
- """
- # Grobe Schätzung: ~150 Wörter pro Minute
- words = len(text.split())
- return (words / 150) * 60 * (1 / self._config.speed)
|