| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433 |
- # -*- coding: utf-8 -*-
- """
- Coqui TTS Plugin - Text-to-Speech mit Coqui TTS.
- Coqui TTS ist ein Open-Source Deep Learning TTS Framework.
- Unterstützt:
- - Viele vortrainierte Modelle
- - Multi-Speaker Modelle
- - GPU-Beschleunigung (optional)
- - Lokale Synthese (offline)
- - Hochwertige neuronale Stimmen
- - Automatischer Modell-Download
- Modelle:
- - Tacotron2: Gute Qualität, schnell
- - VITS: Sehr natürlich, End-to-End
- - YourTTS: Multi-Speaker, Multi-Lingual
- """
- import asyncio
- import io
- import os
- import time
- import wave
- from pathlib import Path
- from typing import Any, AsyncIterator
- from trixy_core.plugins import TrixyPlugin
- from trixy_core.audio.tts import TTSProvider, TTSConfig, TTSResult, TTSState, Voice
- # Coqui TTS Modell-Katalog
- COQUI_MODELS = {
- "tts_models/de/thorsten/tacotron2-DDC": {
- "name": "Thorsten Tacotron2",
- "language": "de-DE",
- "gender": "male",
- "description": "Deutsche männliche Stimme (Tacotron2)",
- "sample_rate": 22050,
- },
- "tts_models/de/thorsten/vits": {
- "name": "Thorsten VITS",
- "language": "de-DE",
- "gender": "male",
- "description": "Deutsche männliche Stimme (VITS, sehr natürlich)",
- "sample_rate": 22050,
- },
- "tts_models/de/thorsten/tacotron2-DCA": {
- "name": "Thorsten Tacotron2-DCA",
- "language": "de-DE",
- "gender": "male",
- "description": "Deutsche männliche Stimme (Tacotron2 DCA)",
- "sample_rate": 22050,
- },
- "tts_models/en/ljspeech/tacotron2-DDC": {
- "name": "LJSpeech Tacotron2",
- "language": "en-US",
- "gender": "female",
- "description": "Englische weibliche Stimme (Tacotron2)",
- "sample_rate": 22050,
- },
- "tts_models/en/ljspeech/vits": {
- "name": "LJSpeech VITS",
- "language": "en-US",
- "gender": "female",
- "description": "Englische weibliche Stimme (VITS)",
- "sample_rate": 22050,
- },
- "tts_models/multilingual/multi-dataset/your_tts": {
- "name": "YourTTS Multilingual",
- "language": "multi",
- "gender": "multi",
- "description": "Multilinguale Multi-Speaker Stimme",
- "sample_rate": 16000,
- },
- }
- class CoquiTTSProvider(TTSProvider):
- """Coqui TTS Provider."""
- def __init__(
- self,
- config: TTSConfig | None = None,
- models_dir: Path | None = None,
- model_name: str = "tts_models/de/thorsten/tacotron2-DDC",
- vocoder_name: str | None = None,
- speaker: str | None = None,
- use_cuda: bool = False,
- ):
- super().__init__(config)
- self._models_dir = models_dir
- self._model_name = model_name
- self._vocoder_name = vocoder_name
- self._speaker = speaker
- self._use_cuda = use_cuda
- self._tts = None
- self._available_voices: dict[str, Voice] = {}
- @property
- def name(self) -> str:
- return "coqui"
- @property
- def supported_languages(self) -> list[str]:
- return [
- "de-DE", "en-US", "en-GB", "fr-FR",
- "es-ES", "it-IT", "pt-BR", "pl-PL",
- "nl-NL", "ru-RU", "ja-JP", "zh-CN",
- ]
- @property
- def supports_streaming(self) -> bool:
- return False # Coqui TTS generiert vollständige Audio-Dateien
- async def initialize(self) -> None:
- """Lädt das Coqui TTS Modell."""
- try:
- from TTS.api import TTS
- from trixy_core.utils.debug import pinfo
- self._state = TTSState.SYNTHESIZING
- # Models-Verzeichnis für Coqui setzen
- if self._models_dir:
- self._models_dir.mkdir(parents=True, exist_ok=True)
- # Coqui TTS nutzt HOME/.local/share/tts
- os.environ["TTS_HOME"] = str(self._models_dir)
- model_info = COQUI_MODELS.get(self._model_name, {})
- pinfo(f"Lade Coqui TTS Modell: {self._model_name}...")
- pinfo("Bei erstem Start wird das Modell heruntergeladen...")
- # TTS laden (in Thread wegen Modell-Download)
- loop = asyncio.get_event_loop()
- def load_tts():
- return TTS(
- model_name=self._model_name,
- vocoder_path=self._vocoder_name,
- progress_bar=True,
- gpu=self._use_cuda,
- )
- self._tts = await loop.run_in_executor(None, load_tts)
- # Verfügbare Stimmen laden
- await self._load_voices()
- self._model_loaded = True
- self._state = TTSState.READY
- pinfo(f"Coqui TTS Modell geladen: {self._model_name}")
- except ImportError:
- raise RuntimeError(
- "Coqui TTS nicht installiert. "
- "Installieren mit: pip install TTS"
- )
- except Exception as e:
- self._state = TTSState.ERROR
- raise RuntimeError(f"Fehler beim Laden des Coqui TTS Modells: {e}")
- async def _load_voices(self) -> None:
- """Lädt verfügbare Stimmen."""
- # Bekannte Modelle als Voices hinzufügen
- for model_id, info in COQUI_MODELS.items():
- self._available_voices[model_id] = Voice(
- id=model_id,
- name=info["name"],
- language=info["language"],
- gender=info["gender"],
- description=info["description"],
- sample_rate=info["sample_rate"],
- )
- # Multi-Speaker Stimmen
- if self._tts and hasattr(self._tts, "speakers") and self._tts.speakers:
- for speaker in self._tts.speakers:
- voice_id = f"{self._model_name}:{speaker}"
- self._available_voices[voice_id] = Voice(
- id=voice_id,
- name=f"{speaker}",
- language=self._config.language if self._config else "de-DE",
- gender="unknown",
- description=f"Speaker: {speaker}",
- sample_rate=22050,
- )
- # Aktuelle Stimme setzen
- if self._model_name in self._available_voices:
- self._current_voice = self._available_voices[self._model_name]
- async def shutdown(self) -> None:
- """Gibt Ressourcen frei."""
- self._tts = None
- self._model_loaded = False
- self._available_voices.clear()
- self._state = TTSState.UNINITIALIZED
- async def get_voices(self, language: str | None = None) -> list[Voice]:
- """Gibt verfügbare Stimmen zurück."""
- voices = list(self._available_voices.values())
- if language:
- lang_prefix = language.split("-")[0]
- voices = [
- v for v in voices
- if v.language == "multi" or v.language.startswith(lang_prefix)
- ]
- return voices
- async def synthesize(
- self,
- text: str,
- voice_id: str | None = None,
- language: str | None = None,
- ) -> TTSResult:
- """Synthetisiert Text mit Coqui TTS."""
- if not self._tts:
- raise RuntimeError("Coqui TTS nicht geladen")
- self._state = TTSState.SYNTHESIZING
- start_time = time.time()
- try:
- # Speaker aus voice_id extrahieren
- speaker = self._speaker
- if voice_id and ":" in voice_id:
- _, speaker = voice_id.split(":", 1)
- # Synthese in Thread
- loop = asyncio.get_event_loop()
- def do_synthesize():
- # Coqui TTS gibt numpy array zurück
- wav = self._tts.tts(
- text=text,
- speaker=speaker,
- language=language,
- )
- return wav
- wav_data = await loop.run_in_executor(None, do_synthesize)
- processing_time = (time.time() - start_time) * 1000
- # Numpy array zu WAV bytes konvertieren
- import numpy as np
- # Normalisieren und zu int16 konvertieren
- if isinstance(wav_data, np.ndarray):
- wav_normalized = np.clip(wav_data, -1.0, 1.0)
- wav_int16 = (wav_normalized * 32767).astype(np.int16)
- else:
- wav_int16 = np.array(wav_data, dtype=np.int16)
- # WAV-Datei erstellen
- sample_rate = self._tts.synthesizer.output_sample_rate if hasattr(self._tts, 'synthesizer') else 22050
- audio_buffer = io.BytesIO()
- with wave.open(audio_buffer, "wb") as wav_file:
- wav_file.setnchannels(1)
- wav_file.setsampwidth(2) # 16-bit
- wav_file.setframerate(sample_rate)
- wav_file.writeframes(wav_int16.tobytes())
- audio_data = audio_buffer.getvalue()
- self._state = TTSState.READY
- # Dauer berechnen
- duration = len(wav_int16) / sample_rate
- return TTSResult(
- audio_data=audio_data,
- sample_rate=sample_rate,
- channels=1,
- text=text,
- duration_seconds=duration,
- processing_time_ms=processing_time,
- voice_id=voice_id or self._model_name,
- voice_name=self._current_voice.name if self._current_voice else "",
- language=language or self._config.language,
- provider="coqui",
- model=self._model_name,
- )
- except Exception as e:
- self._state = TTSState.ERROR
- raise RuntimeError(f"Coqui TTS Synthese fehlgeschlagen: {e}")
- async def synthesize_stream(
- self,
- text: str,
- voice_id: str | None = None,
- language: str | None = None,
- chunk_size: int = 4096,
- ) -> AsyncIterator[bytes]:
- """Synthetisiert Text als Stream (simuliert)."""
- # Coqui TTS unterstützt kein echtes Streaming
- result = await self.synthesize(text, voice_id, language)
- for i in range(0, len(result.audio_data), chunk_size):
- yield result.audio_data[i:i + chunk_size]
- class CoquiTTSPlugin(TrixyPlugin):
- """Coqui TTS Plugin für Trixy."""
- def __init__(self, application, plugin_path, config: dict | None = None):
- super().__init__(application, plugin_path, config)
- self._provider: CoquiTTSProvider | None = None
- async def on_load(self) -> None:
- """Plugin wird geladen."""
- from trixy_core.utils.debug import pinfo
- pinfo("Coqui TTS Plugin: Lade...")
- # Konfiguration
- model_name = self.config.get("model", "tts_models/de/thorsten/tacotron2-DDC")
- vocoder_name = self.config.get("vocoder")
- speaker = self.config.get("speaker")
- language = self.config.get("language", "de-DE")
- use_cuda = self.config.get("use_cuda", False)
- # Models-Verzeichnis im Plugin-Ordner
- models_dir = self.plugin_path / "models"
- models_dir.mkdir(parents=True, exist_ok=True)
- tts_config = TTSConfig(
- language=language,
- voice_id=model_name,
- )
- # Provider erstellen
- self._provider = CoquiTTSProvider(
- tts_config,
- models_dir=models_dir,
- model_name=model_name,
- vocoder_name=vocoder_name,
- speaker=speaker,
- use_cuda=use_cuda,
- )
- # Modell laden (und ggf. downloaden)
- await self._provider.initialize()
- # Extension registrieren
- if hasattr(self.application, "extension_points"):
- ext_point = self.application.extension_points.get("conversation.tts")
- if ext_point:
- ext_point.register(self._provider)
- pinfo("Coqui TTS Plugin: Extension registriert")
- # Event-Handler registrieren
- self._register_event_handlers()
- pinfo(f"Coqui TTS Plugin: Geladen (Modell: {model_name})")
- def _register_event_handlers(self) -> None:
- """Registriert Event-Handler."""
- em = self.application.events
- @em.on("tts_request")
- async def on_tts_request(event_name: str, data: dict) -> None:
- """Verarbeitet TTS-Anfragen."""
- if not self._provider or not self._provider.is_ready:
- return
- text = data.get("text")
- satellite_id = data.get("satellite_id")
- session_id = data.get("session_id")
- voice_id = data.get("voice_id")
- language = data.get("language")
- if not text:
- return
- from trixy_core.utils.debug import pinfo
- pinfo(f"Coqui TTS: Synthetisiere '{text[:50]}...'")
- try:
- result = await self._provider.synthesize(
- text,
- voice_id=voice_id,
- language=language,
- )
- await em.emit("tts_completed", {
- "audio_data": result.audio_data.hex(),
- "sample_rate": result.sample_rate,
- "duration_seconds": result.duration_seconds,
- "provider": "coqui",
- "text": text,
- "satellite_id": satellite_id,
- "session_id": session_id,
- "processing_time_ms": result.processing_time_ms,
- })
- pinfo(f"Coqui TTS: Fertig ({result.duration_seconds:.1f}s Audio, "
- f"{result.processing_time_ms:.0f}ms)")
- except Exception as e:
- from trixy_core.utils.debug import perror
- perror(f"Coqui TTS Fehler: {e}")
- await em.emit("tts_error", {
- "error": str(e),
- "provider": "coqui",
- "text": text,
- "satellite_id": satellite_id,
- "session_id": session_id,
- })
- async def on_unload(self) -> None:
- """Plugin wird entladen."""
- if self._provider:
- await self._provider.shutdown()
- self._provider = None
- from trixy_core.utils.debug import pinfo
- pinfo("Coqui TTS Plugin: Entladen")
- @property
- def provider(self) -> CoquiTTSProvider | None:
- """TTS-Provider."""
- return self._provider
- # Plugin-Export
- Plugin = CoquiTTSPlugin
|