main.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. # -*- coding: utf-8 -*-
  2. """
  3. Coqui TTS Plugin - Text-to-Speech mit Coqui TTS.
  4. Coqui TTS ist ein Open-Source Deep Learning TTS Framework.
  5. Unterstützt:
  6. - Viele vortrainierte Modelle
  7. - Multi-Speaker Modelle
  8. - GPU-Beschleunigung (optional)
  9. - Lokale Synthese (offline)
  10. - Hochwertige neuronale Stimmen
  11. - Automatischer Modell-Download
  12. Modelle:
  13. - Tacotron2: Gute Qualität, schnell
  14. - VITS: Sehr natürlich, End-to-End
  15. - YourTTS: Multi-Speaker, Multi-Lingual
  16. """
  17. import asyncio
  18. import io
  19. import os
  20. import time
  21. import wave
  22. from pathlib import Path
  23. from typing import Any, AsyncIterator
  24. from trixy_core.plugins import TrixyPlugin
  25. from trixy_core.audio.tts import TTSProvider, TTSConfig, TTSResult, TTSState, Voice
  26. # Coqui TTS Modell-Katalog
  27. COQUI_MODELS = {
  28. "tts_models/de/thorsten/tacotron2-DDC": {
  29. "name": "Thorsten Tacotron2",
  30. "language": "de-DE",
  31. "gender": "male",
  32. "description": "Deutsche männliche Stimme (Tacotron2)",
  33. "sample_rate": 22050,
  34. },
  35. "tts_models/de/thorsten/vits": {
  36. "name": "Thorsten VITS",
  37. "language": "de-DE",
  38. "gender": "male",
  39. "description": "Deutsche männliche Stimme (VITS, sehr natürlich)",
  40. "sample_rate": 22050,
  41. },
  42. "tts_models/de/thorsten/tacotron2-DCA": {
  43. "name": "Thorsten Tacotron2-DCA",
  44. "language": "de-DE",
  45. "gender": "male",
  46. "description": "Deutsche männliche Stimme (Tacotron2 DCA)",
  47. "sample_rate": 22050,
  48. },
  49. "tts_models/en/ljspeech/tacotron2-DDC": {
  50. "name": "LJSpeech Tacotron2",
  51. "language": "en-US",
  52. "gender": "female",
  53. "description": "Englische weibliche Stimme (Tacotron2)",
  54. "sample_rate": 22050,
  55. },
  56. "tts_models/en/ljspeech/vits": {
  57. "name": "LJSpeech VITS",
  58. "language": "en-US",
  59. "gender": "female",
  60. "description": "Englische weibliche Stimme (VITS)",
  61. "sample_rate": 22050,
  62. },
  63. "tts_models/multilingual/multi-dataset/your_tts": {
  64. "name": "YourTTS Multilingual",
  65. "language": "multi",
  66. "gender": "multi",
  67. "description": "Multilinguale Multi-Speaker Stimme",
  68. "sample_rate": 16000,
  69. },
  70. }
  71. class CoquiTTSProvider(TTSProvider):
  72. """Coqui TTS Provider."""
  73. def __init__(
  74. self,
  75. config: TTSConfig | None = None,
  76. models_dir: Path | None = None,
  77. model_name: str = "tts_models/de/thorsten/tacotron2-DDC",
  78. vocoder_name: str | None = None,
  79. speaker: str | None = None,
  80. use_cuda: bool = False,
  81. ):
  82. super().__init__(config)
  83. self._models_dir = models_dir
  84. self._model_name = model_name
  85. self._vocoder_name = vocoder_name
  86. self._speaker = speaker
  87. self._use_cuda = use_cuda
  88. self._tts = None
  89. self._available_voices: dict[str, Voice] = {}
  90. @property
  91. def name(self) -> str:
  92. return "coqui"
  93. @property
  94. def supported_languages(self) -> list[str]:
  95. return [
  96. "de-DE", "en-US", "en-GB", "fr-FR",
  97. "es-ES", "it-IT", "pt-BR", "pl-PL",
  98. "nl-NL", "ru-RU", "ja-JP", "zh-CN",
  99. ]
  100. @property
  101. def supports_streaming(self) -> bool:
  102. return False # Coqui TTS generiert vollständige Audio-Dateien
  103. async def initialize(self) -> None:
  104. """Lädt das Coqui TTS Modell."""
  105. try:
  106. from TTS.api import TTS
  107. from trixy_core.utils.debug import pinfo
  108. self._state = TTSState.SYNTHESIZING
  109. # Models-Verzeichnis für Coqui setzen
  110. if self._models_dir:
  111. self._models_dir.mkdir(parents=True, exist_ok=True)
  112. # Coqui TTS nutzt HOME/.local/share/tts
  113. os.environ["TTS_HOME"] = str(self._models_dir)
  114. model_info = COQUI_MODELS.get(self._model_name, {})
  115. pinfo(f"Lade Coqui TTS Modell: {self._model_name}...")
  116. pinfo("Bei erstem Start wird das Modell heruntergeladen...")
  117. # TTS laden (in Thread wegen Modell-Download)
  118. loop = asyncio.get_event_loop()
  119. def load_tts():
  120. return TTS(
  121. model_name=self._model_name,
  122. vocoder_path=self._vocoder_name,
  123. progress_bar=True,
  124. gpu=self._use_cuda,
  125. )
  126. self._tts = await loop.run_in_executor(None, load_tts)
  127. # Verfügbare Stimmen laden
  128. await self._load_voices()
  129. self._model_loaded = True
  130. self._state = TTSState.READY
  131. pinfo(f"Coqui TTS Modell geladen: {self._model_name}")
  132. except ImportError:
  133. raise RuntimeError(
  134. "Coqui TTS nicht installiert. "
  135. "Installieren mit: pip install TTS"
  136. )
  137. except Exception as e:
  138. self._state = TTSState.ERROR
  139. raise RuntimeError(f"Fehler beim Laden des Coqui TTS Modells: {e}")
  140. async def _load_voices(self) -> None:
  141. """Lädt verfügbare Stimmen."""
  142. # Bekannte Modelle als Voices hinzufügen
  143. for model_id, info in COQUI_MODELS.items():
  144. self._available_voices[model_id] = Voice(
  145. id=model_id,
  146. name=info["name"],
  147. language=info["language"],
  148. gender=info["gender"],
  149. description=info["description"],
  150. sample_rate=info["sample_rate"],
  151. )
  152. # Multi-Speaker Stimmen
  153. if self._tts and hasattr(self._tts, "speakers") and self._tts.speakers:
  154. for speaker in self._tts.speakers:
  155. voice_id = f"{self._model_name}:{speaker}"
  156. self._available_voices[voice_id] = Voice(
  157. id=voice_id,
  158. name=f"{speaker}",
  159. language=self._config.language if self._config else "de-DE",
  160. gender="unknown",
  161. description=f"Speaker: {speaker}",
  162. sample_rate=22050,
  163. )
  164. # Aktuelle Stimme setzen
  165. if self._model_name in self._available_voices:
  166. self._current_voice = self._available_voices[self._model_name]
  167. async def shutdown(self) -> None:
  168. """Gibt Ressourcen frei."""
  169. self._tts = None
  170. self._model_loaded = False
  171. self._available_voices.clear()
  172. self._state = TTSState.UNINITIALIZED
  173. async def get_voices(self, language: str | None = None) -> list[Voice]:
  174. """Gibt verfügbare Stimmen zurück."""
  175. voices = list(self._available_voices.values())
  176. if language:
  177. lang_prefix = language.split("-")[0]
  178. voices = [
  179. v for v in voices
  180. if v.language == "multi" or v.language.startswith(lang_prefix)
  181. ]
  182. return voices
  183. async def synthesize(
  184. self,
  185. text: str,
  186. voice_id: str | None = None,
  187. language: str | None = None,
  188. ) -> TTSResult:
  189. """Synthetisiert Text mit Coqui TTS."""
  190. if not self._tts:
  191. raise RuntimeError("Coqui TTS nicht geladen")
  192. self._state = TTSState.SYNTHESIZING
  193. start_time = time.time()
  194. try:
  195. # Speaker aus voice_id extrahieren
  196. speaker = self._speaker
  197. if voice_id and ":" in voice_id:
  198. _, speaker = voice_id.split(":", 1)
  199. # Synthese in Thread
  200. loop = asyncio.get_event_loop()
  201. def do_synthesize():
  202. # Coqui TTS gibt numpy array zurück
  203. wav = self._tts.tts(
  204. text=text,
  205. speaker=speaker,
  206. language=language,
  207. )
  208. return wav
  209. wav_data = await loop.run_in_executor(None, do_synthesize)
  210. processing_time = (time.time() - start_time) * 1000
  211. # Numpy array zu WAV bytes konvertieren
  212. import numpy as np
  213. # Normalisieren und zu int16 konvertieren
  214. if isinstance(wav_data, np.ndarray):
  215. wav_normalized = np.clip(wav_data, -1.0, 1.0)
  216. wav_int16 = (wav_normalized * 32767).astype(np.int16)
  217. else:
  218. wav_int16 = np.array(wav_data, dtype=np.int16)
  219. # WAV-Datei erstellen
  220. sample_rate = self._tts.synthesizer.output_sample_rate if hasattr(self._tts, 'synthesizer') else 22050
  221. audio_buffer = io.BytesIO()
  222. with wave.open(audio_buffer, "wb") as wav_file:
  223. wav_file.setnchannels(1)
  224. wav_file.setsampwidth(2) # 16-bit
  225. wav_file.setframerate(sample_rate)
  226. wav_file.writeframes(wav_int16.tobytes())
  227. audio_data = audio_buffer.getvalue()
  228. self._state = TTSState.READY
  229. # Dauer berechnen
  230. duration = len(wav_int16) / sample_rate
  231. return TTSResult(
  232. audio_data=audio_data,
  233. sample_rate=sample_rate,
  234. channels=1,
  235. text=text,
  236. duration_seconds=duration,
  237. processing_time_ms=processing_time,
  238. voice_id=voice_id or self._model_name,
  239. voice_name=self._current_voice.name if self._current_voice else "",
  240. language=language or self._config.language,
  241. provider="coqui",
  242. model=self._model_name,
  243. )
  244. except Exception as e:
  245. self._state = TTSState.ERROR
  246. raise RuntimeError(f"Coqui TTS Synthese fehlgeschlagen: {e}")
  247. async def synthesize_stream(
  248. self,
  249. text: str,
  250. voice_id: str | None = None,
  251. language: str | None = None,
  252. chunk_size: int = 4096,
  253. ) -> AsyncIterator[bytes]:
  254. """Synthetisiert Text als Stream (simuliert)."""
  255. # Coqui TTS unterstützt kein echtes Streaming
  256. result = await self.synthesize(text, voice_id, language)
  257. for i in range(0, len(result.audio_data), chunk_size):
  258. yield result.audio_data[i:i + chunk_size]
  259. class CoquiTTSPlugin(TrixyPlugin):
  260. """Coqui TTS Plugin für Trixy."""
  261. def __init__(self, application, plugin_path, config: dict | None = None):
  262. super().__init__(application, plugin_path, config)
  263. self._provider: CoquiTTSProvider | None = None
  264. async def on_load(self) -> None:
  265. """Plugin wird geladen."""
  266. from trixy_core.utils.debug import pinfo
  267. pinfo("Coqui TTS Plugin: Lade...")
  268. # Konfiguration
  269. model_name = self.config.get("model", "tts_models/de/thorsten/tacotron2-DDC")
  270. vocoder_name = self.config.get("vocoder")
  271. speaker = self.config.get("speaker")
  272. language = self.config.get("language", "de-DE")
  273. use_cuda = self.config.get("use_cuda", False)
  274. # Models-Verzeichnis im Plugin-Ordner
  275. models_dir = self.plugin_path / "models"
  276. models_dir.mkdir(parents=True, exist_ok=True)
  277. tts_config = TTSConfig(
  278. language=language,
  279. voice_id=model_name,
  280. )
  281. # Provider erstellen
  282. self._provider = CoquiTTSProvider(
  283. tts_config,
  284. models_dir=models_dir,
  285. model_name=model_name,
  286. vocoder_name=vocoder_name,
  287. speaker=speaker,
  288. use_cuda=use_cuda,
  289. )
  290. # Modell laden (und ggf. downloaden)
  291. await self._provider.initialize()
  292. # Extension registrieren
  293. if hasattr(self.application, "extension_points"):
  294. ext_point = self.application.extension_points.get("conversation.tts")
  295. if ext_point:
  296. ext_point.register(self._provider)
  297. pinfo("Coqui TTS Plugin: Extension registriert")
  298. # Event-Handler registrieren
  299. self._register_event_handlers()
  300. pinfo(f"Coqui TTS Plugin: Geladen (Modell: {model_name})")
  301. def _register_event_handlers(self) -> None:
  302. """Registriert Event-Handler."""
  303. em = self.application.events
  304. @em.on("tts_request")
  305. async def on_tts_request(event_name: str, data: dict) -> None:
  306. """Verarbeitet TTS-Anfragen."""
  307. if not self._provider or not self._provider.is_ready:
  308. return
  309. text = data.get("text")
  310. satellite_id = data.get("satellite_id")
  311. session_id = data.get("session_id")
  312. voice_id = data.get("voice_id")
  313. language = data.get("language")
  314. if not text:
  315. return
  316. from trixy_core.utils.debug import pinfo
  317. pinfo(f"Coqui TTS: Synthetisiere '{text[:50]}...'")
  318. try:
  319. result = await self._provider.synthesize(
  320. text,
  321. voice_id=voice_id,
  322. language=language,
  323. )
  324. await em.emit("tts_completed", {
  325. "audio_data": result.audio_data.hex(),
  326. "sample_rate": result.sample_rate,
  327. "duration_seconds": result.duration_seconds,
  328. "provider": "coqui",
  329. "text": text,
  330. "satellite_id": satellite_id,
  331. "session_id": session_id,
  332. "processing_time_ms": result.processing_time_ms,
  333. })
  334. pinfo(f"Coqui TTS: Fertig ({result.duration_seconds:.1f}s Audio, "
  335. f"{result.processing_time_ms:.0f}ms)")
  336. except Exception as e:
  337. from trixy_core.utils.debug import perror
  338. perror(f"Coqui TTS Fehler: {e}")
  339. await em.emit("tts_error", {
  340. "error": str(e),
  341. "provider": "coqui",
  342. "text": text,
  343. "satellite_id": satellite_id,
  344. "session_id": session_id,
  345. })
  346. async def on_unload(self) -> None:
  347. """Plugin wird entladen."""
  348. if self._provider:
  349. await self._provider.shutdown()
  350. self._provider = None
  351. from trixy_core.utils.debug import pinfo
  352. pinfo("Coqui TTS Plugin: Entladen")
  353. @property
  354. def provider(self) -> CoquiTTSProvider | None:
  355. """TTS-Provider."""
  356. return self._provider
  357. # Plugin-Export
  358. Plugin = CoquiTTSPlugin