main.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. # -*- coding: utf-8 -*-
  2. """
  3. DeepSpeech STT Plugin - Speech-to-Text mit Mozilla DeepSpeech.
  4. Hinweis: Mozilla DeepSpeech wird nicht mehr aktiv entwickelt.
  5. Dieses Plugin unterstützt vorhandene DeepSpeech-Modelle.
  6. Unterstützt:
  7. - Offline-Erkennung
  8. - Vortrainierte Modelle
  9. - Streaming
  10. - Auto-Download von Modellen
  11. """
  12. import asyncio
  13. import time
  14. from pathlib import Path
  15. from typing import Any, AsyncIterator
  16. from trixy_core.plugins import TrixyPlugin
  17. from trixy_core.audio.stt import STTProvider, STTConfig, STTResult, STTState, WordTiming
  18. # DeepSpeech Modell-Katalog
  19. DEEPSPEECH_MODELS = {
  20. "de": {
  21. "name": "deepspeech-0.9.3-models-de",
  22. "model_url": "https://github.com/AASHISHAG/deepspeech-german/releases/download/v0.9.0/output_graph.pbmm",
  23. "scorer_url": "https://github.com/AASHISHAG/deepspeech-german/releases/download/v0.9.0/kenlm.scorer",
  24. "model_file": "deepspeech-de.pbmm",
  25. "scorer_file": "deepspeech-de.scorer",
  26. "size": "~1.8 GB",
  27. "description": "Deutsches Modell (Community)",
  28. },
  29. "en": {
  30. "name": "deepspeech-0.9.3-models",
  31. "model_url": "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm",
  32. "scorer_url": "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer",
  33. "model_file": "deepspeech-0.9.3-models.pbmm",
  34. "scorer_file": "deepspeech-0.9.3-models.scorer",
  35. "size": "~1.0 GB",
  36. "description": "Offizielles englisches Modell",
  37. },
  38. }
  39. class DeepSpeechSTTProvider(STTProvider):
  40. """DeepSpeech-basierter STT-Provider."""
  41. def __init__(
  42. self,
  43. config: STTConfig | None = None,
  44. models_dir: Path | None = None,
  45. model_name: str = "de",
  46. auto_download: bool = True,
  47. event_manager: Any = None,
  48. ):
  49. super().__init__(config)
  50. self._models_dir = models_dir
  51. self._model_name = model_name
  52. self._auto_download = auto_download
  53. self._event_manager = event_manager
  54. self._model = None
  55. self._model_path: Path | None = None
  56. self._scorer_path: Path | None = None
  57. @property
  58. def name(self) -> str:
  59. return "deepspeech"
  60. @property
  61. def supported_languages(self) -> list[str]:
  62. return ["de-DE", "en-US", "en-GB"]
  63. @property
  64. def supports_streaming(self) -> bool:
  65. return True
  66. @property
  67. def supports_word_timings(self) -> bool:
  68. return True
  69. async def initialize(self) -> None:
  70. """Lädt das DeepSpeech-Modell."""
  71. try:
  72. from deepspeech import Model
  73. from trixy_core.utils.debug import pinfo, pwarn
  74. pwarn("DeepSpeech wird nicht mehr aktiv entwickelt. "
  75. "Erwäge Whisper oder Vosk als Alternative.")
  76. self._state = STTState.PROCESSING
  77. # Modell-Info
  78. if self._model_name not in DEEPSPEECH_MODELS:
  79. raise RuntimeError(
  80. f"Unbekanntes Modell: {self._model_name}. "
  81. f"Verfügbar: {', '.join(DEEPSPEECH_MODELS.keys())}"
  82. )
  83. model_info = DEEPSPEECH_MODELS[self._model_name]
  84. self._model_path = self._models_dir / model_info["model_file"]
  85. self._scorer_path = self._models_dir / model_info["scorer_file"]
  86. # Auto-Download wenn nicht vorhanden
  87. if not self._model_path.exists():
  88. if self._auto_download:
  89. await self._download_model()
  90. else:
  91. raise RuntimeError(
  92. f"DeepSpeech-Modell nicht gefunden: {self._model_path}. "
  93. "Auto-Download ist deaktiviert."
  94. )
  95. # Modell laden (in Thread)
  96. loop = asyncio.get_event_loop()
  97. def load_model():
  98. model = Model(str(self._model_path))
  99. if self._scorer_path.exists():
  100. model.enableExternalScorer(str(self._scorer_path))
  101. return model
  102. pinfo(f"Lade DeepSpeech-Modell: {self._model_name}...")
  103. self._model = await loop.run_in_executor(None, load_model)
  104. self._model_loaded = True
  105. self._state = STTState.READY
  106. pinfo(f"DeepSpeech-Modell geladen: {self._model_name}")
  107. except ImportError:
  108. raise RuntimeError(
  109. "DeepSpeech nicht installiert. "
  110. "Installieren mit: pip install deepspeech"
  111. )
  112. except Exception as e:
  113. self._state = STTState.ERROR
  114. raise RuntimeError(f"Fehler beim Laden des DeepSpeech-Modells: {e}")
  115. async def _download_model(self) -> None:
  116. """Lädt das Modell herunter."""
  117. from trixy_core.utils.debug import pinfo
  118. from trixy_core.utils.download import download_file
  119. model_info = DEEPSPEECH_MODELS[self._model_name]
  120. self._models_dir.mkdir(parents=True, exist_ok=True)
  121. pinfo(f"Lade DeepSpeech-Modell: {self._model_name} ({model_info['size']})...")
  122. # Progressbar wird automatisch von DownloadProgress angezeigt
  123. success = await download_file(
  124. url=model_info["model_url"],
  125. dest_path=self._model_path,
  126. event_manager=self._event_manager,
  127. download_id=f"deepspeech-{self._model_name}-model",
  128. )
  129. if not success:
  130. raise RuntimeError(f"Download des Modells fehlgeschlagen")
  131. # Scorer herunterladen (optional)
  132. pinfo("Lade Scorer...")
  133. await download_file(
  134. url=model_info["scorer_url"],
  135. dest_path=self._scorer_path,
  136. event_manager=self._event_manager,
  137. download_id=f"deepspeech-{self._model_name}-scorer",
  138. )
  139. pinfo(f"DeepSpeech-Modell heruntergeladen: {self._model_name}")
  140. async def shutdown(self) -> None:
  141. """Gibt Ressourcen frei."""
  142. self._model = None
  143. self._model_loaded = False
  144. self._state = STTState.UNINITIALIZED
  145. async def transcribe(
  146. self,
  147. audio_data: bytes,
  148. language: str | None = None,
  149. ) -> STTResult:
  150. """Transkribiert Audio mit DeepSpeech."""
  151. if not self._model:
  152. raise RuntimeError("DeepSpeech-Modell nicht geladen")
  153. self._state = STTState.PROCESSING
  154. start_time = time.time()
  155. try:
  156. import numpy as np
  157. # Audio-Daten zu numpy array (16-bit signed int)
  158. audio_np = np.frombuffer(audio_data, dtype=np.int16)
  159. # Transkription in Thread
  160. loop = asyncio.get_event_loop()
  161. def do_transcribe():
  162. return self._model.sttWithMetadata(audio_np, 1)
  163. metadata = await loop.run_in_executor(None, do_transcribe)
  164. processing_time = (time.time() - start_time) * 1000
  165. # Ergebnis extrahieren
  166. text = ""
  167. word_timings = []
  168. confidence = 0.0
  169. if metadata.transcripts:
  170. transcript = metadata.transcripts[0]
  171. confidence = transcript.confidence
  172. # Text und Timings aus Tokens
  173. current_word = ""
  174. word_start = 0.0
  175. for token in transcript.tokens:
  176. if token.text == " ":
  177. if current_word:
  178. word_timings.append(WordTiming(
  179. word=current_word,
  180. start_time=word_start,
  181. end_time=token.start_time,
  182. confidence=1.0,
  183. ))
  184. current_word = ""
  185. else:
  186. if not current_word:
  187. word_start = token.start_time
  188. current_word += token.text
  189. # Letztes Wort
  190. if current_word:
  191. word_timings.append(WordTiming(
  192. word=current_word,
  193. start_time=word_start,
  194. end_time=token.start_time + 0.1,
  195. confidence=1.0,
  196. ))
  197. text = " ".join(w.word for w in word_timings)
  198. self._state = STTState.READY
  199. return STTResult(
  200. text=text.strip(),
  201. confidence=confidence,
  202. language=language or self._config.language,
  203. duration_seconds=self.get_audio_duration(audio_data),
  204. processing_time_ms=processing_time,
  205. word_timings=word_timings,
  206. provider="deepspeech",
  207. model=f"deepspeech-{self._model_name}",
  208. raw_result=metadata,
  209. )
  210. except Exception as e:
  211. self._state = STTState.ERROR
  212. raise RuntimeError(f"DeepSpeech-Transkription fehlgeschlagen: {e}")
  213. async def transcribe_stream(
  214. self,
  215. audio_stream: AsyncIterator[bytes],
  216. language: str | None = None,
  217. ) -> AsyncIterator[STTResult]:
  218. """Transkribiert Audio-Stream mit DeepSpeech."""
  219. if not self._model:
  220. raise RuntimeError("DeepSpeech-Modell nicht geladen")
  221. import numpy as np
  222. # Stream-Kontext erstellen
  223. stream = self._model.createStream()
  224. start_time = time.time()
  225. all_audio = b""
  226. async for chunk in audio_stream:
  227. all_audio += chunk
  228. # Chunk verarbeiten
  229. audio_np = np.frombuffer(chunk, dtype=np.int16)
  230. stream.feedAudioContent(audio_np)
  231. # Intermediate result
  232. text = stream.intermediateDecode()
  233. if text:
  234. yield STTResult(
  235. text=text.strip(),
  236. confidence=0.5, # Intermediate hat keine Confidence
  237. language=language or self._config.language,
  238. is_final=False,
  239. provider="deepspeech",
  240. )
  241. # Final result
  242. metadata = stream.finishStreamWithMetadata(1)
  243. processing_time = (time.time() - start_time) * 1000
  244. text = ""
  245. confidence = 0.0
  246. word_timings = []
  247. if metadata.transcripts:
  248. transcript = metadata.transcripts[0]
  249. confidence = transcript.confidence
  250. current_word = ""
  251. word_start = 0.0
  252. for token in transcript.tokens:
  253. if token.text == " ":
  254. if current_word:
  255. word_timings.append(WordTiming(
  256. word=current_word,
  257. start_time=word_start,
  258. end_time=token.start_time,
  259. confidence=1.0,
  260. ))
  261. current_word = ""
  262. else:
  263. if not current_word:
  264. word_start = token.start_time
  265. current_word += token.text
  266. if current_word:
  267. word_timings.append(WordTiming(
  268. word=current_word,
  269. start_time=word_start,
  270. end_time=token.start_time + 0.1,
  271. confidence=1.0,
  272. ))
  273. text = " ".join(w.word for w in word_timings)
  274. yield STTResult(
  275. text=text.strip(),
  276. confidence=confidence,
  277. language=language or self._config.language,
  278. duration_seconds=self.get_audio_duration(all_audio),
  279. processing_time_ms=processing_time,
  280. word_timings=word_timings,
  281. is_final=True,
  282. provider="deepspeech",
  283. raw_result=metadata,
  284. )
  285. class DeepSpeechSTTPlugin(TrixyPlugin):
  286. """DeepSpeech STT Plugin für Trixy."""
  287. def __init__(self, application, plugin_path, config: dict | None = None):
  288. super().__init__(application, plugin_path, config)
  289. self._provider: DeepSpeechSTTProvider | None = None
  290. async def on_load(self) -> None:
  291. """Plugin wird geladen."""
  292. from trixy_core.utils.debug import pinfo, pwarn
  293. pinfo("DeepSpeech STT Plugin: Lade...")
  294. # Konfiguration
  295. model_name = self.config.get("model", "de")
  296. language = self.config.get("language", "de-DE")
  297. auto_download = self.config.get("auto_download", True)
  298. # Models-Verzeichnis im Plugin-Ordner
  299. models_dir = self.plugin_path / "models"
  300. models_dir.mkdir(parents=True, exist_ok=True)
  301. stt_config = STTConfig(
  302. language=language,
  303. )
  304. # Provider erstellen
  305. # Provider erstellen (mit EventManager für Download-Events)
  306. self._provider = DeepSpeechSTTProvider(
  307. stt_config,
  308. models_dir=models_dir,
  309. model_name=model_name,
  310. auto_download=auto_download,
  311. event_manager=self.application.events,
  312. )
  313. # Modell laden (und ggf. downloaden)
  314. await self._provider.initialize()
  315. # Extension registrieren
  316. if hasattr(self.application, "extension_points"):
  317. ext_point = self.application.extension_points.get("conversation.stt")
  318. if ext_point:
  319. ext_point.register(self._provider)
  320. pinfo("DeepSpeech STT Plugin: Extension registriert")
  321. # Event-Handler registrieren
  322. self._register_event_handlers()
  323. pinfo(f"DeepSpeech STT Plugin: Geladen (Modell: {model_name})")
  324. def _register_event_handlers(self) -> None:
  325. """Registriert Event-Handler."""
  326. em = self.application.events
  327. @em.on("raw_audio_received")
  328. async def on_audio_received(event_name: str, data: dict) -> None:
  329. """Verarbeitet empfangene Audio-Daten."""
  330. if not self._provider or not self._provider.is_ready:
  331. return
  332. audio_data = data.get("audio_data")
  333. satellite_id = data.get("satellite_id")
  334. session_id = data.get("session_id")
  335. if not audio_data:
  336. return
  337. if isinstance(audio_data, str):
  338. audio_data = bytes.fromhex(audio_data)
  339. from trixy_core.utils.debug import pinfo
  340. pinfo(f"DeepSpeech STT: Verarbeite Audio ({len(audio_data)} bytes)")
  341. try:
  342. result = await self._provider.transcribe(audio_data)
  343. await em.emit("stt_completed", {
  344. "text": result.text,
  345. "confidence": result.confidence,
  346. "language": result.language,
  347. "provider": "deepspeech",
  348. "satellite_id": satellite_id,
  349. "session_id": session_id,
  350. "duration_seconds": result.duration_seconds,
  351. "processing_time_ms": result.processing_time_ms,
  352. "word_timings": [w.__dict__ for w in result.word_timings],
  353. })
  354. pinfo(f"DeepSpeech STT: '{result.text}' ({result.processing_time_ms:.0f}ms)")
  355. # Event-Brücke: speech_recognized für NLP-Pipeline
  356. if result.text.strip():
  357. from trixy_core.events.event_data.basic import SpeechRecognized
  358. speech_event = SpeechRecognized(
  359. satellite_id=satellite_id or "",
  360. text=result.text,
  361. confidence=result.confidence,
  362. language=result.language or "de",
  363. is_final=True,
  364. source="stt",
  365. )
  366. speech_event.metadata["session_id"] = session_id or ""
  367. await em.trigger("speech_recognized", speech_event)
  368. except Exception as e:
  369. from trixy_core.utils.debug import perror
  370. perror(f"DeepSpeech STT Fehler: {e}")
  371. await em.emit("stt_error", {
  372. "error": str(e),
  373. "provider": "deepspeech",
  374. "satellite_id": satellite_id,
  375. "session_id": session_id,
  376. })
  377. async def on_unload(self) -> None:
  378. """Plugin wird entladen."""
  379. if self._provider:
  380. await self._provider.shutdown()
  381. self._provider = None
  382. from trixy_core.utils.debug import pinfo
  383. pinfo("DeepSpeech STT Plugin: Entladen")
  384. @property
  385. def provider(self) -> DeepSpeechSTTProvider | None:
  386. """STT-Provider."""
  387. return self._provider
  388. # Plugin-Export
  389. Plugin = DeepSpeechSTTPlugin