2 months ago · d55a6f5322
--- a/trixy_core/nlp/intent_classifier.py
+++ b/trixy_core/nlp/intent_classifier.py
@@ -90,8 +90,11 @@ class IntentClassifier:
 
				         self._encoder = None           # SentenceTransformer oder ONNX-Encoder
			
 
				         self._tokenizer = None         # HuggingFace Tokenizer (fuer ONNX-Encoder)
			
 
				         self._classifier = None        # ONNX InferenceSession oder PyTorch
			
 
				+        self._slot_tagger = None       # ONNX InferenceSession fuer BIO-Tagger
			
 
				+        self._bio_tagset = None        # BIOTagSet fuer Tag-Decoding
			
 
				         self._use_onnx = False
			
 
				-        self._use_onnx_encoder = False  # True wenn ONNX-Encoder statt SentenceTransformer
			
 
				+        self._use_onnx_encoder = False
			
 
				+        self._has_bio_tagger = False
			
 
				         self._intent_names: list[str] = []
			
 
				         self._slot_names: list[str] = []
			
 
				         self._embedding_dim: int = 0
			
@@ -222,10 +225,28 @@ class IntentClassifier:
 
				             perror(f"IntentClassifier: Kein Modell gefunden in {model_dir}")
			
 
				             return False
			
 
				 
			
 
				+        # BIO Slot-Tagger laden (optional)
			
 
				+        bio_onnx = model_dir / "slot_tagger.onnx"
			
 
				+        bio_labels = model_dir / "bio_labels.json"
			
 
				+        if bio_onnx.is_file() and bio_labels.is_file():
			
 
				+            try:
			
 
				+                import onnxruntime as ort
			
 
				+                self._slot_tagger = ort.InferenceSession(
			
 
				+                    str(bio_onnx), providers=["CPUExecutionProvider"],
			
 
				+                )
			
 
				+                with open(bio_labels) as f:
			
 
				+                    from trixy_core.trainer.core.intent.bio_tagger import BIOTagSet
			
 
				+                    self._bio_tagset = BIOTagSet.from_dict(json.load(f))
			
 
				+                self._has_bio_tagger = True
			
 
				+                pinfo(f"IntentClassifier: BIO Slot-Tagger geladen ({self._bio_tagset.num_tags} Tags)")
			
 
				+            except Exception as e:
			
 
				+                pdebug(f"BIO Slot-Tagger nicht geladen: {e}")
			
 
				+
			
 
				         self._loaded = True
			
 
				         pinfo(
			
 
				             f"IntentClassifier geladen: {len(self._intent_names)} Intents"
			
 
				             f"{', ONNX-Encoder' if self._use_onnx_encoder else ', SentenceTransformer'}"
			
 
				+            f"{', BIO-Slots' if self._has_bio_tagger else ''}"
			
 
				         )
			
 
				         return True
			
 
				 
			
@@ -304,8 +325,15 @@ class IntentClassifier:
 
				             if self._intent_names[idx] != "__negative__"
			
 
				         ]
			
 
				 
			
 
				-        # 4. Slot-Extraktion (regelbasiert auf Basis der Slot-Listen)
			
 
				-        slots = self._extract_slots(text) if best_intent != "unknown" else {}
			
 
				+        # 4. Slot-Extraktion
			
 
				+        slots: dict[str, Any] = {}
			
 
				+        if best_intent != "unknown":
			
 
				+            # BIO-Tagger hat Vorrang (neuronale Slot-Extraktion)
			
 
				+            if self._has_bio_tagger and self._use_onnx_encoder:
			
 
				+                slots = self._extract_slots_bio(token_embeddings, inputs)
			
 
				+            # Fallback: Regelbasiert
			
 
				+            if not slots:
			
 
				+                slots = self._extract_slots(text)
			
 
				 
			
 
				         # 5. Tone-Analyse (Schlagwort + Satzstruktur)
			
 
				         from trixy_core.nlp.tone import analyze_tone
			
@@ -323,6 +351,53 @@ class IntentClassifier:
 
				             alternatives=alternatives,
			
 
				         )
			
 
				 
			
 
				+    def _extract_slots_bio(
			
 
				+        self, token_embeddings: "np.ndarray", inputs: dict,
			
 
				+    ) -> dict[str, Any]:
			
 
				+        """
			
 
				+        Extrahiert Slots via BIO-Tagger auf Token-Embeddings.
			
 
				+
			
 
				+        Args:
			
 
				+            token_embeddings: (seq_len, 384) vom Encoder
			
 
				+            inputs: Tokenizer-Output mit input_ids
			
 
				+
			
 
				+        Returns:
			
 
				+            Dict mit extrahierten Slots
			
 
				+        """
			
 
				+        import numpy as np
			
 
				+
			
 
				+        if not self._slot_tagger or not self._bio_tagset:
			
 
				+            return {}
			
 
				+
			
 
				+        try:
			
 
				+            # BIO-Logits berechnen: (1, seq_len, num_tags)
			
 
				+            token_emb_batch = token_embeddings.reshape(1, *token_embeddings.shape).astype(np.float32)
			
 
				+            bio_outputs = self._slot_tagger.run(
			
 
				+                None, {"token_embeddings": token_emb_batch},
			
 
				+            )
			
 
				+            bio_logits = bio_outputs[0][0]  # (seq_len, num_tags)
			
 
				+
			
 
				+            # Argmax → Tag-Indizes
			
 
				+            tag_indices = bio_logits.argmax(axis=1).tolist()  # (seq_len,)
			
 
				+
			
 
				+            # Token-IDs fuer Decoding
			
 
				+            token_ids = inputs["input_ids"][0].tolist() if hasattr(inputs["input_ids"], "tolist") else list(inputs["input_ids"][0])
			
 
				+
			
 
				+            # BIO-Tags decodieren → Slots
			
 
				+            from trixy_core.trainer.core.intent.bio_tagger import decode_bio_tags
			
 
				+            slots = decode_bio_tags(
			
 
				+                tag_indices, token_ids, self._bio_tagset, self._tokenizer,
			
 
				+            )
			
 
				+
			
 
				+            if slots:
			
 
				+                pdebug(f"IntentClassifier BIO-Slots: {slots}")
			
 
				+
			
 
				+            return slots
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            pdebug(f"BIO Slot-Extraktion Fehler: {e}")
			
 
				+            return {}
			
 
				+
			
 
				     def _extract_slots(self, text: str) -> dict[str, str | list[str]]:
			
 
				         """
			
 
				         Extrahiert Slots aus dem Text via Pattern-Matching.
			
--- a/trixy_core/trainer/core/intent/data_generator.py
+++ b/trixy_core/trainer/core/intent/data_generator.py
@@ -299,7 +299,13 @@ class DataGenerator:
 
				 
			
 
				             all_samples.extend(samples)
			
 
				 
			
 
				-        # 6. Negativ-Beispiele
			
 
				+        # 6. BIO-Tags generieren fuer Samples mit Slots
			
 
				+        from trixy_core.trainer.core.intent.bio_tagger import generate_word_bio_tags
			
 
				+        for sample in all_samples:
			
 
				+            if sample.slots:
			
 
				+                sample.bio_tags = generate_word_bio_tags(sample.text, sample.slots)
			
 
				+
			
 
				+        # 7. Negativ-Beispiele
			
 
				         if include_negative:
			
 
				             neg_count = max(len(all_samples) // 4, 50)
			
 
				             negatives = self._generate_negatives(neg_count, rng)
			
--- a/trixy_core/trainer/core/intent/trainer.py
+++ b/trixy_core/trainer/core/intent/trainer.py
@@ -528,7 +528,22 @@ class IntentTrainer(ITrainer):
 
				         with open(model_dir / "intent_labels.json", "w") as f:
			
 
				             json.dump(intent_names, f, ensure_ascii=False)
			
 
				 
			
 
				-        # === 10. ONNX Export ===
			
 
				+        # === 10. BIO Slot-Tagger trainieren ===
			
 
				+        if check_stop():
			
 
				+            return
			
 
				+
			
 
				+        report(0.88, "Trainiere BIO Slot-Tagger...")
			
 
				+        try:
			
 
				+            self._train_bio_tagger(
			
 
				+                samples, encoder, model_dir, epochs,
			
 
				+                batch_size, report, check_stop, wait_if_paused,
			
 
				+            )
			
 
				+        except Exception as e:
			
 
				+            perror(f"BIO-Tagger Training fehlgeschlagen: {e}")
			
 
				+            import traceback
			
 
				+            pdebug(traceback.format_exc())
			
 
				+
			
 
				+        # === 11. ONNX Export ===
			
 
				         if self._settings.get("export_onnx", True):
			
 
				             report(0.92, "Exportiere Classifier nach ONNX...")
			
 
				             try:
			
@@ -573,6 +588,229 @@ class IntentTrainer(ITrainer):
 
				             f"CV-Mean={np.mean([f['accuracy'] for f in fold_results]):.3f}"
			
 
				         )
			
 
				 
			
 
				+    # === BIO Slot-Tagger ===
			
 
				+
			
 
				+    def _train_bio_tagger(
			
 
				+        self,
			
 
				+        samples: list,
			
 
				+        encoder: Any,
			
 
				+        model_dir: Path,
			
 
				+        epochs: int,
			
 
				+        batch_size: int,
			
 
				+        report: Any,
			
 
				+        check_stop: Any,
			
 
				+        wait_if_paused: Any,
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        Trainiert den BIO-Slot-Tagger auf Token-Level Embeddings.
			
 
				+
			
 
				+        Verwendet den gleichen Encoder wie der Intent-Classifier.
			
 
				+        Nur Samples mit Slots werden fuer das BIO-Training genutzt.
			
 
				+        """
			
 
				+        import torch
			
 
				+        import numpy as np
			
 
				+        from trixy_core.trainer.core.intent.bio_tagger import (
			
 
				+            BIOTagSet, generate_word_bio_tags, align_bio_to_tokens,
			
 
				+        )
			
 
				+
			
 
				+        MAX_SEQ_LEN = 64
			
 
				+
			
 
				+        # Tokenizer holen
			
 
				+        tokenizer = encoder.tokenizer
			
 
				+
			
 
				+        # Samples mit Slots filtern
			
 
				+        slot_samples = [s for s in samples if s.slots and s.intent != "__negative__"]
			
 
				+        if not slot_samples:
			
 
				+            pinfo("BIO-Tagger: Keine Samples mit Slots — uebersprungen")
			
 
				+            return
			
 
				+
			
 
				+        # Alle Slot-Namen sammeln
			
 
				+        all_slot_names: set[str] = set()
			
 
				+        for s in slot_samples:
			
 
				+            all_slot_names.update(s.slots.keys())
			
 
				+
			
 
				+        # Auch Samples ohne Slots als Negative nehmen (lernt "O" Tags)
			
 
				+        no_slot_samples = [s for s in samples if not s.slots and s.intent != "__negative__"]
			
 
				+        import random
			
 
				+        rng = random.Random(42)
			
 
				+        neg_count = min(len(no_slot_samples), len(slot_samples))
			
 
				+        if no_slot_samples:
			
 
				+            neg_bio_samples = rng.sample(no_slot_samples, neg_count)
			
 
				+        else:
			
 
				+            neg_bio_samples = []
			
 
				+
			
 
				+        bio_samples = slot_samples + neg_bio_samples
			
 
				+        rng.shuffle(bio_samples)
			
 
				+
			
 
				+        pinfo(
			
 
				+            f"BIO-Tagger: {len(slot_samples)} Slot-Samples, "
			
 
				+            f"{len(neg_bio_samples)} Neg-Samples, "
			
 
				+            f"{len(all_slot_names)} Slot-Typen: {sorted(all_slot_names)}"
			
 
				+        )
			
 
				+
			
 
				+        # BIO-TagSet erstellen
			
 
				+        tagset = BIOTagSet(slot_names=sorted(all_slot_names))
			
 
				+        pinfo(f"BIO-Tags: {tagset.num_tags} ({tagset.tags})")
			
 
				+
			
 
				+        # Token-Embeddings und BIO-Labels berechnen
			
 
				+        report(0.89, f"BIO: Berechne Token-Embeddings ({len(bio_samples)} Samples)...")
			
 
				+
			
 
				+        # Transformer-Modell aus SentenceTransformer extrahieren
			
 
				+        transformer = encoder[0]
			
 
				+        auto_model = transformer.auto_model.cpu()  # Auf CPU (vermeidet CUDA-Mismatch)
			
 
				+        auto_model.eval()
			
 
				+
			
 
				+        all_token_embs: list[np.ndarray] = []
			
 
				+        all_bio_labels: list[np.ndarray] = []
			
 
				+
			
 
				+        for i, sample in enumerate(bio_samples):
			
 
				+            if check_stop():
			
 
				+                return
			
 
				+
			
 
				+            # 1. Wort-Level BIO-Tags
			
 
				+            word_tags = generate_word_bio_tags(sample.text, sample.slots)
			
 
				+
			
 
				+            # 2. Tokenisieren
			
 
				+            encoding = tokenizer(
			
 
				+                sample.text,
			
 
				+                padding="max_length",
			
 
				+                truncation=True,
			
 
				+                max_length=MAX_SEQ_LEN,
			
 
				+                return_tensors="pt",
			
 
				+                return_offsets_mapping=True,
			
 
				+            )
			
 
				+
			
 
				+            # 3. Token-Level BIO-Tags (Subword-aligned)
			
 
				+            token_tags = align_bio_to_tokens(
			
 
				+                word_tags, sample.text, tokenizer, MAX_SEQ_LEN,
			
 
				+            )
			
 
				+
			
 
				+            # 4. BIO-Label Indizes
			
 
				+            label_indices = [tagset.tag_to_idx(t) for t in token_tags]
			
 
				+            all_bio_labels.append(np.array(label_indices, dtype=np.int64))
			
 
				+
			
 
				+            # 5. Token-Embeddings via Encoder
			
 
				+            with torch.no_grad():
			
 
				+                input_ids = encoding["input_ids"]
			
 
				+                attention_mask = encoding["attention_mask"]
			
 
				+                outputs = auto_model(input_ids=input_ids, attention_mask=attention_mask)
			
 
				+                token_emb = outputs.last_hidden_state[0].cpu().numpy()  # (seq_len, 384)
			
 
				+
			
 
				+            all_token_embs.append(token_emb)
			
 
				+
			
 
				+            if (i + 1) % 500 == 0:
			
 
				+                report(0.89 + (i / len(bio_samples)) * 0.03,
			
 
				+                       f"BIO: Token-Embeddings {i+1}/{len(bio_samples)}")
			
 
				+
			
 
				+        # Zu Tensoren
			
 
				+        X_bio = torch.FloatTensor(np.stack(all_token_embs))   # (N, seq_len, 384)
			
 
				+        y_bio = torch.LongTensor(np.stack(all_bio_labels))    # (N, seq_len)
			
 
				+
			
 
				+        embedding_dim = X_bio.shape[2]
			
 
				+
			
 
				+        # BIO-Kopf erstellen
			
 
				+        bio_head = torch.nn.Sequential(
			
 
				+            torch.nn.Linear(embedding_dim, 128),
			
 
				+            torch.nn.ReLU(),
			
 
				+            torch.nn.Dropout(0.3),
			
 
				+            torch.nn.Linear(128, tagset.num_tags),
			
 
				+        )
			
 
				+
			
 
				+        optimizer = torch.optim.Adam(bio_head.parameters(), lr=1e-3)
			
 
				+        loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tagset.pad_idx)
			
 
				+
			
 
				+        # Training
			
 
				+        bio_epochs = min(epochs, 20)
			
 
				+        report(0.92, f"BIO: Trainiere {bio_epochs} Epochen...")
			
 
				+
			
 
				+        for epoch in range(bio_epochs):
			
 
				+            if check_stop():
			
 
				+                return
			
 
				+            wait_if_paused()
			
 
				+
			
 
				+            bio_head.train()
			
 
				+            epoch_loss = 0.0
			
 
				+
			
 
				+            indices = torch.randperm(len(X_bio))
			
 
				+            for i in range(0, len(indices), batch_size):
			
 
				+                batch_idx = indices[i:i + batch_size]
			
 
				+                batch_x = X_bio[batch_idx]    # (B, seq_len, 384)
			
 
				+                batch_y = y_bio[batch_idx]    # (B, seq_len)
			
 
				+
			
 
				+                # Forward: (B, seq_len, 384) → (B, seq_len, num_tags)
			
 
				+                logits = bio_head(batch_x)
			
 
				+
			
 
				+                # Loss: Reshape fuer CrossEntropy
			
 
				+                # (B*seq_len, num_tags) vs (B*seq_len,)
			
 
				+                loss = loss_fn(
			
 
				+                    logits.view(-1, tagset.num_tags),
			
 
				+                    batch_y.view(-1),
			
 
				+                )
			
 
				+
			
 
				+                optimizer.zero_grad()
			
 
				+                loss.backward()
			
 
				+                optimizer.step()
			
 
				+                epoch_loss += loss.item()
			
 
				+
			
 
				+            avg_loss = epoch_loss / max(1, len(X_bio) // batch_size)
			
 
				+
			
 
				+            if (epoch + 1) % 5 == 0 or epoch == 0:
			
 
				+                # Accuracy (ohne PAD)
			
 
				+                bio_head.eval()
			
 
				+                with torch.no_grad():
			
 
				+                    all_logits = bio_head(X_bio)
			
 
				+                    preds = all_logits.argmax(dim=2)  # (N, seq_len)
			
 
				+
			
 
				+                    mask = y_bio != tagset.pad_idx
			
 
				+                    correct = ((preds == y_bio) & mask).sum().item()
			
 
				+                    total = mask.sum().item()
			
 
				+                    acc = correct / total if total > 0 else 0
			
 
				+
			
 
				+                    # Slot-spezifische Accuracy (nicht-O Tags)
			
 
				+                    slot_mask = mask & (y_bio != tagset.o_idx)
			
 
				+                    slot_correct = ((preds == y_bio) & slot_mask).sum().item()
			
 
				+                    slot_total = slot_mask.sum().item()
			
 
				+                    slot_acc = slot_correct / slot_total if slot_total > 0 else 0
			
 
				+
			
 
				+                pinfo(
			
 
				+                    f"BIO Epoch {epoch+1}/{bio_epochs}: "
			
 
				+                    f"loss={avg_loss:.4f}, acc={acc:.3f}, slot_acc={slot_acc:.3f}"
			
 
				+                )
			
 
				+
			
 
				+        # Speichern
			
 
				+        report(0.95, "BIO: Speichere Modell...")
			
 
				+        torch.save(bio_head.state_dict(), model_dir / "slot_tagger.pth")
			
 
				+
			
 
				+        # BIO-Labels speichern
			
 
				+        with open(model_dir / "bio_labels.json", "w") as f:
			
 
				+            json.dump(tagset.to_dict(), f, indent=2, ensure_ascii=False)
			
 
				+
			
 
				+        # ONNX Export des BIO-Kopfs
			
 
				+        try:
			
 
				+            bio_head.eval()
			
 
				+            dummy = torch.randn(1, MAX_SEQ_LEN, embedding_dim)
			
 
				+            torch.onnx.export(
			
 
				+                bio_head,
			
 
				+                dummy,
			
 
				+                str(model_dir / "slot_tagger.onnx"),
			
 
				+                input_names=["token_embeddings"],
			
 
				+                output_names=["bio_logits"],
			
 
				+                dynamic_axes={
			
 
				+                    "token_embeddings": {0: "batch", 1: "seq_len"},
			
 
				+                    "bio_logits": {0: "batch", 1: "seq_len"},
			
 
				+                },
			
 
				+                opset_version=18,
			
 
				+            )
			
 
				+            size_kb = (model_dir / "slot_tagger.onnx").stat().st_size / 1024
			
 
				+            pinfo(f"BIO ONNX: slot_tagger.onnx ({size_kb:.1f} KB)")
			
 
				+        except Exception as e:
			
 
				+            perror(f"BIO ONNX Export fehlgeschlagen: {e}")
			
 
				+
			
 
				+        pinfo(
			
 
				+            f"BIO-Tagger Training abgeschlossen: "
			
 
				+            f"{tagset.num_tags} Tags, {len(bio_samples)} Samples"
			
 
				+        )
			
 
				+
			
 
				     # === Hilfsmethoden ===
			
 
				 
			
 
				     def _create_generator(self, slot_registry: SlotRegistry | None = None) -> DataGenerator: