فهرست منبع

Joint Intent + BIO Slot-Tagger (Token-Level)

Training:
- BIO-Tagger als separate Phase nach Intent-Training
- Token-Level Embeddings via SentenceTransformer auto_model
- Slot-Samples + Negative fuer balanciertes BIO-Training
- Joint Training: Intent-Kopf + BIO-Kopf
- ONNX Export: slot_tagger.onnx (6KB)
- bio_labels.json mit Tag-Index-Mapping

Runtime:
- BIO-Tagger automatisch geladen wenn vorhanden
- Token-Embeddings (vor Mean-Pool) fuer BIO-Decoding genutzt
- Slot-Extraktion: BIO-Tagger → Fallback regelbasiert
- decode_bio_tags(): Subword-Zusammenfuehrung + Multi-Value

Getestet: "wetter morgen in berlin" → {city: "berlin"} ✅
Noch offen: query, content, name Slots brauchen mehr Training-Coverage

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
patrick 1 هفته پیش
والد
کامیت
d55a6f5322
3فایلهای تغییر یافته به همراه324 افزوده شده و 5 حذف شده
  1. 78 3
      trixy_core/nlp/intent_classifier.py
  2. 7 1
      trixy_core/trainer/core/intent/data_generator.py
  3. 239 1
      trixy_core/trainer/core/intent/trainer.py

+ 78 - 3
trixy_core/nlp/intent_classifier.py

@@ -90,8 +90,11 @@ class IntentClassifier:
         self._encoder = None           # SentenceTransformer oder ONNX-Encoder
         self._tokenizer = None         # HuggingFace Tokenizer (fuer ONNX-Encoder)
         self._classifier = None        # ONNX InferenceSession oder PyTorch
+        self._slot_tagger = None       # ONNX InferenceSession fuer BIO-Tagger
+        self._bio_tagset = None        # BIOTagSet fuer Tag-Decoding
         self._use_onnx = False
-        self._use_onnx_encoder = False  # True wenn ONNX-Encoder statt SentenceTransformer
+        self._use_onnx_encoder = False
+        self._has_bio_tagger = False
         self._intent_names: list[str] = []
         self._slot_names: list[str] = []
         self._embedding_dim: int = 0
@@ -222,10 +225,28 @@ class IntentClassifier:
             perror(f"IntentClassifier: Kein Modell gefunden in {model_dir}")
             return False
 
+        # BIO Slot-Tagger laden (optional)
+        bio_onnx = model_dir / "slot_tagger.onnx"
+        bio_labels = model_dir / "bio_labels.json"
+        if bio_onnx.is_file() and bio_labels.is_file():
+            try:
+                import onnxruntime as ort
+                self._slot_tagger = ort.InferenceSession(
+                    str(bio_onnx), providers=["CPUExecutionProvider"],
+                )
+                with open(bio_labels) as f:
+                    from trixy_core.trainer.core.intent.bio_tagger import BIOTagSet
+                    self._bio_tagset = BIOTagSet.from_dict(json.load(f))
+                self._has_bio_tagger = True
+                pinfo(f"IntentClassifier: BIO Slot-Tagger geladen ({self._bio_tagset.num_tags} Tags)")
+            except Exception as e:
+                pdebug(f"BIO Slot-Tagger nicht geladen: {e}")
+
         self._loaded = True
         pinfo(
             f"IntentClassifier geladen: {len(self._intent_names)} Intents"
             f"{', ONNX-Encoder' if self._use_onnx_encoder else ', SentenceTransformer'}"
+            f"{', BIO-Slots' if self._has_bio_tagger else ''}"
         )
         return True
 
@@ -304,8 +325,15 @@ class IntentClassifier:
             if self._intent_names[idx] != "__negative__"
         ]
 
-        # 4. Slot-Extraktion (regelbasiert auf Basis der Slot-Listen)
-        slots = self._extract_slots(text) if best_intent != "unknown" else {}
+        # 4. Slot-Extraktion
+        slots: dict[str, Any] = {}
+        if best_intent != "unknown":
+            # BIO-Tagger hat Vorrang (neuronale Slot-Extraktion)
+            if self._has_bio_tagger and self._use_onnx_encoder:
+                slots = self._extract_slots_bio(token_embeddings, inputs)
+            # Fallback: Regelbasiert
+            if not slots:
+                slots = self._extract_slots(text)
 
         # 5. Tone-Analyse (Schlagwort + Satzstruktur)
         from trixy_core.nlp.tone import analyze_tone
@@ -323,6 +351,53 @@ class IntentClassifier:
             alternatives=alternatives,
         )
 
+    def _extract_slots_bio(
+        self, token_embeddings: "np.ndarray", inputs: dict,
+    ) -> dict[str, Any]:
+        """
+        Extrahiert Slots via BIO-Tagger auf Token-Embeddings.
+
+        Args:
+            token_embeddings: (seq_len, 384) vom Encoder
+            inputs: Tokenizer-Output mit input_ids
+
+        Returns:
+            Dict mit extrahierten Slots
+        """
+        import numpy as np
+
+        if not self._slot_tagger or not self._bio_tagset:
+            return {}
+
+        try:
+            # BIO-Logits berechnen: (1, seq_len, num_tags)
+            token_emb_batch = token_embeddings.reshape(1, *token_embeddings.shape).astype(np.float32)
+            bio_outputs = self._slot_tagger.run(
+                None, {"token_embeddings": token_emb_batch},
+            )
+            bio_logits = bio_outputs[0][0]  # (seq_len, num_tags)
+
+            # Argmax → Tag-Indizes
+            tag_indices = bio_logits.argmax(axis=1).tolist()  # (seq_len,)
+
+            # Token-IDs fuer Decoding
+            token_ids = inputs["input_ids"][0].tolist() if hasattr(inputs["input_ids"], "tolist") else list(inputs["input_ids"][0])
+
+            # BIO-Tags decodieren → Slots
+            from trixy_core.trainer.core.intent.bio_tagger import decode_bio_tags
+            slots = decode_bio_tags(
+                tag_indices, token_ids, self._bio_tagset, self._tokenizer,
+            )
+
+            if slots:
+                pdebug(f"IntentClassifier BIO-Slots: {slots}")
+
+            return slots
+
+        except Exception as e:
+            pdebug(f"BIO Slot-Extraktion Fehler: {e}")
+            return {}
+
     def _extract_slots(self, text: str) -> dict[str, str | list[str]]:
         """
         Extrahiert Slots aus dem Text via Pattern-Matching.

+ 7 - 1
trixy_core/trainer/core/intent/data_generator.py

@@ -299,7 +299,13 @@ class DataGenerator:
 
             all_samples.extend(samples)
 
-        # 6. Negativ-Beispiele
+        # 6. BIO-Tags generieren fuer Samples mit Slots
+        from trixy_core.trainer.core.intent.bio_tagger import generate_word_bio_tags
+        for sample in all_samples:
+            if sample.slots:
+                sample.bio_tags = generate_word_bio_tags(sample.text, sample.slots)
+
+        # 7. Negativ-Beispiele
         if include_negative:
             neg_count = max(len(all_samples) // 4, 50)
             negatives = self._generate_negatives(neg_count, rng)

+ 239 - 1
trixy_core/trainer/core/intent/trainer.py

@@ -528,7 +528,22 @@ class IntentTrainer(ITrainer):
         with open(model_dir / "intent_labels.json", "w") as f:
             json.dump(intent_names, f, ensure_ascii=False)
 
-        # === 10. ONNX Export ===
+        # === 10. BIO Slot-Tagger trainieren ===
+        if check_stop():
+            return
+
+        report(0.88, "Trainiere BIO Slot-Tagger...")
+        try:
+            self._train_bio_tagger(
+                samples, encoder, model_dir, epochs,
+                batch_size, report, check_stop, wait_if_paused,
+            )
+        except Exception as e:
+            perror(f"BIO-Tagger Training fehlgeschlagen: {e}")
+            import traceback
+            pdebug(traceback.format_exc())
+
+        # === 11. ONNX Export ===
         if self._settings.get("export_onnx", True):
             report(0.92, "Exportiere Classifier nach ONNX...")
             try:
@@ -573,6 +588,229 @@ class IntentTrainer(ITrainer):
             f"CV-Mean={np.mean([f['accuracy'] for f in fold_results]):.3f}"
         )
 
+    # === BIO Slot-Tagger ===
+
+    def _train_bio_tagger(
+        self,
+        samples: list,
+        encoder: Any,
+        model_dir: Path,
+        epochs: int,
+        batch_size: int,
+        report: Any,
+        check_stop: Any,
+        wait_if_paused: Any,
+    ) -> None:
+        """
+        Trainiert den BIO-Slot-Tagger auf Token-Level Embeddings.
+
+        Verwendet den gleichen Encoder wie der Intent-Classifier.
+        Nur Samples mit Slots werden fuer das BIO-Training genutzt.
+        """
+        import torch
+        import numpy as np
+        from trixy_core.trainer.core.intent.bio_tagger import (
+            BIOTagSet, generate_word_bio_tags, align_bio_to_tokens,
+        )
+
+        MAX_SEQ_LEN = 64
+
+        # Tokenizer holen
+        tokenizer = encoder.tokenizer
+
+        # Samples mit Slots filtern
+        slot_samples = [s for s in samples if s.slots and s.intent != "__negative__"]
+        if not slot_samples:
+            pinfo("BIO-Tagger: Keine Samples mit Slots — uebersprungen")
+            return
+
+        # Alle Slot-Namen sammeln
+        all_slot_names: set[str] = set()
+        for s in slot_samples:
+            all_slot_names.update(s.slots.keys())
+
+        # Auch Samples ohne Slots als Negative nehmen (lernt "O" Tags)
+        no_slot_samples = [s for s in samples if not s.slots and s.intent != "__negative__"]
+        import random
+        rng = random.Random(42)
+        neg_count = min(len(no_slot_samples), len(slot_samples))
+        if no_slot_samples:
+            neg_bio_samples = rng.sample(no_slot_samples, neg_count)
+        else:
+            neg_bio_samples = []
+
+        bio_samples = slot_samples + neg_bio_samples
+        rng.shuffle(bio_samples)
+
+        pinfo(
+            f"BIO-Tagger: {len(slot_samples)} Slot-Samples, "
+            f"{len(neg_bio_samples)} Neg-Samples, "
+            f"{len(all_slot_names)} Slot-Typen: {sorted(all_slot_names)}"
+        )
+
+        # BIO-TagSet erstellen
+        tagset = BIOTagSet(slot_names=sorted(all_slot_names))
+        pinfo(f"BIO-Tags: {tagset.num_tags} ({tagset.tags})")
+
+        # Token-Embeddings und BIO-Labels berechnen
+        report(0.89, f"BIO: Berechne Token-Embeddings ({len(bio_samples)} Samples)...")
+
+        # Transformer-Modell aus SentenceTransformer extrahieren
+        transformer = encoder[0]
+        auto_model = transformer.auto_model.cpu()  # Auf CPU (vermeidet CUDA-Mismatch)
+        auto_model.eval()
+
+        all_token_embs: list[np.ndarray] = []
+        all_bio_labels: list[np.ndarray] = []
+
+        for i, sample in enumerate(bio_samples):
+            if check_stop():
+                return
+
+            # 1. Wort-Level BIO-Tags
+            word_tags = generate_word_bio_tags(sample.text, sample.slots)
+
+            # 2. Tokenisieren
+            encoding = tokenizer(
+                sample.text,
+                padding="max_length",
+                truncation=True,
+                max_length=MAX_SEQ_LEN,
+                return_tensors="pt",
+                return_offsets_mapping=True,
+            )
+
+            # 3. Token-Level BIO-Tags (Subword-aligned)
+            token_tags = align_bio_to_tokens(
+                word_tags, sample.text, tokenizer, MAX_SEQ_LEN,
+            )
+
+            # 4. BIO-Label Indizes
+            label_indices = [tagset.tag_to_idx(t) for t in token_tags]
+            all_bio_labels.append(np.array(label_indices, dtype=np.int64))
+
+            # 5. Token-Embeddings via Encoder
+            with torch.no_grad():
+                input_ids = encoding["input_ids"]
+                attention_mask = encoding["attention_mask"]
+                outputs = auto_model(input_ids=input_ids, attention_mask=attention_mask)
+                token_emb = outputs.last_hidden_state[0].cpu().numpy()  # (seq_len, 384)
+
+            all_token_embs.append(token_emb)
+
+            if (i + 1) % 500 == 0:
+                report(0.89 + (i / len(bio_samples)) * 0.03,
+                       f"BIO: Token-Embeddings {i+1}/{len(bio_samples)}")
+
+        # Zu Tensoren
+        X_bio = torch.FloatTensor(np.stack(all_token_embs))   # (N, seq_len, 384)
+        y_bio = torch.LongTensor(np.stack(all_bio_labels))    # (N, seq_len)
+
+        embedding_dim = X_bio.shape[2]
+
+        # BIO-Kopf erstellen
+        bio_head = torch.nn.Sequential(
+            torch.nn.Linear(embedding_dim, 128),
+            torch.nn.ReLU(),
+            torch.nn.Dropout(0.3),
+            torch.nn.Linear(128, tagset.num_tags),
+        )
+
+        optimizer = torch.optim.Adam(bio_head.parameters(), lr=1e-3)
+        loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tagset.pad_idx)
+
+        # Training
+        bio_epochs = min(epochs, 20)
+        report(0.92, f"BIO: Trainiere {bio_epochs} Epochen...")
+
+        for epoch in range(bio_epochs):
+            if check_stop():
+                return
+            wait_if_paused()
+
+            bio_head.train()
+            epoch_loss = 0.0
+
+            indices = torch.randperm(len(X_bio))
+            for i in range(0, len(indices), batch_size):
+                batch_idx = indices[i:i + batch_size]
+                batch_x = X_bio[batch_idx]    # (B, seq_len, 384)
+                batch_y = y_bio[batch_idx]    # (B, seq_len)
+
+                # Forward: (B, seq_len, 384) → (B, seq_len, num_tags)
+                logits = bio_head(batch_x)
+
+                # Loss: Reshape fuer CrossEntropy
+                # (B*seq_len, num_tags) vs (B*seq_len,)
+                loss = loss_fn(
+                    logits.view(-1, tagset.num_tags),
+                    batch_y.view(-1),
+                )
+
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+                epoch_loss += loss.item()
+
+            avg_loss = epoch_loss / max(1, len(X_bio) // batch_size)
+
+            if (epoch + 1) % 5 == 0 or epoch == 0:
+                # Accuracy (ohne PAD)
+                bio_head.eval()
+                with torch.no_grad():
+                    all_logits = bio_head(X_bio)
+                    preds = all_logits.argmax(dim=2)  # (N, seq_len)
+
+                    mask = y_bio != tagset.pad_idx
+                    correct = ((preds == y_bio) & mask).sum().item()
+                    total = mask.sum().item()
+                    acc = correct / total if total > 0 else 0
+
+                    # Slot-spezifische Accuracy (nicht-O Tags)
+                    slot_mask = mask & (y_bio != tagset.o_idx)
+                    slot_correct = ((preds == y_bio) & slot_mask).sum().item()
+                    slot_total = slot_mask.sum().item()
+                    slot_acc = slot_correct / slot_total if slot_total > 0 else 0
+
+                pinfo(
+                    f"BIO Epoch {epoch+1}/{bio_epochs}: "
+                    f"loss={avg_loss:.4f}, acc={acc:.3f}, slot_acc={slot_acc:.3f}"
+                )
+
+        # Speichern
+        report(0.95, "BIO: Speichere Modell...")
+        torch.save(bio_head.state_dict(), model_dir / "slot_tagger.pth")
+
+        # BIO-Labels speichern
+        with open(model_dir / "bio_labels.json", "w") as f:
+            json.dump(tagset.to_dict(), f, indent=2, ensure_ascii=False)
+
+        # ONNX Export des BIO-Kopfs
+        try:
+            bio_head.eval()
+            dummy = torch.randn(1, MAX_SEQ_LEN, embedding_dim)
+            torch.onnx.export(
+                bio_head,
+                dummy,
+                str(model_dir / "slot_tagger.onnx"),
+                input_names=["token_embeddings"],
+                output_names=["bio_logits"],
+                dynamic_axes={
+                    "token_embeddings": {0: "batch", 1: "seq_len"},
+                    "bio_logits": {0: "batch", 1: "seq_len"},
+                },
+                opset_version=18,
+            )
+            size_kb = (model_dir / "slot_tagger.onnx").stat().st_size / 1024
+            pinfo(f"BIO ONNX: slot_tagger.onnx ({size_kb:.1f} KB)")
+        except Exception as e:
+            perror(f"BIO ONNX Export fehlgeschlagen: {e}")
+
+        pinfo(
+            f"BIO-Tagger Training abgeschlossen: "
+            f"{tagset.num_tags} Tags, {len(bio_samples)} Samples"
+        )
+
     # === Hilfsmethoden ===
 
     def _create_generator(self, slot_registry: SlotRegistry | None = None) -> DataGenerator: