|
|
@@ -528,7 +528,22 @@ class IntentTrainer(ITrainer):
|
|
|
with open(model_dir / "intent_labels.json", "w") as f:
|
|
|
json.dump(intent_names, f, ensure_ascii=False)
|
|
|
|
|
|
- # === 10. ONNX Export ===
|
|
|
+ # === 10. BIO Slot-Tagger trainieren ===
|
|
|
+ if check_stop():
|
|
|
+ return
|
|
|
+
|
|
|
+ report(0.88, "Trainiere BIO Slot-Tagger...")
|
|
|
+ try:
|
|
|
+ self._train_bio_tagger(
|
|
|
+ samples, encoder, model_dir, epochs,
|
|
|
+ batch_size, report, check_stop, wait_if_paused,
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ perror(f"BIO-Tagger Training fehlgeschlagen: {e}")
|
|
|
+ import traceback
|
|
|
+ pdebug(traceback.format_exc())
|
|
|
+
|
|
|
+ # === 11. ONNX Export ===
|
|
|
if self._settings.get("export_onnx", True):
|
|
|
report(0.92, "Exportiere Classifier nach ONNX...")
|
|
|
try:
|
|
|
@@ -573,6 +588,229 @@ class IntentTrainer(ITrainer):
|
|
|
f"CV-Mean={np.mean([f['accuracy'] for f in fold_results]):.3f}"
|
|
|
)
|
|
|
|
|
|
+ # === BIO Slot-Tagger ===
|
|
|
+
|
|
|
+ def _train_bio_tagger(
|
|
|
+ self,
|
|
|
+ samples: list,
|
|
|
+ encoder: Any,
|
|
|
+ model_dir: Path,
|
|
|
+ epochs: int,
|
|
|
+ batch_size: int,
|
|
|
+ report: Any,
|
|
|
+ check_stop: Any,
|
|
|
+ wait_if_paused: Any,
|
|
|
+ ) -> None:
|
|
|
+ """
|
|
|
+ Trainiert den BIO-Slot-Tagger auf Token-Level Embeddings.
|
|
|
+
|
|
|
+ Verwendet den gleichen Encoder wie der Intent-Classifier.
|
|
|
+ Nur Samples mit Slots werden fuer das BIO-Training genutzt.
|
|
|
+ """
|
|
|
+ import torch
|
|
|
+ import numpy as np
|
|
|
+ from trixy_core.trainer.core.intent.bio_tagger import (
|
|
|
+ BIOTagSet, generate_word_bio_tags, align_bio_to_tokens,
|
|
|
+ )
|
|
|
+
|
|
|
+ MAX_SEQ_LEN = 64
|
|
|
+
|
|
|
+ # Tokenizer holen
|
|
|
+ tokenizer = encoder.tokenizer
|
|
|
+
|
|
|
+ # Samples mit Slots filtern
|
|
|
+ slot_samples = [s for s in samples if s.slots and s.intent != "__negative__"]
|
|
|
+ if not slot_samples:
|
|
|
+ pinfo("BIO-Tagger: Keine Samples mit Slots — uebersprungen")
|
|
|
+ return
|
|
|
+
|
|
|
+ # Alle Slot-Namen sammeln
|
|
|
+ all_slot_names: set[str] = set()
|
|
|
+ for s in slot_samples:
|
|
|
+ all_slot_names.update(s.slots.keys())
|
|
|
+
|
|
|
+ # Auch Samples ohne Slots als Negative nehmen (lernt "O" Tags)
|
|
|
+ no_slot_samples = [s for s in samples if not s.slots and s.intent != "__negative__"]
|
|
|
+ import random
|
|
|
+ rng = random.Random(42)
|
|
|
+ neg_count = min(len(no_slot_samples), len(slot_samples))
|
|
|
+ if no_slot_samples:
|
|
|
+ neg_bio_samples = rng.sample(no_slot_samples, neg_count)
|
|
|
+ else:
|
|
|
+ neg_bio_samples = []
|
|
|
+
|
|
|
+ bio_samples = slot_samples + neg_bio_samples
|
|
|
+ rng.shuffle(bio_samples)
|
|
|
+
|
|
|
+ pinfo(
|
|
|
+ f"BIO-Tagger: {len(slot_samples)} Slot-Samples, "
|
|
|
+ f"{len(neg_bio_samples)} Neg-Samples, "
|
|
|
+ f"{len(all_slot_names)} Slot-Typen: {sorted(all_slot_names)}"
|
|
|
+ )
|
|
|
+
|
|
|
+ # BIO-TagSet erstellen
|
|
|
+ tagset = BIOTagSet(slot_names=sorted(all_slot_names))
|
|
|
+ pinfo(f"BIO-Tags: {tagset.num_tags} ({tagset.tags})")
|
|
|
+
|
|
|
+ # Token-Embeddings und BIO-Labels berechnen
|
|
|
+ report(0.89, f"BIO: Berechne Token-Embeddings ({len(bio_samples)} Samples)...")
|
|
|
+
|
|
|
+ # Transformer-Modell aus SentenceTransformer extrahieren
|
|
|
+ transformer = encoder[0]
|
|
|
+ auto_model = transformer.auto_model.cpu() # Auf CPU (vermeidet CUDA-Mismatch)
|
|
|
+ auto_model.eval()
|
|
|
+
|
|
|
+ all_token_embs: list[np.ndarray] = []
|
|
|
+ all_bio_labels: list[np.ndarray] = []
|
|
|
+
|
|
|
+ for i, sample in enumerate(bio_samples):
|
|
|
+ if check_stop():
|
|
|
+ return
|
|
|
+
|
|
|
+ # 1. Wort-Level BIO-Tags
|
|
|
+ word_tags = generate_word_bio_tags(sample.text, sample.slots)
|
|
|
+
|
|
|
+ # 2. Tokenisieren
|
|
|
+ encoding = tokenizer(
|
|
|
+ sample.text,
|
|
|
+ padding="max_length",
|
|
|
+ truncation=True,
|
|
|
+ max_length=MAX_SEQ_LEN,
|
|
|
+ return_tensors="pt",
|
|
|
+ return_offsets_mapping=True,
|
|
|
+ )
|
|
|
+
|
|
|
+ # 3. Token-Level BIO-Tags (Subword-aligned)
|
|
|
+ token_tags = align_bio_to_tokens(
|
|
|
+ word_tags, sample.text, tokenizer, MAX_SEQ_LEN,
|
|
|
+ )
|
|
|
+
|
|
|
+ # 4. BIO-Label Indizes
|
|
|
+ label_indices = [tagset.tag_to_idx(t) for t in token_tags]
|
|
|
+ all_bio_labels.append(np.array(label_indices, dtype=np.int64))
|
|
|
+
|
|
|
+ # 5. Token-Embeddings via Encoder
|
|
|
+ with torch.no_grad():
|
|
|
+ input_ids = encoding["input_ids"]
|
|
|
+ attention_mask = encoding["attention_mask"]
|
|
|
+ outputs = auto_model(input_ids=input_ids, attention_mask=attention_mask)
|
|
|
+ token_emb = outputs.last_hidden_state[0].cpu().numpy() # (seq_len, 384)
|
|
|
+
|
|
|
+ all_token_embs.append(token_emb)
|
|
|
+
|
|
|
+ if (i + 1) % 500 == 0:
|
|
|
+ report(0.89 + (i / len(bio_samples)) * 0.03,
|
|
|
+ f"BIO: Token-Embeddings {i+1}/{len(bio_samples)}")
|
|
|
+
|
|
|
+ # Zu Tensoren
|
|
|
+ X_bio = torch.FloatTensor(np.stack(all_token_embs)) # (N, seq_len, 384)
|
|
|
+ y_bio = torch.LongTensor(np.stack(all_bio_labels)) # (N, seq_len)
|
|
|
+
|
|
|
+ embedding_dim = X_bio.shape[2]
|
|
|
+
|
|
|
+ # BIO-Kopf erstellen
|
|
|
+ bio_head = torch.nn.Sequential(
|
|
|
+ torch.nn.Linear(embedding_dim, 128),
|
|
|
+ torch.nn.ReLU(),
|
|
|
+ torch.nn.Dropout(0.3),
|
|
|
+ torch.nn.Linear(128, tagset.num_tags),
|
|
|
+ )
|
|
|
+
|
|
|
+ optimizer = torch.optim.Adam(bio_head.parameters(), lr=1e-3)
|
|
|
+ loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tagset.pad_idx)
|
|
|
+
|
|
|
+ # Training
|
|
|
+ bio_epochs = min(epochs, 20)
|
|
|
+ report(0.92, f"BIO: Trainiere {bio_epochs} Epochen...")
|
|
|
+
|
|
|
+ for epoch in range(bio_epochs):
|
|
|
+ if check_stop():
|
|
|
+ return
|
|
|
+ wait_if_paused()
|
|
|
+
|
|
|
+ bio_head.train()
|
|
|
+ epoch_loss = 0.0
|
|
|
+
|
|
|
+ indices = torch.randperm(len(X_bio))
|
|
|
+ for i in range(0, len(indices), batch_size):
|
|
|
+ batch_idx = indices[i:i + batch_size]
|
|
|
+ batch_x = X_bio[batch_idx] # (B, seq_len, 384)
|
|
|
+ batch_y = y_bio[batch_idx] # (B, seq_len)
|
|
|
+
|
|
|
+ # Forward: (B, seq_len, 384) → (B, seq_len, num_tags)
|
|
|
+ logits = bio_head(batch_x)
|
|
|
+
|
|
|
+ # Loss: Reshape fuer CrossEntropy
|
|
|
+ # (B*seq_len, num_tags) vs (B*seq_len,)
|
|
|
+ loss = loss_fn(
|
|
|
+ logits.view(-1, tagset.num_tags),
|
|
|
+ batch_y.view(-1),
|
|
|
+ )
|
|
|
+
|
|
|
+ optimizer.zero_grad()
|
|
|
+ loss.backward()
|
|
|
+ optimizer.step()
|
|
|
+ epoch_loss += loss.item()
|
|
|
+
|
|
|
+ avg_loss = epoch_loss / max(1, len(X_bio) // batch_size)
|
|
|
+
|
|
|
+ if (epoch + 1) % 5 == 0 or epoch == 0:
|
|
|
+ # Accuracy (ohne PAD)
|
|
|
+ bio_head.eval()
|
|
|
+ with torch.no_grad():
|
|
|
+ all_logits = bio_head(X_bio)
|
|
|
+ preds = all_logits.argmax(dim=2) # (N, seq_len)
|
|
|
+
|
|
|
+ mask = y_bio != tagset.pad_idx
|
|
|
+ correct = ((preds == y_bio) & mask).sum().item()
|
|
|
+ total = mask.sum().item()
|
|
|
+ acc = correct / total if total > 0 else 0
|
|
|
+
|
|
|
+ # Slot-spezifische Accuracy (nicht-O Tags)
|
|
|
+ slot_mask = mask & (y_bio != tagset.o_idx)
|
|
|
+ slot_correct = ((preds == y_bio) & slot_mask).sum().item()
|
|
|
+ slot_total = slot_mask.sum().item()
|
|
|
+ slot_acc = slot_correct / slot_total if slot_total > 0 else 0
|
|
|
+
|
|
|
+ pinfo(
|
|
|
+ f"BIO Epoch {epoch+1}/{bio_epochs}: "
|
|
|
+ f"loss={avg_loss:.4f}, acc={acc:.3f}, slot_acc={slot_acc:.3f}"
|
|
|
+ )
|
|
|
+
|
|
|
+ # Speichern
|
|
|
+ report(0.95, "BIO: Speichere Modell...")
|
|
|
+ torch.save(bio_head.state_dict(), model_dir / "slot_tagger.pth")
|
|
|
+
|
|
|
+ # BIO-Labels speichern
|
|
|
+ with open(model_dir / "bio_labels.json", "w") as f:
|
|
|
+ json.dump(tagset.to_dict(), f, indent=2, ensure_ascii=False)
|
|
|
+
|
|
|
+ # ONNX Export des BIO-Kopfs
|
|
|
+ try:
|
|
|
+ bio_head.eval()
|
|
|
+ dummy = torch.randn(1, MAX_SEQ_LEN, embedding_dim)
|
|
|
+ torch.onnx.export(
|
|
|
+ bio_head,
|
|
|
+ dummy,
|
|
|
+ str(model_dir / "slot_tagger.onnx"),
|
|
|
+ input_names=["token_embeddings"],
|
|
|
+ output_names=["bio_logits"],
|
|
|
+ dynamic_axes={
|
|
|
+ "token_embeddings": {0: "batch", 1: "seq_len"},
|
|
|
+ "bio_logits": {0: "batch", 1: "seq_len"},
|
|
|
+ },
|
|
|
+ opset_version=18,
|
|
|
+ )
|
|
|
+ size_kb = (model_dir / "slot_tagger.onnx").stat().st_size / 1024
|
|
|
+ pinfo(f"BIO ONNX: slot_tagger.onnx ({size_kb:.1f} KB)")
|
|
|
+ except Exception as e:
|
|
|
+ perror(f"BIO ONNX Export fehlgeschlagen: {e}")
|
|
|
+
|
|
|
+ pinfo(
|
|
|
+ f"BIO-Tagger Training abgeschlossen: "
|
|
|
+ f"{tagset.num_tags} Tags, {len(bio_samples)} Samples"
|
|
|
+ )
|
|
|
+
|
|
|
# === Hilfsmethoden ===
|
|
|
|
|
|
def _create_generator(self, slot_registry: SlotRegistry | None = None) -> DataGenerator:
|