Browse Source

ONNX Encoder-Export via optimum (Int8 quantisiert, 113MB)

- Encoder-Export: optimum ORTModelForFeatureExtraction + Int8 Quantisierung
  448MB float32 → 113MB int8 (4x kleiner)
- Runtime: ONNX-Encoder bevorzugt, SentenceTransformer als Fallback
  Kein PyTorch auf dem Pi noetig wenn ONNX-Encoder vorhanden
- Inferenz: 1.8-5.2ms mit ONNX-Encoder (vorher 4-8ms mit SentenceTransformer)
- _tag_names Referenz entfernt (BIO-Tags wurden in vorherigem Commit entfernt)

Deployment-Bundle models/intent/:
  encoder_onnx/model_quantized.onnx (113MB) — Encoder
  intent_classifier.onnx (~5KB) — Classifier-Kopf
  tokenizer/ — HuggingFace Tokenizer
  metafile.json + intent_labels.json

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
patrick 1 week ago
parent
commit
0ebac77fe3
2 changed files with 103 additions and 40 deletions
  1. 58 14
      trixy_core/nlp/intent_classifier.py
  2. 45 26
      trixy_core/trainer/core/intent/trainer.py

+ 58 - 14
trixy_core/nlp/intent_classifier.py

@@ -87,11 +87,12 @@ class IntentClassifier:
     """
 
     def __init__(self) -> None:
-        self._encoder = None           # SentenceTransformer
+        self._encoder = None           # SentenceTransformer oder ONNX-Encoder
+        self._tokenizer = None         # HuggingFace Tokenizer (fuer ONNX-Encoder)
         self._classifier = None        # ONNX InferenceSession oder PyTorch
         self._use_onnx = False
+        self._use_onnx_encoder = False  # True wenn ONNX-Encoder statt SentenceTransformer
         self._intent_names: list[str] = []
-        self._tag_names: list[str] = []
         self._slot_names: list[str] = []
         self._embedding_dim: int = 0
         self._loaded = False
@@ -147,14 +148,40 @@ class IntentClassifier:
             perror(f"IntentClassifier: Metadaten-Fehler: {e}")
             return False
 
-        # Encoder laden
-        try:
-            from sentence_transformers import SentenceTransformer
-            self._encoder = SentenceTransformer(base_model)
-            pdebug(f"IntentClassifier: Encoder geladen ({base_model})")
-        except ImportError:
-            perror("sentence-transformers nicht installiert")
-            return False
+        # Encoder laden (ONNX bevorzugt, Fallback SentenceTransformer)
+        encoder_dir = model_dir / "encoder_onnx"
+        has_onnx_encoder = encoder_dir.is_dir() and any(
+            f.suffix == ".onnx" for f in encoder_dir.iterdir() if f.is_file()
+        )
+        if has_onnx_encoder:
+            # ONNX-Encoder (kein PyTorch noetig, ~112MB)
+            try:
+                from optimum.onnxruntime import ORTModelForFeatureExtraction
+                from transformers import AutoTokenizer
+
+                # Quantisiertes Modell bevorzugen
+                onnx_files = [f.name for f in encoder_dir.iterdir() if f.suffix == ".onnx"]
+                file_name = "model_quantized.onnx" if "model_quantized.onnx" in onnx_files else None
+                self._encoder = ORTModelForFeatureExtraction.from_pretrained(
+                    str(encoder_dir), file_name=file_name,
+                )
+                self._tokenizer = AutoTokenizer.from_pretrained(str(encoder_dir))
+                self._use_onnx_encoder = True
+                pinfo(f"IntentClassifier: ONNX-Encoder geladen ({encoder_dir})")
+            except ImportError:
+                pdebug("optimum nicht verfuegbar, versuche SentenceTransformer")
+            except Exception as e:
+                pdebug(f"ONNX-Encoder Fehler: {e}, versuche SentenceTransformer")
+
+        if not self._use_onnx_encoder:
+            # Fallback: SentenceTransformer (braucht PyTorch, ~400MB)
+            try:
+                from sentence_transformers import SentenceTransformer
+                self._encoder = SentenceTransformer(base_model)
+                pdebug(f"IntentClassifier: SentenceTransformer geladen ({base_model})")
+            except ImportError:
+                perror("Weder optimum noch sentence-transformers installiert")
+                return False
 
         # Classifier laden (ONNX bevorzugt, Fallback PyTorch)
         onnx_path = model_dir / "intent_classifier.onnx"
@@ -197,8 +224,8 @@ class IntentClassifier:
 
         self._loaded = True
         pinfo(
-            f"IntentClassifier geladen: {len(self._intent_names)} Intents, "
-            f"{len(self._tag_names)} Slot-Tags"
+            f"IntentClassifier geladen: {len(self._intent_names)} Intents"
+            f"{', ONNX-Encoder' if self._use_onnx_encoder else ', SentenceTransformer'}"
         )
         return True
 
@@ -226,8 +253,25 @@ class IntentClassifier:
 
         # 1. Embedding berechnen
         import numpy as np
-        embedding = self._encoder.encode([text], show_progress_bar=False)
-        embedding = np.array(embedding, dtype=np.float32)
+
+        if self._use_onnx_encoder:
+            # ONNX-Encoder: Tokenize → Encoder → Mean-Pooling
+            inputs = self._tokenizer(
+                text, return_tensors="np",
+                padding=True, truncation=True, max_length=128,
+            )
+            outputs = self._encoder(**inputs)
+            # Mean-Pooling ueber Token-Embeddings (ohne Padding)
+            token_embeddings = outputs.last_hidden_state[0]  # (seq_len, dim)
+            attention_mask = inputs["attention_mask"][0]      # (seq_len,)
+            mask = attention_mask.astype(np.float32)
+            masked = token_embeddings * mask[:, np.newaxis]
+            embedding = masked.sum(axis=0) / mask.sum()
+            embedding = embedding.reshape(1, -1).astype(np.float32)
+        else:
+            # SentenceTransformer: Direkte Embedding-Berechnung
+            embedding = self._encoder.encode([text], show_progress_bar=False)
+            embedding = np.array(embedding, dtype=np.float32)
 
         # 2. Classifier ausfuehren
         if self._use_onnx:

+ 45 - 26
trixy_core/trainer/core/intent/trainer.py

@@ -592,34 +592,53 @@ class IntentTrainer(ITrainer):
     def _export_encoder_onnx(
         self, encoder: Any, model_dir: Path,
     ) -> None:
-        """Exportiert den SentenceTransformer-Encoder nach ONNX."""
-        import torch
+        """
+        Exportiert den SentenceTransformer-Encoder nach ONNX.
 
-        # Den internen Transformer extrahieren
-        transformer = encoder[0]  # SentenceTransformer → Transformer module
-        auto_model = transformer.auto_model
+        Nutzt die optimum-Library fuer zuverlaessigen Export.
+        Quantisiert auf Int8 (~112MB statt 448MB).
+        Auf dem Pi braucht man dann kein PyTorch/sentence-transformers.
+        """
+        try:
+            from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTQuantizer
+            from optimum.onnxruntime.configuration import AutoQuantizationConfig
+            from transformers import AutoTokenizer
+        except ImportError:
+            perror("optimum nicht installiert — pip install optimum[onnxruntime]")
+            return
 
-        # Dummy-Input
-        tokenizer = encoder.tokenizer
-        dummy = tokenizer(
-            "test satz", return_tensors="pt",
-            padding=True, truncation=True, max_length=128,
+        base_model = self._settings.get(
+            "base_model",
+            "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
         )
 
-        onnx_path = model_dir / "encoder.onnx"
-        torch.onnx.export(
-            auto_model,
-            (dummy["input_ids"], dummy["attention_mask"]),
-            str(onnx_path),
-            input_names=["input_ids", "attention_mask"],
-            output_names=["last_hidden_state"],
-            dynamic_axes={
-                "input_ids": {0: "batch", 1: "sequence"},
-                "attention_mask": {0: "batch", 1: "sequence"},
-                "last_hidden_state": {0: "batch", 1: "sequence"},
-            },
-            opset_version=18,
-        )
+        encoder_dir = model_dir / "encoder_onnx"
+        encoder_float_dir = model_dir / "encoder_float32"
+
+        # 1. Float32 ONNX exportieren
+        pinfo("ONNX Encoder: Exportiere float32...")
+        model = ORTModelForFeatureExtraction.from_pretrained(base_model, export=True)
+        model.save_pretrained(str(encoder_float_dir))
+
+        # 2. Tokenizer speichern
+        tokenizer = AutoTokenizer.from_pretrained(base_model)
+        tokenizer.save_pretrained(str(encoder_float_dir))
 
-        file_size_mb = onnx_path.stat().st_size / (1024 * 1024)
-        pinfo(f"ONNX Encoder: {onnx_path} ({file_size_mb:.1f} MB)")
+        # 3. Quantisierung (Int8 — ~4x kleiner)
+        pinfo("ONNX Encoder: Quantisiere auf Int8...")
+        quantizer = ORTQuantizer.from_pretrained(str(encoder_float_dir))
+        qconfig = AutoQuantizationConfig.avx2(is_static=False)
+        quantizer.quantize(save_dir=str(encoder_dir), quantization_config=qconfig)
+
+        # Tokenizer ins quantisierte Verzeichnis kopieren
+        tokenizer.save_pretrained(str(encoder_dir))
+
+        # Float32-Version loeschen (spart ~340MB)
+        import shutil
+        shutil.rmtree(str(encoder_float_dir), ignore_errors=True)
+
+        # Groesse anzeigen
+        total_size = sum(
+            f.stat().st_size for f in encoder_dir.rglob("*") if f.is_file()
+        )
+        pinfo(f"ONNX Encoder: {encoder_dir} ({total_size / 1024 / 1024:.1f} MB, Int8)")