Parcourir la source

Fuellwort-Injection fuer Intent-Training (multilingual via JSON)

- config/fillers_de.json: Deutsche Fuellwoerter in 5 Kategorien
  (hesitation, casual, polite, emotional, insertions)
- DataGenerator.load_fillers(): Laedt Fuellwoerter aus externer Datei
- _inject_fillers(): Erzeugt ~3 Varianten pro Sample mit Fuellwoertern
  "Spiel Musik" → "koenntest du bitte musik spielen"
  "Timer 5 Minuten" → "aehm timer 5 minuten bitte"
  "Wie spaet ist es" → "sag mal wie spaet ist es"
- Multilingual: Pro Sprache eine fillers_{lang}.json Datei

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
patrick il y a 2 mois
Parent
commit
36433645ac

+ 30 - 0
config/fillers_de.json

@@ -0,0 +1,30 @@
+{
+  "_comment": "Fuellwoerter fuer Intent-Training Datengenerierung (Deutsch)",
+  "language": "de",
+
+  "prefixes": {
+    "hesitation": ["aehm ", "also ", "na ", "ok "],
+    "casual": ["hey ", "ey ", "du ", "sag mal ", "mal eben "],
+    "polite": [
+      "bitte ", "koenntest du ", "kannst du ", "wuerdest du ",
+      "koenntest du bitte ", "kannst du bitte ",
+      "koenntest du mir ", "kannst du mir ",
+      "sei so lieb und ", "waerst du so nett und ",
+      "wuerdest du bitte "
+    ],
+    "emotional": [
+      "verdammt ", "schnell ", "sofort ", "endlich ",
+      "jetzt ", "jetzt mal ", "nun "
+    ]
+  },
+
+  "suffixes": [
+    " bitte", " mal", " kurz", " schnell",
+    " fuer mich", " bitte danke", " ja"
+  ],
+
+  "insertions": {
+    "_comment": "Woerter die an beliebiger Stelle eingefuegt werden koennen",
+    "words": ["doch", "mal", "eben", "halt", "eigentlich", "denn"]
+  }
+}

+ 144 - 1
trixy_core/trainer/core/intent/data_generator.py

@@ -66,6 +66,14 @@ class DataGenerator:
         self._slots = slot_registry
         self._intents: dict[str, IntentDefinition] = {}
         self._negatives_file: str = "trainer/data/intent/negatives.txt"
+        self._fillers_file: str = "config/fillers_de.json"
+
+        # Fuellwoerter (aus JSON geladen via load_fillers())
+        self._filler_prefixes: list[str] = []
+        self._filler_polite: list[str] = []
+        self._filler_emotional: list[str] = []
+        self._filler_suffixes: list[str] = []
+        self._filler_insertions: list[str] = []
 
     def add_intent(self, intent: IntentDefinition) -> None:
         """Fuegt einen Intent hinzu oder merged mit bestehendem."""
@@ -268,7 +276,11 @@ class DataGenerator:
                 expanded = self._expand_pattern(pattern, intent_name, rng)
                 samples.extend(expanded)
 
-            # 3. Augmentation falls noetig
+            # 3. Fuellwort-Varianten generieren
+            filler_samples = self._inject_fillers(samples, rng)
+            samples.extend(filler_samples)
+
+            # 4. Augmentation falls noetig
             if len(samples) < samples_per_intent:
                 augmented = self._augment(
                     samples,
@@ -402,6 +414,137 @@ class DataGenerator:
 
         return text, slots
 
+    # === Fuellwort-Injection ===
+
+    def load_fillers(self, path: str | Path = "") -> bool:
+        """
+        Laedt Fuellwoerter aus einer JSON-Datei.
+
+        Format: config/fillers_de.json (multilingual austauschbar)
+
+        Returns:
+            True bei Erfolg
+        """
+        import json as _json
+
+        fpath = Path(path or self._fillers_file)
+        if not fpath.is_file():
+            pdebug(f"Fuellwoerter nicht gefunden: {fpath}")
+            return False
+
+        try:
+            with open(fpath, encoding="utf-8") as f:
+                data = _json.load(f)
+
+            prefixes = data.get("prefixes", {})
+            self._filler_prefixes = prefixes.get("hesitation", []) + prefixes.get("casual", [])
+            self._filler_polite = prefixes.get("polite", [])
+            self._filler_emotional = prefixes.get("emotional", [])
+            self._filler_suffixes = data.get("suffixes", [])
+            self._filler_insertions = data.get("insertions", {}).get("words", [])
+
+            total = (len(self._filler_prefixes) + len(self._filler_polite)
+                     + len(self._filler_emotional) + len(self._filler_suffixes)
+                     + len(self._filler_insertions))
+            pdebug(f"Fuellwoerter geladen: {total} aus {fpath}")
+            return True
+
+        except Exception as e:
+            pdebug(f"Fuellwoerter Fehler: {e}")
+            return False
+
+    def _inject_fillers(
+        self,
+        samples: list[TrainingSample],
+        rng: random.Random,
+        variants_per_sample: int = 3,
+    ) -> list[TrainingSample]:
+        """
+        Erzeugt Varianten mit Fuellwoertern aus der JSON-Datei.
+
+        Pro Sample werden bis zu 3 Varianten erzeugt:
+        - Mit hoeflichem Prefix ("koenntest du bitte ...")
+        - Mit Fuellwort-Prefix ("aehm ...", "also ...")
+        - Mit Suffix ("... bitte", "... mal")
+        - Mit Insertion ("kannst du doch mal ...")
+        """
+        if not (self._filler_prefixes or self._filler_polite
+                or self._filler_emotional or self._filler_suffixes):
+            return []
+
+        filler_types = []
+        if self._filler_polite:
+            filler_types.append("polite")
+        if self._filler_prefixes:
+            filler_types.append("prefix")
+        if self._filler_suffixes:
+            filler_types.append("suffix")
+        if self._filler_emotional:
+            filler_types.append("emotional")
+        if self._filler_insertions:
+            filler_types.append("insertion")
+        if self._filler_prefixes and self._filler_suffixes:
+            filler_types.append("combo")
+
+        if not filler_types:
+            return []
+
+        result: list[TrainingSample] = []
+        seen = {s.text.lower() for s in samples}
+
+        for sample in samples:
+            text = sample.text
+            if len(text.split()) < 2:
+                continue
+
+            for _ in range(variants_per_sample):
+                variant = text
+                filler_type = rng.choice(filler_types)
+
+                if filler_type == "polite":
+                    prefix = rng.choice(self._filler_polite)
+                    variant = prefix + text[0].lower() + text[1:]
+
+                elif filler_type == "prefix":
+                    prefix = rng.choice(self._filler_prefixes)
+                    variant = prefix + text[0].lower() + text[1:]
+
+                elif filler_type == "suffix":
+                    suffix = rng.choice(self._filler_suffixes)
+                    variant = text.rstrip("?!.") + suffix
+
+                elif filler_type == "emotional":
+                    prefix = rng.choice(self._filler_emotional)
+                    variant = prefix + text[0].lower() + text[1:]
+
+                elif filler_type == "insertion":
+                    words = text.split()
+                    if len(words) >= 3:
+                        pos = rng.randint(1, len(words) - 1)
+                        filler = rng.choice(self._filler_insertions)
+                        words.insert(pos, filler)
+                        variant = " ".join(words)
+
+                elif filler_type == "combo":
+                    prefix = rng.choice(self._filler_prefixes + self._filler_polite)
+                    suffix = rng.choice(self._filler_suffixes)
+                    variant = prefix + text[0].lower() + text[1:]
+                    variant = variant.rstrip("?!.") + suffix
+
+                # Bereinigen
+                variant = re.sub(r"\s+", " ", variant).strip()
+
+                if variant.lower() not in seen and variant:
+                    seen.add(variant.lower())
+                    result.append(TrainingSample(
+                        text=variant,
+                        intent=sample.intent,
+                        slots=dict(sample.slots),
+                        source="filler",
+                    ))
+
+        return result
+
     # === Augmentation ===
 
     # Prefixe und Suffixe fuer natuerliche Varianten

+ 3 - 0
trixy_core/trainer/core/intent/trainer.py

@@ -272,6 +272,9 @@ class IntentTrainer(ITrainer):
             self._settings.get("core_nlp_dir", "trixy_core/nlp"),
         )
 
+        # Fuellwoerter laden (fuer Trainings-Augmentation)
+        generator.load_fillers("config/fillers_de.json")
+
         # Zusaetzliche Beispiele laden
         examples_dir = self._dataset_config.get("additional_examples_dir", "")
         if examples_dir and os.path.isdir(examples_dir):