|
|
@@ -66,6 +66,14 @@ class DataGenerator:
|
|
|
self._slots = slot_registry
|
|
|
self._intents: dict[str, IntentDefinition] = {}
|
|
|
self._negatives_file: str = "trainer/data/intent/negatives.txt"
|
|
|
+ self._fillers_file: str = "config/fillers_de.json"
|
|
|
+
|
|
|
+ # Fuellwoerter (aus JSON geladen via load_fillers())
|
|
|
+ self._filler_prefixes: list[str] = []
|
|
|
+ self._filler_polite: list[str] = []
|
|
|
+ self._filler_emotional: list[str] = []
|
|
|
+ self._filler_suffixes: list[str] = []
|
|
|
+ self._filler_insertions: list[str] = []
|
|
|
|
|
|
def add_intent(self, intent: IntentDefinition) -> None:
|
|
|
"""Fuegt einen Intent hinzu oder merged mit bestehendem."""
|
|
|
@@ -268,7 +276,11 @@ class DataGenerator:
|
|
|
expanded = self._expand_pattern(pattern, intent_name, rng)
|
|
|
samples.extend(expanded)
|
|
|
|
|
|
- # 3. Augmentation falls noetig
|
|
|
+ # 3. Fuellwort-Varianten generieren
|
|
|
+ filler_samples = self._inject_fillers(samples, rng)
|
|
|
+ samples.extend(filler_samples)
|
|
|
+
|
|
|
+ # 4. Augmentation falls noetig
|
|
|
if len(samples) < samples_per_intent:
|
|
|
augmented = self._augment(
|
|
|
samples,
|
|
|
@@ -402,6 +414,137 @@ class DataGenerator:
|
|
|
|
|
|
return text, slots
|
|
|
|
|
|
+ # === Fuellwort-Injection ===
|
|
|
+
|
|
|
+ def load_fillers(self, path: str | Path = "") -> bool:
|
|
|
+ """
|
|
|
+ Laedt Fuellwoerter aus einer JSON-Datei.
|
|
|
+
|
|
|
+ Format: config/fillers_de.json (multilingual austauschbar)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ True bei Erfolg
|
|
|
+ """
|
|
|
+ import json as _json
|
|
|
+
|
|
|
+ fpath = Path(path or self._fillers_file)
|
|
|
+ if not fpath.is_file():
|
|
|
+ pdebug(f"Fuellwoerter nicht gefunden: {fpath}")
|
|
|
+ return False
|
|
|
+
|
|
|
+ try:
|
|
|
+ with open(fpath, encoding="utf-8") as f:
|
|
|
+ data = _json.load(f)
|
|
|
+
|
|
|
+ prefixes = data.get("prefixes", {})
|
|
|
+ self._filler_prefixes = prefixes.get("hesitation", []) + prefixes.get("casual", [])
|
|
|
+ self._filler_polite = prefixes.get("polite", [])
|
|
|
+ self._filler_emotional = prefixes.get("emotional", [])
|
|
|
+ self._filler_suffixes = data.get("suffixes", [])
|
|
|
+ self._filler_insertions = data.get("insertions", {}).get("words", [])
|
|
|
+
|
|
|
+ total = (len(self._filler_prefixes) + len(self._filler_polite)
|
|
|
+ + len(self._filler_emotional) + len(self._filler_suffixes)
|
|
|
+ + len(self._filler_insertions))
|
|
|
+ pdebug(f"Fuellwoerter geladen: {total} aus {fpath}")
|
|
|
+ return True
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ pdebug(f"Fuellwoerter Fehler: {e}")
|
|
|
+ return False
|
|
|
+
|
|
|
+ def _inject_fillers(
|
|
|
+ self,
|
|
|
+ samples: list[TrainingSample],
|
|
|
+ rng: random.Random,
|
|
|
+ variants_per_sample: int = 3,
|
|
|
+ ) -> list[TrainingSample]:
|
|
|
+ """
|
|
|
+ Erzeugt Varianten mit Fuellwoertern aus der JSON-Datei.
|
|
|
+
|
|
|
+ Pro Sample werden bis zu 3 Varianten erzeugt:
|
|
|
+ - Mit hoeflichem Prefix ("koenntest du bitte ...")
|
|
|
+ - Mit Fuellwort-Prefix ("aehm ...", "also ...")
|
|
|
+ - Mit Suffix ("... bitte", "... mal")
|
|
|
+ - Mit Insertion ("kannst du doch mal ...")
|
|
|
+ """
|
|
|
+ if not (self._filler_prefixes or self._filler_polite
|
|
|
+ or self._filler_emotional or self._filler_suffixes):
|
|
|
+ return []
|
|
|
+
|
|
|
+ filler_types = []
|
|
|
+ if self._filler_polite:
|
|
|
+ filler_types.append("polite")
|
|
|
+ if self._filler_prefixes:
|
|
|
+ filler_types.append("prefix")
|
|
|
+ if self._filler_suffixes:
|
|
|
+ filler_types.append("suffix")
|
|
|
+ if self._filler_emotional:
|
|
|
+ filler_types.append("emotional")
|
|
|
+ if self._filler_insertions:
|
|
|
+ filler_types.append("insertion")
|
|
|
+ if self._filler_prefixes and self._filler_suffixes:
|
|
|
+ filler_types.append("combo")
|
|
|
+
|
|
|
+ if not filler_types:
|
|
|
+ return []
|
|
|
+
|
|
|
+ result: list[TrainingSample] = []
|
|
|
+ seen = {s.text.lower() for s in samples}
|
|
|
+
|
|
|
+ for sample in samples:
|
|
|
+ text = sample.text
|
|
|
+ if len(text.split()) < 2:
|
|
|
+ continue
|
|
|
+
|
|
|
+ for _ in range(variants_per_sample):
|
|
|
+ variant = text
|
|
|
+ filler_type = rng.choice(filler_types)
|
|
|
+
|
|
|
+ if filler_type == "polite":
|
|
|
+ prefix = rng.choice(self._filler_polite)
|
|
|
+ variant = prefix + text[0].lower() + text[1:]
|
|
|
+
|
|
|
+ elif filler_type == "prefix":
|
|
|
+ prefix = rng.choice(self._filler_prefixes)
|
|
|
+ variant = prefix + text[0].lower() + text[1:]
|
|
|
+
|
|
|
+ elif filler_type == "suffix":
|
|
|
+ suffix = rng.choice(self._filler_suffixes)
|
|
|
+ variant = text.rstrip("?!.") + suffix
|
|
|
+
|
|
|
+ elif filler_type == "emotional":
|
|
|
+ prefix = rng.choice(self._filler_emotional)
|
|
|
+ variant = prefix + text[0].lower() + text[1:]
|
|
|
+
|
|
|
+ elif filler_type == "insertion":
|
|
|
+ words = text.split()
|
|
|
+ if len(words) >= 3:
|
|
|
+ pos = rng.randint(1, len(words) - 1)
|
|
|
+ filler = rng.choice(self._filler_insertions)
|
|
|
+ words.insert(pos, filler)
|
|
|
+ variant = " ".join(words)
|
|
|
+
|
|
|
+ elif filler_type == "combo":
|
|
|
+ prefix = rng.choice(self._filler_prefixes + self._filler_polite)
|
|
|
+ suffix = rng.choice(self._filler_suffixes)
|
|
|
+ variant = prefix + text[0].lower() + text[1:]
|
|
|
+ variant = variant.rstrip("?!.") + suffix
|
|
|
+
|
|
|
+ # Bereinigen
|
|
|
+ variant = re.sub(r"\s+", " ", variant).strip()
|
|
|
+
|
|
|
+ if variant.lower() not in seen and variant:
|
|
|
+ seen.add(variant.lower())
|
|
|
+ result.append(TrainingSample(
|
|
|
+ text=variant,
|
|
|
+ intent=sample.intent,
|
|
|
+ slots=dict(sample.slots),
|
|
|
+ source="filler",
|
|
|
+ ))
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
# === Augmentation ===
|
|
|
|
|
|
# Prefixe und Suffixe fuer natuerliche Varianten
|