|
|
@@ -531,6 +531,40 @@ class DataGenerator:
|
|
|
pdebug(f"Fuellwoerter Fehler: {e}")
|
|
|
return False
|
|
|
|
|
|
+ # Verben im Imperativ die einen polite-Prefix erlauben
|
|
|
+ # "koenntest du" + Infinitiv statt Imperativ
|
|
|
+ _IMPERATIV_TO_INFINITIV: dict[str, str] = {
|
|
|
+ "spiel": "spielen", "spiele": "spielen",
|
|
|
+ "stell": "stellen", "stelle": "stellen",
|
|
|
+ "sag": "sagen", "sage": "sagen",
|
|
|
+ "zeig": "zeigen", "zeige": "zeigen",
|
|
|
+ "mach": "machen", "mache": "machen",
|
|
|
+ "stopp": "stoppen", "stoppe": "stoppen",
|
|
|
+ "halt": "halten", "halte": "halten",
|
|
|
+ "schalt": "schalten", "schalte": "schalten",
|
|
|
+ "oeffne": "oeffnen", "schliess": "schliessen",
|
|
|
+ "ruf": "rufen", "rufe": "rufen",
|
|
|
+ "such": "suchen", "suche": "suchen",
|
|
|
+ "start": "starten", "starte": "starten",
|
|
|
+ "setz": "setzen", "setze": "setzen",
|
|
|
+ "erstell": "erstellen", "erstelle": "erstellen",
|
|
|
+ "bestell": "bestellen", "bestelle": "bestellen",
|
|
|
+ "loesch": "loeschen", "loesche": "loeschen",
|
|
|
+ "entfern": "entfernen", "entferne": "entfernen",
|
|
|
+ "aktivier": "aktivieren", "aktiviere": "aktivieren",
|
|
|
+ "deaktivier": "deaktivieren", "deaktiviere": "deaktivieren",
|
|
|
+ "leere": "leeren", "leer": "leeren",
|
|
|
+ "wechsle": "wechseln", "wechsel": "wechseln",
|
|
|
+ "nenn": "nennen", "nenne": "nennen",
|
|
|
+ "nimm": "nehmen", "nehme": "nehmen",
|
|
|
+ "gib": "geben", "gebe": "geben",
|
|
|
+ "lies": "lesen", "lese": "lesen",
|
|
|
+ "merke": "merken", "merk": "merken",
|
|
|
+ "notiere": "notieren", "notier": "notieren",
|
|
|
+ "erinnere": "erinnern", "erinner": "erinnern",
|
|
|
+ "weck": "wecken", "wecke": "wecken",
|
|
|
+ }
|
|
|
+
|
|
|
def _inject_fillers(
|
|
|
self,
|
|
|
samples: list[TrainingSample],
|
|
|
@@ -538,88 +572,112 @@ class DataGenerator:
|
|
|
variants_per_sample: int = 3,
|
|
|
) -> list[TrainingSample]:
|
|
|
"""
|
|
|
- Erzeugt Varianten mit Fuellwoertern aus der JSON-Datei.
|
|
|
-
|
|
|
- Pro Sample werden bis zu 3 Varianten erzeugt:
|
|
|
- - Mit hoeflichem Prefix ("koenntest du bitte ...")
|
|
|
- - Mit Fuellwort-Prefix ("aehm ...", "also ...")
|
|
|
- - Mit Suffix ("... bitte", "... mal")
|
|
|
- - Mit Insertion ("kannst du doch mal ...")
|
|
|
+ Erzeugt grammatisch korrekte Varianten mit Fuellwoertern.
|
|
|
+
|
|
|
+ Regeln:
|
|
|
+ - Polite Prefix nur bei Saetzen die NICHT schon hoeflich sind
|
|
|
+ - Bei Imperativ-Saetzen: Verb in Infinitiv umwandeln
|
|
|
+ "Loesche den Cache" → "Koenntest du den Cache loeschen"
|
|
|
+ - Suffixe nur an Saetze die noch kein "bitte" haben
|
|
|
+ - Kein Prefix auf Prefix (kein "koenntest du kannst du")
|
|
|
+ - Nur auf "example" und "pattern" Samples anwenden (nicht auf andere Filler)
|
|
|
"""
|
|
|
if not (self._filler_prefixes or self._filler_polite
|
|
|
or self._filler_emotional or self._filler_suffixes):
|
|
|
return []
|
|
|
|
|
|
- filler_types = []
|
|
|
- if self._filler_polite:
|
|
|
- filler_types.append("polite")
|
|
|
- if self._filler_prefixes:
|
|
|
- filler_types.append("prefix")
|
|
|
- if self._filler_suffixes:
|
|
|
- filler_types.append("suffix")
|
|
|
- if self._filler_emotional:
|
|
|
- filler_types.append("emotional")
|
|
|
- if self._filler_insertions:
|
|
|
- filler_types.append("insertion")
|
|
|
- if self._filler_prefixes and self._filler_suffixes:
|
|
|
- filler_types.append("combo")
|
|
|
-
|
|
|
- if not filler_types:
|
|
|
- return []
|
|
|
-
|
|
|
result: list[TrainingSample] = []
|
|
|
seen = {s.text.lower() for s in samples}
|
|
|
|
|
|
+ # Bereits hoefliche Prefixe erkennen
|
|
|
+ polite_starts = {"kannst", "koenntest", "wuerdest", "waerst", "sei", "bitte"}
|
|
|
+
|
|
|
for sample in samples:
|
|
|
+ # Nur Original-Samples verwenden (keine Filler/Augmented)
|
|
|
+ if sample.source not in ("example", "pattern"):
|
|
|
+ continue
|
|
|
+
|
|
|
text = sample.text
|
|
|
- if len(text.split()) < 2:
|
|
|
+ words = text.split()
|
|
|
+ if len(words) < 2:
|
|
|
+ continue
|
|
|
+
|
|
|
+ first_word_lower = words[0].lower()
|
|
|
+ has_polite = first_word_lower in polite_starts
|
|
|
+ has_bitte = "bitte" in text.lower()
|
|
|
+ is_imperativ = first_word_lower in self._IMPERATIV_TO_INFINITIV
|
|
|
+ is_question = text.rstrip().endswith("?") or first_word_lower in (
|
|
|
+ "wie", "was", "wer", "wo", "wann", "warum", "welche", "welcher",
|
|
|
+ "welches", "wieviel",
|
|
|
+ )
|
|
|
+ has_placeholder = "[" in text # Unaufgeloeste Platzhalter ueberspringen
|
|
|
+
|
|
|
+ # Samples mit Platzhaltern ueberspringen
|
|
|
+ if has_placeholder:
|
|
|
+ continue
|
|
|
+
|
|
|
+ generated = 0
|
|
|
+ attempts = 0
|
|
|
+ max_attempts = variants_per_sample * 3
|
|
|
+
|
|
|
+ # Erlaubte Filler-Typen fuer diesen Satz bestimmen
|
|
|
+ allowed = []
|
|
|
+ if self._filler_suffixes and not has_bitte:
|
|
|
+ allowed.append("suffix")
|
|
|
+ if self._filler_polite and not has_polite and not is_question and is_imperativ:
|
|
|
+ allowed.append("polite") # Nur bei Imperativ-Saetzen
|
|
|
+ if self._filler_prefixes and not has_polite:
|
|
|
+ allowed.append("prefix") # Hey, also, sag mal
|
|
|
+ if self._filler_emotional and not has_polite and is_imperativ:
|
|
|
+ allowed.append("emotional") # Nur bei Befehlen
|
|
|
+
|
|
|
+ if not allowed:
|
|
|
continue
|
|
|
|
|
|
- for _ in range(variants_per_sample):
|
|
|
- variant = text
|
|
|
- filler_type = rng.choice(filler_types)
|
|
|
+ while generated < variants_per_sample and attempts < max_attempts:
|
|
|
+ attempts += 1
|
|
|
+ variant = None
|
|
|
|
|
|
- if filler_type == "polite":
|
|
|
+ filler_type = rng.choice(allowed)
|
|
|
+
|
|
|
+ if filler_type == "suffix":
|
|
|
+ suffix = rng.choice(self._filler_suffixes)
|
|
|
+ if "bitte" in suffix and has_bitte:
|
|
|
+ continue
|
|
|
+ variant = text.rstrip("?!.") + suffix
|
|
|
+
|
|
|
+ elif filler_type == "polite":
|
|
|
prefix = rng.choice(self._filler_polite)
|
|
|
- variant = prefix + text[0].lower() + text[1:]
|
|
|
+ # Vermeide doppeltes "mir" ("kannst du mir" + "mir ein Lied")
|
|
|
+ if "mir " in prefix and " mir " in text.lower():
|
|
|
+ continue
|
|
|
+ # Imperativ → Infinitiv ans Ende verschieben
|
|
|
+ infinitiv = self._IMPERATIV_TO_INFINITIV[first_word_lower]
|
|
|
+ rest = " ".join(words[1:]).rstrip("?!.")
|
|
|
+ if rest:
|
|
|
+ variant = f"{prefix}{rest} {infinitiv}"
|
|
|
+ else:
|
|
|
+ variant = f"{prefix}{infinitiv}"
|
|
|
|
|
|
elif filler_type == "prefix":
|
|
|
prefix = rng.choice(self._filler_prefixes)
|
|
|
variant = prefix + text[0].lower() + text[1:]
|
|
|
|
|
|
- elif filler_type == "suffix":
|
|
|
- suffix = rng.choice(self._filler_suffixes)
|
|
|
- variant = text.rstrip("?!.") + suffix
|
|
|
-
|
|
|
elif filler_type == "emotional":
|
|
|
prefix = rng.choice(self._filler_emotional)
|
|
|
variant = prefix + text[0].lower() + text[1:]
|
|
|
|
|
|
- elif filler_type == "insertion":
|
|
|
- words = text.split()
|
|
|
- if len(words) >= 3:
|
|
|
- pos = rng.randint(1, len(words) - 1)
|
|
|
- filler = rng.choice(self._filler_insertions)
|
|
|
- words.insert(pos, filler)
|
|
|
- variant = " ".join(words)
|
|
|
-
|
|
|
- elif filler_type == "combo":
|
|
|
- prefix = rng.choice(self._filler_prefixes + self._filler_polite)
|
|
|
- suffix = rng.choice(self._filler_suffixes)
|
|
|
- variant = prefix + text[0].lower() + text[1:]
|
|
|
- variant = variant.rstrip("?!.") + suffix
|
|
|
-
|
|
|
- # Bereinigen
|
|
|
- variant = re.sub(r"\s+", " ", variant).strip()
|
|
|
-
|
|
|
- if variant.lower() not in seen and variant:
|
|
|
- seen.add(variant.lower())
|
|
|
- result.append(TrainingSample(
|
|
|
- text=variant,
|
|
|
- intent=sample.intent,
|
|
|
- slots=dict(sample.slots),
|
|
|
- source="filler",
|
|
|
- ))
|
|
|
+ if variant:
|
|
|
+ variant = re.sub(r"\s+", " ", variant).strip()
|
|
|
+ if variant.lower() not in seen and variant:
|
|
|
+ seen.add(variant.lower())
|
|
|
+ result.append(TrainingSample(
|
|
|
+ text=variant,
|
|
|
+ intent=sample.intent,
|
|
|
+ slots=dict(sample.slots),
|
|
|
+ source="filler",
|
|
|
+ ))
|
|
|
+ generated += 1
|
|
|
|
|
|
return result
|
|
|
|