alpha01 backoffice: crossword engine, lexicon curation and JSON contract

2026-04-29 13:24:04 +02:00
parent a1f8cb8577
commit 47d8957e15
20 changed files with 5985 additions and 16 deletions
--- a/clue_generator.py
+++ b/clue_generator.py
@@ -0,0 +1,423 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+
+from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH, TOPIC_DOMAIN_RULES, load_json
+from crossword_generator import HORIZONTAL, Placement
+
+
+@dataclass(frozen=True)
+class Clue:
+    number: int
+    word: str
+    direction: str
+    x: int
+    y: int
+    text: str
+    source: str
+
+
+@dataclass(frozen=True)
+class ClueCandidate:
+    text: str
+    source: str
+    family: str
+    difficulty_hint: str
+    topic_score: int
+    strong_topic: bool
+
+
+DIFFICULTY_ALIASES = {
+    "1": "easy",
+    "2": "medium",
+    "3": "hard",
+    "4": "expert",
+    "5": "expert",
+    "easy": "easy",
+    "medium": "medium",
+    "hard": "hard",
+    "expert": "expert",
+}
+
+GENERIC_CLUE_PATTERNS = (
+    "termine da ricavare dagli incroci",
+    "termine lessicale collegato",
+    "collegato a:",
+)
+
+
+def load_enriched_entries(path: Path = ENRICHED_LEXICON_OUTPUT_PATH) -> Dict[str, Dict[str, object]]:
+    payload = load_json(path, {"entries": []})
+    if not isinstance(payload, dict):
+        return {}
+    return {
+        str(entry.get("form", "")).lower(): entry
+        for entry in payload.get("entries", []) or []
+        if isinstance(entry, dict) and entry.get("form")
+    }
+
+
+def normalize_difficulty(value: Optional[str]) -> str:
+    return DIFFICULTY_ALIASES.get(str(value or "medium").strip().lower(), "medium")
+
+
+def clean_definition(text: str, answer: str) -> str:
+    clue = str(text or "")
+    clue = re.sub(r"\[[^\]]*\]", " ", clue)
+    clue = re.sub(r"\s+", " ", clue).strip(" .;:-")
+    if not clue:
+        return ""
+    clue = re.sub(re.escape(answer), "questa parola", clue, flags=re.IGNORECASE)
+    clue = re.sub(r"\(\s*\)", "", clue)
+    clue = re.sub(r"\s+,", ",", clue)
+    clue = re.sub(r"\s+;", ";", clue)
+    if clue and clue[0].islower():
+        clue = clue[0].upper() + clue[1:]
+    return clue + "."
+
+
+def synset_has_strong_topic_domain(synset: Dict[str, object], topic: Optional[str]) -> bool:
+    if not topic or topic == "general":
+        return True
+    rules = TOPIC_DOMAIN_RULES.get(topic, {})
+    strong_domains = {str(domain).upper() for domain in rules.get("strong", ())}
+    if not strong_domains:
+        return True
+    domains = {str(domain).upper() for domain in synset.get("domains", []) or []}
+    return bool(domains.intersection(strong_domains))
+
+
+def text_contains_answer(text: str, answer: str) -> bool:
+    return bool(re.search(re.escape(answer), text, flags=re.IGNORECASE))
+
+
+def directness_score(text: str) -> int:
+    lowered = text.lower()
+    score = 0
+    direct_keywords = (
+        "strumento",
+        "veicolo",
+        "animale",
+        "pianta",
+        "titolo",
+        "edificio",
+        "persona",
+        "luogo",
+        "malattia",
+        "farmaco",
+        "mezzo",
+        "parte di",
+    )
+    for keyword in direct_keywords:
+        if keyword in lowered:
+            score += 8
+    if any(marker in lowered for marker in ("cioè", "ossia", "ovvero")):
+        score += 4
+    return score
+
+
+def preferred_length_range(difficulty: str) -> Tuple[int, int]:
+    if difficulty == "easy":
+        return 24, 90
+    if difficulty == "medium":
+        return 20, 75
+    if difficulty == "hard":
+        return 16, 60
+    return 14, 50
+
+
+def score_candidate(candidate: ClueCandidate, answer: str, difficulty: str) -> int:
+    text = candidate.text
+    lowered = text.lower()
+    score = 0
+
+    if not text or len(text) < 12:
+        return -10_000
+
+    if any(pattern in lowered for pattern in GENERIC_CLUE_PATTERNS):
+        score -= 120
+
+    if text_contains_answer(text, answer):
+        score -= 140
+    else:
+        score += 40
+
+    min_len, max_len = preferred_length_range(difficulty)
+    length = len(text)
+    if min_len <= length <= max_len:
+        score += 28
+    else:
+        score -= abs(length - max_len) if length > max_len else abs(min_len - length) // 2
+
+    directness = directness_score(text)
+    if difficulty == "easy":
+        score += directness * 2
+    elif difficulty == "medium":
+        score += directness
+    elif difficulty == "hard":
+        score -= max(0, directness - 6)
+    else:
+        score -= directness
+
+    family_bonus = {
+        "semantic_definition": 56,
+        "semantic_gloss": 34,
+        "refined_sense": 30,
+        "babelnet_best_gloss": 18,
+        "babelnet_gloss": 10,
+        "fallback": 0,
+    }
+    score += family_bonus.get(candidate.family, 0)
+
+    difficulty_pref = {
+        "easy": {"direct", "balanced"},
+        "medium": {"balanced", "direct"},
+        "hard": {"balanced", "oblique"},
+        "expert": {"oblique", "balanced"},
+    }
+    if candidate.difficulty_hint in difficulty_pref.get(difficulty, {"balanced"}):
+        score += 18
+
+    if difficulty == "easy" and ";" in text:
+        score += 8
+    if difficulty in {"hard", "expert"} and ";" in text:
+        score -= 8
+
+    if candidate.topic_score >= 40:
+        score += 18
+    elif candidate.topic_score > 0:
+        score += 8
+    elif candidate.family in {"babelnet_best_gloss", "babelnet_gloss"}:
+        score -= 140
+
+    if candidate.strong_topic:
+        score += 10
+
+    if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", lowered):
+        score -= 28
+
+    if length > 120:
+        score -= 45
+    if length > 180:
+        score -= 90
+
+    return score
+
+
+def candidate_hint(text: str, family: str) -> str:
+    lowered = text.lower()
+    if family in {"semantic_definition", "semantic_gloss"} and len(text) <= 70:
+        return "direct"
+    if any(marker in lowered for marker in ("fig.", "figurato", "poetico", "letterario")):
+        return "oblique"
+    if len(text) > 85:
+        return "direct"
+    return "balanced"
+
+
+def add_candidate(
+    candidates: List[ClueCandidate],
+    seen: set[Tuple[str, str]],
+    *,
+    text: str,
+    answer: str,
+    source: str,
+    family: str,
+    topic_score: int = 0,
+    strong_topic: bool = False,
+) -> None:
+    cleaned = clean_definition(text, answer)
+    if not cleaned:
+        return
+    key = (cleaned.lower(), family)
+    if key in seen:
+        return
+    seen.add(key)
+    candidates.append(
+        ClueCandidate(
+            text=cleaned,
+            source=source,
+            family=family,
+            difficulty_hint=candidate_hint(cleaned, family),
+            topic_score=topic_score,
+            strong_topic=strong_topic,
+        )
+    )
+
+
+def semantic_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
+    semantic = entry.get("semantic", {})
+    if not isinstance(semantic, dict):
+        return []
+
+    candidates: List[ClueCandidate] = []
+    seen: set[Tuple[str, str]] = set()
+
+    for synset in semantic.get("synsets", []) or []:
+        if not isinstance(synset, dict):
+            continue
+        add_candidate(
+            candidates,
+            seen,
+            text=str(synset.get("definition", "")),
+            answer=answer,
+            source="semantic",
+            family="semantic_definition",
+        )
+
+    for gloss in semantic.get("glosses", []) or []:
+        add_candidate(
+            candidates,
+            seen,
+            text=str(gloss),
+            answer=answer,
+            source="semantic",
+            family="semantic_gloss",
+        )
+
+    return candidates
+
+
+def babelnet_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
+    babelnet = entry.get("babelnet", {})
+    if not isinstance(babelnet, dict) or babelnet.get("status") not in {"enriched", "ambiguous"}:
+        return []
+
+    candidates: List[ClueCandidate] = []
+    seen: set[Tuple[str, str]] = set()
+
+    best_synset = babelnet.get("best_synset", {})
+    if isinstance(best_synset, dict):
+        topic_score = int(best_synset.get("topic_score", 0) or 0)
+        strong_topic = bool(best_synset.get("strong_topic")) or synset_has_strong_topic_domain(best_synset, topic)
+        for gloss in best_synset.get("glosses", []) or []:
+            add_candidate(
+                candidates,
+                seen,
+                text=str(gloss),
+                answer=answer,
+                source="babelnet",
+                family="babelnet_best_gloss",
+                topic_score=topic_score,
+                strong_topic=strong_topic,
+            )
+
+    for synset in babelnet.get("synsets", []) or []:
+        if not isinstance(synset, dict):
+            continue
+        if topic and topic != "general" and not synset_has_strong_topic_domain(synset, topic):
+            continue
+        topic_score = 40 if topic and topic != "general" and synset_has_strong_topic_domain(synset, topic) else 0
+        for gloss in synset.get("glosses", []) or []:
+            add_candidate(
+                candidates,
+                seen,
+                text=str(gloss),
+                answer=answer,
+                source="babelnet",
+                family="babelnet_gloss",
+                topic_score=topic_score,
+                strong_topic=topic_score >= 40,
+            )
+
+    return candidates
+
+
+def refined_sense_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
+    senses = entry.get("senses", [])
+    if not isinstance(senses, list):
+        return []
+
+    candidates: List[ClueCandidate] = []
+    seen: set[Tuple[str, str]] = set()
+    for sense in senses:
+        if not isinstance(sense, dict):
+            continue
+        confidence = float(sense.get("confidence", 0.0) or 0.0)
+        add_candidate(
+            candidates,
+            seen,
+            text=str(sense.get("definition", "")),
+            answer=answer,
+            source=str(sense.get("source", "refined")),
+            family="refined_sense",
+            topic_score=int(confidence * 100),
+            strong_topic=confidence >= 0.75,
+        )
+    return candidates
+
+
+def fallback_definition(entry: Dict[str, object], answer: str) -> str:
+    pos = str(entry.get("pos", "")).lower()
+    topics = ", ".join(str(topic) for topic in entry.get("topics", []) if topic and str(topic).lower() != "general")
+    if topics:
+        return f"Termine {pos or 'lessicale'} collegato all'ambito: {topics}."
+    return "Termine da ricavare dagli incroci."
+
+
+def all_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
+    candidates: List[ClueCandidate] = []
+    candidates.extend(semantic_candidates(entry, answer))
+    candidates.extend(refined_sense_candidates(entry, answer))
+    candidates.extend(babelnet_candidates(entry, answer, topic))
+    return candidates
+
+
+def choose_candidate(candidates: Sequence[ClueCandidate], answer: str, difficulty: str) -> Optional[ClueCandidate]:
+    ranked = sorted(
+        candidates,
+        key=lambda candidate: (
+            score_candidate(candidate, answer, difficulty),
+            candidate.topic_score,
+            len(candidate.text),
+        ),
+        reverse=True,
+    )
+    return ranked[0] if ranked else None
+
+
+def definition_for_word(
+    word: str,
+    entries: Dict[str, Dict[str, object]],
+    topic: Optional[str] = None,
+    difficulty: Optional[str] = None,
+) -> tuple[str, str]:
+    answer = word.lower()
+    entry = entries.get(answer, {})
+    if not entry:
+        return "Termine da ricavare dagli incroci.", "fallback"
+
+    normalized_difficulty = normalize_difficulty(difficulty)
+    candidates = all_candidates(entry, answer, topic)
+    best = choose_candidate(candidates, answer, normalized_difficulty)
+    if best:
+        return best.text, best.source
+
+    return fallback_definition(entry, answer), "fallback"
+
+
+def generate_clues(
+    placements: Iterable[Placement],
+    entries: Dict[str, Dict[str, object]],
+    topic: Optional[str] = None,
+    difficulty: Optional[str] = None,
+) -> List[Clue]:
+    clues = []
+    for number, placement in enumerate(placements, start=1):
+        text, source = definition_for_word(placement.word, entries, topic, difficulty)
+        direction = "orizzontale" if placement.direction == HORIZONTAL else "verticale"
+        clues.append(
+            Clue(
+                number=number,
+                word=placement.word,
+                direction=direction,
+                x=placement.x,
+                y=placement.y,
+                text=text,
+                source=source,
+            )
+        )
+    return clues