alpha01 backoffice: crossword engine, lexicon curation and JSON contract

2026-04-29 13:24:04 +02:00
parent a1f8cb8577
commit 47d8957e15
20 changed files with 5985 additions and 16 deletions
--- a/curate_lexicon_alpha.py
+++ b/curate_lexicon_alpha.py
@@ -0,0 +1,611 @@
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from copy import deepcopy
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+
+from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
+
+
+CURATED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated.json")
+TO_BE_REVIEW_OUTPUT_PATH = Path(__file__).with_name("to_be_review.json")
+
+DIFFICULTIES = ("easy", "medium", "hard", "expert")
+
+TEXT_REPLACEMENTS = {
+    " ngrandimento": " ingrandimento",
+    "superificie": "superficie",
+    "quantitaaa": "quantità",
+    "quantitaaaa": "quantità",
+    "quantit": "quantità",
+    "sanit_militare": "sanità_militare",
+    " unaparola ": " una parola ",
+    "questa parola, ": "",
+    "questa parola; ": "",
+}
+
+SUSPICIOUS_PROPER_PATTERNS = (
+    r"\bepisodio\b",
+    r"\bfilm\b",
+    r"\bserie tv\b",
+    r"\bfamiglia\b",
+    r"\bcomune italiano\b",
+    r"\bfrazione del comune\b",
+    r"\bcitta metropolitana\b",
+    r"\bpersonaggio\b",
+    r"\balbum\b",
+    r"\bcognome\b",
+    r"\bnome proprio\b",
+)
+
+DOMAIN_HINTS = {
+    "religion": ("monastero", "abbazia", "sacerdot", "prete", "vescovo", "clero", "religios"),
+    "transport": ("veicolo", "motore", "aereo", "treno", "nave", "trasport", "rimorch", "reattor"),
+    "health": ("malat", "ferit", "ospedal", "medic", "sanitar", "cura", "paziente"),
+    "nature": ("animale", "pianta", "mare", "bosco", "albero", "fiore", "montagna", "acque", "salate"),
+    "geography": ("comune", "paese", "regione", "provincia", "isola", "citta", "territorio"),
+    "sea": ("acque", "salate", "superficie terrestre", "oceano"),
+}
+
+ABSTRACT_PATTERNS = (
+    r"\bgrande quantita\b",
+    r"\bfigurato\b",
+    r"\bsenso figurato\b",
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Cura il lessico refined per la milestone alpha e separa i casi dubbi in to_be_review.json."
+    )
+    parser.add_argument(
+        "--input",
+        type=Path,
+        default=REFINED_LEXICON_OUTPUT_PATH,
+        help="Lessico refined di partenza.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=CURATED_LEXICON_OUTPUT_PATH,
+        help="Lessico curated da generare.",
+    )
+    parser.add_argument(
+        "--review-output",
+        type=Path,
+        default=TO_BE_REVIEW_OUTPUT_PATH,
+        help="File JSON con le voci che richiedono revisione umana.",
+    )
+    parser.add_argument(
+        "--max-review",
+        type=int,
+        default=0,
+        help="Limite opzionale di voci da esportare in to_be_review.json. 0 = tutte.",
+    )
+    return parser.parse_args()
+
+
+def load_json(path: Path) -> Dict[str, object]:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def write_json(path: Path, payload: Dict[str, object]) -> None:
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def dedupe(items: Iterable[str]) -> List[str]:
+    result: List[str] = []
+    seen = set()
+    for item in items:
+        text = str(item).strip()
+        if not text:
+            continue
+        key = text.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        result.append(text)
+    return result
+
+
+def ascii_fold(text: str) -> str:
+    replacements = str.maketrans(
+        {
+            "à": "a",
+            "á": "a",
+            "è": "e",
+            "é": "e",
+            "ì": "i",
+            "í": "i",
+            "ò": "o",
+            "ó": "o",
+            "ù": "u",
+            "ú": "u",
+        }
+    )
+    return str(text).translate(replacements)
+
+
+def normalize_text(text: str) -> str:
+    value = str(text or "").strip()
+    if not value:
+        return ""
+    for old, new in TEXT_REPLACEMENTS.items():
+        value = value.replace(old, new)
+    value = re.sub(r"\s+", " ", value)
+    value = re.sub(r"\s*;\s*", "; ", value)
+    value = re.sub(r"\s*,\s*", ", ", value)
+    value = value.strip(" .;:-")
+    if value and value[0].islower():
+        value = value[0].upper() + value[1:]
+    return value + "."
+
+
+def split_definition_text(text: str) -> List[str]:
+    value = str(text or "").strip()
+    if not value:
+        return []
+    pieces = re.split(r"\s*;\s+|\.\s+(?=[a-zàèéìòù])", value, flags=re.IGNORECASE)
+    normalized = []
+    for piece in pieces:
+        cleaned = normalize_text(piece)
+        if cleaned:
+            normalized.append(cleaned)
+    return normalized
+
+
+def entry_is_common_word(entry: Dict[str, object]) -> bool:
+    form = str(entry.get("form", ""))
+    return bool(form) and form[:1].islower() and not (entry.get("name_tags") or [])
+
+
+def definition_mentions_answer(text: str, answer: str) -> bool:
+    normalized_text = ascii_fold(text).lower()
+    normalized_answer = ascii_fold(answer).lower()
+    return bool(re.search(re.escape(normalized_answer), normalized_text))
+
+
+def suspicious_proper_noun_definition(text: str, entry: Dict[str, object]) -> bool:
+    if not entry_is_common_word(entry):
+        return False
+    lowered = ascii_fold(text).lower()
+    return any(re.search(pattern, lowered) for pattern in SUSPICIOUS_PROPER_PATTERNS)
+
+
+def likely_abstract_detour(text: str) -> bool:
+    lowered = ascii_fold(text).lower()
+    return any(re.search(pattern, lowered) for pattern in ABSTRACT_PATTERNS)
+
+
+def semantic_topics(entry: Dict[str, object]) -> List[str]:
+    semantic = entry.get("semantic", {})
+    topics = []
+    if isinstance(semantic, dict):
+        topics.extend(str(item).lower() for item in semantic.get("semantic_topics", []) or [])
+    wiktextract = entry.get("wiktextract", {})
+    if isinstance(wiktextract, dict):
+        topics.extend(str(item).lower() for item in wiktextract.get("topic_hints", []) or [])
+    return dedupe(topics)
+
+
+def lexical_topics(entry: Dict[str, object]) -> List[str]:
+    return [str(item).lower() for item in entry.get("topics", []) or [] if item]
+
+
+def topic_alignment_score(text: str, entry: Dict[str, object]) -> int:
+    lowered = ascii_fold(text).lower()
+    score = 0
+    topics = set(lexical_topics(entry)) | set(semantic_topics(entry))
+    for topic in topics:
+        for hint in DOMAIN_HINTS.get(topic, ()):
+            if hint in lowered:
+                score += 16
+    return score
+
+
+def candidate_style(text: str) -> str:
+    lowered = ascii_fold(text).lower()
+    if ";" in text or len(text) > 90:
+        return "direct"
+    if any(marker in lowered for marker in ("chi ", "che ", "strumento", "veicolo", "titolo", "parte di")):
+        return "balanced"
+    return "oblique"
+
+
+def length_window(difficulty: str) -> Tuple[int, int]:
+    if difficulty == "easy":
+        return 18, 90
+    if difficulty == "medium":
+        return 18, 78
+    if difficulty == "hard":
+        return 14, 62
+    return 12, 55
+
+
+def build_candidate(
+    text: str,
+    *,
+    source: str,
+    family: str,
+    confidence: float,
+    priority: int = 0,
+) -> Dict[str, object]:
+    cleaned = normalize_text(text)
+    return {
+        "text": cleaned,
+        "source": source,
+        "family": family,
+        "confidence": confidence,
+        "style": candidate_style(cleaned),
+        "priority": priority,
+    }
+
+
+def collect_candidates(entry: Dict[str, object]) -> List[Dict[str, object]]:
+    candidates: List[Dict[str, object]] = []
+    seen = set()
+
+    semantic = entry.get("semantic", {})
+    if isinstance(semantic, dict):
+        for index, synset in enumerate(semantic.get("synsets", []) or []):
+            if not isinstance(synset, dict):
+                continue
+            for piece in split_definition_text(str(synset.get("definition", ""))):
+                candidate = build_candidate(
+                    piece,
+                    source="semantic",
+                    family="semantic_definition",
+                    confidence=0.9,
+                    priority=max(0, 100 - index * 12),
+                )
+                key = (candidate["text"].lower(), candidate["family"])
+                if candidate["text"] and key not in seen:
+                    seen.add(key)
+                    candidates.append(candidate)
+        for index, gloss in enumerate(semantic.get("glosses", []) or []):
+            for piece in split_definition_text(str(gloss)):
+                candidate = build_candidate(
+                    piece,
+                    source="semantic_gloss",
+                    family="semantic_gloss",
+                    confidence=0.8,
+                    priority=max(0, 90 - index * 10),
+                )
+                key = (candidate["text"].lower(), candidate["family"])
+                if candidate["text"] and key not in seen:
+                    seen.add(key)
+                    candidates.append(candidate)
+
+    for index, sense in enumerate(entry.get("senses", []) or []):
+        if not isinstance(sense, dict):
+            continue
+        for piece in split_definition_text(str(sense.get("definition", ""))):
+            source = str(sense.get("source", "refined"))
+            candidate = build_candidate(
+                piece,
+                source="refined" if source == "semantic" else source,
+                family="refined_sense",
+                confidence=float(sense.get("confidence", 0.7) or 0.7),
+                priority=max(0, 80 - index * 8),
+            )
+            key = (candidate["text"].lower(), candidate["family"])
+            if candidate["text"] and key not in seen:
+                seen.add(key)
+                candidates.append(candidate)
+
+    babelnet = entry.get("babelnet", {})
+    if isinstance(babelnet, dict):
+        best_synset = babelnet.get("best_synset", {})
+        if isinstance(best_synset, dict):
+            confidence = 0.85 if babelnet.get("status") == "enriched" else 0.55
+            for index, gloss in enumerate(best_synset.get("glosses", []) or []):
+                for piece in split_definition_text(str(gloss)):
+                    candidate = build_candidate(
+                        piece,
+                        source="babelnet",
+                        family="babelnet_gloss",
+                        confidence=confidence,
+                        priority=max(0, 60 - index * 8),
+                    )
+                    key = (candidate["text"].lower(), candidate["family"])
+                    if candidate["text"] and key not in seen:
+                        seen.add(key)
+                        candidates.append(candidate)
+
+    wiktextract = entry.get("wiktextract", {})
+    if isinstance(wiktextract, dict):
+        definitions = wiktextract.get("definitions", []) or []
+        confidence = 0.78 if wiktextract.get("matched") else 0.45
+        for index, definition in enumerate(definitions):
+            for piece in split_definition_text(str(definition)):
+                candidate = build_candidate(
+                    piece,
+                    source="wiktextract",
+                    family="wiktextract_definition",
+                    confidence=confidence,
+                    priority=max(0, 88 - index * 9),
+                )
+                key = (candidate["text"].lower(), candidate["family"])
+                if candidate["text"] and key not in seen:
+                    seen.add(key)
+                    candidates.append(candidate)
+
+    return candidates
+
+
+def score_candidate(candidate: Dict[str, object], entry: Dict[str, object], difficulty: str) -> int:
+    text = str(candidate["text"])
+    answer = str(entry.get("form", "")).lower()
+    score = 0
+
+    source = str(candidate.get("source"))
+    family = str(candidate.get("family"))
+    confidence = float(candidate.get("confidence", 0.0) or 0.0)
+
+    if len(text) < 12:
+        return -10_000
+
+    if definition_mentions_answer(text, answer):
+        score -= 140
+    else:
+        score += 30
+
+    if suspicious_proper_noun_definition(text, entry):
+        score -= 220
+
+    if likely_abstract_detour(text):
+        score -= 80
+
+    min_len, max_len = length_window(difficulty)
+    if min_len <= len(text) <= max_len:
+        score += 24
+    else:
+        score -= abs(len(text) - max_len) if len(text) > max_len else abs(min_len - len(text)) // 2
+
+    source_bonus = {
+        "semantic": 55,
+        "semantic_gloss": 40,
+        "babelnet": 24,
+        "refined": 30,
+        "wiktextract": 52,
+    }
+    score += source_bonus.get(source, 10)
+
+    family_bonus = {
+        "semantic_definition": 30,
+        "semantic_gloss": 18,
+        "babelnet_gloss": 8,
+        "refined_sense": 22,
+        "wiktextract_definition": 28,
+    }
+    score += family_bonus.get(family, 0)
+
+    score += int(candidate.get("priority", 0) or 0)
+    score += int(confidence * 35)
+
+    alignment = topic_alignment_score(text, entry)
+    score += alignment
+    topical = set(lexical_topics(entry)) | set(semantic_topics(entry))
+    concrete_topics = topical.intersection({"religion", "transport", "health", "nature", "geography", "sea"})
+    if concrete_topics and alignment == 0:
+        score -= 45
+
+    style = str(candidate.get("style"))
+    if difficulty == "easy" and style == "direct":
+        score += 16
+    elif difficulty == "medium" and style in {"direct", "balanced"}:
+        score += 14
+    elif difficulty == "hard" and style == "balanced":
+        score += 10
+    elif difficulty == "expert" and style == "oblique":
+        score += 10
+
+    if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", text.lower()):
+        score -= 30
+    if difficulty in {"hard", "expert"} and ";" in text:
+        score -= 10
+
+    if entry.get("needs_review"):
+        score -= 8
+
+    return score
+
+
+def choose_best_candidate(
+    candidates: Sequence[Dict[str, object]],
+    entry: Dict[str, object],
+    difficulty: str,
+) -> Optional[Dict[str, object]]:
+    ranked = sorted(
+        candidates,
+        key=lambda candidate: (
+            score_candidate(candidate, entry, difficulty),
+            float(candidate.get("confidence", 0.0)),
+            float(candidate.get("priority", 0.0)),
+            -len(str(candidate.get("text", ""))),
+        ),
+        reverse=True,
+    )
+    return ranked[0] if ranked else None
+
+
+def review_reasons(entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[str]:
+    reasons: List[str] = []
+    form = str(entry.get("form", ""))
+    lowered_topics = set(lexical_topics(entry))
+    semantic_topic_set = set(semantic_topics(entry))
+    babelnet_status = str((entry.get("babelnet") or {}).get("status", ""))
+    wiktextract = entry.get("wiktextract", {})
+    wiktextract_status = str(wiktextract.get("status", "")) if isinstance(wiktextract, dict) else ""
+    preferred_definition = str(entry.get("preferred_definition", ""))
+    preferred_source = str(entry.get("preferred_source", ""))
+
+    if not candidates:
+        reasons.append("no_viable_definition")
+    if not preferred_definition and entry.get("needs_review"):
+        reasons.append("flagged_by_refined_stage")
+    if preferred_definition and suspicious_proper_noun_definition(preferred_definition, entry):
+        reasons.append("proper_noun_collision")
+    if babelnet_status == "ambiguous" and preferred_source == "babelnet":
+        reasons.append("babelnet_ambiguous")
+    if wiktextract_status in {"missing", "no_match"} and not preferred_definition:
+        reasons.append("wiktextract_missing")
+    if lowered_topics == {"general"} and not semantic_topic_set and not preferred_definition:
+        reasons.append("only_general_topics")
+    if len(form) <= 2:
+        reasons.append("very_short_word")
+    if str(entry.get("pos", "")) in {"PREP", "CONJ"}:
+        reasons.append("function_word")
+    if preferred_source == "babelnet" and any("None" in str(sense.get("topics")) for sense in entry.get("senses", []) if isinstance(sense, dict)):
+        reasons.append("unresolved_sense_topics")
+    if preferred_definition and definition_mentions_answer(preferred_definition, form.lower()):
+        reasons.append("candidate_mentions_answer")
+
+    return dedupe(reasons)
+
+
+def curate_entry(entry: Dict[str, object]) -> Tuple[Dict[str, object], Optional[Dict[str, object]]]:
+    curated = deepcopy(entry)
+    candidates = collect_candidates(curated)
+
+    clue_definitions: Dict[str, str] = {}
+    clue_sources: Dict[str, str] = {}
+    clue_scores: Dict[str, int] = {}
+    curation_notes: List[str] = []
+
+    for difficulty in DIFFICULTIES:
+        best = choose_best_candidate(candidates, curated, difficulty)
+        if best:
+            clue_definitions[difficulty] = str(best["text"])
+            clue_sources[difficulty] = str(best["source"])
+            clue_scores[difficulty] = score_candidate(best, curated, difficulty)
+
+    preferred_definition = clue_definitions.get("medium") or clue_definitions.get("easy") or ""
+    preferred_source = clue_sources.get("medium") or clue_sources.get("easy") or "fallback"
+
+    if preferred_definition:
+        curation_notes.append(f"preferred_from={preferred_source}")
+    if clue_scores.get("medium", -9999) < 20:
+        curation_notes.append("weak_medium_definition")
+
+    curated["curated_glosses"] = dedupe(candidate["text"] for candidate in candidates)
+    curated["curated_senses"] = [
+        {
+            "definition": candidate["text"],
+            "source": candidate["source"],
+            "family": candidate["family"],
+            "confidence": candidate["confidence"],
+            "priority": candidate["priority"],
+        }
+        for candidate in candidates
+    ]
+    curated["preferred_definition"] = preferred_definition
+    curated["preferred_source"] = preferred_source
+    curated["clue_definitions"] = clue_definitions
+    curated["clue_sources"] = clue_sources
+    curated["clue_scores"] = clue_scores
+    curated["curation_notes"] = curation_notes
+
+    reasons = review_reasons(curated, candidates)
+    severe = {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
+    alpha_ready = bool(preferred_definition) and not severe.intersection(reasons)
+    curated["alpha_ready"] = alpha_ready
+    curated["review_reasons"] = reasons
+
+    review_item = None
+    if reasons:
+        review_item = {
+            "form": curated.get("form"),
+            "lemma": curated.get("lemma"),
+            "pos": curated.get("pos"),
+            "topics": curated.get("topics"),
+            "topic_suggestions": curated.get("topic_suggestions"),
+            "preferred_definition": preferred_definition,
+            "preferred_source": preferred_source,
+            "clue_definitions": clue_definitions,
+            "review_reasons": reasons,
+            "semantic_glosses": (curated.get("semantic") or {}).get("glosses", []),
+            "senses": curated.get("senses", []),
+            "babelnet_status": (curated.get("babelnet") or {}).get("status"),
+            "babelnet_best_synset": (curated.get("babelnet") or {}).get("best_synset"),
+            "wiktextract_status": (curated.get("wiktextract") or {}).get("status"),
+            "wiktextract": curated.get("wiktextract"),
+            "candidate_pool": [
+                {
+                    "text": candidate["text"],
+                    "source": candidate["source"],
+                    "family": candidate["family"],
+                    "confidence": candidate["confidence"],
+                    "priority": candidate["priority"],
+                }
+                for candidate in candidates[:12]
+            ],
+        }
+
+    return curated, review_item
+
+
+def build_curated_lexicon(args: argparse.Namespace) -> Tuple[Dict[str, object], Dict[str, object]]:
+    payload = load_json(args.input)
+    if not isinstance(payload, dict) or "entries" not in payload:
+        raise ValueError(f"Lessico refined non valido: {args.input}")
+
+    curated_entries: List[Dict[str, object]] = []
+    review_entries: List[Dict[str, object]] = []
+
+    for entry in payload.get("entries", []) or []:
+        if not isinstance(entry, dict):
+            continue
+        curated, review_item = curate_entry(entry)
+        curated_entries.append(curated)
+        if review_item:
+            review_entries.append(review_item)
+
+    if args.max_review > 0:
+        review_entries = review_entries[: args.max_review]
+
+    curated_payload = {
+        "meta": {
+            "language": "it",
+            "version": 1,
+            "base_lexicon": args.input.name,
+            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
+            "entry_count": len(curated_entries),
+            "alpha_ready_count": sum(1 for item in curated_entries if item.get("alpha_ready")),
+            "review_count": len(review_entries),
+        },
+        "entries": curated_entries,
+    }
+
+    review_payload = {
+        "meta": {
+            "language": "it",
+            "version": 1,
+            "base_lexicon": args.input.name,
+            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
+            "entry_count": len(review_entries),
+        },
+        "entries": review_entries,
+    }
+
+    return curated_payload, review_payload
+
+
+def main() -> None:
+    args = parse_args()
+    curated_payload, review_payload = build_curated_lexicon(args)
+    write_json(args.output, curated_payload)
+    write_json(args.review_output, review_payload)
+    print(f"Lessico curated generato: {args.output}")
+    print(f"Voci totali: {curated_payload['meta']['entry_count']}")
+    print(f"Voci alpha_ready: {curated_payload['meta']['alpha_ready_count']}")
+    print(f"Voci da revisionare: {review_payload['meta']['entry_count']}")
+    print(f"File review generato: {args.review_output}")
+
+
+if __name__ == "__main__":
+    main()