alpha01 backoffice: crossword engine, lexicon curation and JSON contract

2026-04-29 13:24:04 +02:00
parent a1f8cb8577
commit 47d8957e15
20 changed files with 5985 additions and 16 deletions
--- a/babelnet_incremental_enricher.py
+++ b/babelnet_incremental_enricher.py
@@ -0,0 +1,583 @@
+from __future__ import annotations
+
+import argparse
+import os
+from copy import deepcopy
+from datetime import datetime
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Dict, Iterable, List, Optional, Tuple
+
+from build_babelnet_enrichment import (
+    BABELNET_CACHE_PATH,
+    BABELNET_ENV_KEY,
+    BABELNET_OUTPUT_PATH,
+    POS_TO_BABELNET,
+    enrich_entry,
+    load_json,
+    write_json,
+)
+from build_enriched_lexicon import (
+    ENRICHED_LEXICON_OUTPUT_PATH,
+    build_enriched_lexicon,
+    write_json as write_enriched_json,
+)
+from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
+
+
+DIFFICULTY_ALIASES: Dict[str, int] = {
+    "easy": 1,
+    "medium": 2,
+    "hard": 4,
+    "expert": 5,
+}
+DEFAULT_TOPIC = "general"
+ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
+FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
+GENERAL_FILL_MIN_QUALITY = 6
+GENERAL_FILL_MAX_LENGTH = 10
+SOFT_RELATED_FILL_LIMIT = 120
+CONCRETE_TOPICS = {
+    "animals",
+    "plants",
+    "nature",
+    "ecology",
+    "geography",
+    "weather",
+    "sea",
+    "mountain",
+    "health",
+    "science",
+    "sport",
+    "history",
+    "school",
+    "cinema",
+    "literature",
+    "food",
+    "city",
+    "transport",
+    "work",
+    "home",
+}
+
+TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, Tuple[str, ...]] = {
+    "transport": (
+        "auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer",
+        "bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc",
+        "trattor", "carr", "vap", "rota", "ruot",
+    ),
+    "animals": (
+        "can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall",
+        "serpent", "tig", "leon", "volp", "cerv", "capr", "pecor",
+    ),
+    "nature": (
+        "mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol",
+        "rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent",
+        "litoral", "piogg", "nev", "onda", "clim",
+    ),
+    "cinema": (
+        "film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog",
+        "comic", "div", "docu", "pellic", "spettacol",
+    ),
+}
+
+TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, Tuple[str, ...]] = {
+    "transport": (
+        "intervist", "intratten", "speriment", "stermin", "investig",
+        "intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt",
+        "eccit", "traduz", "fluttu", "sollecit",
+    ),
+    "animals": (
+        "assicur", "finanz", "coediz", "camerier", "servitor", "indic",
+        "estens", "diffus", "difensor", "spessor", "maggior",
+    ),
+    "cinema": (
+        "manifest", "riediz", "dissimul", "diffus", "difensor", "estens",
+        "malumor", "eversor",
+    ),
+}
+
+ENRICHABLE_STATUSES = {"not_requested", "api_error"}
+
+BABELNET_TOPIC_SAFE_PREFIXES: Dict[str, Tuple[str, ...]] = {
+    "transport": (
+        "ambul",
+        "aer",
+        "autobus",
+        "autocar",
+        "automob",
+        "autostrad",
+        "autoveic",
+        "autovett",
+        "bicicl",
+        "ciclo",
+        "imbarc",
+        "locom",
+        "motoc",
+        "motr",
+        "navig",
+        "rimorch",
+        "trattor",
+        "tren",
+        "veicol",
+        "vettur",
+    ),
+}
+
+
+def parse_difficulty(value: str) -> int:
+    text = str(value).strip().lower()
+    if text in DIFFICULTY_ALIASES:
+        return DIFFICULTY_ALIASES[text]
+    try:
+        level = int(text)
+    except ValueError as exc:
+        raise SystemExit(
+            "Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
+        ) from exc
+    if not 1 <= level <= 5:
+        raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
+    return level
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Arricchisce incrementalmente il lessico: seleziona parole mancanti, "
+            "chiama BabelNet entro un limite e rigenera lexicon_it_enriched.json."
+        )
+    )
+    parser.add_argument(
+        "--api-key",
+        default=os.environ.get(BABELNET_ENV_KEY),
+        help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.",
+    )
+    parser.add_argument(
+        "--topic",
+        default=DEFAULT_TOPIC,
+        help="Topic per cui scegliere le prossime parole da arricchire.",
+    )
+    parser.add_argument(
+        "--difficulty",
+        default="medium",
+        help="Difficolta massima: easy, medium, hard, expert oppure 1-5.",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=50,
+        help="Numero massimo di parole da arricchire in questa esecuzione.",
+    )
+    parser.add_argument(
+        "--sleep",
+        type=float,
+        default=0.2,
+        help="Pausa tra richieste API.",
+    )
+    parser.add_argument(
+        "--semantic",
+        type=Path,
+        default=SEMANTIC_LEXICON_OUTPUT_PATH,
+        help="Lessico semantico completo di partenza.",
+    )
+    parser.add_argument(
+        "--babelnet",
+        type=Path,
+        default=BABELNET_OUTPUT_PATH,
+        help="Archivio degli arricchimenti BabelNet parziali.",
+    )
+    parser.add_argument(
+        "--enriched",
+        type=Path,
+        default=ENRICHED_LEXICON_OUTPUT_PATH,
+        help="Lessico arricchito da aggiornare.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Mostra le parole candidate senza chiamare BabelNet e senza scrivere file.",
+    )
+    parser.add_argument(
+        "--retry-no-match",
+        action="store_true",
+        help="Riprova anche parole gia marcate come no_match.",
+    )
+    parser.add_argument(
+        "--words",
+        nargs="*",
+        default=None,
+        help="Parole specifiche da arricchire, utile per generare definizioni sul cruciverba finale.",
+    )
+    return parser.parse_args()
+
+
+def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
+    form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
+    pos = str(entry.get("pos") or "").strip().upper()
+    return form, pos
+
+
+def dedupe(items: Iterable[Dict[str, object]]) -> List[Dict[str, object]]:
+    seen = set()
+    result = []
+    for item in items:
+        key = entry_key(item)
+        if key in seen:
+            continue
+        seen.add(key)
+        result.append(item)
+    return result
+
+
+def entry_topics(entry: Dict[str, object]) -> Tuple[set[str], set[str]]:
+    topics = {str(item).lower() for item in entry.get("topics", []) if item}
+    semantic = entry.get("semantic", {})
+    semantic_topics = set()
+    if isinstance(semantic, dict):
+        semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", []) if item}
+    return topics, semantic_topics
+
+
+def current_babelnet_status(entry: Dict[str, object]) -> str:
+    babelnet = entry.get("babelnet", {})
+    if isinstance(babelnet, dict):
+        return str(babelnet.get("status", "not_requested"))
+    return "not_requested"
+
+
+def matches_topic_roots(word: str, topic: str) -> bool:
+    roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(topic, ())
+    return not roots or any(root in word for root in roots)
+
+
+def matches_safe_babelnet_roots(word: str, topic: str) -> bool:
+    prefixes = BABELNET_TOPIC_SAFE_PREFIXES.get(topic)
+    if prefixes is None:
+        return False
+    return any(word.startswith(prefix) for prefix in prefixes)
+
+
+def is_blocked_for_topic(word: str, topic: str) -> bool:
+    return any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(topic, ()))
+
+
+def topic_score(entry: Dict[str, object], topic: str) -> int:
+    if topic == DEFAULT_TOPIC:
+        return 20
+
+    word = str(entry.get("form", "")).lower()
+    topics, semantic_topics = entry_topics(entry)
+    score = 0
+    if topic in topics:
+        score += 100
+    if topic in semantic_topics:
+        score += 45
+    if matches_topic_roots(word, topic):
+        score += 35
+    if DEFAULT_TOPIC in topics:
+        score += 5
+    if is_blocked_for_topic(word, topic):
+        score -= 100
+    if topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
+        score -= 30
+    return score
+
+
+def candidate_score(entry: Dict[str, object], topic: str) -> Tuple[int, int, int, int, int, str]:
+    word = str(entry.get("form", ""))
+    pos = str(entry.get("pos", ""))
+    pos_bonus = {
+        "NOUN": 12,
+        "VERB": 8,
+        "ADJ": 6,
+        "ADV": 4,
+    }.get(pos, 0)
+    semantic = entry.get("semantic", {})
+    semantic_bonus = 3 if isinstance(semantic, dict) and semantic.get("matched") else 0
+    length_bonus = 4 if 4 <= len(word) <= 10 else 1 if len(word) <= 14 else -3
+    return (
+        topic_score(entry, topic),
+        int(entry.get("quality_score", 0)),
+        pos_bonus,
+        semantic_bonus,
+        length_bonus,
+        word,
+    )
+
+
+def eligible_for_babelnet(entry: Dict[str, object], topic: str, difficulty_level: int, retry_no_match: bool) -> bool:
+    word = str(entry.get("form", "")).lower()
+    pos = str(entry.get("pos", ""))
+    topics, semantic_topics = entry_topics(entry)
+    status = current_babelnet_status(entry)
+    allowed_statuses = set(ENRICHABLE_STATUSES)
+    if retry_no_match:
+        allowed_statuses.add("no_match")
+
+    if status not in allowed_statuses:
+        return False
+    if not word.isalpha() or len(word) < 3 or len(word) > 16:
+        return False
+    if pos not in POS_TO_BABELNET or pos not in FILL_ALLOWED_POS:
+        return False
+    if int(entry.get("difficulty_word", 5)) > difficulty_level:
+        return False
+    if not entry.get("allowed_in_crossword", False):
+        return False
+    if topic != DEFAULT_TOPIC:
+        if topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
+            return False
+        conservative_match = topic in topics
+        safe_root_match = matches_safe_babelnet_roots(word, topic)
+        semantic_only_match = topic in semantic_topics and topic not in CONCRETE_TOPICS
+        if not (conservative_match or safe_root_match or semantic_only_match):
+            return False
+    return True
+
+
+def select_candidates(payload: Dict[str, object], topic: str, difficulty_level: int, limit: int, retry_no_match: bool) -> List[Dict[str, object]]:
+    entries = [
+        entry
+        for entry in payload.get("entries", []) or []
+        if isinstance(entry, dict) and eligible_for_babelnet(entry, topic, difficulty_level, retry_no_match)
+    ]
+
+    if topic != DEFAULT_TOPIC:
+        strong = [entry for entry in entries if topic in entry_topics(entry)[0]]
+        soft = [
+            entry
+            for entry in entries
+            if entry not in strong
+            and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
+            and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH
+        ]
+        support = [
+            entry
+            for entry in entries
+            if entry not in strong
+            and entry not in soft
+            and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
+            and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES)
+        ]
+        entries = strong + sorted(soft, key=lambda item: candidate_score(item, topic), reverse=True)[:SOFT_RELATED_FILL_LIMIT]
+        entries += sorted(support, key=lambda item: candidate_score(item, topic), reverse=True)
+
+    entries = dedupe(entries)
+    entries.sort(key=lambda item: candidate_score(item, topic), reverse=True)
+    return entries[:limit]
+
+
+def select_word_candidates(
+    payload: Dict[str, object],
+    words: Iterable[str],
+    limit: int,
+    retry_no_match: bool,
+) -> List[Dict[str, object]]:
+    requested = []
+    seen_words = set()
+    for word in words:
+        normalized = str(word).strip().lower()
+        if normalized and normalized not in seen_words:
+            requested.append(normalized)
+            seen_words.add(normalized)
+
+    by_word = {
+        str(entry.get("form", "")).lower(): entry
+        for entry in payload.get("entries", []) or []
+        if isinstance(entry, dict)
+    }
+    selected = []
+    allowed_statuses = set(ENRICHABLE_STATUSES)
+    if retry_no_match:
+        allowed_statuses.add("no_match")
+
+    for word in requested:
+        entry = by_word.get(word)
+        if not entry:
+            continue
+        status = current_babelnet_status(entry)
+        if status not in allowed_statuses:
+            continue
+        if str(entry.get("pos", "")) not in POS_TO_BABELNET:
+            continue
+        if not str(entry.get("form", "")).isalpha():
+            continue
+        selected.append(entry)
+        if len(selected) >= limit:
+            break
+
+    return selected
+
+
+def load_source_payload(enriched_path: Path, semantic_path: Path) -> Dict[str, object]:
+    if enriched_path.exists():
+        payload = load_json(enriched_path, {})
+        if isinstance(payload, dict) and "entries" in payload:
+            return payload
+    payload = load_json(semantic_path, {})
+    if isinstance(payload, dict) and "entries" in payload:
+        return payload
+    raise ValueError(f"Nessun lessico valido trovato: {enriched_path} / {semantic_path}")
+
+
+def merge_babelnet_entries(existing_payload: Dict[str, object], new_entries: List[Dict[str, object]], topic: str, difficulty: str) -> Dict[str, object]:
+    existing_entries = [
+        entry for entry in existing_payload.get("entries", []) or [] if isinstance(entry, dict)
+    ]
+    index = {entry_key(entry): deepcopy(entry) for entry in existing_entries}
+    generated_at = datetime.now().astimezone().isoformat(timespec="seconds")
+
+    for entry in new_entries:
+        updated = deepcopy(entry)
+        updated["babelnet_generated_at"] = generated_at
+        index[entry_key(updated)] = updated
+
+    entries = sorted(index.values(), key=lambda item: (str(item.get("form", "")), str(item.get("pos", ""))))
+    meta = dict(existing_payload.get("meta", {})) if isinstance(existing_payload.get("meta", {}), dict) else {}
+    meta.update(
+        {
+            "language": meta.get("language", "it"),
+            "version": max(1, int(meta.get("version", 1))),
+            "source": "BabelNet API",
+            "updated_at": generated_at,
+            "last_topic": topic,
+            "last_difficulty": difficulty,
+            "entry_count": len(entries),
+        }
+    )
+    return {"meta": meta, "entries": entries}
+
+
+def rebuild_enriched(semantic_path: Path, babelnet_path: Path, enriched_path: Path, topic: str) -> Dict[str, object]:
+    namespace = SimpleNamespace(
+        semantic=semantic_path,
+        babelnet=babelnet_path,
+        output=enriched_path,
+        topic=topic,
+    )
+    payload = build_enriched_lexicon(namespace)
+    write_enriched_json(enriched_path, payload)
+    return payload
+
+
+def run_incremental_enrichment(args: argparse.Namespace) -> Dict[str, object]:
+    normalized_topic = args.topic.strip().lower()
+    difficulty_level = parse_difficulty(str(args.difficulty))
+    source_payload = load_source_payload(args.enriched, args.semantic)
+    target_words = getattr(args, "words", None)
+    if target_words:
+        candidates = select_word_candidates(
+            source_payload,
+            target_words,
+            max(0, args.limit),
+            args.retry_no_match,
+        )
+    else:
+        candidates = select_candidates(
+            source_payload,
+            normalized_topic,
+            difficulty_level,
+            max(0, args.limit),
+            args.retry_no_match,
+        )
+
+    if args.dry_run:
+        return {
+            "mode": "dry-run",
+            "topic": normalized_topic,
+            "difficulty": args.difficulty,
+            "selected_count": len(candidates),
+            "selected_words": [entry.get("form") for entry in candidates],
+        }
+
+    if not args.api_key:
+        raise SystemExit(
+            f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure usa --api-key <chiave>."
+        )
+
+    cache = load_json(BABELNET_CACHE_PATH, {})
+    if not isinstance(cache, dict):
+        cache = {}
+    babelnet_payload = load_json(args.babelnet, {"entries": []})
+    if not isinstance(babelnet_payload, dict):
+        babelnet_payload = {"entries": []}
+
+    enriched_candidates = []
+    word_logs = []
+    for index, entry in enumerate(candidates, start=1):
+        updated = deepcopy(entry)
+        updated.pop("babelnet", None)
+        stats = {"api_calls": 0, "cache_hits": 0, "responses": 0}
+        updated["babelnet"] = enrich_entry(updated, args.api_key, cache, args.sleep, stats)
+        enriched_candidates.append(updated)
+        write_json(BABELNET_CACHE_PATH, cache)
+        word_logs.append(
+            {
+                "word": updated["form"],
+                "api_calls": stats["api_calls"],
+                "cache_hits": stats["cache_hits"],
+                "responses": stats["responses"],
+                "matched": bool(updated["babelnet"].get("matched")),
+                "synsets": len(updated["babelnet"].get("synsets", []) or []),
+                "reason": updated["babelnet"].get("reason"),
+            }
+        )
+        print(
+            f"[{index}/{len(candidates)}] {updated['form']}: "
+            f"api_calls={stats['api_calls']} cache_hits={stats['cache_hits']} "
+            f"risposta={stats['responses'] > 0} match={updated['babelnet'].get('matched')}"
+        )
+
+    merged_babelnet = merge_babelnet_entries(
+        babelnet_payload,
+        enriched_candidates,
+        normalized_topic,
+        str(args.difficulty),
+    )
+    write_json(args.babelnet, merged_babelnet)
+    enriched_payload = rebuild_enriched(args.semantic, args.babelnet, args.enriched, normalized_topic)
+
+    return {
+        "mode": "enriched",
+        "topic": normalized_topic,
+        "difficulty": args.difficulty,
+        "selected_count": len(candidates),
+        "matched_count": sum(1 for entry in enriched_candidates if entry.get("babelnet", {}).get("matched")),
+        "api_call_count": sum(item["api_calls"] for item in word_logs),
+        "cache_hit_count": sum(item["cache_hits"] for item in word_logs),
+        "word_logs": word_logs,
+        "babelnet_entry_count": merged_babelnet["meta"]["entry_count"],
+        "enriched_status_counts": enriched_payload["meta"]["babelnet_status_counts"],
+    }
+
+
+def main() -> None:
+    args = parse_args()
+    result = run_incremental_enrichment(args)
+    if result["mode"] == "dry-run":
+        print("Dry-run BabelNet incrementale")
+        print(f"Topic: {result['topic']}")
+        print(f"Difficolta: {result['difficulty']}")
+        print(f"Parole selezionate: {result['selected_count']}")
+        for index, word in enumerate(result["selected_words"], start=1):
+            print(f"{index:2d}. {word}")
+        return
+
+    print("Arricchimento BabelNet completato")
+    print(f"Topic: {result['topic']}")
+    print(f"Parole interrogate: {result['selected_count']}")
+    print(f"Chiamate API BabelNet reali: {result['api_call_count']}")
+    print(f"Risposte da cache: {result['cache_hit_count']}")
+    print(f"Match BabelNet: {result['matched_count']}")
+    for item in result["word_logs"]:
+        print(
+            f"- {item['word']}: api_calls={item['api_calls']}, "
+            f"cache_hits={item['cache_hits']}, risposta={item['responses'] > 0}, "
+            f"match={item['matched']}, synsets={item['synsets']}"
+        )
+    print(f"Voci BabelNet archiviate: {result['babelnet_entry_count']}")
+    print(f"Stati lessico arricchito: {result['enriched_status_counts']}")
+
+
+if __name__ == "__main__":
+    main()