feat: consolida lessico semantico, temi controllati e filler a quota tematica

2026-04-15 15:37:52 +02:00
parent b172b9c04b
commit a1f8cb8577
8 changed files with 14030 additions and 46434 deletions
--- a/build_babelnet_enrichment.py
+++ b/build_babelnet_enrichment.py
@@ -0,0 +1,291 @@
 from __future__ import annotations
 import argparse
 import json
 import os
 import time
 import urllib.error
 import urllib.parse
 import urllib.request
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional
 from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
 from main import parse_difficulty
 BABELNET_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_babelnet.json")
 BABELNET_CACHE_PATH = Path(__file__).with_name(".babelnet_cache.json")
 BABELNET_API_BASE = "https://babelnet.io/v9"
 BABELNET_ENV_KEY = "BABELNET_API_KEY"
 POS_TO_BABELNET = {
    "NOUN": "NOUN",
    "VERB": "VERB",
    "ADJ": "ADJECTIVE",
    "ADV": "ADVERB",
 }
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Arricchisce lexicon_it_semantic.json usando BabelNet, se disponibile una API key."
    )
    parser.add_argument(
        "--api-key",
        default=os.environ.get(BABELNET_ENV_KEY),
        help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.",
    )
    parser.add_argument(
        "--topic",
        default=None,
        help="Topic opzionale da usare per limitare le voci da arricchire.",
    )
    parser.add_argument(
        "--difficulty",
        default="medium",
        help="Difficolta massima delle voci da arricchire: easy, medium, hard, expert oppure 1-5.",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=100,
        help="Numero massimo di lemmi da interrogare in questa esecuzione.",
    )
    parser.add_argument(
        "--sleep",
        type=float,
        default=0.2,
        help="Pausa tra richieste API, utile per non stressare il servizio.",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=BABELNET_OUTPUT_PATH,
        help="File JSON di output.",
    )
    return parser.parse_args()
 def load_json(path: Path, default: object) -> object:
    if not path.exists():
        return default
    return json.loads(path.read_text(encoding="utf-8"))
 def write_json(path: Path, payload: object) -> None:
    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
 def request_json(endpoint: str, params: Dict[str, str], cache: Dict[str, object]) -> object:
    url = f"{BABELNET_API_BASE}/{endpoint}?{urllib.parse.urlencode(params)}"
    if url in cache:
        return cache[url]
    request = urllib.request.Request(url, headers={"Accept": "application/json"})
    try:
        with urllib.request.urlopen(request, timeout=30) as response:
            payload = json.loads(response.read().decode("utf-8"))
    except urllib.error.HTTPError as exc:
        detail = exc.read().decode("utf-8", errors="replace")
        raise RuntimeError(f"Errore BabelNet HTTP {exc.code}: {detail}") from exc
    cache[url] = payload
    return payload
 def entry_topics(entry: Dict[str, object]) -> set[str]:
    return {str(item).lower() for item in entry.get("topics", [])}
 def select_entries(payload: Dict[str, object], topic: Optional[str], difficulty_level: int, limit: int) -> List[Dict[str, object]]:
    selected = []
    normalized_topic = topic.strip().lower() if topic else None
    for entry in payload.get("entries", []):
        word = str(entry.get("form", ""))
        if not word or not word.isalpha():
            continue
        if len(word) < 3 or len(word) > 16:
            continue
        if int(entry.get("difficulty_word", 5)) > difficulty_level:
            continue
        if str(entry.get("pos", "")) not in POS_TO_BABELNET:
            continue
        if normalized_topic and normalized_topic not in entry_topics(entry):
            continue
        selected.append(entry)
        if len(selected) >= limit:
            break
    return selected
 def compact_synset_id(payload: Dict[str, object]) -> Dict[str, object]:
    return {
        "id": payload.get("id"),
        "pos": payload.get("pos"),
        "source": payload.get("source"),
    }
 def extract_glosses(payload: Dict[str, object]) -> List[str]:
    glosses = []
    for item in payload.get("glosses", []) or []:
        language = str(item.get("language", "")).upper()
        gloss = str(item.get("gloss", "")).strip()
        if gloss and language in {"IT", "ITA", ""}:
            glosses.append(gloss)
    return dedupe(glosses)[:5]
 def extract_senses(payload: Dict[str, object]) -> List[str]:
    senses = []
    for item in payload.get("senses", []) or []:
        language = str(item.get("language", "")).upper()
        lemma = str(item.get("properties", {}).get("simpleLemma") or item.get("fullLemma") or "").strip()
        if lemma and language in {"IT", "ITA", ""}:
            senses.append(lemma.replace("_", " "))
    return dedupe(senses)[:20]
 def extract_categories(payload: Dict[str, object]) -> List[str]:
    categories = []
    for item in payload.get("categories", []) or []:
        category = str(item.get("category", "")).strip()
        if category:
            categories.append(category)
    return dedupe(categories)[:20]
 def extract_domains(payload: Dict[str, object]) -> List[str]:
    domains = payload.get("domains", [])
    if isinstance(domains, dict):
        return sorted(str(key) for key, value in domains.items() if value)
    if isinstance(domains, list):
        return dedupe(str(item) for item in domains if item)[:20]
    return []
 def dedupe(items: Iterable[str]) -> List[str]:
    seen = set()
    result = []
    for item in items:
        text = str(item).strip()
        if not text or text in seen:
            continue
        seen.add(text)
        result.append(text)
    return result
 def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object], sleep_seconds: float) -> Dict[str, object]:
    word = str(entry.get("form", ""))
    pos = POS_TO_BABELNET.get(str(entry.get("pos", "")))
    if not pos:
        return {"matched": False, "reason": "unsupported_pos", "synsets": []}
    synset_ids = request_json(
        "getSynsetIds",
        {
            "lemma": word,
            "searchLang": "IT",
            "pos": pos,
            "key": api_key,
        },
        cache,
    )
    if sleep_seconds:
        time.sleep(sleep_seconds)
    if not isinstance(synset_ids, list) or not synset_ids:
        return {"matched": False, "reason": "no_synsets", "synsets": []}
    synsets = []
    for synset_ref in synset_ids[:3]:
        synset_id = synset_ref.get("id") if isinstance(synset_ref, dict) else str(synset_ref)
        if not synset_id:
            continue
        synset_payload = request_json(
            "getSynset",
            {
                "id": synset_id,
                "targetLang": "IT",
                "key": api_key,
            },
            cache,
        )
        if sleep_seconds:
            time.sleep(sleep_seconds)
        if not isinstance(synset_payload, dict):
            continue
        synsets.append(
            {
                "id": synset_id,
                "senses": extract_senses(synset_payload),
                "glosses": extract_glosses(synset_payload),
                "categories": extract_categories(synset_payload),
                "domains": extract_domains(synset_payload),
            }
        )
    return {
        "matched": bool(synsets),
        "synset_refs": [compact_synset_id(item) for item in synset_ids[:5] if isinstance(item, dict)],
        "synsets": synsets,
    }
 def build_babelnet_enrichment(args: argparse.Namespace) -> Dict[str, object]:
    if not args.api_key:
        raise SystemExit(
            f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure usa --api-key <chiave>."
        )
    if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
        raise FileNotFoundError(f"Lessico semantico non trovato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
    payload = load_json(SEMANTIC_LEXICON_OUTPUT_PATH, {})
    cache = load_json(BABELNET_CACHE_PATH, {})
    if not isinstance(cache, dict):
        cache = {}
    difficulty_level = parse_difficulty(str(args.difficulty))
    selected_entries = select_entries(payload, args.topic, difficulty_level, args.limit)
    enriched_entries = []
    for index, entry in enumerate(selected_entries, start=1):
        enriched = dict(entry)
        enriched["babelnet"] = enrich_entry(enriched, args.api_key, cache, args.sleep)
        enriched_entries.append(enriched)
        print(f"[{index}/{len(selected_entries)}] {entry['form']}: {enriched['babelnet'].get('matched')}")
        write_json(BABELNET_CACHE_PATH, cache)
    return {
        "meta": {
            "language": "it",
            "version": 1,
            "base_lexicon": SEMANTIC_LEXICON_OUTPUT_PATH.name,
            "source": "BabelNet API",
            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
            "topic": args.topic,
            "difficulty": args.difficulty,
            "requested_limit": args.limit,
            "entry_count": len(enriched_entries),
        },
        "entries": enriched_entries,
    }
 def main() -> None:
    args = parse_args()
    payload = build_babelnet_enrichment(args)
    write_json(args.output, payload)
    matched = sum(1 for entry in payload["entries"] if entry.get("babelnet", {}).get("matched"))
    print(f"Lessico BabelNet generato: {args.output}")
    print(f"Voci arricchite: {payload['meta']['entry_count']}")
    print(f"Voci con match BabelNet: {matched}")
 if __name__ == "__main__":
    main()
--- a/build_lexicon.py
+++ b/build_lexicon.py
@@ -83,8 +83,9 @@ TOPIC_KEYWORDS = {
        "aula", "figura", "titolo",
    },
    "cinema": {
-        "film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "voce", "visione",
+        "film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "cinema",
-        "finale", "figura",
+        "doppiatore", "documentario", "cinegiornale", "colossal", "commedia", "comparsa",
        "controfigura", "diva", "divo", "cabaret", "cartoon",
    },
    "literature": {
        "libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
@@ -99,8 +100,12 @@ TOPIC_KEYWORDS = {
        "casale", "balcone", "finestra", "stazione",
    },
    "transport": {
-        "automobile", "barca", "vela", "treno", "motore", "viaggio", "ruota", "ponte", "pilota",
+        "automobile", "auto", "automezzo", "autoveicolo", "autovettura", "autobus", "autocarro",
-        "volo", "aeroporto", "vettura",
+        "aeromobile", "aeroplano", "aeroporto", "ambulanza", "autoambulanza", "astronave",
        "barca", "barchetta", "bastimento", "bicicletta", "bici", "bimotore", "bireattore",
        "bombardiere", "imbarcazione", "motrice", "motore", "nave", "pista", "porto",
        "quadrimotore", "reattore", "rimorchio", "rimorchiatore", "rotaia", "ruota", "trattore",
        "treno", "vapore", "vela", "veliero", "vettura", "volante", "volo",
    },
    "work": {
        "lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
@@ -115,11 +120,6 @@ TOPIC_KEYWORDS = {
 TOPIC_SUFFIXES = {
    "actions": ("are", "ere", "ire"),
    "abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
    "animals": ("cane", "gatto", "lupo", "pesce", "volpe", "orso"),
    "plants": ("fiore", "foglia", "seme", "radice", "erba"),
    "nature": ("mare", "lago", "bosco", "vento", "onda", "roccia"),
    "geography": ("montagna", "isola", "deserto", "confine"),
    "city": ("strada", "palazzo", "porta", "ponte"),
 }
@@ -135,7 +135,7 @@ def infer_topics(word: str, tags: List[str]) -> List[str]:
    if "verb_infinitive" in tags:
        topics.add("actions")
-    if any(word.endswith(suffix) for suffix in ("tore", "trice", "zione", "ismo", "ista", "mento", "anza", "enza")):
+    if any(word.endswith(suffix) for suffix in ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza")):
        topics.add("abstract")
    for topic, keywords in TOPIC_KEYWORDS.items():
--- a/build_semantic_lexicon.py
+++ b/build_semantic_lexicon.py
@@ -9,7 +9,7 @@ from datetime import datetime
 from pathlib import Path
 from typing import Dict, Iterable, List, Tuple
-from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics
+from build_lexicon import LEXICON_OUTPUT_PATH
 IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
@@ -356,8 +356,7 @@ def enrich_entry(
    ][:20]
    glosses = dedupe_keep_order(glosses)
    semantic_topics = dedupe_keep_order(
-        list(entry.get("topics", []))
+        semantic_topics_from_text(
        + semantic_topics_from_text(
            glosses
            + synonyms
            + raw_relation_terms.get("hypernym", [])
@@ -365,7 +364,6 @@ def enrich_entry(
            + raw_relation_terms.get("similar", [])
        )
    )
    entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
    entry["semantic"] = {
        "source": "iwn-omw",
        "matched": True,
--- a/create_passo4.bat
+++ b/create_passo4.bat
@@ -4,7 +4,7 @@ setlocal
 cd /d "%~dp0"
 set "BRANCH_NAME=passo4"
-set "COMMIT_MSG=feat: aggiunge il lessico semantico con integrazione ItalWordNet"
+set "COMMIT_MSG=feat: consolida lessico semantico, temi controllati e filler a quota tematica"
 if not "%~1"=="" (
    set "COMMIT_MSG=%~1"
@@ -32,8 +32,8 @@ if errorlevel 1 (
 if errorlevel 1 exit /b 1
 echo.
-echo Aggiungo le modifiche...
+echo Aggiungo le modifiche di progetto, escludendo cache Python e cache API...
-git add .
+git add *.py *.bat *.txt lexicon_it.json lexicon_it_semantic.json vocaboli_it_metadata.json package iwn-omw-main
 if errorlevel 1 exit /b 1
 echo.
--- a/crossword_filler.py
+++ b/crossword_filler.py
@@ -62,7 +62,7 @@ class FillCandidate:
    slot: FillSlot
    new_letters: int
    reused_letters: int
-    local_score: Tuple[int, int, int]
+    local_score: Tuple[int, ...]
 class CrosswordFiller:
@@ -73,6 +73,9 @@ class CrosswordFiller:
        *,
        target_empty_ratio: float = TARGET_EMPTY_RATIO,
        vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
        semantic_metadata: Optional[Dict[str, Dict[str, object]]] = None,
        selected_topic: str = "general",
        max_themed_fill_words: int = 10,
        seed: Optional[int] = None,
    ) -> None:
        self.state = state.copy()
@@ -83,6 +86,9 @@ class CrosswordFiller:
        self.vocabulary = self._normalize_vocabulary(vocabulary)
        self.words_by_length = self._index_vocabulary(self.vocabulary)
        self.vocabulary_metadata = vocabulary_metadata or {}
        self.semantic_metadata = semantic_metadata or {}
        self.selected_topic = selected_topic.strip().lower()
        self.max_themed_fill_words = max(0, max_themed_fill_words)
        self.seed = seed
        self.rng = random.Random(seed)
        self.bounds = self._compute_bounds(self.state.grid)
@@ -281,9 +287,11 @@ class CrosswordFiller:
            new_letters = sum(1 for cell in slot.cells if cell not in self.state.grid)
            reused_letters = slot.fixed_letters
            local_score = (
                self._semantic_topic_score(word),
                reused_letters,
                new_letters,
                self._word_quality(word),
                self._semantic_quality(word),
                len(set(word)),
            )
            candidates.append(
@@ -311,6 +319,56 @@ class CrosswordFiller:
        except (TypeError, ValueError):
            return 0
    def _semantic_entry(self, word: str) -> Dict[str, object]:
        return self.semantic_metadata.get(word, {})
    def _semantic_quality(self, word: str) -> int:
        entry = self._semantic_entry(word)
        semantic = entry.get("semantic", {})
        score = 0
        if semantic.get("matched"):
            score += 2
        score += min(3, len(semantic.get("glosses", [])))
        score += min(2, len(semantic.get("synonyms", [])))
        return score
    def _semantic_topic_score(self, word: str) -> int:
        if not self.selected_topic or self.selected_topic == "general":
            return 0
        entry = self._semantic_entry(word)
        try:
            relevance = int(entry.get("_topic_relevance", 0))
        except (TypeError, ValueError):
            relevance = 0
        if relevance:
            if self._themed_added_count() < self.max_themed_fill_words:
                return relevance
            return min(relevance, 10)
        topics = {str(item).lower() for item in entry.get("topics", [])}
        semantic = entry.get("semantic", {})
        semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", [])}
        score = 0
        if self.selected_topic in topics:
            score += 4
        if self.selected_topic in semantic_topics:
            score += 6
        if "general" in topics:
            score += 1
        return score
    def _themed_added_count(self) -> int:
        total = 0
        for placement in self.added_words:
            entry = self._semantic_entry(placement.word)
            try:
                if int(entry.get("_strong_topic_relevance", 0)) > 0:
                    total += 1
            except (TypeError, ValueError):
                continue
        return total
    def _placement_is_valid(self, slot: FillSlot, word: str) -> bool:
        dx, dy = (1, 0) if slot.direction == HORIZONTAL else (0, 1)
        before = (slot.x - dx, slot.y - dy)
@@ -380,6 +438,7 @@ class CrosswordFiller:
            f"vuote={self.empty_cells_count()}/{self.total_cells} "
            f"target={self.target_empty_cells} "
            f"aggiunte={len(self.added_words)} "
            f"tema={self._themed_added_count()}/{self.max_themed_fill_words} "
            f"ultima={self.last_word} "
            f"t={elapsed:0.1f}s"
        )
--- a/lexicon_it.json
+++ b/lexicon_it.json
--- a/lexicon_it_semantic.json
+++ b/lexicon_it_semantic.json
--- a/main.py
+++ b/main.py
@@ -25,6 +25,72 @@ DIFFICULTY_ALIASES: Dict[str, int] = {
 }
 DEFAULT_TOPIC = "general"
 DEFAULT_INITIAL_WORD_COUNT = len(WORDS)
 ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
 FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
 GENERAL_FILL_MIN_QUALITY = 6
 GENERAL_FILL_MAX_LENGTH = 10
 SOFT_RELATED_FILL_LIMIT = 120
 DEFAULT_THEMED_FILL_WORD_COUNT = 10
 CONCRETE_TOPICS = {
    "animals",
    "plants",
    "nature",
    "ecology",
    "geography",
    "weather",
    "sea",
    "mountain",
    "health",
    "science",
    "sport",
    "history",
    "school",
    "cinema",
    "literature",
    "food",
    "city",
    "transport",
    "work",
    "home",
 }
 TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
    "transport": (
        "auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer",
        "bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc",
        "trattor", "carr", "vap", "rota", "ruot",
    ),
    "animals": (
        "can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall",
        "serpent", "tig", "leon", "volp", "cerv", "capr", "pecor",
    ),
    "nature": (
        "mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol",
        "rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent",
        "litoral", "piogg", "nev", "onda", "clim",
    ),
    "cinema": (
        "film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog",
        "comic", "div", "docu", "pellic", "spettacol",
    ),
 }
 TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
    "transport": (
        "intervist", "intratten", "speriment", "stermin", "investig",
        "intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt",
        "eccit", "traduz", "fluttu", "sollecit",
    ),
    "animals": (
        "assicur", "finanz", "coediz", "camerier", "servitor", "indic",
        "estens", "diffus", "difensor", "spessor", "maggior",
    ),
    "cinema": (
        "manifest", "riediz", "dissimul", "diffus", "difensor", "estens",
        "malumor", "eversor",
    ),
 }
 def parse_args() -> argparse.Namespace:
@@ -95,6 +161,18 @@ def parse_args() -> argparse.Namespace:
        default=DEFAULT_TOPIC,
        help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
    )
    parser.add_argument(
        "--initial-word-count",
        type=int,
        default=DEFAULT_INITIAL_WORD_COUNT,
        help="Numero di parole-seme usate per costruire la griglia iniziale prima del filler.",
    )
    parser.add_argument(
        "--themed-fill-count",
        type=int,
        default=DEFAULT_THEMED_FILL_WORD_COUNT,
        help="Numero massimo indicativo di parole aggiunte dal filler da mantenere fortemente legate al tema.",
    )
    return parser.parse_args()
@@ -165,42 +243,328 @@ def load_selected_vocabulary(path: Path | None) -> List[str]:
    return path.read_text(encoding="utf-8").splitlines()
-def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
+def load_semantic_payload() -> Dict[str, object]:
-    if not LEXICON_OUTPUT_PATH.exists():
+    if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
-        lexicon = build_lexicon()
+        lexicon = build_semantic_lexicon()
-        LEXICON_OUTPUT_PATH.write_text(
+        SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
            json.dumps(lexicon, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
    return json.loads(SEMANTIC_LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
-    payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
+
 def entry_topics(entry: Dict[str, object]) -> tuple[set[str], set[str]]:
    topics = {str(item).lower() for item in entry.get("topics", [])}
    semantic_topics = {
        str(item).lower()
        for item in entry.get("semantic", {}).get("semantic_topics", [])
    }
    return topics, semantic_topics
 def matches_topic_roots(word: str, selected_topic: str) -> bool:
    roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic, ())
    blocked = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
    if any(part in word for part in blocked):
        return False
    return bool(roots) and any(part in word for part in roots)
 def topic_relevance(entry: Dict[str, object], topic: str) -> int:
    selected_topic = topic.strip().lower()
    if selected_topic == DEFAULT_TOPIC:
        return 20
    word = str(entry.get("form", ""))
    topics, semantic_topics = entry_topics(entry)
    score = 0
    if selected_topic in topics:
        score += 100
    if selected_topic in semantic_topics:
        score += 45
    if matches_topic_roots(word, selected_topic):
        score += 35
    if "general" in topics:
        score += 5
    if any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())):
        score -= 80
    if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
        score -= 15
    return score
 def strong_topic_relevance(entry: Dict[str, object], topic: str) -> int:
    selected_topic = topic.strip().lower()
    if selected_topic == DEFAULT_TOPIC:
        return 20
    topics, _ = entry_topics(entry)
    return 100 if selected_topic in topics else 0
 def lexical_fill_score(entry: Dict[str, object], topic: str) -> tuple[int, int, int, int, int, str]:
    word = str(entry.get("form", ""))
    quality = int(entry.get("quality_score", 0))
    pos = str(entry.get("pos", ""))
    semantic = entry.get("semantic", {})
    pos_bonus = {
        "NOUN": 12,
        "VERB": 8,
        "ADJ": 6,
        "ADV": 4,
        "PREP": 2,
        "CONJ": 2,
    }.get(pos, 0)
    semantic_bonus = 3 if semantic.get("matched") else 0
    length = len(word)
    length_bonus = 3 if 4 <= length <= 10 else 1 if 2 <= length <= 13 else -4
    return (
        topic_relevance(entry, topic),
        quality,
        pos_bonus,
        semantic_bonus,
        length_bonus,
        word,
    )
 def is_general_fill_support(entry: Dict[str, object]) -> bool:
    word = str(entry.get("form", ""))
    if int(entry.get("quality_score", 0)) < GENERAL_FILL_MIN_QUALITY:
        return False
    if len(word) > GENERAL_FILL_MAX_LENGTH:
        return False
    if word.endswith(ABSTRACTISH_SUFFIXES):
        return False
    return DEFAULT_TOPIC in {str(item).lower() for item in entry.get("topics", [])}
 def load_filtered_entries(level: int, topic: str) -> List[Dict[str, object]]:
    payload = load_semantic_payload()
    normalized_topic = topic.strip().lower()
-    def matches(entry: Dict[str, object], selected_topic: str) -> bool:
+    eligible = [
-        topics = [str(item).lower() for item in entry.get("topics", [])]
+        entry
        return selected_topic in topics
    words = [
        entry["form"]
        for entry in payload.get("entries", [])
        if entry.get("allowed_in_crossword", False)
        and int(entry.get("difficulty_word", 5)) <= level
-        and matches(entry, normalized_topic)
+        and str(entry.get("pos", "")) in FILL_ALLOWED_POS
    ]
-    if words:
+    if normalized_topic == DEFAULT_TOPIC:
-        return words
+        selected = eligible
-
+    else:
-    if normalized_topic != DEFAULT_TOPIC:
+        strong_topic = [entry for entry in eligible if strong_topic_relevance(entry, normalized_topic) > 0]
-        return [
+        soft_related = [
-            entry["form"]
+            entry
-            for entry in payload.get("entries", [])
+            for entry in eligible
-            if entry.get("allowed_in_crossword", False)
+            if entry not in strong_topic
-            and int(entry.get("difficulty_word", 5)) <= level
+            and topic_relevance(entry, normalized_topic) > 0
-            and matches(entry, DEFAULT_TOPIC)
+            and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
            and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH
            and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES)
        ]
        soft_related.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
-    return words
+        general_support = [
            entry
            for entry in eligible
            if entry not in strong_topic
            and is_general_fill_support(entry)
        ]
        general_support.sort(key=lambda entry: lexical_fill_score(entry, DEFAULT_TOPIC), reverse=True)
        selected = strong_topic + soft_related[:SOFT_RELATED_FILL_LIMIT]
        selected += [entry for entry in general_support if entry not in selected]
    selected.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
    return selected
 def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
    return [str(entry["form"]) for entry in load_filtered_entries(level, topic)]
 def load_semantic_metadata_for_vocabulary(words: List[str], topic: str) -> Dict[str, Dict[str, object]]:
    payload = load_semantic_payload()
    selected = set(words)
    metadata: Dict[str, Dict[str, object]] = {}
    for entry in payload.get("entries", []):
        word = str(entry.get("form", ""))
        if word not in selected:
            continue
        enriched = dict(entry)
        enriched["_topic_relevance"] = topic_relevance(enriched, topic)
        enriched["_strong_topic_relevance"] = strong_topic_relevance(enriched, topic)
        metadata[word] = enriched
    return metadata
 def select_initial_words(level: int, topic: str, count: int) -> List[str]:
    payload = load_semantic_payload()
    normalized_topic = topic.strip().lower()
    abstract_like_topics = {"abstract", "actions"}
    def matches(entry: Dict[str, object], selected_topic: str) -> bool:
        topics, semantic_topics = entry_topics(entry)
        return selected_topic in topics
    def word_score(entry: Dict[str, object], selected_topic: str) -> tuple[int, int, int, int, int, int, str]:
        topics, semantic_topics = entry_topics(entry)
        quality = int(entry.get("quality_score", 0))
        semantic = entry.get("semantic", {})
        semantic_match = 1 if semantic.get("matched") else 0
        glossary_bonus = min(3, len(semantic.get("glosses", [])))
        word = str(entry.get("form", ""))
        length = len(word)
        topical_concreteness_penalty = 0
        topic_bonus = 0
        pos_bonus = 0
        if selected_topic in topics:
            topic_bonus += 4
        if "general" in topics:
            topic_bonus += 1
        if str(entry.get("pos", "")) == "NOUN":
            pos_bonus += 4
        elif str(entry.get("pos", "")) == "ADJ":
            pos_bonus += 1
        if selected_topic not in abstract_like_topics and selected_topic != DEFAULT_TOPIC:
            if "abstract" in topics and selected_topic not in topics:
                topical_concreteness_penalty -= 3
            if "actions" in topics and selected_topic not in topics:
                topical_concreteness_penalty -= 2
            if word.endswith(ABSTRACTISH_SUFFIXES):
                topical_concreteness_penalty -= 4
            if str(entry.get("pos", "")) != "NOUN":
                topical_concreteness_penalty -= 3
        if 5 <= length <= 10:
            length_bonus = 3
        elif 4 <= length <= 12:
            length_bonus = 1
        else:
            length_bonus = -2
        return (
            topic_bonus,
            pos_bonus,
            topical_concreteness_penalty,
            quality,
            semantic_match,
            glossary_bonus,
            length_bonus,
            word,
        )
    def is_seed_friendly(entry: Dict[str, object], selected_topic: str) -> bool:
        word = str(entry.get("form", ""))
        pos = str(entry.get("pos", ""))
        topics, semantic_topics = entry_topics(entry)
        topic_hit = selected_topic in topics
        if len(word) < 4 or len(word) > 13:
            return False
        if selected_topic in CONCRETE_TOPICS and pos != "NOUN":
            return False
        if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
            return False
        blocked_substrings = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
        if any(part in word for part in blocked_substrings):
            return False
        required_substrings = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic)
        if (
            selected_topic in CONCRETE_TOPICS
            and required_substrings
            and selected_topic != DEFAULT_TOPIC
            and not any(part in word for part in required_substrings)
        ):
            return False
        if selected_topic != DEFAULT_TOPIC and not topic_hit:
            return False
        return True
    def overlap_score(left: str, right: str) -> int:
        shared = set(left) & set(right)
        return sum(min(left.count(ch), right.count(ch)) for ch in shared)
    def pick_seed_set(entries: List[Dict[str, object]], selected_topic: str, target_count: int) -> List[str]:
        if not entries:
            return []
        ranked = sorted(entries, key=lambda entry: word_score(entry, selected_topic), reverse=True)
        chosen: List[str] = []
        chosen_entries: List[Dict[str, object]] = []
        first = ranked[0]
        chosen.append(str(first["form"]))
        chosen_entries.append(first)
        while len(chosen) < target_count:
            best_entry = None
            best_key = None
            for entry in ranked:
                word = str(entry.get("form", ""))
                if word in chosen:
                    continue
                overlap_total = sum(overlap_score(word, existing) for existing in chosen)
                max_overlap = max((overlap_score(word, existing) for existing in chosen), default=0)
                distinct_letters = len(set(word))
                same_length_penalty = -sum(1 for existing in chosen if len(existing) == len(word))
                key = (
                    1 if max_overlap >= 2 else 0,
                    overlap_total,
                    max_overlap,
                    same_length_penalty,
                    distinct_letters,
                    word_score(entry, selected_topic),
                )
                if best_key is None or key > best_key:
                    best_key = key
                    best_entry = entry
            if best_entry is None:
                break
            chosen.append(str(best_entry["form"]))
            chosen_entries.append(best_entry)
        return chosen
    eligible = [
        entry
        for entry in payload.get("entries", [])
        if entry.get("allowed_in_crossword", False)
        and int(entry.get("difficulty_word", 5)) <= level
    ]
    lexical_topical = []
    for entry in eligible:
        topics, semantic_topics = entry_topics(entry)
        if normalized_topic in topics:
            lexical_topical.append(entry)
    fallback = [entry for entry in eligible if matches(entry, DEFAULT_TOPIC)]
    if normalized_topic == DEFAULT_TOPIC:
        pool = fallback
    else:
        pool = list(lexical_topical)
        if not pool:
            pool = fallback
    strict_pool = [entry for entry in pool if is_seed_friendly(entry, normalized_topic)]
    relaxed_pool = sorted(pool, key=lambda entry: word_score(entry, normalized_topic), reverse=True)
    selected = pick_seed_set(strict_pool, normalized_topic, count)
    if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
        relaxed_selected = pick_seed_set(relaxed_pool, normalized_topic, count)
        for word in relaxed_selected:
            if word not in selected:
                selected.append(word)
            if len(selected) >= count:
                break
    if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
        for word in WORDS:
            if word in selected:
                continue
            selected.append(word)
            if len(selected) >= count:
                break
    return selected[:count]
 def main() -> None:
@@ -209,9 +573,10 @@ def main() -> None:
    ensure_lexicon(args)
    ensure_semantic_lexicon(args)
    difficulty_level = parse_difficulty(args.difficulty)
    initial_words = select_initial_words(difficulty_level, args.topic, args.initial_word_count)
    generator = CrosswordGenerator(
-        WORDS,
+        initial_words,
        diffxy=args.diffxy,
        time_limit_seconds=args.time_limit,
        max_candidates_per_word=args.max_candidates,
@@ -220,6 +585,7 @@ def main() -> None:
    initial_state = generator.solve()
    print("Griglia iniziale")
    print(f"Parole-seme richieste: {len(initial_words)}")
    print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
    print(f"Intersezioni: {initial_state.intersections}")
    print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
@@ -229,17 +595,24 @@ def main() -> None:
        print(f"Seed: {args.seed}")
    print()
    print(render_grid(initial_state.grid, initial_state.placements))
    print()
    print("Parole-seme selezionate:")
    print(", ".join(initial_words))
    if args.skip_fill:
        return
    vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
    metadata = load_vocabulary_metadata()
    semantic_metadata = load_semantic_metadata_for_vocabulary(vocabulary, args.topic) if not args.vocabulary else {}
    filler = CrosswordFiller(
        initial_state,
        vocabulary,
        target_empty_ratio=args.target_empty_ratio,
        vocabulary_metadata=metadata,
        semantic_metadata=semantic_metadata,
        selected_topic=args.topic,
        max_themed_fill_words=args.themed_fill_count,
        seed=args.seed,
    )
    final_state = filler.fill()