feat: consolida lessico semantico, temi controllati e filler a quota tematica

2026-04-15 15:37:52 +02:00
parent b172b9c04b
commit a1f8cb8577
8 changed files with 14030 additions and 46434 deletions
--- a/build_babelnet_enrichment.py
+++ b/build_babelnet_enrichment.py
@@ -0,0 +1,291 @@
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import time
+import urllib.error
+import urllib.parse
+import urllib.request
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional
+
+from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
+from main import parse_difficulty
+
+
+BABELNET_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_babelnet.json")
+BABELNET_CACHE_PATH = Path(__file__).with_name(".babelnet_cache.json")
+BABELNET_API_BASE = "https://babelnet.io/v9"
+BABELNET_ENV_KEY = "BABELNET_API_KEY"
+
+POS_TO_BABELNET = {
+    "NOUN": "NOUN",
+    "VERB": "VERB",
+    "ADJ": "ADJECTIVE",
+    "ADV": "ADVERB",
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Arricchisce lexicon_it_semantic.json usando BabelNet, se disponibile una API key."
+    )
+    parser.add_argument(
+        "--api-key",
+        default=os.environ.get(BABELNET_ENV_KEY),
+        help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.",
+    )
+    parser.add_argument(
+        "--topic",
+        default=None,
+        help="Topic opzionale da usare per limitare le voci da arricchire.",
+    )
+    parser.add_argument(
+        "--difficulty",
+        default="medium",
+        help="Difficolta massima delle voci da arricchire: easy, medium, hard, expert oppure 1-5.",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=100,
+        help="Numero massimo di lemmi da interrogare in questa esecuzione.",
+    )
+    parser.add_argument(
+        "--sleep",
+        type=float,
+        default=0.2,
+        help="Pausa tra richieste API, utile per non stressare il servizio.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=BABELNET_OUTPUT_PATH,
+        help="File JSON di output.",
+    )
+    return parser.parse_args()
+
+
+def load_json(path: Path, default: object) -> object:
+    if not path.exists():
+        return default
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def write_json(path: Path, payload: object) -> None:
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def request_json(endpoint: str, params: Dict[str, str], cache: Dict[str, object]) -> object:
+    url = f"{BABELNET_API_BASE}/{endpoint}?{urllib.parse.urlencode(params)}"
+    if url in cache:
+        return cache[url]
+
+    request = urllib.request.Request(url, headers={"Accept": "application/json"})
+    try:
+        with urllib.request.urlopen(request, timeout=30) as response:
+            payload = json.loads(response.read().decode("utf-8"))
+    except urllib.error.HTTPError as exc:
+        detail = exc.read().decode("utf-8", errors="replace")
+        raise RuntimeError(f"Errore BabelNet HTTP {exc.code}: {detail}") from exc
+
+    cache[url] = payload
+    return payload
+
+
+def entry_topics(entry: Dict[str, object]) -> set[str]:
+    return {str(item).lower() for item in entry.get("topics", [])}
+
+
+def select_entries(payload: Dict[str, object], topic: Optional[str], difficulty_level: int, limit: int) -> List[Dict[str, object]]:
+    selected = []
+    normalized_topic = topic.strip().lower() if topic else None
+
+    for entry in payload.get("entries", []):
+        word = str(entry.get("form", ""))
+        if not word or not word.isalpha():
+            continue
+        if len(word) < 3 or len(word) > 16:
+            continue
+        if int(entry.get("difficulty_word", 5)) > difficulty_level:
+            continue
+        if str(entry.get("pos", "")) not in POS_TO_BABELNET:
+            continue
+        if normalized_topic and normalized_topic not in entry_topics(entry):
+            continue
+        selected.append(entry)
+        if len(selected) >= limit:
+            break
+
+    return selected
+
+
+def compact_synset_id(payload: Dict[str, object]) -> Dict[str, object]:
+    return {
+        "id": payload.get("id"),
+        "pos": payload.get("pos"),
+        "source": payload.get("source"),
+    }
+
+
+def extract_glosses(payload: Dict[str, object]) -> List[str]:
+    glosses = []
+    for item in payload.get("glosses", []) or []:
+        language = str(item.get("language", "")).upper()
+        gloss = str(item.get("gloss", "")).strip()
+        if gloss and language in {"IT", "ITA", ""}:
+            glosses.append(gloss)
+    return dedupe(glosses)[:5]
+
+
+def extract_senses(payload: Dict[str, object]) -> List[str]:
+    senses = []
+    for item in payload.get("senses", []) or []:
+        language = str(item.get("language", "")).upper()
+        lemma = str(item.get("properties", {}).get("simpleLemma") or item.get("fullLemma") or "").strip()
+        if lemma and language in {"IT", "ITA", ""}:
+            senses.append(lemma.replace("_", " "))
+    return dedupe(senses)[:20]
+
+
+def extract_categories(payload: Dict[str, object]) -> List[str]:
+    categories = []
+    for item in payload.get("categories", []) or []:
+        category = str(item.get("category", "")).strip()
+        if category:
+            categories.append(category)
+    return dedupe(categories)[:20]
+
+
+def extract_domains(payload: Dict[str, object]) -> List[str]:
+    domains = payload.get("domains", [])
+    if isinstance(domains, dict):
+        return sorted(str(key) for key, value in domains.items() if value)
+    if isinstance(domains, list):
+        return dedupe(str(item) for item in domains if item)[:20]
+    return []
+
+
+def dedupe(items: Iterable[str]) -> List[str]:
+    seen = set()
+    result = []
+    for item in items:
+        text = str(item).strip()
+        if not text or text in seen:
+            continue
+        seen.add(text)
+        result.append(text)
+    return result
+
+
+def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object], sleep_seconds: float) -> Dict[str, object]:
+    word = str(entry.get("form", ""))
+    pos = POS_TO_BABELNET.get(str(entry.get("pos", "")))
+    if not pos:
+        return {"matched": False, "reason": "unsupported_pos", "synsets": []}
+
+    synset_ids = request_json(
+        "getSynsetIds",
+        {
+            "lemma": word,
+            "searchLang": "IT",
+            "pos": pos,
+            "key": api_key,
+        },
+        cache,
+    )
+    if sleep_seconds:
+        time.sleep(sleep_seconds)
+
+    if not isinstance(synset_ids, list) or not synset_ids:
+        return {"matched": False, "reason": "no_synsets", "synsets": []}
+
+    synsets = []
+    for synset_ref in synset_ids[:3]:
+        synset_id = synset_ref.get("id") if isinstance(synset_ref, dict) else str(synset_ref)
+        if not synset_id:
+            continue
+        synset_payload = request_json(
+            "getSynset",
+            {
+                "id": synset_id,
+                "targetLang": "IT",
+                "key": api_key,
+            },
+            cache,
+        )
+        if sleep_seconds:
+            time.sleep(sleep_seconds)
+        if not isinstance(synset_payload, dict):
+            continue
+        synsets.append(
+            {
+                "id": synset_id,
+                "senses": extract_senses(synset_payload),
+                "glosses": extract_glosses(synset_payload),
+                "categories": extract_categories(synset_payload),
+                "domains": extract_domains(synset_payload),
+            }
+        )
+
+    return {
+        "matched": bool(synsets),
+        "synset_refs": [compact_synset_id(item) for item in synset_ids[:5] if isinstance(item, dict)],
+        "synsets": synsets,
+    }
+
+
+def build_babelnet_enrichment(args: argparse.Namespace) -> Dict[str, object]:
+    if not args.api_key:
+        raise SystemExit(
+            f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure usa --api-key <chiave>."
+        )
+    if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
+        raise FileNotFoundError(f"Lessico semantico non trovato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
+
+    payload = load_json(SEMANTIC_LEXICON_OUTPUT_PATH, {})
+    cache = load_json(BABELNET_CACHE_PATH, {})
+    if not isinstance(cache, dict):
+        cache = {}
+
+    difficulty_level = parse_difficulty(str(args.difficulty))
+    selected_entries = select_entries(payload, args.topic, difficulty_level, args.limit)
+    enriched_entries = []
+
+    for index, entry in enumerate(selected_entries, start=1):
+        enriched = dict(entry)
+        enriched["babelnet"] = enrich_entry(enriched, args.api_key, cache, args.sleep)
+        enriched_entries.append(enriched)
+        print(f"[{index}/{len(selected_entries)}] {entry['form']}: {enriched['babelnet'].get('matched')}")
+        write_json(BABELNET_CACHE_PATH, cache)
+
+    return {
+        "meta": {
+            "language": "it",
+            "version": 1,
+            "base_lexicon": SEMANTIC_LEXICON_OUTPUT_PATH.name,
+            "source": "BabelNet API",
+            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
+            "topic": args.topic,
+            "difficulty": args.difficulty,
+            "requested_limit": args.limit,
+            "entry_count": len(enriched_entries),
+        },
+        "entries": enriched_entries,
+    }
+
+
+def main() -> None:
+    args = parse_args()
+    payload = build_babelnet_enrichment(args)
+    write_json(args.output, payload)
+    matched = sum(1 for entry in payload["entries"] if entry.get("babelnet", {}).get("matched"))
+    print(f"Lessico BabelNet generato: {args.output}")
+    print(f"Voci arricchite: {payload['meta']['entry_count']}")
+    print(f"Voci con match BabelNet: {matched}")
+
+
+if __name__ == "__main__":
+    main()
--- a/build_lexicon.py
+++ b/build_lexicon.py
@@ -83,8 +83,9 @@ TOPIC_KEYWORDS = {
        "aula", "figura", "titolo",
    },
    "cinema": {
-        "film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "voce", "visione",
-        "finale", "figura",
+        "film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "cinema",
+        "doppiatore", "documentario", "cinegiornale", "colossal", "commedia", "comparsa",
+        "controfigura", "diva", "divo", "cabaret", "cartoon",
    },
    "literature": {
        "libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
@@ -99,8 +100,12 @@ TOPIC_KEYWORDS = {
        "casale", "balcone", "finestra", "stazione",
    },
    "transport": {
-        "automobile", "barca", "vela", "treno", "motore", "viaggio", "ruota", "ponte", "pilota",
-        "volo", "aeroporto", "vettura",
+        "automobile", "auto", "automezzo", "autoveicolo", "autovettura", "autobus", "autocarro",
+        "aeromobile", "aeroplano", "aeroporto", "ambulanza", "autoambulanza", "astronave",
+        "barca", "barchetta", "bastimento", "bicicletta", "bici", "bimotore", "bireattore",
+        "bombardiere", "imbarcazione", "motrice", "motore", "nave", "pista", "porto",
+        "quadrimotore", "reattore", "rimorchio", "rimorchiatore", "rotaia", "ruota", "trattore",
+        "treno", "vapore", "vela", "veliero", "vettura", "volante", "volo",
    },
    "work": {
        "lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
@@ -115,11 +120,6 @@ TOPIC_KEYWORDS = {
 TOPIC_SUFFIXES = {
    "actions": ("are", "ere", "ire"),
    "abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
-    "animals": ("cane", "gatto", "lupo", "pesce", "volpe", "orso"),
-    "plants": ("fiore", "foglia", "seme", "radice", "erba"),
-    "nature": ("mare", "lago", "bosco", "vento", "onda", "roccia"),
-    "geography": ("montagna", "isola", "deserto", "confine"),
-    "city": ("strada", "palazzo", "porta", "ponte"),
 }


@@ -135,7 +135,7 @@ def infer_topics(word: str, tags: List[str]) -> List[str]:

    if "verb_infinitive" in tags:
        topics.add("actions")
-    if any(word.endswith(suffix) for suffix in ("tore", "trice", "zione", "ismo", "ista", "mento", "anza", "enza")):
+    if any(word.endswith(suffix) for suffix in ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza")):
        topics.add("abstract")

    for topic, keywords in TOPIC_KEYWORDS.items():
--- a/build_semantic_lexicon.py
+++ b/build_semantic_lexicon.py
@@ -9,7 +9,7 @@ from datetime import datetime
 from pathlib import Path
 from typing import Dict, Iterable, List, Tuple

-from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics
+from build_lexicon import LEXICON_OUTPUT_PATH


 IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
@@ -356,8 +356,7 @@ def enrich_entry(
    ][:20]
    glosses = dedupe_keep_order(glosses)
    semantic_topics = dedupe_keep_order(
-        list(entry.get("topics", []))
-        + semantic_topics_from_text(
+        semantic_topics_from_text(
            glosses
            + synonyms
            + raw_relation_terms.get("hypernym", [])
@@ -365,7 +364,6 @@ def enrich_entry(
            + raw_relation_terms.get("similar", [])
        )
    )
-    entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
    entry["semantic"] = {
        "source": "iwn-omw",
        "matched": True,
--- a/create_passo4.bat
+++ b/create_passo4.bat
@@ -4,7 +4,7 @@ setlocal
 cd /d "%~dp0"

 set "BRANCH_NAME=passo4"
-set "COMMIT_MSG=feat: aggiunge il lessico semantico con integrazione ItalWordNet"
+set "COMMIT_MSG=feat: consolida lessico semantico, temi controllati e filler a quota tematica"

 if not "%~1"=="" (
    set "COMMIT_MSG=%~1"
@@ -32,8 +32,8 @@ if errorlevel 1 (
 if errorlevel 1 exit /b 1

 echo.
-echo Aggiungo le modifiche...
-git add .
+echo Aggiungo le modifiche di progetto, escludendo cache Python e cache API...
+git add *.py *.bat *.txt lexicon_it.json lexicon_it_semantic.json vocaboli_it_metadata.json package iwn-omw-main
 if errorlevel 1 exit /b 1

 echo.
--- a/crossword_filler.py
+++ b/crossword_filler.py
@@ -62,7 +62,7 @@ class FillCandidate:
    slot: FillSlot
    new_letters: int
    reused_letters: int
-    local_score: Tuple[int, int, int]
+    local_score: Tuple[int, ...]


 class CrosswordFiller:
@@ -73,6 +73,9 @@ class CrosswordFiller:
        *,
        target_empty_ratio: float = TARGET_EMPTY_RATIO,
        vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
+        semantic_metadata: Optional[Dict[str, Dict[str, object]]] = None,
+        selected_topic: str = "general",
+        max_themed_fill_words: int = 10,
        seed: Optional[int] = None,
    ) -> None:
        self.state = state.copy()
@@ -83,6 +86,9 @@ class CrosswordFiller:
        self.vocabulary = self._normalize_vocabulary(vocabulary)
        self.words_by_length = self._index_vocabulary(self.vocabulary)
        self.vocabulary_metadata = vocabulary_metadata or {}
+        self.semantic_metadata = semantic_metadata or {}
+        self.selected_topic = selected_topic.strip().lower()
+        self.max_themed_fill_words = max(0, max_themed_fill_words)
        self.seed = seed
        self.rng = random.Random(seed)
        self.bounds = self._compute_bounds(self.state.grid)
@@ -281,9 +287,11 @@ class CrosswordFiller:
            new_letters = sum(1 for cell in slot.cells if cell not in self.state.grid)
            reused_letters = slot.fixed_letters
            local_score = (
+                self._semantic_topic_score(word),
                reused_letters,
                new_letters,
                self._word_quality(word),
+                self._semantic_quality(word),
                len(set(word)),
            )
            candidates.append(
@@ -311,6 +319,56 @@ class CrosswordFiller:
        except (TypeError, ValueError):
            return 0

+    def _semantic_entry(self, word: str) -> Dict[str, object]:
+        return self.semantic_metadata.get(word, {})
+
+    def _semantic_quality(self, word: str) -> int:
+        entry = self._semantic_entry(word)
+        semantic = entry.get("semantic", {})
+        score = 0
+        if semantic.get("matched"):
+            score += 2
+        score += min(3, len(semantic.get("glosses", [])))
+        score += min(2, len(semantic.get("synonyms", [])))
+        return score
+
+    def _semantic_topic_score(self, word: str) -> int:
+        if not self.selected_topic or self.selected_topic == "general":
+            return 0
+
+        entry = self._semantic_entry(word)
+        try:
+            relevance = int(entry.get("_topic_relevance", 0))
+        except (TypeError, ValueError):
+            relevance = 0
+        if relevance:
+            if self._themed_added_count() < self.max_themed_fill_words:
+                return relevance
+            return min(relevance, 10)
+
+        topics = {str(item).lower() for item in entry.get("topics", [])}
+        semantic = entry.get("semantic", {})
+        semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", [])}
+        score = 0
+        if self.selected_topic in topics:
+            score += 4
+        if self.selected_topic in semantic_topics:
+            score += 6
+        if "general" in topics:
+            score += 1
+        return score
+
+    def _themed_added_count(self) -> int:
+        total = 0
+        for placement in self.added_words:
+            entry = self._semantic_entry(placement.word)
+            try:
+                if int(entry.get("_strong_topic_relevance", 0)) > 0:
+                    total += 1
+            except (TypeError, ValueError):
+                continue
+        return total
+
    def _placement_is_valid(self, slot: FillSlot, word: str) -> bool:
        dx, dy = (1, 0) if slot.direction == HORIZONTAL else (0, 1)
        before = (slot.x - dx, slot.y - dy)
@@ -380,6 +438,7 @@ class CrosswordFiller:
            f"vuote={self.empty_cells_count()}/{self.total_cells} "
            f"target={self.target_empty_cells} "
            f"aggiunte={len(self.added_words)} "
+            f"tema={self._themed_added_count()}/{self.max_themed_fill_words} "
            f"ultima={self.last_word} "
            f"t={elapsed:0.1f}s"
        )
--- a/lexicon_it.json
+++ b/lexicon_it.json
--- a/lexicon_it_semantic.json
+++ b/lexicon_it_semantic.json
--- a/main.py
+++ b/main.py
@@ -25,6 +25,72 @@ DIFFICULTY_ALIASES: Dict[str, int] = {
 }

 DEFAULT_TOPIC = "general"
+DEFAULT_INITIAL_WORD_COUNT = len(WORDS)
+ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
+FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
+GENERAL_FILL_MIN_QUALITY = 6
+GENERAL_FILL_MAX_LENGTH = 10
+SOFT_RELATED_FILL_LIMIT = 120
+DEFAULT_THEMED_FILL_WORD_COUNT = 10
+CONCRETE_TOPICS = {
+    "animals",
+    "plants",
+    "nature",
+    "ecology",
+    "geography",
+    "weather",
+    "sea",
+    "mountain",
+    "health",
+    "science",
+    "sport",
+    "history",
+    "school",
+    "cinema",
+    "literature",
+    "food",
+    "city",
+    "transport",
+    "work",
+    "home",
+}
+
+TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
+    "transport": (
+        "auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer",
+        "bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc",
+        "trattor", "carr", "vap", "rota", "ruot",
+    ),
+    "animals": (
+        "can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall",
+        "serpent", "tig", "leon", "volp", "cerv", "capr", "pecor",
+    ),
+    "nature": (
+        "mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol",
+        "rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent",
+        "litoral", "piogg", "nev", "onda", "clim",
+    ),
+    "cinema": (
+        "film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog",
+        "comic", "div", "docu", "pellic", "spettacol",
+    ),
+}
+
+TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
+    "transport": (
+        "intervist", "intratten", "speriment", "stermin", "investig",
+        "intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt",
+        "eccit", "traduz", "fluttu", "sollecit",
+    ),
+    "animals": (
+        "assicur", "finanz", "coediz", "camerier", "servitor", "indic",
+        "estens", "diffus", "difensor", "spessor", "maggior",
+    ),
+    "cinema": (
+        "manifest", "riediz", "dissimul", "diffus", "difensor", "estens",
+        "malumor", "eversor",
+    ),
+}


 def parse_args() -> argparse.Namespace:
@@ -95,6 +161,18 @@ def parse_args() -> argparse.Namespace:
        default=DEFAULT_TOPIC,
        help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
    )
+    parser.add_argument(
+        "--initial-word-count",
+        type=int,
+        default=DEFAULT_INITIAL_WORD_COUNT,
+        help="Numero di parole-seme usate per costruire la griglia iniziale prima del filler.",
+    )
+    parser.add_argument(
+        "--themed-fill-count",
+        type=int,
+        default=DEFAULT_THEMED_FILL_WORD_COUNT,
+        help="Numero massimo indicativo di parole aggiunte dal filler da mantenere fortemente legate al tema.",
+    )
    return parser.parse_args()


@@ -165,42 +243,328 @@ def load_selected_vocabulary(path: Path | None) -> List[str]:
    return path.read_text(encoding="utf-8").splitlines()


-def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
-    if not LEXICON_OUTPUT_PATH.exists():
-        lexicon = build_lexicon()
-        LEXICON_OUTPUT_PATH.write_text(
+def load_semantic_payload() -> Dict[str, object]:
+    if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
+        lexicon = build_semantic_lexicon()
+        SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
            json.dumps(lexicon, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
+    return json.loads(SEMANTIC_LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))

-    payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
+
+def entry_topics(entry: Dict[str, object]) -> tuple[set[str], set[str]]:
+    topics = {str(item).lower() for item in entry.get("topics", [])}
+    semantic_topics = {
+        str(item).lower()
+        for item in entry.get("semantic", {}).get("semantic_topics", [])
+    }
+    return topics, semantic_topics
+
+
+def matches_topic_roots(word: str, selected_topic: str) -> bool:
+    roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic, ())
+    blocked = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
+    if any(part in word for part in blocked):
+        return False
+    return bool(roots) and any(part in word for part in roots)
+
+
+def topic_relevance(entry: Dict[str, object], topic: str) -> int:
+    selected_topic = topic.strip().lower()
+    if selected_topic == DEFAULT_TOPIC:
+        return 20
+
+    word = str(entry.get("form", ""))
+    topics, semantic_topics = entry_topics(entry)
+    score = 0
+    if selected_topic in topics:
+        score += 100
+    if selected_topic in semantic_topics:
+        score += 45
+    if matches_topic_roots(word, selected_topic):
+        score += 35
+    if "general" in topics:
+        score += 5
+
+    if any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())):
+        score -= 80
+    if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
+        score -= 15
+    return score
+
+
+def strong_topic_relevance(entry: Dict[str, object], topic: str) -> int:
+    selected_topic = topic.strip().lower()
+    if selected_topic == DEFAULT_TOPIC:
+        return 20
+    topics, _ = entry_topics(entry)
+    return 100 if selected_topic in topics else 0
+
+
+def lexical_fill_score(entry: Dict[str, object], topic: str) -> tuple[int, int, int, int, int, str]:
+    word = str(entry.get("form", ""))
+    quality = int(entry.get("quality_score", 0))
+    pos = str(entry.get("pos", ""))
+    semantic = entry.get("semantic", {})
+    pos_bonus = {
+        "NOUN": 12,
+        "VERB": 8,
+        "ADJ": 6,
+        "ADV": 4,
+        "PREP": 2,
+        "CONJ": 2,
+    }.get(pos, 0)
+    semantic_bonus = 3 if semantic.get("matched") else 0
+    length = len(word)
+    length_bonus = 3 if 4 <= length <= 10 else 1 if 2 <= length <= 13 else -4
+    return (
+        topic_relevance(entry, topic),
+        quality,
+        pos_bonus,
+        semantic_bonus,
+        length_bonus,
+        word,
+    )
+
+
+def is_general_fill_support(entry: Dict[str, object]) -> bool:
+    word = str(entry.get("form", ""))
+    if int(entry.get("quality_score", 0)) < GENERAL_FILL_MIN_QUALITY:
+        return False
+    if len(word) > GENERAL_FILL_MAX_LENGTH:
+        return False
+    if word.endswith(ABSTRACTISH_SUFFIXES):
+        return False
+    return DEFAULT_TOPIC in {str(item).lower() for item in entry.get("topics", [])}
+
+
+def load_filtered_entries(level: int, topic: str) -> List[Dict[str, object]]:
+    payload = load_semantic_payload()
    normalized_topic = topic.strip().lower()

-    def matches(entry: Dict[str, object], selected_topic: str) -> bool:
-        topics = [str(item).lower() for item in entry.get("topics", [])]
-        return selected_topic in topics
-
-    words = [
-        entry["form"]
+    eligible = [
+        entry
        for entry in payload.get("entries", [])
        if entry.get("allowed_in_crossword", False)
        and int(entry.get("difficulty_word", 5)) <= level
-        and matches(entry, normalized_topic)
+        and str(entry.get("pos", "")) in FILL_ALLOWED_POS
    ]

-    if words:
-        return words
-
-    if normalized_topic != DEFAULT_TOPIC:
-        return [
-            entry["form"]
-            for entry in payload.get("entries", [])
-            if entry.get("allowed_in_crossword", False)
-            and int(entry.get("difficulty_word", 5)) <= level
-            and matches(entry, DEFAULT_TOPIC)
+    if normalized_topic == DEFAULT_TOPIC:
+        selected = eligible
+    else:
+        strong_topic = [entry for entry in eligible if strong_topic_relevance(entry, normalized_topic) > 0]
+        soft_related = [
+            entry
+            for entry in eligible
+            if entry not in strong_topic
+            and topic_relevance(entry, normalized_topic) > 0
+            and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
+            and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH
+            and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES)
        ]
+        soft_related.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)

-    return words
+        general_support = [
+            entry
+            for entry in eligible
+            if entry not in strong_topic
+            and is_general_fill_support(entry)
+        ]
+        general_support.sort(key=lambda entry: lexical_fill_score(entry, DEFAULT_TOPIC), reverse=True)
+        selected = strong_topic + soft_related[:SOFT_RELATED_FILL_LIMIT]
+        selected += [entry for entry in general_support if entry not in selected]
+
+    selected.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
+    return selected
+
+
+def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
+    return [str(entry["form"]) for entry in load_filtered_entries(level, topic)]
+
+
+def load_semantic_metadata_for_vocabulary(words: List[str], topic: str) -> Dict[str, Dict[str, object]]:
+    payload = load_semantic_payload()
+    selected = set(words)
+    metadata: Dict[str, Dict[str, object]] = {}
+    for entry in payload.get("entries", []):
+        word = str(entry.get("form", ""))
+        if word not in selected:
+            continue
+        enriched = dict(entry)
+        enriched["_topic_relevance"] = topic_relevance(enriched, topic)
+        enriched["_strong_topic_relevance"] = strong_topic_relevance(enriched, topic)
+        metadata[word] = enriched
+    return metadata
+
+
+def select_initial_words(level: int, topic: str, count: int) -> List[str]:
+    payload = load_semantic_payload()
+    normalized_topic = topic.strip().lower()
+    abstract_like_topics = {"abstract", "actions"}
+
+    def matches(entry: Dict[str, object], selected_topic: str) -> bool:
+        topics, semantic_topics = entry_topics(entry)
+        return selected_topic in topics
+
+    def word_score(entry: Dict[str, object], selected_topic: str) -> tuple[int, int, int, int, int, int, str]:
+        topics, semantic_topics = entry_topics(entry)
+        quality = int(entry.get("quality_score", 0))
+        semantic = entry.get("semantic", {})
+        semantic_match = 1 if semantic.get("matched") else 0
+        glossary_bonus = min(3, len(semantic.get("glosses", [])))
+        word = str(entry.get("form", ""))
+        length = len(word)
+        topical_concreteness_penalty = 0
+        topic_bonus = 0
+        pos_bonus = 0
+        if selected_topic in topics:
+            topic_bonus += 4
+        if "general" in topics:
+            topic_bonus += 1
+        if str(entry.get("pos", "")) == "NOUN":
+            pos_bonus += 4
+        elif str(entry.get("pos", "")) == "ADJ":
+            pos_bonus += 1
+        if selected_topic not in abstract_like_topics and selected_topic != DEFAULT_TOPIC:
+            if "abstract" in topics and selected_topic not in topics:
+                topical_concreteness_penalty -= 3
+            if "actions" in topics and selected_topic not in topics:
+                topical_concreteness_penalty -= 2
+            if word.endswith(ABSTRACTISH_SUFFIXES):
+                topical_concreteness_penalty -= 4
+            if str(entry.get("pos", "")) != "NOUN":
+                topical_concreteness_penalty -= 3
+        if 5 <= length <= 10:
+            length_bonus = 3
+        elif 4 <= length <= 12:
+            length_bonus = 1
+        else:
+            length_bonus = -2
+        return (
+            topic_bonus,
+            pos_bonus,
+            topical_concreteness_penalty,
+            quality,
+            semantic_match,
+            glossary_bonus,
+            length_bonus,
+            word,
+        )
+
+    def is_seed_friendly(entry: Dict[str, object], selected_topic: str) -> bool:
+        word = str(entry.get("form", ""))
+        pos = str(entry.get("pos", ""))
+        topics, semantic_topics = entry_topics(entry)
+        topic_hit = selected_topic in topics
+        if len(word) < 4 or len(word) > 13:
+            return False
+        if selected_topic in CONCRETE_TOPICS and pos != "NOUN":
+            return False
+        if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
+            return False
+        blocked_substrings = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
+        if any(part in word for part in blocked_substrings):
+            return False
+        required_substrings = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic)
+        if (
+            selected_topic in CONCRETE_TOPICS
+            and required_substrings
+            and selected_topic != DEFAULT_TOPIC
+            and not any(part in word for part in required_substrings)
+        ):
+            return False
+        if selected_topic != DEFAULT_TOPIC and not topic_hit:
+            return False
+        return True
+
+    def overlap_score(left: str, right: str) -> int:
+        shared = set(left) & set(right)
+        return sum(min(left.count(ch), right.count(ch)) for ch in shared)
+
+    def pick_seed_set(entries: List[Dict[str, object]], selected_topic: str, target_count: int) -> List[str]:
+        if not entries:
+            return []
+
+        ranked = sorted(entries, key=lambda entry: word_score(entry, selected_topic), reverse=True)
+        chosen: List[str] = []
+        chosen_entries: List[Dict[str, object]] = []
+
+        first = ranked[0]
+        chosen.append(str(first["form"]))
+        chosen_entries.append(first)
+
+        while len(chosen) < target_count:
+            best_entry = None
+            best_key = None
+            for entry in ranked:
+                word = str(entry.get("form", ""))
+                if word in chosen:
+                    continue
+                overlap_total = sum(overlap_score(word, existing) for existing in chosen)
+                max_overlap = max((overlap_score(word, existing) for existing in chosen), default=0)
+                distinct_letters = len(set(word))
+                same_length_penalty = -sum(1 for existing in chosen if len(existing) == len(word))
+                key = (
+                    1 if max_overlap >= 2 else 0,
+                    overlap_total,
+                    max_overlap,
+                    same_length_penalty,
+                    distinct_letters,
+                    word_score(entry, selected_topic),
+                )
+                if best_key is None or key > best_key:
+                    best_key = key
+                    best_entry = entry
+            if best_entry is None:
+                break
+            chosen.append(str(best_entry["form"]))
+            chosen_entries.append(best_entry)
+
+        return chosen
+
+    eligible = [
+        entry
+        for entry in payload.get("entries", [])
+        if entry.get("allowed_in_crossword", False)
+        and int(entry.get("difficulty_word", 5)) <= level
+    ]
+
+    lexical_topical = []
+    for entry in eligible:
+        topics, semantic_topics = entry_topics(entry)
+        if normalized_topic in topics:
+            lexical_topical.append(entry)
+    fallback = [entry for entry in eligible if matches(entry, DEFAULT_TOPIC)]
+    if normalized_topic == DEFAULT_TOPIC:
+        pool = fallback
+    else:
+        pool = list(lexical_topical)
+        if not pool:
+            pool = fallback
+
+    strict_pool = [entry for entry in pool if is_seed_friendly(entry, normalized_topic)]
+    relaxed_pool = sorted(pool, key=lambda entry: word_score(entry, normalized_topic), reverse=True)
+
+    selected = pick_seed_set(strict_pool, normalized_topic, count)
+    if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
+        relaxed_selected = pick_seed_set(relaxed_pool, normalized_topic, count)
+        for word in relaxed_selected:
+            if word not in selected:
+                selected.append(word)
+            if len(selected) >= count:
+                break
+
+    if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
+        for word in WORDS:
+            if word in selected:
+                continue
+            selected.append(word)
+            if len(selected) >= count:
+                break
+
+    return selected[:count]


 def main() -> None:
@@ -209,9 +573,10 @@ def main() -> None:
    ensure_lexicon(args)
    ensure_semantic_lexicon(args)
    difficulty_level = parse_difficulty(args.difficulty)
+    initial_words = select_initial_words(difficulty_level, args.topic, args.initial_word_count)

    generator = CrosswordGenerator(
-        WORDS,
+        initial_words,
        diffxy=args.diffxy,
        time_limit_seconds=args.time_limit,
        max_candidates_per_word=args.max_candidates,
@@ -220,6 +585,7 @@ def main() -> None:
    initial_state = generator.solve()

    print("Griglia iniziale")
+    print(f"Parole-seme richieste: {len(initial_words)}")
    print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
    print(f"Intersezioni: {initial_state.intersections}")
    print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
@@ -229,17 +595,24 @@ def main() -> None:
        print(f"Seed: {args.seed}")
    print()
    print(render_grid(initial_state.grid, initial_state.placements))
+    print()
+    print("Parole-seme selezionate:")
+    print(", ".join(initial_words))

    if args.skip_fill:
        return

    vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
    metadata = load_vocabulary_metadata()
+    semantic_metadata = load_semantic_metadata_for_vocabulary(vocabulary, args.topic) if not args.vocabulary else {}
    filler = CrosswordFiller(
        initial_state,
        vocabulary,
        target_empty_ratio=args.target_empty_ratio,
        vocabulary_metadata=metadata,
+        semantic_metadata=semantic_metadata,
+        selected_topic=args.topic,
+        max_themed_fill_words=args.themed_fill_count,
        seed=args.seed,
    )
    final_state = filler.fill()