feat: collega il lessico semantico al filler

2026-04-14 18:56:17 +02:00
parent 77c7e709b6
commit b172b9c04b
15 changed files with 2255563 additions and 9 deletions
--- a/pycache/build_lexicon.cpython-313.pyc
+++ b/pycache/build_lexicon.cpython-313.pyc
--- a/pycache/build_semantic_lexicon.cpython-313.pyc
+++ b/pycache/build_semantic_lexicon.cpython-313.pyc
--- a/pycache/crossword_filler.cpython-313.pyc
+++ b/pycache/crossword_filler.cpython-313.pyc
--- a/pycache/crossword_generator.cpython-313.pyc
+++ b/pycache/crossword_generator.cpython-313.pyc
--- a/build_lexicon.py
+++ b/build_lexicon.py
@@ -0,0 +1,249 @@
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List
+
+from build_vocabulary import (
+    FILTERED_OUTPUT_PATH,
+    METADATA_OUTPUT_PATH,
+    build_vocabulary,
+)
+
+
+LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it.json")
+
+POS_BY_TAG = {
+    "function": "PREP",
+    "verb_infinitive": "VERB",
+    "adverb": "ADV",
+    "adjective_like": "ADJ",
+    "noun_like": "NOUN",
+}
+
+REGISTER_BY_QUALITY = [
+    (8, "common"),
+    (5, "standard"),
+    (3, "formal"),
+    (0, "rare"),
+]
+
+TOPIC_KEYWORDS = {
+    "animals": {
+        "cane", "gatto", "lupo", "volpe", "orso", "pesce", "cervo", "cavallo", "capra", "pecora",
+        "leone", "tigre", "zebra", "aquila", "falco", "serpente", "vipera", "gabbiano", "anatra",
+        "passero", "coniglio", "castoro", "bruco", "cigno", "asino", "alpaca",
+    },
+    "plants": {
+        "albero", "pianta", "fiore", "foglia", "radice", "seme", "bosco", "selva", "ulivo", "quercia",
+        "ortica", "edera", "aloe", "tulipano", "spiga", "polline", "grano", "erba",
+    },
+    "nature": {
+        "natura", "bosco", "selva", "montagna", "collina", "roccia", "pietra", "fiume", "lago", "mare",
+        "riva", "fonte", "onda", "vento", "fuoco", "terra", "sole", "luna", "aurora", "nuvola",
+        "nebbia", "deserto", "isola", "greto", "radice", "fiore", "foglia", "erba", "zolla",
+    },
+    "ecology": {
+        "ambiente", "ecologia", "natura", "bosco", "energia", "acqua", "terra", "clima", "sorgere",
+        "fonte", "solare", "verde", "ulivo", "pianta", "polline", "grano", "radice",
+    },
+    "geography": {
+        "montagna", "collina", "isola", "deserto", "equatore", "ovest", "oriente", "riva", "mare",
+        "lago", "fiume", "ponte", "confine", "quota", "pianeta", "roccia", "greto",
+    },
+    "weather": {
+        "vento", "nebbia", "aurora", "pioggia", "sole", "nuvola", "tempesta", "brina", "sereno",
+        "clima", "goccia",
+    },
+    "sea": {
+        "mare", "onda", "vela", "barca", "porto", "pesce", "ancora", "scoglio", "riva", "veliero",
+    },
+    "mountain": {
+        "montagna", "quota", "vetta", "roccia", "greto", "collina", "sentiero", "alpino",
+    },
+    "health": {
+        "salute", "febbre", "medico", "cura", "respiro", "diuretico", "anemico", "vigore", "energia",
+        "dente", "cuore", "corpo", "viso",
+    },
+    "science": {
+        "atomo", "energia", "metodo", "equatore", "digitale", "misura", "tecnica", "triangolo",
+        "microfibra", "microscopio", "algoritmo", "motore", "materia", "liquido",
+    },
+    "sport": {
+        "calcio", "atleta", "sportivo", "gol", "pallone", "gara", "trionfo", "primato", "allenatore",
+        "stadio", "squadra", "rete",
+    },
+    "history": {
+        "re", "principe", "regno", "impero", "senato", "console", "legione", "vittoria", "epoca",
+        "origine", "ritorno",
+    },
+    "school": {
+        "libro", "quaderno", "lezione", "classe", "studiare", "maestro", "scuola", "esame", "penna",
+        "aula", "figura", "titolo",
+    },
+    "cinema": {
+        "film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "voce", "visione",
+        "finale", "figura",
+    },
+    "literature": {
+        "libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
+        "leggenda", "scrivere", "titolo",
+    },
+    "food": {
+        "pane", "cacao", "gelato", "burro", "latte", "mandorla", "nocciola", "cena", "pranzo",
+        "zuppa", "zucchero", "acqua", "fiore", "frutto",
+    },
+    "city": {
+        "porta", "strada", "piazza", "ponte", "palazzo", "cortile", "villaggio", "citta", "urbano",
+        "casale", "balcone", "finestra", "stazione",
+    },
+    "transport": {
+        "automobile", "barca", "vela", "treno", "motore", "viaggio", "ruota", "ponte", "pilota",
+        "volo", "aeroporto", "vettura",
+    },
+    "work": {
+        "lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
+        "mestiere", "servire",
+    },
+    "home": {
+        "casa", "finestra", "porta", "parete", "divano", "tavolo", "sedia", "camera", "balcone",
+        "camino", "tetto", "cortile", "vasca",
+    },
+}
+
+TOPIC_SUFFIXES = {
+    "actions": ("are", "ere", "ire"),
+    "abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
+    "animals": ("cane", "gatto", "lupo", "pesce", "volpe", "orso"),
+    "plants": ("fiore", "foglia", "seme", "radice", "erba"),
+    "nature": ("mare", "lago", "bosco", "vento", "onda", "roccia"),
+    "geography": ("montagna", "isola", "deserto", "confine"),
+    "city": ("strada", "palazzo", "porta", "ponte"),
+}
+
+
+def infer_pos(tags: List[str]) -> str:
+    for tag in tags:
+        if tag in POS_BY_TAG:
+            return POS_BY_TAG[tag]
+    return "NOUN"
+
+
+def infer_topics(word: str, tags: List[str]) -> List[str]:
+    topics = {"general"}
+
+    if "verb_infinitive" in tags:
+        topics.add("actions")
+    if any(word.endswith(suffix) for suffix in ("tore", "trice", "zione", "ismo", "ista", "mento", "anza", "enza")):
+        topics.add("abstract")
+
+    for topic, keywords in TOPIC_KEYWORDS.items():
+        if word in keywords:
+            topics.add(topic)
+
+    for topic, suffixes in TOPIC_SUFFIXES.items():
+        if any(word.endswith(suffix) for suffix in suffixes):
+            topics.add(topic)
+
+    if "animals" in topics:
+        topics.add("nature")
+    if "plants" in topics:
+        topics.update({"nature", "ecology"})
+    if "sea" in topics or "mountain" in topics or "weather" in topics:
+        topics.add("nature")
+    if "geography" in topics and "nature" not in topics:
+        topics.add("nature")
+
+    return sorted(topics)
+
+
+def infer_register(quality: int) -> str:
+    for threshold, label in REGISTER_BY_QUALITY:
+        if quality >= threshold:
+            return label
+    return "rare"
+
+
+def frequency_from_quality(quality: int, index: int, total: int) -> tuple[int, float]:
+    rank = index + 1
+    normalized_rank = 1.0 - (rank - 1) / max(1, total - 1)
+    quality_boost = min(max(quality, 0), 10) / 20.0
+    frequency_score = round(min(1.0, normalized_rank * 0.7 + quality_boost), 4)
+    return rank, frequency_score
+
+
+def load_words() -> List[str]:
+    if not FILTERED_OUTPUT_PATH.exists() or not METADATA_OUTPUT_PATH.exists():
+        build_vocabulary()
+
+    words = [
+        line.strip()
+        for line in FILTERED_OUTPUT_PATH.read_text(encoding="utf-8").splitlines()
+        if line.strip()
+    ]
+    return words
+
+
+def load_metadata() -> Dict[str, Dict[str, object]]:
+    if not METADATA_OUTPUT_PATH.exists():
+        build_vocabulary()
+    return json.loads(METADATA_OUTPUT_PATH.read_text(encoding="utf-8"))
+
+
+def build_lexicon() -> Dict[str, object]:
+    words = load_words()
+    metadata = load_metadata()
+
+    entries = []
+    total = len(words)
+    for index, word in enumerate(words):
+        meta = metadata.get(word, {})
+        tags = list(meta.get("tags", []))
+        quality = int(meta.get("quality", 0))
+        frequency_rank, frequency_score = frequency_from_quality(quality, index, total)
+
+        entry = {
+            "form": word,
+            "normalized_form": word,
+            "lemma": word,
+            "pos": infer_pos(tags),
+            "length": len(word),
+            "frequency_rank": frequency_rank,
+            "frequency_score": frequency_score,
+            "difficulty_word": max(1, min(5, 6 - max(1, min(5, quality // 2 + 1)))),
+            "allowed_in_crossword": True,
+            "quality_score": max(0, min(10, quality)),
+            "topics": infer_topics(word, tags),
+            "morph_features": {},
+            "register": infer_register(quality),
+            "source_flags": ["from_filtered_vocabulary", "from_metadata_heuristics"],
+            "crossword_flags": tags,
+            "notes": "",
+        }
+        entries.append(entry)
+
+    return {
+        "meta": {
+            "language": "it",
+            "version": 1,
+            "sources": ["vocaboli_it_filtrato.txt", "vocaboli_it_metadata.json"],
+            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
+            "entry_count": len(entries),
+        },
+        "entries": entries,
+    }
+
+
+def main() -> None:
+    lexicon = build_lexicon()
+    LEXICON_OUTPUT_PATH.write_text(
+        json.dumps(lexicon, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    print(f"Lessico generato: {LEXICON_OUTPUT_PATH}")
+    print(f"Voci generate: {lexicon['meta']['entry_count']}")
+
+
+if __name__ == "__main__":
+    main()
--- a/build_semantic_lexicon.py
+++ b/build_semantic_lexicon.py
@@ -0,0 +1,426 @@
+from __future__ import annotations
+
+import json
+import re
+import unicodedata
+import xml.etree.ElementTree as ET
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics
+
+
+IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
+SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json")
+
+IWN_POS_MAP = {
+    "n": "NOUN",
+    "v": "VERB",
+    "a": "ADJ",
+    "s": "ADJ",
+    "r": "ADV",
+}
+
+SEMANTIC_TOPIC_KEYWORDS = {
+    "animals": {
+        "animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce",
+        "rettile", "domestico", "compagnia", "caccia", "pastorizia",
+    },
+    "plants": {
+        "pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto",
+        "ghianda", "bosco", "radice", "seme", "vegetale",
+    },
+    "nature": {
+        "natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco",
+        "lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre",
+    },
+    "ecology": {
+        "ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde",
+        "ecosistema", "acqua", "terra",
+    },
+    "geography": {
+        "territorio", "superficie", "terrestre", "regione", "confine", "montagna",
+        "isola", "mare", "lago", "fiume",
+    },
+    "weather": {
+        "clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina",
+        "atmosfera",
+    },
+    "sea": {
+        "mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela",
+        "nave", "fondale",
+    },
+    "mountain": {
+        "montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero",
+    },
+    "health": {
+        "salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia",
+        "terapia", "cervello", "respiro",
+    },
+    "science": {
+        "scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica",
+        "chimica", "biologia", "strumento",
+    },
+    "sport": {
+        "sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco",
+        "agonistico",
+    },
+    "history": {
+        "storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca",
+    },
+    "school": {
+        "scuola", "lezione", "studente", "classe", "maestro", "esame", "libro",
+        "quaderno", "aula",
+    },
+    "cinema": {
+        "film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo",
+        "teatro",
+    },
+    "literature": {
+        "libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo",
+        "letteratura",
+    },
+    "food": {
+        "cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina",
+        "gelato",
+    },
+    "city": {
+        "citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta",
+        "quartiere",
+    },
+    "transport": {
+        "veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave",
+        "aereo", "automobile",
+    },
+    "work": {
+        "lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio",
+    },
+    "home": {
+        "casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone",
+        "tavolo", "sedia",
+    },
+}
+
+
+def normalize_word(text: str) -> str:
+    normalized = unicodedata.normalize("NFKD", text)
+    ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
+    ascii_only = ascii_only.lower().replace("_", "")
+    ascii_only = re.sub(r"[^a-z]", "", ascii_only)
+    return ascii_only
+
+
+def semantic_topics_from_text(parts: Iterable[str]) -> List[str]:
+    tokens = set()
+    for part in parts:
+        normalized = normalize_word(part)
+        if not normalized:
+            continue
+        tokens.add(normalized)
+        tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part))))
+
+    topics = set()
+    for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items():
+        if any(keyword in tokens for keyword in keywords):
+            topics.add(topic)
+    return sorted(topics)
+
+
+def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]:
+    xml_text = IWN_XML_PATH.read_text(encoding="utf-8")
+    xml_text = xml_text.replace('"  -->', '">')
+    root = ET.fromstring(xml_text)
+
+    synsets: Dict[str, Dict[str, object]] = {}
+    entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list)
+
+    for synset in root.findall(".//Synset"):
+        synset_id = synset.attrib.get("id", "")
+        relations = [
+            {
+                "type": relation.attrib.get("relType", ""),
+                "target": relation.attrib.get("target", ""),
+                "subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""),
+            }
+            for relation in synset.findall("SynsetRelation")
+        ]
+        synsets[synset_id] = {
+            "id": synset_id,
+            "ili": synset.attrib.get("ili", ""),
+            "definition": (synset.findtext("Definition") or "").strip(),
+            "relations": relations,
+            "lemmas": [],
+            "pos": "",
+        }
+
+    for lexical_entry in root.findall(".//LexicalEntry"):
+        lemma = lexical_entry.find("Lemma")
+        if lemma is None:
+            continue
+
+        written_form = lemma.attrib.get("writtenForm", "").strip()
+        normalized_form = normalize_word(written_form)
+        if not normalized_form:
+            continue
+
+        pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN")
+        senses = lexical_entry.findall("Sense")
+        sense_payloads = []
+
+        for sense in senses:
+            synset_id = sense.attrib.get("synset", "")
+            sense_id = sense.attrib.get("id", "")
+            if not synset_id or synset_id not in synsets:
+                continue
+
+            synsets[synset_id]["lemmas"].append(written_form)
+            synsets[synset_id]["pos"] = pos
+            sense_payloads.append(
+                {
+                    "sense_id": sense_id,
+                    "synset_id": synset_id,
+                    "pos": pos,
+                }
+            )
+
+        if sense_payloads:
+            entries_by_norm[normalized_form].append(
+                {
+                    "written_form": written_form,
+                    "normalized_form": normalized_form,
+                    "pos": pos,
+                    "senses": sense_payloads,
+                }
+            )
+
+    for synset in synsets.values():
+        unique_lemmas = []
+        seen = set()
+        for lemma in synset["lemmas"]:
+            if lemma not in seen:
+                seen.add(lemma)
+                unique_lemmas.append(lemma)
+        synset["lemmas"] = unique_lemmas
+
+    return synsets, entries_by_norm
+
+
+def score_sense(
+    synset_id: str,
+    current_topics: Iterable[str],
+    synsets: Dict[str, Dict[str, object]],
+) -> int:
+    synset = synsets.get(synset_id, {})
+    definition = str(synset.get("definition", ""))
+    inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", []))))
+    current_topics_set = set(str(topic) for topic in current_topics)
+    overlap = len(inferred_topics & current_topics_set)
+    return overlap * 10 + len(definition)
+
+
+def best_candidate(
+    candidates: List[Dict[str, object]],
+    expected_pos: str,
+    current_topics: Iterable[str],
+    synsets: Dict[str, Dict[str, object]],
+) -> Dict[str, object]:
+    ranked = []
+    for candidate in candidates:
+        pos_bonus = 100 if candidate["pos"] == expected_pos else 0
+        sense_bonus = 0
+        for sense in candidate.get("senses", []):
+            sense_bonus = max(
+                sense_bonus,
+                score_sense(str(sense.get("synset_id", "")), current_topics, synsets),
+            )
+        ranked.append((pos_bonus + sense_bonus, candidate))
+    ranked.sort(key=lambda item: item[0], reverse=True)
+    return ranked[0][1]
+
+
+def dedupe_keep_order(items: Iterable[str]) -> List[str]:
+    seen = set()
+    result = []
+    for item in items:
+        text = str(item).strip()
+        if not text or text in seen:
+            continue
+        seen.add(text)
+        result.append(text)
+    return result
+
+
+def resolve_related_lemmas(
+    synset_ids: Iterable[str],
+    relation_type: str,
+    synsets: Dict[str, Dict[str, object]],
+) -> List[str]:
+    related = []
+    for synset_id in synset_ids:
+        synset = synsets.get(synset_id)
+        if not synset:
+            continue
+        for relation in synset.get("relations", []):
+            if relation.get("type") != relation_type:
+                continue
+            target = relation.get("target", "")
+            target_synset = synsets.get(target)
+            if not target_synset:
+                continue
+            related.extend(target_synset.get("lemmas", []))
+    return dedupe_keep_order(related)
+
+
+def collect_relation_terms(
+    synset_ids: Iterable[str],
+    relation_types: Iterable[str],
+    synsets: Dict[str, Dict[str, object]],
+) -> Dict[str, List[str]]:
+    return {
+        relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20]
+        for relation_type in relation_types
+    }
+
+
+def enrich_entry(
+    entry: Dict[str, object],
+    synsets: Dict[str, Dict[str, object]],
+    entries_by_norm: Dict[str, List[Dict[str, object]]],
+) -> Dict[str, object]:
+    normalized_candidates = dedupe_keep_order(
+        [
+            normalize_word(str(entry.get("form", ""))),
+            normalize_word(str(entry.get("lemma", ""))),
+            normalize_word(str(entry.get("normalized_form", ""))),
+        ]
+    )
+    matches: List[Dict[str, object]] = []
+    for candidate_key in normalized_candidates:
+        matches.extend(entries_by_norm.get(candidate_key, []))
+
+    if not matches:
+        entry["semantic"] = {
+            "source": "iwn-omw",
+            "matched": False,
+            "match_count": 0,
+            "synsets": [],
+            "synonyms": [],
+            "raw_relation_terms": {},
+            "glosses": [],
+            "semantic_topics": [],
+        }
+        return entry
+
+    selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets)
+    sorted_senses = sorted(
+        selected.get("senses", []),
+        key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets),
+        reverse=True,
+    )
+    synset_ids = [sense["synset_id"] for sense in sorted_senses]
+    synset_payloads = []
+    glosses = []
+    synonyms = []
+    for synset_id in synset_ids[:3]:
+        synset = synsets.get(synset_id)
+        if not synset:
+            continue
+        gloss = str(synset.get("definition", "")).strip()
+        glosses.append(gloss)
+        synset_payloads.append(
+            {
+                "id": synset_id,
+                "pos": synset.get("pos", ""),
+                "definition": gloss,
+                "lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12],
+                "raw_relation_terms": collect_relation_terms(
+                    [synset_id],
+                    ("hypernym", "hyponym", "similar"),
+                    synsets,
+                ),
+            }
+        )
+        synonyms.extend(synset.get("lemmas", []))
+
+    raw_relation_terms = collect_relation_terms(
+        synset_ids,
+        ("hypernym", "hyponym", "similar"),
+        synsets,
+    )
+    synonyms = [
+        lemma
+        for lemma in dedupe_keep_order(synonyms)
+        if normalize_word(lemma) != normalize_word(str(entry.get("form", "")))
+    ][:20]
+    glosses = dedupe_keep_order(glosses)
+    semantic_topics = dedupe_keep_order(
+        list(entry.get("topics", []))
+        + semantic_topics_from_text(
+            glosses
+            + synonyms
+            + raw_relation_terms.get("hypernym", [])
+            + raw_relation_terms.get("hyponym", [])
+            + raw_relation_terms.get("similar", [])
+        )
+    )
+    entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
+    entry["semantic"] = {
+        "source": "iwn-omw",
+        "matched": True,
+        "match_count": len(matches),
+        "selected_form": selected.get("written_form", ""),
+        "synsets": synset_payloads,
+        "synonyms": synonyms,
+        "raw_relation_terms": raw_relation_terms,
+        "glosses": glosses,
+        "semantic_topics": semantic_topics,
+    }
+    return entry
+
+
+def build_semantic_lexicon() -> Dict[str, object]:
+    if not LEXICON_OUTPUT_PATH.exists():
+        raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}")
+    if not IWN_XML_PATH.exists():
+        raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}")
+
+    payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
+    synsets, entries_by_norm = parse_iwn()
+
+    enriched_entries = []
+    for entry in payload.get("entries", []):
+        enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm))
+
+    return {
+        "meta": {
+            "language": "it",
+            "version": 1,
+            "base_lexicon": str(LEXICON_OUTPUT_PATH.name),
+            "sources": [
+                "lexicon_it.json",
+                "iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml",
+            ],
+            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
+            "entry_count": len(enriched_entries),
+            "semantic_source": "IWN-OMW v1.0",
+        },
+        "entries": enriched_entries,
+    }
+
+
+def main() -> None:
+    payload = build_semantic_lexicon()
+    SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
+        json.dumps(payload, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched"))
+    print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
+    print(f"Voci totali: {payload['meta']['entry_count']}")
+    print(f"Voci con match semantico: {matched}")
+
+
+if __name__ == "__main__":
+    main()
--- a/create_passo4.bat
+++ b/create_passo4.bat
@@ -0,0 +1,55 @@
+@echo off
+setlocal
+
+cd /d "%~dp0"
+
+set "BRANCH_NAME=passo4"
+set "COMMIT_MSG=feat: aggiunge il lessico semantico con integrazione ItalWordNet"
+
+if not "%~1"=="" (
+    set "COMMIT_MSG=%~1"
+)
+
+echo Repository: %cd%
+echo Branch target: %BRANCH_NAME%
+echo Commit message: %COMMIT_MSG%
+echo.
+
+git rev-parse --is-inside-work-tree >nul 2>nul
+if errorlevel 1 (
+    echo Errore: questa cartella non e' un repository Git.
+    exit /b 1
+)
+
+git show-ref --verify --quiet refs/heads/%BRANCH_NAME%
+if errorlevel 1 (
+    echo Creo il branch %BRANCH_NAME%...
+    git checkout -b %BRANCH_NAME%
+) else (
+    echo Il branch %BRANCH_NAME% esiste gia', ci passo sopra...
+    git checkout %BRANCH_NAME%
+)
+if errorlevel 1 exit /b 1
+
+echo.
+echo Aggiungo le modifiche...
+git add .
+if errorlevel 1 exit /b 1
+
+echo.
+echo Creo il commit...
+git commit -m "%COMMIT_MSG%"
+if errorlevel 1 (
+    echo.
+    echo Nessun commit creato. Potrebbe non esserci nulla di nuovo da salvare.
+    exit /b 1
+)
+
+echo.
+echo Eseguo il push del branch %BRANCH_NAME%...
+git push -u origin %BRANCH_NAME%
+if errorlevel 1 exit /b 1
+
+echo.
+echo Operazione completata con successo.
+endlocal
--- a/crossword_filler.py
+++ b/crossword_filler.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 import json
 from pathlib import Path
+import random
 import sys
 import time
 from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
@@ -72,6 +73,7 @@ class CrosswordFiller:
        *,
        target_empty_ratio: float = TARGET_EMPTY_RATIO,
        vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
+        seed: Optional[int] = None,
    ) -> None:
        self.state = state.copy()
        self.initial_state = state.copy()
@@ -81,6 +83,8 @@ class CrosswordFiller:
        self.vocabulary = self._normalize_vocabulary(vocabulary)
        self.words_by_length = self._index_vocabulary(self.vocabulary)
        self.vocabulary_metadata = vocabulary_metadata or {}
+        self.seed = seed
+        self.rng = random.Random(seed)
        self.bounds = self._compute_bounds(self.state.grid)
        self.total_cells = self._area(self.bounds)
        self.target_empty_cells = max(0, int(round(self.total_cells * self.target_empty_ratio)))
@@ -181,6 +185,10 @@ class CrosswordFiller:

        collected = list(unique.values())
        collected.sort(key=self._slot_priority, reverse=True)
+        if len(collected) > 1:
+            top_slice = collected[: min(MAX_SLOT_CANDIDATES, len(collected))]
+            self.rng.shuffle(top_slice)
+            collected = top_slice + collected[min(MAX_SLOT_CANDIDATES, len(collected)) :]
        return collected

    def _slots_from_start(self, x: int, y: int, direction: str) -> Iterable[FillSlot]:
@@ -292,7 +300,7 @@ class CrosswordFiller:
            return None

        candidates.sort(key=lambda item: item.local_score, reverse=True)
-        return candidates[0]
+        return self.rng.choice(candidates[: min(3, len(candidates))])

    def _word_quality(self, word: str) -> int:
        metadata = self.vocabulary_metadata.get(word)
--- a/crossword_generator.py
+++ b/crossword_generator.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 from dataclasses import dataclass
 import locale
+import random
 import sys
 import time
 from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
@@ -114,6 +115,7 @@ class CrosswordGenerator:
        max_candidates_per_word: int = 12,
        time_limit_seconds: float = 8.0,
        diffxy: int = DIFFXY,
+        seed: Optional[int] = None,
    ) -> None:
        normalized = [self._normalize(word) for word in words]
        unique_words = list(dict.fromkeys(word for word in normalized if len(word) >= 2))
@@ -122,6 +124,8 @@ class CrosswordGenerator:
        self.max_candidates_per_word = max_candidates_per_word
        self.time_limit_seconds = time_limit_seconds
        self.diffxy = diffxy
+        self.seed = seed
+        self.rng = random.Random(seed)
        self.started_at = 0.0
        self.visited: Dict[Tuple[frozenset, Tuple[str, ...]], Tuple[int, int, int]] = {}
        self.nodes_visited = 0
@@ -213,6 +217,8 @@ class CrosswordGenerator:
            reverse=True,
        )
        candidates = candidates[: self.max_candidates_per_word]
+        if len(candidates) > 1:
+            self.rng.shuffle(candidates)

        next_remaining = [word for word in remaining_words if word != next_word]
        for placement in candidates:
@@ -253,6 +259,10 @@ class CrosswordGenerator:
                word,
            ),
        )
+        if len(ranked_words) > 1:
+            top_slice = ranked_words[: min(5, len(ranked_words))]
+            self.rng.shuffle(top_slice)
+            ranked_words = top_slice + ranked_words[min(5, len(ranked_words)) :]

        best_word = ranked_words[0]
        best_key: Optional[Tuple[int, int, int, str]] = None
--- a/iwn-omw-main.zip
+++ b/iwn-omw-main.zip
--- a/iwn-omw-main/IWN-OMW-main/README.md
+++ b/iwn-omw-main/IWN-OMW-main/README.md
@@ -0,0 +1,21 @@
+# IWN-OMW
+This is the repository for the Open Italian WordNet, i.e. ItalWordNet versions compliant with the Open Multilingual WordNet guidelines and initiative.
+
+IWN-OMW is a new LMF version of the ItalWordNet resource converted and formatted according to the guidelines and requirements defined by the Open Multilingual Wordnet initiative (OMW, https://omwn.org/). This current version is derived from the ItalWordNet v.2. (IWN) database (http://hdl.handle.net/20.500.11752/ILC-62).
+
+NB: 'dc:relation', when used, contains links to equivalent Senses in the RDF version of the SIMPLE Italian lexiconù
+
+## Licence
+
+CC-BY-SA 4.0
+
+## Citation
+
+If you use this resource please cite: 
+
+Quochi, Valeria, Roberto Bartolini, and Monica Monachini (to appear) ‘ItalwordNet goes open´. *LiLT Special Issues on Open Multilingual
+WordNets*. CSLI Publications.
+
+And 
+
+Roventini, Adriana, Antonietta Alonge, Francesca Bertagna, Nicoletta Calzolari, J. Cancila, C. Girardi, Bernardo Magnini, Rita Marinelli, Manuela Speranza, and Antonio Zampolli (2003) "ItalwordNet: building a large semantic database for the automatic treatment of Italian". *Linguistica Computazionale* 18-19:745-791.
--- a/iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml
+++ b/iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml
--- a/lexicon_it.json
+++ b/lexicon_it.json
--- a/lexicon_it_semantic.json
+++ b/lexicon_it_semantic.json
--- a/main.py
+++ b/main.py
@@ -1,8 +1,9 @@
 from __future__ import annotations

 import argparse
+import json
 from pathlib import Path
-from typing import List
+from typing import Dict, List

 from build_vocabulary import (
    FILTERED_OUTPUT_PATH,
@@ -10,39 +11,61 @@ from build_vocabulary import (
    OUTPUT_PATH,
    build_vocabulary,
 )
+from build_lexicon import LEXICON_OUTPUT_PATH, build_lexicon
+from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH, build_semantic_lexicon
 from crossword_filler import CrosswordFiller, load_vocabulary, load_vocabulary_metadata
 from crossword_generator import CrosswordGenerator, WORDS, render_grid


+DIFFICULTY_ALIASES: Dict[str, int] = {
+    "easy": 1,
+    "medium": 2,
+    "hard": 4,
+    "expert": 5,
+}
+
+DEFAULT_TOPIC = "general"
+
+
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generatore e filler di cruciverba.")
    parser.add_argument(
        "--build-vocabulary",
        action="store_true",
-        help="Rigenera il vocabolario esteso, filtrato e i metadati prima dell'esecuzione.",
+        help="Rigenera i file lessicali intermedi: vocabolario esteso, filtrato e metadati.",
+    )
+    parser.add_argument(
+        "--build-lexicon",
+        action="store_true",
+        help="Rigenera `lexicon_it.json` prima dell'esecuzione.",
    )
    parser.add_argument(
        "--skip-fill",
        action="store_true",
-        help="Genera solo la griglia iniziale senza eseguire il filler.",
+        help="Genera solo la griglia iniziale e salta il riempimento con il filler.",
+    )
+    parser.add_argument(
+        "--build-semantic-lexicon",
+        action="store_true",
+        help="Rigenera `lexicon_it_semantic.json` arricchendo il lessico con IWN-OMW/ItalWordNet.",
    )
    parser.add_argument(
        "--vocabulary",
        type=Path,
        default=None,
-        help="Percorso opzionale a un vocabolario personalizzato.",
+        help="Percorso opzionale a un vocabolario testuale personalizzato da usare al posto di quello di default.",
    )
    parser.add_argument(
        "--target-empty-ratio",
        type=float,
        default=1 / 6,
-        help="Rapporto target di celle vuote residue dopo il filler.",
+        help="Rapporto target di celle vuote residue dopo il filler. Esempio: 0.1667 lascia circa un sesto di celle vuote.",
    )
    parser.add_argument(
        "--time-limit",
        type=float,
        default=8.0,
-        help="Tempo massimo in secondi per la fase di generazione iniziale.",
+        help="Tempo massimo in secondi per la fase di generazione iniziale della griglia.",
    )
    parser.add_argument(
        "--max-candidates",
@@ -54,7 +77,23 @@ def parse_args() -> argparse.Namespace:
        "--diffxy",
        type=int,
        default=7,
-        help="Differenza massima preferita tra larghezza e altezza della griglia.",
+        help="Differenza massima preferita tra larghezza e altezza della griglia iniziale.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Seed casuale per ottenere varianti riproducibili del cruciverba: stesso seed, stesso risultato.",
+    )
+    parser.add_argument(
+        "--difficulty",
+        default="medium",
+        help="Difficolta lessicale del filler. Alias testuali: easy, medium, hard, expert. Internamente mappati a livelli numerici 1-5.",
+    )
+    parser.add_argument(
+        "--topic",
+        default=DEFAULT_TOPIC,
+        help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
    )
    return parser.parse_args()

@@ -73,21 +112,110 @@ def ensure_vocabulary(args: argparse.Namespace) -> None:
    print(f"- parole filtrate: {totals['filtered_words']}")


+def ensure_lexicon(args: argparse.Namespace) -> None:
+    needs_build = args.build_lexicon or not LEXICON_OUTPUT_PATH.exists()
+    if not needs_build:
+        return
+
+    lexicon = build_lexicon()
+    LEXICON_OUTPUT_PATH.write_text(
+        json.dumps(lexicon, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    print("Lessico rigenerato")
+    print(f"- file: {LEXICON_OUTPUT_PATH}")
+    print(f"- voci: {lexicon['meta']['entry_count']}")
+
+
+def ensure_semantic_lexicon(args: argparse.Namespace) -> None:
+    needs_build = args.build_semantic_lexicon or not SEMANTIC_LEXICON_OUTPUT_PATH.exists()
+    if not needs_build:
+        return
+
+    lexicon = build_semantic_lexicon()
+    SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
+        json.dumps(lexicon, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    matched = sum(1 for entry in lexicon["entries"] if entry.get("semantic", {}).get("matched"))
+    print("Lessico semantico rigenerato")
+    print(f"- file: {SEMANTIC_LEXICON_OUTPUT_PATH}")
+    print(f"- voci: {lexicon['meta']['entry_count']}")
+    print(f"- match semantici: {matched}")
+
+
+def parse_difficulty(value: str) -> int:
+    text = str(value).strip().lower()
+    if text in DIFFICULTY_ALIASES:
+        return DIFFICULTY_ALIASES[text]
+    try:
+        level = int(text)
+    except ValueError as exc:
+        raise SystemExit(
+            "Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
+        ) from exc
+    if not 1 <= level <= 5:
+        raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
+    return level
+
+
 def load_selected_vocabulary(path: Path | None) -> List[str]:
    if path is None:
        return load_vocabulary()
    return path.read_text(encoding="utf-8").splitlines()


+def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
+    if not LEXICON_OUTPUT_PATH.exists():
+        lexicon = build_lexicon()
+        LEXICON_OUTPUT_PATH.write_text(
+            json.dumps(lexicon, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+
+    payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
+    normalized_topic = topic.strip().lower()
+
+    def matches(entry: Dict[str, object], selected_topic: str) -> bool:
+        topics = [str(item).lower() for item in entry.get("topics", [])]
+        return selected_topic in topics
+
+    words = [
+        entry["form"]
+        for entry in payload.get("entries", [])
+        if entry.get("allowed_in_crossword", False)
+        and int(entry.get("difficulty_word", 5)) <= level
+        and matches(entry, normalized_topic)
+    ]
+
+    if words:
+        return words
+
+    if normalized_topic != DEFAULT_TOPIC:
+        return [
+            entry["form"]
+            for entry in payload.get("entries", [])
+            if entry.get("allowed_in_crossword", False)
+            and int(entry.get("difficulty_word", 5)) <= level
+            and matches(entry, DEFAULT_TOPIC)
+        ]
+
+    return words
+
+
 def main() -> None:
    args = parse_args()
    ensure_vocabulary(args)
+    ensure_lexicon(args)
+    ensure_semantic_lexicon(args)
+    difficulty_level = parse_difficulty(args.difficulty)

    generator = CrosswordGenerator(
        WORDS,
        diffxy=args.diffxy,
        time_limit_seconds=args.time_limit,
        max_candidates_per_word=args.max_candidates,
+        seed=args.seed,
    )
    initial_state = generator.solve()

@@ -95,19 +223,24 @@ def main() -> None:
    print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
    print(f"Intersezioni: {initial_state.intersections}")
    print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
+    print(f"Difficolta filler: {args.difficulty} -> livello {difficulty_level}")
+    print(f"Tema filler: {args.topic}")
+    if args.seed is not None:
+        print(f"Seed: {args.seed}")
    print()
    print(render_grid(initial_state.grid, initial_state.placements))

    if args.skip_fill:
        return

-    vocabulary = load_selected_vocabulary(args.vocabulary)
+    vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
    metadata = load_vocabulary_metadata()
    filler = CrosswordFiller(
        initial_state,
        vocabulary,
        target_empty_ratio=args.target_empty_ratio,
        vocabulary_metadata=metadata,
+        seed=args.seed,
    )
    final_state = filler.fill()