feat: collega il lessico semantico al filler

2026-04-14 18:56:17 +02:00
parent 77c7e709b6
commit b172b9c04b
15 changed files with 2255563 additions and 9 deletions
--- a/pycache/build_lexicon.cpython-313.pyc
+++ b/pycache/build_lexicon.cpython-313.pyc
--- a/pycache/build_semantic_lexicon.cpython-313.pyc
+++ b/pycache/build_semantic_lexicon.cpython-313.pyc
--- a/pycache/crossword_filler.cpython-313.pyc
+++ b/pycache/crossword_filler.cpython-313.pyc
--- a/pycache/crossword_generator.cpython-313.pyc
+++ b/pycache/crossword_generator.cpython-313.pyc
--- a/build_lexicon.py
+++ b/build_lexicon.py
@@ -0,0 +1,249 @@
 from __future__ import annotations
 import json
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, List
 from build_vocabulary import (
    FILTERED_OUTPUT_PATH,
    METADATA_OUTPUT_PATH,
    build_vocabulary,
 )
 LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it.json")
 POS_BY_TAG = {
    "function": "PREP",
    "verb_infinitive": "VERB",
    "adverb": "ADV",
    "adjective_like": "ADJ",
    "noun_like": "NOUN",
 }
 REGISTER_BY_QUALITY = [
    (8, "common"),
    (5, "standard"),
    (3, "formal"),
    (0, "rare"),
 ]
 TOPIC_KEYWORDS = {
    "animals": {
        "cane", "gatto", "lupo", "volpe", "orso", "pesce", "cervo", "cavallo", "capra", "pecora",
        "leone", "tigre", "zebra", "aquila", "falco", "serpente", "vipera", "gabbiano", "anatra",
        "passero", "coniglio", "castoro", "bruco", "cigno", "asino", "alpaca",
    },
    "plants": {
        "albero", "pianta", "fiore", "foglia", "radice", "seme", "bosco", "selva", "ulivo", "quercia",
        "ortica", "edera", "aloe", "tulipano", "spiga", "polline", "grano", "erba",
    },
    "nature": {
        "natura", "bosco", "selva", "montagna", "collina", "roccia", "pietra", "fiume", "lago", "mare",
        "riva", "fonte", "onda", "vento", "fuoco", "terra", "sole", "luna", "aurora", "nuvola",
        "nebbia", "deserto", "isola", "greto", "radice", "fiore", "foglia", "erba", "zolla",
    },
    "ecology": {
        "ambiente", "ecologia", "natura", "bosco", "energia", "acqua", "terra", "clima", "sorgere",
        "fonte", "solare", "verde", "ulivo", "pianta", "polline", "grano", "radice",
    },
    "geography": {
        "montagna", "collina", "isola", "deserto", "equatore", "ovest", "oriente", "riva", "mare",
        "lago", "fiume", "ponte", "confine", "quota", "pianeta", "roccia", "greto",
    },
    "weather": {
        "vento", "nebbia", "aurora", "pioggia", "sole", "nuvola", "tempesta", "brina", "sereno",
        "clima", "goccia",
    },
    "sea": {
        "mare", "onda", "vela", "barca", "porto", "pesce", "ancora", "scoglio", "riva", "veliero",
    },
    "mountain": {
        "montagna", "quota", "vetta", "roccia", "greto", "collina", "sentiero", "alpino",
    },
    "health": {
        "salute", "febbre", "medico", "cura", "respiro", "diuretico", "anemico", "vigore", "energia",
        "dente", "cuore", "corpo", "viso",
    },
    "science": {
        "atomo", "energia", "metodo", "equatore", "digitale", "misura", "tecnica", "triangolo",
        "microfibra", "microscopio", "algoritmo", "motore", "materia", "liquido",
    },
    "sport": {
        "calcio", "atleta", "sportivo", "gol", "pallone", "gara", "trionfo", "primato", "allenatore",
        "stadio", "squadra", "rete",
    },
    "history": {
        "re", "principe", "regno", "impero", "senato", "console", "legione", "vittoria", "epoca",
        "origine", "ritorno",
    },
    "school": {
        "libro", "quaderno", "lezione", "classe", "studiare", "maestro", "scuola", "esame", "penna",
        "aula", "figura", "titolo",
    },
    "cinema": {
        "film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "voce", "visione",
        "finale", "figura",
    },
    "literature": {
        "libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
        "leggenda", "scrivere", "titolo",
    },
    "food": {
        "pane", "cacao", "gelato", "burro", "latte", "mandorla", "nocciola", "cena", "pranzo",
        "zuppa", "zucchero", "acqua", "fiore", "frutto",
    },
    "city": {
        "porta", "strada", "piazza", "ponte", "palazzo", "cortile", "villaggio", "citta", "urbano",
        "casale", "balcone", "finestra", "stazione",
    },
    "transport": {
        "automobile", "barca", "vela", "treno", "motore", "viaggio", "ruota", "ponte", "pilota",
        "volo", "aeroporto", "vettura",
    },
    "work": {
        "lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
        "mestiere", "servire",
    },
    "home": {
        "casa", "finestra", "porta", "parete", "divano", "tavolo", "sedia", "camera", "balcone",
        "camino", "tetto", "cortile", "vasca",
    },
 }
 TOPIC_SUFFIXES = {
    "actions": ("are", "ere", "ire"),
    "abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
    "animals": ("cane", "gatto", "lupo", "pesce", "volpe", "orso"),
    "plants": ("fiore", "foglia", "seme", "radice", "erba"),
    "nature": ("mare", "lago", "bosco", "vento", "onda", "roccia"),
    "geography": ("montagna", "isola", "deserto", "confine"),
    "city": ("strada", "palazzo", "porta", "ponte"),
 }
 def infer_pos(tags: List[str]) -> str:
    for tag in tags:
        if tag in POS_BY_TAG:
            return POS_BY_TAG[tag]
    return "NOUN"
 def infer_topics(word: str, tags: List[str]) -> List[str]:
    topics = {"general"}
    if "verb_infinitive" in tags:
        topics.add("actions")
    if any(word.endswith(suffix) for suffix in ("tore", "trice", "zione", "ismo", "ista", "mento", "anza", "enza")):
        topics.add("abstract")
    for topic, keywords in TOPIC_KEYWORDS.items():
        if word in keywords:
            topics.add(topic)
    for topic, suffixes in TOPIC_SUFFIXES.items():
        if any(word.endswith(suffix) for suffix in suffixes):
            topics.add(topic)
    if "animals" in topics:
        topics.add("nature")
    if "plants" in topics:
        topics.update({"nature", "ecology"})
    if "sea" in topics or "mountain" in topics or "weather" in topics:
        topics.add("nature")
    if "geography" in topics and "nature" not in topics:
        topics.add("nature")
    return sorted(topics)
 def infer_register(quality: int) -> str:
    for threshold, label in REGISTER_BY_QUALITY:
        if quality >= threshold:
            return label
    return "rare"
 def frequency_from_quality(quality: int, index: int, total: int) -> tuple[int, float]:
    rank = index + 1
    normalized_rank = 1.0 - (rank - 1) / max(1, total - 1)
    quality_boost = min(max(quality, 0), 10) / 20.0
    frequency_score = round(min(1.0, normalized_rank * 0.7 + quality_boost), 4)
    return rank, frequency_score
 def load_words() -> List[str]:
    if not FILTERED_OUTPUT_PATH.exists() or not METADATA_OUTPUT_PATH.exists():
        build_vocabulary()
    words = [
        line.strip()
        for line in FILTERED_OUTPUT_PATH.read_text(encoding="utf-8").splitlines()
        if line.strip()
    ]
    return words
 def load_metadata() -> Dict[str, Dict[str, object]]:
    if not METADATA_OUTPUT_PATH.exists():
        build_vocabulary()
    return json.loads(METADATA_OUTPUT_PATH.read_text(encoding="utf-8"))
 def build_lexicon() -> Dict[str, object]:
    words = load_words()
    metadata = load_metadata()
    entries = []
    total = len(words)
    for index, word in enumerate(words):
        meta = metadata.get(word, {})
        tags = list(meta.get("tags", []))
        quality = int(meta.get("quality", 0))
        frequency_rank, frequency_score = frequency_from_quality(quality, index, total)
        entry = {
            "form": word,
            "normalized_form": word,
            "lemma": word,
            "pos": infer_pos(tags),
            "length": len(word),
            "frequency_rank": frequency_rank,
            "frequency_score": frequency_score,
            "difficulty_word": max(1, min(5, 6 - max(1, min(5, quality // 2 + 1)))),
            "allowed_in_crossword": True,
            "quality_score": max(0, min(10, quality)),
            "topics": infer_topics(word, tags),
            "morph_features": {},
            "register": infer_register(quality),
            "source_flags": ["from_filtered_vocabulary", "from_metadata_heuristics"],
            "crossword_flags": tags,
            "notes": "",
        }
        entries.append(entry)
    return {
        "meta": {
            "language": "it",
            "version": 1,
            "sources": ["vocaboli_it_filtrato.txt", "vocaboli_it_metadata.json"],
            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
            "entry_count": len(entries),
        },
        "entries": entries,
    }
 def main() -> None:
    lexicon = build_lexicon()
    LEXICON_OUTPUT_PATH.write_text(
        json.dumps(lexicon, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    print(f"Lessico generato: {LEXICON_OUTPUT_PATH}")
    print(f"Voci generate: {lexicon['meta']['entry_count']}")
 if __name__ == "__main__":
    main()
--- a/build_semantic_lexicon.py
+++ b/build_semantic_lexicon.py
@@ -0,0 +1,426 @@
 from __future__ import annotations
 import json
 import re
 import unicodedata
 import xml.etree.ElementTree as ET
 from collections import defaultdict
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, Iterable, List, Tuple
 from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics
 IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
 SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json")
 IWN_POS_MAP = {
    "n": "NOUN",
    "v": "VERB",
    "a": "ADJ",
    "s": "ADJ",
    "r": "ADV",
 }
 SEMANTIC_TOPIC_KEYWORDS = {
    "animals": {
        "animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce",
        "rettile", "domestico", "compagnia", "caccia", "pastorizia",
    },
    "plants": {
        "pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto",
        "ghianda", "bosco", "radice", "seme", "vegetale",
    },
    "nature": {
        "natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco",
        "lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre",
    },
    "ecology": {
        "ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde",
        "ecosistema", "acqua", "terra",
    },
    "geography": {
        "territorio", "superficie", "terrestre", "regione", "confine", "montagna",
        "isola", "mare", "lago", "fiume",
    },
    "weather": {
        "clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina",
        "atmosfera",
    },
    "sea": {
        "mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela",
        "nave", "fondale",
    },
    "mountain": {
        "montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero",
    },
    "health": {
        "salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia",
        "terapia", "cervello", "respiro",
    },
    "science": {
        "scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica",
        "chimica", "biologia", "strumento",
    },
    "sport": {
        "sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco",
        "agonistico",
    },
    "history": {
        "storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca",
    },
    "school": {
        "scuola", "lezione", "studente", "classe", "maestro", "esame", "libro",
        "quaderno", "aula",
    },
    "cinema": {
        "film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo",
        "teatro",
    },
    "literature": {
        "libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo",
        "letteratura",
    },
    "food": {
        "cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina",
        "gelato",
    },
    "city": {
        "citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta",
        "quartiere",
    },
    "transport": {
        "veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave",
        "aereo", "automobile",
    },
    "work": {
        "lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio",
    },
    "home": {
        "casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone",
        "tavolo", "sedia",
    },
 }
 def normalize_word(text: str) -> str:
    normalized = unicodedata.normalize("NFKD", text)
    ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
    ascii_only = ascii_only.lower().replace("_", "")
    ascii_only = re.sub(r"[^a-z]", "", ascii_only)
    return ascii_only
 def semantic_topics_from_text(parts: Iterable[str]) -> List[str]:
    tokens = set()
    for part in parts:
        normalized = normalize_word(part)
        if not normalized:
            continue
        tokens.add(normalized)
        tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part))))
    topics = set()
    for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items():
        if any(keyword in tokens for keyword in keywords):
            topics.add(topic)
    return sorted(topics)
 def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]:
    xml_text = IWN_XML_PATH.read_text(encoding="utf-8")
    xml_text = xml_text.replace('"  -->', '">')
    root = ET.fromstring(xml_text)
    synsets: Dict[str, Dict[str, object]] = {}
    entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list)
    for synset in root.findall(".//Synset"):
        synset_id = synset.attrib.get("id", "")
        relations = [
            {
                "type": relation.attrib.get("relType", ""),
                "target": relation.attrib.get("target", ""),
                "subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""),
            }
            for relation in synset.findall("SynsetRelation")
        ]
        synsets[synset_id] = {
            "id": synset_id,
            "ili": synset.attrib.get("ili", ""),
            "definition": (synset.findtext("Definition") or "").strip(),
            "relations": relations,
            "lemmas": [],
            "pos": "",
        }
    for lexical_entry in root.findall(".//LexicalEntry"):
        lemma = lexical_entry.find("Lemma")
        if lemma is None:
            continue
        written_form = lemma.attrib.get("writtenForm", "").strip()
        normalized_form = normalize_word(written_form)
        if not normalized_form:
            continue
        pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN")
        senses = lexical_entry.findall("Sense")
        sense_payloads = []
        for sense in senses:
            synset_id = sense.attrib.get("synset", "")
            sense_id = sense.attrib.get("id", "")
            if not synset_id or synset_id not in synsets:
                continue
            synsets[synset_id]["lemmas"].append(written_form)
            synsets[synset_id]["pos"] = pos
            sense_payloads.append(
                {
                    "sense_id": sense_id,
                    "synset_id": synset_id,
                    "pos": pos,
                }
            )
        if sense_payloads:
            entries_by_norm[normalized_form].append(
                {
                    "written_form": written_form,
                    "normalized_form": normalized_form,
                    "pos": pos,
                    "senses": sense_payloads,
                }
            )
    for synset in synsets.values():
        unique_lemmas = []
        seen = set()
        for lemma in synset["lemmas"]:
            if lemma not in seen:
                seen.add(lemma)
                unique_lemmas.append(lemma)
        synset["lemmas"] = unique_lemmas
    return synsets, entries_by_norm
 def score_sense(
    synset_id: str,
    current_topics: Iterable[str],
    synsets: Dict[str, Dict[str, object]],
 ) -> int:
    synset = synsets.get(synset_id, {})
    definition = str(synset.get("definition", ""))
    inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", []))))
    current_topics_set = set(str(topic) for topic in current_topics)
    overlap = len(inferred_topics & current_topics_set)
    return overlap * 10 + len(definition)
 def best_candidate(
    candidates: List[Dict[str, object]],
    expected_pos: str,
    current_topics: Iterable[str],
    synsets: Dict[str, Dict[str, object]],
 ) -> Dict[str, object]:
    ranked = []
    for candidate in candidates:
        pos_bonus = 100 if candidate["pos"] == expected_pos else 0
        sense_bonus = 0
        for sense in candidate.get("senses", []):
            sense_bonus = max(
                sense_bonus,
                score_sense(str(sense.get("synset_id", "")), current_topics, synsets),
            )
        ranked.append((pos_bonus + sense_bonus, candidate))
    ranked.sort(key=lambda item: item[0], reverse=True)
    return ranked[0][1]
 def dedupe_keep_order(items: Iterable[str]) -> List[str]:
    seen = set()
    result = []
    for item in items:
        text = str(item).strip()
        if not text or text in seen:
            continue
        seen.add(text)
        result.append(text)
    return result
 def resolve_related_lemmas(
    synset_ids: Iterable[str],
    relation_type: str,
    synsets: Dict[str, Dict[str, object]],
 ) -> List[str]:
    related = []
    for synset_id in synset_ids:
        synset = synsets.get(synset_id)
        if not synset:
            continue
        for relation in synset.get("relations", []):
            if relation.get("type") != relation_type:
                continue
            target = relation.get("target", "")
            target_synset = synsets.get(target)
            if not target_synset:
                continue
            related.extend(target_synset.get("lemmas", []))
    return dedupe_keep_order(related)
 def collect_relation_terms(
    synset_ids: Iterable[str],
    relation_types: Iterable[str],
    synsets: Dict[str, Dict[str, object]],
 ) -> Dict[str, List[str]]:
    return {
        relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20]
        for relation_type in relation_types
    }
 def enrich_entry(
    entry: Dict[str, object],
    synsets: Dict[str, Dict[str, object]],
    entries_by_norm: Dict[str, List[Dict[str, object]]],
 ) -> Dict[str, object]:
    normalized_candidates = dedupe_keep_order(
        [
            normalize_word(str(entry.get("form", ""))),
            normalize_word(str(entry.get("lemma", ""))),
            normalize_word(str(entry.get("normalized_form", ""))),
        ]
    )
    matches: List[Dict[str, object]] = []
    for candidate_key in normalized_candidates:
        matches.extend(entries_by_norm.get(candidate_key, []))
    if not matches:
        entry["semantic"] = {
            "source": "iwn-omw",
            "matched": False,
            "match_count": 0,
            "synsets": [],
            "synonyms": [],
            "raw_relation_terms": {},
            "glosses": [],
            "semantic_topics": [],
        }
        return entry
    selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets)
    sorted_senses = sorted(
        selected.get("senses", []),
        key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets),
        reverse=True,
    )
    synset_ids = [sense["synset_id"] for sense in sorted_senses]
    synset_payloads = []
    glosses = []
    synonyms = []
    for synset_id in synset_ids[:3]:
        synset = synsets.get(synset_id)
        if not synset:
            continue
        gloss = str(synset.get("definition", "")).strip()
        glosses.append(gloss)
        synset_payloads.append(
            {
                "id": synset_id,
                "pos": synset.get("pos", ""),
                "definition": gloss,
                "lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12],
                "raw_relation_terms": collect_relation_terms(
                    [synset_id],
                    ("hypernym", "hyponym", "similar"),
                    synsets,
                ),
            }
        )
        synonyms.extend(synset.get("lemmas", []))
    raw_relation_terms = collect_relation_terms(
        synset_ids,
        ("hypernym", "hyponym", "similar"),
        synsets,
    )
    synonyms = [
        lemma
        for lemma in dedupe_keep_order(synonyms)
        if normalize_word(lemma) != normalize_word(str(entry.get("form", "")))
    ][:20]
    glosses = dedupe_keep_order(glosses)
    semantic_topics = dedupe_keep_order(
        list(entry.get("topics", []))
        + semantic_topics_from_text(
            glosses
            + synonyms
            + raw_relation_terms.get("hypernym", [])
            + raw_relation_terms.get("hyponym", [])
            + raw_relation_terms.get("similar", [])
        )
    )
    entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
    entry["semantic"] = {
        "source": "iwn-omw",
        "matched": True,
        "match_count": len(matches),
        "selected_form": selected.get("written_form", ""),
        "synsets": synset_payloads,
        "synonyms": synonyms,
        "raw_relation_terms": raw_relation_terms,
        "glosses": glosses,
        "semantic_topics": semantic_topics,
    }
    return entry
 def build_semantic_lexicon() -> Dict[str, object]:
    if not LEXICON_OUTPUT_PATH.exists():
        raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}")
    if not IWN_XML_PATH.exists():
        raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}")
    payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
    synsets, entries_by_norm = parse_iwn()
    enriched_entries = []
    for entry in payload.get("entries", []):
        enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm))
    return {
        "meta": {
            "language": "it",
            "version": 1,
            "base_lexicon": str(LEXICON_OUTPUT_PATH.name),
            "sources": [
                "lexicon_it.json",
                "iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml",
            ],
            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
            "entry_count": len(enriched_entries),
            "semantic_source": "IWN-OMW v1.0",
        },
        "entries": enriched_entries,
    }
 def main() -> None:
    payload = build_semantic_lexicon()
    SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
        json.dumps(payload, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched"))
    print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
    print(f"Voci totali: {payload['meta']['entry_count']}")
    print(f"Voci con match semantico: {matched}")
 if __name__ == "__main__":
    main()
--- a/create_passo4.bat
+++ b/create_passo4.bat
@@ -0,0 +1,55 @@
@echo off
 setlocal
 cd /d "%~dp0"
 set "BRANCH_NAME=passo4"
 set "COMMIT_MSG=feat: aggiunge il lessico semantico con integrazione ItalWordNet"
 if not "%~1"=="" (
    set "COMMIT_MSG=%~1"
 )
 echo Repository: %cd%
 echo Branch target: %BRANCH_NAME%
 echo Commit message: %COMMIT_MSG%
 echo.
 git rev-parse --is-inside-work-tree >nul 2>nul
 if errorlevel 1 (
    echo Errore: questa cartella non e' un repository Git.
    exit /b 1
 )
 git show-ref --verify --quiet refs/heads/%BRANCH_NAME%
 if errorlevel 1 (
    echo Creo il branch %BRANCH_NAME%...
    git checkout -b %BRANCH_NAME%
 ) else (
    echo Il branch %BRANCH_NAME% esiste gia', ci passo sopra...
    git checkout %BRANCH_NAME%
 )
 if errorlevel 1 exit /b 1
 echo.
 echo Aggiungo le modifiche...
 git add .
 if errorlevel 1 exit /b 1
 echo.
 echo Creo il commit...
 git commit -m "%COMMIT_MSG%"
 if errorlevel 1 (
    echo.
    echo Nessun commit creato. Potrebbe non esserci nulla di nuovo da salvare.
    exit /b 1
 )
 echo.
 echo Eseguo il push del branch %BRANCH_NAME%...
 git push -u origin %BRANCH_NAME%
 if errorlevel 1 exit /b 1
 echo.
 echo Operazione completata con successo.
 endlocal
--- a/crossword_filler.py
+++ b/crossword_filler.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 import json
 from pathlib import Path
 import random
 import sys
 import time
 from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
@@ -72,6 +73,7 @@ class CrosswordFiller:
        *,
        target_empty_ratio: float = TARGET_EMPTY_RATIO,
        vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
        seed: Optional[int] = None,
    ) -> None:
        self.state = state.copy()
        self.initial_state = state.copy()
@@ -81,6 +83,8 @@ class CrosswordFiller:
        self.vocabulary = self._normalize_vocabulary(vocabulary)
        self.words_by_length = self._index_vocabulary(self.vocabulary)
        self.vocabulary_metadata = vocabulary_metadata or {}
        self.seed = seed
        self.rng = random.Random(seed)
        self.bounds = self._compute_bounds(self.state.grid)
        self.total_cells = self._area(self.bounds)
        self.target_empty_cells = max(0, int(round(self.total_cells * self.target_empty_ratio)))
@@ -181,6 +185,10 @@ class CrosswordFiller:
        collected = list(unique.values())
        collected.sort(key=self._slot_priority, reverse=True)
        if len(collected) > 1:
            top_slice = collected[: min(MAX_SLOT_CANDIDATES, len(collected))]
            self.rng.shuffle(top_slice)
            collected = top_slice + collected[min(MAX_SLOT_CANDIDATES, len(collected)) :]
        return collected
    def _slots_from_start(self, x: int, y: int, direction: str) -> Iterable[FillSlot]:
@@ -292,7 +300,7 @@ class CrosswordFiller:
            return None
        candidates.sort(key=lambda item: item.local_score, reverse=True)
-        return candidates[0]
+        return self.rng.choice(candidates[: min(3, len(candidates))])
    def _word_quality(self, word: str) -> int:
        metadata = self.vocabulary_metadata.get(word)
--- a/crossword_generator.py
+++ b/crossword_generator.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 from dataclasses import dataclass
 import locale
 import random
 import sys
 import time
 from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
@@ -114,6 +115,7 @@ class CrosswordGenerator:
        max_candidates_per_word: int = 12,
        time_limit_seconds: float = 8.0,
        diffxy: int = DIFFXY,
        seed: Optional[int] = None,
    ) -> None:
        normalized = [self._normalize(word) for word in words]
        unique_words = list(dict.fromkeys(word for word in normalized if len(word) >= 2))
@@ -122,6 +124,8 @@ class CrosswordGenerator:
        self.max_candidates_per_word = max_candidates_per_word
        self.time_limit_seconds = time_limit_seconds
        self.diffxy = diffxy
        self.seed = seed
        self.rng = random.Random(seed)
        self.started_at = 0.0
        self.visited: Dict[Tuple[frozenset, Tuple[str, ...]], Tuple[int, int, int]] = {}
        self.nodes_visited = 0
@@ -213,6 +217,8 @@ class CrosswordGenerator:
            reverse=True,
        )
        candidates = candidates[: self.max_candidates_per_word]
        if len(candidates) > 1:
            self.rng.shuffle(candidates)
        next_remaining = [word for word in remaining_words if word != next_word]
        for placement in candidates:
@@ -253,6 +259,10 @@ class CrosswordGenerator:
                word,
            ),
        )
        if len(ranked_words) > 1:
            top_slice = ranked_words[: min(5, len(ranked_words))]
            self.rng.shuffle(top_slice)
            ranked_words = top_slice + ranked_words[min(5, len(ranked_words)) :]
        best_word = ranked_words[0]
        best_key: Optional[Tuple[int, int, int, str]] = None
--- a/iwn-omw-main.zip
+++ b/iwn-omw-main.zip
--- a/iwn-omw-main/IWN-OMW-main/README.md
+++ b/iwn-omw-main/IWN-OMW-main/README.md
@@ -0,0 +1,21 @@
 # IWN-OMW
 This is the repository for the Open Italian WordNet, i.e. ItalWordNet versions compliant with the Open Multilingual WordNet guidelines and initiative.
 IWN-OMW is a new LMF version of the ItalWordNet resource converted and formatted according to the guidelines and requirements defined by the Open Multilingual Wordnet initiative (OMW, https://omwn.org/). This current version is derived from the ItalWordNet v.2. (IWN) database (http://hdl.handle.net/20.500.11752/ILC-62).
 NB: 'dc:relation', when used, contains links to equivalent Senses in the RDF version of the SIMPLE Italian lexiconù
 ## Licence
 CC-BY-SA 4.0
 ## Citation
 If you use this resource please cite: 
 Quochi, Valeria, Roberto Bartolini, and Monica Monachini (to appear) ‘ItalwordNet goes open´. *LiLT Special Issues on Open Multilingual
 WordNets*. CSLI Publications.
 And 
 Roventini, Adriana, Antonietta Alonge, Francesca Bertagna, Nicoletta Calzolari, J. Cancila, C. Girardi, Bernardo Magnini, Rita Marinelli, Manuela Speranza, and Antonio Zampolli (2003) "ItalwordNet: building a large semantic database for the automatic treatment of Italian". *Linguistica Computazionale* 18-19:745-791.
--- a/iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml
+++ b/iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml
--- a/lexicon_it.json
+++ b/lexicon_it.json
--- a/lexicon_it_semantic.json
+++ b/lexicon_it_semantic.json
--- a/main.py
+++ b/main.py
@@ -1,8 +1,9 @@
 from __future__ import annotations
 import argparse
 import json
 from pathlib import Path
-from typing import List
+from typing import Dict, List
 from build_vocabulary import (
    FILTERED_OUTPUT_PATH,
@@ -10,39 +11,61 @@ from build_vocabulary import (
    OUTPUT_PATH,
    build_vocabulary,
 )
 from build_lexicon import LEXICON_OUTPUT_PATH, build_lexicon
 from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH, build_semantic_lexicon
 from crossword_filler import CrosswordFiller, load_vocabulary, load_vocabulary_metadata
 from crossword_generator import CrosswordGenerator, WORDS, render_grid
 DIFFICULTY_ALIASES: Dict[str, int] = {
    "easy": 1,
    "medium": 2,
    "hard": 4,
    "expert": 5,
 }
 DEFAULT_TOPIC = "general"
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generatore e filler di cruciverba.")
    parser.add_argument(
        "--build-vocabulary",
        action="store_true",
-        help="Rigenera il vocabolario esteso, filtrato e i metadati prima dell'esecuzione.",
+        help="Rigenera i file lessicali intermedi: vocabolario esteso, filtrato e metadati.",
    )
    parser.add_argument(
        "--build-lexicon",
        action="store_true",
        help="Rigenera `lexicon_it.json` prima dell'esecuzione.",
    )
    parser.add_argument(
        "--skip-fill",
        action="store_true",
-        help="Genera solo la griglia iniziale senza eseguire il filler.",
+        help="Genera solo la griglia iniziale e salta il riempimento con il filler.",
    )
    parser.add_argument(
        "--build-semantic-lexicon",
        action="store_true",
        help="Rigenera `lexicon_it_semantic.json` arricchendo il lessico con IWN-OMW/ItalWordNet.",
    )
    parser.add_argument(
        "--vocabulary",
        type=Path,
        default=None,
-        help="Percorso opzionale a un vocabolario personalizzato.",
+        help="Percorso opzionale a un vocabolario testuale personalizzato da usare al posto di quello di default.",
    )
    parser.add_argument(
        "--target-empty-ratio",
        type=float,
        default=1 / 6,
-        help="Rapporto target di celle vuote residue dopo il filler.",
+        help="Rapporto target di celle vuote residue dopo il filler. Esempio: 0.1667 lascia circa un sesto di celle vuote.",
    )
    parser.add_argument(
        "--time-limit",
        type=float,
        default=8.0,
-        help="Tempo massimo in secondi per la fase di generazione iniziale.",
+        help="Tempo massimo in secondi per la fase di generazione iniziale della griglia.",
    )
    parser.add_argument(
        "--max-candidates",
@@ -54,7 +77,23 @@ def parse_args() -> argparse.Namespace:
        "--diffxy",
        type=int,
        default=7,
-        help="Differenza massima preferita tra larghezza e altezza della griglia.",
+        help="Differenza massima preferita tra larghezza e altezza della griglia iniziale.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Seed casuale per ottenere varianti riproducibili del cruciverba: stesso seed, stesso risultato.",
    )
    parser.add_argument(
        "--difficulty",
        default="medium",
        help="Difficolta lessicale del filler. Alias testuali: easy, medium, hard, expert. Internamente mappati a livelli numerici 1-5.",
    )
    parser.add_argument(
        "--topic",
        default=DEFAULT_TOPIC,
        help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
    )
    return parser.parse_args()
@@ -73,21 +112,110 @@ def ensure_vocabulary(args: argparse.Namespace) -> None:
    print(f"- parole filtrate: {totals['filtered_words']}")
 def ensure_lexicon(args: argparse.Namespace) -> None:
    needs_build = args.build_lexicon or not LEXICON_OUTPUT_PATH.exists()
    if not needs_build:
        return
    lexicon = build_lexicon()
    LEXICON_OUTPUT_PATH.write_text(
        json.dumps(lexicon, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    print("Lessico rigenerato")
    print(f"- file: {LEXICON_OUTPUT_PATH}")
    print(f"- voci: {lexicon['meta']['entry_count']}")
 def ensure_semantic_lexicon(args: argparse.Namespace) -> None:
    needs_build = args.build_semantic_lexicon or not SEMANTIC_LEXICON_OUTPUT_PATH.exists()
    if not needs_build:
        return
    lexicon = build_semantic_lexicon()
    SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
        json.dumps(lexicon, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    matched = sum(1 for entry in lexicon["entries"] if entry.get("semantic", {}).get("matched"))
    print("Lessico semantico rigenerato")
    print(f"- file: {SEMANTIC_LEXICON_OUTPUT_PATH}")
    print(f"- voci: {lexicon['meta']['entry_count']}")
    print(f"- match semantici: {matched}")
 def parse_difficulty(value: str) -> int:
    text = str(value).strip().lower()
    if text in DIFFICULTY_ALIASES:
        return DIFFICULTY_ALIASES[text]
    try:
        level = int(text)
    except ValueError as exc:
        raise SystemExit(
            "Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
        ) from exc
    if not 1 <= level <= 5:
        raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
    return level
 def load_selected_vocabulary(path: Path | None) -> List[str]:
    if path is None:
        return load_vocabulary()
    return path.read_text(encoding="utf-8").splitlines()
 def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
    if not LEXICON_OUTPUT_PATH.exists():
        lexicon = build_lexicon()
        LEXICON_OUTPUT_PATH.write_text(
            json.dumps(lexicon, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
    payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
    normalized_topic = topic.strip().lower()
    def matches(entry: Dict[str, object], selected_topic: str) -> bool:
        topics = [str(item).lower() for item in entry.get("topics", [])]
        return selected_topic in topics
    words = [
        entry["form"]
        for entry in payload.get("entries", [])
        if entry.get("allowed_in_crossword", False)
        and int(entry.get("difficulty_word", 5)) <= level
        and matches(entry, normalized_topic)
    ]
    if words:
        return words
    if normalized_topic != DEFAULT_TOPIC:
        return [
            entry["form"]
            for entry in payload.get("entries", [])
            if entry.get("allowed_in_crossword", False)
            and int(entry.get("difficulty_word", 5)) <= level
            and matches(entry, DEFAULT_TOPIC)
        ]
    return words
 def main() -> None:
    args = parse_args()
    ensure_vocabulary(args)
    ensure_lexicon(args)
    ensure_semantic_lexicon(args)
    difficulty_level = parse_difficulty(args.difficulty)
    generator = CrosswordGenerator(
        WORDS,
        diffxy=args.diffxy,
        time_limit_seconds=args.time_limit,
        max_candidates_per_word=args.max_candidates,
        seed=args.seed,
    )
    initial_state = generator.solve()
@@ -95,19 +223,24 @@ def main() -> None:
    print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
    print(f"Intersezioni: {initial_state.intersections}")
    print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
    print(f"Difficolta filler: {args.difficulty} -> livello {difficulty_level}")
    print(f"Tema filler: {args.topic}")
    if args.seed is not None:
        print(f"Seed: {args.seed}")
    print()
    print(render_grid(initial_state.grid, initial_state.placements))
    if args.skip_fill:
        return
-    vocabulary = load_selected_vocabulary(args.vocabulary)
+    vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
    metadata = load_vocabulary_metadata()
    filler = CrosswordFiller(
        initial_state,
        vocabulary,
        target_empty_ratio=args.target_empty_ratio,
        vocabulary_metadata=metadata,
        seed=args.seed,
    )
    final_state = filler.fill()