feat: collega il lessico semantico al filler

This commit is contained in:
2026-04-14 18:56:17 +02:00
parent 77c7e709b6
commit b172b9c04b
15 changed files with 2255563 additions and 9 deletions

Binary file not shown.

Binary file not shown.

249
build_lexicon.py Normal file
View File

@@ -0,0 +1,249 @@
from __future__ import annotations
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List
from build_vocabulary import (
FILTERED_OUTPUT_PATH,
METADATA_OUTPUT_PATH,
build_vocabulary,
)
LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it.json")
POS_BY_TAG = {
"function": "PREP",
"verb_infinitive": "VERB",
"adverb": "ADV",
"adjective_like": "ADJ",
"noun_like": "NOUN",
}
REGISTER_BY_QUALITY = [
(8, "common"),
(5, "standard"),
(3, "formal"),
(0, "rare"),
]
TOPIC_KEYWORDS = {
"animals": {
"cane", "gatto", "lupo", "volpe", "orso", "pesce", "cervo", "cavallo", "capra", "pecora",
"leone", "tigre", "zebra", "aquila", "falco", "serpente", "vipera", "gabbiano", "anatra",
"passero", "coniglio", "castoro", "bruco", "cigno", "asino", "alpaca",
},
"plants": {
"albero", "pianta", "fiore", "foglia", "radice", "seme", "bosco", "selva", "ulivo", "quercia",
"ortica", "edera", "aloe", "tulipano", "spiga", "polline", "grano", "erba",
},
"nature": {
"natura", "bosco", "selva", "montagna", "collina", "roccia", "pietra", "fiume", "lago", "mare",
"riva", "fonte", "onda", "vento", "fuoco", "terra", "sole", "luna", "aurora", "nuvola",
"nebbia", "deserto", "isola", "greto", "radice", "fiore", "foglia", "erba", "zolla",
},
"ecology": {
"ambiente", "ecologia", "natura", "bosco", "energia", "acqua", "terra", "clima", "sorgere",
"fonte", "solare", "verde", "ulivo", "pianta", "polline", "grano", "radice",
},
"geography": {
"montagna", "collina", "isola", "deserto", "equatore", "ovest", "oriente", "riva", "mare",
"lago", "fiume", "ponte", "confine", "quota", "pianeta", "roccia", "greto",
},
"weather": {
"vento", "nebbia", "aurora", "pioggia", "sole", "nuvola", "tempesta", "brina", "sereno",
"clima", "goccia",
},
"sea": {
"mare", "onda", "vela", "barca", "porto", "pesce", "ancora", "scoglio", "riva", "veliero",
},
"mountain": {
"montagna", "quota", "vetta", "roccia", "greto", "collina", "sentiero", "alpino",
},
"health": {
"salute", "febbre", "medico", "cura", "respiro", "diuretico", "anemico", "vigore", "energia",
"dente", "cuore", "corpo", "viso",
},
"science": {
"atomo", "energia", "metodo", "equatore", "digitale", "misura", "tecnica", "triangolo",
"microfibra", "microscopio", "algoritmo", "motore", "materia", "liquido",
},
"sport": {
"calcio", "atleta", "sportivo", "gol", "pallone", "gara", "trionfo", "primato", "allenatore",
"stadio", "squadra", "rete",
},
"history": {
"re", "principe", "regno", "impero", "senato", "console", "legione", "vittoria", "epoca",
"origine", "ritorno",
},
"school": {
"libro", "quaderno", "lezione", "classe", "studiare", "maestro", "scuola", "esame", "penna",
"aula", "figura", "titolo",
},
"cinema": {
"film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "voce", "visione",
"finale", "figura",
},
"literature": {
"libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
"leggenda", "scrivere", "titolo",
},
"food": {
"pane", "cacao", "gelato", "burro", "latte", "mandorla", "nocciola", "cena", "pranzo",
"zuppa", "zucchero", "acqua", "fiore", "frutto",
},
"city": {
"porta", "strada", "piazza", "ponte", "palazzo", "cortile", "villaggio", "citta", "urbano",
"casale", "balcone", "finestra", "stazione",
},
"transport": {
"automobile", "barca", "vela", "treno", "motore", "viaggio", "ruota", "ponte", "pilota",
"volo", "aeroporto", "vettura",
},
"work": {
"lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
"mestiere", "servire",
},
"home": {
"casa", "finestra", "porta", "parete", "divano", "tavolo", "sedia", "camera", "balcone",
"camino", "tetto", "cortile", "vasca",
},
}
TOPIC_SUFFIXES = {
"actions": ("are", "ere", "ire"),
"abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
"animals": ("cane", "gatto", "lupo", "pesce", "volpe", "orso"),
"plants": ("fiore", "foglia", "seme", "radice", "erba"),
"nature": ("mare", "lago", "bosco", "vento", "onda", "roccia"),
"geography": ("montagna", "isola", "deserto", "confine"),
"city": ("strada", "palazzo", "porta", "ponte"),
}
def infer_pos(tags: List[str]) -> str:
for tag in tags:
if tag in POS_BY_TAG:
return POS_BY_TAG[tag]
return "NOUN"
def infer_topics(word: str, tags: List[str]) -> List[str]:
topics = {"general"}
if "verb_infinitive" in tags:
topics.add("actions")
if any(word.endswith(suffix) for suffix in ("tore", "trice", "zione", "ismo", "ista", "mento", "anza", "enza")):
topics.add("abstract")
for topic, keywords in TOPIC_KEYWORDS.items():
if word in keywords:
topics.add(topic)
for topic, suffixes in TOPIC_SUFFIXES.items():
if any(word.endswith(suffix) for suffix in suffixes):
topics.add(topic)
if "animals" in topics:
topics.add("nature")
if "plants" in topics:
topics.update({"nature", "ecology"})
if "sea" in topics or "mountain" in topics or "weather" in topics:
topics.add("nature")
if "geography" in topics and "nature" not in topics:
topics.add("nature")
return sorted(topics)
def infer_register(quality: int) -> str:
for threshold, label in REGISTER_BY_QUALITY:
if quality >= threshold:
return label
return "rare"
def frequency_from_quality(quality: int, index: int, total: int) -> tuple[int, float]:
rank = index + 1
normalized_rank = 1.0 - (rank - 1) / max(1, total - 1)
quality_boost = min(max(quality, 0), 10) / 20.0
frequency_score = round(min(1.0, normalized_rank * 0.7 + quality_boost), 4)
return rank, frequency_score
def load_words() -> List[str]:
if not FILTERED_OUTPUT_PATH.exists() or not METADATA_OUTPUT_PATH.exists():
build_vocabulary()
words = [
line.strip()
for line in FILTERED_OUTPUT_PATH.read_text(encoding="utf-8").splitlines()
if line.strip()
]
return words
def load_metadata() -> Dict[str, Dict[str, object]]:
if not METADATA_OUTPUT_PATH.exists():
build_vocabulary()
return json.loads(METADATA_OUTPUT_PATH.read_text(encoding="utf-8"))
def build_lexicon() -> Dict[str, object]:
words = load_words()
metadata = load_metadata()
entries = []
total = len(words)
for index, word in enumerate(words):
meta = metadata.get(word, {})
tags = list(meta.get("tags", []))
quality = int(meta.get("quality", 0))
frequency_rank, frequency_score = frequency_from_quality(quality, index, total)
entry = {
"form": word,
"normalized_form": word,
"lemma": word,
"pos": infer_pos(tags),
"length": len(word),
"frequency_rank": frequency_rank,
"frequency_score": frequency_score,
"difficulty_word": max(1, min(5, 6 - max(1, min(5, quality // 2 + 1)))),
"allowed_in_crossword": True,
"quality_score": max(0, min(10, quality)),
"topics": infer_topics(word, tags),
"morph_features": {},
"register": infer_register(quality),
"source_flags": ["from_filtered_vocabulary", "from_metadata_heuristics"],
"crossword_flags": tags,
"notes": "",
}
entries.append(entry)
return {
"meta": {
"language": "it",
"version": 1,
"sources": ["vocaboli_it_filtrato.txt", "vocaboli_it_metadata.json"],
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
"entry_count": len(entries),
},
"entries": entries,
}
def main() -> None:
lexicon = build_lexicon()
LEXICON_OUTPUT_PATH.write_text(
json.dumps(lexicon, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print(f"Lessico generato: {LEXICON_OUTPUT_PATH}")
print(f"Voci generate: {lexicon['meta']['entry_count']}")
if __name__ == "__main__":
main()

426
build_semantic_lexicon.py Normal file
View File

@@ -0,0 +1,426 @@
from __future__ import annotations
import json
import re
import unicodedata
import xml.etree.ElementTree as ET
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics
IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json")
IWN_POS_MAP = {
"n": "NOUN",
"v": "VERB",
"a": "ADJ",
"s": "ADJ",
"r": "ADV",
}
SEMANTIC_TOPIC_KEYWORDS = {
"animals": {
"animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce",
"rettile", "domestico", "compagnia", "caccia", "pastorizia",
},
"plants": {
"pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto",
"ghianda", "bosco", "radice", "seme", "vegetale",
},
"nature": {
"natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco",
"lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre",
},
"ecology": {
"ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde",
"ecosistema", "acqua", "terra",
},
"geography": {
"territorio", "superficie", "terrestre", "regione", "confine", "montagna",
"isola", "mare", "lago", "fiume",
},
"weather": {
"clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina",
"atmosfera",
},
"sea": {
"mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela",
"nave", "fondale",
},
"mountain": {
"montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero",
},
"health": {
"salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia",
"terapia", "cervello", "respiro",
},
"science": {
"scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica",
"chimica", "biologia", "strumento",
},
"sport": {
"sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco",
"agonistico",
},
"history": {
"storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca",
},
"school": {
"scuola", "lezione", "studente", "classe", "maestro", "esame", "libro",
"quaderno", "aula",
},
"cinema": {
"film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo",
"teatro",
},
"literature": {
"libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo",
"letteratura",
},
"food": {
"cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina",
"gelato",
},
"city": {
"citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta",
"quartiere",
},
"transport": {
"veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave",
"aereo", "automobile",
},
"work": {
"lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio",
},
"home": {
"casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone",
"tavolo", "sedia",
},
}
def normalize_word(text: str) -> str:
normalized = unicodedata.normalize("NFKD", text)
ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
ascii_only = ascii_only.lower().replace("_", "")
ascii_only = re.sub(r"[^a-z]", "", ascii_only)
return ascii_only
def semantic_topics_from_text(parts: Iterable[str]) -> List[str]:
tokens = set()
for part in parts:
normalized = normalize_word(part)
if not normalized:
continue
tokens.add(normalized)
tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part))))
topics = set()
for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items():
if any(keyword in tokens for keyword in keywords):
topics.add(topic)
return sorted(topics)
def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]:
xml_text = IWN_XML_PATH.read_text(encoding="utf-8")
xml_text = xml_text.replace('" -->', '">')
root = ET.fromstring(xml_text)
synsets: Dict[str, Dict[str, object]] = {}
entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list)
for synset in root.findall(".//Synset"):
synset_id = synset.attrib.get("id", "")
relations = [
{
"type": relation.attrib.get("relType", ""),
"target": relation.attrib.get("target", ""),
"subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""),
}
for relation in synset.findall("SynsetRelation")
]
synsets[synset_id] = {
"id": synset_id,
"ili": synset.attrib.get("ili", ""),
"definition": (synset.findtext("Definition") or "").strip(),
"relations": relations,
"lemmas": [],
"pos": "",
}
for lexical_entry in root.findall(".//LexicalEntry"):
lemma = lexical_entry.find("Lemma")
if lemma is None:
continue
written_form = lemma.attrib.get("writtenForm", "").strip()
normalized_form = normalize_word(written_form)
if not normalized_form:
continue
pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN")
senses = lexical_entry.findall("Sense")
sense_payloads = []
for sense in senses:
synset_id = sense.attrib.get("synset", "")
sense_id = sense.attrib.get("id", "")
if not synset_id or synset_id not in synsets:
continue
synsets[synset_id]["lemmas"].append(written_form)
synsets[synset_id]["pos"] = pos
sense_payloads.append(
{
"sense_id": sense_id,
"synset_id": synset_id,
"pos": pos,
}
)
if sense_payloads:
entries_by_norm[normalized_form].append(
{
"written_form": written_form,
"normalized_form": normalized_form,
"pos": pos,
"senses": sense_payloads,
}
)
for synset in synsets.values():
unique_lemmas = []
seen = set()
for lemma in synset["lemmas"]:
if lemma not in seen:
seen.add(lemma)
unique_lemmas.append(lemma)
synset["lemmas"] = unique_lemmas
return synsets, entries_by_norm
def score_sense(
synset_id: str,
current_topics: Iterable[str],
synsets: Dict[str, Dict[str, object]],
) -> int:
synset = synsets.get(synset_id, {})
definition = str(synset.get("definition", ""))
inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", []))))
current_topics_set = set(str(topic) for topic in current_topics)
overlap = len(inferred_topics & current_topics_set)
return overlap * 10 + len(definition)
def best_candidate(
candidates: List[Dict[str, object]],
expected_pos: str,
current_topics: Iterable[str],
synsets: Dict[str, Dict[str, object]],
) -> Dict[str, object]:
ranked = []
for candidate in candidates:
pos_bonus = 100 if candidate["pos"] == expected_pos else 0
sense_bonus = 0
for sense in candidate.get("senses", []):
sense_bonus = max(
sense_bonus,
score_sense(str(sense.get("synset_id", "")), current_topics, synsets),
)
ranked.append((pos_bonus + sense_bonus, candidate))
ranked.sort(key=lambda item: item[0], reverse=True)
return ranked[0][1]
def dedupe_keep_order(items: Iterable[str]) -> List[str]:
seen = set()
result = []
for item in items:
text = str(item).strip()
if not text or text in seen:
continue
seen.add(text)
result.append(text)
return result
def resolve_related_lemmas(
synset_ids: Iterable[str],
relation_type: str,
synsets: Dict[str, Dict[str, object]],
) -> List[str]:
related = []
for synset_id in synset_ids:
synset = synsets.get(synset_id)
if not synset:
continue
for relation in synset.get("relations", []):
if relation.get("type") != relation_type:
continue
target = relation.get("target", "")
target_synset = synsets.get(target)
if not target_synset:
continue
related.extend(target_synset.get("lemmas", []))
return dedupe_keep_order(related)
def collect_relation_terms(
synset_ids: Iterable[str],
relation_types: Iterable[str],
synsets: Dict[str, Dict[str, object]],
) -> Dict[str, List[str]]:
return {
relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20]
for relation_type in relation_types
}
def enrich_entry(
entry: Dict[str, object],
synsets: Dict[str, Dict[str, object]],
entries_by_norm: Dict[str, List[Dict[str, object]]],
) -> Dict[str, object]:
normalized_candidates = dedupe_keep_order(
[
normalize_word(str(entry.get("form", ""))),
normalize_word(str(entry.get("lemma", ""))),
normalize_word(str(entry.get("normalized_form", ""))),
]
)
matches: List[Dict[str, object]] = []
for candidate_key in normalized_candidates:
matches.extend(entries_by_norm.get(candidate_key, []))
if not matches:
entry["semantic"] = {
"source": "iwn-omw",
"matched": False,
"match_count": 0,
"synsets": [],
"synonyms": [],
"raw_relation_terms": {},
"glosses": [],
"semantic_topics": [],
}
return entry
selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets)
sorted_senses = sorted(
selected.get("senses", []),
key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets),
reverse=True,
)
synset_ids = [sense["synset_id"] for sense in sorted_senses]
synset_payloads = []
glosses = []
synonyms = []
for synset_id in synset_ids[:3]:
synset = synsets.get(synset_id)
if not synset:
continue
gloss = str(synset.get("definition", "")).strip()
glosses.append(gloss)
synset_payloads.append(
{
"id": synset_id,
"pos": synset.get("pos", ""),
"definition": gloss,
"lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12],
"raw_relation_terms": collect_relation_terms(
[synset_id],
("hypernym", "hyponym", "similar"),
synsets,
),
}
)
synonyms.extend(synset.get("lemmas", []))
raw_relation_terms = collect_relation_terms(
synset_ids,
("hypernym", "hyponym", "similar"),
synsets,
)
synonyms = [
lemma
for lemma in dedupe_keep_order(synonyms)
if normalize_word(lemma) != normalize_word(str(entry.get("form", "")))
][:20]
glosses = dedupe_keep_order(glosses)
semantic_topics = dedupe_keep_order(
list(entry.get("topics", []))
+ semantic_topics_from_text(
glosses
+ synonyms
+ raw_relation_terms.get("hypernym", [])
+ raw_relation_terms.get("hyponym", [])
+ raw_relation_terms.get("similar", [])
)
)
entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
entry["semantic"] = {
"source": "iwn-omw",
"matched": True,
"match_count": len(matches),
"selected_form": selected.get("written_form", ""),
"synsets": synset_payloads,
"synonyms": synonyms,
"raw_relation_terms": raw_relation_terms,
"glosses": glosses,
"semantic_topics": semantic_topics,
}
return entry
def build_semantic_lexicon() -> Dict[str, object]:
if not LEXICON_OUTPUT_PATH.exists():
raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}")
if not IWN_XML_PATH.exists():
raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}")
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
synsets, entries_by_norm = parse_iwn()
enriched_entries = []
for entry in payload.get("entries", []):
enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm))
return {
"meta": {
"language": "it",
"version": 1,
"base_lexicon": str(LEXICON_OUTPUT_PATH.name),
"sources": [
"lexicon_it.json",
"iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml",
],
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
"entry_count": len(enriched_entries),
"semantic_source": "IWN-OMW v1.0",
},
"entries": enriched_entries,
}
def main() -> None:
payload = build_semantic_lexicon()
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
json.dumps(payload, ensure_ascii=False, indent=2),
encoding="utf-8",
)
matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched"))
print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
print(f"Voci totali: {payload['meta']['entry_count']}")
print(f"Voci con match semantico: {matched}")
if __name__ == "__main__":
main()

55
create_passo4.bat Normal file
View File

@@ -0,0 +1,55 @@
@echo off
setlocal
cd /d "%~dp0"
set "BRANCH_NAME=passo4"
set "COMMIT_MSG=feat: aggiunge il lessico semantico con integrazione ItalWordNet"
if not "%~1"=="" (
set "COMMIT_MSG=%~1"
)
echo Repository: %cd%
echo Branch target: %BRANCH_NAME%
echo Commit message: %COMMIT_MSG%
echo.
git rev-parse --is-inside-work-tree >nul 2>nul
if errorlevel 1 (
echo Errore: questa cartella non e' un repository Git.
exit /b 1
)
git show-ref --verify --quiet refs/heads/%BRANCH_NAME%
if errorlevel 1 (
echo Creo il branch %BRANCH_NAME%...
git checkout -b %BRANCH_NAME%
) else (
echo Il branch %BRANCH_NAME% esiste gia', ci passo sopra...
git checkout %BRANCH_NAME%
)
if errorlevel 1 exit /b 1
echo.
echo Aggiungo le modifiche...
git add .
if errorlevel 1 exit /b 1
echo.
echo Creo il commit...
git commit -m "%COMMIT_MSG%"
if errorlevel 1 (
echo.
echo Nessun commit creato. Potrebbe non esserci nulla di nuovo da salvare.
exit /b 1
)
echo.
echo Eseguo il push del branch %BRANCH_NAME%...
git push -u origin %BRANCH_NAME%
if errorlevel 1 exit /b 1
echo.
echo Operazione completata con successo.
endlocal

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
import json import json
from pathlib import Path from pathlib import Path
import random
import sys import sys
import time import time
from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
@@ -72,6 +73,7 @@ class CrosswordFiller:
*, *,
target_empty_ratio: float = TARGET_EMPTY_RATIO, target_empty_ratio: float = TARGET_EMPTY_RATIO,
vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None, vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
seed: Optional[int] = None,
) -> None: ) -> None:
self.state = state.copy() self.state = state.copy()
self.initial_state = state.copy() self.initial_state = state.copy()
@@ -81,6 +83,8 @@ class CrosswordFiller:
self.vocabulary = self._normalize_vocabulary(vocabulary) self.vocabulary = self._normalize_vocabulary(vocabulary)
self.words_by_length = self._index_vocabulary(self.vocabulary) self.words_by_length = self._index_vocabulary(self.vocabulary)
self.vocabulary_metadata = vocabulary_metadata or {} self.vocabulary_metadata = vocabulary_metadata or {}
self.seed = seed
self.rng = random.Random(seed)
self.bounds = self._compute_bounds(self.state.grid) self.bounds = self._compute_bounds(self.state.grid)
self.total_cells = self._area(self.bounds) self.total_cells = self._area(self.bounds)
self.target_empty_cells = max(0, int(round(self.total_cells * self.target_empty_ratio))) self.target_empty_cells = max(0, int(round(self.total_cells * self.target_empty_ratio)))
@@ -181,6 +185,10 @@ class CrosswordFiller:
collected = list(unique.values()) collected = list(unique.values())
collected.sort(key=self._slot_priority, reverse=True) collected.sort(key=self._slot_priority, reverse=True)
if len(collected) > 1:
top_slice = collected[: min(MAX_SLOT_CANDIDATES, len(collected))]
self.rng.shuffle(top_slice)
collected = top_slice + collected[min(MAX_SLOT_CANDIDATES, len(collected)) :]
return collected return collected
def _slots_from_start(self, x: int, y: int, direction: str) -> Iterable[FillSlot]: def _slots_from_start(self, x: int, y: int, direction: str) -> Iterable[FillSlot]:
@@ -292,7 +300,7 @@ class CrosswordFiller:
return None return None
candidates.sort(key=lambda item: item.local_score, reverse=True) candidates.sort(key=lambda item: item.local_score, reverse=True)
return candidates[0] return self.rng.choice(candidates[: min(3, len(candidates))])
def _word_quality(self, word: str) -> int: def _word_quality(self, word: str) -> int:
metadata = self.vocabulary_metadata.get(word) metadata = self.vocabulary_metadata.get(word)

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
import locale import locale
import random
import sys import sys
import time import time
from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
@@ -114,6 +115,7 @@ class CrosswordGenerator:
max_candidates_per_word: int = 12, max_candidates_per_word: int = 12,
time_limit_seconds: float = 8.0, time_limit_seconds: float = 8.0,
diffxy: int = DIFFXY, diffxy: int = DIFFXY,
seed: Optional[int] = None,
) -> None: ) -> None:
normalized = [self._normalize(word) for word in words] normalized = [self._normalize(word) for word in words]
unique_words = list(dict.fromkeys(word for word in normalized if len(word) >= 2)) unique_words = list(dict.fromkeys(word for word in normalized if len(word) >= 2))
@@ -122,6 +124,8 @@ class CrosswordGenerator:
self.max_candidates_per_word = max_candidates_per_word self.max_candidates_per_word = max_candidates_per_word
self.time_limit_seconds = time_limit_seconds self.time_limit_seconds = time_limit_seconds
self.diffxy = diffxy self.diffxy = diffxy
self.seed = seed
self.rng = random.Random(seed)
self.started_at = 0.0 self.started_at = 0.0
self.visited: Dict[Tuple[frozenset, Tuple[str, ...]], Tuple[int, int, int]] = {} self.visited: Dict[Tuple[frozenset, Tuple[str, ...]], Tuple[int, int, int]] = {}
self.nodes_visited = 0 self.nodes_visited = 0
@@ -213,6 +217,8 @@ class CrosswordGenerator:
reverse=True, reverse=True,
) )
candidates = candidates[: self.max_candidates_per_word] candidates = candidates[: self.max_candidates_per_word]
if len(candidates) > 1:
self.rng.shuffle(candidates)
next_remaining = [word for word in remaining_words if word != next_word] next_remaining = [word for word in remaining_words if word != next_word]
for placement in candidates: for placement in candidates:
@@ -253,6 +259,10 @@ class CrosswordGenerator:
word, word,
), ),
) )
if len(ranked_words) > 1:
top_slice = ranked_words[: min(5, len(ranked_words))]
self.rng.shuffle(top_slice)
ranked_words = top_slice + ranked_words[min(5, len(ranked_words)) :]
best_word = ranked_words[0] best_word = ranked_words[0]
best_key: Optional[Tuple[int, int, int, str]] = None best_key: Optional[Tuple[int, int, int, str]] = None

BIN
iwn-omw-main.zip Normal file

Binary file not shown.

View File

@@ -0,0 +1,21 @@
# IWN-OMW
This is the repository for the Open Italian WordNet, i.e. ItalWordNet versions compliant with the Open Multilingual WordNet guidelines and initiative.
IWN-OMW is a new LMF version of the ItalWordNet resource converted and formatted according to the guidelines and requirements defined by the Open Multilingual Wordnet initiative (OMW, https://omwn.org/). This current version is derived from the ItalWordNet v.2. (IWN) database (http://hdl.handle.net/20.500.11752/ILC-62).
NB: 'dc:relation', when used, contains links to equivalent Senses in the RDF version of the SIMPLE Italian lexiconù
## Licence
CC-BY-SA 4.0
## Citation
If you use this resource please cite:
Quochi, Valeria, Roberto Bartolini, and Monica Monachini (to appear) ItalwordNet goes open´. *LiLT Special Issues on Open Multilingual
WordNets*. CSLI Publications.
And
Roventini, Adriana, Antonietta Alonge, Francesca Bertagna, Nicoletta Calzolari, J. Cancila, C. Girardi, Bernardo Magnini, Rita Marinelli, Manuela Speranza, and Antonio Zampolli (2003) "ItalwordNet: building a large semantic database for the automatic treatment of Italian". *Linguistica Computazionale* 18-19:745-791.

File diff suppressed because it is too large Load Diff

433535
lexicon_it.json Normal file

File diff suppressed because it is too large Load Diff

1333962
lexicon_it_semantic.json Normal file

File diff suppressed because it is too large Load Diff

149
main.py
View File

@@ -1,8 +1,9 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import json
from pathlib import Path from pathlib import Path
from typing import List from typing import Dict, List
from build_vocabulary import ( from build_vocabulary import (
FILTERED_OUTPUT_PATH, FILTERED_OUTPUT_PATH,
@@ -10,39 +11,61 @@ from build_vocabulary import (
OUTPUT_PATH, OUTPUT_PATH,
build_vocabulary, build_vocabulary,
) )
from build_lexicon import LEXICON_OUTPUT_PATH, build_lexicon
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH, build_semantic_lexicon
from crossword_filler import CrosswordFiller, load_vocabulary, load_vocabulary_metadata from crossword_filler import CrosswordFiller, load_vocabulary, load_vocabulary_metadata
from crossword_generator import CrosswordGenerator, WORDS, render_grid from crossword_generator import CrosswordGenerator, WORDS, render_grid
DIFFICULTY_ALIASES: Dict[str, int] = {
"easy": 1,
"medium": 2,
"hard": 4,
"expert": 5,
}
DEFAULT_TOPIC = "general"
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generatore e filler di cruciverba.") parser = argparse.ArgumentParser(description="Generatore e filler di cruciverba.")
parser.add_argument( parser.add_argument(
"--build-vocabulary", "--build-vocabulary",
action="store_true", action="store_true",
help="Rigenera il vocabolario esteso, filtrato e i metadati prima dell'esecuzione.", help="Rigenera i file lessicali intermedi: vocabolario esteso, filtrato e metadati.",
)
parser.add_argument(
"--build-lexicon",
action="store_true",
help="Rigenera `lexicon_it.json` prima dell'esecuzione.",
) )
parser.add_argument( parser.add_argument(
"--skip-fill", "--skip-fill",
action="store_true", action="store_true",
help="Genera solo la griglia iniziale senza eseguire il filler.", help="Genera solo la griglia iniziale e salta il riempimento con il filler.",
)
parser.add_argument(
"--build-semantic-lexicon",
action="store_true",
help="Rigenera `lexicon_it_semantic.json` arricchendo il lessico con IWN-OMW/ItalWordNet.",
) )
parser.add_argument( parser.add_argument(
"--vocabulary", "--vocabulary",
type=Path, type=Path,
default=None, default=None,
help="Percorso opzionale a un vocabolario personalizzato.", help="Percorso opzionale a un vocabolario testuale personalizzato da usare al posto di quello di default.",
) )
parser.add_argument( parser.add_argument(
"--target-empty-ratio", "--target-empty-ratio",
type=float, type=float,
default=1 / 6, default=1 / 6,
help="Rapporto target di celle vuote residue dopo il filler.", help="Rapporto target di celle vuote residue dopo il filler. Esempio: 0.1667 lascia circa un sesto di celle vuote.",
) )
parser.add_argument( parser.add_argument(
"--time-limit", "--time-limit",
type=float, type=float,
default=8.0, default=8.0,
help="Tempo massimo in secondi per la fase di generazione iniziale.", help="Tempo massimo in secondi per la fase di generazione iniziale della griglia.",
) )
parser.add_argument( parser.add_argument(
"--max-candidates", "--max-candidates",
@@ -54,7 +77,23 @@ def parse_args() -> argparse.Namespace:
"--diffxy", "--diffxy",
type=int, type=int,
default=7, default=7,
help="Differenza massima preferita tra larghezza e altezza della griglia.", help="Differenza massima preferita tra larghezza e altezza della griglia iniziale.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Seed casuale per ottenere varianti riproducibili del cruciverba: stesso seed, stesso risultato.",
)
parser.add_argument(
"--difficulty",
default="medium",
help="Difficolta lessicale del filler. Alias testuali: easy, medium, hard, expert. Internamente mappati a livelli numerici 1-5.",
)
parser.add_argument(
"--topic",
default=DEFAULT_TOPIC,
help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
) )
return parser.parse_args() return parser.parse_args()
@@ -73,21 +112,110 @@ def ensure_vocabulary(args: argparse.Namespace) -> None:
print(f"- parole filtrate: {totals['filtered_words']}") print(f"- parole filtrate: {totals['filtered_words']}")
def ensure_lexicon(args: argparse.Namespace) -> None:
needs_build = args.build_lexicon or not LEXICON_OUTPUT_PATH.exists()
if not needs_build:
return
lexicon = build_lexicon()
LEXICON_OUTPUT_PATH.write_text(
json.dumps(lexicon, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print("Lessico rigenerato")
print(f"- file: {LEXICON_OUTPUT_PATH}")
print(f"- voci: {lexicon['meta']['entry_count']}")
def ensure_semantic_lexicon(args: argparse.Namespace) -> None:
needs_build = args.build_semantic_lexicon or not SEMANTIC_LEXICON_OUTPUT_PATH.exists()
if not needs_build:
return
lexicon = build_semantic_lexicon()
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
json.dumps(lexicon, ensure_ascii=False, indent=2),
encoding="utf-8",
)
matched = sum(1 for entry in lexicon["entries"] if entry.get("semantic", {}).get("matched"))
print("Lessico semantico rigenerato")
print(f"- file: {SEMANTIC_LEXICON_OUTPUT_PATH}")
print(f"- voci: {lexicon['meta']['entry_count']}")
print(f"- match semantici: {matched}")
def parse_difficulty(value: str) -> int:
text = str(value).strip().lower()
if text in DIFFICULTY_ALIASES:
return DIFFICULTY_ALIASES[text]
try:
level = int(text)
except ValueError as exc:
raise SystemExit(
"Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
) from exc
if not 1 <= level <= 5:
raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
return level
def load_selected_vocabulary(path: Path | None) -> List[str]: def load_selected_vocabulary(path: Path | None) -> List[str]:
if path is None: if path is None:
return load_vocabulary() return load_vocabulary()
return path.read_text(encoding="utf-8").splitlines() return path.read_text(encoding="utf-8").splitlines()
def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
if not LEXICON_OUTPUT_PATH.exists():
lexicon = build_lexicon()
LEXICON_OUTPUT_PATH.write_text(
json.dumps(lexicon, ensure_ascii=False, indent=2),
encoding="utf-8",
)
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
normalized_topic = topic.strip().lower()
def matches(entry: Dict[str, object], selected_topic: str) -> bool:
topics = [str(item).lower() for item in entry.get("topics", [])]
return selected_topic in topics
words = [
entry["form"]
for entry in payload.get("entries", [])
if entry.get("allowed_in_crossword", False)
and int(entry.get("difficulty_word", 5)) <= level
and matches(entry, normalized_topic)
]
if words:
return words
if normalized_topic != DEFAULT_TOPIC:
return [
entry["form"]
for entry in payload.get("entries", [])
if entry.get("allowed_in_crossword", False)
and int(entry.get("difficulty_word", 5)) <= level
and matches(entry, DEFAULT_TOPIC)
]
return words
def main() -> None: def main() -> None:
args = parse_args() args = parse_args()
ensure_vocabulary(args) ensure_vocabulary(args)
ensure_lexicon(args)
ensure_semantic_lexicon(args)
difficulty_level = parse_difficulty(args.difficulty)
generator = CrosswordGenerator( generator = CrosswordGenerator(
WORDS, WORDS,
diffxy=args.diffxy, diffxy=args.diffxy,
time_limit_seconds=args.time_limit, time_limit_seconds=args.time_limit,
max_candidates_per_word=args.max_candidates, max_candidates_per_word=args.max_candidates,
seed=args.seed,
) )
initial_state = generator.solve() initial_state = generator.solve()
@@ -95,19 +223,24 @@ def main() -> None:
print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}") print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
print(f"Intersezioni: {initial_state.intersections}") print(f"Intersezioni: {initial_state.intersections}")
print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})") print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
print(f"Difficolta filler: {args.difficulty} -> livello {difficulty_level}")
print(f"Tema filler: {args.topic}")
if args.seed is not None:
print(f"Seed: {args.seed}")
print() print()
print(render_grid(initial_state.grid, initial_state.placements)) print(render_grid(initial_state.grid, initial_state.placements))
if args.skip_fill: if args.skip_fill:
return return
vocabulary = load_selected_vocabulary(args.vocabulary) vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
metadata = load_vocabulary_metadata() metadata = load_vocabulary_metadata()
filler = CrosswordFiller( filler = CrosswordFiller(
initial_state, initial_state,
vocabulary, vocabulary,
target_empty_ratio=args.target_empty_ratio, target_empty_ratio=args.target_empty_ratio,
vocabulary_metadata=metadata, vocabulary_metadata=metadata,
seed=args.seed,
) )
final_state = filler.fill() final_state = filler.fill()