feat: consolida lessico semantico, temi controllati e filler a quota tematica

This commit is contained in:
2026-04-15 15:37:52 +02:00
parent b172b9c04b
commit a1f8cb8577
8 changed files with 14030 additions and 46434 deletions

View File

@@ -0,0 +1,291 @@
from __future__ import annotations
import argparse
import json
import os
import time
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List, Optional
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
from main import parse_difficulty
BABELNET_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_babelnet.json")
BABELNET_CACHE_PATH = Path(__file__).with_name(".babelnet_cache.json")
BABELNET_API_BASE = "https://babelnet.io/v9"
BABELNET_ENV_KEY = "BABELNET_API_KEY"
POS_TO_BABELNET = {
"NOUN": "NOUN",
"VERB": "VERB",
"ADJ": "ADJECTIVE",
"ADV": "ADVERB",
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Arricchisce lexicon_it_semantic.json usando BabelNet, se disponibile una API key."
)
parser.add_argument(
"--api-key",
default=os.environ.get(BABELNET_ENV_KEY),
help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.",
)
parser.add_argument(
"--topic",
default=None,
help="Topic opzionale da usare per limitare le voci da arricchire.",
)
parser.add_argument(
"--difficulty",
default="medium",
help="Difficolta massima delle voci da arricchire: easy, medium, hard, expert oppure 1-5.",
)
parser.add_argument(
"--limit",
type=int,
default=100,
help="Numero massimo di lemmi da interrogare in questa esecuzione.",
)
parser.add_argument(
"--sleep",
type=float,
default=0.2,
help="Pausa tra richieste API, utile per non stressare il servizio.",
)
parser.add_argument(
"--output",
type=Path,
default=BABELNET_OUTPUT_PATH,
help="File JSON di output.",
)
return parser.parse_args()
def load_json(path: Path, default: object) -> object:
if not path.exists():
return default
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: object) -> None:
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
def request_json(endpoint: str, params: Dict[str, str], cache: Dict[str, object]) -> object:
url = f"{BABELNET_API_BASE}/{endpoint}?{urllib.parse.urlencode(params)}"
if url in cache:
return cache[url]
request = urllib.request.Request(url, headers={"Accept": "application/json"})
try:
with urllib.request.urlopen(request, timeout=30) as response:
payload = json.loads(response.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"Errore BabelNet HTTP {exc.code}: {detail}") from exc
cache[url] = payload
return payload
def entry_topics(entry: Dict[str, object]) -> set[str]:
return {str(item).lower() for item in entry.get("topics", [])}
def select_entries(payload: Dict[str, object], topic: Optional[str], difficulty_level: int, limit: int) -> List[Dict[str, object]]:
selected = []
normalized_topic = topic.strip().lower() if topic else None
for entry in payload.get("entries", []):
word = str(entry.get("form", ""))
if not word or not word.isalpha():
continue
if len(word) < 3 or len(word) > 16:
continue
if int(entry.get("difficulty_word", 5)) > difficulty_level:
continue
if str(entry.get("pos", "")) not in POS_TO_BABELNET:
continue
if normalized_topic and normalized_topic not in entry_topics(entry):
continue
selected.append(entry)
if len(selected) >= limit:
break
return selected
def compact_synset_id(payload: Dict[str, object]) -> Dict[str, object]:
return {
"id": payload.get("id"),
"pos": payload.get("pos"),
"source": payload.get("source"),
}
def extract_glosses(payload: Dict[str, object]) -> List[str]:
glosses = []
for item in payload.get("glosses", []) or []:
language = str(item.get("language", "")).upper()
gloss = str(item.get("gloss", "")).strip()
if gloss and language in {"IT", "ITA", ""}:
glosses.append(gloss)
return dedupe(glosses)[:5]
def extract_senses(payload: Dict[str, object]) -> List[str]:
senses = []
for item in payload.get("senses", []) or []:
language = str(item.get("language", "")).upper()
lemma = str(item.get("properties", {}).get("simpleLemma") or item.get("fullLemma") or "").strip()
if lemma and language in {"IT", "ITA", ""}:
senses.append(lemma.replace("_", " "))
return dedupe(senses)[:20]
def extract_categories(payload: Dict[str, object]) -> List[str]:
categories = []
for item in payload.get("categories", []) or []:
category = str(item.get("category", "")).strip()
if category:
categories.append(category)
return dedupe(categories)[:20]
def extract_domains(payload: Dict[str, object]) -> List[str]:
domains = payload.get("domains", [])
if isinstance(domains, dict):
return sorted(str(key) for key, value in domains.items() if value)
if isinstance(domains, list):
return dedupe(str(item) for item in domains if item)[:20]
return []
def dedupe(items: Iterable[str]) -> List[str]:
seen = set()
result = []
for item in items:
text = str(item).strip()
if not text or text in seen:
continue
seen.add(text)
result.append(text)
return result
def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object], sleep_seconds: float) -> Dict[str, object]:
word = str(entry.get("form", ""))
pos = POS_TO_BABELNET.get(str(entry.get("pos", "")))
if not pos:
return {"matched": False, "reason": "unsupported_pos", "synsets": []}
synset_ids = request_json(
"getSynsetIds",
{
"lemma": word,
"searchLang": "IT",
"pos": pos,
"key": api_key,
},
cache,
)
if sleep_seconds:
time.sleep(sleep_seconds)
if not isinstance(synset_ids, list) or not synset_ids:
return {"matched": False, "reason": "no_synsets", "synsets": []}
synsets = []
for synset_ref in synset_ids[:3]:
synset_id = synset_ref.get("id") if isinstance(synset_ref, dict) else str(synset_ref)
if not synset_id:
continue
synset_payload = request_json(
"getSynset",
{
"id": synset_id,
"targetLang": "IT",
"key": api_key,
},
cache,
)
if sleep_seconds:
time.sleep(sleep_seconds)
if not isinstance(synset_payload, dict):
continue
synsets.append(
{
"id": synset_id,
"senses": extract_senses(synset_payload),
"glosses": extract_glosses(synset_payload),
"categories": extract_categories(synset_payload),
"domains": extract_domains(synset_payload),
}
)
return {
"matched": bool(synsets),
"synset_refs": [compact_synset_id(item) for item in synset_ids[:5] if isinstance(item, dict)],
"synsets": synsets,
}
def build_babelnet_enrichment(args: argparse.Namespace) -> Dict[str, object]:
if not args.api_key:
raise SystemExit(
f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure usa --api-key <chiave>."
)
if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
raise FileNotFoundError(f"Lessico semantico non trovato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
payload = load_json(SEMANTIC_LEXICON_OUTPUT_PATH, {})
cache = load_json(BABELNET_CACHE_PATH, {})
if not isinstance(cache, dict):
cache = {}
difficulty_level = parse_difficulty(str(args.difficulty))
selected_entries = select_entries(payload, args.topic, difficulty_level, args.limit)
enriched_entries = []
for index, entry in enumerate(selected_entries, start=1):
enriched = dict(entry)
enriched["babelnet"] = enrich_entry(enriched, args.api_key, cache, args.sleep)
enriched_entries.append(enriched)
print(f"[{index}/{len(selected_entries)}] {entry['form']}: {enriched['babelnet'].get('matched')}")
write_json(BABELNET_CACHE_PATH, cache)
return {
"meta": {
"language": "it",
"version": 1,
"base_lexicon": SEMANTIC_LEXICON_OUTPUT_PATH.name,
"source": "BabelNet API",
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
"topic": args.topic,
"difficulty": args.difficulty,
"requested_limit": args.limit,
"entry_count": len(enriched_entries),
},
"entries": enriched_entries,
}
def main() -> None:
args = parse_args()
payload = build_babelnet_enrichment(args)
write_json(args.output, payload)
matched = sum(1 for entry in payload["entries"] if entry.get("babelnet", {}).get("matched"))
print(f"Lessico BabelNet generato: {args.output}")
print(f"Voci arricchite: {payload['meta']['entry_count']}")
print(f"Voci con match BabelNet: {matched}")
if __name__ == "__main__":
main()

View File

@@ -83,8 +83,9 @@ TOPIC_KEYWORDS = {
"aula", "figura", "titolo", "aula", "figura", "titolo",
}, },
"cinema": { "cinema": {
"film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "voce", "visione", "film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "cinema",
"finale", "figura", "doppiatore", "documentario", "cinegiornale", "colossal", "commedia", "comparsa",
"controfigura", "diva", "divo", "cabaret", "cartoon",
}, },
"literature": { "literature": {
"libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia", "libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
@@ -99,8 +100,12 @@ TOPIC_KEYWORDS = {
"casale", "balcone", "finestra", "stazione", "casale", "balcone", "finestra", "stazione",
}, },
"transport": { "transport": {
"automobile", "barca", "vela", "treno", "motore", "viaggio", "ruota", "ponte", "pilota", "automobile", "auto", "automezzo", "autoveicolo", "autovettura", "autobus", "autocarro",
"volo", "aeroporto", "vettura", "aeromobile", "aeroplano", "aeroporto", "ambulanza", "autoambulanza", "astronave",
"barca", "barchetta", "bastimento", "bicicletta", "bici", "bimotore", "bireattore",
"bombardiere", "imbarcazione", "motrice", "motore", "nave", "pista", "porto",
"quadrimotore", "reattore", "rimorchio", "rimorchiatore", "rotaia", "ruota", "trattore",
"treno", "vapore", "vela", "veliero", "vettura", "volante", "volo",
}, },
"work": { "work": {
"lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida", "lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
@@ -115,11 +120,6 @@ TOPIC_KEYWORDS = {
TOPIC_SUFFIXES = { TOPIC_SUFFIXES = {
"actions": ("are", "ere", "ire"), "actions": ("are", "ere", "ire"),
"abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"), "abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
"animals": ("cane", "gatto", "lupo", "pesce", "volpe", "orso"),
"plants": ("fiore", "foglia", "seme", "radice", "erba"),
"nature": ("mare", "lago", "bosco", "vento", "onda", "roccia"),
"geography": ("montagna", "isola", "deserto", "confine"),
"city": ("strada", "palazzo", "porta", "ponte"),
} }
@@ -135,7 +135,7 @@ def infer_topics(word: str, tags: List[str]) -> List[str]:
if "verb_infinitive" in tags: if "verb_infinitive" in tags:
topics.add("actions") topics.add("actions")
if any(word.endswith(suffix) for suffix in ("tore", "trice", "zione", "ismo", "ista", "mento", "anza", "enza")): if any(word.endswith(suffix) for suffix in ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza")):
topics.add("abstract") topics.add("abstract")
for topic, keywords in TOPIC_KEYWORDS.items(): for topic, keywords in TOPIC_KEYWORDS.items():

View File

@@ -9,7 +9,7 @@ from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, List, Tuple from typing import Dict, Iterable, List, Tuple
from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics from build_lexicon import LEXICON_OUTPUT_PATH
IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml" IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
@@ -356,8 +356,7 @@ def enrich_entry(
][:20] ][:20]
glosses = dedupe_keep_order(glosses) glosses = dedupe_keep_order(glosses)
semantic_topics = dedupe_keep_order( semantic_topics = dedupe_keep_order(
list(entry.get("topics", [])) semantic_topics_from_text(
+ semantic_topics_from_text(
glosses glosses
+ synonyms + synonyms
+ raw_relation_terms.get("hypernym", []) + raw_relation_terms.get("hypernym", [])
@@ -365,7 +364,6 @@ def enrich_entry(
+ raw_relation_terms.get("similar", []) + raw_relation_terms.get("similar", [])
) )
) )
entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
entry["semantic"] = { entry["semantic"] = {
"source": "iwn-omw", "source": "iwn-omw",
"matched": True, "matched": True,

View File

@@ -4,7 +4,7 @@ setlocal
cd /d "%~dp0" cd /d "%~dp0"
set "BRANCH_NAME=passo4" set "BRANCH_NAME=passo4"
set "COMMIT_MSG=feat: aggiunge il lessico semantico con integrazione ItalWordNet" set "COMMIT_MSG=feat: consolida lessico semantico, temi controllati e filler a quota tematica"
if not "%~1"=="" ( if not "%~1"=="" (
set "COMMIT_MSG=%~1" set "COMMIT_MSG=%~1"
@@ -32,8 +32,8 @@ if errorlevel 1 (
if errorlevel 1 exit /b 1 if errorlevel 1 exit /b 1
echo. echo.
echo Aggiungo le modifiche... echo Aggiungo le modifiche di progetto, escludendo cache Python e cache API...
git add . git add *.py *.bat *.txt lexicon_it.json lexicon_it_semantic.json vocaboli_it_metadata.json package iwn-omw-main
if errorlevel 1 exit /b 1 if errorlevel 1 exit /b 1
echo. echo.

View File

@@ -62,7 +62,7 @@ class FillCandidate:
slot: FillSlot slot: FillSlot
new_letters: int new_letters: int
reused_letters: int reused_letters: int
local_score: Tuple[int, int, int] local_score: Tuple[int, ...]
class CrosswordFiller: class CrosswordFiller:
@@ -73,6 +73,9 @@ class CrosswordFiller:
*, *,
target_empty_ratio: float = TARGET_EMPTY_RATIO, target_empty_ratio: float = TARGET_EMPTY_RATIO,
vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None, vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
semantic_metadata: Optional[Dict[str, Dict[str, object]]] = None,
selected_topic: str = "general",
max_themed_fill_words: int = 10,
seed: Optional[int] = None, seed: Optional[int] = None,
) -> None: ) -> None:
self.state = state.copy() self.state = state.copy()
@@ -83,6 +86,9 @@ class CrosswordFiller:
self.vocabulary = self._normalize_vocabulary(vocabulary) self.vocabulary = self._normalize_vocabulary(vocabulary)
self.words_by_length = self._index_vocabulary(self.vocabulary) self.words_by_length = self._index_vocabulary(self.vocabulary)
self.vocabulary_metadata = vocabulary_metadata or {} self.vocabulary_metadata = vocabulary_metadata or {}
self.semantic_metadata = semantic_metadata or {}
self.selected_topic = selected_topic.strip().lower()
self.max_themed_fill_words = max(0, max_themed_fill_words)
self.seed = seed self.seed = seed
self.rng = random.Random(seed) self.rng = random.Random(seed)
self.bounds = self._compute_bounds(self.state.grid) self.bounds = self._compute_bounds(self.state.grid)
@@ -281,9 +287,11 @@ class CrosswordFiller:
new_letters = sum(1 for cell in slot.cells if cell not in self.state.grid) new_letters = sum(1 for cell in slot.cells if cell not in self.state.grid)
reused_letters = slot.fixed_letters reused_letters = slot.fixed_letters
local_score = ( local_score = (
self._semantic_topic_score(word),
reused_letters, reused_letters,
new_letters, new_letters,
self._word_quality(word), self._word_quality(word),
self._semantic_quality(word),
len(set(word)), len(set(word)),
) )
candidates.append( candidates.append(
@@ -311,6 +319,56 @@ class CrosswordFiller:
except (TypeError, ValueError): except (TypeError, ValueError):
return 0 return 0
def _semantic_entry(self, word: str) -> Dict[str, object]:
return self.semantic_metadata.get(word, {})
def _semantic_quality(self, word: str) -> int:
entry = self._semantic_entry(word)
semantic = entry.get("semantic", {})
score = 0
if semantic.get("matched"):
score += 2
score += min(3, len(semantic.get("glosses", [])))
score += min(2, len(semantic.get("synonyms", [])))
return score
def _semantic_topic_score(self, word: str) -> int:
if not self.selected_topic or self.selected_topic == "general":
return 0
entry = self._semantic_entry(word)
try:
relevance = int(entry.get("_topic_relevance", 0))
except (TypeError, ValueError):
relevance = 0
if relevance:
if self._themed_added_count() < self.max_themed_fill_words:
return relevance
return min(relevance, 10)
topics = {str(item).lower() for item in entry.get("topics", [])}
semantic = entry.get("semantic", {})
semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", [])}
score = 0
if self.selected_topic in topics:
score += 4
if self.selected_topic in semantic_topics:
score += 6
if "general" in topics:
score += 1
return score
def _themed_added_count(self) -> int:
total = 0
for placement in self.added_words:
entry = self._semantic_entry(placement.word)
try:
if int(entry.get("_strong_topic_relevance", 0)) > 0:
total += 1
except (TypeError, ValueError):
continue
return total
def _placement_is_valid(self, slot: FillSlot, word: str) -> bool: def _placement_is_valid(self, slot: FillSlot, word: str) -> bool:
dx, dy = (1, 0) if slot.direction == HORIZONTAL else (0, 1) dx, dy = (1, 0) if slot.direction == HORIZONTAL else (0, 1)
before = (slot.x - dx, slot.y - dy) before = (slot.x - dx, slot.y - dy)
@@ -380,6 +438,7 @@ class CrosswordFiller:
f"vuote={self.empty_cells_count()}/{self.total_cells} " f"vuote={self.empty_cells_count()}/{self.total_cells} "
f"target={self.target_empty_cells} " f"target={self.target_empty_cells} "
f"aggiunte={len(self.added_words)} " f"aggiunte={len(self.added_words)} "
f"tema={self._themed_added_count()}/{self.max_themed_fill_words} "
f"ultima={self.last_word} " f"ultima={self.last_word} "
f"t={elapsed:0.1f}s" f"t={elapsed:0.1f}s"
) )

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

415
main.py
View File

@@ -25,6 +25,72 @@ DIFFICULTY_ALIASES: Dict[str, int] = {
} }
DEFAULT_TOPIC = "general" DEFAULT_TOPIC = "general"
DEFAULT_INITIAL_WORD_COUNT = len(WORDS)
ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
GENERAL_FILL_MIN_QUALITY = 6
GENERAL_FILL_MAX_LENGTH = 10
SOFT_RELATED_FILL_LIMIT = 120
DEFAULT_THEMED_FILL_WORD_COUNT = 10
CONCRETE_TOPICS = {
"animals",
"plants",
"nature",
"ecology",
"geography",
"weather",
"sea",
"mountain",
"health",
"science",
"sport",
"history",
"school",
"cinema",
"literature",
"food",
"city",
"transport",
"work",
"home",
}
TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
"transport": (
"auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer",
"bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc",
"trattor", "carr", "vap", "rota", "ruot",
),
"animals": (
"can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall",
"serpent", "tig", "leon", "volp", "cerv", "capr", "pecor",
),
"nature": (
"mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol",
"rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent",
"litoral", "piogg", "nev", "onda", "clim",
),
"cinema": (
"film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog",
"comic", "div", "docu", "pellic", "spettacol",
),
}
TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
"transport": (
"intervist", "intratten", "speriment", "stermin", "investig",
"intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt",
"eccit", "traduz", "fluttu", "sollecit",
),
"animals": (
"assicur", "finanz", "coediz", "camerier", "servitor", "indic",
"estens", "diffus", "difensor", "spessor", "maggior",
),
"cinema": (
"manifest", "riediz", "dissimul", "diffus", "difensor", "estens",
"malumor", "eversor",
),
}
def parse_args() -> argparse.Namespace: def parse_args() -> argparse.Namespace:
@@ -95,6 +161,18 @@ def parse_args() -> argparse.Namespace:
default=DEFAULT_TOPIC, default=DEFAULT_TOPIC,
help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.", help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
) )
parser.add_argument(
"--initial-word-count",
type=int,
default=DEFAULT_INITIAL_WORD_COUNT,
help="Numero di parole-seme usate per costruire la griglia iniziale prima del filler.",
)
parser.add_argument(
"--themed-fill-count",
type=int,
default=DEFAULT_THEMED_FILL_WORD_COUNT,
help="Numero massimo indicativo di parole aggiunte dal filler da mantenere fortemente legate al tema.",
)
return parser.parse_args() return parser.parse_args()
@@ -165,42 +243,328 @@ def load_selected_vocabulary(path: Path | None) -> List[str]:
return path.read_text(encoding="utf-8").splitlines() return path.read_text(encoding="utf-8").splitlines()
def load_filtered_vocabulary(level: int, topic: str) -> List[str]: def load_semantic_payload() -> Dict[str, object]:
if not LEXICON_OUTPUT_PATH.exists(): if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
lexicon = build_lexicon() lexicon = build_semantic_lexicon()
LEXICON_OUTPUT_PATH.write_text( SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
json.dumps(lexicon, ensure_ascii=False, indent=2), json.dumps(lexicon, ensure_ascii=False, indent=2),
encoding="utf-8", encoding="utf-8",
) )
return json.loads(SEMANTIC_LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
def entry_topics(entry: Dict[str, object]) -> tuple[set[str], set[str]]:
topics = {str(item).lower() for item in entry.get("topics", [])}
semantic_topics = {
str(item).lower()
for item in entry.get("semantic", {}).get("semantic_topics", [])
}
return topics, semantic_topics
def matches_topic_roots(word: str, selected_topic: str) -> bool:
roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic, ())
blocked = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
if any(part in word for part in blocked):
return False
return bool(roots) and any(part in word for part in roots)
def topic_relevance(entry: Dict[str, object], topic: str) -> int:
selected_topic = topic.strip().lower()
if selected_topic == DEFAULT_TOPIC:
return 20
word = str(entry.get("form", ""))
topics, semantic_topics = entry_topics(entry)
score = 0
if selected_topic in topics:
score += 100
if selected_topic in semantic_topics:
score += 45
if matches_topic_roots(word, selected_topic):
score += 35
if "general" in topics:
score += 5
if any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())):
score -= 80
if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
score -= 15
return score
def strong_topic_relevance(entry: Dict[str, object], topic: str) -> int:
selected_topic = topic.strip().lower()
if selected_topic == DEFAULT_TOPIC:
return 20
topics, _ = entry_topics(entry)
return 100 if selected_topic in topics else 0
def lexical_fill_score(entry: Dict[str, object], topic: str) -> tuple[int, int, int, int, int, str]:
word = str(entry.get("form", ""))
quality = int(entry.get("quality_score", 0))
pos = str(entry.get("pos", ""))
semantic = entry.get("semantic", {})
pos_bonus = {
"NOUN": 12,
"VERB": 8,
"ADJ": 6,
"ADV": 4,
"PREP": 2,
"CONJ": 2,
}.get(pos, 0)
semantic_bonus = 3 if semantic.get("matched") else 0
length = len(word)
length_bonus = 3 if 4 <= length <= 10 else 1 if 2 <= length <= 13 else -4
return (
topic_relevance(entry, topic),
quality,
pos_bonus,
semantic_bonus,
length_bonus,
word,
)
def is_general_fill_support(entry: Dict[str, object]) -> bool:
word = str(entry.get("form", ""))
if int(entry.get("quality_score", 0)) < GENERAL_FILL_MIN_QUALITY:
return False
if len(word) > GENERAL_FILL_MAX_LENGTH:
return False
if word.endswith(ABSTRACTISH_SUFFIXES):
return False
return DEFAULT_TOPIC in {str(item).lower() for item in entry.get("topics", [])}
def load_filtered_entries(level: int, topic: str) -> List[Dict[str, object]]:
payload = load_semantic_payload()
normalized_topic = topic.strip().lower() normalized_topic = topic.strip().lower()
eligible = [
entry
for entry in payload.get("entries", [])
if entry.get("allowed_in_crossword", False)
and int(entry.get("difficulty_word", 5)) <= level
and str(entry.get("pos", "")) in FILL_ALLOWED_POS
]
if normalized_topic == DEFAULT_TOPIC:
selected = eligible
else:
strong_topic = [entry for entry in eligible if strong_topic_relevance(entry, normalized_topic) > 0]
soft_related = [
entry
for entry in eligible
if entry not in strong_topic
and topic_relevance(entry, normalized_topic) > 0
and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH
and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES)
]
soft_related.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
general_support = [
entry
for entry in eligible
if entry not in strong_topic
and is_general_fill_support(entry)
]
general_support.sort(key=lambda entry: lexical_fill_score(entry, DEFAULT_TOPIC), reverse=True)
selected = strong_topic + soft_related[:SOFT_RELATED_FILL_LIMIT]
selected += [entry for entry in general_support if entry not in selected]
selected.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
return selected
def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
return [str(entry["form"]) for entry in load_filtered_entries(level, topic)]
def load_semantic_metadata_for_vocabulary(words: List[str], topic: str) -> Dict[str, Dict[str, object]]:
payload = load_semantic_payload()
selected = set(words)
metadata: Dict[str, Dict[str, object]] = {}
for entry in payload.get("entries", []):
word = str(entry.get("form", ""))
if word not in selected:
continue
enriched = dict(entry)
enriched["_topic_relevance"] = topic_relevance(enriched, topic)
enriched["_strong_topic_relevance"] = strong_topic_relevance(enriched, topic)
metadata[word] = enriched
return metadata
def select_initial_words(level: int, topic: str, count: int) -> List[str]:
payload = load_semantic_payload()
normalized_topic = topic.strip().lower()
abstract_like_topics = {"abstract", "actions"}
def matches(entry: Dict[str, object], selected_topic: str) -> bool: def matches(entry: Dict[str, object], selected_topic: str) -> bool:
topics = [str(item).lower() for item in entry.get("topics", [])] topics, semantic_topics = entry_topics(entry)
return selected_topic in topics return selected_topic in topics
words = [ def word_score(entry: Dict[str, object], selected_topic: str) -> tuple[int, int, int, int, int, int, str]:
entry["form"] topics, semantic_topics = entry_topics(entry)
quality = int(entry.get("quality_score", 0))
semantic = entry.get("semantic", {})
semantic_match = 1 if semantic.get("matched") else 0
glossary_bonus = min(3, len(semantic.get("glosses", [])))
word = str(entry.get("form", ""))
length = len(word)
topical_concreteness_penalty = 0
topic_bonus = 0
pos_bonus = 0
if selected_topic in topics:
topic_bonus += 4
if "general" in topics:
topic_bonus += 1
if str(entry.get("pos", "")) == "NOUN":
pos_bonus += 4
elif str(entry.get("pos", "")) == "ADJ":
pos_bonus += 1
if selected_topic not in abstract_like_topics and selected_topic != DEFAULT_TOPIC:
if "abstract" in topics and selected_topic not in topics:
topical_concreteness_penalty -= 3
if "actions" in topics and selected_topic not in topics:
topical_concreteness_penalty -= 2
if word.endswith(ABSTRACTISH_SUFFIXES):
topical_concreteness_penalty -= 4
if str(entry.get("pos", "")) != "NOUN":
topical_concreteness_penalty -= 3
if 5 <= length <= 10:
length_bonus = 3
elif 4 <= length <= 12:
length_bonus = 1
else:
length_bonus = -2
return (
topic_bonus,
pos_bonus,
topical_concreteness_penalty,
quality,
semantic_match,
glossary_bonus,
length_bonus,
word,
)
def is_seed_friendly(entry: Dict[str, object], selected_topic: str) -> bool:
word = str(entry.get("form", ""))
pos = str(entry.get("pos", ""))
topics, semantic_topics = entry_topics(entry)
topic_hit = selected_topic in topics
if len(word) < 4 or len(word) > 13:
return False
if selected_topic in CONCRETE_TOPICS and pos != "NOUN":
return False
if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
return False
blocked_substrings = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
if any(part in word for part in blocked_substrings):
return False
required_substrings = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic)
if (
selected_topic in CONCRETE_TOPICS
and required_substrings
and selected_topic != DEFAULT_TOPIC
and not any(part in word for part in required_substrings)
):
return False
if selected_topic != DEFAULT_TOPIC and not topic_hit:
return False
return True
def overlap_score(left: str, right: str) -> int:
shared = set(left) & set(right)
return sum(min(left.count(ch), right.count(ch)) for ch in shared)
def pick_seed_set(entries: List[Dict[str, object]], selected_topic: str, target_count: int) -> List[str]:
if not entries:
return []
ranked = sorted(entries, key=lambda entry: word_score(entry, selected_topic), reverse=True)
chosen: List[str] = []
chosen_entries: List[Dict[str, object]] = []
first = ranked[0]
chosen.append(str(first["form"]))
chosen_entries.append(first)
while len(chosen) < target_count:
best_entry = None
best_key = None
for entry in ranked:
word = str(entry.get("form", ""))
if word in chosen:
continue
overlap_total = sum(overlap_score(word, existing) for existing in chosen)
max_overlap = max((overlap_score(word, existing) for existing in chosen), default=0)
distinct_letters = len(set(word))
same_length_penalty = -sum(1 for existing in chosen if len(existing) == len(word))
key = (
1 if max_overlap >= 2 else 0,
overlap_total,
max_overlap,
same_length_penalty,
distinct_letters,
word_score(entry, selected_topic),
)
if best_key is None or key > best_key:
best_key = key
best_entry = entry
if best_entry is None:
break
chosen.append(str(best_entry["form"]))
chosen_entries.append(best_entry)
return chosen
eligible = [
entry
for entry in payload.get("entries", []) for entry in payload.get("entries", [])
if entry.get("allowed_in_crossword", False) if entry.get("allowed_in_crossword", False)
and int(entry.get("difficulty_word", 5)) <= level and int(entry.get("difficulty_word", 5)) <= level
and matches(entry, normalized_topic)
] ]
if words: lexical_topical = []
return words for entry in eligible:
topics, semantic_topics = entry_topics(entry)
if normalized_topic in topics:
lexical_topical.append(entry)
fallback = [entry for entry in eligible if matches(entry, DEFAULT_TOPIC)]
if normalized_topic == DEFAULT_TOPIC:
pool = fallback
else:
pool = list(lexical_topical)
if not pool:
pool = fallback
if normalized_topic != DEFAULT_TOPIC: strict_pool = [entry for entry in pool if is_seed_friendly(entry, normalized_topic)]
return [ relaxed_pool = sorted(pool, key=lambda entry: word_score(entry, normalized_topic), reverse=True)
entry["form"]
for entry in payload.get("entries", [])
if entry.get("allowed_in_crossword", False)
and int(entry.get("difficulty_word", 5)) <= level
and matches(entry, DEFAULT_TOPIC)
]
return words selected = pick_seed_set(strict_pool, normalized_topic, count)
if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
relaxed_selected = pick_seed_set(relaxed_pool, normalized_topic, count)
for word in relaxed_selected:
if word not in selected:
selected.append(word)
if len(selected) >= count:
break
if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
for word in WORDS:
if word in selected:
continue
selected.append(word)
if len(selected) >= count:
break
return selected[:count]
def main() -> None: def main() -> None:
@@ -209,9 +573,10 @@ def main() -> None:
ensure_lexicon(args) ensure_lexicon(args)
ensure_semantic_lexicon(args) ensure_semantic_lexicon(args)
difficulty_level = parse_difficulty(args.difficulty) difficulty_level = parse_difficulty(args.difficulty)
initial_words = select_initial_words(difficulty_level, args.topic, args.initial_word_count)
generator = CrosswordGenerator( generator = CrosswordGenerator(
WORDS, initial_words,
diffxy=args.diffxy, diffxy=args.diffxy,
time_limit_seconds=args.time_limit, time_limit_seconds=args.time_limit,
max_candidates_per_word=args.max_candidates, max_candidates_per_word=args.max_candidates,
@@ -220,6 +585,7 @@ def main() -> None:
initial_state = generator.solve() initial_state = generator.solve()
print("Griglia iniziale") print("Griglia iniziale")
print(f"Parole-seme richieste: {len(initial_words)}")
print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}") print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
print(f"Intersezioni: {initial_state.intersections}") print(f"Intersezioni: {initial_state.intersections}")
print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})") print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
@@ -229,17 +595,24 @@ def main() -> None:
print(f"Seed: {args.seed}") print(f"Seed: {args.seed}")
print() print()
print(render_grid(initial_state.grid, initial_state.placements)) print(render_grid(initial_state.grid, initial_state.placements))
print()
print("Parole-seme selezionate:")
print(", ".join(initial_words))
if args.skip_fill: if args.skip_fill:
return return
vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic) vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
metadata = load_vocabulary_metadata() metadata = load_vocabulary_metadata()
semantic_metadata = load_semantic_metadata_for_vocabulary(vocabulary, args.topic) if not args.vocabulary else {}
filler = CrosswordFiller( filler = CrosswordFiller(
initial_state, initial_state,
vocabulary, vocabulary,
target_empty_ratio=args.target_empty_ratio, target_empty_ratio=args.target_empty_ratio,
vocabulary_metadata=metadata, vocabulary_metadata=metadata,
semantic_metadata=semantic_metadata,
selected_topic=args.topic,
max_themed_fill_words=args.themed_fill_count,
seed=args.seed, seed=args.seed,
) )
final_state = filler.fill() final_state = filler.fill()