feat: consolida lessico semantico, temi controllati e filler a quota tematica
This commit is contained in:
291
build_babelnet_enrichment.py
Normal file
291
build_babelnet_enrichment.py
Normal file
@@ -0,0 +1,291 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, List, Optional
|
||||||
|
|
||||||
|
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||||
|
from main import parse_difficulty
|
||||||
|
|
||||||
|
|
||||||
|
BABELNET_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_babelnet.json")
|
||||||
|
BABELNET_CACHE_PATH = Path(__file__).with_name(".babelnet_cache.json")
|
||||||
|
BABELNET_API_BASE = "https://babelnet.io/v9"
|
||||||
|
BABELNET_ENV_KEY = "BABELNET_API_KEY"
|
||||||
|
|
||||||
|
POS_TO_BABELNET = {
|
||||||
|
"NOUN": "NOUN",
|
||||||
|
"VERB": "VERB",
|
||||||
|
"ADJ": "ADJECTIVE",
|
||||||
|
"ADV": "ADVERB",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Arricchisce lexicon_it_semantic.json usando BabelNet, se disponibile una API key."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--api-key",
|
||||||
|
default=os.environ.get(BABELNET_ENV_KEY),
|
||||||
|
help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--topic",
|
||||||
|
default=None,
|
||||||
|
help="Topic opzionale da usare per limitare le voci da arricchire.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--difficulty",
|
||||||
|
default="medium",
|
||||||
|
help="Difficolta massima delle voci da arricchire: easy, medium, hard, expert oppure 1-5.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--limit",
|
||||||
|
type=int,
|
||||||
|
default=100,
|
||||||
|
help="Numero massimo di lemmi da interrogare in questa esecuzione.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sleep",
|
||||||
|
type=float,
|
||||||
|
default=0.2,
|
||||||
|
help="Pausa tra richieste API, utile per non stressare il servizio.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=Path,
|
||||||
|
default=BABELNET_OUTPUT_PATH,
|
||||||
|
help="File JSON di output.",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def load_json(path: Path, default: object) -> object:
|
||||||
|
if not path.exists():
|
||||||
|
return default
|
||||||
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def write_json(path: Path, payload: object) -> None:
|
||||||
|
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def request_json(endpoint: str, params: Dict[str, str], cache: Dict[str, object]) -> object:
|
||||||
|
url = f"{BABELNET_API_BASE}/{endpoint}?{urllib.parse.urlencode(params)}"
|
||||||
|
if url in cache:
|
||||||
|
return cache[url]
|
||||||
|
|
||||||
|
request = urllib.request.Request(url, headers={"Accept": "application/json"})
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(request, timeout=30) as response:
|
||||||
|
payload = json.loads(response.read().decode("utf-8"))
|
||||||
|
except urllib.error.HTTPError as exc:
|
||||||
|
detail = exc.read().decode("utf-8", errors="replace")
|
||||||
|
raise RuntimeError(f"Errore BabelNet HTTP {exc.code}: {detail}") from exc
|
||||||
|
|
||||||
|
cache[url] = payload
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def entry_topics(entry: Dict[str, object]) -> set[str]:
|
||||||
|
return {str(item).lower() for item in entry.get("topics", [])}
|
||||||
|
|
||||||
|
|
||||||
|
def select_entries(payload: Dict[str, object], topic: Optional[str], difficulty_level: int, limit: int) -> List[Dict[str, object]]:
|
||||||
|
selected = []
|
||||||
|
normalized_topic = topic.strip().lower() if topic else None
|
||||||
|
|
||||||
|
for entry in payload.get("entries", []):
|
||||||
|
word = str(entry.get("form", ""))
|
||||||
|
if not word or not word.isalpha():
|
||||||
|
continue
|
||||||
|
if len(word) < 3 or len(word) > 16:
|
||||||
|
continue
|
||||||
|
if int(entry.get("difficulty_word", 5)) > difficulty_level:
|
||||||
|
continue
|
||||||
|
if str(entry.get("pos", "")) not in POS_TO_BABELNET:
|
||||||
|
continue
|
||||||
|
if normalized_topic and normalized_topic not in entry_topics(entry):
|
||||||
|
continue
|
||||||
|
selected.append(entry)
|
||||||
|
if len(selected) >= limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
return selected
|
||||||
|
|
||||||
|
|
||||||
|
def compact_synset_id(payload: Dict[str, object]) -> Dict[str, object]:
|
||||||
|
return {
|
||||||
|
"id": payload.get("id"),
|
||||||
|
"pos": payload.get("pos"),
|
||||||
|
"source": payload.get("source"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_glosses(payload: Dict[str, object]) -> List[str]:
|
||||||
|
glosses = []
|
||||||
|
for item in payload.get("glosses", []) or []:
|
||||||
|
language = str(item.get("language", "")).upper()
|
||||||
|
gloss = str(item.get("gloss", "")).strip()
|
||||||
|
if gloss and language in {"IT", "ITA", ""}:
|
||||||
|
glosses.append(gloss)
|
||||||
|
return dedupe(glosses)[:5]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_senses(payload: Dict[str, object]) -> List[str]:
|
||||||
|
senses = []
|
||||||
|
for item in payload.get("senses", []) or []:
|
||||||
|
language = str(item.get("language", "")).upper()
|
||||||
|
lemma = str(item.get("properties", {}).get("simpleLemma") or item.get("fullLemma") or "").strip()
|
||||||
|
if lemma and language in {"IT", "ITA", ""}:
|
||||||
|
senses.append(lemma.replace("_", " "))
|
||||||
|
return dedupe(senses)[:20]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_categories(payload: Dict[str, object]) -> List[str]:
|
||||||
|
categories = []
|
||||||
|
for item in payload.get("categories", []) or []:
|
||||||
|
category = str(item.get("category", "")).strip()
|
||||||
|
if category:
|
||||||
|
categories.append(category)
|
||||||
|
return dedupe(categories)[:20]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_domains(payload: Dict[str, object]) -> List[str]:
|
||||||
|
domains = payload.get("domains", [])
|
||||||
|
if isinstance(domains, dict):
|
||||||
|
return sorted(str(key) for key, value in domains.items() if value)
|
||||||
|
if isinstance(domains, list):
|
||||||
|
return dedupe(str(item) for item in domains if item)[:20]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def dedupe(items: Iterable[str]) -> List[str]:
|
||||||
|
seen = set()
|
||||||
|
result = []
|
||||||
|
for item in items:
|
||||||
|
text = str(item).strip()
|
||||||
|
if not text or text in seen:
|
||||||
|
continue
|
||||||
|
seen.add(text)
|
||||||
|
result.append(text)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object], sleep_seconds: float) -> Dict[str, object]:
|
||||||
|
word = str(entry.get("form", ""))
|
||||||
|
pos = POS_TO_BABELNET.get(str(entry.get("pos", "")))
|
||||||
|
if not pos:
|
||||||
|
return {"matched": False, "reason": "unsupported_pos", "synsets": []}
|
||||||
|
|
||||||
|
synset_ids = request_json(
|
||||||
|
"getSynsetIds",
|
||||||
|
{
|
||||||
|
"lemma": word,
|
||||||
|
"searchLang": "IT",
|
||||||
|
"pos": pos,
|
||||||
|
"key": api_key,
|
||||||
|
},
|
||||||
|
cache,
|
||||||
|
)
|
||||||
|
if sleep_seconds:
|
||||||
|
time.sleep(sleep_seconds)
|
||||||
|
|
||||||
|
if not isinstance(synset_ids, list) or not synset_ids:
|
||||||
|
return {"matched": False, "reason": "no_synsets", "synsets": []}
|
||||||
|
|
||||||
|
synsets = []
|
||||||
|
for synset_ref in synset_ids[:3]:
|
||||||
|
synset_id = synset_ref.get("id") if isinstance(synset_ref, dict) else str(synset_ref)
|
||||||
|
if not synset_id:
|
||||||
|
continue
|
||||||
|
synset_payload = request_json(
|
||||||
|
"getSynset",
|
||||||
|
{
|
||||||
|
"id": synset_id,
|
||||||
|
"targetLang": "IT",
|
||||||
|
"key": api_key,
|
||||||
|
},
|
||||||
|
cache,
|
||||||
|
)
|
||||||
|
if sleep_seconds:
|
||||||
|
time.sleep(sleep_seconds)
|
||||||
|
if not isinstance(synset_payload, dict):
|
||||||
|
continue
|
||||||
|
synsets.append(
|
||||||
|
{
|
||||||
|
"id": synset_id,
|
||||||
|
"senses": extract_senses(synset_payload),
|
||||||
|
"glosses": extract_glosses(synset_payload),
|
||||||
|
"categories": extract_categories(synset_payload),
|
||||||
|
"domains": extract_domains(synset_payload),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"matched": bool(synsets),
|
||||||
|
"synset_refs": [compact_synset_id(item) for item in synset_ids[:5] if isinstance(item, dict)],
|
||||||
|
"synsets": synsets,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_babelnet_enrichment(args: argparse.Namespace) -> Dict[str, object]:
|
||||||
|
if not args.api_key:
|
||||||
|
raise SystemExit(
|
||||||
|
f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure usa --api-key <chiave>."
|
||||||
|
)
|
||||||
|
if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
|
||||||
|
raise FileNotFoundError(f"Lessico semantico non trovato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
|
||||||
|
|
||||||
|
payload = load_json(SEMANTIC_LEXICON_OUTPUT_PATH, {})
|
||||||
|
cache = load_json(BABELNET_CACHE_PATH, {})
|
||||||
|
if not isinstance(cache, dict):
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
difficulty_level = parse_difficulty(str(args.difficulty))
|
||||||
|
selected_entries = select_entries(payload, args.topic, difficulty_level, args.limit)
|
||||||
|
enriched_entries = []
|
||||||
|
|
||||||
|
for index, entry in enumerate(selected_entries, start=1):
|
||||||
|
enriched = dict(entry)
|
||||||
|
enriched["babelnet"] = enrich_entry(enriched, args.api_key, cache, args.sleep)
|
||||||
|
enriched_entries.append(enriched)
|
||||||
|
print(f"[{index}/{len(selected_entries)}] {entry['form']}: {enriched['babelnet'].get('matched')}")
|
||||||
|
write_json(BABELNET_CACHE_PATH, cache)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"meta": {
|
||||||
|
"language": "it",
|
||||||
|
"version": 1,
|
||||||
|
"base_lexicon": SEMANTIC_LEXICON_OUTPUT_PATH.name,
|
||||||
|
"source": "BabelNet API",
|
||||||
|
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||||
|
"topic": args.topic,
|
||||||
|
"difficulty": args.difficulty,
|
||||||
|
"requested_limit": args.limit,
|
||||||
|
"entry_count": len(enriched_entries),
|
||||||
|
},
|
||||||
|
"entries": enriched_entries,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
payload = build_babelnet_enrichment(args)
|
||||||
|
write_json(args.output, payload)
|
||||||
|
matched = sum(1 for entry in payload["entries"] if entry.get("babelnet", {}).get("matched"))
|
||||||
|
print(f"Lessico BabelNet generato: {args.output}")
|
||||||
|
print(f"Voci arricchite: {payload['meta']['entry_count']}")
|
||||||
|
print(f"Voci con match BabelNet: {matched}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -83,8 +83,9 @@ TOPIC_KEYWORDS = {
|
|||||||
"aula", "figura", "titolo",
|
"aula", "figura", "titolo",
|
||||||
},
|
},
|
||||||
"cinema": {
|
"cinema": {
|
||||||
"film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "voce", "visione",
|
"film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "cinema",
|
||||||
"finale", "figura",
|
"doppiatore", "documentario", "cinegiornale", "colossal", "commedia", "comparsa",
|
||||||
|
"controfigura", "diva", "divo", "cabaret", "cartoon",
|
||||||
},
|
},
|
||||||
"literature": {
|
"literature": {
|
||||||
"libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
|
"libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
|
||||||
@@ -99,8 +100,12 @@ TOPIC_KEYWORDS = {
|
|||||||
"casale", "balcone", "finestra", "stazione",
|
"casale", "balcone", "finestra", "stazione",
|
||||||
},
|
},
|
||||||
"transport": {
|
"transport": {
|
||||||
"automobile", "barca", "vela", "treno", "motore", "viaggio", "ruota", "ponte", "pilota",
|
"automobile", "auto", "automezzo", "autoveicolo", "autovettura", "autobus", "autocarro",
|
||||||
"volo", "aeroporto", "vettura",
|
"aeromobile", "aeroplano", "aeroporto", "ambulanza", "autoambulanza", "astronave",
|
||||||
|
"barca", "barchetta", "bastimento", "bicicletta", "bici", "bimotore", "bireattore",
|
||||||
|
"bombardiere", "imbarcazione", "motrice", "motore", "nave", "pista", "porto",
|
||||||
|
"quadrimotore", "reattore", "rimorchio", "rimorchiatore", "rotaia", "ruota", "trattore",
|
||||||
|
"treno", "vapore", "vela", "veliero", "vettura", "volante", "volo",
|
||||||
},
|
},
|
||||||
"work": {
|
"work": {
|
||||||
"lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
|
"lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
|
||||||
@@ -115,11 +120,6 @@ TOPIC_KEYWORDS = {
|
|||||||
TOPIC_SUFFIXES = {
|
TOPIC_SUFFIXES = {
|
||||||
"actions": ("are", "ere", "ire"),
|
"actions": ("are", "ere", "ire"),
|
||||||
"abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
|
"abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
|
||||||
"animals": ("cane", "gatto", "lupo", "pesce", "volpe", "orso"),
|
|
||||||
"plants": ("fiore", "foglia", "seme", "radice", "erba"),
|
|
||||||
"nature": ("mare", "lago", "bosco", "vento", "onda", "roccia"),
|
|
||||||
"geography": ("montagna", "isola", "deserto", "confine"),
|
|
||||||
"city": ("strada", "palazzo", "porta", "ponte"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -135,7 +135,7 @@ def infer_topics(word: str, tags: List[str]) -> List[str]:
|
|||||||
|
|
||||||
if "verb_infinitive" in tags:
|
if "verb_infinitive" in tags:
|
||||||
topics.add("actions")
|
topics.add("actions")
|
||||||
if any(word.endswith(suffix) for suffix in ("tore", "trice", "zione", "ismo", "ista", "mento", "anza", "enza")):
|
if any(word.endswith(suffix) for suffix in ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza")):
|
||||||
topics.add("abstract")
|
topics.add("abstract")
|
||||||
|
|
||||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from datetime import datetime
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Iterable, List, Tuple
|
from typing import Dict, Iterable, List, Tuple
|
||||||
|
|
||||||
from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics
|
from build_lexicon import LEXICON_OUTPUT_PATH
|
||||||
|
|
||||||
|
|
||||||
IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
|
IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
|
||||||
@@ -356,8 +356,7 @@ def enrich_entry(
|
|||||||
][:20]
|
][:20]
|
||||||
glosses = dedupe_keep_order(glosses)
|
glosses = dedupe_keep_order(glosses)
|
||||||
semantic_topics = dedupe_keep_order(
|
semantic_topics = dedupe_keep_order(
|
||||||
list(entry.get("topics", []))
|
semantic_topics_from_text(
|
||||||
+ semantic_topics_from_text(
|
|
||||||
glosses
|
glosses
|
||||||
+ synonyms
|
+ synonyms
|
||||||
+ raw_relation_terms.get("hypernym", [])
|
+ raw_relation_terms.get("hypernym", [])
|
||||||
@@ -365,7 +364,6 @@ def enrich_entry(
|
|||||||
+ raw_relation_terms.get("similar", [])
|
+ raw_relation_terms.get("similar", [])
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
|
|
||||||
entry["semantic"] = {
|
entry["semantic"] = {
|
||||||
"source": "iwn-omw",
|
"source": "iwn-omw",
|
||||||
"matched": True,
|
"matched": True,
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ setlocal
|
|||||||
cd /d "%~dp0"
|
cd /d "%~dp0"
|
||||||
|
|
||||||
set "BRANCH_NAME=passo4"
|
set "BRANCH_NAME=passo4"
|
||||||
set "COMMIT_MSG=feat: aggiunge il lessico semantico con integrazione ItalWordNet"
|
set "COMMIT_MSG=feat: consolida lessico semantico, temi controllati e filler a quota tematica"
|
||||||
|
|
||||||
if not "%~1"=="" (
|
if not "%~1"=="" (
|
||||||
set "COMMIT_MSG=%~1"
|
set "COMMIT_MSG=%~1"
|
||||||
@@ -32,8 +32,8 @@ if errorlevel 1 (
|
|||||||
if errorlevel 1 exit /b 1
|
if errorlevel 1 exit /b 1
|
||||||
|
|
||||||
echo.
|
echo.
|
||||||
echo Aggiungo le modifiche...
|
echo Aggiungo le modifiche di progetto, escludendo cache Python e cache API...
|
||||||
git add .
|
git add *.py *.bat *.txt lexicon_it.json lexicon_it_semantic.json vocaboli_it_metadata.json package iwn-omw-main
|
||||||
if errorlevel 1 exit /b 1
|
if errorlevel 1 exit /b 1
|
||||||
|
|
||||||
echo.
|
echo.
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ class FillCandidate:
|
|||||||
slot: FillSlot
|
slot: FillSlot
|
||||||
new_letters: int
|
new_letters: int
|
||||||
reused_letters: int
|
reused_letters: int
|
||||||
local_score: Tuple[int, int, int]
|
local_score: Tuple[int, ...]
|
||||||
|
|
||||||
|
|
||||||
class CrosswordFiller:
|
class CrosswordFiller:
|
||||||
@@ -73,6 +73,9 @@ class CrosswordFiller:
|
|||||||
*,
|
*,
|
||||||
target_empty_ratio: float = TARGET_EMPTY_RATIO,
|
target_empty_ratio: float = TARGET_EMPTY_RATIO,
|
||||||
vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
|
vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
|
||||||
|
semantic_metadata: Optional[Dict[str, Dict[str, object]]] = None,
|
||||||
|
selected_topic: str = "general",
|
||||||
|
max_themed_fill_words: int = 10,
|
||||||
seed: Optional[int] = None,
|
seed: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.state = state.copy()
|
self.state = state.copy()
|
||||||
@@ -83,6 +86,9 @@ class CrosswordFiller:
|
|||||||
self.vocabulary = self._normalize_vocabulary(vocabulary)
|
self.vocabulary = self._normalize_vocabulary(vocabulary)
|
||||||
self.words_by_length = self._index_vocabulary(self.vocabulary)
|
self.words_by_length = self._index_vocabulary(self.vocabulary)
|
||||||
self.vocabulary_metadata = vocabulary_metadata or {}
|
self.vocabulary_metadata = vocabulary_metadata or {}
|
||||||
|
self.semantic_metadata = semantic_metadata or {}
|
||||||
|
self.selected_topic = selected_topic.strip().lower()
|
||||||
|
self.max_themed_fill_words = max(0, max_themed_fill_words)
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.rng = random.Random(seed)
|
self.rng = random.Random(seed)
|
||||||
self.bounds = self._compute_bounds(self.state.grid)
|
self.bounds = self._compute_bounds(self.state.grid)
|
||||||
@@ -281,9 +287,11 @@ class CrosswordFiller:
|
|||||||
new_letters = sum(1 for cell in slot.cells if cell not in self.state.grid)
|
new_letters = sum(1 for cell in slot.cells if cell not in self.state.grid)
|
||||||
reused_letters = slot.fixed_letters
|
reused_letters = slot.fixed_letters
|
||||||
local_score = (
|
local_score = (
|
||||||
|
self._semantic_topic_score(word),
|
||||||
reused_letters,
|
reused_letters,
|
||||||
new_letters,
|
new_letters,
|
||||||
self._word_quality(word),
|
self._word_quality(word),
|
||||||
|
self._semantic_quality(word),
|
||||||
len(set(word)),
|
len(set(word)),
|
||||||
)
|
)
|
||||||
candidates.append(
|
candidates.append(
|
||||||
@@ -311,6 +319,56 @@ class CrosswordFiller:
|
|||||||
except (TypeError, ValueError):
|
except (TypeError, ValueError):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
def _semantic_entry(self, word: str) -> Dict[str, object]:
|
||||||
|
return self.semantic_metadata.get(word, {})
|
||||||
|
|
||||||
|
def _semantic_quality(self, word: str) -> int:
|
||||||
|
entry = self._semantic_entry(word)
|
||||||
|
semantic = entry.get("semantic", {})
|
||||||
|
score = 0
|
||||||
|
if semantic.get("matched"):
|
||||||
|
score += 2
|
||||||
|
score += min(3, len(semantic.get("glosses", [])))
|
||||||
|
score += min(2, len(semantic.get("synonyms", [])))
|
||||||
|
return score
|
||||||
|
|
||||||
|
def _semantic_topic_score(self, word: str) -> int:
|
||||||
|
if not self.selected_topic or self.selected_topic == "general":
|
||||||
|
return 0
|
||||||
|
|
||||||
|
entry = self._semantic_entry(word)
|
||||||
|
try:
|
||||||
|
relevance = int(entry.get("_topic_relevance", 0))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
relevance = 0
|
||||||
|
if relevance:
|
||||||
|
if self._themed_added_count() < self.max_themed_fill_words:
|
||||||
|
return relevance
|
||||||
|
return min(relevance, 10)
|
||||||
|
|
||||||
|
topics = {str(item).lower() for item in entry.get("topics", [])}
|
||||||
|
semantic = entry.get("semantic", {})
|
||||||
|
semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", [])}
|
||||||
|
score = 0
|
||||||
|
if self.selected_topic in topics:
|
||||||
|
score += 4
|
||||||
|
if self.selected_topic in semantic_topics:
|
||||||
|
score += 6
|
||||||
|
if "general" in topics:
|
||||||
|
score += 1
|
||||||
|
return score
|
||||||
|
|
||||||
|
def _themed_added_count(self) -> int:
|
||||||
|
total = 0
|
||||||
|
for placement in self.added_words:
|
||||||
|
entry = self._semantic_entry(placement.word)
|
||||||
|
try:
|
||||||
|
if int(entry.get("_strong_topic_relevance", 0)) > 0:
|
||||||
|
total += 1
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue
|
||||||
|
return total
|
||||||
|
|
||||||
def _placement_is_valid(self, slot: FillSlot, word: str) -> bool:
|
def _placement_is_valid(self, slot: FillSlot, word: str) -> bool:
|
||||||
dx, dy = (1, 0) if slot.direction == HORIZONTAL else (0, 1)
|
dx, dy = (1, 0) if slot.direction == HORIZONTAL else (0, 1)
|
||||||
before = (slot.x - dx, slot.y - dy)
|
before = (slot.x - dx, slot.y - dy)
|
||||||
@@ -380,6 +438,7 @@ class CrosswordFiller:
|
|||||||
f"vuote={self.empty_cells_count()}/{self.total_cells} "
|
f"vuote={self.empty_cells_count()}/{self.total_cells} "
|
||||||
f"target={self.target_empty_cells} "
|
f"target={self.target_empty_cells} "
|
||||||
f"aggiunte={len(self.added_words)} "
|
f"aggiunte={len(self.added_words)} "
|
||||||
|
f"tema={self._themed_added_count()}/{self.max_themed_fill_words} "
|
||||||
f"ultima={self.last_word} "
|
f"ultima={self.last_word} "
|
||||||
f"t={elapsed:0.1f}s"
|
f"t={elapsed:0.1f}s"
|
||||||
)
|
)
|
||||||
|
|||||||
1346
lexicon_it.json
1346
lexicon_it.json
File diff suppressed because it is too large
Load Diff
58313
lexicon_it_semantic.json
58313
lexicon_it_semantic.json
File diff suppressed because it is too large
Load Diff
421
main.py
421
main.py
@@ -25,6 +25,72 @@ DIFFICULTY_ALIASES: Dict[str, int] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_TOPIC = "general"
|
DEFAULT_TOPIC = "general"
|
||||||
|
DEFAULT_INITIAL_WORD_COUNT = len(WORDS)
|
||||||
|
ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
|
||||||
|
FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
|
||||||
|
GENERAL_FILL_MIN_QUALITY = 6
|
||||||
|
GENERAL_FILL_MAX_LENGTH = 10
|
||||||
|
SOFT_RELATED_FILL_LIMIT = 120
|
||||||
|
DEFAULT_THEMED_FILL_WORD_COUNT = 10
|
||||||
|
CONCRETE_TOPICS = {
|
||||||
|
"animals",
|
||||||
|
"plants",
|
||||||
|
"nature",
|
||||||
|
"ecology",
|
||||||
|
"geography",
|
||||||
|
"weather",
|
||||||
|
"sea",
|
||||||
|
"mountain",
|
||||||
|
"health",
|
||||||
|
"science",
|
||||||
|
"sport",
|
||||||
|
"history",
|
||||||
|
"school",
|
||||||
|
"cinema",
|
||||||
|
"literature",
|
||||||
|
"food",
|
||||||
|
"city",
|
||||||
|
"transport",
|
||||||
|
"work",
|
||||||
|
"home",
|
||||||
|
}
|
||||||
|
|
||||||
|
TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
|
||||||
|
"transport": (
|
||||||
|
"auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer",
|
||||||
|
"bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc",
|
||||||
|
"trattor", "carr", "vap", "rota", "ruot",
|
||||||
|
),
|
||||||
|
"animals": (
|
||||||
|
"can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall",
|
||||||
|
"serpent", "tig", "leon", "volp", "cerv", "capr", "pecor",
|
||||||
|
),
|
||||||
|
"nature": (
|
||||||
|
"mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol",
|
||||||
|
"rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent",
|
||||||
|
"litoral", "piogg", "nev", "onda", "clim",
|
||||||
|
),
|
||||||
|
"cinema": (
|
||||||
|
"film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog",
|
||||||
|
"comic", "div", "docu", "pellic", "spettacol",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
|
||||||
|
"transport": (
|
||||||
|
"intervist", "intratten", "speriment", "stermin", "investig",
|
||||||
|
"intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt",
|
||||||
|
"eccit", "traduz", "fluttu", "sollecit",
|
||||||
|
),
|
||||||
|
"animals": (
|
||||||
|
"assicur", "finanz", "coediz", "camerier", "servitor", "indic",
|
||||||
|
"estens", "diffus", "difensor", "spessor", "maggior",
|
||||||
|
),
|
||||||
|
"cinema": (
|
||||||
|
"manifest", "riediz", "dissimul", "diffus", "difensor", "estens",
|
||||||
|
"malumor", "eversor",
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
@@ -95,6 +161,18 @@ def parse_args() -> argparse.Namespace:
|
|||||||
default=DEFAULT_TOPIC,
|
default=DEFAULT_TOPIC,
|
||||||
help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
|
help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--initial-word-count",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_INITIAL_WORD_COUNT,
|
||||||
|
help="Numero di parole-seme usate per costruire la griglia iniziale prima del filler.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--themed-fill-count",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_THEMED_FILL_WORD_COUNT,
|
||||||
|
help="Numero massimo indicativo di parole aggiunte dal filler da mantenere fortemente legate al tema.",
|
||||||
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@@ -165,42 +243,328 @@ def load_selected_vocabulary(path: Path | None) -> List[str]:
|
|||||||
return path.read_text(encoding="utf-8").splitlines()
|
return path.read_text(encoding="utf-8").splitlines()
|
||||||
|
|
||||||
|
|
||||||
def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
|
def load_semantic_payload() -> Dict[str, object]:
|
||||||
if not LEXICON_OUTPUT_PATH.exists():
|
if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
|
||||||
lexicon = build_lexicon()
|
lexicon = build_semantic_lexicon()
|
||||||
LEXICON_OUTPUT_PATH.write_text(
|
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
|
||||||
json.dumps(lexicon, ensure_ascii=False, indent=2),
|
json.dumps(lexicon, ensure_ascii=False, indent=2),
|
||||||
encoding="utf-8",
|
encoding="utf-8",
|
||||||
)
|
)
|
||||||
|
return json.loads(SEMANTIC_LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
|
|
||||||
|
def entry_topics(entry: Dict[str, object]) -> tuple[set[str], set[str]]:
|
||||||
|
topics = {str(item).lower() for item in entry.get("topics", [])}
|
||||||
|
semantic_topics = {
|
||||||
|
str(item).lower()
|
||||||
|
for item in entry.get("semantic", {}).get("semantic_topics", [])
|
||||||
|
}
|
||||||
|
return topics, semantic_topics
|
||||||
|
|
||||||
|
|
||||||
|
def matches_topic_roots(word: str, selected_topic: str) -> bool:
|
||||||
|
roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic, ())
|
||||||
|
blocked = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
|
||||||
|
if any(part in word for part in blocked):
|
||||||
|
return False
|
||||||
|
return bool(roots) and any(part in word for part in roots)
|
||||||
|
|
||||||
|
|
||||||
|
def topic_relevance(entry: Dict[str, object], topic: str) -> int:
|
||||||
|
selected_topic = topic.strip().lower()
|
||||||
|
if selected_topic == DEFAULT_TOPIC:
|
||||||
|
return 20
|
||||||
|
|
||||||
|
word = str(entry.get("form", ""))
|
||||||
|
topics, semantic_topics = entry_topics(entry)
|
||||||
|
score = 0
|
||||||
|
if selected_topic in topics:
|
||||||
|
score += 100
|
||||||
|
if selected_topic in semantic_topics:
|
||||||
|
score += 45
|
||||||
|
if matches_topic_roots(word, selected_topic):
|
||||||
|
score += 35
|
||||||
|
if "general" in topics:
|
||||||
|
score += 5
|
||||||
|
|
||||||
|
if any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())):
|
||||||
|
score -= 80
|
||||||
|
if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
|
||||||
|
score -= 15
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def strong_topic_relevance(entry: Dict[str, object], topic: str) -> int:
|
||||||
|
selected_topic = topic.strip().lower()
|
||||||
|
if selected_topic == DEFAULT_TOPIC:
|
||||||
|
return 20
|
||||||
|
topics, _ = entry_topics(entry)
|
||||||
|
return 100 if selected_topic in topics else 0
|
||||||
|
|
||||||
|
|
||||||
|
def lexical_fill_score(entry: Dict[str, object], topic: str) -> tuple[int, int, int, int, int, str]:
|
||||||
|
word = str(entry.get("form", ""))
|
||||||
|
quality = int(entry.get("quality_score", 0))
|
||||||
|
pos = str(entry.get("pos", ""))
|
||||||
|
semantic = entry.get("semantic", {})
|
||||||
|
pos_bonus = {
|
||||||
|
"NOUN": 12,
|
||||||
|
"VERB": 8,
|
||||||
|
"ADJ": 6,
|
||||||
|
"ADV": 4,
|
||||||
|
"PREP": 2,
|
||||||
|
"CONJ": 2,
|
||||||
|
}.get(pos, 0)
|
||||||
|
semantic_bonus = 3 if semantic.get("matched") else 0
|
||||||
|
length = len(word)
|
||||||
|
length_bonus = 3 if 4 <= length <= 10 else 1 if 2 <= length <= 13 else -4
|
||||||
|
return (
|
||||||
|
topic_relevance(entry, topic),
|
||||||
|
quality,
|
||||||
|
pos_bonus,
|
||||||
|
semantic_bonus,
|
||||||
|
length_bonus,
|
||||||
|
word,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_general_fill_support(entry: Dict[str, object]) -> bool:
|
||||||
|
word = str(entry.get("form", ""))
|
||||||
|
if int(entry.get("quality_score", 0)) < GENERAL_FILL_MIN_QUALITY:
|
||||||
|
return False
|
||||||
|
if len(word) > GENERAL_FILL_MAX_LENGTH:
|
||||||
|
return False
|
||||||
|
if word.endswith(ABSTRACTISH_SUFFIXES):
|
||||||
|
return False
|
||||||
|
return DEFAULT_TOPIC in {str(item).lower() for item in entry.get("topics", [])}
|
||||||
|
|
||||||
|
|
||||||
|
def load_filtered_entries(level: int, topic: str) -> List[Dict[str, object]]:
|
||||||
|
payload = load_semantic_payload()
|
||||||
normalized_topic = topic.strip().lower()
|
normalized_topic = topic.strip().lower()
|
||||||
|
|
||||||
def matches(entry: Dict[str, object], selected_topic: str) -> bool:
|
eligible = [
|
||||||
topics = [str(item).lower() for item in entry.get("topics", [])]
|
entry
|
||||||
return selected_topic in topics
|
|
||||||
|
|
||||||
words = [
|
|
||||||
entry["form"]
|
|
||||||
for entry in payload.get("entries", [])
|
for entry in payload.get("entries", [])
|
||||||
if entry.get("allowed_in_crossword", False)
|
if entry.get("allowed_in_crossword", False)
|
||||||
and int(entry.get("difficulty_word", 5)) <= level
|
and int(entry.get("difficulty_word", 5)) <= level
|
||||||
and matches(entry, normalized_topic)
|
and str(entry.get("pos", "")) in FILL_ALLOWED_POS
|
||||||
]
|
]
|
||||||
|
|
||||||
if words:
|
if normalized_topic == DEFAULT_TOPIC:
|
||||||
return words
|
selected = eligible
|
||||||
|
else:
|
||||||
if normalized_topic != DEFAULT_TOPIC:
|
strong_topic = [entry for entry in eligible if strong_topic_relevance(entry, normalized_topic) > 0]
|
||||||
return [
|
soft_related = [
|
||||||
entry["form"]
|
entry
|
||||||
for entry in payload.get("entries", [])
|
for entry in eligible
|
||||||
if entry.get("allowed_in_crossword", False)
|
if entry not in strong_topic
|
||||||
and int(entry.get("difficulty_word", 5)) <= level
|
and topic_relevance(entry, normalized_topic) > 0
|
||||||
and matches(entry, DEFAULT_TOPIC)
|
and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
|
||||||
|
and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH
|
||||||
|
and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES)
|
||||||
]
|
]
|
||||||
|
soft_related.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
|
||||||
|
|
||||||
return words
|
general_support = [
|
||||||
|
entry
|
||||||
|
for entry in eligible
|
||||||
|
if entry not in strong_topic
|
||||||
|
and is_general_fill_support(entry)
|
||||||
|
]
|
||||||
|
general_support.sort(key=lambda entry: lexical_fill_score(entry, DEFAULT_TOPIC), reverse=True)
|
||||||
|
selected = strong_topic + soft_related[:SOFT_RELATED_FILL_LIMIT]
|
||||||
|
selected += [entry for entry in general_support if entry not in selected]
|
||||||
|
|
||||||
|
selected.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
|
||||||
|
return selected
|
||||||
|
|
||||||
|
|
||||||
|
def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
|
||||||
|
return [str(entry["form"]) for entry in load_filtered_entries(level, topic)]
|
||||||
|
|
||||||
|
|
||||||
|
def load_semantic_metadata_for_vocabulary(words: List[str], topic: str) -> Dict[str, Dict[str, object]]:
|
||||||
|
payload = load_semantic_payload()
|
||||||
|
selected = set(words)
|
||||||
|
metadata: Dict[str, Dict[str, object]] = {}
|
||||||
|
for entry in payload.get("entries", []):
|
||||||
|
word = str(entry.get("form", ""))
|
||||||
|
if word not in selected:
|
||||||
|
continue
|
||||||
|
enriched = dict(entry)
|
||||||
|
enriched["_topic_relevance"] = topic_relevance(enriched, topic)
|
||||||
|
enriched["_strong_topic_relevance"] = strong_topic_relevance(enriched, topic)
|
||||||
|
metadata[word] = enriched
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def select_initial_words(level: int, topic: str, count: int) -> List[str]:
|
||||||
|
payload = load_semantic_payload()
|
||||||
|
normalized_topic = topic.strip().lower()
|
||||||
|
abstract_like_topics = {"abstract", "actions"}
|
||||||
|
|
||||||
|
def matches(entry: Dict[str, object], selected_topic: str) -> bool:
|
||||||
|
topics, semantic_topics = entry_topics(entry)
|
||||||
|
return selected_topic in topics
|
||||||
|
|
||||||
|
def word_score(entry: Dict[str, object], selected_topic: str) -> tuple[int, int, int, int, int, int, str]:
|
||||||
|
topics, semantic_topics = entry_topics(entry)
|
||||||
|
quality = int(entry.get("quality_score", 0))
|
||||||
|
semantic = entry.get("semantic", {})
|
||||||
|
semantic_match = 1 if semantic.get("matched") else 0
|
||||||
|
glossary_bonus = min(3, len(semantic.get("glosses", [])))
|
||||||
|
word = str(entry.get("form", ""))
|
||||||
|
length = len(word)
|
||||||
|
topical_concreteness_penalty = 0
|
||||||
|
topic_bonus = 0
|
||||||
|
pos_bonus = 0
|
||||||
|
if selected_topic in topics:
|
||||||
|
topic_bonus += 4
|
||||||
|
if "general" in topics:
|
||||||
|
topic_bonus += 1
|
||||||
|
if str(entry.get("pos", "")) == "NOUN":
|
||||||
|
pos_bonus += 4
|
||||||
|
elif str(entry.get("pos", "")) == "ADJ":
|
||||||
|
pos_bonus += 1
|
||||||
|
if selected_topic not in abstract_like_topics and selected_topic != DEFAULT_TOPIC:
|
||||||
|
if "abstract" in topics and selected_topic not in topics:
|
||||||
|
topical_concreteness_penalty -= 3
|
||||||
|
if "actions" in topics and selected_topic not in topics:
|
||||||
|
topical_concreteness_penalty -= 2
|
||||||
|
if word.endswith(ABSTRACTISH_SUFFIXES):
|
||||||
|
topical_concreteness_penalty -= 4
|
||||||
|
if str(entry.get("pos", "")) != "NOUN":
|
||||||
|
topical_concreteness_penalty -= 3
|
||||||
|
if 5 <= length <= 10:
|
||||||
|
length_bonus = 3
|
||||||
|
elif 4 <= length <= 12:
|
||||||
|
length_bonus = 1
|
||||||
|
else:
|
||||||
|
length_bonus = -2
|
||||||
|
return (
|
||||||
|
topic_bonus,
|
||||||
|
pos_bonus,
|
||||||
|
topical_concreteness_penalty,
|
||||||
|
quality,
|
||||||
|
semantic_match,
|
||||||
|
glossary_bonus,
|
||||||
|
length_bonus,
|
||||||
|
word,
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_seed_friendly(entry: Dict[str, object], selected_topic: str) -> bool:
|
||||||
|
word = str(entry.get("form", ""))
|
||||||
|
pos = str(entry.get("pos", ""))
|
||||||
|
topics, semantic_topics = entry_topics(entry)
|
||||||
|
topic_hit = selected_topic in topics
|
||||||
|
if len(word) < 4 or len(word) > 13:
|
||||||
|
return False
|
||||||
|
if selected_topic in CONCRETE_TOPICS and pos != "NOUN":
|
||||||
|
return False
|
||||||
|
if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
|
||||||
|
return False
|
||||||
|
blocked_substrings = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
|
||||||
|
if any(part in word for part in blocked_substrings):
|
||||||
|
return False
|
||||||
|
required_substrings = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic)
|
||||||
|
if (
|
||||||
|
selected_topic in CONCRETE_TOPICS
|
||||||
|
and required_substrings
|
||||||
|
and selected_topic != DEFAULT_TOPIC
|
||||||
|
and not any(part in word for part in required_substrings)
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
if selected_topic != DEFAULT_TOPIC and not topic_hit:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def overlap_score(left: str, right: str) -> int:
|
||||||
|
shared = set(left) & set(right)
|
||||||
|
return sum(min(left.count(ch), right.count(ch)) for ch in shared)
|
||||||
|
|
||||||
|
def pick_seed_set(entries: List[Dict[str, object]], selected_topic: str, target_count: int) -> List[str]:
|
||||||
|
if not entries:
|
||||||
|
return []
|
||||||
|
|
||||||
|
ranked = sorted(entries, key=lambda entry: word_score(entry, selected_topic), reverse=True)
|
||||||
|
chosen: List[str] = []
|
||||||
|
chosen_entries: List[Dict[str, object]] = []
|
||||||
|
|
||||||
|
first = ranked[0]
|
||||||
|
chosen.append(str(first["form"]))
|
||||||
|
chosen_entries.append(first)
|
||||||
|
|
||||||
|
while len(chosen) < target_count:
|
||||||
|
best_entry = None
|
||||||
|
best_key = None
|
||||||
|
for entry in ranked:
|
||||||
|
word = str(entry.get("form", ""))
|
||||||
|
if word in chosen:
|
||||||
|
continue
|
||||||
|
overlap_total = sum(overlap_score(word, existing) for existing in chosen)
|
||||||
|
max_overlap = max((overlap_score(word, existing) for existing in chosen), default=0)
|
||||||
|
distinct_letters = len(set(word))
|
||||||
|
same_length_penalty = -sum(1 for existing in chosen if len(existing) == len(word))
|
||||||
|
key = (
|
||||||
|
1 if max_overlap >= 2 else 0,
|
||||||
|
overlap_total,
|
||||||
|
max_overlap,
|
||||||
|
same_length_penalty,
|
||||||
|
distinct_letters,
|
||||||
|
word_score(entry, selected_topic),
|
||||||
|
)
|
||||||
|
if best_key is None or key > best_key:
|
||||||
|
best_key = key
|
||||||
|
best_entry = entry
|
||||||
|
if best_entry is None:
|
||||||
|
break
|
||||||
|
chosen.append(str(best_entry["form"]))
|
||||||
|
chosen_entries.append(best_entry)
|
||||||
|
|
||||||
|
return chosen
|
||||||
|
|
||||||
|
eligible = [
|
||||||
|
entry
|
||||||
|
for entry in payload.get("entries", [])
|
||||||
|
if entry.get("allowed_in_crossword", False)
|
||||||
|
and int(entry.get("difficulty_word", 5)) <= level
|
||||||
|
]
|
||||||
|
|
||||||
|
lexical_topical = []
|
||||||
|
for entry in eligible:
|
||||||
|
topics, semantic_topics = entry_topics(entry)
|
||||||
|
if normalized_topic in topics:
|
||||||
|
lexical_topical.append(entry)
|
||||||
|
fallback = [entry for entry in eligible if matches(entry, DEFAULT_TOPIC)]
|
||||||
|
if normalized_topic == DEFAULT_TOPIC:
|
||||||
|
pool = fallback
|
||||||
|
else:
|
||||||
|
pool = list(lexical_topical)
|
||||||
|
if not pool:
|
||||||
|
pool = fallback
|
||||||
|
|
||||||
|
strict_pool = [entry for entry in pool if is_seed_friendly(entry, normalized_topic)]
|
||||||
|
relaxed_pool = sorted(pool, key=lambda entry: word_score(entry, normalized_topic), reverse=True)
|
||||||
|
|
||||||
|
selected = pick_seed_set(strict_pool, normalized_topic, count)
|
||||||
|
if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
|
||||||
|
relaxed_selected = pick_seed_set(relaxed_pool, normalized_topic, count)
|
||||||
|
for word in relaxed_selected:
|
||||||
|
if word not in selected:
|
||||||
|
selected.append(word)
|
||||||
|
if len(selected) >= count:
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
|
||||||
|
for word in WORDS:
|
||||||
|
if word in selected:
|
||||||
|
continue
|
||||||
|
selected.append(word)
|
||||||
|
if len(selected) >= count:
|
||||||
|
break
|
||||||
|
|
||||||
|
return selected[:count]
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
@@ -209,9 +573,10 @@ def main() -> None:
|
|||||||
ensure_lexicon(args)
|
ensure_lexicon(args)
|
||||||
ensure_semantic_lexicon(args)
|
ensure_semantic_lexicon(args)
|
||||||
difficulty_level = parse_difficulty(args.difficulty)
|
difficulty_level = parse_difficulty(args.difficulty)
|
||||||
|
initial_words = select_initial_words(difficulty_level, args.topic, args.initial_word_count)
|
||||||
|
|
||||||
generator = CrosswordGenerator(
|
generator = CrosswordGenerator(
|
||||||
WORDS,
|
initial_words,
|
||||||
diffxy=args.diffxy,
|
diffxy=args.diffxy,
|
||||||
time_limit_seconds=args.time_limit,
|
time_limit_seconds=args.time_limit,
|
||||||
max_candidates_per_word=args.max_candidates,
|
max_candidates_per_word=args.max_candidates,
|
||||||
@@ -220,6 +585,7 @@ def main() -> None:
|
|||||||
initial_state = generator.solve()
|
initial_state = generator.solve()
|
||||||
|
|
||||||
print("Griglia iniziale")
|
print("Griglia iniziale")
|
||||||
|
print(f"Parole-seme richieste: {len(initial_words)}")
|
||||||
print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
|
print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
|
||||||
print(f"Intersezioni: {initial_state.intersections}")
|
print(f"Intersezioni: {initial_state.intersections}")
|
||||||
print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
|
print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
|
||||||
@@ -229,17 +595,24 @@ def main() -> None:
|
|||||||
print(f"Seed: {args.seed}")
|
print(f"Seed: {args.seed}")
|
||||||
print()
|
print()
|
||||||
print(render_grid(initial_state.grid, initial_state.placements))
|
print(render_grid(initial_state.grid, initial_state.placements))
|
||||||
|
print()
|
||||||
|
print("Parole-seme selezionate:")
|
||||||
|
print(", ".join(initial_words))
|
||||||
|
|
||||||
if args.skip_fill:
|
if args.skip_fill:
|
||||||
return
|
return
|
||||||
|
|
||||||
vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
|
vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
|
||||||
metadata = load_vocabulary_metadata()
|
metadata = load_vocabulary_metadata()
|
||||||
|
semantic_metadata = load_semantic_metadata_for_vocabulary(vocabulary, args.topic) if not args.vocabulary else {}
|
||||||
filler = CrosswordFiller(
|
filler = CrosswordFiller(
|
||||||
initial_state,
|
initial_state,
|
||||||
vocabulary,
|
vocabulary,
|
||||||
target_empty_ratio=args.target_empty_ratio,
|
target_empty_ratio=args.target_empty_ratio,
|
||||||
vocabulary_metadata=metadata,
|
vocabulary_metadata=metadata,
|
||||||
|
semantic_metadata=semantic_metadata,
|
||||||
|
selected_topic=args.topic,
|
||||||
|
max_themed_fill_words=args.themed_fill_count,
|
||||||
seed=args.seed,
|
seed=args.seed,
|
||||||
)
|
)
|
||||||
final_state = filler.fill()
|
final_state = filler.fill()
|
||||||
|
|||||||
Reference in New Issue
Block a user