feat: consolida lessico semantico, temi controllati e filler a quota tematica
This commit is contained in:
421
main.py
421
main.py
@@ -25,6 +25,72 @@ DIFFICULTY_ALIASES: Dict[str, int] = {
|
||||
}
|
||||
|
||||
DEFAULT_TOPIC = "general"
|
||||
DEFAULT_INITIAL_WORD_COUNT = len(WORDS)
|
||||
ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
|
||||
FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
|
||||
GENERAL_FILL_MIN_QUALITY = 6
|
||||
GENERAL_FILL_MAX_LENGTH = 10
|
||||
SOFT_RELATED_FILL_LIMIT = 120
|
||||
DEFAULT_THEMED_FILL_WORD_COUNT = 10
|
||||
CONCRETE_TOPICS = {
|
||||
"animals",
|
||||
"plants",
|
||||
"nature",
|
||||
"ecology",
|
||||
"geography",
|
||||
"weather",
|
||||
"sea",
|
||||
"mountain",
|
||||
"health",
|
||||
"science",
|
||||
"sport",
|
||||
"history",
|
||||
"school",
|
||||
"cinema",
|
||||
"literature",
|
||||
"food",
|
||||
"city",
|
||||
"transport",
|
||||
"work",
|
||||
"home",
|
||||
}
|
||||
|
||||
TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer",
|
||||
"bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc",
|
||||
"trattor", "carr", "vap", "rota", "ruot",
|
||||
),
|
||||
"animals": (
|
||||
"can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall",
|
||||
"serpent", "tig", "leon", "volp", "cerv", "capr", "pecor",
|
||||
),
|
||||
"nature": (
|
||||
"mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol",
|
||||
"rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent",
|
||||
"litoral", "piogg", "nev", "onda", "clim",
|
||||
),
|
||||
"cinema": (
|
||||
"film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog",
|
||||
"comic", "div", "docu", "pellic", "spettacol",
|
||||
),
|
||||
}
|
||||
|
||||
TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"intervist", "intratten", "speriment", "stermin", "investig",
|
||||
"intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt",
|
||||
"eccit", "traduz", "fluttu", "sollecit",
|
||||
),
|
||||
"animals": (
|
||||
"assicur", "finanz", "coediz", "camerier", "servitor", "indic",
|
||||
"estens", "diffus", "difensor", "spessor", "maggior",
|
||||
),
|
||||
"cinema": (
|
||||
"manifest", "riediz", "dissimul", "diffus", "difensor", "estens",
|
||||
"malumor", "eversor",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
@@ -95,6 +161,18 @@ def parse_args() -> argparse.Namespace:
|
||||
default=DEFAULT_TOPIC,
|
||||
help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--initial-word-count",
|
||||
type=int,
|
||||
default=DEFAULT_INITIAL_WORD_COUNT,
|
||||
help="Numero di parole-seme usate per costruire la griglia iniziale prima del filler.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--themed-fill-count",
|
||||
type=int,
|
||||
default=DEFAULT_THEMED_FILL_WORD_COUNT,
|
||||
help="Numero massimo indicativo di parole aggiunte dal filler da mantenere fortemente legate al tema.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -165,42 +243,328 @@ def load_selected_vocabulary(path: Path | None) -> List[str]:
|
||||
return path.read_text(encoding="utf-8").splitlines()
|
||||
|
||||
|
||||
def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
|
||||
if not LEXICON_OUTPUT_PATH.exists():
|
||||
lexicon = build_lexicon()
|
||||
LEXICON_OUTPUT_PATH.write_text(
|
||||
def load_semantic_payload() -> Dict[str, object]:
|
||||
if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
|
||||
lexicon = build_semantic_lexicon()
|
||||
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
|
||||
json.dumps(lexicon, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
return json.loads(SEMANTIC_LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
def entry_topics(entry: Dict[str, object]) -> tuple[set[str], set[str]]:
|
||||
topics = {str(item).lower() for item in entry.get("topics", [])}
|
||||
semantic_topics = {
|
||||
str(item).lower()
|
||||
for item in entry.get("semantic", {}).get("semantic_topics", [])
|
||||
}
|
||||
return topics, semantic_topics
|
||||
|
||||
|
||||
def matches_topic_roots(word: str, selected_topic: str) -> bool:
|
||||
roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic, ())
|
||||
blocked = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
|
||||
if any(part in word for part in blocked):
|
||||
return False
|
||||
return bool(roots) and any(part in word for part in roots)
|
||||
|
||||
|
||||
def topic_relevance(entry: Dict[str, object], topic: str) -> int:
|
||||
selected_topic = topic.strip().lower()
|
||||
if selected_topic == DEFAULT_TOPIC:
|
||||
return 20
|
||||
|
||||
word = str(entry.get("form", ""))
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
score = 0
|
||||
if selected_topic in topics:
|
||||
score += 100
|
||||
if selected_topic in semantic_topics:
|
||||
score += 45
|
||||
if matches_topic_roots(word, selected_topic):
|
||||
score += 35
|
||||
if "general" in topics:
|
||||
score += 5
|
||||
|
||||
if any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())):
|
||||
score -= 80
|
||||
if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
|
||||
score -= 15
|
||||
return score
|
||||
|
||||
|
||||
def strong_topic_relevance(entry: Dict[str, object], topic: str) -> int:
|
||||
selected_topic = topic.strip().lower()
|
||||
if selected_topic == DEFAULT_TOPIC:
|
||||
return 20
|
||||
topics, _ = entry_topics(entry)
|
||||
return 100 if selected_topic in topics else 0
|
||||
|
||||
|
||||
def lexical_fill_score(entry: Dict[str, object], topic: str) -> tuple[int, int, int, int, int, str]:
|
||||
word = str(entry.get("form", ""))
|
||||
quality = int(entry.get("quality_score", 0))
|
||||
pos = str(entry.get("pos", ""))
|
||||
semantic = entry.get("semantic", {})
|
||||
pos_bonus = {
|
||||
"NOUN": 12,
|
||||
"VERB": 8,
|
||||
"ADJ": 6,
|
||||
"ADV": 4,
|
||||
"PREP": 2,
|
||||
"CONJ": 2,
|
||||
}.get(pos, 0)
|
||||
semantic_bonus = 3 if semantic.get("matched") else 0
|
||||
length = len(word)
|
||||
length_bonus = 3 if 4 <= length <= 10 else 1 if 2 <= length <= 13 else -4
|
||||
return (
|
||||
topic_relevance(entry, topic),
|
||||
quality,
|
||||
pos_bonus,
|
||||
semantic_bonus,
|
||||
length_bonus,
|
||||
word,
|
||||
)
|
||||
|
||||
|
||||
def is_general_fill_support(entry: Dict[str, object]) -> bool:
|
||||
word = str(entry.get("form", ""))
|
||||
if int(entry.get("quality_score", 0)) < GENERAL_FILL_MIN_QUALITY:
|
||||
return False
|
||||
if len(word) > GENERAL_FILL_MAX_LENGTH:
|
||||
return False
|
||||
if word.endswith(ABSTRACTISH_SUFFIXES):
|
||||
return False
|
||||
return DEFAULT_TOPIC in {str(item).lower() for item in entry.get("topics", [])}
|
||||
|
||||
|
||||
def load_filtered_entries(level: int, topic: str) -> List[Dict[str, object]]:
|
||||
payload = load_semantic_payload()
|
||||
normalized_topic = topic.strip().lower()
|
||||
|
||||
def matches(entry: Dict[str, object], selected_topic: str) -> bool:
|
||||
topics = [str(item).lower() for item in entry.get("topics", [])]
|
||||
return selected_topic in topics
|
||||
|
||||
words = [
|
||||
entry["form"]
|
||||
eligible = [
|
||||
entry
|
||||
for entry in payload.get("entries", [])
|
||||
if entry.get("allowed_in_crossword", False)
|
||||
and int(entry.get("difficulty_word", 5)) <= level
|
||||
and matches(entry, normalized_topic)
|
||||
and str(entry.get("pos", "")) in FILL_ALLOWED_POS
|
||||
]
|
||||
|
||||
if words:
|
||||
return words
|
||||
|
||||
if normalized_topic != DEFAULT_TOPIC:
|
||||
return [
|
||||
entry["form"]
|
||||
for entry in payload.get("entries", [])
|
||||
if entry.get("allowed_in_crossword", False)
|
||||
and int(entry.get("difficulty_word", 5)) <= level
|
||||
and matches(entry, DEFAULT_TOPIC)
|
||||
if normalized_topic == DEFAULT_TOPIC:
|
||||
selected = eligible
|
||||
else:
|
||||
strong_topic = [entry for entry in eligible if strong_topic_relevance(entry, normalized_topic) > 0]
|
||||
soft_related = [
|
||||
entry
|
||||
for entry in eligible
|
||||
if entry not in strong_topic
|
||||
and topic_relevance(entry, normalized_topic) > 0
|
||||
and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
|
||||
and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH
|
||||
and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES)
|
||||
]
|
||||
soft_related.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
|
||||
|
||||
return words
|
||||
general_support = [
|
||||
entry
|
||||
for entry in eligible
|
||||
if entry not in strong_topic
|
||||
and is_general_fill_support(entry)
|
||||
]
|
||||
general_support.sort(key=lambda entry: lexical_fill_score(entry, DEFAULT_TOPIC), reverse=True)
|
||||
selected = strong_topic + soft_related[:SOFT_RELATED_FILL_LIMIT]
|
||||
selected += [entry for entry in general_support if entry not in selected]
|
||||
|
||||
selected.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
|
||||
return selected
|
||||
|
||||
|
||||
def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
|
||||
return [str(entry["form"]) for entry in load_filtered_entries(level, topic)]
|
||||
|
||||
|
||||
def load_semantic_metadata_for_vocabulary(words: List[str], topic: str) -> Dict[str, Dict[str, object]]:
|
||||
payload = load_semantic_payload()
|
||||
selected = set(words)
|
||||
metadata: Dict[str, Dict[str, object]] = {}
|
||||
for entry in payload.get("entries", []):
|
||||
word = str(entry.get("form", ""))
|
||||
if word not in selected:
|
||||
continue
|
||||
enriched = dict(entry)
|
||||
enriched["_topic_relevance"] = topic_relevance(enriched, topic)
|
||||
enriched["_strong_topic_relevance"] = strong_topic_relevance(enriched, topic)
|
||||
metadata[word] = enriched
|
||||
return metadata
|
||||
|
||||
|
||||
def select_initial_words(level: int, topic: str, count: int) -> List[str]:
|
||||
payload = load_semantic_payload()
|
||||
normalized_topic = topic.strip().lower()
|
||||
abstract_like_topics = {"abstract", "actions"}
|
||||
|
||||
def matches(entry: Dict[str, object], selected_topic: str) -> bool:
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
return selected_topic in topics
|
||||
|
||||
def word_score(entry: Dict[str, object], selected_topic: str) -> tuple[int, int, int, int, int, int, str]:
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
quality = int(entry.get("quality_score", 0))
|
||||
semantic = entry.get("semantic", {})
|
||||
semantic_match = 1 if semantic.get("matched") else 0
|
||||
glossary_bonus = min(3, len(semantic.get("glosses", [])))
|
||||
word = str(entry.get("form", ""))
|
||||
length = len(word)
|
||||
topical_concreteness_penalty = 0
|
||||
topic_bonus = 0
|
||||
pos_bonus = 0
|
||||
if selected_topic in topics:
|
||||
topic_bonus += 4
|
||||
if "general" in topics:
|
||||
topic_bonus += 1
|
||||
if str(entry.get("pos", "")) == "NOUN":
|
||||
pos_bonus += 4
|
||||
elif str(entry.get("pos", "")) == "ADJ":
|
||||
pos_bonus += 1
|
||||
if selected_topic not in abstract_like_topics and selected_topic != DEFAULT_TOPIC:
|
||||
if "abstract" in topics and selected_topic not in topics:
|
||||
topical_concreteness_penalty -= 3
|
||||
if "actions" in topics and selected_topic not in topics:
|
||||
topical_concreteness_penalty -= 2
|
||||
if word.endswith(ABSTRACTISH_SUFFIXES):
|
||||
topical_concreteness_penalty -= 4
|
||||
if str(entry.get("pos", "")) != "NOUN":
|
||||
topical_concreteness_penalty -= 3
|
||||
if 5 <= length <= 10:
|
||||
length_bonus = 3
|
||||
elif 4 <= length <= 12:
|
||||
length_bonus = 1
|
||||
else:
|
||||
length_bonus = -2
|
||||
return (
|
||||
topic_bonus,
|
||||
pos_bonus,
|
||||
topical_concreteness_penalty,
|
||||
quality,
|
||||
semantic_match,
|
||||
glossary_bonus,
|
||||
length_bonus,
|
||||
word,
|
||||
)
|
||||
|
||||
def is_seed_friendly(entry: Dict[str, object], selected_topic: str) -> bool:
|
||||
word = str(entry.get("form", ""))
|
||||
pos = str(entry.get("pos", ""))
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
topic_hit = selected_topic in topics
|
||||
if len(word) < 4 or len(word) > 13:
|
||||
return False
|
||||
if selected_topic in CONCRETE_TOPICS and pos != "NOUN":
|
||||
return False
|
||||
if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
|
||||
return False
|
||||
blocked_substrings = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
|
||||
if any(part in word for part in blocked_substrings):
|
||||
return False
|
||||
required_substrings = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic)
|
||||
if (
|
||||
selected_topic in CONCRETE_TOPICS
|
||||
and required_substrings
|
||||
and selected_topic != DEFAULT_TOPIC
|
||||
and not any(part in word for part in required_substrings)
|
||||
):
|
||||
return False
|
||||
if selected_topic != DEFAULT_TOPIC and not topic_hit:
|
||||
return False
|
||||
return True
|
||||
|
||||
def overlap_score(left: str, right: str) -> int:
|
||||
shared = set(left) & set(right)
|
||||
return sum(min(left.count(ch), right.count(ch)) for ch in shared)
|
||||
|
||||
def pick_seed_set(entries: List[Dict[str, object]], selected_topic: str, target_count: int) -> List[str]:
|
||||
if not entries:
|
||||
return []
|
||||
|
||||
ranked = sorted(entries, key=lambda entry: word_score(entry, selected_topic), reverse=True)
|
||||
chosen: List[str] = []
|
||||
chosen_entries: List[Dict[str, object]] = []
|
||||
|
||||
first = ranked[0]
|
||||
chosen.append(str(first["form"]))
|
||||
chosen_entries.append(first)
|
||||
|
||||
while len(chosen) < target_count:
|
||||
best_entry = None
|
||||
best_key = None
|
||||
for entry in ranked:
|
||||
word = str(entry.get("form", ""))
|
||||
if word in chosen:
|
||||
continue
|
||||
overlap_total = sum(overlap_score(word, existing) for existing in chosen)
|
||||
max_overlap = max((overlap_score(word, existing) for existing in chosen), default=0)
|
||||
distinct_letters = len(set(word))
|
||||
same_length_penalty = -sum(1 for existing in chosen if len(existing) == len(word))
|
||||
key = (
|
||||
1 if max_overlap >= 2 else 0,
|
||||
overlap_total,
|
||||
max_overlap,
|
||||
same_length_penalty,
|
||||
distinct_letters,
|
||||
word_score(entry, selected_topic),
|
||||
)
|
||||
if best_key is None or key > best_key:
|
||||
best_key = key
|
||||
best_entry = entry
|
||||
if best_entry is None:
|
||||
break
|
||||
chosen.append(str(best_entry["form"]))
|
||||
chosen_entries.append(best_entry)
|
||||
|
||||
return chosen
|
||||
|
||||
eligible = [
|
||||
entry
|
||||
for entry in payload.get("entries", [])
|
||||
if entry.get("allowed_in_crossword", False)
|
||||
and int(entry.get("difficulty_word", 5)) <= level
|
||||
]
|
||||
|
||||
lexical_topical = []
|
||||
for entry in eligible:
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
if normalized_topic in topics:
|
||||
lexical_topical.append(entry)
|
||||
fallback = [entry for entry in eligible if matches(entry, DEFAULT_TOPIC)]
|
||||
if normalized_topic == DEFAULT_TOPIC:
|
||||
pool = fallback
|
||||
else:
|
||||
pool = list(lexical_topical)
|
||||
if not pool:
|
||||
pool = fallback
|
||||
|
||||
strict_pool = [entry for entry in pool if is_seed_friendly(entry, normalized_topic)]
|
||||
relaxed_pool = sorted(pool, key=lambda entry: word_score(entry, normalized_topic), reverse=True)
|
||||
|
||||
selected = pick_seed_set(strict_pool, normalized_topic, count)
|
||||
if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
|
||||
relaxed_selected = pick_seed_set(relaxed_pool, normalized_topic, count)
|
||||
for word in relaxed_selected:
|
||||
if word not in selected:
|
||||
selected.append(word)
|
||||
if len(selected) >= count:
|
||||
break
|
||||
|
||||
if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
|
||||
for word in WORDS:
|
||||
if word in selected:
|
||||
continue
|
||||
selected.append(word)
|
||||
if len(selected) >= count:
|
||||
break
|
||||
|
||||
return selected[:count]
|
||||
|
||||
|
||||
def main() -> None:
|
||||
@@ -209,9 +573,10 @@ def main() -> None:
|
||||
ensure_lexicon(args)
|
||||
ensure_semantic_lexicon(args)
|
||||
difficulty_level = parse_difficulty(args.difficulty)
|
||||
initial_words = select_initial_words(difficulty_level, args.topic, args.initial_word_count)
|
||||
|
||||
generator = CrosswordGenerator(
|
||||
WORDS,
|
||||
initial_words,
|
||||
diffxy=args.diffxy,
|
||||
time_limit_seconds=args.time_limit,
|
||||
max_candidates_per_word=args.max_candidates,
|
||||
@@ -220,6 +585,7 @@ def main() -> None:
|
||||
initial_state = generator.solve()
|
||||
|
||||
print("Griglia iniziale")
|
||||
print(f"Parole-seme richieste: {len(initial_words)}")
|
||||
print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
|
||||
print(f"Intersezioni: {initial_state.intersections}")
|
||||
print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
|
||||
@@ -229,17 +595,24 @@ def main() -> None:
|
||||
print(f"Seed: {args.seed}")
|
||||
print()
|
||||
print(render_grid(initial_state.grid, initial_state.placements))
|
||||
print()
|
||||
print("Parole-seme selezionate:")
|
||||
print(", ".join(initial_words))
|
||||
|
||||
if args.skip_fill:
|
||||
return
|
||||
|
||||
vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
|
||||
metadata = load_vocabulary_metadata()
|
||||
semantic_metadata = load_semantic_metadata_for_vocabulary(vocabulary, args.topic) if not args.vocabulary else {}
|
||||
filler = CrosswordFiller(
|
||||
initial_state,
|
||||
vocabulary,
|
||||
target_empty_ratio=args.target_empty_ratio,
|
||||
vocabulary_metadata=metadata,
|
||||
semantic_metadata=semantic_metadata,
|
||||
selected_topic=args.topic,
|
||||
max_themed_fill_words=args.themed_fill_count,
|
||||
seed=args.seed,
|
||||
)
|
||||
final_state = filler.fill()
|
||||
|
||||
Reference in New Issue
Block a user