Files
cruciverba_1/clue_generator.py

424 lines
12 KiB
Python

from __future__ import annotations
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH, TOPIC_DOMAIN_RULES, load_json
from crossword_generator import HORIZONTAL, Placement
@dataclass(frozen=True)
class Clue:
number: int
word: str
direction: str
x: int
y: int
text: str
source: str
@dataclass(frozen=True)
class ClueCandidate:
text: str
source: str
family: str
difficulty_hint: str
topic_score: int
strong_topic: bool
DIFFICULTY_ALIASES = {
"1": "easy",
"2": "medium",
"3": "hard",
"4": "expert",
"5": "expert",
"easy": "easy",
"medium": "medium",
"hard": "hard",
"expert": "expert",
}
GENERIC_CLUE_PATTERNS = (
"termine da ricavare dagli incroci",
"termine lessicale collegato",
"collegato a:",
)
def load_enriched_entries(path: Path = ENRICHED_LEXICON_OUTPUT_PATH) -> Dict[str, Dict[str, object]]:
payload = load_json(path, {"entries": []})
if not isinstance(payload, dict):
return {}
return {
str(entry.get("form", "")).lower(): entry
for entry in payload.get("entries", []) or []
if isinstance(entry, dict) and entry.get("form")
}
def normalize_difficulty(value: Optional[str]) -> str:
return DIFFICULTY_ALIASES.get(str(value or "medium").strip().lower(), "medium")
def clean_definition(text: str, answer: str) -> str:
clue = str(text or "")
clue = re.sub(r"\[[^\]]*\]", " ", clue)
clue = re.sub(r"\s+", " ", clue).strip(" .;:-")
if not clue:
return ""
clue = re.sub(re.escape(answer), "questa parola", clue, flags=re.IGNORECASE)
clue = re.sub(r"\(\s*\)", "", clue)
clue = re.sub(r"\s+,", ",", clue)
clue = re.sub(r"\s+;", ";", clue)
if clue and clue[0].islower():
clue = clue[0].upper() + clue[1:]
return clue + "."
def synset_has_strong_topic_domain(synset: Dict[str, object], topic: Optional[str]) -> bool:
if not topic or topic == "general":
return True
rules = TOPIC_DOMAIN_RULES.get(topic, {})
strong_domains = {str(domain).upper() for domain in rules.get("strong", ())}
if not strong_domains:
return True
domains = {str(domain).upper() for domain in synset.get("domains", []) or []}
return bool(domains.intersection(strong_domains))
def text_contains_answer(text: str, answer: str) -> bool:
return bool(re.search(re.escape(answer), text, flags=re.IGNORECASE))
def directness_score(text: str) -> int:
lowered = text.lower()
score = 0
direct_keywords = (
"strumento",
"veicolo",
"animale",
"pianta",
"titolo",
"edificio",
"persona",
"luogo",
"malattia",
"farmaco",
"mezzo",
"parte di",
)
for keyword in direct_keywords:
if keyword in lowered:
score += 8
if any(marker in lowered for marker in ("cioè", "ossia", "ovvero")):
score += 4
return score
def preferred_length_range(difficulty: str) -> Tuple[int, int]:
if difficulty == "easy":
return 24, 90
if difficulty == "medium":
return 20, 75
if difficulty == "hard":
return 16, 60
return 14, 50
def score_candidate(candidate: ClueCandidate, answer: str, difficulty: str) -> int:
text = candidate.text
lowered = text.lower()
score = 0
if not text or len(text) < 12:
return -10_000
if any(pattern in lowered for pattern in GENERIC_CLUE_PATTERNS):
score -= 120
if text_contains_answer(text, answer):
score -= 140
else:
score += 40
min_len, max_len = preferred_length_range(difficulty)
length = len(text)
if min_len <= length <= max_len:
score += 28
else:
score -= abs(length - max_len) if length > max_len else abs(min_len - length) // 2
directness = directness_score(text)
if difficulty == "easy":
score += directness * 2
elif difficulty == "medium":
score += directness
elif difficulty == "hard":
score -= max(0, directness - 6)
else:
score -= directness
family_bonus = {
"semantic_definition": 56,
"semantic_gloss": 34,
"refined_sense": 30,
"babelnet_best_gloss": 18,
"babelnet_gloss": 10,
"fallback": 0,
}
score += family_bonus.get(candidate.family, 0)
difficulty_pref = {
"easy": {"direct", "balanced"},
"medium": {"balanced", "direct"},
"hard": {"balanced", "oblique"},
"expert": {"oblique", "balanced"},
}
if candidate.difficulty_hint in difficulty_pref.get(difficulty, {"balanced"}):
score += 18
if difficulty == "easy" and ";" in text:
score += 8
if difficulty in {"hard", "expert"} and ";" in text:
score -= 8
if candidate.topic_score >= 40:
score += 18
elif candidate.topic_score > 0:
score += 8
elif candidate.family in {"babelnet_best_gloss", "babelnet_gloss"}:
score -= 140
if candidate.strong_topic:
score += 10
if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", lowered):
score -= 28
if length > 120:
score -= 45
if length > 180:
score -= 90
return score
def candidate_hint(text: str, family: str) -> str:
lowered = text.lower()
if family in {"semantic_definition", "semantic_gloss"} and len(text) <= 70:
return "direct"
if any(marker in lowered for marker in ("fig.", "figurato", "poetico", "letterario")):
return "oblique"
if len(text) > 85:
return "direct"
return "balanced"
def add_candidate(
candidates: List[ClueCandidate],
seen: set[Tuple[str, str]],
*,
text: str,
answer: str,
source: str,
family: str,
topic_score: int = 0,
strong_topic: bool = False,
) -> None:
cleaned = clean_definition(text, answer)
if not cleaned:
return
key = (cleaned.lower(), family)
if key in seen:
return
seen.add(key)
candidates.append(
ClueCandidate(
text=cleaned,
source=source,
family=family,
difficulty_hint=candidate_hint(cleaned, family),
topic_score=topic_score,
strong_topic=strong_topic,
)
)
def semantic_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
semantic = entry.get("semantic", {})
if not isinstance(semantic, dict):
return []
candidates: List[ClueCandidate] = []
seen: set[Tuple[str, str]] = set()
for synset in semantic.get("synsets", []) or []:
if not isinstance(synset, dict):
continue
add_candidate(
candidates,
seen,
text=str(synset.get("definition", "")),
answer=answer,
source="semantic",
family="semantic_definition",
)
for gloss in semantic.get("glosses", []) or []:
add_candidate(
candidates,
seen,
text=str(gloss),
answer=answer,
source="semantic",
family="semantic_gloss",
)
return candidates
def babelnet_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
babelnet = entry.get("babelnet", {})
if not isinstance(babelnet, dict) or babelnet.get("status") not in {"enriched", "ambiguous"}:
return []
candidates: List[ClueCandidate] = []
seen: set[Tuple[str, str]] = set()
best_synset = babelnet.get("best_synset", {})
if isinstance(best_synset, dict):
topic_score = int(best_synset.get("topic_score", 0) or 0)
strong_topic = bool(best_synset.get("strong_topic")) or synset_has_strong_topic_domain(best_synset, topic)
for gloss in best_synset.get("glosses", []) or []:
add_candidate(
candidates,
seen,
text=str(gloss),
answer=answer,
source="babelnet",
family="babelnet_best_gloss",
topic_score=topic_score,
strong_topic=strong_topic,
)
for synset in babelnet.get("synsets", []) or []:
if not isinstance(synset, dict):
continue
if topic and topic != "general" and not synset_has_strong_topic_domain(synset, topic):
continue
topic_score = 40 if topic and topic != "general" and synset_has_strong_topic_domain(synset, topic) else 0
for gloss in synset.get("glosses", []) or []:
add_candidate(
candidates,
seen,
text=str(gloss),
answer=answer,
source="babelnet",
family="babelnet_gloss",
topic_score=topic_score,
strong_topic=topic_score >= 40,
)
return candidates
def refined_sense_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
senses = entry.get("senses", [])
if not isinstance(senses, list):
return []
candidates: List[ClueCandidate] = []
seen: set[Tuple[str, str]] = set()
for sense in senses:
if not isinstance(sense, dict):
continue
confidence = float(sense.get("confidence", 0.0) or 0.0)
add_candidate(
candidates,
seen,
text=str(sense.get("definition", "")),
answer=answer,
source=str(sense.get("source", "refined")),
family="refined_sense",
topic_score=int(confidence * 100),
strong_topic=confidence >= 0.75,
)
return candidates
def fallback_definition(entry: Dict[str, object], answer: str) -> str:
pos = str(entry.get("pos", "")).lower()
topics = ", ".join(str(topic) for topic in entry.get("topics", []) if topic and str(topic).lower() != "general")
if topics:
return f"Termine {pos or 'lessicale'} collegato all'ambito: {topics}."
return "Termine da ricavare dagli incroci."
def all_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
candidates: List[ClueCandidate] = []
candidates.extend(semantic_candidates(entry, answer))
candidates.extend(refined_sense_candidates(entry, answer))
candidates.extend(babelnet_candidates(entry, answer, topic))
return candidates
def choose_candidate(candidates: Sequence[ClueCandidate], answer: str, difficulty: str) -> Optional[ClueCandidate]:
ranked = sorted(
candidates,
key=lambda candidate: (
score_candidate(candidate, answer, difficulty),
candidate.topic_score,
len(candidate.text),
),
reverse=True,
)
return ranked[0] if ranked else None
def definition_for_word(
word: str,
entries: Dict[str, Dict[str, object]],
topic: Optional[str] = None,
difficulty: Optional[str] = None,
) -> tuple[str, str]:
answer = word.lower()
entry = entries.get(answer, {})
if not entry:
return "Termine da ricavare dagli incroci.", "fallback"
normalized_difficulty = normalize_difficulty(difficulty)
candidates = all_candidates(entry, answer, topic)
best = choose_candidate(candidates, answer, normalized_difficulty)
if best:
return best.text, best.source
return fallback_definition(entry, answer), "fallback"
def generate_clues(
placements: Iterable[Placement],
entries: Dict[str, Dict[str, object]],
topic: Optional[str] = None,
difficulty: Optional[str] = None,
) -> List[Clue]:
clues = []
for number, placement in enumerate(placements, start=1):
text, source = definition_for_word(placement.word, entries, topic, difficulty)
direction = "orizzontale" if placement.direction == HORIZONTAL else "verticale"
clues.append(
Clue(
number=number,
word=placement.word,
direction=direction,
x=placement.x,
y=placement.y,
text=text,
source=source,
)
)
return clues