alpha01 backoffice: crossword engine, lexicon curation and JSON contract
This commit is contained in:
423
clue_generator.py
Normal file
423
clue_generator.py
Normal file
@@ -0,0 +1,423 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH, TOPIC_DOMAIN_RULES, load_json
|
||||
from crossword_generator import HORIZONTAL, Placement
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Clue:
|
||||
number: int
|
||||
word: str
|
||||
direction: str
|
||||
x: int
|
||||
y: int
|
||||
text: str
|
||||
source: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ClueCandidate:
|
||||
text: str
|
||||
source: str
|
||||
family: str
|
||||
difficulty_hint: str
|
||||
topic_score: int
|
||||
strong_topic: bool
|
||||
|
||||
|
||||
DIFFICULTY_ALIASES = {
|
||||
"1": "easy",
|
||||
"2": "medium",
|
||||
"3": "hard",
|
||||
"4": "expert",
|
||||
"5": "expert",
|
||||
"easy": "easy",
|
||||
"medium": "medium",
|
||||
"hard": "hard",
|
||||
"expert": "expert",
|
||||
}
|
||||
|
||||
GENERIC_CLUE_PATTERNS = (
|
||||
"termine da ricavare dagli incroci",
|
||||
"termine lessicale collegato",
|
||||
"collegato a:",
|
||||
)
|
||||
|
||||
|
||||
def load_enriched_entries(path: Path = ENRICHED_LEXICON_OUTPUT_PATH) -> Dict[str, Dict[str, object]]:
|
||||
payload = load_json(path, {"entries": []})
|
||||
if not isinstance(payload, dict):
|
||||
return {}
|
||||
return {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict) and entry.get("form")
|
||||
}
|
||||
|
||||
|
||||
def normalize_difficulty(value: Optional[str]) -> str:
|
||||
return DIFFICULTY_ALIASES.get(str(value or "medium").strip().lower(), "medium")
|
||||
|
||||
|
||||
def clean_definition(text: str, answer: str) -> str:
|
||||
clue = str(text or "")
|
||||
clue = re.sub(r"\[[^\]]*\]", " ", clue)
|
||||
clue = re.sub(r"\s+", " ", clue).strip(" .;:-")
|
||||
if not clue:
|
||||
return ""
|
||||
clue = re.sub(re.escape(answer), "questa parola", clue, flags=re.IGNORECASE)
|
||||
clue = re.sub(r"\(\s*\)", "", clue)
|
||||
clue = re.sub(r"\s+,", ",", clue)
|
||||
clue = re.sub(r"\s+;", ";", clue)
|
||||
if clue and clue[0].islower():
|
||||
clue = clue[0].upper() + clue[1:]
|
||||
return clue + "."
|
||||
|
||||
|
||||
def synset_has_strong_topic_domain(synset: Dict[str, object], topic: Optional[str]) -> bool:
|
||||
if not topic or topic == "general":
|
||||
return True
|
||||
rules = TOPIC_DOMAIN_RULES.get(topic, {})
|
||||
strong_domains = {str(domain).upper() for domain in rules.get("strong", ())}
|
||||
if not strong_domains:
|
||||
return True
|
||||
domains = {str(domain).upper() for domain in synset.get("domains", []) or []}
|
||||
return bool(domains.intersection(strong_domains))
|
||||
|
||||
|
||||
def text_contains_answer(text: str, answer: str) -> bool:
|
||||
return bool(re.search(re.escape(answer), text, flags=re.IGNORECASE))
|
||||
|
||||
|
||||
def directness_score(text: str) -> int:
|
||||
lowered = text.lower()
|
||||
score = 0
|
||||
direct_keywords = (
|
||||
"strumento",
|
||||
"veicolo",
|
||||
"animale",
|
||||
"pianta",
|
||||
"titolo",
|
||||
"edificio",
|
||||
"persona",
|
||||
"luogo",
|
||||
"malattia",
|
||||
"farmaco",
|
||||
"mezzo",
|
||||
"parte di",
|
||||
)
|
||||
for keyword in direct_keywords:
|
||||
if keyword in lowered:
|
||||
score += 8
|
||||
if any(marker in lowered for marker in ("cioè", "ossia", "ovvero")):
|
||||
score += 4
|
||||
return score
|
||||
|
||||
|
||||
def preferred_length_range(difficulty: str) -> Tuple[int, int]:
|
||||
if difficulty == "easy":
|
||||
return 24, 90
|
||||
if difficulty == "medium":
|
||||
return 20, 75
|
||||
if difficulty == "hard":
|
||||
return 16, 60
|
||||
return 14, 50
|
||||
|
||||
|
||||
def score_candidate(candidate: ClueCandidate, answer: str, difficulty: str) -> int:
|
||||
text = candidate.text
|
||||
lowered = text.lower()
|
||||
score = 0
|
||||
|
||||
if not text or len(text) < 12:
|
||||
return -10_000
|
||||
|
||||
if any(pattern in lowered for pattern in GENERIC_CLUE_PATTERNS):
|
||||
score -= 120
|
||||
|
||||
if text_contains_answer(text, answer):
|
||||
score -= 140
|
||||
else:
|
||||
score += 40
|
||||
|
||||
min_len, max_len = preferred_length_range(difficulty)
|
||||
length = len(text)
|
||||
if min_len <= length <= max_len:
|
||||
score += 28
|
||||
else:
|
||||
score -= abs(length - max_len) if length > max_len else abs(min_len - length) // 2
|
||||
|
||||
directness = directness_score(text)
|
||||
if difficulty == "easy":
|
||||
score += directness * 2
|
||||
elif difficulty == "medium":
|
||||
score += directness
|
||||
elif difficulty == "hard":
|
||||
score -= max(0, directness - 6)
|
||||
else:
|
||||
score -= directness
|
||||
|
||||
family_bonus = {
|
||||
"semantic_definition": 56,
|
||||
"semantic_gloss": 34,
|
||||
"refined_sense": 30,
|
||||
"babelnet_best_gloss": 18,
|
||||
"babelnet_gloss": 10,
|
||||
"fallback": 0,
|
||||
}
|
||||
score += family_bonus.get(candidate.family, 0)
|
||||
|
||||
difficulty_pref = {
|
||||
"easy": {"direct", "balanced"},
|
||||
"medium": {"balanced", "direct"},
|
||||
"hard": {"balanced", "oblique"},
|
||||
"expert": {"oblique", "balanced"},
|
||||
}
|
||||
if candidate.difficulty_hint in difficulty_pref.get(difficulty, {"balanced"}):
|
||||
score += 18
|
||||
|
||||
if difficulty == "easy" and ";" in text:
|
||||
score += 8
|
||||
if difficulty in {"hard", "expert"} and ";" in text:
|
||||
score -= 8
|
||||
|
||||
if candidate.topic_score >= 40:
|
||||
score += 18
|
||||
elif candidate.topic_score > 0:
|
||||
score += 8
|
||||
elif candidate.family in {"babelnet_best_gloss", "babelnet_gloss"}:
|
||||
score -= 140
|
||||
|
||||
if candidate.strong_topic:
|
||||
score += 10
|
||||
|
||||
if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", lowered):
|
||||
score -= 28
|
||||
|
||||
if length > 120:
|
||||
score -= 45
|
||||
if length > 180:
|
||||
score -= 90
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def candidate_hint(text: str, family: str) -> str:
|
||||
lowered = text.lower()
|
||||
if family in {"semantic_definition", "semantic_gloss"} and len(text) <= 70:
|
||||
return "direct"
|
||||
if any(marker in lowered for marker in ("fig.", "figurato", "poetico", "letterario")):
|
||||
return "oblique"
|
||||
if len(text) > 85:
|
||||
return "direct"
|
||||
return "balanced"
|
||||
|
||||
|
||||
def add_candidate(
|
||||
candidates: List[ClueCandidate],
|
||||
seen: set[Tuple[str, str]],
|
||||
*,
|
||||
text: str,
|
||||
answer: str,
|
||||
source: str,
|
||||
family: str,
|
||||
topic_score: int = 0,
|
||||
strong_topic: bool = False,
|
||||
) -> None:
|
||||
cleaned = clean_definition(text, answer)
|
||||
if not cleaned:
|
||||
return
|
||||
key = (cleaned.lower(), family)
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
candidates.append(
|
||||
ClueCandidate(
|
||||
text=cleaned,
|
||||
source=source,
|
||||
family=family,
|
||||
difficulty_hint=candidate_hint(cleaned, family),
|
||||
topic_score=topic_score,
|
||||
strong_topic=strong_topic,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def semantic_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
|
||||
semantic = entry.get("semantic", {})
|
||||
if not isinstance(semantic, dict):
|
||||
return []
|
||||
|
||||
candidates: List[ClueCandidate] = []
|
||||
seen: set[Tuple[str, str]] = set()
|
||||
|
||||
for synset in semantic.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(synset.get("definition", "")),
|
||||
answer=answer,
|
||||
source="semantic",
|
||||
family="semantic_definition",
|
||||
)
|
||||
|
||||
for gloss in semantic.get("glosses", []) or []:
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(gloss),
|
||||
answer=answer,
|
||||
source="semantic",
|
||||
family="semantic_gloss",
|
||||
)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def babelnet_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if not isinstance(babelnet, dict) or babelnet.get("status") not in {"enriched", "ambiguous"}:
|
||||
return []
|
||||
|
||||
candidates: List[ClueCandidate] = []
|
||||
seen: set[Tuple[str, str]] = set()
|
||||
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
topic_score = int(best_synset.get("topic_score", 0) or 0)
|
||||
strong_topic = bool(best_synset.get("strong_topic")) or synset_has_strong_topic_domain(best_synset, topic)
|
||||
for gloss in best_synset.get("glosses", []) or []:
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(gloss),
|
||||
answer=answer,
|
||||
source="babelnet",
|
||||
family="babelnet_best_gloss",
|
||||
topic_score=topic_score,
|
||||
strong_topic=strong_topic,
|
||||
)
|
||||
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
if topic and topic != "general" and not synset_has_strong_topic_domain(synset, topic):
|
||||
continue
|
||||
topic_score = 40 if topic and topic != "general" and synset_has_strong_topic_domain(synset, topic) else 0
|
||||
for gloss in synset.get("glosses", []) or []:
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(gloss),
|
||||
answer=answer,
|
||||
source="babelnet",
|
||||
family="babelnet_gloss",
|
||||
topic_score=topic_score,
|
||||
strong_topic=topic_score >= 40,
|
||||
)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def refined_sense_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
|
||||
senses = entry.get("senses", [])
|
||||
if not isinstance(senses, list):
|
||||
return []
|
||||
|
||||
candidates: List[ClueCandidate] = []
|
||||
seen: set[Tuple[str, str]] = set()
|
||||
for sense in senses:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
confidence = float(sense.get("confidence", 0.0) or 0.0)
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(sense.get("definition", "")),
|
||||
answer=answer,
|
||||
source=str(sense.get("source", "refined")),
|
||||
family="refined_sense",
|
||||
topic_score=int(confidence * 100),
|
||||
strong_topic=confidence >= 0.75,
|
||||
)
|
||||
return candidates
|
||||
|
||||
|
||||
def fallback_definition(entry: Dict[str, object], answer: str) -> str:
|
||||
pos = str(entry.get("pos", "")).lower()
|
||||
topics = ", ".join(str(topic) for topic in entry.get("topics", []) if topic and str(topic).lower() != "general")
|
||||
if topics:
|
||||
return f"Termine {pos or 'lessicale'} collegato all'ambito: {topics}."
|
||||
return "Termine da ricavare dagli incroci."
|
||||
|
||||
|
||||
def all_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
|
||||
candidates: List[ClueCandidate] = []
|
||||
candidates.extend(semantic_candidates(entry, answer))
|
||||
candidates.extend(refined_sense_candidates(entry, answer))
|
||||
candidates.extend(babelnet_candidates(entry, answer, topic))
|
||||
return candidates
|
||||
|
||||
|
||||
def choose_candidate(candidates: Sequence[ClueCandidate], answer: str, difficulty: str) -> Optional[ClueCandidate]:
|
||||
ranked = sorted(
|
||||
candidates,
|
||||
key=lambda candidate: (
|
||||
score_candidate(candidate, answer, difficulty),
|
||||
candidate.topic_score,
|
||||
len(candidate.text),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return ranked[0] if ranked else None
|
||||
|
||||
|
||||
def definition_for_word(
|
||||
word: str,
|
||||
entries: Dict[str, Dict[str, object]],
|
||||
topic: Optional[str] = None,
|
||||
difficulty: Optional[str] = None,
|
||||
) -> tuple[str, str]:
|
||||
answer = word.lower()
|
||||
entry = entries.get(answer, {})
|
||||
if not entry:
|
||||
return "Termine da ricavare dagli incroci.", "fallback"
|
||||
|
||||
normalized_difficulty = normalize_difficulty(difficulty)
|
||||
candidates = all_candidates(entry, answer, topic)
|
||||
best = choose_candidate(candidates, answer, normalized_difficulty)
|
||||
if best:
|
||||
return best.text, best.source
|
||||
|
||||
return fallback_definition(entry, answer), "fallback"
|
||||
|
||||
|
||||
def generate_clues(
|
||||
placements: Iterable[Placement],
|
||||
entries: Dict[str, Dict[str, object]],
|
||||
topic: Optional[str] = None,
|
||||
difficulty: Optional[str] = None,
|
||||
) -> List[Clue]:
|
||||
clues = []
|
||||
for number, placement in enumerate(placements, start=1):
|
||||
text, source = definition_for_word(placement.word, entries, topic, difficulty)
|
||||
direction = "orizzontale" if placement.direction == HORIZONTAL else "verticale"
|
||||
clues.append(
|
||||
Clue(
|
||||
number=number,
|
||||
word=placement.word,
|
||||
direction=direction,
|
||||
x=placement.x,
|
||||
y=placement.y,
|
||||
text=text,
|
||||
source=source,
|
||||
)
|
||||
)
|
||||
return clues
|
||||
Reference in New Issue
Block a user