from __future__ import annotations import re from dataclasses import dataclass from pathlib import Path from typing import Dict, Iterable, List, Optional, Sequence, Tuple from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH, TOPIC_DOMAIN_RULES, load_json from crossword_generator import HORIZONTAL, Placement @dataclass(frozen=True) class Clue: number: int word: str direction: str x: int y: int text: str source: str @dataclass(frozen=True) class ClueCandidate: text: str source: str family: str difficulty_hint: str topic_score: int strong_topic: bool DIFFICULTY_ALIASES = { "1": "easy", "2": "medium", "3": "hard", "4": "expert", "5": "expert", "easy": "easy", "medium": "medium", "hard": "hard", "expert": "expert", } GENERIC_CLUE_PATTERNS = ( "termine da ricavare dagli incroci", "termine lessicale collegato", "collegato a:", ) def load_enriched_entries(path: Path = ENRICHED_LEXICON_OUTPUT_PATH) -> Dict[str, Dict[str, object]]: payload = load_json(path, {"entries": []}) if not isinstance(payload, dict): return {} return { str(entry.get("form", "")).lower(): entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict) and entry.get("form") } def normalize_difficulty(value: Optional[str]) -> str: return DIFFICULTY_ALIASES.get(str(value or "medium").strip().lower(), "medium") def clean_definition(text: str, answer: str) -> str: clue = str(text or "") clue = re.sub(r"\[[^\]]*\]", " ", clue) clue = re.sub(r"\s+", " ", clue).strip(" .;:-") if not clue: return "" clue = re.sub(re.escape(answer), "questa parola", clue, flags=re.IGNORECASE) clue = re.sub(r"\(\s*\)", "", clue) clue = re.sub(r"\s+,", ",", clue) clue = re.sub(r"\s+;", ";", clue) if clue and clue[0].islower(): clue = clue[0].upper() + clue[1:] return clue + "." def synset_has_strong_topic_domain(synset: Dict[str, object], topic: Optional[str]) -> bool: if not topic or topic == "general": return True rules = TOPIC_DOMAIN_RULES.get(topic, {}) strong_domains = {str(domain).upper() for domain in rules.get("strong", ())} if not strong_domains: return True domains = {str(domain).upper() for domain in synset.get("domains", []) or []} return bool(domains.intersection(strong_domains)) def text_contains_answer(text: str, answer: str) -> bool: return bool(re.search(re.escape(answer), text, flags=re.IGNORECASE)) def directness_score(text: str) -> int: lowered = text.lower() score = 0 direct_keywords = ( "strumento", "veicolo", "animale", "pianta", "titolo", "edificio", "persona", "luogo", "malattia", "farmaco", "mezzo", "parte di", ) for keyword in direct_keywords: if keyword in lowered: score += 8 if any(marker in lowered for marker in ("cioè", "ossia", "ovvero")): score += 4 return score def preferred_length_range(difficulty: str) -> Tuple[int, int]: if difficulty == "easy": return 24, 90 if difficulty == "medium": return 20, 75 if difficulty == "hard": return 16, 60 return 14, 50 def score_candidate(candidate: ClueCandidate, answer: str, difficulty: str) -> int: text = candidate.text lowered = text.lower() score = 0 if not text or len(text) < 12: return -10_000 if any(pattern in lowered for pattern in GENERIC_CLUE_PATTERNS): score -= 120 if text_contains_answer(text, answer): score -= 140 else: score += 40 min_len, max_len = preferred_length_range(difficulty) length = len(text) if min_len <= length <= max_len: score += 28 else: score -= abs(length - max_len) if length > max_len else abs(min_len - length) // 2 directness = directness_score(text) if difficulty == "easy": score += directness * 2 elif difficulty == "medium": score += directness elif difficulty == "hard": score -= max(0, directness - 6) else: score -= directness family_bonus = { "semantic_definition": 56, "semantic_gloss": 34, "refined_sense": 30, "babelnet_best_gloss": 18, "babelnet_gloss": 10, "fallback": 0, } score += family_bonus.get(candidate.family, 0) difficulty_pref = { "easy": {"direct", "balanced"}, "medium": {"balanced", "direct"}, "hard": {"balanced", "oblique"}, "expert": {"oblique", "balanced"}, } if candidate.difficulty_hint in difficulty_pref.get(difficulty, {"balanced"}): score += 18 if difficulty == "easy" and ";" in text: score += 8 if difficulty in {"hard", "expert"} and ";" in text: score -= 8 if candidate.topic_score >= 40: score += 18 elif candidate.topic_score > 0: score += 8 elif candidate.family in {"babelnet_best_gloss", "babelnet_gloss"}: score -= 140 if candidate.strong_topic: score += 10 if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", lowered): score -= 28 if length > 120: score -= 45 if length > 180: score -= 90 return score def candidate_hint(text: str, family: str) -> str: lowered = text.lower() if family in {"semantic_definition", "semantic_gloss"} and len(text) <= 70: return "direct" if any(marker in lowered for marker in ("fig.", "figurato", "poetico", "letterario")): return "oblique" if len(text) > 85: return "direct" return "balanced" def add_candidate( candidates: List[ClueCandidate], seen: set[Tuple[str, str]], *, text: str, answer: str, source: str, family: str, topic_score: int = 0, strong_topic: bool = False, ) -> None: cleaned = clean_definition(text, answer) if not cleaned: return key = (cleaned.lower(), family) if key in seen: return seen.add(key) candidates.append( ClueCandidate( text=cleaned, source=source, family=family, difficulty_hint=candidate_hint(cleaned, family), topic_score=topic_score, strong_topic=strong_topic, ) ) def semantic_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]: semantic = entry.get("semantic", {}) if not isinstance(semantic, dict): return [] candidates: List[ClueCandidate] = [] seen: set[Tuple[str, str]] = set() for synset in semantic.get("synsets", []) or []: if not isinstance(synset, dict): continue add_candidate( candidates, seen, text=str(synset.get("definition", "")), answer=answer, source="semantic", family="semantic_definition", ) for gloss in semantic.get("glosses", []) or []: add_candidate( candidates, seen, text=str(gloss), answer=answer, source="semantic", family="semantic_gloss", ) return candidates def babelnet_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]: babelnet = entry.get("babelnet", {}) if not isinstance(babelnet, dict) or babelnet.get("status") not in {"enriched", "ambiguous"}: return [] candidates: List[ClueCandidate] = [] seen: set[Tuple[str, str]] = set() best_synset = babelnet.get("best_synset", {}) if isinstance(best_synset, dict): topic_score = int(best_synset.get("topic_score", 0) or 0) strong_topic = bool(best_synset.get("strong_topic")) or synset_has_strong_topic_domain(best_synset, topic) for gloss in best_synset.get("glosses", []) or []: add_candidate( candidates, seen, text=str(gloss), answer=answer, source="babelnet", family="babelnet_best_gloss", topic_score=topic_score, strong_topic=strong_topic, ) for synset in babelnet.get("synsets", []) or []: if not isinstance(synset, dict): continue if topic and topic != "general" and not synset_has_strong_topic_domain(synset, topic): continue topic_score = 40 if topic and topic != "general" and synset_has_strong_topic_domain(synset, topic) else 0 for gloss in synset.get("glosses", []) or []: add_candidate( candidates, seen, text=str(gloss), answer=answer, source="babelnet", family="babelnet_gloss", topic_score=topic_score, strong_topic=topic_score >= 40, ) return candidates def refined_sense_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]: senses = entry.get("senses", []) if not isinstance(senses, list): return [] candidates: List[ClueCandidate] = [] seen: set[Tuple[str, str]] = set() for sense in senses: if not isinstance(sense, dict): continue confidence = float(sense.get("confidence", 0.0) or 0.0) add_candidate( candidates, seen, text=str(sense.get("definition", "")), answer=answer, source=str(sense.get("source", "refined")), family="refined_sense", topic_score=int(confidence * 100), strong_topic=confidence >= 0.75, ) return candidates def fallback_definition(entry: Dict[str, object], answer: str) -> str: pos = str(entry.get("pos", "")).lower() topics = ", ".join(str(topic) for topic in entry.get("topics", []) if topic and str(topic).lower() != "general") if topics: return f"Termine {pos or 'lessicale'} collegato all'ambito: {topics}." return "Termine da ricavare dagli incroci." def all_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]: candidates: List[ClueCandidate] = [] candidates.extend(semantic_candidates(entry, answer)) candidates.extend(refined_sense_candidates(entry, answer)) candidates.extend(babelnet_candidates(entry, answer, topic)) return candidates def choose_candidate(candidates: Sequence[ClueCandidate], answer: str, difficulty: str) -> Optional[ClueCandidate]: ranked = sorted( candidates, key=lambda candidate: ( score_candidate(candidate, answer, difficulty), candidate.topic_score, len(candidate.text), ), reverse=True, ) return ranked[0] if ranked else None def definition_for_word( word: str, entries: Dict[str, Dict[str, object]], topic: Optional[str] = None, difficulty: Optional[str] = None, ) -> tuple[str, str]: answer = word.lower() entry = entries.get(answer, {}) if not entry: return "Termine da ricavare dagli incroci.", "fallback" normalized_difficulty = normalize_difficulty(difficulty) candidates = all_candidates(entry, answer, topic) best = choose_candidate(candidates, answer, normalized_difficulty) if best: return best.text, best.source return fallback_definition(entry, answer), "fallback" def generate_clues( placements: Iterable[Placement], entries: Dict[str, Dict[str, object]], topic: Optional[str] = None, difficulty: Optional[str] = None, ) -> List[Clue]: clues = [] for number, placement in enumerate(placements, start=1): text, source = definition_for_word(placement.word, entries, topic, difficulty) direction = "orizzontale" if placement.direction == HORIZONTAL else "verticale" clues.append( Clue( number=number, word=placement.word, direction=direction, x=placement.x, y=placement.y, text=text, source=source, ) ) return clues