424 lines
12 KiB
Python
424 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
|
|
|
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH, TOPIC_DOMAIN_RULES, load_json
|
|
from crossword_generator import HORIZONTAL, Placement
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Clue:
|
|
number: int
|
|
word: str
|
|
direction: str
|
|
x: int
|
|
y: int
|
|
text: str
|
|
source: str
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ClueCandidate:
|
|
text: str
|
|
source: str
|
|
family: str
|
|
difficulty_hint: str
|
|
topic_score: int
|
|
strong_topic: bool
|
|
|
|
|
|
DIFFICULTY_ALIASES = {
|
|
"1": "easy",
|
|
"2": "medium",
|
|
"3": "hard",
|
|
"4": "expert",
|
|
"5": "expert",
|
|
"easy": "easy",
|
|
"medium": "medium",
|
|
"hard": "hard",
|
|
"expert": "expert",
|
|
}
|
|
|
|
GENERIC_CLUE_PATTERNS = (
|
|
"termine da ricavare dagli incroci",
|
|
"termine lessicale collegato",
|
|
"collegato a:",
|
|
)
|
|
|
|
|
|
def load_enriched_entries(path: Path = ENRICHED_LEXICON_OUTPUT_PATH) -> Dict[str, Dict[str, object]]:
|
|
payload = load_json(path, {"entries": []})
|
|
if not isinstance(payload, dict):
|
|
return {}
|
|
return {
|
|
str(entry.get("form", "")).lower(): entry
|
|
for entry in payload.get("entries", []) or []
|
|
if isinstance(entry, dict) and entry.get("form")
|
|
}
|
|
|
|
|
|
def normalize_difficulty(value: Optional[str]) -> str:
|
|
return DIFFICULTY_ALIASES.get(str(value or "medium").strip().lower(), "medium")
|
|
|
|
|
|
def clean_definition(text: str, answer: str) -> str:
|
|
clue = str(text or "")
|
|
clue = re.sub(r"\[[^\]]*\]", " ", clue)
|
|
clue = re.sub(r"\s+", " ", clue).strip(" .;:-")
|
|
if not clue:
|
|
return ""
|
|
clue = re.sub(re.escape(answer), "questa parola", clue, flags=re.IGNORECASE)
|
|
clue = re.sub(r"\(\s*\)", "", clue)
|
|
clue = re.sub(r"\s+,", ",", clue)
|
|
clue = re.sub(r"\s+;", ";", clue)
|
|
if clue and clue[0].islower():
|
|
clue = clue[0].upper() + clue[1:]
|
|
return clue + "."
|
|
|
|
|
|
def synset_has_strong_topic_domain(synset: Dict[str, object], topic: Optional[str]) -> bool:
|
|
if not topic or topic == "general":
|
|
return True
|
|
rules = TOPIC_DOMAIN_RULES.get(topic, {})
|
|
strong_domains = {str(domain).upper() for domain in rules.get("strong", ())}
|
|
if not strong_domains:
|
|
return True
|
|
domains = {str(domain).upper() for domain in synset.get("domains", []) or []}
|
|
return bool(domains.intersection(strong_domains))
|
|
|
|
|
|
def text_contains_answer(text: str, answer: str) -> bool:
|
|
return bool(re.search(re.escape(answer), text, flags=re.IGNORECASE))
|
|
|
|
|
|
def directness_score(text: str) -> int:
|
|
lowered = text.lower()
|
|
score = 0
|
|
direct_keywords = (
|
|
"strumento",
|
|
"veicolo",
|
|
"animale",
|
|
"pianta",
|
|
"titolo",
|
|
"edificio",
|
|
"persona",
|
|
"luogo",
|
|
"malattia",
|
|
"farmaco",
|
|
"mezzo",
|
|
"parte di",
|
|
)
|
|
for keyword in direct_keywords:
|
|
if keyword in lowered:
|
|
score += 8
|
|
if any(marker in lowered for marker in ("cioè", "ossia", "ovvero")):
|
|
score += 4
|
|
return score
|
|
|
|
|
|
def preferred_length_range(difficulty: str) -> Tuple[int, int]:
|
|
if difficulty == "easy":
|
|
return 24, 90
|
|
if difficulty == "medium":
|
|
return 20, 75
|
|
if difficulty == "hard":
|
|
return 16, 60
|
|
return 14, 50
|
|
|
|
|
|
def score_candidate(candidate: ClueCandidate, answer: str, difficulty: str) -> int:
|
|
text = candidate.text
|
|
lowered = text.lower()
|
|
score = 0
|
|
|
|
if not text or len(text) < 12:
|
|
return -10_000
|
|
|
|
if any(pattern in lowered for pattern in GENERIC_CLUE_PATTERNS):
|
|
score -= 120
|
|
|
|
if text_contains_answer(text, answer):
|
|
score -= 140
|
|
else:
|
|
score += 40
|
|
|
|
min_len, max_len = preferred_length_range(difficulty)
|
|
length = len(text)
|
|
if min_len <= length <= max_len:
|
|
score += 28
|
|
else:
|
|
score -= abs(length - max_len) if length > max_len else abs(min_len - length) // 2
|
|
|
|
directness = directness_score(text)
|
|
if difficulty == "easy":
|
|
score += directness * 2
|
|
elif difficulty == "medium":
|
|
score += directness
|
|
elif difficulty == "hard":
|
|
score -= max(0, directness - 6)
|
|
else:
|
|
score -= directness
|
|
|
|
family_bonus = {
|
|
"semantic_definition": 56,
|
|
"semantic_gloss": 34,
|
|
"refined_sense": 30,
|
|
"babelnet_best_gloss": 18,
|
|
"babelnet_gloss": 10,
|
|
"fallback": 0,
|
|
}
|
|
score += family_bonus.get(candidate.family, 0)
|
|
|
|
difficulty_pref = {
|
|
"easy": {"direct", "balanced"},
|
|
"medium": {"balanced", "direct"},
|
|
"hard": {"balanced", "oblique"},
|
|
"expert": {"oblique", "balanced"},
|
|
}
|
|
if candidate.difficulty_hint in difficulty_pref.get(difficulty, {"balanced"}):
|
|
score += 18
|
|
|
|
if difficulty == "easy" and ";" in text:
|
|
score += 8
|
|
if difficulty in {"hard", "expert"} and ";" in text:
|
|
score -= 8
|
|
|
|
if candidate.topic_score >= 40:
|
|
score += 18
|
|
elif candidate.topic_score > 0:
|
|
score += 8
|
|
elif candidate.family in {"babelnet_best_gloss", "babelnet_gloss"}:
|
|
score -= 140
|
|
|
|
if candidate.strong_topic:
|
|
score += 10
|
|
|
|
if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", lowered):
|
|
score -= 28
|
|
|
|
if length > 120:
|
|
score -= 45
|
|
if length > 180:
|
|
score -= 90
|
|
|
|
return score
|
|
|
|
|
|
def candidate_hint(text: str, family: str) -> str:
|
|
lowered = text.lower()
|
|
if family in {"semantic_definition", "semantic_gloss"} and len(text) <= 70:
|
|
return "direct"
|
|
if any(marker in lowered for marker in ("fig.", "figurato", "poetico", "letterario")):
|
|
return "oblique"
|
|
if len(text) > 85:
|
|
return "direct"
|
|
return "balanced"
|
|
|
|
|
|
def add_candidate(
|
|
candidates: List[ClueCandidate],
|
|
seen: set[Tuple[str, str]],
|
|
*,
|
|
text: str,
|
|
answer: str,
|
|
source: str,
|
|
family: str,
|
|
topic_score: int = 0,
|
|
strong_topic: bool = False,
|
|
) -> None:
|
|
cleaned = clean_definition(text, answer)
|
|
if not cleaned:
|
|
return
|
|
key = (cleaned.lower(), family)
|
|
if key in seen:
|
|
return
|
|
seen.add(key)
|
|
candidates.append(
|
|
ClueCandidate(
|
|
text=cleaned,
|
|
source=source,
|
|
family=family,
|
|
difficulty_hint=candidate_hint(cleaned, family),
|
|
topic_score=topic_score,
|
|
strong_topic=strong_topic,
|
|
)
|
|
)
|
|
|
|
|
|
def semantic_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
|
|
semantic = entry.get("semantic", {})
|
|
if not isinstance(semantic, dict):
|
|
return []
|
|
|
|
candidates: List[ClueCandidate] = []
|
|
seen: set[Tuple[str, str]] = set()
|
|
|
|
for synset in semantic.get("synsets", []) or []:
|
|
if not isinstance(synset, dict):
|
|
continue
|
|
add_candidate(
|
|
candidates,
|
|
seen,
|
|
text=str(synset.get("definition", "")),
|
|
answer=answer,
|
|
source="semantic",
|
|
family="semantic_definition",
|
|
)
|
|
|
|
for gloss in semantic.get("glosses", []) or []:
|
|
add_candidate(
|
|
candidates,
|
|
seen,
|
|
text=str(gloss),
|
|
answer=answer,
|
|
source="semantic",
|
|
family="semantic_gloss",
|
|
)
|
|
|
|
return candidates
|
|
|
|
|
|
def babelnet_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
|
|
babelnet = entry.get("babelnet", {})
|
|
if not isinstance(babelnet, dict) or babelnet.get("status") not in {"enriched", "ambiguous"}:
|
|
return []
|
|
|
|
candidates: List[ClueCandidate] = []
|
|
seen: set[Tuple[str, str]] = set()
|
|
|
|
best_synset = babelnet.get("best_synset", {})
|
|
if isinstance(best_synset, dict):
|
|
topic_score = int(best_synset.get("topic_score", 0) or 0)
|
|
strong_topic = bool(best_synset.get("strong_topic")) or synset_has_strong_topic_domain(best_synset, topic)
|
|
for gloss in best_synset.get("glosses", []) or []:
|
|
add_candidate(
|
|
candidates,
|
|
seen,
|
|
text=str(gloss),
|
|
answer=answer,
|
|
source="babelnet",
|
|
family="babelnet_best_gloss",
|
|
topic_score=topic_score,
|
|
strong_topic=strong_topic,
|
|
)
|
|
|
|
for synset in babelnet.get("synsets", []) or []:
|
|
if not isinstance(synset, dict):
|
|
continue
|
|
if topic and topic != "general" and not synset_has_strong_topic_domain(synset, topic):
|
|
continue
|
|
topic_score = 40 if topic and topic != "general" and synset_has_strong_topic_domain(synset, topic) else 0
|
|
for gloss in synset.get("glosses", []) or []:
|
|
add_candidate(
|
|
candidates,
|
|
seen,
|
|
text=str(gloss),
|
|
answer=answer,
|
|
source="babelnet",
|
|
family="babelnet_gloss",
|
|
topic_score=topic_score,
|
|
strong_topic=topic_score >= 40,
|
|
)
|
|
|
|
return candidates
|
|
|
|
|
|
def refined_sense_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
|
|
senses = entry.get("senses", [])
|
|
if not isinstance(senses, list):
|
|
return []
|
|
|
|
candidates: List[ClueCandidate] = []
|
|
seen: set[Tuple[str, str]] = set()
|
|
for sense in senses:
|
|
if not isinstance(sense, dict):
|
|
continue
|
|
confidence = float(sense.get("confidence", 0.0) or 0.0)
|
|
add_candidate(
|
|
candidates,
|
|
seen,
|
|
text=str(sense.get("definition", "")),
|
|
answer=answer,
|
|
source=str(sense.get("source", "refined")),
|
|
family="refined_sense",
|
|
topic_score=int(confidence * 100),
|
|
strong_topic=confidence >= 0.75,
|
|
)
|
|
return candidates
|
|
|
|
|
|
def fallback_definition(entry: Dict[str, object], answer: str) -> str:
|
|
pos = str(entry.get("pos", "")).lower()
|
|
topics = ", ".join(str(topic) for topic in entry.get("topics", []) if topic and str(topic).lower() != "general")
|
|
if topics:
|
|
return f"Termine {pos or 'lessicale'} collegato all'ambito: {topics}."
|
|
return "Termine da ricavare dagli incroci."
|
|
|
|
|
|
def all_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
|
|
candidates: List[ClueCandidate] = []
|
|
candidates.extend(semantic_candidates(entry, answer))
|
|
candidates.extend(refined_sense_candidates(entry, answer))
|
|
candidates.extend(babelnet_candidates(entry, answer, topic))
|
|
return candidates
|
|
|
|
|
|
def choose_candidate(candidates: Sequence[ClueCandidate], answer: str, difficulty: str) -> Optional[ClueCandidate]:
|
|
ranked = sorted(
|
|
candidates,
|
|
key=lambda candidate: (
|
|
score_candidate(candidate, answer, difficulty),
|
|
candidate.topic_score,
|
|
len(candidate.text),
|
|
),
|
|
reverse=True,
|
|
)
|
|
return ranked[0] if ranked else None
|
|
|
|
|
|
def definition_for_word(
|
|
word: str,
|
|
entries: Dict[str, Dict[str, object]],
|
|
topic: Optional[str] = None,
|
|
difficulty: Optional[str] = None,
|
|
) -> tuple[str, str]:
|
|
answer = word.lower()
|
|
entry = entries.get(answer, {})
|
|
if not entry:
|
|
return "Termine da ricavare dagli incroci.", "fallback"
|
|
|
|
normalized_difficulty = normalize_difficulty(difficulty)
|
|
candidates = all_candidates(entry, answer, topic)
|
|
best = choose_candidate(candidates, answer, normalized_difficulty)
|
|
if best:
|
|
return best.text, best.source
|
|
|
|
return fallback_definition(entry, answer), "fallback"
|
|
|
|
|
|
def generate_clues(
|
|
placements: Iterable[Placement],
|
|
entries: Dict[str, Dict[str, object]],
|
|
topic: Optional[str] = None,
|
|
difficulty: Optional[str] = None,
|
|
) -> List[Clue]:
|
|
clues = []
|
|
for number, placement in enumerate(placements, start=1):
|
|
text, source = definition_for_word(placement.word, entries, topic, difficulty)
|
|
direction = "orizzontale" if placement.direction == HORIZONTAL else "verticale"
|
|
clues.append(
|
|
Clue(
|
|
number=number,
|
|
word=placement.word,
|
|
direction=direction,
|
|
x=placement.x,
|
|
y=placement.y,
|
|
text=text,
|
|
source=source,
|
|
)
|
|
)
|
|
return clues
|