cruciverba_1/curate_lexicon_alpha.py

from __future__ import annotations

import argparse
import json
import re
from copy import deepcopy
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple

from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH


CURATED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated.json")
TO_BE_REVIEW_OUTPUT_PATH = Path(__file__).with_name("to_be_review.json")

DIFFICULTIES = ("easy", "medium", "hard", "expert")

TEXT_REPLACEMENTS = {
    " ngrandimento": " ingrandimento",
    "superificie": "superficie",
    "quantitaaa": "quantità",
    "quantitaaaa": "quantità",
    "quantit": "quantità",
    "sanit_militare": "sanità_militare",
    " unaparola ": " una parola ",
    "questa parola, ": "",
    "questa parola; ": "",
}

SUSPICIOUS_PROPER_PATTERNS = (
    r"\bepisodio\b",
    r"\bfilm\b",
    r"\bserie tv\b",
    r"\bfamiglia\b",
    r"\bcomune italiano\b",
    r"\bfrazione del comune\b",
    r"\bcitta metropolitana\b",
    r"\bpersonaggio\b",
    r"\balbum\b",
    r"\bcognome\b",
    r"\bnome proprio\b",
)

DOMAIN_HINTS = {
    "religion": ("monastero", "abbazia", "sacerdot", "prete", "vescovo", "clero", "religios"),
    "transport": ("veicolo", "motore", "aereo", "treno", "nave", "trasport", "rimorch", "reattor"),
    "health": ("malat", "ferit", "ospedal", "medic", "sanitar", "cura", "paziente"),
    "nature": ("animale", "pianta", "mare", "bosco", "albero", "fiore", "montagna", "acque", "salate"),
    "geography": ("comune", "paese", "regione", "provincia", "isola", "citta", "territorio"),
    "sea": ("acque", "salate", "superficie terrestre", "oceano"),
}

ABSTRACT_PATTERNS = (
    r"\bgrande quantita\b",
    r"\bfigurato\b",
    r"\bsenso figurato\b",
)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Cura il lessico refined per la milestone alpha e separa i casi dubbi in to_be_review.json."
    )
    parser.add_argument(
        "--input",
        type=Path,
        default=REFINED_LEXICON_OUTPUT_PATH,
        help="Lessico refined di partenza.",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=CURATED_LEXICON_OUTPUT_PATH,
        help="Lessico curated da generare.",
    )
    parser.add_argument(
        "--review-output",
        type=Path,
        default=TO_BE_REVIEW_OUTPUT_PATH,
        help="File JSON con le voci che richiedono revisione umana.",
    )
    parser.add_argument(
        "--max-review",
        type=int,
        default=0,
        help="Limite opzionale di voci da esportare in to_be_review.json. 0 = tutte.",
    )
    return parser.parse_args()


def load_json(path: Path) -> Dict[str, object]:
    return json.loads(path.read_text(encoding="utf-8"))


def write_json(path: Path, payload: Dict[str, object]) -> None:
    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")


def dedupe(items: Iterable[str]) -> List[str]:
    result: List[str] = []
    seen = set()
    for item in items:
        text = str(item).strip()
        if not text:
            continue
        key = text.lower()
        if key in seen:
            continue
        seen.add(key)
        result.append(text)
    return result


def ascii_fold(text: str) -> str:
    replacements = str.maketrans(
        {
            "à": "a",
            "á": "a",
            "è": "e",
            "é": "e",
            "ì": "i",
            "í": "i",
            "ò": "o",
            "ó": "o",
            "ù": "u",
            "ú": "u",
        }
    )
    return str(text).translate(replacements)


def normalize_text(text: str) -> str:
    value = str(text or "").strip()
    if not value:
        return ""
    for old, new in TEXT_REPLACEMENTS.items():
        value = value.replace(old, new)
    value = re.sub(r"\s+", " ", value)
    value = re.sub(r"\s*;\s*", "; ", value)
    value = re.sub(r"\s*,\s*", ", ", value)
    value = value.strip(" .;:-")
    if value and value[0].islower():
        value = value[0].upper() + value[1:]
    return value + "."


def split_definition_text(text: str) -> List[str]:
    value = str(text or "").strip()
    if not value:
        return []
    pieces = re.split(r"\s*;\s+|\.\s+(?=[a-zàèéìòù])", value, flags=re.IGNORECASE)
    normalized = []
    for piece in pieces:
        cleaned = normalize_text(piece)
        if cleaned:
            normalized.append(cleaned)
    return normalized


def entry_is_common_word(entry: Dict[str, object]) -> bool:
    form = str(entry.get("form", ""))
    return bool(form) and form[:1].islower() and not (entry.get("name_tags") or [])


def definition_mentions_answer(text: str, answer: str) -> bool:
    normalized_text = ascii_fold(text).lower()
    normalized_answer = ascii_fold(answer).lower()
    return bool(re.search(re.escape(normalized_answer), normalized_text))


def suspicious_proper_noun_definition(text: str, entry: Dict[str, object]) -> bool:
    if not entry_is_common_word(entry):
        return False
    lowered = ascii_fold(text).lower()
    return any(re.search(pattern, lowered) for pattern in SUSPICIOUS_PROPER_PATTERNS)


def likely_abstract_detour(text: str) -> bool:
    lowered = ascii_fold(text).lower()
    return any(re.search(pattern, lowered) for pattern in ABSTRACT_PATTERNS)


def semantic_topics(entry: Dict[str, object]) -> List[str]:
    semantic = entry.get("semantic", {})
    topics = []
    if isinstance(semantic, dict):
        topics.extend(str(item).lower() for item in semantic.get("semantic_topics", []) or [])
    wiktextract = entry.get("wiktextract", {})
    if isinstance(wiktextract, dict):
        topics.extend(str(item).lower() for item in wiktextract.get("topic_hints", []) or [])
    return dedupe(topics)


def lexical_topics(entry: Dict[str, object]) -> List[str]:
    return [str(item).lower() for item in entry.get("topics", []) or [] if item]


def topic_alignment_score(text: str, entry: Dict[str, object]) -> int:
    lowered = ascii_fold(text).lower()
    score = 0
    topics = set(lexical_topics(entry)) | set(semantic_topics(entry))
    for topic in topics:
        for hint in DOMAIN_HINTS.get(topic, ()):
            if hint in lowered:
                score += 16
    return score


def candidate_style(text: str) -> str:
    lowered = ascii_fold(text).lower()
    if ";" in text or len(text) > 90:
        return "direct"
    if any(marker in lowered for marker in ("chi ", "che ", "strumento", "veicolo", "titolo", "parte di")):
        return "balanced"
    return "oblique"


def length_window(difficulty: str) -> Tuple[int, int]:
    if difficulty == "easy":
        return 18, 90
    if difficulty == "medium":
        return 18, 78
    if difficulty == "hard":
        return 14, 62
    return 12, 55


def build_candidate(
    text: str,
    *,
    source: str,
    family: str,
    confidence: float,
    priority: int = 0,
) -> Dict[str, object]:
    cleaned = normalize_text(text)
    return {
        "text": cleaned,
        "source": source,
        "family": family,
        "confidence": confidence,
        "style": candidate_style(cleaned),
        "priority": priority,
    }


def collect_candidates(entry: Dict[str, object]) -> List[Dict[str, object]]:
    candidates: List[Dict[str, object]] = []
    seen = set()

    semantic = entry.get("semantic", {})
    if isinstance(semantic, dict):
        for index, synset in enumerate(semantic.get("synsets", []) or []):
            if not isinstance(synset, dict):
                continue
            for piece in split_definition_text(str(synset.get("definition", ""))):
                candidate = build_candidate(
                    piece,
                    source="semantic",
                    family="semantic_definition",
                    confidence=0.9,
                    priority=max(0, 100 - index * 12),
                )
                key = (candidate["text"].lower(), candidate["family"])
                if candidate["text"] and key not in seen:
                    seen.add(key)
                    candidates.append(candidate)
        for index, gloss in enumerate(semantic.get("glosses", []) or []):
            for piece in split_definition_text(str(gloss)):
                candidate = build_candidate(
                    piece,
                    source="semantic_gloss",
                    family="semantic_gloss",
                    confidence=0.8,
                    priority=max(0, 90 - index * 10),
                )
                key = (candidate["text"].lower(), candidate["family"])
                if candidate["text"] and key not in seen:
                    seen.add(key)
                    candidates.append(candidate)

    for index, sense in enumerate(entry.get("senses", []) or []):
        if not isinstance(sense, dict):
            continue
        for piece in split_definition_text(str(sense.get("definition", ""))):
            source = str(sense.get("source", "refined"))
            candidate = build_candidate(
                piece,
                source="refined" if source == "semantic" else source,
                family="refined_sense",
                confidence=float(sense.get("confidence", 0.7) or 0.7),
                priority=max(0, 80 - index * 8),
            )
            key = (candidate["text"].lower(), candidate["family"])
            if candidate["text"] and key not in seen:
                seen.add(key)
                candidates.append(candidate)

    babelnet = entry.get("babelnet", {})
    if isinstance(babelnet, dict):
        best_synset = babelnet.get("best_synset", {})
        if isinstance(best_synset, dict):
            confidence = 0.85 if babelnet.get("status") == "enriched" else 0.55
            for index, gloss in enumerate(best_synset.get("glosses", []) or []):
                for piece in split_definition_text(str(gloss)):
                    candidate = build_candidate(
                        piece,
                        source="babelnet",
                        family="babelnet_gloss",
                        confidence=confidence,
                        priority=max(0, 60 - index * 8),
                    )
                    key = (candidate["text"].lower(), candidate["family"])
                    if candidate["text"] and key not in seen:
                        seen.add(key)
                        candidates.append(candidate)

    wiktextract = entry.get("wiktextract", {})
    if isinstance(wiktextract, dict):
        definitions = wiktextract.get("definitions", []) or []
        confidence = 0.78 if wiktextract.get("matched") else 0.45
        for index, definition in enumerate(definitions):
            for piece in split_definition_text(str(definition)):
                candidate = build_candidate(
                    piece,
                    source="wiktextract",
                    family="wiktextract_definition",
                    confidence=confidence,
                    priority=max(0, 88 - index * 9),
                )
                key = (candidate["text"].lower(), candidate["family"])
                if candidate["text"] and key not in seen:
                    seen.add(key)
                    candidates.append(candidate)

    return candidates


def score_candidate(candidate: Dict[str, object], entry: Dict[str, object], difficulty: str) -> int:
    text = str(candidate["text"])
    answer = str(entry.get("form", "")).lower()
    score = 0

    source = str(candidate.get("source"))
    family = str(candidate.get("family"))
    confidence = float(candidate.get("confidence", 0.0) or 0.0)

    if len(text) < 12:
        return -10_000

    if definition_mentions_answer(text, answer):
        score -= 140
    else:
        score += 30

    if suspicious_proper_noun_definition(text, entry):
        score -= 220

    if likely_abstract_detour(text):
        score -= 80

    min_len, max_len = length_window(difficulty)
    if min_len <= len(text) <= max_len:
        score += 24
    else:
        score -= abs(len(text) - max_len) if len(text) > max_len else abs(min_len - len(text)) // 2

    source_bonus = {
        "semantic": 55,
        "semantic_gloss": 40,
        "babelnet": 24,
        "refined": 30,
        "wiktextract": 52,
    }
    score += source_bonus.get(source, 10)

    family_bonus = {
        "semantic_definition": 30,
        "semantic_gloss": 18,
        "babelnet_gloss": 8,
        "refined_sense": 22,
        "wiktextract_definition": 28,
    }
    score += family_bonus.get(family, 0)

    score += int(candidate.get("priority", 0) or 0)
    score += int(confidence * 35)

    alignment = topic_alignment_score(text, entry)
    score += alignment
    topical = set(lexical_topics(entry)) | set(semantic_topics(entry))
    concrete_topics = topical.intersection({"religion", "transport", "health", "nature", "geography", "sea"})
    if concrete_topics and alignment == 0:
        score -= 45

    style = str(candidate.get("style"))
    if difficulty == "easy" and style == "direct":
        score += 16
    elif difficulty == "medium" and style in {"direct", "balanced"}:
        score += 14
    elif difficulty == "hard" and style == "balanced":
        score += 10
    elif difficulty == "expert" and style == "oblique":
        score += 10

    if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", text.lower()):
        score -= 30
    if difficulty in {"hard", "expert"} and ";" in text:
        score -= 10

    if entry.get("needs_review"):
        score -= 8

    return score


def choose_best_candidate(
    candidates: Sequence[Dict[str, object]],
    entry: Dict[str, object],
    difficulty: str,
) -> Optional[Dict[str, object]]:
    ranked = sorted(
        candidates,
        key=lambda candidate: (
            score_candidate(candidate, entry, difficulty),
            float(candidate.get("confidence", 0.0)),
            float(candidate.get("priority", 0.0)),
            -len(str(candidate.get("text", ""))),
        ),
        reverse=True,
    )
    return ranked[0] if ranked else None


def review_reasons(entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[str]:
    reasons: List[str] = []
    form = str(entry.get("form", ""))
    lowered_topics = set(lexical_topics(entry))
    semantic_topic_set = set(semantic_topics(entry))
    babelnet_status = str((entry.get("babelnet") or {}).get("status", ""))
    wiktextract = entry.get("wiktextract", {})
    wiktextract_status = str(wiktextract.get("status", "")) if isinstance(wiktextract, dict) else ""
    preferred_definition = str(entry.get("preferred_definition", ""))
    preferred_source = str(entry.get("preferred_source", ""))

    if not candidates:
        reasons.append("no_viable_definition")
    if not preferred_definition and entry.get("needs_review"):
        reasons.append("flagged_by_refined_stage")
    if preferred_definition and suspicious_proper_noun_definition(preferred_definition, entry):
        reasons.append("proper_noun_collision")
    if babelnet_status == "ambiguous" and preferred_source == "babelnet":
        reasons.append("babelnet_ambiguous")
    if wiktextract_status in {"missing", "no_match"} and not preferred_definition:
        reasons.append("wiktextract_missing")
    if lowered_topics == {"general"} and not semantic_topic_set and not preferred_definition:
        reasons.append("only_general_topics")
    if len(form) <= 2:
        reasons.append("very_short_word")
    if str(entry.get("pos", "")) in {"PREP", "CONJ"}:
        reasons.append("function_word")
    if preferred_source == "babelnet" and any("None" in str(sense.get("topics")) for sense in entry.get("senses", []) if isinstance(sense, dict)):
        reasons.append("unresolved_sense_topics")
    if preferred_definition and definition_mentions_answer(preferred_definition, form.lower()):
        reasons.append("candidate_mentions_answer")

    return dedupe(reasons)


def curate_entry(entry: Dict[str, object]) -> Tuple[Dict[str, object], Optional[Dict[str, object]]]:
    curated = deepcopy(entry)
    candidates = collect_candidates(curated)

    clue_definitions: Dict[str, str] = {}
    clue_sources: Dict[str, str] = {}
    clue_scores: Dict[str, int] = {}
    curation_notes: List[str] = []

    for difficulty in DIFFICULTIES:
        best = choose_best_candidate(candidates, curated, difficulty)
        if best:
            clue_definitions[difficulty] = str(best["text"])
            clue_sources[difficulty] = str(best["source"])
            clue_scores[difficulty] = score_candidate(best, curated, difficulty)

    preferred_definition = clue_definitions.get("medium") or clue_definitions.get("easy") or ""
    preferred_source = clue_sources.get("medium") or clue_sources.get("easy") or "fallback"

    if preferred_definition:
        curation_notes.append(f"preferred_from={preferred_source}")
    if clue_scores.get("medium", -9999) < 20:
        curation_notes.append("weak_medium_definition")

    curated["curated_glosses"] = dedupe(candidate["text"] for candidate in candidates)
    curated["curated_senses"] = [
        {
            "definition": candidate["text"],
            "source": candidate["source"],
            "family": candidate["family"],
            "confidence": candidate["confidence"],
            "priority": candidate["priority"],
        }
        for candidate in candidates
    ]
    curated["preferred_definition"] = preferred_definition
    curated["preferred_source"] = preferred_source
    curated["clue_definitions"] = clue_definitions
    curated["clue_sources"] = clue_sources
    curated["clue_scores"] = clue_scores
    curated["curation_notes"] = curation_notes

    reasons = review_reasons(curated, candidates)
    severe = {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
    alpha_ready = bool(preferred_definition) and not severe.intersection(reasons)
    curated["alpha_ready"] = alpha_ready
    curated["review_reasons"] = reasons

    review_item = None
    if reasons:
        review_item = {
            "form": curated.get("form"),
            "lemma": curated.get("lemma"),
            "pos": curated.get("pos"),
            "topics": curated.get("topics"),
            "topic_suggestions": curated.get("topic_suggestions"),
            "preferred_definition": preferred_definition,
            "preferred_source": preferred_source,
            "clue_definitions": clue_definitions,
            "review_reasons": reasons,
            "semantic_glosses": (curated.get("semantic") or {}).get("glosses", []),
            "senses": curated.get("senses", []),
            "babelnet_status": (curated.get("babelnet") or {}).get("status"),
            "babelnet_best_synset": (curated.get("babelnet") or {}).get("best_synset"),
            "wiktextract_status": (curated.get("wiktextract") or {}).get("status"),
            "wiktextract": curated.get("wiktextract"),
            "candidate_pool": [
                {
                    "text": candidate["text"],
                    "source": candidate["source"],
                    "family": candidate["family"],
                    "confidence": candidate["confidence"],
                    "priority": candidate["priority"],
                }
                for candidate in candidates[:12]
            ],
        }

    return curated, review_item


def build_curated_lexicon(args: argparse.Namespace) -> Tuple[Dict[str, object], Dict[str, object]]:
    payload = load_json(args.input)
    if not isinstance(payload, dict) or "entries" not in payload:
        raise ValueError(f"Lessico refined non valido: {args.input}")

    curated_entries: List[Dict[str, object]] = []
    review_entries: List[Dict[str, object]] = []

    for entry in payload.get("entries", []) or []:
        if not isinstance(entry, dict):
            continue
        curated, review_item = curate_entry(entry)
        curated_entries.append(curated)
        if review_item:
            review_entries.append(review_item)

    if args.max_review > 0:
        review_entries = review_entries[: args.max_review]

    curated_payload = {
        "meta": {
            "language": "it",
            "version": 1,
            "base_lexicon": args.input.name,
            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
            "entry_count": len(curated_entries),
            "alpha_ready_count": sum(1 for item in curated_entries if item.get("alpha_ready")),
            "review_count": len(review_entries),
        },
        "entries": curated_entries,
    }

    review_payload = {
        "meta": {
            "language": "it",
            "version": 1,
            "base_lexicon": args.input.name,
            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
            "entry_count": len(review_entries),
        },
        "entries": review_entries,
    }

    return curated_payload, review_payload


def main() -> None:
    args = parse_args()
    curated_payload, review_payload = build_curated_lexicon(args)
    write_json(args.output, curated_payload)
    write_json(args.review_output, review_payload)
    print(f"Lessico curated generato: {args.output}")
    print(f"Voci totali: {curated_payload['meta']['entry_count']}")
    print(f"Voci alpha_ready: {curated_payload['meta']['alpha_ready_count']}")
    print(f"Voci da revisionare: {review_payload['meta']['entry_count']}")
    print(f"File review generato: {args.review_output}")


if __name__ == "__main__":
    main()