cruciverba_1/build_review_priority.py

from __future__ import annotations

import argparse
import json
from collections import Counter
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple


REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
PRIORITY_OUTPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")

REASON_WEIGHTS = {
    "no_viable_definition": 100,
    "proper_noun_collision": 90,
    "candidate_mentions_answer": 85,
    "function_word": 80,
    "very_short_word": 75,
    "wiktextract_missing": 55,
    "only_general_topics": 45,
    "flagged_by_refined_stage": 35,
    "unresolved_sense_topics": 30,
    "babelnet_ambiguous": 20,
}

SOURCE_WEIGHTS = {
    "fallback": 50,
    "babelnet": 18,
    "semantic": 8,
    "wiktextract": 6,
}


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Costruisce un file di review prioritizzato partendo da to_be_review.json."
    )
    parser.add_argument(
        "--input",
        type=Path,
        default=REVIEW_INPUT_PATH,
        help="File to_be_review.json di partenza.",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=PRIORITY_OUTPUT_PATH,
        help="File to_be_review_priority.json da generare.",
    )
    parser.add_argument(
        "--top",
        type=int,
        default=0,
        help="Numero massimo di voci da tenere nel file priority. 0 = tutte.",
    )
    return parser.parse_args()


def load_json(path: Path) -> Dict[str, object]:
    return json.loads(path.read_text(encoding="utf-8"))


def write_json(path: Path, payload: Dict[str, object]) -> None:
    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")


def priority_score(entry: Dict[str, object]) -> Tuple[int, int, int, int, str]:
    reasons = [str(item) for item in entry.get("review_reasons", []) or []]
    source = str(entry.get("preferred_source", "")).lower()
    preferred_definition = str(entry.get("preferred_definition", ""))
    clue_definitions = entry.get("clue_definitions", {}) or {}
    form = str(entry.get("form", ""))

    score = sum(REASON_WEIGHTS.get(reason, 5) for reason in reasons)
    score += SOURCE_WEIGHTS.get(source, 0)

    if not preferred_definition:
        score += 40

    clue_count = len([value for value in clue_definitions.values() if str(value).strip()])
    if clue_count == 0:
        score += 20
    elif clue_count == 1:
        score += 8

    score += min(len(reasons), 5) * 3

    if len(form) <= 2:
        score -= 120
    elif len(form) == 3:
        score -= 35

    severe_count = sum(
        1
        for reason in reasons
        if reason in {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
    )
    return (
        score,
        severe_count,
        int(source == "fallback"),
        -len(preferred_definition),
        str(entry.get("form", "")),
    )


def priority_bucket(entry: Dict[str, object]) -> str:
    reasons = {str(item) for item in entry.get("review_reasons", []) or []}
    if reasons.intersection({"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}):
        return "red"
    if reasons.intersection({"function_word", "very_short_word", "wiktextract_missing", "only_general_topics"}):
        return "orange"
    return "yellow"


def compact_entry(entry: Dict[str, object], score_tuple: Tuple[int, int, int, int, str]) -> Dict[str, object]:
    score = score_tuple[0]
    compact = dict(entry)
    compact["priority_score"] = score
    compact["priority_bucket"] = priority_bucket(entry)
    return compact


def build_priority_review(args: argparse.Namespace) -> Dict[str, object]:
    payload = load_json(args.input)
    if not isinstance(payload, dict) or "entries" not in payload:
        raise ValueError(f"File review non valido: {args.input}")

    entries = [entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict)]
    ranked = sorted(
        entries,
        key=priority_score,
        reverse=True,
    )

    if args.top > 0:
        ranked = ranked[: args.top]

    compact_entries = [compact_entry(entry, priority_score(entry)) for entry in ranked]

    practical_entries = [
        item
        for item in compact_entries
        if len(str(item.get("form", ""))) > 2
    ]

    bucket_counter = Counter(item["priority_bucket"] for item in compact_entries)
    practical_bucket_counter = Counter(item["priority_bucket"] for item in practical_entries)
    reason_counter = Counter()
    for item in compact_entries:
        for reason in item.get("review_reasons", []):
            reason_counter[str(reason)] += 1

    return {
        "meta": {
            "language": "it",
            "version": 1,
            "base_review": args.input.name,
            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
            "entry_count": len(compact_entries),
            "bucket_counts": dict(bucket_counter),
            "practical_entry_count": len(practical_entries),
            "practical_bucket_counts": dict(practical_bucket_counter),
            "top_reason_counts": dict(reason_counter.most_common(12)),
        },
        "entries": compact_entries,
        "practical_entries": practical_entries,
    }


def main() -> None:
    args = parse_args()
    payload = build_priority_review(args)
    write_json(args.output, payload)
    print(f"Review priority generato: {args.output}")
    print(f"Voci nel priority file: {payload['meta']['entry_count']}")
    print(f"Bucket: {payload['meta']['bucket_counts']}")


if __name__ == "__main__":
    main()