alpha01 backoffice: crossword engine, lexicon curation and JSON contract

2026-04-29 13:24:04 +02:00
parent a1f8cb8577
commit 47d8957e15
20 changed files with 5985 additions and 16 deletions
--- a/build_review_priority.py
+++ b/build_review_priority.py
@@ -0,0 +1,182 @@
+from __future__ import annotations
+
+import argparse
+import json
+from collections import Counter
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+
+REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
+PRIORITY_OUTPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
+
+REASON_WEIGHTS = {
+    "no_viable_definition": 100,
+    "proper_noun_collision": 90,
+    "candidate_mentions_answer": 85,
+    "function_word": 80,
+    "very_short_word": 75,
+    "wiktextract_missing": 55,
+    "only_general_topics": 45,
+    "flagged_by_refined_stage": 35,
+    "unresolved_sense_topics": 30,
+    "babelnet_ambiguous": 20,
+}
+
+SOURCE_WEIGHTS = {
+    "fallback": 50,
+    "babelnet": 18,
+    "semantic": 8,
+    "wiktextract": 6,
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Costruisce un file di review prioritizzato partendo da to_be_review.json."
+    )
+    parser.add_argument(
+        "--input",
+        type=Path,
+        default=REVIEW_INPUT_PATH,
+        help="File to_be_review.json di partenza.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=PRIORITY_OUTPUT_PATH,
+        help="File to_be_review_priority.json da generare.",
+    )
+    parser.add_argument(
+        "--top",
+        type=int,
+        default=0,
+        help="Numero massimo di voci da tenere nel file priority. 0 = tutte.",
+    )
+    return parser.parse_args()
+
+
+def load_json(path: Path) -> Dict[str, object]:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def write_json(path: Path, payload: Dict[str, object]) -> None:
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def priority_score(entry: Dict[str, object]) -> Tuple[int, int, int, int, str]:
+    reasons = [str(item) for item in entry.get("review_reasons", []) or []]
+    source = str(entry.get("preferred_source", "")).lower()
+    preferred_definition = str(entry.get("preferred_definition", ""))
+    clue_definitions = entry.get("clue_definitions", {}) or {}
+    form = str(entry.get("form", ""))
+
+    score = sum(REASON_WEIGHTS.get(reason, 5) for reason in reasons)
+    score += SOURCE_WEIGHTS.get(source, 0)
+
+    if not preferred_definition:
+        score += 40
+
+    clue_count = len([value for value in clue_definitions.values() if str(value).strip()])
+    if clue_count == 0:
+        score += 20
+    elif clue_count == 1:
+        score += 8
+
+    score += min(len(reasons), 5) * 3
+
+    if len(form) <= 2:
+        score -= 120
+    elif len(form) == 3:
+        score -= 35
+
+    severe_count = sum(
+        1
+        for reason in reasons
+        if reason in {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
+    )
+    return (
+        score,
+        severe_count,
+        int(source == "fallback"),
+        -len(preferred_definition),
+        str(entry.get("form", "")),
+    )
+
+
+def priority_bucket(entry: Dict[str, object]) -> str:
+    reasons = {str(item) for item in entry.get("review_reasons", []) or []}
+    if reasons.intersection({"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}):
+        return "red"
+    if reasons.intersection({"function_word", "very_short_word", "wiktextract_missing", "only_general_topics"}):
+        return "orange"
+    return "yellow"
+
+
+def compact_entry(entry: Dict[str, object], score_tuple: Tuple[int, int, int, int, str]) -> Dict[str, object]:
+    score = score_tuple[0]
+    compact = dict(entry)
+    compact["priority_score"] = score
+    compact["priority_bucket"] = priority_bucket(entry)
+    return compact
+
+
+def build_priority_review(args: argparse.Namespace) -> Dict[str, object]:
+    payload = load_json(args.input)
+    if not isinstance(payload, dict) or "entries" not in payload:
+        raise ValueError(f"File review non valido: {args.input}")
+
+    entries = [entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict)]
+    ranked = sorted(
+        entries,
+        key=priority_score,
+        reverse=True,
+    )
+
+    if args.top > 0:
+        ranked = ranked[: args.top]
+
+    compact_entries = [compact_entry(entry, priority_score(entry)) for entry in ranked]
+
+    practical_entries = [
+        item
+        for item in compact_entries
+        if len(str(item.get("form", ""))) > 2
+    ]
+
+    bucket_counter = Counter(item["priority_bucket"] for item in compact_entries)
+    practical_bucket_counter = Counter(item["priority_bucket"] for item in practical_entries)
+    reason_counter = Counter()
+    for item in compact_entries:
+        for reason in item.get("review_reasons", []):
+            reason_counter[str(reason)] += 1
+
+    return {
+        "meta": {
+            "language": "it",
+            "version": 1,
+            "base_review": args.input.name,
+            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
+            "entry_count": len(compact_entries),
+            "bucket_counts": dict(bucket_counter),
+            "practical_entry_count": len(practical_entries),
+            "practical_bucket_counts": dict(practical_bucket_counter),
+            "top_reason_counts": dict(reason_counter.most_common(12)),
+        },
+        "entries": compact_entries,
+        "practical_entries": practical_entries,
+    }
+
+
+def main() -> None:
+    args = parse_args()
+    payload = build_priority_review(args)
+    write_json(args.output, payload)
+    print(f"Review priority generato: {args.output}")
+    print(f"Voci nel priority file: {payload['meta']['entry_count']}")
+    print(f"Bucket: {payload['meta']['bucket_counts']}")
+
+
+if __name__ == "__main__":
+    main()