alpha01 backoffice: crossword engine, lexicon curation and JSON contract

This commit is contained in:
2026-04-29 13:24:04 +02:00
parent a1f8cb8577
commit 47d8957e15
20 changed files with 5985 additions and 16 deletions

182
build_review_priority.py Normal file
View File

@@ -0,0 +1,182 @@
from __future__ import annotations
import argparse
import json
from collections import Counter
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
PRIORITY_OUTPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
REASON_WEIGHTS = {
"no_viable_definition": 100,
"proper_noun_collision": 90,
"candidate_mentions_answer": 85,
"function_word": 80,
"very_short_word": 75,
"wiktextract_missing": 55,
"only_general_topics": 45,
"flagged_by_refined_stage": 35,
"unresolved_sense_topics": 30,
"babelnet_ambiguous": 20,
}
SOURCE_WEIGHTS = {
"fallback": 50,
"babelnet": 18,
"semantic": 8,
"wiktextract": 6,
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Costruisce un file di review prioritizzato partendo da to_be_review.json."
)
parser.add_argument(
"--input",
type=Path,
default=REVIEW_INPUT_PATH,
help="File to_be_review.json di partenza.",
)
parser.add_argument(
"--output",
type=Path,
default=PRIORITY_OUTPUT_PATH,
help="File to_be_review_priority.json da generare.",
)
parser.add_argument(
"--top",
type=int,
default=0,
help="Numero massimo di voci da tenere nel file priority. 0 = tutte.",
)
return parser.parse_args()
def load_json(path: Path) -> Dict[str, object]:
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: Dict[str, object]) -> None:
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
def priority_score(entry: Dict[str, object]) -> Tuple[int, int, int, int, str]:
reasons = [str(item) for item in entry.get("review_reasons", []) or []]
source = str(entry.get("preferred_source", "")).lower()
preferred_definition = str(entry.get("preferred_definition", ""))
clue_definitions = entry.get("clue_definitions", {}) or {}
form = str(entry.get("form", ""))
score = sum(REASON_WEIGHTS.get(reason, 5) for reason in reasons)
score += SOURCE_WEIGHTS.get(source, 0)
if not preferred_definition:
score += 40
clue_count = len([value for value in clue_definitions.values() if str(value).strip()])
if clue_count == 0:
score += 20
elif clue_count == 1:
score += 8
score += min(len(reasons), 5) * 3
if len(form) <= 2:
score -= 120
elif len(form) == 3:
score -= 35
severe_count = sum(
1
for reason in reasons
if reason in {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
)
return (
score,
severe_count,
int(source == "fallback"),
-len(preferred_definition),
str(entry.get("form", "")),
)
def priority_bucket(entry: Dict[str, object]) -> str:
reasons = {str(item) for item in entry.get("review_reasons", []) or []}
if reasons.intersection({"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}):
return "red"
if reasons.intersection({"function_word", "very_short_word", "wiktextract_missing", "only_general_topics"}):
return "orange"
return "yellow"
def compact_entry(entry: Dict[str, object], score_tuple: Tuple[int, int, int, int, str]) -> Dict[str, object]:
score = score_tuple[0]
compact = dict(entry)
compact["priority_score"] = score
compact["priority_bucket"] = priority_bucket(entry)
return compact
def build_priority_review(args: argparse.Namespace) -> Dict[str, object]:
payload = load_json(args.input)
if not isinstance(payload, dict) or "entries" not in payload:
raise ValueError(f"File review non valido: {args.input}")
entries = [entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict)]
ranked = sorted(
entries,
key=priority_score,
reverse=True,
)
if args.top > 0:
ranked = ranked[: args.top]
compact_entries = [compact_entry(entry, priority_score(entry)) for entry in ranked]
practical_entries = [
item
for item in compact_entries
if len(str(item.get("form", ""))) > 2
]
bucket_counter = Counter(item["priority_bucket"] for item in compact_entries)
practical_bucket_counter = Counter(item["priority_bucket"] for item in practical_entries)
reason_counter = Counter()
for item in compact_entries:
for reason in item.get("review_reasons", []):
reason_counter[str(reason)] += 1
return {
"meta": {
"language": "it",
"version": 1,
"base_review": args.input.name,
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
"entry_count": len(compact_entries),
"bucket_counts": dict(bucket_counter),
"practical_entry_count": len(practical_entries),
"practical_bucket_counts": dict(practical_bucket_counter),
"top_reason_counts": dict(reason_counter.most_common(12)),
},
"entries": compact_entries,
"practical_entries": practical_entries,
}
def main() -> None:
args = parse_args()
payload = build_priority_review(args)
write_json(args.output, payload)
print(f"Review priority generato: {args.output}")
print(f"Voci nel priority file: {payload['meta']['entry_count']}")
print(f"Bucket: {payload['meta']['bucket_counts']}")
if __name__ == "__main__":
main()