from __future__ import annotations import argparse import json from collections import Counter from datetime import datetime from pathlib import Path from typing import Dict, List, Tuple REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json") PRIORITY_OUTPUT_PATH = Path(__file__).with_name("to_be_review_priority.json") REASON_WEIGHTS = { "no_viable_definition": 100, "proper_noun_collision": 90, "candidate_mentions_answer": 85, "function_word": 80, "very_short_word": 75, "wiktextract_missing": 55, "only_general_topics": 45, "flagged_by_refined_stage": 35, "unresolved_sense_topics": 30, "babelnet_ambiguous": 20, } SOURCE_WEIGHTS = { "fallback": 50, "babelnet": 18, "semantic": 8, "wiktextract": 6, } def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Costruisce un file di review prioritizzato partendo da to_be_review.json." ) parser.add_argument( "--input", type=Path, default=REVIEW_INPUT_PATH, help="File to_be_review.json di partenza.", ) parser.add_argument( "--output", type=Path, default=PRIORITY_OUTPUT_PATH, help="File to_be_review_priority.json da generare.", ) parser.add_argument( "--top", type=int, default=0, help="Numero massimo di voci da tenere nel file priority. 0 = tutte.", ) return parser.parse_args() def load_json(path: Path) -> Dict[str, object]: return json.loads(path.read_text(encoding="utf-8")) def write_json(path: Path, payload: Dict[str, object]) -> None: path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") def priority_score(entry: Dict[str, object]) -> Tuple[int, int, int, int, str]: reasons = [str(item) for item in entry.get("review_reasons", []) or []] source = str(entry.get("preferred_source", "")).lower() preferred_definition = str(entry.get("preferred_definition", "")) clue_definitions = entry.get("clue_definitions", {}) or {} form = str(entry.get("form", "")) score = sum(REASON_WEIGHTS.get(reason, 5) for reason in reasons) score += SOURCE_WEIGHTS.get(source, 0) if not preferred_definition: score += 40 clue_count = len([value for value in clue_definitions.values() if str(value).strip()]) if clue_count == 0: score += 20 elif clue_count == 1: score += 8 score += min(len(reasons), 5) * 3 if len(form) <= 2: score -= 120 elif len(form) == 3: score -= 35 severe_count = sum( 1 for reason in reasons if reason in {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"} ) return ( score, severe_count, int(source == "fallback"), -len(preferred_definition), str(entry.get("form", "")), ) def priority_bucket(entry: Dict[str, object]) -> str: reasons = {str(item) for item in entry.get("review_reasons", []) or []} if reasons.intersection({"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}): return "red" if reasons.intersection({"function_word", "very_short_word", "wiktextract_missing", "only_general_topics"}): return "orange" return "yellow" def compact_entry(entry: Dict[str, object], score_tuple: Tuple[int, int, int, int, str]) -> Dict[str, object]: score = score_tuple[0] compact = dict(entry) compact["priority_score"] = score compact["priority_bucket"] = priority_bucket(entry) return compact def build_priority_review(args: argparse.Namespace) -> Dict[str, object]: payload = load_json(args.input) if not isinstance(payload, dict) or "entries" not in payload: raise ValueError(f"File review non valido: {args.input}") entries = [entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict)] ranked = sorted( entries, key=priority_score, reverse=True, ) if args.top > 0: ranked = ranked[: args.top] compact_entries = [compact_entry(entry, priority_score(entry)) for entry in ranked] practical_entries = [ item for item in compact_entries if len(str(item.get("form", ""))) > 2 ] bucket_counter = Counter(item["priority_bucket"] for item in compact_entries) practical_bucket_counter = Counter(item["priority_bucket"] for item in practical_entries) reason_counter = Counter() for item in compact_entries: for reason in item.get("review_reasons", []): reason_counter[str(reason)] += 1 return { "meta": { "language": "it", "version": 1, "base_review": args.input.name, "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), "entry_count": len(compact_entries), "bucket_counts": dict(bucket_counter), "practical_entry_count": len(practical_entries), "practical_bucket_counts": dict(practical_bucket_counter), "top_reason_counts": dict(reason_counter.most_common(12)), }, "entries": compact_entries, "practical_entries": practical_entries, } def main() -> None: args = parse_args() payload = build_priority_review(args) write_json(args.output, payload) print(f"Review priority generato: {args.output}") print(f"Voci nel priority file: {payload['meta']['entry_count']}") print(f"Bucket: {payload['meta']['bucket_counts']}") if __name__ == "__main__": main()