from __future__ import annotations import argparse import json from datetime import datetime from pathlib import Path from typing import Dict, List PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json") PATCH_OUTPUT_PATH = Path(__file__).with_name("treccani_rescue_patch.json") def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description=( "Estrae un lotto prioritario dal file to_be_review_priority.json per preparare una patch " "manuale/assistita di rescue lessicale." ) ) parser.add_argument( "--input", type=Path, default=PRIORITY_INPUT_PATH, help="File to_be_review_priority.json di partenza.", ) parser.add_argument( "--output", type=Path, default=PATCH_OUTPUT_PATH, help="Patch JSON da generare o aggiornare.", ) parser.add_argument( "--limit", type=int, default=100, help="Numero massimo di voci da preparare nel lotto.", ) parser.add_argument( "--bucket", default="red", help="Bucket di priorita da considerare: red, orange, yellow oppure all.", ) return parser.parse_args() def load_json(path: Path, default: object) -> object: if not path.exists(): return default return json.loads(path.read_text(encoding="utf-8")) def write_json(path: Path, payload: object) -> None: path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") def build_record(entry: Dict[str, object]) -> Dict[str, object]: return { "form": entry.get("form"), "lemma": entry.get("lemma"), "pos": entry.get("pos"), "priority_bucket": entry.get("priority_bucket"), "priority_score": entry.get("priority_score"), "review_reasons": entry.get("review_reasons", []), "current_topics": entry.get("topics", []), "current_definition": entry.get("preferred_definition", ""), "current_source": entry.get("preferred_source", ""), "rescue_definition": "", "rescue_source": "treccani_rescue", "rescue_topics": [], "rescue_semantic_tags": [], "rescue_notes": "", "status": "pending", } def build_patch(args: argparse.Namespace) -> Dict[str, object]: payload = load_json(args.input, {"entries": []}) if not isinstance(payload, dict): raise ValueError(f"File priority non valido: {args.input}") existing_patch = load_json(args.output, {"entries": []}) if not isinstance(existing_patch, dict): existing_patch = {"entries": []} existing_by_form = { str(entry.get("form", "")).lower(): entry for entry in existing_patch.get("entries", []) or [] if isinstance(entry, dict) and entry.get("form") } bucket = str(args.bucket or "red").strip().lower() source_entries = payload.get("practical_entries") or payload.get("entries") or [] selected: List[Dict[str, object]] = [] for entry in source_entries: if not isinstance(entry, dict): continue if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket: continue form = str(entry.get("form", "")).strip().lower() if not form: continue selected.append(entry) if len(selected) >= max(1, int(args.limit)): break merged_records = [] seen = set() for entry in selected: form = str(entry.get("form", "")).strip().lower() if form in existing_by_form: merged_records.append(existing_by_form[form]) else: merged_records.append(build_record(entry)) seen.add(form) for form, entry in existing_by_form.items(): if form not in seen: merged_records.append(entry) merged_records.sort( key=lambda item: ( {"pending": 0, "drafted": 1, "reviewed": 2, "done": 3}.get(str(item.get("status", "pending")), 9), -int(item.get("priority_score", 0) or 0), str(item.get("form", "")), ) ) return { "meta": { "language": "it", "version": 1, "base_priority": args.input.name, "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), "batch_bucket": bucket, "batch_limit": int(args.limit), "entry_count": len(merged_records), }, "entries": merged_records, } def main() -> None: args = parse_args() payload = build_patch(args) write_json(args.output, payload) print(f"Patch rescue generata: {args.output}") print(f"Voci nel lotto: {payload['meta']['entry_count']}") if __name__ == "__main__": main()