Files
cruciverba_1/build_treccani_rescue_patch.py

154 lines
4.7 KiB
Python

from __future__ import annotations
import argparse
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List
PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
PATCH_OUTPUT_PATH = Path(__file__).with_name("treccani_rescue_patch.json")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
"Estrae un lotto prioritario dal file to_be_review_priority.json per preparare una patch "
"manuale/assistita di rescue lessicale."
)
)
parser.add_argument(
"--input",
type=Path,
default=PRIORITY_INPUT_PATH,
help="File to_be_review_priority.json di partenza.",
)
parser.add_argument(
"--output",
type=Path,
default=PATCH_OUTPUT_PATH,
help="Patch JSON da generare o aggiornare.",
)
parser.add_argument(
"--limit",
type=int,
default=100,
help="Numero massimo di voci da preparare nel lotto.",
)
parser.add_argument(
"--bucket",
default="red",
help="Bucket di priorita da considerare: red, orange, yellow oppure all.",
)
return parser.parse_args()
def load_json(path: Path, default: object) -> object:
if not path.exists():
return default
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: object) -> None:
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
def build_record(entry: Dict[str, object]) -> Dict[str, object]:
return {
"form": entry.get("form"),
"lemma": entry.get("lemma"),
"pos": entry.get("pos"),
"priority_bucket": entry.get("priority_bucket"),
"priority_score": entry.get("priority_score"),
"review_reasons": entry.get("review_reasons", []),
"current_topics": entry.get("topics", []),
"current_definition": entry.get("preferred_definition", ""),
"current_source": entry.get("preferred_source", ""),
"rescue_definition": "",
"rescue_source": "treccani_rescue",
"rescue_topics": [],
"rescue_semantic_tags": [],
"rescue_notes": "",
"status": "pending",
}
def build_patch(args: argparse.Namespace) -> Dict[str, object]:
payload = load_json(args.input, {"entries": []})
if not isinstance(payload, dict):
raise ValueError(f"File priority non valido: {args.input}")
existing_patch = load_json(args.output, {"entries": []})
if not isinstance(existing_patch, dict):
existing_patch = {"entries": []}
existing_by_form = {
str(entry.get("form", "")).lower(): entry
for entry in existing_patch.get("entries", []) or []
if isinstance(entry, dict) and entry.get("form")
}
bucket = str(args.bucket or "red").strip().lower()
source_entries = payload.get("practical_entries") or payload.get("entries") or []
selected: List[Dict[str, object]] = []
for entry in source_entries:
if not isinstance(entry, dict):
continue
if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket:
continue
form = str(entry.get("form", "")).strip().lower()
if not form:
continue
selected.append(entry)
if len(selected) >= max(1, int(args.limit)):
break
merged_records = []
seen = set()
for entry in selected:
form = str(entry.get("form", "")).strip().lower()
if form in existing_by_form:
merged_records.append(existing_by_form[form])
else:
merged_records.append(build_record(entry))
seen.add(form)
for form, entry in existing_by_form.items():
if form not in seen:
merged_records.append(entry)
merged_records.sort(
key=lambda item: (
{"pending": 0, "drafted": 1, "reviewed": 2, "done": 3}.get(str(item.get("status", "pending")), 9),
-int(item.get("priority_score", 0) or 0),
str(item.get("form", "")),
)
)
return {
"meta": {
"language": "it",
"version": 1,
"base_priority": args.input.name,
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
"batch_bucket": bucket,
"batch_limit": int(args.limit),
"entry_count": len(merged_records),
},
"entries": merged_records,
}
def main() -> None:
args = parse_args()
payload = build_patch(args)
write_json(args.output, payload)
print(f"Patch rescue generata: {args.output}")
print(f"Voci nel lotto: {payload['meta']['entry_count']}")
if __name__ == "__main__":
main()