alpha01 backoffice: crossword engine, lexicon curation and JSON contract
This commit is contained in:
153
build_treccani_rescue_patch.py
Normal file
153
build_treccani_rescue_patch.py
Normal file
@@ -0,0 +1,153 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
||||
PATCH_OUTPUT_PATH = Path(__file__).with_name("treccani_rescue_patch.json")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Estrae un lotto prioritario dal file to_be_review_priority.json per preparare una patch "
|
||||
"manuale/assistita di rescue lessicale."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=PRIORITY_INPUT_PATH,
|
||||
help="File to_be_review_priority.json di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=PATCH_OUTPUT_PATH,
|
||||
help="Patch JSON da generare o aggiornare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Numero massimo di voci da preparare nel lotto.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bucket",
|
||||
default="red",
|
||||
help="Bucket di priorita da considerare: red, orange, yellow oppure all.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def build_record(entry: Dict[str, object]) -> Dict[str, object]:
|
||||
return {
|
||||
"form": entry.get("form"),
|
||||
"lemma": entry.get("lemma"),
|
||||
"pos": entry.get("pos"),
|
||||
"priority_bucket": entry.get("priority_bucket"),
|
||||
"priority_score": entry.get("priority_score"),
|
||||
"review_reasons": entry.get("review_reasons", []),
|
||||
"current_topics": entry.get("topics", []),
|
||||
"current_definition": entry.get("preferred_definition", ""),
|
||||
"current_source": entry.get("preferred_source", ""),
|
||||
"rescue_definition": "",
|
||||
"rescue_source": "treccani_rescue",
|
||||
"rescue_topics": [],
|
||||
"rescue_semantic_tags": [],
|
||||
"rescue_notes": "",
|
||||
"status": "pending",
|
||||
}
|
||||
|
||||
|
||||
def build_patch(args: argparse.Namespace) -> Dict[str, object]:
|
||||
payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(payload, dict):
|
||||
raise ValueError(f"File priority non valido: {args.input}")
|
||||
|
||||
existing_patch = load_json(args.output, {"entries": []})
|
||||
if not isinstance(existing_patch, dict):
|
||||
existing_patch = {"entries": []}
|
||||
|
||||
existing_by_form = {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in existing_patch.get("entries", []) or []
|
||||
if isinstance(entry, dict) and entry.get("form")
|
||||
}
|
||||
|
||||
bucket = str(args.bucket or "red").strip().lower()
|
||||
source_entries = payload.get("practical_entries") or payload.get("entries") or []
|
||||
|
||||
selected: List[Dict[str, object]] = []
|
||||
for entry in source_entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket:
|
||||
continue
|
||||
form = str(entry.get("form", "")).strip().lower()
|
||||
if not form:
|
||||
continue
|
||||
selected.append(entry)
|
||||
if len(selected) >= max(1, int(args.limit)):
|
||||
break
|
||||
|
||||
merged_records = []
|
||||
seen = set()
|
||||
for entry in selected:
|
||||
form = str(entry.get("form", "")).strip().lower()
|
||||
if form in existing_by_form:
|
||||
merged_records.append(existing_by_form[form])
|
||||
else:
|
||||
merged_records.append(build_record(entry))
|
||||
seen.add(form)
|
||||
|
||||
for form, entry in existing_by_form.items():
|
||||
if form not in seen:
|
||||
merged_records.append(entry)
|
||||
|
||||
merged_records.sort(
|
||||
key=lambda item: (
|
||||
{"pending": 0, "drafted": 1, "reviewed": 2, "done": 3}.get(str(item.get("status", "pending")), 9),
|
||||
-int(item.get("priority_score", 0) or 0),
|
||||
str(item.get("form", "")),
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_priority": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"batch_bucket": bucket,
|
||||
"batch_limit": int(args.limit),
|
||||
"entry_count": len(merged_records),
|
||||
},
|
||||
"entries": merged_records,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_patch(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Patch rescue generata: {args.output}")
|
||||
print(f"Voci nel lotto: {payload['meta']['entry_count']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user