154 lines
4.7 KiB
Python
154 lines
4.7 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
|
|
PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
|
PATCH_OUTPUT_PATH = Path(__file__).with_name("treccani_rescue_patch.json")
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description=(
|
|
"Estrae un lotto prioritario dal file to_be_review_priority.json per preparare una patch "
|
|
"manuale/assistita di rescue lessicale."
|
|
)
|
|
)
|
|
parser.add_argument(
|
|
"--input",
|
|
type=Path,
|
|
default=PRIORITY_INPUT_PATH,
|
|
help="File to_be_review_priority.json di partenza.",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=PATCH_OUTPUT_PATH,
|
|
help="Patch JSON da generare o aggiornare.",
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
default=100,
|
|
help="Numero massimo di voci da preparare nel lotto.",
|
|
)
|
|
parser.add_argument(
|
|
"--bucket",
|
|
default="red",
|
|
help="Bucket di priorita da considerare: red, orange, yellow oppure all.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def load_json(path: Path, default: object) -> object:
|
|
if not path.exists():
|
|
return default
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
|
|
|
|
def write_json(path: Path, payload: object) -> None:
|
|
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
|
|
|
def build_record(entry: Dict[str, object]) -> Dict[str, object]:
|
|
return {
|
|
"form": entry.get("form"),
|
|
"lemma": entry.get("lemma"),
|
|
"pos": entry.get("pos"),
|
|
"priority_bucket": entry.get("priority_bucket"),
|
|
"priority_score": entry.get("priority_score"),
|
|
"review_reasons": entry.get("review_reasons", []),
|
|
"current_topics": entry.get("topics", []),
|
|
"current_definition": entry.get("preferred_definition", ""),
|
|
"current_source": entry.get("preferred_source", ""),
|
|
"rescue_definition": "",
|
|
"rescue_source": "treccani_rescue",
|
|
"rescue_topics": [],
|
|
"rescue_semantic_tags": [],
|
|
"rescue_notes": "",
|
|
"status": "pending",
|
|
}
|
|
|
|
|
|
def build_patch(args: argparse.Namespace) -> Dict[str, object]:
|
|
payload = load_json(args.input, {"entries": []})
|
|
if not isinstance(payload, dict):
|
|
raise ValueError(f"File priority non valido: {args.input}")
|
|
|
|
existing_patch = load_json(args.output, {"entries": []})
|
|
if not isinstance(existing_patch, dict):
|
|
existing_patch = {"entries": []}
|
|
|
|
existing_by_form = {
|
|
str(entry.get("form", "")).lower(): entry
|
|
for entry in existing_patch.get("entries", []) or []
|
|
if isinstance(entry, dict) and entry.get("form")
|
|
}
|
|
|
|
bucket = str(args.bucket or "red").strip().lower()
|
|
source_entries = payload.get("practical_entries") or payload.get("entries") or []
|
|
|
|
selected: List[Dict[str, object]] = []
|
|
for entry in source_entries:
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket:
|
|
continue
|
|
form = str(entry.get("form", "")).strip().lower()
|
|
if not form:
|
|
continue
|
|
selected.append(entry)
|
|
if len(selected) >= max(1, int(args.limit)):
|
|
break
|
|
|
|
merged_records = []
|
|
seen = set()
|
|
for entry in selected:
|
|
form = str(entry.get("form", "")).strip().lower()
|
|
if form in existing_by_form:
|
|
merged_records.append(existing_by_form[form])
|
|
else:
|
|
merged_records.append(build_record(entry))
|
|
seen.add(form)
|
|
|
|
for form, entry in existing_by_form.items():
|
|
if form not in seen:
|
|
merged_records.append(entry)
|
|
|
|
merged_records.sort(
|
|
key=lambda item: (
|
|
{"pending": 0, "drafted": 1, "reviewed": 2, "done": 3}.get(str(item.get("status", "pending")), 9),
|
|
-int(item.get("priority_score", 0) or 0),
|
|
str(item.get("form", "")),
|
|
)
|
|
)
|
|
|
|
return {
|
|
"meta": {
|
|
"language": "it",
|
|
"version": 1,
|
|
"base_priority": args.input.name,
|
|
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
|
"batch_bucket": bucket,
|
|
"batch_limit": int(args.limit),
|
|
"entry_count": len(merged_records),
|
|
},
|
|
"entries": merged_records,
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
payload = build_patch(args)
|
|
write_json(args.output, payload)
|
|
print(f"Patch rescue generata: {args.output}")
|
|
print(f"Voci nel lotto: {payload['meta']['entry_count']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|