alpha01 backoffice: crossword engine, lexicon curation and JSON contract
This commit is contained in:
182
build_review_priority.py
Normal file
182
build_review_priority.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
|
||||
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
PRIORITY_OUTPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
||||
|
||||
REASON_WEIGHTS = {
|
||||
"no_viable_definition": 100,
|
||||
"proper_noun_collision": 90,
|
||||
"candidate_mentions_answer": 85,
|
||||
"function_word": 80,
|
||||
"very_short_word": 75,
|
||||
"wiktextract_missing": 55,
|
||||
"only_general_topics": 45,
|
||||
"flagged_by_refined_stage": 35,
|
||||
"unresolved_sense_topics": 30,
|
||||
"babelnet_ambiguous": 20,
|
||||
}
|
||||
|
||||
SOURCE_WEIGHTS = {
|
||||
"fallback": 50,
|
||||
"babelnet": 18,
|
||||
"semantic": 8,
|
||||
"wiktextract": 6,
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Costruisce un file di review prioritizzato partendo da to_be_review.json."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REVIEW_INPUT_PATH,
|
||||
help="File to_be_review.json di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=PRIORITY_OUTPUT_PATH,
|
||||
help="File to_be_review_priority.json da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Numero massimo di voci da tenere nel file priority. 0 = tutte.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def priority_score(entry: Dict[str, object]) -> Tuple[int, int, int, int, str]:
|
||||
reasons = [str(item) for item in entry.get("review_reasons", []) or []]
|
||||
source = str(entry.get("preferred_source", "")).lower()
|
||||
preferred_definition = str(entry.get("preferred_definition", ""))
|
||||
clue_definitions = entry.get("clue_definitions", {}) or {}
|
||||
form = str(entry.get("form", ""))
|
||||
|
||||
score = sum(REASON_WEIGHTS.get(reason, 5) for reason in reasons)
|
||||
score += SOURCE_WEIGHTS.get(source, 0)
|
||||
|
||||
if not preferred_definition:
|
||||
score += 40
|
||||
|
||||
clue_count = len([value for value in clue_definitions.values() if str(value).strip()])
|
||||
if clue_count == 0:
|
||||
score += 20
|
||||
elif clue_count == 1:
|
||||
score += 8
|
||||
|
||||
score += min(len(reasons), 5) * 3
|
||||
|
||||
if len(form) <= 2:
|
||||
score -= 120
|
||||
elif len(form) == 3:
|
||||
score -= 35
|
||||
|
||||
severe_count = sum(
|
||||
1
|
||||
for reason in reasons
|
||||
if reason in {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
|
||||
)
|
||||
return (
|
||||
score,
|
||||
severe_count,
|
||||
int(source == "fallback"),
|
||||
-len(preferred_definition),
|
||||
str(entry.get("form", "")),
|
||||
)
|
||||
|
||||
|
||||
def priority_bucket(entry: Dict[str, object]) -> str:
|
||||
reasons = {str(item) for item in entry.get("review_reasons", []) or []}
|
||||
if reasons.intersection({"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}):
|
||||
return "red"
|
||||
if reasons.intersection({"function_word", "very_short_word", "wiktextract_missing", "only_general_topics"}):
|
||||
return "orange"
|
||||
return "yellow"
|
||||
|
||||
|
||||
def compact_entry(entry: Dict[str, object], score_tuple: Tuple[int, int, int, int, str]) -> Dict[str, object]:
|
||||
score = score_tuple[0]
|
||||
compact = dict(entry)
|
||||
compact["priority_score"] = score
|
||||
compact["priority_bucket"] = priority_bucket(entry)
|
||||
return compact
|
||||
|
||||
|
||||
def build_priority_review(args: argparse.Namespace) -> Dict[str, object]:
|
||||
payload = load_json(args.input)
|
||||
if not isinstance(payload, dict) or "entries" not in payload:
|
||||
raise ValueError(f"File review non valido: {args.input}")
|
||||
|
||||
entries = [entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
ranked = sorted(
|
||||
entries,
|
||||
key=priority_score,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
if args.top > 0:
|
||||
ranked = ranked[: args.top]
|
||||
|
||||
compact_entries = [compact_entry(entry, priority_score(entry)) for entry in ranked]
|
||||
|
||||
practical_entries = [
|
||||
item
|
||||
for item in compact_entries
|
||||
if len(str(item.get("form", ""))) > 2
|
||||
]
|
||||
|
||||
bucket_counter = Counter(item["priority_bucket"] for item in compact_entries)
|
||||
practical_bucket_counter = Counter(item["priority_bucket"] for item in practical_entries)
|
||||
reason_counter = Counter()
|
||||
for item in compact_entries:
|
||||
for reason in item.get("review_reasons", []):
|
||||
reason_counter[str(reason)] += 1
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_review": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(compact_entries),
|
||||
"bucket_counts": dict(bucket_counter),
|
||||
"practical_entry_count": len(practical_entries),
|
||||
"practical_bucket_counts": dict(practical_bucket_counter),
|
||||
"top_reason_counts": dict(reason_counter.most_common(12)),
|
||||
},
|
||||
"entries": compact_entries,
|
||||
"practical_entries": practical_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_priority_review(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Review priority generato: {args.output}")
|
||||
print(f"Voci nel priority file: {payload['meta']['entry_count']}")
|
||||
print(f"Bucket: {payload['meta']['bucket_counts']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user