183 lines
5.5 KiB
Python
183 lines
5.5 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from collections import Counter
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple
|
|
|
|
|
|
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
|
PRIORITY_OUTPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
|
|
|
REASON_WEIGHTS = {
|
|
"no_viable_definition": 100,
|
|
"proper_noun_collision": 90,
|
|
"candidate_mentions_answer": 85,
|
|
"function_word": 80,
|
|
"very_short_word": 75,
|
|
"wiktextract_missing": 55,
|
|
"only_general_topics": 45,
|
|
"flagged_by_refined_stage": 35,
|
|
"unresolved_sense_topics": 30,
|
|
"babelnet_ambiguous": 20,
|
|
}
|
|
|
|
SOURCE_WEIGHTS = {
|
|
"fallback": 50,
|
|
"babelnet": 18,
|
|
"semantic": 8,
|
|
"wiktextract": 6,
|
|
}
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Costruisce un file di review prioritizzato partendo da to_be_review.json."
|
|
)
|
|
parser.add_argument(
|
|
"--input",
|
|
type=Path,
|
|
default=REVIEW_INPUT_PATH,
|
|
help="File to_be_review.json di partenza.",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=PRIORITY_OUTPUT_PATH,
|
|
help="File to_be_review_priority.json da generare.",
|
|
)
|
|
parser.add_argument(
|
|
"--top",
|
|
type=int,
|
|
default=0,
|
|
help="Numero massimo di voci da tenere nel file priority. 0 = tutte.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def load_json(path: Path) -> Dict[str, object]:
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
|
|
|
|
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
|
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
|
|
|
|
def priority_score(entry: Dict[str, object]) -> Tuple[int, int, int, int, str]:
|
|
reasons = [str(item) for item in entry.get("review_reasons", []) or []]
|
|
source = str(entry.get("preferred_source", "")).lower()
|
|
preferred_definition = str(entry.get("preferred_definition", ""))
|
|
clue_definitions = entry.get("clue_definitions", {}) or {}
|
|
form = str(entry.get("form", ""))
|
|
|
|
score = sum(REASON_WEIGHTS.get(reason, 5) for reason in reasons)
|
|
score += SOURCE_WEIGHTS.get(source, 0)
|
|
|
|
if not preferred_definition:
|
|
score += 40
|
|
|
|
clue_count = len([value for value in clue_definitions.values() if str(value).strip()])
|
|
if clue_count == 0:
|
|
score += 20
|
|
elif clue_count == 1:
|
|
score += 8
|
|
|
|
score += min(len(reasons), 5) * 3
|
|
|
|
if len(form) <= 2:
|
|
score -= 120
|
|
elif len(form) == 3:
|
|
score -= 35
|
|
|
|
severe_count = sum(
|
|
1
|
|
for reason in reasons
|
|
if reason in {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
|
|
)
|
|
return (
|
|
score,
|
|
severe_count,
|
|
int(source == "fallback"),
|
|
-len(preferred_definition),
|
|
str(entry.get("form", "")),
|
|
)
|
|
|
|
|
|
def priority_bucket(entry: Dict[str, object]) -> str:
|
|
reasons = {str(item) for item in entry.get("review_reasons", []) or []}
|
|
if reasons.intersection({"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}):
|
|
return "red"
|
|
if reasons.intersection({"function_word", "very_short_word", "wiktextract_missing", "only_general_topics"}):
|
|
return "orange"
|
|
return "yellow"
|
|
|
|
|
|
def compact_entry(entry: Dict[str, object], score_tuple: Tuple[int, int, int, int, str]) -> Dict[str, object]:
|
|
score = score_tuple[0]
|
|
compact = dict(entry)
|
|
compact["priority_score"] = score
|
|
compact["priority_bucket"] = priority_bucket(entry)
|
|
return compact
|
|
|
|
|
|
def build_priority_review(args: argparse.Namespace) -> Dict[str, object]:
|
|
payload = load_json(args.input)
|
|
if not isinstance(payload, dict) or "entries" not in payload:
|
|
raise ValueError(f"File review non valido: {args.input}")
|
|
|
|
entries = [entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict)]
|
|
ranked = sorted(
|
|
entries,
|
|
key=priority_score,
|
|
reverse=True,
|
|
)
|
|
|
|
if args.top > 0:
|
|
ranked = ranked[: args.top]
|
|
|
|
compact_entries = [compact_entry(entry, priority_score(entry)) for entry in ranked]
|
|
|
|
practical_entries = [
|
|
item
|
|
for item in compact_entries
|
|
if len(str(item.get("form", ""))) > 2
|
|
]
|
|
|
|
bucket_counter = Counter(item["priority_bucket"] for item in compact_entries)
|
|
practical_bucket_counter = Counter(item["priority_bucket"] for item in practical_entries)
|
|
reason_counter = Counter()
|
|
for item in compact_entries:
|
|
for reason in item.get("review_reasons", []):
|
|
reason_counter[str(reason)] += 1
|
|
|
|
return {
|
|
"meta": {
|
|
"language": "it",
|
|
"version": 1,
|
|
"base_review": args.input.name,
|
|
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
|
"entry_count": len(compact_entries),
|
|
"bucket_counts": dict(bucket_counter),
|
|
"practical_entry_count": len(practical_entries),
|
|
"practical_bucket_counts": dict(practical_bucket_counter),
|
|
"top_reason_counts": dict(reason_counter.most_common(12)),
|
|
},
|
|
"entries": compact_entries,
|
|
"practical_entries": practical_entries,
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
args = parse_args()
|
|
payload = build_priority_review(args)
|
|
write_json(args.output, payload)
|
|
print(f"Review priority generato: {args.output}")
|
|
print(f"Voci nel priority file: {payload['meta']['entry_count']}")
|
|
print(f"Bucket: {payload['meta']['bucket_counts']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|