alpha01 backoffice: crossword engine, lexicon curation and JSON contract
This commit is contained in:
611
curate_lexicon_alpha.py
Normal file
611
curate_lexicon_alpha.py
Normal file
@@ -0,0 +1,611 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
CURATED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated.json")
|
||||
TO_BE_REVIEW_OUTPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
|
||||
DIFFICULTIES = ("easy", "medium", "hard", "expert")
|
||||
|
||||
TEXT_REPLACEMENTS = {
|
||||
" ngrandimento": " ingrandimento",
|
||||
"superificie": "superficie",
|
||||
"quantitaaa": "quantità",
|
||||
"quantitaaaa": "quantità",
|
||||
"quantit": "quantità",
|
||||
"sanit_militare": "sanità_militare",
|
||||
" unaparola ": " una parola ",
|
||||
"questa parola, ": "",
|
||||
"questa parola; ": "",
|
||||
}
|
||||
|
||||
SUSPICIOUS_PROPER_PATTERNS = (
|
||||
r"\bepisodio\b",
|
||||
r"\bfilm\b",
|
||||
r"\bserie tv\b",
|
||||
r"\bfamiglia\b",
|
||||
r"\bcomune italiano\b",
|
||||
r"\bfrazione del comune\b",
|
||||
r"\bcitta metropolitana\b",
|
||||
r"\bpersonaggio\b",
|
||||
r"\balbum\b",
|
||||
r"\bcognome\b",
|
||||
r"\bnome proprio\b",
|
||||
)
|
||||
|
||||
DOMAIN_HINTS = {
|
||||
"religion": ("monastero", "abbazia", "sacerdot", "prete", "vescovo", "clero", "religios"),
|
||||
"transport": ("veicolo", "motore", "aereo", "treno", "nave", "trasport", "rimorch", "reattor"),
|
||||
"health": ("malat", "ferit", "ospedal", "medic", "sanitar", "cura", "paziente"),
|
||||
"nature": ("animale", "pianta", "mare", "bosco", "albero", "fiore", "montagna", "acque", "salate"),
|
||||
"geography": ("comune", "paese", "regione", "provincia", "isola", "citta", "territorio"),
|
||||
"sea": ("acque", "salate", "superficie terrestre", "oceano"),
|
||||
}
|
||||
|
||||
ABSTRACT_PATTERNS = (
|
||||
r"\bgrande quantita\b",
|
||||
r"\bfigurato\b",
|
||||
r"\bsenso figurato\b",
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Cura il lessico refined per la milestone alpha e separa i casi dubbi in to_be_review.json."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico refined di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=CURATED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico curated da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review-output",
|
||||
type=Path,
|
||||
default=TO_BE_REVIEW_OUTPUT_PATH,
|
||||
help="File JSON con le voci che richiedono revisione umana.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-review",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Limite opzionale di voci da esportare in to_be_review.json. 0 = tutte.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
result: List[str] = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text:
|
||||
continue
|
||||
key = text.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def ascii_fold(text: str) -> str:
|
||||
replacements = str.maketrans(
|
||||
{
|
||||
"à": "a",
|
||||
"á": "a",
|
||||
"è": "e",
|
||||
"é": "e",
|
||||
"ì": "i",
|
||||
"í": "i",
|
||||
"ò": "o",
|
||||
"ó": "o",
|
||||
"ù": "u",
|
||||
"ú": "u",
|
||||
}
|
||||
)
|
||||
return str(text).translate(replacements)
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
value = str(text or "").strip()
|
||||
if not value:
|
||||
return ""
|
||||
for old, new in TEXT_REPLACEMENTS.items():
|
||||
value = value.replace(old, new)
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
value = re.sub(r"\s*;\s*", "; ", value)
|
||||
value = re.sub(r"\s*,\s*", ", ", value)
|
||||
value = value.strip(" .;:-")
|
||||
if value and value[0].islower():
|
||||
value = value[0].upper() + value[1:]
|
||||
return value + "."
|
||||
|
||||
|
||||
def split_definition_text(text: str) -> List[str]:
|
||||
value = str(text or "").strip()
|
||||
if not value:
|
||||
return []
|
||||
pieces = re.split(r"\s*;\s+|\.\s+(?=[a-zàèéìòù])", value, flags=re.IGNORECASE)
|
||||
normalized = []
|
||||
for piece in pieces:
|
||||
cleaned = normalize_text(piece)
|
||||
if cleaned:
|
||||
normalized.append(cleaned)
|
||||
return normalized
|
||||
|
||||
|
||||
def entry_is_common_word(entry: Dict[str, object]) -> bool:
|
||||
form = str(entry.get("form", ""))
|
||||
return bool(form) and form[:1].islower() and not (entry.get("name_tags") or [])
|
||||
|
||||
|
||||
def definition_mentions_answer(text: str, answer: str) -> bool:
|
||||
normalized_text = ascii_fold(text).lower()
|
||||
normalized_answer = ascii_fold(answer).lower()
|
||||
return bool(re.search(re.escape(normalized_answer), normalized_text))
|
||||
|
||||
|
||||
def suspicious_proper_noun_definition(text: str, entry: Dict[str, object]) -> bool:
|
||||
if not entry_is_common_word(entry):
|
||||
return False
|
||||
lowered = ascii_fold(text).lower()
|
||||
return any(re.search(pattern, lowered) for pattern in SUSPICIOUS_PROPER_PATTERNS)
|
||||
|
||||
|
||||
def likely_abstract_detour(text: str) -> bool:
|
||||
lowered = ascii_fold(text).lower()
|
||||
return any(re.search(pattern, lowered) for pattern in ABSTRACT_PATTERNS)
|
||||
|
||||
|
||||
def semantic_topics(entry: Dict[str, object]) -> List[str]:
|
||||
semantic = entry.get("semantic", {})
|
||||
topics = []
|
||||
if isinstance(semantic, dict):
|
||||
topics.extend(str(item).lower() for item in semantic.get("semantic_topics", []) or [])
|
||||
wiktextract = entry.get("wiktextract", {})
|
||||
if isinstance(wiktextract, dict):
|
||||
topics.extend(str(item).lower() for item in wiktextract.get("topic_hints", []) or [])
|
||||
return dedupe(topics)
|
||||
|
||||
|
||||
def lexical_topics(entry: Dict[str, object]) -> List[str]:
|
||||
return [str(item).lower() for item in entry.get("topics", []) or [] if item]
|
||||
|
||||
|
||||
def topic_alignment_score(text: str, entry: Dict[str, object]) -> int:
|
||||
lowered = ascii_fold(text).lower()
|
||||
score = 0
|
||||
topics = set(lexical_topics(entry)) | set(semantic_topics(entry))
|
||||
for topic in topics:
|
||||
for hint in DOMAIN_HINTS.get(topic, ()):
|
||||
if hint in lowered:
|
||||
score += 16
|
||||
return score
|
||||
|
||||
|
||||
def candidate_style(text: str) -> str:
|
||||
lowered = ascii_fold(text).lower()
|
||||
if ";" in text or len(text) > 90:
|
||||
return "direct"
|
||||
if any(marker in lowered for marker in ("chi ", "che ", "strumento", "veicolo", "titolo", "parte di")):
|
||||
return "balanced"
|
||||
return "oblique"
|
||||
|
||||
|
||||
def length_window(difficulty: str) -> Tuple[int, int]:
|
||||
if difficulty == "easy":
|
||||
return 18, 90
|
||||
if difficulty == "medium":
|
||||
return 18, 78
|
||||
if difficulty == "hard":
|
||||
return 14, 62
|
||||
return 12, 55
|
||||
|
||||
|
||||
def build_candidate(
|
||||
text: str,
|
||||
*,
|
||||
source: str,
|
||||
family: str,
|
||||
confidence: float,
|
||||
priority: int = 0,
|
||||
) -> Dict[str, object]:
|
||||
cleaned = normalize_text(text)
|
||||
return {
|
||||
"text": cleaned,
|
||||
"source": source,
|
||||
"family": family,
|
||||
"confidence": confidence,
|
||||
"style": candidate_style(cleaned),
|
||||
"priority": priority,
|
||||
}
|
||||
|
||||
|
||||
def collect_candidates(entry: Dict[str, object]) -> List[Dict[str, object]]:
|
||||
candidates: List[Dict[str, object]] = []
|
||||
seen = set()
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
for index, synset in enumerate(semantic.get("synsets", []) or []):
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
for piece in split_definition_text(str(synset.get("definition", ""))):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="semantic",
|
||||
family="semantic_definition",
|
||||
confidence=0.9,
|
||||
priority=max(0, 100 - index * 12),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
for index, gloss in enumerate(semantic.get("glosses", []) or []):
|
||||
for piece in split_definition_text(str(gloss)):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="semantic_gloss",
|
||||
family="semantic_gloss",
|
||||
confidence=0.8,
|
||||
priority=max(0, 90 - index * 10),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
for index, sense in enumerate(entry.get("senses", []) or []):
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
for piece in split_definition_text(str(sense.get("definition", ""))):
|
||||
source = str(sense.get("source", "refined"))
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="refined" if source == "semantic" else source,
|
||||
family="refined_sense",
|
||||
confidence=float(sense.get("confidence", 0.7) or 0.7),
|
||||
priority=max(0, 80 - index * 8),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
confidence = 0.85 if babelnet.get("status") == "enriched" else 0.55
|
||||
for index, gloss in enumerate(best_synset.get("glosses", []) or []):
|
||||
for piece in split_definition_text(str(gloss)):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="babelnet",
|
||||
family="babelnet_gloss",
|
||||
confidence=confidence,
|
||||
priority=max(0, 60 - index * 8),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
wiktextract = entry.get("wiktextract", {})
|
||||
if isinstance(wiktextract, dict):
|
||||
definitions = wiktextract.get("definitions", []) or []
|
||||
confidence = 0.78 if wiktextract.get("matched") else 0.45
|
||||
for index, definition in enumerate(definitions):
|
||||
for piece in split_definition_text(str(definition)):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="wiktextract",
|
||||
family="wiktextract_definition",
|
||||
confidence=confidence,
|
||||
priority=max(0, 88 - index * 9),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def score_candidate(candidate: Dict[str, object], entry: Dict[str, object], difficulty: str) -> int:
|
||||
text = str(candidate["text"])
|
||||
answer = str(entry.get("form", "")).lower()
|
||||
score = 0
|
||||
|
||||
source = str(candidate.get("source"))
|
||||
family = str(candidate.get("family"))
|
||||
confidence = float(candidate.get("confidence", 0.0) or 0.0)
|
||||
|
||||
if len(text) < 12:
|
||||
return -10_000
|
||||
|
||||
if definition_mentions_answer(text, answer):
|
||||
score -= 140
|
||||
else:
|
||||
score += 30
|
||||
|
||||
if suspicious_proper_noun_definition(text, entry):
|
||||
score -= 220
|
||||
|
||||
if likely_abstract_detour(text):
|
||||
score -= 80
|
||||
|
||||
min_len, max_len = length_window(difficulty)
|
||||
if min_len <= len(text) <= max_len:
|
||||
score += 24
|
||||
else:
|
||||
score -= abs(len(text) - max_len) if len(text) > max_len else abs(min_len - len(text)) // 2
|
||||
|
||||
source_bonus = {
|
||||
"semantic": 55,
|
||||
"semantic_gloss": 40,
|
||||
"babelnet": 24,
|
||||
"refined": 30,
|
||||
"wiktextract": 52,
|
||||
}
|
||||
score += source_bonus.get(source, 10)
|
||||
|
||||
family_bonus = {
|
||||
"semantic_definition": 30,
|
||||
"semantic_gloss": 18,
|
||||
"babelnet_gloss": 8,
|
||||
"refined_sense": 22,
|
||||
"wiktextract_definition": 28,
|
||||
}
|
||||
score += family_bonus.get(family, 0)
|
||||
|
||||
score += int(candidate.get("priority", 0) or 0)
|
||||
score += int(confidence * 35)
|
||||
|
||||
alignment = topic_alignment_score(text, entry)
|
||||
score += alignment
|
||||
topical = set(lexical_topics(entry)) | set(semantic_topics(entry))
|
||||
concrete_topics = topical.intersection({"religion", "transport", "health", "nature", "geography", "sea"})
|
||||
if concrete_topics and alignment == 0:
|
||||
score -= 45
|
||||
|
||||
style = str(candidate.get("style"))
|
||||
if difficulty == "easy" and style == "direct":
|
||||
score += 16
|
||||
elif difficulty == "medium" and style in {"direct", "balanced"}:
|
||||
score += 14
|
||||
elif difficulty == "hard" and style == "balanced":
|
||||
score += 10
|
||||
elif difficulty == "expert" and style == "oblique":
|
||||
score += 10
|
||||
|
||||
if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", text.lower()):
|
||||
score -= 30
|
||||
if difficulty in {"hard", "expert"} and ";" in text:
|
||||
score -= 10
|
||||
|
||||
if entry.get("needs_review"):
|
||||
score -= 8
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def choose_best_candidate(
|
||||
candidates: Sequence[Dict[str, object]],
|
||||
entry: Dict[str, object],
|
||||
difficulty: str,
|
||||
) -> Optional[Dict[str, object]]:
|
||||
ranked = sorted(
|
||||
candidates,
|
||||
key=lambda candidate: (
|
||||
score_candidate(candidate, entry, difficulty),
|
||||
float(candidate.get("confidence", 0.0)),
|
||||
float(candidate.get("priority", 0.0)),
|
||||
-len(str(candidate.get("text", ""))),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return ranked[0] if ranked else None
|
||||
|
||||
|
||||
def review_reasons(entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[str]:
|
||||
reasons: List[str] = []
|
||||
form = str(entry.get("form", ""))
|
||||
lowered_topics = set(lexical_topics(entry))
|
||||
semantic_topic_set = set(semantic_topics(entry))
|
||||
babelnet_status = str((entry.get("babelnet") or {}).get("status", ""))
|
||||
wiktextract = entry.get("wiktextract", {})
|
||||
wiktextract_status = str(wiktextract.get("status", "")) if isinstance(wiktextract, dict) else ""
|
||||
preferred_definition = str(entry.get("preferred_definition", ""))
|
||||
preferred_source = str(entry.get("preferred_source", ""))
|
||||
|
||||
if not candidates:
|
||||
reasons.append("no_viable_definition")
|
||||
if not preferred_definition and entry.get("needs_review"):
|
||||
reasons.append("flagged_by_refined_stage")
|
||||
if preferred_definition and suspicious_proper_noun_definition(preferred_definition, entry):
|
||||
reasons.append("proper_noun_collision")
|
||||
if babelnet_status == "ambiguous" and preferred_source == "babelnet":
|
||||
reasons.append("babelnet_ambiguous")
|
||||
if wiktextract_status in {"missing", "no_match"} and not preferred_definition:
|
||||
reasons.append("wiktextract_missing")
|
||||
if lowered_topics == {"general"} and not semantic_topic_set and not preferred_definition:
|
||||
reasons.append("only_general_topics")
|
||||
if len(form) <= 2:
|
||||
reasons.append("very_short_word")
|
||||
if str(entry.get("pos", "")) in {"PREP", "CONJ"}:
|
||||
reasons.append("function_word")
|
||||
if preferred_source == "babelnet" and any("None" in str(sense.get("topics")) for sense in entry.get("senses", []) if isinstance(sense, dict)):
|
||||
reasons.append("unresolved_sense_topics")
|
||||
if preferred_definition and definition_mentions_answer(preferred_definition, form.lower()):
|
||||
reasons.append("candidate_mentions_answer")
|
||||
|
||||
return dedupe(reasons)
|
||||
|
||||
|
||||
def curate_entry(entry: Dict[str, object]) -> Tuple[Dict[str, object], Optional[Dict[str, object]]]:
|
||||
curated = deepcopy(entry)
|
||||
candidates = collect_candidates(curated)
|
||||
|
||||
clue_definitions: Dict[str, str] = {}
|
||||
clue_sources: Dict[str, str] = {}
|
||||
clue_scores: Dict[str, int] = {}
|
||||
curation_notes: List[str] = []
|
||||
|
||||
for difficulty in DIFFICULTIES:
|
||||
best = choose_best_candidate(candidates, curated, difficulty)
|
||||
if best:
|
||||
clue_definitions[difficulty] = str(best["text"])
|
||||
clue_sources[difficulty] = str(best["source"])
|
||||
clue_scores[difficulty] = score_candidate(best, curated, difficulty)
|
||||
|
||||
preferred_definition = clue_definitions.get("medium") or clue_definitions.get("easy") or ""
|
||||
preferred_source = clue_sources.get("medium") or clue_sources.get("easy") or "fallback"
|
||||
|
||||
if preferred_definition:
|
||||
curation_notes.append(f"preferred_from={preferred_source}")
|
||||
if clue_scores.get("medium", -9999) < 20:
|
||||
curation_notes.append("weak_medium_definition")
|
||||
|
||||
curated["curated_glosses"] = dedupe(candidate["text"] for candidate in candidates)
|
||||
curated["curated_senses"] = [
|
||||
{
|
||||
"definition": candidate["text"],
|
||||
"source": candidate["source"],
|
||||
"family": candidate["family"],
|
||||
"confidence": candidate["confidence"],
|
||||
"priority": candidate["priority"],
|
||||
}
|
||||
for candidate in candidates
|
||||
]
|
||||
curated["preferred_definition"] = preferred_definition
|
||||
curated["preferred_source"] = preferred_source
|
||||
curated["clue_definitions"] = clue_definitions
|
||||
curated["clue_sources"] = clue_sources
|
||||
curated["clue_scores"] = clue_scores
|
||||
curated["curation_notes"] = curation_notes
|
||||
|
||||
reasons = review_reasons(curated, candidates)
|
||||
severe = {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
|
||||
alpha_ready = bool(preferred_definition) and not severe.intersection(reasons)
|
||||
curated["alpha_ready"] = alpha_ready
|
||||
curated["review_reasons"] = reasons
|
||||
|
||||
review_item = None
|
||||
if reasons:
|
||||
review_item = {
|
||||
"form": curated.get("form"),
|
||||
"lemma": curated.get("lemma"),
|
||||
"pos": curated.get("pos"),
|
||||
"topics": curated.get("topics"),
|
||||
"topic_suggestions": curated.get("topic_suggestions"),
|
||||
"preferred_definition": preferred_definition,
|
||||
"preferred_source": preferred_source,
|
||||
"clue_definitions": clue_definitions,
|
||||
"review_reasons": reasons,
|
||||
"semantic_glosses": (curated.get("semantic") or {}).get("glosses", []),
|
||||
"senses": curated.get("senses", []),
|
||||
"babelnet_status": (curated.get("babelnet") or {}).get("status"),
|
||||
"babelnet_best_synset": (curated.get("babelnet") or {}).get("best_synset"),
|
||||
"wiktextract_status": (curated.get("wiktextract") or {}).get("status"),
|
||||
"wiktextract": curated.get("wiktextract"),
|
||||
"candidate_pool": [
|
||||
{
|
||||
"text": candidate["text"],
|
||||
"source": candidate["source"],
|
||||
"family": candidate["family"],
|
||||
"confidence": candidate["confidence"],
|
||||
"priority": candidate["priority"],
|
||||
}
|
||||
for candidate in candidates[:12]
|
||||
],
|
||||
}
|
||||
|
||||
return curated, review_item
|
||||
|
||||
|
||||
def build_curated_lexicon(args: argparse.Namespace) -> Tuple[Dict[str, object], Dict[str, object]]:
|
||||
payload = load_json(args.input)
|
||||
if not isinstance(payload, dict) or "entries" not in payload:
|
||||
raise ValueError(f"Lessico refined non valido: {args.input}")
|
||||
|
||||
curated_entries: List[Dict[str, object]] = []
|
||||
review_entries: List[Dict[str, object]] = []
|
||||
|
||||
for entry in payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
curated, review_item = curate_entry(entry)
|
||||
curated_entries.append(curated)
|
||||
if review_item:
|
||||
review_entries.append(review_item)
|
||||
|
||||
if args.max_review > 0:
|
||||
review_entries = review_entries[: args.max_review]
|
||||
|
||||
curated_payload = {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(curated_entries),
|
||||
"alpha_ready_count": sum(1 for item in curated_entries if item.get("alpha_ready")),
|
||||
"review_count": len(review_entries),
|
||||
},
|
||||
"entries": curated_entries,
|
||||
}
|
||||
|
||||
review_payload = {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(review_entries),
|
||||
},
|
||||
"entries": review_entries,
|
||||
}
|
||||
|
||||
return curated_payload, review_payload
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
curated_payload, review_payload = build_curated_lexicon(args)
|
||||
write_json(args.output, curated_payload)
|
||||
write_json(args.review_output, review_payload)
|
||||
print(f"Lessico curated generato: {args.output}")
|
||||
print(f"Voci totali: {curated_payload['meta']['entry_count']}")
|
||||
print(f"Voci alpha_ready: {curated_payload['meta']['alpha_ready_count']}")
|
||||
print(f"Voci da revisionare: {review_payload['meta']['entry_count']}")
|
||||
print(f"File review generato: {args.review_output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user