from __future__ import annotations import argparse import json import re from copy import deepcopy from datetime import datetime from pathlib import Path from typing import Dict, Iterable, List, Optional, Sequence, Tuple from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH CURATED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated.json") TO_BE_REVIEW_OUTPUT_PATH = Path(__file__).with_name("to_be_review.json") DIFFICULTIES = ("easy", "medium", "hard", "expert") TEXT_REPLACEMENTS = { " ngrandimento": " ingrandimento", "superificie": "superficie", "quantitaaa": "quantità", "quantitaaaa": "quantità", "quantit": "quantità", "sanit_militare": "sanità_militare", " unaparola ": " una parola ", "questa parola, ": "", "questa parola; ": "", } SUSPICIOUS_PROPER_PATTERNS = ( r"\bepisodio\b", r"\bfilm\b", r"\bserie tv\b", r"\bfamiglia\b", r"\bcomune italiano\b", r"\bfrazione del comune\b", r"\bcitta metropolitana\b", r"\bpersonaggio\b", r"\balbum\b", r"\bcognome\b", r"\bnome proprio\b", ) DOMAIN_HINTS = { "religion": ("monastero", "abbazia", "sacerdot", "prete", "vescovo", "clero", "religios"), "transport": ("veicolo", "motore", "aereo", "treno", "nave", "trasport", "rimorch", "reattor"), "health": ("malat", "ferit", "ospedal", "medic", "sanitar", "cura", "paziente"), "nature": ("animale", "pianta", "mare", "bosco", "albero", "fiore", "montagna", "acque", "salate"), "geography": ("comune", "paese", "regione", "provincia", "isola", "citta", "territorio"), "sea": ("acque", "salate", "superficie terrestre", "oceano"), } ABSTRACT_PATTERNS = ( r"\bgrande quantita\b", r"\bfigurato\b", r"\bsenso figurato\b", ) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Cura il lessico refined per la milestone alpha e separa i casi dubbi in to_be_review.json." ) parser.add_argument( "--input", type=Path, default=REFINED_LEXICON_OUTPUT_PATH, help="Lessico refined di partenza.", ) parser.add_argument( "--output", type=Path, default=CURATED_LEXICON_OUTPUT_PATH, help="Lessico curated da generare.", ) parser.add_argument( "--review-output", type=Path, default=TO_BE_REVIEW_OUTPUT_PATH, help="File JSON con le voci che richiedono revisione umana.", ) parser.add_argument( "--max-review", type=int, default=0, help="Limite opzionale di voci da esportare in to_be_review.json. 0 = tutte.", ) return parser.parse_args() def load_json(path: Path) -> Dict[str, object]: return json.loads(path.read_text(encoding="utf-8")) def write_json(path: Path, payload: Dict[str, object]) -> None: path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") def dedupe(items: Iterable[str]) -> List[str]: result: List[str] = [] seen = set() for item in items: text = str(item).strip() if not text: continue key = text.lower() if key in seen: continue seen.add(key) result.append(text) return result def ascii_fold(text: str) -> str: replacements = str.maketrans( { "à": "a", "á": "a", "è": "e", "é": "e", "ì": "i", "í": "i", "ò": "o", "ó": "o", "ù": "u", "ú": "u", } ) return str(text).translate(replacements) def normalize_text(text: str) -> str: value = str(text or "").strip() if not value: return "" for old, new in TEXT_REPLACEMENTS.items(): value = value.replace(old, new) value = re.sub(r"\s+", " ", value) value = re.sub(r"\s*;\s*", "; ", value) value = re.sub(r"\s*,\s*", ", ", value) value = value.strip(" .;:-") if value and value[0].islower(): value = value[0].upper() + value[1:] return value + "." def split_definition_text(text: str) -> List[str]: value = str(text or "").strip() if not value: return [] pieces = re.split(r"\s*;\s+|\.\s+(?=[a-zàèéìòù])", value, flags=re.IGNORECASE) normalized = [] for piece in pieces: cleaned = normalize_text(piece) if cleaned: normalized.append(cleaned) return normalized def entry_is_common_word(entry: Dict[str, object]) -> bool: form = str(entry.get("form", "")) return bool(form) and form[:1].islower() and not (entry.get("name_tags") or []) def definition_mentions_answer(text: str, answer: str) -> bool: normalized_text = ascii_fold(text).lower() normalized_answer = ascii_fold(answer).lower() return bool(re.search(re.escape(normalized_answer), normalized_text)) def suspicious_proper_noun_definition(text: str, entry: Dict[str, object]) -> bool: if not entry_is_common_word(entry): return False lowered = ascii_fold(text).lower() return any(re.search(pattern, lowered) for pattern in SUSPICIOUS_PROPER_PATTERNS) def likely_abstract_detour(text: str) -> bool: lowered = ascii_fold(text).lower() return any(re.search(pattern, lowered) for pattern in ABSTRACT_PATTERNS) def semantic_topics(entry: Dict[str, object]) -> List[str]: semantic = entry.get("semantic", {}) topics = [] if isinstance(semantic, dict): topics.extend(str(item).lower() for item in semantic.get("semantic_topics", []) or []) wiktextract = entry.get("wiktextract", {}) if isinstance(wiktextract, dict): topics.extend(str(item).lower() for item in wiktextract.get("topic_hints", []) or []) return dedupe(topics) def lexical_topics(entry: Dict[str, object]) -> List[str]: return [str(item).lower() for item in entry.get("topics", []) or [] if item] def topic_alignment_score(text: str, entry: Dict[str, object]) -> int: lowered = ascii_fold(text).lower() score = 0 topics = set(lexical_topics(entry)) | set(semantic_topics(entry)) for topic in topics: for hint in DOMAIN_HINTS.get(topic, ()): if hint in lowered: score += 16 return score def candidate_style(text: str) -> str: lowered = ascii_fold(text).lower() if ";" in text or len(text) > 90: return "direct" if any(marker in lowered for marker in ("chi ", "che ", "strumento", "veicolo", "titolo", "parte di")): return "balanced" return "oblique" def length_window(difficulty: str) -> Tuple[int, int]: if difficulty == "easy": return 18, 90 if difficulty == "medium": return 18, 78 if difficulty == "hard": return 14, 62 return 12, 55 def build_candidate( text: str, *, source: str, family: str, confidence: float, priority: int = 0, ) -> Dict[str, object]: cleaned = normalize_text(text) return { "text": cleaned, "source": source, "family": family, "confidence": confidence, "style": candidate_style(cleaned), "priority": priority, } def collect_candidates(entry: Dict[str, object]) -> List[Dict[str, object]]: candidates: List[Dict[str, object]] = [] seen = set() semantic = entry.get("semantic", {}) if isinstance(semantic, dict): for index, synset in enumerate(semantic.get("synsets", []) or []): if not isinstance(synset, dict): continue for piece in split_definition_text(str(synset.get("definition", ""))): candidate = build_candidate( piece, source="semantic", family="semantic_definition", confidence=0.9, priority=max(0, 100 - index * 12), ) key = (candidate["text"].lower(), candidate["family"]) if candidate["text"] and key not in seen: seen.add(key) candidates.append(candidate) for index, gloss in enumerate(semantic.get("glosses", []) or []): for piece in split_definition_text(str(gloss)): candidate = build_candidate( piece, source="semantic_gloss", family="semantic_gloss", confidence=0.8, priority=max(0, 90 - index * 10), ) key = (candidate["text"].lower(), candidate["family"]) if candidate["text"] and key not in seen: seen.add(key) candidates.append(candidate) for index, sense in enumerate(entry.get("senses", []) or []): if not isinstance(sense, dict): continue for piece in split_definition_text(str(sense.get("definition", ""))): source = str(sense.get("source", "refined")) candidate = build_candidate( piece, source="refined" if source == "semantic" else source, family="refined_sense", confidence=float(sense.get("confidence", 0.7) or 0.7), priority=max(0, 80 - index * 8), ) key = (candidate["text"].lower(), candidate["family"]) if candidate["text"] and key not in seen: seen.add(key) candidates.append(candidate) babelnet = entry.get("babelnet", {}) if isinstance(babelnet, dict): best_synset = babelnet.get("best_synset", {}) if isinstance(best_synset, dict): confidence = 0.85 if babelnet.get("status") == "enriched" else 0.55 for index, gloss in enumerate(best_synset.get("glosses", []) or []): for piece in split_definition_text(str(gloss)): candidate = build_candidate( piece, source="babelnet", family="babelnet_gloss", confidence=confidence, priority=max(0, 60 - index * 8), ) key = (candidate["text"].lower(), candidate["family"]) if candidate["text"] and key not in seen: seen.add(key) candidates.append(candidate) wiktextract = entry.get("wiktextract", {}) if isinstance(wiktextract, dict): definitions = wiktextract.get("definitions", []) or [] confidence = 0.78 if wiktextract.get("matched") else 0.45 for index, definition in enumerate(definitions): for piece in split_definition_text(str(definition)): candidate = build_candidate( piece, source="wiktextract", family="wiktextract_definition", confidence=confidence, priority=max(0, 88 - index * 9), ) key = (candidate["text"].lower(), candidate["family"]) if candidate["text"] and key not in seen: seen.add(key) candidates.append(candidate) return candidates def score_candidate(candidate: Dict[str, object], entry: Dict[str, object], difficulty: str) -> int: text = str(candidate["text"]) answer = str(entry.get("form", "")).lower() score = 0 source = str(candidate.get("source")) family = str(candidate.get("family")) confidence = float(candidate.get("confidence", 0.0) or 0.0) if len(text) < 12: return -10_000 if definition_mentions_answer(text, answer): score -= 140 else: score += 30 if suspicious_proper_noun_definition(text, entry): score -= 220 if likely_abstract_detour(text): score -= 80 min_len, max_len = length_window(difficulty) if min_len <= len(text) <= max_len: score += 24 else: score -= abs(len(text) - max_len) if len(text) > max_len else abs(min_len - len(text)) // 2 source_bonus = { "semantic": 55, "semantic_gloss": 40, "babelnet": 24, "refined": 30, "wiktextract": 52, } score += source_bonus.get(source, 10) family_bonus = { "semantic_definition": 30, "semantic_gloss": 18, "babelnet_gloss": 8, "refined_sense": 22, "wiktextract_definition": 28, } score += family_bonus.get(family, 0) score += int(candidate.get("priority", 0) or 0) score += int(confidence * 35) alignment = topic_alignment_score(text, entry) score += alignment topical = set(lexical_topics(entry)) | set(semantic_topics(entry)) concrete_topics = topical.intersection({"religion", "transport", "health", "nature", "geography", "sea"}) if concrete_topics and alignment == 0: score -= 45 style = str(candidate.get("style")) if difficulty == "easy" and style == "direct": score += 16 elif difficulty == "medium" and style in {"direct", "balanced"}: score += 14 elif difficulty == "hard" and style == "balanced": score += 10 elif difficulty == "expert" and style == "oblique": score += 10 if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", text.lower()): score -= 30 if difficulty in {"hard", "expert"} and ";" in text: score -= 10 if entry.get("needs_review"): score -= 8 return score def choose_best_candidate( candidates: Sequence[Dict[str, object]], entry: Dict[str, object], difficulty: str, ) -> Optional[Dict[str, object]]: ranked = sorted( candidates, key=lambda candidate: ( score_candidate(candidate, entry, difficulty), float(candidate.get("confidence", 0.0)), float(candidate.get("priority", 0.0)), -len(str(candidate.get("text", ""))), ), reverse=True, ) return ranked[0] if ranked else None def review_reasons(entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[str]: reasons: List[str] = [] form = str(entry.get("form", "")) lowered_topics = set(lexical_topics(entry)) semantic_topic_set = set(semantic_topics(entry)) babelnet_status = str((entry.get("babelnet") or {}).get("status", "")) wiktextract = entry.get("wiktextract", {}) wiktextract_status = str(wiktextract.get("status", "")) if isinstance(wiktextract, dict) else "" preferred_definition = str(entry.get("preferred_definition", "")) preferred_source = str(entry.get("preferred_source", "")) if not candidates: reasons.append("no_viable_definition") if not preferred_definition and entry.get("needs_review"): reasons.append("flagged_by_refined_stage") if preferred_definition and suspicious_proper_noun_definition(preferred_definition, entry): reasons.append("proper_noun_collision") if babelnet_status == "ambiguous" and preferred_source == "babelnet": reasons.append("babelnet_ambiguous") if wiktextract_status in {"missing", "no_match"} and not preferred_definition: reasons.append("wiktextract_missing") if lowered_topics == {"general"} and not semantic_topic_set and not preferred_definition: reasons.append("only_general_topics") if len(form) <= 2: reasons.append("very_short_word") if str(entry.get("pos", "")) in {"PREP", "CONJ"}: reasons.append("function_word") if preferred_source == "babelnet" and any("None" in str(sense.get("topics")) for sense in entry.get("senses", []) if isinstance(sense, dict)): reasons.append("unresolved_sense_topics") if preferred_definition and definition_mentions_answer(preferred_definition, form.lower()): reasons.append("candidate_mentions_answer") return dedupe(reasons) def curate_entry(entry: Dict[str, object]) -> Tuple[Dict[str, object], Optional[Dict[str, object]]]: curated = deepcopy(entry) candidates = collect_candidates(curated) clue_definitions: Dict[str, str] = {} clue_sources: Dict[str, str] = {} clue_scores: Dict[str, int] = {} curation_notes: List[str] = [] for difficulty in DIFFICULTIES: best = choose_best_candidate(candidates, curated, difficulty) if best: clue_definitions[difficulty] = str(best["text"]) clue_sources[difficulty] = str(best["source"]) clue_scores[difficulty] = score_candidate(best, curated, difficulty) preferred_definition = clue_definitions.get("medium") or clue_definitions.get("easy") or "" preferred_source = clue_sources.get("medium") or clue_sources.get("easy") or "fallback" if preferred_definition: curation_notes.append(f"preferred_from={preferred_source}") if clue_scores.get("medium", -9999) < 20: curation_notes.append("weak_medium_definition") curated["curated_glosses"] = dedupe(candidate["text"] for candidate in candidates) curated["curated_senses"] = [ { "definition": candidate["text"], "source": candidate["source"], "family": candidate["family"], "confidence": candidate["confidence"], "priority": candidate["priority"], } for candidate in candidates ] curated["preferred_definition"] = preferred_definition curated["preferred_source"] = preferred_source curated["clue_definitions"] = clue_definitions curated["clue_sources"] = clue_sources curated["clue_scores"] = clue_scores curated["curation_notes"] = curation_notes reasons = review_reasons(curated, candidates) severe = {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"} alpha_ready = bool(preferred_definition) and not severe.intersection(reasons) curated["alpha_ready"] = alpha_ready curated["review_reasons"] = reasons review_item = None if reasons: review_item = { "form": curated.get("form"), "lemma": curated.get("lemma"), "pos": curated.get("pos"), "topics": curated.get("topics"), "topic_suggestions": curated.get("topic_suggestions"), "preferred_definition": preferred_definition, "preferred_source": preferred_source, "clue_definitions": clue_definitions, "review_reasons": reasons, "semantic_glosses": (curated.get("semantic") or {}).get("glosses", []), "senses": curated.get("senses", []), "babelnet_status": (curated.get("babelnet") or {}).get("status"), "babelnet_best_synset": (curated.get("babelnet") or {}).get("best_synset"), "wiktextract_status": (curated.get("wiktextract") or {}).get("status"), "wiktextract": curated.get("wiktextract"), "candidate_pool": [ { "text": candidate["text"], "source": candidate["source"], "family": candidate["family"], "confidence": candidate["confidence"], "priority": candidate["priority"], } for candidate in candidates[:12] ], } return curated, review_item def build_curated_lexicon(args: argparse.Namespace) -> Tuple[Dict[str, object], Dict[str, object]]: payload = load_json(args.input) if not isinstance(payload, dict) or "entries" not in payload: raise ValueError(f"Lessico refined non valido: {args.input}") curated_entries: List[Dict[str, object]] = [] review_entries: List[Dict[str, object]] = [] for entry in payload.get("entries", []) or []: if not isinstance(entry, dict): continue curated, review_item = curate_entry(entry) curated_entries.append(curated) if review_item: review_entries.append(review_item) if args.max_review > 0: review_entries = review_entries[: args.max_review] curated_payload = { "meta": { "language": "it", "version": 1, "base_lexicon": args.input.name, "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), "entry_count": len(curated_entries), "alpha_ready_count": sum(1 for item in curated_entries if item.get("alpha_ready")), "review_count": len(review_entries), }, "entries": curated_entries, } review_payload = { "meta": { "language": "it", "version": 1, "base_lexicon": args.input.name, "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), "entry_count": len(review_entries), }, "entries": review_entries, } return curated_payload, review_payload def main() -> None: args = parse_args() curated_payload, review_payload = build_curated_lexicon(args) write_json(args.output, curated_payload) write_json(args.review_output, review_payload) print(f"Lessico curated generato: {args.output}") print(f"Voci totali: {curated_payload['meta']['entry_count']}") print(f"Voci alpha_ready: {curated_payload['meta']['alpha_ready_count']}") print(f"Voci da revisionare: {review_payload['meta']['entry_count']}") print(f"File review generato: {args.review_output}") if __name__ == "__main__": main()